{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999969938373666, "eval_steps": 500, "global_step": 16632, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "auxiliary_loss_clip": 0.05028445, "auxiliary_loss_mlp": 0.02215396, "balance_loss_clip": 1.76983953, "balance_loss_mlp": 2.43573999, "epoch": 6.012325266796934e-05, "flos": 24456507091200.0, "grad_norm": 55.0182521788336, "language_loss": 2.85272503, "learning_rate": 0.0, "loss": 1.94613922, "num_input_tokens_seen": 19155, "router_z_loss_clip": 4.4375, "router_z_loss_mlp": 26.0, "step": 1, "time_per_iteration": 22.945544719696045 }, { "auxiliary_loss_clip": 0.03380186, "auxiliary_loss_mlp": 0.01460843, "balance_loss_clip": 1.19095325, "balance_loss_mlp": 1.62771606, "epoch": 0.00012024650533593868, "flos": 20225931246720.0, "grad_norm": 34.62643469867135, "language_loss": 1.82596803, "learning_rate": 4e-06, "loss": 1.8743782, "num_input_tokens_seen": 36175, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 17.5, "step": 2, "time_per_iteration": 2.5352511405944824 }, { "auxiliary_loss_clip": 0.03320145, "auxiliary_loss_mlp": 0.01440918, "balance_loss_clip": 1.18857622, "balance_loss_mlp": 1.6254766, "epoch": 0.000180369758003908, "flos": 22309935454080.0, "grad_norm": 33.67025628076908, "language_loss": 1.57590365, "learning_rate": 3.999999964312572e-06, "loss": 1.62351418, "num_input_tokens_seen": 54870, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 17.0, "step": 3, "time_per_iteration": 2.4825499057769775 }, { "auxiliary_loss_clip": 0.03344148, "auxiliary_loss_mlp": 0.01449154, "balance_loss_clip": 1.15599442, "balance_loss_mlp": 1.6226294, "epoch": 0.00024049301067187735, "flos": 22414650577920.0, "grad_norm": 24.01683476173993, "language_loss": 1.37193835, "learning_rate": 3.99999985725029e-06, "loss": 1.41987133, "num_input_tokens_seen": 74575, "router_z_loss_clip": 2.9375, "router_z_loss_mlp": 17.25, "step": 4, "time_per_iteration": 2.6461877822875977 }, { "auxiliary_loss_clip": 0.03364734, "auxiliary_loss_mlp": 0.01493453, "balance_loss_clip": 1.21116519, "balance_loss_mlp": 1.62210393, "epoch": 0.0003006162633398467, "flos": 21396978449280.0, "grad_norm": 21.680100443653572, "language_loss": 1.41663432, "learning_rate": 3.999999678813158e-06, "loss": 1.46521616, "num_input_tokens_seen": 92580, "router_z_loss_clip": 2.828125, "router_z_loss_mlp": 17.5, "step": 5, "time_per_iteration": 2.7182352542877197 }, { "auxiliary_loss_clip": 0.03289292, "auxiliary_loss_mlp": 0.0149112, "balance_loss_clip": 1.20921433, "balance_loss_mlp": 1.6078527, "epoch": 0.000360739516007816, "flos": 21652375127040.0, "grad_norm": 6.674916681228244, "language_loss": 1.17528927, "learning_rate": 3.999999429001183e-06, "loss": 1.22309351, "num_input_tokens_seen": 109705, "router_z_loss_clip": 2.828125, "router_z_loss_mlp": 16.75, "step": 6, "time_per_iteration": 2.7764434814453125 }, { "auxiliary_loss_clip": 0.03215872, "auxiliary_loss_mlp": 0.01438721, "balance_loss_clip": 1.17722356, "balance_loss_mlp": 1.60342479, "epoch": 0.0004208627686757854, "flos": 27159742897920.0, "grad_norm": 4.907029504088935, "language_loss": 1.14647281, "learning_rate": 3.9999991078143714e-06, "loss": 1.19301867, "num_input_tokens_seen": 129425, "router_z_loss_clip": 2.625, "router_z_loss_mlp": 16.125, "step": 7, "time_per_iteration": 2.800668478012085 }, { "auxiliary_loss_clip": 0.03166129, "auxiliary_loss_mlp": 0.0139772, "balance_loss_clip": 1.14404249, "balance_loss_mlp": 1.59850562, "epoch": 0.0004809860213437547, "flos": 31319096135040.0, "grad_norm": 6.631832274448743, "language_loss": 0.95500398, "learning_rate": 3.999998715252736e-06, "loss": 1.00064254, "num_input_tokens_seen": 149210, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 15.6875, "step": 8, "time_per_iteration": 2.9442672729492188 }, { "auxiliary_loss_clip": 0.03180369, "auxiliary_loss_mlp": 0.01429696, "balance_loss_clip": 1.17544663, "balance_loss_mlp": 1.59536433, "epoch": 0.000541109274011724, "flos": 32160411463680.0, "grad_norm": 4.309421462608331, "language_loss": 1.11496198, "learning_rate": 3.999998251316293e-06, "loss": 1.16106272, "num_input_tokens_seen": 169055, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 15.875, "step": 9, "time_per_iteration": 2.8559279441833496 }, { "auxiliary_loss_clip": 0.0306814, "auxiliary_loss_mlp": 0.0140191, "balance_loss_clip": 1.16959453, "balance_loss_mlp": 1.59435844, "epoch": 0.0006012325266796934, "flos": 18916808163840.0, "grad_norm": 3.4472817516934455, "language_loss": 1.06159806, "learning_rate": 3.9999977160050555e-06, "loss": 1.10629857, "num_input_tokens_seen": 188045, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 14.75, "step": 10, "time_per_iteration": 2.7172327041625977 }, { "auxiliary_loss_clip": 0.02965079, "auxiliary_loss_mlp": 0.01368837, "balance_loss_clip": 1.15244818, "balance_loss_mlp": 1.57718801, "epoch": 0.0006613557793476627, "flos": 20774861867520.0, "grad_norm": 5.714859970142246, "language_loss": 1.09359026, "learning_rate": 3.9999971093190445e-06, "loss": 1.13692951, "num_input_tokens_seen": 207035, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 13.875, "step": 11, "time_per_iteration": 2.6762027740478516 }, { "auxiliary_loss_clip": 0.02833837, "auxiliary_loss_mlp": 0.0133198, "balance_loss_clip": 1.12169433, "balance_loss_mlp": 1.557042, "epoch": 0.000721479032015632, "flos": 16581680997120.0, "grad_norm": 3.535022888905977, "language_loss": 1.08959699, "learning_rate": 3.999996431258282e-06, "loss": 1.13125527, "num_input_tokens_seen": 223225, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 12.75, "step": 12, "time_per_iteration": 2.7126104831695557 }, { "auxiliary_loss_clip": 0.02811598, "auxiliary_loss_mlp": 0.01284024, "balance_loss_clip": 1.0953871, "balance_loss_mlp": 1.56122184, "epoch": 0.0007816022846836014, "flos": 23805471144960.0, "grad_norm": 3.0503928722685236, "language_loss": 0.99238098, "learning_rate": 3.999995681822791e-06, "loss": 1.03333712, "num_input_tokens_seen": 242570, "router_z_loss_clip": 1.890625, "router_z_loss_mlp": 12.5, "step": 13, "time_per_iteration": 2.6825122833251953 }, { "auxiliary_loss_clip": 0.02745491, "auxiliary_loss_mlp": 0.01324712, "balance_loss_clip": 1.13292789, "balance_loss_mlp": 1.54845357, "epoch": 0.0008417255373515708, "flos": 19172204841600.0, "grad_norm": 2.5452058707440246, "language_loss": 1.05650008, "learning_rate": 3.999994861012598e-06, "loss": 1.09720206, "num_input_tokens_seen": 261215, "router_z_loss_clip": 1.921875, "router_z_loss_mlp": 12.0, "step": 14, "time_per_iteration": 2.7167434692382812 }, { "auxiliary_loss_clip": 0.02684302, "auxiliary_loss_mlp": 0.01295707, "balance_loss_clip": 1.11384094, "balance_loss_mlp": 1.5449543, "epoch": 0.00090184879001954, "flos": 26395564026240.0, "grad_norm": 8.263795122382318, "language_loss": 0.98025811, "learning_rate": 3.999993968827733e-06, "loss": 1.02005816, "num_input_tokens_seen": 280035, "router_z_loss_clip": 1.8203125, "router_z_loss_mlp": 11.375, "step": 15, "time_per_iteration": 2.794675588607788 }, { "auxiliary_loss_clip": 0.02614957, "auxiliary_loss_mlp": 0.01286248, "balance_loss_clip": 1.120785, "balance_loss_mlp": 1.52328587, "epoch": 0.0009619720426875094, "flos": 24679500785280.0, "grad_norm": 2.675297635793713, "language_loss": 0.99156678, "learning_rate": 3.999993005268228e-06, "loss": 1.03057885, "num_input_tokens_seen": 300265, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 10.9375, "step": 16, "time_per_iteration": 2.846781015396118 }, { "auxiliary_loss_clip": 0.02569221, "auxiliary_loss_mlp": 0.01294684, "balance_loss_clip": 1.14057028, "balance_loss_mlp": 1.52408361, "epoch": 0.0010220952953554788, "flos": 18624531196800.0, "grad_norm": 3.1345844494929436, "language_loss": 1.01092076, "learning_rate": 3.999991970334118e-06, "loss": 1.04955983, "num_input_tokens_seen": 317375, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 10.4375, "step": 17, "time_per_iteration": 4.468073606491089 }, { "auxiliary_loss_clip": 0.0239225, "auxiliary_loss_mlp": 0.01279105, "balance_loss_clip": 1.13586295, "balance_loss_mlp": 1.477314, "epoch": 0.001082218548023448, "flos": 26142537646080.0, "grad_norm": 2.1711376461295604, "language_loss": 0.9967609, "learning_rate": 3.999990864025439e-06, "loss": 1.03347445, "num_input_tokens_seen": 337975, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 9.125, "step": 18, "time_per_iteration": 4.234963417053223 }, { "auxiliary_loss_clip": 0.02339843, "auxiliary_loss_mlp": 0.0127945, "balance_loss_clip": 1.12571764, "balance_loss_mlp": 1.46378553, "epoch": 0.0011423418006914173, "flos": 19609776322560.0, "grad_norm": 2.623073884511166, "language_loss": 0.90929437, "learning_rate": 3.99998968634223e-06, "loss": 0.94548726, "num_input_tokens_seen": 356635, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 8.75, "step": 19, "time_per_iteration": 2.803426504135132 }, { "auxiliary_loss_clip": 0.02247358, "auxiliary_loss_mlp": 0.01246729, "balance_loss_clip": 1.1311909, "balance_loss_mlp": 1.45410037, "epoch": 0.0012024650533593868, "flos": 17895365107200.0, "grad_norm": 3.137533965585164, "language_loss": 1.03398967, "learning_rate": 3.999988437284535e-06, "loss": 1.06893063, "num_input_tokens_seen": 375625, "router_z_loss_clip": 1.15625, "router_z_loss_mlp": 7.9375, "step": 20, "time_per_iteration": 2.813762903213501 }, { "auxiliary_loss_clip": 0.02198346, "auxiliary_loss_mlp": 0.01282586, "balance_loss_clip": 1.15546119, "balance_loss_mlp": 1.43252063, "epoch": 0.001262588306027356, "flos": 21252043071360.0, "grad_norm": 2.5633740935575497, "language_loss": 0.94354928, "learning_rate": 3.999987116852396e-06, "loss": 0.97835863, "num_input_tokens_seen": 394350, "router_z_loss_clip": 1.2734375, "router_z_loss_mlp": 7.65625, "step": 21, "time_per_iteration": 2.935816764831543 }, { "auxiliary_loss_clip": 0.02130667, "auxiliary_loss_mlp": 0.01263227, "balance_loss_clip": 1.15012169, "balance_loss_mlp": 1.4122498, "epoch": 0.0013227115586953253, "flos": 26104077158400.0, "grad_norm": 2.5492072518428777, "language_loss": 0.96073782, "learning_rate": 3.999985725045861e-06, "loss": 0.99467671, "num_input_tokens_seen": 413255, "router_z_loss_clip": 1.1328125, "router_z_loss_mlp": 7.1875, "step": 22, "time_per_iteration": 2.8830130100250244 }, { "auxiliary_loss_clip": 0.02102982, "auxiliary_loss_mlp": 0.01239944, "balance_loss_clip": 1.14052367, "balance_loss_mlp": 1.40957212, "epoch": 0.0013828348113632948, "flos": 23951376190080.0, "grad_norm": 2.070040743505321, "language_loss": 0.83626318, "learning_rate": 3.999984261864982e-06, "loss": 0.8696925, "num_input_tokens_seen": 433065, "router_z_loss_clip": 0.99609375, "router_z_loss_mlp": 6.9375, "step": 23, "time_per_iteration": 2.853854179382324 }, { "auxiliary_loss_clip": 0.02073312, "auxiliary_loss_mlp": 0.01248126, "balance_loss_clip": 1.14789486, "balance_loss_mlp": 1.40226281, "epoch": 0.001442958064031264, "flos": 15959851724160.0, "grad_norm": 1.929412758292372, "language_loss": 1.01630628, "learning_rate": 3.999982727309807e-06, "loss": 1.04952073, "num_input_tokens_seen": 451175, "router_z_loss_clip": 1.0, "router_z_loss_mlp": 6.6875, "step": 24, "time_per_iteration": 2.7231638431549072 }, { "auxiliary_loss_clip": 0.02002566, "auxiliary_loss_mlp": 0.01273132, "balance_loss_clip": 1.17738354, "balance_loss_mlp": 1.38892961, "epoch": 0.0015030813166992333, "flos": 18108350801280.0, "grad_norm": 2.3836391584346157, "language_loss": 0.93132114, "learning_rate": 3.999981121380394e-06, "loss": 0.96407813, "num_input_tokens_seen": 468775, "router_z_loss_clip": 0.9609375, "router_z_loss_mlp": 6.125, "step": 25, "time_per_iteration": 2.70859694480896 }, { "auxiliary_loss_clip": 0.0196998, "auxiliary_loss_mlp": 0.01237387, "balance_loss_clip": 1.14511895, "balance_loss_mlp": 1.37610817, "epoch": 0.0015632045693672028, "flos": 22234558763520.0, "grad_norm": 2.314741476608532, "language_loss": 1.00620687, "learning_rate": 3.9999794440768e-06, "loss": 1.03828061, "num_input_tokens_seen": 488530, "router_z_loss_clip": 0.921875, "router_z_loss_mlp": 5.9375, "step": 26, "time_per_iteration": 2.7603824138641357 }, { "auxiliary_loss_clip": 0.01959178, "auxiliary_loss_mlp": 0.01267969, "balance_loss_clip": 1.1797539, "balance_loss_mlp": 1.37561631, "epoch": 0.001623327822035172, "flos": 23991955580160.0, "grad_norm": 1.917755066435002, "language_loss": 0.89742452, "learning_rate": 3.999977695399084e-06, "loss": 0.92969596, "num_input_tokens_seen": 510495, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 5.8125, "step": 27, "time_per_iteration": 2.8110058307647705 }, { "auxiliary_loss_clip": 0.01936165, "auxiliary_loss_mlp": 0.01251329, "balance_loss_clip": 1.17255592, "balance_loss_mlp": 1.37417376, "epoch": 0.0016834510747031415, "flos": 19677647070720.0, "grad_norm": 2.1707276363139227, "language_loss": 0.99471062, "learning_rate": 3.999975875347308e-06, "loss": 1.02658558, "num_input_tokens_seen": 528605, "router_z_loss_clip": 0.7890625, "router_z_loss_mlp": 5.625, "step": 28, "time_per_iteration": 2.75118088722229 }, { "auxiliary_loss_clip": 0.01924591, "auxiliary_loss_mlp": 0.01223626, "balance_loss_clip": 1.1443758, "balance_loss_mlp": 1.36104393, "epoch": 0.0017435743273711108, "flos": 20923819568640.0, "grad_norm": 1.902902422493553, "language_loss": 0.9687205, "learning_rate": 3.999973983921538e-06, "loss": 1.00020266, "num_input_tokens_seen": 548515, "router_z_loss_clip": 0.7890625, "router_z_loss_mlp": 5.625, "step": 29, "time_per_iteration": 2.877054214477539 }, { "auxiliary_loss_clip": 0.0191659, "auxiliary_loss_mlp": 0.01225737, "balance_loss_clip": 1.14763141, "balance_loss_mlp": 1.35567272, "epoch": 0.00180369758003908, "flos": 19528976678400.0, "grad_norm": 3.1396378111140173, "language_loss": 1.1127882, "learning_rate": 3.9999720211218405e-06, "loss": 1.14421141, "num_input_tokens_seen": 564025, "router_z_loss_clip": 0.78125, "router_z_loss_mlp": 5.625, "step": 30, "time_per_iteration": 2.8587076663970947 }, { "auxiliary_loss_clip": 0.0187344, "auxiliary_loss_mlp": 0.01229359, "balance_loss_clip": 1.15125299, "balance_loss_mlp": 1.34400606, "epoch": 0.0018638208327070496, "flos": 27453169100160.0, "grad_norm": 2.7780918137809363, "language_loss": 0.96741927, "learning_rate": 3.999969986948286e-06, "loss": 0.99844718, "num_input_tokens_seen": 583345, "router_z_loss_clip": 0.78125, "router_z_loss_mlp": 5.3125, "step": 31, "time_per_iteration": 2.757504940032959 }, { "auxiliary_loss_clip": 0.01847128, "auxiliary_loss_mlp": 0.01209681, "balance_loss_clip": 1.13491309, "balance_loss_mlp": 1.33621264, "epoch": 0.0019239440853750188, "flos": 13589460380160.0, "grad_norm": 2.11711976047465, "language_loss": 0.88353336, "learning_rate": 3.999967881400949e-06, "loss": 0.91410142, "num_input_tokens_seen": 600010, "router_z_loss_clip": 0.74609375, "router_z_loss_mlp": 5.125, "step": 32, "time_per_iteration": 2.786022186279297 }, { "auxiliary_loss_clip": 0.01855958, "auxiliary_loss_mlp": 0.01184123, "balance_loss_clip": 1.11154842, "balance_loss_mlp": 1.33720863, "epoch": 0.001984067338042988, "flos": 11253866336640.0, "grad_norm": 2.672325289634158, "language_loss": 0.87235242, "learning_rate": 3.999965704479901e-06, "loss": 0.90275323, "num_input_tokens_seen": 616295, "router_z_loss_clip": 0.7265625, "router_z_loss_mlp": 5.1875, "step": 33, "time_per_iteration": 2.767817974090576 }, { "auxiliary_loss_clip": 0.01819576, "auxiliary_loss_mlp": 0.01181169, "balance_loss_clip": 1.10940456, "balance_loss_mlp": 1.32752967, "epoch": 0.0020441905907109576, "flos": 22386245898240.0, "grad_norm": 2.5852182879899552, "language_loss": 0.86820865, "learning_rate": 3.999963456185222e-06, "loss": 0.89821601, "num_input_tokens_seen": 637640, "router_z_loss_clip": 0.71875, "router_z_loss_mlp": 4.9375, "step": 34, "time_per_iteration": 2.9295778274536133 }, { "auxiliary_loss_clip": 0.01799304, "auxiliary_loss_mlp": 0.01155045, "balance_loss_clip": 1.08170736, "balance_loss_mlp": 1.30863667, "epoch": 0.0021043138433789266, "flos": 49778580337920.0, "grad_norm": 5.293659337790577, "language_loss": 0.70638025, "learning_rate": 3.999961136516991e-06, "loss": 0.73592371, "num_input_tokens_seen": 659710, "router_z_loss_clip": 0.734375, "router_z_loss_mlp": 4.90625, "step": 35, "time_per_iteration": 2.967212677001953 }, { "auxiliary_loss_clip": 0.01794737, "auxiliary_loss_mlp": 0.01158157, "balance_loss_clip": 1.08796632, "balance_loss_mlp": 1.30851829, "epoch": 0.002164437096046896, "flos": 20557961591040.0, "grad_norm": 2.088511663374645, "language_loss": 0.84600544, "learning_rate": 3.999958745475293e-06, "loss": 0.87553442, "num_input_tokens_seen": 679670, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 4.875, "step": 36, "time_per_iteration": 2.7419686317443848 }, { "auxiliary_loss_clip": 0.01787935, "auxiliary_loss_mlp": 0.01199155, "balance_loss_clip": 1.12786746, "balance_loss_mlp": 1.30573416, "epoch": 0.0022245603487148656, "flos": 26542295084160.0, "grad_norm": 2.0496488296476394, "language_loss": 0.87570006, "learning_rate": 3.999956283060211e-06, "loss": 0.90557098, "num_input_tokens_seen": 700170, "router_z_loss_clip": 0.7109375, "router_z_loss_mlp": 4.8125, "step": 37, "time_per_iteration": 2.7328574657440186 }, { "auxiliary_loss_clip": 0.01772152, "auxiliary_loss_mlp": 0.01175964, "balance_loss_clip": 1.10477173, "balance_loss_mlp": 1.30053353, "epoch": 0.0022846836013828346, "flos": 20338188226560.0, "grad_norm": 2.133477039227205, "language_loss": 0.99626315, "learning_rate": 3.9999537492718345e-06, "loss": 1.02574432, "num_input_tokens_seen": 718545, "router_z_loss_clip": 0.7109375, "router_z_loss_mlp": 4.71875, "step": 38, "time_per_iteration": 2.7216956615448 }, { "auxiliary_loss_clip": 0.01769446, "auxiliary_loss_mlp": 0.01144987, "balance_loss_clip": 1.07184029, "balance_loss_mlp": 1.29865599, "epoch": 0.002344806854050804, "flos": 26247575992320.0, "grad_norm": 2.1380102411451434, "language_loss": 0.81381756, "learning_rate": 3.999951144110252e-06, "loss": 0.84296191, "num_input_tokens_seen": 739865, "router_z_loss_clip": 0.73046875, "router_z_loss_mlp": 4.71875, "step": 39, "time_per_iteration": 2.8758668899536133 }, { "auxiliary_loss_clip": 0.01756833, "auxiliary_loss_mlp": 0.01158922, "balance_loss_clip": 1.08248484, "balance_loss_mlp": 1.28591752, "epoch": 0.0024049301067187736, "flos": 11801539981440.0, "grad_norm": 4.108855608251118, "language_loss": 0.83653128, "learning_rate": 3.999948467575558e-06, "loss": 0.8656888, "num_input_tokens_seen": 755770, "router_z_loss_clip": 0.765625, "router_z_loss_mlp": 4.71875, "step": 40, "time_per_iteration": 2.771930694580078 }, { "auxiliary_loss_clip": 0.01744523, "auxiliary_loss_mlp": 0.01167345, "balance_loss_clip": 1.094437, "balance_loss_mlp": 1.28363156, "epoch": 0.0024650533593867426, "flos": 20631506688000.0, "grad_norm": 2.375880826231286, "language_loss": 0.8880477, "learning_rate": 3.999945719667849e-06, "loss": 0.91716635, "num_input_tokens_seen": 773440, "router_z_loss_clip": 0.7265625, "router_z_loss_mlp": 4.625, "step": 41, "time_per_iteration": 2.8389899730682373 }, { "auxiliary_loss_clip": 0.01727268, "auxiliary_loss_mlp": 0.01146119, "balance_loss_clip": 1.07697737, "balance_loss_mlp": 1.27592349, "epoch": 0.002525176612054712, "flos": 18406122549120.0, "grad_norm": 2.2399237677022206, "language_loss": 0.92629802, "learning_rate": 3.999942900387221e-06, "loss": 0.95503187, "num_input_tokens_seen": 790455, "router_z_loss_clip": 0.69140625, "router_z_loss_mlp": 4.5, "step": 42, "time_per_iteration": 2.747921943664551 }, { "auxiliary_loss_clip": 0.01716762, "auxiliary_loss_mlp": 0.0116778, "balance_loss_clip": 1.09420443, "balance_loss_mlp": 1.27550626, "epoch": 0.0025852998647226816, "flos": 28184023128960.0, "grad_norm": 2.4392401705887576, "language_loss": 0.93347049, "learning_rate": 3.999940009733775e-06, "loss": 0.96231592, "num_input_tokens_seen": 810645, "router_z_loss_clip": 0.734375, "router_z_loss_mlp": 4.40625, "step": 43, "time_per_iteration": 2.8376433849334717 }, { "auxiliary_loss_clip": 0.01721966, "auxiliary_loss_mlp": 0.01162527, "balance_loss_clip": 1.08871293, "balance_loss_mlp": 1.27190924, "epoch": 0.0026454231173906506, "flos": 14283110897280.0, "grad_norm": 2.1210562277100737, "language_loss": 0.89014769, "learning_rate": 3.9999370477076146e-06, "loss": 0.91899264, "num_input_tokens_seen": 827470, "router_z_loss_clip": 0.73828125, "router_z_loss_mlp": 4.5, "step": 44, "time_per_iteration": 2.8068647384643555 }, { "auxiliary_loss_clip": 0.01704942, "auxiliary_loss_mlp": 0.01146433, "balance_loss_clip": 1.0780549, "balance_loss_mlp": 1.2681222, "epoch": 0.00270554637005862, "flos": 22419211605120.0, "grad_norm": 2.400916105632771, "language_loss": 0.95102835, "learning_rate": 3.9999340143088455e-06, "loss": 0.97954208, "num_input_tokens_seen": 847285, "router_z_loss_clip": 0.68359375, "router_z_loss_mlp": 4.375, "step": 45, "time_per_iteration": 2.8055288791656494 }, { "auxiliary_loss_clip": 0.01706244, "auxiliary_loss_mlp": 0.01145199, "balance_loss_clip": 1.07829869, "balance_loss_mlp": 1.26560378, "epoch": 0.0027656696227265896, "flos": 23985778440960.0, "grad_norm": 1.9155035036297179, "language_loss": 0.99967158, "learning_rate": 3.999930909537576e-06, "loss": 1.02818608, "num_input_tokens_seen": 867545, "router_z_loss_clip": 0.66796875, "router_z_loss_mlp": 4.40625, "step": 46, "time_per_iteration": 2.893806219100952 }, { "auxiliary_loss_clip": 0.01683809, "auxiliary_loss_mlp": 0.01159188, "balance_loss_clip": 1.08599329, "balance_loss_mlp": 1.25747252, "epoch": 0.0028257928753945586, "flos": 37669503087360.0, "grad_norm": 2.779721274201443, "language_loss": 0.83829749, "learning_rate": 3.999927733393916e-06, "loss": 0.86672747, "num_input_tokens_seen": 889915, "router_z_loss_clip": 0.73046875, "router_z_loss_mlp": 4.25, "step": 47, "time_per_iteration": 2.882610559463501 }, { "auxiliary_loss_clip": 0.01669397, "auxiliary_loss_mlp": 0.01150768, "balance_loss_clip": 1.07647693, "balance_loss_mlp": 1.25379467, "epoch": 0.002885916128062528, "flos": 22454547609600.0, "grad_norm": 1.9946632290980892, "language_loss": 0.85004634, "learning_rate": 3.99992448587798e-06, "loss": 0.87824798, "num_input_tokens_seen": 908975, "router_z_loss_clip": 0.7421875, "router_z_loss_mlp": 4.15625, "step": 48, "time_per_iteration": 2.8406484127044678 }, { "auxiliary_loss_clip": 0.01669175, "auxiliary_loss_mlp": 0.01153542, "balance_loss_clip": 1.07886958, "balance_loss_mlp": 1.2494328, "epoch": 0.0029460393807304976, "flos": 27012796358400.0, "grad_norm": 2.024580682805021, "language_loss": 0.86492181, "learning_rate": 3.999921166989884e-06, "loss": 0.89314902, "num_input_tokens_seen": 929810, "router_z_loss_clip": 0.75, "router_z_loss_mlp": 4.1875, "step": 49, "time_per_iteration": 2.7821803092956543 }, { "auxiliary_loss_clip": 0.01657259, "auxiliary_loss_mlp": 0.01177109, "balance_loss_clip": 1.10691833, "balance_loss_mlp": 1.23793948, "epoch": 0.0030061626333984666, "flos": 15851832549120.0, "grad_norm": 2.2174110192789143, "language_loss": 0.88151956, "learning_rate": 3.999917776729746e-06, "loss": 0.90986329, "num_input_tokens_seen": 948650, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 4.1875, "step": 50, "time_per_iteration": 2.7734792232513428 }, { "auxiliary_loss_clip": 0.01661211, "auxiliary_loss_mlp": 0.0113978, "balance_loss_clip": 1.07063818, "balance_loss_mlp": 1.24090695, "epoch": 0.003066285886066436, "flos": 31827052316160.0, "grad_norm": 3.378869147553855, "language_loss": 0.83965456, "learning_rate": 3.999914315097687e-06, "loss": 0.8676644, "num_input_tokens_seen": 966455, "router_z_loss_clip": 0.69140625, "router_z_loss_mlp": 4.1875, "step": 51, "time_per_iteration": 2.9274868965148926 }, { "auxiliary_loss_clip": 0.01636303, "auxiliary_loss_mlp": 0.01161787, "balance_loss_clip": 1.08849752, "balance_loss_mlp": 1.23596096, "epoch": 0.0031264091387344056, "flos": 41427482774400.0, "grad_norm": 1.9637935389376489, "language_loss": 0.91764569, "learning_rate": 3.999910782093829e-06, "loss": 0.9456265, "num_input_tokens_seen": 988110, "router_z_loss_clip": 0.73046875, "router_z_loss_mlp": 4.0, "step": 52, "time_per_iteration": 2.9212095737457275 }, { "auxiliary_loss_clip": 0.01633926, "auxiliary_loss_mlp": 0.01169201, "balance_loss_clip": 1.09643579, "balance_loss_mlp": 1.2293427, "epoch": 0.0031865323914023747, "flos": 23440941970560.0, "grad_norm": 2.060808900019492, "language_loss": 0.8882125, "learning_rate": 3.999907177718301e-06, "loss": 0.91624379, "num_input_tokens_seen": 1008550, "router_z_loss_clip": 0.7265625, "router_z_loss_mlp": 4.03125, "step": 53, "time_per_iteration": 2.8374691009521484 }, { "auxiliary_loss_clip": 0.01629154, "auxiliary_loss_mlp": 0.01174264, "balance_loss_clip": 1.09754097, "balance_loss_mlp": 1.23428178, "epoch": 0.003246655644070344, "flos": 14429195510400.0, "grad_norm": 2.814742509386654, "language_loss": 0.79488289, "learning_rate": 3.99990350197123e-06, "loss": 0.8229171, "num_input_tokens_seen": 1026840, "router_z_loss_clip": 0.765625, "router_z_loss_mlp": 3.953125, "step": 54, "time_per_iteration": 2.689469575881958 }, { "auxiliary_loss_clip": 0.01623372, "auxiliary_loss_mlp": 0.01167235, "balance_loss_clip": 1.09222853, "balance_loss_mlp": 1.21989584, "epoch": 0.0033067788967383136, "flos": 35918247496320.0, "grad_norm": 2.385537249768405, "language_loss": 0.77464372, "learning_rate": 3.999899754852747e-06, "loss": 0.80254978, "num_input_tokens_seen": 1048875, "router_z_loss_clip": 0.75, "router_z_loss_mlp": 4.03125, "step": 55, "time_per_iteration": 3.025588035583496 }, { "auxiliary_loss_clip": 0.01621212, "auxiliary_loss_mlp": 0.01152561, "balance_loss_clip": 1.08051062, "balance_loss_mlp": 1.22244549, "epoch": 0.003366902149406283, "flos": 22958732862720.0, "grad_norm": 2.7240470138330366, "language_loss": 0.83559608, "learning_rate": 3.999895936362987e-06, "loss": 0.86333382, "num_input_tokens_seen": 1066435, "router_z_loss_clip": 0.71875, "router_z_loss_mlp": 4.0, "step": 56, "time_per_iteration": 2.7477524280548096 }, { "auxiliary_loss_clip": 0.01617782, "auxiliary_loss_mlp": 0.01167574, "balance_loss_clip": 1.09623945, "balance_loss_mlp": 1.21659446, "epoch": 0.003427025402074252, "flos": 26582838560640.0, "grad_norm": 1.8713084992503017, "language_loss": 0.90531063, "learning_rate": 3.9998920465020845e-06, "loss": 0.93316418, "num_input_tokens_seen": 1090330, "router_z_loss_clip": 0.71484375, "router_z_loss_mlp": 4.03125, "step": 57, "time_per_iteration": 2.8642702102661133 }, { "auxiliary_loss_clip": 0.01604595, "auxiliary_loss_mlp": 0.01161716, "balance_loss_clip": 1.08923638, "balance_loss_mlp": 1.22386372, "epoch": 0.0034871486547422216, "flos": 23951196622080.0, "grad_norm": 1.9623684666770438, "language_loss": 0.96994984, "learning_rate": 3.999888085270179e-06, "loss": 0.99761295, "num_input_tokens_seen": 1109840, "router_z_loss_clip": 0.72265625, "router_z_loss_mlp": 3.8125, "step": 58, "time_per_iteration": 2.7648754119873047 }, { "auxiliary_loss_clip": 0.01592729, "auxiliary_loss_mlp": 0.01150305, "balance_loss_clip": 1.08164084, "balance_loss_mlp": 1.21068025, "epoch": 0.003547271907410191, "flos": 21214983214080.0, "grad_norm": 2.232862615640545, "language_loss": 0.85619295, "learning_rate": 3.9998840526674135e-06, "loss": 0.88362324, "num_input_tokens_seen": 1128415, "router_z_loss_clip": 0.6875, "router_z_loss_mlp": 3.828125, "step": 59, "time_per_iteration": 7.220743417739868 }, { "auxiliary_loss_clip": 0.01605253, "auxiliary_loss_mlp": 0.01139341, "balance_loss_clip": 1.06905556, "balance_loss_mlp": 1.218346, "epoch": 0.00360739516007816, "flos": 16504903676160.0, "grad_norm": 1.9272503289600638, "language_loss": 0.9062767, "learning_rate": 3.999879948693929e-06, "loss": 0.93372262, "num_input_tokens_seen": 1146515, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 3.875, "step": 60, "time_per_iteration": 2.7951700687408447 }, { "auxiliary_loss_clip": 0.01589134, "auxiliary_loss_mlp": 0.01140661, "balance_loss_clip": 1.07585871, "balance_loss_mlp": 1.20530105, "epoch": 0.0036675184127461296, "flos": 19464805031040.0, "grad_norm": 2.3151323009841662, "language_loss": 0.8692798, "learning_rate": 3.999875773349874e-06, "loss": 0.89657784, "num_input_tokens_seen": 1166330, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 3.84375, "step": 61, "time_per_iteration": 2.744251012802124 }, { "auxiliary_loss_clip": 0.01580394, "auxiliary_loss_mlp": 0.01138713, "balance_loss_clip": 1.07338679, "balance_loss_mlp": 1.20532632, "epoch": 0.003727641665414099, "flos": 20957323979520.0, "grad_norm": 1.8590793823566587, "language_loss": 0.86193752, "learning_rate": 3.999871526635397e-06, "loss": 0.88912863, "num_input_tokens_seen": 1186010, "router_z_loss_clip": 0.65234375, "router_z_loss_mlp": 3.75, "step": 62, "time_per_iteration": 2.7456886768341064 }, { "auxiliary_loss_clip": 0.0157243, "auxiliary_loss_mlp": 0.01142791, "balance_loss_clip": 1.07560515, "balance_loss_mlp": 1.20217133, "epoch": 0.003787764918082068, "flos": 18406050721920.0, "grad_norm": 1.8768039581076839, "language_loss": 0.94133669, "learning_rate": 3.999867208550649e-06, "loss": 0.96848893, "num_input_tokens_seen": 1204985, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 3.703125, "step": 63, "time_per_iteration": 2.7763946056365967 }, { "auxiliary_loss_clip": 0.01578084, "auxiliary_loss_mlp": 0.01161262, "balance_loss_clip": 1.09507728, "balance_loss_mlp": 1.20622158, "epoch": 0.0038478881707500376, "flos": 12459243962880.0, "grad_norm": 5.771264751346797, "language_loss": 0.95654064, "learning_rate": 3.999862819095785e-06, "loss": 0.98393416, "num_input_tokens_seen": 1223545, "router_z_loss_clip": 0.6640625, "router_z_loss_mlp": 3.71875, "step": 64, "time_per_iteration": 2.768186330795288 }, { "auxiliary_loss_clip": 0.01584773, "auxiliary_loss_mlp": 0.01138134, "balance_loss_clip": 1.07137632, "balance_loss_mlp": 1.20585155, "epoch": 0.003908011423418007, "flos": 13553334276480.0, "grad_norm": 2.3790265391846, "language_loss": 0.82953966, "learning_rate": 3.999858358270962e-06, "loss": 0.85676867, "num_input_tokens_seen": 1241175, "router_z_loss_clip": 0.66796875, "router_z_loss_mlp": 3.796875, "step": 65, "time_per_iteration": 2.6515231132507324 }, { "auxiliary_loss_clip": 0.01570915, "auxiliary_loss_mlp": 0.01143029, "balance_loss_clip": 1.07722533, "balance_loss_mlp": 1.1964587, "epoch": 0.003968134676085976, "flos": 18333475292160.0, "grad_norm": 2.0406519847587923, "language_loss": 0.83225119, "learning_rate": 3.999853826076338e-06, "loss": 0.85939062, "num_input_tokens_seen": 1259315, "router_z_loss_clip": 0.65625, "router_z_loss_mlp": 3.75, "step": 66, "time_per_iteration": 2.7542858123779297 }, { "auxiliary_loss_clip": 0.01569493, "auxiliary_loss_mlp": 0.01142574, "balance_loss_clip": 1.07080972, "balance_loss_mlp": 1.19310427, "epoch": 0.004028257928753946, "flos": 20485242506880.0, "grad_norm": 2.4658524035468385, "language_loss": 0.94164139, "learning_rate": 3.999849222512075e-06, "loss": 0.96876198, "num_input_tokens_seen": 1277055, "router_z_loss_clip": 0.71875, "router_z_loss_mlp": 3.765625, "step": 67, "time_per_iteration": 2.9491140842437744 }, { "auxiliary_loss_clip": 0.01554922, "auxiliary_loss_mlp": 0.01145984, "balance_loss_clip": 1.07555532, "balance_loss_mlp": 1.19229269, "epoch": 0.004088381181421915, "flos": 18843837684480.0, "grad_norm": 2.085019086405183, "language_loss": 0.9198578, "learning_rate": 3.9998445475783365e-06, "loss": 0.94686687, "num_input_tokens_seen": 1294355, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 3.625, "step": 68, "time_per_iteration": 2.719846725463867 }, { "auxiliary_loss_clip": 0.01545185, "auxiliary_loss_mlp": 0.01145293, "balance_loss_clip": 1.07844019, "balance_loss_mlp": 1.18825841, "epoch": 0.004148504434089885, "flos": 19427817000960.0, "grad_norm": 2.947790174992327, "language_loss": 0.94288182, "learning_rate": 3.999839801275292e-06, "loss": 0.96978652, "num_input_tokens_seen": 1313525, "router_z_loss_clip": 0.66796875, "router_z_loss_mlp": 3.5625, "step": 69, "time_per_iteration": 2.7641184329986572 }, { "auxiliary_loss_clip": 0.0154635, "auxiliary_loss_mlp": 0.01148579, "balance_loss_clip": 1.08568442, "balance_loss_mlp": 1.19253528, "epoch": 0.004208627686757853, "flos": 20811023884800.0, "grad_norm": 2.4623792423478266, "language_loss": 0.96742696, "learning_rate": 3.999834983603108e-06, "loss": 0.99437618, "num_input_tokens_seen": 1330505, "router_z_loss_clip": 0.62890625, "router_z_loss_mlp": 3.53125, "step": 70, "time_per_iteration": 2.784700393676758 }, { "auxiliary_loss_clip": 0.01552881, "auxiliary_loss_mlp": 0.0113086, "balance_loss_clip": 1.06453192, "balance_loss_mlp": 1.18375599, "epoch": 0.004268750939425823, "flos": 19098623831040.0, "grad_norm": 2.5969624222387417, "language_loss": 0.91885793, "learning_rate": 3.9998300945619576e-06, "loss": 0.94569534, "num_input_tokens_seen": 1349615, "router_z_loss_clip": 0.6640625, "router_z_loss_mlp": 3.6875, "step": 71, "time_per_iteration": 2.898578643798828 }, { "auxiliary_loss_clip": 0.01846831, "auxiliary_loss_mlp": 0.01310361, "balance_loss_clip": 1.27450264, "balance_loss_mlp": 1.52646041, "epoch": 0.004328874192093792, "flos": 52439635514880.0, "grad_norm": 2.260861741495242, "language_loss": 0.65806115, "learning_rate": 3.999825134152016e-06, "loss": 0.68963313, "num_input_tokens_seen": 1410275, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 3.203125, "step": 72, "time_per_iteration": 3.267561197280884 }, { "auxiliary_loss_clip": 0.01792233, "auxiliary_loss_mlp": 0.01237491, "balance_loss_clip": 1.20258701, "balance_loss_mlp": 1.49428391, "epoch": 0.004388997444761762, "flos": 66473239564800.0, "grad_norm": 2.081189478100792, "language_loss": 0.63708186, "learning_rate": 3.999820102373459e-06, "loss": 0.66737914, "num_input_tokens_seen": 1473020, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 2.96875, "step": 73, "time_per_iteration": 3.561401128768921 }, { "auxiliary_loss_clip": 0.01536725, "auxiliary_loss_mlp": 0.01130753, "balance_loss_clip": 1.06642735, "balance_loss_mlp": 1.18122911, "epoch": 0.004449120697429731, "flos": 18952970181120.0, "grad_norm": 2.133672971678585, "language_loss": 0.83411503, "learning_rate": 3.999814999226467e-06, "loss": 0.86078978, "num_input_tokens_seen": 1490385, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 3.5625, "step": 74, "time_per_iteration": 3.2371063232421875 }, { "auxiliary_loss_clip": 0.01543735, "auxiliary_loss_mlp": 0.01155193, "balance_loss_clip": 1.08948529, "balance_loss_mlp": 1.18491483, "epoch": 0.004509243950097701, "flos": 21105491581440.0, "grad_norm": 2.702174212695957, "language_loss": 0.94963861, "learning_rate": 3.999809824711222e-06, "loss": 0.97662789, "num_input_tokens_seen": 1509725, "router_z_loss_clip": 0.65625, "router_z_loss_mlp": 3.59375, "step": 75, "time_per_iteration": 2.795450448989868 }, { "auxiliary_loss_clip": 0.01530242, "auxiliary_loss_mlp": 0.01139879, "balance_loss_clip": 1.0774138, "balance_loss_mlp": 1.18250883, "epoch": 0.004569367202765669, "flos": 20698730991360.0, "grad_norm": 2.1834781965394225, "language_loss": 0.86145663, "learning_rate": 3.9998045788279075e-06, "loss": 0.88815784, "num_input_tokens_seen": 1527245, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 3.46875, "step": 76, "time_per_iteration": 2.805821180343628 }, { "auxiliary_loss_clip": 0.01535857, "auxiliary_loss_mlp": 0.01136494, "balance_loss_clip": 1.07121515, "balance_loss_mlp": 1.18527055, "epoch": 0.004629490455433639, "flos": 28658474899200.0, "grad_norm": 2.2656674307113476, "language_loss": 0.90470546, "learning_rate": 3.9997992615767125e-06, "loss": 0.93142903, "num_input_tokens_seen": 1548930, "router_z_loss_clip": 0.65234375, "router_z_loss_mlp": 3.5, "step": 77, "time_per_iteration": 2.8375539779663086 }, { "auxiliary_loss_clip": 0.01528669, "auxiliary_loss_mlp": 0.01167098, "balance_loss_clip": 1.10119939, "balance_loss_mlp": 1.18725085, "epoch": 0.004689613708101608, "flos": 11072409805440.0, "grad_norm": 2.3188756983125054, "language_loss": 0.89905316, "learning_rate": 3.9997938729578266e-06, "loss": 0.92601085, "num_input_tokens_seen": 1565695, "router_z_loss_clip": 0.66015625, "router_z_loss_mlp": 3.421875, "step": 78, "time_per_iteration": 2.720586061477661 }, { "auxiliary_loss_clip": 0.01525322, "auxiliary_loss_mlp": 0.01151388, "balance_loss_clip": 1.08696723, "balance_loss_mlp": 1.18258595, "epoch": 0.004749736960769578, "flos": 21799106184960.0, "grad_norm": 3.0030656946980168, "language_loss": 0.80505943, "learning_rate": 3.99978841297144e-06, "loss": 0.83182657, "num_input_tokens_seen": 1582625, "router_z_loss_clip": 0.64453125, "router_z_loss_mlp": 3.421875, "step": 79, "time_per_iteration": 2.715778112411499 }, { "auxiliary_loss_clip": 0.01541368, "auxiliary_loss_mlp": 0.01136898, "balance_loss_clip": 1.07138073, "balance_loss_mlp": 1.18974316, "epoch": 0.004809860213437547, "flos": 19792597570560.0, "grad_norm": 2.3040378369485848, "language_loss": 0.89932275, "learning_rate": 3.99978288161775e-06, "loss": 0.92610544, "num_input_tokens_seen": 1601725, "router_z_loss_clip": 0.65625, "router_z_loss_mlp": 3.515625, "step": 80, "time_per_iteration": 2.7648096084594727 }, { "auxiliary_loss_clip": 0.01518964, "auxiliary_loss_mlp": 0.01139298, "balance_loss_clip": 1.07416189, "balance_loss_mlp": 1.18412375, "epoch": 0.004869983466105517, "flos": 26574327037440.0, "grad_norm": 2.5499851290760067, "language_loss": 0.93061912, "learning_rate": 3.999777278896952e-06, "loss": 0.95720172, "num_input_tokens_seen": 1622420, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 3.34375, "step": 81, "time_per_iteration": 2.80580735206604 }, { "auxiliary_loss_clip": 0.01534601, "auxiliary_loss_mlp": 0.01144374, "balance_loss_clip": 1.08262372, "balance_loss_mlp": 1.18943334, "epoch": 0.004930106718773485, "flos": 12823378087680.0, "grad_norm": 3.1112870352321877, "language_loss": 0.94327658, "learning_rate": 3.999771604809249e-06, "loss": 0.97006631, "num_input_tokens_seen": 1640715, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 3.453125, "step": 82, "time_per_iteration": 2.7714664936065674 }, { "auxiliary_loss_clip": 0.01520069, "auxiliary_loss_mlp": 0.01160288, "balance_loss_clip": 1.09415102, "balance_loss_mlp": 1.17844129, "epoch": 0.004990229971441455, "flos": 25774919902080.0, "grad_norm": 2.0089960188936096, "language_loss": 0.85273731, "learning_rate": 3.999765859354839e-06, "loss": 0.87954092, "num_input_tokens_seen": 1662210, "router_z_loss_clip": 0.6640625, "router_z_loss_mlp": 3.421875, "step": 83, "time_per_iteration": 2.7751569747924805 }, { "auxiliary_loss_clip": 0.01513719, "auxiliary_loss_mlp": 0.01146688, "balance_loss_clip": 1.0816474, "balance_loss_mlp": 1.17907274, "epoch": 0.005050353224109424, "flos": 17457254922240.0, "grad_norm": 2.3528454277983757, "language_loss": 0.90925443, "learning_rate": 3.999760042533931e-06, "loss": 0.93585849, "num_input_tokens_seen": 1681070, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 3.34375, "step": 84, "time_per_iteration": 2.6500656604766846 }, { "auxiliary_loss_clip": 0.01530935, "auxiliary_loss_mlp": 0.01165654, "balance_loss_clip": 1.13589978, "balance_loss_mlp": 1.30583966, "epoch": 0.005110476476777394, "flos": 69805460367360.0, "grad_norm": 1.0804258801985054, "language_loss": 0.61871135, "learning_rate": 3.999754154346731e-06, "loss": 0.64567721, "num_input_tokens_seen": 1747140, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 2.25, "step": 85, "time_per_iteration": 3.395254373550415 }, { "auxiliary_loss_clip": 0.01504274, "auxiliary_loss_mlp": 0.01117787, "balance_loss_clip": 1.05293763, "balance_loss_mlp": 1.16893888, "epoch": 0.005170599729445363, "flos": 24790105739520.0, "grad_norm": 2.3880373648465794, "language_loss": 0.89212108, "learning_rate": 3.999748194793449e-06, "loss": 0.91834164, "num_input_tokens_seen": 1767475, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 3.359375, "step": 86, "time_per_iteration": 2.7309799194335938 }, { "auxiliary_loss_clip": 0.0151016, "auxiliary_loss_mlp": 0.01151889, "balance_loss_clip": 1.0860374, "balance_loss_mlp": 1.17610049, "epoch": 0.005230722982113333, "flos": 23258048895360.0, "grad_norm": 2.6251917901567166, "language_loss": 0.80428815, "learning_rate": 3.999742163874298e-06, "loss": 0.8309086, "num_input_tokens_seen": 1784980, "router_z_loss_clip": 0.65625, "router_z_loss_mlp": 3.34375, "step": 87, "time_per_iteration": 2.7968127727508545 }, { "auxiliary_loss_clip": 0.01503241, "auxiliary_loss_mlp": 0.01141884, "balance_loss_clip": 1.07846451, "balance_loss_mlp": 1.17521214, "epoch": 0.005290846234781301, "flos": 16727909264640.0, "grad_norm": 1.9626102270379322, "language_loss": 0.94069648, "learning_rate": 3.999736061589492e-06, "loss": 0.96714777, "num_input_tokens_seen": 1803030, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 3.28125, "step": 88, "time_per_iteration": 2.662675380706787 }, { "auxiliary_loss_clip": 0.015147, "auxiliary_loss_mlp": 0.01138403, "balance_loss_clip": 1.07584178, "balance_loss_mlp": 1.17292678, "epoch": 0.005350969487449271, "flos": 20886077352960.0, "grad_norm": 1.8285908158488517, "language_loss": 0.84165311, "learning_rate": 3.999729887939251e-06, "loss": 0.86818409, "num_input_tokens_seen": 1822865, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 3.421875, "step": 89, "time_per_iteration": 2.731827974319458 }, { "auxiliary_loss_clip": 0.01504045, "auxiliary_loss_mlp": 0.01120027, "balance_loss_clip": 1.05851495, "balance_loss_mlp": 1.1660825, "epoch": 0.00541109274011724, "flos": 26209977431040.0, "grad_norm": 2.3945770559032176, "language_loss": 0.89530885, "learning_rate": 3.9997236429237945e-06, "loss": 0.92154962, "num_input_tokens_seen": 1842435, "router_z_loss_clip": 0.61328125, "router_z_loss_mlp": 3.390625, "step": 90, "time_per_iteration": 2.7439517974853516 }, { "auxiliary_loss_clip": 0.01491546, "auxiliary_loss_mlp": 0.01122248, "balance_loss_clip": 1.0596869, "balance_loss_mlp": 1.16535187, "epoch": 0.00547121599278521, "flos": 21178569801600.0, "grad_norm": 4.859503687268972, "language_loss": 0.8446964, "learning_rate": 3.999717326543345e-06, "loss": 0.87083435, "num_input_tokens_seen": 1860065, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 3.265625, "step": 91, "time_per_iteration": 2.7039849758148193 }, { "auxiliary_loss_clip": 0.01490279, "auxiliary_loss_mlp": 0.01129494, "balance_loss_clip": 1.06903124, "balance_loss_mlp": 1.15876889, "epoch": 0.005531339245453179, "flos": 19718801078400.0, "grad_norm": 2.052247472391775, "language_loss": 0.8519339, "learning_rate": 3.9997109387981275e-06, "loss": 0.87813163, "num_input_tokens_seen": 1878135, "router_z_loss_clip": 0.60546875, "router_z_loss_mlp": 3.3125, "step": 92, "time_per_iteration": 2.7737700939178467 }, { "auxiliary_loss_clip": 0.01487439, "auxiliary_loss_mlp": 0.01161849, "balance_loss_clip": 1.09938383, "balance_loss_mlp": 1.15416908, "epoch": 0.005591462498121149, "flos": 17636089760640.0, "grad_norm": 2.6734493874848586, "language_loss": 0.89575541, "learning_rate": 3.99970447968837e-06, "loss": 0.9222483, "num_input_tokens_seen": 1894895, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 3.34375, "step": 93, "time_per_iteration": 2.684605598449707 }, { "auxiliary_loss_clip": 0.01501879, "auxiliary_loss_mlp": 0.01135858, "balance_loss_clip": 1.07034016, "balance_loss_mlp": 1.16155016, "epoch": 0.005651585750789117, "flos": 20011221699840.0, "grad_norm": 2.0527780703219043, "language_loss": 0.85968602, "learning_rate": 3.9996979492143045e-06, "loss": 0.8860634, "num_input_tokens_seen": 1913220, "router_z_loss_clip": 0.65625, "router_z_loss_mlp": 3.40625, "step": 94, "time_per_iteration": 2.7623517513275146 }, { "auxiliary_loss_clip": 0.01462238, "auxiliary_loss_mlp": 0.01142006, "balance_loss_clip": 1.11320484, "balance_loss_mlp": 1.25547099, "epoch": 0.005711709003457087, "flos": 59812957981440.0, "grad_norm": 1.1961909857398474, "language_loss": 0.67677665, "learning_rate": 3.999691347376162e-06, "loss": 0.70281911, "num_input_tokens_seen": 1970970, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 2.0625, "step": 95, "time_per_iteration": 3.141388177871704 }, { "auxiliary_loss_clip": 0.01480088, "auxiliary_loss_mlp": 0.0114688, "balance_loss_clip": 1.08517754, "balance_loss_mlp": 1.14839411, "epoch": 0.005771832256125056, "flos": 15559591495680.0, "grad_norm": 3.2604442505222266, "language_loss": 0.88511574, "learning_rate": 3.99968467417418e-06, "loss": 0.91138548, "num_input_tokens_seen": 1988930, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 3.3125, "step": 96, "time_per_iteration": 2.7316365242004395 }, { "auxiliary_loss_clip": 0.01471272, "auxiliary_loss_mlp": 0.01134592, "balance_loss_clip": 1.07782412, "balance_loss_mlp": 1.15014362, "epoch": 0.005831955508793026, "flos": 22528380015360.0, "grad_norm": 2.688580470852486, "language_loss": 0.88103557, "learning_rate": 3.999677929608596e-06, "loss": 0.90709424, "num_input_tokens_seen": 2006285, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 3.203125, "step": 97, "time_per_iteration": 2.7813620567321777 }, { "auxiliary_loss_clip": 0.01460909, "auxiliary_loss_mlp": 0.01141158, "balance_loss_clip": 1.07878828, "balance_loss_mlp": 1.14379871, "epoch": 0.005892078761460995, "flos": 22049834094720.0, "grad_norm": 1.8166832249091076, "language_loss": 0.75106227, "learning_rate": 3.99967111367965e-06, "loss": 0.77708304, "num_input_tokens_seen": 2024905, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 3.15625, "step": 98, "time_per_iteration": 2.676407814025879 }, { "auxiliary_loss_clip": 0.01403753, "auxiliary_loss_mlp": 0.01037869, "balance_loss_clip": 1.01097572, "balance_loss_mlp": 1.21193719, "epoch": 0.005952202014128965, "flos": 68539143317760.0, "grad_norm": 0.9509949366903337, "language_loss": 0.6523295, "learning_rate": 3.999664226387586e-06, "loss": 0.67674577, "num_input_tokens_seen": 2086220, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.921875, "step": 99, "time_per_iteration": 3.3119068145751953 }, { "auxiliary_loss_clip": 0.01464211, "auxiliary_loss_mlp": 0.0114392, "balance_loss_clip": 1.07954657, "balance_loss_mlp": 1.14336777, "epoch": 0.006012325266796933, "flos": 22960887678720.0, "grad_norm": 1.9393130867961403, "language_loss": 0.89398187, "learning_rate": 3.999657267732648e-06, "loss": 0.9200632, "num_input_tokens_seen": 2103365, "router_z_loss_clip": 0.64453125, "router_z_loss_mlp": 3.21875, "step": 100, "time_per_iteration": 2.6694769859313965 }, { "auxiliary_loss_clip": 0.01470955, "auxiliary_loss_mlp": 0.01136173, "balance_loss_clip": 1.07623422, "balance_loss_mlp": 1.14497292, "epoch": 0.006072448519464903, "flos": 17347942857600.0, "grad_norm": 2.0576085647704088, "language_loss": 0.89471412, "learning_rate": 3.999650237715088e-06, "loss": 0.92078543, "num_input_tokens_seen": 2121995, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 3.25, "step": 101, "time_per_iteration": 4.827823877334595 }, { "auxiliary_loss_clip": 0.01466645, "auxiliary_loss_mlp": 0.01148184, "balance_loss_clip": 1.08381069, "balance_loss_mlp": 1.14878225, "epoch": 0.006132571772132872, "flos": 24681116897280.0, "grad_norm": 2.3063060044871055, "language_loss": 0.89110059, "learning_rate": 3.9996431363351536e-06, "loss": 0.91724885, "num_input_tokens_seen": 2141815, "router_z_loss_clip": 0.64453125, "router_z_loss_mlp": 3.1875, "step": 102, "time_per_iteration": 2.810624122619629 }, { "auxiliary_loss_clip": 0.01460785, "auxiliary_loss_mlp": 0.0113287, "balance_loss_clip": 1.07545924, "balance_loss_mlp": 1.14452529, "epoch": 0.006192695024800842, "flos": 21465675210240.0, "grad_norm": 4.103820116094435, "language_loss": 0.86304545, "learning_rate": 3.9996359635931e-06, "loss": 0.88898194, "num_input_tokens_seen": 2161125, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 3.15625, "step": 103, "time_per_iteration": 2.7339346408843994 }, { "auxiliary_loss_clip": 0.0146183, "auxiliary_loss_mlp": 0.01140685, "balance_loss_clip": 1.08127105, "balance_loss_mlp": 1.14677608, "epoch": 0.006252818277468811, "flos": 17420410546560.0, "grad_norm": 2.062436414694029, "language_loss": 0.92736185, "learning_rate": 3.999628719489181e-06, "loss": 0.95338702, "num_input_tokens_seen": 2179510, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 3.15625, "step": 104, "time_per_iteration": 2.672914981842041 }, { "auxiliary_loss_clip": 0.01455842, "auxiliary_loss_mlp": 0.01147529, "balance_loss_clip": 1.09064245, "balance_loss_mlp": 1.13872063, "epoch": 0.006312941530136781, "flos": 19099557584640.0, "grad_norm": 2.5060221267890332, "language_loss": 0.948259, "learning_rate": 3.999621404023658e-06, "loss": 0.97429276, "num_input_tokens_seen": 2197870, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 3.171875, "step": 105, "time_per_iteration": 2.610215902328491 }, { "auxiliary_loss_clip": 0.01451513, "auxiliary_loss_mlp": 0.01145782, "balance_loss_clip": 1.0816952, "balance_loss_mlp": 1.13573456, "epoch": 0.006373064782804749, "flos": 24060831909120.0, "grad_norm": 2.341992952998321, "language_loss": 0.8541944, "learning_rate": 3.9996140171967904e-06, "loss": 0.88016737, "num_input_tokens_seen": 2217495, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 3.15625, "step": 106, "time_per_iteration": 2.721602201461792 }, { "auxiliary_loss_clip": 0.01443811, "auxiliary_loss_mlp": 0.01149823, "balance_loss_clip": 1.08850217, "balance_loss_mlp": 1.12909532, "epoch": 0.006433188035472719, "flos": 18332433797760.0, "grad_norm": 2.1440567425293513, "language_loss": 0.81417787, "learning_rate": 3.9996065590088426e-06, "loss": 0.84011424, "num_input_tokens_seen": 2236520, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 3.15625, "step": 107, "time_per_iteration": 2.6346819400787354 }, { "auxiliary_loss_clip": 0.01351465, "auxiliary_loss_mlp": 0.01129675, "balance_loss_clip": 1.10592842, "balance_loss_mlp": 1.17232561, "epoch": 0.006493311288140688, "flos": 62562387594240.0, "grad_norm": 0.9604503405635797, "language_loss": 0.6458739, "learning_rate": 3.999599029460081e-06, "loss": 0.67068529, "num_input_tokens_seen": 2300140, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 1.796875, "step": 108, "time_per_iteration": 3.2997775077819824 }, { "auxiliary_loss_clip": 0.01441173, "auxiliary_loss_mlp": 0.01134698, "balance_loss_clip": 1.07266128, "balance_loss_mlp": 1.13397837, "epoch": 0.006553434540808658, "flos": 19500141035520.0, "grad_norm": 2.6159344910379234, "language_loss": 0.96003878, "learning_rate": 3.999591428550772e-06, "loss": 0.98579746, "num_input_tokens_seen": 2317320, "router_z_loss_clip": 0.62109375, "router_z_loss_mlp": 3.078125, "step": 109, "time_per_iteration": 2.722465991973877 }, { "auxiliary_loss_clip": 0.01440823, "auxiliary_loss_mlp": 0.01133018, "balance_loss_clip": 1.07312691, "balance_loss_mlp": 1.13594222, "epoch": 0.006613557793476627, "flos": 21105132445440.0, "grad_norm": 1.7207049261250957, "language_loss": 0.83228397, "learning_rate": 3.999583756281189e-06, "loss": 0.85802239, "num_input_tokens_seen": 2337820, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 3.0625, "step": 110, "time_per_iteration": 2.647887706756592 }, { "auxiliary_loss_clip": 0.01439758, "auxiliary_loss_mlp": 0.01144515, "balance_loss_clip": 1.08419514, "balance_loss_mlp": 1.13363957, "epoch": 0.006673681046144597, "flos": 26030747543040.0, "grad_norm": 2.2451230473436037, "language_loss": 0.81863892, "learning_rate": 3.999576012651605e-06, "loss": 0.84448159, "num_input_tokens_seen": 2358560, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 3.0625, "step": 111, "time_per_iteration": 2.783092737197876 }, { "auxiliary_loss_clip": 0.0143723, "auxiliary_loss_mlp": 0.01117281, "balance_loss_clip": 1.06082344, "balance_loss_mlp": 1.13319921, "epoch": 0.006733804298812566, "flos": 23147767163520.0, "grad_norm": 2.3685004520228126, "language_loss": 0.9250201, "learning_rate": 3.999568197662297e-06, "loss": 0.95056516, "num_input_tokens_seen": 2379005, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 3.046875, "step": 112, "time_per_iteration": 2.7741987705230713 }, { "auxiliary_loss_clip": 0.01445084, "auxiliary_loss_mlp": 0.01127304, "balance_loss_clip": 1.06979775, "balance_loss_mlp": 1.13437057, "epoch": 0.006793927551480535, "flos": 11764444210560.0, "grad_norm": 2.3247480952184083, "language_loss": 0.77508956, "learning_rate": 3.999560311313543e-06, "loss": 0.80081344, "num_input_tokens_seen": 2395610, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 3.109375, "step": 113, "time_per_iteration": 2.7157163619995117 }, { "auxiliary_loss_clip": 0.01435496, "auxiliary_loss_mlp": 0.01118585, "balance_loss_clip": 1.05836034, "balance_loss_mlp": 1.13198113, "epoch": 0.006854050804148504, "flos": 19171953446400.0, "grad_norm": 4.397569820549149, "language_loss": 0.91775912, "learning_rate": 3.999552353605626e-06, "loss": 0.94329995, "num_input_tokens_seen": 2415005, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 3.03125, "step": 114, "time_per_iteration": 2.6741530895233154 }, { "auxiliary_loss_clip": 0.01442766, "auxiliary_loss_mlp": 0.01134605, "balance_loss_clip": 1.07962561, "balance_loss_mlp": 1.13212395, "epoch": 0.006914174056816474, "flos": 21981891519360.0, "grad_norm": 2.242296540408999, "language_loss": 0.93257785, "learning_rate": 3.999544324538829e-06, "loss": 0.95835149, "num_input_tokens_seen": 2433965, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 3.109375, "step": 115, "time_per_iteration": 2.747790813446045 }, { "auxiliary_loss_clip": 0.01439084, "auxiliary_loss_mlp": 0.01106534, "balance_loss_clip": 1.04602301, "balance_loss_mlp": 1.12853849, "epoch": 0.006974297309484443, "flos": 16289152634880.0, "grad_norm": 2.336764114516812, "language_loss": 0.80202162, "learning_rate": 3.999536224113438e-06, "loss": 0.82747781, "num_input_tokens_seen": 2451605, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 3.109375, "step": 116, "time_per_iteration": 2.6194088459014893 }, { "auxiliary_loss_clip": 0.01433226, "auxiliary_loss_mlp": 0.0111897, "balance_loss_clip": 1.06036687, "balance_loss_mlp": 1.13238227, "epoch": 0.007034420562152413, "flos": 26104005331200.0, "grad_norm": 2.089830076674858, "language_loss": 0.86731577, "learning_rate": 3.9995280523297416e-06, "loss": 0.89283776, "num_input_tokens_seen": 2472035, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 3.0, "step": 117, "time_per_iteration": 2.702129602432251 }, { "auxiliary_loss_clip": 0.01433556, "auxiliary_loss_mlp": 0.01127053, "balance_loss_clip": 1.06706715, "balance_loss_mlp": 1.13227582, "epoch": 0.007094543814820382, "flos": 14204609723520.0, "grad_norm": 3.7435654565568397, "language_loss": 0.82813084, "learning_rate": 3.9995198091880334e-06, "loss": 0.853737, "num_input_tokens_seen": 2489285, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 3.0, "step": 118, "time_per_iteration": 2.6606478691101074 }, { "auxiliary_loss_clip": 0.01441457, "auxiliary_loss_mlp": 0.01148332, "balance_loss_clip": 1.08906102, "balance_loss_mlp": 1.1303699, "epoch": 0.007154667067488351, "flos": 14976007228800.0, "grad_norm": 3.1190591421413183, "language_loss": 0.97957551, "learning_rate": 3.999511494688606e-06, "loss": 1.00547338, "num_input_tokens_seen": 2506460, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 3.109375, "step": 119, "time_per_iteration": 2.6702921390533447 }, { "auxiliary_loss_clip": 0.01445041, "auxiliary_loss_mlp": 0.01108622, "balance_loss_clip": 1.05125821, "balance_loss_mlp": 1.12877131, "epoch": 0.00721479032015632, "flos": 20193288762240.0, "grad_norm": 2.3735393350778002, "language_loss": 0.87781787, "learning_rate": 3.999503108831758e-06, "loss": 0.90335453, "num_input_tokens_seen": 2525565, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 3.15625, "step": 120, "time_per_iteration": 2.8434715270996094 }, { "auxiliary_loss_clip": 0.01435058, "auxiliary_loss_mlp": 0.01119713, "balance_loss_clip": 1.06058526, "balance_loss_mlp": 1.1332736, "epoch": 0.00727491357282429, "flos": 23147228459520.0, "grad_norm": 1.7885213125460429, "language_loss": 0.92224765, "learning_rate": 3.999494651617787e-06, "loss": 0.94779539, "num_input_tokens_seen": 2546605, "router_z_loss_clip": 0.58984375, "router_z_loss_mlp": 3.015625, "step": 121, "time_per_iteration": 2.7096426486968994 }, { "auxiliary_loss_clip": 0.01435674, "auxiliary_loss_mlp": 0.01142295, "balance_loss_clip": 1.0843116, "balance_loss_mlp": 1.13167048, "epoch": 0.007335036825492259, "flos": 15521669712000.0, "grad_norm": 2.2989304956491994, "language_loss": 0.88753957, "learning_rate": 3.999486123046994e-06, "loss": 0.91331935, "num_input_tokens_seen": 2560730, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 3.03125, "step": 122, "time_per_iteration": 2.7145779132843018 }, { "auxiliary_loss_clip": 0.01432827, "auxiliary_loss_mlp": 0.01129328, "balance_loss_clip": 1.06786418, "balance_loss_mlp": 1.12785029, "epoch": 0.007395160078160229, "flos": 24243365848320.0, "grad_norm": 2.103705334418487, "language_loss": 0.91590428, "learning_rate": 3.999477523119686e-06, "loss": 0.94152582, "num_input_tokens_seen": 2579550, "router_z_loss_clip": 0.61328125, "router_z_loss_mlp": 3.046875, "step": 123, "time_per_iteration": 2.670454502105713 }, { "auxiliary_loss_clip": 0.01433304, "auxiliary_loss_mlp": 0.01126645, "balance_loss_clip": 1.06880438, "balance_loss_mlp": 1.12406301, "epoch": 0.007455283330828198, "flos": 31759792099200.0, "grad_norm": 1.958676454096908, "language_loss": 0.70151782, "learning_rate": 3.999468851836168e-06, "loss": 0.7271173, "num_input_tokens_seen": 2600390, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 3.09375, "step": 124, "time_per_iteration": 2.842423677444458 }, { "auxiliary_loss_clip": 0.01420664, "auxiliary_loss_mlp": 0.01115767, "balance_loss_clip": 1.05759239, "balance_loss_mlp": 1.1253413, "epoch": 0.007515406583496167, "flos": 26615157822720.0, "grad_norm": 2.3520013774581354, "language_loss": 0.87156767, "learning_rate": 3.999460109196749e-06, "loss": 0.89693195, "num_input_tokens_seen": 2620770, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 2.953125, "step": 125, "time_per_iteration": 2.7109036445617676 }, { "auxiliary_loss_clip": 0.01433678, "auxiliary_loss_mlp": 0.0113141, "balance_loss_clip": 1.07306886, "balance_loss_mlp": 1.12557256, "epoch": 0.007575529836164136, "flos": 18223696350720.0, "grad_norm": 3.7439150385451683, "language_loss": 0.81098872, "learning_rate": 3.999451295201743e-06, "loss": 0.83663964, "num_input_tokens_seen": 2639900, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 3.078125, "step": 126, "time_per_iteration": 2.794557809829712 }, { "auxiliary_loss_clip": 0.01427546, "auxiliary_loss_mlp": 0.01130699, "balance_loss_clip": 1.07371724, "balance_loss_mlp": 1.12323892, "epoch": 0.007635653088832106, "flos": 21580410228480.0, "grad_norm": 2.711258271166943, "language_loss": 0.66358572, "learning_rate": 3.999442409851463e-06, "loss": 0.68916816, "num_input_tokens_seen": 2657450, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 3.046875, "step": 127, "time_per_iteration": 2.690289258956909 }, { "auxiliary_loss_clip": 0.01419939, "auxiliary_loss_mlp": 0.01121269, "balance_loss_clip": 1.06423914, "balance_loss_mlp": 1.12918711, "epoch": 0.007695776341500075, "flos": 25375054723200.0, "grad_norm": 2.00478854174961, "language_loss": 0.87019849, "learning_rate": 3.999433453146227e-06, "loss": 0.89561057, "num_input_tokens_seen": 2678150, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 2.90625, "step": 128, "time_per_iteration": 2.728484630584717 }, { "auxiliary_loss_clip": 0.01420888, "auxiliary_loss_mlp": 0.01132776, "balance_loss_clip": 1.07479262, "balance_loss_mlp": 1.12287581, "epoch": 0.007755899594168045, "flos": 22343906741760.0, "grad_norm": 1.820996488564965, "language_loss": 0.84014088, "learning_rate": 3.9994244250863535e-06, "loss": 0.86567754, "num_input_tokens_seen": 2698290, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 2.96875, "step": 129, "time_per_iteration": 2.715790033340454 }, { "auxiliary_loss_clip": 0.0141668, "auxiliary_loss_mlp": 0.01120656, "balance_loss_clip": 1.06224322, "balance_loss_mlp": 1.12250698, "epoch": 0.007816022846836013, "flos": 22638230784000.0, "grad_norm": 2.6172393089814663, "language_loss": 0.96244353, "learning_rate": 3.999415325672166e-06, "loss": 0.98781687, "num_input_tokens_seen": 2717630, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 2.9375, "step": 130, "time_per_iteration": 2.6531214714050293 }, { "auxiliary_loss_clip": 0.01409819, "auxiliary_loss_mlp": 0.01115721, "balance_loss_clip": 1.05721259, "balance_loss_mlp": 1.12012458, "epoch": 0.007876146099503984, "flos": 18182901479040.0, "grad_norm": 1.960363846159135, "language_loss": 0.80776978, "learning_rate": 3.9994061549039886e-06, "loss": 0.83302516, "num_input_tokens_seen": 2735835, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 2.90625, "step": 131, "time_per_iteration": 2.702486276626587 }, { "auxiliary_loss_clip": 0.01418215, "auxiliary_loss_mlp": 0.01112201, "balance_loss_clip": 1.05712628, "balance_loss_mlp": 1.123209, "epoch": 0.007936269352171952, "flos": 27119486730240.0, "grad_norm": 3.1924715202061553, "language_loss": 0.82006526, "learning_rate": 3.9993969127821485e-06, "loss": 0.84536946, "num_input_tokens_seen": 2756335, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 2.953125, "step": 132, "time_per_iteration": 2.671081066131592 }, { "auxiliary_loss_clip": 0.0141147, "auxiliary_loss_mlp": 0.01116596, "balance_loss_clip": 1.06018662, "balance_loss_mlp": 1.11823273, "epoch": 0.007996392604839923, "flos": 19026335710080.0, "grad_norm": 2.2742314868324285, "language_loss": 0.94053698, "learning_rate": 3.9993875993069755e-06, "loss": 0.96581769, "num_input_tokens_seen": 2775090, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 2.9375, "step": 133, "time_per_iteration": 2.725954055786133 }, { "auxiliary_loss_clip": 0.01408463, "auxiliary_loss_mlp": 0.01113863, "balance_loss_clip": 1.05988538, "balance_loss_mlp": 1.12716937, "epoch": 0.008056515857507891, "flos": 25484151306240.0, "grad_norm": 2.3180000959064038, "language_loss": 0.7220782, "learning_rate": 3.9993782144788025e-06, "loss": 0.74730152, "num_input_tokens_seen": 2795320, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 2.8125, "step": 134, "time_per_iteration": 2.848083972930908 }, { "auxiliary_loss_clip": 0.01406909, "auxiliary_loss_mlp": 0.01119613, "balance_loss_clip": 1.06096244, "balance_loss_mlp": 1.11823106, "epoch": 0.00811663911017586, "flos": 20557566541440.0, "grad_norm": 2.6903763781636, "language_loss": 0.87518322, "learning_rate": 3.999368758297964e-06, "loss": 0.90044844, "num_input_tokens_seen": 2812815, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 2.890625, "step": 135, "time_per_iteration": 2.6804380416870117 }, { "auxiliary_loss_clip": 0.01408375, "auxiliary_loss_mlp": 0.01117878, "balance_loss_clip": 1.05903661, "balance_loss_mlp": 1.1201961, "epoch": 0.00817676236284383, "flos": 18799738761600.0, "grad_norm": 1.904855888328157, "language_loss": 0.87804925, "learning_rate": 3.999359230764798e-06, "loss": 0.90331185, "num_input_tokens_seen": 2830445, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 2.875, "step": 136, "time_per_iteration": 2.6115262508392334 }, { "auxiliary_loss_clip": 0.0140408, "auxiliary_loss_mlp": 0.01097986, "balance_loss_clip": 1.04407918, "balance_loss_mlp": 1.11604476, "epoch": 0.008236885615511799, "flos": 23873593288320.0, "grad_norm": 2.1832972354153046, "language_loss": 0.82507938, "learning_rate": 3.999349631879643e-06, "loss": 0.85010004, "num_input_tokens_seen": 2846965, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 2.875, "step": 137, "time_per_iteration": 2.7002720832824707 }, { "auxiliary_loss_clip": 0.01404792, "auxiliary_loss_mlp": 0.01107398, "balance_loss_clip": 1.05344415, "balance_loss_mlp": 1.11713696, "epoch": 0.00829700886817977, "flos": 24643626076800.0, "grad_norm": 1.7082236172236611, "language_loss": 0.89207155, "learning_rate": 3.9993399616428425e-06, "loss": 0.91719341, "num_input_tokens_seen": 2867520, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 2.875, "step": 138, "time_per_iteration": 2.700784921646118 }, { "auxiliary_loss_clip": 0.01395202, "auxiliary_loss_mlp": 0.01113072, "balance_loss_clip": 1.05513597, "balance_loss_mlp": 1.11645913, "epoch": 0.008357132120847738, "flos": 25262007644160.0, "grad_norm": 2.3238223135771796, "language_loss": 0.90570927, "learning_rate": 3.999330220054742e-06, "loss": 0.93079203, "num_input_tokens_seen": 2885675, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 2.78125, "step": 139, "time_per_iteration": 2.6295700073242188 }, { "auxiliary_loss_clip": 0.01408614, "auxiliary_loss_mlp": 0.01112246, "balance_loss_clip": 1.05607462, "balance_loss_mlp": 1.1231612, "epoch": 0.008417255373515706, "flos": 27344898529920.0, "grad_norm": 2.4719205579957277, "language_loss": 0.84356701, "learning_rate": 3.9993204071156894e-06, "loss": 0.86877573, "num_input_tokens_seen": 2905960, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 2.84375, "step": 140, "time_per_iteration": 2.71671199798584 }, { "auxiliary_loss_clip": 0.01398796, "auxiliary_loss_mlp": 0.01109099, "balance_loss_clip": 1.05402458, "balance_loss_mlp": 1.11855745, "epoch": 0.008477378626183677, "flos": 17639070589440.0, "grad_norm": 3.3605115650410915, "language_loss": 0.83065832, "learning_rate": 3.999310522826034e-06, "loss": 0.85573733, "num_input_tokens_seen": 2922780, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 2.796875, "step": 141, "time_per_iteration": 2.6929945945739746 }, { "auxiliary_loss_clip": 0.01400898, "auxiliary_loss_mlp": 0.01120967, "balance_loss_clip": 1.06579661, "balance_loss_mlp": 1.11573744, "epoch": 0.008537501878851645, "flos": 13881342297600.0, "grad_norm": 2.329217708081014, "language_loss": 0.80357921, "learning_rate": 3.999300567186129e-06, "loss": 0.82879782, "num_input_tokens_seen": 2938765, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 2.84375, "step": 142, "time_per_iteration": 6.141700267791748 }, { "auxiliary_loss_clip": 0.01305808, "auxiliary_loss_mlp": 0.01166152, "balance_loss_clip": 1.14574373, "balance_loss_mlp": 1.14444089, "epoch": 0.008597625131519616, "flos": 71248101281280.0, "grad_norm": 1.0329907478195892, "language_loss": 0.66792417, "learning_rate": 3.999290540196329e-06, "loss": 0.69264376, "num_input_tokens_seen": 3006665, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.609375, "step": 143, "time_per_iteration": 6.006771564483643 }, { "auxiliary_loss_clip": 0.01399864, "auxiliary_loss_mlp": 0.01112588, "balance_loss_clip": 1.05775177, "balance_loss_mlp": 1.11901617, "epoch": 0.008657748384187584, "flos": 17602836744960.0, "grad_norm": 2.8875801743558247, "language_loss": 0.83200538, "learning_rate": 3.999280441856992e-06, "loss": 0.85712987, "num_input_tokens_seen": 3024335, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 2.8125, "step": 144, "time_per_iteration": 3.102023124694824 }, { "auxiliary_loss_clip": 0.0138664, "auxiliary_loss_mlp": 0.01102593, "balance_loss_clip": 1.04618359, "balance_loss_mlp": 1.11123657, "epoch": 0.008717871636855555, "flos": 19715317459200.0, "grad_norm": 1.8158488324454802, "language_loss": 0.87333989, "learning_rate": 3.9992702721684805e-06, "loss": 0.89823228, "num_input_tokens_seen": 3043300, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 2.75, "step": 145, "time_per_iteration": 2.693039894104004 }, { "auxiliary_loss_clip": 0.01400105, "auxiliary_loss_mlp": 0.01107929, "balance_loss_clip": 1.0513289, "balance_loss_mlp": 1.1140573, "epoch": 0.008777994889523523, "flos": 24717422568960.0, "grad_norm": 1.7454261278102383, "language_loss": 0.85707629, "learning_rate": 3.999260031131154e-06, "loss": 0.88215667, "num_input_tokens_seen": 3064610, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 2.859375, "step": 146, "time_per_iteration": 2.727663993835449 }, { "auxiliary_loss_clip": 0.01286643, "auxiliary_loss_mlp": 0.0107654, "balance_loss_clip": 1.05651319, "balance_loss_mlp": 1.13553309, "epoch": 0.008838118142191492, "flos": 70132067758080.0, "grad_norm": 0.8270154012043053, "language_loss": 0.5990988, "learning_rate": 3.999249718745381e-06, "loss": 0.62273061, "num_input_tokens_seen": 3130385, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.515625, "step": 147, "time_per_iteration": 3.317737102508545 }, { "auxiliary_loss_clip": 0.01394769, "auxiliary_loss_mlp": 0.01116464, "balance_loss_clip": 1.06422663, "balance_loss_mlp": 1.11772442, "epoch": 0.008898241394859462, "flos": 20044797937920.0, "grad_norm": 1.7673761656712375, "language_loss": 0.8364346, "learning_rate": 3.999239335011527e-06, "loss": 0.86154699, "num_input_tokens_seen": 3149760, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 2.78125, "step": 148, "time_per_iteration": 2.6634914875030518 }, { "auxiliary_loss_clip": 0.01399783, "auxiliary_loss_mlp": 0.01135076, "balance_loss_clip": 1.07723618, "balance_loss_mlp": 1.1200335, "epoch": 0.008958364647527431, "flos": 10743611685120.0, "grad_norm": 2.4582092336735113, "language_loss": 0.87531507, "learning_rate": 3.999228879929965e-06, "loss": 0.90066367, "num_input_tokens_seen": 3164500, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 2.8125, "step": 149, "time_per_iteration": 2.6620078086853027 }, { "auxiliary_loss_clip": 0.01399453, "auxiliary_loss_mlp": 0.01123451, "balance_loss_clip": 1.07028413, "balance_loss_mlp": 1.11519217, "epoch": 0.009018487900195401, "flos": 29127467802240.0, "grad_norm": 1.9738435062172026, "language_loss": 0.92270708, "learning_rate": 3.999218353501066e-06, "loss": 0.94793612, "num_input_tokens_seen": 3182455, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 2.84375, "step": 150, "time_per_iteration": 2.710641622543335 }, { "auxiliary_loss_clip": 0.01387084, "auxiliary_loss_mlp": 0.0111407, "balance_loss_clip": 1.06254768, "balance_loss_mlp": 1.1122582, "epoch": 0.00907861115286337, "flos": 32963661354240.0, "grad_norm": 2.282702242971855, "language_loss": 0.73434126, "learning_rate": 3.999207755725208e-06, "loss": 0.7593528, "num_input_tokens_seen": 3203995, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 2.75, "step": 151, "time_per_iteration": 2.739431858062744 }, { "auxiliary_loss_clip": 0.01397666, "auxiliary_loss_mlp": 0.01126358, "balance_loss_clip": 1.07400107, "balance_loss_mlp": 1.11570406, "epoch": 0.009138734405531338, "flos": 21762441377280.0, "grad_norm": 2.1200857691628046, "language_loss": 0.87066239, "learning_rate": 3.999197086602766e-06, "loss": 0.89590263, "num_input_tokens_seen": 3222575, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 2.8125, "step": 152, "time_per_iteration": 2.639108657836914 }, { "auxiliary_loss_clip": 0.01396196, "auxiliary_loss_mlp": 0.01124821, "balance_loss_clip": 1.07158256, "balance_loss_mlp": 1.1208061, "epoch": 0.009198857658199309, "flos": 20842517134080.0, "grad_norm": 2.837424004115326, "language_loss": 0.82224429, "learning_rate": 3.9991863461341234e-06, "loss": 0.84745443, "num_input_tokens_seen": 3240180, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 2.75, "step": 153, "time_per_iteration": 2.6967201232910156 }, { "auxiliary_loss_clip": 0.01387953, "auxiliary_loss_mlp": 0.0113216, "balance_loss_clip": 1.07687068, "balance_loss_mlp": 1.11198306, "epoch": 0.009258980910867277, "flos": 24827381078400.0, "grad_norm": 2.11552565611495, "language_loss": 0.88587677, "learning_rate": 3.999175534319662e-06, "loss": 0.91107786, "num_input_tokens_seen": 3259800, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 2.75, "step": 154, "time_per_iteration": 2.7899792194366455 }, { "auxiliary_loss_clip": 0.01395636, "auxiliary_loss_mlp": 0.01136944, "balance_loss_clip": 1.08406281, "balance_loss_mlp": 1.11853111, "epoch": 0.009319104163535248, "flos": 28767786963840.0, "grad_norm": 2.262763349198872, "language_loss": 0.88660794, "learning_rate": 3.999164651159769e-06, "loss": 0.91193378, "num_input_tokens_seen": 3280400, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 2.765625, "step": 155, "time_per_iteration": 2.700101852416992 }, { "auxiliary_loss_clip": 0.01397997, "auxiliary_loss_mlp": 0.01108018, "balance_loss_clip": 1.05594742, "balance_loss_mlp": 1.11348939, "epoch": 0.009379227416203216, "flos": 16582004219520.0, "grad_norm": 2.4549846349796574, "language_loss": 0.8535198, "learning_rate": 3.999153696654832e-06, "loss": 0.87857991, "num_input_tokens_seen": 3297600, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 2.84375, "step": 156, "time_per_iteration": 2.6739721298217773 }, { "auxiliary_loss_clip": 0.01397203, "auxiliary_loss_mlp": 0.0112064, "balance_loss_clip": 1.0678544, "balance_loss_mlp": 1.12004566, "epoch": 0.009439350668871187, "flos": 18329919845760.0, "grad_norm": 2.4029888535747324, "language_loss": 0.98667347, "learning_rate": 3.9991426708052416e-06, "loss": 1.01185191, "num_input_tokens_seen": 3313635, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 2.78125, "step": 157, "time_per_iteration": 2.7071235179901123 }, { "auxiliary_loss_clip": 0.01384193, "auxiliary_loss_mlp": 0.01137399, "balance_loss_clip": 1.08404064, "balance_loss_mlp": 1.1139344, "epoch": 0.009499473921539155, "flos": 24349912565760.0, "grad_norm": 2.3161388855369798, "language_loss": 0.87232322, "learning_rate": 3.999131573611392e-06, "loss": 0.89753908, "num_input_tokens_seen": 3333735, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 2.703125, "step": 158, "time_per_iteration": 2.78680682182312 }, { "auxiliary_loss_clip": 0.0139251, "auxiliary_loss_mlp": 0.01120986, "balance_loss_clip": 1.06698442, "balance_loss_mlp": 1.11670804, "epoch": 0.009559597174207124, "flos": 16399326625920.0, "grad_norm": 2.1327781159243466, "language_loss": 0.85496902, "learning_rate": 3.999120405073679e-06, "loss": 0.88010395, "num_input_tokens_seen": 3348800, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 2.75, "step": 159, "time_per_iteration": 2.595515012741089 }, { "auxiliary_loss_clip": 0.01381087, "auxiliary_loss_mlp": 0.01127405, "balance_loss_clip": 1.07321191, "balance_loss_mlp": 1.11100841, "epoch": 0.009619720426875094, "flos": 22856890826880.0, "grad_norm": 2.1364698862346576, "language_loss": 0.85567844, "learning_rate": 3.9991091651925014e-06, "loss": 0.88076341, "num_input_tokens_seen": 3368595, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 2.703125, "step": 160, "time_per_iteration": 2.719897747039795 }, { "auxiliary_loss_clip": 0.01390037, "auxiliary_loss_mlp": 0.01116146, "balance_loss_clip": 1.06595898, "balance_loss_mlp": 1.11729836, "epoch": 0.009679843679543063, "flos": 19135001329920.0, "grad_norm": 2.526959979914761, "language_loss": 0.91124856, "learning_rate": 3.999097853968259e-06, "loss": 0.93631041, "num_input_tokens_seen": 3384975, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 2.734375, "step": 161, "time_per_iteration": 2.577406406402588 }, { "auxiliary_loss_clip": 0.01391581, "auxiliary_loss_mlp": 0.01097422, "balance_loss_clip": 1.04754472, "balance_loss_mlp": 1.11681485, "epoch": 0.009739966932211033, "flos": 20302995876480.0, "grad_norm": 1.8037702806972502, "language_loss": 0.90509748, "learning_rate": 3.999086471401357e-06, "loss": 0.92998755, "num_input_tokens_seen": 3404755, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 2.75, "step": 162, "time_per_iteration": 2.7069506645202637 }, { "auxiliary_loss_clip": 0.01274435, "auxiliary_loss_mlp": 0.01096781, "balance_loss_clip": 1.07751644, "balance_loss_mlp": 1.11573315, "epoch": 0.009800090184879002, "flos": 67034234177280.0, "grad_norm": 1.1795932316210986, "language_loss": 0.72119969, "learning_rate": 3.9990750174922005e-06, "loss": 0.74491185, "num_input_tokens_seen": 3467210, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.5859375, "step": 163, "time_per_iteration": 3.1577696800231934 }, { "auxiliary_loss_clip": 0.01379818, "auxiliary_loss_mlp": 0.01106391, "balance_loss_clip": 1.05498779, "balance_loss_mlp": 1.11415792, "epoch": 0.00986021343754697, "flos": 17164690646400.0, "grad_norm": 2.8279482242325185, "language_loss": 0.83509195, "learning_rate": 3.9990634922412e-06, "loss": 0.859954, "num_input_tokens_seen": 3483220, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 2.65625, "step": 164, "time_per_iteration": 2.61991024017334 }, { "auxiliary_loss_clip": 0.01371669, "auxiliary_loss_mlp": 0.01093141, "balance_loss_clip": 1.04357421, "balance_loss_mlp": 1.1068362, "epoch": 0.00992033669021494, "flos": 17749424148480.0, "grad_norm": 2.3096509092012005, "language_loss": 0.88258314, "learning_rate": 3.9990518956487655e-06, "loss": 0.90723127, "num_input_tokens_seen": 3501465, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 2.640625, "step": 165, "time_per_iteration": 2.6344151496887207 }, { "auxiliary_loss_clip": 0.0137746, "auxiliary_loss_mlp": 0.011218, "balance_loss_clip": 1.06732106, "balance_loss_mlp": 1.1083014, "epoch": 0.00998045994288291, "flos": 25297164080640.0, "grad_norm": 2.0217684517878993, "language_loss": 0.79471493, "learning_rate": 3.9990402277153105e-06, "loss": 0.81970745, "num_input_tokens_seen": 3520480, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 2.6875, "step": 166, "time_per_iteration": 2.776238203048706 }, { "auxiliary_loss_clip": 0.01383299, "auxiliary_loss_mlp": 0.01119432, "balance_loss_clip": 1.06650269, "balance_loss_mlp": 1.1133039, "epoch": 0.01004058319555088, "flos": 32298954220800.0, "grad_norm": 2.393734087698293, "language_loss": 0.90845126, "learning_rate": 3.999028488441252e-06, "loss": 0.93347859, "num_input_tokens_seen": 3539570, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 2.6875, "step": 167, "time_per_iteration": 2.8031821250915527 }, { "auxiliary_loss_clip": 0.01375116, "auxiliary_loss_mlp": 0.01130915, "balance_loss_clip": 1.08020306, "balance_loss_mlp": 1.11159492, "epoch": 0.010100706448218848, "flos": 11319941404800.0, "grad_norm": 2.5681052042883383, "language_loss": 0.89236975, "learning_rate": 3.999016677827009e-06, "loss": 0.9174301, "num_input_tokens_seen": 3555465, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 2.640625, "step": 168, "time_per_iteration": 2.5622477531433105 }, { "auxiliary_loss_clip": 0.01370354, "auxiliary_loss_mlp": 0.01107675, "balance_loss_clip": 1.05641508, "balance_loss_mlp": 1.10702693, "epoch": 0.010160829700886819, "flos": 29719491765120.0, "grad_norm": 1.5582480348588723, "language_loss": 0.8633033, "learning_rate": 3.999004795873003e-06, "loss": 0.88808364, "num_input_tokens_seen": 3578970, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 2.625, "step": 169, "time_per_iteration": 2.755807399749756 }, { "auxiliary_loss_clip": 0.01372742, "auxiliary_loss_mlp": 0.0111318, "balance_loss_clip": 1.06106114, "balance_loss_mlp": 1.10921848, "epoch": 0.010220952953554787, "flos": 20412343854720.0, "grad_norm": 2.3416864782099704, "language_loss": 0.8382448, "learning_rate": 3.998992842579657e-06, "loss": 0.86310405, "num_input_tokens_seen": 3597275, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 2.625, "step": 170, "time_per_iteration": 2.5785999298095703 }, { "auxiliary_loss_clip": 0.01383729, "auxiliary_loss_mlp": 0.01135213, "balance_loss_clip": 1.08326197, "balance_loss_mlp": 1.11193466, "epoch": 0.010281076206222756, "flos": 31285124847360.0, "grad_norm": 2.255386631168423, "language_loss": 0.88955176, "learning_rate": 3.9989808179474e-06, "loss": 0.91474116, "num_input_tokens_seen": 3618905, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 2.71875, "step": 171, "time_per_iteration": 2.766789674758911 }, { "auxiliary_loss_clip": 0.01373972, "auxiliary_loss_mlp": 0.01109752, "balance_loss_clip": 1.06070971, "balance_loss_mlp": 1.11149907, "epoch": 0.010341199458890726, "flos": 21982286568960.0, "grad_norm": 1.8489480813912669, "language_loss": 0.88286769, "learning_rate": 3.998968721976658e-06, "loss": 0.90770495, "num_input_tokens_seen": 3639610, "router_z_loss_clip": 0.49023438, "router_z_loss_mlp": 2.625, "step": 172, "time_per_iteration": 2.608543634414673 }, { "auxiliary_loss_clip": 0.01363483, "auxiliary_loss_mlp": 0.01117224, "balance_loss_clip": 1.06760859, "balance_loss_mlp": 1.10592604, "epoch": 0.010401322711558695, "flos": 30810529422720.0, "grad_norm": 1.6719819245678589, "language_loss": 0.79986799, "learning_rate": 3.998956554667865e-06, "loss": 0.82467502, "num_input_tokens_seen": 3664030, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 2.578125, "step": 173, "time_per_iteration": 2.7515130043029785 }, { "auxiliary_loss_clip": 0.013761, "auxiliary_loss_mlp": 0.01101434, "balance_loss_clip": 1.05417967, "balance_loss_mlp": 1.10988772, "epoch": 0.010461445964226665, "flos": 24715124098560.0, "grad_norm": 1.7692424649667882, "language_loss": 0.82044339, "learning_rate": 3.998944316021455e-06, "loss": 0.84521866, "num_input_tokens_seen": 3683615, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 2.65625, "step": 174, "time_per_iteration": 2.6574337482452393 }, { "auxiliary_loss_clip": 0.01371757, "auxiliary_loss_mlp": 0.01119781, "balance_loss_clip": 1.06980872, "balance_loss_mlp": 1.10787868, "epoch": 0.010521569216894634, "flos": 27710361457920.0, "grad_norm": 18.812260624825086, "language_loss": 0.71771884, "learning_rate": 3.9989320060378634e-06, "loss": 0.74263418, "num_input_tokens_seen": 3704540, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 2.640625, "step": 175, "time_per_iteration": 2.7188217639923096 }, { "auxiliary_loss_clip": 0.01373982, "auxiliary_loss_mlp": 0.01127365, "balance_loss_clip": 1.07593846, "balance_loss_mlp": 1.10918069, "epoch": 0.010581692469562603, "flos": 12458346122880.0, "grad_norm": 2.490237014761603, "language_loss": 0.96319127, "learning_rate": 3.998919624717531e-06, "loss": 0.98820472, "num_input_tokens_seen": 3721320, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 2.640625, "step": 176, "time_per_iteration": 2.566922187805176 }, { "auxiliary_loss_clip": 0.01359823, "auxiliary_loss_mlp": 0.01126252, "balance_loss_clip": 1.07806802, "balance_loss_mlp": 1.10520124, "epoch": 0.010641815722230573, "flos": 19427601519360.0, "grad_norm": 2.2418619623423566, "language_loss": 0.75806653, "learning_rate": 3.998907172060898e-06, "loss": 0.78292727, "num_input_tokens_seen": 3739385, "router_z_loss_clip": 0.48242188, "router_z_loss_mlp": 2.546875, "step": 177, "time_per_iteration": 2.631993293762207 }, { "auxiliary_loss_clip": 0.01371999, "auxiliary_loss_mlp": 0.011017, "balance_loss_clip": 1.05275273, "balance_loss_mlp": 1.10530579, "epoch": 0.010701938974898541, "flos": 18332577452160.0, "grad_norm": 3.289812131571032, "language_loss": 0.75523168, "learning_rate": 3.9988946480684115e-06, "loss": 0.77996868, "num_input_tokens_seen": 3756360, "router_z_loss_clip": 0.49023438, "router_z_loss_mlp": 2.65625, "step": 178, "time_per_iteration": 2.6030099391937256 }, { "auxiliary_loss_clip": 0.01378303, "auxiliary_loss_mlp": 0.01102032, "balance_loss_clip": 1.05170226, "balance_loss_mlp": 1.11041093, "epoch": 0.010762062227566512, "flos": 19203985399680.0, "grad_norm": 2.4235668129283496, "language_loss": 0.83332664, "learning_rate": 3.998882052740516e-06, "loss": 0.85813004, "num_input_tokens_seen": 3773930, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 2.6875, "step": 179, "time_per_iteration": 2.5807971954345703 }, { "auxiliary_loss_clip": 0.01369973, "auxiliary_loss_mlp": 0.01105832, "balance_loss_clip": 1.05481005, "balance_loss_mlp": 1.10527623, "epoch": 0.01082218548023448, "flos": 31425427370880.0, "grad_norm": 2.100364084704256, "language_loss": 0.83241969, "learning_rate": 3.9988693860776616e-06, "loss": 0.85717773, "num_input_tokens_seen": 3793630, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 2.65625, "step": 180, "time_per_iteration": 2.7799878120422363 }, { "auxiliary_loss_clip": 0.01371011, "auxiliary_loss_mlp": 0.01108768, "balance_loss_clip": 1.05877137, "balance_loss_mlp": 1.10853648, "epoch": 0.01088230873290245, "flos": 25046436170880.0, "grad_norm": 2.259445016539282, "language_loss": 0.76592493, "learning_rate": 3.998856648080301e-06, "loss": 0.79072273, "num_input_tokens_seen": 3813610, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 2.625, "step": 181, "time_per_iteration": 2.7116408348083496 }, { "auxiliary_loss_clip": 0.01366059, "auxiliary_loss_mlp": 0.01115511, "balance_loss_clip": 1.06634927, "balance_loss_mlp": 1.10489225, "epoch": 0.01094243198557042, "flos": 22893411980160.0, "grad_norm": 2.95709000282301, "language_loss": 0.75793421, "learning_rate": 3.998843838748888e-06, "loss": 0.78274989, "num_input_tokens_seen": 3831390, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 2.609375, "step": 182, "time_per_iteration": 2.6523945331573486 }, { "auxiliary_loss_clip": 0.0136619, "auxiliary_loss_mlp": 0.01116345, "balance_loss_clip": 1.06556177, "balance_loss_mlp": 1.10477555, "epoch": 0.011002555238238388, "flos": 17165049782400.0, "grad_norm": 2.4475734241451943, "language_loss": 0.86826599, "learning_rate": 3.9988309580838796e-06, "loss": 0.89309138, "num_input_tokens_seen": 3849705, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 2.625, "step": 183, "time_per_iteration": 2.571237564086914 }, { "auxiliary_loss_clip": 0.0136636, "auxiliary_loss_mlp": 0.01102075, "balance_loss_clip": 1.05615592, "balance_loss_mlp": 1.10646892, "epoch": 0.011062678490906358, "flos": 22310150935680.0, "grad_norm": 1.941015990967445, "language_loss": 0.85478359, "learning_rate": 3.998818006085736e-06, "loss": 0.87946796, "num_input_tokens_seen": 3869230, "router_z_loss_clip": 0.45898438, "router_z_loss_mlp": 2.59375, "step": 184, "time_per_iteration": 4.297726631164551 }, { "auxiliary_loss_clip": 0.01364012, "auxiliary_loss_mlp": 0.01097193, "balance_loss_clip": 1.04812658, "balance_loss_mlp": 1.10819769, "epoch": 0.011122801743574327, "flos": 24388373053440.0, "grad_norm": 1.7404746703919853, "language_loss": 0.82655197, "learning_rate": 3.99880498275492e-06, "loss": 0.85116398, "num_input_tokens_seen": 3889735, "router_z_loss_clip": 0.49023438, "router_z_loss_mlp": 2.5625, "step": 185, "time_per_iteration": 2.79290509223938 }, { "auxiliary_loss_clip": 0.01370455, "auxiliary_loss_mlp": 0.01100666, "balance_loss_clip": 1.0487628, "balance_loss_mlp": 1.10472703, "epoch": 0.011182924996242297, "flos": 18150258994560.0, "grad_norm": 2.271541390295322, "language_loss": 0.70406753, "learning_rate": 3.9987918880918946e-06, "loss": 0.72877872, "num_input_tokens_seen": 3908855, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 2.65625, "step": 186, "time_per_iteration": 2.6574244499206543 }, { "auxiliary_loss_clip": 0.01368179, "auxiliary_loss_mlp": 0.01100928, "balance_loss_clip": 1.05445993, "balance_loss_mlp": 1.10592735, "epoch": 0.011243048248910266, "flos": 15486800584320.0, "grad_norm": 2.3255256453738204, "language_loss": 1.01440227, "learning_rate": 3.9987787220971295e-06, "loss": 1.03909326, "num_input_tokens_seen": 3923865, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 2.625, "step": 187, "time_per_iteration": 2.5812692642211914 }, { "auxiliary_loss_clip": 0.01358215, "auxiliary_loss_mlp": 0.01103758, "balance_loss_clip": 1.05392814, "balance_loss_mlp": 1.10476327, "epoch": 0.011303171501578235, "flos": 40916868986880.0, "grad_norm": 1.741356031643303, "language_loss": 0.74055755, "learning_rate": 3.9987654847710925e-06, "loss": 0.76517725, "num_input_tokens_seen": 3946870, "router_z_loss_clip": 0.49804688, "router_z_loss_mlp": 2.53125, "step": 188, "time_per_iteration": 2.7662599086761475 }, { "auxiliary_loss_clip": 0.01260304, "auxiliary_loss_mlp": 0.01036555, "balance_loss_clip": 1.01919806, "balance_loss_mlp": 1.11925244, "epoch": 0.011363294754246205, "flos": 66302697790080.0, "grad_norm": 0.7870803897235196, "language_loss": 0.5615294, "learning_rate": 3.998752176114257e-06, "loss": 0.58449799, "num_input_tokens_seen": 4010005, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 1.40625, "step": 189, "time_per_iteration": 3.253859519958496 }, { "auxiliary_loss_clip": 0.0135888, "auxiliary_loss_mlp": 0.01110999, "balance_loss_clip": 1.06148005, "balance_loss_mlp": 1.10301471, "epoch": 0.011423418006914174, "flos": 24900279730560.0, "grad_norm": 2.4683198047193495, "language_loss": 0.93683612, "learning_rate": 3.998738796127097e-06, "loss": 0.96153498, "num_input_tokens_seen": 4029035, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 2.5625, "step": 190, "time_per_iteration": 2.5690853595733643 }, { "auxiliary_loss_clip": 0.01356988, "auxiliary_loss_mlp": 0.01095747, "balance_loss_clip": 1.04882586, "balance_loss_mlp": 1.10243607, "epoch": 0.011483541259582144, "flos": 19791879298560.0, "grad_norm": 4.559762604482707, "language_loss": 0.84164697, "learning_rate": 3.998725344810092e-06, "loss": 0.86617434, "num_input_tokens_seen": 4046995, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 2.546875, "step": 191, "time_per_iteration": 2.616806983947754 }, { "auxiliary_loss_clip": 0.0135654, "auxiliary_loss_mlp": 0.01109232, "balance_loss_clip": 1.06307399, "balance_loss_mlp": 1.09897208, "epoch": 0.011543664512250112, "flos": 26176939896960.0, "grad_norm": 1.8150692990523398, "language_loss": 0.91409481, "learning_rate": 3.99871182216372e-06, "loss": 0.93875247, "num_input_tokens_seen": 4065865, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 2.578125, "step": 192, "time_per_iteration": 2.60302472114563 }, { "auxiliary_loss_clip": 0.01355959, "auxiliary_loss_mlp": 0.01114625, "balance_loss_clip": 1.06279325, "balance_loss_mlp": 1.10229254, "epoch": 0.011603787764918083, "flos": 23768985905280.0, "grad_norm": 2.7729427283035224, "language_loss": 0.85993862, "learning_rate": 3.998698228188465e-06, "loss": 0.88464445, "num_input_tokens_seen": 4085305, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 2.53125, "step": 193, "time_per_iteration": 2.6046299934387207 }, { "auxiliary_loss_clip": 0.01357993, "auxiliary_loss_mlp": 0.01095324, "balance_loss_clip": 1.0485698, "balance_loss_mlp": 1.09996557, "epoch": 0.011663911017586051, "flos": 25954688494080.0, "grad_norm": 2.464508436999364, "language_loss": 0.91767246, "learning_rate": 3.9986845628848115e-06, "loss": 0.94220567, "num_input_tokens_seen": 4105185, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 2.578125, "step": 194, "time_per_iteration": 2.63761043548584 }, { "auxiliary_loss_clip": 0.01363892, "auxiliary_loss_mlp": 0.01102938, "balance_loss_clip": 1.05530179, "balance_loss_mlp": 1.10539877, "epoch": 0.01172403427025402, "flos": 17895149625600.0, "grad_norm": 2.2504451249378565, "language_loss": 0.89255893, "learning_rate": 3.998670826253246e-06, "loss": 0.91722715, "num_input_tokens_seen": 4123160, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 2.578125, "step": 195, "time_per_iteration": 2.575007438659668 }, { "auxiliary_loss_clip": 0.0135793, "auxiliary_loss_mlp": 0.01107734, "balance_loss_clip": 1.05778515, "balance_loss_mlp": 1.10434282, "epoch": 0.01178415752292199, "flos": 17894539094400.0, "grad_norm": 2.970184625680167, "language_loss": 0.84670502, "learning_rate": 3.998657018294261e-06, "loss": 0.87136161, "num_input_tokens_seen": 4140425, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 2.53125, "step": 196, "time_per_iteration": 2.5740020275115967 }, { "auxiliary_loss_clip": 0.01355317, "auxiliary_loss_mlp": 0.01095486, "balance_loss_clip": 1.04634833, "balance_loss_mlp": 1.10058844, "epoch": 0.011844280775589959, "flos": 22893555634560.0, "grad_norm": 2.2701546485961566, "language_loss": 0.92401749, "learning_rate": 3.998643139008348e-06, "loss": 0.94852543, "num_input_tokens_seen": 4159555, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 2.5625, "step": 197, "time_per_iteration": 2.7370705604553223 }, { "auxiliary_loss_clip": 0.01355351, "auxiliary_loss_mlp": 0.01103731, "balance_loss_clip": 1.05626154, "balance_loss_mlp": 1.10263824, "epoch": 0.01190440402825793, "flos": 26980333441920.0, "grad_norm": 1.9147903819998393, "language_loss": 0.78117055, "learning_rate": 3.998629188396002e-06, "loss": 0.80576134, "num_input_tokens_seen": 4180480, "router_z_loss_clip": 0.47460938, "router_z_loss_mlp": 2.53125, "step": 198, "time_per_iteration": 2.8101677894592285 }, { "auxiliary_loss_clip": 0.01351338, "auxiliary_loss_mlp": 0.01109881, "balance_loss_clip": 1.06329393, "balance_loss_mlp": 1.10289168, "epoch": 0.011964527280925898, "flos": 20521584092160.0, "grad_norm": 1.8287133240487217, "language_loss": 0.87577945, "learning_rate": 3.9986151664577225e-06, "loss": 0.90039164, "num_input_tokens_seen": 4198835, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 2.484375, "step": 199, "time_per_iteration": 2.62125825881958 }, { "auxiliary_loss_clip": 0.013609, "auxiliary_loss_mlp": 0.01116653, "balance_loss_clip": 1.06479716, "balance_loss_mlp": 1.10506129, "epoch": 0.012024650533593867, "flos": 27745984771200.0, "grad_norm": 3.361079243340175, "language_loss": 0.80542624, "learning_rate": 3.998601073194007e-06, "loss": 0.8302018, "num_input_tokens_seen": 4219335, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 2.5625, "step": 200, "time_per_iteration": 2.7025911808013916 }, { "auxiliary_loss_clip": 0.01359434, "auxiliary_loss_mlp": 0.01093732, "balance_loss_clip": 1.04461765, "balance_loss_mlp": 1.10191822, "epoch": 0.012084773786261837, "flos": 10452017076480.0, "grad_norm": 2.208920034688615, "language_loss": 0.8689245, "learning_rate": 3.998586908605362e-06, "loss": 0.8934561, "num_input_tokens_seen": 4236940, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 2.5625, "step": 201, "time_per_iteration": 2.5781807899475098 }, { "auxiliary_loss_clip": 0.01360857, "auxiliary_loss_mlp": 0.01099684, "balance_loss_clip": 1.0506413, "balance_loss_mlp": 1.1072228, "epoch": 0.012144897038929806, "flos": 23105751229440.0, "grad_norm": 1.8090528034923534, "language_loss": 0.83623797, "learning_rate": 3.99857267269229e-06, "loss": 0.86084342, "num_input_tokens_seen": 4256755, "router_z_loss_clip": 0.49023438, "router_z_loss_mlp": 2.53125, "step": 202, "time_per_iteration": 2.7291107177734375 }, { "auxiliary_loss_clip": 0.01350009, "auxiliary_loss_mlp": 0.01102273, "balance_loss_clip": 1.05487537, "balance_loss_mlp": 1.09802556, "epoch": 0.012205020291597776, "flos": 21033203460480.0, "grad_norm": 1.756176868899251, "language_loss": 0.89093304, "learning_rate": 3.9985583654553e-06, "loss": 0.91545594, "num_input_tokens_seen": 4276505, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 2.515625, "step": 203, "time_per_iteration": 2.7477006912231445 }, { "auxiliary_loss_clip": 0.01238208, "auxiliary_loss_mlp": 0.01028022, "balance_loss_clip": 1.01066554, "balance_loss_mlp": 1.10461235, "epoch": 0.012265143544265745, "flos": 68447785075200.0, "grad_norm": 0.9804370447114354, "language_loss": 0.61100376, "learning_rate": 3.998543986894904e-06, "loss": 0.6336661, "num_input_tokens_seen": 4330965, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 1.3359375, "step": 204, "time_per_iteration": 3.2370388507843018 }, { "auxiliary_loss_clip": 0.01358864, "auxiliary_loss_mlp": 0.01107629, "balance_loss_clip": 1.06023169, "balance_loss_mlp": 1.10176194, "epoch": 0.012325266796933715, "flos": 17019252478080.0, "grad_norm": 2.6980325949236934, "language_loss": 0.90699828, "learning_rate": 3.9985295370116135e-06, "loss": 0.93166322, "num_input_tokens_seen": 4348200, "router_z_loss_clip": 0.47460938, "router_z_loss_mlp": 2.578125, "step": 205, "time_per_iteration": 2.657144069671631 }, { "auxiliary_loss_clip": 0.0136534, "auxiliary_loss_mlp": 0.01109289, "balance_loss_clip": 1.06108069, "balance_loss_mlp": 1.10596991, "epoch": 0.012385390049601683, "flos": 20190056538240.0, "grad_norm": 1.991272317331622, "language_loss": 0.88931787, "learning_rate": 3.998515015805945e-06, "loss": 0.91406417, "num_input_tokens_seen": 4365460, "router_z_loss_clip": 0.48242188, "router_z_loss_mlp": 2.59375, "step": 206, "time_per_iteration": 2.6542673110961914 }, { "auxiliary_loss_clip": 0.01354952, "auxiliary_loss_mlp": 0.01096125, "balance_loss_clip": 1.04956186, "balance_loss_mlp": 1.09964085, "epoch": 0.012445513302269652, "flos": 16253134272000.0, "grad_norm": 2.4522007422903167, "language_loss": 0.94599247, "learning_rate": 3.998500423278416e-06, "loss": 0.97050333, "num_input_tokens_seen": 4383650, "router_z_loss_clip": 0.46679688, "router_z_loss_mlp": 2.5625, "step": 207, "time_per_iteration": 2.618166208267212 }, { "auxiliary_loss_clip": 0.01356855, "auxiliary_loss_mlp": 0.01105728, "balance_loss_clip": 1.05868793, "balance_loss_mlp": 1.10578156, "epoch": 0.012505636554937622, "flos": 23769380954880.0, "grad_norm": 2.022092173184776, "language_loss": 0.74996626, "learning_rate": 3.998485759429547e-06, "loss": 0.77459216, "num_input_tokens_seen": 4403765, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 2.5, "step": 208, "time_per_iteration": 2.6778981685638428 }, { "auxiliary_loss_clip": 0.01343472, "auxiliary_loss_mlp": 0.01096059, "balance_loss_clip": 1.04913843, "balance_loss_mlp": 1.09933937, "epoch": 0.012565759807605591, "flos": 30591546157440.0, "grad_norm": 2.0787104438167296, "language_loss": 0.97701281, "learning_rate": 3.998471024259863e-06, "loss": 1.0014081, "num_input_tokens_seen": 4421935, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 2.4375, "step": 209, "time_per_iteration": 2.7084176540374756 }, { "auxiliary_loss_clip": 0.01351088, "auxiliary_loss_mlp": 0.01105765, "balance_loss_clip": 1.05874884, "balance_loss_mlp": 1.10099268, "epoch": 0.012625883060273561, "flos": 40113511355520.0, "grad_norm": 2.7801746751476255, "language_loss": 0.84705329, "learning_rate": 3.998456217769888e-06, "loss": 0.87162185, "num_input_tokens_seen": 4441470, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 2.5, "step": 210, "time_per_iteration": 2.767183542251587 }, { "auxiliary_loss_clip": 0.01349706, "auxiliary_loss_mlp": 0.01110072, "balance_loss_clip": 1.06486797, "balance_loss_mlp": 1.09999716, "epoch": 0.01268600631294153, "flos": 27089178629760.0, "grad_norm": 2.0928100782793004, "language_loss": 0.95970249, "learning_rate": 3.998441339960152e-06, "loss": 0.98430026, "num_input_tokens_seen": 4459950, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 2.5, "step": 211, "time_per_iteration": 2.741319179534912 }, { "auxiliary_loss_clip": 0.01359534, "auxiliary_loss_mlp": 0.0110481, "balance_loss_clip": 1.05672121, "balance_loss_mlp": 1.105721, "epoch": 0.012746129565609499, "flos": 16982767238400.0, "grad_norm": 2.105916306465752, "language_loss": 0.9434346, "learning_rate": 3.998426390831185e-06, "loss": 0.96807808, "num_input_tokens_seen": 4478390, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 2.53125, "step": 212, "time_per_iteration": 2.6713292598724365 }, { "auxiliary_loss_clip": 0.01347659, "auxiliary_loss_mlp": 0.01104983, "balance_loss_clip": 1.05813348, "balance_loss_mlp": 1.10162258, "epoch": 0.012806252818277469, "flos": 46533476995200.0, "grad_norm": 1.7419373918139236, "language_loss": 0.75567389, "learning_rate": 3.998411370383521e-06, "loss": 0.78020036, "num_input_tokens_seen": 4501665, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 2.453125, "step": 213, "time_per_iteration": 2.8910908699035645 }, { "auxiliary_loss_clip": 0.01352463, "auxiliary_loss_mlp": 0.01097068, "balance_loss_clip": 1.05122018, "balance_loss_mlp": 1.10190821, "epoch": 0.012866376070945438, "flos": 14388616120320.0, "grad_norm": 2.2667324708791536, "language_loss": 0.86005366, "learning_rate": 3.9983962786176945e-06, "loss": 0.88454902, "num_input_tokens_seen": 4519055, "router_z_loss_clip": 0.45898438, "router_z_loss_mlp": 2.5, "step": 214, "time_per_iteration": 2.6951987743377686 }, { "auxiliary_loss_clip": 0.01349267, "auxiliary_loss_mlp": 0.01120384, "balance_loss_clip": 1.07141268, "balance_loss_mlp": 1.10584664, "epoch": 0.012926499323613408, "flos": 26140813793280.0, "grad_norm": 2.5372855307550215, "language_loss": 0.767775, "learning_rate": 3.9983811155342465e-06, "loss": 0.79247153, "num_input_tokens_seen": 4540870, "router_z_loss_clip": 0.49023438, "router_z_loss_mlp": 2.4375, "step": 215, "time_per_iteration": 2.7182514667510986 }, { "auxiliary_loss_clip": 0.01355317, "auxiliary_loss_mlp": 0.01100928, "balance_loss_clip": 1.05503249, "balance_loss_mlp": 1.1035893, "epoch": 0.012986622576281377, "flos": 30117202128000.0, "grad_norm": 2.04518922180088, "language_loss": 0.88842207, "learning_rate": 3.998365881133717e-06, "loss": 0.91298449, "num_input_tokens_seen": 4560395, "router_z_loss_clip": 0.45898438, "router_z_loss_mlp": 2.515625, "step": 216, "time_per_iteration": 2.679992198944092 }, { "auxiliary_loss_clip": 0.0134888, "auxiliary_loss_mlp": 0.01094521, "balance_loss_clip": 1.04726696, "balance_loss_mlp": 1.09808528, "epoch": 0.013046745828949347, "flos": 13954025468160.0, "grad_norm": 2.8497485586245057, "language_loss": 0.92712307, "learning_rate": 3.998350575416648e-06, "loss": 0.95155716, "num_input_tokens_seen": 4575785, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 2.515625, "step": 217, "time_per_iteration": 2.688990354537964 }, { "auxiliary_loss_clip": 0.01344433, "auxiliary_loss_mlp": 0.01096962, "balance_loss_clip": 1.04920697, "balance_loss_mlp": 1.09836316, "epoch": 0.013106869081617315, "flos": 17347835116800.0, "grad_norm": 2.058442367628473, "language_loss": 0.92828798, "learning_rate": 3.9983351983835885e-06, "loss": 0.95270193, "num_input_tokens_seen": 4594985, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 2.46875, "step": 218, "time_per_iteration": 2.648418426513672 }, { "auxiliary_loss_clip": 0.01342942, "auxiliary_loss_mlp": 0.01095072, "balance_loss_clip": 1.04993927, "balance_loss_mlp": 1.09918439, "epoch": 0.013166992334285284, "flos": 25884914325120.0, "grad_norm": 1.8342505425861282, "language_loss": 0.85490549, "learning_rate": 3.998319750035087e-06, "loss": 0.87928563, "num_input_tokens_seen": 4616125, "router_z_loss_clip": 0.45117188, "router_z_loss_mlp": 2.4375, "step": 219, "time_per_iteration": 2.744497776031494 }, { "auxiliary_loss_clip": 0.01347684, "auxiliary_loss_mlp": 0.01091856, "balance_loss_clip": 1.04603171, "balance_loss_mlp": 1.10072303, "epoch": 0.013227115586953254, "flos": 31175956437120.0, "grad_norm": 2.296867961550453, "language_loss": 0.87035191, "learning_rate": 3.998304230371692e-06, "loss": 0.89474726, "num_input_tokens_seen": 4637795, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 2.46875, "step": 220, "time_per_iteration": 2.7499942779541016 }, { "auxiliary_loss_clip": 0.01340398, "auxiliary_loss_mlp": 0.01098488, "balance_loss_clip": 1.05624008, "balance_loss_mlp": 1.09749794, "epoch": 0.013287238839621223, "flos": 20409470766720.0, "grad_norm": 1.8776152150829077, "language_loss": 0.85943246, "learning_rate": 3.99828863939396e-06, "loss": 0.88382125, "num_input_tokens_seen": 4656835, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 2.4375, "step": 221, "time_per_iteration": 2.5735232830047607 }, { "auxiliary_loss_clip": 0.01344801, "auxiliary_loss_mlp": 0.01104296, "balance_loss_clip": 1.05840063, "balance_loss_mlp": 1.09567738, "epoch": 0.013347362092289193, "flos": 14137134024960.0, "grad_norm": 2.1042958134360172, "language_loss": 0.91329271, "learning_rate": 3.998272977102448e-06, "loss": 0.93778366, "num_input_tokens_seen": 4673015, "router_z_loss_clip": 0.45898438, "router_z_loss_mlp": 2.484375, "step": 222, "time_per_iteration": 2.5573227405548096 }, { "auxiliary_loss_clip": 0.01342664, "auxiliary_loss_mlp": 0.01096893, "balance_loss_clip": 1.04780221, "balance_loss_mlp": 1.09853864, "epoch": 0.013407485344957162, "flos": 21797705554560.0, "grad_norm": 2.226945278962744, "language_loss": 0.94954634, "learning_rate": 3.998257243497712e-06, "loss": 0.97394186, "num_input_tokens_seen": 4692355, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 2.4375, "step": 223, "time_per_iteration": 2.5592222213745117 }, { "auxiliary_loss_clip": 0.01342869, "auxiliary_loss_mlp": 0.01096027, "balance_loss_clip": 1.0509181, "balance_loss_mlp": 1.09942162, "epoch": 0.013467608597625132, "flos": 18621622195200.0, "grad_norm": 2.341588656390952, "language_loss": 0.87233496, "learning_rate": 3.998241438580316e-06, "loss": 0.89672399, "num_input_tokens_seen": 4710080, "router_z_loss_clip": 0.45117188, "router_z_loss_mlp": 2.4375, "step": 224, "time_per_iteration": 2.669471263885498 }, { "auxiliary_loss_clip": 0.0134262, "auxiliary_loss_mlp": 0.01099918, "balance_loss_clip": 1.05104184, "balance_loss_mlp": 1.09807456, "epoch": 0.013527731850293101, "flos": 18552314903040.0, "grad_norm": 2.0380329720281636, "language_loss": 0.88783944, "learning_rate": 3.998225562350823e-06, "loss": 0.91226482, "num_input_tokens_seen": 4728980, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 2.453125, "step": 225, "time_per_iteration": 5.626880168914795 }, { "auxiliary_loss_clip": 0.01340484, "auxiliary_loss_mlp": 0.01108489, "balance_loss_clip": 1.06140113, "balance_loss_mlp": 1.09829807, "epoch": 0.01358785510296107, "flos": 19165381257600.0, "grad_norm": 2.0211078614342775, "language_loss": 0.98115206, "learning_rate": 3.998209614809799e-06, "loss": 1.00564182, "num_input_tokens_seen": 4747020, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 2.421875, "step": 226, "time_per_iteration": 2.5498337745666504 }, { "auxiliary_loss_clip": 0.01344862, "auxiliary_loss_mlp": 0.01103932, "balance_loss_clip": 1.05488896, "balance_loss_mlp": 1.10060549, "epoch": 0.01364797835562904, "flos": 23329941966720.0, "grad_norm": 2.518548629204583, "language_loss": 0.90308344, "learning_rate": 3.9981935959578145e-06, "loss": 0.92757142, "num_input_tokens_seen": 4765000, "router_z_loss_clip": 0.49023438, "router_z_loss_mlp": 2.453125, "step": 227, "time_per_iteration": 2.7390103340148926 }, { "auxiliary_loss_clip": 0.01229125, "auxiliary_loss_mlp": 0.01061804, "balance_loss_clip": 1.04501939, "balance_loss_mlp": 1.0906322, "epoch": 0.013708101608297009, "flos": 70993746097920.0, "grad_norm": 0.9804401327481929, "language_loss": 0.57551652, "learning_rate": 3.99817750579544e-06, "loss": 0.59842581, "num_input_tokens_seen": 4833210, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 1.3828125, "step": 228, "time_per_iteration": 3.411332607269287 }, { "auxiliary_loss_clip": 0.01336355, "auxiliary_loss_mlp": 0.01101575, "balance_loss_clip": 1.05677617, "balance_loss_mlp": 1.09722161, "epoch": 0.013768224860964979, "flos": 16325170997760.0, "grad_norm": 2.386297287636576, "language_loss": 0.86496961, "learning_rate": 3.998161344323251e-06, "loss": 0.88934898, "num_input_tokens_seen": 4850120, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 2.390625, "step": 229, "time_per_iteration": 2.649449110031128 }, { "auxiliary_loss_clip": 0.01342705, "auxiliary_loss_mlp": 0.01092498, "balance_loss_clip": 1.04593444, "balance_loss_mlp": 1.09688771, "epoch": 0.013828348113632948, "flos": 20193037367040.0, "grad_norm": 2.630252270951573, "language_loss": 0.83325756, "learning_rate": 3.998145111541823e-06, "loss": 0.85760951, "num_input_tokens_seen": 4866215, "router_z_loss_clip": 0.46679688, "router_z_loss_mlp": 2.46875, "step": 230, "time_per_iteration": 2.6746835708618164 }, { "auxiliary_loss_clip": 0.01342999, "auxiliary_loss_mlp": 0.01104391, "balance_loss_clip": 1.0587095, "balance_loss_mlp": 1.09925997, "epoch": 0.013888471366300916, "flos": 20741070147840.0, "grad_norm": 5.990915197948855, "language_loss": 0.89811885, "learning_rate": 3.998128807451736e-06, "loss": 0.92259276, "num_input_tokens_seen": 4885630, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 2.4375, "step": 231, "time_per_iteration": 2.6738955974578857 }, { "auxiliary_loss_clip": 0.01346586, "auxiliary_loss_mlp": 0.01101227, "balance_loss_clip": 1.05635715, "balance_loss_mlp": 1.10238731, "epoch": 0.013948594618968886, "flos": 22090628966400.0, "grad_norm": 2.6464337079285976, "language_loss": 0.82910693, "learning_rate": 3.9981124320535715e-06, "loss": 0.85358506, "num_input_tokens_seen": 4905570, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 2.4375, "step": 232, "time_per_iteration": 2.7001991271972656 }, { "auxiliary_loss_clip": 0.01347344, "auxiliary_loss_mlp": 0.01096168, "balance_loss_clip": 1.04953337, "balance_loss_mlp": 1.09456992, "epoch": 0.014008717871636855, "flos": 19063108258560.0, "grad_norm": 2.8400235210056786, "language_loss": 0.73404449, "learning_rate": 3.998095985347915e-06, "loss": 0.7584796, "num_input_tokens_seen": 4923535, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 2.53125, "step": 233, "time_per_iteration": 2.5842394828796387 }, { "auxiliary_loss_clip": 0.01349532, "auxiliary_loss_mlp": 0.01104801, "balance_loss_clip": 1.05766559, "balance_loss_mlp": 1.10171032, "epoch": 0.014068841124304825, "flos": 14530822064640.0, "grad_norm": 2.101759204429222, "language_loss": 0.85046613, "learning_rate": 3.998079467335351e-06, "loss": 0.87500954, "num_input_tokens_seen": 4939200, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 2.46875, "step": 234, "time_per_iteration": 2.6161227226257324 }, { "auxiliary_loss_clip": 0.01343743, "auxiliary_loss_mlp": 0.01102314, "balance_loss_clip": 1.05780137, "balance_loss_mlp": 1.10160279, "epoch": 0.014128964376972794, "flos": 18077396256000.0, "grad_norm": 2.367509026755564, "language_loss": 0.88227135, "learning_rate": 3.998062878016471e-06, "loss": 0.9067319, "num_input_tokens_seen": 4956620, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 2.421875, "step": 235, "time_per_iteration": 2.574087619781494 }, { "auxiliary_loss_clip": 0.01343908, "auxiliary_loss_mlp": 0.01105478, "balance_loss_clip": 1.05929637, "balance_loss_mlp": 1.1035893, "epoch": 0.014189087629640764, "flos": 25334331678720.0, "grad_norm": 2.7014849077545215, "language_loss": 0.84932697, "learning_rate": 3.998046217391867e-06, "loss": 0.87382078, "num_input_tokens_seen": 4975650, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 2.40625, "step": 236, "time_per_iteration": 2.655557155609131 }, { "auxiliary_loss_clip": 0.01342577, "auxiliary_loss_mlp": 0.010923, "balance_loss_clip": 1.04695296, "balance_loss_mlp": 1.09761775, "epoch": 0.014249210882308733, "flos": 36139744713600.0, "grad_norm": 2.0548827206545717, "language_loss": 0.8194797, "learning_rate": 3.9980294854621325e-06, "loss": 0.8438285, "num_input_tokens_seen": 4997415, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 2.453125, "step": 237, "time_per_iteration": 2.7256317138671875 }, { "auxiliary_loss_clip": 0.0133905, "auxiliary_loss_mlp": 0.01113748, "balance_loss_clip": 1.06630254, "balance_loss_mlp": 1.09877086, "epoch": 0.014309334134976702, "flos": 12932977461120.0, "grad_norm": 2.6048215858158668, "language_loss": 0.76266474, "learning_rate": 3.998012682227866e-06, "loss": 0.7871927, "num_input_tokens_seen": 5013905, "router_z_loss_clip": 0.47460938, "router_z_loss_mlp": 2.40625, "step": 238, "time_per_iteration": 2.6472091674804688 }, { "auxiliary_loss_clip": 0.01337797, "auxiliary_loss_mlp": 0.01103408, "balance_loss_clip": 1.06085014, "balance_loss_mlp": 1.10171103, "epoch": 0.014369457387644672, "flos": 20777519473920.0, "grad_norm": 1.836320223401573, "language_loss": 0.86389363, "learning_rate": 3.9979958076896655e-06, "loss": 0.88830566, "num_input_tokens_seen": 5033645, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 2.375, "step": 239, "time_per_iteration": 2.649803876876831 }, { "auxiliary_loss_clip": 0.01323922, "auxiliary_loss_mlp": 0.01090404, "balance_loss_clip": 1.04677296, "balance_loss_mlp": 1.09204066, "epoch": 0.01442958064031264, "flos": 25848536826240.0, "grad_norm": 4.014832541850425, "language_loss": 0.92399836, "learning_rate": 3.997978861848135e-06, "loss": 0.94814157, "num_input_tokens_seen": 5052875, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 2.3125, "step": 240, "time_per_iteration": 2.6678740978240967 }, { "auxiliary_loss_clip": 0.01333172, "auxiliary_loss_mlp": 0.01094524, "balance_loss_clip": 1.05234766, "balance_loss_mlp": 1.09780443, "epoch": 0.014489703892980611, "flos": 28219718269440.0, "grad_norm": 2.4329224248189645, "language_loss": 0.84676456, "learning_rate": 3.997961844703877e-06, "loss": 0.87104154, "num_input_tokens_seen": 5075005, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 2.34375, "step": 241, "time_per_iteration": 2.631643533706665 }, { "auxiliary_loss_clip": 0.01332389, "auxiliary_loss_mlp": 0.01111683, "balance_loss_clip": 1.06621635, "balance_loss_mlp": 1.10342312, "epoch": 0.01454982714564858, "flos": 22490925108480.0, "grad_norm": 1.7603881281107623, "language_loss": 0.87822193, "learning_rate": 3.997944756257501e-06, "loss": 0.90266263, "num_input_tokens_seen": 5091875, "router_z_loss_clip": 0.45507812, "router_z_loss_mlp": 2.28125, "step": 242, "time_per_iteration": 2.585980176925659 }, { "auxiliary_loss_clip": 0.01331639, "auxiliary_loss_mlp": 0.01087377, "balance_loss_clip": 1.04191077, "balance_loss_mlp": 1.09549737, "epoch": 0.014609950398316548, "flos": 21653201139840.0, "grad_norm": 2.3928919893429357, "language_loss": 0.85364699, "learning_rate": 3.997927596509616e-06, "loss": 0.87783712, "num_input_tokens_seen": 5111290, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 2.34375, "step": 243, "time_per_iteration": 2.5738987922668457 }, { "auxiliary_loss_clip": 0.01337945, "auxiliary_loss_mlp": 0.01101117, "balance_loss_clip": 1.05462503, "balance_loss_mlp": 1.09921432, "epoch": 0.014670073650984519, "flos": 21869993675520.0, "grad_norm": 1.5746945863929847, "language_loss": 0.84170151, "learning_rate": 3.997910365460834e-06, "loss": 0.86609221, "num_input_tokens_seen": 5132265, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 2.390625, "step": 244, "time_per_iteration": 2.591207981109619 }, { "auxiliary_loss_clip": 0.0134417, "auxiliary_loss_mlp": 0.01112503, "balance_loss_clip": 1.06551051, "balance_loss_mlp": 1.10127628, "epoch": 0.014730196903652487, "flos": 23183713699200.0, "grad_norm": 2.3215585506954346, "language_loss": 0.78511149, "learning_rate": 3.9978930631117705e-06, "loss": 0.8096782, "num_input_tokens_seen": 5148575, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 2.4375, "step": 245, "time_per_iteration": 2.606776237487793 }, { "auxiliary_loss_clip": 0.01343793, "auxiliary_loss_mlp": 0.01099006, "balance_loss_clip": 1.04965377, "balance_loss_mlp": 1.09667051, "epoch": 0.014790320156320457, "flos": 23222605150080.0, "grad_norm": 1.9697614204465739, "language_loss": 0.83568478, "learning_rate": 3.997875689463043e-06, "loss": 0.86011279, "num_input_tokens_seen": 5170415, "router_z_loss_clip": 0.49414062, "router_z_loss_mlp": 2.46875, "step": 246, "time_per_iteration": 2.5965535640716553 }, { "auxiliary_loss_clip": 0.01338278, "auxiliary_loss_mlp": 0.01091577, "balance_loss_clip": 1.04561031, "balance_loss_mlp": 1.09623802, "epoch": 0.014850443408988426, "flos": 15815490963840.0, "grad_norm": 2.064213723127464, "language_loss": 0.89281297, "learning_rate": 3.9978582445152705e-06, "loss": 0.91711152, "num_input_tokens_seen": 5188565, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 2.421875, "step": 247, "time_per_iteration": 2.5912694931030273 }, { "auxiliary_loss_clip": 0.01337456, "auxiliary_loss_mlp": 0.0109181, "balance_loss_clip": 1.04758334, "balance_loss_mlp": 1.09274912, "epoch": 0.014910566661656396, "flos": 22781657790720.0, "grad_norm": 2.101872304181628, "language_loss": 0.77352369, "learning_rate": 3.997840728269077e-06, "loss": 0.79781628, "num_input_tokens_seen": 5207810, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 2.453125, "step": 248, "time_per_iteration": 2.6411383152008057 }, { "auxiliary_loss_clip": 0.01339285, "auxiliary_loss_mlp": 0.01094648, "balance_loss_clip": 1.04987347, "balance_loss_mlp": 1.10080206, "epoch": 0.014970689914324365, "flos": 26865023806080.0, "grad_norm": 2.20501805853465, "language_loss": 0.83047169, "learning_rate": 3.997823140725088e-06, "loss": 0.85481107, "num_input_tokens_seen": 5226210, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 2.390625, "step": 249, "time_per_iteration": 2.607417583465576 }, { "auxiliary_loss_clip": 0.01333709, "auxiliary_loss_mlp": 0.0109667, "balance_loss_clip": 1.05320597, "balance_loss_mlp": 1.09516203, "epoch": 0.015030813166992334, "flos": 13985662371840.0, "grad_norm": 2.1546007809184275, "language_loss": 0.9265247, "learning_rate": 3.997805481883929e-06, "loss": 0.95082849, "num_input_tokens_seen": 5241660, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 2.390625, "step": 250, "time_per_iteration": 2.511246919631958 }, { "auxiliary_loss_clip": 0.01341131, "auxiliary_loss_mlp": 0.0111359, "balance_loss_clip": 1.06616843, "balance_loss_mlp": 1.0996387, "epoch": 0.015090936419660304, "flos": 24717817618560.0, "grad_norm": 2.4496288495395406, "language_loss": 0.96364802, "learning_rate": 3.997787751746231e-06, "loss": 0.98819524, "num_input_tokens_seen": 5261090, "router_z_loss_clip": 0.47460938, "router_z_loss_mlp": 2.40625, "step": 251, "time_per_iteration": 2.6839914321899414 }, { "auxiliary_loss_clip": 0.0133574, "auxiliary_loss_mlp": 0.01096732, "balance_loss_clip": 1.05176628, "balance_loss_mlp": 1.09419215, "epoch": 0.015151059672328273, "flos": 25738793798400.0, "grad_norm": 1.831422969065553, "language_loss": 0.83595204, "learning_rate": 3.997769950312628e-06, "loss": 0.8602767, "num_input_tokens_seen": 5279175, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 2.421875, "step": 252, "time_per_iteration": 2.6178534030914307 }, { "auxiliary_loss_clip": 0.01330183, "auxiliary_loss_mlp": 0.01109322, "balance_loss_clip": 1.06473744, "balance_loss_mlp": 1.09669745, "epoch": 0.015211182924996243, "flos": 21871214737920.0, "grad_norm": 2.2118900776316033, "language_loss": 0.97728664, "learning_rate": 3.997752077583753e-06, "loss": 1.00168169, "num_input_tokens_seen": 5296975, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 2.328125, "step": 253, "time_per_iteration": 2.7065653800964355 }, { "auxiliary_loss_clip": 0.01232591, "auxiliary_loss_mlp": 0.01055834, "balance_loss_clip": 1.03981256, "balance_loss_mlp": 1.10836601, "epoch": 0.015271306177664212, "flos": 66895080888960.0, "grad_norm": 0.8424410258362646, "language_loss": 0.5558998, "learning_rate": 3.997734133560246e-06, "loss": 0.57878405, "num_input_tokens_seen": 5358375, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 1.2421875, "step": 254, "time_per_iteration": 3.1872172355651855 }, { "auxiliary_loss_clip": 0.0133414, "auxiliary_loss_mlp": 0.01108173, "balance_loss_clip": 1.06292081, "balance_loss_mlp": 1.09231114, "epoch": 0.01533142943033218, "flos": 26834069260800.0, "grad_norm": 2.531499383911013, "language_loss": 0.89958721, "learning_rate": 3.997716118242746e-06, "loss": 0.9240104, "num_input_tokens_seen": 5377255, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 2.421875, "step": 255, "time_per_iteration": 2.6951472759246826 }, { "auxiliary_loss_clip": 0.01330912, "auxiliary_loss_mlp": 0.01112528, "balance_loss_clip": 1.06799173, "balance_loss_mlp": 1.0942874, "epoch": 0.01539155268300015, "flos": 20813753318400.0, "grad_norm": 2.2827602923329198, "language_loss": 0.84561682, "learning_rate": 3.997698031631898e-06, "loss": 0.87005126, "num_input_tokens_seen": 5395320, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 2.359375, "step": 256, "time_per_iteration": 2.6353836059570312 }, { "auxiliary_loss_clip": 0.01340448, "auxiliary_loss_mlp": 0.01111915, "balance_loss_clip": 1.06640053, "balance_loss_mlp": 1.09716558, "epoch": 0.01545167593566812, "flos": 15961862885760.0, "grad_norm": 2.3504796727977224, "language_loss": 0.70944142, "learning_rate": 3.997679873728344e-06, "loss": 0.73396498, "num_input_tokens_seen": 5411970, "router_z_loss_clip": 0.45507812, "router_z_loss_mlp": 2.4375, "step": 257, "time_per_iteration": 2.697669744491577 }, { "auxiliary_loss_clip": 0.01331768, "auxiliary_loss_mlp": 0.01097532, "balance_loss_clip": 1.05366278, "balance_loss_mlp": 1.0954268, "epoch": 0.01551179918833609, "flos": 22601745544320.0, "grad_norm": 1.9895129300035912, "language_loss": 0.9399358, "learning_rate": 3.9976616445327355e-06, "loss": 0.96422887, "num_input_tokens_seen": 5430245, "router_z_loss_clip": 0.43945312, "router_z_loss_mlp": 2.359375, "step": 258, "time_per_iteration": 2.6537466049194336 }, { "auxiliary_loss_clip": 0.01329671, "auxiliary_loss_mlp": 0.01103053, "balance_loss_clip": 1.05947018, "balance_loss_mlp": 1.09240294, "epoch": 0.015571922441004058, "flos": 22816706486400.0, "grad_norm": 2.362055538224131, "language_loss": 0.92159891, "learning_rate": 3.9976433440457205e-06, "loss": 0.94592613, "num_input_tokens_seen": 5448905, "router_z_loss_clip": 0.43554688, "router_z_loss_mlp": 2.375, "step": 259, "time_per_iteration": 2.6555843353271484 }, { "auxiliary_loss_clip": 0.01326148, "auxiliary_loss_mlp": 0.01097094, "balance_loss_clip": 1.05606198, "balance_loss_mlp": 1.09715724, "epoch": 0.015632045693672027, "flos": 18947439486720.0, "grad_norm": 1.799427721837976, "language_loss": 0.97034281, "learning_rate": 3.997624972267954e-06, "loss": 0.99457526, "num_input_tokens_seen": 5466405, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 2.28125, "step": 260, "time_per_iteration": 2.613474130630493 }, { "auxiliary_loss_clip": 0.01339075, "auxiliary_loss_mlp": 0.01104922, "balance_loss_clip": 1.06160092, "balance_loss_mlp": 1.09662342, "epoch": 0.015692168946339995, "flos": 29971728046080.0, "grad_norm": 2.0445240434367027, "language_loss": 0.87273872, "learning_rate": 3.99760652920009e-06, "loss": 0.89717865, "num_input_tokens_seen": 5487055, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 2.421875, "step": 261, "time_per_iteration": 2.6415975093841553 }, { "auxiliary_loss_clip": 0.01331291, "auxiliary_loss_mlp": 0.01097652, "balance_loss_clip": 1.05290055, "balance_loss_mlp": 1.09364939, "epoch": 0.015752292199007967, "flos": 19392085946880.0, "grad_norm": 2.3982039322221005, "language_loss": 0.66797501, "learning_rate": 3.997588014842788e-06, "loss": 0.69226444, "num_input_tokens_seen": 5506600, "router_z_loss_clip": 0.44726562, "router_z_loss_mlp": 2.375, "step": 262, "time_per_iteration": 2.7318918704986572 }, { "auxiliary_loss_clip": 0.01328141, "auxiliary_loss_mlp": 0.01110188, "balance_loss_clip": 1.06608009, "balance_loss_mlp": 1.09413111, "epoch": 0.015812415451675936, "flos": 20339804338560.0, "grad_norm": 2.345134909225795, "language_loss": 0.67320395, "learning_rate": 3.997569429196708e-06, "loss": 0.69758725, "num_input_tokens_seen": 5524350, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 2.34375, "step": 263, "time_per_iteration": 2.726876735687256 }, { "auxiliary_loss_clip": 0.01329767, "auxiliary_loss_mlp": 0.01100706, "balance_loss_clip": 1.05702758, "balance_loss_mlp": 1.09036946, "epoch": 0.015872538704343905, "flos": 17525412979200.0, "grad_norm": 2.442326351371745, "language_loss": 0.84338039, "learning_rate": 3.997550772262513e-06, "loss": 0.86768514, "num_input_tokens_seen": 5542145, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 2.390625, "step": 264, "time_per_iteration": 2.7284371852874756 }, { "auxiliary_loss_clip": 0.01334389, "auxiliary_loss_mlp": 0.01092335, "balance_loss_clip": 1.05030155, "balance_loss_mlp": 1.09648263, "epoch": 0.015932661957011873, "flos": 15260490944640.0, "grad_norm": 2.5169451561219827, "language_loss": 1.03589034, "learning_rate": 3.997532044040869e-06, "loss": 1.06015754, "num_input_tokens_seen": 5557920, "router_z_loss_clip": 0.41992188, "router_z_loss_mlp": 2.375, "step": 265, "time_per_iteration": 2.7547338008880615 }, { "auxiliary_loss_clip": 0.0133636, "auxiliary_loss_mlp": 0.01108269, "balance_loss_clip": 1.06068087, "balance_loss_mlp": 1.09737575, "epoch": 0.015992785209679845, "flos": 20302528999680.0, "grad_norm": 2.6464176660171312, "language_loss": 0.74561679, "learning_rate": 3.997513244532445e-06, "loss": 0.77006304, "num_input_tokens_seen": 5576290, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 2.390625, "step": 266, "time_per_iteration": 2.73130464553833 }, { "auxiliary_loss_clip": 0.01324626, "auxiliary_loss_mlp": 0.01091433, "balance_loss_clip": 1.04763508, "balance_loss_mlp": 1.09470797, "epoch": 0.016052908462347814, "flos": 23362368969600.0, "grad_norm": 2.2959907645942037, "language_loss": 0.90054798, "learning_rate": 3.997494373737912e-06, "loss": 0.9247086, "num_input_tokens_seen": 5595205, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 2.296875, "step": 267, "time_per_iteration": 5.685004472732544 }, { "auxiliary_loss_clip": 0.0133527, "auxiliary_loss_mlp": 0.01108514, "balance_loss_clip": 1.06552672, "balance_loss_mlp": 1.09615862, "epoch": 0.016113031715015783, "flos": 21286588976640.0, "grad_norm": 3.4347597389956097, "language_loss": 0.85074139, "learning_rate": 3.997475431657943e-06, "loss": 0.87517923, "num_input_tokens_seen": 5612645, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 2.390625, "step": 268, "time_per_iteration": 2.560570240020752 }, { "auxiliary_loss_clip": 0.0132865, "auxiliary_loss_mlp": 0.0109113, "balance_loss_clip": 1.0483582, "balance_loss_mlp": 1.09794247, "epoch": 0.01617315496768375, "flos": 18914689261440.0, "grad_norm": 2.183838497796688, "language_loss": 0.87636745, "learning_rate": 3.9974564182932135e-06, "loss": 0.90056527, "num_input_tokens_seen": 5628345, "router_z_loss_clip": 0.42773438, "router_z_loss_mlp": 2.3125, "step": 269, "time_per_iteration": 2.5324227809906006 }, { "auxiliary_loss_clip": 0.01330847, "auxiliary_loss_mlp": 0.01095604, "balance_loss_clip": 1.04854047, "balance_loss_mlp": 1.0938077, "epoch": 0.01623327822035172, "flos": 16546488647040.0, "grad_norm": 2.0988041826680948, "language_loss": 0.96281421, "learning_rate": 3.997437333644403e-06, "loss": 0.98707873, "num_input_tokens_seen": 5645940, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 2.375, "step": 270, "time_per_iteration": 2.6169915199279785 }, { "auxiliary_loss_clip": 0.0133205, "auxiliary_loss_mlp": 0.01108302, "balance_loss_clip": 1.06488562, "balance_loss_mlp": 1.09934378, "epoch": 0.016293401473019692, "flos": 23513481486720.0, "grad_norm": 1.998009594234162, "language_loss": 0.85278732, "learning_rate": 3.9974181777121915e-06, "loss": 0.87719083, "num_input_tokens_seen": 5665690, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 2.328125, "step": 271, "time_per_iteration": 2.5958306789398193 }, { "auxiliary_loss_clip": 0.01330256, "auxiliary_loss_mlp": 0.01091493, "balance_loss_clip": 1.04881585, "balance_loss_mlp": 1.09144616, "epoch": 0.01635352472568766, "flos": 29016072748800.0, "grad_norm": 4.228506345685007, "language_loss": 0.81063259, "learning_rate": 3.997398950497263e-06, "loss": 0.83485007, "num_input_tokens_seen": 5683190, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 2.390625, "step": 272, "time_per_iteration": 2.6072776317596436 }, { "auxiliary_loss_clip": 0.01324728, "auxiliary_loss_mlp": 0.01115118, "balance_loss_clip": 1.07363296, "balance_loss_mlp": 1.09408391, "epoch": 0.01641364797835563, "flos": 13370513028480.0, "grad_norm": 2.200548114666878, "language_loss": 0.80089444, "learning_rate": 3.9973796520003044e-06, "loss": 0.82529283, "num_input_tokens_seen": 5699780, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 2.3125, "step": 273, "time_per_iteration": 2.5180585384368896 }, { "auxiliary_loss_clip": 0.01323046, "auxiliary_loss_mlp": 0.01095656, "balance_loss_clip": 1.05319381, "balance_loss_mlp": 1.09001303, "epoch": 0.016473771231023598, "flos": 18878239935360.0, "grad_norm": 2.290833922401326, "language_loss": 0.90788639, "learning_rate": 3.997360282222004e-06, "loss": 0.93207341, "num_input_tokens_seen": 5716980, "router_z_loss_clip": 0.42382812, "router_z_loss_mlp": 2.328125, "step": 274, "time_per_iteration": 2.539487600326538 }, { "auxiliary_loss_clip": 0.01321827, "auxiliary_loss_mlp": 0.01099436, "balance_loss_clip": 1.05606747, "balance_loss_mlp": 1.09155095, "epoch": 0.016533894483691566, "flos": 22601637803520.0, "grad_norm": 1.8798181899112463, "language_loss": 0.8732301, "learning_rate": 3.997340841163053e-06, "loss": 0.89744276, "num_input_tokens_seen": 5737780, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 2.3125, "step": 275, "time_per_iteration": 2.6877739429473877 }, { "auxiliary_loss_clip": 0.01325114, "auxiliary_loss_mlp": 0.01097336, "balance_loss_clip": 1.051965, "balance_loss_mlp": 1.09200346, "epoch": 0.01659401773635954, "flos": 21507188353920.0, "grad_norm": 1.6111534497167528, "language_loss": 0.80424142, "learning_rate": 3.9973213288241445e-06, "loss": 0.82846594, "num_input_tokens_seen": 5758330, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 2.328125, "step": 276, "time_per_iteration": 2.6243934631347656 }, { "auxiliary_loss_clip": 0.01316995, "auxiliary_loss_mlp": 0.01095608, "balance_loss_clip": 1.05514789, "balance_loss_mlp": 1.09155297, "epoch": 0.016654140989027507, "flos": 32850973411200.0, "grad_norm": 2.0065769792560872, "language_loss": 0.80730081, "learning_rate": 3.997301745205976e-06, "loss": 0.83142674, "num_input_tokens_seen": 5778340, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 2.25, "step": 277, "time_per_iteration": 2.650425434112549 }, { "auxiliary_loss_clip": 0.01316965, "auxiliary_loss_mlp": 0.01089087, "balance_loss_clip": 1.04593301, "balance_loss_mlp": 1.08657908, "epoch": 0.016714264241695476, "flos": 12306228024960.0, "grad_norm": 3.2339010360139526, "language_loss": 0.79775238, "learning_rate": 3.997282090309246e-06, "loss": 0.82181293, "num_input_tokens_seen": 5794295, "router_z_loss_clip": 0.43164062, "router_z_loss_mlp": 2.3125, "step": 278, "time_per_iteration": 2.5992372035980225 }, { "auxiliary_loss_clip": 0.0131955, "auxiliary_loss_mlp": 0.0109382, "balance_loss_clip": 1.05254984, "balance_loss_mlp": 1.09040999, "epoch": 0.016774387494363444, "flos": 27123796362240.0, "grad_norm": 1.881572117984159, "language_loss": 0.9035027, "learning_rate": 3.9972623641346555e-06, "loss": 0.92763638, "num_input_tokens_seen": 5814405, "router_z_loss_clip": 0.41210938, "router_z_loss_mlp": 2.296875, "step": 279, "time_per_iteration": 2.619685173034668 }, { "auxiliary_loss_clip": 0.01322637, "auxiliary_loss_mlp": 0.01106325, "balance_loss_clip": 1.06386256, "balance_loss_mlp": 1.09007108, "epoch": 0.016834510747031413, "flos": 20191493082240.0, "grad_norm": 8.873164033820073, "language_loss": 0.9369666, "learning_rate": 3.9972425666829085e-06, "loss": 0.96125615, "num_input_tokens_seen": 5832795, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 2.328125, "step": 280, "time_per_iteration": 2.622232675552368 }, { "auxiliary_loss_clip": 0.01324004, "auxiliary_loss_mlp": 0.01095591, "balance_loss_clip": 1.05229449, "balance_loss_mlp": 1.08826756, "epoch": 0.016894633999699385, "flos": 27274262434560.0, "grad_norm": 2.0214823357264535, "language_loss": 0.73567635, "learning_rate": 3.997222697954712e-06, "loss": 0.75987232, "num_input_tokens_seen": 5855750, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 2.359375, "step": 281, "time_per_iteration": 2.717432975769043 }, { "auxiliary_loss_clip": 0.01322838, "auxiliary_loss_mlp": 0.01110182, "balance_loss_clip": 1.06845915, "balance_loss_mlp": 1.09269083, "epoch": 0.016954757252367354, "flos": 14902964922240.0, "grad_norm": 2.414723937168111, "language_loss": 0.79856253, "learning_rate": 3.997202757950775e-06, "loss": 0.82289279, "num_input_tokens_seen": 5872610, "router_z_loss_clip": 0.41601562, "router_z_loss_mlp": 2.296875, "step": 282, "time_per_iteration": 2.6688809394836426 }, { "auxiliary_loss_clip": 0.01326069, "auxiliary_loss_mlp": 0.01113773, "balance_loss_clip": 1.07081032, "balance_loss_mlp": 1.09302831, "epoch": 0.017014880505035322, "flos": 21358805270400.0, "grad_norm": 1.899151692020778, "language_loss": 0.77383268, "learning_rate": 3.997182746671809e-06, "loss": 0.79823112, "num_input_tokens_seen": 5892985, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 2.328125, "step": 283, "time_per_iteration": 2.601694345474243 }, { "auxiliary_loss_clip": 0.01324375, "auxiliary_loss_mlp": 0.01084751, "balance_loss_clip": 1.04464853, "balance_loss_mlp": 1.09311223, "epoch": 0.01707500375770329, "flos": 35333154858240.0, "grad_norm": 2.098300043921156, "language_loss": 0.83716559, "learning_rate": 3.997162664118528e-06, "loss": 0.86125684, "num_input_tokens_seen": 5914060, "router_z_loss_clip": 0.40039062, "router_z_loss_mlp": 2.3125, "step": 284, "time_per_iteration": 2.880491018295288 }, { "auxiliary_loss_clip": 0.01316762, "auxiliary_loss_mlp": 0.01094154, "balance_loss_clip": 1.05088127, "balance_loss_mlp": 1.08850288, "epoch": 0.01713512701037126, "flos": 23582070506880.0, "grad_norm": 4.021906810764536, "language_loss": 0.96516901, "learning_rate": 3.99714251029165e-06, "loss": 0.98927814, "num_input_tokens_seen": 5932860, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 2.28125, "step": 285, "time_per_iteration": 2.662238121032715 }, { "auxiliary_loss_clip": 0.01318643, "auxiliary_loss_mlp": 0.0108882, "balance_loss_clip": 1.04852724, "balance_loss_mlp": 1.09050608, "epoch": 0.01719525026303923, "flos": 27634661544960.0, "grad_norm": 2.052421742998031, "language_loss": 0.93243122, "learning_rate": 3.997122285191892e-06, "loss": 0.95650578, "num_input_tokens_seen": 5952725, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 2.28125, "step": 286, "time_per_iteration": 2.6657190322875977 }, { "auxiliary_loss_clip": 0.01311355, "auxiliary_loss_mlp": 0.01081792, "balance_loss_clip": 1.03987861, "balance_loss_mlp": 1.08632541, "epoch": 0.0172553735157072, "flos": 26979722910720.0, "grad_norm": 2.008851809466569, "language_loss": 0.91628647, "learning_rate": 3.997101988819976e-06, "loss": 0.94021797, "num_input_tokens_seen": 5970560, "router_z_loss_clip": 0.41992188, "router_z_loss_mlp": 2.25, "step": 287, "time_per_iteration": 2.6737489700317383 }, { "auxiliary_loss_clip": 0.01315209, "auxiliary_loss_mlp": 0.010849, "balance_loss_clip": 1.0435822, "balance_loss_mlp": 1.0857017, "epoch": 0.01731549676837517, "flos": 14056621689600.0, "grad_norm": 4.0487803859103835, "language_loss": 1.01115155, "learning_rate": 3.997081621176629e-06, "loss": 1.03515255, "num_input_tokens_seen": 5982980, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 2.296875, "step": 288, "time_per_iteration": 2.568899393081665 }, { "auxiliary_loss_clip": 0.0119939, "auxiliary_loss_mlp": 0.0104195, "balance_loss_clip": 1.0292666, "balance_loss_mlp": 1.07766604, "epoch": 0.017375620021043137, "flos": 66510694471680.0, "grad_norm": 0.8955218302345069, "language_loss": 0.63943136, "learning_rate": 3.997061182262575e-06, "loss": 0.66184473, "num_input_tokens_seen": 6049445, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 1.21875, "step": 289, "time_per_iteration": 3.3559372425079346 }, { "auxiliary_loss_clip": 0.01317764, "auxiliary_loss_mlp": 0.01102276, "balance_loss_clip": 1.0621264, "balance_loss_mlp": 1.09054708, "epoch": 0.01743574327371111, "flos": 15225154940160.0, "grad_norm": 2.258674261165782, "language_loss": 0.88101155, "learning_rate": 3.997040672078545e-06, "loss": 0.90521193, "num_input_tokens_seen": 6064150, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 2.28125, "step": 290, "time_per_iteration": 2.564758777618408 }, { "auxiliary_loss_clip": 0.01315653, "auxiliary_loss_mlp": 0.01086716, "balance_loss_clip": 1.04485011, "balance_loss_mlp": 1.08876002, "epoch": 0.017495866526379078, "flos": 25373869574400.0, "grad_norm": 1.9635253036405966, "language_loss": 0.83832991, "learning_rate": 3.997020090625269e-06, "loss": 0.86235368, "num_input_tokens_seen": 6083920, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 2.265625, "step": 291, "time_per_iteration": 2.6508572101593018 }, { "auxiliary_loss_clip": 0.01322096, "auxiliary_loss_mlp": 0.01104477, "balance_loss_clip": 1.0599885, "balance_loss_mlp": 1.09686637, "epoch": 0.017555989779047047, "flos": 26359473836160.0, "grad_norm": 1.7400390466473412, "language_loss": 0.72272015, "learning_rate": 3.996999437903485e-06, "loss": 0.74698591, "num_input_tokens_seen": 6105460, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 2.25, "step": 292, "time_per_iteration": 2.6479482650756836 }, { "auxiliary_loss_clip": 0.01310615, "auxiliary_loss_mlp": 0.0109789, "balance_loss_clip": 1.05552316, "balance_loss_mlp": 1.08848882, "epoch": 0.017616113031715015, "flos": 22338807010560.0, "grad_norm": 2.85408293967315, "language_loss": 0.86419868, "learning_rate": 3.996978713913927e-06, "loss": 0.88828373, "num_input_tokens_seen": 6122890, "router_z_loss_clip": 0.42382812, "router_z_loss_mlp": 2.21875, "step": 293, "time_per_iteration": 2.682191848754883 }, { "auxiliary_loss_clip": 0.01309131, "auxiliary_loss_mlp": 0.01093443, "balance_loss_clip": 1.05226779, "balance_loss_mlp": 1.08665133, "epoch": 0.017676236284382984, "flos": 20156911263360.0, "grad_norm": 2.0610052587433554, "language_loss": 0.80281186, "learning_rate": 3.996957918657335e-06, "loss": 0.82683754, "num_input_tokens_seen": 6142890, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 2.234375, "step": 294, "time_per_iteration": 2.602905035018921 }, { "auxiliary_loss_clip": 0.01309127, "auxiliary_loss_mlp": 0.01104934, "balance_loss_clip": 1.06316328, "balance_loss_mlp": 1.08346772, "epoch": 0.017736359537050956, "flos": 25223331674880.0, "grad_norm": 1.934622238144232, "language_loss": 0.83750343, "learning_rate": 3.996937052134452e-06, "loss": 0.86164403, "num_input_tokens_seen": 6162030, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 2.25, "step": 295, "time_per_iteration": 2.7121920585632324 }, { "auxiliary_loss_clip": 0.01316173, "auxiliary_loss_mlp": 0.01103482, "balance_loss_clip": 1.0623548, "balance_loss_mlp": 1.09397125, "epoch": 0.017796482789718925, "flos": 20338798757760.0, "grad_norm": 1.938004935542463, "language_loss": 0.83734357, "learning_rate": 3.996916114346023e-06, "loss": 0.86154008, "num_input_tokens_seen": 6180540, "router_z_loss_clip": 0.41210938, "router_z_loss_mlp": 2.21875, "step": 296, "time_per_iteration": 2.6468639373779297 }, { "auxiliary_loss_clip": 0.01314952, "auxiliary_loss_mlp": 0.0109612, "balance_loss_clip": 1.05570769, "balance_loss_mlp": 1.08916283, "epoch": 0.017856606042386893, "flos": 22379206832640.0, "grad_norm": 2.7215825718995954, "language_loss": 0.87464118, "learning_rate": 3.996895105292794e-06, "loss": 0.89875197, "num_input_tokens_seen": 6199425, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 2.265625, "step": 297, "time_per_iteration": 2.647000312805176 }, { "auxiliary_loss_clip": 0.01312109, "auxiliary_loss_mlp": 0.01096347, "balance_loss_clip": 1.05624557, "balance_loss_mlp": 1.08718848, "epoch": 0.017916729295054862, "flos": 20230061310720.0, "grad_norm": 2.1806547744629747, "language_loss": 0.88141668, "learning_rate": 3.996874024975515e-06, "loss": 0.90550125, "num_input_tokens_seen": 6219170, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 2.25, "step": 298, "time_per_iteration": 2.613859176635742 }, { "auxiliary_loss_clip": 0.01307927, "auxiliary_loss_mlp": 0.01111536, "balance_loss_clip": 1.069098, "balance_loss_mlp": 1.08730257, "epoch": 0.01797685254772283, "flos": 19390972625280.0, "grad_norm": 3.5111014004899, "language_loss": 0.8814947, "learning_rate": 3.996852873394939e-06, "loss": 0.90568924, "num_input_tokens_seen": 6237930, "router_z_loss_clip": 0.42382812, "router_z_loss_mlp": 2.21875, "step": 299, "time_per_iteration": 2.5755691528320312 }, { "auxiliary_loss_clip": 0.01316258, "auxiliary_loss_mlp": 0.01092821, "balance_loss_clip": 1.05085921, "balance_loss_mlp": 1.08907545, "epoch": 0.018036975800390802, "flos": 24426007528320.0, "grad_norm": 8.394737207703779, "language_loss": 0.63999122, "learning_rate": 3.996831650551821e-06, "loss": 0.66408199, "num_input_tokens_seen": 6257170, "router_z_loss_clip": 0.41992188, "router_z_loss_mlp": 2.265625, "step": 300, "time_per_iteration": 2.566180944442749 }, { "auxiliary_loss_clip": 0.01314716, "auxiliary_loss_mlp": 0.0109952, "balance_loss_clip": 1.05777287, "balance_loss_mlp": 1.09113884, "epoch": 0.01809709905305877, "flos": 15778933896960.0, "grad_norm": 5.390447341026992, "language_loss": 0.87867808, "learning_rate": 3.996810356446917e-06, "loss": 0.90282053, "num_input_tokens_seen": 6274780, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 2.234375, "step": 301, "time_per_iteration": 2.5326147079467773 }, { "auxiliary_loss_clip": 0.01195044, "auxiliary_loss_mlp": 0.01031025, "balance_loss_clip": 1.01853204, "balance_loss_mlp": 1.08023894, "epoch": 0.01815722230572674, "flos": 67348382526720.0, "grad_norm": 0.8544634960590001, "language_loss": 0.6220907, "learning_rate": 3.996788991080988e-06, "loss": 0.64435136, "num_input_tokens_seen": 6340435, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 1.1484375, "step": 302, "time_per_iteration": 3.342679500579834 }, { "auxiliary_loss_clip": 0.0130412, "auxiliary_loss_mlp": 0.01098458, "balance_loss_clip": 1.05728316, "balance_loss_mlp": 1.0835197, "epoch": 0.01821734555839471, "flos": 15485615435520.0, "grad_norm": 1.9945499347759685, "language_loss": 0.89004242, "learning_rate": 3.996767554454796e-06, "loss": 0.91406822, "num_input_tokens_seen": 6358160, "router_z_loss_clip": 0.41210938, "router_z_loss_mlp": 2.203125, "step": 303, "time_per_iteration": 2.542099714279175 }, { "auxiliary_loss_clip": 0.01313439, "auxiliary_loss_mlp": 0.01110319, "balance_loss_clip": 1.06759429, "balance_loss_mlp": 1.09064817, "epoch": 0.018277468811062677, "flos": 24097424889600.0, "grad_norm": 1.801713682311557, "language_loss": 0.79728448, "learning_rate": 3.996746046569107e-06, "loss": 0.821522, "num_input_tokens_seen": 6378485, "router_z_loss_clip": 0.42773438, "router_z_loss_mlp": 2.234375, "step": 304, "time_per_iteration": 2.730572462081909 }, { "auxiliary_loss_clip": 0.01308462, "auxiliary_loss_mlp": 0.01084688, "balance_loss_clip": 1.04592085, "balance_loss_mlp": 1.09243035, "epoch": 0.01833759206373065, "flos": 20959335141120.0, "grad_norm": 1.6325557531539698, "language_loss": 0.82361007, "learning_rate": 3.996724467424687e-06, "loss": 0.84754157, "num_input_tokens_seen": 6397845, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 2.15625, "step": 305, "time_per_iteration": 2.5148913860321045 }, { "auxiliary_loss_clip": 0.01308051, "auxiliary_loss_mlp": 0.01085634, "balance_loss_clip": 1.04665208, "balance_loss_mlp": 1.08507192, "epoch": 0.018397715316398618, "flos": 19390757143680.0, "grad_norm": 3.42454620127531, "language_loss": 0.90061152, "learning_rate": 3.996702817022308e-06, "loss": 0.92454839, "num_input_tokens_seen": 6416475, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 2.234375, "step": 306, "time_per_iteration": 2.5996899604797363 }, { "auxiliary_loss_clip": 0.01303779, "auxiliary_loss_mlp": 0.01084421, "balance_loss_clip": 1.04560661, "balance_loss_mlp": 1.08340514, "epoch": 0.018457838569066586, "flos": 29132531619840.0, "grad_norm": 3.1102010611883384, "language_loss": 0.86104894, "learning_rate": 3.996681095362741e-06, "loss": 0.88493097, "num_input_tokens_seen": 6437520, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 2.21875, "step": 307, "time_per_iteration": 2.6317741870880127 }, { "auxiliary_loss_clip": 0.01305395, "auxiliary_loss_mlp": 0.01094731, "balance_loss_clip": 1.05138683, "balance_loss_mlp": 1.08778942, "epoch": 0.018517961821734555, "flos": 19208654167680.0, "grad_norm": 3.0096289246351193, "language_loss": 0.7113359, "learning_rate": 3.996659302446762e-06, "loss": 0.73533714, "num_input_tokens_seen": 6455680, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 2.171875, "step": 308, "time_per_iteration": 5.551621913909912 }, { "auxiliary_loss_clip": 0.01307876, "auxiliary_loss_mlp": 0.01096039, "balance_loss_clip": 1.05584145, "balance_loss_mlp": 1.08336496, "epoch": 0.018578085074402523, "flos": 19863018184320.0, "grad_norm": 2.4622401626573827, "language_loss": 0.91653508, "learning_rate": 3.996637438275148e-06, "loss": 0.94057417, "num_input_tokens_seen": 6474880, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 2.25, "step": 309, "time_per_iteration": 4.07793927192688 }, { "auxiliary_loss_clip": 0.01317042, "auxiliary_loss_mlp": 0.01093271, "balance_loss_clip": 1.05140507, "balance_loss_mlp": 1.08683729, "epoch": 0.018638208327070496, "flos": 29606947476480.0, "grad_norm": 2.093776882007538, "language_loss": 0.72045308, "learning_rate": 3.99661550284868e-06, "loss": 0.74455625, "num_input_tokens_seen": 6495945, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 2.3125, "step": 310, "time_per_iteration": 2.6745288372039795 }, { "auxiliary_loss_clip": 0.01308832, "auxiliary_loss_mlp": 0.01100933, "balance_loss_clip": 1.06130803, "balance_loss_mlp": 1.0896095, "epoch": 0.018698331579738464, "flos": 45731555907840.0, "grad_norm": 2.411033931770992, "language_loss": 0.73620152, "learning_rate": 3.996593496168141e-06, "loss": 0.76029921, "num_input_tokens_seen": 6519930, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 2.1875, "step": 311, "time_per_iteration": 2.7694578170776367 }, { "auxiliary_loss_clip": 0.01315355, "auxiliary_loss_mlp": 0.0109879, "balance_loss_clip": 1.05935526, "balance_loss_mlp": 1.0868082, "epoch": 0.018758454832406433, "flos": 20483662308480.0, "grad_norm": 7.062513486345706, "language_loss": 0.90941608, "learning_rate": 3.996571418234316e-06, "loss": 0.93355757, "num_input_tokens_seen": 6535070, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 2.28125, "step": 312, "time_per_iteration": 2.629993438720703 }, { "auxiliary_loss_clip": 0.01315063, "auxiliary_loss_mlp": 0.01099681, "balance_loss_clip": 1.05829191, "balance_loss_mlp": 1.08866048, "epoch": 0.0188185780850744, "flos": 15777784661760.0, "grad_norm": 2.023607521292682, "language_loss": 0.89136022, "learning_rate": 3.996549269047992e-06, "loss": 0.91550767, "num_input_tokens_seen": 6554135, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 2.265625, "step": 313, "time_per_iteration": 2.5152945518493652 }, { "auxiliary_loss_clip": 0.01318111, "auxiliary_loss_mlp": 0.01088227, "balance_loss_clip": 1.04700494, "balance_loss_mlp": 1.08808112, "epoch": 0.018878701337742373, "flos": 22455732758400.0, "grad_norm": 2.9043290267043487, "language_loss": 0.7241922, "learning_rate": 3.996527048609961e-06, "loss": 0.74825555, "num_input_tokens_seen": 6572275, "router_z_loss_clip": 0.41210938, "router_z_loss_mlp": 2.296875, "step": 314, "time_per_iteration": 2.594820499420166 }, { "auxiliary_loss_clip": 0.0130837, "auxiliary_loss_mlp": 0.01100081, "balance_loss_clip": 1.06078982, "balance_loss_mlp": 1.08536816, "epoch": 0.018938824590410342, "flos": 30993530238720.0, "grad_norm": 2.3714548118295427, "language_loss": 0.88484573, "learning_rate": 3.996504756921015e-06, "loss": 0.9089303, "num_input_tokens_seen": 6594520, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 2.234375, "step": 315, "time_per_iteration": 2.587191104888916 }, { "auxiliary_loss_clip": 0.01308656, "auxiliary_loss_mlp": 0.0108638, "balance_loss_clip": 1.04572988, "balance_loss_mlp": 1.08659256, "epoch": 0.01899894784307831, "flos": 23258910821760.0, "grad_norm": 1.987575867690697, "language_loss": 0.8026908, "learning_rate": 3.996482393981951e-06, "loss": 0.8266412, "num_input_tokens_seen": 6614245, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 2.21875, "step": 316, "time_per_iteration": 2.559706211090088 }, { "auxiliary_loss_clip": 0.01306576, "auxiliary_loss_mlp": 0.01084524, "balance_loss_clip": 1.04547071, "balance_loss_mlp": 1.08545589, "epoch": 0.01905907109574628, "flos": 17457901367040.0, "grad_norm": 1.8713345292680401, "language_loss": 0.89968061, "learning_rate": 3.996459959793564e-06, "loss": 0.92359161, "num_input_tokens_seen": 6632015, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 2.21875, "step": 317, "time_per_iteration": 2.5836169719696045 }, { "auxiliary_loss_clip": 0.01303845, "auxiliary_loss_mlp": 0.01090998, "balance_loss_clip": 1.05115867, "balance_loss_mlp": 1.08390307, "epoch": 0.019119194348414248, "flos": 14970225139200.0, "grad_norm": 2.5307678446185653, "language_loss": 0.90419543, "learning_rate": 3.996437454356658e-06, "loss": 0.92814392, "num_input_tokens_seen": 6649015, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 2.1875, "step": 318, "time_per_iteration": 2.5630390644073486 }, { "auxiliary_loss_clip": 0.01301791, "auxiliary_loss_mlp": 0.01080931, "balance_loss_clip": 1.04278421, "balance_loss_mlp": 1.08246756, "epoch": 0.01917931760108222, "flos": 25482822503040.0, "grad_norm": 4.473369803341623, "language_loss": 0.93013334, "learning_rate": 3.996414877672034e-06, "loss": 0.9539606, "num_input_tokens_seen": 6669225, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 2.1875, "step": 319, "time_per_iteration": 2.559276819229126 }, { "auxiliary_loss_clip": 0.01178518, "auxiliary_loss_mlp": 0.01015568, "balance_loss_clip": 1.00455308, "balance_loss_mlp": 1.06691146, "epoch": 0.01923944085375019, "flos": 71556967353600.0, "grad_norm": 0.894056706900591, "language_loss": 0.59697652, "learning_rate": 3.996392229740498e-06, "loss": 0.61891735, "num_input_tokens_seen": 6725775, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 1.1171875, "step": 320, "time_per_iteration": 3.0775701999664307 }, { "auxiliary_loss_clip": 0.01301209, "auxiliary_loss_mlp": 0.01090526, "balance_loss_clip": 1.05221176, "balance_loss_mlp": 1.08297038, "epoch": 0.019299564106418157, "flos": 19682495406720.0, "grad_norm": 2.3997574362834753, "language_loss": 0.89051211, "learning_rate": 3.99636951056286e-06, "loss": 0.91442943, "num_input_tokens_seen": 6744170, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 2.1875, "step": 321, "time_per_iteration": 2.518080711364746 }, { "auxiliary_loss_clip": 0.01311928, "auxiliary_loss_mlp": 0.01108271, "balance_loss_clip": 1.06800234, "balance_loss_mlp": 1.08718443, "epoch": 0.019359687359086126, "flos": 24387151991040.0, "grad_norm": 2.241055932274248, "language_loss": 0.82277524, "learning_rate": 3.996346720139928e-06, "loss": 0.84697729, "num_input_tokens_seen": 6764565, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 2.25, "step": 322, "time_per_iteration": 2.623497247695923 }, { "auxiliary_loss_clip": 0.01313283, "auxiliary_loss_mlp": 0.01088239, "balance_loss_clip": 1.05040193, "balance_loss_mlp": 1.0864222, "epoch": 0.019419810611754094, "flos": 23951376190080.0, "grad_norm": 2.0197806471714634, "language_loss": 0.72001493, "learning_rate": 3.996323858472518e-06, "loss": 0.74403012, "num_input_tokens_seen": 6785310, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 2.265625, "step": 323, "time_per_iteration": 2.5912911891937256 }, { "auxiliary_loss_clip": 0.01301267, "auxiliary_loss_mlp": 0.01084289, "balance_loss_clip": 1.04475892, "balance_loss_mlp": 1.08064079, "epoch": 0.019479933864422067, "flos": 22160223567360.0, "grad_norm": 1.8363304181349946, "language_loss": 0.91717589, "learning_rate": 3.996300925561445e-06, "loss": 0.94103146, "num_input_tokens_seen": 6803290, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 2.203125, "step": 324, "time_per_iteration": 2.5433850288391113 }, { "auxiliary_loss_clip": 0.01309147, "auxiliary_loss_mlp": 0.01079977, "balance_loss_clip": 1.04309392, "balance_loss_mlp": 1.08677936, "epoch": 0.019540057117090035, "flos": 22236821320320.0, "grad_norm": 3.8932951615024542, "language_loss": 0.6439684, "learning_rate": 3.996277921407525e-06, "loss": 0.66785967, "num_input_tokens_seen": 6822570, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 2.21875, "step": 325, "time_per_iteration": 2.597839593887329 }, { "auxiliary_loss_clip": 0.01308762, "auxiliary_loss_mlp": 0.01096164, "balance_loss_clip": 1.05775476, "balance_loss_mlp": 1.09159052, "epoch": 0.019600180369758004, "flos": 23076771932160.0, "grad_norm": 1.9160942711173397, "language_loss": 0.76406336, "learning_rate": 3.996254846011582e-06, "loss": 0.78811258, "num_input_tokens_seen": 6841910, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 2.1875, "step": 326, "time_per_iteration": 2.5959973335266113 }, { "auxiliary_loss_clip": 0.01303228, "auxiliary_loss_mlp": 0.01100191, "balance_loss_clip": 1.06068468, "balance_loss_mlp": 1.08643568, "epoch": 0.019660303622425972, "flos": 25410857604480.0, "grad_norm": 2.7188525601074574, "language_loss": 0.78588557, "learning_rate": 3.99623169937444e-06, "loss": 0.80991977, "num_input_tokens_seen": 6862480, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 2.171875, "step": 327, "time_per_iteration": 2.5897626876831055 }, { "auxiliary_loss_clip": 0.01307278, "auxiliary_loss_mlp": 0.01096581, "balance_loss_clip": 1.05836213, "balance_loss_mlp": 1.08751094, "epoch": 0.01972042687509394, "flos": 23657519024640.0, "grad_norm": 2.2085804416351293, "language_loss": 0.80564022, "learning_rate": 3.996208481496923e-06, "loss": 0.82967883, "num_input_tokens_seen": 6882015, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 2.1875, "step": 328, "time_per_iteration": 2.5976943969726562 }, { "auxiliary_loss_clip": 0.0130704, "auxiliary_loss_mlp": 0.01090265, "balance_loss_clip": 1.05001974, "balance_loss_mlp": 1.08299792, "epoch": 0.019780550127761913, "flos": 18223480869120.0, "grad_norm": 2.417427223458816, "language_loss": 0.92934602, "learning_rate": 3.996185192379858e-06, "loss": 0.95331907, "num_input_tokens_seen": 6899785, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 2.234375, "step": 329, "time_per_iteration": 2.549769878387451 }, { "auxiliary_loss_clip": 0.01312925, "auxiliary_loss_mlp": 0.01099347, "balance_loss_clip": 1.06057978, "balance_loss_mlp": 1.08762372, "epoch": 0.01984067338042988, "flos": 22418780641920.0, "grad_norm": 2.5870134803971636, "language_loss": 0.74536669, "learning_rate": 3.996161832024081e-06, "loss": 0.76948947, "num_input_tokens_seen": 6918575, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 2.25, "step": 330, "time_per_iteration": 2.5247230529785156 }, { "auxiliary_loss_clip": 0.01308355, "auxiliary_loss_mlp": 0.01103083, "balance_loss_clip": 1.06457853, "balance_loss_mlp": 1.08542395, "epoch": 0.01990079663309785, "flos": 17055199013760.0, "grad_norm": 2.5699495297743113, "language_loss": 0.93396461, "learning_rate": 3.996138400430422e-06, "loss": 0.9580791, "num_input_tokens_seen": 6936965, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 2.21875, "step": 331, "time_per_iteration": 2.589275598526001 }, { "auxiliary_loss_clip": 0.01296652, "auxiliary_loss_mlp": 0.01078469, "balance_loss_clip": 1.04177654, "balance_loss_mlp": 1.08410263, "epoch": 0.01996091988576582, "flos": 15961791058560.0, "grad_norm": 12.595818008145388, "language_loss": 0.92313915, "learning_rate": 3.996114897599718e-06, "loss": 0.94689029, "num_input_tokens_seen": 6953475, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 2.125, "step": 332, "time_per_iteration": 2.507871150970459 }, { "auxiliary_loss_clip": 0.01304419, "auxiliary_loss_mlp": 0.01079013, "balance_loss_clip": 1.04050875, "balance_loss_mlp": 1.08868861, "epoch": 0.02002104313843379, "flos": 23586451966080.0, "grad_norm": 6.81647987901557, "language_loss": 0.7509802, "learning_rate": 3.996091323532807e-06, "loss": 0.77481455, "num_input_tokens_seen": 6971630, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 2.15625, "step": 333, "time_per_iteration": 2.6540489196777344 }, { "auxiliary_loss_clip": 0.01303969, "auxiliary_loss_mlp": 0.01081187, "balance_loss_clip": 1.04363656, "balance_loss_mlp": 1.08403778, "epoch": 0.02008116639110176, "flos": 34094883352320.0, "grad_norm": 2.1925376787827604, "language_loss": 0.78173566, "learning_rate": 3.996067678230532e-06, "loss": 0.80558729, "num_input_tokens_seen": 6992775, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 2.203125, "step": 334, "time_per_iteration": 2.664149522781372 }, { "auxiliary_loss_clip": 0.01304286, "auxiliary_loss_mlp": 0.01093156, "balance_loss_clip": 1.05286336, "balance_loss_mlp": 1.08075523, "epoch": 0.020141289643769728, "flos": 19683716469120.0, "grad_norm": 1.7686284560730514, "language_loss": 0.82786554, "learning_rate": 3.996043961693736e-06, "loss": 0.85184002, "num_input_tokens_seen": 7011425, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 2.234375, "step": 335, "time_per_iteration": 2.623809814453125 }, { "auxiliary_loss_clip": 0.01302859, "auxiliary_loss_mlp": 0.01079958, "balance_loss_clip": 1.04216862, "balance_loss_mlp": 1.08505869, "epoch": 0.020201412896437697, "flos": 20740567357440.0, "grad_norm": 2.7250218784817384, "language_loss": 0.91918254, "learning_rate": 3.996020173923266e-06, "loss": 0.94301069, "num_input_tokens_seen": 7029450, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 2.1875, "step": 336, "time_per_iteration": 2.577200174331665 }, { "auxiliary_loss_clip": 0.01299558, "auxiliary_loss_mlp": 0.01074795, "balance_loss_clip": 1.0372442, "balance_loss_mlp": 1.08013225, "epoch": 0.020261536149105665, "flos": 20266510636800.0, "grad_norm": 1.713382620224874, "language_loss": 0.87799799, "learning_rate": 3.99599631491997e-06, "loss": 0.90174162, "num_input_tokens_seen": 7047555, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 2.1875, "step": 337, "time_per_iteration": 2.633516788482666 }, { "auxiliary_loss_clip": 0.01292433, "auxiliary_loss_mlp": 0.01097384, "balance_loss_clip": 1.05842614, "balance_loss_mlp": 1.07895708, "epoch": 0.020321659401773638, "flos": 25848752307840.0, "grad_norm": 1.5204681707118783, "language_loss": 0.89787173, "learning_rate": 3.995972384684699e-06, "loss": 0.92176992, "num_input_tokens_seen": 7068185, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 2.125, "step": 338, "time_per_iteration": 2.651075839996338 }, { "auxiliary_loss_clip": 0.01301595, "auxiliary_loss_mlp": 0.01091129, "balance_loss_clip": 1.05272019, "balance_loss_mlp": 1.08209753, "epoch": 0.020381782654441606, "flos": 17495033051520.0, "grad_norm": 2.3148897430879636, "language_loss": 0.84514648, "learning_rate": 3.995948383218309e-06, "loss": 0.86907369, "num_input_tokens_seen": 7085955, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 2.1875, "step": 339, "time_per_iteration": 2.5827178955078125 }, { "auxiliary_loss_clip": 0.01309191, "auxiliary_loss_mlp": 0.01087033, "balance_loss_clip": 1.04824257, "balance_loss_mlp": 1.08660853, "epoch": 0.020441905907109575, "flos": 24243940465920.0, "grad_norm": 1.8093266019970038, "language_loss": 0.88632244, "learning_rate": 3.995924310521655e-06, "loss": 0.91028464, "num_input_tokens_seen": 7106345, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 2.21875, "step": 340, "time_per_iteration": 2.6051225662231445 }, { "auxiliary_loss_clip": 0.01304518, "auxiliary_loss_mlp": 0.01075985, "balance_loss_clip": 1.03571641, "balance_loss_mlp": 1.08427286, "epoch": 0.020502029159777543, "flos": 22233301787520.0, "grad_norm": 2.3894307263635337, "language_loss": 0.87866282, "learning_rate": 3.995900166595596e-06, "loss": 0.90246785, "num_input_tokens_seen": 7125070, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 2.203125, "step": 341, "time_per_iteration": 2.625159502029419 }, { "auxiliary_loss_clip": 0.01305338, "auxiliary_loss_mlp": 0.01093933, "balance_loss_clip": 1.05538118, "balance_loss_mlp": 1.08132017, "epoch": 0.020562152412445512, "flos": 23987861429760.0, "grad_norm": 3.320448261165454, "language_loss": 0.80185449, "learning_rate": 3.995875951440995e-06, "loss": 0.82584715, "num_input_tokens_seen": 7144675, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 2.234375, "step": 342, "time_per_iteration": 2.726144552230835 }, { "auxiliary_loss_clip": 0.01296306, "auxiliary_loss_mlp": 0.0109266, "balance_loss_clip": 1.05134225, "balance_loss_mlp": 1.08175886, "epoch": 0.020622275665113484, "flos": 26975305537920.0, "grad_norm": 1.9358334916160238, "language_loss": 0.88751888, "learning_rate": 3.995851665058715e-06, "loss": 0.91140854, "num_input_tokens_seen": 7165505, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 2.15625, "step": 343, "time_per_iteration": 2.660553455352783 }, { "auxiliary_loss_clip": 0.01305373, "auxiliary_loss_mlp": 0.01090517, "balance_loss_clip": 1.05306125, "balance_loss_mlp": 1.08756602, "epoch": 0.020682398917781453, "flos": 22600704049920.0, "grad_norm": 2.1256902728156932, "language_loss": 0.77530479, "learning_rate": 3.995827307449623e-06, "loss": 0.79926372, "num_input_tokens_seen": 7184605, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 2.1875, "step": 344, "time_per_iteration": 2.5972039699554443 }, { "auxiliary_loss_clip": 0.01292332, "auxiliary_loss_mlp": 0.01090193, "balance_loss_clip": 1.05104411, "balance_loss_mlp": 1.08105671, "epoch": 0.02074252217044942, "flos": 15013605790080.0, "grad_norm": 1.8298995820269282, "language_loss": 0.74482918, "learning_rate": 3.995802878614588e-06, "loss": 0.76865441, "num_input_tokens_seen": 7203065, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 2.109375, "step": 345, "time_per_iteration": 2.5927846431732178 }, { "auxiliary_loss_clip": 0.01297588, "auxiliary_loss_mlp": 0.01088454, "balance_loss_clip": 1.05023551, "balance_loss_mlp": 1.08476233, "epoch": 0.02080264542311739, "flos": 25337958952320.0, "grad_norm": 2.7683397337420717, "language_loss": 0.90453386, "learning_rate": 3.995778378554483e-06, "loss": 0.92839432, "num_input_tokens_seen": 7222995, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 2.125, "step": 346, "time_per_iteration": 2.698592185974121 }, { "auxiliary_loss_clip": 0.01295583, "auxiliary_loss_mlp": 0.01084596, "balance_loss_clip": 1.04876149, "balance_loss_mlp": 1.08099508, "epoch": 0.02086276867578536, "flos": 24388804016640.0, "grad_norm": 2.017597875394816, "language_loss": 0.78605306, "learning_rate": 3.99575380727018e-06, "loss": 0.80985487, "num_input_tokens_seen": 7244625, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 2.15625, "step": 347, "time_per_iteration": 2.5793559551239014 }, { "auxiliary_loss_clip": 0.01298767, "auxiliary_loss_mlp": 0.01088223, "balance_loss_clip": 1.04950356, "balance_loss_mlp": 1.08591509, "epoch": 0.02092289192845333, "flos": 24462205459200.0, "grad_norm": 5.239834075434785, "language_loss": 0.70420963, "learning_rate": 3.995729164762559e-06, "loss": 0.72807944, "num_input_tokens_seen": 7263255, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 2.125, "step": 348, "time_per_iteration": 2.6289217472076416 }, { "auxiliary_loss_clip": 0.01301467, "auxiliary_loss_mlp": 0.01090649, "balance_loss_clip": 1.05264544, "balance_loss_mlp": 1.08295035, "epoch": 0.0209830151811213, "flos": 17451185523840.0, "grad_norm": 2.608547168979691, "language_loss": 0.76338971, "learning_rate": 3.995704451032496e-06, "loss": 0.78731084, "num_input_tokens_seen": 7279275, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 2.1875, "step": 349, "time_per_iteration": 2.5405266284942627 }, { "auxiliary_loss_clip": 0.01287051, "auxiliary_loss_mlp": 0.01095146, "balance_loss_clip": 1.05809593, "balance_loss_mlp": 1.08148372, "epoch": 0.021043138433789268, "flos": 24573995562240.0, "grad_norm": 7.967012324848996, "language_loss": 0.84915042, "learning_rate": 3.995679666080876e-06, "loss": 0.87297237, "num_input_tokens_seen": 7300180, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 2.0625, "step": 350, "time_per_iteration": 5.582220792770386 }, { "auxiliary_loss_clip": 0.01294986, "auxiliary_loss_mlp": 0.01081267, "balance_loss_clip": 1.04631519, "balance_loss_mlp": 1.08518004, "epoch": 0.021103261686457236, "flos": 24454053072000.0, "grad_norm": 2.647525637894033, "language_loss": 0.79771632, "learning_rate": 3.995654809908581e-06, "loss": 0.82147884, "num_input_tokens_seen": 7317430, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 2.09375, "step": 351, "time_per_iteration": 2.6318933963775635 }, { "auxiliary_loss_clip": 0.01300414, "auxiliary_loss_mlp": 0.01093992, "balance_loss_clip": 1.05505836, "balance_loss_mlp": 1.08570981, "epoch": 0.021163384939125205, "flos": 14683083816960.0, "grad_norm": 2.135237691038618, "language_loss": 0.87079144, "learning_rate": 3.9956298825165005e-06, "loss": 0.89473546, "num_input_tokens_seen": 7334875, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 2.15625, "step": 352, "time_per_iteration": 2.504642963409424 }, { "auxiliary_loss_clip": 0.01296143, "auxiliary_loss_mlp": 0.01089491, "balance_loss_clip": 1.05112958, "balance_loss_mlp": 1.08596051, "epoch": 0.021223508191793177, "flos": 24493195918080.0, "grad_norm": 2.62934447630278, "language_loss": 0.82292032, "learning_rate": 3.995604883905522e-06, "loss": 0.84677672, "num_input_tokens_seen": 7355185, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 2.09375, "step": 353, "time_per_iteration": 2.651881217956543 }, { "auxiliary_loss_clip": 0.01291532, "auxiliary_loss_mlp": 0.01082936, "balance_loss_clip": 1.04743576, "balance_loss_mlp": 1.08263278, "epoch": 0.021283631444461146, "flos": 24126978804480.0, "grad_norm": 2.108515494896807, "language_loss": 0.80576992, "learning_rate": 3.995579814076539e-06, "loss": 0.82951462, "num_input_tokens_seen": 7374425, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 2.09375, "step": 354, "time_per_iteration": 2.5747287273406982 }, { "auxiliary_loss_clip": 0.01298616, "auxiliary_loss_mlp": 0.01084234, "balance_loss_clip": 1.04575288, "balance_loss_mlp": 1.08263862, "epoch": 0.021343754697129114, "flos": 25192233475200.0, "grad_norm": 1.938462698786331, "language_loss": 0.8040458, "learning_rate": 3.9955546730304455e-06, "loss": 0.8278743, "num_input_tokens_seen": 7394175, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 2.15625, "step": 355, "time_per_iteration": 2.6085288524627686 }, { "auxiliary_loss_clip": 0.01294779, "auxiliary_loss_mlp": 0.01082496, "balance_loss_clip": 1.04539835, "balance_loss_mlp": 1.08146048, "epoch": 0.021403877949797083, "flos": 17274182279040.0, "grad_norm": 2.8630463873847143, "language_loss": 0.88818896, "learning_rate": 3.995529460768139e-06, "loss": 0.91196173, "num_input_tokens_seen": 7412645, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 2.140625, "step": 356, "time_per_iteration": 2.5399763584136963 }, { "auxiliary_loss_clip": 0.01292372, "auxiliary_loss_mlp": 0.01078752, "balance_loss_clip": 1.04008055, "balance_loss_mlp": 1.08330834, "epoch": 0.021464001202465055, "flos": 30917435276160.0, "grad_norm": 2.2509422948456184, "language_loss": 0.79880279, "learning_rate": 3.995504177290519e-06, "loss": 0.82251406, "num_input_tokens_seen": 7432275, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 2.09375, "step": 357, "time_per_iteration": 2.8393023014068604 }, { "auxiliary_loss_clip": 0.01294013, "auxiliary_loss_mlp": 0.01076724, "balance_loss_clip": 1.04203379, "balance_loss_mlp": 1.08094406, "epoch": 0.021524124455133024, "flos": 18186385098240.0, "grad_norm": 3.888562010963965, "language_loss": 0.75968081, "learning_rate": 3.995478822598488e-06, "loss": 0.7833882, "num_input_tokens_seen": 7450245, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 2.125, "step": 358, "time_per_iteration": 2.6483571529388428 }, { "auxiliary_loss_clip": 0.01293212, "auxiliary_loss_mlp": 0.01082673, "balance_loss_clip": 1.04364347, "balance_loss_mlp": 1.08009887, "epoch": 0.021584247707800992, "flos": 13805786039040.0, "grad_norm": 2.0296195864815645, "language_loss": 0.8830989, "learning_rate": 3.995453396692951e-06, "loss": 0.90685773, "num_input_tokens_seen": 7466845, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 2.125, "step": 359, "time_per_iteration": 2.617278814315796 }, { "auxiliary_loss_clip": 0.01301051, "auxiliary_loss_mlp": 0.0108176, "balance_loss_clip": 1.04673576, "balance_loss_mlp": 1.08588231, "epoch": 0.02164437096046896, "flos": 23294713703040.0, "grad_norm": 2.6013515232275166, "language_loss": 0.75988942, "learning_rate": 3.995427899574816e-06, "loss": 0.78371757, "num_input_tokens_seen": 7485450, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 2.15625, "step": 360, "time_per_iteration": 2.6739697456359863 }, { "auxiliary_loss_clip": 0.01183744, "auxiliary_loss_mlp": 0.01010369, "balance_loss_clip": 1.00045109, "balance_loss_mlp": 1.07365727, "epoch": 0.02170449421313693, "flos": 68899578341760.0, "grad_norm": 0.8298259914602092, "language_loss": 0.64922184, "learning_rate": 3.99540233124499e-06, "loss": 0.67116296, "num_input_tokens_seen": 7553780, "router_z_loss_clip": 0.09912109, "router_z_loss_mlp": 1.1015625, "step": 361, "time_per_iteration": 3.323498249053955 }, { "auxiliary_loss_clip": 0.01291224, "auxiliary_loss_mlp": 0.01076081, "balance_loss_clip": 1.0394125, "balance_loss_mlp": 1.07785702, "epoch": 0.0217646174658049, "flos": 25228539146880.0, "grad_norm": 2.646120634778381, "language_loss": 0.77783626, "learning_rate": 3.995376691704389e-06, "loss": 0.80150938, "num_input_tokens_seen": 7574155, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 2.125, "step": 362, "time_per_iteration": 2.651461124420166 }, { "auxiliary_loss_clip": 0.01292054, "auxiliary_loss_mlp": 0.01081305, "balance_loss_clip": 1.04578042, "balance_loss_mlp": 1.07978308, "epoch": 0.02182474071847287, "flos": 22893124671360.0, "grad_norm": 2.7356008080307275, "language_loss": 0.92394012, "learning_rate": 3.995350980953926e-06, "loss": 0.94767374, "num_input_tokens_seen": 7592320, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 2.125, "step": 363, "time_per_iteration": 2.5744152069091797 }, { "auxiliary_loss_clip": 0.01289908, "auxiliary_loss_mlp": 0.0108498, "balance_loss_clip": 1.05000401, "balance_loss_mlp": 1.07968056, "epoch": 0.02188486397114084, "flos": 23658991482240.0, "grad_norm": 2.178745419876089, "language_loss": 0.89261591, "learning_rate": 3.99532519899452e-06, "loss": 0.91636479, "num_input_tokens_seen": 7611185, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 2.09375, "step": 364, "time_per_iteration": 2.6391937732696533 }, { "auxiliary_loss_clip": 0.01295078, "auxiliary_loss_mlp": 0.01082774, "balance_loss_clip": 1.04627216, "balance_loss_mlp": 1.08361435, "epoch": 0.021944987223808807, "flos": 21543637680000.0, "grad_norm": 1.8810277121577246, "language_loss": 0.78939515, "learning_rate": 3.99529934582709e-06, "loss": 0.81317365, "num_input_tokens_seen": 7631970, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 2.109375, "step": 365, "time_per_iteration": 2.5556628704071045 }, { "auxiliary_loss_clip": 0.0129013, "auxiliary_loss_mlp": 0.01089459, "balance_loss_clip": 1.05190873, "balance_loss_mlp": 1.08165419, "epoch": 0.022005110476476776, "flos": 16070887641600.0, "grad_norm": 1.946559091184121, "language_loss": 0.84045327, "learning_rate": 3.995273421452558e-06, "loss": 0.86424923, "num_input_tokens_seen": 7649745, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 2.078125, "step": 366, "time_per_iteration": 2.5886080265045166 }, { "auxiliary_loss_clip": 0.01293131, "auxiliary_loss_mlp": 0.01077726, "balance_loss_clip": 1.04348922, "balance_loss_mlp": 1.08104432, "epoch": 0.022065233729144748, "flos": 21433715084160.0, "grad_norm": 3.193728668868375, "language_loss": 0.86262542, "learning_rate": 3.995247425871851e-06, "loss": 0.88633406, "num_input_tokens_seen": 7668830, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 2.125, "step": 367, "time_per_iteration": 2.52200984954834 }, { "auxiliary_loss_clip": 0.01297501, "auxiliary_loss_mlp": 0.01089169, "balance_loss_clip": 1.05161834, "balance_loss_mlp": 1.08107543, "epoch": 0.022125356981812717, "flos": 21543709507200.0, "grad_norm": 2.759137797599, "language_loss": 0.84406793, "learning_rate": 3.995221359085895e-06, "loss": 0.8679347, "num_input_tokens_seen": 7687240, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 2.15625, "step": 368, "time_per_iteration": 2.5732245445251465 }, { "auxiliary_loss_clip": 0.0129628, "auxiliary_loss_mlp": 0.01071252, "balance_loss_clip": 1.0364188, "balance_loss_mlp": 1.07920671, "epoch": 0.022185480234480685, "flos": 20704153944960.0, "grad_norm": 2.5461144055746012, "language_loss": 0.75409406, "learning_rate": 3.995195221095621e-06, "loss": 0.77776939, "num_input_tokens_seen": 7704440, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 2.171875, "step": 369, "time_per_iteration": 2.5730104446411133 }, { "auxiliary_loss_clip": 0.01291554, "auxiliary_loss_mlp": 0.01086889, "balance_loss_clip": 1.05126929, "balance_loss_mlp": 1.08190656, "epoch": 0.022245603487148654, "flos": 25193203142400.0, "grad_norm": 2.0177544787728388, "language_loss": 0.81937075, "learning_rate": 3.995169011901963e-06, "loss": 0.84315521, "num_input_tokens_seen": 7727160, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 2.09375, "step": 370, "time_per_iteration": 2.6064085960388184 }, { "auxiliary_loss_clip": 0.01289892, "auxiliary_loss_mlp": 0.01088264, "balance_loss_clip": 1.05223906, "balance_loss_mlp": 1.08164668, "epoch": 0.022305726739816623, "flos": 21395936954880.0, "grad_norm": 1.7773794689340752, "language_loss": 0.81366289, "learning_rate": 3.995142731505854e-06, "loss": 0.83744442, "num_input_tokens_seen": 7747730, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 2.078125, "step": 371, "time_per_iteration": 2.653964042663574 }, { "auxiliary_loss_clip": 0.01295528, "auxiliary_loss_mlp": 0.01081447, "balance_loss_clip": 1.0444206, "balance_loss_mlp": 1.08596957, "epoch": 0.022365849992484595, "flos": 22492146170880.0, "grad_norm": 2.140174443561109, "language_loss": 0.8296895, "learning_rate": 3.995116379908234e-06, "loss": 0.85345924, "num_input_tokens_seen": 7766765, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 2.09375, "step": 372, "time_per_iteration": 2.6804873943328857 }, { "auxiliary_loss_clip": 0.01286537, "auxiliary_loss_mlp": 0.01080353, "balance_loss_clip": 1.0443759, "balance_loss_mlp": 1.08019328, "epoch": 0.022425973245152563, "flos": 17856581397120.0, "grad_norm": 4.29565891784552, "language_loss": 0.78201628, "learning_rate": 3.995089957110041e-06, "loss": 0.80568522, "num_input_tokens_seen": 7784010, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 2.0625, "step": 373, "time_per_iteration": 2.574737071990967 }, { "auxiliary_loss_clip": 0.01293905, "auxiliary_loss_mlp": 0.01078463, "balance_loss_clip": 1.04129338, "balance_loss_mlp": 1.08310819, "epoch": 0.022486096497820532, "flos": 15483029656320.0, "grad_norm": 3.31029884588786, "language_loss": 0.77394879, "learning_rate": 3.995063463112221e-06, "loss": 0.79767251, "num_input_tokens_seen": 7801305, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 2.109375, "step": 374, "time_per_iteration": 2.562690258026123 }, { "auxiliary_loss_clip": 0.01285439, "auxiliary_loss_mlp": 0.01071174, "balance_loss_clip": 1.03362274, "balance_loss_mlp": 1.07504296, "epoch": 0.0225462197504885, "flos": 27784157950080.0, "grad_norm": 1.6545148374534708, "language_loss": 0.85971594, "learning_rate": 3.995036897915717e-06, "loss": 0.88328212, "num_input_tokens_seen": 7823965, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 2.109375, "step": 375, "time_per_iteration": 2.629866361618042 }, { "auxiliary_loss_clip": 0.01292247, "auxiliary_loss_mlp": 0.01085977, "balance_loss_clip": 1.04830682, "balance_loss_mlp": 1.08432388, "epoch": 0.02260634300315647, "flos": 19975490645760.0, "grad_norm": 2.273183497073172, "language_loss": 0.8848784, "learning_rate": 3.995010261521478e-06, "loss": 0.90866059, "num_input_tokens_seen": 7842115, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 2.078125, "step": 376, "time_per_iteration": 2.558739423751831 }, { "auxiliary_loss_clip": 0.01290791, "auxiliary_loss_mlp": 0.01079633, "balance_loss_clip": 1.04546773, "balance_loss_mlp": 1.07832837, "epoch": 0.02266646625582444, "flos": 16028189349120.0, "grad_norm": 2.324379686498739, "language_loss": 0.74864209, "learning_rate": 3.9949835539304545e-06, "loss": 0.77234638, "num_input_tokens_seen": 7857830, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 2.125, "step": 377, "time_per_iteration": 2.537673234939575 }, { "auxiliary_loss_clip": 0.01284396, "auxiliary_loss_mlp": 0.0108682, "balance_loss_clip": 1.05046177, "balance_loss_mlp": 1.08218074, "epoch": 0.02272658950849241, "flos": 20404622430720.0, "grad_norm": 2.57881871739473, "language_loss": 0.99049628, "learning_rate": 3.9949567751436e-06, "loss": 1.01420856, "num_input_tokens_seen": 7875840, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 2.015625, "step": 378, "time_per_iteration": 2.578077793121338 }, { "auxiliary_loss_clip": 0.01173773, "auxiliary_loss_mlp": 0.01035227, "balance_loss_clip": 1.02492738, "balance_loss_mlp": 1.06650805, "epoch": 0.02278671276116038, "flos": 69847332647040.0, "grad_norm": 0.9587755569604567, "language_loss": 0.75480258, "learning_rate": 3.99492992516187e-06, "loss": 0.77689254, "num_input_tokens_seen": 7940190, "router_z_loss_clip": 0.10302734, "router_z_loss_mlp": 1.0703125, "step": 379, "time_per_iteration": 3.2196271419525146 }, { "auxiliary_loss_clip": 0.01293328, "auxiliary_loss_mlp": 0.01079646, "balance_loss_clip": 1.04486132, "balance_loss_mlp": 1.0779686, "epoch": 0.022846836013828347, "flos": 38508771340800.0, "grad_norm": 2.4391937443700527, "language_loss": 0.78144944, "learning_rate": 3.994903003986222e-06, "loss": 0.80517924, "num_input_tokens_seen": 7960840, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 2.15625, "step": 380, "time_per_iteration": 2.712125539779663 }, { "auxiliary_loss_clip": 0.01285343, "auxiliary_loss_mlp": 0.01082945, "balance_loss_clip": 1.04877996, "balance_loss_mlp": 1.07928276, "epoch": 0.02290695926649632, "flos": 20959478795520.0, "grad_norm": 2.1936262506762314, "language_loss": 0.96035719, "learning_rate": 3.9948760116176174e-06, "loss": 0.98404002, "num_input_tokens_seen": 7975500, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 2.0625, "step": 381, "time_per_iteration": 2.617497444152832 }, { "auxiliary_loss_clip": 0.01294152, "auxiliary_loss_mlp": 0.01081993, "balance_loss_clip": 1.04623032, "balance_loss_mlp": 1.08035088, "epoch": 0.022967082519164288, "flos": 24022407335040.0, "grad_norm": 2.496288475478205, "language_loss": 0.87266213, "learning_rate": 3.994848948057019e-06, "loss": 0.89642358, "num_input_tokens_seen": 7993880, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 2.125, "step": 382, "time_per_iteration": 2.5606253147125244 }, { "auxiliary_loss_clip": 0.01286983, "auxiliary_loss_mlp": 0.01090821, "balance_loss_clip": 1.05620313, "balance_loss_mlp": 1.08088613, "epoch": 0.023027205771832256, "flos": 20997149184000.0, "grad_norm": 1.9719606127138456, "language_loss": 0.84435439, "learning_rate": 3.994821813305394e-06, "loss": 0.86813247, "num_input_tokens_seen": 8012730, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 2.0625, "step": 383, "time_per_iteration": 2.606414556503296 }, { "auxiliary_loss_clip": 0.01284831, "auxiliary_loss_mlp": 0.01103716, "balance_loss_clip": 1.06897831, "balance_loss_mlp": 1.08062959, "epoch": 0.023087329024500225, "flos": 21360816432000.0, "grad_norm": 2.9270534013001126, "language_loss": 0.83105916, "learning_rate": 3.99479460736371e-06, "loss": 0.85494471, "num_input_tokens_seen": 8031275, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 2.046875, "step": 384, "time_per_iteration": 2.5678811073303223 }, { "auxiliary_loss_clip": 0.01285109, "auxiliary_loss_mlp": 0.01082885, "balance_loss_clip": 1.05010271, "balance_loss_mlp": 1.08344531, "epoch": 0.023147452277168194, "flos": 21872435800320.0, "grad_norm": 1.8477357088751194, "language_loss": 0.88657379, "learning_rate": 3.994767330232937e-06, "loss": 0.91025376, "num_input_tokens_seen": 8051600, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 2.015625, "step": 385, "time_per_iteration": 2.6522154808044434 }, { "auxiliary_loss_clip": 0.01288308, "auxiliary_loss_mlp": 0.01103217, "balance_loss_clip": 1.06917083, "balance_loss_mlp": 1.08126402, "epoch": 0.023207575529836166, "flos": 18916700423040.0, "grad_norm": 1.729947896642467, "language_loss": 0.69431186, "learning_rate": 3.994739981914049e-06, "loss": 0.71822709, "num_input_tokens_seen": 8070600, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 2.0625, "step": 386, "time_per_iteration": 2.540104866027832 }, { "auxiliary_loss_clip": 0.01288243, "auxiliary_loss_mlp": 0.01083113, "balance_loss_clip": 1.048805, "balance_loss_mlp": 1.08163524, "epoch": 0.023267698782504134, "flos": 25046005207680.0, "grad_norm": 2.1755209658625296, "language_loss": 0.87698954, "learning_rate": 3.994712562408022e-06, "loss": 0.90070307, "num_input_tokens_seen": 8090680, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 2.0625, "step": 387, "time_per_iteration": 2.613739490509033 }, { "auxiliary_loss_clip": 0.01288479, "auxiliary_loss_mlp": 0.01071138, "balance_loss_clip": 1.03578079, "balance_loss_mlp": 1.08082867, "epoch": 0.023327822035172103, "flos": 28879217930880.0, "grad_norm": 1.9714744218845865, "language_loss": 0.82944596, "learning_rate": 3.994685071715835e-06, "loss": 0.85304213, "num_input_tokens_seen": 8114610, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 2.0625, "step": 388, "time_per_iteration": 2.6252188682556152 }, { "auxiliary_loss_clip": 0.01286224, "auxiliary_loss_mlp": 0.01080931, "balance_loss_clip": 1.04776669, "balance_loss_mlp": 1.08020306, "epoch": 0.02338794528784007, "flos": 27121533805440.0, "grad_norm": 2.7700265618789404, "language_loss": 0.93532634, "learning_rate": 3.9946575098384686e-06, "loss": 0.95899785, "num_input_tokens_seen": 8133975, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 2.0625, "step": 389, "time_per_iteration": 2.677266836166382 }, { "auxiliary_loss_clip": 0.01279849, "auxiliary_loss_mlp": 0.0107533, "balance_loss_clip": 1.0408783, "balance_loss_mlp": 1.07992601, "epoch": 0.02344806854050804, "flos": 21322355944320.0, "grad_norm": 2.223999879110182, "language_loss": 0.87407374, "learning_rate": 3.9946298767769065e-06, "loss": 0.89762551, "num_input_tokens_seen": 8153570, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 2.0, "step": 390, "time_per_iteration": 2.550992488861084 }, { "auxiliary_loss_clip": 0.01284155, "auxiliary_loss_mlp": 0.010809, "balance_loss_clip": 1.04837942, "balance_loss_mlp": 1.0808742, "epoch": 0.023508191793176012, "flos": 24789997998720.0, "grad_norm": 2.184336350350803, "language_loss": 0.88968456, "learning_rate": 3.994602172532135e-06, "loss": 0.91333508, "num_input_tokens_seen": 8170075, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 2.03125, "step": 391, "time_per_iteration": 5.560213088989258 }, { "auxiliary_loss_clip": 0.01281157, "auxiliary_loss_mlp": 0.0106494, "balance_loss_clip": 1.03101289, "balance_loss_mlp": 1.0778501, "epoch": 0.02356831504584398, "flos": 25995375624960.0, "grad_norm": 5.405579926729326, "language_loss": 0.857669, "learning_rate": 3.994574397105143e-06, "loss": 0.88112998, "num_input_tokens_seen": 8190420, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 2.03125, "step": 392, "time_per_iteration": 4.08501124382019 }, { "auxiliary_loss_clip": 0.01277246, "auxiliary_loss_mlp": 0.01078958, "balance_loss_clip": 1.04531741, "balance_loss_mlp": 1.07725334, "epoch": 0.02362843829851195, "flos": 19062461813760.0, "grad_norm": 2.100887557354998, "language_loss": 0.88495052, "learning_rate": 3.994546550496921e-06, "loss": 0.90851259, "num_input_tokens_seen": 8208790, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 2.0, "step": 393, "time_per_iteration": 2.629486322402954 }, { "auxiliary_loss_clip": 0.01288438, "auxiliary_loss_mlp": 0.01096964, "balance_loss_clip": 1.0612489, "balance_loss_mlp": 1.08384693, "epoch": 0.023688561551179918, "flos": 16071031296000.0, "grad_norm": 2.057306790344946, "language_loss": 0.81102377, "learning_rate": 3.994518632708464e-06, "loss": 0.83487785, "num_input_tokens_seen": 8226885, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 2.046875, "step": 394, "time_per_iteration": 2.6831724643707275 }, { "auxiliary_loss_clip": 0.01279739, "auxiliary_loss_mlp": 0.01079138, "balance_loss_clip": 1.04430544, "balance_loss_mlp": 1.07684553, "epoch": 0.023748684803847887, "flos": 21724375939200.0, "grad_norm": 2.079597249632602, "language_loss": 0.85524642, "learning_rate": 3.994490643740766e-06, "loss": 0.8788352, "num_input_tokens_seen": 8246825, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 2.03125, "step": 395, "time_per_iteration": 2.6408917903900146 }, { "auxiliary_loss_clip": 0.01168479, "auxiliary_loss_mlp": 0.01036502, "balance_loss_clip": 1.02677441, "balance_loss_mlp": 1.06136823, "epoch": 0.02380880805651586, "flos": 61926192881280.0, "grad_norm": 0.9244176000033065, "language_loss": 0.63747132, "learning_rate": 3.994462583594828e-06, "loss": 0.6595211, "num_input_tokens_seen": 8302835, "router_z_loss_clip": 0.09716797, "router_z_loss_mlp": 1.0703125, "step": 396, "time_per_iteration": 2.9982683658599854 }, { "auxiliary_loss_clip": 0.01276346, "auxiliary_loss_mlp": 0.01066787, "balance_loss_clip": 1.03302741, "balance_loss_mlp": 1.07570386, "epoch": 0.023868931309183827, "flos": 20266331068800.0, "grad_norm": 3.0712396901478933, "language_loss": 0.83596432, "learning_rate": 3.994434452271651e-06, "loss": 0.85939574, "num_input_tokens_seen": 8320745, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 2.0, "step": 397, "time_per_iteration": 2.5918874740600586 }, { "auxiliary_loss_clip": 0.0128128, "auxiliary_loss_mlp": 0.01079813, "balance_loss_clip": 1.04550433, "balance_loss_mlp": 1.07919657, "epoch": 0.023929054561851796, "flos": 21139103733120.0, "grad_norm": 4.006483702217074, "language_loss": 0.84197581, "learning_rate": 3.994406249772239e-06, "loss": 0.86558676, "num_input_tokens_seen": 8339540, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 2.015625, "step": 398, "time_per_iteration": 2.6303577423095703 }, { "auxiliary_loss_clip": 0.01281887, "auxiliary_loss_mlp": 0.01074362, "balance_loss_clip": 1.03883815, "balance_loss_mlp": 1.0765779, "epoch": 0.023989177814519765, "flos": 13698521049600.0, "grad_norm": 2.5804037627050715, "language_loss": 0.85595894, "learning_rate": 3.994377976097598e-06, "loss": 0.87952149, "num_input_tokens_seen": 8354890, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 2.0625, "step": 399, "time_per_iteration": 2.595515727996826 }, { "auxiliary_loss_clip": 0.01277253, "auxiliary_loss_mlp": 0.01085256, "balance_loss_clip": 1.05149603, "balance_loss_mlp": 1.07828379, "epoch": 0.024049301067187733, "flos": 26322018929280.0, "grad_norm": 1.9752203485474735, "language_loss": 0.85840309, "learning_rate": 3.9943496312487365e-06, "loss": 0.88202822, "num_input_tokens_seen": 8375845, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 1.984375, "step": 400, "time_per_iteration": 2.6792919635772705 }, { "auxiliary_loss_clip": 0.01282818, "auxiliary_loss_mlp": 0.01075728, "balance_loss_clip": 1.04246843, "balance_loss_mlp": 1.08064592, "epoch": 0.024109424319855705, "flos": 24425432910720.0, "grad_norm": 1.9261789072210453, "language_loss": 0.78977346, "learning_rate": 3.994321215226667e-06, "loss": 0.81335896, "num_input_tokens_seen": 8395240, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 2.03125, "step": 401, "time_per_iteration": 2.6035869121551514 }, { "auxiliary_loss_clip": 0.01277542, "auxiliary_loss_mlp": 0.0108781, "balance_loss_clip": 1.05464578, "balance_loss_mlp": 1.07666707, "epoch": 0.024169547572523674, "flos": 29604397610880.0, "grad_norm": 2.5480592119544214, "language_loss": 0.78164035, "learning_rate": 3.994292728032404e-06, "loss": 0.80529386, "num_input_tokens_seen": 8416950, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 2.0, "step": 402, "time_per_iteration": 2.6646203994750977 }, { "auxiliary_loss_clip": 0.01280617, "auxiliary_loss_mlp": 0.0107319, "balance_loss_clip": 1.0375464, "balance_loss_mlp": 1.0752579, "epoch": 0.024229670825191642, "flos": 22601458235520.0, "grad_norm": 2.2677542254489738, "language_loss": 0.943524, "learning_rate": 3.994264169666963e-06, "loss": 0.96706206, "num_input_tokens_seen": 8433660, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 2.0625, "step": 403, "time_per_iteration": 2.597003221511841 }, { "auxiliary_loss_clip": 0.0128433, "auxiliary_loss_mlp": 0.01086119, "balance_loss_clip": 1.05061889, "balance_loss_mlp": 1.07547355, "epoch": 0.02428979407785961, "flos": 18150258994560.0, "grad_norm": 2.8392246782312833, "language_loss": 0.99742877, "learning_rate": 3.994235540131364e-06, "loss": 1.0211333, "num_input_tokens_seen": 8450180, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 2.09375, "step": 404, "time_per_iteration": 2.5909135341644287 }, { "auxiliary_loss_clip": 0.01284847, "auxiliary_loss_mlp": 0.01094304, "balance_loss_clip": 1.06009114, "balance_loss_mlp": 1.07952499, "epoch": 0.024349917330527583, "flos": 15304984917120.0, "grad_norm": 2.8772548500088204, "language_loss": 0.87323928, "learning_rate": 3.994206839426627e-06, "loss": 0.89703077, "num_input_tokens_seen": 8467775, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 2.046875, "step": 405, "time_per_iteration": 2.7902843952178955 }, { "auxiliary_loss_clip": 0.01291321, "auxiliary_loss_mlp": 0.01095776, "balance_loss_clip": 1.06072855, "balance_loss_mlp": 1.08287525, "epoch": 0.024410040583195552, "flos": 20773892200320.0, "grad_norm": 2.6796953914095307, "language_loss": 0.9286356, "learning_rate": 3.994178067553779e-06, "loss": 0.95250654, "num_input_tokens_seen": 8486765, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 2.078125, "step": 406, "time_per_iteration": 2.720015525817871 }, { "auxiliary_loss_clip": 0.01284803, "auxiliary_loss_mlp": 0.01081086, "balance_loss_clip": 1.04568088, "balance_loss_mlp": 1.08111715, "epoch": 0.02447016383586352, "flos": 21798854789760.0, "grad_norm": 2.1485692511761356, "language_loss": 0.86815321, "learning_rate": 3.994149224513846e-06, "loss": 0.89181203, "num_input_tokens_seen": 8506515, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 2.046875, "step": 407, "time_per_iteration": 2.653592348098755 }, { "auxiliary_loss_clip": 0.0128114, "auxiliary_loss_mlp": 0.01079201, "balance_loss_clip": 1.04536974, "balance_loss_mlp": 1.08100152, "epoch": 0.02453028708853149, "flos": 33948116380800.0, "grad_norm": 2.246568050956846, "language_loss": 0.73455095, "learning_rate": 3.994120310307856e-06, "loss": 0.75815439, "num_input_tokens_seen": 8528035, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 2.0, "step": 408, "time_per_iteration": 2.7537200450897217 }, { "auxiliary_loss_clip": 0.01285858, "auxiliary_loss_mlp": 0.01086844, "balance_loss_clip": 1.05179656, "balance_loss_mlp": 1.08016443, "epoch": 0.024590410341199458, "flos": 21793000872960.0, "grad_norm": 3.425147864424178, "language_loss": 0.92000484, "learning_rate": 3.994091324936841e-06, "loss": 0.94373196, "num_input_tokens_seen": 8546455, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 2.046875, "step": 409, "time_per_iteration": 2.5980618000030518 }, { "auxiliary_loss_clip": 0.01284681, "auxiliary_loss_mlp": 0.01071171, "balance_loss_clip": 1.03946114, "balance_loss_mlp": 1.08091927, "epoch": 0.02465053359386743, "flos": 35114782124160.0, "grad_norm": 2.0488499602585986, "language_loss": 0.82290703, "learning_rate": 3.994062268401836e-06, "loss": 0.84646559, "num_input_tokens_seen": 8568450, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 2.03125, "step": 410, "time_per_iteration": 2.743696689605713 }, { "auxiliary_loss_clip": 0.01285257, "auxiliary_loss_mlp": 0.01084778, "balance_loss_clip": 1.05063653, "balance_loss_mlp": 1.0801028, "epoch": 0.0247106568465354, "flos": 27451409333760.0, "grad_norm": 2.7443908889799844, "language_loss": 0.7755304, "learning_rate": 3.994033140703878e-06, "loss": 0.79923075, "num_input_tokens_seen": 8589340, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 2.046875, "step": 411, "time_per_iteration": 2.666862964630127 }, { "auxiliary_loss_clip": 0.01283892, "auxiliary_loss_mlp": 0.01090653, "balance_loss_clip": 1.05481851, "balance_loss_mlp": 1.07727277, "epoch": 0.024770780099203367, "flos": 20703794808960.0, "grad_norm": 2.206025414642123, "language_loss": 0.86392498, "learning_rate": 3.994003941844007e-06, "loss": 0.8876704, "num_input_tokens_seen": 8607150, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 2.0625, "step": 412, "time_per_iteration": 2.541682004928589 }, { "auxiliary_loss_clip": 0.0117626, "auxiliary_loss_mlp": 0.01111872, "balance_loss_clip": 1.10114348, "balance_loss_mlp": 1.06838822, "epoch": 0.024830903351871336, "flos": 69551859369600.0, "grad_norm": 0.8445259456708579, "language_loss": 0.58502519, "learning_rate": 3.993974671823265e-06, "loss": 0.60790658, "num_input_tokens_seen": 8669865, "router_z_loss_clip": 0.10742188, "router_z_loss_mlp": 1.078125, "step": 413, "time_per_iteration": 3.2222797870635986 }, { "auxiliary_loss_clip": 0.01278623, "auxiliary_loss_mlp": 0.01086057, "balance_loss_clip": 1.04886413, "balance_loss_mlp": 1.07666492, "epoch": 0.024891026604539304, "flos": 32270477713920.0, "grad_norm": 1.7539553243322552, "language_loss": 0.79994309, "learning_rate": 3.9939453306426955e-06, "loss": 0.82358992, "num_input_tokens_seen": 8690235, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 2.015625, "step": 414, "time_per_iteration": 2.7271218299865723 }, { "auxiliary_loss_clip": 0.01273153, "auxiliary_loss_mlp": 0.01085032, "balance_loss_clip": 1.05155826, "balance_loss_mlp": 1.07460475, "epoch": 0.024951149857207276, "flos": 18840282238080.0, "grad_norm": 3.1312001576600816, "language_loss": 0.80140519, "learning_rate": 3.9939159183033466e-06, "loss": 0.82498705, "num_input_tokens_seen": 8706295, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 1.984375, "step": 415, "time_per_iteration": 2.566307544708252 }, { "auxiliary_loss_clip": 0.01290605, "auxiliary_loss_mlp": 0.01080173, "balance_loss_clip": 1.04548311, "balance_loss_mlp": 1.08101785, "epoch": 0.025011273109875245, "flos": 15377201210880.0, "grad_norm": 2.5047481007271277, "language_loss": 0.95811254, "learning_rate": 3.9938864348062675e-06, "loss": 0.98182034, "num_input_tokens_seen": 8724200, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 2.09375, "step": 416, "time_per_iteration": 2.6122195720672607 }, { "auxiliary_loss_clip": 0.01280054, "auxiliary_loss_mlp": 0.01075277, "balance_loss_clip": 1.03889418, "balance_loss_mlp": 1.07628143, "epoch": 0.025071396362543213, "flos": 18915515274240.0, "grad_norm": 1.7251335356118607, "language_loss": 0.77070832, "learning_rate": 3.993856880152509e-06, "loss": 0.79426157, "num_input_tokens_seen": 8744170, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 2.03125, "step": 417, "time_per_iteration": 2.663254976272583 }, { "auxiliary_loss_clip": 0.01278979, "auxiliary_loss_mlp": 0.01087789, "balance_loss_clip": 1.05297983, "balance_loss_mlp": 1.08080757, "epoch": 0.025131519615211182, "flos": 25337958952320.0, "grad_norm": 1.5841231137374339, "language_loss": 0.76667655, "learning_rate": 3.9938272543431286e-06, "loss": 0.7903443, "num_input_tokens_seen": 8765120, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 1.984375, "step": 418, "time_per_iteration": 2.6066932678222656 }, { "auxiliary_loss_clip": 0.01286606, "auxiliary_loss_mlp": 0.01073305, "balance_loss_clip": 1.03944933, "balance_loss_mlp": 1.0808804, "epoch": 0.02519164286787915, "flos": 18953149749120.0, "grad_norm": 3.0628752902091643, "language_loss": 0.804618, "learning_rate": 3.993797557379182e-06, "loss": 0.82821709, "num_input_tokens_seen": 8783500, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 2.0625, "step": 419, "time_per_iteration": 2.636824369430542 }, { "auxiliary_loss_clip": 0.01279641, "auxiliary_loss_mlp": 0.010822, "balance_loss_clip": 1.04555535, "balance_loss_mlp": 1.07953191, "epoch": 0.025251766120547123, "flos": 17421092904960.0, "grad_norm": 1.7823168507053047, "language_loss": 0.73286635, "learning_rate": 3.9937677892617295e-06, "loss": 0.75648475, "num_input_tokens_seen": 8801175, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 2.0, "step": 420, "time_per_iteration": 2.5576694011688232 }, { "auxiliary_loss_clip": 0.0128325, "auxiliary_loss_mlp": 0.01096339, "balance_loss_clip": 1.06212592, "balance_loss_mlp": 1.07828522, "epoch": 0.02531188937321509, "flos": 25045430590080.0, "grad_norm": 1.8897098589818027, "language_loss": 0.78504872, "learning_rate": 3.993737949991833e-06, "loss": 0.80884469, "num_input_tokens_seen": 8820215, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 2.046875, "step": 421, "time_per_iteration": 2.6502654552459717 }, { "auxiliary_loss_clip": 0.01280754, "auxiliary_loss_mlp": 0.01088084, "balance_loss_clip": 1.05153394, "balance_loss_mlp": 1.07970524, "epoch": 0.02537201262588306, "flos": 30592228515840.0, "grad_norm": 2.0628122081415414, "language_loss": 0.81413257, "learning_rate": 3.993708039570557e-06, "loss": 0.83782095, "num_input_tokens_seen": 8839660, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 2.015625, "step": 422, "time_per_iteration": 2.635955810546875 }, { "auxiliary_loss_clip": 0.01284147, "auxiliary_loss_mlp": 0.01080745, "balance_loss_clip": 1.04719996, "balance_loss_mlp": 1.07778227, "epoch": 0.02543213587855103, "flos": 26065365275520.0, "grad_norm": 4.096448640971264, "language_loss": 0.83488017, "learning_rate": 3.99367805799897e-06, "loss": 0.85852909, "num_input_tokens_seen": 8859280, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 2.0625, "step": 423, "time_per_iteration": 2.6315553188323975 }, { "auxiliary_loss_clip": 0.01277787, "auxiliary_loss_mlp": 0.01075402, "balance_loss_clip": 1.04128397, "balance_loss_mlp": 1.07530379, "epoch": 0.025492259131218997, "flos": 36022818965760.0, "grad_norm": 2.6448719965576255, "language_loss": 0.73950183, "learning_rate": 3.993648005278142e-06, "loss": 0.76303375, "num_input_tokens_seen": 8880560, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 2.03125, "step": 424, "time_per_iteration": 2.734670400619507 }, { "auxiliary_loss_clip": 0.01285152, "auxiliary_loss_mlp": 0.01091406, "balance_loss_clip": 1.05814648, "balance_loss_mlp": 1.08063579, "epoch": 0.02555238238388697, "flos": 18588045957120.0, "grad_norm": 2.64515646473531, "language_loss": 0.82752228, "learning_rate": 3.993617881409143e-06, "loss": 0.85128784, "num_input_tokens_seen": 8899155, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 2.03125, "step": 425, "time_per_iteration": 2.603703260421753 }, { "auxiliary_loss_clip": 0.01282022, "auxiliary_loss_mlp": 0.01087615, "balance_loss_clip": 1.05101764, "balance_loss_mlp": 1.07325208, "epoch": 0.025612505636554938, "flos": 24243186280320.0, "grad_norm": 2.2109826592411745, "language_loss": 0.85185319, "learning_rate": 3.993587686393052e-06, "loss": 0.87554961, "num_input_tokens_seen": 8917890, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 2.09375, "step": 426, "time_per_iteration": 2.568300724029541 }, { "auxiliary_loss_clip": 0.01171489, "auxiliary_loss_mlp": 0.01044309, "balance_loss_clip": 1.0358212, "balance_loss_mlp": 1.06651425, "epoch": 0.025672628889222907, "flos": 60586941265920.0, "grad_norm": 0.8832851603992004, "language_loss": 0.57187533, "learning_rate": 3.993557420230944e-06, "loss": 0.59403336, "num_input_tokens_seen": 8978260, "router_z_loss_clip": 0.08496094, "router_z_loss_mlp": 1.046875, "step": 427, "time_per_iteration": 3.2203924655914307 }, { "auxiliary_loss_clip": 0.01282071, "auxiliary_loss_mlp": 0.01088975, "balance_loss_clip": 1.05490541, "balance_loss_mlp": 1.08165419, "epoch": 0.025732752141890875, "flos": 19573255169280.0, "grad_norm": 2.4471870465307566, "language_loss": 0.8836087, "learning_rate": 3.9935270829239e-06, "loss": 0.90731919, "num_input_tokens_seen": 8994460, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 2.0, "step": 428, "time_per_iteration": 2.6035211086273193 }, { "auxiliary_loss_clip": 0.01273996, "auxiliary_loss_mlp": 0.01071249, "balance_loss_clip": 1.03455639, "balance_loss_mlp": 1.07508755, "epoch": 0.025792875394558847, "flos": 31284262920960.0, "grad_norm": 1.5784199705891946, "language_loss": 0.8537693, "learning_rate": 3.993496674473002e-06, "loss": 0.8772217, "num_input_tokens_seen": 9016670, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 1.984375, "step": 429, "time_per_iteration": 2.625396966934204 }, { "auxiliary_loss_clip": 0.01279389, "auxiliary_loss_mlp": 0.01081011, "balance_loss_clip": 1.04453301, "balance_loss_mlp": 1.07446671, "epoch": 0.025852998647226816, "flos": 32379610210560.0, "grad_norm": 2.3646710162781037, "language_loss": 0.88028872, "learning_rate": 3.993466194879335e-06, "loss": 0.9038927, "num_input_tokens_seen": 9039720, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 2.046875, "step": 430, "time_per_iteration": 2.7203330993652344 }, { "auxiliary_loss_clip": 0.01281684, "auxiliary_loss_mlp": 0.01078864, "balance_loss_clip": 1.04598665, "balance_loss_mlp": 1.08209848, "epoch": 0.025913121899894784, "flos": 20193288762240.0, "grad_norm": 2.111769122836529, "language_loss": 0.83727193, "learning_rate": 3.993435644143989e-06, "loss": 0.86087739, "num_input_tokens_seen": 9059850, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 1.9921875, "step": 431, "time_per_iteration": 2.596349000930786 }, { "auxiliary_loss_clip": 0.01277484, "auxiliary_loss_mlp": 0.01070851, "balance_loss_clip": 1.03909338, "balance_loss_mlp": 1.07745016, "epoch": 0.025973245152562753, "flos": 14720430983040.0, "grad_norm": 2.975061446394602, "language_loss": 0.86085308, "learning_rate": 3.993405022268051e-06, "loss": 0.88433647, "num_input_tokens_seen": 9077590, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 2.0, "step": 432, "time_per_iteration": 2.5673394203186035 }, { "auxiliary_loss_clip": 0.01281329, "auxiliary_loss_mlp": 0.01072193, "balance_loss_clip": 1.04041219, "balance_loss_mlp": 1.07981646, "epoch": 0.02603336840523072, "flos": 30992991534720.0, "grad_norm": 2.0399829014424586, "language_loss": 0.75886059, "learning_rate": 3.993374329252616e-06, "loss": 0.78239578, "num_input_tokens_seen": 9099880, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 2.015625, "step": 433, "time_per_iteration": 5.5416553020477295 }, { "auxiliary_loss_clip": 0.0128029, "auxiliary_loss_mlp": 0.01091101, "balance_loss_clip": 1.05431294, "balance_loss_mlp": 1.07636881, "epoch": 0.026093491657898694, "flos": 17674262939520.0, "grad_norm": 1.972423242596645, "language_loss": 0.89519501, "learning_rate": 3.993343565098778e-06, "loss": 0.91890895, "num_input_tokens_seen": 9118620, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 2.03125, "step": 434, "time_per_iteration": 4.406802177429199 }, { "auxiliary_loss_clip": 0.01280109, "auxiliary_loss_mlp": 0.01096882, "balance_loss_clip": 1.06040406, "balance_loss_mlp": 1.08111835, "epoch": 0.026153614910566662, "flos": 17857874286720.0, "grad_norm": 2.4292432297615116, "language_loss": 0.79631919, "learning_rate": 3.993312729807637e-06, "loss": 0.8200891, "num_input_tokens_seen": 9135655, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 1.984375, "step": 435, "time_per_iteration": 2.5871171951293945 }, { "auxiliary_loss_clip": 0.01277481, "auxiliary_loss_mlp": 0.01089436, "balance_loss_clip": 1.05436432, "balance_loss_mlp": 1.07568002, "epoch": 0.02621373816323463, "flos": 20011113959040.0, "grad_norm": 2.515235854651368, "language_loss": 0.86018771, "learning_rate": 3.993281823380292e-06, "loss": 0.88385689, "num_input_tokens_seen": 9153520, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 2.015625, "step": 436, "time_per_iteration": 2.7078778743743896 }, { "auxiliary_loss_clip": 0.01280776, "auxiliary_loss_mlp": 0.01089562, "balance_loss_clip": 1.05515778, "balance_loss_mlp": 1.07955551, "epoch": 0.0262738614159026, "flos": 19281193683840.0, "grad_norm": 2.3254899191753613, "language_loss": 0.74498451, "learning_rate": 3.993250845817845e-06, "loss": 0.76868784, "num_input_tokens_seen": 9170750, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 2.015625, "step": 437, "time_per_iteration": 2.716935634613037 }, { "auxiliary_loss_clip": 0.01280762, "auxiliary_loss_mlp": 0.01095046, "balance_loss_clip": 1.05935454, "balance_loss_mlp": 1.08175683, "epoch": 0.026333984668570568, "flos": 18807208790400.0, "grad_norm": 2.22111420205254, "language_loss": 0.90982068, "learning_rate": 3.9932197971214026e-06, "loss": 0.93357885, "num_input_tokens_seen": 9188430, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 1.9921875, "step": 438, "time_per_iteration": 2.6897263526916504 }, { "auxiliary_loss_clip": 0.01157291, "auxiliary_loss_mlp": 0.01023622, "balance_loss_clip": 1.01556325, "balance_loss_mlp": 1.05732346, "epoch": 0.02639410792123854, "flos": 64572020691840.0, "grad_norm": 0.8567819465196971, "language_loss": 0.62662393, "learning_rate": 3.9931886772920735e-06, "loss": 0.64843303, "num_input_tokens_seen": 9255835, "router_z_loss_clip": 0.08056641, "router_z_loss_mlp": 1.0, "step": 439, "time_per_iteration": 3.3072004318237305 }, { "auxiliary_loss_clip": 0.01286145, "auxiliary_loss_mlp": 0.01094467, "balance_loss_clip": 1.05860913, "balance_loss_mlp": 1.08336294, "epoch": 0.02645423117390651, "flos": 28473462921600.0, "grad_norm": 1.7598333211055652, "language_loss": 0.75457722, "learning_rate": 3.993157486330967e-06, "loss": 0.77838337, "num_input_tokens_seen": 9276835, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 2.03125, "step": 440, "time_per_iteration": 2.722867488861084 }, { "auxiliary_loss_clip": 0.01278574, "auxiliary_loss_mlp": 0.01075859, "balance_loss_clip": 1.04150319, "balance_loss_mlp": 1.07969713, "epoch": 0.026514354426574478, "flos": 18551237495040.0, "grad_norm": 1.864536183682119, "language_loss": 0.82896566, "learning_rate": 3.993126224239198e-06, "loss": 0.85251004, "num_input_tokens_seen": 9295075, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 1.984375, "step": 441, "time_per_iteration": 2.6590723991394043 }, { "auxiliary_loss_clip": 0.01278625, "auxiliary_loss_mlp": 0.01081364, "balance_loss_clip": 1.04552937, "balance_loss_mlp": 1.07610011, "epoch": 0.026574477679242446, "flos": 20667812359680.0, "grad_norm": 1.8223481819854233, "language_loss": 0.78716129, "learning_rate": 3.99309489101788e-06, "loss": 0.81076121, "num_input_tokens_seen": 9314205, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 2.03125, "step": 442, "time_per_iteration": 2.5804991722106934 }, { "auxiliary_loss_clip": 0.01152102, "auxiliary_loss_mlp": 0.01011852, "balance_loss_clip": 1.00360286, "balance_loss_mlp": 1.05325341, "epoch": 0.026634600931910415, "flos": 57956125340160.0, "grad_norm": 0.9513942629367617, "language_loss": 0.64487636, "learning_rate": 3.993063486668132e-06, "loss": 0.66651595, "num_input_tokens_seen": 9367395, "router_z_loss_clip": 0.08251953, "router_z_loss_mlp": 0.98828125, "step": 443, "time_per_iteration": 3.0153071880340576 }, { "auxiliary_loss_clip": 0.01278968, "auxiliary_loss_mlp": 0.01083243, "balance_loss_clip": 1.0490061, "balance_loss_mlp": 1.08031225, "epoch": 0.026694724184578387, "flos": 15815131827840.0, "grad_norm": 1.9710104187976332, "language_loss": 0.82039022, "learning_rate": 3.993032011191076e-06, "loss": 0.84401232, "num_input_tokens_seen": 9385185, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 1.984375, "step": 444, "time_per_iteration": 2.51112961769104 }, { "auxiliary_loss_clip": 0.01281792, "auxiliary_loss_mlp": 0.01077403, "balance_loss_clip": 1.04018557, "balance_loss_mlp": 1.07873487, "epoch": 0.026754847437246355, "flos": 23440259612160.0, "grad_norm": 2.2540893716676536, "language_loss": 0.95432687, "learning_rate": 3.993000464587833e-06, "loss": 0.97791886, "num_input_tokens_seen": 9403225, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 2.03125, "step": 445, "time_per_iteration": 2.633692979812622 }, { "auxiliary_loss_clip": 0.01279929, "auxiliary_loss_mlp": 0.01091752, "balance_loss_clip": 1.05474937, "balance_loss_mlp": 1.08045864, "epoch": 0.026814970689914324, "flos": 17341801632000.0, "grad_norm": 2.273156556325993, "language_loss": 0.91057557, "learning_rate": 3.9929688468595305e-06, "loss": 0.93429238, "num_input_tokens_seen": 9420540, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 1.9921875, "step": 446, "time_per_iteration": 2.5008766651153564 }, { "auxiliary_loss_clip": 0.01275881, "auxiliary_loss_mlp": 0.01078724, "balance_loss_clip": 1.0439868, "balance_loss_mlp": 1.07886243, "epoch": 0.026875093942582293, "flos": 17894718662400.0, "grad_norm": 2.3280938574362695, "language_loss": 0.79368699, "learning_rate": 3.992937158007296e-06, "loss": 0.81723309, "num_input_tokens_seen": 9438840, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 1.96875, "step": 447, "time_per_iteration": 2.5728917121887207 }, { "auxiliary_loss_clip": 0.01274308, "auxiliary_loss_mlp": 0.01069677, "balance_loss_clip": 1.03765738, "balance_loss_mlp": 1.07704556, "epoch": 0.026935217195250265, "flos": 21723980889600.0, "grad_norm": 2.1250152232365767, "language_loss": 0.85796428, "learning_rate": 3.992905398032262e-06, "loss": 0.88140416, "num_input_tokens_seen": 9457215, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 1.96875, "step": 448, "time_per_iteration": 2.5378434658050537 }, { "auxiliary_loss_clip": 0.01269069, "auxiliary_loss_mlp": 0.0108348, "balance_loss_clip": 1.05177081, "balance_loss_mlp": 1.07572222, "epoch": 0.026995340447918233, "flos": 23622685810560.0, "grad_norm": 1.8496275306235823, "language_loss": 0.88582039, "learning_rate": 3.992873566935559e-06, "loss": 0.90934587, "num_input_tokens_seen": 9475615, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 1.9375, "step": 449, "time_per_iteration": 2.631025552749634 }, { "auxiliary_loss_clip": 0.01279697, "auxiliary_loss_mlp": 0.0108389, "balance_loss_clip": 1.04912889, "balance_loss_mlp": 1.0778296, "epoch": 0.027055463700586202, "flos": 17931275729280.0, "grad_norm": 2.1570382318307573, "language_loss": 0.80135626, "learning_rate": 3.992841664718326e-06, "loss": 0.82499218, "num_input_tokens_seen": 9493975, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 2.015625, "step": 450, "time_per_iteration": 2.7755703926086426 }, { "auxiliary_loss_clip": 0.01272677, "auxiliary_loss_mlp": 0.01073366, "balance_loss_clip": 1.03953493, "balance_loss_mlp": 1.07983613, "epoch": 0.02711558695325417, "flos": 25118903859840.0, "grad_norm": 1.6026612480382807, "language_loss": 0.81309509, "learning_rate": 3.9928096913817e-06, "loss": 0.83655554, "num_input_tokens_seen": 9514810, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 1.921875, "step": 451, "time_per_iteration": 2.6444599628448486 }, { "auxiliary_loss_clip": 0.01270985, "auxiliary_loss_mlp": 0.01085979, "balance_loss_clip": 1.05047882, "balance_loss_mlp": 1.07771266, "epoch": 0.02717571020592214, "flos": 24239559006720.0, "grad_norm": 1.604512792289794, "language_loss": 0.7664398, "learning_rate": 3.992777646926822e-06, "loss": 0.79000944, "num_input_tokens_seen": 9533635, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 1.9375, "step": 452, "time_per_iteration": 2.663912057876587 }, { "auxiliary_loss_clip": 0.01273396, "auxiliary_loss_mlp": 0.0107885, "balance_loss_clip": 1.04466057, "balance_loss_mlp": 1.07803655, "epoch": 0.02723583345859011, "flos": 25118939773440.0, "grad_norm": 1.7046747032750555, "language_loss": 0.72274292, "learning_rate": 3.992745531354836e-06, "loss": 0.74626541, "num_input_tokens_seen": 9555420, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 1.953125, "step": 453, "time_per_iteration": 2.6327056884765625 }, { "auxiliary_loss_clip": 0.01268221, "auxiliary_loss_mlp": 0.01074468, "balance_loss_clip": 1.0426625, "balance_loss_mlp": 1.07512164, "epoch": 0.02729595671125808, "flos": 42741597847680.0, "grad_norm": 3.4028905498879443, "language_loss": 0.81877625, "learning_rate": 3.992713344666888e-06, "loss": 0.84220314, "num_input_tokens_seen": 9578950, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 1.9296875, "step": 454, "time_per_iteration": 2.7495479583740234 }, { "auxiliary_loss_clip": 0.01274678, "auxiliary_loss_mlp": 0.01071762, "balance_loss_clip": 1.0406003, "balance_loss_mlp": 1.07922304, "epoch": 0.02735607996392605, "flos": 21430985650560.0, "grad_norm": 2.0068462479376374, "language_loss": 0.75469863, "learning_rate": 3.992681086864125e-06, "loss": 0.77816296, "num_input_tokens_seen": 9598160, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 1.953125, "step": 455, "time_per_iteration": 2.614346981048584 }, { "auxiliary_loss_clip": 0.01271482, "auxiliary_loss_mlp": 0.01083132, "balance_loss_clip": 1.05130327, "balance_loss_mlp": 1.07512176, "epoch": 0.027416203216594017, "flos": 20851280052480.0, "grad_norm": 1.9772220739048663, "language_loss": 0.80345118, "learning_rate": 3.992648757947702e-06, "loss": 0.82699728, "num_input_tokens_seen": 9616010, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 1.9609375, "step": 456, "time_per_iteration": 2.644509792327881 }, { "auxiliary_loss_clip": 0.01266484, "auxiliary_loss_mlp": 0.0107075, "balance_loss_clip": 1.03772902, "balance_loss_mlp": 1.0746969, "epoch": 0.027476326469261986, "flos": 13224500242560.0, "grad_norm": 2.376885285566527, "language_loss": 0.81475222, "learning_rate": 3.99261635791877e-06, "loss": 0.83812463, "num_input_tokens_seen": 9634000, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 1.9140625, "step": 457, "time_per_iteration": 2.4993345737457275 }, { "auxiliary_loss_clip": 0.01271966, "auxiliary_loss_mlp": 0.01074884, "balance_loss_clip": 1.04205358, "balance_loss_mlp": 1.07595301, "epoch": 0.027536449721929958, "flos": 24024526237440.0, "grad_norm": 2.0394272134825537, "language_loss": 0.9364748, "learning_rate": 3.992583886778485e-06, "loss": 0.95994323, "num_input_tokens_seen": 9653455, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 1.9609375, "step": 458, "time_per_iteration": 2.6028313636779785 }, { "auxiliary_loss_clip": 0.0126975, "auxiliary_loss_mlp": 0.01089049, "balance_loss_clip": 1.05714858, "balance_loss_mlp": 1.07462251, "epoch": 0.027596572974597926, "flos": 13006055681280.0, "grad_norm": 2.4614514663687386, "language_loss": 0.78442138, "learning_rate": 3.9925513445280075e-06, "loss": 0.80800939, "num_input_tokens_seen": 9669650, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 1.953125, "step": 459, "time_per_iteration": 2.506410837173462 }, { "auxiliary_loss_clip": 0.01273607, "auxiliary_loss_mlp": 0.01076182, "balance_loss_clip": 1.04049122, "balance_loss_mlp": 1.08039427, "epoch": 0.027656696227265895, "flos": 26143076350080.0, "grad_norm": 2.0354278570975466, "language_loss": 0.83240676, "learning_rate": 3.9925187311684975e-06, "loss": 0.85590464, "num_input_tokens_seen": 9691415, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 1.9296875, "step": 460, "time_per_iteration": 2.674160957336426 }, { "auxiliary_loss_clip": 0.01144608, "auxiliary_loss_mlp": 0.01030666, "balance_loss_clip": 1.02251232, "balance_loss_mlp": 1.04971623, "epoch": 0.027716819479933864, "flos": 60697222997760.0, "grad_norm": 1.5966930804987634, "language_loss": 0.73595583, "learning_rate": 3.9924860467011195e-06, "loss": 0.75770855, "num_input_tokens_seen": 9755605, "router_z_loss_clip": 0.08154297, "router_z_loss_mlp": 0.94921875, "step": 461, "time_per_iteration": 3.127223253250122 }, { "auxiliary_loss_clip": 0.01269013, "auxiliary_loss_mlp": 0.01076621, "balance_loss_clip": 1.04631841, "balance_loss_mlp": 1.08126545, "epoch": 0.027776942732601832, "flos": 31211938886400.0, "grad_norm": 2.4766798975392943, "language_loss": 0.80657166, "learning_rate": 3.99245329112704e-06, "loss": 0.83002794, "num_input_tokens_seen": 9776270, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.875, "step": 462, "time_per_iteration": 2.6850132942199707 }, { "auxiliary_loss_clip": 0.01268117, "auxiliary_loss_mlp": 0.01068037, "balance_loss_clip": 1.03830647, "balance_loss_mlp": 1.07724881, "epoch": 0.027837065985269804, "flos": 22674644196480.0, "grad_norm": 2.243413274003846, "language_loss": 0.89966971, "learning_rate": 3.992420464447427e-06, "loss": 0.92303121, "num_input_tokens_seen": 9794465, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.90625, "step": 463, "time_per_iteration": 2.5672359466552734 }, { "auxiliary_loss_clip": 0.01140688, "auxiliary_loss_mlp": 0.01008813, "balance_loss_clip": 1.00099301, "balance_loss_mlp": 1.04523492, "epoch": 0.027897189237937773, "flos": 62182487399040.0, "grad_norm": 0.8924790264603296, "language_loss": 0.59079617, "learning_rate": 3.992387566663454e-06, "loss": 0.61229122, "num_input_tokens_seen": 9849685, "router_z_loss_clip": 0.078125, "router_z_loss_mlp": 0.953125, "step": 464, "time_per_iteration": 3.1203877925872803 }, { "auxiliary_loss_clip": 0.01280497, "auxiliary_loss_mlp": 0.01089763, "balance_loss_clip": 1.05652702, "balance_loss_mlp": 1.08103228, "epoch": 0.02795731249060574, "flos": 24493160004480.0, "grad_norm": 2.440293911714608, "language_loss": 0.80808675, "learning_rate": 3.992354597776293e-06, "loss": 0.83178937, "num_input_tokens_seen": 9869505, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 1.984375, "step": 465, "time_per_iteration": 2.595494031906128 }, { "auxiliary_loss_clip": 0.01265651, "auxiliary_loss_mlp": 0.01077668, "balance_loss_clip": 1.0456965, "balance_loss_mlp": 1.0746603, "epoch": 0.02801743574327371, "flos": 23733003456000.0, "grad_norm": 1.9101847827120362, "language_loss": 0.78115946, "learning_rate": 3.992321557787121e-06, "loss": 0.80459267, "num_input_tokens_seen": 9890950, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 1.90625, "step": 466, "time_per_iteration": 2.6586616039276123 }, { "auxiliary_loss_clip": 0.01265126, "auxiliary_loss_mlp": 0.01077727, "balance_loss_clip": 1.0462079, "balance_loss_mlp": 1.07613409, "epoch": 0.02807755899594168, "flos": 20629100476800.0, "grad_norm": 1.9736928336436528, "language_loss": 0.87262666, "learning_rate": 3.992288446697118e-06, "loss": 0.89605528, "num_input_tokens_seen": 9911265, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 1.890625, "step": 467, "time_per_iteration": 2.5953357219696045 }, { "auxiliary_loss_clip": 0.01271977, "auxiliary_loss_mlp": 0.01093467, "balance_loss_clip": 1.0599457, "balance_loss_mlp": 1.07769907, "epoch": 0.02813768224860965, "flos": 19244564789760.0, "grad_norm": 2.221794726752906, "language_loss": 0.85536504, "learning_rate": 3.9922552645074644e-06, "loss": 0.8790195, "num_input_tokens_seen": 9929025, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 1.9453125, "step": 468, "time_per_iteration": 2.584991931915283 }, { "auxiliary_loss_clip": 0.01267305, "auxiliary_loss_mlp": 0.01074262, "balance_loss_clip": 1.04126525, "balance_loss_mlp": 1.07496929, "epoch": 0.02819780550127762, "flos": 20813968800000.0, "grad_norm": 2.142733164303962, "language_loss": 0.91412187, "learning_rate": 3.992222011219346e-06, "loss": 0.93753755, "num_input_tokens_seen": 9945190, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 1.921875, "step": 469, "time_per_iteration": 2.5608303546905518 }, { "auxiliary_loss_clip": 0.01267615, "auxiliary_loss_mlp": 0.0108333, "balance_loss_clip": 1.05047548, "balance_loss_mlp": 1.07667112, "epoch": 0.028257928753945588, "flos": 19974125928960.0, "grad_norm": 2.5164534207381823, "language_loss": 0.81026566, "learning_rate": 3.992188686833948e-06, "loss": 0.8337751, "num_input_tokens_seen": 9962820, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 1.90625, "step": 470, "time_per_iteration": 2.5482566356658936 }, { "auxiliary_loss_clip": 0.01266841, "auxiliary_loss_mlp": 0.01080552, "balance_loss_clip": 1.04643404, "balance_loss_mlp": 1.07466173, "epoch": 0.028318052006613557, "flos": 20484488321280.0, "grad_norm": 1.8602389342909746, "language_loss": 0.92670971, "learning_rate": 3.992155291352461e-06, "loss": 0.95018363, "num_input_tokens_seen": 9982595, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 1.921875, "step": 471, "time_per_iteration": 2.5993399620056152 }, { "auxiliary_loss_clip": 0.01263688, "auxiliary_loss_mlp": 0.01085326, "balance_loss_clip": 1.05263877, "balance_loss_mlp": 1.07400143, "epoch": 0.02837817525928153, "flos": 28514832410880.0, "grad_norm": 2.393270008473059, "language_loss": 0.75672716, "learning_rate": 3.992121824776075e-06, "loss": 0.78021729, "num_input_tokens_seen": 10004645, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 1.8984375, "step": 472, "time_per_iteration": 2.642068862915039 }, { "auxiliary_loss_clip": 0.01269997, "auxiliary_loss_mlp": 0.01069264, "balance_loss_clip": 1.03822243, "balance_loss_mlp": 1.0737195, "epoch": 0.028438298511949497, "flos": 18551668458240.0, "grad_norm": 1.8311437034388416, "language_loss": 0.93315881, "learning_rate": 3.9920882871059865e-06, "loss": 0.95655143, "num_input_tokens_seen": 10022555, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 1.9609375, "step": 473, "time_per_iteration": 2.6169991493225098 }, { "auxiliary_loss_clip": 0.01275919, "auxiliary_loss_mlp": 0.01102864, "balance_loss_clip": 1.07160783, "balance_loss_mlp": 1.07881141, "epoch": 0.028498421764617466, "flos": 16910227722240.0, "grad_norm": 2.9185063864910066, "language_loss": 0.88351882, "learning_rate": 3.992054678343391e-06, "loss": 0.90730667, "num_input_tokens_seen": 10041025, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.96875, "step": 474, "time_per_iteration": 4.118648290634155 }, { "auxiliary_loss_clip": 0.01268394, "auxiliary_loss_mlp": 0.01081049, "balance_loss_clip": 1.04862356, "balance_loss_mlp": 1.07688713, "epoch": 0.028558545017285435, "flos": 27778699082880.0, "grad_norm": 2.029214895929726, "language_loss": 0.78587425, "learning_rate": 3.992020998489488e-06, "loss": 0.80936861, "num_input_tokens_seen": 10060775, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.9140625, "step": 475, "time_per_iteration": 5.534535884857178 }, { "auxiliary_loss_clip": 0.01141006, "auxiliary_loss_mlp": 0.0103692, "balance_loss_clip": 1.02909994, "balance_loss_mlp": 1.04544473, "epoch": 0.028618668269953403, "flos": 65654367258240.0, "grad_norm": 0.9178364990929183, "language_loss": 0.66914415, "learning_rate": 3.991987247545479e-06, "loss": 0.69092333, "num_input_tokens_seen": 10120225, "router_z_loss_clip": 0.078125, "router_z_loss_mlp": 0.95703125, "step": 476, "time_per_iteration": 3.110283613204956 }, { "auxiliary_loss_clip": 0.01270241, "auxiliary_loss_mlp": 0.01076841, "balance_loss_clip": 1.04408193, "balance_loss_mlp": 1.07833278, "epoch": 0.028678791522621375, "flos": 21937074324480.0, "grad_norm": 2.1874753742211483, "language_loss": 0.83778822, "learning_rate": 3.99195342551257e-06, "loss": 0.86125898, "num_input_tokens_seen": 10137880, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 1.921875, "step": 477, "time_per_iteration": 2.62290358543396 }, { "auxiliary_loss_clip": 0.01274584, "auxiliary_loss_mlp": 0.01082279, "balance_loss_clip": 1.04925823, "balance_loss_mlp": 1.08062041, "epoch": 0.028738914775289344, "flos": 24572128055040.0, "grad_norm": 2.378293029008002, "language_loss": 0.81691694, "learning_rate": 3.991919532391967e-06, "loss": 0.84048557, "num_input_tokens_seen": 10156930, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 1.9375, "step": 478, "time_per_iteration": 2.571470022201538 }, { "auxiliary_loss_clip": 0.01268403, "auxiliary_loss_mlp": 0.01075114, "balance_loss_clip": 1.0428561, "balance_loss_mlp": 1.07714844, "epoch": 0.028799038027957313, "flos": 23257977068160.0, "grad_norm": 2.1858077934771507, "language_loss": 0.80697882, "learning_rate": 3.991885568184879e-06, "loss": 0.830414, "num_input_tokens_seen": 10176295, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 1.90625, "step": 479, "time_per_iteration": 2.623443603515625 }, { "auxiliary_loss_clip": 0.01264929, "auxiliary_loss_mlp": 0.01076539, "balance_loss_clip": 1.0418967, "balance_loss_mlp": 1.0753063, "epoch": 0.02885916128062528, "flos": 22164102236160.0, "grad_norm": 4.143560826112622, "language_loss": 0.73364878, "learning_rate": 3.991851532892521e-06, "loss": 0.75706339, "num_input_tokens_seen": 10195790, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 1.8984375, "step": 480, "time_per_iteration": 2.641725540161133 }, { "auxiliary_loss_clip": 0.012631, "auxiliary_loss_mlp": 0.01067676, "balance_loss_clip": 1.03779054, "balance_loss_mlp": 1.07639325, "epoch": 0.02891928453329325, "flos": 22932842135040.0, "grad_norm": 1.8465113425149888, "language_loss": 0.87424988, "learning_rate": 3.991817426516103e-06, "loss": 0.89755762, "num_input_tokens_seen": 10218405, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.8671875, "step": 481, "time_per_iteration": 2.662597417831421 }, { "auxiliary_loss_clip": 0.01136984, "auxiliary_loss_mlp": 0.01014866, "balance_loss_clip": 1.00771379, "balance_loss_mlp": 1.044505, "epoch": 0.028979407785961222, "flos": 57432941792640.0, "grad_norm": 0.8226979034209231, "language_loss": 0.66096973, "learning_rate": 3.991783249056846e-06, "loss": 0.6824882, "num_input_tokens_seen": 10271005, "router_z_loss_clip": 0.07128906, "router_z_loss_mlp": 0.92578125, "step": 482, "time_per_iteration": 2.951549530029297 }, { "auxiliary_loss_clip": 0.01279027, "auxiliary_loss_mlp": 0.01074533, "balance_loss_clip": 1.04442024, "balance_loss_mlp": 1.08026564, "epoch": 0.02903953103862919, "flos": 16722737706240.0, "grad_norm": 2.6115039246413327, "language_loss": 0.78260314, "learning_rate": 3.991749000515968e-06, "loss": 0.80613875, "num_input_tokens_seen": 10288405, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.984375, "step": 483, "time_per_iteration": 2.6460282802581787 }, { "auxiliary_loss_clip": 0.01272455, "auxiliary_loss_mlp": 0.01086674, "balance_loss_clip": 1.05474973, "balance_loss_mlp": 1.07999969, "epoch": 0.02909965429129716, "flos": 16763640318720.0, "grad_norm": 2.3677345147857225, "language_loss": 0.75258029, "learning_rate": 3.991714680894691e-06, "loss": 0.77617157, "num_input_tokens_seen": 10306875, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 1.921875, "step": 484, "time_per_iteration": 2.610383987426758 }, { "auxiliary_loss_clip": 0.01269182, "auxiliary_loss_mlp": 0.01073, "balance_loss_clip": 1.04241109, "balance_loss_mlp": 1.0772121, "epoch": 0.029159777543965128, "flos": 19785343023360.0, "grad_norm": 2.1899666353050966, "language_loss": 0.83394492, "learning_rate": 3.991680290194241e-06, "loss": 0.85736674, "num_input_tokens_seen": 10323965, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 1.921875, "step": 485, "time_per_iteration": 2.5718681812286377 }, { "auxiliary_loss_clip": 0.01270511, "auxiliary_loss_mlp": 0.01068784, "balance_loss_clip": 1.03790927, "balance_loss_mlp": 1.08111501, "epoch": 0.029219900796633096, "flos": 19642670202240.0, "grad_norm": 2.121772048389168, "language_loss": 0.83935213, "learning_rate": 3.991645828415844e-06, "loss": 0.86274505, "num_input_tokens_seen": 10342620, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.890625, "step": 486, "time_per_iteration": 2.538158655166626 }, { "auxiliary_loss_clip": 0.01266502, "auxiliary_loss_mlp": 0.01078035, "balance_loss_clip": 1.04577708, "balance_loss_mlp": 1.07490158, "epoch": 0.02928002404930107, "flos": 25885704424320.0, "grad_norm": 1.7986929339974556, "language_loss": 0.88291824, "learning_rate": 3.991611295560732e-06, "loss": 0.90636361, "num_input_tokens_seen": 10364610, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 1.9140625, "step": 487, "time_per_iteration": 2.5889124870300293 }, { "auxiliary_loss_clip": 0.01270928, "auxiliary_loss_mlp": 0.01063187, "balance_loss_clip": 1.0338974, "balance_loss_mlp": 1.07929063, "epoch": 0.029340147301969037, "flos": 20660234590080.0, "grad_norm": 2.418813182054693, "language_loss": 0.88252211, "learning_rate": 3.991576691630134e-06, "loss": 0.90586329, "num_input_tokens_seen": 10380910, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.9140625, "step": 488, "time_per_iteration": 2.5326194763183594 }, { "auxiliary_loss_clip": 0.01265531, "auxiliary_loss_mlp": 0.01070957, "balance_loss_clip": 1.0396769, "balance_loss_mlp": 1.07670343, "epoch": 0.029400270554637006, "flos": 24428018689920.0, "grad_norm": 1.9280903566405285, "language_loss": 0.88703811, "learning_rate": 3.991542016625289e-06, "loss": 0.91040301, "num_input_tokens_seen": 10400665, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.890625, "step": 489, "time_per_iteration": 2.6273245811462402 }, { "auxiliary_loss_clip": 0.01262008, "auxiliary_loss_mlp": 0.01072768, "balance_loss_clip": 1.04115343, "balance_loss_mlp": 1.07282043, "epoch": 0.029460393807304974, "flos": 20120892900480.0, "grad_norm": 1.8878027049624986, "language_loss": 0.88456929, "learning_rate": 3.99150727054743e-06, "loss": 0.90791702, "num_input_tokens_seen": 10420150, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 1.890625, "step": 490, "time_per_iteration": 2.579040050506592 }, { "auxiliary_loss_clip": 0.01267483, "auxiliary_loss_mlp": 0.01075293, "balance_loss_clip": 1.04420328, "balance_loss_mlp": 1.07466209, "epoch": 0.029520517059972943, "flos": 17675914965120.0, "grad_norm": 2.8898009726632923, "language_loss": 0.91097873, "learning_rate": 3.9914724533978e-06, "loss": 0.93440652, "num_input_tokens_seen": 10438210, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 1.921875, "step": 491, "time_per_iteration": 2.607588291168213 }, { "auxiliary_loss_clip": 0.01259457, "auxiliary_loss_mlp": 0.01066204, "balance_loss_clip": 1.03630614, "balance_loss_mlp": 1.07433343, "epoch": 0.029580640312640915, "flos": 18953185662720.0, "grad_norm": 5.871702506706982, "language_loss": 0.84954953, "learning_rate": 3.991437565177642e-06, "loss": 0.87280613, "num_input_tokens_seen": 10455125, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.84375, "step": 492, "time_per_iteration": 2.6303911209106445 }, { "auxiliary_loss_clip": 0.01270784, "auxiliary_loss_mlp": 0.01086232, "balance_loss_clip": 1.05459404, "balance_loss_mlp": 1.08114946, "epoch": 0.029640763565308884, "flos": 18726121837440.0, "grad_norm": 3.0071536942488124, "language_loss": 0.84133261, "learning_rate": 3.991402605888198e-06, "loss": 0.86490279, "num_input_tokens_seen": 10470990, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 1.8984375, "step": 493, "time_per_iteration": 2.559354066848755 }, { "auxiliary_loss_clip": 0.01265173, "auxiliary_loss_mlp": 0.01073899, "balance_loss_clip": 1.04321456, "balance_loss_mlp": 1.07475019, "epoch": 0.029700886817976852, "flos": 20595308757120.0, "grad_norm": 1.8499514647776734, "language_loss": 0.86542088, "learning_rate": 3.991367575530719e-06, "loss": 0.88881159, "num_input_tokens_seen": 10490685, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 1.90625, "step": 494, "time_per_iteration": 2.5775961875915527 }, { "auxiliary_loss_clip": 0.01271576, "auxiliary_loss_mlp": 0.01078192, "balance_loss_clip": 1.0494988, "balance_loss_mlp": 1.07909167, "epoch": 0.02976101007064482, "flos": 22236857233920.0, "grad_norm": 2.6872985349220015, "language_loss": 0.86668718, "learning_rate": 3.9913324741064535e-06, "loss": 0.89018488, "num_input_tokens_seen": 10509435, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.921875, "step": 495, "time_per_iteration": 2.6465163230895996 }, { "auxiliary_loss_clip": 0.01260076, "auxiliary_loss_mlp": 0.01072299, "balance_loss_clip": 1.0427115, "balance_loss_mlp": 1.07552433, "epoch": 0.029821133323312793, "flos": 23732644320000.0, "grad_norm": 1.8923121003422538, "language_loss": 0.62006187, "learning_rate": 3.991297301616653e-06, "loss": 0.64338559, "num_input_tokens_seen": 10530050, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.84375, "step": 496, "time_per_iteration": 2.5976948738098145 }, { "auxiliary_loss_clip": 0.01263286, "auxiliary_loss_mlp": 0.01080687, "balance_loss_clip": 1.05018139, "balance_loss_mlp": 1.07961154, "epoch": 0.02988125657598076, "flos": 22419498913920.0, "grad_norm": 1.819522778955576, "language_loss": 0.88320404, "learning_rate": 3.991262058062575e-06, "loss": 0.90664375, "num_input_tokens_seen": 10551370, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.8359375, "step": 497, "time_per_iteration": 2.5672082901000977 }, { "auxiliary_loss_clip": 0.01269814, "auxiliary_loss_mlp": 0.01079737, "balance_loss_clip": 1.04879022, "balance_loss_mlp": 1.07938719, "epoch": 0.02994137982864873, "flos": 13845108453120.0, "grad_norm": 2.808122272612253, "language_loss": 0.84562957, "learning_rate": 3.991226743445477e-06, "loss": 0.86912507, "num_input_tokens_seen": 10569225, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.90625, "step": 498, "time_per_iteration": 2.5603291988372803 }, { "auxiliary_loss_clip": 0.01267933, "auxiliary_loss_mlp": 0.0107668, "balance_loss_clip": 1.04728293, "balance_loss_mlp": 1.07812536, "epoch": 0.0300015030813167, "flos": 23908354675200.0, "grad_norm": 7.8426008988642995, "language_loss": 0.77358139, "learning_rate": 3.991191357766617e-06, "loss": 0.79702753, "num_input_tokens_seen": 10586170, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.8984375, "step": 499, "time_per_iteration": 2.5460760593414307 }, { "auxiliary_loss_clip": 0.01266923, "auxiliary_loss_mlp": 0.01076772, "balance_loss_clip": 1.04699302, "balance_loss_mlp": 1.0788492, "epoch": 0.030061626333984667, "flos": 22016796560640.0, "grad_norm": 1.611978752384653, "language_loss": 0.82742977, "learning_rate": 3.991155901027261e-06, "loss": 0.85086679, "num_input_tokens_seen": 10606205, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.875, "step": 500, "time_per_iteration": 2.710304021835327 }, { "auxiliary_loss_clip": 0.01260384, "auxiliary_loss_mlp": 0.01082903, "balance_loss_clip": 1.04957247, "balance_loss_mlp": 1.07436156, "epoch": 0.03012174958665264, "flos": 23039747988480.0, "grad_norm": 3.4059598968711424, "language_loss": 0.87644482, "learning_rate": 3.991120373228672e-06, "loss": 0.89987773, "num_input_tokens_seen": 10625995, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 1.859375, "step": 501, "time_per_iteration": 2.5515859127044678 }, { "auxiliary_loss_clip": 0.01267043, "auxiliary_loss_mlp": 0.01068637, "balance_loss_clip": 1.04016972, "balance_loss_mlp": 1.07690835, "epoch": 0.030181872839320608, "flos": 18953257489920.0, "grad_norm": 2.4097782139032953, "language_loss": 0.86428857, "learning_rate": 3.991084774372118e-06, "loss": 0.88764536, "num_input_tokens_seen": 10644105, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.90625, "step": 502, "time_per_iteration": 2.6532981395721436 }, { "auxiliary_loss_clip": 0.01262339, "auxiliary_loss_mlp": 0.01072739, "balance_loss_clip": 1.04260242, "balance_loss_mlp": 1.07683492, "epoch": 0.030241996091988577, "flos": 16728017005440.0, "grad_norm": 4.1367290850796214, "language_loss": 0.84397769, "learning_rate": 3.991049104458871e-06, "loss": 0.86732841, "num_input_tokens_seen": 10661090, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.8515625, "step": 503, "time_per_iteration": 2.525132417678833 }, { "auxiliary_loss_clip": 0.01258981, "auxiliary_loss_mlp": 0.01077985, "balance_loss_clip": 1.04800367, "balance_loss_mlp": 1.07713199, "epoch": 0.030302119344656545, "flos": 28621271387520.0, "grad_norm": 3.245665511599393, "language_loss": 0.87943637, "learning_rate": 3.991013363490202e-06, "loss": 0.90280604, "num_input_tokens_seen": 10682380, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.8203125, "step": 504, "time_per_iteration": 2.634183168411255 }, { "auxiliary_loss_clip": 0.01260483, "auxiliary_loss_mlp": 0.01081878, "balance_loss_clip": 1.05245686, "balance_loss_mlp": 1.0750339, "epoch": 0.030362242597324514, "flos": 15669334523520.0, "grad_norm": 2.1410322074568486, "language_loss": 0.77616012, "learning_rate": 3.9909775514673885e-06, "loss": 0.79958367, "num_input_tokens_seen": 10699925, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.859375, "step": 505, "time_per_iteration": 2.5135395526885986 }, { "auxiliary_loss_clip": 0.01133155, "auxiliary_loss_mlp": 0.01012113, "balance_loss_clip": 1.00562787, "balance_loss_mlp": 1.04340053, "epoch": 0.030422365849992486, "flos": 72125973676800.0, "grad_norm": 0.8526616970340299, "language_loss": 0.54976088, "learning_rate": 3.990941668391708e-06, "loss": 0.57121354, "num_input_tokens_seen": 10766525, "router_z_loss_clip": 0.06494141, "router_z_loss_mlp": 0.8984375, "step": 506, "time_per_iteration": 3.3473641872406006 }, { "auxiliary_loss_clip": 0.01261029, "auxiliary_loss_mlp": 0.01084149, "balance_loss_clip": 1.0533694, "balance_loss_mlp": 1.07463408, "epoch": 0.030482489102660455, "flos": 19427817000960.0, "grad_norm": 2.2158300023107116, "language_loss": 0.82806712, "learning_rate": 3.99090571426444e-06, "loss": 0.85151887, "num_input_tokens_seen": 10786725, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.859375, "step": 507, "time_per_iteration": 2.5672008991241455 }, { "auxiliary_loss_clip": 0.01266024, "auxiliary_loss_mlp": 0.01070391, "balance_loss_clip": 1.04068446, "balance_loss_mlp": 1.0763104, "epoch": 0.030542612355328423, "flos": 20375822701440.0, "grad_norm": 2.2195221510908754, "language_loss": 0.88456488, "learning_rate": 3.990869689086868e-06, "loss": 0.907929, "num_input_tokens_seen": 10805390, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.890625, "step": 508, "time_per_iteration": 2.587887763977051 }, { "auxiliary_loss_clip": 0.01267816, "auxiliary_loss_mlp": 0.01058479, "balance_loss_clip": 1.02817559, "balance_loss_mlp": 1.07715583, "epoch": 0.030602735607996392, "flos": 34677354297600.0, "grad_norm": 1.7450869927488095, "language_loss": 0.71357775, "learning_rate": 3.990833592860279e-06, "loss": 0.73684072, "num_input_tokens_seen": 10828030, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.90625, "step": 509, "time_per_iteration": 2.6902570724487305 }, { "auxiliary_loss_clip": 0.01259693, "auxiliary_loss_mlp": 0.010637, "balance_loss_clip": 1.03536379, "balance_loss_mlp": 1.07626128, "epoch": 0.03066285886066436, "flos": 23658668259840.0, "grad_norm": 1.9643776088153468, "language_loss": 0.82193542, "learning_rate": 3.990797425585959e-06, "loss": 0.84516937, "num_input_tokens_seen": 10845240, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.828125, "step": 510, "time_per_iteration": 2.5524251461029053 }, { "auxiliary_loss_clip": 0.01264041, "auxiliary_loss_mlp": 0.01071536, "balance_loss_clip": 1.0412333, "balance_loss_mlp": 1.0797255, "epoch": 0.030722982113332332, "flos": 23002975440000.0, "grad_norm": 2.2215921594745263, "language_loss": 0.83543026, "learning_rate": 3.9907611872652e-06, "loss": 0.85878605, "num_input_tokens_seen": 10864325, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.84375, "step": 511, "time_per_iteration": 2.558598518371582 }, { "auxiliary_loss_clip": 0.01264435, "auxiliary_loss_mlp": 0.01070649, "balance_loss_clip": 1.04067981, "balance_loss_mlp": 1.07510555, "epoch": 0.0307831053660003, "flos": 24750855152640.0, "grad_norm": 2.776353408832487, "language_loss": 0.8206259, "learning_rate": 3.990724877899296e-06, "loss": 0.84397662, "num_input_tokens_seen": 10883860, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.890625, "step": 512, "time_per_iteration": 2.551600933074951 }, { "auxiliary_loss_clip": 0.01260148, "auxiliary_loss_mlp": 0.01074098, "balance_loss_clip": 1.04372382, "balance_loss_mlp": 1.07371211, "epoch": 0.03084322861866827, "flos": 26140885620480.0, "grad_norm": 1.796732741760336, "language_loss": 0.86551672, "learning_rate": 3.990688497489541e-06, "loss": 0.88885915, "num_input_tokens_seen": 10904555, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.8671875, "step": 513, "time_per_iteration": 2.6016769409179688 }, { "auxiliary_loss_clip": 0.01269111, "auxiliary_loss_mlp": 0.01071841, "balance_loss_clip": 1.04141903, "balance_loss_mlp": 1.07890224, "epoch": 0.03090335187133624, "flos": 18771298168320.0, "grad_norm": 1.76889271491263, "language_loss": 0.79081583, "learning_rate": 3.990652046037234e-06, "loss": 0.81422538, "num_input_tokens_seen": 10923700, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.8984375, "step": 514, "time_per_iteration": 2.5773892402648926 }, { "auxiliary_loss_clip": 0.01260755, "auxiliary_loss_mlp": 0.01059486, "balance_loss_clip": 1.03097105, "balance_loss_mlp": 1.07699966, "epoch": 0.030963475124004207, "flos": 23221886878080.0, "grad_norm": 3.885118090672634, "language_loss": 0.7741459, "learning_rate": 3.990615523543677e-06, "loss": 0.79734838, "num_input_tokens_seen": 10942730, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.8359375, "step": 515, "time_per_iteration": 2.6014583110809326 }, { "auxiliary_loss_clip": 0.01258594, "auxiliary_loss_mlp": 0.01064571, "balance_loss_clip": 1.03376758, "balance_loss_mlp": 1.07195985, "epoch": 0.03102359837667218, "flos": 42525595411200.0, "grad_norm": 1.7888930929494051, "language_loss": 0.82411611, "learning_rate": 3.990578930010171e-06, "loss": 0.84734774, "num_input_tokens_seen": 10967120, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.8671875, "step": 516, "time_per_iteration": 5.637955904006958 }, { "auxiliary_loss_clip": 0.01256698, "auxiliary_loss_mlp": 0.01067542, "balance_loss_clip": 1.03728628, "balance_loss_mlp": 1.0746994, "epoch": 0.031083721629340148, "flos": 21176953689600.0, "grad_norm": 1.801707568430303, "language_loss": 0.78725398, "learning_rate": 3.990542265438024e-06, "loss": 0.81049633, "num_input_tokens_seen": 10986775, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.8203125, "step": 517, "time_per_iteration": 4.055310010910034 }, { "auxiliary_loss_clip": 0.0125392, "auxiliary_loss_mlp": 0.01063941, "balance_loss_clip": 1.03459144, "balance_loss_mlp": 1.0742352, "epoch": 0.031143844882008116, "flos": 29716187713920.0, "grad_norm": 1.8543030064432067, "language_loss": 0.90259701, "learning_rate": 3.990505529828544e-06, "loss": 0.92577565, "num_input_tokens_seen": 11011360, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.796875, "step": 518, "time_per_iteration": 2.6018354892730713 }, { "auxiliary_loss_clip": 0.01266517, "auxiliary_loss_mlp": 0.01088977, "balance_loss_clip": 1.05769634, "balance_loss_mlp": 1.08200765, "epoch": 0.031203968134676085, "flos": 23112467072640.0, "grad_norm": 2.152472406784086, "language_loss": 0.85917723, "learning_rate": 3.9904687231830424e-06, "loss": 0.88273215, "num_input_tokens_seen": 11030150, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.84375, "step": 519, "time_per_iteration": 2.620866537094116 }, { "auxiliary_loss_clip": 0.01260168, "auxiliary_loss_mlp": 0.01086638, "balance_loss_clip": 1.05714607, "balance_loss_mlp": 1.07225251, "epoch": 0.03126409138734405, "flos": 20954379064320.0, "grad_norm": 2.039814563862096, "language_loss": 0.86534911, "learning_rate": 3.990431845502831e-06, "loss": 0.88881719, "num_input_tokens_seen": 11049145, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.8828125, "step": 520, "time_per_iteration": 2.556694746017456 }, { "auxiliary_loss_clip": 0.01264766, "auxiliary_loss_mlp": 0.01082429, "balance_loss_clip": 1.05181575, "balance_loss_mlp": 1.07581186, "epoch": 0.031324214640012026, "flos": 21650112570240.0, "grad_norm": 1.7128309079850434, "language_loss": 0.8927986, "learning_rate": 3.990394896789228e-06, "loss": 0.91627061, "num_input_tokens_seen": 11068835, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.890625, "step": 521, "time_per_iteration": 2.5922186374664307 }, { "auxiliary_loss_clip": 0.0125919, "auxiliary_loss_mlp": 0.01073432, "balance_loss_clip": 1.04181802, "balance_loss_mlp": 1.0755868, "epoch": 0.03138433789267999, "flos": 23441337020160.0, "grad_norm": 2.081017604216626, "language_loss": 0.70900619, "learning_rate": 3.9903578770435505e-06, "loss": 0.73233241, "num_input_tokens_seen": 11088980, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 1.8359375, "step": 522, "time_per_iteration": 2.5714194774627686 }, { "auxiliary_loss_clip": 0.01262354, "auxiliary_loss_mlp": 0.01080123, "balance_loss_clip": 1.05069065, "balance_loss_mlp": 1.07433248, "epoch": 0.03144446114534796, "flos": 18982164960000.0, "grad_norm": 2.969613946968611, "language_loss": 0.84635419, "learning_rate": 3.99032078626712e-06, "loss": 0.86977887, "num_input_tokens_seen": 11104300, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.8828125, "step": 523, "time_per_iteration": 2.5851404666900635 }, { "auxiliary_loss_clip": 0.01265715, "auxiliary_loss_mlp": 0.01074024, "balance_loss_clip": 1.04310119, "balance_loss_mlp": 1.07491183, "epoch": 0.031504584398015935, "flos": 22637692080000.0, "grad_norm": 2.2479858831978876, "language_loss": 0.89528251, "learning_rate": 3.990283624461261e-06, "loss": 0.91867995, "num_input_tokens_seen": 11123335, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.90625, "step": 524, "time_per_iteration": 2.6020946502685547 }, { "auxiliary_loss_clip": 0.01270515, "auxiliary_loss_mlp": 0.01074548, "balance_loss_clip": 1.04276705, "balance_loss_mlp": 1.07964361, "epoch": 0.0315647076506839, "flos": 25297056339840.0, "grad_norm": 2.785184212351142, "language_loss": 0.79663026, "learning_rate": 3.9902463916273e-06, "loss": 0.82008088, "num_input_tokens_seen": 11140880, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 1.90625, "step": 525, "time_per_iteration": 2.6272597312927246 }, { "auxiliary_loss_clip": 0.01262105, "auxiliary_loss_mlp": 0.0107822, "balance_loss_clip": 1.04763055, "balance_loss_mlp": 1.07402825, "epoch": 0.03162483090335187, "flos": 16982839065600.0, "grad_norm": 2.6717314400064356, "language_loss": 0.80528003, "learning_rate": 3.990209087766563e-06, "loss": 0.82868326, "num_input_tokens_seen": 11158710, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.8828125, "step": 526, "time_per_iteration": 2.5017993450164795 }, { "auxiliary_loss_clip": 0.01269118, "auxiliary_loss_mlp": 0.01062972, "balance_loss_clip": 1.03097677, "balance_loss_mlp": 1.0794754, "epoch": 0.03168495415601984, "flos": 18734489706240.0, "grad_norm": 2.00762113535681, "language_loss": 0.81524682, "learning_rate": 3.990171712880383e-06, "loss": 0.83856773, "num_input_tokens_seen": 11177550, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 1.8984375, "step": 527, "time_per_iteration": 2.58986496925354 }, { "auxiliary_loss_clip": 0.01255138, "auxiliary_loss_mlp": 0.01074224, "balance_loss_clip": 1.04363501, "balance_loss_mlp": 1.07143378, "epoch": 0.03174507740868781, "flos": 21214875473280.0, "grad_norm": 1.8701526715133683, "language_loss": 0.93597484, "learning_rate": 3.990134266970095e-06, "loss": 0.95926857, "num_input_tokens_seen": 11196230, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.8359375, "step": 528, "time_per_iteration": 2.57332706451416 }, { "auxiliary_loss_clip": 0.01258565, "auxiliary_loss_mlp": 0.01066661, "balance_loss_clip": 1.03654873, "balance_loss_mlp": 1.07296944, "epoch": 0.03180520066135578, "flos": 24787663614720.0, "grad_norm": 1.798161746848025, "language_loss": 0.83847803, "learning_rate": 3.9900967500370335e-06, "loss": 0.86173034, "num_input_tokens_seen": 11214935, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.859375, "step": 529, "time_per_iteration": 2.7009239196777344 }, { "auxiliary_loss_clip": 0.01261875, "auxiliary_loss_mlp": 0.01079063, "balance_loss_clip": 1.04903424, "balance_loss_mlp": 1.07989311, "epoch": 0.03186532391402375, "flos": 24864261367680.0, "grad_norm": 1.9765730997549977, "language_loss": 0.90606391, "learning_rate": 3.990059162082539e-06, "loss": 0.92947328, "num_input_tokens_seen": 11235310, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.8203125, "step": 530, "time_per_iteration": 2.6059765815734863 }, { "auxiliary_loss_clip": 0.01255766, "auxiliary_loss_mlp": 0.01070273, "balance_loss_clip": 1.03875411, "balance_loss_mlp": 1.07009792, "epoch": 0.03192544716669172, "flos": 21215055041280.0, "grad_norm": 2.2151798830393945, "language_loss": 0.75899637, "learning_rate": 3.9900215031079515e-06, "loss": 0.78225678, "num_input_tokens_seen": 11254425, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 1.859375, "step": 531, "time_per_iteration": 2.6077959537506104 }, { "auxiliary_loss_clip": 0.01255618, "auxiliary_loss_mlp": 0.01060851, "balance_loss_clip": 1.02947462, "balance_loss_mlp": 1.07155848, "epoch": 0.03198557041935969, "flos": 24353216616960.0, "grad_norm": 1.9105582401841896, "language_loss": 0.90134823, "learning_rate": 3.989983773114616e-06, "loss": 0.92451298, "num_input_tokens_seen": 11274595, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 1.84375, "step": 532, "time_per_iteration": 2.6206889152526855 }, { "auxiliary_loss_clip": 0.01129329, "auxiliary_loss_mlp": 0.01015149, "balance_loss_clip": 1.00861645, "balance_loss_mlp": 1.0428226, "epoch": 0.032045693672027656, "flos": 61827367587840.0, "grad_norm": 0.9290551058757479, "language_loss": 0.57957184, "learning_rate": 3.989945972103877e-06, "loss": 0.60101664, "num_input_tokens_seen": 11336705, "router_z_loss_clip": 0.06542969, "router_z_loss_mlp": 0.8671875, "step": 533, "time_per_iteration": 3.2213804721832275 }, { "auxiliary_loss_clip": 0.01255227, "auxiliary_loss_mlp": 0.01076765, "balance_loss_clip": 1.04693925, "balance_loss_mlp": 1.07163024, "epoch": 0.03210581692469563, "flos": 28401174800640.0, "grad_norm": 1.77515191840988, "language_loss": 0.86576748, "learning_rate": 3.989908100077087e-06, "loss": 0.88908744, "num_input_tokens_seen": 11356820, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.8359375, "step": 534, "time_per_iteration": 2.676973581314087 }, { "auxiliary_loss_clip": 0.01256969, "auxiliary_loss_mlp": 0.01059496, "balance_loss_clip": 1.02993214, "balance_loss_mlp": 1.07377505, "epoch": 0.03216594017736359, "flos": 24717709877760.0, "grad_norm": 2.3271290558115996, "language_loss": 0.77045584, "learning_rate": 3.989870157035594e-06, "loss": 0.79362053, "num_input_tokens_seen": 11376645, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.828125, "step": 535, "time_per_iteration": 2.6185858249664307 }, { "auxiliary_loss_clip": 0.01259618, "auxiliary_loss_mlp": 0.01081116, "balance_loss_clip": 1.0505743, "balance_loss_mlp": 1.07520175, "epoch": 0.032226063430031565, "flos": 31175453646720.0, "grad_norm": 2.9168223568977347, "language_loss": 0.80510807, "learning_rate": 3.989832142980754e-06, "loss": 0.82851541, "num_input_tokens_seen": 11397310, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.84375, "step": 536, "time_per_iteration": 2.676380157470703 }, { "auxiliary_loss_clip": 0.01259717, "auxiliary_loss_mlp": 0.01074583, "balance_loss_clip": 1.04210997, "balance_loss_mlp": 1.07518578, "epoch": 0.03228618668269954, "flos": 32198225506560.0, "grad_norm": 1.9926160477508257, "language_loss": 0.69749749, "learning_rate": 3.989794057913923e-06, "loss": 0.72084045, "num_input_tokens_seen": 11418475, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.84375, "step": 537, "time_per_iteration": 2.638582229614258 }, { "auxiliary_loss_clip": 0.01259879, "auxiliary_loss_mlp": 0.0107352, "balance_loss_clip": 1.0432409, "balance_loss_mlp": 1.07960761, "epoch": 0.0323463099353675, "flos": 22670154996480.0, "grad_norm": 2.0225919745837855, "language_loss": 0.82534647, "learning_rate": 3.9897559018364615e-06, "loss": 0.84868044, "num_input_tokens_seen": 11436630, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.8046875, "step": 538, "time_per_iteration": 2.5811657905578613 }, { "auxiliary_loss_clip": 0.01266521, "auxiliary_loss_mlp": 0.01075429, "balance_loss_clip": 1.04491186, "balance_loss_mlp": 1.07637084, "epoch": 0.032406433188035474, "flos": 26905172232960.0, "grad_norm": 2.8395435128397177, "language_loss": 0.79335338, "learning_rate": 3.98971767474973e-06, "loss": 0.81677288, "num_input_tokens_seen": 11457275, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.90625, "step": 539, "time_per_iteration": 2.5894062519073486 }, { "auxiliary_loss_clip": 0.01260019, "auxiliary_loss_mlp": 0.01066951, "balance_loss_clip": 1.03543162, "balance_loss_mlp": 1.0750916, "epoch": 0.03246655644070344, "flos": 31503928544640.0, "grad_norm": 4.488445308617215, "language_loss": 0.7715764, "learning_rate": 3.989679376655092e-06, "loss": 0.79484612, "num_input_tokens_seen": 11476925, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 1.8515625, "step": 540, "time_per_iteration": 2.747016429901123 }, { "auxiliary_loss_clip": 0.01269217, "auxiliary_loss_mlp": 0.01071475, "balance_loss_clip": 1.04114866, "balance_loss_mlp": 1.07954526, "epoch": 0.03252667969337141, "flos": 23218331431680.0, "grad_norm": 2.163196359751297, "language_loss": 0.85147625, "learning_rate": 3.989641007553916e-06, "loss": 0.87488317, "num_input_tokens_seen": 11496830, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.8984375, "step": 541, "time_per_iteration": 2.54268741607666 }, { "auxiliary_loss_clip": 0.01258318, "auxiliary_loss_mlp": 0.01079904, "balance_loss_clip": 1.04774117, "balance_loss_mlp": 1.0778898, "epoch": 0.032586802946039384, "flos": 14757454926720.0, "grad_norm": 2.358869005492393, "language_loss": 0.88503587, "learning_rate": 3.989602567447569e-06, "loss": 0.90841806, "num_input_tokens_seen": 11515605, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 1.8046875, "step": 542, "time_per_iteration": 2.5552852153778076 }, { "auxiliary_loss_clip": 0.01259398, "auxiliary_loss_mlp": 0.010804, "balance_loss_clip": 1.04983425, "balance_loss_mlp": 1.07527542, "epoch": 0.03264692619870735, "flos": 24280677100800.0, "grad_norm": 1.9880460832081763, "language_loss": 0.70593548, "learning_rate": 3.989564056337426e-06, "loss": 0.72933346, "num_input_tokens_seen": 11536230, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.8359375, "step": 543, "time_per_iteration": 2.5863869190216064 }, { "auxiliary_loss_clip": 0.01256449, "auxiliary_loss_mlp": 0.01080884, "balance_loss_clip": 1.05050921, "balance_loss_mlp": 1.07010627, "epoch": 0.03270704945137532, "flos": 22893160584960.0, "grad_norm": 5.076361814755986, "language_loss": 0.91545188, "learning_rate": 3.989525474224858e-06, "loss": 0.93882519, "num_input_tokens_seen": 11554715, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.859375, "step": 544, "time_per_iteration": 2.620103597640991 }, { "auxiliary_loss_clip": 0.01261268, "auxiliary_loss_mlp": 0.01080316, "balance_loss_clip": 1.05165792, "balance_loss_mlp": 1.07700849, "epoch": 0.032767172704043286, "flos": 18041018757120.0, "grad_norm": 2.3712862323444592, "language_loss": 0.65189195, "learning_rate": 3.989486821111244e-06, "loss": 0.67530775, "num_input_tokens_seen": 11571370, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.84375, "step": 545, "time_per_iteration": 2.5005979537963867 }, { "auxiliary_loss_clip": 0.01258269, "auxiliary_loss_mlp": 0.01068044, "balance_loss_clip": 1.03867078, "balance_loss_mlp": 1.07372105, "epoch": 0.03282729595671126, "flos": 22528739151360.0, "grad_norm": 2.1451058623289847, "language_loss": 0.91776311, "learning_rate": 3.9894480969979635e-06, "loss": 0.94102621, "num_input_tokens_seen": 11588560, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.84375, "step": 546, "time_per_iteration": 2.6217074394226074 }, { "auxiliary_loss_clip": 0.0125657, "auxiliary_loss_mlp": 0.01070939, "balance_loss_clip": 1.03932428, "balance_loss_mlp": 1.07113242, "epoch": 0.03288741920937923, "flos": 20410620001920.0, "grad_norm": 3.9939058327767167, "language_loss": 0.8145631, "learning_rate": 3.989409301886398e-06, "loss": 0.83783817, "num_input_tokens_seen": 11605685, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 1.8515625, "step": 547, "time_per_iteration": 2.554671287536621 }, { "auxiliary_loss_clip": 0.0125851, "auxiliary_loss_mlp": 0.01072132, "balance_loss_clip": 1.0411613, "balance_loss_mlp": 1.07462907, "epoch": 0.032947542462047195, "flos": 20777986350720.0, "grad_norm": 1.7690012924364047, "language_loss": 0.81186306, "learning_rate": 3.989370435777931e-06, "loss": 0.83516949, "num_input_tokens_seen": 11626290, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.84375, "step": 548, "time_per_iteration": 2.7230050563812256 }, { "auxiliary_loss_clip": 0.01258957, "auxiliary_loss_mlp": 0.01074269, "balance_loss_clip": 1.04263103, "balance_loss_mlp": 1.07479036, "epoch": 0.03300766571471517, "flos": 19901263190400.0, "grad_norm": 2.558678116586822, "language_loss": 0.66799796, "learning_rate": 3.989331498673951e-06, "loss": 0.69133019, "num_input_tokens_seen": 11643950, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 1.84375, "step": 549, "time_per_iteration": 2.5464487075805664 }, { "auxiliary_loss_clip": 0.01253767, "auxiliary_loss_mlp": 0.01074497, "balance_loss_clip": 1.04340768, "balance_loss_mlp": 1.07398057, "epoch": 0.03306778896738313, "flos": 17967760968960.0, "grad_norm": 1.9468395590512864, "language_loss": 0.85867715, "learning_rate": 3.9892924905758475e-06, "loss": 0.88195986, "num_input_tokens_seen": 11662560, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 1.796875, "step": 550, "time_per_iteration": 2.5837109088897705 }, { "auxiliary_loss_clip": 0.01259208, "auxiliary_loss_mlp": 0.01085846, "balance_loss_clip": 1.05547118, "balance_loss_mlp": 1.07769179, "epoch": 0.033127912220051105, "flos": 21653380707840.0, "grad_norm": 1.7801880295698413, "language_loss": 0.81216693, "learning_rate": 3.989253411485011e-06, "loss": 0.83561748, "num_input_tokens_seen": 11682265, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.8125, "step": 551, "time_per_iteration": 2.5639636516571045 }, { "auxiliary_loss_clip": 0.01265603, "auxiliary_loss_mlp": 0.01081278, "balance_loss_clip": 1.04987872, "balance_loss_mlp": 1.07689881, "epoch": 0.03318803547271908, "flos": 30188376927360.0, "grad_norm": 2.6586083460935277, "language_loss": 0.86052299, "learning_rate": 3.989214261402838e-06, "loss": 0.88399184, "num_input_tokens_seen": 11699300, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 1.890625, "step": 552, "time_per_iteration": 2.6226749420166016 }, { "auxiliary_loss_clip": 0.01265327, "auxiliary_loss_mlp": 0.01074847, "balance_loss_clip": 1.04227912, "balance_loss_mlp": 1.07999098, "epoch": 0.03324815872538704, "flos": 20376038183040.0, "grad_norm": 2.4600768369722847, "language_loss": 0.92553222, "learning_rate": 3.989175040330724e-06, "loss": 0.94893396, "num_input_tokens_seen": 11716955, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 1.8515625, "step": 553, "time_per_iteration": 2.548058271408081 }, { "auxiliary_loss_clip": 0.01259295, "auxiliary_loss_mlp": 0.01071593, "balance_loss_clip": 1.03904843, "balance_loss_mlp": 1.07543862, "epoch": 0.033308281978055014, "flos": 24494560634880.0, "grad_norm": 2.8343024300039663, "language_loss": 0.78525537, "learning_rate": 3.98913574827007e-06, "loss": 0.80856431, "num_input_tokens_seen": 11736130, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.8359375, "step": 554, "time_per_iteration": 2.600719690322876 }, { "auxiliary_loss_clip": 0.01255109, "auxiliary_loss_mlp": 0.01080896, "balance_loss_clip": 1.0475409, "balance_loss_mlp": 1.0749495, "epoch": 0.03336840523072298, "flos": 23400326666880.0, "grad_norm": 2.147498238622775, "language_loss": 0.81732023, "learning_rate": 3.989096385222278e-06, "loss": 0.8406803, "num_input_tokens_seen": 11754425, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 1.796875, "step": 555, "time_per_iteration": 2.627650737762451 }, { "auxiliary_loss_clip": 0.01261801, "auxiliary_loss_mlp": 0.01083507, "balance_loss_clip": 1.05108202, "balance_loss_mlp": 1.07504845, "epoch": 0.03342852848339095, "flos": 30550571717760.0, "grad_norm": 2.148322989492087, "language_loss": 0.88565683, "learning_rate": 3.989056951188753e-06, "loss": 0.90910995, "num_input_tokens_seen": 11772845, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.8671875, "step": 556, "time_per_iteration": 2.668663740158081 }, { "auxiliary_loss_clip": 0.01257486, "auxiliary_loss_mlp": 0.010787, "balance_loss_clip": 1.04584599, "balance_loss_mlp": 1.0748471, "epoch": 0.03348865173605892, "flos": 22893304239360.0, "grad_norm": 2.104324395654354, "language_loss": 0.83287287, "learning_rate": 3.989017446170901e-06, "loss": 0.85623467, "num_input_tokens_seen": 11792850, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 1.828125, "step": 557, "time_per_iteration": 5.648193597793579 }, { "auxiliary_loss_clip": 0.01265961, "auxiliary_loss_mlp": 0.01063473, "balance_loss_clip": 1.03286004, "balance_loss_mlp": 1.08076453, "epoch": 0.03354877498872689, "flos": 17676022705920.0, "grad_norm": 2.680616556440576, "language_loss": 0.9399389, "learning_rate": 3.988977870170133e-06, "loss": 0.96323317, "num_input_tokens_seen": 11809670, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 1.8515625, "step": 558, "time_per_iteration": 3.9046638011932373 }, { "auxiliary_loss_clip": 0.01256048, "auxiliary_loss_mlp": 0.01070398, "balance_loss_clip": 1.03887856, "balance_loss_mlp": 1.07334423, "epoch": 0.03360889824139486, "flos": 21652985658240.0, "grad_norm": 2.21108526479943, "language_loss": 0.76590109, "learning_rate": 3.988938223187861e-06, "loss": 0.7891655, "num_input_tokens_seen": 11829665, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 1.828125, "step": 559, "time_per_iteration": 4.060724973678589 }, { "auxiliary_loss_clip": 0.01257054, "auxiliary_loss_mlp": 0.01080658, "balance_loss_clip": 1.04954481, "balance_loss_mlp": 1.07360518, "epoch": 0.033669021494062826, "flos": 21795730306560.0, "grad_norm": 2.009241823464463, "language_loss": 0.87438536, "learning_rate": 3.9888985052255005e-06, "loss": 0.89776254, "num_input_tokens_seen": 11848190, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 1.8359375, "step": 560, "time_per_iteration": 2.6723945140838623 }, { "auxiliary_loss_clip": 0.01254428, "auxiliary_loss_mlp": 0.01064179, "balance_loss_clip": 1.03590286, "balance_loss_mlp": 1.07401705, "epoch": 0.0337291447467308, "flos": 21866222747520.0, "grad_norm": 2.3127620321042826, "language_loss": 0.81106544, "learning_rate": 3.988858716284468e-06, "loss": 0.83425152, "num_input_tokens_seen": 11864795, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.8046875, "step": 561, "time_per_iteration": 2.5878806114196777 }, { "auxiliary_loss_clip": 0.01253596, "auxiliary_loss_mlp": 0.01072409, "balance_loss_clip": 1.04234469, "balance_loss_mlp": 1.07385218, "epoch": 0.03378926799939877, "flos": 24245951627520.0, "grad_norm": 1.7897799363861522, "language_loss": 0.82135189, "learning_rate": 3.988818856366184e-06, "loss": 0.84461194, "num_input_tokens_seen": 11885275, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.796875, "step": 562, "time_per_iteration": 2.5871329307556152 }, { "auxiliary_loss_clip": 0.01263675, "auxiliary_loss_mlp": 0.0107999, "balance_loss_clip": 1.04844689, "balance_loss_mlp": 1.07816195, "epoch": 0.033849391252066735, "flos": 16507812677760.0, "grad_norm": 1.951612843278937, "language_loss": 0.83727503, "learning_rate": 3.9887789254720704e-06, "loss": 0.86071169, "num_input_tokens_seen": 11903595, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 1.859375, "step": 563, "time_per_iteration": 2.5728065967559814 }, { "auxiliary_loss_clip": 0.01258384, "auxiliary_loss_mlp": 0.01083104, "balance_loss_clip": 1.05229998, "balance_loss_mlp": 1.07646847, "epoch": 0.03390951450473471, "flos": 15669298609920.0, "grad_norm": 2.315942613155138, "language_loss": 0.9339813, "learning_rate": 3.988738923603553e-06, "loss": 0.95739615, "num_input_tokens_seen": 11917815, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.8203125, "step": 564, "time_per_iteration": 2.476961135864258 }, { "auxiliary_loss_clip": 0.01258558, "auxiliary_loss_mlp": 0.0107003, "balance_loss_clip": 1.03808224, "balance_loss_mlp": 1.07399964, "epoch": 0.03396963775740267, "flos": 22674787850880.0, "grad_norm": 2.5173072577854474, "language_loss": 0.93834895, "learning_rate": 3.98869885076206e-06, "loss": 0.96163487, "num_input_tokens_seen": 11936305, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 1.84375, "step": 565, "time_per_iteration": 2.5713024139404297 }, { "auxiliary_loss_clip": 0.01126574, "auxiliary_loss_mlp": 0.0102866, "balance_loss_clip": 1.02286673, "balance_loss_mlp": 1.04249215, "epoch": 0.034029761010070644, "flos": 64392683063040.0, "grad_norm": 1.1268311288974795, "language_loss": 0.54866433, "learning_rate": 3.9886587069490195e-06, "loss": 0.57021666, "num_input_tokens_seen": 11998940, "router_z_loss_clip": 0.05786133, "router_z_loss_mlp": 0.83984375, "step": 566, "time_per_iteration": 3.1825897693634033 }, { "auxiliary_loss_clip": 0.01260726, "auxiliary_loss_mlp": 0.01068542, "balance_loss_clip": 1.03573549, "balance_loss_mlp": 1.07706189, "epoch": 0.034089884262738616, "flos": 25004204755200.0, "grad_norm": 4.245925060936882, "language_loss": 0.76406085, "learning_rate": 3.988618492165865e-06, "loss": 0.78735358, "num_input_tokens_seen": 12018860, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 1.8359375, "step": 567, "time_per_iteration": 2.6059341430664062 }, { "auxiliary_loss_clip": 0.01252843, "auxiliary_loss_mlp": 0.01086024, "balance_loss_clip": 1.05603051, "balance_loss_mlp": 1.07503176, "epoch": 0.03415000751540658, "flos": 28439096584320.0, "grad_norm": 2.052115385445112, "language_loss": 0.8043313, "learning_rate": 3.988578206414032e-06, "loss": 0.82771999, "num_input_tokens_seen": 12039675, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.78125, "step": 568, "time_per_iteration": 2.6420023441314697 }, { "auxiliary_loss_clip": 0.01255384, "auxiliary_loss_mlp": 0.01079647, "balance_loss_clip": 1.04839039, "balance_loss_mlp": 1.07799995, "epoch": 0.034210130768074554, "flos": 21468727866240.0, "grad_norm": 1.7628249573922368, "language_loss": 0.86359704, "learning_rate": 3.988537849694959e-06, "loss": 0.88694739, "num_input_tokens_seen": 12057680, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.7734375, "step": 569, "time_per_iteration": 2.542300224304199 }, { "auxiliary_loss_clip": 0.01255714, "auxiliary_loss_mlp": 0.01073745, "balance_loss_clip": 1.04317951, "balance_loss_mlp": 1.07514322, "epoch": 0.03427025402074252, "flos": 18697501676160.0, "grad_norm": 1.7366384950231661, "language_loss": 0.95509076, "learning_rate": 3.988497422010084e-06, "loss": 0.97838539, "num_input_tokens_seen": 12076135, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 1.8046875, "step": 570, "time_per_iteration": 2.6214234828948975 }, { "auxiliary_loss_clip": 0.01256409, "auxiliary_loss_mlp": 0.01076072, "balance_loss_clip": 1.04231191, "balance_loss_mlp": 1.0714488, "epoch": 0.03433037727341049, "flos": 20849987162880.0, "grad_norm": 2.04266887740158, "language_loss": 0.79405427, "learning_rate": 3.988456923360852e-06, "loss": 0.81737906, "num_input_tokens_seen": 12094785, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 1.8515625, "step": 571, "time_per_iteration": 2.522082805633545 }, { "auxiliary_loss_clip": 0.01255802, "auxiliary_loss_mlp": 0.01083245, "balance_loss_clip": 1.05113006, "balance_loss_mlp": 1.07597351, "epoch": 0.03439050052607846, "flos": 25410282986880.0, "grad_norm": 2.3120342827942366, "language_loss": 0.80133945, "learning_rate": 3.988416353748707e-06, "loss": 0.82472986, "num_input_tokens_seen": 12114590, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 1.796875, "step": 572, "time_per_iteration": 2.640536308288574 }, { "auxiliary_loss_clip": 0.01263801, "auxiliary_loss_mlp": 0.01071701, "balance_loss_clip": 1.04101658, "balance_loss_mlp": 1.0779314, "epoch": 0.03445062377874643, "flos": 17640147997440.0, "grad_norm": 2.0396669469756703, "language_loss": 0.84083164, "learning_rate": 3.988375713175097e-06, "loss": 0.86418664, "num_input_tokens_seen": 12132390, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 1.859375, "step": 573, "time_per_iteration": 2.6076817512512207 }, { "auxiliary_loss_clip": 0.01256991, "auxiliary_loss_mlp": 0.01065045, "balance_loss_clip": 1.03560019, "balance_loss_mlp": 1.07239509, "epoch": 0.0345107470314144, "flos": 16764502245120.0, "grad_norm": 2.322869397710353, "language_loss": 0.76208258, "learning_rate": 3.988335001641473e-06, "loss": 0.78530288, "num_input_tokens_seen": 12149035, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.8515625, "step": 574, "time_per_iteration": 2.664034843444824 }, { "auxiliary_loss_clip": 0.01255845, "auxiliary_loss_mlp": 0.01063954, "balance_loss_clip": 1.03462839, "balance_loss_mlp": 1.07440901, "epoch": 0.03457087028408237, "flos": 14684448533760.0, "grad_norm": 2.5290445346177712, "language_loss": 0.84063077, "learning_rate": 3.988294219149287e-06, "loss": 0.86382884, "num_input_tokens_seen": 12167530, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.8125, "step": 575, "time_per_iteration": 2.6016790866851807 }, { "auxiliary_loss_clip": 0.01250394, "auxiliary_loss_mlp": 0.01069719, "balance_loss_clip": 1.03967834, "balance_loss_mlp": 1.07450104, "epoch": 0.03463099353675034, "flos": 20011293527040.0, "grad_norm": 2.360372753406771, "language_loss": 0.84040666, "learning_rate": 3.9882533656999945e-06, "loss": 0.86360782, "num_input_tokens_seen": 12186340, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.7578125, "step": 576, "time_per_iteration": 2.657914638519287 }, { "auxiliary_loss_clip": 0.01255583, "auxiliary_loss_mlp": 0.01075404, "balance_loss_clip": 1.04567337, "balance_loss_mlp": 1.0797987, "epoch": 0.03469111678941831, "flos": 25301150490240.0, "grad_norm": 2.221085238122376, "language_loss": 0.86295062, "learning_rate": 3.988212441295054e-06, "loss": 0.88626051, "num_input_tokens_seen": 12204090, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.7578125, "step": 577, "time_per_iteration": 2.5982823371887207 }, { "auxiliary_loss_clip": 0.01254248, "auxiliary_loss_mlp": 0.01070372, "balance_loss_clip": 1.04040313, "balance_loss_mlp": 1.07701862, "epoch": 0.034751240042086275, "flos": 23259413612160.0, "grad_norm": 1.8144058714807623, "language_loss": 0.71667302, "learning_rate": 3.9881714459359255e-06, "loss": 0.73991919, "num_input_tokens_seen": 12224850, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.7734375, "step": 578, "time_per_iteration": 2.651177167892456 }, { "auxiliary_loss_clip": 0.01252834, "auxiliary_loss_mlp": 0.010757, "balance_loss_clip": 1.04408586, "balance_loss_mlp": 1.07316303, "epoch": 0.03481136329475425, "flos": 23769237300480.0, "grad_norm": 1.7115528571130005, "language_loss": 0.77305579, "learning_rate": 3.988130379624073e-06, "loss": 0.79634112, "num_input_tokens_seen": 12244935, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 1.796875, "step": 579, "time_per_iteration": 2.557021379470825 }, { "auxiliary_loss_clip": 0.01252929, "auxiliary_loss_mlp": 0.01077209, "balance_loss_clip": 1.04485583, "balance_loss_mlp": 1.07542765, "epoch": 0.03487148654742222, "flos": 20157521794560.0, "grad_norm": 2.016440925449469, "language_loss": 0.86392891, "learning_rate": 3.988089242360961e-06, "loss": 0.88723028, "num_input_tokens_seen": 12262140, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.7734375, "step": 580, "time_per_iteration": 2.5865638256073 }, { "auxiliary_loss_clip": 0.01258261, "auxiliary_loss_mlp": 0.01061549, "balance_loss_clip": 1.03343928, "balance_loss_mlp": 1.07540512, "epoch": 0.034931609800090184, "flos": 15669585918720.0, "grad_norm": 2.4310321614496595, "language_loss": 0.82044172, "learning_rate": 3.988048034148057e-06, "loss": 0.84363985, "num_input_tokens_seen": 12280930, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.828125, "step": 581, "time_per_iteration": 2.536848306655884 }, { "auxiliary_loss_clip": 0.0125555, "auxiliary_loss_mlp": 0.01072014, "balance_loss_clip": 1.04264069, "balance_loss_mlp": 1.07769275, "epoch": 0.034991733052758156, "flos": 16362374509440.0, "grad_norm": 2.2384844718619274, "language_loss": 0.77242154, "learning_rate": 3.988006754986834e-06, "loss": 0.79569721, "num_input_tokens_seen": 12299125, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.78125, "step": 582, "time_per_iteration": 2.5745701789855957 }, { "auxiliary_loss_clip": 0.01253605, "auxiliary_loss_mlp": 0.01078858, "balance_loss_clip": 1.04763722, "balance_loss_mlp": 1.07812428, "epoch": 0.03505185630542612, "flos": 19387309438080.0, "grad_norm": 1.9765778929831261, "language_loss": 0.87503493, "learning_rate": 3.987965404878763e-06, "loss": 0.8983596, "num_input_tokens_seen": 12316905, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.7578125, "step": 583, "time_per_iteration": 2.5360467433929443 }, { "auxiliary_loss_clip": 0.0125876, "auxiliary_loss_mlp": 0.01087096, "balance_loss_clip": 1.05607748, "balance_loss_mlp": 1.07582653, "epoch": 0.03511197955809409, "flos": 21323828401920.0, "grad_norm": 2.9038608173796314, "language_loss": 0.80884755, "learning_rate": 3.987923983825321e-06, "loss": 0.83230609, "num_input_tokens_seen": 12335070, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.828125, "step": 584, "time_per_iteration": 2.6747288703918457 }, { "auxiliary_loss_clip": 0.01254894, "auxiliary_loss_mlp": 0.01070997, "balance_loss_clip": 1.04062295, "balance_loss_mlp": 1.07348275, "epoch": 0.035172102810762065, "flos": 14136595320960.0, "grad_norm": 2.8886426050675262, "language_loss": 0.92062879, "learning_rate": 3.9878824918279845e-06, "loss": 0.94388771, "num_input_tokens_seen": 12350315, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.8125, "step": 585, "time_per_iteration": 2.5379321575164795 }, { "auxiliary_loss_clip": 0.01254764, "auxiliary_loss_mlp": 0.01072793, "balance_loss_clip": 1.04241872, "balance_loss_mlp": 1.07538533, "epoch": 0.03523222606343003, "flos": 20296890564480.0, "grad_norm": 2.1092000545298277, "language_loss": 0.87376904, "learning_rate": 3.9878409288882364e-06, "loss": 0.89704466, "num_input_tokens_seen": 12366030, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.796875, "step": 586, "time_per_iteration": 2.6093432903289795 }, { "auxiliary_loss_clip": 0.0126058, "auxiliary_loss_mlp": 0.01076126, "balance_loss_clip": 1.04672849, "balance_loss_mlp": 1.07813263, "epoch": 0.035292349316098, "flos": 20375822701440.0, "grad_norm": 2.0867426805524483, "language_loss": 0.7615248, "learning_rate": 3.987799295007558e-06, "loss": 0.78489184, "num_input_tokens_seen": 12384895, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.828125, "step": 587, "time_per_iteration": 2.5078279972076416 }, { "auxiliary_loss_clip": 0.01259371, "auxiliary_loss_mlp": 0.01064539, "balance_loss_clip": 1.03280544, "balance_loss_mlp": 1.07510114, "epoch": 0.03535247256876597, "flos": 21468871520640.0, "grad_norm": 1.7115232132964906, "language_loss": 0.78523242, "learning_rate": 3.987757590187436e-06, "loss": 0.80847156, "num_input_tokens_seen": 12404980, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 1.84375, "step": 588, "time_per_iteration": 2.5528829097747803 }, { "auxiliary_loss_clip": 0.01257323, "auxiliary_loss_mlp": 0.01074101, "balance_loss_clip": 1.04067504, "balance_loss_mlp": 1.07407963, "epoch": 0.03541259582143394, "flos": 23623044946560.0, "grad_norm": 3.188774345250744, "language_loss": 0.93589193, "learning_rate": 3.987715814429359e-06, "loss": 0.9592061, "num_input_tokens_seen": 12423835, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 1.828125, "step": 589, "time_per_iteration": 2.532121181488037 }, { "auxiliary_loss_clip": 0.0125695, "auxiliary_loss_mlp": 0.01074832, "balance_loss_clip": 1.04481471, "balance_loss_mlp": 1.07394385, "epoch": 0.03547271907410191, "flos": 33726367768320.0, "grad_norm": 2.392619330089972, "language_loss": 0.835181, "learning_rate": 3.987673967734818e-06, "loss": 0.85849875, "num_input_tokens_seen": 12443135, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.828125, "step": 590, "time_per_iteration": 2.851731777191162 }, { "auxiliary_loss_clip": 0.01250621, "auxiliary_loss_mlp": 0.01064111, "balance_loss_clip": 1.03573966, "balance_loss_mlp": 1.07481027, "epoch": 0.03553284232676988, "flos": 21142695093120.0, "grad_norm": 2.106073243781163, "language_loss": 0.86674321, "learning_rate": 3.987632050105306e-06, "loss": 0.88989055, "num_input_tokens_seen": 12462895, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.7578125, "step": 591, "time_per_iteration": 2.595834970474243 }, { "auxiliary_loss_clip": 0.0125366, "auxiliary_loss_mlp": 0.01077788, "balance_loss_clip": 1.04655504, "balance_loss_mlp": 1.07201409, "epoch": 0.03559296557943785, "flos": 20046593617920.0, "grad_norm": 1.8290125648519058, "language_loss": 0.826406, "learning_rate": 3.987590061542319e-06, "loss": 0.84972054, "num_input_tokens_seen": 12481515, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.8125, "step": 592, "time_per_iteration": 2.5674922466278076 }, { "auxiliary_loss_clip": 0.01124476, "auxiliary_loss_mlp": 0.01005617, "balance_loss_clip": 1.00025272, "balance_loss_mlp": 1.04158902, "epoch": 0.035653088832105814, "flos": 60334597244160.0, "grad_norm": 0.8734091938727091, "language_loss": 0.5988313, "learning_rate": 3.987548002047354e-06, "loss": 0.62013227, "num_input_tokens_seen": 12548220, "router_z_loss_clip": 0.05371094, "router_z_loss_mlp": 0.828125, "step": 593, "time_per_iteration": 3.184910297393799 }, { "auxiliary_loss_clip": 0.01254459, "auxiliary_loss_mlp": 0.01071039, "balance_loss_clip": 1.03999734, "balance_loss_mlp": 1.07876348, "epoch": 0.035713212084773786, "flos": 20113135562880.0, "grad_norm": 1.8785343920072017, "language_loss": 0.86807835, "learning_rate": 3.987505871621915e-06, "loss": 0.89133334, "num_input_tokens_seen": 12566105, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 1.7578125, "step": 594, "time_per_iteration": 2.5623509883880615 }, { "auxiliary_loss_clip": 0.01252901, "auxiliary_loss_mlp": 0.0107616, "balance_loss_clip": 1.04595208, "balance_loss_mlp": 1.0722363, "epoch": 0.03577333533744176, "flos": 26285785084800.0, "grad_norm": 1.981014882682513, "language_loss": 0.83206594, "learning_rate": 3.987463670267502e-06, "loss": 0.85535657, "num_input_tokens_seen": 12586680, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.8046875, "step": 595, "time_per_iteration": 2.726280450820923 }, { "auxiliary_loss_clip": 0.01252039, "auxiliary_loss_mlp": 0.01075668, "balance_loss_clip": 1.04517448, "balance_loss_mlp": 1.07537842, "epoch": 0.035833458590109724, "flos": 10889732211840.0, "grad_norm": 2.3920979328145013, "language_loss": 0.95476907, "learning_rate": 3.987421397985625e-06, "loss": 0.97804606, "num_input_tokens_seen": 12601605, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.765625, "step": 596, "time_per_iteration": 2.623023748397827 }, { "auxiliary_loss_clip": 0.01254721, "auxiliary_loss_mlp": 0.01071942, "balance_loss_clip": 1.04330778, "balance_loss_mlp": 1.07654369, "epoch": 0.035893581842777696, "flos": 22090198003200.0, "grad_norm": 2.2185762948991683, "language_loss": 0.8221637, "learning_rate": 3.98737905477779e-06, "loss": 0.84543031, "num_input_tokens_seen": 12620365, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.78125, "step": 597, "time_per_iteration": 2.7191662788391113 }, { "auxiliary_loss_clip": 0.01250743, "auxiliary_loss_mlp": 0.0106928, "balance_loss_clip": 1.03957331, "balance_loss_mlp": 1.07673788, "epoch": 0.03595370509544566, "flos": 23038347358080.0, "grad_norm": 1.5544051352076438, "language_loss": 0.8114022, "learning_rate": 3.987336640645508e-06, "loss": 0.83460248, "num_input_tokens_seen": 12641140, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.734375, "step": 598, "time_per_iteration": 2.587156295776367 }, { "auxiliary_loss_clip": 0.01244923, "auxiliary_loss_mlp": 0.01067666, "balance_loss_clip": 1.03705323, "balance_loss_mlp": 1.07537317, "epoch": 0.03601382834811363, "flos": 20777734955520.0, "grad_norm": 1.7996084766273048, "language_loss": 0.81064773, "learning_rate": 3.987294155590295e-06, "loss": 0.83377361, "num_input_tokens_seen": 12661080, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 1.6953125, "step": 599, "time_per_iteration": 5.519532203674316 }, { "auxiliary_loss_clip": 0.01253921, "auxiliary_loss_mlp": 0.01074083, "balance_loss_clip": 1.04500782, "balance_loss_mlp": 1.07392299, "epoch": 0.036073951600781605, "flos": 23951627585280.0, "grad_norm": 2.0231494304494766, "language_loss": 0.85986555, "learning_rate": 3.987251599613664e-06, "loss": 0.88314557, "num_input_tokens_seen": 12678270, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.796875, "step": 600, "time_per_iteration": 3.999175786972046 }, { "auxiliary_loss_clip": 0.01248724, "auxiliary_loss_mlp": 0.01083079, "balance_loss_clip": 1.05277634, "balance_loss_mlp": 1.07220018, "epoch": 0.03613407485344957, "flos": 18912283050240.0, "grad_norm": 1.9506583047170583, "language_loss": 0.81551719, "learning_rate": 3.987208972717135e-06, "loss": 0.83883518, "num_input_tokens_seen": 12697295, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.765625, "step": 601, "time_per_iteration": 4.959498167037964 }, { "auxiliary_loss_clip": 0.0124463, "auxiliary_loss_mlp": 0.01063359, "balance_loss_clip": 1.03343797, "balance_loss_mlp": 1.07146096, "epoch": 0.03619419810611754, "flos": 23038526926080.0, "grad_norm": 4.016573903919817, "language_loss": 0.74605942, "learning_rate": 3.987166274902231e-06, "loss": 0.76913941, "num_input_tokens_seen": 12716165, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.734375, "step": 602, "time_per_iteration": 2.7005722522735596 }, { "auxiliary_loss_clip": 0.01243003, "auxiliary_loss_mlp": 0.01057861, "balance_loss_clip": 1.02855897, "balance_loss_mlp": 1.07026458, "epoch": 0.03625432135878551, "flos": 29457774293760.0, "grad_norm": 1.8381539610488435, "language_loss": 0.79941344, "learning_rate": 3.987123506170473e-06, "loss": 0.82242215, "num_input_tokens_seen": 12735475, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.7265625, "step": 603, "time_per_iteration": 2.6152331829071045 }, { "auxiliary_loss_clip": 0.01248646, "auxiliary_loss_mlp": 0.01060773, "balance_loss_clip": 1.03216243, "balance_loss_mlp": 1.07572985, "epoch": 0.03631444461145348, "flos": 23508525409920.0, "grad_norm": 2.204649803027775, "language_loss": 0.86293834, "learning_rate": 3.987080666523389e-06, "loss": 0.88603258, "num_input_tokens_seen": 12754540, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.7265625, "step": 604, "time_per_iteration": 2.6081552505493164 }, { "auxiliary_loss_clip": 0.01251843, "auxiliary_loss_mlp": 0.01065747, "balance_loss_clip": 1.03451395, "balance_loss_mlp": 1.07665348, "epoch": 0.03637456786412145, "flos": 16618130323200.0, "grad_norm": 3.6455571623211145, "language_loss": 0.80466646, "learning_rate": 3.987037755962506e-06, "loss": 0.82784235, "num_input_tokens_seen": 12773050, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.75, "step": 605, "time_per_iteration": 2.5297956466674805 }, { "auxiliary_loss_clip": 0.01247056, "auxiliary_loss_mlp": 0.01067745, "balance_loss_clip": 1.03976619, "balance_loss_mlp": 1.07304382, "epoch": 0.03643469111678942, "flos": 15851832549120.0, "grad_norm": 2.5405085617992547, "language_loss": 0.85647821, "learning_rate": 3.986994774489359e-06, "loss": 0.87962627, "num_input_tokens_seen": 12791240, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.7421875, "step": 606, "time_per_iteration": 2.6443703174591064 }, { "auxiliary_loss_clip": 0.01252965, "auxiliary_loss_mlp": 0.01072994, "balance_loss_clip": 1.04288125, "balance_loss_mlp": 1.07535303, "epoch": 0.03649481436945739, "flos": 23620387340160.0, "grad_norm": 1.996721871719595, "language_loss": 0.82262874, "learning_rate": 3.986951722105479e-06, "loss": 0.84588826, "num_input_tokens_seen": 12812245, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.7734375, "step": 607, "time_per_iteration": 2.6321306228637695 }, { "auxiliary_loss_clip": 0.01247452, "auxiliary_loss_mlp": 0.0106507, "balance_loss_clip": 1.03655529, "balance_loss_mlp": 1.07425964, "epoch": 0.036554937622125354, "flos": 21755581879680.0, "grad_norm": 2.3288600929306478, "language_loss": 0.83045191, "learning_rate": 3.986908598812402e-06, "loss": 0.85357714, "num_input_tokens_seen": 12831085, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.734375, "step": 608, "time_per_iteration": 2.5705649852752686 }, { "auxiliary_loss_clip": 0.01250951, "auxiliary_loss_mlp": 0.01063226, "balance_loss_clip": 1.03146839, "balance_loss_mlp": 1.07616568, "epoch": 0.036615060874793326, "flos": 17819772935040.0, "grad_norm": 2.1823678328483953, "language_loss": 0.81750864, "learning_rate": 3.986865404611669e-06, "loss": 0.84065044, "num_input_tokens_seen": 12849115, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 1.75, "step": 609, "time_per_iteration": 2.4959473609924316 }, { "auxiliary_loss_clip": 0.01258082, "auxiliary_loss_mlp": 0.0108221, "balance_loss_clip": 1.05381453, "balance_loss_mlp": 1.08276927, "epoch": 0.0366751841274613, "flos": 26753808320640.0, "grad_norm": 1.97555122511207, "language_loss": 0.7924962, "learning_rate": 3.98682213950482e-06, "loss": 0.81589913, "num_input_tokens_seen": 12868005, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.75, "step": 610, "time_per_iteration": 2.6227381229400635 }, { "auxiliary_loss_clip": 0.01257819, "auxiliary_loss_mlp": 0.01078682, "balance_loss_clip": 1.04876018, "balance_loss_mlp": 1.07960105, "epoch": 0.03673530738012926, "flos": 22196960202240.0, "grad_norm": 1.9560682213369323, "language_loss": 0.87392241, "learning_rate": 3.986778803493401e-06, "loss": 0.89728737, "num_input_tokens_seen": 12886890, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.78125, "step": 611, "time_per_iteration": 2.569667100906372 }, { "auxiliary_loss_clip": 0.01251092, "auxiliary_loss_mlp": 0.01089766, "balance_loss_clip": 1.06024957, "balance_loss_mlp": 1.0779655, "epoch": 0.036795430632797235, "flos": 24681655601280.0, "grad_norm": 2.2611612668333234, "language_loss": 0.7278322, "learning_rate": 3.986735396578956e-06, "loss": 0.75124079, "num_input_tokens_seen": 12906130, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.734375, "step": 612, "time_per_iteration": 2.613240957260132 }, { "auxiliary_loss_clip": 0.01251273, "auxiliary_loss_mlp": 0.01068677, "balance_loss_clip": 1.0388751, "balance_loss_mlp": 1.07561207, "epoch": 0.0368555538854652, "flos": 17748921358080.0, "grad_norm": 3.2271709962898183, "language_loss": 0.78807843, "learning_rate": 3.986691918763034e-06, "loss": 0.81127799, "num_input_tokens_seen": 12925260, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.7578125, "step": 613, "time_per_iteration": 2.523176908493042 }, { "auxiliary_loss_clip": 0.01245631, "auxiliary_loss_mlp": 0.01077629, "balance_loss_clip": 1.04965091, "balance_loss_mlp": 1.0748136, "epoch": 0.03691567713813317, "flos": 20594554571520.0, "grad_norm": 1.8458518196252214, "language_loss": 0.93698722, "learning_rate": 3.98664837004719e-06, "loss": 0.96021974, "num_input_tokens_seen": 12944590, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.7109375, "step": 614, "time_per_iteration": 2.5507683753967285 }, { "auxiliary_loss_clip": 0.01257998, "auxiliary_loss_mlp": 0.0107764, "balance_loss_clip": 1.0458585, "balance_loss_mlp": 1.08083868, "epoch": 0.036975800390801145, "flos": 33650380546560.0, "grad_norm": 2.8852163202186567, "language_loss": 0.72728878, "learning_rate": 3.986604750432974e-06, "loss": 0.75064516, "num_input_tokens_seen": 12964785, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 1.7734375, "step": 615, "time_per_iteration": 2.6319663524627686 }, { "auxiliary_loss_clip": 0.01251778, "auxiliary_loss_mlp": 0.01068086, "balance_loss_clip": 1.03831923, "balance_loss_mlp": 1.07460856, "epoch": 0.03703592364346911, "flos": 28293694329600.0, "grad_norm": 1.8325130201385704, "language_loss": 0.81394964, "learning_rate": 3.986561059921947e-06, "loss": 0.83714825, "num_input_tokens_seen": 12986705, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.7734375, "step": 616, "time_per_iteration": 2.5895326137542725 }, { "auxiliary_loss_clip": 0.01250991, "auxiliary_loss_mlp": 0.01067151, "balance_loss_clip": 1.03913641, "balance_loss_mlp": 1.07636094, "epoch": 0.03709604689613708, "flos": 31504215853440.0, "grad_norm": 2.01443959285179, "language_loss": 0.67716378, "learning_rate": 3.986517298515664e-06, "loss": 0.70034522, "num_input_tokens_seen": 13010560, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.75, "step": 617, "time_per_iteration": 2.676222324371338 }, { "auxiliary_loss_clip": 0.01254947, "auxiliary_loss_mlp": 0.01072442, "balance_loss_clip": 1.04120898, "balance_loss_mlp": 1.07986903, "epoch": 0.03715617014880505, "flos": 19609381272960.0, "grad_norm": 2.004023106516082, "language_loss": 0.80116558, "learning_rate": 3.9864734662156884e-06, "loss": 0.82443947, "num_input_tokens_seen": 13028935, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.75, "step": 618, "time_per_iteration": 2.5659215450286865 }, { "auxiliary_loss_clip": 0.01256262, "auxiliary_loss_mlp": 0.01074665, "balance_loss_clip": 1.0427649, "balance_loss_mlp": 1.07802653, "epoch": 0.03721629340147302, "flos": 15924192497280.0, "grad_norm": 2.0730634155402963, "language_loss": 0.91515291, "learning_rate": 3.986429563023585e-06, "loss": 0.93846214, "num_input_tokens_seen": 13046000, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 1.78125, "step": 619, "time_per_iteration": 2.562534809112549 }, { "auxiliary_loss_clip": 0.01251215, "auxiliary_loss_mlp": 0.01081344, "balance_loss_clip": 1.05223322, "balance_loss_mlp": 1.07864904, "epoch": 0.03727641665414099, "flos": 21104090951040.0, "grad_norm": 2.1985938834612906, "language_loss": 0.9406476, "learning_rate": 3.986385588940921e-06, "loss": 0.96397316, "num_input_tokens_seen": 13062995, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.7265625, "step": 620, "time_per_iteration": 2.565897226333618 }, { "auxiliary_loss_clip": 0.0124985, "auxiliary_loss_mlp": 0.01076282, "balance_loss_clip": 1.04492998, "balance_loss_mlp": 1.07573009, "epoch": 0.037336539906808956, "flos": 24131683486080.0, "grad_norm": 2.053562896586196, "language_loss": 0.76765794, "learning_rate": 3.986341543969264e-06, "loss": 0.79091924, "num_input_tokens_seen": 13084120, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.7421875, "step": 621, "time_per_iteration": 2.6217050552368164 }, { "auxiliary_loss_clip": 0.01246823, "auxiliary_loss_mlp": 0.01059443, "balance_loss_clip": 1.02942586, "balance_loss_mlp": 1.07682157, "epoch": 0.03739666315947693, "flos": 22346384780160.0, "grad_norm": 3.679124322035662, "language_loss": 0.86665416, "learning_rate": 3.986297428110187e-06, "loss": 0.88971674, "num_input_tokens_seen": 13100035, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.703125, "step": 622, "time_per_iteration": 2.576777458190918 }, { "auxiliary_loss_clip": 0.01247252, "auxiliary_loss_mlp": 0.01063766, "balance_loss_clip": 1.03409529, "balance_loss_mlp": 1.07478857, "epoch": 0.0374567864121449, "flos": 20449511452800.0, "grad_norm": 4.016174248563173, "language_loss": 0.89910966, "learning_rate": 3.986253241365264e-06, "loss": 0.92221981, "num_input_tokens_seen": 13118070, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.71875, "step": 623, "time_per_iteration": 2.6833608150482178 }, { "auxiliary_loss_clip": 0.0125151, "auxiliary_loss_mlp": 0.01083053, "balance_loss_clip": 1.05210674, "balance_loss_mlp": 1.0786463, "epoch": 0.037516909664812866, "flos": 19208043636480.0, "grad_norm": 1.8358446408368834, "language_loss": 0.83858961, "learning_rate": 3.986208983736073e-06, "loss": 0.86193526, "num_input_tokens_seen": 13136355, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 1.7265625, "step": 624, "time_per_iteration": 2.6495885848999023 }, { "auxiliary_loss_clip": 0.01253262, "auxiliary_loss_mlp": 0.01070455, "balance_loss_clip": 1.03989029, "balance_loss_mlp": 1.07559419, "epoch": 0.03757703291748084, "flos": 35005218664320.0, "grad_norm": 2.125493977213546, "language_loss": 0.66987264, "learning_rate": 3.986164655224191e-06, "loss": 0.69310987, "num_input_tokens_seen": 13155435, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.78125, "step": 625, "time_per_iteration": 2.753251075744629 }, { "auxiliary_loss_clip": 0.01244996, "auxiliary_loss_mlp": 0.01063374, "balance_loss_clip": 1.03259444, "balance_loss_mlp": 1.07525873, "epoch": 0.0376371561701488, "flos": 25483899911040.0, "grad_norm": 2.201002688946779, "language_loss": 0.77225691, "learning_rate": 3.986120255831202e-06, "loss": 0.7953406, "num_input_tokens_seen": 13174295, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.703125, "step": 626, "time_per_iteration": 2.581388235092163 }, { "auxiliary_loss_clip": 0.01250062, "auxiliary_loss_mlp": 0.01070661, "balance_loss_clip": 1.0412643, "balance_loss_mlp": 1.07675552, "epoch": 0.037697279422816775, "flos": 18185630912640.0, "grad_norm": 2.557235574195682, "language_loss": 0.81166303, "learning_rate": 3.986075785558691e-06, "loss": 0.83487022, "num_input_tokens_seen": 13192500, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.734375, "step": 627, "time_per_iteration": 2.5725533962249756 }, { "auxiliary_loss_clip": 0.01253343, "auxiliary_loss_mlp": 0.01079322, "balance_loss_clip": 1.04931676, "balance_loss_mlp": 1.07894433, "epoch": 0.03775740267548475, "flos": 24644272521600.0, "grad_norm": 1.631280397167843, "language_loss": 0.88448453, "learning_rate": 3.986031244408243e-06, "loss": 0.90781116, "num_input_tokens_seen": 13213470, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.75, "step": 628, "time_per_iteration": 2.617056369781494 }, { "auxiliary_loss_clip": 0.01246402, "auxiliary_loss_mlp": 0.01071071, "balance_loss_clip": 1.04179347, "balance_loss_mlp": 1.07220054, "epoch": 0.03781752592815271, "flos": 21288205088640.0, "grad_norm": 2.2649971783375475, "language_loss": 0.79783881, "learning_rate": 3.985986632381449e-06, "loss": 0.82101351, "num_input_tokens_seen": 13232365, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.7421875, "step": 629, "time_per_iteration": 2.600954055786133 }, { "auxiliary_loss_clip": 0.012479, "auxiliary_loss_mlp": 0.01062757, "balance_loss_clip": 1.03464782, "balance_loss_mlp": 1.07465816, "epoch": 0.037877649180820684, "flos": 22089623385600.0, "grad_norm": 1.7078636044782585, "language_loss": 0.76839006, "learning_rate": 3.9859419494799e-06, "loss": 0.79149663, "num_input_tokens_seen": 13251920, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.734375, "step": 630, "time_per_iteration": 2.627903699874878 }, { "auxiliary_loss_clip": 0.01253161, "auxiliary_loss_mlp": 0.01063501, "balance_loss_clip": 1.03508186, "balance_loss_mlp": 1.07688606, "epoch": 0.03793777243348865, "flos": 14501339976960.0, "grad_norm": 2.437429811776212, "language_loss": 0.90812445, "learning_rate": 3.985897195705192e-06, "loss": 0.93129098, "num_input_tokens_seen": 13267440, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.765625, "step": 631, "time_per_iteration": 2.5817153453826904 }, { "auxiliary_loss_clip": 0.01242061, "auxiliary_loss_mlp": 0.01079265, "balance_loss_clip": 1.0482471, "balance_loss_mlp": 1.07334578, "epoch": 0.03799789568615662, "flos": 21908418249600.0, "grad_norm": 1.6138215366737259, "language_loss": 0.92008108, "learning_rate": 3.985852371058921e-06, "loss": 0.94329441, "num_input_tokens_seen": 13287850, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 1.6875, "step": 632, "time_per_iteration": 2.5959043502807617 }, { "auxiliary_loss_clip": 0.0124633, "auxiliary_loss_mlp": 0.01062804, "balance_loss_clip": 1.03450394, "balance_loss_mlp": 1.07576632, "epoch": 0.03805801893882459, "flos": 24827021942400.0, "grad_norm": 1.9416139464929654, "language_loss": 0.83055454, "learning_rate": 3.985807475542687e-06, "loss": 0.85364586, "num_input_tokens_seen": 13307760, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.703125, "step": 633, "time_per_iteration": 2.7053534984588623 }, { "auxiliary_loss_clip": 0.01246472, "auxiliary_loss_mlp": 0.01064497, "balance_loss_clip": 1.03504062, "balance_loss_mlp": 1.0733633, "epoch": 0.03811814219149256, "flos": 30482952364800.0, "grad_norm": 1.580312738588131, "language_loss": 0.69583958, "learning_rate": 3.985762509158093e-06, "loss": 0.71894932, "num_input_tokens_seen": 13331230, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.734375, "step": 634, "time_per_iteration": 2.7008888721466064 }, { "auxiliary_loss_clip": 0.01123403, "auxiliary_loss_mlp": 0.01006682, "balance_loss_clip": 1.00131798, "balance_loss_mlp": 1.04435003, "epoch": 0.03817826544416053, "flos": 66992577379200.0, "grad_norm": 0.8716037276095885, "language_loss": 0.61634952, "learning_rate": 3.985717471906742e-06, "loss": 0.63765043, "num_input_tokens_seen": 13394760, "router_z_loss_clip": 0.05371094, "router_z_loss_mlp": 0.7890625, "step": 635, "time_per_iteration": 3.1649985313415527 }, { "auxiliary_loss_clip": 0.01243022, "auxiliary_loss_mlp": 0.01066858, "balance_loss_clip": 1.03847384, "balance_loss_mlp": 1.07240057, "epoch": 0.038238388696828496, "flos": 20485350247680.0, "grad_norm": 2.253237364878562, "language_loss": 0.83707213, "learning_rate": 3.985672363790243e-06, "loss": 0.86017096, "num_input_tokens_seen": 13412775, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.703125, "step": 636, "time_per_iteration": 2.577866792678833 }, { "auxiliary_loss_clip": 0.0124749, "auxiliary_loss_mlp": 0.01080514, "balance_loss_clip": 1.05170107, "balance_loss_mlp": 1.07584453, "epoch": 0.03829851194949647, "flos": 17965893461760.0, "grad_norm": 2.468337173306768, "language_loss": 0.79303133, "learning_rate": 3.985627184810206e-06, "loss": 0.81631142, "num_input_tokens_seen": 13427835, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.71875, "step": 637, "time_per_iteration": 2.5275189876556396 }, { "auxiliary_loss_clip": 0.01245178, "auxiliary_loss_mlp": 0.01077131, "balance_loss_clip": 1.04782987, "balance_loss_mlp": 1.07182407, "epoch": 0.03835863520216444, "flos": 22456522857600.0, "grad_norm": 1.9710040869476455, "language_loss": 0.83637178, "learning_rate": 3.985581934968241e-06, "loss": 0.85959488, "num_input_tokens_seen": 13447295, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.734375, "step": 638, "time_per_iteration": 2.6683359146118164 }, { "auxiliary_loss_clip": 0.01251155, "auxiliary_loss_mlp": 0.01068345, "balance_loss_clip": 1.03857791, "balance_loss_mlp": 1.07452559, "epoch": 0.038418758454832405, "flos": 22164425458560.0, "grad_norm": 2.139445115217317, "language_loss": 0.69460773, "learning_rate": 3.985536614265964e-06, "loss": 0.71780276, "num_input_tokens_seen": 13468455, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.765625, "step": 639, "time_per_iteration": 2.6038737297058105 }, { "auxiliary_loss_clip": 0.01247341, "auxiliary_loss_mlp": 0.01076491, "balance_loss_clip": 1.0464263, "balance_loss_mlp": 1.07420635, "epoch": 0.03847888170750038, "flos": 22747435107840.0, "grad_norm": 2.953632012364999, "language_loss": 0.85188246, "learning_rate": 3.985491222704994e-06, "loss": 0.87512076, "num_input_tokens_seen": 13489085, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.7265625, "step": 640, "time_per_iteration": 5.512648820877075 }, { "auxiliary_loss_clip": 0.01247775, "auxiliary_loss_mlp": 0.01074574, "balance_loss_clip": 1.04300737, "balance_loss_mlp": 1.07339716, "epoch": 0.03853900496016834, "flos": 22711201263360.0, "grad_norm": 2.0474761756140736, "language_loss": 0.81865066, "learning_rate": 3.985445760286949e-06, "loss": 0.84187412, "num_input_tokens_seen": 13509120, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 1.7421875, "step": 641, "time_per_iteration": 4.1141016483306885 }, { "auxiliary_loss_clip": 0.0111886, "auxiliary_loss_mlp": 0.01008235, "balance_loss_clip": 1.00265586, "balance_loss_mlp": 1.04150319, "epoch": 0.038599128212836314, "flos": 70399136355840.0, "grad_norm": 0.9268600538029453, "language_loss": 0.65525591, "learning_rate": 3.985400227013452e-06, "loss": 0.6765269, "num_input_tokens_seen": 13562005, "router_z_loss_clip": 0.0559082, "router_z_loss_mlp": 0.7734375, "step": 642, "time_per_iteration": 4.689119338989258 }, { "auxiliary_loss_clip": 0.01242457, "auxiliary_loss_mlp": 0.01057351, "balance_loss_clip": 1.02959931, "balance_loss_mlp": 1.07111597, "epoch": 0.03865925146550429, "flos": 23295144666240.0, "grad_norm": 2.1746473285813983, "language_loss": 0.79471529, "learning_rate": 3.985354622886128e-06, "loss": 0.81771332, "num_input_tokens_seen": 13582185, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.7109375, "step": 643, "time_per_iteration": 2.5945208072662354 }, { "auxiliary_loss_clip": 0.01240323, "auxiliary_loss_mlp": 0.01072743, "balance_loss_clip": 1.04558754, "balance_loss_mlp": 1.06959307, "epoch": 0.03871937471817225, "flos": 21430446946560.0, "grad_norm": 1.7181746798581157, "language_loss": 0.82690012, "learning_rate": 3.985308947906604e-06, "loss": 0.85003078, "num_input_tokens_seen": 13599555, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.703125, "step": 644, "time_per_iteration": 2.5668118000030518 }, { "auxiliary_loss_clip": 0.01244028, "auxiliary_loss_mlp": 0.01060877, "balance_loss_clip": 1.0318737, "balance_loss_mlp": 1.0713197, "epoch": 0.038779497970840224, "flos": 34277309550720.0, "grad_norm": 2.237294656420358, "language_loss": 0.82143736, "learning_rate": 3.985263202076511e-06, "loss": 0.84448642, "num_input_tokens_seen": 13621160, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.7265625, "step": 645, "time_per_iteration": 2.739872455596924 }, { "auxiliary_loss_clip": 0.01247727, "auxiliary_loss_mlp": 0.01066697, "balance_loss_clip": 1.03760958, "balance_loss_mlp": 1.07302999, "epoch": 0.03883962122350819, "flos": 22748189293440.0, "grad_norm": 2.3832241315466187, "language_loss": 0.81823689, "learning_rate": 3.985217385397481e-06, "loss": 0.84138119, "num_input_tokens_seen": 13641915, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.75, "step": 646, "time_per_iteration": 2.5950140953063965 }, { "auxiliary_loss_clip": 0.01247073, "auxiliary_loss_mlp": 0.01071634, "balance_loss_clip": 1.04383445, "balance_loss_mlp": 1.07640052, "epoch": 0.03889974447617616, "flos": 21945837242880.0, "grad_norm": 2.2351376408101613, "language_loss": 0.81671929, "learning_rate": 3.985171497871149e-06, "loss": 0.83990639, "num_input_tokens_seen": 13661410, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.703125, "step": 647, "time_per_iteration": 2.6046721935272217 }, { "auxiliary_loss_clip": 0.01243385, "auxiliary_loss_mlp": 0.01065753, "balance_loss_clip": 1.03868055, "balance_loss_mlp": 1.0724957, "epoch": 0.03895986772884413, "flos": 31504826384640.0, "grad_norm": 2.0400827847582605, "language_loss": 0.83933848, "learning_rate": 3.985125539499152e-06, "loss": 0.86242986, "num_input_tokens_seen": 13681705, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.7109375, "step": 648, "time_per_iteration": 2.632535696029663 }, { "auxiliary_loss_clip": 0.01242812, "auxiliary_loss_mlp": 0.01069609, "balance_loss_clip": 1.04126143, "balance_loss_mlp": 1.07241559, "epoch": 0.0390199909815121, "flos": 19901011795200.0, "grad_norm": 2.124505419062883, "language_loss": 0.84159529, "learning_rate": 3.9850795102831315e-06, "loss": 0.86471951, "num_input_tokens_seen": 13700400, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.703125, "step": 649, "time_per_iteration": 2.6083931922912598 }, { "auxiliary_loss_clip": 0.01244291, "auxiliary_loss_mlp": 0.0106308, "balance_loss_clip": 1.03461242, "balance_loss_mlp": 1.07169914, "epoch": 0.03908011423418007, "flos": 21612478095360.0, "grad_norm": 1.7472079318039213, "language_loss": 0.79691672, "learning_rate": 3.9850334102247295e-06, "loss": 0.81999052, "num_input_tokens_seen": 13720145, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.7265625, "step": 650, "time_per_iteration": 2.583803653717041 }, { "auxiliary_loss_clip": 0.01237695, "auxiliary_loss_mlp": 0.01068799, "balance_loss_clip": 1.03953314, "balance_loss_mlp": 1.06756854, "epoch": 0.039140237486848035, "flos": 18661411486080.0, "grad_norm": 2.0665042804737697, "language_loss": 0.78664184, "learning_rate": 3.984987239325592e-06, "loss": 0.80970681, "num_input_tokens_seen": 13737500, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.703125, "step": 651, "time_per_iteration": 2.6574289798736572 }, { "auxiliary_loss_clip": 0.01241066, "auxiliary_loss_mlp": 0.01074458, "balance_loss_clip": 1.04346418, "balance_loss_mlp": 1.06999183, "epoch": 0.03920036073951601, "flos": 18661124177280.0, "grad_norm": 3.903234604318685, "language_loss": 0.87339497, "learning_rate": 3.984940997587364e-06, "loss": 0.89655024, "num_input_tokens_seen": 13754750, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 1.7109375, "step": 652, "time_per_iteration": 2.545186758041382 }, { "auxiliary_loss_clip": 0.01235715, "auxiliary_loss_mlp": 0.01062013, "balance_loss_clip": 1.03449976, "balance_loss_mlp": 1.06988072, "epoch": 0.03926048399218398, "flos": 31354468053120.0, "grad_norm": 2.989574164529167, "language_loss": 0.78954268, "learning_rate": 3.984894685011699e-06, "loss": 0.81251991, "num_input_tokens_seen": 13771990, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.65625, "step": 653, "time_per_iteration": 2.6410961151123047 }, { "auxiliary_loss_clip": 0.01248636, "auxiliary_loss_mlp": 0.01067434, "balance_loss_clip": 1.03901434, "balance_loss_mlp": 1.07302451, "epoch": 0.039320607244851945, "flos": 29603499770880.0, "grad_norm": 2.4170928073718723, "language_loss": 0.85770726, "learning_rate": 3.984848301600248e-06, "loss": 0.88086796, "num_input_tokens_seen": 13792750, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.7578125, "step": 654, "time_per_iteration": 2.708017110824585 }, { "auxiliary_loss_clip": 0.01113446, "auxiliary_loss_mlp": 0.0101238, "balance_loss_clip": 1.00658631, "balance_loss_mlp": 1.03559041, "epoch": 0.03938073049751992, "flos": 66534609951360.0, "grad_norm": 0.8282034470599247, "language_loss": 0.4992471, "learning_rate": 3.984801847354667e-06, "loss": 0.52050537, "num_input_tokens_seen": 13858570, "router_z_loss_clip": 0.05786133, "router_z_loss_mlp": 0.77734375, "step": 655, "time_per_iteration": 3.253936290740967 }, { "auxiliary_loss_clip": 0.01238904, "auxiliary_loss_mlp": 0.01064468, "balance_loss_clip": 1.03693104, "balance_loss_mlp": 1.07282269, "epoch": 0.03944085375018788, "flos": 23367827836800.0, "grad_norm": 1.990628233032477, "language_loss": 0.8034656, "learning_rate": 3.984755322276614e-06, "loss": 0.82649934, "num_input_tokens_seen": 13876335, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.6640625, "step": 656, "time_per_iteration": 2.5678696632385254 }, { "auxiliary_loss_clip": 0.01247701, "auxiliary_loss_mlp": 0.01085959, "balance_loss_clip": 1.05298543, "balance_loss_mlp": 1.07211685, "epoch": 0.039500977002855854, "flos": 18548292579840.0, "grad_norm": 2.6994149977086903, "language_loss": 0.76219249, "learning_rate": 3.9847087263677485e-06, "loss": 0.78552908, "num_input_tokens_seen": 13892640, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 1.75, "step": 657, "time_per_iteration": 2.482665538787842 }, { "auxiliary_loss_clip": 0.01245407, "auxiliary_loss_mlp": 0.01065785, "balance_loss_clip": 1.03684139, "balance_loss_mlp": 1.07259047, "epoch": 0.039561100255523826, "flos": 25338174433920.0, "grad_norm": 2.9783323955145065, "language_loss": 0.81757933, "learning_rate": 3.984662059629734e-06, "loss": 0.84069121, "num_input_tokens_seen": 13910085, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.734375, "step": 658, "time_per_iteration": 2.5366644859313965 }, { "auxiliary_loss_clip": 0.01237985, "auxiliary_loss_mlp": 0.0107071, "balance_loss_clip": 1.0393343, "balance_loss_mlp": 1.06925571, "epoch": 0.03962122350819179, "flos": 18219889509120.0, "grad_norm": 2.006031182843564, "language_loss": 0.9089309, "learning_rate": 3.984615322064235e-06, "loss": 0.9320178, "num_input_tokens_seen": 13928800, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.6875, "step": 659, "time_per_iteration": 2.577533006668091 }, { "auxiliary_loss_clip": 0.01244405, "auxiliary_loss_mlp": 0.01064868, "balance_loss_clip": 1.03560185, "balance_loss_mlp": 1.07038045, "epoch": 0.03968134676085976, "flos": 20522230536960.0, "grad_norm": 2.0979118767493556, "language_loss": 0.78834689, "learning_rate": 3.9845685136729215e-06, "loss": 0.81143957, "num_input_tokens_seen": 13948325, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.7421875, "step": 660, "time_per_iteration": 2.5905871391296387 }, { "auxiliary_loss_clip": 0.01240732, "auxiliary_loss_mlp": 0.01066016, "balance_loss_clip": 1.03692913, "balance_loss_mlp": 1.07247376, "epoch": 0.03974147001352773, "flos": 22422587483520.0, "grad_norm": 1.659075382352201, "language_loss": 0.81590068, "learning_rate": 3.984521634457461e-06, "loss": 0.83896816, "num_input_tokens_seen": 13969090, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.6875, "step": 661, "time_per_iteration": 2.6344351768493652 }, { "auxiliary_loss_clip": 0.01113215, "auxiliary_loss_mlp": 0.01006055, "balance_loss_clip": 1.00078607, "balance_loss_mlp": 1.03491712, "epoch": 0.0398015932661957, "flos": 71128769322240.0, "grad_norm": 0.913898970100561, "language_loss": 0.69446462, "learning_rate": 3.98447468441953e-06, "loss": 0.71565729, "num_input_tokens_seen": 14037555, "router_z_loss_clip": 0.05273438, "router_z_loss_mlp": 0.78125, "step": 662, "time_per_iteration": 3.33259916305542 }, { "auxiliary_loss_clip": 0.01243828, "auxiliary_loss_mlp": 0.0106784, "balance_loss_clip": 1.03875279, "balance_loss_mlp": 1.07314372, "epoch": 0.03986171651886367, "flos": 16800951571200.0, "grad_norm": 1.8950568315809648, "language_loss": 0.82751703, "learning_rate": 3.984427663560801e-06, "loss": 0.85063374, "num_input_tokens_seen": 14055765, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.703125, "step": 663, "time_per_iteration": 2.519582986831665 }, { "auxiliary_loss_clip": 0.01248106, "auxiliary_loss_mlp": 0.01059854, "balance_loss_clip": 1.02976608, "balance_loss_mlp": 1.07350922, "epoch": 0.03992183977153164, "flos": 24535068197760.0, "grad_norm": 2.2084867119477334, "language_loss": 0.87381887, "learning_rate": 3.984380571882954e-06, "loss": 0.89689845, "num_input_tokens_seen": 14074195, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.7421875, "step": 664, "time_per_iteration": 2.590404510498047 }, { "auxiliary_loss_clip": 0.01235653, "auxiliary_loss_mlp": 0.01058955, "balance_loss_clip": 1.02948618, "balance_loss_mlp": 1.07021153, "epoch": 0.03998196302419961, "flos": 15595897167360.0, "grad_norm": 2.352779520651158, "language_loss": 0.8471725, "learning_rate": 3.984333409387668e-06, "loss": 0.87011856, "num_input_tokens_seen": 14090215, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.65625, "step": 665, "time_per_iteration": 2.4688680171966553 }, { "auxiliary_loss_clip": 0.01251381, "auxiliary_loss_mlp": 0.01083235, "balance_loss_clip": 1.05176425, "balance_loss_mlp": 1.07495725, "epoch": 0.04004208627686758, "flos": 25305065072640.0, "grad_norm": 2.0617261125383295, "language_loss": 0.81476867, "learning_rate": 3.984286176076628e-06, "loss": 0.83811486, "num_input_tokens_seen": 14112150, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 1.765625, "step": 666, "time_per_iteration": 2.632394313812256 }, { "auxiliary_loss_clip": 0.01239094, "auxiliary_loss_mlp": 0.01071918, "balance_loss_clip": 1.0414722, "balance_loss_mlp": 1.06987059, "epoch": 0.04010220952953555, "flos": 23475847011840.0, "grad_norm": 1.971367992200873, "language_loss": 0.86823475, "learning_rate": 3.984238871951518e-06, "loss": 0.8913449, "num_input_tokens_seen": 14131475, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.6875, "step": 667, "time_per_iteration": 2.5645246505737305 }, { "auxiliary_loss_clip": 0.01239037, "auxiliary_loss_mlp": 0.01064218, "balance_loss_clip": 1.03532219, "balance_loss_mlp": 1.07384288, "epoch": 0.04016233278220352, "flos": 18617025254400.0, "grad_norm": 2.97148501891518, "language_loss": 0.80371606, "learning_rate": 3.984191497014026e-06, "loss": 0.82674861, "num_input_tokens_seen": 14146165, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.6484375, "step": 668, "time_per_iteration": 2.5883522033691406 }, { "auxiliary_loss_clip": 0.01109507, "auxiliary_loss_mlp": 0.01004691, "balance_loss_clip": 0.99944609, "balance_loss_mlp": 1.03064692, "epoch": 0.040222456034871484, "flos": 70906194696960.0, "grad_norm": 0.7851633589008883, "language_loss": 0.60076702, "learning_rate": 3.984144051265844e-06, "loss": 0.6219089, "num_input_tokens_seen": 14215005, "router_z_loss_clip": 0.05249023, "router_z_loss_mlp": 0.7890625, "step": 669, "time_per_iteration": 3.26953125 }, { "auxiliary_loss_clip": 0.01240575, "auxiliary_loss_mlp": 0.01063656, "balance_loss_clip": 1.03475976, "balance_loss_mlp": 1.06897712, "epoch": 0.040282579287539456, "flos": 23764712186880.0, "grad_norm": 1.809646906222931, "language_loss": 0.8599869, "learning_rate": 3.984096534708665e-06, "loss": 0.88302922, "num_input_tokens_seen": 14235510, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.71875, "step": 670, "time_per_iteration": 2.583090305328369 }, { "auxiliary_loss_clip": 0.01239172, "auxiliary_loss_mlp": 0.01069113, "balance_loss_clip": 1.0403955, "balance_loss_mlp": 1.07127297, "epoch": 0.04034270254020743, "flos": 18478518410880.0, "grad_norm": 1.8932376624530007, "language_loss": 0.74935484, "learning_rate": 3.9840489473441835e-06, "loss": 0.77243769, "num_input_tokens_seen": 14254565, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.6796875, "step": 671, "time_per_iteration": 2.5819382667541504 }, { "auxiliary_loss_clip": 0.01247087, "auxiliary_loss_mlp": 0.01071798, "balance_loss_clip": 1.04190052, "balance_loss_mlp": 1.07448709, "epoch": 0.040402825792875394, "flos": 17201858244480.0, "grad_norm": 2.0582063767102197, "language_loss": 0.92015815, "learning_rate": 3.984001289174099e-06, "loss": 0.94334698, "num_input_tokens_seen": 14271885, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.7265625, "step": 672, "time_per_iteration": 2.518118143081665 }, { "auxiliary_loss_clip": 0.01241719, "auxiliary_loss_mlp": 0.01069865, "balance_loss_clip": 1.03791642, "balance_loss_mlp": 1.07181001, "epoch": 0.040462949045543366, "flos": 19172168928000.0, "grad_norm": 2.1345360755471305, "language_loss": 0.89895844, "learning_rate": 3.983953560200113e-06, "loss": 0.92207426, "num_input_tokens_seen": 14289670, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 1.703125, "step": 673, "time_per_iteration": 2.5383784770965576 }, { "auxiliary_loss_clip": 0.01236113, "auxiliary_loss_mlp": 0.01075759, "balance_loss_clip": 1.0453366, "balance_loss_mlp": 1.06873286, "epoch": 0.04052307229821133, "flos": 24019821555840.0, "grad_norm": 2.1385955046055987, "language_loss": 0.74902678, "learning_rate": 3.983905760423926e-06, "loss": 0.77214545, "num_input_tokens_seen": 14309285, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.671875, "step": 674, "time_per_iteration": 2.6371121406555176 }, { "auxiliary_loss_clip": 0.01247184, "auxiliary_loss_mlp": 0.01061514, "balance_loss_clip": 1.03175974, "balance_loss_mlp": 1.07199442, "epoch": 0.0405831955508793, "flos": 16436601964800.0, "grad_norm": 2.832869998659563, "language_loss": 0.77768517, "learning_rate": 3.983857889847247e-06, "loss": 0.80077213, "num_input_tokens_seen": 14328300, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.75, "step": 675, "time_per_iteration": 2.5843236446380615 }, { "auxiliary_loss_clip": 0.01238782, "auxiliary_loss_mlp": 0.0107301, "balance_loss_clip": 1.04172957, "balance_loss_mlp": 1.06941903, "epoch": 0.040643318803547275, "flos": 24279922915200.0, "grad_norm": 1.7158686632561675, "language_loss": 0.76697409, "learning_rate": 3.983809948471783e-06, "loss": 0.79009199, "num_input_tokens_seen": 14346395, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.6953125, "step": 676, "time_per_iteration": 2.6543962955474854 }, { "auxiliary_loss_clip": 0.0124439, "auxiliary_loss_mlp": 0.01075571, "balance_loss_clip": 1.04534006, "balance_loss_mlp": 1.07181621, "epoch": 0.04070344205621524, "flos": 17712076982400.0, "grad_norm": 2.3244760010018615, "language_loss": 0.849015, "learning_rate": 3.983761936299245e-06, "loss": 0.87221467, "num_input_tokens_seen": 14364605, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.7265625, "step": 677, "time_per_iteration": 2.570965528488159 }, { "auxiliary_loss_clip": 0.01238682, "auxiliary_loss_mlp": 0.01062714, "balance_loss_clip": 1.03285205, "balance_loss_mlp": 1.07118964, "epoch": 0.04076356530888321, "flos": 26177658168960.0, "grad_norm": 1.9413652197631517, "language_loss": 0.7645784, "learning_rate": 3.983713853331345e-06, "loss": 0.78759241, "num_input_tokens_seen": 14385265, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.671875, "step": 678, "time_per_iteration": 2.6181013584136963 }, { "auxiliary_loss_clip": 0.01237251, "auxiliary_loss_mlp": 0.01066415, "balance_loss_clip": 1.03787613, "balance_loss_mlp": 1.06838393, "epoch": 0.04082368856155118, "flos": 35773455772800.0, "grad_norm": 2.65248236382678, "language_loss": 0.82017368, "learning_rate": 3.9836656995698015e-06, "loss": 0.84321034, "num_input_tokens_seen": 14406090, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.6875, "step": 679, "time_per_iteration": 2.731166362762451 }, { "auxiliary_loss_clip": 0.01243551, "auxiliary_loss_mlp": 0.01059493, "balance_loss_clip": 1.0307163, "balance_loss_mlp": 1.07504725, "epoch": 0.04088381181421915, "flos": 28146640049280.0, "grad_norm": 1.6685227261106799, "language_loss": 0.7581954, "learning_rate": 3.983617475016331e-06, "loss": 0.78122586, "num_input_tokens_seen": 14425130, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.6875, "step": 680, "time_per_iteration": 2.6055474281311035 }, { "auxiliary_loss_clip": 0.0124003, "auxiliary_loss_mlp": 0.01066168, "balance_loss_clip": 1.03505397, "balance_loss_mlp": 1.06792986, "epoch": 0.04094393506688712, "flos": 27597673514880.0, "grad_norm": 1.7618044043576149, "language_loss": 0.83264911, "learning_rate": 3.9835691796726555e-06, "loss": 0.8557111, "num_input_tokens_seen": 14447355, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 1.71875, "step": 681, "time_per_iteration": 2.6415843963623047 }, { "auxiliary_loss_clip": 0.01240291, "auxiliary_loss_mlp": 0.01062418, "balance_loss_clip": 1.03228223, "balance_loss_mlp": 1.06868529, "epoch": 0.04100405831955509, "flos": 23112036109440.0, "grad_norm": 1.5873111873299208, "language_loss": 0.71129274, "learning_rate": 3.9835208135404986e-06, "loss": 0.73431981, "num_input_tokens_seen": 14466790, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.71875, "step": 682, "time_per_iteration": 5.567903995513916 }, { "auxiliary_loss_clip": 0.01240346, "auxiliary_loss_mlp": 0.0106595, "balance_loss_clip": 1.03731561, "balance_loss_mlp": 1.06876349, "epoch": 0.04106418157222306, "flos": 20156731695360.0, "grad_norm": 1.7549952360171095, "language_loss": 0.71957463, "learning_rate": 3.9834723766215865e-06, "loss": 0.74263757, "num_input_tokens_seen": 14485195, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.71875, "step": 683, "time_per_iteration": 5.488450765609741 }, { "auxiliary_loss_clip": 0.01238818, "auxiliary_loss_mlp": 0.01077209, "balance_loss_clip": 1.04849195, "balance_loss_mlp": 1.07477438, "epoch": 0.041124304824891024, "flos": 17420697855360.0, "grad_norm": 3.191190919284635, "language_loss": 0.8134231, "learning_rate": 3.983423868917646e-06, "loss": 0.83658338, "num_input_tokens_seen": 14503370, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.640625, "step": 684, "time_per_iteration": 2.5737624168395996 }, { "auxiliary_loss_clip": 0.01238723, "auxiliary_loss_mlp": 0.01067979, "balance_loss_clip": 1.03724623, "balance_loss_mlp": 1.07157135, "epoch": 0.041184428077558996, "flos": 25780163287680.0, "grad_norm": 1.4628721448706163, "language_loss": 0.90639472, "learning_rate": 3.983375290430411e-06, "loss": 0.92946172, "num_input_tokens_seen": 14526415, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.671875, "step": 685, "time_per_iteration": 2.6543519496917725 }, { "auxiliary_loss_clip": 0.01240244, "auxiliary_loss_mlp": 0.010659, "balance_loss_clip": 1.03640771, "balance_loss_mlp": 1.07101417, "epoch": 0.04124455133022697, "flos": 22964766347520.0, "grad_norm": 4.386517621883777, "language_loss": 0.87851155, "learning_rate": 3.983326641161613e-06, "loss": 0.90157306, "num_input_tokens_seen": 14546595, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.6875, "step": 686, "time_per_iteration": 2.576594114303589 }, { "auxiliary_loss_clip": 0.01244878, "auxiliary_loss_mlp": 0.0106941, "balance_loss_clip": 1.04022741, "balance_loss_mlp": 1.07124579, "epoch": 0.04130467458289493, "flos": 21289067015040.0, "grad_norm": 1.9069406633979404, "language_loss": 0.71463013, "learning_rate": 3.9832779211129894e-06, "loss": 0.73777306, "num_input_tokens_seen": 14566590, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.734375, "step": 687, "time_per_iteration": 2.5526106357574463 }, { "auxiliary_loss_clip": 0.01237503, "auxiliary_loss_mlp": 0.01065925, "balance_loss_clip": 1.03870964, "balance_loss_mlp": 1.07408524, "epoch": 0.041364797835562905, "flos": 19974233669760.0, "grad_norm": 1.5466324645988248, "language_loss": 0.85789335, "learning_rate": 3.983229130286278e-06, "loss": 0.88092756, "num_input_tokens_seen": 14585965, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.6328125, "step": 688, "time_per_iteration": 2.6507174968719482 }, { "auxiliary_loss_clip": 0.01236453, "auxiliary_loss_mlp": 0.01072452, "balance_loss_clip": 1.04398477, "balance_loss_mlp": 1.07192016, "epoch": 0.04142492108823087, "flos": 21906227520000.0, "grad_norm": 2.1968904408841143, "language_loss": 0.8322351, "learning_rate": 3.98318026868322e-06, "loss": 0.85532415, "num_input_tokens_seen": 14606015, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.640625, "step": 689, "time_per_iteration": 2.6081230640411377 }, { "auxiliary_loss_clip": 0.0123627, "auxiliary_loss_mlp": 0.01074615, "balance_loss_clip": 1.04699492, "balance_loss_mlp": 1.06861639, "epoch": 0.04148504434089884, "flos": 27639617621760.0, "grad_norm": 2.725563803970748, "language_loss": 0.68065453, "learning_rate": 3.9831313363055606e-06, "loss": 0.70376337, "num_input_tokens_seen": 14629955, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.671875, "step": 690, "time_per_iteration": 2.69132924079895 }, { "auxiliary_loss_clip": 0.01236285, "auxiliary_loss_mlp": 0.01074147, "balance_loss_clip": 1.0454061, "balance_loss_mlp": 1.07010889, "epoch": 0.041545167593566815, "flos": 20518387781760.0, "grad_norm": 1.7156137070451853, "language_loss": 0.74609196, "learning_rate": 3.9830823331550445e-06, "loss": 0.76919633, "num_input_tokens_seen": 14648000, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.6640625, "step": 691, "time_per_iteration": 2.545285940170288 }, { "auxiliary_loss_clip": 0.0123448, "auxiliary_loss_mlp": 0.01075469, "balance_loss_clip": 1.04572654, "balance_loss_mlp": 1.07014656, "epoch": 0.04160529084623478, "flos": 11868907939200.0, "grad_norm": 1.9744956193165077, "language_loss": 0.83990723, "learning_rate": 3.983033259233421e-06, "loss": 0.86300671, "num_input_tokens_seen": 14662235, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.640625, "step": 692, "time_per_iteration": 2.537715196609497 }, { "auxiliary_loss_clip": 0.01239029, "auxiliary_loss_mlp": 0.01067867, "balance_loss_clip": 1.03706288, "balance_loss_mlp": 1.0674665, "epoch": 0.04166541409890275, "flos": 14828306503680.0, "grad_norm": 2.7970612131160792, "language_loss": 0.88865268, "learning_rate": 3.982984114542442e-06, "loss": 0.91172159, "num_input_tokens_seen": 14676065, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.71875, "step": 693, "time_per_iteration": 2.489797353744507 }, { "auxiliary_loss_clip": 0.01241694, "auxiliary_loss_mlp": 0.01060138, "balance_loss_clip": 1.03117061, "balance_loss_mlp": 1.07279861, "epoch": 0.04172553735157072, "flos": 25808137004160.0, "grad_norm": 2.1333219913843635, "language_loss": 0.81715798, "learning_rate": 3.98293489908386e-06, "loss": 0.84017634, "num_input_tokens_seen": 14694955, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.6875, "step": 694, "time_per_iteration": 2.647353172302246 }, { "auxiliary_loss_clip": 0.01239842, "auxiliary_loss_mlp": 0.01067673, "balance_loss_clip": 1.03899097, "balance_loss_mlp": 1.06738973, "epoch": 0.04178566060423869, "flos": 24279815174400.0, "grad_norm": 1.9415142567760009, "language_loss": 0.8310414, "learning_rate": 3.982885612859432e-06, "loss": 0.85411656, "num_input_tokens_seen": 14715510, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.7265625, "step": 695, "time_per_iteration": 2.5801608562469482 }, { "auxiliary_loss_clip": 0.01237186, "auxiliary_loss_mlp": 0.01068595, "balance_loss_clip": 1.03938842, "balance_loss_mlp": 1.07003927, "epoch": 0.04184578385690666, "flos": 18222008411520.0, "grad_norm": 2.259866607286375, "language_loss": 0.84240317, "learning_rate": 3.982836255870918e-06, "loss": 0.86546099, "num_input_tokens_seen": 14731755, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.671875, "step": 696, "time_per_iteration": 2.622473955154419 }, { "auxiliary_loss_clip": 0.01235812, "auxiliary_loss_mlp": 0.01074722, "balance_loss_clip": 1.04434764, "balance_loss_mlp": 1.06756842, "epoch": 0.041905907109574626, "flos": 22776342577920.0, "grad_norm": 2.042240393462993, "language_loss": 0.92737401, "learning_rate": 3.982786828120078e-06, "loss": 0.95047939, "num_input_tokens_seen": 14750810, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.6875, "step": 697, "time_per_iteration": 2.5942654609680176 }, { "auxiliary_loss_clip": 0.01232764, "auxiliary_loss_mlp": 0.01066423, "balance_loss_clip": 1.03895712, "balance_loss_mlp": 1.068645, "epoch": 0.0419660303622426, "flos": 20156947176960.0, "grad_norm": 2.2434599362266665, "language_loss": 0.83476853, "learning_rate": 3.982737329608676e-06, "loss": 0.85776043, "num_input_tokens_seen": 14768435, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.640625, "step": 698, "time_per_iteration": 2.6186137199401855 }, { "auxiliary_loss_clip": 0.01236333, "auxiliary_loss_mlp": 0.01085082, "balance_loss_clip": 1.05451703, "balance_loss_mlp": 1.06760311, "epoch": 0.042026153614910564, "flos": 23076376882560.0, "grad_norm": 1.855999347739082, "language_loss": 0.9097569, "learning_rate": 3.98268776033848e-06, "loss": 0.932971, "num_input_tokens_seen": 14786690, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.6875, "step": 699, "time_per_iteration": 2.561232805252075 }, { "auxiliary_loss_clip": 0.01109007, "auxiliary_loss_mlp": 0.01025173, "balance_loss_clip": 1.02018976, "balance_loss_mlp": 1.03255558, "epoch": 0.042086276867578536, "flos": 64495243370880.0, "grad_norm": 0.8936361637532635, "language_loss": 0.67974395, "learning_rate": 3.9826381203112575e-06, "loss": 0.70108569, "num_input_tokens_seen": 14853840, "router_z_loss_clip": 0.04980469, "router_z_loss_mlp": 0.765625, "step": 700, "time_per_iteration": 3.2137866020202637 }, { "auxiliary_loss_clip": 0.01241353, "auxiliary_loss_mlp": 0.01077689, "balance_loss_clip": 1.04650402, "balance_loss_mlp": 1.07060039, "epoch": 0.04214640012024651, "flos": 15487016065920.0, "grad_norm": 4.771386023686684, "language_loss": 0.88054347, "learning_rate": 3.98258840952878e-06, "loss": 0.90373391, "num_input_tokens_seen": 14869580, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.7109375, "step": 701, "time_per_iteration": 2.560100555419922 }, { "auxiliary_loss_clip": 0.01237493, "auxiliary_loss_mlp": 0.01076472, "balance_loss_clip": 1.04521501, "balance_loss_mlp": 1.07243347, "epoch": 0.04220652337291447, "flos": 23877040993920.0, "grad_norm": 2.7468370900999615, "language_loss": 0.67007697, "learning_rate": 3.982538627992822e-06, "loss": 0.69321656, "num_input_tokens_seen": 14891065, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.65625, "step": 702, "time_per_iteration": 2.63688325881958 }, { "auxiliary_loss_clip": 0.01100327, "auxiliary_loss_mlp": 0.01006813, "balance_loss_clip": 1.00204432, "balance_loss_mlp": 1.0237782, "epoch": 0.042266646625582445, "flos": 63795451628160.0, "grad_norm": 0.8351119086628682, "language_loss": 0.60755992, "learning_rate": 3.98248877570516e-06, "loss": 0.62863123, "num_input_tokens_seen": 14954815, "router_z_loss_clip": 0.04760742, "router_z_loss_mlp": 0.765625, "step": 703, "time_per_iteration": 3.197526693344116 }, { "auxiliary_loss_clip": 0.01099931, "auxiliary_loss_mlp": 0.01006281, "balance_loss_clip": 1.00141776, "balance_loss_mlp": 1.02323914, "epoch": 0.04232676987825041, "flos": 50018863345920.0, "grad_norm": 1.0646623347993531, "language_loss": 0.57724178, "learning_rate": 3.982438852667574e-06, "loss": 0.59830391, "num_input_tokens_seen": 15003050, "router_z_loss_clip": 0.04858398, "router_z_loss_mlp": 0.765625, "step": 704, "time_per_iteration": 3.027521848678589 }, { "auxiliary_loss_clip": 0.01239295, "auxiliary_loss_mlp": 0.01068556, "balance_loss_clip": 1.03760934, "balance_loss_mlp": 1.06990409, "epoch": 0.04238689313091838, "flos": 21616105368960.0, "grad_norm": 2.1799754273457155, "language_loss": 0.87228143, "learning_rate": 3.982388858881844e-06, "loss": 0.89535999, "num_input_tokens_seen": 15021990, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.6953125, "step": 705, "time_per_iteration": 2.624455451965332 }, { "auxiliary_loss_clip": 0.01225167, "auxiliary_loss_mlp": 0.01063471, "balance_loss_clip": 1.03557646, "balance_loss_mlp": 1.06392848, "epoch": 0.042447016383586354, "flos": 19135109070720.0, "grad_norm": 1.7086990627369683, "language_loss": 0.71024072, "learning_rate": 3.982338794349755e-06, "loss": 0.73312712, "num_input_tokens_seen": 15040700, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.609375, "step": 706, "time_per_iteration": 2.4993553161621094 }, { "auxiliary_loss_clip": 0.01229603, "auxiliary_loss_mlp": 0.01064236, "balance_loss_clip": 1.03419507, "balance_loss_mlp": 1.0663116, "epoch": 0.04250713963625432, "flos": 24426007528320.0, "grad_norm": 2.5536600206338393, "language_loss": 0.93649209, "learning_rate": 3.982288659073094e-06, "loss": 0.95943046, "num_input_tokens_seen": 15056725, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.6328125, "step": 707, "time_per_iteration": 2.653830051422119 }, { "auxiliary_loss_clip": 0.01233296, "auxiliary_loss_mlp": 0.01061805, "balance_loss_clip": 1.03154981, "balance_loss_mlp": 1.06379187, "epoch": 0.04256726288892229, "flos": 30367391333760.0, "grad_norm": 2.6656263516242555, "language_loss": 0.81577647, "learning_rate": 3.98223845305365e-06, "loss": 0.83872741, "num_input_tokens_seen": 15077550, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.6953125, "step": 708, "time_per_iteration": 2.6160590648651123 }, { "auxiliary_loss_clip": 0.01239896, "auxiliary_loss_mlp": 0.01075021, "balance_loss_clip": 1.04357338, "balance_loss_mlp": 1.06894112, "epoch": 0.04262738614159026, "flos": 16362661818240.0, "grad_norm": 2.34338393842511, "language_loss": 0.82177901, "learning_rate": 3.982188176293213e-06, "loss": 0.84492821, "num_input_tokens_seen": 15094955, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.7109375, "step": 709, "time_per_iteration": 2.5545830726623535 }, { "auxiliary_loss_clip": 0.01244593, "auxiliary_loss_mlp": 0.01065013, "balance_loss_clip": 1.03519905, "balance_loss_mlp": 1.07158971, "epoch": 0.04268750939425823, "flos": 20412379768320.0, "grad_norm": 2.6847415105857304, "language_loss": 0.84625489, "learning_rate": 3.982137828793581e-06, "loss": 0.86935103, "num_input_tokens_seen": 15113395, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.7265625, "step": 710, "time_per_iteration": 2.5064525604248047 }, { "auxiliary_loss_clip": 0.01242315, "auxiliary_loss_mlp": 0.01074024, "balance_loss_clip": 1.04459167, "balance_loss_mlp": 1.07317734, "epoch": 0.0427476326469262, "flos": 20302959962880.0, "grad_norm": 2.3073821122011764, "language_loss": 0.84451842, "learning_rate": 3.982087410556547e-06, "loss": 0.86768174, "num_input_tokens_seen": 15132920, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.6875, "step": 711, "time_per_iteration": 2.5966475009918213 }, { "auxiliary_loss_clip": 0.01231252, "auxiliary_loss_mlp": 0.01068822, "balance_loss_clip": 1.03905499, "balance_loss_mlp": 1.06534386, "epoch": 0.042807755899594166, "flos": 21650794928640.0, "grad_norm": 2.4052297092060884, "language_loss": 0.85245454, "learning_rate": 3.982036921583912e-06, "loss": 0.87545526, "num_input_tokens_seen": 15153115, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.6640625, "step": 712, "time_per_iteration": 2.561671018600464 }, { "auxiliary_loss_clip": 0.0123937, "auxiliary_loss_mlp": 0.01068964, "balance_loss_clip": 1.04036558, "balance_loss_mlp": 1.06570315, "epoch": 0.04286787915226214, "flos": 21435007973760.0, "grad_norm": 2.7148370168206086, "language_loss": 0.90984851, "learning_rate": 3.981986361877479e-06, "loss": 0.9329319, "num_input_tokens_seen": 15172770, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.734375, "step": 713, "time_per_iteration": 2.6933233737945557 }, { "auxiliary_loss_clip": 0.01100264, "auxiliary_loss_mlp": 0.01028162, "balance_loss_clip": 1.02265501, "balance_loss_mlp": 1.02602923, "epoch": 0.04292800240493011, "flos": 66397970615040.0, "grad_norm": 0.8766654759851812, "language_loss": 0.63753319, "learning_rate": 3.9819357314390494e-06, "loss": 0.65881741, "num_input_tokens_seen": 15240055, "router_z_loss_clip": 0.05517578, "router_z_loss_mlp": 0.7421875, "step": 714, "time_per_iteration": 3.2581257820129395 }, { "auxiliary_loss_clip": 0.01233318, "auxiliary_loss_mlp": 0.01076579, "balance_loss_clip": 1.04732513, "balance_loss_mlp": 1.06940341, "epoch": 0.042988125657598075, "flos": 31650264552960.0, "grad_norm": 3.8973775609103347, "language_loss": 0.74690938, "learning_rate": 3.981885030270432e-06, "loss": 0.77000833, "num_input_tokens_seen": 15261585, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.640625, "step": 715, "time_per_iteration": 2.6736700534820557 }, { "auxiliary_loss_clip": 0.0123827, "auxiliary_loss_mlp": 0.01072705, "balance_loss_clip": 1.04185319, "balance_loss_mlp": 1.07127309, "epoch": 0.04304824891026605, "flos": 33248468292480.0, "grad_norm": 1.7741865075941958, "language_loss": 0.72318387, "learning_rate": 3.981834258373437e-06, "loss": 0.74629366, "num_input_tokens_seen": 15281160, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.671875, "step": 716, "time_per_iteration": 2.725358009338379 }, { "auxiliary_loss_clip": 0.01095404, "auxiliary_loss_mlp": 0.01008847, "balance_loss_clip": 1.0031724, "balance_loss_mlp": 1.02152324, "epoch": 0.04310837216293401, "flos": 64064782782720.0, "grad_norm": 0.899975739420501, "language_loss": 0.65578115, "learning_rate": 3.981783415749874e-06, "loss": 0.67682362, "num_input_tokens_seen": 15344505, "router_z_loss_clip": 0.05664062, "router_z_loss_mlp": 0.73828125, "step": 717, "time_per_iteration": 3.229902982711792 }, { "auxiliary_loss_clip": 0.01093429, "auxiliary_loss_mlp": 0.01005084, "balance_loss_clip": 0.99964774, "balance_loss_mlp": 1.01943922, "epoch": 0.043168495415601985, "flos": 61343757849600.0, "grad_norm": 0.9714945820548747, "language_loss": 0.58816999, "learning_rate": 3.9817325024015596e-06, "loss": 0.60915512, "num_input_tokens_seen": 15404050, "router_z_loss_clip": 0.05444336, "router_z_loss_mlp": 0.73828125, "step": 718, "time_per_iteration": 3.0665228366851807 }, { "auxiliary_loss_clip": 0.01232962, "auxiliary_loss_mlp": 0.01075048, "balance_loss_clip": 1.04324341, "balance_loss_mlp": 1.06961, "epoch": 0.04322861866826996, "flos": 20704261685760.0, "grad_norm": 2.0341809573861487, "language_loss": 0.91239679, "learning_rate": 3.9816815183303086e-06, "loss": 0.9354769, "num_input_tokens_seen": 15424190, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 1.6328125, "step": 719, "time_per_iteration": 2.630207061767578 }, { "auxiliary_loss_clip": 0.01229845, "auxiliary_loss_mlp": 0.01072836, "balance_loss_clip": 1.04423809, "balance_loss_mlp": 1.06338811, "epoch": 0.04328874192093792, "flos": 30373352991360.0, "grad_norm": 1.8467190684893646, "language_loss": 0.66454935, "learning_rate": 3.981630463537942e-06, "loss": 0.68757617, "num_input_tokens_seen": 15446500, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.6640625, "step": 720, "time_per_iteration": 2.6404531002044678 }, { "auxiliary_loss_clip": 0.01229996, "auxiliary_loss_mlp": 0.01079004, "balance_loss_clip": 1.04920113, "balance_loss_mlp": 1.06539583, "epoch": 0.043348865173605894, "flos": 21944795748480.0, "grad_norm": 1.8079238054587563, "language_loss": 0.76967287, "learning_rate": 3.981579338026282e-06, "loss": 0.79276288, "num_input_tokens_seen": 15465830, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.6484375, "step": 721, "time_per_iteration": 2.591179370880127 }, { "auxiliary_loss_clip": 0.01233216, "auxiliary_loss_mlp": 0.01089499, "balance_loss_clip": 1.06065011, "balance_loss_mlp": 1.06719756, "epoch": 0.04340898842627386, "flos": 15264225959040.0, "grad_norm": 2.899632455099069, "language_loss": 0.87760019, "learning_rate": 3.981528141797153e-06, "loss": 0.90082735, "num_input_tokens_seen": 15479985, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.65625, "step": 722, "time_per_iteration": 2.5242552757263184 }, { "auxiliary_loss_clip": 0.01238534, "auxiliary_loss_mlp": 0.01074022, "balance_loss_clip": 1.045614, "balance_loss_mlp": 1.06899929, "epoch": 0.04346911167894183, "flos": 27965434913280.0, "grad_norm": 1.7581869091856668, "language_loss": 0.84271276, "learning_rate": 3.981476874852382e-06, "loss": 0.86583829, "num_input_tokens_seen": 15501545, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.6953125, "step": 723, "time_per_iteration": 5.534470319747925 }, { "auxiliary_loss_clip": 0.01239271, "auxiliary_loss_mlp": 0.01083041, "balance_loss_clip": 1.0533576, "balance_loss_mlp": 1.07028699, "epoch": 0.0435292349316098, "flos": 29242202820480.0, "grad_norm": 1.6828397700873847, "language_loss": 0.82310236, "learning_rate": 3.981425537193796e-06, "loss": 0.84632546, "num_input_tokens_seen": 15521725, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.6875, "step": 724, "time_per_iteration": 4.0927207469940186 }, { "auxiliary_loss_clip": 0.01235122, "auxiliary_loss_mlp": 0.01089517, "balance_loss_clip": 1.05862975, "balance_loss_mlp": 1.06838655, "epoch": 0.04358935818427777, "flos": 20558356640640.0, "grad_norm": 2.0086662261787365, "language_loss": 0.79370141, "learning_rate": 3.981374128823232e-06, "loss": 0.81694782, "num_input_tokens_seen": 15540910, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.6640625, "step": 725, "time_per_iteration": 4.33024001121521 }, { "auxiliary_loss_clip": 0.01244818, "auxiliary_loss_mlp": 0.01076438, "balance_loss_clip": 1.04587257, "balance_loss_mlp": 1.07057846, "epoch": 0.04364948143694574, "flos": 14464926564480.0, "grad_norm": 2.7527914293270945, "language_loss": 0.86362934, "learning_rate": 3.981322649742521e-06, "loss": 0.88684189, "num_input_tokens_seen": 15558640, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.7421875, "step": 726, "time_per_iteration": 2.5869431495666504 }, { "auxiliary_loss_clip": 0.01097923, "auxiliary_loss_mlp": 0.01081149, "balance_loss_clip": 1.0752362, "balance_loss_mlp": 1.02425313, "epoch": 0.043709604689613706, "flos": 50067268922880.0, "grad_norm": 0.916257588052339, "language_loss": 0.55921042, "learning_rate": 3.9812710999535005e-06, "loss": 0.58100116, "num_input_tokens_seen": 15612975, "router_z_loss_clip": 0.05908203, "router_z_loss_mlp": 0.734375, "step": 727, "time_per_iteration": 3.363163471221924 }, { "auxiliary_loss_clip": 0.01235204, "auxiliary_loss_mlp": 0.01075958, "balance_loss_clip": 1.04665649, "balance_loss_mlp": 1.0675838, "epoch": 0.04376972794228168, "flos": 13991588115840.0, "grad_norm": 2.1335025863834898, "language_loss": 0.82328498, "learning_rate": 3.981219479458012e-06, "loss": 0.84639668, "num_input_tokens_seen": 15631070, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.671875, "step": 728, "time_per_iteration": 2.640916585922241 }, { "auxiliary_loss_clip": 0.01228385, "auxiliary_loss_mlp": 0.01073878, "balance_loss_clip": 1.04548216, "balance_loss_mlp": 1.06610942, "epoch": 0.04382985119494965, "flos": 22009901149440.0, "grad_norm": 2.128162305194655, "language_loss": 0.76573247, "learning_rate": 3.981167788257896e-06, "loss": 0.78875506, "num_input_tokens_seen": 15647825, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.625, "step": 729, "time_per_iteration": 2.6711153984069824 }, { "auxiliary_loss_clip": 0.01235937, "auxiliary_loss_mlp": 0.01072917, "balance_loss_clip": 1.04287601, "balance_loss_mlp": 1.06519771, "epoch": 0.043889974447617615, "flos": 24206521472640.0, "grad_norm": 2.0350157990735025, "language_loss": 0.95459545, "learning_rate": 3.9811160263549985e-06, "loss": 0.97768402, "num_input_tokens_seen": 15668260, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.703125, "step": 730, "time_per_iteration": 2.7635562419891357 }, { "auxiliary_loss_clip": 0.01231361, "auxiliary_loss_mlp": 0.01066434, "balance_loss_clip": 1.03498662, "balance_loss_mlp": 1.06330216, "epoch": 0.04395009770028559, "flos": 17274541415040.0, "grad_norm": 2.386476642660336, "language_loss": 0.87806749, "learning_rate": 3.981064193751166e-06, "loss": 0.90104544, "num_input_tokens_seen": 15685630, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 1.6796875, "step": 731, "time_per_iteration": 2.635080575942993 }, { "auxiliary_loss_clip": 0.01235549, "auxiliary_loss_mlp": 0.01071327, "balance_loss_clip": 1.04396892, "balance_loss_mlp": 1.06780362, "epoch": 0.04401022095295355, "flos": 12310286261760.0, "grad_norm": 3.55282343933488, "language_loss": 0.8868143, "learning_rate": 3.981012290448247e-06, "loss": 0.90988302, "num_input_tokens_seen": 15698645, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.671875, "step": 732, "time_per_iteration": 2.6148102283477783 }, { "auxiliary_loss_clip": 0.01233727, "auxiliary_loss_mlp": 0.01076633, "balance_loss_clip": 1.04709327, "balance_loss_mlp": 1.0655961, "epoch": 0.044070344205621524, "flos": 20959658363520.0, "grad_norm": 2.248539202020273, "language_loss": 0.86061311, "learning_rate": 3.980960316448097e-06, "loss": 0.8837167, "num_input_tokens_seen": 15716775, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.6796875, "step": 733, "time_per_iteration": 2.5595288276672363 }, { "auxiliary_loss_clip": 0.01239646, "auxiliary_loss_mlp": 0.0107862, "balance_loss_clip": 1.0471611, "balance_loss_mlp": 1.06818831, "epoch": 0.044130467458289496, "flos": 13845288021120.0, "grad_norm": 2.10549967585714, "language_loss": 0.91152084, "learning_rate": 3.980908271752567e-06, "loss": 0.93470347, "num_input_tokens_seen": 15733320, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 1.7109375, "step": 734, "time_per_iteration": 2.670717477798462 }, { "auxiliary_loss_clip": 0.01231051, "auxiliary_loss_mlp": 0.01064241, "balance_loss_clip": 1.03648925, "balance_loss_mlp": 1.06730568, "epoch": 0.04419059071095746, "flos": 28655063107200.0, "grad_norm": 1.992600902823137, "language_loss": 0.77783275, "learning_rate": 3.980856156363518e-06, "loss": 0.80078572, "num_input_tokens_seen": 15752705, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.6328125, "step": 735, "time_per_iteration": 2.7138006687164307 }, { "auxiliary_loss_clip": 0.01224637, "auxiliary_loss_mlp": 0.01061997, "balance_loss_clip": 1.03355396, "balance_loss_mlp": 1.06086683, "epoch": 0.04425071396362543, "flos": 28183304856960.0, "grad_norm": 2.2544739565714083, "language_loss": 0.87991178, "learning_rate": 3.980803970282806e-06, "loss": 0.90277803, "num_input_tokens_seen": 15772800, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.640625, "step": 736, "time_per_iteration": 2.6906254291534424 }, { "auxiliary_loss_clip": 0.01228301, "auxiliary_loss_mlp": 0.01073113, "balance_loss_clip": 1.04514611, "balance_loss_mlp": 1.06765282, "epoch": 0.0443108372162934, "flos": 23658452778240.0, "grad_norm": 1.9385820314909974, "language_loss": 0.84123158, "learning_rate": 3.980751713512298e-06, "loss": 0.86424577, "num_input_tokens_seen": 15793665, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.609375, "step": 737, "time_per_iteration": 2.6257784366607666 }, { "auxiliary_loss_clip": 0.01236476, "auxiliary_loss_mlp": 0.01066518, "balance_loss_clip": 1.03473651, "balance_loss_mlp": 1.06815481, "epoch": 0.04437096046896137, "flos": 33979861025280.0, "grad_norm": 2.9834202794382865, "language_loss": 0.84604234, "learning_rate": 3.980699386053855e-06, "loss": 0.8690722, "num_input_tokens_seen": 15813175, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 1.6875, "step": 738, "time_per_iteration": 2.710813522338867 }, { "auxiliary_loss_clip": 0.01087628, "auxiliary_loss_mlp": 0.01013754, "balance_loss_clip": 1.00915229, "balance_loss_mlp": 1.015589, "epoch": 0.04443108372162934, "flos": 67397506375680.0, "grad_norm": 0.855676907896944, "language_loss": 0.59234411, "learning_rate": 3.9806469879093465e-06, "loss": 0.61335796, "num_input_tokens_seen": 15872050, "router_z_loss_clip": 0.04589844, "router_z_loss_mlp": 0.71875, "step": 739, "time_per_iteration": 3.1366159915924072 }, { "auxiliary_loss_clip": 0.01231697, "auxiliary_loss_mlp": 0.01066921, "balance_loss_clip": 1.03826284, "balance_loss_mlp": 1.06793082, "epoch": 0.04449120697429731, "flos": 29752672953600.0, "grad_norm": 2.036583628258073, "language_loss": 0.90919113, "learning_rate": 3.9805945190806415e-06, "loss": 0.93217742, "num_input_tokens_seen": 15891085, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.640625, "step": 740, "time_per_iteration": 2.6966018676757812 }, { "auxiliary_loss_clip": 0.01235673, "auxiliary_loss_mlp": 0.0106152, "balance_loss_clip": 1.03361297, "balance_loss_mlp": 1.06939197, "epoch": 0.04455133022696528, "flos": 36502119072000.0, "grad_norm": 2.182267100100241, "language_loss": 0.71838772, "learning_rate": 3.980541979569614e-06, "loss": 0.74135971, "num_input_tokens_seen": 15914225, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.6640625, "step": 741, "time_per_iteration": 2.6616477966308594 }, { "auxiliary_loss_clip": 0.01225986, "auxiliary_loss_mlp": 0.01071744, "balance_loss_clip": 1.04340792, "balance_loss_mlp": 1.06375623, "epoch": 0.044611453479633245, "flos": 28803661672320.0, "grad_norm": 1.8377208345665508, "language_loss": 0.88812375, "learning_rate": 3.980489369378136e-06, "loss": 0.91110098, "num_input_tokens_seen": 15934540, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.625, "step": 742, "time_per_iteration": 2.6032004356384277 }, { "auxiliary_loss_clip": 0.01228341, "auxiliary_loss_mlp": 0.01061688, "balance_loss_clip": 1.03212464, "balance_loss_mlp": 1.06457186, "epoch": 0.04467157673230122, "flos": 20010970304640.0, "grad_norm": 2.0341621352836947, "language_loss": 0.83874422, "learning_rate": 3.980436688508087e-06, "loss": 0.86164451, "num_input_tokens_seen": 15952560, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.640625, "step": 743, "time_per_iteration": 2.555476427078247 }, { "auxiliary_loss_clip": 0.01233807, "auxiliary_loss_mlp": 0.01070448, "balance_loss_clip": 1.04183769, "balance_loss_mlp": 1.06731546, "epoch": 0.04473169998496919, "flos": 18004964480640.0, "grad_norm": 2.8093921933166746, "language_loss": 0.79534644, "learning_rate": 3.980383936961348e-06, "loss": 0.818389, "num_input_tokens_seen": 15970620, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.6640625, "step": 744, "time_per_iteration": 2.6788084506988525 }, { "auxiliary_loss_clip": 0.01227083, "auxiliary_loss_mlp": 0.01069531, "balance_loss_clip": 1.04131424, "balance_loss_mlp": 1.06481087, "epoch": 0.044791823237637154, "flos": 20631722169600.0, "grad_norm": 4.078478990523418, "language_loss": 0.85068345, "learning_rate": 3.980331114739799e-06, "loss": 0.8736496, "num_input_tokens_seen": 15987325, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.625, "step": 745, "time_per_iteration": 2.659529685974121 }, { "auxiliary_loss_clip": 0.01225387, "auxiliary_loss_mlp": 0.01064711, "balance_loss_clip": 1.03519464, "balance_loss_mlp": 1.06322825, "epoch": 0.04485194649030513, "flos": 31176171918720.0, "grad_norm": 1.7062478282224003, "language_loss": 0.69110417, "learning_rate": 3.980278221845328e-06, "loss": 0.71400517, "num_input_tokens_seen": 16008310, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.625, "step": 746, "time_per_iteration": 2.6896450519561768 }, { "auxiliary_loss_clip": 0.0123583, "auxiliary_loss_mlp": 0.01073863, "balance_loss_clip": 1.04375052, "balance_loss_mlp": 1.0715816, "epoch": 0.04491206974297309, "flos": 26143291831680.0, "grad_norm": 1.9309746454598562, "language_loss": 0.68185139, "learning_rate": 3.98022525827982e-06, "loss": 0.70494831, "num_input_tokens_seen": 16029620, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.640625, "step": 747, "time_per_iteration": 2.7136688232421875 }, { "auxiliary_loss_clip": 0.01234193, "auxiliary_loss_mlp": 0.01073387, "balance_loss_clip": 1.04325128, "balance_loss_mlp": 1.06781292, "epoch": 0.044972192995641064, "flos": 20667668705280.0, "grad_norm": 2.0082046096011408, "language_loss": 0.66651499, "learning_rate": 3.980172224045168e-06, "loss": 0.68959081, "num_input_tokens_seen": 16049065, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.6640625, "step": 748, "time_per_iteration": 2.6249279975891113 }, { "auxiliary_loss_clip": 0.01236463, "auxiliary_loss_mlp": 0.01080971, "balance_loss_clip": 1.05190802, "balance_loss_mlp": 1.07146931, "epoch": 0.045032316248309036, "flos": 16106834177280.0, "grad_norm": 2.505720985564524, "language_loss": 0.76649046, "learning_rate": 3.980119119143262e-06, "loss": 0.7896648, "num_input_tokens_seen": 16066765, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.6484375, "step": 749, "time_per_iteration": 2.6097569465637207 }, { "auxiliary_loss_clip": 0.01233611, "auxiliary_loss_mlp": 0.01074318, "balance_loss_clip": 1.04606581, "balance_loss_mlp": 1.06909347, "epoch": 0.045092439500977, "flos": 17858843953920.0, "grad_norm": 2.3493473789364394, "language_loss": 0.89286101, "learning_rate": 3.980065943575998e-06, "loss": 0.91594028, "num_input_tokens_seen": 16085980, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.6484375, "step": 750, "time_per_iteration": 2.5692007541656494 }, { "auxiliary_loss_clip": 0.01231692, "auxiliary_loss_mlp": 0.01070516, "balance_loss_clip": 1.03844905, "balance_loss_mlp": 1.06398749, "epoch": 0.04515256275364497, "flos": 24462815990400.0, "grad_norm": 1.7154596123594645, "language_loss": 0.74285978, "learning_rate": 3.9800126973452725e-06, "loss": 0.76588184, "num_input_tokens_seen": 16106260, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 1.6796875, "step": 751, "time_per_iteration": 2.6376585960388184 }, { "auxiliary_loss_clip": 0.01229081, "auxiliary_loss_mlp": 0.010703, "balance_loss_clip": 1.04177284, "balance_loss_mlp": 1.06511259, "epoch": 0.04521268600631294, "flos": 20916385453440.0, "grad_norm": 2.064800483153111, "language_loss": 0.68568939, "learning_rate": 3.979959380452989e-06, "loss": 0.70868319, "num_input_tokens_seen": 16123475, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.640625, "step": 752, "time_per_iteration": 2.566098213195801 }, { "auxiliary_loss_clip": 0.01229061, "auxiliary_loss_mlp": 0.01058371, "balance_loss_clip": 1.03035641, "balance_loss_mlp": 1.06501532, "epoch": 0.04527280925898091, "flos": 13371374954880.0, "grad_norm": 2.9606275590403603, "language_loss": 0.9207809, "learning_rate": 3.979905992901047e-06, "loss": 0.94365519, "num_input_tokens_seen": 16138335, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.640625, "step": 753, "time_per_iteration": 2.6057581901550293 }, { "auxiliary_loss_clip": 0.01233764, "auxiliary_loss_mlp": 0.01078543, "balance_loss_clip": 1.05017138, "balance_loss_mlp": 1.06863248, "epoch": 0.04533293251164888, "flos": 23254565276160.0, "grad_norm": 2.3304722806444653, "language_loss": 0.91021341, "learning_rate": 3.979852534691353e-06, "loss": 0.9333365, "num_input_tokens_seen": 16157110, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.65625, "step": 754, "time_per_iteration": 2.5776331424713135 }, { "auxiliary_loss_clip": 0.01222353, "auxiliary_loss_mlp": 0.01070693, "balance_loss_clip": 1.0412122, "balance_loss_mlp": 1.06485534, "epoch": 0.04539305576431685, "flos": 12422004537600.0, "grad_norm": 5.409075048608937, "language_loss": 0.78531778, "learning_rate": 3.979799005825816e-06, "loss": 0.80824828, "num_input_tokens_seen": 16174155, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.578125, "step": 755, "time_per_iteration": 2.651078224182129 }, { "auxiliary_loss_clip": 0.01235365, "auxiliary_loss_mlp": 0.01079118, "balance_loss_clip": 1.05050743, "balance_loss_mlp": 1.06968212, "epoch": 0.04545317901698482, "flos": 16070995382400.0, "grad_norm": 2.0297225915591985, "language_loss": 0.7810446, "learning_rate": 3.979745406306345e-06, "loss": 0.8041895, "num_input_tokens_seen": 16192240, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.65625, "step": 756, "time_per_iteration": 2.523350954055786 }, { "auxiliary_loss_clip": 0.01088826, "auxiliary_loss_mlp": 0.01016975, "balance_loss_clip": 1.01265955, "balance_loss_mlp": 1.016469, "epoch": 0.045513302269652785, "flos": 66396139021440.0, "grad_norm": 0.7981720214827482, "language_loss": 0.62736368, "learning_rate": 3.979691736134852e-06, "loss": 0.64842165, "num_input_tokens_seen": 16255775, "router_z_loss_clip": 0.04321289, "router_z_loss_mlp": 0.72265625, "step": 757, "time_per_iteration": 3.2396719455718994 }, { "auxiliary_loss_clip": 0.01227844, "auxiliary_loss_mlp": 0.01066246, "balance_loss_clip": 1.03838646, "balance_loss_mlp": 1.06572354, "epoch": 0.04557342552232076, "flos": 21471169991040.0, "grad_norm": 1.4556944129810792, "language_loss": 0.84035408, "learning_rate": 3.979637995313254e-06, "loss": 0.86329496, "num_input_tokens_seen": 16277015, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.6171875, "step": 758, "time_per_iteration": 2.6907265186309814 }, { "auxiliary_loss_clip": 0.01221948, "auxiliary_loss_mlp": 0.01066603, "balance_loss_clip": 1.03931594, "balance_loss_mlp": 1.06015897, "epoch": 0.04563354877498873, "flos": 23732680233600.0, "grad_norm": 1.743549440415242, "language_loss": 0.88351834, "learning_rate": 3.979584183843468e-06, "loss": 0.9064039, "num_input_tokens_seen": 16296005, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.6171875, "step": 759, "time_per_iteration": 2.7507476806640625 }, { "auxiliary_loss_clip": 0.01233086, "auxiliary_loss_mlp": 0.01065989, "balance_loss_clip": 1.03638983, "balance_loss_mlp": 1.06802654, "epoch": 0.045693672027656694, "flos": 25735741142400.0, "grad_norm": 2.22527613942534, "language_loss": 0.73429084, "learning_rate": 3.979530301727414e-06, "loss": 0.75728166, "num_input_tokens_seen": 16315300, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.6484375, "step": 760, "time_per_iteration": 2.6171295642852783 }, { "auxiliary_loss_clip": 0.01230304, "auxiliary_loss_mlp": 0.01072156, "balance_loss_clip": 1.04368913, "balance_loss_mlp": 1.06942117, "epoch": 0.045753795280324666, "flos": 19719016560000.0, "grad_norm": 2.025961895979912, "language_loss": 0.81992131, "learning_rate": 3.979476348967016e-06, "loss": 0.84294593, "num_input_tokens_seen": 16333820, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.609375, "step": 761, "time_per_iteration": 2.6323068141937256 }, { "auxiliary_loss_clip": 0.01223503, "auxiliary_loss_mlp": 0.01067987, "balance_loss_clip": 1.03925741, "balance_loss_mlp": 1.06357932, "epoch": 0.04581391853299264, "flos": 23255786338560.0, "grad_norm": 1.9180374924288084, "language_loss": 0.79729038, "learning_rate": 3.979422325564199e-06, "loss": 0.82020533, "num_input_tokens_seen": 16355290, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.6015625, "step": 762, "time_per_iteration": 2.707031011581421 }, { "auxiliary_loss_clip": 0.01086267, "auxiliary_loss_mlp": 0.0100584, "balance_loss_clip": 1.00154877, "balance_loss_mlp": 1.01461458, "epoch": 0.0458740417856606, "flos": 64231155601920.0, "grad_norm": 1.0004000971085225, "language_loss": 0.58709812, "learning_rate": 3.979368231520891e-06, "loss": 0.60801917, "num_input_tokens_seen": 16415995, "router_z_loss_clip": 0.04296875, "router_z_loss_mlp": 0.71875, "step": 763, "time_per_iteration": 3.2051517963409424 }, { "auxiliary_loss_clip": 0.01228202, "auxiliary_loss_mlp": 0.01073824, "balance_loss_clip": 1.04505932, "balance_loss_mlp": 1.06503296, "epoch": 0.045934165038328575, "flos": 20770121272320.0, "grad_norm": 2.227188482056197, "language_loss": 0.87971675, "learning_rate": 3.979314066839022e-06, "loss": 0.90273702, "num_input_tokens_seen": 16433120, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.625, "step": 764, "time_per_iteration": 2.6491289138793945 }, { "auxiliary_loss_clip": 0.01230322, "auxiliary_loss_mlp": 0.01076415, "balance_loss_clip": 1.04786444, "balance_loss_mlp": 1.06655252, "epoch": 0.04599428829099654, "flos": 30262891691520.0, "grad_norm": 2.132814739239689, "language_loss": 0.85619223, "learning_rate": 3.979259831520526e-06, "loss": 0.87925965, "num_input_tokens_seen": 16453360, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.640625, "step": 765, "time_per_iteration": 5.6742095947265625 }, { "auxiliary_loss_clip": 0.01233152, "auxiliary_loss_mlp": 0.01069328, "balance_loss_clip": 1.03886962, "balance_loss_mlp": 1.06711662, "epoch": 0.04605441154366451, "flos": 23038921975680.0, "grad_norm": 3.758072537696435, "language_loss": 0.87588334, "learning_rate": 3.979205525567337e-06, "loss": 0.89890814, "num_input_tokens_seen": 16471160, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.65625, "step": 766, "time_per_iteration": 4.054310321807861 }, { "auxiliary_loss_clip": 0.01226324, "auxiliary_loss_mlp": 0.01069071, "balance_loss_clip": 1.04055643, "balance_loss_mlp": 1.06390786, "epoch": 0.046114534796332485, "flos": 22017407091840.0, "grad_norm": 2.6836998412565167, "language_loss": 0.84027207, "learning_rate": 3.979151148981395e-06, "loss": 0.86322612, "num_input_tokens_seen": 16488940, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.625, "step": 767, "time_per_iteration": 4.810234069824219 }, { "auxiliary_loss_clip": 0.01227815, "auxiliary_loss_mlp": 0.01057974, "balance_loss_clip": 1.02894711, "balance_loss_mlp": 1.06730413, "epoch": 0.04617465804900045, "flos": 29862380067840.0, "grad_norm": 2.068321958445888, "language_loss": 0.86398441, "learning_rate": 3.979096701764638e-06, "loss": 0.88684225, "num_input_tokens_seen": 16509505, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.609375, "step": 768, "time_per_iteration": 2.622298240661621 }, { "auxiliary_loss_clip": 0.01223208, "auxiliary_loss_mlp": 0.01066529, "balance_loss_clip": 1.03962374, "balance_loss_mlp": 1.05990958, "epoch": 0.04623478130166842, "flos": 25630056351360.0, "grad_norm": 2.566635505116688, "language_loss": 0.7475698, "learning_rate": 3.979042183919012e-06, "loss": 0.77046716, "num_input_tokens_seen": 16528840, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.6328125, "step": 769, "time_per_iteration": 2.6334710121154785 }, { "auxiliary_loss_clip": 0.01219945, "auxiliary_loss_mlp": 0.01066723, "balance_loss_clip": 1.04034245, "balance_loss_mlp": 1.06277061, "epoch": 0.04629490455433639, "flos": 20449080489600.0, "grad_norm": 1.8761617081459232, "language_loss": 0.85892469, "learning_rate": 3.97898759544646e-06, "loss": 0.88179135, "num_input_tokens_seen": 16548335, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.5703125, "step": 770, "time_per_iteration": 2.5830819606781006 }, { "auxiliary_loss_clip": 0.01230239, "auxiliary_loss_mlp": 0.0106599, "balance_loss_clip": 1.03771377, "balance_loss_mlp": 1.06250381, "epoch": 0.04635502780700436, "flos": 23148736830720.0, "grad_norm": 1.9324741329477726, "language_loss": 0.86626774, "learning_rate": 3.978932936348932e-06, "loss": 0.88923001, "num_input_tokens_seen": 16567725, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.671875, "step": 771, "time_per_iteration": 2.702465534210205 }, { "auxiliary_loss_clip": 0.01230067, "auxiliary_loss_mlp": 0.01076022, "balance_loss_clip": 1.04605281, "balance_loss_mlp": 1.06116462, "epoch": 0.04641515105967233, "flos": 23292020183040.0, "grad_norm": 2.1388179828626765, "language_loss": 0.83225483, "learning_rate": 3.978878206628377e-06, "loss": 0.85531574, "num_input_tokens_seen": 16588175, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.6875, "step": 772, "time_per_iteration": 2.5657811164855957 }, { "auxiliary_loss_clip": 0.01226208, "auxiliary_loss_mlp": 0.01064334, "balance_loss_clip": 1.03746462, "balance_loss_mlp": 1.0652616, "epoch": 0.046475274312340296, "flos": 25115204759040.0, "grad_norm": 1.997466466136796, "language_loss": 0.73305881, "learning_rate": 3.978823406286751e-06, "loss": 0.75596428, "num_input_tokens_seen": 16607735, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.609375, "step": 773, "time_per_iteration": 2.6562552452087402 }, { "auxiliary_loss_clip": 0.01221478, "auxiliary_loss_mlp": 0.0106515, "balance_loss_clip": 1.03608739, "balance_loss_mlp": 1.06325996, "epoch": 0.04653539756500827, "flos": 25264916645760.0, "grad_norm": 2.0364778947999245, "language_loss": 0.78547567, "learning_rate": 3.978768535326006e-06, "loss": 0.80834198, "num_input_tokens_seen": 16627225, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.5859375, "step": 774, "time_per_iteration": 2.6002414226531982 }, { "auxiliary_loss_clip": 0.01220633, "auxiliary_loss_mlp": 0.0106619, "balance_loss_clip": 1.03973794, "balance_loss_mlp": 1.06219327, "epoch": 0.046595520817676234, "flos": 35404150089600.0, "grad_norm": 1.945383618262577, "language_loss": 0.73427081, "learning_rate": 3.978713593748103e-06, "loss": 0.75713903, "num_input_tokens_seen": 16647785, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.5859375, "step": 775, "time_per_iteration": 2.7200169563293457 }, { "auxiliary_loss_clip": 0.01223045, "auxiliary_loss_mlp": 0.01059883, "balance_loss_clip": 1.03148746, "balance_loss_mlp": 1.06248069, "epoch": 0.046655644070344206, "flos": 18112516778880.0, "grad_norm": 1.5782130116906967, "language_loss": 0.77147895, "learning_rate": 3.9786585815550015e-06, "loss": 0.79430819, "num_input_tokens_seen": 16667555, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.609375, "step": 776, "time_per_iteration": 2.586270809173584 }, { "auxiliary_loss_clip": 0.01218066, "auxiliary_loss_mlp": 0.01064079, "balance_loss_clip": 1.03774595, "balance_loss_mlp": 1.06063497, "epoch": 0.04671576732301218, "flos": 29205286617600.0, "grad_norm": 2.237624225084928, "language_loss": 0.70870078, "learning_rate": 3.978603498748664e-06, "loss": 0.73152226, "num_input_tokens_seen": 16686875, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.578125, "step": 777, "time_per_iteration": 2.657862901687622 }, { "auxiliary_loss_clip": 0.0122302, "auxiliary_loss_mlp": 0.01071953, "balance_loss_clip": 1.04379559, "balance_loss_mlp": 1.06376088, "epoch": 0.04677589057568014, "flos": 30478319510400.0, "grad_norm": 1.9509483473536091, "language_loss": 0.76676494, "learning_rate": 3.978548345331058e-06, "loss": 0.78971469, "num_input_tokens_seen": 16706420, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.59375, "step": 778, "time_per_iteration": 2.6345338821411133 }, { "auxiliary_loss_clip": 0.0121849, "auxiliary_loss_mlp": 0.01064156, "balance_loss_clip": 1.03657126, "balance_loss_mlp": 1.06110537, "epoch": 0.046836013828348115, "flos": 20557674282240.0, "grad_norm": 2.0466181693472008, "language_loss": 0.79212582, "learning_rate": 3.978493121304151e-06, "loss": 0.81495225, "num_input_tokens_seen": 16726390, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.578125, "step": 779, "time_per_iteration": 2.545949935913086 }, { "auxiliary_loss_clip": 0.01214847, "auxiliary_loss_mlp": 0.01059015, "balance_loss_clip": 1.03245533, "balance_loss_mlp": 1.06055093, "epoch": 0.04689613708101608, "flos": 25447378757760.0, "grad_norm": 1.8944441121481534, "language_loss": 0.77100325, "learning_rate": 3.978437826669914e-06, "loss": 0.79374182, "num_input_tokens_seen": 16748965, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.546875, "step": 780, "time_per_iteration": 2.6161928176879883 }, { "auxiliary_loss_clip": 0.01218438, "auxiliary_loss_mlp": 0.01065691, "balance_loss_clip": 1.04053783, "balance_loss_mlp": 1.06325424, "epoch": 0.04695626033368405, "flos": 23001395241600.0, "grad_norm": 1.940924268947264, "language_loss": 0.76564914, "learning_rate": 3.9783824614303195e-06, "loss": 0.78849041, "num_input_tokens_seen": 16768620, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.5546875, "step": 781, "time_per_iteration": 2.6705000400543213 }, { "auxiliary_loss_clip": 0.01228536, "auxiliary_loss_mlp": 0.01067472, "balance_loss_clip": 1.03945827, "balance_loss_mlp": 1.06512988, "epoch": 0.047016383586352024, "flos": 29133357632640.0, "grad_norm": 2.1481998572806367, "language_loss": 0.73380804, "learning_rate": 3.978327025587344e-06, "loss": 0.75676811, "num_input_tokens_seen": 16789755, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.6328125, "step": 782, "time_per_iteration": 2.6776344776153564 }, { "auxiliary_loss_clip": 0.01221765, "auxiliary_loss_mlp": 0.01063395, "balance_loss_clip": 1.03803933, "balance_loss_mlp": 1.06449318, "epoch": 0.04707650683901999, "flos": 14976330451200.0, "grad_norm": 2.1496485500510283, "language_loss": 0.7981112, "learning_rate": 3.978271519142967e-06, "loss": 0.82096273, "num_input_tokens_seen": 16807585, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.5703125, "step": 783, "time_per_iteration": 2.5243334770202637 }, { "auxiliary_loss_clip": 0.01214453, "auxiliary_loss_mlp": 0.01056535, "balance_loss_clip": 1.03148937, "balance_loss_mlp": 1.06114757, "epoch": 0.04713663009168796, "flos": 21651118151040.0, "grad_norm": 1.9336778048811976, "language_loss": 0.81242096, "learning_rate": 3.978215942099167e-06, "loss": 0.83513087, "num_input_tokens_seen": 16827220, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.53125, "step": 784, "time_per_iteration": 2.5886294841766357 }, { "auxiliary_loss_clip": 0.01222641, "auxiliary_loss_mlp": 0.01071844, "balance_loss_clip": 1.04591548, "balance_loss_mlp": 1.06072509, "epoch": 0.04719675334435593, "flos": 21325408600320.0, "grad_norm": 2.467721117761624, "language_loss": 0.8017813, "learning_rate": 3.9781602944579285e-06, "loss": 0.8247261, "num_input_tokens_seen": 16846230, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.625, "step": 785, "time_per_iteration": 2.5676052570343018 }, { "auxiliary_loss_clip": 0.01223576, "auxiliary_loss_mlp": 0.01063502, "balance_loss_clip": 1.03832459, "balance_loss_mlp": 1.06712484, "epoch": 0.0472568765970239, "flos": 17931383470080.0, "grad_norm": 2.1563616960188075, "language_loss": 0.89456928, "learning_rate": 3.978104576221238e-06, "loss": 0.91744006, "num_input_tokens_seen": 16865325, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.5625, "step": 786, "time_per_iteration": 2.6095187664031982 }, { "auxiliary_loss_clip": 0.01222908, "auxiliary_loss_mlp": 0.01065913, "balance_loss_clip": 1.0373745, "balance_loss_mlp": 1.05705345, "epoch": 0.04731699984969187, "flos": 18077324428800.0, "grad_norm": 2.6241714066927586, "language_loss": 0.76295549, "learning_rate": 3.978048787391084e-06, "loss": 0.78584373, "num_input_tokens_seen": 16882930, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.65625, "step": 787, "time_per_iteration": 2.5100221633911133 }, { "auxiliary_loss_clip": 0.01227451, "auxiliary_loss_mlp": 0.01062211, "balance_loss_clip": 1.03505528, "balance_loss_mlp": 1.06741881, "epoch": 0.047377123102359836, "flos": 23185078416000.0, "grad_norm": 2.496006568353847, "language_loss": 0.80803299, "learning_rate": 3.9779929279694565e-06, "loss": 0.8309297, "num_input_tokens_seen": 16900710, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.6015625, "step": 788, "time_per_iteration": 2.5555903911590576 }, { "auxiliary_loss_clip": 0.01220348, "auxiliary_loss_mlp": 0.01068835, "balance_loss_clip": 1.04060602, "balance_loss_mlp": 1.06521106, "epoch": 0.04743724635502781, "flos": 22747794243840.0, "grad_norm": 1.9216131638779725, "language_loss": 0.84392571, "learning_rate": 3.977936997958349e-06, "loss": 0.86681747, "num_input_tokens_seen": 16919210, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.5546875, "step": 789, "time_per_iteration": 2.53167986869812 }, { "auxiliary_loss_clip": 0.01224464, "auxiliary_loss_mlp": 0.01068832, "balance_loss_clip": 1.04282033, "balance_loss_mlp": 1.06579733, "epoch": 0.04749736960769577, "flos": 17238702620160.0, "grad_norm": 2.133365724445988, "language_loss": 0.81953454, "learning_rate": 3.977880997359758e-06, "loss": 0.84246755, "num_input_tokens_seen": 16937125, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.5859375, "step": 790, "time_per_iteration": 2.6035332679748535 }, { "auxiliary_loss_clip": 0.01219079, "auxiliary_loss_mlp": 0.01063619, "balance_loss_clip": 1.03589129, "balance_loss_mlp": 1.06125772, "epoch": 0.047557492860363745, "flos": 40479261592320.0, "grad_norm": 2.08218499629929, "language_loss": 0.87980878, "learning_rate": 3.977824926175682e-06, "loss": 0.90263581, "num_input_tokens_seen": 16958610, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.578125, "step": 791, "time_per_iteration": 2.820481061935425 }, { "auxiliary_loss_clip": 0.01221739, "auxiliary_loss_mlp": 0.0106546, "balance_loss_clip": 1.03789902, "balance_loss_mlp": 1.06397343, "epoch": 0.04761761611303172, "flos": 18698004466560.0, "grad_norm": 2.266762579045219, "language_loss": 0.90085649, "learning_rate": 3.977768784408122e-06, "loss": 0.92372847, "num_input_tokens_seen": 16977300, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.578125, "step": 792, "time_per_iteration": 2.61556339263916 }, { "auxiliary_loss_clip": 0.01221734, "auxiliary_loss_mlp": 0.01072747, "balance_loss_clip": 1.04702115, "balance_loss_mlp": 1.06127357, "epoch": 0.04767773936569968, "flos": 20921987975040.0, "grad_norm": 2.068483361931489, "language_loss": 0.73027325, "learning_rate": 3.977712572059081e-06, "loss": 0.75321811, "num_input_tokens_seen": 16994950, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.609375, "step": 793, "time_per_iteration": 2.540168285369873 }, { "auxiliary_loss_clip": 0.01227501, "auxiliary_loss_mlp": 0.01067321, "balance_loss_clip": 1.03934205, "balance_loss_mlp": 1.06563616, "epoch": 0.047737862618367655, "flos": 23732680233600.0, "grad_norm": 1.7466355671068954, "language_loss": 0.85849017, "learning_rate": 3.977656289130567e-06, "loss": 0.88143837, "num_input_tokens_seen": 17014760, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.6171875, "step": 794, "time_per_iteration": 2.678995370864868 }, { "auxiliary_loss_clip": 0.01223712, "auxiliary_loss_mlp": 0.01072168, "balance_loss_clip": 1.04470181, "balance_loss_mlp": 1.0616622, "epoch": 0.04779798587103562, "flos": 23695764030720.0, "grad_norm": 5.689091498861775, "language_loss": 0.69026518, "learning_rate": 3.977599935624586e-06, "loss": 0.71322393, "num_input_tokens_seen": 17032715, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.6171875, "step": 795, "time_per_iteration": 2.7074809074401855 }, { "auxiliary_loss_clip": 0.01221951, "auxiliary_loss_mlp": 0.01076136, "balance_loss_clip": 1.04812193, "balance_loss_mlp": 1.06278753, "epoch": 0.04785810912370359, "flos": 23183641872000.0, "grad_norm": 1.7323479618941913, "language_loss": 0.80868787, "learning_rate": 3.977543511543151e-06, "loss": 0.83166873, "num_input_tokens_seen": 17052215, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.59375, "step": 796, "time_per_iteration": 2.665839672088623 }, { "auxiliary_loss_clip": 0.01220628, "auxiliary_loss_mlp": 0.01064141, "balance_loss_clip": 1.03692603, "balance_loss_mlp": 1.06136799, "epoch": 0.047918232376371564, "flos": 18040623707520.0, "grad_norm": 2.2587675506046025, "language_loss": 0.81221569, "learning_rate": 3.977487016888274e-06, "loss": 0.83506334, "num_input_tokens_seen": 17069225, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.59375, "step": 797, "time_per_iteration": 2.6097166538238525 }, { "auxiliary_loss_clip": 0.0109202, "auxiliary_loss_mlp": 0.0102161, "balance_loss_clip": 1.01674581, "balance_loss_mlp": 1.02337837, "epoch": 0.04797835562903953, "flos": 62442588758400.0, "grad_norm": 0.9207405185143682, "language_loss": 0.6447922, "learning_rate": 3.977430451661972e-06, "loss": 0.6659286, "num_input_tokens_seen": 17126680, "router_z_loss_clip": 0.04858398, "router_z_loss_mlp": 0.6875, "step": 798, "time_per_iteration": 3.076730966567993 }, { "auxiliary_loss_clip": 0.01227642, "auxiliary_loss_mlp": 0.01068544, "balance_loss_clip": 1.04163897, "balance_loss_mlp": 1.06174219, "epoch": 0.0480384788817075, "flos": 21507296094720.0, "grad_norm": 2.1554087714384753, "language_loss": 0.90876818, "learning_rate": 3.9773738158662655e-06, "loss": 0.93173009, "num_input_tokens_seen": 17144835, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.65625, "step": 799, "time_per_iteration": 2.6413748264312744 }, { "auxiliary_loss_clip": 0.01221651, "auxiliary_loss_mlp": 0.01071761, "balance_loss_clip": 1.04399681, "balance_loss_mlp": 1.0657208, "epoch": 0.048098602134375466, "flos": 21726710323200.0, "grad_norm": 2.1069996605969794, "language_loss": 0.86329639, "learning_rate": 3.977317109503172e-06, "loss": 0.88623047, "num_input_tokens_seen": 17165030, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.5625, "step": 800, "time_per_iteration": 2.5716004371643066 }, { "auxiliary_loss_clip": 0.01224535, "auxiliary_loss_mlp": 0.01066947, "balance_loss_clip": 1.04058981, "balance_loss_mlp": 1.06202567, "epoch": 0.04815872538704344, "flos": 22931082368640.0, "grad_norm": 2.3267959397131106, "language_loss": 0.84100062, "learning_rate": 3.977260332574718e-06, "loss": 0.86391538, "num_input_tokens_seen": 17184895, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.625, "step": 801, "time_per_iteration": 2.6069631576538086 }, { "auxiliary_loss_clip": 0.01223706, "auxiliary_loss_mlp": 0.01068085, "balance_loss_clip": 1.04165637, "balance_loss_mlp": 1.06413984, "epoch": 0.04821884863971141, "flos": 43174716042240.0, "grad_norm": 1.760789050193579, "language_loss": 0.79627633, "learning_rate": 3.977203485082928e-06, "loss": 0.8191942, "num_input_tokens_seen": 17208225, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.59375, "step": 802, "time_per_iteration": 2.7247114181518555 }, { "auxiliary_loss_clip": 0.01223705, "auxiliary_loss_mlp": 0.01068774, "balance_loss_clip": 1.04287004, "balance_loss_mlp": 1.06424499, "epoch": 0.048278971892379376, "flos": 18620006083200.0, "grad_norm": 2.1681880719362656, "language_loss": 0.86051071, "learning_rate": 3.977146567029833e-06, "loss": 0.88343561, "num_input_tokens_seen": 17226305, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.59375, "step": 803, "time_per_iteration": 2.552727460861206 }, { "auxiliary_loss_clip": 0.01220511, "auxiliary_loss_mlp": 0.01059588, "balance_loss_clip": 1.03293288, "balance_loss_mlp": 1.0654428, "epoch": 0.04833909514504735, "flos": 20230061310720.0, "grad_norm": 2.2521751253464313, "language_loss": 0.8507694, "learning_rate": 3.977089578417462e-06, "loss": 0.87357044, "num_input_tokens_seen": 17244545, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.546875, "step": 804, "time_per_iteration": 2.5680179595947266 }, { "auxiliary_loss_clip": 0.01223111, "auxiliary_loss_mlp": 0.01065361, "balance_loss_clip": 1.03830028, "balance_loss_mlp": 1.06394958, "epoch": 0.04839921839771532, "flos": 24645170361600.0, "grad_norm": 2.073124420771773, "language_loss": 0.8600809, "learning_rate": 3.9770325192478504e-06, "loss": 0.88296568, "num_input_tokens_seen": 17265730, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.59375, "step": 805, "time_per_iteration": 2.633993625640869 }, { "auxiliary_loss_clip": 0.01218845, "auxiliary_loss_mlp": 0.01064359, "balance_loss_clip": 1.03845453, "balance_loss_mlp": 1.06215215, "epoch": 0.048459341650383285, "flos": 24827452905600.0, "grad_norm": 1.8502511532647603, "language_loss": 0.67732191, "learning_rate": 3.9769753895230324e-06, "loss": 0.70015389, "num_input_tokens_seen": 17284820, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.5703125, "step": 806, "time_per_iteration": 5.631593704223633 }, { "auxiliary_loss_clip": 0.012162, "auxiliary_loss_mlp": 0.0105833, "balance_loss_clip": 1.03212762, "balance_loss_mlp": 1.06229854, "epoch": 0.04851946490305126, "flos": 22163204396160.0, "grad_norm": 2.3776930325971914, "language_loss": 0.76754928, "learning_rate": 3.976918189245049e-06, "loss": 0.79029465, "num_input_tokens_seen": 17305085, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.5390625, "step": 807, "time_per_iteration": 2.5452959537506104 }, { "auxiliary_loss_clip": 0.01214295, "auxiliary_loss_mlp": 0.01068299, "balance_loss_clip": 1.04229891, "balance_loss_mlp": 1.05892563, "epoch": 0.04857958815571922, "flos": 19792022952960.0, "grad_norm": 2.513251492208562, "language_loss": 0.86542022, "learning_rate": 3.9768609184159405e-06, "loss": 0.88824606, "num_input_tokens_seen": 17322715, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.5546875, "step": 808, "time_per_iteration": 3.9438438415527344 }, { "auxiliary_loss_clip": 0.01223098, "auxiliary_loss_mlp": 0.01063175, "balance_loss_clip": 1.03673387, "balance_loss_mlp": 1.06220007, "epoch": 0.048639711408387194, "flos": 18697968552960.0, "grad_norm": 1.7558063897478804, "language_loss": 0.8980208, "learning_rate": 3.976803577037751e-06, "loss": 0.92088354, "num_input_tokens_seen": 17341455, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.609375, "step": 809, "time_per_iteration": 4.182870626449585 }, { "auxiliary_loss_clip": 0.01222834, "auxiliary_loss_mlp": 0.0106486, "balance_loss_clip": 1.03795409, "balance_loss_mlp": 1.06499004, "epoch": 0.048699834661055166, "flos": 24863507182080.0, "grad_norm": 1.924673669071918, "language_loss": 0.84309274, "learning_rate": 3.976746165112527e-06, "loss": 0.86596966, "num_input_tokens_seen": 17360765, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.578125, "step": 810, "time_per_iteration": 2.566676616668701 }, { "auxiliary_loss_clip": 0.01221743, "auxiliary_loss_mlp": 0.01063524, "balance_loss_clip": 1.03688085, "balance_loss_mlp": 1.06188035, "epoch": 0.04875995791372313, "flos": 20704010290560.0, "grad_norm": 2.35011909257841, "language_loss": 0.80595249, "learning_rate": 3.976688682642317e-06, "loss": 0.82880521, "num_input_tokens_seen": 17380625, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.59375, "step": 811, "time_per_iteration": 2.61861252784729 }, { "auxiliary_loss_clip": 0.01213128, "auxiliary_loss_mlp": 0.01065327, "balance_loss_clip": 1.0403049, "balance_loss_mlp": 1.0601275, "epoch": 0.048820081166391104, "flos": 18588297352320.0, "grad_norm": 1.8216041039837787, "language_loss": 0.74260902, "learning_rate": 3.976631129629173e-06, "loss": 0.76539356, "num_input_tokens_seen": 17399355, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.53125, "step": 812, "time_per_iteration": 2.578355073928833 }, { "auxiliary_loss_clip": 0.01217964, "auxiliary_loss_mlp": 0.01076622, "balance_loss_clip": 1.05123091, "balance_loss_mlp": 1.06585872, "epoch": 0.04888020441905907, "flos": 22707322594560.0, "grad_norm": 1.8074585394537936, "language_loss": 0.89900303, "learning_rate": 3.9765735060751475e-06, "loss": 0.92194891, "num_input_tokens_seen": 17418240, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.5234375, "step": 813, "time_per_iteration": 2.635913372039795 }, { "auxiliary_loss_clip": 0.01214838, "auxiliary_loss_mlp": 0.01056112, "balance_loss_clip": 1.03100681, "balance_loss_mlp": 1.06169152, "epoch": 0.04894032767172704, "flos": 22784351310720.0, "grad_norm": 1.8517309932872184, "language_loss": 0.7440846, "learning_rate": 3.976515811982298e-06, "loss": 0.76679409, "num_input_tokens_seen": 17436250, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.53125, "step": 814, "time_per_iteration": 2.6362767219543457 }, { "auxiliary_loss_clip": 0.01219081, "auxiliary_loss_mlp": 0.01070612, "balance_loss_clip": 1.04390955, "balance_loss_mlp": 1.06218302, "epoch": 0.04900045092439501, "flos": 25516147345920.0, "grad_norm": 1.957419863160796, "language_loss": 0.83707392, "learning_rate": 3.976458047352684e-06, "loss": 0.85997087, "num_input_tokens_seen": 17455750, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.5703125, "step": 815, "time_per_iteration": 2.6504178047180176 }, { "auxiliary_loss_clip": 0.01222408, "auxiliary_loss_mlp": 0.01061549, "balance_loss_clip": 1.03529954, "balance_loss_mlp": 1.06433225, "epoch": 0.04906057417706298, "flos": 25958136199680.0, "grad_norm": 2.2381544000442686, "language_loss": 0.90559185, "learning_rate": 3.976400212188366e-06, "loss": 0.92843145, "num_input_tokens_seen": 17474995, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.578125, "step": 816, "time_per_iteration": 2.7453773021698 }, { "auxiliary_loss_clip": 0.01223062, "auxiliary_loss_mlp": 0.01059886, "balance_loss_clip": 1.03385115, "balance_loss_mlp": 1.06425428, "epoch": 0.04912069742973095, "flos": 18624638937600.0, "grad_norm": 2.079124065955999, "language_loss": 0.79839402, "learning_rate": 3.976342306491408e-06, "loss": 0.8212235, "num_input_tokens_seen": 17493395, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.59375, "step": 817, "time_per_iteration": 2.6093616485595703 }, { "auxiliary_loss_clip": 0.01217822, "auxiliary_loss_mlp": 0.01074584, "balance_loss_clip": 1.04798841, "balance_loss_mlp": 1.06318998, "epoch": 0.049180820682398915, "flos": 23699786353920.0, "grad_norm": 1.6526354209819343, "language_loss": 0.84603691, "learning_rate": 3.976284330263878e-06, "loss": 0.86896092, "num_input_tokens_seen": 17514565, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.546875, "step": 818, "time_per_iteration": 2.6574039459228516 }, { "auxiliary_loss_clip": 0.01227175, "auxiliary_loss_mlp": 0.01066787, "balance_loss_clip": 1.03964281, "balance_loss_mlp": 1.06579375, "epoch": 0.04924094393506689, "flos": 22420396753920.0, "grad_norm": 2.308249657407511, "language_loss": 0.75213832, "learning_rate": 3.976226283507843e-06, "loss": 0.77507794, "num_input_tokens_seen": 17534590, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.609375, "step": 819, "time_per_iteration": 2.6399545669555664 }, { "auxiliary_loss_clip": 0.01224474, "auxiliary_loss_mlp": 0.01063435, "balance_loss_clip": 1.03939044, "balance_loss_mlp": 1.06877375, "epoch": 0.04930106718773486, "flos": 15738246766080.0, "grad_norm": 2.4576523354535387, "language_loss": 0.84684688, "learning_rate": 3.976168166225375e-06, "loss": 0.86972594, "num_input_tokens_seen": 17551900, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 1.5546875, "step": 820, "time_per_iteration": 2.563364028930664 }, { "auxiliary_loss_clip": 0.01220423, "auxiliary_loss_mlp": 0.01063251, "balance_loss_clip": 1.03685808, "balance_loss_mlp": 1.06267226, "epoch": 0.049361190440402825, "flos": 26250628648320.0, "grad_norm": 2.019003085967663, "language_loss": 0.90456063, "learning_rate": 3.976109978418549e-06, "loss": 0.92739737, "num_input_tokens_seen": 17571485, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.578125, "step": 821, "time_per_iteration": 2.592754602432251 }, { "auxiliary_loss_clip": 0.01218969, "auxiliary_loss_mlp": 0.01075682, "balance_loss_clip": 1.05036211, "balance_loss_mlp": 1.06265044, "epoch": 0.0494213136930708, "flos": 21252366293760.0, "grad_norm": 2.288359132132766, "language_loss": 0.89152741, "learning_rate": 3.976051720089441e-06, "loss": 0.91447395, "num_input_tokens_seen": 17591410, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.5625, "step": 822, "time_per_iteration": 2.6135520935058594 }, { "auxiliary_loss_clip": 0.01220823, "auxiliary_loss_mlp": 0.01059655, "balance_loss_clip": 1.03191519, "balance_loss_mlp": 1.06577682, "epoch": 0.04948143694573876, "flos": 27965506740480.0, "grad_norm": 1.9157172171871082, "language_loss": 0.66871732, "learning_rate": 3.9759933912401304e-06, "loss": 0.69152212, "num_input_tokens_seen": 17612010, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.5546875, "step": 823, "time_per_iteration": 2.5529069900512695 }, { "auxiliary_loss_clip": 0.01093888, "auxiliary_loss_mlp": 0.0101337, "balance_loss_clip": 1.00905442, "balance_loss_mlp": 1.02673531, "epoch": 0.049541560198406734, "flos": 66180995533440.0, "grad_norm": 1.3762602657828822, "language_loss": 0.62239635, "learning_rate": 3.975934991872698e-06, "loss": 0.64346898, "num_input_tokens_seen": 17673430, "router_z_loss_clip": 0.04321289, "router_z_loss_mlp": 0.671875, "step": 824, "time_per_iteration": 3.1879143714904785 }, { "auxiliary_loss_clip": 0.01223678, "auxiliary_loss_mlp": 0.01070625, "balance_loss_clip": 1.04307568, "balance_loss_mlp": 1.06428552, "epoch": 0.049601683451074706, "flos": 22892693708160.0, "grad_norm": 1.8587057820337165, "language_loss": 0.90303266, "learning_rate": 3.975876521989229e-06, "loss": 0.92597568, "num_input_tokens_seen": 17689545, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.59375, "step": 825, "time_per_iteration": 2.620405912399292 }, { "auxiliary_loss_clip": 0.01221817, "auxiliary_loss_mlp": 0.01063485, "balance_loss_clip": 1.03537548, "balance_loss_mlp": 1.06359148, "epoch": 0.04966180670374267, "flos": 21433643256960.0, "grad_norm": 2.5183153351109646, "language_loss": 0.66418296, "learning_rate": 3.975817981591809e-06, "loss": 0.68703604, "num_input_tokens_seen": 17705965, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.578125, "step": 826, "time_per_iteration": 2.5806996822357178 }, { "auxiliary_loss_clip": 0.01223442, "auxiliary_loss_mlp": 0.01065564, "balance_loss_clip": 1.03875351, "balance_loss_mlp": 1.06478894, "epoch": 0.04972192995641064, "flos": 23107367341440.0, "grad_norm": 1.912504675282028, "language_loss": 0.78038239, "learning_rate": 3.975759370682528e-06, "loss": 0.80327243, "num_input_tokens_seen": 17724580, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.5859375, "step": 827, "time_per_iteration": 2.724081516265869 }, { "auxiliary_loss_clip": 0.01226352, "auxiliary_loss_mlp": 0.01077794, "balance_loss_clip": 1.05014944, "balance_loss_mlp": 1.06562054, "epoch": 0.04978205320907861, "flos": 40406147458560.0, "grad_norm": 1.6965576505095166, "language_loss": 0.78545958, "learning_rate": 3.975700689263477e-06, "loss": 0.80850101, "num_input_tokens_seen": 17747755, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.609375, "step": 828, "time_per_iteration": 2.9883389472961426 }, { "auxiliary_loss_clip": 0.01212401, "auxiliary_loss_mlp": 0.01056908, "balance_loss_clip": 1.03144479, "balance_loss_mlp": 1.05949879, "epoch": 0.04984217646174658, "flos": 25228539146880.0, "grad_norm": 2.630588224883796, "language_loss": 0.83488059, "learning_rate": 3.97564193733675e-06, "loss": 0.85757369, "num_input_tokens_seen": 17768550, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.53125, "step": 829, "time_per_iteration": 2.6204142570495605 }, { "auxiliary_loss_clip": 0.01219359, "auxiliary_loss_mlp": 0.01078704, "balance_loss_clip": 1.04861593, "balance_loss_mlp": 1.0609771, "epoch": 0.04990229971441455, "flos": 15959636242560.0, "grad_norm": 2.3832948007088466, "language_loss": 0.75154328, "learning_rate": 3.975583114904446e-06, "loss": 0.77452391, "num_input_tokens_seen": 17786080, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.5859375, "step": 830, "time_per_iteration": 2.5666749477386475 }, { "auxiliary_loss_clip": 0.0122048, "auxiliary_loss_mlp": 0.01071309, "balance_loss_clip": 1.04426038, "balance_loss_mlp": 1.06226289, "epoch": 0.04996242296708252, "flos": 18405116968320.0, "grad_norm": 2.4613828701595817, "language_loss": 0.7923488, "learning_rate": 3.975524221968661e-06, "loss": 0.81526667, "num_input_tokens_seen": 17803635, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.5859375, "step": 831, "time_per_iteration": 2.560803174972534 }, { "auxiliary_loss_clip": 0.01259129, "auxiliary_loss_mlp": 0.01071038, "balance_loss_clip": 1.04602838, "balance_loss_mlp": 1.06414771, "epoch": 0.05002254621975049, "flos": 17858053854720.0, "grad_norm": 2.5408071144637794, "language_loss": 0.91532391, "learning_rate": 3.975465258531499e-06, "loss": 0.93862557, "num_input_tokens_seen": 17822190, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.578125, "step": 832, "time_per_iteration": 2.596212148666382 }, { "auxiliary_loss_clip": 0.01233209, "auxiliary_loss_mlp": 0.01077842, "balance_loss_clip": 1.05286789, "balance_loss_mlp": 1.06261575, "epoch": 0.050082669472418455, "flos": 45660273367680.0, "grad_norm": 1.901340797538477, "language_loss": 0.83006704, "learning_rate": 3.9754062245950625e-06, "loss": 0.85317755, "num_input_tokens_seen": 17846915, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.5234375, "step": 833, "time_per_iteration": 2.818922519683838 }, { "auxiliary_loss_clip": 0.01238705, "auxiliary_loss_mlp": 0.01063143, "balance_loss_clip": 1.03572512, "balance_loss_mlp": 1.0619365, "epoch": 0.05014279272508643, "flos": 37962067363200.0, "grad_norm": 2.218390400557543, "language_loss": 0.82524753, "learning_rate": 3.975347120161459e-06, "loss": 0.84826601, "num_input_tokens_seen": 17867270, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.5859375, "step": 834, "time_per_iteration": 2.7465226650238037 }, { "auxiliary_loss_clip": 0.01232804, "auxiliary_loss_mlp": 0.01066072, "balance_loss_clip": 1.03840351, "balance_loss_mlp": 1.06418633, "epoch": 0.0502029159777544, "flos": 20996179516800.0, "grad_norm": 2.380877770173438, "language_loss": 0.91539979, "learning_rate": 3.975287945232799e-06, "loss": 0.93838853, "num_input_tokens_seen": 17884880, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.59375, "step": 835, "time_per_iteration": 2.574397325515747 }, { "auxiliary_loss_clip": 0.01218479, "auxiliary_loss_mlp": 0.01070985, "balance_loss_clip": 1.04254127, "balance_loss_mlp": 1.05822361, "epoch": 0.050263039230422364, "flos": 15888066393600.0, "grad_norm": 2.3533977426708157, "language_loss": 0.75979775, "learning_rate": 3.975228699811193e-06, "loss": 0.78269243, "num_input_tokens_seen": 17903695, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.6015625, "step": 836, "time_per_iteration": 2.6411874294281006 }, { "auxiliary_loss_clip": 0.01223726, "auxiliary_loss_mlp": 0.01060442, "balance_loss_clip": 1.03491974, "balance_loss_mlp": 1.0622977, "epoch": 0.050323162483090336, "flos": 23732752060800.0, "grad_norm": 2.2458028594069726, "language_loss": 0.83600724, "learning_rate": 3.975169383898755e-06, "loss": 0.85884893, "num_input_tokens_seen": 17920745, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.515625, "step": 837, "time_per_iteration": 2.729757785797119 }, { "auxiliary_loss_clip": 0.01236173, "auxiliary_loss_mlp": 0.01317053, "balance_loss_clip": 1.04209363, "balance_loss_mlp": 1.06394851, "epoch": 0.0503832857357583, "flos": 20266223328000.0, "grad_norm": 2.426187109097369, "language_loss": 0.73090374, "learning_rate": 3.975109997497604e-06, "loss": 0.75643599, "num_input_tokens_seen": 17938220, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.546875, "step": 838, "time_per_iteration": 2.6206214427948 }, { "auxiliary_loss_clip": 0.01234799, "auxiliary_loss_mlp": 0.01069783, "balance_loss_clip": 1.04273415, "balance_loss_mlp": 1.06020832, "epoch": 0.05044340898842627, "flos": 17785011548160.0, "grad_norm": 1.8306537799954374, "language_loss": 0.83201313, "learning_rate": 3.975050540609857e-06, "loss": 0.85505891, "num_input_tokens_seen": 17957325, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.5625, "step": 839, "time_per_iteration": 2.7347474098205566 }, { "auxiliary_loss_clip": 0.01216024, "auxiliary_loss_mlp": 0.01064689, "balance_loss_clip": 1.03954816, "balance_loss_mlp": 1.06207037, "epoch": 0.050503532241094246, "flos": 22966526113920.0, "grad_norm": 1.9620474509327226, "language_loss": 0.8516984, "learning_rate": 3.9749910132376355e-06, "loss": 0.87450552, "num_input_tokens_seen": 17975875, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.5390625, "step": 840, "time_per_iteration": 2.5862507820129395 }, { "auxiliary_loss_clip": 0.01238375, "auxiliary_loss_mlp": 0.01060514, "balance_loss_clip": 1.03273809, "balance_loss_mlp": 1.06356406, "epoch": 0.05056365549376221, "flos": 22776989022720.0, "grad_norm": 1.744780567681202, "language_loss": 0.94125366, "learning_rate": 3.974931415383066e-06, "loss": 0.96424258, "num_input_tokens_seen": 17994340, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.5625, "step": 841, "time_per_iteration": 2.6286070346832275 }, { "auxiliary_loss_clip": 0.01216454, "auxiliary_loss_mlp": 0.01067247, "balance_loss_clip": 1.0393281, "balance_loss_mlp": 1.06097054, "epoch": 0.05062377874643018, "flos": 30916968399360.0, "grad_norm": 2.176196335615235, "language_loss": 0.77011275, "learning_rate": 3.974871747048274e-06, "loss": 0.7929498, "num_input_tokens_seen": 18015260, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.5546875, "step": 842, "time_per_iteration": 2.7548210620880127 }, { "auxiliary_loss_clip": 0.01232784, "auxiliary_loss_mlp": 0.01074409, "balance_loss_clip": 1.04710984, "balance_loss_mlp": 1.0642128, "epoch": 0.05068390199909815, "flos": 19647159402240.0, "grad_norm": 2.0257149068911957, "language_loss": 0.78161442, "learning_rate": 3.97481200823539e-06, "loss": 0.80468643, "num_input_tokens_seen": 18033960, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.59375, "step": 843, "time_per_iteration": 2.5499730110168457 }, { "auxiliary_loss_clip": 0.01232559, "auxiliary_loss_mlp": 0.01062425, "balance_loss_clip": 1.03736722, "balance_loss_mlp": 1.06586289, "epoch": 0.05074402525176612, "flos": 37962103276800.0, "grad_norm": 1.8782756674822956, "language_loss": 0.82937777, "learning_rate": 3.974752198946545e-06, "loss": 0.85232753, "num_input_tokens_seen": 18056700, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.578125, "step": 844, "time_per_iteration": 2.7477023601531982 }, { "auxiliary_loss_clip": 0.01225687, "auxiliary_loss_mlp": 0.01061858, "balance_loss_clip": 1.03631151, "balance_loss_mlp": 1.06172442, "epoch": 0.05080414850443409, "flos": 22054610603520.0, "grad_norm": 2.644219541359473, "language_loss": 0.76928109, "learning_rate": 3.974692319183873e-06, "loss": 0.79215658, "num_input_tokens_seen": 18075815, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.546875, "step": 845, "time_per_iteration": 2.558526039123535 }, { "auxiliary_loss_clip": 0.01223354, "auxiliary_loss_mlp": 0.01068144, "balance_loss_clip": 1.04147696, "balance_loss_mlp": 1.05844593, "epoch": 0.05086427175710206, "flos": 20225787592320.0, "grad_norm": 1.7730222380963443, "language_loss": 0.87437266, "learning_rate": 3.974632368949513e-06, "loss": 0.89728761, "num_input_tokens_seen": 18095095, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.5625, "step": 846, "time_per_iteration": 2.6144955158233643 }, { "auxiliary_loss_clip": 0.01235504, "auxiliary_loss_mlp": 0.01068469, "balance_loss_clip": 1.04245722, "balance_loss_mlp": 1.06264186, "epoch": 0.05092439500977003, "flos": 15159223526400.0, "grad_norm": 1.723088870189485, "language_loss": 0.87243199, "learning_rate": 3.974572348245602e-06, "loss": 0.89547175, "num_input_tokens_seen": 18112675, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.546875, "step": 847, "time_per_iteration": 2.593432903289795 }, { "auxiliary_loss_clip": 0.01248353, "auxiliary_loss_mlp": 0.01060798, "balance_loss_clip": 1.03427386, "balance_loss_mlp": 1.05960429, "epoch": 0.050984518262437994, "flos": 22055149307520.0, "grad_norm": 2.3666185032472082, "language_loss": 0.81986654, "learning_rate": 3.974512257074284e-06, "loss": 0.84295809, "num_input_tokens_seen": 18130745, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.5234375, "step": 848, "time_per_iteration": 5.542198896408081 }, { "auxiliary_loss_clip": 0.01218123, "auxiliary_loss_mlp": 0.01070853, "balance_loss_clip": 1.04491293, "balance_loss_mlp": 1.06498647, "epoch": 0.05104464151510597, "flos": 30225329043840.0, "grad_norm": 1.6899789058428758, "language_loss": 0.87021846, "learning_rate": 3.974452095437701e-06, "loss": 0.89310825, "num_input_tokens_seen": 18152410, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.53125, "step": 849, "time_per_iteration": 4.165936470031738 }, { "auxiliary_loss_clip": 0.0121203, "auxiliary_loss_mlp": 0.01059094, "balance_loss_clip": 1.03297484, "balance_loss_mlp": 1.05894923, "epoch": 0.05110476476777394, "flos": 18332900674560.0, "grad_norm": 1.9537278902271145, "language_loss": 0.82805133, "learning_rate": 3.974391863338003e-06, "loss": 0.85076255, "num_input_tokens_seen": 18170870, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.53125, "step": 850, "time_per_iteration": 4.043461799621582 }, { "auxiliary_loss_clip": 0.01225651, "auxiliary_loss_mlp": 0.01063948, "balance_loss_clip": 1.03813875, "balance_loss_mlp": 1.0612855, "epoch": 0.051164888020441904, "flos": 37998732170880.0, "grad_norm": 2.1324500220240874, "language_loss": 0.6454587, "learning_rate": 3.974331560777338e-06, "loss": 0.66835475, "num_input_tokens_seen": 18191555, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.5546875, "step": 851, "time_per_iteration": 2.797919511795044 }, { "auxiliary_loss_clip": 0.01223198, "auxiliary_loss_mlp": 0.01069865, "balance_loss_clip": 1.04160035, "balance_loss_mlp": 1.05901754, "epoch": 0.051225011273109876, "flos": 23038634666880.0, "grad_norm": 3.056257527965623, "language_loss": 0.83253539, "learning_rate": 3.974271187757857e-06, "loss": 0.85546601, "num_input_tokens_seen": 18208620, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.5546875, "step": 852, "time_per_iteration": 2.627997398376465 }, { "auxiliary_loss_clip": 0.01233799, "auxiliary_loss_mlp": 0.01070971, "balance_loss_clip": 1.04363632, "balance_loss_mlp": 1.05879498, "epoch": 0.05128513452577785, "flos": 18259822454400.0, "grad_norm": 2.0679405825930957, "language_loss": 0.80113786, "learning_rate": 3.974210744281717e-06, "loss": 0.82418549, "num_input_tokens_seen": 18226370, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.5625, "step": 853, "time_per_iteration": 2.6963000297546387 }, { "auxiliary_loss_clip": 0.0121136, "auxiliary_loss_mlp": 0.01065815, "balance_loss_clip": 1.03823042, "balance_loss_mlp": 1.05798411, "epoch": 0.05134525777844581, "flos": 27198957571200.0, "grad_norm": 2.5052246935955504, "language_loss": 0.75307345, "learning_rate": 3.974150230351074e-06, "loss": 0.77584529, "num_input_tokens_seen": 18247075, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.53125, "step": 854, "time_per_iteration": 2.677424192428589 }, { "auxiliary_loss_clip": 0.01236149, "auxiliary_loss_mlp": 0.01066886, "balance_loss_clip": 1.03940868, "balance_loss_mlp": 1.06206846, "epoch": 0.051405381031113785, "flos": 28362247436160.0, "grad_norm": 2.4448792321714725, "language_loss": 0.81267667, "learning_rate": 3.974089645968087e-06, "loss": 0.83570707, "num_input_tokens_seen": 18265680, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.5546875, "step": 855, "time_per_iteration": 2.749377727508545 }, { "auxiliary_loss_clip": 0.01120756, "auxiliary_loss_mlp": 0.0125796, "balance_loss_clip": 1.00444698, "balance_loss_mlp": 1.02410209, "epoch": 0.05146550428378175, "flos": 65618169327360.0, "grad_norm": 0.9710582923911042, "language_loss": 0.65637529, "learning_rate": 3.974028991134917e-06, "loss": 0.68016255, "num_input_tokens_seen": 18327015, "router_z_loss_clip": 0.04321289, "router_z_loss_mlp": 0.69140625, "step": 856, "time_per_iteration": 3.214327335357666 }, { "auxiliary_loss_clip": 0.01229661, "auxiliary_loss_mlp": 0.0105489, "balance_loss_clip": 1.0281986, "balance_loss_mlp": 1.06052995, "epoch": 0.05152562753644972, "flos": 22054861998720.0, "grad_norm": 2.303465996025532, "language_loss": 0.76883912, "learning_rate": 3.973968265853732e-06, "loss": 0.79168463, "num_input_tokens_seen": 18345235, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.5078125, "step": 857, "time_per_iteration": 2.7630615234375 }, { "auxiliary_loss_clip": 0.01240249, "auxiliary_loss_mlp": 0.01056067, "balance_loss_clip": 1.02901816, "balance_loss_mlp": 1.05839443, "epoch": 0.051585750789117694, "flos": 18509544783360.0, "grad_norm": 2.091810394761215, "language_loss": 0.88861787, "learning_rate": 3.973907470126697e-06, "loss": 0.91158104, "num_input_tokens_seen": 18362350, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.5390625, "step": 858, "time_per_iteration": 2.6002280712127686 }, { "auxiliary_loss_clip": 0.01231739, "auxiliary_loss_mlp": 0.01055987, "balance_loss_clip": 1.02927232, "balance_loss_mlp": 1.06059563, "epoch": 0.05164587404178566, "flos": 23730238108800.0, "grad_norm": 2.169205187727112, "language_loss": 0.75438732, "learning_rate": 3.973846603955982e-06, "loss": 0.7772646, "num_input_tokens_seen": 18383390, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.53125, "step": 859, "time_per_iteration": 2.679534435272217 }, { "auxiliary_loss_clip": 0.01242204, "auxiliary_loss_mlp": 0.01071839, "balance_loss_clip": 1.04336023, "balance_loss_mlp": 1.06182647, "epoch": 0.05170599729445363, "flos": 16252882876800.0, "grad_norm": 2.632372417803763, "language_loss": 0.9062869, "learning_rate": 3.973785667343758e-06, "loss": 0.92942739, "num_input_tokens_seen": 18399220, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.6171875, "step": 860, "time_per_iteration": 2.691744565963745 }, { "auxiliary_loss_clip": 0.01216832, "auxiliary_loss_mlp": 0.01061351, "balance_loss_clip": 1.03601933, "balance_loss_mlp": 1.06250536, "epoch": 0.0517661205471216, "flos": 23985922095360.0, "grad_norm": 1.9055418146337222, "language_loss": 0.82158178, "learning_rate": 3.973724660292202e-06, "loss": 0.84436357, "num_input_tokens_seen": 18419005, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.546875, "step": 861, "time_per_iteration": 2.691981315612793 }, { "auxiliary_loss_clip": 0.012335, "auxiliary_loss_mlp": 0.01058843, "balance_loss_clip": 1.03309357, "balance_loss_mlp": 1.05936301, "epoch": 0.05182624379978957, "flos": 29277718392960.0, "grad_norm": 1.7032809353873806, "language_loss": 0.78416812, "learning_rate": 3.973663582803489e-06, "loss": 0.80709153, "num_input_tokens_seen": 18440550, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.5625, "step": 862, "time_per_iteration": 2.6832149028778076 }, { "auxiliary_loss_clip": 0.01229049, "auxiliary_loss_mlp": 0.01064806, "balance_loss_clip": 1.0401057, "balance_loss_mlp": 1.0611856, "epoch": 0.05188636705245754, "flos": 24170826332160.0, "grad_norm": 1.934757941310735, "language_loss": 0.88867545, "learning_rate": 3.9736024348798e-06, "loss": 0.911614, "num_input_tokens_seen": 18461950, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 1.5, "step": 863, "time_per_iteration": 2.6468663215637207 }, { "auxiliary_loss_clip": 0.01235558, "auxiliary_loss_mlp": 0.01065463, "balance_loss_clip": 1.03766334, "balance_loss_mlp": 1.06208682, "epoch": 0.051946490305125506, "flos": 26760703731840.0, "grad_norm": 2.021004176035624, "language_loss": 0.75245571, "learning_rate": 3.973541216523316e-06, "loss": 0.77546597, "num_input_tokens_seen": 18480555, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.5546875, "step": 864, "time_per_iteration": 2.6947853565216064 }, { "auxiliary_loss_clip": 0.01235224, "auxiliary_loss_mlp": 0.01067678, "balance_loss_clip": 1.04116583, "balance_loss_mlp": 1.06130087, "epoch": 0.05200661355779348, "flos": 21502519585920.0, "grad_norm": 2.0019449956916535, "language_loss": 0.78885412, "learning_rate": 3.973479927736224e-06, "loss": 0.81188315, "num_input_tokens_seen": 18499645, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.5546875, "step": 865, "time_per_iteration": 2.5963025093078613 }, { "auxiliary_loss_clip": 0.0122383, "auxiliary_loss_mlp": 0.01064148, "balance_loss_clip": 1.03762364, "balance_loss_mlp": 1.05932021, "epoch": 0.05206673681046144, "flos": 18114492026880.0, "grad_norm": 2.7522296827852246, "language_loss": 0.85965323, "learning_rate": 3.973418568520709e-06, "loss": 0.88253301, "num_input_tokens_seen": 18516810, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.5546875, "step": 866, "time_per_iteration": 2.652806043624878 }, { "auxiliary_loss_clip": 0.01245624, "auxiliary_loss_mlp": 0.01063441, "balance_loss_clip": 1.03742993, "balance_loss_mlp": 1.06524706, "epoch": 0.052126860063129415, "flos": 17524191916800.0, "grad_norm": 2.8335733059692263, "language_loss": 0.87115514, "learning_rate": 3.973357138878961e-06, "loss": 0.89424574, "num_input_tokens_seen": 18532510, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.53125, "step": 867, "time_per_iteration": 2.6996090412139893 }, { "auxiliary_loss_clip": 0.01238967, "auxiliary_loss_mlp": 0.01075536, "balance_loss_clip": 1.05198026, "balance_loss_mlp": 1.0630219, "epoch": 0.05218698331579739, "flos": 32598054771840.0, "grad_norm": 1.531635955540602, "language_loss": 0.63623619, "learning_rate": 3.973295638813174e-06, "loss": 0.65938127, "num_input_tokens_seen": 18557380, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 1.484375, "step": 868, "time_per_iteration": 2.8135530948638916 }, { "auxiliary_loss_clip": 0.01215597, "auxiliary_loss_mlp": 0.01065189, "balance_loss_clip": 1.03748465, "balance_loss_mlp": 1.05976629, "epoch": 0.05224710656846535, "flos": 22127293774080.0, "grad_norm": 1.966531137038951, "language_loss": 0.8310625, "learning_rate": 3.973234068325541e-06, "loss": 0.85387033, "num_input_tokens_seen": 18575720, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.5546875, "step": 869, "time_per_iteration": 2.5431926250457764 }, { "auxiliary_loss_clip": 0.01227292, "auxiliary_loss_mlp": 0.0105899, "balance_loss_clip": 1.03371775, "balance_loss_mlp": 1.0608964, "epoch": 0.052307229821133325, "flos": 11145092976000.0, "grad_norm": 2.0397533156049077, "language_loss": 0.87486124, "learning_rate": 3.973172427418259e-06, "loss": 0.89772403, "num_input_tokens_seen": 18592185, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.5703125, "step": 870, "time_per_iteration": 2.594677686691284 }, { "auxiliary_loss_clip": 0.01215178, "auxiliary_loss_mlp": 0.01063425, "balance_loss_clip": 1.03861749, "balance_loss_mlp": 1.06151307, "epoch": 0.05236735307380129, "flos": 19128070005120.0, "grad_norm": 1.986261548761154, "language_loss": 0.80641806, "learning_rate": 3.97311071609353e-06, "loss": 0.82920408, "num_input_tokens_seen": 18609560, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.5390625, "step": 871, "time_per_iteration": 2.5981903076171875 }, { "auxiliary_loss_clip": 0.01232681, "auxiliary_loss_mlp": 0.01060764, "balance_loss_clip": 1.03617072, "balance_loss_mlp": 1.06280923, "epoch": 0.05242747632646926, "flos": 20960663944320.0, "grad_norm": 1.8954857172168933, "language_loss": 0.81541121, "learning_rate": 3.973048934353554e-06, "loss": 0.83834571, "num_input_tokens_seen": 18629405, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 1.515625, "step": 872, "time_per_iteration": 2.6272947788238525 }, { "auxiliary_loss_clip": 0.01106705, "auxiliary_loss_mlp": 0.01038291, "balance_loss_clip": 1.03395212, "balance_loss_mlp": 1.02141881, "epoch": 0.052487599579137234, "flos": 65020542842880.0, "grad_norm": 1.4169061097315896, "language_loss": 0.6168412, "learning_rate": 3.972987082200538e-06, "loss": 0.63829112, "num_input_tokens_seen": 18681480, "router_z_loss_clip": 0.04345703, "router_z_loss_mlp": 0.66796875, "step": 873, "time_per_iteration": 3.0818305015563965 }, { "auxiliary_loss_clip": 0.01231873, "auxiliary_loss_mlp": 0.01059154, "balance_loss_clip": 1.03409624, "balance_loss_mlp": 1.06047797, "epoch": 0.0525477228318052, "flos": 23288859786240.0, "grad_norm": 1.88602925140777, "language_loss": 0.8876462, "learning_rate": 3.972925159636687e-06, "loss": 0.91055644, "num_input_tokens_seen": 18700390, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.53125, "step": 874, "time_per_iteration": 2.646359443664551 }, { "auxiliary_loss_clip": 0.01231681, "auxiliary_loss_mlp": 0.01062802, "balance_loss_clip": 1.03748238, "balance_loss_mlp": 1.05889988, "epoch": 0.05260784608447317, "flos": 32230221546240.0, "grad_norm": 1.9022846163185487, "language_loss": 0.7439267, "learning_rate": 3.972863166664212e-06, "loss": 0.76687151, "num_input_tokens_seen": 18721280, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.546875, "step": 875, "time_per_iteration": 2.722834587097168 }, { "auxiliary_loss_clip": 0.01219464, "auxiliary_loss_mlp": 0.0105705, "balance_loss_clip": 1.03031111, "balance_loss_mlp": 1.05895078, "epoch": 0.052667969337141136, "flos": 24463211040000.0, "grad_norm": 1.8412203254791206, "language_loss": 0.92760664, "learning_rate": 3.972801103285326e-06, "loss": 0.9503718, "num_input_tokens_seen": 18741545, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.515625, "step": 876, "time_per_iteration": 2.6882290840148926 }, { "auxiliary_loss_clip": 0.01085229, "auxiliary_loss_mlp": 0.01010208, "balance_loss_clip": 1.00579739, "balance_loss_mlp": 1.02015352, "epoch": 0.05272809258980911, "flos": 57784329567360.0, "grad_norm": 0.8172437914030547, "language_loss": 0.6301527, "learning_rate": 3.9727389695022434e-06, "loss": 0.65110707, "num_input_tokens_seen": 18801400, "router_z_loss_clip": 0.04418945, "router_z_loss_mlp": 0.6484375, "step": 877, "time_per_iteration": 3.1462361812591553 }, { "auxiliary_loss_clip": 0.01229594, "auxiliary_loss_mlp": 0.0106721, "balance_loss_clip": 1.04043508, "balance_loss_mlp": 1.05806708, "epoch": 0.05278821584247708, "flos": 17420805596160.0, "grad_norm": 2.4439501635482914, "language_loss": 0.85351068, "learning_rate": 3.972676765317181e-06, "loss": 0.87647867, "num_input_tokens_seen": 18819670, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.53125, "step": 878, "time_per_iteration": 2.5518856048583984 }, { "auxiliary_loss_clip": 0.01213364, "auxiliary_loss_mlp": 0.01057356, "balance_loss_clip": 1.03166616, "balance_loss_mlp": 1.05870104, "epoch": 0.052848339095145046, "flos": 26137258346880.0, "grad_norm": 1.9335076897908565, "language_loss": 0.82774985, "learning_rate": 3.97261449073236e-06, "loss": 0.85045707, "num_input_tokens_seen": 18840580, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.546875, "step": 879, "time_per_iteration": 2.636770009994507 }, { "auxiliary_loss_clip": 0.0122812, "auxiliary_loss_mlp": 0.01067978, "balance_loss_clip": 1.0425148, "balance_loss_mlp": 1.05849814, "epoch": 0.05290846234781302, "flos": 16472081623680.0, "grad_norm": 2.238216028501252, "language_loss": 0.8411727, "learning_rate": 3.9725521457500005e-06, "loss": 0.86413372, "num_input_tokens_seen": 18859295, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.515625, "step": 880, "time_per_iteration": 2.610524892807007 }, { "auxiliary_loss_clip": 0.01243802, "auxiliary_loss_mlp": 0.01065569, "balance_loss_clip": 1.04001093, "balance_loss_mlp": 1.06160116, "epoch": 0.05296858560048098, "flos": 19865173000320.0, "grad_norm": 2.145785155152331, "language_loss": 0.86306036, "learning_rate": 3.97248973037233e-06, "loss": 0.88615406, "num_input_tokens_seen": 18877485, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.546875, "step": 881, "time_per_iteration": 2.664766788482666 }, { "auxiliary_loss_clip": 0.01219994, "auxiliary_loss_mlp": 0.01066086, "balance_loss_clip": 1.03953815, "balance_loss_mlp": 1.05746686, "epoch": 0.053028708853148955, "flos": 24388588535040.0, "grad_norm": 1.7943770098052925, "language_loss": 0.87734115, "learning_rate": 3.972427244601574e-06, "loss": 0.90020198, "num_input_tokens_seen": 18898275, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.53125, "step": 882, "time_per_iteration": 2.6326236724853516 }, { "auxiliary_loss_clip": 0.01214998, "auxiliary_loss_mlp": 0.0105822, "balance_loss_clip": 1.0316602, "balance_loss_mlp": 1.0586257, "epoch": 0.05308883210581693, "flos": 36393166143360.0, "grad_norm": 2.746438225383856, "language_loss": 0.68908763, "learning_rate": 3.972364688439964e-06, "loss": 0.71181983, "num_input_tokens_seen": 18920665, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.5625, "step": 883, "time_per_iteration": 2.789663076400757 }, { "auxiliary_loss_clip": 0.01237874, "auxiliary_loss_mlp": 0.01057154, "balance_loss_clip": 1.03295481, "balance_loss_mlp": 1.06028545, "epoch": 0.05314895535848489, "flos": 22855095146880.0, "grad_norm": 3.021817592698074, "language_loss": 0.7585482, "learning_rate": 3.9723020618897325e-06, "loss": 0.78149843, "num_input_tokens_seen": 18939835, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.5, "step": 884, "time_per_iteration": 2.6227755546569824 }, { "auxiliary_loss_clip": 0.01246685, "auxiliary_loss_mlp": 0.01060356, "balance_loss_clip": 1.03588223, "balance_loss_mlp": 1.06049061, "epoch": 0.053209078611152864, "flos": 12860330204160.0, "grad_norm": 2.0019515476542167, "language_loss": 0.85247546, "learning_rate": 3.972239364953113e-06, "loss": 0.87554586, "num_input_tokens_seen": 18958405, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 1.5, "step": 885, "time_per_iteration": 2.6027536392211914 }, { "auxiliary_loss_clip": 0.01230795, "auxiliary_loss_mlp": 0.01058355, "balance_loss_clip": 1.03293943, "balance_loss_mlp": 1.0582453, "epoch": 0.05326920186382083, "flos": 12164596698240.0, "grad_norm": 3.781721230284744, "language_loss": 0.86039507, "learning_rate": 3.9721765976323435e-06, "loss": 0.8832866, "num_input_tokens_seen": 18975445, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.5390625, "step": 886, "time_per_iteration": 2.6296181678771973 }, { "auxiliary_loss_clip": 0.01237781, "auxiliary_loss_mlp": 0.01063813, "balance_loss_clip": 1.03718114, "balance_loss_mlp": 1.0592072, "epoch": 0.0533293251164888, "flos": 22704485420160.0, "grad_norm": 2.0773394475161564, "language_loss": 0.89117527, "learning_rate": 3.972113759929665e-06, "loss": 0.91419113, "num_input_tokens_seen": 18991930, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.515625, "step": 887, "time_per_iteration": 2.633816719055176 }, { "auxiliary_loss_clip": 0.01245935, "auxiliary_loss_mlp": 0.01065882, "balance_loss_clip": 1.03870213, "balance_loss_mlp": 1.05560958, "epoch": 0.053389448369156774, "flos": 26940939200640.0, "grad_norm": 1.8500688305386483, "language_loss": 0.74860632, "learning_rate": 3.9720508518473186e-06, "loss": 0.77172446, "num_input_tokens_seen": 19009790, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.53125, "step": 888, "time_per_iteration": 2.7740957736968994 }, { "auxiliary_loss_clip": 0.01246651, "auxiliary_loss_mlp": 0.01070011, "balance_loss_clip": 1.04284286, "balance_loss_mlp": 1.05848587, "epoch": 0.05344957162182474, "flos": 25556331686400.0, "grad_norm": 1.853578408885756, "language_loss": 0.88568771, "learning_rate": 3.97198787338755e-06, "loss": 0.90885437, "num_input_tokens_seen": 19030170, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.515625, "step": 889, "time_per_iteration": 5.7096946239471436 }, { "auxiliary_loss_clip": 0.01206274, "auxiliary_loss_mlp": 0.0104921, "balance_loss_clip": 1.02451015, "balance_loss_mlp": 1.05739737, "epoch": 0.05350969487449271, "flos": 19719591177600.0, "grad_norm": 2.298715981716306, "language_loss": 0.87908089, "learning_rate": 3.971924824552607e-06, "loss": 0.90163577, "num_input_tokens_seen": 19048075, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 1.484375, "step": 890, "time_per_iteration": 4.197216510772705 }, { "auxiliary_loss_clip": 0.0122855, "auxiliary_loss_mlp": 0.01062465, "balance_loss_clip": 1.03874278, "balance_loss_mlp": 1.05876017, "epoch": 0.053569818127160676, "flos": 27016351804800.0, "grad_norm": 2.2143661514492172, "language_loss": 0.92851686, "learning_rate": 3.97186170534474e-06, "loss": 0.95142698, "num_input_tokens_seen": 19067465, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 1.515625, "step": 891, "time_per_iteration": 2.7676618099212646 }, { "auxiliary_loss_clip": 0.01217949, "auxiliary_loss_mlp": 0.01061007, "balance_loss_clip": 1.03469753, "balance_loss_mlp": 1.05821741, "epoch": 0.05362994137982865, "flos": 13188338225280.0, "grad_norm": 2.145481312436288, "language_loss": 0.71635282, "learning_rate": 3.9717985157662e-06, "loss": 0.73914242, "num_input_tokens_seen": 19085505, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.5078125, "step": 892, "time_per_iteration": 4.9363884925842285 }, { "auxiliary_loss_clip": 0.01228132, "auxiliary_loss_mlp": 0.01315961, "balance_loss_clip": 1.04256082, "balance_loss_mlp": 1.05767441, "epoch": 0.05369006463249662, "flos": 28658008022400.0, "grad_norm": 30.073399256599945, "language_loss": 0.82462549, "learning_rate": 3.971735255819244e-06, "loss": 0.85006648, "num_input_tokens_seen": 19104360, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.515625, "step": 893, "time_per_iteration": 2.6858503818511963 }, { "auxiliary_loss_clip": 0.01229172, "auxiliary_loss_mlp": 0.010594, "balance_loss_clip": 1.0354389, "balance_loss_mlp": 1.05736291, "epoch": 0.053750187885164585, "flos": 28913153304960.0, "grad_norm": 2.302244570289395, "language_loss": 0.81490564, "learning_rate": 3.971671925506129e-06, "loss": 0.83779132, "num_input_tokens_seen": 19124680, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 1.53125, "step": 894, "time_per_iteration": 2.691497802734375 }, { "auxiliary_loss_clip": 0.01217672, "auxiliary_loss_mlp": 0.01059541, "balance_loss_clip": 1.03467453, "balance_loss_mlp": 1.05805206, "epoch": 0.05381031113783256, "flos": 15158828476800.0, "grad_norm": 2.43415745348969, "language_loss": 0.75529784, "learning_rate": 3.9716085248291125e-06, "loss": 0.77807003, "num_input_tokens_seen": 19142895, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 1.5, "step": 895, "time_per_iteration": 2.648869752883911 }, { "auxiliary_loss_clip": 0.01237544, "auxiliary_loss_mlp": 0.0105907, "balance_loss_clip": 1.03413153, "balance_loss_mlp": 1.06214643, "epoch": 0.05387043439050053, "flos": 21835232288640.0, "grad_norm": 2.0487453932300927, "language_loss": 0.86341375, "learning_rate": 3.97154505379046e-06, "loss": 0.88637996, "num_input_tokens_seen": 19163125, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 1.5703125, "step": 896, "time_per_iteration": 2.628422975540161 }, { "auxiliary_loss_clip": 0.01213306, "auxiliary_loss_mlp": 0.01307478, "balance_loss_clip": 1.03216887, "balance_loss_mlp": 1.05814528, "epoch": 0.053930557643168495, "flos": 17310308382720.0, "grad_norm": 2.3261964246105693, "language_loss": 0.87587571, "learning_rate": 3.971481512392438e-06, "loss": 0.90108353, "num_input_tokens_seen": 19179385, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.5546875, "step": 897, "time_per_iteration": 2.5931026935577393 }, { "auxiliary_loss_clip": 0.01219992, "auxiliary_loss_mlp": 0.0106141, "balance_loss_clip": 1.03411138, "balance_loss_mlp": 1.05931795, "epoch": 0.05399068089583647, "flos": 17348481561600.0, "grad_norm": 1.734324324460211, "language_loss": 0.90146244, "learning_rate": 3.97141790063731e-06, "loss": 0.92427647, "num_input_tokens_seen": 19198725, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.515625, "step": 898, "time_per_iteration": 2.5758047103881836 }, { "auxiliary_loss_clip": 0.01235094, "auxiliary_loss_mlp": 0.01074819, "balance_loss_clip": 1.04979718, "balance_loss_mlp": 1.06044674, "epoch": 0.05405080414850443, "flos": 17486952491520.0, "grad_norm": 4.825430797055667, "language_loss": 0.91927928, "learning_rate": 3.971354218527349e-06, "loss": 0.9423784, "num_input_tokens_seen": 19212380, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.5625, "step": 899, "time_per_iteration": 2.6206767559051514 }, { "auxiliary_loss_clip": 0.01226413, "auxiliary_loss_mlp": 0.01065195, "balance_loss_clip": 1.04095936, "balance_loss_mlp": 1.05785358, "epoch": 0.054110927401172404, "flos": 24496787278080.0, "grad_norm": 6.215766678443315, "language_loss": 0.75527829, "learning_rate": 3.971290466064827e-06, "loss": 0.77819443, "num_input_tokens_seen": 19232235, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.5, "step": 900, "time_per_iteration": 2.6610517501831055 }, { "auxiliary_loss_clip": 0.01222362, "auxiliary_loss_mlp": 0.01056204, "balance_loss_clip": 1.03224301, "balance_loss_mlp": 1.05893731, "epoch": 0.054171050653840376, "flos": 22930040874240.0, "grad_norm": 2.9406338212617804, "language_loss": 0.73445868, "learning_rate": 3.971226643252019e-06, "loss": 0.75724435, "num_input_tokens_seen": 19251460, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 1.546875, "step": 901, "time_per_iteration": 2.6586012840270996 }, { "auxiliary_loss_clip": 0.01232086, "auxiliary_loss_mlp": 0.01066262, "balance_loss_clip": 1.04308724, "balance_loss_mlp": 1.05862951, "epoch": 0.05423117390650834, "flos": 12933192942720.0, "grad_norm": 2.3555507976839833, "language_loss": 0.85187584, "learning_rate": 3.971162750091202e-06, "loss": 0.87485927, "num_input_tokens_seen": 19269060, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.4609375, "step": 902, "time_per_iteration": 2.5821731090545654 }, { "auxiliary_loss_clip": 0.01243019, "auxiliary_loss_mlp": 0.01061304, "balance_loss_clip": 1.037081, "balance_loss_mlp": 1.05621743, "epoch": 0.05429129715917631, "flos": 19901335017600.0, "grad_norm": 3.166718432353381, "language_loss": 0.86277539, "learning_rate": 3.971098786584657e-06, "loss": 0.88581866, "num_input_tokens_seen": 19288620, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.5078125, "step": 903, "time_per_iteration": 2.707411050796509 }, { "auxiliary_loss_clip": 0.01218771, "auxiliary_loss_mlp": 0.01062314, "balance_loss_clip": 1.03884196, "balance_loss_mlp": 1.0597465, "epoch": 0.05435142041184428, "flos": 16908611610240.0, "grad_norm": 3.564674080859438, "language_loss": 0.74607694, "learning_rate": 3.971034752734668e-06, "loss": 0.76888782, "num_input_tokens_seen": 19306615, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.5, "step": 904, "time_per_iteration": 2.5422139167785645 }, { "auxiliary_loss_clip": 0.01221728, "auxiliary_loss_mlp": 0.0107613, "balance_loss_clip": 1.05245531, "balance_loss_mlp": 1.06050324, "epoch": 0.05441154366451225, "flos": 23948323534080.0, "grad_norm": 2.066537959666049, "language_loss": 0.85509318, "learning_rate": 3.970970648543517e-06, "loss": 0.87807178, "num_input_tokens_seen": 19321680, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 1.515625, "step": 905, "time_per_iteration": 2.684922218322754 }, { "auxiliary_loss_clip": 0.01228701, "auxiliary_loss_mlp": 0.01066749, "balance_loss_clip": 1.04403996, "balance_loss_mlp": 1.06333303, "epoch": 0.05447166691718022, "flos": 19975382904960.0, "grad_norm": 1.9035858631138278, "language_loss": 0.74245214, "learning_rate": 3.970906474013494e-06, "loss": 0.76540661, "num_input_tokens_seen": 19339760, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 1.46875, "step": 906, "time_per_iteration": 2.5888924598693848 }, { "auxiliary_loss_clip": 0.0123218, "auxiliary_loss_mlp": 0.01062719, "balance_loss_clip": 1.03842461, "balance_loss_mlp": 1.06122541, "epoch": 0.05453179016984819, "flos": 24936513575040.0, "grad_norm": 2.3206561245113493, "language_loss": 0.87149388, "learning_rate": 3.97084222914689e-06, "loss": 0.89444292, "num_input_tokens_seen": 19359585, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 1.5234375, "step": 907, "time_per_iteration": 2.6219723224639893 }, { "auxiliary_loss_clip": 0.01219295, "auxiliary_loss_mlp": 0.01069907, "balance_loss_clip": 1.04543328, "balance_loss_mlp": 1.06146443, "epoch": 0.05459191342251616, "flos": 18115102558080.0, "grad_norm": 1.7460922163325756, "language_loss": 0.86944407, "learning_rate": 3.970777913945995e-06, "loss": 0.89233613, "num_input_tokens_seen": 19378590, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.484375, "step": 908, "time_per_iteration": 2.651590585708618 }, { "auxiliary_loss_clip": 0.01227575, "auxiliary_loss_mlp": 0.01064424, "balance_loss_clip": 1.03918743, "balance_loss_mlp": 1.06065869, "epoch": 0.054652036675184125, "flos": 19208295031680.0, "grad_norm": 2.1685824508137683, "language_loss": 0.89823687, "learning_rate": 3.970713528413106e-06, "loss": 0.92115688, "num_input_tokens_seen": 19397910, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.484375, "step": 909, "time_per_iteration": 2.651662588119507 }, { "auxiliary_loss_clip": 0.01231189, "auxiliary_loss_mlp": 0.01067518, "balance_loss_clip": 1.04249597, "balance_loss_mlp": 1.06053948, "epoch": 0.0547121599278521, "flos": 16325745615360.0, "grad_norm": 2.314031374016742, "language_loss": 0.70628357, "learning_rate": 3.9706490725505205e-06, "loss": 0.7292707, "num_input_tokens_seen": 19415950, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.5234375, "step": 910, "time_per_iteration": 2.6507797241210938 }, { "auxiliary_loss_clip": 0.01225971, "auxiliary_loss_mlp": 0.0105821, "balance_loss_clip": 1.03482103, "balance_loss_mlp": 1.05942059, "epoch": 0.05477228318052007, "flos": 20339014239360.0, "grad_norm": 1.7920670125599498, "language_loss": 0.83078319, "learning_rate": 3.970584546360539e-06, "loss": 0.85362494, "num_input_tokens_seen": 19435275, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 1.484375, "step": 911, "time_per_iteration": 2.6474716663360596 }, { "auxiliary_loss_clip": 0.01216813, "auxiliary_loss_mlp": 0.01069449, "balance_loss_clip": 1.04437983, "balance_loss_mlp": 1.05729699, "epoch": 0.054832406433188034, "flos": 21973092687360.0, "grad_norm": 2.534294357486835, "language_loss": 0.75919867, "learning_rate": 3.970519949845464e-06, "loss": 0.78206134, "num_input_tokens_seen": 19452090, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.5, "step": 912, "time_per_iteration": 2.620361804962158 }, { "auxiliary_loss_clip": 0.0122203, "auxiliary_loss_mlp": 0.01053879, "balance_loss_clip": 1.0297395, "balance_loss_mlp": 1.05874419, "epoch": 0.054892529685856006, "flos": 16398931576320.0, "grad_norm": 2.2417163607675685, "language_loss": 0.82695305, "learning_rate": 3.9704552830076005e-06, "loss": 0.84971219, "num_input_tokens_seen": 19470865, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.4453125, "step": 913, "time_per_iteration": 2.5640196800231934 }, { "auxiliary_loss_clip": 0.012157, "auxiliary_loss_mlp": 0.01059082, "balance_loss_clip": 1.03512132, "balance_loss_mlp": 1.06179929, "epoch": 0.05495265293852397, "flos": 23912341084800.0, "grad_norm": 2.117088959325425, "language_loss": 0.83755457, "learning_rate": 3.9703905458492564e-06, "loss": 0.86030239, "num_input_tokens_seen": 19492145, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.4453125, "step": 914, "time_per_iteration": 2.672625780105591 }, { "auxiliary_loss_clip": 0.0122105, "auxiliary_loss_mlp": 0.01305087, "balance_loss_clip": 1.03215933, "balance_loss_mlp": 1.0638355, "epoch": 0.055012776191191944, "flos": 23586954756480.0, "grad_norm": 2.4649295497031276, "language_loss": 0.89928705, "learning_rate": 3.970325738372742e-06, "loss": 0.92454845, "num_input_tokens_seen": 19511015, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 1.4765625, "step": 915, "time_per_iteration": 2.680938482284546 }, { "auxiliary_loss_clip": 0.01218638, "auxiliary_loss_mlp": 0.0105294, "balance_loss_clip": 1.02924144, "balance_loss_mlp": 1.06195164, "epoch": 0.055072899443859916, "flos": 17528501548800.0, "grad_norm": 1.7468943237872503, "language_loss": 0.89751297, "learning_rate": 3.970260860580371e-06, "loss": 0.92022872, "num_input_tokens_seen": 19529040, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.4765625, "step": 916, "time_per_iteration": 2.6106362342834473 }, { "auxiliary_loss_clip": 0.01239808, "auxiliary_loss_mlp": 0.01067192, "balance_loss_clip": 1.04247963, "balance_loss_mlp": 1.06105018, "epoch": 0.05513302269652788, "flos": 21687172427520.0, "grad_norm": 2.0487008344477107, "language_loss": 0.80018419, "learning_rate": 3.970195912474457e-06, "loss": 0.82325417, "num_input_tokens_seen": 19549540, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 1.515625, "step": 917, "time_per_iteration": 2.6240224838256836 }, { "auxiliary_loss_clip": 0.01257556, "auxiliary_loss_mlp": 0.0105815, "balance_loss_clip": 1.03266275, "balance_loss_mlp": 1.06017327, "epoch": 0.05519314594919585, "flos": 21613340021760.0, "grad_norm": 1.9256871456332512, "language_loss": 0.79506254, "learning_rate": 3.9701308940573195e-06, "loss": 0.8182196, "num_input_tokens_seen": 19567570, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.515625, "step": 918, "time_per_iteration": 2.6884329319000244 }, { "auxiliary_loss_clip": 0.01215654, "auxiliary_loss_mlp": 0.01053318, "balance_loss_clip": 1.02904677, "balance_loss_mlp": 1.05804777, "epoch": 0.05525326920186382, "flos": 21798567480960.0, "grad_norm": 2.3009638515353643, "language_loss": 0.89064097, "learning_rate": 3.970065805331279e-06, "loss": 0.91333067, "num_input_tokens_seen": 19585330, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 1.484375, "step": 919, "time_per_iteration": 2.596850633621216 }, { "auxiliary_loss_clip": 0.01204696, "auxiliary_loss_mlp": 0.01064906, "balance_loss_clip": 1.03976524, "balance_loss_mlp": 1.05781841, "epoch": 0.05531339245453179, "flos": 28439635288320.0, "grad_norm": 2.422117174871221, "language_loss": 0.8733288, "learning_rate": 3.970000646298656e-06, "loss": 0.89602482, "num_input_tokens_seen": 19604970, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.46875, "step": 920, "time_per_iteration": 2.662721872329712 }, { "auxiliary_loss_clip": 0.01225353, "auxiliary_loss_mlp": 0.01058593, "balance_loss_clip": 1.03433442, "balance_loss_mlp": 1.06101727, "epoch": 0.05537351570719976, "flos": 37375143131520.0, "grad_norm": 2.1134766816372914, "language_loss": 0.65768999, "learning_rate": 3.969935416961778e-06, "loss": 0.68052942, "num_input_tokens_seen": 19626235, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.453125, "step": 921, "time_per_iteration": 2.77988600730896 }, { "auxiliary_loss_clip": 0.01223077, "auxiliary_loss_mlp": 0.01060773, "balance_loss_clip": 1.03374815, "balance_loss_mlp": 1.06276846, "epoch": 0.05543363895986773, "flos": 20084479488000.0, "grad_norm": 2.0732430330446276, "language_loss": 0.71253949, "learning_rate": 3.969870117322973e-06, "loss": 0.73537803, "num_input_tokens_seen": 19644305, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.5078125, "step": 922, "time_per_iteration": 2.6271145343780518 }, { "auxiliary_loss_clip": 0.01230739, "auxiliary_loss_mlp": 0.01065009, "balance_loss_clip": 1.0396297, "balance_loss_mlp": 1.06074941, "epoch": 0.0554937622125357, "flos": 24533200690560.0, "grad_norm": 2.38844389040058, "language_loss": 0.82422143, "learning_rate": 3.96980474738457e-06, "loss": 0.84717894, "num_input_tokens_seen": 19662130, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.515625, "step": 923, "time_per_iteration": 2.6511292457580566 }, { "auxiliary_loss_clip": 0.01215357, "auxiliary_loss_mlp": 0.01062908, "balance_loss_clip": 1.03703952, "balance_loss_mlp": 1.05698812, "epoch": 0.055553885465203665, "flos": 14320063013760.0, "grad_norm": 2.320295198364083, "language_loss": 0.78857815, "learning_rate": 3.969739307148902e-06, "loss": 0.81136084, "num_input_tokens_seen": 19680715, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.4921875, "step": 924, "time_per_iteration": 2.547165632247925 }, { "auxiliary_loss_clip": 0.01233003, "auxiliary_loss_mlp": 0.01055565, "balance_loss_clip": 1.02982783, "balance_loss_mlp": 1.05796301, "epoch": 0.05561400871787164, "flos": 27381132374400.0, "grad_norm": 2.0804084638611378, "language_loss": 1.02076948, "learning_rate": 3.969673796618306e-06, "loss": 1.04365516, "num_input_tokens_seen": 19700535, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.4765625, "step": 925, "time_per_iteration": 2.6500463485717773 }, { "auxiliary_loss_clip": 0.01242893, "auxiliary_loss_mlp": 0.01047899, "balance_loss_clip": 1.02353239, "balance_loss_mlp": 1.05593157, "epoch": 0.05567413197053961, "flos": 23221096778880.0, "grad_norm": 1.8776819571286139, "language_loss": 0.81188488, "learning_rate": 3.969608215795117e-06, "loss": 0.83479273, "num_input_tokens_seen": 19718825, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 1.5, "step": 926, "time_per_iteration": 2.6450235843658447 }, { "auxiliary_loss_clip": 0.01225378, "auxiliary_loss_mlp": 0.01067282, "balance_loss_clip": 1.04038846, "balance_loss_mlp": 1.06149077, "epoch": 0.055734255223207574, "flos": 25264952559360.0, "grad_norm": 3.924552110659169, "language_loss": 0.7321254, "learning_rate": 3.969542564681679e-06, "loss": 0.75505197, "num_input_tokens_seen": 19739080, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.546875, "step": 927, "time_per_iteration": 2.7321693897247314 }, { "auxiliary_loss_clip": 0.01111134, "auxiliary_loss_mlp": 0.01022206, "balance_loss_clip": 1.01626897, "balance_loss_mlp": 1.04395986, "epoch": 0.055794378475875546, "flos": 66503116702080.0, "grad_norm": 0.778867245047978, "language_loss": 0.5987066, "learning_rate": 3.969476843280333e-06, "loss": 0.62004006, "num_input_tokens_seen": 19802960, "router_z_loss_clip": 0.05932617, "router_z_loss_mlp": 0.671875, "step": 928, "time_per_iteration": 3.1645383834838867 }, { "auxiliary_loss_clip": 0.01218209, "auxiliary_loss_mlp": 0.01062528, "balance_loss_clip": 1.03761315, "balance_loss_mlp": 1.06056571, "epoch": 0.05585450172854351, "flos": 25337635729920.0, "grad_norm": 2.737230358633963, "language_loss": 0.95054358, "learning_rate": 3.969411051593424e-06, "loss": 0.97335088, "num_input_tokens_seen": 19822765, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 1.484375, "step": 929, "time_per_iteration": 2.679734230041504 }, { "auxiliary_loss_clip": 0.0123491, "auxiliary_loss_mlp": 0.01067434, "balance_loss_clip": 1.04011166, "balance_loss_mlp": 1.05798697, "epoch": 0.05591462498121148, "flos": 33911738881920.0, "grad_norm": 2.740190364472513, "language_loss": 0.71843344, "learning_rate": 3.9693451896233e-06, "loss": 0.74145687, "num_input_tokens_seen": 19843590, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.4921875, "step": 930, "time_per_iteration": 4.271701812744141 }, { "auxiliary_loss_clip": 0.01230413, "auxiliary_loss_mlp": 0.01057504, "balance_loss_clip": 1.03232682, "balance_loss_mlp": 1.06247497, "epoch": 0.055974748233879455, "flos": 17930880679680.0, "grad_norm": 3.748987835136519, "language_loss": 0.84968174, "learning_rate": 3.969279257372313e-06, "loss": 0.87256092, "num_input_tokens_seen": 19860230, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.5, "step": 931, "time_per_iteration": 4.018869638442993 }, { "auxiliary_loss_clip": 0.01219887, "auxiliary_loss_mlp": 0.01318105, "balance_loss_clip": 1.04270041, "balance_loss_mlp": 1.05898404, "epoch": 0.05603487148654742, "flos": 24021976371840.0, "grad_norm": 1.7023257259390399, "language_loss": 0.83495128, "learning_rate": 3.969213254842814e-06, "loss": 0.86033124, "num_input_tokens_seen": 19880795, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.515625, "step": 932, "time_per_iteration": 4.056366920471191 }, { "auxiliary_loss_clip": 0.01221013, "auxiliary_loss_mlp": 0.01069782, "balance_loss_clip": 1.04250741, "balance_loss_mlp": 1.06199741, "epoch": 0.05609499473921539, "flos": 17307758517120.0, "grad_norm": 2.225178795821609, "language_loss": 0.73994935, "learning_rate": 3.9691471820371594e-06, "loss": 0.76285732, "num_input_tokens_seen": 19897960, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.5, "step": 933, "time_per_iteration": 2.6475279331207275 }, { "auxiliary_loss_clip": 0.01244451, "auxiliary_loss_mlp": 0.01068883, "balance_loss_clip": 1.04333663, "balance_loss_mlp": 1.05892897, "epoch": 0.05615511799188336, "flos": 20994742972800.0, "grad_norm": 3.1669665439131927, "language_loss": 0.86084807, "learning_rate": 3.969081038957708e-06, "loss": 0.88398147, "num_input_tokens_seen": 19913315, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.484375, "step": 934, "time_per_iteration": 4.209365129470825 }, { "auxiliary_loss_clip": 0.01247615, "auxiliary_loss_mlp": 0.01073394, "balance_loss_clip": 1.04855061, "balance_loss_mlp": 1.06512892, "epoch": 0.05621524124455133, "flos": 17273535834240.0, "grad_norm": 2.3851094659292627, "language_loss": 0.80016339, "learning_rate": 3.969014825606819e-06, "loss": 0.82337356, "num_input_tokens_seen": 19928790, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.4609375, "step": 935, "time_per_iteration": 2.5946176052093506 }, { "auxiliary_loss_clip": 0.0109614, "auxiliary_loss_mlp": 0.01024787, "balance_loss_clip": 1.02023339, "balance_loss_mlp": 1.03072739, "epoch": 0.0562753644972193, "flos": 58719370458240.0, "grad_norm": 0.8254772635211803, "language_loss": 0.69244969, "learning_rate": 3.968948541986855e-06, "loss": 0.71365893, "num_input_tokens_seen": 19988785, "router_z_loss_clip": 0.0456543, "router_z_loss_mlp": 0.65625, "step": 936, "time_per_iteration": 3.1010117530822754 }, { "auxiliary_loss_clip": 0.01208831, "auxiliary_loss_mlp": 0.01063397, "balance_loss_clip": 1.03836334, "balance_loss_mlp": 1.05890131, "epoch": 0.05633548774988727, "flos": 17457039440640.0, "grad_norm": 2.323590387047664, "language_loss": 0.75950384, "learning_rate": 3.968882188100183e-06, "loss": 0.78222609, "num_input_tokens_seen": 20007685, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.5, "step": 937, "time_per_iteration": 2.5466392040252686 }, { "auxiliary_loss_clip": 0.01104749, "auxiliary_loss_mlp": 0.01007665, "balance_loss_clip": 1.00306308, "balance_loss_mlp": 1.02956164, "epoch": 0.05639561100255524, "flos": 70654928083200.0, "grad_norm": 0.8545276570896616, "language_loss": 0.6440618, "learning_rate": 3.9688157639491704e-06, "loss": 0.66518593, "num_input_tokens_seen": 20072750, "router_z_loss_clip": 0.04589844, "router_z_loss_mlp": 0.65625, "step": 938, "time_per_iteration": 3.155595541000366 }, { "auxiliary_loss_clip": 0.01243426, "auxiliary_loss_mlp": 0.01065874, "balance_loss_clip": 1.03960085, "balance_loss_mlp": 1.06068611, "epoch": 0.056455734255223204, "flos": 20485996692480.0, "grad_norm": 2.2786872721712372, "language_loss": 0.79148543, "learning_rate": 3.968749269536188e-06, "loss": 0.81457841, "num_input_tokens_seen": 20089070, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.546875, "step": 939, "time_per_iteration": 2.5664775371551514 }, { "auxiliary_loss_clip": 0.01228214, "auxiliary_loss_mlp": 0.01068577, "balance_loss_clip": 1.04313731, "balance_loss_mlp": 1.06095505, "epoch": 0.056515857507891176, "flos": 22053569109120.0, "grad_norm": 1.8453725206048053, "language_loss": 0.7398718, "learning_rate": 3.9686827048636074e-06, "loss": 0.76283973, "num_input_tokens_seen": 20108790, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.484375, "step": 940, "time_per_iteration": 2.678401231765747 }, { "auxiliary_loss_clip": 0.0121818, "auxiliary_loss_mlp": 0.01064378, "balance_loss_clip": 1.03858101, "balance_loss_mlp": 1.06148839, "epoch": 0.05657598076055915, "flos": 24025316336640.0, "grad_norm": 2.4464206277274285, "language_loss": 0.70426559, "learning_rate": 3.968616069933806e-06, "loss": 0.72709119, "num_input_tokens_seen": 20128455, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.4765625, "step": 941, "time_per_iteration": 2.6652743816375732 }, { "auxiliary_loss_clip": 0.01242397, "auxiliary_loss_mlp": 0.01064803, "balance_loss_clip": 1.03910136, "balance_loss_mlp": 1.06077218, "epoch": 0.05663610401322711, "flos": 20480609652480.0, "grad_norm": 1.943281206007809, "language_loss": 0.80634606, "learning_rate": 3.96854936474916e-06, "loss": 0.82941806, "num_input_tokens_seen": 20145775, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.453125, "step": 942, "time_per_iteration": 2.6379876136779785 }, { "auxiliary_loss_clip": 0.01211142, "auxiliary_loss_mlp": 0.01063138, "balance_loss_clip": 1.03771079, "balance_loss_mlp": 1.05610943, "epoch": 0.056696227265895086, "flos": 21069042255360.0, "grad_norm": 2.1778219226455207, "language_loss": 0.88065374, "learning_rate": 3.968482589312052e-06, "loss": 0.90339649, "num_input_tokens_seen": 20164315, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.4609375, "step": 943, "time_per_iteration": 2.6414597034454346 }, { "auxiliary_loss_clip": 0.01217139, "auxiliary_loss_mlp": 0.01058314, "balance_loss_clip": 1.03353095, "balance_loss_mlp": 1.06238604, "epoch": 0.05675635051856306, "flos": 17821317219840.0, "grad_norm": 2.2135881260006216, "language_loss": 0.74652207, "learning_rate": 3.968415743624863e-06, "loss": 0.76927662, "num_input_tokens_seen": 20182760, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.453125, "step": 944, "time_per_iteration": 2.569330930709839 }, { "auxiliary_loss_clip": 0.01215472, "auxiliary_loss_mlp": 0.01059982, "balance_loss_clip": 1.03624773, "balance_loss_mlp": 1.05849934, "epoch": 0.05681647377123102, "flos": 23114945111040.0, "grad_norm": 1.6420831708758095, "language_loss": 0.79221785, "learning_rate": 3.9683488276899794e-06, "loss": 0.8149724, "num_input_tokens_seen": 20203830, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 1.4765625, "step": 945, "time_per_iteration": 2.5974881649017334 }, { "auxiliary_loss_clip": 0.01217633, "auxiliary_loss_mlp": 0.01058134, "balance_loss_clip": 1.03302801, "balance_loss_mlp": 1.05901277, "epoch": 0.056876597023898995, "flos": 16070528505600.0, "grad_norm": 2.6671030335222383, "language_loss": 0.82690877, "learning_rate": 3.96828184150979e-06, "loss": 0.84966648, "num_input_tokens_seen": 20220365, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.5, "step": 946, "time_per_iteration": 2.5463972091674805 }, { "auxiliary_loss_clip": 0.01211998, "auxiliary_loss_mlp": 0.01057774, "balance_loss_clip": 1.03170276, "balance_loss_mlp": 1.06103396, "epoch": 0.05693672027656696, "flos": 16835641130880.0, "grad_norm": 2.118605043235412, "language_loss": 0.7858988, "learning_rate": 3.968214785086684e-06, "loss": 0.80859649, "num_input_tokens_seen": 20238640, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.5078125, "step": 947, "time_per_iteration": 2.56113862991333 }, { "auxiliary_loss_clip": 0.01232923, "auxiliary_loss_mlp": 0.01062537, "balance_loss_clip": 1.03756285, "balance_loss_mlp": 1.06321704, "epoch": 0.05699684352923493, "flos": 21389113370880.0, "grad_norm": 3.098989305535126, "language_loss": 0.85285163, "learning_rate": 3.968147658423056e-06, "loss": 0.87580627, "num_input_tokens_seen": 20251025, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.5078125, "step": 948, "time_per_iteration": 2.611226797103882 }, { "auxiliary_loss_clip": 0.0120557, "auxiliary_loss_mlp": 0.01065521, "balance_loss_clip": 1.03813875, "balance_loss_mlp": 1.058254, "epoch": 0.057056966781902904, "flos": 15560309767680.0, "grad_norm": 1.8737896140319108, "language_loss": 0.87011027, "learning_rate": 3.9680804615213e-06, "loss": 0.89282119, "num_input_tokens_seen": 20269775, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.46875, "step": 949, "time_per_iteration": 2.6472291946411133 }, { "auxiliary_loss_clip": 0.01211555, "auxiliary_loss_mlp": 0.01063983, "balance_loss_clip": 1.04011703, "balance_loss_mlp": 1.05845213, "epoch": 0.05711709003457087, "flos": 19937856170880.0, "grad_norm": 2.3589796846205346, "language_loss": 0.78594106, "learning_rate": 3.968013194383815e-06, "loss": 0.80869645, "num_input_tokens_seen": 20287715, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 1.4375, "step": 950, "time_per_iteration": 2.6007425785064697 }, { "auxiliary_loss_clip": 0.01228777, "auxiliary_loss_mlp": 0.01067261, "balance_loss_clip": 1.04200029, "balance_loss_mlp": 1.06038725, "epoch": 0.05717721328723884, "flos": 30332701774080.0, "grad_norm": 2.1563345371742617, "language_loss": 0.8218956, "learning_rate": 3.967945857013002e-06, "loss": 0.84485608, "num_input_tokens_seen": 20307070, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.5, "step": 951, "time_per_iteration": 2.6913840770721436 }, { "auxiliary_loss_clip": 0.01224766, "auxiliary_loss_mlp": 0.01059528, "balance_loss_clip": 1.03330147, "balance_loss_mlp": 1.05729949, "epoch": 0.05723733653990681, "flos": 23654358627840.0, "grad_norm": 2.616234586584888, "language_loss": 0.86263978, "learning_rate": 3.967878449411263e-06, "loss": 0.88548267, "num_input_tokens_seen": 20324945, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.4921875, "step": 952, "time_per_iteration": 2.5884883403778076 }, { "auxiliary_loss_clip": 0.01227862, "auxiliary_loss_mlp": 0.0106141, "balance_loss_clip": 1.03327632, "balance_loss_mlp": 1.05693793, "epoch": 0.05729745979257478, "flos": 22055759838720.0, "grad_norm": 2.235406219204705, "language_loss": 0.79347742, "learning_rate": 3.967810971581004e-06, "loss": 0.81637013, "num_input_tokens_seen": 20346135, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.5234375, "step": 953, "time_per_iteration": 2.653209924697876 }, { "auxiliary_loss_clip": 0.01229777, "auxiliary_loss_mlp": 0.010627, "balance_loss_clip": 1.03650975, "balance_loss_mlp": 1.06329417, "epoch": 0.05735758304524275, "flos": 19604353368960.0, "grad_norm": 2.176834683475283, "language_loss": 0.86406761, "learning_rate": 3.967743423524633e-06, "loss": 0.88699239, "num_input_tokens_seen": 20364450, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.484375, "step": 954, "time_per_iteration": 2.5526275634765625 }, { "auxiliary_loss_clip": 0.0122064, "auxiliary_loss_mlp": 0.01059981, "balance_loss_clip": 1.03523326, "balance_loss_mlp": 1.06094027, "epoch": 0.057417706297910716, "flos": 19099018880640.0, "grad_norm": 2.254561403755522, "language_loss": 0.87877262, "learning_rate": 3.967675805244562e-06, "loss": 0.9015789, "num_input_tokens_seen": 20383500, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 1.5, "step": 955, "time_per_iteration": 2.6342923641204834 }, { "auxiliary_loss_clip": 0.012245, "auxiliary_loss_mlp": 0.01062932, "balance_loss_clip": 1.03796935, "balance_loss_mlp": 1.0591929, "epoch": 0.05747782955057869, "flos": 16654507822080.0, "grad_norm": 2.2131875994334127, "language_loss": 0.89297926, "learning_rate": 3.967608116743202e-06, "loss": 0.91585362, "num_input_tokens_seen": 20400295, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.46875, "step": 956, "time_per_iteration": 2.525550603866577 }, { "auxiliary_loss_clip": 0.01231156, "auxiliary_loss_mlp": 0.01059541, "balance_loss_clip": 1.03341079, "balance_loss_mlp": 1.06060159, "epoch": 0.05753795280324665, "flos": 14502058248960.0, "grad_norm": 1.7839806350733105, "language_loss": 0.75136328, "learning_rate": 3.96754035802297e-06, "loss": 0.77427024, "num_input_tokens_seen": 20419085, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.4375, "step": 957, "time_per_iteration": 2.620253086090088 }, { "auxiliary_loss_clip": 0.01237282, "auxiliary_loss_mlp": 0.01073561, "balance_loss_clip": 1.04608369, "balance_loss_mlp": 1.05891335, "epoch": 0.057598076055914625, "flos": 18076318848000.0, "grad_norm": 2.0482345061876717, "language_loss": 0.79539919, "learning_rate": 3.967472529086284e-06, "loss": 0.81850767, "num_input_tokens_seen": 20437465, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.5078125, "step": 958, "time_per_iteration": 2.603492498397827 }, { "auxiliary_loss_clip": 0.01222824, "auxiliary_loss_mlp": 0.01058873, "balance_loss_clip": 1.03492427, "balance_loss_mlp": 1.05681992, "epoch": 0.0576581993085826, "flos": 22124600254080.0, "grad_norm": 2.5018758969541524, "language_loss": 0.88283145, "learning_rate": 3.967404629935564e-06, "loss": 0.90564847, "num_input_tokens_seen": 20456235, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.4765625, "step": 959, "time_per_iteration": 2.702171802520752 }, { "auxiliary_loss_clip": 0.0122903, "auxiliary_loss_mlp": 0.01058968, "balance_loss_clip": 1.03391075, "balance_loss_mlp": 1.05945837, "epoch": 0.05771832256125056, "flos": 33181746779520.0, "grad_norm": 2.633863687718275, "language_loss": 0.78456092, "learning_rate": 3.9673366605732335e-06, "loss": 0.80744094, "num_input_tokens_seen": 20476825, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.421875, "step": 960, "time_per_iteration": 2.7090837955474854 }, { "auxiliary_loss_clip": 0.01253093, "auxiliary_loss_mlp": 0.01060591, "balance_loss_clip": 1.03440094, "balance_loss_mlp": 1.05736876, "epoch": 0.057778445813918534, "flos": 24170143973760.0, "grad_norm": 1.973672125628491, "language_loss": 0.92483246, "learning_rate": 3.967268621001718e-06, "loss": 0.94796932, "num_input_tokens_seen": 20496965, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.4921875, "step": 961, "time_per_iteration": 2.7168121337890625 }, { "auxiliary_loss_clip": 0.01223889, "auxiliary_loss_mlp": 0.01067108, "balance_loss_clip": 1.04087055, "balance_loss_mlp": 1.05735147, "epoch": 0.0578385690665865, "flos": 29643037666560.0, "grad_norm": 2.1211653871159157, "language_loss": 0.67590666, "learning_rate": 3.967200511223446e-06, "loss": 0.6988166, "num_input_tokens_seen": 20518035, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.484375, "step": 962, "time_per_iteration": 2.727384328842163 }, { "auxiliary_loss_clip": 0.01231905, "auxiliary_loss_mlp": 0.0106432, "balance_loss_clip": 1.04022765, "balance_loss_mlp": 1.05909526, "epoch": 0.05789869231925447, "flos": 20885430908160.0, "grad_norm": 2.0878430495892992, "language_loss": 0.88380635, "learning_rate": 3.967132331240848e-06, "loss": 0.90676862, "num_input_tokens_seen": 20534740, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.453125, "step": 963, "time_per_iteration": 2.59157657623291 }, { "auxiliary_loss_clip": 0.01230513, "auxiliary_loss_mlp": 0.01057831, "balance_loss_clip": 1.03223693, "balance_loss_mlp": 1.06329894, "epoch": 0.057958815571922444, "flos": 26031106679040.0, "grad_norm": 2.615324101773671, "language_loss": 0.85063493, "learning_rate": 3.9670640810563575e-06, "loss": 0.87351835, "num_input_tokens_seen": 20553485, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.484375, "step": 964, "time_per_iteration": 2.6807398796081543 }, { "auxiliary_loss_clip": 0.01224577, "auxiliary_loss_mlp": 0.01069622, "balance_loss_clip": 1.04367018, "balance_loss_mlp": 1.05914271, "epoch": 0.05801893882459041, "flos": 18077683564800.0, "grad_norm": 3.5840059017887826, "language_loss": 0.77928632, "learning_rate": 3.96699576067241e-06, "loss": 0.80222833, "num_input_tokens_seen": 20572155, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.46875, "step": 965, "time_per_iteration": 2.6038858890533447 }, { "auxiliary_loss_clip": 0.01227324, "auxiliary_loss_mlp": 0.01062117, "balance_loss_clip": 1.03775036, "balance_loss_mlp": 1.05696154, "epoch": 0.05807906207725838, "flos": 17748885444480.0, "grad_norm": 1.9083013158481252, "language_loss": 0.80903137, "learning_rate": 3.966927370091442e-06, "loss": 0.83192581, "num_input_tokens_seen": 20590395, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.4296875, "step": 966, "time_per_iteration": 2.6588029861450195 }, { "auxiliary_loss_clip": 0.01214356, "auxiliary_loss_mlp": 0.01059669, "balance_loss_clip": 1.03551745, "balance_loss_mlp": 1.05881369, "epoch": 0.058139185329926346, "flos": 18040372312320.0, "grad_norm": 1.8381956529056156, "language_loss": 0.76311773, "learning_rate": 3.9668589093158975e-06, "loss": 0.78585804, "num_input_tokens_seen": 20608435, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.46875, "step": 967, "time_per_iteration": 2.5520365238189697 }, { "auxiliary_loss_clip": 0.01108948, "auxiliary_loss_mlp": 0.01039269, "balance_loss_clip": 1.03519237, "balance_loss_mlp": 1.0335083, "epoch": 0.05819930858259432, "flos": 62363297485440.0, "grad_norm": 0.7951380505091629, "language_loss": 0.57322454, "learning_rate": 3.966790378348217e-06, "loss": 0.59470671, "num_input_tokens_seen": 20668575, "router_z_loss_clip": 0.04077148, "router_z_loss_mlp": 0.6640625, "step": 968, "time_per_iteration": 3.1392085552215576 }, { "auxiliary_loss_clip": 0.01228276, "auxiliary_loss_mlp": 0.01060259, "balance_loss_clip": 1.0330795, "balance_loss_mlp": 1.06233788, "epoch": 0.05825943183526229, "flos": 19135360465920.0, "grad_norm": 2.904748371839789, "language_loss": 0.81660473, "learning_rate": 3.966721777190847e-06, "loss": 0.83949006, "num_input_tokens_seen": 20687355, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.4765625, "step": 969, "time_per_iteration": 2.5347609519958496 }, { "auxiliary_loss_clip": 0.01217637, "auxiliary_loss_mlp": 0.01308129, "balance_loss_clip": 1.0333451, "balance_loss_mlp": 1.05937719, "epoch": 0.058319555087930255, "flos": 29022465369600.0, "grad_norm": 2.6226008346734, "language_loss": 0.77634764, "learning_rate": 3.966653105846237e-06, "loss": 0.80160534, "num_input_tokens_seen": 20705710, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.4921875, "step": 970, "time_per_iteration": 2.678245782852173 }, { "auxiliary_loss_clip": 0.01235591, "auxiliary_loss_mlp": 0.01061661, "balance_loss_clip": 1.03553009, "balance_loss_mlp": 1.05985689, "epoch": 0.05837967834059823, "flos": 18879999701760.0, "grad_norm": 2.665806861792512, "language_loss": 0.91797006, "learning_rate": 3.966584364316835e-06, "loss": 0.94094259, "num_input_tokens_seen": 20722405, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.484375, "step": 971, "time_per_iteration": 2.646502733230591 }, { "auxiliary_loss_clip": 0.0122241, "auxiliary_loss_mlp": 0.01057502, "balance_loss_clip": 1.03155041, "balance_loss_mlp": 1.05773294, "epoch": 0.05843980159326619, "flos": 25703062744320.0, "grad_norm": 1.7870724780321798, "language_loss": 0.85694444, "learning_rate": 3.966515552605096e-06, "loss": 0.87974358, "num_input_tokens_seen": 20741480, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.4609375, "step": 972, "time_per_iteration": 4.180164813995361 }, { "auxiliary_loss_clip": 0.01230674, "auxiliary_loss_mlp": 0.0106984, "balance_loss_clip": 1.0457958, "balance_loss_mlp": 1.06003499, "epoch": 0.058499924845934165, "flos": 25552129795200.0, "grad_norm": 2.32347405151251, "language_loss": 0.87182575, "learning_rate": 3.966446670713476e-06, "loss": 0.89483088, "num_input_tokens_seen": 20759685, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 1.4296875, "step": 973, "time_per_iteration": 4.04780912399292 }, { "auxiliary_loss_clip": 0.01215689, "auxiliary_loss_mlp": 0.01311306, "balance_loss_clip": 1.03585637, "balance_loss_mlp": 1.05927062, "epoch": 0.05856004809860214, "flos": 16436171001600.0, "grad_norm": 2.2807591828876754, "language_loss": 0.75206757, "learning_rate": 3.9663777186444325e-06, "loss": 0.77733755, "num_input_tokens_seen": 20778180, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.4765625, "step": 974, "time_per_iteration": 2.649155616760254 }, { "auxiliary_loss_clip": 0.0121518, "auxiliary_loss_mlp": 0.01311505, "balance_loss_clip": 1.03725719, "balance_loss_mlp": 1.05989766, "epoch": 0.0586201713512701, "flos": 39458824116480.0, "grad_norm": 1.5761080193898613, "language_loss": 0.76549709, "learning_rate": 3.966308696400426e-06, "loss": 0.79076397, "num_input_tokens_seen": 20802705, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.4609375, "step": 975, "time_per_iteration": 4.1736369132995605 }, { "auxiliary_loss_clip": 0.01209385, "auxiliary_loss_mlp": 0.01069415, "balance_loss_clip": 1.04373693, "balance_loss_mlp": 1.05986547, "epoch": 0.058680294603938074, "flos": 23365170230400.0, "grad_norm": 2.5523559902040445, "language_loss": 0.77277291, "learning_rate": 3.96623960398392e-06, "loss": 0.7955609, "num_input_tokens_seen": 20822540, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.4921875, "step": 976, "time_per_iteration": 2.5685079097747803 }, { "auxiliary_loss_clip": 0.01220894, "auxiliary_loss_mlp": 0.01063433, "balance_loss_clip": 1.03860164, "balance_loss_mlp": 1.05614901, "epoch": 0.05874041785660604, "flos": 32232017226240.0, "grad_norm": 2.094392790178978, "language_loss": 0.8719511, "learning_rate": 3.9661704413973805e-06, "loss": 0.8947944, "num_input_tokens_seen": 20844175, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.46875, "step": 977, "time_per_iteration": 2.6039748191833496 }, { "auxiliary_loss_clip": 0.01225469, "auxiliary_loss_mlp": 0.01064127, "balance_loss_clip": 1.03891444, "balance_loss_mlp": 1.06139016, "epoch": 0.05880054110927401, "flos": 22310043194880.0, "grad_norm": 1.8037086440893848, "language_loss": 0.79076844, "learning_rate": 3.966101208643276e-06, "loss": 0.81366444, "num_input_tokens_seen": 20864730, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.453125, "step": 978, "time_per_iteration": 2.579465389251709 }, { "auxiliary_loss_clip": 0.01267227, "auxiliary_loss_mlp": 0.01075178, "balance_loss_clip": 1.04835558, "balance_loss_mlp": 1.06123233, "epoch": 0.05886066436194198, "flos": 27380450016000.0, "grad_norm": 1.873107870107471, "language_loss": 0.80452162, "learning_rate": 3.966031905724076e-06, "loss": 0.82794559, "num_input_tokens_seen": 20885200, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.5078125, "step": 979, "time_per_iteration": 2.640974998474121 }, { "auxiliary_loss_clip": 0.01116449, "auxiliary_loss_mlp": 0.01056977, "balance_loss_clip": 1.05323374, "balance_loss_mlp": 1.03293502, "epoch": 0.05892078761460995, "flos": 59584493525760.0, "grad_norm": 0.9251310253110426, "language_loss": 0.59029341, "learning_rate": 3.965962532642255e-06, "loss": 0.61202765, "num_input_tokens_seen": 20940325, "router_z_loss_clip": 0.03735352, "router_z_loss_mlp": 0.6484375, "step": 980, "time_per_iteration": 3.1248972415924072 }, { "auxiliary_loss_clip": 0.01204002, "auxiliary_loss_mlp": 0.01073726, "balance_loss_clip": 1.04707074, "balance_loss_mlp": 1.05991936, "epoch": 0.05898091086727792, "flos": 15414081500160.0, "grad_norm": 2.1018057503773493, "language_loss": 0.86177039, "learning_rate": 3.9658930894002885e-06, "loss": 0.88454765, "num_input_tokens_seen": 20958220, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.4453125, "step": 981, "time_per_iteration": 2.6404247283935547 }, { "auxiliary_loss_clip": 0.01222247, "auxiliary_loss_mlp": 0.01062922, "balance_loss_clip": 1.03978395, "balance_loss_mlp": 1.05903566, "epoch": 0.059041034119945886, "flos": 23655328295040.0, "grad_norm": 2.3454246654618527, "language_loss": 0.79132408, "learning_rate": 3.965823576000653e-06, "loss": 0.81417572, "num_input_tokens_seen": 20978920, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.4453125, "step": 982, "time_per_iteration": 2.6232428550720215 }, { "auxiliary_loss_clip": 0.01231107, "auxiliary_loss_mlp": 0.01066711, "balance_loss_clip": 1.04087865, "balance_loss_mlp": 1.06074417, "epoch": 0.05910115737261386, "flos": 24754087376640.0, "grad_norm": 3.1081294974818685, "language_loss": 0.84367931, "learning_rate": 3.965753992445833e-06, "loss": 0.8666575, "num_input_tokens_seen": 20999490, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.5234375, "step": 983, "time_per_iteration": 2.6706960201263428 }, { "auxiliary_loss_clip": 0.01214258, "auxiliary_loss_mlp": 0.0106148, "balance_loss_clip": 1.0375545, "balance_loss_mlp": 1.06037951, "epoch": 0.05916128062528183, "flos": 11728749070080.0, "grad_norm": 1.9082764121087232, "language_loss": 0.84503561, "learning_rate": 3.9656843387383075e-06, "loss": 0.86779302, "num_input_tokens_seen": 21017865, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.4453125, "step": 984, "time_per_iteration": 2.525695562362671 }, { "auxiliary_loss_clip": 0.01212566, "auxiliary_loss_mlp": 0.01057315, "balance_loss_clip": 1.03340113, "balance_loss_mlp": 1.06020164, "epoch": 0.059221403877949795, "flos": 21902995296000.0, "grad_norm": 3.015718294016995, "language_loss": 0.77448571, "learning_rate": 3.965614614880566e-06, "loss": 0.79718447, "num_input_tokens_seen": 21035900, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 1.4296875, "step": 985, "time_per_iteration": 2.657763719558716 }, { "auxiliary_loss_clip": 0.01245376, "auxiliary_loss_mlp": 0.01060433, "balance_loss_clip": 1.0356375, "balance_loss_mlp": 1.06033647, "epoch": 0.05928152713061777, "flos": 20514580940160.0, "grad_norm": 1.8366271622167545, "language_loss": 0.90731323, "learning_rate": 3.965544820875094e-06, "loss": 0.93037134, "num_input_tokens_seen": 21053235, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.484375, "step": 986, "time_per_iteration": 2.5988948345184326 }, { "auxiliary_loss_clip": 0.012176, "auxiliary_loss_mlp": 0.01064182, "balance_loss_clip": 1.03670382, "balance_loss_mlp": 1.05835986, "epoch": 0.05934165038328574, "flos": 24495135252480.0, "grad_norm": 2.2240843691674517, "language_loss": 0.75510526, "learning_rate": 3.965474956724383e-06, "loss": 0.77792305, "num_input_tokens_seen": 21073090, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.5, "step": 987, "time_per_iteration": 2.6480584144592285 }, { "auxiliary_loss_clip": 0.01219579, "auxiliary_loss_mlp": 0.01054317, "balance_loss_clip": 1.02938986, "balance_loss_mlp": 1.05869889, "epoch": 0.059401773635953704, "flos": 38728041914880.0, "grad_norm": 2.0401204252751612, "language_loss": 0.71367693, "learning_rate": 3.965405022430928e-06, "loss": 0.73641592, "num_input_tokens_seen": 21094895, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 1.515625, "step": 988, "time_per_iteration": 2.7582314014434814 }, { "auxiliary_loss_clip": 0.01119732, "auxiliary_loss_mlp": 0.01016473, "balance_loss_clip": 1.01239586, "balance_loss_mlp": 1.02719235, "epoch": 0.059461896888621676, "flos": 58023565125120.0, "grad_norm": 0.9268236976847627, "language_loss": 0.71102321, "learning_rate": 3.965335017997222e-06, "loss": 0.73238528, "num_input_tokens_seen": 21147555, "router_z_loss_clip": 0.04077148, "router_z_loss_mlp": 0.6484375, "step": 989, "time_per_iteration": 3.1253437995910645 }, { "auxiliary_loss_clip": 0.01248802, "auxiliary_loss_mlp": 0.01068726, "balance_loss_clip": 1.0411644, "balance_loss_mlp": 1.05989861, "epoch": 0.05952202014128964, "flos": 22127760650880.0, "grad_norm": 1.8337286522452738, "language_loss": 0.7811929, "learning_rate": 3.965264943425766e-06, "loss": 0.80436814, "num_input_tokens_seen": 21167845, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.5234375, "step": 990, "time_per_iteration": 2.6040098667144775 }, { "auxiliary_loss_clip": 0.0121256, "auxiliary_loss_mlp": 0.01056244, "balance_loss_clip": 1.03053105, "balance_loss_mlp": 1.05719602, "epoch": 0.059582143393957614, "flos": 20445776438400.0, "grad_norm": 2.5488870660065537, "language_loss": 0.86150455, "learning_rate": 3.965194798719059e-06, "loss": 0.88419259, "num_input_tokens_seen": 21185085, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.4609375, "step": 991, "time_per_iteration": 2.6233417987823486 }, { "auxiliary_loss_clip": 0.012099, "auxiliary_loss_mlp": 0.01066347, "balance_loss_clip": 1.04143214, "balance_loss_mlp": 1.05893707, "epoch": 0.059642266646625586, "flos": 20594877793920.0, "grad_norm": 2.0228145499462893, "language_loss": 0.76537871, "learning_rate": 3.965124583879604e-06, "loss": 0.78814113, "num_input_tokens_seen": 21204230, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.515625, "step": 992, "time_per_iteration": 2.6784207820892334 }, { "auxiliary_loss_clip": 0.01244538, "auxiliary_loss_mlp": 0.01059725, "balance_loss_clip": 1.03577554, "balance_loss_mlp": 1.0607121, "epoch": 0.05970238989929355, "flos": 19352655792000.0, "grad_norm": 2.69792093660772, "language_loss": 0.74379444, "learning_rate": 3.965054298909908e-06, "loss": 0.76683712, "num_input_tokens_seen": 21222655, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.4765625, "step": 993, "time_per_iteration": 2.608604669570923 }, { "auxiliary_loss_clip": 0.01234177, "auxiliary_loss_mlp": 0.01074207, "balance_loss_clip": 1.04762363, "balance_loss_mlp": 1.06033826, "epoch": 0.05976251315196152, "flos": 30264040926720.0, "grad_norm": 1.816655732989781, "language_loss": 0.78519338, "learning_rate": 3.964983943812479e-06, "loss": 0.80827725, "num_input_tokens_seen": 21242310, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.46875, "step": 994, "time_per_iteration": 2.688215732574463 }, { "auxiliary_loss_clip": 0.01207953, "auxiliary_loss_mlp": 0.01080878, "balance_loss_clip": 1.05603433, "balance_loss_mlp": 1.06092227, "epoch": 0.05982263640462949, "flos": 23185150243200.0, "grad_norm": 2.6530585379212686, "language_loss": 0.79694086, "learning_rate": 3.964913518589827e-06, "loss": 0.81982917, "num_input_tokens_seen": 21261410, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.46875, "step": 995, "time_per_iteration": 2.7627687454223633 }, { "auxiliary_loss_clip": 0.01231961, "auxiliary_loss_mlp": 0.01066439, "balance_loss_clip": 1.04029667, "balance_loss_mlp": 1.05676115, "epoch": 0.05988275965729746, "flos": 27850879463040.0, "grad_norm": 2.424134422514706, "language_loss": 0.86971223, "learning_rate": 3.964843023244466e-06, "loss": 0.8926962, "num_input_tokens_seen": 21280080, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.4765625, "step": 996, "time_per_iteration": 2.6478915214538574 }, { "auxiliary_loss_clip": 0.01235363, "auxiliary_loss_mlp": 0.01076008, "balance_loss_clip": 1.04656339, "balance_loss_mlp": 1.0592041, "epoch": 0.05994288290996543, "flos": 24680003575680.0, "grad_norm": 16.872969966343067, "language_loss": 0.88080871, "learning_rate": 3.964772457778912e-06, "loss": 0.90392238, "num_input_tokens_seen": 21296765, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.484375, "step": 997, "time_per_iteration": 2.595156669616699 }, { "auxiliary_loss_clip": 0.01108387, "auxiliary_loss_mlp": 0.01014782, "balance_loss_clip": 1.01082456, "balance_loss_mlp": 1.02657223, "epoch": 0.0600030061626334, "flos": 69929568835200.0, "grad_norm": 1.0035244756167176, "language_loss": 0.7537719, "learning_rate": 3.964701822195683e-06, "loss": 0.77500361, "num_input_tokens_seen": 21363345, "router_z_loss_clip": 0.03955078, "router_z_loss_mlp": 0.6328125, "step": 998, "time_per_iteration": 3.255857229232788 }, { "auxiliary_loss_clip": 0.01233483, "auxiliary_loss_mlp": 0.01062292, "balance_loss_clip": 1.03766358, "balance_loss_mlp": 1.06114423, "epoch": 0.06006312941530137, "flos": 26540140268160.0, "grad_norm": 2.1635381695328553, "language_loss": 0.75563693, "learning_rate": 3.9646311164973e-06, "loss": 0.77859467, "num_input_tokens_seen": 21385290, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 1.453125, "step": 999, "time_per_iteration": 2.618399143218994 }, { "auxiliary_loss_clip": 0.01217422, "auxiliary_loss_mlp": 0.01060476, "balance_loss_clip": 1.03473878, "balance_loss_mlp": 1.0589149, "epoch": 0.060123252667969335, "flos": 27344000689920.0, "grad_norm": 2.3184063794255265, "language_loss": 0.83008826, "learning_rate": 3.9645603406862846e-06, "loss": 0.85286719, "num_input_tokens_seen": 21407625, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.5, "step": 1000, "time_per_iteration": 2.6240110397338867 }, { "auxiliary_loss_clip": 0.01227629, "auxiliary_loss_mlp": 0.0106156, "balance_loss_clip": 1.03731263, "balance_loss_mlp": 1.0611738, "epoch": 0.06018337592063731, "flos": 27016710940800.0, "grad_norm": 2.551280665498057, "language_loss": 0.86131823, "learning_rate": 3.964489494765166e-06, "loss": 0.88421011, "num_input_tokens_seen": 21426835, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.4765625, "step": 1001, "time_per_iteration": 2.601722002029419 }, { "auxiliary_loss_clip": 0.0122216, "auxiliary_loss_mlp": 0.01058532, "balance_loss_clip": 1.03423703, "balance_loss_mlp": 1.05939019, "epoch": 0.06024349917330528, "flos": 25592960580480.0, "grad_norm": 2.084195618868116, "language_loss": 0.73772097, "learning_rate": 3.96441857873647e-06, "loss": 0.76052791, "num_input_tokens_seen": 21444920, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.4453125, "step": 1002, "time_per_iteration": 2.651949167251587 }, { "auxiliary_loss_clip": 0.01205132, "auxiliary_loss_mlp": 0.01058863, "balance_loss_clip": 1.03368616, "balance_loss_mlp": 1.05882037, "epoch": 0.060303622425973244, "flos": 26133271937280.0, "grad_norm": 2.5647676084429327, "language_loss": 0.75542438, "learning_rate": 3.964347592602728e-06, "loss": 0.77806437, "num_input_tokens_seen": 21463555, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.4609375, "step": 1003, "time_per_iteration": 2.6224913597106934 }, { "auxiliary_loss_clip": 0.01232365, "auxiliary_loss_mlp": 0.01066464, "balance_loss_clip": 1.04085755, "balance_loss_mlp": 1.0631206, "epoch": 0.060363745678641216, "flos": 20377187418240.0, "grad_norm": 2.5986170125132606, "language_loss": 0.69475317, "learning_rate": 3.964276536366473e-06, "loss": 0.71774149, "num_input_tokens_seen": 21481990, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.5078125, "step": 1004, "time_per_iteration": 2.6373350620269775 }, { "auxiliary_loss_clip": 0.01223233, "auxiliary_loss_mlp": 0.01069703, "balance_loss_clip": 1.04433572, "balance_loss_mlp": 1.06383824, "epoch": 0.06042386893130918, "flos": 17749172753280.0, "grad_norm": 4.921407866947859, "language_loss": 0.83185524, "learning_rate": 3.964205410030241e-06, "loss": 0.85478455, "num_input_tokens_seen": 21500385, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.5, "step": 1005, "time_per_iteration": 2.5167696475982666 }, { "auxiliary_loss_clip": 0.01097431, "auxiliary_loss_mlp": 0.01004382, "balance_loss_clip": 1.00020981, "balance_loss_mlp": 1.02587724, "epoch": 0.06048399218397715, "flos": 68538496872960.0, "grad_norm": 0.9036998354132295, "language_loss": 0.59036505, "learning_rate": 3.964134213596571e-06, "loss": 0.61138314, "num_input_tokens_seen": 21561040, "router_z_loss_clip": 0.04174805, "router_z_loss_mlp": 0.625, "step": 1006, "time_per_iteration": 3.214705467224121 }, { "auxiliary_loss_clip": 0.01225421, "auxiliary_loss_mlp": 0.01057317, "balance_loss_clip": 1.03126955, "balance_loss_mlp": 1.05887532, "epoch": 0.060544115436645125, "flos": 23258515772160.0, "grad_norm": 2.222788854131573, "language_loss": 0.74037904, "learning_rate": 3.964062947068003e-06, "loss": 0.76320648, "num_input_tokens_seen": 21580655, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.484375, "step": 1007, "time_per_iteration": 2.654587745666504 }, { "auxiliary_loss_clip": 0.0122615, "auxiliary_loss_mlp": 0.01058593, "balance_loss_clip": 1.03315401, "balance_loss_mlp": 1.0596838, "epoch": 0.06060423868931309, "flos": 23878441624320.0, "grad_norm": 1.7567880017358963, "language_loss": 0.80225092, "learning_rate": 3.9639916104470804e-06, "loss": 0.82509828, "num_input_tokens_seen": 21599650, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.484375, "step": 1008, "time_per_iteration": 2.6821022033691406 }, { "auxiliary_loss_clip": 0.01236901, "auxiliary_loss_mlp": 0.0105822, "balance_loss_clip": 1.03232741, "balance_loss_mlp": 1.06300998, "epoch": 0.06066436194198106, "flos": 18728061171840.0, "grad_norm": 1.8573576399240008, "language_loss": 0.77589655, "learning_rate": 3.9639202037363494e-06, "loss": 0.79884779, "num_input_tokens_seen": 21617550, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.4609375, "step": 1009, "time_per_iteration": 2.6146345138549805 }, { "auxiliary_loss_clip": 0.01215681, "auxiliary_loss_mlp": 0.010566, "balance_loss_clip": 1.03247249, "balance_loss_mlp": 1.06205261, "epoch": 0.06072448519464903, "flos": 24640465680000.0, "grad_norm": 1.7646258304018867, "language_loss": 0.9295274, "learning_rate": 3.9638487269383575e-06, "loss": 0.9522503, "num_input_tokens_seen": 21635865, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.4453125, "step": 1010, "time_per_iteration": 2.626885175704956 }, { "auxiliary_loss_clip": 0.01260613, "auxiliary_loss_mlp": 0.01316776, "balance_loss_clip": 1.0399642, "balance_loss_mlp": 1.06027448, "epoch": 0.060784608447317, "flos": 17378825575680.0, "grad_norm": 2.2435564912290173, "language_loss": 0.7144537, "learning_rate": 3.9637771800556576e-06, "loss": 0.74022758, "num_input_tokens_seen": 21653945, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.546875, "step": 1011, "time_per_iteration": 2.634577512741089 }, { "auxiliary_loss_clip": 0.01218441, "auxiliary_loss_mlp": 0.01068909, "balance_loss_clip": 1.04157472, "balance_loss_mlp": 1.05851626, "epoch": 0.06084473169998497, "flos": 23692208584320.0, "grad_norm": 2.4703345576583384, "language_loss": 0.87123692, "learning_rate": 3.963705563090801e-06, "loss": 0.89411032, "num_input_tokens_seen": 21671230, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.5078125, "step": 1012, "time_per_iteration": 2.590048313140869 }, { "auxiliary_loss_clip": 0.01211951, "auxiliary_loss_mlp": 0.01063863, "balance_loss_clip": 1.03911567, "balance_loss_mlp": 1.05683541, "epoch": 0.06090485495265294, "flos": 23546339452800.0, "grad_norm": 1.793110628419584, "language_loss": 0.76638633, "learning_rate": 3.963633876046344e-06, "loss": 0.78914446, "num_input_tokens_seen": 21691155, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.4609375, "step": 1013, "time_per_iteration": 2.626441478729248 }, { "auxiliary_loss_clip": 0.01220768, "auxiliary_loss_mlp": 0.01313019, "balance_loss_clip": 1.0363214, "balance_loss_mlp": 1.06023932, "epoch": 0.06096497820532091, "flos": 20339301548160.0, "grad_norm": 2.474636220996359, "language_loss": 0.8532533, "learning_rate": 3.963562118924844e-06, "loss": 0.87859118, "num_input_tokens_seen": 21707405, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.515625, "step": 1014, "time_per_iteration": 5.716583251953125 }, { "auxiliary_loss_clip": 0.01224329, "auxiliary_loss_mlp": 0.01074757, "balance_loss_clip": 1.04695737, "balance_loss_mlp": 1.06238675, "epoch": 0.061025101457988874, "flos": 26939035779840.0, "grad_norm": 4.481483870780565, "language_loss": 0.72533947, "learning_rate": 3.963490291728864e-06, "loss": 0.74833035, "num_input_tokens_seen": 21728090, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.53125, "step": 1015, "time_per_iteration": 4.072299242019653 }, { "auxiliary_loss_clip": 0.01221115, "auxiliary_loss_mlp": 0.01063177, "balance_loss_clip": 1.03739238, "balance_loss_mlp": 1.05883276, "epoch": 0.061085224710656846, "flos": 25375054723200.0, "grad_norm": 1.642254933862659, "language_loss": 0.79237723, "learning_rate": 3.963418394460966e-06, "loss": 0.81522018, "num_input_tokens_seen": 21747950, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.4453125, "step": 1016, "time_per_iteration": 2.7332170009613037 }, { "auxiliary_loss_clip": 0.01234602, "auxiliary_loss_mlp": 0.01059076, "balance_loss_clip": 1.03463793, "balance_loss_mlp": 1.05837893, "epoch": 0.06114534796332482, "flos": 24824759385600.0, "grad_norm": 1.6846492804244757, "language_loss": 0.75972342, "learning_rate": 3.9633464271237166e-06, "loss": 0.78266025, "num_input_tokens_seen": 21767900, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.484375, "step": 1017, "time_per_iteration": 4.12010931968689 }, { "auxiliary_loss_clip": 0.0123565, "auxiliary_loss_mlp": 0.01075242, "balance_loss_clip": 1.04875338, "balance_loss_mlp": 1.06361866, "epoch": 0.061205471215992784, "flos": 20631434860800.0, "grad_norm": 2.356639491571373, "language_loss": 0.85502017, "learning_rate": 3.963274389719682e-06, "loss": 0.87812901, "num_input_tokens_seen": 21787375, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.5390625, "step": 1018, "time_per_iteration": 2.660543918609619 }, { "auxiliary_loss_clip": 0.01233104, "auxiliary_loss_mlp": 0.01078135, "balance_loss_clip": 1.05083632, "balance_loss_mlp": 1.0594672, "epoch": 0.061265594468660756, "flos": 16508351381760.0, "grad_norm": 2.504664406875458, "language_loss": 0.76664591, "learning_rate": 3.963202282251436e-06, "loss": 0.78975832, "num_input_tokens_seen": 21806275, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.4609375, "step": 1019, "time_per_iteration": 2.526557207107544 }, { "auxiliary_loss_clip": 0.01244801, "auxiliary_loss_mlp": 0.01062125, "balance_loss_clip": 1.03614902, "balance_loss_mlp": 1.05844247, "epoch": 0.06132571772132872, "flos": 26246211275520.0, "grad_norm": 2.1692980680932368, "language_loss": 0.84293199, "learning_rate": 3.96313010472155e-06, "loss": 0.86600125, "num_input_tokens_seen": 21826430, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.5, "step": 1020, "time_per_iteration": 2.7281930446624756 }, { "auxiliary_loss_clip": 0.01221003, "auxiliary_loss_mlp": 0.01059125, "balance_loss_clip": 1.03328085, "balance_loss_mlp": 1.06277573, "epoch": 0.06138584097399669, "flos": 37414788768000.0, "grad_norm": 1.9377845003288587, "language_loss": 0.79379797, "learning_rate": 3.963057857132601e-06, "loss": 0.81659919, "num_input_tokens_seen": 21847800, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.4921875, "step": 1021, "time_per_iteration": 2.7410073280334473 }, { "auxiliary_loss_clip": 0.01225029, "auxiliary_loss_mlp": 0.01064328, "balance_loss_clip": 1.04153562, "balance_loss_mlp": 1.06000566, "epoch": 0.061445964226664665, "flos": 17420661941760.0, "grad_norm": 1.8671798675168303, "language_loss": 0.87449753, "learning_rate": 3.962985539487165e-06, "loss": 0.89739114, "num_input_tokens_seen": 21863385, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.46875, "step": 1022, "time_per_iteration": 2.5532567501068115 }, { "auxiliary_loss_clip": 0.01227814, "auxiliary_loss_mlp": 0.01067091, "balance_loss_clip": 1.04140186, "balance_loss_mlp": 1.06308174, "epoch": 0.06150608747933263, "flos": 22600021691520.0, "grad_norm": 1.9508531035469834, "language_loss": 0.82821798, "learning_rate": 3.962913151787826e-06, "loss": 0.85116702, "num_input_tokens_seen": 21881880, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.4609375, "step": 1023, "time_per_iteration": 2.588548183441162 }, { "auxiliary_loss_clip": 0.01090275, "auxiliary_loss_mlp": 0.01004701, "balance_loss_clip": 1.00064778, "balance_loss_mlp": 1.02841592, "epoch": 0.0615662107320006, "flos": 56741482005120.0, "grad_norm": 0.8977274679403141, "language_loss": 0.6505568, "learning_rate": 3.962840694037165e-06, "loss": 0.67150664, "num_input_tokens_seen": 21940550, "router_z_loss_clip": 0.04052734, "router_z_loss_mlp": 0.6171875, "step": 1024, "time_per_iteration": 3.2464776039123535 }, { "auxiliary_loss_clip": 0.01229763, "auxiliary_loss_mlp": 0.01064292, "balance_loss_clip": 1.03780329, "balance_loss_mlp": 1.064134, "epoch": 0.06162633398466857, "flos": 22564793427840.0, "grad_norm": 1.9776837780244398, "language_loss": 0.87945616, "learning_rate": 3.962768166237768e-06, "loss": 0.90239668, "num_input_tokens_seen": 21958390, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.46875, "step": 1025, "time_per_iteration": 2.5845236778259277 }, { "auxiliary_loss_clip": 0.01228192, "auxiliary_loss_mlp": 0.01062664, "balance_loss_clip": 1.03782034, "balance_loss_mlp": 1.06160736, "epoch": 0.06168645723733654, "flos": 25593104234880.0, "grad_norm": 2.0188859649276623, "language_loss": 0.84337115, "learning_rate": 3.9626955683922264e-06, "loss": 0.86627972, "num_input_tokens_seen": 21978625, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 1.484375, "step": 1026, "time_per_iteration": 2.603502035140991 }, { "auxiliary_loss_clip": 0.01235938, "auxiliary_loss_mlp": 0.01061035, "balance_loss_clip": 1.03627527, "balance_loss_mlp": 1.05979025, "epoch": 0.06174658049000451, "flos": 15997917162240.0, "grad_norm": 1.9607504492921002, "language_loss": 0.81993461, "learning_rate": 3.962622900503127e-06, "loss": 0.84290433, "num_input_tokens_seen": 21996035, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.4921875, "step": 1027, "time_per_iteration": 2.6414425373077393 }, { "auxiliary_loss_clip": 0.01243883, "auxiliary_loss_mlp": 0.01059211, "balance_loss_clip": 1.03511918, "balance_loss_mlp": 1.06126034, "epoch": 0.06180670374267248, "flos": 11285970117120.0, "grad_norm": 5.0582755044908065, "language_loss": 0.84087092, "learning_rate": 3.962550162573065e-06, "loss": 0.86390185, "num_input_tokens_seen": 22011625, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 1.4609375, "step": 1028, "time_per_iteration": 2.591970205307007 }, { "auxiliary_loss_clip": 0.01105681, "auxiliary_loss_mlp": 0.01008519, "balance_loss_clip": 1.00460911, "balance_loss_mlp": 1.02648914, "epoch": 0.06186682699534045, "flos": 65130142216320.0, "grad_norm": 1.4608996745177012, "language_loss": 0.60521525, "learning_rate": 3.962477354604636e-06, "loss": 0.62635726, "num_input_tokens_seen": 22066035, "router_z_loss_clip": 0.0390625, "router_z_loss_mlp": 0.609375, "step": 1029, "time_per_iteration": 3.0255439281463623 }, { "auxiliary_loss_clip": 0.0122114, "auxiliary_loss_mlp": 0.01066624, "balance_loss_clip": 1.04265118, "balance_loss_mlp": 1.05763841, "epoch": 0.061926950248008414, "flos": 21105742976640.0, "grad_norm": 2.227641349012994, "language_loss": 0.82586586, "learning_rate": 3.962404476600438e-06, "loss": 0.8487435, "num_input_tokens_seen": 22085015, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.453125, "step": 1030, "time_per_iteration": 2.584923028945923 }, { "auxiliary_loss_clip": 0.01213081, "auxiliary_loss_mlp": 0.01071778, "balance_loss_clip": 1.04540896, "balance_loss_mlp": 1.06357455, "epoch": 0.061987073500676386, "flos": 17748454481280.0, "grad_norm": 2.6258833779532313, "language_loss": 0.7960546, "learning_rate": 3.962331528563072e-06, "loss": 0.81890315, "num_input_tokens_seen": 22102775, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.5, "step": 1031, "time_per_iteration": 2.6013848781585693 }, { "auxiliary_loss_clip": 0.01225338, "auxiliary_loss_mlp": 0.01075388, "balance_loss_clip": 1.04926968, "balance_loss_mlp": 1.06011379, "epoch": 0.06204719675334436, "flos": 21836237869440.0, "grad_norm": 1.5995063557518507, "language_loss": 0.77238345, "learning_rate": 3.962258510495142e-06, "loss": 0.79539073, "num_input_tokens_seen": 22121680, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.46875, "step": 1032, "time_per_iteration": 2.549928903579712 }, { "auxiliary_loss_clip": 0.01216152, "auxiliary_loss_mlp": 0.01066697, "balance_loss_clip": 1.04049516, "balance_loss_mlp": 1.05873466, "epoch": 0.06210732000601232, "flos": 19353697286400.0, "grad_norm": 2.407441316581249, "language_loss": 0.88917124, "learning_rate": 3.962185422399254e-06, "loss": 0.9119997, "num_input_tokens_seen": 22138155, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.484375, "step": 1033, "time_per_iteration": 2.5907297134399414 }, { "auxiliary_loss_clip": 0.01220336, "auxiliary_loss_mlp": 0.01062818, "balance_loss_clip": 1.03827262, "balance_loss_mlp": 1.05986643, "epoch": 0.062167443258680295, "flos": 24749382695040.0, "grad_norm": 2.2474239039951907, "language_loss": 0.84563768, "learning_rate": 3.962112264278014e-06, "loss": 0.86846918, "num_input_tokens_seen": 22157420, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 1.5078125, "step": 1034, "time_per_iteration": 2.5751187801361084 }, { "auxiliary_loss_clip": 0.01220123, "auxiliary_loss_mlp": 0.01059042, "balance_loss_clip": 1.03341198, "balance_loss_mlp": 1.06158054, "epoch": 0.06222756651134827, "flos": 34458478773120.0, "grad_norm": 1.8300571916930426, "language_loss": 0.80415004, "learning_rate": 3.962039036134035e-06, "loss": 0.82694173, "num_input_tokens_seen": 22178620, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.40625, "step": 1035, "time_per_iteration": 2.7130491733551025 }, { "auxiliary_loss_clip": 0.01246117, "auxiliary_loss_mlp": 0.01067916, "balance_loss_clip": 1.04091525, "balance_loss_mlp": 1.06294179, "epoch": 0.06228768976401623, "flos": 25666469763840.0, "grad_norm": 3.7559202721763483, "language_loss": 0.78399277, "learning_rate": 3.961965737969931e-06, "loss": 0.80713308, "num_input_tokens_seen": 22197125, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.46875, "step": 1036, "time_per_iteration": 2.651880979537964 }, { "auxiliary_loss_clip": 0.01211334, "auxiliary_loss_mlp": 0.0106765, "balance_loss_clip": 1.04249656, "balance_loss_mlp": 1.05877185, "epoch": 0.062347813016684205, "flos": 25295619795840.0, "grad_norm": 1.95737630353409, "language_loss": 0.86544824, "learning_rate": 3.961892369788315e-06, "loss": 0.88823807, "num_input_tokens_seen": 22217575, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.4375, "step": 1037, "time_per_iteration": 2.6157023906707764 }, { "auxiliary_loss_clip": 0.01213264, "auxiliary_loss_mlp": 0.01057339, "balance_loss_clip": 1.03060079, "balance_loss_mlp": 1.06010318, "epoch": 0.06240793626935217, "flos": 26907039740160.0, "grad_norm": 2.2322050439267236, "language_loss": 0.80472386, "learning_rate": 3.961818931591808e-06, "loss": 0.82742995, "num_input_tokens_seen": 22236840, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.4375, "step": 1038, "time_per_iteration": 2.6615543365478516 }, { "auxiliary_loss_clip": 0.01229417, "auxiliary_loss_mlp": 0.01063798, "balance_loss_clip": 1.03779888, "balance_loss_mlp": 1.0604589, "epoch": 0.06246805952202014, "flos": 21615782146560.0, "grad_norm": 1.9198837979104988, "language_loss": 0.85769969, "learning_rate": 3.961745423383028e-06, "loss": 0.8806318, "num_input_tokens_seen": 22256465, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.4140625, "step": 1039, "time_per_iteration": 2.580744743347168 }, { "auxiliary_loss_clip": 0.01221592, "auxiliary_loss_mlp": 0.01065634, "balance_loss_clip": 1.04055238, "balance_loss_mlp": 1.05893016, "epoch": 0.0625281827746881, "flos": 19311896833920.0, "grad_norm": 1.9265717475675823, "language_loss": 0.80577731, "learning_rate": 3.961671845164602e-06, "loss": 0.82864952, "num_input_tokens_seen": 22274025, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.4453125, "step": 1040, "time_per_iteration": 2.584958553314209 }, { "auxiliary_loss_clip": 0.01207713, "auxiliary_loss_mlp": 0.01065322, "balance_loss_clip": 1.04066992, "balance_loss_mlp": 1.06386316, "epoch": 0.06258830602735609, "flos": 27745769289600.0, "grad_norm": 2.4300139673681294, "language_loss": 0.69444895, "learning_rate": 3.961598196939153e-06, "loss": 0.7171793, "num_input_tokens_seen": 22292245, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 1.4375, "step": 1041, "time_per_iteration": 2.6377480030059814 }, { "auxiliary_loss_clip": 0.01205833, "auxiliary_loss_mlp": 0.01059722, "balance_loss_clip": 1.03329349, "balance_loss_mlp": 1.05825496, "epoch": 0.06264842928002405, "flos": 23222605150080.0, "grad_norm": 2.6299974653631892, "language_loss": 0.81502271, "learning_rate": 3.961524478709311e-06, "loss": 0.83767825, "num_input_tokens_seen": 22311455, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.4765625, "step": 1042, "time_per_iteration": 2.5872247219085693 }, { "auxiliary_loss_clip": 0.0120326, "auxiliary_loss_mlp": 0.01050158, "balance_loss_clip": 1.02560055, "balance_loss_mlp": 1.05902421, "epoch": 0.06270855253269202, "flos": 38399495189760.0, "grad_norm": 2.214320038759532, "language_loss": 0.75815487, "learning_rate": 3.961450690477705e-06, "loss": 0.780689, "num_input_tokens_seen": 22333750, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 1.4375, "step": 1043, "time_per_iteration": 2.774864912033081 }, { "auxiliary_loss_clip": 0.01209275, "auxiliary_loss_mlp": 0.0106043, "balance_loss_clip": 1.03620648, "balance_loss_mlp": 1.06109679, "epoch": 0.06276867578535998, "flos": 22453542028800.0, "grad_norm": 1.901256242574698, "language_loss": 0.91772068, "learning_rate": 3.961376832246969e-06, "loss": 0.94041765, "num_input_tokens_seen": 22351940, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.390625, "step": 1044, "time_per_iteration": 2.768627166748047 }, { "auxiliary_loss_clip": 0.0121058, "auxiliary_loss_mlp": 0.01074706, "balance_loss_clip": 1.04968381, "balance_loss_mlp": 1.05781209, "epoch": 0.06282879903802796, "flos": 22930435923840.0, "grad_norm": 1.7617562387732029, "language_loss": 0.86238617, "learning_rate": 3.96130290401974e-06, "loss": 0.88523901, "num_input_tokens_seen": 22372085, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.4375, "step": 1045, "time_per_iteration": 2.6765894889831543 }, { "auxiliary_loss_clip": 0.01218345, "auxiliary_loss_mlp": 0.01065489, "balance_loss_clip": 1.04134881, "balance_loss_mlp": 1.05864072, "epoch": 0.06288892229069593, "flos": 34819237019520.0, "grad_norm": 2.4636596398202246, "language_loss": 0.77861637, "learning_rate": 3.961228905798655e-06, "loss": 0.80145466, "num_input_tokens_seen": 22392020, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.4140625, "step": 1046, "time_per_iteration": 2.7368242740631104 }, { "auxiliary_loss_clip": 0.0121389, "auxiliary_loss_mlp": 0.01069239, "balance_loss_clip": 1.04507565, "balance_loss_mlp": 1.06122875, "epoch": 0.06294904554336389, "flos": 19427134642560.0, "grad_norm": 4.5886092833098555, "language_loss": 0.77173668, "learning_rate": 3.961154837586356e-06, "loss": 0.79456794, "num_input_tokens_seen": 22411180, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.4375, "step": 1047, "time_per_iteration": 2.561077356338501 }, { "auxiliary_loss_clip": 0.01218494, "auxiliary_loss_mlp": 0.01062971, "balance_loss_clip": 1.03785396, "balance_loss_mlp": 1.06065071, "epoch": 0.06300916879603187, "flos": 40661867358720.0, "grad_norm": 2.1858804221208508, "language_loss": 0.7712512, "learning_rate": 3.961080699385484e-06, "loss": 0.79406583, "num_input_tokens_seen": 22435105, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.484375, "step": 1048, "time_per_iteration": 2.819023370742798 }, { "auxiliary_loss_clip": 0.01224002, "auxiliary_loss_mlp": 0.01068292, "balance_loss_clip": 1.04309082, "balance_loss_mlp": 1.05935609, "epoch": 0.06306929204869983, "flos": 23804142341760.0, "grad_norm": 2.1819555173809135, "language_loss": 0.77747065, "learning_rate": 3.961006491198688e-06, "loss": 0.80039364, "num_input_tokens_seen": 22452710, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.4609375, "step": 1049, "time_per_iteration": 2.5701920986175537 }, { "auxiliary_loss_clip": 0.01212441, "auxiliary_loss_mlp": 0.01058903, "balance_loss_clip": 1.0347873, "balance_loss_mlp": 1.05909038, "epoch": 0.0631294153013678, "flos": 18915802583040.0, "grad_norm": 2.0798306081951536, "language_loss": 0.82984966, "learning_rate": 3.960932213028614e-06, "loss": 0.85256308, "num_input_tokens_seen": 22470175, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.4453125, "step": 1050, "time_per_iteration": 2.7383229732513428 }, { "auxiliary_loss_clip": 0.01202274, "auxiliary_loss_mlp": 0.01065206, "balance_loss_clip": 1.0416863, "balance_loss_mlp": 1.06119752, "epoch": 0.06318953855403578, "flos": 24280174310400.0, "grad_norm": 1.9829947528125926, "language_loss": 0.76856816, "learning_rate": 3.960857864877913e-06, "loss": 0.79124296, "num_input_tokens_seen": 22490020, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 1.4140625, "step": 1051, "time_per_iteration": 2.593632698059082 }, { "auxiliary_loss_clip": 0.01224419, "auxiliary_loss_mlp": 0.01066641, "balance_loss_clip": 1.04188156, "balance_loss_mlp": 1.06174338, "epoch": 0.06324966180670374, "flos": 22528918719360.0, "grad_norm": 2.5061841910112643, "language_loss": 0.80273139, "learning_rate": 3.960783446749239e-06, "loss": 0.82564199, "num_input_tokens_seen": 22509685, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.4453125, "step": 1052, "time_per_iteration": 2.6267268657684326 }, { "auxiliary_loss_clip": 0.01207992, "auxiliary_loss_mlp": 0.01059674, "balance_loss_clip": 1.0352118, "balance_loss_mlp": 1.06242585, "epoch": 0.06330978505937171, "flos": 15778107884160.0, "grad_norm": 3.178365088408801, "language_loss": 0.78102946, "learning_rate": 3.960708958645247e-06, "loss": 0.80370617, "num_input_tokens_seen": 22527905, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.4609375, "step": 1053, "time_per_iteration": 2.4815287590026855 }, { "auxiliary_loss_clip": 0.01224827, "auxiliary_loss_mlp": 0.01303076, "balance_loss_clip": 1.03087246, "balance_loss_mlp": 1.06273484, "epoch": 0.06336990831203967, "flos": 21471098163840.0, "grad_norm": 1.9641632925938177, "language_loss": 0.84621143, "learning_rate": 3.960634400568597e-06, "loss": 0.87149048, "num_input_tokens_seen": 22546335, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 1.4375, "step": 1054, "time_per_iteration": 2.6103579998016357 }, { "auxiliary_loss_clip": 0.01222957, "auxiliary_loss_mlp": 0.01064317, "balance_loss_clip": 1.04039145, "balance_loss_mlp": 1.0634495, "epoch": 0.06343003156470765, "flos": 18478877546880.0, "grad_norm": 2.232544620607063, "language_loss": 0.85201395, "learning_rate": 3.9605597725219485e-06, "loss": 0.87488669, "num_input_tokens_seen": 22563885, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.4140625, "step": 1055, "time_per_iteration": 4.088316440582275 }, { "auxiliary_loss_clip": 0.01237206, "auxiliary_loss_mlp": 0.01067477, "balance_loss_clip": 1.04268169, "balance_loss_mlp": 1.0622617, "epoch": 0.06349015481737562, "flos": 25154886309120.0, "grad_norm": 2.6987953452573312, "language_loss": 0.80933738, "learning_rate": 3.960485074507964e-06, "loss": 0.83238423, "num_input_tokens_seen": 22583035, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.4765625, "step": 1056, "time_per_iteration": 2.7586634159088135 }, { "auxiliary_loss_clip": 0.01257285, "auxiliary_loss_mlp": 0.01062776, "balance_loss_clip": 1.03646636, "balance_loss_mlp": 1.06074083, "epoch": 0.06355027807004358, "flos": 26871775562880.0, "grad_norm": 2.209322774073181, "language_loss": 0.80495775, "learning_rate": 3.960410306529311e-06, "loss": 0.82815844, "num_input_tokens_seen": 22605055, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.5078125, "step": 1057, "time_per_iteration": 4.130179405212402 }, { "auxiliary_loss_clip": 0.01224695, "auxiliary_loss_mlp": 0.01061095, "balance_loss_clip": 1.03910077, "balance_loss_mlp": 1.05923986, "epoch": 0.06361040132271156, "flos": 21396691140480.0, "grad_norm": 2.2065084868771168, "language_loss": 0.83711803, "learning_rate": 3.960335468588656e-06, "loss": 0.85997593, "num_input_tokens_seen": 22623760, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.3828125, "step": 1058, "time_per_iteration": 2.804180860519409 }, { "auxiliary_loss_clip": 0.01223971, "auxiliary_loss_mlp": 0.01062492, "balance_loss_clip": 1.03538406, "balance_loss_mlp": 1.06106758, "epoch": 0.06367052457537953, "flos": 25733765894400.0, "grad_norm": 2.1361954923810624, "language_loss": 0.87558955, "learning_rate": 3.960260560688672e-06, "loss": 0.89845425, "num_input_tokens_seen": 22643000, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.4453125, "step": 1059, "time_per_iteration": 4.27583646774292 }, { "auxiliary_loss_clip": 0.01243637, "auxiliary_loss_mlp": 0.010593, "balance_loss_clip": 1.03436112, "balance_loss_mlp": 1.06863594, "epoch": 0.0637306478280475, "flos": 17631420992640.0, "grad_norm": 2.8743642821365176, "language_loss": 0.91150743, "learning_rate": 3.96018558283203e-06, "loss": 0.93453676, "num_input_tokens_seen": 22660460, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.46875, "step": 1060, "time_per_iteration": 2.7132747173309326 }, { "auxiliary_loss_clip": 0.01224159, "auxiliary_loss_mlp": 0.0106259, "balance_loss_clip": 1.03727019, "balance_loss_mlp": 1.06206036, "epoch": 0.06379077108071547, "flos": 13662610427520.0, "grad_norm": 2.3285877024162147, "language_loss": 0.86884439, "learning_rate": 3.960110535021406e-06, "loss": 0.89171195, "num_input_tokens_seen": 22679270, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.4375, "step": 1061, "time_per_iteration": 2.654916286468506 }, { "auxiliary_loss_clip": 0.012297, "auxiliary_loss_mlp": 0.01062003, "balance_loss_clip": 1.03476405, "balance_loss_mlp": 1.06052303, "epoch": 0.06385089433338344, "flos": 28478849961600.0, "grad_norm": 2.1173142488046697, "language_loss": 0.77321005, "learning_rate": 3.96003541725948e-06, "loss": 0.79612708, "num_input_tokens_seen": 22699330, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.515625, "step": 1062, "time_per_iteration": 2.7751662731170654 }, { "auxiliary_loss_clip": 0.0123496, "auxiliary_loss_mlp": 0.01062695, "balance_loss_clip": 1.0382328, "balance_loss_mlp": 1.0608356, "epoch": 0.0639110175860514, "flos": 24311057028480.0, "grad_norm": 2.1504806389689617, "language_loss": 0.86279035, "learning_rate": 3.959960229548932e-06, "loss": 0.88576692, "num_input_tokens_seen": 22717945, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.46875, "step": 1063, "time_per_iteration": 2.687791347503662 }, { "auxiliary_loss_clip": 0.01205917, "auxiliary_loss_mlp": 0.01061371, "balance_loss_clip": 1.03603911, "balance_loss_mlp": 1.06030917, "epoch": 0.06397114083871938, "flos": 22090772620800.0, "grad_norm": 2.125809630192689, "language_loss": 0.79860222, "learning_rate": 3.9598849718924456e-06, "loss": 0.82127512, "num_input_tokens_seen": 22736790, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.4609375, "step": 1064, "time_per_iteration": 2.6715850830078125 }, { "auxiliary_loss_clip": 0.01217364, "auxiliary_loss_mlp": 0.01069177, "balance_loss_clip": 1.04296303, "balance_loss_mlp": 1.06180954, "epoch": 0.06403126409138735, "flos": 19572824206080.0, "grad_norm": 2.5224150076648084, "language_loss": 0.84220469, "learning_rate": 3.9598096442927045e-06, "loss": 0.8650701, "num_input_tokens_seen": 22754745, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.4609375, "step": 1065, "time_per_iteration": 2.6769862174987793 }, { "auxiliary_loss_clip": 0.01237601, "auxiliary_loss_mlp": 0.01055425, "balance_loss_clip": 1.03159499, "balance_loss_mlp": 1.06739044, "epoch": 0.06409138734405531, "flos": 40807413267840.0, "grad_norm": 1.6656160590912725, "language_loss": 0.68174398, "learning_rate": 3.959734246752399e-06, "loss": 0.70467412, "num_input_tokens_seen": 22776780, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 1.421875, "step": 1066, "time_per_iteration": 2.8195106983184814 }, { "auxiliary_loss_clip": 0.01208186, "auxiliary_loss_mlp": 0.01072045, "balance_loss_clip": 1.04579508, "balance_loss_mlp": 1.0647068, "epoch": 0.06415151059672328, "flos": 20441610460800.0, "grad_norm": 1.9289819659393745, "language_loss": 0.90438056, "learning_rate": 3.959658779274219e-06, "loss": 0.92718279, "num_input_tokens_seen": 22793915, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.4375, "step": 1067, "time_per_iteration": 2.7054569721221924 }, { "auxiliary_loss_clip": 0.01226742, "auxiliary_loss_mlp": 0.01064325, "balance_loss_clip": 1.04011405, "balance_loss_mlp": 1.06207442, "epoch": 0.06421163384939126, "flos": 18072045129600.0, "grad_norm": 1.8772788311557065, "language_loss": 0.83035821, "learning_rate": 3.959583241860859e-06, "loss": 0.85326892, "num_input_tokens_seen": 22812670, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.4609375, "step": 1068, "time_per_iteration": 2.600095748901367 }, { "auxiliary_loss_clip": 0.01223254, "auxiliary_loss_mlp": 0.0106304, "balance_loss_clip": 1.03884065, "balance_loss_mlp": 1.06457043, "epoch": 0.06427175710205922, "flos": 25119442563840.0, "grad_norm": 2.467377898153468, "language_loss": 0.89404678, "learning_rate": 3.959507634515013e-06, "loss": 0.91690981, "num_input_tokens_seen": 22832440, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.3984375, "step": 1069, "time_per_iteration": 2.741276741027832 }, { "auxiliary_loss_clip": 0.01229052, "auxiliary_loss_mlp": 0.01073497, "balance_loss_clip": 1.04686582, "balance_loss_mlp": 1.06392682, "epoch": 0.06433188035472719, "flos": 17380549428480.0, "grad_norm": 4.690104509842661, "language_loss": 0.95548975, "learning_rate": 3.95943195723938e-06, "loss": 0.97851521, "num_input_tokens_seen": 22845495, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.46875, "step": 1070, "time_per_iteration": 2.5973317623138428 }, { "auxiliary_loss_clip": 0.01247874, "auxiliary_loss_mlp": 0.01052736, "balance_loss_clip": 1.02667642, "balance_loss_mlp": 1.06438041, "epoch": 0.06439200360739517, "flos": 23546267625600.0, "grad_norm": 1.7667869372279261, "language_loss": 0.88064837, "learning_rate": 3.959356210036661e-06, "loss": 0.90365446, "num_input_tokens_seen": 22865390, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.46875, "step": 1071, "time_per_iteration": 2.7667477130889893 }, { "auxiliary_loss_clip": 0.01228981, "auxiliary_loss_mlp": 0.01050807, "balance_loss_clip": 1.02788293, "balance_loss_mlp": 1.05898535, "epoch": 0.06445212686006313, "flos": 21979772616960.0, "grad_norm": 1.7455426196853059, "language_loss": 0.76308525, "learning_rate": 3.959280392909559e-06, "loss": 0.78588307, "num_input_tokens_seen": 22885495, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.4296875, "step": 1072, "time_per_iteration": 2.664447546005249 }, { "auxiliary_loss_clip": 0.01229817, "auxiliary_loss_mlp": 0.01066264, "balance_loss_clip": 1.04002571, "balance_loss_mlp": 1.06414402, "epoch": 0.0645122501127311, "flos": 25921291824000.0, "grad_norm": 1.923070773311182, "language_loss": 0.80561745, "learning_rate": 3.9592045058607785e-06, "loss": 0.82857829, "num_input_tokens_seen": 22904845, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.46875, "step": 1073, "time_per_iteration": 2.6812384128570557 }, { "auxiliary_loss_clip": 0.01208882, "auxiliary_loss_mlp": 0.01061368, "balance_loss_clip": 1.03673935, "balance_loss_mlp": 1.06030023, "epoch": 0.06457237336539907, "flos": 25626034028160.0, "grad_norm": 1.7835028927502212, "language_loss": 0.8049373, "learning_rate": 3.95912854889303e-06, "loss": 0.82763982, "num_input_tokens_seen": 22925940, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 1.390625, "step": 1074, "time_per_iteration": 2.771136999130249 }, { "auxiliary_loss_clip": 0.01221687, "auxiliary_loss_mlp": 0.01062054, "balance_loss_clip": 1.03657925, "balance_loss_mlp": 1.05921233, "epoch": 0.06463249661806704, "flos": 19463979018240.0, "grad_norm": 2.3619150791406676, "language_loss": 0.78949606, "learning_rate": 3.959052522009023e-06, "loss": 0.81233346, "num_input_tokens_seen": 22944375, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.4375, "step": 1075, "time_per_iteration": 2.6749613285064697 }, { "auxiliary_loss_clip": 0.01227089, "auxiliary_loss_mlp": 0.01062646, "balance_loss_clip": 1.03946006, "balance_loss_mlp": 1.06468475, "epoch": 0.064692619870735, "flos": 24498044254080.0, "grad_norm": 2.9153481391810896, "language_loss": 0.87990332, "learning_rate": 3.95897642521147e-06, "loss": 0.90280062, "num_input_tokens_seen": 22959145, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.4375, "step": 1076, "time_per_iteration": 2.703871250152588 }, { "auxiliary_loss_clip": 0.01210218, "auxiliary_loss_mlp": 0.01052804, "balance_loss_clip": 1.02890229, "balance_loss_mlp": 1.05951452, "epoch": 0.06475274312340297, "flos": 17018677860480.0, "grad_norm": 1.8437947701076225, "language_loss": 0.80082309, "learning_rate": 3.958900258503089e-06, "loss": 0.82345325, "num_input_tokens_seen": 22978100, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.4140625, "step": 1077, "time_per_iteration": 2.5801377296447754 }, { "auxiliary_loss_clip": 0.01229454, "auxiliary_loss_mlp": 0.01066634, "balance_loss_clip": 1.04027677, "balance_loss_mlp": 1.06150627, "epoch": 0.06481286637607095, "flos": 24572379450240.0, "grad_norm": 1.9802888429508685, "language_loss": 0.91323739, "learning_rate": 3.958824021886595e-06, "loss": 0.93619823, "num_input_tokens_seen": 22997285, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.5, "step": 1078, "time_per_iteration": 2.6635446548461914 }, { "auxiliary_loss_clip": 0.01214498, "auxiliary_loss_mlp": 0.01067148, "balance_loss_clip": 1.04132736, "balance_loss_mlp": 1.06380737, "epoch": 0.06487298962873891, "flos": 21105635235840.0, "grad_norm": 3.9789874511363013, "language_loss": 0.78410709, "learning_rate": 3.9587477153647115e-06, "loss": 0.80692351, "num_input_tokens_seen": 23016285, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.5078125, "step": 1079, "time_per_iteration": 2.5963215827941895 }, { "auxiliary_loss_clip": 0.01233252, "auxiliary_loss_mlp": 0.01065971, "balance_loss_clip": 1.0410682, "balance_loss_mlp": 1.06032789, "epoch": 0.06493311288140688, "flos": 24608182331520.0, "grad_norm": 2.2056074864902313, "language_loss": 0.69042969, "learning_rate": 3.95867133894016e-06, "loss": 0.71342194, "num_input_tokens_seen": 23036420, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.453125, "step": 1080, "time_per_iteration": 2.7164812088012695 }, { "auxiliary_loss_clip": 0.01245041, "auxiliary_loss_mlp": 0.01068742, "balance_loss_clip": 1.04187226, "balance_loss_mlp": 1.06202161, "epoch": 0.06499323613407486, "flos": 25337994865920.0, "grad_norm": 1.8839193281366018, "language_loss": 0.71704519, "learning_rate": 3.958594892615667e-06, "loss": 0.740183, "num_input_tokens_seen": 23056945, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.46875, "step": 1081, "time_per_iteration": 2.7932870388031006 }, { "auxiliary_loss_clip": 0.0124292, "auxiliary_loss_mlp": 0.01063683, "balance_loss_clip": 1.03727853, "balance_loss_mlp": 1.06291151, "epoch": 0.06505335938674282, "flos": 20375714960640.0, "grad_norm": 2.4897222797042993, "language_loss": 0.83511961, "learning_rate": 3.95851837639396e-06, "loss": 0.85818565, "num_input_tokens_seen": 23074940, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.4296875, "step": 1082, "time_per_iteration": 2.7579610347747803 }, { "auxiliary_loss_clip": 0.01225503, "auxiliary_loss_mlp": 0.01067073, "balance_loss_clip": 1.04063296, "balance_loss_mlp": 1.06267226, "epoch": 0.06511348263941079, "flos": 25337923038720.0, "grad_norm": 2.3818320698881763, "language_loss": 0.82619536, "learning_rate": 3.9584417902777695e-06, "loss": 0.84912109, "num_input_tokens_seen": 23093420, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.5390625, "step": 1083, "time_per_iteration": 2.6421115398406982 }, { "auxiliary_loss_clip": 0.01235113, "auxiliary_loss_mlp": 0.01065221, "balance_loss_clip": 1.03864884, "balance_loss_mlp": 1.06233811, "epoch": 0.06517360589207877, "flos": 20332801186560.0, "grad_norm": 2.6770435928986513, "language_loss": 0.79351234, "learning_rate": 3.95836513426983e-06, "loss": 0.81651556, "num_input_tokens_seen": 23111550, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.453125, "step": 1084, "time_per_iteration": 2.793856620788574 }, { "auxiliary_loss_clip": 0.01224707, "auxiliary_loss_mlp": 0.01064251, "balance_loss_clip": 1.03851366, "balance_loss_mlp": 1.06163824, "epoch": 0.06523372914474673, "flos": 31681650061440.0, "grad_norm": 2.1027660686680676, "language_loss": 0.66196424, "learning_rate": 3.958288408372877e-06, "loss": 0.68485379, "num_input_tokens_seen": 23130335, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.4453125, "step": 1085, "time_per_iteration": 2.729581594467163 }, { "auxiliary_loss_clip": 0.01221161, "auxiliary_loss_mlp": 0.01068754, "balance_loss_clip": 1.0444237, "balance_loss_mlp": 1.061481, "epoch": 0.0652938523974147, "flos": 20778165918720.0, "grad_norm": 1.8331211468599955, "language_loss": 0.76515186, "learning_rate": 3.9582116125896474e-06, "loss": 0.78805107, "num_input_tokens_seen": 23152380, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 1.4140625, "step": 1086, "time_per_iteration": 2.725050687789917 }, { "auxiliary_loss_clip": 0.01210351, "auxiliary_loss_mlp": 0.01062735, "balance_loss_clip": 1.03705788, "balance_loss_mlp": 1.05803561, "epoch": 0.06535397565008266, "flos": 16690993061760.0, "grad_norm": 2.0998226989750126, "language_loss": 0.85105121, "learning_rate": 3.958134746922882e-06, "loss": 0.87378204, "num_input_tokens_seen": 23171630, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.4296875, "step": 1087, "time_per_iteration": 2.643465280532837 }, { "auxiliary_loss_clip": 0.01226219, "auxiliary_loss_mlp": 0.01061745, "balance_loss_clip": 1.0367353, "balance_loss_mlp": 1.06302834, "epoch": 0.06541409890275064, "flos": 26868220116480.0, "grad_norm": 1.8427343810837125, "language_loss": 0.77938914, "learning_rate": 3.958057811375325e-06, "loss": 0.80226874, "num_input_tokens_seen": 23192520, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.453125, "step": 1088, "time_per_iteration": 2.748657464981079 }, { "auxiliary_loss_clip": 0.01221838, "auxiliary_loss_mlp": 0.01065287, "balance_loss_clip": 1.04210114, "balance_loss_mlp": 1.06000471, "epoch": 0.06547422215541861, "flos": 20521620005760.0, "grad_norm": 1.7350849046236556, "language_loss": 0.71274561, "learning_rate": 3.957980805949722e-06, "loss": 0.73561686, "num_input_tokens_seen": 23210710, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.4375, "step": 1089, "time_per_iteration": 2.6918342113494873 }, { "auxiliary_loss_clip": 0.01222127, "auxiliary_loss_mlp": 0.01308688, "balance_loss_clip": 1.03584003, "balance_loss_mlp": 1.06209469, "epoch": 0.06553434540808657, "flos": 22016616992640.0, "grad_norm": 1.7342069272863518, "language_loss": 0.85313278, "learning_rate": 3.957903730648819e-06, "loss": 0.87844098, "num_input_tokens_seen": 23230305, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 1.421875, "step": 1090, "time_per_iteration": 2.6526002883911133 }, { "auxiliary_loss_clip": 0.0120511, "auxiliary_loss_mlp": 0.01060183, "balance_loss_clip": 1.03497052, "balance_loss_mlp": 1.06171334, "epoch": 0.06559446866075455, "flos": 24608649208320.0, "grad_norm": 1.7449989483357364, "language_loss": 0.72065794, "learning_rate": 3.957826585475369e-06, "loss": 0.74331087, "num_input_tokens_seen": 23249015, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.4296875, "step": 1091, "time_per_iteration": 2.699218988418579 }, { "auxiliary_loss_clip": 0.01257212, "auxiliary_loss_mlp": 0.01063061, "balance_loss_clip": 1.03809905, "balance_loss_mlp": 1.05866301, "epoch": 0.06565459191342252, "flos": 24274679529600.0, "grad_norm": 2.095230685092632, "language_loss": 0.82455355, "learning_rate": 3.957749370432124e-06, "loss": 0.84775633, "num_input_tokens_seen": 23265105, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.4375, "step": 1092, "time_per_iteration": 2.7610037326812744 }, { "auxiliary_loss_clip": 0.01207602, "auxiliary_loss_mlp": 0.01065081, "balance_loss_clip": 1.03725767, "balance_loss_mlp": 1.05913007, "epoch": 0.06571471516609048, "flos": 24787124910720.0, "grad_norm": 1.8637933912538842, "language_loss": 0.70764625, "learning_rate": 3.957672085521841e-06, "loss": 0.73037302, "num_input_tokens_seen": 23283950, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.484375, "step": 1093, "time_per_iteration": 2.7109382152557373 }, { "auxiliary_loss_clip": 0.01223474, "auxiliary_loss_mlp": 0.0106204, "balance_loss_clip": 1.03528965, "balance_loss_mlp": 1.06153345, "epoch": 0.06577483841875846, "flos": 26214071581440.0, "grad_norm": 1.6598688825844548, "language_loss": 0.87822491, "learning_rate": 3.957594730747276e-06, "loss": 0.90108001, "num_input_tokens_seen": 23305005, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.4375, "step": 1094, "time_per_iteration": 2.6912343502044678 }, { "auxiliary_loss_clip": 0.0122467, "auxiliary_loss_mlp": 0.01065954, "balance_loss_clip": 1.03851151, "balance_loss_mlp": 1.0620656, "epoch": 0.06583496167142643, "flos": 25080802508160.0, "grad_norm": 2.0179939648623417, "language_loss": 0.81356847, "learning_rate": 3.957517306111191e-06, "loss": 0.83647478, "num_input_tokens_seen": 23323220, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.4453125, "step": 1095, "time_per_iteration": 2.7217588424682617 }, { "auxiliary_loss_clip": 0.01227965, "auxiliary_loss_mlp": 0.01060633, "balance_loss_clip": 1.03682709, "balance_loss_mlp": 1.05835271, "epoch": 0.06589508492409439, "flos": 25629804956160.0, "grad_norm": 2.0152202675974302, "language_loss": 0.69845438, "learning_rate": 3.957439811616349e-06, "loss": 0.7213403, "num_input_tokens_seen": 23342235, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 1.421875, "step": 1096, "time_per_iteration": 2.6689794063568115 }, { "auxiliary_loss_clip": 0.01215807, "auxiliary_loss_mlp": 0.01071025, "balance_loss_clip": 1.04586029, "balance_loss_mlp": 1.06359339, "epoch": 0.06595520817676236, "flos": 23621249266560.0, "grad_norm": 1.989684629143292, "language_loss": 0.77189142, "learning_rate": 3.957362247265515e-06, "loss": 0.79475975, "num_input_tokens_seen": 23363680, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.4296875, "step": 1097, "time_per_iteration": 4.220184326171875 }, { "auxiliary_loss_clip": 0.01205504, "auxiliary_loss_mlp": 0.01068698, "balance_loss_clip": 1.04361653, "balance_loss_mlp": 1.06051278, "epoch": 0.06601533142943034, "flos": 33801708545280.0, "grad_norm": 2.9412616338056035, "language_loss": 0.78565121, "learning_rate": 3.957284613061456e-06, "loss": 0.80839318, "num_input_tokens_seen": 23385590, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.4453125, "step": 1098, "time_per_iteration": 4.19852614402771 }, { "auxiliary_loss_clip": 0.01213628, "auxiliary_loss_mlp": 0.01076032, "balance_loss_clip": 1.04923368, "balance_loss_mlp": 1.06183326, "epoch": 0.0660754546820983, "flos": 20259184262400.0, "grad_norm": 1.886532120818202, "language_loss": 0.8167063, "learning_rate": 3.957206909006945e-06, "loss": 0.83960295, "num_input_tokens_seen": 23402945, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.421875, "step": 1099, "time_per_iteration": 2.6490867137908936 }, { "auxiliary_loss_clip": 0.01206739, "auxiliary_loss_mlp": 0.01058704, "balance_loss_clip": 1.03369427, "balance_loss_mlp": 1.05684638, "epoch": 0.06613557793476627, "flos": 19354164163200.0, "grad_norm": 1.8764856175861124, "language_loss": 0.82398582, "learning_rate": 3.957129135104754e-06, "loss": 0.84664029, "num_input_tokens_seen": 23421410, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.40625, "step": 1100, "time_per_iteration": 4.832265615463257 }, { "auxiliary_loss_clip": 0.01223724, "auxiliary_loss_mlp": 0.01065032, "balance_loss_clip": 1.03991485, "balance_loss_mlp": 1.06316972, "epoch": 0.06619570118743424, "flos": 13772568936960.0, "grad_norm": 2.1297366825148343, "language_loss": 0.73778462, "learning_rate": 3.957051291357658e-06, "loss": 0.76067221, "num_input_tokens_seen": 23438870, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.421875, "step": 1101, "time_per_iteration": 2.6657040119171143 }, { "auxiliary_loss_clip": 0.01207058, "auxiliary_loss_mlp": 0.01067802, "balance_loss_clip": 1.04339957, "balance_loss_mlp": 1.05700672, "epoch": 0.06625582444010221, "flos": 17857874286720.0, "grad_norm": 2.729213272124456, "language_loss": 0.85920173, "learning_rate": 3.956973377768437e-06, "loss": 0.88195038, "num_input_tokens_seen": 23456975, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.40625, "step": 1102, "time_per_iteration": 2.722254753112793 }, { "auxiliary_loss_clip": 0.01217627, "auxiliary_loss_mlp": 0.01061102, "balance_loss_clip": 1.03544843, "balance_loss_mlp": 1.05864716, "epoch": 0.06631594769277017, "flos": 11838707579520.0, "grad_norm": 1.9662363080308254, "language_loss": 0.81311965, "learning_rate": 3.956895394339869e-06, "loss": 0.83590698, "num_input_tokens_seen": 23473440, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.40625, "step": 1103, "time_per_iteration": 2.5477278232574463 }, { "auxiliary_loss_clip": 0.0121286, "auxiliary_loss_mlp": 0.0131284, "balance_loss_clip": 1.0402168, "balance_loss_mlp": 1.0630765, "epoch": 0.06637607094543815, "flos": 19793351756160.0, "grad_norm": 1.7173600615245106, "language_loss": 0.81846851, "learning_rate": 3.956817341074738e-06, "loss": 0.8437255, "num_input_tokens_seen": 23493880, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 1.40625, "step": 1104, "time_per_iteration": 2.664788007736206 }, { "auxiliary_loss_clip": 0.01222686, "auxiliary_loss_mlp": 0.01054534, "balance_loss_clip": 1.02803373, "balance_loss_mlp": 1.05600429, "epoch": 0.06643619419810612, "flos": 25485659677440.0, "grad_norm": 1.6297775501449205, "language_loss": 0.80494136, "learning_rate": 3.95673921797583e-06, "loss": 0.82771355, "num_input_tokens_seen": 23514920, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.390625, "step": 1105, "time_per_iteration": 2.670197010040283 }, { "auxiliary_loss_clip": 0.01213775, "auxiliary_loss_mlp": 0.01063312, "balance_loss_clip": 1.04020941, "balance_loss_mlp": 1.05906963, "epoch": 0.06649631745077408, "flos": 16946533393920.0, "grad_norm": 1.8643439237176356, "language_loss": 0.96660393, "learning_rate": 3.956661025045933e-06, "loss": 0.98937488, "num_input_tokens_seen": 23531635, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.3671875, "step": 1106, "time_per_iteration": 2.6800267696380615 }, { "auxiliary_loss_clip": 0.01214021, "auxiliary_loss_mlp": 0.01061011, "balance_loss_clip": 1.03555965, "balance_loss_mlp": 1.05987835, "epoch": 0.06655644070344206, "flos": 17858592558720.0, "grad_norm": 1.7855310956402766, "language_loss": 0.8217119, "learning_rate": 3.9565827622878365e-06, "loss": 0.84446216, "num_input_tokens_seen": 23551020, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.4453125, "step": 1107, "time_per_iteration": 2.621978759765625 }, { "auxiliary_loss_clip": 0.01093777, "auxiliary_loss_mlp": 0.01012115, "balance_loss_clip": 1.00782394, "balance_loss_mlp": 1.02592111, "epoch": 0.06661656395611003, "flos": 61419350021760.0, "grad_norm": 0.8044910979605662, "language_loss": 0.5677861, "learning_rate": 3.956504429704334e-06, "loss": 0.58884501, "num_input_tokens_seen": 23610675, "router_z_loss_clip": 0.04296875, "router_z_loss_mlp": 0.5859375, "step": 1108, "time_per_iteration": 3.1866164207458496 }, { "auxiliary_loss_clip": 0.01218436, "auxiliary_loss_mlp": 0.01062988, "balance_loss_clip": 1.03765571, "balance_loss_mlp": 1.05790353, "epoch": 0.066676687208778, "flos": 20662856282880.0, "grad_norm": 2.88953686407931, "language_loss": 0.72398627, "learning_rate": 3.956426027298221e-06, "loss": 0.74680054, "num_input_tokens_seen": 23628710, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.421875, "step": 1109, "time_per_iteration": 2.6403090953826904 }, { "auxiliary_loss_clip": 0.01233138, "auxiliary_loss_mlp": 0.01066043, "balance_loss_clip": 1.04036534, "balance_loss_mlp": 1.05701065, "epoch": 0.06673681046144596, "flos": 20923280864640.0, "grad_norm": 1.8999553643437765, "language_loss": 0.78283376, "learning_rate": 3.956347555072296e-06, "loss": 0.80582559, "num_input_tokens_seen": 23649160, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.390625, "step": 1110, "time_per_iteration": 2.744929552078247 }, { "auxiliary_loss_clip": 0.01227428, "auxiliary_loss_mlp": 0.01058357, "balance_loss_clip": 1.03452694, "balance_loss_mlp": 1.05948055, "epoch": 0.06679693371411394, "flos": 31065818359680.0, "grad_norm": 1.912028722002255, "language_loss": 0.7144019, "learning_rate": 3.95626901302936e-06, "loss": 0.73725975, "num_input_tokens_seen": 23671995, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 1.40625, "step": 1111, "time_per_iteration": 2.7555768489837646 }, { "auxiliary_loss_clip": 0.01230617, "auxiliary_loss_mlp": 0.01067685, "balance_loss_clip": 1.04345012, "balance_loss_mlp": 1.06036329, "epoch": 0.0668570569667819, "flos": 21726135705600.0, "grad_norm": 1.8532821242039863, "language_loss": 0.78405738, "learning_rate": 3.956190401172214e-06, "loss": 0.80704033, "num_input_tokens_seen": 23690705, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.4296875, "step": 1112, "time_per_iteration": 2.6687421798706055 }, { "auxiliary_loss_clip": 0.01229205, "auxiliary_loss_mlp": 0.01061215, "balance_loss_clip": 1.03711104, "balance_loss_mlp": 1.06124401, "epoch": 0.06691718021944987, "flos": 22747255539840.0, "grad_norm": 2.204544865722285, "language_loss": 0.7853601, "learning_rate": 3.956111719503664e-06, "loss": 0.80826432, "num_input_tokens_seen": 23709990, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.40625, "step": 1113, "time_per_iteration": 2.7651355266571045 }, { "auxiliary_loss_clip": 0.0123058, "auxiliary_loss_mlp": 0.01051676, "balance_loss_clip": 1.02804852, "balance_loss_mlp": 1.05613565, "epoch": 0.06697730347211785, "flos": 16545626720640.0, "grad_norm": 1.853148575925155, "language_loss": 0.82377946, "learning_rate": 3.956032968026519e-06, "loss": 0.84660196, "num_input_tokens_seen": 23728485, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.375, "step": 1114, "time_per_iteration": 2.5936455726623535 }, { "auxiliary_loss_clip": 0.01121964, "auxiliary_loss_mlp": 0.01004667, "balance_loss_clip": 1.00066209, "balance_loss_mlp": 1.02673101, "epoch": 0.06703742672478581, "flos": 59782326658560.0, "grad_norm": 0.8319730042102305, "language_loss": 0.58226508, "learning_rate": 3.955954146743589e-06, "loss": 0.60353136, "num_input_tokens_seen": 23786650, "router_z_loss_clip": 0.04003906, "router_z_loss_mlp": 0.5859375, "step": 1115, "time_per_iteration": 3.209649085998535 }, { "auxiliary_loss_clip": 0.01217656, "auxiliary_loss_mlp": 0.01059286, "balance_loss_clip": 1.03326309, "balance_loss_mlp": 1.05825365, "epoch": 0.06709754997745378, "flos": 16800197385600.0, "grad_norm": 2.582110924432283, "language_loss": 0.91590989, "learning_rate": 3.9558752556576874e-06, "loss": 0.93867928, "num_input_tokens_seen": 23802555, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.4140625, "step": 1116, "time_per_iteration": 2.601409912109375 }, { "auxiliary_loss_clip": 0.01233183, "auxiliary_loss_mlp": 0.01311957, "balance_loss_clip": 1.03873062, "balance_loss_mlp": 1.06184387, "epoch": 0.06715767323012176, "flos": 22123917895680.0, "grad_norm": 1.9842735162435179, "language_loss": 0.86905122, "learning_rate": 3.955796294771628e-06, "loss": 0.89450264, "num_input_tokens_seen": 23822945, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.4375, "step": 1117, "time_per_iteration": 2.6911087036132812 }, { "auxiliary_loss_clip": 0.01090918, "auxiliary_loss_mlp": 0.01004266, "balance_loss_clip": 1.00047481, "balance_loss_mlp": 1.02327061, "epoch": 0.06721779648278972, "flos": 66618100137600.0, "grad_norm": 0.856222012380048, "language_loss": 0.59828877, "learning_rate": 3.95571726408823e-06, "loss": 0.61924064, "num_input_tokens_seen": 23874075, "router_z_loss_clip": 0.0378418, "router_z_loss_mlp": 0.5859375, "step": 1118, "time_per_iteration": 3.123915195465088 }, { "auxiliary_loss_clip": 0.01217127, "auxiliary_loss_mlp": 0.01055513, "balance_loss_clip": 1.03123033, "balance_loss_mlp": 1.05539036, "epoch": 0.06727791973545769, "flos": 22382474970240.0, "grad_norm": 3.1059898691399273, "language_loss": 0.83725274, "learning_rate": 3.955638163610314e-06, "loss": 0.85997909, "num_input_tokens_seen": 23889720, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 1.4375, "step": 1119, "time_per_iteration": 2.64304780960083 }, { "auxiliary_loss_clip": 0.01214741, "auxiliary_loss_mlp": 0.01061857, "balance_loss_clip": 1.03949332, "balance_loss_mlp": 1.05862999, "epoch": 0.06733804298812565, "flos": 24280210224000.0, "grad_norm": 3.4787545229988446, "language_loss": 0.8426773, "learning_rate": 3.955558993340703e-06, "loss": 0.86544329, "num_input_tokens_seen": 23909385, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.375, "step": 1120, "time_per_iteration": 2.840301752090454 }, { "auxiliary_loss_clip": 0.01206276, "auxiliary_loss_mlp": 0.01065588, "balance_loss_clip": 1.04072118, "balance_loss_mlp": 1.06289697, "epoch": 0.06739816624079363, "flos": 15918230839680.0, "grad_norm": 1.8093799408611995, "language_loss": 0.78279519, "learning_rate": 3.955479753282221e-06, "loss": 0.80551386, "num_input_tokens_seen": 23926830, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 1.4375, "step": 1121, "time_per_iteration": 2.543463706970215 }, { "auxiliary_loss_clip": 0.01088268, "auxiliary_loss_mlp": 0.01010803, "balance_loss_clip": 1.00722694, "balance_loss_mlp": 1.02160966, "epoch": 0.0674582894934616, "flos": 71398567353600.0, "grad_norm": 0.7515058533257841, "language_loss": 0.58363867, "learning_rate": 3.955400443437696e-06, "loss": 0.6046294, "num_input_tokens_seen": 23992640, "router_z_loss_clip": 0.03564453, "router_z_loss_mlp": 0.57421875, "step": 1122, "time_per_iteration": 3.407900333404541 }, { "auxiliary_loss_clip": 0.01236071, "auxiliary_loss_mlp": 0.01061219, "balance_loss_clip": 1.03712678, "balance_loss_mlp": 1.06012511, "epoch": 0.06751841274612956, "flos": 25264952559360.0, "grad_norm": 1.918434816067421, "language_loss": 0.71134293, "learning_rate": 3.95532106380996e-06, "loss": 0.73431575, "num_input_tokens_seen": 24011135, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.390625, "step": 1123, "time_per_iteration": 2.7069170475006104 }, { "auxiliary_loss_clip": 0.01233793, "auxiliary_loss_mlp": 0.01059941, "balance_loss_clip": 1.03446603, "balance_loss_mlp": 1.05654454, "epoch": 0.06757853599879754, "flos": 23802741711360.0, "grad_norm": 2.22654854154564, "language_loss": 0.78651291, "learning_rate": 3.9552416144018445e-06, "loss": 0.80945027, "num_input_tokens_seen": 24030695, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.40625, "step": 1124, "time_per_iteration": 2.984158754348755 }, { "auxiliary_loss_clip": 0.0120133, "auxiliary_loss_mlp": 0.01057431, "balance_loss_clip": 1.03521013, "balance_loss_mlp": 1.05570805, "epoch": 0.0676386592514655, "flos": 21033742164480.0, "grad_norm": 2.1266968209827786, "language_loss": 0.71411008, "learning_rate": 3.955162095216186e-06, "loss": 0.73669767, "num_input_tokens_seen": 24050680, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.3671875, "step": 1125, "time_per_iteration": 2.6562366485595703 }, { "auxiliary_loss_clip": 0.01224795, "auxiliary_loss_mlp": 0.01067908, "balance_loss_clip": 1.04240918, "balance_loss_mlp": 1.05873036, "epoch": 0.06769878250413347, "flos": 25556331686400.0, "grad_norm": 2.9239388434795393, "language_loss": 0.81635141, "learning_rate": 3.95508250625582e-06, "loss": 0.8392784, "num_input_tokens_seen": 24067205, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.3828125, "step": 1126, "time_per_iteration": 2.709019422531128 }, { "auxiliary_loss_clip": 0.01096493, "auxiliary_loss_mlp": 0.01002821, "balance_loss_clip": 0.99931586, "balance_loss_mlp": 1.02094305, "epoch": 0.06775890575680145, "flos": 70655251305600.0, "grad_norm": 0.7727114723956019, "language_loss": 0.59769058, "learning_rate": 3.95500284752359e-06, "loss": 0.6186837, "num_input_tokens_seen": 24131320, "router_z_loss_clip": 0.03515625, "router_z_loss_mlp": 0.57421875, "step": 1127, "time_per_iteration": 3.229997396469116 }, { "auxiliary_loss_clip": 0.01194964, "auxiliary_loss_mlp": 0.0105682, "balance_loss_clip": 1.0328474, "balance_loss_mlp": 1.05842948, "epoch": 0.06781902900946941, "flos": 24235500769920.0, "grad_norm": 3.3626465183734164, "language_loss": 0.81506503, "learning_rate": 3.954923119022337e-06, "loss": 0.83758283, "num_input_tokens_seen": 24149930, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.3671875, "step": 1128, "time_per_iteration": 2.617730140686035 }, { "auxiliary_loss_clip": 0.0123239, "auxiliary_loss_mlp": 0.01056634, "balance_loss_clip": 1.03095651, "balance_loss_mlp": 1.06150556, "epoch": 0.06787915226213738, "flos": 22417523665920.0, "grad_norm": 2.405051228326887, "language_loss": 0.75467885, "learning_rate": 3.9548433207549065e-06, "loss": 0.77756912, "num_input_tokens_seen": 24169590, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.4296875, "step": 1129, "time_per_iteration": 2.695087194442749 }, { "auxiliary_loss_clip": 0.01252455, "auxiliary_loss_mlp": 0.01060506, "balance_loss_clip": 1.03588915, "balance_loss_mlp": 1.05681825, "epoch": 0.06793927551480534, "flos": 37706922080640.0, "grad_norm": 1.7673834047180073, "language_loss": 0.71329182, "learning_rate": 3.954763452724146e-06, "loss": 0.73642147, "num_input_tokens_seen": 24189965, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 1.40625, "step": 1130, "time_per_iteration": 2.7996182441711426 }, { "auxiliary_loss_clip": 0.01230491, "auxiliary_loss_mlp": 0.01058386, "balance_loss_clip": 1.03419852, "balance_loss_mlp": 1.05582976, "epoch": 0.06799939876747332, "flos": 20631398947200.0, "grad_norm": 2.2416130181923393, "language_loss": 0.80428255, "learning_rate": 3.954683514932906e-06, "loss": 0.82717133, "num_input_tokens_seen": 24208045, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.3828125, "step": 1131, "time_per_iteration": 2.71690034866333 }, { "auxiliary_loss_clip": 0.01202212, "auxiliary_loss_mlp": 0.01063226, "balance_loss_clip": 1.03918195, "balance_loss_mlp": 1.05732751, "epoch": 0.06805952202014129, "flos": 14864755829760.0, "grad_norm": 2.2387525853491455, "language_loss": 0.8073228, "learning_rate": 3.95460350738404e-06, "loss": 0.82997715, "num_input_tokens_seen": 24223805, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 1.359375, "step": 1132, "time_per_iteration": 2.602656126022339 }, { "auxiliary_loss_clip": 0.01212196, "auxiliary_loss_mlp": 0.01061491, "balance_loss_clip": 1.03762543, "balance_loss_mlp": 1.05619287, "epoch": 0.06811964527280925, "flos": 48909434947200.0, "grad_norm": 1.4866859716215728, "language_loss": 0.6330176, "learning_rate": 3.954523430080402e-06, "loss": 0.65575445, "num_input_tokens_seen": 24249475, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.375, "step": 1133, "time_per_iteration": 2.8526575565338135 }, { "auxiliary_loss_clip": 0.01242976, "auxiliary_loss_mlp": 0.01059481, "balance_loss_clip": 1.03219354, "balance_loss_mlp": 1.0544982, "epoch": 0.06817976852547723, "flos": 15377273038080.0, "grad_norm": 1.877518219941483, "language_loss": 0.74531108, "learning_rate": 3.9544432830248504e-06, "loss": 0.76833564, "num_input_tokens_seen": 24267980, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.4296875, "step": 1134, "time_per_iteration": 2.6042308807373047 }, { "auxiliary_loss_clip": 0.01222658, "auxiliary_loss_mlp": 0.01305616, "balance_loss_clip": 1.03377366, "balance_loss_mlp": 1.05874777, "epoch": 0.0682398917781452, "flos": 20155690200960.0, "grad_norm": 1.9939010254119292, "language_loss": 0.87151307, "learning_rate": 3.954363066220246e-06, "loss": 0.89679581, "num_input_tokens_seen": 24286805, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.3671875, "step": 1135, "time_per_iteration": 2.6492037773132324 }, { "auxiliary_loss_clip": 0.01211444, "auxiliary_loss_mlp": 0.01298081, "balance_loss_clip": 1.02478385, "balance_loss_mlp": 1.05324972, "epoch": 0.06830001503081316, "flos": 23440618748160.0, "grad_norm": 2.8132926803073914, "language_loss": 0.78065139, "learning_rate": 3.954282779669451e-06, "loss": 0.80574661, "num_input_tokens_seen": 24305855, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.40625, "step": 1136, "time_per_iteration": 2.603717803955078 }, { "auxiliary_loss_clip": 0.01216971, "auxiliary_loss_mlp": 0.01061055, "balance_loss_clip": 1.03687954, "balance_loss_mlp": 1.05900073, "epoch": 0.06836013828348114, "flos": 34349813153280.0, "grad_norm": 2.282465207394852, "language_loss": 0.839531, "learning_rate": 3.95420242337533e-06, "loss": 0.86231124, "num_input_tokens_seen": 24326535, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.3984375, "step": 1137, "time_per_iteration": 2.721789836883545 }, { "auxiliary_loss_clip": 0.01219448, "auxiliary_loss_mlp": 0.01057387, "balance_loss_clip": 1.03347397, "balance_loss_mlp": 1.05569983, "epoch": 0.06842026153614911, "flos": 23148844571520.0, "grad_norm": 2.4997758025067087, "language_loss": 0.78610671, "learning_rate": 3.954121997340752e-06, "loss": 0.80887502, "num_input_tokens_seen": 24345810, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.3671875, "step": 1138, "time_per_iteration": 4.0794548988342285 }, { "auxiliary_loss_clip": 0.01198708, "auxiliary_loss_mlp": 0.01058538, "balance_loss_clip": 1.03404069, "balance_loss_mlp": 1.05800903, "epoch": 0.06848038478881707, "flos": 24608972430720.0, "grad_norm": 2.170215524127931, "language_loss": 0.85350257, "learning_rate": 3.9540415015685855e-06, "loss": 0.87607503, "num_input_tokens_seen": 24366095, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 1.40625, "step": 1139, "time_per_iteration": 4.278623819351196 }, { "auxiliary_loss_clip": 0.01222583, "auxiliary_loss_mlp": 0.01060342, "balance_loss_clip": 1.03625011, "balance_loss_mlp": 1.05692661, "epoch": 0.06854050804148504, "flos": 40880994278400.0, "grad_norm": 2.565193758779041, "language_loss": 0.74347341, "learning_rate": 3.953960936061706e-06, "loss": 0.7663027, "num_input_tokens_seen": 24388665, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.3828125, "step": 1140, "time_per_iteration": 4.227624893188477 }, { "auxiliary_loss_clip": 0.01209665, "auxiliary_loss_mlp": 0.01309894, "balance_loss_clip": 1.03616214, "balance_loss_mlp": 1.05497992, "epoch": 0.06860063129415302, "flos": 31686354743040.0, "grad_norm": 2.268138207959344, "language_loss": 0.67831922, "learning_rate": 3.9538803008229845e-06, "loss": 0.70351481, "num_input_tokens_seen": 24407705, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.359375, "step": 1141, "time_per_iteration": 2.7127490043640137 }, { "auxiliary_loss_clip": 0.01206961, "auxiliary_loss_mlp": 0.01067441, "balance_loss_clip": 1.04258597, "balance_loss_mlp": 1.05769014, "epoch": 0.06866075454682098, "flos": 26542007775360.0, "grad_norm": 1.9299433005209392, "language_loss": 0.78947294, "learning_rate": 3.953799595855303e-06, "loss": 0.81221694, "num_input_tokens_seen": 24428390, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 1.40625, "step": 1142, "time_per_iteration": 4.242859601974487 }, { "auxiliary_loss_clip": 0.01214375, "auxiliary_loss_mlp": 0.01061266, "balance_loss_clip": 1.03781724, "balance_loss_mlp": 1.05799174, "epoch": 0.06872087779948895, "flos": 29789768724480.0, "grad_norm": 2.038316633766087, "language_loss": 0.68921161, "learning_rate": 3.953718821161539e-06, "loss": 0.711968, "num_input_tokens_seen": 24450810, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.3828125, "step": 1143, "time_per_iteration": 2.728062868118286 }, { "auxiliary_loss_clip": 0.01197107, "auxiliary_loss_mlp": 0.01055197, "balance_loss_clip": 1.03358459, "balance_loss_mlp": 1.0571698, "epoch": 0.06878100105215693, "flos": 26941118768640.0, "grad_norm": 1.6197767967086913, "language_loss": 0.74030674, "learning_rate": 3.953637976744576e-06, "loss": 0.76282978, "num_input_tokens_seen": 24469965, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.3046875, "step": 1144, "time_per_iteration": 2.680117607116699 }, { "auxiliary_loss_clip": 0.01206226, "auxiliary_loss_mlp": 0.01061301, "balance_loss_clip": 1.03681505, "balance_loss_mlp": 1.05531514, "epoch": 0.06884112430482489, "flos": 10670748946560.0, "grad_norm": 2.762162848308683, "language_loss": 0.91772652, "learning_rate": 3.953557062607299e-06, "loss": 0.94040185, "num_input_tokens_seen": 24486370, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.4140625, "step": 1145, "time_per_iteration": 2.6234371662139893 }, { "auxiliary_loss_clip": 0.01198861, "auxiliary_loss_mlp": 0.01062881, "balance_loss_clip": 1.03865767, "balance_loss_mlp": 1.05657637, "epoch": 0.06890124755749286, "flos": 20193647898240.0, "grad_norm": 2.2972545688735346, "language_loss": 0.82175708, "learning_rate": 3.953476078752595e-06, "loss": 0.84437448, "num_input_tokens_seen": 24503780, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.421875, "step": 1146, "time_per_iteration": 2.572157859802246 }, { "auxiliary_loss_clip": 0.01191148, "auxiliary_loss_mlp": 0.01060287, "balance_loss_clip": 1.03787553, "balance_loss_mlp": 1.05701816, "epoch": 0.06896137081016084, "flos": 20449224144000.0, "grad_norm": 2.1993627188608427, "language_loss": 0.85166579, "learning_rate": 3.953395025183355e-06, "loss": 0.87418008, "num_input_tokens_seen": 24522320, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.34375, "step": 1147, "time_per_iteration": 2.6394498348236084 }, { "auxiliary_loss_clip": 0.01228667, "auxiliary_loss_mlp": 0.01068983, "balance_loss_clip": 1.04522502, "balance_loss_mlp": 1.05455279, "epoch": 0.0690214940628288, "flos": 18368703555840.0, "grad_norm": 1.8632578406203129, "language_loss": 0.85716462, "learning_rate": 3.9533139019024715e-06, "loss": 0.88014114, "num_input_tokens_seen": 24540445, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 1.375, "step": 1148, "time_per_iteration": 2.606879711151123 }, { "auxiliary_loss_clip": 0.01211651, "auxiliary_loss_mlp": 0.01056378, "balance_loss_clip": 1.0334301, "balance_loss_mlp": 1.05636692, "epoch": 0.06908161731549677, "flos": 20558033418240.0, "grad_norm": 2.0302562716954577, "language_loss": 0.69995558, "learning_rate": 3.953232708912839e-06, "loss": 0.72263592, "num_input_tokens_seen": 24557105, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 1.3671875, "step": 1149, "time_per_iteration": 2.664736270904541 }, { "auxiliary_loss_clip": 0.01196262, "auxiliary_loss_mlp": 0.01055245, "balance_loss_clip": 1.03078365, "balance_loss_mlp": 1.05580282, "epoch": 0.06914174056816474, "flos": 27563666313600.0, "grad_norm": 1.951962906520407, "language_loss": 0.83799231, "learning_rate": 3.953151446217356e-06, "loss": 0.86050749, "num_input_tokens_seen": 24578240, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 1.40625, "step": 1150, "time_per_iteration": 2.588046073913574 }, { "auxiliary_loss_clip": 0.01216652, "auxiliary_loss_mlp": 0.01060276, "balance_loss_clip": 1.03637505, "balance_loss_mlp": 1.05876946, "epoch": 0.06920186382083271, "flos": 15304015249920.0, "grad_norm": 3.6590077920293993, "language_loss": 0.81201392, "learning_rate": 3.953070113818921e-06, "loss": 0.8347832, "num_input_tokens_seen": 24593585, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.3984375, "step": 1151, "time_per_iteration": 2.685276746749878 }, { "auxiliary_loss_clip": 0.01194097, "auxiliary_loss_mlp": 0.01059755, "balance_loss_clip": 1.0371058, "balance_loss_mlp": 1.05875158, "epoch": 0.06926198707350067, "flos": 25191227894400.0, "grad_norm": 3.037436893833616, "language_loss": 0.85523987, "learning_rate": 3.952988711720439e-06, "loss": 0.87777841, "num_input_tokens_seen": 24613110, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.3515625, "step": 1152, "time_per_iteration": 2.638411045074463 }, { "auxiliary_loss_clip": 0.01200458, "auxiliary_loss_mlp": 0.01056584, "balance_loss_clip": 1.03428054, "balance_loss_mlp": 1.05679178, "epoch": 0.06932211032616864, "flos": 13256137146240.0, "grad_norm": 2.1145781012296982, "language_loss": 0.90458989, "learning_rate": 3.952907239924813e-06, "loss": 0.92716026, "num_input_tokens_seen": 24628795, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.34375, "step": 1153, "time_per_iteration": 2.692986488342285 }, { "auxiliary_loss_clip": 0.01213092, "auxiliary_loss_mlp": 0.01055659, "balance_loss_clip": 1.03230619, "balance_loss_mlp": 1.05868411, "epoch": 0.06938223357883662, "flos": 24827381078400.0, "grad_norm": 2.264846209169028, "language_loss": 0.81136942, "learning_rate": 3.95282569843495e-06, "loss": 0.83405697, "num_input_tokens_seen": 24645480, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 1.359375, "step": 1154, "time_per_iteration": 2.591728687286377 }, { "auxiliary_loss_clip": 0.01213521, "auxiliary_loss_mlp": 0.01059575, "balance_loss_clip": 1.03682983, "balance_loss_mlp": 1.0606463, "epoch": 0.06944235683150458, "flos": 27267977554560.0, "grad_norm": 1.7810830539154374, "language_loss": 0.75291067, "learning_rate": 3.952744087253762e-06, "loss": 0.77564168, "num_input_tokens_seen": 24664630, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 1.34375, "step": 1155, "time_per_iteration": 2.729362964630127 }, { "auxiliary_loss_clip": 0.01196884, "auxiliary_loss_mlp": 0.01306535, "balance_loss_clip": 1.03575373, "balance_loss_mlp": 1.05283391, "epoch": 0.06950248008417255, "flos": 25808065176960.0, "grad_norm": 1.6709989547177497, "language_loss": 0.70525998, "learning_rate": 3.952662406384161e-06, "loss": 0.73029423, "num_input_tokens_seen": 24684210, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.3515625, "step": 1156, "time_per_iteration": 2.6966049671173096 }, { "auxiliary_loss_clip": 0.01212275, "auxiliary_loss_mlp": 0.01313996, "balance_loss_clip": 1.04126525, "balance_loss_mlp": 1.05744839, "epoch": 0.06956260333684053, "flos": 22271546793600.0, "grad_norm": 1.8224071975052956, "language_loss": 0.74502605, "learning_rate": 3.952580655829061e-06, "loss": 0.77028871, "num_input_tokens_seen": 24702490, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 1.3671875, "step": 1157, "time_per_iteration": 2.69474458694458 }, { "auxiliary_loss_clip": 0.01211106, "auxiliary_loss_mlp": 0.01052605, "balance_loss_clip": 1.02924037, "balance_loss_mlp": 1.05524182, "epoch": 0.0696227265895085, "flos": 29681390413440.0, "grad_norm": 2.037800552652823, "language_loss": 0.71382099, "learning_rate": 3.952498835591381e-06, "loss": 0.73645806, "num_input_tokens_seen": 24724340, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.375, "step": 1158, "time_per_iteration": 2.6660332679748535 }, { "auxiliary_loss_clip": 0.01194035, "auxiliary_loss_mlp": 0.01058587, "balance_loss_clip": 1.03525805, "balance_loss_mlp": 1.05567133, "epoch": 0.06968284984217646, "flos": 25523545547520.0, "grad_norm": 1.8120085630723142, "language_loss": 0.79686844, "learning_rate": 3.952416945674039e-06, "loss": 0.81939471, "num_input_tokens_seen": 24745550, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 1.3828125, "step": 1159, "time_per_iteration": 2.6390645503997803 }, { "auxiliary_loss_clip": 0.01199507, "auxiliary_loss_mlp": 0.01058923, "balance_loss_clip": 1.03294718, "balance_loss_mlp": 1.06075978, "epoch": 0.06974297309484444, "flos": 20698192287360.0, "grad_norm": 4.332438951688228, "language_loss": 0.80415851, "learning_rate": 3.952334986079957e-06, "loss": 0.82674277, "num_input_tokens_seen": 24762575, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.3828125, "step": 1160, "time_per_iteration": 2.553697347640991 }, { "auxiliary_loss_clip": 0.01199697, "auxiliary_loss_mlp": 0.01055028, "balance_loss_clip": 1.03122211, "balance_loss_mlp": 1.05256617, "epoch": 0.0698030963475124, "flos": 26505199313280.0, "grad_norm": 1.7438411142137649, "language_loss": 0.75702691, "learning_rate": 3.9522529568120635e-06, "loss": 0.77957416, "num_input_tokens_seen": 24782605, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 1.3828125, "step": 1161, "time_per_iteration": 2.6656150817871094 }, { "auxiliary_loss_clip": 0.01208707, "auxiliary_loss_mlp": 0.01058981, "balance_loss_clip": 1.03475809, "balance_loss_mlp": 1.05340075, "epoch": 0.06986321960018037, "flos": 23040430346880.0, "grad_norm": 1.9023344097634642, "language_loss": 0.82605487, "learning_rate": 3.952170857873283e-06, "loss": 0.84873176, "num_input_tokens_seen": 24802910, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.3671875, "step": 1162, "time_per_iteration": 2.6277756690979004 }, { "auxiliary_loss_clip": 0.01198102, "auxiliary_loss_mlp": 0.01051102, "balance_loss_clip": 1.02751088, "balance_loss_mlp": 1.05401433, "epoch": 0.06992334285284833, "flos": 28584822061440.0, "grad_norm": 2.0571272902043223, "language_loss": 0.79719782, "learning_rate": 3.952088689266547e-06, "loss": 0.81968981, "num_input_tokens_seen": 24823305, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.3515625, "step": 1163, "time_per_iteration": 2.6847147941589355 }, { "auxiliary_loss_clip": 0.01201437, "auxiliary_loss_mlp": 0.01055955, "balance_loss_clip": 1.0319699, "balance_loss_mlp": 1.05479836, "epoch": 0.06998346610551631, "flos": 20595344670720.0, "grad_norm": 2.0102940136123713, "language_loss": 0.79247928, "learning_rate": 3.952006450994786e-06, "loss": 0.81505311, "num_input_tokens_seen": 24842155, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 1.375, "step": 1164, "time_per_iteration": 2.6663906574249268 }, { "auxiliary_loss_clip": 0.01230016, "auxiliary_loss_mlp": 0.01061403, "balance_loss_clip": 1.03714359, "balance_loss_mlp": 1.05662918, "epoch": 0.07004358935818428, "flos": 22528810978560.0, "grad_norm": 1.4708711112895005, "language_loss": 0.72585595, "learning_rate": 3.951924143060937e-06, "loss": 0.74877012, "num_input_tokens_seen": 24862080, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.3671875, "step": 1165, "time_per_iteration": 2.721311569213867 }, { "auxiliary_loss_clip": 0.01210364, "auxiliary_loss_mlp": 0.01054994, "balance_loss_clip": 1.03221309, "balance_loss_mlp": 1.0567286, "epoch": 0.07010371261085224, "flos": 28949997680640.0, "grad_norm": 1.7218026362422512, "language_loss": 0.81036699, "learning_rate": 3.951841765467935e-06, "loss": 0.83302057, "num_input_tokens_seen": 24886165, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 1.359375, "step": 1166, "time_per_iteration": 2.821803569793701 }, { "auxiliary_loss_clip": 0.01207989, "auxiliary_loss_mlp": 0.01045651, "balance_loss_clip": 1.02154648, "balance_loss_mlp": 1.0542891, "epoch": 0.07016383586352022, "flos": 23659171050240.0, "grad_norm": 2.291593906180241, "language_loss": 0.84067726, "learning_rate": 3.951759318218722e-06, "loss": 0.86321366, "num_input_tokens_seen": 24905775, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.359375, "step": 1167, "time_per_iteration": 2.6635990142822266 }, { "auxiliary_loss_clip": 0.0120487, "auxiliary_loss_mlp": 0.01059909, "balance_loss_clip": 1.03580475, "balance_loss_mlp": 1.05754054, "epoch": 0.07022395911618819, "flos": 19792130693760.0, "grad_norm": 2.179294538676083, "language_loss": 0.89606839, "learning_rate": 3.951676801316239e-06, "loss": 0.91871619, "num_input_tokens_seen": 24924295, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.3828125, "step": 1168, "time_per_iteration": 2.732194185256958 }, { "auxiliary_loss_clip": 0.01217335, "auxiliary_loss_mlp": 0.01067318, "balance_loss_clip": 1.04025745, "balance_loss_mlp": 1.05710685, "epoch": 0.07028408236885615, "flos": 21689147675520.0, "grad_norm": 2.0824138992425536, "language_loss": 0.88593864, "learning_rate": 3.951594214763431e-06, "loss": 0.90878516, "num_input_tokens_seen": 24943210, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.421875, "step": 1169, "time_per_iteration": 2.615583896636963 }, { "auxiliary_loss_clip": 0.0119682, "auxiliary_loss_mlp": 0.01061638, "balance_loss_clip": 1.03715241, "balance_loss_mlp": 1.05866373, "epoch": 0.07034420562152413, "flos": 25630271832960.0, "grad_norm": 2.1963903750797784, "language_loss": 0.8408469, "learning_rate": 3.951511558563246e-06, "loss": 0.86343145, "num_input_tokens_seen": 24960360, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 1.3828125, "step": 1170, "time_per_iteration": 2.692652940750122 }, { "auxiliary_loss_clip": 0.01211082, "auxiliary_loss_mlp": 0.01311479, "balance_loss_clip": 1.03864312, "balance_loss_mlp": 1.05682492, "epoch": 0.0704043288741921, "flos": 20810449267200.0, "grad_norm": 2.012886971833418, "language_loss": 0.75441945, "learning_rate": 3.951428832718633e-06, "loss": 0.77964509, "num_input_tokens_seen": 24978290, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 1.359375, "step": 1171, "time_per_iteration": 2.641371250152588 }, { "auxiliary_loss_clip": 0.01210458, "auxiliary_loss_mlp": 0.01057078, "balance_loss_clip": 1.03312862, "balance_loss_mlp": 1.05700564, "epoch": 0.07046445212686006, "flos": 25593176062080.0, "grad_norm": 1.7278204424133135, "language_loss": 0.88790226, "learning_rate": 3.951346037232546e-06, "loss": 0.91057765, "num_input_tokens_seen": 24997055, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.3515625, "step": 1172, "time_per_iteration": 2.6977617740631104 }, { "auxiliary_loss_clip": 0.0121249, "auxiliary_loss_mlp": 0.01051429, "balance_loss_clip": 1.02690721, "balance_loss_mlp": 1.0545485, "epoch": 0.07052457537952803, "flos": 25556978131200.0, "grad_norm": 1.801424515736457, "language_loss": 0.81888741, "learning_rate": 3.951263172107937e-06, "loss": 0.84152657, "num_input_tokens_seen": 25017490, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 1.3984375, "step": 1173, "time_per_iteration": 2.649477005004883 }, { "auxiliary_loss_clip": 0.01206934, "auxiliary_loss_mlp": 0.01056576, "balance_loss_clip": 1.0322094, "balance_loss_mlp": 1.05932331, "epoch": 0.070584698632196, "flos": 17968515154560.0, "grad_norm": 2.670509635460703, "language_loss": 0.81958854, "learning_rate": 3.951180237347765e-06, "loss": 0.84222364, "num_input_tokens_seen": 25035660, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.3828125, "step": 1174, "time_per_iteration": 2.624563217163086 }, { "auxiliary_loss_clip": 0.01209716, "auxiliary_loss_mlp": 0.01059197, "balance_loss_clip": 1.03553367, "balance_loss_mlp": 1.05517566, "epoch": 0.07064482188486397, "flos": 25370888745600.0, "grad_norm": 4.772210801383018, "language_loss": 0.85284126, "learning_rate": 3.951097232954989e-06, "loss": 0.87553036, "num_input_tokens_seen": 25054785, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.3671875, "step": 1175, "time_per_iteration": 2.640491485595703 }, { "auxiliary_loss_clip": 0.01207104, "auxiliary_loss_mlp": 0.01069674, "balance_loss_clip": 1.04585648, "balance_loss_mlp": 1.06023979, "epoch": 0.07070494513753194, "flos": 24899848767360.0, "grad_norm": 1.9569132374582714, "language_loss": 0.83016336, "learning_rate": 3.951014158932572e-06, "loss": 0.85293114, "num_input_tokens_seen": 25075180, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 1.375, "step": 1176, "time_per_iteration": 2.7669177055358887 }, { "auxiliary_loss_clip": 0.01220218, "auxiliary_loss_mlp": 0.01061915, "balance_loss_clip": 1.03741741, "balance_loss_mlp": 1.05805182, "epoch": 0.07076506839019991, "flos": 22338447874560.0, "grad_norm": 2.298694785316557, "language_loss": 0.74170506, "learning_rate": 3.950931015283479e-06, "loss": 0.76452637, "num_input_tokens_seen": 25093035, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 1.34375, "step": 1177, "time_per_iteration": 2.6767494678497314 }, { "auxiliary_loss_clip": 0.01221131, "auxiliary_loss_mlp": 0.01059876, "balance_loss_clip": 1.03535426, "balance_loss_mlp": 1.06152177, "epoch": 0.07082519164286788, "flos": 18660800954880.0, "grad_norm": 1.8022328749549115, "language_loss": 0.85848349, "learning_rate": 3.950847802010675e-06, "loss": 0.88129354, "num_input_tokens_seen": 25112520, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 1.4140625, "step": 1178, "time_per_iteration": 2.6265292167663574 }, { "auxiliary_loss_clip": 0.01111287, "auxiliary_loss_mlp": 0.01020666, "balance_loss_clip": 1.01725638, "balance_loss_mlp": 1.03647029, "epoch": 0.07088531489553584, "flos": 63654107610240.0, "grad_norm": 0.8445018932438124, "language_loss": 0.63313234, "learning_rate": 3.950764519117132e-06, "loss": 0.65445185, "num_input_tokens_seen": 25177760, "router_z_loss_clip": 0.03417969, "router_z_loss_mlp": 0.56640625, "step": 1179, "time_per_iteration": 3.313377857208252 }, { "auxiliary_loss_clip": 0.01210291, "auxiliary_loss_mlp": 0.01063365, "balance_loss_clip": 1.03954697, "balance_loss_mlp": 1.06083727, "epoch": 0.07094543814820382, "flos": 21572688804480.0, "grad_norm": 2.2960995766783214, "language_loss": 0.83912629, "learning_rate": 3.9506811666058215e-06, "loss": 0.86186284, "num_input_tokens_seen": 25195260, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 1.40625, "step": 1180, "time_per_iteration": 5.5434184074401855 }, { "auxiliary_loss_clip": 0.0121257, "auxiliary_loss_mlp": 0.01068082, "balance_loss_clip": 1.04390645, "balance_loss_mlp": 1.05745912, "epoch": 0.07100556140087179, "flos": 22089946608000.0, "grad_norm": 2.738005399202653, "language_loss": 0.88156933, "learning_rate": 3.950597744479717e-06, "loss": 0.90437579, "num_input_tokens_seen": 25212740, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.359375, "step": 1181, "time_per_iteration": 4.033408164978027 }, { "auxiliary_loss_clip": 0.01216056, "auxiliary_loss_mlp": 0.01058338, "balance_loss_clip": 1.03490186, "balance_loss_mlp": 1.06126833, "epoch": 0.07106568465353975, "flos": 47922286400640.0, "grad_norm": 1.7469294229317593, "language_loss": 0.8337332, "learning_rate": 3.950514252741797e-06, "loss": 0.8564772, "num_input_tokens_seen": 25236420, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.359375, "step": 1182, "time_per_iteration": 2.842233419418335 }, { "auxiliary_loss_clip": 0.01202074, "auxiliary_loss_mlp": 0.01060162, "balance_loss_clip": 1.03499746, "balance_loss_mlp": 1.05930018, "epoch": 0.07112580790620772, "flos": 23440798316160.0, "grad_norm": 2.2106505074489986, "language_loss": 0.79473442, "learning_rate": 3.950430691395042e-06, "loss": 0.81735682, "num_input_tokens_seen": 25255120, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.3359375, "step": 1183, "time_per_iteration": 2.676072597503662 }, { "auxiliary_loss_clip": 0.01197426, "auxiliary_loss_mlp": 0.01058414, "balance_loss_clip": 1.03369021, "balance_loss_mlp": 1.05501473, "epoch": 0.0711859311588757, "flos": 31868888682240.0, "grad_norm": 2.022539730650615, "language_loss": 0.78623402, "learning_rate": 3.95034706044243e-06, "loss": 0.80879241, "num_input_tokens_seen": 25275150, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 1.421875, "step": 1184, "time_per_iteration": 4.229240894317627 }, { "auxiliary_loss_clip": 0.01228284, "auxiliary_loss_mlp": 0.01057925, "balance_loss_clip": 1.03399968, "balance_loss_mlp": 1.05756795, "epoch": 0.07124605441154366, "flos": 19610315026560.0, "grad_norm": 1.9498127574117154, "language_loss": 0.76724219, "learning_rate": 3.95026335988695e-06, "loss": 0.79010433, "num_input_tokens_seen": 25293680, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.34375, "step": 1185, "time_per_iteration": 2.7580831050872803 }, { "auxiliary_loss_clip": 0.01195501, "auxiliary_loss_mlp": 0.0106519, "balance_loss_clip": 1.04105043, "balance_loss_mlp": 1.05817437, "epoch": 0.07130617766421163, "flos": 14684448533760.0, "grad_norm": 2.0010810442556726, "language_loss": 0.65425193, "learning_rate": 3.950179589731587e-06, "loss": 0.67685878, "num_input_tokens_seen": 25310050, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.375, "step": 1186, "time_per_iteration": 2.6600871086120605 }, { "auxiliary_loss_clip": 0.01201162, "auxiliary_loss_mlp": 0.01055156, "balance_loss_clip": 1.03146863, "balance_loss_mlp": 1.05714321, "epoch": 0.07136630091687961, "flos": 26067915141120.0, "grad_norm": 2.447732493068023, "language_loss": 0.69403481, "learning_rate": 3.950095749979331e-06, "loss": 0.71659797, "num_input_tokens_seen": 25331020, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 1.3515625, "step": 1187, "time_per_iteration": 2.640268087387085 }, { "auxiliary_loss_clip": 0.01217105, "auxiliary_loss_mlp": 0.01056601, "balance_loss_clip": 1.03354597, "balance_loss_mlp": 1.05712366, "epoch": 0.07142642416954757, "flos": 15669190869120.0, "grad_norm": 2.397350155476342, "language_loss": 0.78559381, "learning_rate": 3.950011840633174e-06, "loss": 0.80833083, "num_input_tokens_seen": 25347875, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.328125, "step": 1188, "time_per_iteration": 2.68271541595459 }, { "auxiliary_loss_clip": 0.01191151, "auxiliary_loss_mlp": 0.01062201, "balance_loss_clip": 1.03753662, "balance_loss_mlp": 1.05756021, "epoch": 0.07148654742221554, "flos": 19755322231680.0, "grad_norm": 2.8428383342877783, "language_loss": 0.84822094, "learning_rate": 3.9499278616961106e-06, "loss": 0.87075442, "num_input_tokens_seen": 25366715, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 1.3359375, "step": 1189, "time_per_iteration": 2.640225887298584 }, { "auxiliary_loss_clip": 0.01217642, "auxiliary_loss_mlp": 0.01306783, "balance_loss_clip": 1.03235757, "balance_loss_mlp": 1.05635691, "epoch": 0.07154667067488352, "flos": 23471824688640.0, "grad_norm": 1.704688761485259, "language_loss": 0.7667405, "learning_rate": 3.949843813171137e-06, "loss": 0.79198474, "num_input_tokens_seen": 25385450, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.3359375, "step": 1190, "time_per_iteration": 2.770620107650757 }, { "auxiliary_loss_clip": 0.01194824, "auxiliary_loss_mlp": 0.01066072, "balance_loss_clip": 1.0398581, "balance_loss_mlp": 1.05730152, "epoch": 0.07160679392755148, "flos": 18332936588160.0, "grad_norm": 2.807729780779413, "language_loss": 0.75507528, "learning_rate": 3.949759695061254e-06, "loss": 0.77768421, "num_input_tokens_seen": 25403940, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.375, "step": 1191, "time_per_iteration": 2.574082851409912 }, { "auxiliary_loss_clip": 0.01220312, "auxiliary_loss_mlp": 0.010561, "balance_loss_clip": 1.03130472, "balance_loss_mlp": 1.05628538, "epoch": 0.07166691718021945, "flos": 17747017937280.0, "grad_norm": 2.4663862783328945, "language_loss": 0.74123502, "learning_rate": 3.949675507369463e-06, "loss": 0.7639991, "num_input_tokens_seen": 25420410, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.3671875, "step": 1192, "time_per_iteration": 2.702052354812622 }, { "auxiliary_loss_clip": 0.01190563, "auxiliary_loss_mlp": 0.01054324, "balance_loss_clip": 1.03138793, "balance_loss_mlp": 1.05485868, "epoch": 0.07172704043288743, "flos": 22451925916800.0, "grad_norm": 3.2044262217274566, "language_loss": 0.77948976, "learning_rate": 3.949591250098768e-06, "loss": 0.80193865, "num_input_tokens_seen": 25439415, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 1.359375, "step": 1193, "time_per_iteration": 2.593231439590454 }, { "auxiliary_loss_clip": 0.01217266, "auxiliary_loss_mlp": 0.01055169, "balance_loss_clip": 1.030195, "balance_loss_mlp": 1.06170559, "epoch": 0.07178716368555539, "flos": 23222210100480.0, "grad_norm": 1.8578557379395624, "language_loss": 0.86018658, "learning_rate": 3.949506923252175e-06, "loss": 0.88291097, "num_input_tokens_seen": 25458715, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.375, "step": 1194, "time_per_iteration": 2.5910630226135254 }, { "auxiliary_loss_clip": 0.01200568, "auxiliary_loss_mlp": 0.01061625, "balance_loss_clip": 1.03733039, "balance_loss_mlp": 1.05668759, "epoch": 0.07184728693822336, "flos": 25150828072320.0, "grad_norm": 1.9401468027038664, "language_loss": 0.8142271, "learning_rate": 3.9494225268326965e-06, "loss": 0.83684909, "num_input_tokens_seen": 25477985, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.34375, "step": 1195, "time_per_iteration": 2.5818376541137695 }, { "auxiliary_loss_clip": 0.01209171, "auxiliary_loss_mlp": 0.01052583, "balance_loss_clip": 1.030756, "balance_loss_mlp": 1.05816555, "epoch": 0.07190741019089132, "flos": 22711237176960.0, "grad_norm": 2.133896177875925, "language_loss": 0.79705393, "learning_rate": 3.949338060843342e-06, "loss": 0.81967151, "num_input_tokens_seen": 25497110, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.328125, "step": 1196, "time_per_iteration": 2.5613226890563965 }, { "auxiliary_loss_clip": 0.01218397, "auxiliary_loss_mlp": 0.01312994, "balance_loss_clip": 1.0388732, "balance_loss_mlp": 1.05797887, "epoch": 0.0719675334435593, "flos": 29349791032320.0, "grad_norm": 2.207725863963237, "language_loss": 0.70840234, "learning_rate": 3.949253525287126e-06, "loss": 0.73371625, "num_input_tokens_seen": 25516555, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.328125, "step": 1197, "time_per_iteration": 2.718334674835205 }, { "auxiliary_loss_clip": 0.01228441, "auxiliary_loss_mlp": 0.01052438, "balance_loss_clip": 1.02826238, "balance_loss_mlp": 1.05824435, "epoch": 0.07202765669622727, "flos": 17639788861440.0, "grad_norm": 2.507219390090115, "language_loss": 0.85710663, "learning_rate": 3.9491689201670655e-06, "loss": 0.87991542, "num_input_tokens_seen": 25533895, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.3359375, "step": 1198, "time_per_iteration": 2.5844147205352783 }, { "auxiliary_loss_clip": 0.01229918, "auxiliary_loss_mlp": 0.01062288, "balance_loss_clip": 1.03786206, "balance_loss_mlp": 1.05983901, "epoch": 0.07208777994889523, "flos": 21434038306560.0, "grad_norm": 2.4783175152238184, "language_loss": 0.83891374, "learning_rate": 3.94908424548618e-06, "loss": 0.86183578, "num_input_tokens_seen": 25554195, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.3359375, "step": 1199, "time_per_iteration": 2.7742886543273926 }, { "auxiliary_loss_clip": 0.01198985, "auxiliary_loss_mlp": 0.0105617, "balance_loss_clip": 1.03340101, "balance_loss_mlp": 1.06359851, "epoch": 0.07214790320156321, "flos": 26940867373440.0, "grad_norm": 2.0494969887592167, "language_loss": 0.76244581, "learning_rate": 3.9489995012474924e-06, "loss": 0.78499734, "num_input_tokens_seen": 25574155, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 1.3515625, "step": 1200, "time_per_iteration": 2.7393136024475098 }, { "auxiliary_loss_clip": 0.012019, "auxiliary_loss_mlp": 0.01063997, "balance_loss_clip": 1.03983307, "balance_loss_mlp": 1.05897236, "epoch": 0.07220802645423118, "flos": 23879949995520.0, "grad_norm": 2.25064777045305, "language_loss": 0.82478595, "learning_rate": 3.948914687454027e-06, "loss": 0.84744483, "num_input_tokens_seen": 25592735, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.3359375, "step": 1201, "time_per_iteration": 2.7378361225128174 }, { "auxiliary_loss_clip": 0.01205016, "auxiliary_loss_mlp": 0.01058986, "balance_loss_clip": 1.03291452, "balance_loss_mlp": 1.05854344, "epoch": 0.07226814970689914, "flos": 19243631036160.0, "grad_norm": 2.496089768811936, "language_loss": 0.68518686, "learning_rate": 3.948829804108807e-06, "loss": 0.70782685, "num_input_tokens_seen": 25611510, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.375, "step": 1202, "time_per_iteration": 2.574061155319214 }, { "auxiliary_loss_clip": 0.01214432, "auxiliary_loss_mlp": 0.01311716, "balance_loss_clip": 1.03648269, "balance_loss_mlp": 1.06097889, "epoch": 0.07232827295956712, "flos": 19172025273600.0, "grad_norm": 2.0915711967638964, "language_loss": 0.87599504, "learning_rate": 3.948744851214865e-06, "loss": 0.90125656, "num_input_tokens_seen": 25629560, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.34375, "step": 1203, "time_per_iteration": 2.6324269771575928 }, { "auxiliary_loss_clip": 0.01233186, "auxiliary_loss_mlp": 0.01063908, "balance_loss_clip": 1.04001796, "balance_loss_mlp": 1.05801725, "epoch": 0.07238839621223508, "flos": 17639752947840.0, "grad_norm": 2.408103583862151, "language_loss": 0.78056467, "learning_rate": 3.948659828775233e-06, "loss": 0.80353558, "num_input_tokens_seen": 25648330, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.390625, "step": 1204, "time_per_iteration": 2.585773229598999 }, { "auxiliary_loss_clip": 0.01204853, "auxiliary_loss_mlp": 0.01062286, "balance_loss_clip": 1.03834832, "balance_loss_mlp": 1.06053305, "epoch": 0.07244851946490305, "flos": 28292401440000.0, "grad_norm": 1.6286649838144107, "language_loss": 0.82127559, "learning_rate": 3.9485747367929436e-06, "loss": 0.84394693, "num_input_tokens_seen": 25669470, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.3515625, "step": 1205, "time_per_iteration": 2.6406407356262207 }, { "auxiliary_loss_clip": 0.01218034, "auxiliary_loss_mlp": 0.01315412, "balance_loss_clip": 1.04156458, "balance_loss_mlp": 1.06278443, "epoch": 0.07250864271757101, "flos": 22564829341440.0, "grad_norm": 1.6658579399150195, "language_loss": 0.76742202, "learning_rate": 3.948489575271035e-06, "loss": 0.79275644, "num_input_tokens_seen": 25690470, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.3671875, "step": 1206, "time_per_iteration": 2.5995872020721436 }, { "auxiliary_loss_clip": 0.01204349, "auxiliary_loss_mlp": 0.01058578, "balance_loss_clip": 1.03442574, "balance_loss_mlp": 1.06174099, "epoch": 0.072568765970239, "flos": 21762405463680.0, "grad_norm": 2.128176560409858, "language_loss": 0.77352154, "learning_rate": 3.948404344212544e-06, "loss": 0.7961508, "num_input_tokens_seen": 25709205, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.328125, "step": 1207, "time_per_iteration": 2.671420097351074 }, { "auxiliary_loss_clip": 0.01204289, "auxiliary_loss_mlp": 0.01055507, "balance_loss_clip": 1.03222561, "balance_loss_mlp": 1.06194997, "epoch": 0.07262888922290696, "flos": 25519702792320.0, "grad_norm": 2.2073079296172775, "language_loss": 0.80005765, "learning_rate": 3.948319043620516e-06, "loss": 0.82265556, "num_input_tokens_seen": 25728485, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.328125, "step": 1208, "time_per_iteration": 2.6569392681121826 }, { "auxiliary_loss_clip": 0.01201409, "auxiliary_loss_mlp": 0.01050549, "balance_loss_clip": 1.02829266, "balance_loss_mlp": 1.05990458, "epoch": 0.07268901247557492, "flos": 21246548290560.0, "grad_norm": 2.666350395142354, "language_loss": 0.78428388, "learning_rate": 3.948233673497991e-06, "loss": 0.80680341, "num_input_tokens_seen": 25747730, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.3203125, "step": 1209, "time_per_iteration": 2.610685348510742 }, { "auxiliary_loss_clip": 0.0120554, "auxiliary_loss_mlp": 0.01063346, "balance_loss_clip": 1.04019594, "balance_loss_mlp": 1.06174088, "epoch": 0.0727491357282429, "flos": 25479302970240.0, "grad_norm": 2.285655254504253, "language_loss": 0.81046867, "learning_rate": 3.948148233848018e-06, "loss": 0.83315754, "num_input_tokens_seen": 25768050, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 1.34375, "step": 1210, "time_per_iteration": 2.7201812267303467 }, { "auxiliary_loss_clip": 0.01210535, "auxiliary_loss_mlp": 0.01060413, "balance_loss_clip": 1.03667796, "balance_loss_mlp": 1.06182683, "epoch": 0.07280925898091087, "flos": 24462169545600.0, "grad_norm": 1.8223856853345588, "language_loss": 0.84376371, "learning_rate": 3.948062724673646e-06, "loss": 0.8664732, "num_input_tokens_seen": 25787985, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 1.3046875, "step": 1211, "time_per_iteration": 2.6408066749572754 }, { "auxiliary_loss_clip": 0.01207373, "auxiliary_loss_mlp": 0.01053496, "balance_loss_clip": 1.0311799, "balance_loss_mlp": 1.05738425, "epoch": 0.07286938223357883, "flos": 18288191220480.0, "grad_norm": 2.5596691532005025, "language_loss": 0.907471, "learning_rate": 3.947977145977927e-06, "loss": 0.9300797, "num_input_tokens_seen": 25803620, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.3125, "step": 1212, "time_per_iteration": 2.6453850269317627 }, { "auxiliary_loss_clip": 0.01190066, "auxiliary_loss_mlp": 0.01055413, "balance_loss_clip": 1.03228629, "balance_loss_mlp": 1.05889249, "epoch": 0.07292950548624681, "flos": 21214803646080.0, "grad_norm": 1.7670606366296333, "language_loss": 0.72065443, "learning_rate": 3.947891497763914e-06, "loss": 0.74310923, "num_input_tokens_seen": 25823315, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 1.3125, "step": 1213, "time_per_iteration": 2.678762674331665 }, { "auxiliary_loss_clip": 0.01227723, "auxiliary_loss_mlp": 0.01050121, "balance_loss_clip": 1.02707779, "balance_loss_mlp": 1.05678856, "epoch": 0.07298962873891478, "flos": 24642009964800.0, "grad_norm": 2.14414369671807, "language_loss": 0.8354972, "learning_rate": 3.947805780034664e-06, "loss": 0.85827565, "num_input_tokens_seen": 25842605, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.34375, "step": 1214, "time_per_iteration": 2.732168197631836 }, { "auxiliary_loss_clip": 0.01197587, "auxiliary_loss_mlp": 0.01057141, "balance_loss_clip": 1.03356147, "balance_loss_mlp": 1.06123924, "epoch": 0.07304975199158274, "flos": 27052765217280.0, "grad_norm": 2.9510469373313004, "language_loss": 0.83820134, "learning_rate": 3.947719992793236e-06, "loss": 0.86074865, "num_input_tokens_seen": 25863030, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 1.359375, "step": 1215, "time_per_iteration": 2.6307833194732666 }, { "auxiliary_loss_clip": 0.01220031, "auxiliary_loss_mlp": 0.0105605, "balance_loss_clip": 1.03254199, "balance_loss_mlp": 1.05893445, "epoch": 0.07310987524425071, "flos": 33549544091520.0, "grad_norm": 1.8141181410174412, "language_loss": 0.80960071, "learning_rate": 3.9476341360426924e-06, "loss": 0.83236152, "num_input_tokens_seen": 25888015, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 1.328125, "step": 1216, "time_per_iteration": 2.8401248455047607 }, { "auxiliary_loss_clip": 0.0121303, "auxiliary_loss_mlp": 0.01056771, "balance_loss_clip": 1.03483677, "balance_loss_mlp": 1.06133008, "epoch": 0.07316999849691869, "flos": 28110944908800.0, "grad_norm": 2.3778467302544244, "language_loss": 0.75677848, "learning_rate": 3.9475482097860955e-06, "loss": 0.77947652, "num_input_tokens_seen": 25908660, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.328125, "step": 1217, "time_per_iteration": 2.631478786468506 }, { "auxiliary_loss_clip": 0.01218633, "auxiliary_loss_mlp": 0.01053689, "balance_loss_clip": 1.03322053, "balance_loss_mlp": 1.05999291, "epoch": 0.07323012174958665, "flos": 14392602529920.0, "grad_norm": 1.9193453508087694, "language_loss": 0.86245602, "learning_rate": 3.947462214026512e-06, "loss": 0.88517928, "num_input_tokens_seen": 25927215, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.3125, "step": 1218, "time_per_iteration": 2.6500139236450195 }, { "auxiliary_loss_clip": 0.0120137, "auxiliary_loss_mlp": 0.01057028, "balance_loss_clip": 1.0337584, "balance_loss_mlp": 1.05756164, "epoch": 0.07329024500225462, "flos": 21616428591360.0, "grad_norm": 2.1496130719992217, "language_loss": 0.86546075, "learning_rate": 3.947376148767013e-06, "loss": 0.88804471, "num_input_tokens_seen": 25945500, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.34375, "step": 1219, "time_per_iteration": 2.5839202404022217 }, { "auxiliary_loss_clip": 0.01189406, "auxiliary_loss_mlp": 0.0105661, "balance_loss_clip": 1.03475893, "balance_loss_mlp": 1.05676818, "epoch": 0.0733503682549226, "flos": 13224141106560.0, "grad_norm": 2.6166628747720067, "language_loss": 0.84476626, "learning_rate": 3.947290014010668e-06, "loss": 0.86722642, "num_input_tokens_seen": 25963105, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.328125, "step": 1220, "time_per_iteration": 2.643087863922119 }, { "auxiliary_loss_clip": 0.01219361, "auxiliary_loss_mlp": 0.01065109, "balance_loss_clip": 1.04251885, "balance_loss_mlp": 1.05866504, "epoch": 0.07341049150759056, "flos": 20886975192960.0, "grad_norm": 2.9831294831931814, "language_loss": 0.76860571, "learning_rate": 3.9472038097605516e-06, "loss": 0.79145038, "num_input_tokens_seen": 25981690, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.328125, "step": 1221, "time_per_iteration": 2.659343957901001 }, { "auxiliary_loss_clip": 0.01231397, "auxiliary_loss_mlp": 0.01057003, "balance_loss_clip": 1.0351162, "balance_loss_mlp": 1.06081915, "epoch": 0.07347061476025853, "flos": 15413614623360.0, "grad_norm": 2.0183845557174354, "language_loss": 0.91523671, "learning_rate": 3.94711753601974e-06, "loss": 0.93812078, "num_input_tokens_seen": 25999890, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.34375, "step": 1222, "time_per_iteration": 5.576252460479736 }, { "auxiliary_loss_clip": 0.01241499, "auxiliary_loss_mlp": 0.01054179, "balance_loss_clip": 1.03244722, "balance_loss_mlp": 1.06040156, "epoch": 0.0735307380129265, "flos": 11108859131520.0, "grad_norm": 2.4618385555546154, "language_loss": 0.9067927, "learning_rate": 3.947031192791312e-06, "loss": 0.92974943, "num_input_tokens_seen": 26016445, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.3515625, "step": 1223, "time_per_iteration": 4.037343263626099 }, { "auxiliary_loss_clip": 0.01191811, "auxiliary_loss_mlp": 0.01067144, "balance_loss_clip": 1.04479241, "balance_loss_mlp": 1.05884039, "epoch": 0.07359086126559447, "flos": 23732392924800.0, "grad_norm": 1.9299013847746542, "language_loss": 0.81914759, "learning_rate": 3.9469447800783485e-06, "loss": 0.84173715, "num_input_tokens_seen": 26036080, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.328125, "step": 1224, "time_per_iteration": 2.6792075634002686 }, { "auxiliary_loss_clip": 0.01190214, "auxiliary_loss_mlp": 0.01054083, "balance_loss_clip": 1.03000236, "balance_loss_mlp": 1.05718994, "epoch": 0.07365098451826244, "flos": 20993270515200.0, "grad_norm": 2.8188073943582412, "language_loss": 0.83058703, "learning_rate": 3.946858297883935e-06, "loss": 0.85302997, "num_input_tokens_seen": 26055805, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.328125, "step": 1225, "time_per_iteration": 2.560964345932007 }, { "auxiliary_loss_clip": 0.01218928, "auxiliary_loss_mlp": 0.01052129, "balance_loss_clip": 1.02953935, "balance_loss_mlp": 1.05747271, "epoch": 0.0737111077709304, "flos": 19933582452480.0, "grad_norm": 2.064038245740939, "language_loss": 0.90136051, "learning_rate": 3.946771746211156e-06, "loss": 0.92407107, "num_input_tokens_seen": 26073905, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 1.34375, "step": 1226, "time_per_iteration": 4.855942487716675 }, { "auxiliary_loss_clip": 0.01198561, "auxiliary_loss_mlp": 0.01044388, "balance_loss_clip": 1.02161908, "balance_loss_mlp": 1.05633569, "epoch": 0.07377123102359838, "flos": 16581537342720.0, "grad_norm": 2.706702133115434, "language_loss": 0.75925511, "learning_rate": 3.946685125063101e-06, "loss": 0.78168464, "num_input_tokens_seen": 26091700, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 1.421875, "step": 1227, "time_per_iteration": 2.6650028228759766 }, { "auxiliary_loss_clip": 0.01198129, "auxiliary_loss_mlp": 0.0105091, "balance_loss_clip": 1.02859366, "balance_loss_mlp": 1.05980802, "epoch": 0.07383135427626634, "flos": 28328563457280.0, "grad_norm": 1.506368196491225, "language_loss": 0.8532725, "learning_rate": 3.9465984344428615e-06, "loss": 0.87576294, "num_input_tokens_seen": 26114105, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.296875, "step": 1228, "time_per_iteration": 2.6593034267425537 }, { "auxiliary_loss_clip": 0.01201181, "auxiliary_loss_mlp": 0.0105511, "balance_loss_clip": 1.03281772, "balance_loss_mlp": 1.05684066, "epoch": 0.07389147752893431, "flos": 20047168235520.0, "grad_norm": 2.0280635060236643, "language_loss": 0.801561, "learning_rate": 3.946511674353531e-06, "loss": 0.82412386, "num_input_tokens_seen": 26131165, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.3515625, "step": 1229, "time_per_iteration": 2.7054686546325684 }, { "auxiliary_loss_clip": 0.01204408, "auxiliary_loss_mlp": 0.01306943, "balance_loss_clip": 1.03514683, "balance_loss_mlp": 1.05992603, "epoch": 0.07395160078160229, "flos": 18114132890880.0, "grad_norm": 2.129126098373719, "language_loss": 0.77674425, "learning_rate": 3.9464248447982065e-06, "loss": 0.80185777, "num_input_tokens_seen": 26150040, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.359375, "step": 1230, "time_per_iteration": 2.571192502975464 }, { "auxiliary_loss_clip": 0.01191466, "auxiliary_loss_mlp": 0.01048898, "balance_loss_clip": 1.02553272, "balance_loss_mlp": 1.05942035, "epoch": 0.07401172403427025, "flos": 23586918842880.0, "grad_norm": 2.143323905945817, "language_loss": 0.81214774, "learning_rate": 3.946337945779986e-06, "loss": 0.83455139, "num_input_tokens_seen": 26169380, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 1.3203125, "step": 1231, "time_per_iteration": 2.6770431995391846 }, { "auxiliary_loss_clip": 0.01213465, "auxiliary_loss_mlp": 0.01065667, "balance_loss_clip": 1.04230165, "balance_loss_mlp": 1.05745101, "epoch": 0.07407184728693822, "flos": 26359904799360.0, "grad_norm": 2.275834868319218, "language_loss": 0.95135844, "learning_rate": 3.94625097730197e-06, "loss": 0.97414976, "num_input_tokens_seen": 26189420, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 1.375, "step": 1232, "time_per_iteration": 2.669807195663452 }, { "auxiliary_loss_clip": 0.01200968, "auxiliary_loss_mlp": 0.01059348, "balance_loss_clip": 1.03787816, "balance_loss_mlp": 1.05837882, "epoch": 0.0741319705396062, "flos": 22200443821440.0, "grad_norm": 1.7391583006116407, "language_loss": 0.80391294, "learning_rate": 3.946163939367264e-06, "loss": 0.82651615, "num_input_tokens_seen": 26209300, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.3359375, "step": 1233, "time_per_iteration": 2.7147583961486816 }, { "auxiliary_loss_clip": 0.01196235, "auxiliary_loss_mlp": 0.01064792, "balance_loss_clip": 1.03957963, "balance_loss_mlp": 1.05721784, "epoch": 0.07419209379227416, "flos": 39200482523520.0, "grad_norm": 2.512331413036233, "language_loss": 0.7042197, "learning_rate": 3.9460768319789724e-06, "loss": 0.72683001, "num_input_tokens_seen": 26228110, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.390625, "step": 1234, "time_per_iteration": 2.785808801651001 }, { "auxiliary_loss_clip": 0.01209795, "auxiliary_loss_mlp": 0.01057127, "balance_loss_clip": 1.03410769, "balance_loss_mlp": 1.05926728, "epoch": 0.07425221704494213, "flos": 22781657790720.0, "grad_norm": 1.8320059091480831, "language_loss": 0.76877213, "learning_rate": 3.945989655140205e-06, "loss": 0.79144138, "num_input_tokens_seen": 26247020, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.3203125, "step": 1235, "time_per_iteration": 2.6198744773864746 }, { "auxiliary_loss_clip": 0.01189339, "auxiliary_loss_mlp": 0.01053144, "balance_loss_clip": 1.03093553, "balance_loss_mlp": 1.05778062, "epoch": 0.0743123402976101, "flos": 22272983337600.0, "grad_norm": 1.8598179547444236, "language_loss": 0.82635492, "learning_rate": 3.945902408854073e-06, "loss": 0.8487798, "num_input_tokens_seen": 26265750, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.3125, "step": 1236, "time_per_iteration": 2.673236846923828 }, { "auxiliary_loss_clip": 0.01221241, "auxiliary_loss_mlp": 0.01303563, "balance_loss_clip": 1.03077543, "balance_loss_mlp": 1.05748868, "epoch": 0.07437246355027807, "flos": 29315029645440.0, "grad_norm": 1.5862034783811774, "language_loss": 0.75415289, "learning_rate": 3.945815093123688e-06, "loss": 0.77940089, "num_input_tokens_seen": 26287905, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 1.359375, "step": 1237, "time_per_iteration": 2.6571221351623535 }, { "auxiliary_loss_clip": 0.01227507, "auxiliary_loss_mlp": 0.01312466, "balance_loss_clip": 1.04099929, "balance_loss_mlp": 1.05613816, "epoch": 0.07443258680294604, "flos": 31944732249600.0, "grad_norm": 1.687328990961364, "language_loss": 0.77438569, "learning_rate": 3.945727707952168e-06, "loss": 0.79978538, "num_input_tokens_seen": 26311795, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.34375, "step": 1238, "time_per_iteration": 2.7952632904052734 }, { "auxiliary_loss_clip": 0.01215251, "auxiliary_loss_mlp": 0.01056138, "balance_loss_clip": 1.03296328, "balance_loss_mlp": 1.05889094, "epoch": 0.074492710055614, "flos": 22675290641280.0, "grad_norm": 1.9545496982326247, "language_loss": 0.86326599, "learning_rate": 3.945640253342632e-06, "loss": 0.88597989, "num_input_tokens_seen": 26330330, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.3828125, "step": 1239, "time_per_iteration": 2.636054277420044 }, { "auxiliary_loss_clip": 0.01220547, "auxiliary_loss_mlp": 0.01049202, "balance_loss_clip": 1.02450228, "balance_loss_mlp": 1.05808163, "epoch": 0.07455283330828198, "flos": 21284901037440.0, "grad_norm": 1.780204446276732, "language_loss": 0.88852811, "learning_rate": 3.9455527292981996e-06, "loss": 0.91122562, "num_input_tokens_seen": 26348865, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 1.3515625, "step": 1240, "time_per_iteration": 2.7130420207977295 }, { "auxiliary_loss_clip": 0.01213784, "auxiliary_loss_mlp": 0.01059892, "balance_loss_clip": 1.03546619, "balance_loss_mlp": 1.06015003, "epoch": 0.07461295656094995, "flos": 24388408967040.0, "grad_norm": 1.9787915753914491, "language_loss": 0.89044005, "learning_rate": 3.945465135821995e-06, "loss": 0.91317683, "num_input_tokens_seen": 26368210, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.3515625, "step": 1241, "time_per_iteration": 2.6412999629974365 }, { "auxiliary_loss_clip": 0.01104067, "auxiliary_loss_mlp": 0.0101259, "balance_loss_clip": 1.00808394, "balance_loss_mlp": 1.0315516, "epoch": 0.07467307981361791, "flos": 62109660574080.0, "grad_norm": 0.8912015301199451, "language_loss": 0.630808, "learning_rate": 3.9453774729171435e-06, "loss": 0.65197456, "num_input_tokens_seen": 26424890, "router_z_loss_clip": 0.04516602, "router_z_loss_mlp": 0.54296875, "step": 1242, "time_per_iteration": 3.285858631134033 }, { "auxiliary_loss_clip": 0.01232978, "auxiliary_loss_mlp": 0.01060355, "balance_loss_clip": 1.03520203, "balance_loss_mlp": 1.0600574, "epoch": 0.07473320306628589, "flos": 24863148046080.0, "grad_norm": 2.1187505336133357, "language_loss": 0.63079202, "learning_rate": 3.945289740586775e-06, "loss": 0.65372533, "num_input_tokens_seen": 26446405, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.453125, "step": 1243, "time_per_iteration": 2.6683144569396973 }, { "auxiliary_loss_clip": 0.01201579, "auxiliary_loss_mlp": 0.01056283, "balance_loss_clip": 1.03284669, "balance_loss_mlp": 1.05918849, "epoch": 0.07479332631895386, "flos": 24897442556160.0, "grad_norm": 2.5600191651046504, "language_loss": 0.76306033, "learning_rate": 3.945201938834018e-06, "loss": 0.78563893, "num_input_tokens_seen": 26466070, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.328125, "step": 1244, "time_per_iteration": 2.737478733062744 }, { "auxiliary_loss_clip": 0.01216201, "auxiliary_loss_mlp": 0.01308908, "balance_loss_clip": 1.03635979, "balance_loss_mlp": 1.06056488, "epoch": 0.07485344957162182, "flos": 17815247821440.0, "grad_norm": 3.1601741664056355, "language_loss": 0.69028938, "learning_rate": 3.945114067662009e-06, "loss": 0.71554047, "num_input_tokens_seen": 26479350, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 1.375, "step": 1245, "time_per_iteration": 2.6679739952087402 }, { "auxiliary_loss_clip": 0.0119399, "auxiliary_loss_mlp": 0.01055483, "balance_loss_clip": 1.03104496, "balance_loss_mlp": 1.05937803, "epoch": 0.0749135728242898, "flos": 25010202326400.0, "grad_norm": 1.9792909390775035, "language_loss": 0.88368368, "learning_rate": 3.9450261270738815e-06, "loss": 0.90617836, "num_input_tokens_seen": 26498255, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.34375, "step": 1246, "time_per_iteration": 2.7986955642700195 }, { "auxiliary_loss_clip": 0.01204078, "auxiliary_loss_mlp": 0.01069678, "balance_loss_clip": 1.04336858, "balance_loss_mlp": 1.05910242, "epoch": 0.07497369607695777, "flos": 17822071405440.0, "grad_norm": 2.241708744462668, "language_loss": 0.88098693, "learning_rate": 3.944938117072776e-06, "loss": 0.90372455, "num_input_tokens_seen": 26515375, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.453125, "step": 1247, "time_per_iteration": 2.656874179840088 }, { "auxiliary_loss_clip": 0.01212017, "auxiliary_loss_mlp": 0.01066529, "balance_loss_clip": 1.04227018, "balance_loss_mlp": 1.05910742, "epoch": 0.07503381932962573, "flos": 15121086261120.0, "grad_norm": 2.6946144705355217, "language_loss": 0.64406019, "learning_rate": 3.944850037661831e-06, "loss": 0.66684568, "num_input_tokens_seen": 26533595, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.34375, "step": 1248, "time_per_iteration": 2.56937837600708 }, { "auxiliary_loss_clip": 0.01201559, "auxiliary_loss_mlp": 0.01309938, "balance_loss_clip": 1.03968573, "balance_loss_mlp": 1.06046414, "epoch": 0.0750939425822937, "flos": 12816734071680.0, "grad_norm": 1.933633812263414, "language_loss": 0.74172771, "learning_rate": 3.944761888844191e-06, "loss": 0.76684266, "num_input_tokens_seen": 26549405, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.3203125, "step": 1249, "time_per_iteration": 2.6613364219665527 }, { "auxiliary_loss_clip": 0.01217063, "auxiliary_loss_mlp": 0.01072658, "balance_loss_clip": 1.04810119, "balance_loss_mlp": 1.06056464, "epoch": 0.07515406583496168, "flos": 24206844695040.0, "grad_norm": 2.450837458431254, "language_loss": 0.82168698, "learning_rate": 3.944673670623001e-06, "loss": 0.84458423, "num_input_tokens_seen": 26567200, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 1.3828125, "step": 1250, "time_per_iteration": 2.6646368503570557 }, { "auxiliary_loss_clip": 0.0123379, "auxiliary_loss_mlp": 0.01063683, "balance_loss_clip": 1.0401988, "balance_loss_mlp": 1.06309164, "epoch": 0.07521418908762964, "flos": 26688164215680.0, "grad_norm": 1.9715108024725467, "language_loss": 0.66386735, "learning_rate": 3.944585383001411e-06, "loss": 0.68684208, "num_input_tokens_seen": 26586190, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.34375, "step": 1251, "time_per_iteration": 2.7630059719085693 }, { "auxiliary_loss_clip": 0.01108597, "auxiliary_loss_mlp": 0.01006366, "balance_loss_clip": 1.00231254, "balance_loss_mlp": 1.02803218, "epoch": 0.0752743123402976, "flos": 59095140589440.0, "grad_norm": 0.9021457859825225, "language_loss": 0.70424062, "learning_rate": 3.944497025982571e-06, "loss": 0.72539026, "num_input_tokens_seen": 26650710, "router_z_loss_clip": 0.04052734, "router_z_loss_mlp": 0.53125, "step": 1252, "time_per_iteration": 3.246471643447876 }, { "auxiliary_loss_clip": 0.01193454, "auxiliary_loss_mlp": 0.01063091, "balance_loss_clip": 1.03938055, "balance_loss_mlp": 1.05734539, "epoch": 0.07533443559296558, "flos": 23477032160640.0, "grad_norm": 2.214913808433388, "language_loss": 0.7941047, "learning_rate": 3.944408599569633e-06, "loss": 0.81667012, "num_input_tokens_seen": 26669000, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.359375, "step": 1253, "time_per_iteration": 2.6717238426208496 }, { "auxiliary_loss_clip": 0.01228869, "auxiliary_loss_mlp": 0.01067583, "balance_loss_clip": 1.04344296, "balance_loss_mlp": 1.06241918, "epoch": 0.07539455884563355, "flos": 20879110114560.0, "grad_norm": 2.692285540397337, "language_loss": 0.93300617, "learning_rate": 3.9443201037657545e-06, "loss": 0.9559707, "num_input_tokens_seen": 26683075, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.390625, "step": 1254, "time_per_iteration": 2.6482348442077637 }, { "auxiliary_loss_clip": 0.0120829, "auxiliary_loss_mlp": 0.01062162, "balance_loss_clip": 1.03846276, "balance_loss_mlp": 1.05828714, "epoch": 0.07545468209830151, "flos": 27672906551040.0, "grad_norm": 1.8409720101894738, "language_loss": 0.88106489, "learning_rate": 3.944231538574092e-06, "loss": 0.90376937, "num_input_tokens_seen": 26701875, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 1.3203125, "step": 1255, "time_per_iteration": 2.707710027694702 }, { "auxiliary_loss_clip": 0.01192914, "auxiliary_loss_mlp": 0.01064526, "balance_loss_clip": 1.04029131, "balance_loss_mlp": 1.05864978, "epoch": 0.0755148053509695, "flos": 14136990370560.0, "grad_norm": 1.7476482102529662, "language_loss": 0.7951656, "learning_rate": 3.9441429039978086e-06, "loss": 0.81773996, "num_input_tokens_seen": 26719050, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.34375, "step": 1256, "time_per_iteration": 2.5943636894226074 }, { "auxiliary_loss_clip": 0.01095248, "auxiliary_loss_mlp": 0.01005511, "balance_loss_clip": 1.00136209, "balance_loss_mlp": 1.02376652, "epoch": 0.07557492860363746, "flos": 58235257929600.0, "grad_norm": 0.8388773281631099, "language_loss": 0.57984489, "learning_rate": 3.944054200040065e-06, "loss": 0.60085249, "num_input_tokens_seen": 26780650, "router_z_loss_clip": 0.04150391, "router_z_loss_mlp": 0.53125, "step": 1257, "time_per_iteration": 3.2371225357055664 }, { "auxiliary_loss_clip": 0.01231331, "auxiliary_loss_mlp": 0.01059851, "balance_loss_clip": 1.03623605, "balance_loss_mlp": 1.06147051, "epoch": 0.07563505185630542, "flos": 24644380262400.0, "grad_norm": 2.3697070723662303, "language_loss": 0.89869243, "learning_rate": 3.943965426704027e-06, "loss": 0.92160428, "num_input_tokens_seen": 26798725, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.328125, "step": 1258, "time_per_iteration": 2.696664333343506 }, { "auxiliary_loss_clip": 0.01210696, "auxiliary_loss_mlp": 0.01056971, "balance_loss_clip": 1.0347147, "balance_loss_mlp": 1.06208861, "epoch": 0.07569517510897339, "flos": 15522998515200.0, "grad_norm": 2.085616421951971, "language_loss": 0.81206644, "learning_rate": 3.943876583992864e-06, "loss": 0.83474314, "num_input_tokens_seen": 26817005, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.3046875, "step": 1259, "time_per_iteration": 2.579706907272339 }, { "auxiliary_loss_clip": 0.01202835, "auxiliary_loss_mlp": 0.01058397, "balance_loss_clip": 1.03552032, "balance_loss_mlp": 1.05996776, "epoch": 0.07575529836164137, "flos": 22928532503040.0, "grad_norm": 1.8899073069900534, "language_loss": 0.75997353, "learning_rate": 3.943787671909746e-06, "loss": 0.7825858, "num_input_tokens_seen": 26836655, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.3359375, "step": 1260, "time_per_iteration": 2.6293392181396484 }, { "auxiliary_loss_clip": 0.01219669, "auxiliary_loss_mlp": 0.01065702, "balance_loss_clip": 1.04126418, "balance_loss_mlp": 1.05815446, "epoch": 0.07581542161430933, "flos": 19500428344320.0, "grad_norm": 2.0938223534464795, "language_loss": 0.84381402, "learning_rate": 3.943698690457846e-06, "loss": 0.86666775, "num_input_tokens_seen": 26854925, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.3359375, "step": 1261, "time_per_iteration": 2.5722103118896484 }, { "auxiliary_loss_clip": 0.01198455, "auxiliary_loss_mlp": 0.01061463, "balance_loss_clip": 1.03858721, "balance_loss_mlp": 1.06298161, "epoch": 0.0758755448669773, "flos": 24973465691520.0, "grad_norm": 1.8850332712177549, "language_loss": 0.82212579, "learning_rate": 3.943609639640339e-06, "loss": 0.84472495, "num_input_tokens_seen": 26876170, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.3515625, "step": 1262, "time_per_iteration": 2.6689164638519287 }, { "auxiliary_loss_clip": 0.01192356, "auxiliary_loss_mlp": 0.01062693, "balance_loss_clip": 1.03832662, "balance_loss_mlp": 1.05759835, "epoch": 0.07593566811964528, "flos": 22747973811840.0, "grad_norm": 2.4661497638431746, "language_loss": 0.82666022, "learning_rate": 3.943520519460405e-06, "loss": 0.84921074, "num_input_tokens_seen": 26895005, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.34375, "step": 1263, "time_per_iteration": 4.048881530761719 }, { "auxiliary_loss_clip": 0.01234368, "auxiliary_loss_mlp": 0.01058882, "balance_loss_clip": 1.03555298, "balance_loss_mlp": 1.06050611, "epoch": 0.07599579137231324, "flos": 23112395245440.0, "grad_norm": 2.089620503328691, "language_loss": 0.76009989, "learning_rate": 3.943431329921221e-06, "loss": 0.78303242, "num_input_tokens_seen": 26913930, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 1.375, "step": 1264, "time_per_iteration": 5.5380566120147705 }, { "auxiliary_loss_clip": 0.01215199, "auxiliary_loss_mlp": 0.01066525, "balance_loss_clip": 1.04233789, "balance_loss_mlp": 1.06083465, "epoch": 0.07605591462498121, "flos": 14502058248960.0, "grad_norm": 2.039784418988322, "language_loss": 0.80390978, "learning_rate": 3.943342071025974e-06, "loss": 0.82672703, "num_input_tokens_seen": 26931485, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.359375, "step": 1265, "time_per_iteration": 2.600118398666382 }, { "auxiliary_loss_clip": 0.01222689, "auxiliary_loss_mlp": 0.01059639, "balance_loss_clip": 1.03656006, "balance_loss_mlp": 1.05876541, "epoch": 0.07611603787764919, "flos": 23514199758720.0, "grad_norm": 2.056394080202971, "language_loss": 0.65280366, "learning_rate": 3.9432527427778455e-06, "loss": 0.67562693, "num_input_tokens_seen": 26951670, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.3671875, "step": 1266, "time_per_iteration": 2.6799561977386475 }, { "auxiliary_loss_clip": 0.01221477, "auxiliary_loss_mlp": 0.01061317, "balance_loss_clip": 1.03736782, "balance_loss_mlp": 1.05778658, "epoch": 0.07617616113031715, "flos": 21507188353920.0, "grad_norm": 2.155458516591513, "language_loss": 0.79361355, "learning_rate": 3.943163345180026e-06, "loss": 0.81644148, "num_input_tokens_seen": 26970335, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 1.359375, "step": 1267, "time_per_iteration": 4.1985838413238525 }, { "auxiliary_loss_clip": 0.01212593, "auxiliary_loss_mlp": 0.01055145, "balance_loss_clip": 1.03280568, "balance_loss_mlp": 1.05870855, "epoch": 0.07623628438298512, "flos": 14573161221120.0, "grad_norm": 2.334641012514603, "language_loss": 0.72988081, "learning_rate": 3.9430738782357054e-06, "loss": 0.75255817, "num_input_tokens_seen": 26986025, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.359375, "step": 1268, "time_per_iteration": 2.551201343536377 }, { "auxiliary_loss_clip": 0.01202913, "auxiliary_loss_mlp": 0.0105928, "balance_loss_clip": 1.03611779, "balance_loss_mlp": 1.05822206, "epoch": 0.07629640763565308, "flos": 14720395069440.0, "grad_norm": 2.1917547069683723, "language_loss": 0.82128668, "learning_rate": 3.9429843419480755e-06, "loss": 0.84390867, "num_input_tokens_seen": 27004045, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 1.359375, "step": 1269, "time_per_iteration": 2.691769599914551 }, { "auxiliary_loss_clip": 0.01204528, "auxiliary_loss_mlp": 0.01058064, "balance_loss_clip": 1.03385293, "balance_loss_mlp": 1.06081915, "epoch": 0.07635653088832106, "flos": 14902929008640.0, "grad_norm": 3.9748066357446312, "language_loss": 0.88667148, "learning_rate": 3.942894736320334e-06, "loss": 0.90929747, "num_input_tokens_seen": 27022070, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.34375, "step": 1270, "time_per_iteration": 2.589416980743408 }, { "auxiliary_loss_clip": 0.01207873, "auxiliary_loss_mlp": 0.01060329, "balance_loss_clip": 1.03670216, "balance_loss_mlp": 1.06031501, "epoch": 0.07641665414098903, "flos": 26651571235200.0, "grad_norm": 2.2043395704362654, "language_loss": 0.70971256, "learning_rate": 3.942805061355676e-06, "loss": 0.73239458, "num_input_tokens_seen": 27041755, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.3828125, "step": 1271, "time_per_iteration": 2.6915123462677 }, { "auxiliary_loss_clip": 0.01217062, "auxiliary_loss_mlp": 0.01065167, "balance_loss_clip": 1.04213595, "balance_loss_mlp": 1.06104946, "epoch": 0.07647677739365699, "flos": 25192808092800.0, "grad_norm": 1.5647667387523356, "language_loss": 0.824265, "learning_rate": 3.9427153170573026e-06, "loss": 0.84708732, "num_input_tokens_seen": 27061540, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.2890625, "step": 1272, "time_per_iteration": 2.6222610473632812 }, { "auxiliary_loss_clip": 0.01210368, "auxiliary_loss_mlp": 0.01308324, "balance_loss_clip": 1.03659272, "balance_loss_mlp": 1.05728614, "epoch": 0.07653690064632497, "flos": 20558141159040.0, "grad_norm": 1.6333656789012285, "language_loss": 0.7987743, "learning_rate": 3.9426255034284174e-06, "loss": 0.82396114, "num_input_tokens_seen": 27081395, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.34375, "step": 1273, "time_per_iteration": 2.6432149410247803 }, { "auxiliary_loss_clip": 0.0121462, "auxiliary_loss_mlp": 0.01061448, "balance_loss_clip": 1.03701019, "balance_loss_mlp": 1.06009436, "epoch": 0.07659702389899294, "flos": 22269320150400.0, "grad_norm": 2.298135932782344, "language_loss": 0.81282079, "learning_rate": 3.942535620472224e-06, "loss": 0.83558148, "num_input_tokens_seen": 27101175, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.359375, "step": 1274, "time_per_iteration": 2.6287841796875 }, { "auxiliary_loss_clip": 0.01211706, "auxiliary_loss_mlp": 0.01069803, "balance_loss_clip": 1.04473281, "balance_loss_mlp": 1.05959964, "epoch": 0.0766571471516609, "flos": 32636120209920.0, "grad_norm": 1.9051519446695937, "language_loss": 0.73224986, "learning_rate": 3.942445668191932e-06, "loss": 0.75506496, "num_input_tokens_seen": 27124505, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.3359375, "step": 1275, "time_per_iteration": 2.735933303833008 }, { "auxiliary_loss_clip": 0.01195629, "auxiliary_loss_mlp": 0.01064423, "balance_loss_clip": 1.03947246, "balance_loss_mlp": 1.05946243, "epoch": 0.07671727040432888, "flos": 15267386355840.0, "grad_norm": 2.4446917868564304, "language_loss": 0.79461622, "learning_rate": 3.94235564659075e-06, "loss": 0.81721669, "num_input_tokens_seen": 27140960, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 1.359375, "step": 1276, "time_per_iteration": 2.60235595703125 }, { "auxiliary_loss_clip": 0.01209392, "auxiliary_loss_mlp": 0.01054295, "balance_loss_clip": 1.03141892, "balance_loss_mlp": 1.06310356, "epoch": 0.07677739365699685, "flos": 28184094956160.0, "grad_norm": 2.187443429351998, "language_loss": 0.59029335, "learning_rate": 3.942265555671892e-06, "loss": 0.61293018, "num_input_tokens_seen": 27160985, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.3671875, "step": 1277, "time_per_iteration": 2.712398052215576 }, { "auxiliary_loss_clip": 0.0121076, "auxiliary_loss_mlp": 0.01057501, "balance_loss_clip": 1.03374279, "balance_loss_mlp": 1.06102467, "epoch": 0.07683751690966481, "flos": 18296128126080.0, "grad_norm": 2.310482181543815, "language_loss": 0.74635208, "learning_rate": 3.942175395438572e-06, "loss": 0.76903474, "num_input_tokens_seen": 27178390, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 1.40625, "step": 1278, "time_per_iteration": 2.5934932231903076 }, { "auxiliary_loss_clip": 0.01209775, "auxiliary_loss_mlp": 0.01058581, "balance_loss_clip": 1.03596652, "balance_loss_mlp": 1.05830598, "epoch": 0.07689764016233278, "flos": 21981101420160.0, "grad_norm": 2.2049643445469185, "language_loss": 0.88535804, "learning_rate": 3.942085165894009e-06, "loss": 0.9080416, "num_input_tokens_seen": 27197505, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.328125, "step": 1279, "time_per_iteration": 2.6168196201324463 }, { "auxiliary_loss_clip": 0.01208783, "auxiliary_loss_mlp": 0.01059475, "balance_loss_clip": 1.03588319, "balance_loss_mlp": 1.0592792, "epoch": 0.07695776341500075, "flos": 22235995307520.0, "grad_norm": 2.311667184431821, "language_loss": 0.82274449, "learning_rate": 3.9419948670414206e-06, "loss": 0.8454271, "num_input_tokens_seen": 27214260, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.3125, "step": 1280, "time_per_iteration": 2.60898756980896 }, { "auxiliary_loss_clip": 0.01200639, "auxiliary_loss_mlp": 0.01059834, "balance_loss_clip": 1.03642106, "balance_loss_mlp": 1.06062496, "epoch": 0.07701788666766872, "flos": 16143750380160.0, "grad_norm": 1.9778149667441935, "language_loss": 0.76090181, "learning_rate": 3.941904498884032e-06, "loss": 0.78350651, "num_input_tokens_seen": 27232525, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.3125, "step": 1281, "time_per_iteration": 2.5495665073394775 }, { "auxiliary_loss_clip": 0.01225428, "auxiliary_loss_mlp": 0.01058503, "balance_loss_clip": 1.03456593, "balance_loss_mlp": 1.058743, "epoch": 0.07707800992033668, "flos": 19463045264640.0, "grad_norm": 1.8349815646414602, "language_loss": 0.75143766, "learning_rate": 3.941814061425067e-06, "loss": 0.77427697, "num_input_tokens_seen": 27249800, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.390625, "step": 1282, "time_per_iteration": 2.5953240394592285 }, { "auxiliary_loss_clip": 0.01193036, "auxiliary_loss_mlp": 0.01066693, "balance_loss_clip": 1.04288673, "balance_loss_mlp": 1.05960679, "epoch": 0.07713813317300466, "flos": 18990281433600.0, "grad_norm": 1.9105431597369023, "language_loss": 0.84156787, "learning_rate": 3.941723554667752e-06, "loss": 0.86416513, "num_input_tokens_seen": 27268895, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 1.328125, "step": 1283, "time_per_iteration": 2.538964033126831 }, { "auxiliary_loss_clip": 0.01203796, "auxiliary_loss_mlp": 0.01062509, "balance_loss_clip": 1.03617597, "balance_loss_mlp": 1.06023026, "epoch": 0.07719825642567263, "flos": 18113953322880.0, "grad_norm": 1.846823577658069, "language_loss": 0.74578309, "learning_rate": 3.941632978615318e-06, "loss": 0.76844615, "num_input_tokens_seen": 27288180, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.34375, "step": 1284, "time_per_iteration": 2.5761423110961914 }, { "auxiliary_loss_clip": 0.01202023, "auxiliary_loss_mlp": 0.01065039, "balance_loss_clip": 1.04191256, "balance_loss_mlp": 1.05978751, "epoch": 0.0772583796783406, "flos": 42194426993280.0, "grad_norm": 1.6088038153955337, "language_loss": 0.75651127, "learning_rate": 3.941542333270999e-06, "loss": 0.7791819, "num_input_tokens_seen": 27311815, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 1.328125, "step": 1285, "time_per_iteration": 2.735092878341675 }, { "auxiliary_loss_clip": 0.01201313, "auxiliary_loss_mlp": 0.01063599, "balance_loss_clip": 1.04097319, "balance_loss_mlp": 1.06553912, "epoch": 0.07731850293100857, "flos": 24753692327040.0, "grad_norm": 2.0737896225318395, "language_loss": 0.84077322, "learning_rate": 3.9414516186380275e-06, "loss": 0.86342239, "num_input_tokens_seen": 27331890, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.359375, "step": 1286, "time_per_iteration": 2.6802191734313965 }, { "auxiliary_loss_clip": 0.01219, "auxiliary_loss_mlp": 0.01056362, "balance_loss_clip": 1.03242517, "balance_loss_mlp": 1.06210923, "epoch": 0.07737862618367654, "flos": 17565884628480.0, "grad_norm": 2.1817063901738636, "language_loss": 0.76453334, "learning_rate": 3.941360834719641e-06, "loss": 0.78728694, "num_input_tokens_seen": 27348320, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.3828125, "step": 1287, "time_per_iteration": 2.5329878330230713 }, { "auxiliary_loss_clip": 0.01211586, "auxiliary_loss_mlp": 0.01054567, "balance_loss_clip": 1.03200102, "balance_loss_mlp": 1.06047308, "epoch": 0.0774387494363445, "flos": 25627147349760.0, "grad_norm": 1.712487281869195, "language_loss": 0.84667248, "learning_rate": 3.941269981519081e-06, "loss": 0.86933404, "num_input_tokens_seen": 27367670, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 1.328125, "step": 1288, "time_per_iteration": 2.711174726486206 }, { "auxiliary_loss_clip": 0.01200111, "auxiliary_loss_mlp": 0.01055867, "balance_loss_clip": 1.03285909, "balance_loss_mlp": 1.06221271, "epoch": 0.07749887268901248, "flos": 12239865648000.0, "grad_norm": 2.418302661894452, "language_loss": 0.85847616, "learning_rate": 3.941179059039589e-06, "loss": 0.88103592, "num_input_tokens_seen": 27385485, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.375, "step": 1289, "time_per_iteration": 2.5656356811523438 }, { "auxiliary_loss_clip": 0.01195529, "auxiliary_loss_mlp": 0.01050619, "balance_loss_clip": 1.02861333, "balance_loss_mlp": 1.06098306, "epoch": 0.07755899594168045, "flos": 25081736261760.0, "grad_norm": 1.7856732778513602, "language_loss": 0.85115492, "learning_rate": 3.941088067284409e-06, "loss": 0.87361646, "num_input_tokens_seen": 27405110, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.34375, "step": 1290, "time_per_iteration": 2.6737663745880127 }, { "auxiliary_loss_clip": 0.01209532, "auxiliary_loss_mlp": 0.01060245, "balance_loss_clip": 1.03525829, "balance_loss_mlp": 1.06216347, "epoch": 0.07761911919434841, "flos": 14246410176000.0, "grad_norm": 2.583817916058056, "language_loss": 0.91057259, "learning_rate": 3.9409970062567895e-06, "loss": 0.93327034, "num_input_tokens_seen": 27422855, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.2890625, "step": 1291, "time_per_iteration": 2.606675624847412 }, { "auxiliary_loss_clip": 0.01098473, "auxiliary_loss_mlp": 0.01008998, "balance_loss_clip": 1.00482607, "balance_loss_mlp": 1.0274868, "epoch": 0.07767924244701638, "flos": 67237202954880.0, "grad_norm": 0.8950606545827509, "language_loss": 0.65088832, "learning_rate": 3.94090587595998e-06, "loss": 0.67196298, "num_input_tokens_seen": 27487190, "router_z_loss_clip": 0.04174805, "router_z_loss_mlp": 0.5234375, "step": 1292, "time_per_iteration": 3.3638181686401367 }, { "auxiliary_loss_clip": 0.01204296, "auxiliary_loss_mlp": 0.01050899, "balance_loss_clip": 1.02756953, "balance_loss_mlp": 1.06068826, "epoch": 0.07773936569968436, "flos": 28550635292160.0, "grad_norm": 1.7832058387222676, "language_loss": 0.87622702, "learning_rate": 3.940814676397232e-06, "loss": 0.89877892, "num_input_tokens_seen": 27510465, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 1.34375, "step": 1293, "time_per_iteration": 2.7919259071350098 }, { "auxiliary_loss_clip": 0.01247734, "auxiliary_loss_mlp": 0.01070647, "balance_loss_clip": 1.04519594, "balance_loss_mlp": 1.06652844, "epoch": 0.07779948895235232, "flos": 27490264871040.0, "grad_norm": 1.9011217210030016, "language_loss": 0.84373462, "learning_rate": 3.940723407571801e-06, "loss": 0.8669185, "num_input_tokens_seen": 27528645, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.359375, "step": 1294, "time_per_iteration": 2.793452501296997 }, { "auxiliary_loss_clip": 0.01224891, "auxiliary_loss_mlp": 0.01056401, "balance_loss_clip": 1.03272653, "balance_loss_mlp": 1.06242859, "epoch": 0.07785961220502029, "flos": 18223301301120.0, "grad_norm": 2.008898025575612, "language_loss": 0.79487717, "learning_rate": 3.9406320694869425e-06, "loss": 0.81769001, "num_input_tokens_seen": 27546165, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.3515625, "step": 1295, "time_per_iteration": 2.689974069595337 }, { "auxiliary_loss_clip": 0.01220988, "auxiliary_loss_mlp": 0.01053935, "balance_loss_clip": 1.02986658, "balance_loss_mlp": 1.05993414, "epoch": 0.07791973545768827, "flos": 24608218245120.0, "grad_norm": 2.049404301456864, "language_loss": 0.87982833, "learning_rate": 3.940540662145918e-06, "loss": 0.90257758, "num_input_tokens_seen": 27566520, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 1.3359375, "step": 1296, "time_per_iteration": 2.5817737579345703 }, { "auxiliary_loss_clip": 0.01198848, "auxiliary_loss_mlp": 0.0106557, "balance_loss_clip": 1.04055989, "balance_loss_mlp": 1.0635711, "epoch": 0.07797985871035623, "flos": 14282069402880.0, "grad_norm": 2.746210649018795, "language_loss": 0.95902473, "learning_rate": 3.940449185551989e-06, "loss": 0.98166883, "num_input_tokens_seen": 27581960, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.3515625, "step": 1297, "time_per_iteration": 2.5539159774780273 }, { "auxiliary_loss_clip": 0.0121618, "auxiliary_loss_mlp": 0.01057999, "balance_loss_clip": 1.03521824, "balance_loss_mlp": 1.06128585, "epoch": 0.0780399819630242, "flos": 26610453141120.0, "grad_norm": 2.118629611691146, "language_loss": 0.7625978, "learning_rate": 3.94035763970842e-06, "loss": 0.78533959, "num_input_tokens_seen": 27601415, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.3671875, "step": 1298, "time_per_iteration": 2.6309118270874023 }, { "auxiliary_loss_clip": 0.01234705, "auxiliary_loss_mlp": 0.013134, "balance_loss_clip": 1.04153156, "balance_loss_mlp": 1.06468821, "epoch": 0.07810010521569218, "flos": 21834514016640.0, "grad_norm": 1.7522483740282184, "language_loss": 0.80563974, "learning_rate": 3.940266024618478e-06, "loss": 0.83112085, "num_input_tokens_seen": 27621490, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.328125, "step": 1299, "time_per_iteration": 2.7033839225769043 }, { "auxiliary_loss_clip": 0.01217372, "auxiliary_loss_mlp": 0.01061324, "balance_loss_clip": 1.03736329, "balance_loss_mlp": 1.06023788, "epoch": 0.07816022846836014, "flos": 25081233471360.0, "grad_norm": 1.843494184956308, "language_loss": 0.86353838, "learning_rate": 3.940174340285432e-06, "loss": 0.88632536, "num_input_tokens_seen": 27640600, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.296875, "step": 1300, "time_per_iteration": 2.651088237762451 }, { "auxiliary_loss_clip": 0.01192873, "auxiliary_loss_mlp": 0.010532, "balance_loss_clip": 1.02900052, "balance_loss_mlp": 1.05935693, "epoch": 0.0782203517210281, "flos": 40917515431680.0, "grad_norm": 1.7728145588023567, "language_loss": 0.7150929, "learning_rate": 3.940082586712555e-06, "loss": 0.73755366, "num_input_tokens_seen": 27663070, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.3359375, "step": 1301, "time_per_iteration": 2.7667460441589355 }, { "auxiliary_loss_clip": 0.0120923, "auxiliary_loss_mlp": 0.01061991, "balance_loss_clip": 1.03825676, "balance_loss_mlp": 1.06382275, "epoch": 0.07828047497369607, "flos": 41172014269440.0, "grad_norm": 1.53978210565348, "language_loss": 0.70689416, "learning_rate": 3.939990763903122e-06, "loss": 0.72960639, "num_input_tokens_seen": 27686425, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 1.359375, "step": 1302, "time_per_iteration": 2.739588499069214 }, { "auxiliary_loss_clip": 0.01211456, "auxiliary_loss_mlp": 0.01310502, "balance_loss_clip": 1.03685689, "balance_loss_mlp": 1.06006193, "epoch": 0.07834059822636405, "flos": 23508130360320.0, "grad_norm": 2.4476511212026195, "language_loss": 0.8188194, "learning_rate": 3.939898871860407e-06, "loss": 0.84403896, "num_input_tokens_seen": 27704900, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 1.328125, "step": 1303, "time_per_iteration": 2.675203323364258 }, { "auxiliary_loss_clip": 0.01204101, "auxiliary_loss_mlp": 0.01064016, "balance_loss_clip": 1.04032946, "balance_loss_mlp": 1.05997252, "epoch": 0.07840072147903202, "flos": 20193899293440.0, "grad_norm": 1.7078406423612678, "language_loss": 0.74807441, "learning_rate": 3.939806910587693e-06, "loss": 0.77075565, "num_input_tokens_seen": 27724890, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 1.3515625, "step": 1304, "time_per_iteration": 2.622744083404541 }, { "auxiliary_loss_clip": 0.0119744, "auxiliary_loss_mlp": 0.01064659, "balance_loss_clip": 1.0402925, "balance_loss_mlp": 1.06410205, "epoch": 0.07846084473169998, "flos": 21360816432000.0, "grad_norm": 1.5663405281145713, "language_loss": 0.76029623, "learning_rate": 3.9397148800882595e-06, "loss": 0.7829172, "num_input_tokens_seen": 27743115, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.328125, "step": 1305, "time_per_iteration": 5.6585328578948975 }, { "auxiliary_loss_clip": 0.01241042, "auxiliary_loss_mlp": 0.01063001, "balance_loss_clip": 1.03877723, "balance_loss_mlp": 1.05979705, "epoch": 0.07852096798436796, "flos": 25410965345280.0, "grad_norm": 1.5978201084535266, "language_loss": 0.8516013, "learning_rate": 3.939622780365391e-06, "loss": 0.87464172, "num_input_tokens_seen": 27763570, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.359375, "step": 1306, "time_per_iteration": 4.217539310455322 }, { "auxiliary_loss_clip": 0.01205934, "auxiliary_loss_mlp": 0.01048152, "balance_loss_clip": 1.02582395, "balance_loss_mlp": 1.06278491, "epoch": 0.07858109123703592, "flos": 24571481610240.0, "grad_norm": 2.546501798224328, "language_loss": 0.90843391, "learning_rate": 3.939530611422375e-06, "loss": 0.93097478, "num_input_tokens_seen": 27780030, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.34375, "step": 1307, "time_per_iteration": 2.7158679962158203 }, { "auxiliary_loss_clip": 0.01228743, "auxiliary_loss_mlp": 0.01056717, "balance_loss_clip": 1.0315876, "balance_loss_mlp": 1.05898809, "epoch": 0.07864121448970389, "flos": 20698874645760.0, "grad_norm": 1.8095568133712159, "language_loss": 0.83137786, "learning_rate": 3.939438373262501e-06, "loss": 0.85423249, "num_input_tokens_seen": 27796225, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.3359375, "step": 1308, "time_per_iteration": 2.604417085647583 }, { "auxiliary_loss_clip": 0.01220764, "auxiliary_loss_mlp": 0.01062572, "balance_loss_clip": 1.03934991, "balance_loss_mlp": 1.05941319, "epoch": 0.07870133774237187, "flos": 22966526113920.0, "grad_norm": 1.5115999609739934, "language_loss": 0.77272558, "learning_rate": 3.93934606588906e-06, "loss": 0.79555893, "num_input_tokens_seen": 27815975, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.3359375, "step": 1309, "time_per_iteration": 4.867414951324463 }, { "auxiliary_loss_clip": 0.0120844, "auxiliary_loss_mlp": 0.01069312, "balance_loss_clip": 1.04357457, "balance_loss_mlp": 1.05966687, "epoch": 0.07876146099503983, "flos": 18842832103680.0, "grad_norm": 1.9941769904216446, "language_loss": 0.80377942, "learning_rate": 3.939253689305346e-06, "loss": 0.82655692, "num_input_tokens_seen": 27832255, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.3984375, "step": 1310, "time_per_iteration": 2.5926544666290283 }, { "auxiliary_loss_clip": 0.01208802, "auxiliary_loss_mlp": 0.01303074, "balance_loss_clip": 1.03100586, "balance_loss_mlp": 1.06147575, "epoch": 0.0788215842477078, "flos": 23805794367360.0, "grad_norm": 1.8445808885095851, "language_loss": 0.73208261, "learning_rate": 3.939161243514657e-06, "loss": 0.75720131, "num_input_tokens_seen": 27852180, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 1.2890625, "step": 1311, "time_per_iteration": 2.648578643798828 }, { "auxiliary_loss_clip": 0.01203232, "auxiliary_loss_mlp": 0.01081193, "balance_loss_clip": 1.05521703, "balance_loss_mlp": 1.0604856, "epoch": 0.07888170750037576, "flos": 21579907438080.0, "grad_norm": 1.823129079489294, "language_loss": 0.85712796, "learning_rate": 3.939068728520291e-06, "loss": 0.87997222, "num_input_tokens_seen": 27871435, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.3359375, "step": 1312, "time_per_iteration": 2.7284951210021973 }, { "auxiliary_loss_clip": 0.01209326, "auxiliary_loss_mlp": 0.0106451, "balance_loss_clip": 1.04151464, "balance_loss_mlp": 1.06074309, "epoch": 0.07894183075304374, "flos": 19864849777920.0, "grad_norm": 1.829386586930896, "language_loss": 0.81918693, "learning_rate": 3.938976144325549e-06, "loss": 0.84192532, "num_input_tokens_seen": 27890625, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.3046875, "step": 1313, "time_per_iteration": 2.6694040298461914 }, { "auxiliary_loss_clip": 0.01205074, "auxiliary_loss_mlp": 0.0131256, "balance_loss_clip": 1.03815413, "balance_loss_mlp": 1.05689895, "epoch": 0.07900195400571171, "flos": 16143463071360.0, "grad_norm": 2.547033540107611, "language_loss": 0.73436463, "learning_rate": 3.9388834909337375e-06, "loss": 0.75954092, "num_input_tokens_seen": 27906530, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.390625, "step": 1314, "time_per_iteration": 2.575150728225708 }, { "auxiliary_loss_clip": 0.01219981, "auxiliary_loss_mlp": 0.01308741, "balance_loss_clip": 1.03637648, "balance_loss_mlp": 1.05786538, "epoch": 0.07906207725837967, "flos": 23730417676800.0, "grad_norm": 1.4408248116520135, "language_loss": 0.79586571, "learning_rate": 3.938790768348161e-06, "loss": 0.82115293, "num_input_tokens_seen": 27926725, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.3515625, "step": 1315, "time_per_iteration": 2.7338685989379883 }, { "auxiliary_loss_clip": 0.01199324, "auxiliary_loss_mlp": 0.01070857, "balance_loss_clip": 1.04528713, "balance_loss_mlp": 1.05596352, "epoch": 0.07912220051104765, "flos": 24315905364480.0, "grad_norm": 2.325412599626247, "language_loss": 0.73749816, "learning_rate": 3.938697976572129e-06, "loss": 0.7601999, "num_input_tokens_seen": 27947875, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.34375, "step": 1316, "time_per_iteration": 2.604867696762085 }, { "auxiliary_loss_clip": 0.0122259, "auxiliary_loss_mlp": 0.01066578, "balance_loss_clip": 1.04187822, "balance_loss_mlp": 1.05777347, "epoch": 0.07918232376371562, "flos": 18880035615360.0, "grad_norm": 2.1806208818295865, "language_loss": 0.64782512, "learning_rate": 3.938605115608954e-06, "loss": 0.67071676, "num_input_tokens_seen": 27965040, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 1.375, "step": 1317, "time_per_iteration": 2.688981294631958 }, { "auxiliary_loss_clip": 0.01229236, "auxiliary_loss_mlp": 0.01068095, "balance_loss_clip": 1.04253614, "balance_loss_mlp": 1.06132412, "epoch": 0.07924244701638358, "flos": 27376284038400.0, "grad_norm": 2.9372120587445534, "language_loss": 0.73344374, "learning_rate": 3.938512185461948e-06, "loss": 0.75641704, "num_input_tokens_seen": 27985330, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.40625, "step": 1318, "time_per_iteration": 2.642637252807617 }, { "auxiliary_loss_clip": 0.01228902, "auxiliary_loss_mlp": 0.01065181, "balance_loss_clip": 1.04182816, "balance_loss_mlp": 1.05807757, "epoch": 0.07930257026905156, "flos": 25120340403840.0, "grad_norm": 1.6083400539116657, "language_loss": 0.89974755, "learning_rate": 3.938419186134429e-06, "loss": 0.92268836, "num_input_tokens_seen": 28007615, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 1.34375, "step": 1319, "time_per_iteration": 2.7571916580200195 }, { "auxiliary_loss_clip": 0.01199896, "auxiliary_loss_mlp": 0.01056498, "balance_loss_clip": 1.03289485, "balance_loss_mlp": 1.05489397, "epoch": 0.07936269352171953, "flos": 21834478103040.0, "grad_norm": 2.834939347915997, "language_loss": 0.79772532, "learning_rate": 3.9383261176297155e-06, "loss": 0.82028925, "num_input_tokens_seen": 28027765, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.359375, "step": 1320, "time_per_iteration": 2.624866008758545 }, { "auxiliary_loss_clip": 0.01210281, "auxiliary_loss_mlp": 0.01059665, "balance_loss_clip": 1.03602576, "balance_loss_mlp": 1.05949306, "epoch": 0.07942281677438749, "flos": 16939889377920.0, "grad_norm": 1.9661479239393176, "language_loss": 0.69163823, "learning_rate": 3.938232979951129e-06, "loss": 0.71433771, "num_input_tokens_seen": 28044225, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.328125, "step": 1321, "time_per_iteration": 2.639652729034424 }, { "auxiliary_loss_clip": 0.01198715, "auxiliary_loss_mlp": 0.01058979, "balance_loss_clip": 1.03435016, "balance_loss_mlp": 1.05895185, "epoch": 0.07948294002705546, "flos": 18986941468800.0, "grad_norm": 2.0150558237923915, "language_loss": 0.83704042, "learning_rate": 3.938139773101993e-06, "loss": 0.85961741, "num_input_tokens_seen": 28062915, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 1.3046875, "step": 1322, "time_per_iteration": 2.551765203475952 }, { "auxiliary_loss_clip": 0.01189375, "auxiliary_loss_mlp": 0.01056859, "balance_loss_clip": 1.03357685, "balance_loss_mlp": 1.05506277, "epoch": 0.07954306327972344, "flos": 21653452535040.0, "grad_norm": 1.8595472291908892, "language_loss": 0.90461242, "learning_rate": 3.938046497085634e-06, "loss": 0.92707479, "num_input_tokens_seen": 28082175, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.34375, "step": 1323, "time_per_iteration": 2.6345953941345215 }, { "auxiliary_loss_clip": 0.0121636, "auxiliary_loss_mlp": 0.0106013, "balance_loss_clip": 1.03666925, "balance_loss_mlp": 1.05776858, "epoch": 0.0796031865323914, "flos": 23220270766080.0, "grad_norm": 1.608507722414036, "language_loss": 0.82446307, "learning_rate": 3.937953151905381e-06, "loss": 0.84722793, "num_input_tokens_seen": 28102645, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.3125, "step": 1324, "time_per_iteration": 2.63753342628479 }, { "auxiliary_loss_clip": 0.01210708, "auxiliary_loss_mlp": 0.01055187, "balance_loss_clip": 1.03020048, "balance_loss_mlp": 1.05751598, "epoch": 0.07966330978505937, "flos": 23294534135040.0, "grad_norm": 5.184405028566896, "language_loss": 0.78618765, "learning_rate": 3.937859737564564e-06, "loss": 0.80884659, "num_input_tokens_seen": 28122805, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.34375, "step": 1325, "time_per_iteration": 2.7017509937286377 }, { "auxiliary_loss_clip": 0.0123752, "auxiliary_loss_mlp": 0.01324233, "balance_loss_clip": 1.04991984, "balance_loss_mlp": 1.05809546, "epoch": 0.07972343303772735, "flos": 18363783392640.0, "grad_norm": 2.076070866020333, "language_loss": 0.88364816, "learning_rate": 3.937766254066519e-06, "loss": 0.90926576, "num_input_tokens_seen": 28140530, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.34375, "step": 1326, "time_per_iteration": 2.6576433181762695 }, { "auxiliary_loss_clip": 0.01186381, "auxiliary_loss_mlp": 0.01059196, "balance_loss_clip": 1.03416252, "balance_loss_mlp": 1.05562615, "epoch": 0.07978355629039531, "flos": 21762513204480.0, "grad_norm": 1.946107338826591, "language_loss": 0.82786196, "learning_rate": 3.937672701414581e-06, "loss": 0.85031772, "num_input_tokens_seen": 28159640, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.3125, "step": 1327, "time_per_iteration": 2.6410646438598633 }, { "auxiliary_loss_clip": 0.01191643, "auxiliary_loss_mlp": 0.01054815, "balance_loss_clip": 1.02962613, "balance_loss_mlp": 1.05695319, "epoch": 0.07984367954306328, "flos": 18551309322240.0, "grad_norm": 2.2112177960896378, "language_loss": 0.78458279, "learning_rate": 3.937579079612087e-06, "loss": 0.80704737, "num_input_tokens_seen": 28177050, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.34375, "step": 1328, "time_per_iteration": 2.5530941486358643 }, { "auxiliary_loss_clip": 0.0120236, "auxiliary_loss_mlp": 0.01058078, "balance_loss_clip": 1.03321075, "balance_loss_mlp": 1.05787063, "epoch": 0.07990380279573125, "flos": 16904050583040.0, "grad_norm": 2.1887316582092873, "language_loss": 0.73812568, "learning_rate": 3.9374853886623805e-06, "loss": 0.76073003, "num_input_tokens_seen": 28193245, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.359375, "step": 1329, "time_per_iteration": 2.597062587738037 }, { "auxiliary_loss_clip": 0.01207504, "auxiliary_loss_mlp": 0.01066619, "balance_loss_clip": 1.04227614, "balance_loss_mlp": 1.05559015, "epoch": 0.07996392604839922, "flos": 24098358643200.0, "grad_norm": 1.4995948572691566, "language_loss": 0.81397665, "learning_rate": 3.937391628568805e-06, "loss": 0.83671784, "num_input_tokens_seen": 28213570, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.3359375, "step": 1330, "time_per_iteration": 2.6828384399414062 }, { "auxiliary_loss_clip": 0.01190979, "auxiliary_loss_mlp": 0.01315029, "balance_loss_clip": 1.04198027, "balance_loss_mlp": 1.05633664, "epoch": 0.08002404930106718, "flos": 14278729438080.0, "grad_norm": 2.8664158356653595, "language_loss": 0.88760912, "learning_rate": 3.937297799334706e-06, "loss": 0.91266924, "num_input_tokens_seen": 28229980, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 1.34375, "step": 1331, "time_per_iteration": 2.573988914489746 }, { "auxiliary_loss_clip": 0.01205732, "auxiliary_loss_mlp": 0.01314102, "balance_loss_clip": 1.03966367, "balance_loss_mlp": 1.05676436, "epoch": 0.08008417255373516, "flos": 40406219285760.0, "grad_norm": 2.0099513248462144, "language_loss": 0.73204529, "learning_rate": 3.937203900963431e-06, "loss": 0.75724363, "num_input_tokens_seen": 28253840, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.3984375, "step": 1332, "time_per_iteration": 2.830038547515869 }, { "auxiliary_loss_clip": 0.01209381, "auxiliary_loss_mlp": 0.01054881, "balance_loss_clip": 1.03170669, "balance_loss_mlp": 1.05781901, "epoch": 0.08014429580640313, "flos": 18478913460480.0, "grad_norm": 3.0126025329642405, "language_loss": 0.82645547, "learning_rate": 3.9371099334583315e-06, "loss": 0.84909809, "num_input_tokens_seen": 28271675, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.328125, "step": 1333, "time_per_iteration": 2.54748272895813 }, { "auxiliary_loss_clip": 0.01227174, "auxiliary_loss_mlp": 0.01057895, "balance_loss_clip": 1.03375459, "balance_loss_mlp": 1.05653453, "epoch": 0.0802044190590711, "flos": 22053461368320.0, "grad_norm": 2.6079492726776943, "language_loss": 0.74772334, "learning_rate": 3.937015896822762e-06, "loss": 0.77057409, "num_input_tokens_seen": 28291850, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.3359375, "step": 1334, "time_per_iteration": 2.7227091789245605 }, { "auxiliary_loss_clip": 0.012117, "auxiliary_loss_mlp": 0.01067409, "balance_loss_clip": 1.04303122, "balance_loss_mlp": 1.06224084, "epoch": 0.08026454231173906, "flos": 24572128055040.0, "grad_norm": 1.8668368055721405, "language_loss": 0.79874539, "learning_rate": 3.936921791060078e-06, "loss": 0.82153648, "num_input_tokens_seen": 28310780, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.3125, "step": 1335, "time_per_iteration": 2.671596050262451 }, { "auxiliary_loss_clip": 0.01106231, "auxiliary_loss_mlp": 0.01010442, "balance_loss_clip": 1.00677061, "balance_loss_mlp": 1.02787495, "epoch": 0.08032466556440704, "flos": 52581841459200.0, "grad_norm": 0.780565510949891, "language_loss": 0.56034672, "learning_rate": 3.936827616173636e-06, "loss": 0.58151346, "num_input_tokens_seen": 28369985, "router_z_loss_clip": 0.03662109, "router_z_loss_mlp": 0.5078125, "step": 1336, "time_per_iteration": 3.251652479171753 }, { "auxiliary_loss_clip": 0.01208706, "auxiliary_loss_mlp": 0.01061097, "balance_loss_clip": 1.03804243, "balance_loss_mlp": 1.05906677, "epoch": 0.080384788817075, "flos": 23842602829440.0, "grad_norm": 2.216829115635381, "language_loss": 0.6766848, "learning_rate": 3.9367333721668006e-06, "loss": 0.69938278, "num_input_tokens_seen": 28388670, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.3125, "step": 1337, "time_per_iteration": 2.6397409439086914 }, { "auxiliary_loss_clip": 0.01209005, "auxiliary_loss_mlp": 0.01068164, "balance_loss_clip": 1.0445013, "balance_loss_mlp": 1.05908144, "epoch": 0.08044491206974297, "flos": 25300719527040.0, "grad_norm": 2.07255869172485, "language_loss": 0.86657768, "learning_rate": 3.936639059042932e-06, "loss": 0.8893494, "num_input_tokens_seen": 28411845, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.3125, "step": 1338, "time_per_iteration": 2.7222588062286377 }, { "auxiliary_loss_clip": 0.01227134, "auxiliary_loss_mlp": 0.01067827, "balance_loss_clip": 1.04267383, "balance_loss_mlp": 1.05547488, "epoch": 0.08050503532241095, "flos": 22376549226240.0, "grad_norm": 2.551942188803344, "language_loss": 0.87384653, "learning_rate": 3.936544676805397e-06, "loss": 0.89679611, "num_input_tokens_seen": 28427875, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.3515625, "step": 1339, "time_per_iteration": 2.732971429824829 }, { "auxiliary_loss_clip": 0.01194022, "auxiliary_loss_mlp": 0.01059851, "balance_loss_clip": 1.03727269, "balance_loss_mlp": 1.05683756, "epoch": 0.08056515857507891, "flos": 18369421827840.0, "grad_norm": 1.8479058882223036, "language_loss": 0.88910675, "learning_rate": 3.936450225457564e-06, "loss": 0.91164553, "num_input_tokens_seen": 28446615, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.28125, "step": 1340, "time_per_iteration": 2.6169090270996094 }, { "auxiliary_loss_clip": 0.01085902, "auxiliary_loss_mlp": 0.01007267, "balance_loss_clip": 1.0036912, "balance_loss_mlp": 1.02617788, "epoch": 0.08062528182774688, "flos": 51348130980480.0, "grad_norm": 0.8874946698862213, "language_loss": 0.64888966, "learning_rate": 3.936355705002804e-06, "loss": 0.66982138, "num_input_tokens_seen": 28505290, "router_z_loss_clip": 0.03564453, "router_z_loss_mlp": 0.5078125, "step": 1341, "time_per_iteration": 3.1441245079040527 }, { "auxiliary_loss_clip": 0.01204675, "auxiliary_loss_mlp": 0.01066624, "balance_loss_clip": 1.03950381, "balance_loss_mlp": 1.05715728, "epoch": 0.08068540508041486, "flos": 17599712261760.0, "grad_norm": 1.9709142632970957, "language_loss": 0.89685166, "learning_rate": 3.936261115444489e-06, "loss": 0.91956472, "num_input_tokens_seen": 28522735, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.3828125, "step": 1342, "time_per_iteration": 2.533400058746338 }, { "auxiliary_loss_clip": 0.01218022, "auxiliary_loss_mlp": 0.01060745, "balance_loss_clip": 1.03490019, "balance_loss_mlp": 1.06430578, "epoch": 0.08074552833308282, "flos": 10561185486720.0, "grad_norm": 2.6360697069157832, "language_loss": 0.76272839, "learning_rate": 3.936166456785997e-06, "loss": 0.78551614, "num_input_tokens_seen": 28539460, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.3515625, "step": 1343, "time_per_iteration": 2.731874465942383 }, { "auxiliary_loss_clip": 0.01093512, "auxiliary_loss_mlp": 0.01006422, "balance_loss_clip": 1.00296485, "balance_loss_mlp": 1.02487445, "epoch": 0.08080565158575079, "flos": 60840254954880.0, "grad_norm": 0.818872425322249, "language_loss": 0.57444078, "learning_rate": 3.936071729030702e-06, "loss": 0.59544009, "num_input_tokens_seen": 28599855, "router_z_loss_clip": 0.03466797, "router_z_loss_mlp": 0.5, "step": 1344, "time_per_iteration": 3.3305447101593018 }, { "auxiliary_loss_clip": 0.01220214, "auxiliary_loss_mlp": 0.01060251, "balance_loss_clip": 1.03552699, "balance_loss_mlp": 1.0592072, "epoch": 0.08086577483841875, "flos": 18332361970560.0, "grad_norm": 2.535399631734483, "language_loss": 0.86183035, "learning_rate": 3.935976932181989e-06, "loss": 0.88463497, "num_input_tokens_seen": 28617585, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 1.3359375, "step": 1345, "time_per_iteration": 2.977343797683716 }, { "auxiliary_loss_clip": 0.01191094, "auxiliary_loss_mlp": 0.01064306, "balance_loss_clip": 1.04030895, "balance_loss_mlp": 1.05958772, "epoch": 0.08092589809108673, "flos": 21543601766400.0, "grad_norm": 1.6202216195771477, "language_loss": 0.87317079, "learning_rate": 3.935882066243239e-06, "loss": 0.89572477, "num_input_tokens_seen": 28636355, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.3125, "step": 1346, "time_per_iteration": 2.63427996635437 }, { "auxiliary_loss_clip": 0.01199543, "auxiliary_loss_mlp": 0.01058661, "balance_loss_clip": 1.03416312, "balance_loss_mlp": 1.05777454, "epoch": 0.0809860213437547, "flos": 22128012046080.0, "grad_norm": 1.5521602250031483, "language_loss": 0.92672884, "learning_rate": 3.935787131217838e-06, "loss": 0.9493109, "num_input_tokens_seen": 28656260, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 1.328125, "step": 1347, "time_per_iteration": 4.109941244125366 }, { "auxiliary_loss_clip": 0.01199784, "auxiliary_loss_mlp": 0.01066939, "balance_loss_clip": 1.04082012, "balance_loss_mlp": 1.05970478, "epoch": 0.08104614459642266, "flos": 21725489260800.0, "grad_norm": 1.7224370378783243, "language_loss": 0.89055812, "learning_rate": 3.9356921271091734e-06, "loss": 0.91322535, "num_input_tokens_seen": 28675865, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.3125, "step": 1348, "time_per_iteration": 4.068401575088501 }, { "auxiliary_loss_clip": 0.01224267, "auxiliary_loss_mlp": 0.01062742, "balance_loss_clip": 1.03951955, "balance_loss_mlp": 1.06075048, "epoch": 0.08110626784909064, "flos": 23951878980480.0, "grad_norm": 1.6061678804028263, "language_loss": 0.76359522, "learning_rate": 3.935597053920635e-06, "loss": 0.78646529, "num_input_tokens_seen": 28696255, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.265625, "step": 1349, "time_per_iteration": 2.76700496673584 }, { "auxiliary_loss_clip": 0.01191094, "auxiliary_loss_mlp": 0.01065323, "balance_loss_clip": 1.04076612, "balance_loss_mlp": 1.05809307, "epoch": 0.0811663911017586, "flos": 19025689265280.0, "grad_norm": 1.9149376136452756, "language_loss": 0.88227868, "learning_rate": 3.935501911655618e-06, "loss": 0.90484285, "num_input_tokens_seen": 28713905, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 1.328125, "step": 1350, "time_per_iteration": 4.815912961959839 }, { "auxiliary_loss_clip": 0.01215678, "auxiliary_loss_mlp": 0.01312318, "balance_loss_clip": 1.03862131, "balance_loss_mlp": 1.0594058, "epoch": 0.08122651435442657, "flos": 15341290588800.0, "grad_norm": 1.9853083281668114, "language_loss": 0.82381374, "learning_rate": 3.935406700317516e-06, "loss": 0.84909368, "num_input_tokens_seen": 28732075, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.2890625, "step": 1351, "time_per_iteration": 2.641805410385132 }, { "auxiliary_loss_clip": 0.01188085, "auxiliary_loss_mlp": 0.01055129, "balance_loss_clip": 1.02929616, "balance_loss_mlp": 1.05531466, "epoch": 0.08128663760709455, "flos": 23221563655680.0, "grad_norm": 2.389406825423874, "language_loss": 0.75108516, "learning_rate": 3.935311419909728e-06, "loss": 0.77351731, "num_input_tokens_seen": 28751150, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.328125, "step": 1352, "time_per_iteration": 2.696599006652832 }, { "auxiliary_loss_clip": 0.01203117, "auxiliary_loss_mlp": 0.01310586, "balance_loss_clip": 1.03532887, "balance_loss_mlp": 1.0591166, "epoch": 0.08134676085976252, "flos": 22965628273920.0, "grad_norm": 1.8495811748291466, "language_loss": 0.82708728, "learning_rate": 3.935216070435652e-06, "loss": 0.85222423, "num_input_tokens_seen": 28773360, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.3515625, "step": 1353, "time_per_iteration": 2.6927578449249268 }, { "auxiliary_loss_clip": 0.01078892, "auxiliary_loss_mlp": 0.01008695, "balance_loss_clip": 1.00540447, "balance_loss_mlp": 1.02002978, "epoch": 0.08140688411243048, "flos": 64322115816960.0, "grad_norm": 0.8462566111298802, "language_loss": 0.59757125, "learning_rate": 3.935120651898694e-06, "loss": 0.61844718, "num_input_tokens_seen": 28833390, "router_z_loss_clip": 0.03295898, "router_z_loss_mlp": 0.49609375, "step": 1354, "time_per_iteration": 3.238065719604492 }, { "auxiliary_loss_clip": 0.01214544, "auxiliary_loss_mlp": 0.0106152, "balance_loss_clip": 1.03766584, "balance_loss_mlp": 1.05836415, "epoch": 0.08146700736509845, "flos": 22491858862080.0, "grad_norm": 1.751615355955736, "language_loss": 0.82684368, "learning_rate": 3.935025164302257e-06, "loss": 0.84960431, "num_input_tokens_seen": 28852430, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 1.2890625, "step": 1355, "time_per_iteration": 2.6650969982147217 }, { "auxiliary_loss_clip": 0.01199742, "auxiliary_loss_mlp": 0.01064396, "balance_loss_clip": 1.03743088, "balance_loss_mlp": 1.05783105, "epoch": 0.08152713061776642, "flos": 20447823513600.0, "grad_norm": 2.115697702545649, "language_loss": 0.71109688, "learning_rate": 3.934929607649749e-06, "loss": 0.73373818, "num_input_tokens_seen": 28870685, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.328125, "step": 1356, "time_per_iteration": 2.672978162765503 }, { "auxiliary_loss_clip": 0.01198866, "auxiliary_loss_mlp": 0.01061305, "balance_loss_clip": 1.03627145, "balance_loss_mlp": 1.05611038, "epoch": 0.08158725387043439, "flos": 23550218121600.0, "grad_norm": 2.522191699582823, "language_loss": 0.70500761, "learning_rate": 3.934833981944582e-06, "loss": 0.72760934, "num_input_tokens_seen": 28889860, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.3359375, "step": 1357, "time_per_iteration": 2.688230276107788 }, { "auxiliary_loss_clip": 0.01203345, "auxiliary_loss_mlp": 0.01057747, "balance_loss_clip": 1.034024, "balance_loss_mlp": 1.059412, "epoch": 0.08164737712310235, "flos": 22017335264640.0, "grad_norm": 2.8787808628505154, "language_loss": 0.8419379, "learning_rate": 3.934738287190166e-06, "loss": 0.86454886, "num_input_tokens_seen": 28905865, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 1.34375, "step": 1358, "time_per_iteration": 2.702517509460449 }, { "auxiliary_loss_clip": 0.01210805, "auxiliary_loss_mlp": 0.01058809, "balance_loss_clip": 1.03271437, "balance_loss_mlp": 1.0585109, "epoch": 0.08170750037577033, "flos": 23367827836800.0, "grad_norm": 1.9834759285724108, "language_loss": 1.0231812, "learning_rate": 3.934642523389917e-06, "loss": 1.04587722, "num_input_tokens_seen": 28925250, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.34375, "step": 1359, "time_per_iteration": 2.7342400550842285 }, { "auxiliary_loss_clip": 0.01233842, "auxiliary_loss_mlp": 0.01057731, "balance_loss_clip": 1.03275681, "balance_loss_mlp": 1.05589294, "epoch": 0.0817676236284383, "flos": 28397978490240.0, "grad_norm": 1.8466829274378291, "language_loss": 0.83431411, "learning_rate": 3.934546690547253e-06, "loss": 0.85722983, "num_input_tokens_seen": 28943445, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.3203125, "step": 1360, "time_per_iteration": 2.7836694717407227 }, { "auxiliary_loss_clip": 0.01190971, "auxiliary_loss_mlp": 0.01062942, "balance_loss_clip": 1.03846812, "balance_loss_mlp": 1.05665851, "epoch": 0.08182774688110626, "flos": 19208905562880.0, "grad_norm": 1.9870597292458951, "language_loss": 0.71912026, "learning_rate": 3.934450788665594e-06, "loss": 0.7416594, "num_input_tokens_seen": 28962695, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 1.34375, "step": 1361, "time_per_iteration": 2.7459380626678467 }, { "auxiliary_loss_clip": 0.01201253, "auxiliary_loss_mlp": 0.01062404, "balance_loss_clip": 1.03590345, "balance_loss_mlp": 1.05327988, "epoch": 0.08188787013377424, "flos": 22784099915520.0, "grad_norm": 2.102111058085073, "language_loss": 0.76622307, "learning_rate": 3.934354817748363e-06, "loss": 0.78885972, "num_input_tokens_seen": 28982120, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.296875, "step": 1362, "time_per_iteration": 2.6558966636657715 }, { "auxiliary_loss_clip": 0.01190072, "auxiliary_loss_mlp": 0.01054983, "balance_loss_clip": 1.03159404, "balance_loss_mlp": 1.05907118, "epoch": 0.08194799338644221, "flos": 16468095214080.0, "grad_norm": 2.1060680141403294, "language_loss": 0.72352839, "learning_rate": 3.934258777798984e-06, "loss": 0.74597889, "num_input_tokens_seen": 28998100, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.3125, "step": 1363, "time_per_iteration": 2.5726537704467773 }, { "auxiliary_loss_clip": 0.01190942, "auxiliary_loss_mlp": 0.01068238, "balance_loss_clip": 1.04183292, "balance_loss_mlp": 1.05996215, "epoch": 0.08200811663911017, "flos": 23913633974400.0, "grad_norm": 1.9973370037373122, "language_loss": 0.76819056, "learning_rate": 3.934162668820884e-06, "loss": 0.79078239, "num_input_tokens_seen": 29017095, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.3125, "step": 1364, "time_per_iteration": 2.6079413890838623 }, { "auxiliary_loss_clip": 0.01228791, "auxiliary_loss_mlp": 0.01065146, "balance_loss_clip": 1.04009974, "balance_loss_mlp": 1.0589335, "epoch": 0.08206823989177814, "flos": 17896550256000.0, "grad_norm": 3.0799704526186447, "language_loss": 0.81634951, "learning_rate": 3.934066490817495e-06, "loss": 0.83928889, "num_input_tokens_seen": 29037240, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.328125, "step": 1365, "time_per_iteration": 2.570411443710327 }, { "auxiliary_loss_clip": 0.012001, "auxiliary_loss_mlp": 0.0106108, "balance_loss_clip": 1.03726196, "balance_loss_mlp": 1.06132793, "epoch": 0.08212836314444612, "flos": 22088186841600.0, "grad_norm": 1.571204041110151, "language_loss": 0.8207407, "learning_rate": 3.9339702437922465e-06, "loss": 0.8433525, "num_input_tokens_seen": 29056250, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 1.296875, "step": 1366, "time_per_iteration": 2.510232448577881 }, { "auxiliary_loss_clip": 0.01215812, "auxiliary_loss_mlp": 0.01063572, "balance_loss_clip": 1.04058862, "balance_loss_mlp": 1.05604541, "epoch": 0.08218848639711408, "flos": 17597485618560.0, "grad_norm": 1.6782208024856948, "language_loss": 0.81540871, "learning_rate": 3.933873927748575e-06, "loss": 0.8382026, "num_input_tokens_seen": 29073380, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.3203125, "step": 1367, "time_per_iteration": 2.567965030670166 }, { "auxiliary_loss_clip": 0.01192815, "auxiliary_loss_mlp": 0.01061575, "balance_loss_clip": 1.036708, "balance_loss_mlp": 1.05816627, "epoch": 0.08224860964978205, "flos": 17857838373120.0, "grad_norm": 1.9838833208717128, "language_loss": 0.83554763, "learning_rate": 3.933777542689918e-06, "loss": 0.85809153, "num_input_tokens_seen": 29091330, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 1.34375, "step": 1368, "time_per_iteration": 2.5379419326782227 }, { "auxiliary_loss_clip": 0.01210265, "auxiliary_loss_mlp": 0.0105291, "balance_loss_clip": 1.02967644, "balance_loss_mlp": 1.05895829, "epoch": 0.08230873290245003, "flos": 25227533566080.0, "grad_norm": 1.6083120288214108, "language_loss": 0.80996799, "learning_rate": 3.933681088619715e-06, "loss": 0.83259976, "num_input_tokens_seen": 29110375, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.2421875, "step": 1369, "time_per_iteration": 2.7074978351593018 }, { "auxiliary_loss_clip": 0.01222819, "auxiliary_loss_mlp": 0.01304518, "balance_loss_clip": 1.03349924, "balance_loss_mlp": 1.0590632, "epoch": 0.08236885615511799, "flos": 31759935753600.0, "grad_norm": 2.0831433604257885, "language_loss": 0.74232602, "learning_rate": 3.933584565541407e-06, "loss": 0.76759946, "num_input_tokens_seen": 29129395, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.2734375, "step": 1370, "time_per_iteration": 2.751103401184082 }, { "auxiliary_loss_clip": 0.01227768, "auxiliary_loss_mlp": 0.01069071, "balance_loss_clip": 1.0439539, "balance_loss_mlp": 1.05936384, "epoch": 0.08242897940778596, "flos": 23185832601600.0, "grad_norm": 1.4494800730168742, "language_loss": 0.74413031, "learning_rate": 3.9334879734584405e-06, "loss": 0.76709878, "num_input_tokens_seen": 29148650, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.3203125, "step": 1371, "time_per_iteration": 2.7171828746795654 }, { "auxiliary_loss_clip": 0.01252653, "auxiliary_loss_mlp": 0.01063514, "balance_loss_clip": 1.03867066, "balance_loss_mlp": 1.05808485, "epoch": 0.08248910266045394, "flos": 34491480393600.0, "grad_norm": 2.471575310801341, "language_loss": 0.71110547, "learning_rate": 3.933391312374262e-06, "loss": 0.73426712, "num_input_tokens_seen": 29170785, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.3125, "step": 1372, "time_per_iteration": 2.8840537071228027 }, { "auxiliary_loss_clip": 0.01218129, "auxiliary_loss_mlp": 0.01055416, "balance_loss_clip": 1.03107309, "balance_loss_mlp": 1.05836606, "epoch": 0.0825492259131219, "flos": 13436228960640.0, "grad_norm": 2.275391196344018, "language_loss": 0.88072693, "learning_rate": 3.93329458229232e-06, "loss": 0.90346235, "num_input_tokens_seen": 29185210, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 1.3203125, "step": 1373, "time_per_iteration": 2.6570045948028564 }, { "auxiliary_loss_clip": 0.01217125, "auxiliary_loss_mlp": 0.01060738, "balance_loss_clip": 1.03665817, "balance_loss_mlp": 1.0589931, "epoch": 0.08260934916578987, "flos": 25812446636160.0, "grad_norm": 2.828698224193423, "language_loss": 0.82274514, "learning_rate": 3.933197783216068e-06, "loss": 0.84552377, "num_input_tokens_seen": 29205210, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.3046875, "step": 1374, "time_per_iteration": 2.7723023891448975 }, { "auxiliary_loss_clip": 0.01108135, "auxiliary_loss_mlp": 0.0101286, "balance_loss_clip": 1.00949848, "balance_loss_mlp": 1.02179384, "epoch": 0.08266947241845783, "flos": 63460009491840.0, "grad_norm": 0.825190771707823, "language_loss": 0.60630643, "learning_rate": 3.93310091514896e-06, "loss": 0.62751639, "num_input_tokens_seen": 29265350, "router_z_loss_clip": 0.03369141, "router_z_loss_mlp": 0.49609375, "step": 1375, "time_per_iteration": 3.243380546569824 }, { "auxiliary_loss_clip": 0.01088689, "auxiliary_loss_mlp": 0.01006248, "balance_loss_clip": 1.00293446, "balance_loss_mlp": 1.02058744, "epoch": 0.08272959567112581, "flos": 69993704568960.0, "grad_norm": 0.9117497104524456, "language_loss": 0.62195385, "learning_rate": 3.933003978094452e-06, "loss": 0.64290321, "num_input_tokens_seen": 29321475, "router_z_loss_clip": 0.03320312, "router_z_loss_mlp": 0.49609375, "step": 1376, "time_per_iteration": 3.175685405731201 }, { "auxiliary_loss_clip": 0.01231674, "auxiliary_loss_mlp": 0.01068079, "balance_loss_clip": 1.04353428, "balance_loss_mlp": 1.06218314, "epoch": 0.08278971892379378, "flos": 20413205781120.0, "grad_norm": 1.7861329137042588, "language_loss": 0.82272542, "learning_rate": 3.9329069720560045e-06, "loss": 0.84572291, "num_input_tokens_seen": 29341405, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 1.328125, "step": 1377, "time_per_iteration": 2.7203845977783203 }, { "auxiliary_loss_clip": 0.01205325, "auxiliary_loss_mlp": 0.01054905, "balance_loss_clip": 1.03201675, "balance_loss_mlp": 1.05753314, "epoch": 0.08284984217646174, "flos": 26250233598720.0, "grad_norm": 1.7943378925830848, "language_loss": 0.8464483, "learning_rate": 3.932809897037079e-06, "loss": 0.8690505, "num_input_tokens_seen": 29361955, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.296875, "step": 1378, "time_per_iteration": 2.679175853729248 }, { "auxiliary_loss_clip": 0.01195976, "auxiliary_loss_mlp": 0.01058789, "balance_loss_clip": 1.03432727, "balance_loss_mlp": 1.05633044, "epoch": 0.08290996542912972, "flos": 27194683852800.0, "grad_norm": 2.1209143212198796, "language_loss": 0.87387538, "learning_rate": 3.932712753041141e-06, "loss": 0.89642304, "num_input_tokens_seen": 29382395, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.3046875, "step": 1379, "time_per_iteration": 2.777778387069702 }, { "auxiliary_loss_clip": 0.0121383, "auxiliary_loss_mlp": 0.01061612, "balance_loss_clip": 1.03898644, "balance_loss_mlp": 1.05911374, "epoch": 0.08297008868179769, "flos": 38618191146240.0, "grad_norm": 2.569776466498695, "language_loss": 0.78518438, "learning_rate": 3.932615540071656e-06, "loss": 0.80793881, "num_input_tokens_seen": 29404460, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.265625, "step": 1380, "time_per_iteration": 2.8539037704467773 }, { "auxiliary_loss_clip": 0.01196933, "auxiliary_loss_mlp": 0.01060074, "balance_loss_clip": 1.0353502, "balance_loss_mlp": 1.06097233, "epoch": 0.08303021193446565, "flos": 19974736460160.0, "grad_norm": 2.1481597797231484, "language_loss": 0.86066139, "learning_rate": 3.932518258132094e-06, "loss": 0.88323146, "num_input_tokens_seen": 29422675, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 1.2734375, "step": 1381, "time_per_iteration": 2.7132058143615723 }, { "auxiliary_loss_clip": 0.01204642, "auxiliary_loss_mlp": 0.01309513, "balance_loss_clip": 1.036057, "balance_loss_mlp": 1.06099606, "epoch": 0.08309033518713363, "flos": 13662646341120.0, "grad_norm": 2.7061107156044613, "language_loss": 0.8784222, "learning_rate": 3.932420907225926e-06, "loss": 0.9035638, "num_input_tokens_seen": 29439840, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 1.34375, "step": 1382, "time_per_iteration": 2.5739898681640625 }, { "auxiliary_loss_clip": 0.01217928, "auxiliary_loss_mlp": 0.01058762, "balance_loss_clip": 1.03683972, "balance_loss_mlp": 1.06058431, "epoch": 0.0831504584398016, "flos": 17968551068160.0, "grad_norm": 2.316930858264656, "language_loss": 0.77317762, "learning_rate": 3.932323487356626e-06, "loss": 0.79594451, "num_input_tokens_seen": 29457360, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.296875, "step": 1383, "time_per_iteration": 2.7264225482940674 }, { "auxiliary_loss_clip": 0.01202153, "auxiliary_loss_mlp": 0.01057392, "balance_loss_clip": 1.03362179, "balance_loss_mlp": 1.05979013, "epoch": 0.08321058169246956, "flos": 22601386408320.0, "grad_norm": 2.41525678176774, "language_loss": 0.83067834, "learning_rate": 3.932225998527672e-06, "loss": 0.85327381, "num_input_tokens_seen": 29477040, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 1.328125, "step": 1384, "time_per_iteration": 2.619450807571411 }, { "auxiliary_loss_clip": 0.01233903, "auxiliary_loss_mlp": 0.01054073, "balance_loss_clip": 1.02818108, "balance_loss_mlp": 1.06173146, "epoch": 0.08327070494513754, "flos": 22850426378880.0, "grad_norm": 3.1457082044563376, "language_loss": 0.84801888, "learning_rate": 3.932128440742542e-06, "loss": 0.87089872, "num_input_tokens_seen": 29492010, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.3515625, "step": 1385, "time_per_iteration": 2.7696642875671387 }, { "auxiliary_loss_clip": 0.01222495, "auxiliary_loss_mlp": 0.01062102, "balance_loss_clip": 1.03785539, "balance_loss_mlp": 1.06127775, "epoch": 0.0833308281978055, "flos": 22782986593920.0, "grad_norm": 1.6839304403359994, "language_loss": 0.68301117, "learning_rate": 3.932030814004719e-06, "loss": 0.7058571, "num_input_tokens_seen": 29511850, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.34375, "step": 1386, "time_per_iteration": 2.672715663909912 }, { "auxiliary_loss_clip": 0.01194686, "auxiliary_loss_mlp": 0.01054073, "balance_loss_clip": 1.03000474, "balance_loss_mlp": 1.05470026, "epoch": 0.08339095145047347, "flos": 20812604083200.0, "grad_norm": 1.8471205687104186, "language_loss": 0.8150264, "learning_rate": 3.9319331183176844e-06, "loss": 0.83751404, "num_input_tokens_seen": 29531415, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 1.3125, "step": 1387, "time_per_iteration": 2.680706262588501 }, { "auxiliary_loss_clip": 0.01209853, "auxiliary_loss_mlp": 0.01066528, "balance_loss_clip": 1.04143405, "balance_loss_mlp": 1.05809259, "epoch": 0.08345107470314143, "flos": 18515326872960.0, "grad_norm": 2.3159802185228306, "language_loss": 0.77088094, "learning_rate": 3.931835353684927e-06, "loss": 0.79364479, "num_input_tokens_seen": 29549525, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.3359375, "step": 1388, "time_per_iteration": 5.584060430526733 }, { "auxiliary_loss_clip": 0.01232427, "auxiliary_loss_mlp": 0.01065662, "balance_loss_clip": 1.04061651, "balance_loss_mlp": 1.0597496, "epoch": 0.08351119795580941, "flos": 18807567926400.0, "grad_norm": 2.1008110619833507, "language_loss": 0.79058015, "learning_rate": 3.931737520109935e-06, "loss": 0.81356102, "num_input_tokens_seen": 29568705, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.265625, "step": 1389, "time_per_iteration": 4.054110288619995 }, { "auxiliary_loss_clip": 0.01212348, "auxiliary_loss_mlp": 0.01060686, "balance_loss_clip": 1.03720129, "balance_loss_mlp": 1.06130457, "epoch": 0.08357132120847738, "flos": 18441817689600.0, "grad_norm": 3.848141851396981, "language_loss": 0.8708837, "learning_rate": 3.931639617596201e-06, "loss": 0.89361399, "num_input_tokens_seen": 29585855, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.328125, "step": 1390, "time_per_iteration": 2.5285804271698 }, { "auxiliary_loss_clip": 0.01207103, "auxiliary_loss_mlp": 0.01058502, "balance_loss_clip": 1.03411245, "balance_loss_mlp": 1.05770957, "epoch": 0.08363144446114534, "flos": 25922333318400.0, "grad_norm": 1.6483149014374465, "language_loss": 0.86744332, "learning_rate": 3.931541646147217e-06, "loss": 0.89009929, "num_input_tokens_seen": 29607280, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.3125, "step": 1391, "time_per_iteration": 2.6551315784454346 }, { "auxiliary_loss_clip": 0.01233993, "auxiliary_loss_mlp": 0.01076626, "balance_loss_clip": 1.0527482, "balance_loss_mlp": 1.06146848, "epoch": 0.08369156771381332, "flos": 18041306065920.0, "grad_norm": 13.650028524469363, "language_loss": 0.87562883, "learning_rate": 3.93144360576648e-06, "loss": 0.89873505, "num_input_tokens_seen": 29624130, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.359375, "step": 1392, "time_per_iteration": 4.183831453323364 }, { "auxiliary_loss_clip": 0.01201196, "auxiliary_loss_mlp": 0.01067728, "balance_loss_clip": 1.04399395, "balance_loss_mlp": 1.05892313, "epoch": 0.08375169096648129, "flos": 22675111073280.0, "grad_norm": 2.2056897240887072, "language_loss": 0.80103576, "learning_rate": 3.931345496457489e-06, "loss": 0.82372499, "num_input_tokens_seen": 29643210, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 1.328125, "step": 1393, "time_per_iteration": 2.6845829486846924 }, { "auxiliary_loss_clip": 0.01193289, "auxiliary_loss_mlp": 0.01059491, "balance_loss_clip": 1.03604269, "balance_loss_mlp": 1.06425679, "epoch": 0.08381181421914925, "flos": 26103215232000.0, "grad_norm": 1.719759389171028, "language_loss": 0.84791166, "learning_rate": 3.931247318223746e-06, "loss": 0.87043941, "num_input_tokens_seen": 29663920, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.2890625, "step": 1394, "time_per_iteration": 2.6590018272399902 }, { "auxiliary_loss_clip": 0.01229919, "auxiliary_loss_mlp": 0.01054391, "balance_loss_clip": 1.03065693, "balance_loss_mlp": 1.06166553, "epoch": 0.08387193747181723, "flos": 20629782835200.0, "grad_norm": 2.1101459391327624, "language_loss": 0.82667011, "learning_rate": 3.931149071068753e-06, "loss": 0.84951317, "num_input_tokens_seen": 29683825, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 1.3125, "step": 1395, "time_per_iteration": 2.6723222732543945 }, { "auxiliary_loss_clip": 0.01204465, "auxiliary_loss_mlp": 0.01314522, "balance_loss_clip": 1.03959751, "balance_loss_mlp": 1.0619998, "epoch": 0.0839320607244852, "flos": 13443196199040.0, "grad_norm": 2.003890352140024, "language_loss": 0.82592702, "learning_rate": 3.931050754996018e-06, "loss": 0.8511169, "num_input_tokens_seen": 29698775, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.3359375, "step": 1396, "time_per_iteration": 2.7726709842681885 }, { "auxiliary_loss_clip": 0.01202026, "auxiliary_loss_mlp": 0.01063354, "balance_loss_clip": 1.03790259, "balance_loss_mlp": 1.06412649, "epoch": 0.08399218397715316, "flos": 23477247642240.0, "grad_norm": 1.958605210892923, "language_loss": 0.76214689, "learning_rate": 3.930952370009048e-06, "loss": 0.78480065, "num_input_tokens_seen": 29719430, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.2890625, "step": 1397, "time_per_iteration": 2.7571945190429688 }, { "auxiliary_loss_clip": 0.01221681, "auxiliary_loss_mlp": 0.01051302, "balance_loss_clip": 1.02679205, "balance_loss_mlp": 1.06343162, "epoch": 0.08405230722982113, "flos": 25920717206400.0, "grad_norm": 2.504082297811024, "language_loss": 0.78134882, "learning_rate": 3.930853916111355e-06, "loss": 0.80407864, "num_input_tokens_seen": 29739685, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 1.3046875, "step": 1398, "time_per_iteration": 2.757019519805908 }, { "auxiliary_loss_clip": 0.01188764, "auxiliary_loss_mlp": 0.01055224, "balance_loss_clip": 1.03177547, "balance_loss_mlp": 1.06096017, "epoch": 0.0841124304824891, "flos": 17967437746560.0, "grad_norm": 2.022924500825401, "language_loss": 0.94766486, "learning_rate": 3.930755393306453e-06, "loss": 0.97010481, "num_input_tokens_seen": 29756165, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.28125, "step": 1399, "time_per_iteration": 2.7320423126220703 }, { "auxiliary_loss_clip": 0.01205792, "auxiliary_loss_mlp": 0.01059209, "balance_loss_clip": 1.03423429, "balance_loss_mlp": 1.06184518, "epoch": 0.08417255373515707, "flos": 25629661301760.0, "grad_norm": 1.751505846641596, "language_loss": 0.81460094, "learning_rate": 3.930656801597857e-06, "loss": 0.83725095, "num_input_tokens_seen": 29776425, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.34375, "step": 1400, "time_per_iteration": 2.754155158996582 }, { "auxiliary_loss_clip": 0.01212613, "auxiliary_loss_mlp": 0.01063254, "balance_loss_clip": 1.03917325, "balance_loss_mlp": 1.06236339, "epoch": 0.08423267698782504, "flos": 26249730808320.0, "grad_norm": 2.5639454016382874, "language_loss": 0.86600322, "learning_rate": 3.930558140989087e-06, "loss": 0.88876188, "num_input_tokens_seen": 29796440, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.3203125, "step": 1401, "time_per_iteration": 2.7952847480773926 }, { "auxiliary_loss_clip": 0.01193502, "auxiliary_loss_mlp": 0.01063746, "balance_loss_clip": 1.03856862, "balance_loss_mlp": 1.06071138, "epoch": 0.08429280024049302, "flos": 20119707751680.0, "grad_norm": 2.6592625249690447, "language_loss": 0.87315214, "learning_rate": 3.930459411483662e-06, "loss": 0.89572459, "num_input_tokens_seen": 29814755, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.328125, "step": 1402, "time_per_iteration": 2.604421615600586 }, { "auxiliary_loss_clip": 0.01216235, "auxiliary_loss_mlp": 0.01309492, "balance_loss_clip": 1.03814888, "balance_loss_mlp": 1.05985367, "epoch": 0.08435292349316098, "flos": 42924526836480.0, "grad_norm": 2.4136637617083507, "language_loss": 0.89004314, "learning_rate": 3.930360613085106e-06, "loss": 0.91530037, "num_input_tokens_seen": 29834785, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.2890625, "step": 1403, "time_per_iteration": 2.8547284603118896 }, { "auxiliary_loss_clip": 0.01212799, "auxiliary_loss_mlp": 0.01060391, "balance_loss_clip": 1.03491616, "balance_loss_mlp": 1.0608995, "epoch": 0.08441304674582895, "flos": 22857285876480.0, "grad_norm": 1.8181204598598195, "language_loss": 0.79961705, "learning_rate": 3.930261745796945e-06, "loss": 0.82234889, "num_input_tokens_seen": 29854695, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.3359375, "step": 1404, "time_per_iteration": 2.6070621013641357 }, { "auxiliary_loss_clip": 0.01221839, "auxiliary_loss_mlp": 0.01075762, "balance_loss_clip": 1.05077565, "balance_loss_mlp": 1.06753743, "epoch": 0.08447316999849692, "flos": 18697501676160.0, "grad_norm": 2.122169351384032, "language_loss": 0.83426583, "learning_rate": 3.930162809622709e-06, "loss": 0.85724181, "num_input_tokens_seen": 29872180, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.359375, "step": 1405, "time_per_iteration": 2.5989725589752197 }, { "auxiliary_loss_clip": 0.01203539, "auxiliary_loss_mlp": 0.01056609, "balance_loss_clip": 1.03379226, "balance_loss_mlp": 1.062253, "epoch": 0.08453329325116489, "flos": 25483971738240.0, "grad_norm": 1.8831686498672298, "language_loss": 0.80020833, "learning_rate": 3.930063804565927e-06, "loss": 0.82280982, "num_input_tokens_seen": 29893205, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.3203125, "step": 1406, "time_per_iteration": 2.611203908920288 }, { "auxiliary_loss_clip": 0.01215515, "auxiliary_loss_mlp": 0.0106488, "balance_loss_clip": 1.03995323, "balance_loss_mlp": 1.06588113, "epoch": 0.08459341650383286, "flos": 20920048640640.0, "grad_norm": 1.774907421917443, "language_loss": 0.79661596, "learning_rate": 3.929964730630132e-06, "loss": 0.81941986, "num_input_tokens_seen": 29911970, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 1.3125, "step": 1407, "time_per_iteration": 2.676945924758911 }, { "auxiliary_loss_clip": 0.01202795, "auxiliary_loss_mlp": 0.01057804, "balance_loss_clip": 1.03440356, "balance_loss_mlp": 1.06321263, "epoch": 0.08465353975650082, "flos": 13043079624960.0, "grad_norm": 3.6697700511082685, "language_loss": 0.91689914, "learning_rate": 3.92986558781886e-06, "loss": 0.93950516, "num_input_tokens_seen": 29929925, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.3046875, "step": 1408, "time_per_iteration": 2.6187543869018555 }, { "auxiliary_loss_clip": 0.01101235, "auxiliary_loss_mlp": 0.01016947, "balance_loss_clip": 1.01396704, "balance_loss_mlp": 1.03202796, "epoch": 0.0847136630091688, "flos": 60877422552960.0, "grad_norm": 0.8776638848433693, "language_loss": 0.61801088, "learning_rate": 3.92976637613565e-06, "loss": 0.6391927, "num_input_tokens_seen": 29985950, "router_z_loss_clip": 0.02978516, "router_z_loss_mlp": 0.5078125, "step": 1409, "time_per_iteration": 3.2267258167266846 }, { "auxiliary_loss_clip": 0.01190438, "auxiliary_loss_mlp": 0.01059854, "balance_loss_clip": 1.03653622, "balance_loss_mlp": 1.06534743, "epoch": 0.08477378626183676, "flos": 22046530043520.0, "grad_norm": 1.7514950578568893, "language_loss": 0.86394584, "learning_rate": 3.9296670955840415e-06, "loss": 0.88644874, "num_input_tokens_seen": 30004330, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 1.25, "step": 1410, "time_per_iteration": 2.6950271129608154 }, { "auxiliary_loss_clip": 0.01205817, "auxiliary_loss_mlp": 0.01055648, "balance_loss_clip": 1.0296489, "balance_loss_mlp": 1.06528687, "epoch": 0.08483390951450473, "flos": 16690059308160.0, "grad_norm": 1.9753330716524058, "language_loss": 0.74068701, "learning_rate": 3.929567746167578e-06, "loss": 0.76330161, "num_input_tokens_seen": 30022555, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.3125, "step": 1411, "time_per_iteration": 2.6160073280334473 }, { "auxiliary_loss_clip": 0.01078245, "auxiliary_loss_mlp": 0.01003995, "balance_loss_clip": 1.00106227, "balance_loss_mlp": 1.02799439, "epoch": 0.08489403276717271, "flos": 51584640082560.0, "grad_norm": 0.9051162891172495, "language_loss": 0.56773758, "learning_rate": 3.929468327889805e-06, "loss": 0.58855993, "num_input_tokens_seen": 30077220, "router_z_loss_clip": 0.02929688, "router_z_loss_mlp": 0.5, "step": 1412, "time_per_iteration": 3.1550891399383545 }, { "auxiliary_loss_clip": 0.01208333, "auxiliary_loss_mlp": 0.01066274, "balance_loss_clip": 1.04299235, "balance_loss_mlp": 1.06238604, "epoch": 0.08495415601984067, "flos": 17092330698240.0, "grad_norm": 36.3836412701779, "language_loss": 0.88799691, "learning_rate": 3.9293688407542715e-06, "loss": 0.91074306, "num_input_tokens_seen": 30094600, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.28125, "step": 1413, "time_per_iteration": 2.6331286430358887 }, { "auxiliary_loss_clip": 0.01214324, "auxiliary_loss_mlp": 0.01052698, "balance_loss_clip": 1.02907085, "balance_loss_mlp": 1.06722426, "epoch": 0.08501427927250864, "flos": 23148413608320.0, "grad_norm": 1.8882799833261579, "language_loss": 0.87802851, "learning_rate": 3.929269284764526e-06, "loss": 0.90069866, "num_input_tokens_seen": 30114475, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.2890625, "step": 1414, "time_per_iteration": 2.6649093627929688 }, { "auxiliary_loss_clip": 0.01213068, "auxiliary_loss_mlp": 0.01063499, "balance_loss_clip": 1.04120672, "balance_loss_mlp": 1.06471205, "epoch": 0.08507440252517662, "flos": 19063467394560.0, "grad_norm": 1.8061191776484324, "language_loss": 0.77182114, "learning_rate": 3.929169659924123e-06, "loss": 0.79458678, "num_input_tokens_seen": 30133350, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.296875, "step": 1415, "time_per_iteration": 2.5815818309783936 }, { "auxiliary_loss_clip": 0.01200317, "auxiliary_loss_mlp": 0.01064533, "balance_loss_clip": 1.04257488, "balance_loss_mlp": 1.06467533, "epoch": 0.08513452577784458, "flos": 60182296600320.0, "grad_norm": 3.3276566756067174, "language_loss": 0.70484591, "learning_rate": 3.929069966236617e-06, "loss": 0.72749442, "num_input_tokens_seen": 30159005, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.265625, "step": 1416, "time_per_iteration": 2.9982097148895264 }, { "auxiliary_loss_clip": 0.01215836, "auxiliary_loss_mlp": 0.01066426, "balance_loss_clip": 1.04157066, "balance_loss_mlp": 1.06452394, "epoch": 0.08519464903051255, "flos": 27308485117440.0, "grad_norm": 1.7665882720032866, "language_loss": 0.75075257, "learning_rate": 3.928970203705565e-06, "loss": 0.77357519, "num_input_tokens_seen": 30179450, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.328125, "step": 1417, "time_per_iteration": 2.6596670150756836 }, { "auxiliary_loss_clip": 0.01209238, "auxiliary_loss_mlp": 0.0105636, "balance_loss_clip": 1.03298295, "balance_loss_mlp": 1.06312621, "epoch": 0.08525477228318051, "flos": 20266438809600.0, "grad_norm": 2.6494053975555527, "language_loss": 0.82216525, "learning_rate": 3.92887037233453e-06, "loss": 0.84482121, "num_input_tokens_seen": 30197235, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 1.28125, "step": 1418, "time_per_iteration": 2.723775625228882 }, { "auxiliary_loss_clip": 0.01099942, "auxiliary_loss_mlp": 0.01018228, "balance_loss_clip": 1.01522398, "balance_loss_mlp": 1.02347279, "epoch": 0.08531489553584849, "flos": 67615017183360.0, "grad_norm": 0.8627505024453171, "language_loss": 0.56612706, "learning_rate": 3.928770472127073e-06, "loss": 0.58730876, "num_input_tokens_seen": 30257410, "router_z_loss_clip": 0.0300293, "router_z_loss_mlp": 0.49023438, "step": 1419, "time_per_iteration": 3.2404749393463135 }, { "auxiliary_loss_clip": 0.01192397, "auxiliary_loss_mlp": 0.01059676, "balance_loss_clip": 1.03628683, "balance_loss_mlp": 1.06242192, "epoch": 0.08537501878851646, "flos": 27526965592320.0, "grad_norm": 2.4429738898733735, "language_loss": 0.69630468, "learning_rate": 3.928670503086758e-06, "loss": 0.7188254, "num_input_tokens_seen": 30277865, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.296875, "step": 1420, "time_per_iteration": 2.7171332836151123 }, { "auxiliary_loss_clip": 0.01215208, "auxiliary_loss_mlp": 0.01055824, "balance_loss_clip": 1.03255439, "balance_loss_mlp": 1.06024694, "epoch": 0.08543514204118442, "flos": 22784243569920.0, "grad_norm": 1.5750672390704503, "language_loss": 0.87882853, "learning_rate": 3.9285704652171545e-06, "loss": 0.90153885, "num_input_tokens_seen": 30298545, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.2734375, "step": 1421, "time_per_iteration": 2.7418770790100098 }, { "auxiliary_loss_clip": 0.01067598, "auxiliary_loss_mlp": 0.0100843, "balance_loss_clip": 1.00547409, "balance_loss_mlp": 1.01905394, "epoch": 0.0854952652938524, "flos": 60990721027200.0, "grad_norm": 0.7986195624066585, "language_loss": 0.63544893, "learning_rate": 3.9284703585218324e-06, "loss": 0.65620923, "num_input_tokens_seen": 30361725, "router_z_loss_clip": 0.02954102, "router_z_loss_mlp": 0.484375, "step": 1422, "time_per_iteration": 3.1269500255584717 }, { "auxiliary_loss_clip": 0.01212282, "auxiliary_loss_mlp": 0.010608, "balance_loss_clip": 1.03750658, "balance_loss_mlp": 1.06153262, "epoch": 0.08555538854652037, "flos": 28038046256640.0, "grad_norm": 1.9234138429729135, "language_loss": 0.83188128, "learning_rate": 3.928370183004363e-06, "loss": 0.85461205, "num_input_tokens_seen": 30382180, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.234375, "step": 1423, "time_per_iteration": 2.685539722442627 }, { "auxiliary_loss_clip": 0.01219426, "auxiliary_loss_mlp": 0.01063985, "balance_loss_clip": 1.04118037, "balance_loss_mlp": 1.0627588, "epoch": 0.08561551179918833, "flos": 23509279595520.0, "grad_norm": 1.6045677732999655, "language_loss": 0.74968326, "learning_rate": 3.9282699386683236e-06, "loss": 0.77251738, "num_input_tokens_seen": 30402980, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 1.296875, "step": 1424, "time_per_iteration": 2.843622922897339 }, { "auxiliary_loss_clip": 0.01225325, "auxiliary_loss_mlp": 0.0105821, "balance_loss_clip": 1.03526258, "balance_loss_mlp": 1.06114721, "epoch": 0.08567563505185631, "flos": 17926930183680.0, "grad_norm": 1.9916420665008774, "language_loss": 0.75776231, "learning_rate": 3.928169625517289e-06, "loss": 0.78059769, "num_input_tokens_seen": 30420800, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 1.28125, "step": 1425, "time_per_iteration": 2.8124589920043945 }, { "auxiliary_loss_clip": 0.01185679, "auxiliary_loss_mlp": 0.01052354, "balance_loss_clip": 1.0300858, "balance_loss_mlp": 1.06041348, "epoch": 0.08573575830452428, "flos": 19719519350400.0, "grad_norm": 2.225528772663269, "language_loss": 0.92795551, "learning_rate": 3.9280692435548405e-06, "loss": 0.9503358, "num_input_tokens_seen": 30439620, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.25, "step": 1426, "time_per_iteration": 2.7257068157196045 }, { "auxiliary_loss_clip": 0.01243183, "auxiliary_loss_mlp": 0.01067159, "balance_loss_clip": 1.04119468, "balance_loss_mlp": 1.06548023, "epoch": 0.08579588155719224, "flos": 17931563038080.0, "grad_norm": 2.220805529728573, "language_loss": 0.755481, "learning_rate": 3.927968792784561e-06, "loss": 0.77858448, "num_input_tokens_seen": 30457300, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.3203125, "step": 1427, "time_per_iteration": 2.709411859512329 }, { "auxiliary_loss_clip": 0.01187433, "auxiliary_loss_mlp": 0.01058094, "balance_loss_clip": 1.03603983, "balance_loss_mlp": 1.06102216, "epoch": 0.08585600480986022, "flos": 16033324993920.0, "grad_norm": 2.389969834999752, "language_loss": 0.81589681, "learning_rate": 3.927868273210033e-06, "loss": 0.83835208, "num_input_tokens_seen": 30471580, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.265625, "step": 1428, "time_per_iteration": 2.661634683609009 }, { "auxiliary_loss_clip": 0.01201041, "auxiliary_loss_mlp": 0.01062561, "balance_loss_clip": 1.03876638, "balance_loss_mlp": 1.06079638, "epoch": 0.08591612806252819, "flos": 28657433404800.0, "grad_norm": 1.9482339809934952, "language_loss": 0.79644901, "learning_rate": 3.927767684834847e-06, "loss": 0.81908506, "num_input_tokens_seen": 30492720, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 1.3125, "step": 1429, "time_per_iteration": 2.755324125289917 }, { "auxiliary_loss_clip": 0.01211279, "auxiliary_loss_mlp": 0.01066074, "balance_loss_clip": 1.04225564, "balance_loss_mlp": 1.06409812, "epoch": 0.08597625131519615, "flos": 20959119659520.0, "grad_norm": 1.915208930533756, "language_loss": 0.87904745, "learning_rate": 3.9276670276625894e-06, "loss": 0.90182096, "num_input_tokens_seen": 30509535, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 1.2890625, "step": 1430, "time_per_iteration": 5.7765398025512695 }, { "auxiliary_loss_clip": 0.01216673, "auxiliary_loss_mlp": 0.01070349, "balance_loss_clip": 1.04753256, "balance_loss_mlp": 1.06346297, "epoch": 0.08603637456786412, "flos": 23256360956160.0, "grad_norm": 1.672958322692802, "language_loss": 0.81869507, "learning_rate": 3.927566301696856e-06, "loss": 0.84156537, "num_input_tokens_seen": 30529490, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.2578125, "step": 1431, "time_per_iteration": 4.0598227977752686 }, { "auxiliary_loss_clip": 0.01218295, "auxiliary_loss_mlp": 0.01060225, "balance_loss_clip": 1.03666973, "balance_loss_mlp": 1.06325257, "epoch": 0.0860964978205321, "flos": 28694170039680.0, "grad_norm": 1.5821721611077566, "language_loss": 0.77405691, "learning_rate": 3.927465506941238e-06, "loss": 0.7968421, "num_input_tokens_seen": 30550205, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 1.28125, "step": 1432, "time_per_iteration": 2.9066784381866455 }, { "auxiliary_loss_clip": 0.01206515, "auxiliary_loss_mlp": 0.01060356, "balance_loss_clip": 1.0357039, "balance_loss_mlp": 1.05929041, "epoch": 0.08615662107320006, "flos": 19318397195520.0, "grad_norm": 2.194301286518109, "language_loss": 0.73307234, "learning_rate": 3.927364643399335e-06, "loss": 0.755741, "num_input_tokens_seen": 30568830, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 1.2890625, "step": 1433, "time_per_iteration": 2.6547093391418457 }, { "auxiliary_loss_clip": 0.01220023, "auxiliary_loss_mlp": 0.01309464, "balance_loss_clip": 1.03468001, "balance_loss_mlp": 1.06241679, "epoch": 0.08621674432586802, "flos": 15851688894720.0, "grad_norm": 3.0283226935925307, "language_loss": 0.85745835, "learning_rate": 3.927263711074745e-06, "loss": 0.88275325, "num_input_tokens_seen": 30585730, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.296875, "step": 1434, "time_per_iteration": 4.849929094314575 }, { "auxiliary_loss_clip": 0.01205927, "auxiliary_loss_mlp": 0.01059655, "balance_loss_clip": 1.03636205, "balance_loss_mlp": 1.05928206, "epoch": 0.086276867578536, "flos": 14100648785280.0, "grad_norm": 2.288980069550643, "language_loss": 0.78572035, "learning_rate": 3.927162709971072e-06, "loss": 0.80837619, "num_input_tokens_seen": 30603180, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 1.28125, "step": 1435, "time_per_iteration": 2.6316092014312744 }, { "auxiliary_loss_clip": 0.01094694, "auxiliary_loss_mlp": 0.01015364, "balance_loss_clip": 1.01262212, "balance_loss_mlp": 1.01949239, "epoch": 0.08633699083120397, "flos": 70184857772160.0, "grad_norm": 0.8935054295938287, "language_loss": 0.57939065, "learning_rate": 3.927061640091918e-06, "loss": 0.60049129, "num_input_tokens_seen": 30668895, "router_z_loss_clip": 0.02746582, "router_z_loss_mlp": 0.4765625, "step": 1436, "time_per_iteration": 3.296766996383667 }, { "auxiliary_loss_clip": 0.01199735, "auxiliary_loss_mlp": 0.01052204, "balance_loss_clip": 1.02876711, "balance_loss_mlp": 1.06050932, "epoch": 0.08639711408387193, "flos": 30298874140800.0, "grad_norm": 2.3482435499464107, "language_loss": 0.69104981, "learning_rate": 3.926960501440891e-06, "loss": 0.71356916, "num_input_tokens_seen": 30688955, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.296875, "step": 1437, "time_per_iteration": 2.765314817428589 }, { "auxiliary_loss_clip": 0.01235094, "auxiliary_loss_mlp": 0.01051998, "balance_loss_clip": 1.02906203, "balance_loss_mlp": 1.06064844, "epoch": 0.08645723733653991, "flos": 20297680663680.0, "grad_norm": 2.683085500871002, "language_loss": 0.72402418, "learning_rate": 3.9268592940216014e-06, "loss": 0.74689519, "num_input_tokens_seen": 30706095, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 1.2890625, "step": 1438, "time_per_iteration": 2.7724132537841797 }, { "auxiliary_loss_clip": 0.01197409, "auxiliary_loss_mlp": 0.01056176, "balance_loss_clip": 1.03253675, "balance_loss_mlp": 1.06209862, "epoch": 0.08651736058920788, "flos": 32890583134080.0, "grad_norm": 1.7206222494723944, "language_loss": 0.63702339, "learning_rate": 3.9267580178376596e-06, "loss": 0.65955925, "num_input_tokens_seen": 30729025, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.265625, "step": 1439, "time_per_iteration": 2.7307186126708984 }, { "auxiliary_loss_clip": 0.01217846, "auxiliary_loss_mlp": 0.01058554, "balance_loss_clip": 1.03387785, "balance_loss_mlp": 1.06297743, "epoch": 0.08657748384187584, "flos": 22637368857600.0, "grad_norm": 3.2567000505706454, "language_loss": 0.87012351, "learning_rate": 3.92665667289268e-06, "loss": 0.89288759, "num_input_tokens_seen": 30746155, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 1.2734375, "step": 1440, "time_per_iteration": 2.780843734741211 }, { "auxiliary_loss_clip": 0.0122131, "auxiliary_loss_mlp": 0.01314357, "balance_loss_clip": 1.04083323, "balance_loss_mlp": 1.06119752, "epoch": 0.08663760709454381, "flos": 23658380951040.0, "grad_norm": 2.0478678435333215, "language_loss": 0.83313799, "learning_rate": 3.92655525919028e-06, "loss": 0.85849464, "num_input_tokens_seen": 30761410, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 1.328125, "step": 1441, "time_per_iteration": 2.6429452896118164 }, { "auxiliary_loss_clip": 0.01064773, "auxiliary_loss_mlp": 0.01002967, "balance_loss_clip": 0.99974805, "balance_loss_mlp": 1.01651919, "epoch": 0.08669773034721179, "flos": 62686564911360.0, "grad_norm": 0.8411186921008502, "language_loss": 0.60476935, "learning_rate": 3.926453776734078e-06, "loss": 0.6254468, "num_input_tokens_seen": 30823010, "router_z_loss_clip": 0.03222656, "router_z_loss_mlp": 0.48046875, "step": 1442, "time_per_iteration": 3.2911789417266846 }, { "auxiliary_loss_clip": 0.01209517, "auxiliary_loss_mlp": 0.01057082, "balance_loss_clip": 1.03314519, "balance_loss_mlp": 1.05973172, "epoch": 0.08675785359987975, "flos": 20667489137280.0, "grad_norm": 2.8182295766988092, "language_loss": 0.7825098, "learning_rate": 3.9263522255276965e-06, "loss": 0.80517578, "num_input_tokens_seen": 30841980, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.3125, "step": 1443, "time_per_iteration": 2.531569004058838 }, { "auxiliary_loss_clip": 0.01195881, "auxiliary_loss_mlp": 0.01052474, "balance_loss_clip": 1.02887034, "balance_loss_mlp": 1.05706501, "epoch": 0.08681797685254772, "flos": 26941118768640.0, "grad_norm": 3.002133049223849, "language_loss": 0.82154876, "learning_rate": 3.9262506055747596e-06, "loss": 0.84403229, "num_input_tokens_seen": 30863280, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.296875, "step": 1444, "time_per_iteration": 2.6435835361480713 }, { "auxiliary_loss_clip": 0.01190962, "auxiliary_loss_mlp": 0.01052418, "balance_loss_clip": 1.02875507, "balance_loss_mlp": 1.05969286, "epoch": 0.0868781001052157, "flos": 17712831168000.0, "grad_norm": 2.811760937761325, "language_loss": 0.86755759, "learning_rate": 3.926148916878893e-06, "loss": 0.88999134, "num_input_tokens_seen": 30881710, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.3125, "step": 1445, "time_per_iteration": 2.5323097705841064 }, { "auxiliary_loss_clip": 0.01201266, "auxiliary_loss_mlp": 0.01055662, "balance_loss_clip": 1.03365636, "balance_loss_mlp": 1.0633949, "epoch": 0.08693822335788366, "flos": 19896522595200.0, "grad_norm": 1.810985097031027, "language_loss": 0.81676978, "learning_rate": 3.926047159443727e-06, "loss": 0.83933914, "num_input_tokens_seen": 30900225, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.2890625, "step": 1446, "time_per_iteration": 2.6462481021881104 }, { "auxiliary_loss_clip": 0.01072899, "auxiliary_loss_mlp": 0.01002921, "balance_loss_clip": 0.99982148, "balance_loss_mlp": 1.01628327, "epoch": 0.08699834661055163, "flos": 67023747406080.0, "grad_norm": 0.7253504096064177, "language_loss": 0.54758763, "learning_rate": 3.925945333272891e-06, "loss": 0.56834584, "num_input_tokens_seen": 30959580, "router_z_loss_clip": 0.03100586, "router_z_loss_mlp": 0.47460938, "step": 1447, "time_per_iteration": 3.2045156955718994 }, { "auxiliary_loss_clip": 0.01211012, "auxiliary_loss_mlp": 0.01059459, "balance_loss_clip": 1.03667808, "balance_loss_mlp": 1.05868018, "epoch": 0.0870584698632196, "flos": 13480507451520.0, "grad_norm": 4.767855431023927, "language_loss": 0.84644926, "learning_rate": 3.925843438370021e-06, "loss": 0.86915398, "num_input_tokens_seen": 30976775, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.25, "step": 1448, "time_per_iteration": 2.5998802185058594 }, { "auxiliary_loss_clip": 0.0118898, "auxiliary_loss_mlp": 0.01062951, "balance_loss_clip": 1.0393002, "balance_loss_mlp": 1.05907357, "epoch": 0.08711859311588757, "flos": 16107013745280.0, "grad_norm": 3.4659019571422047, "language_loss": 0.80055052, "learning_rate": 3.925741474738752e-06, "loss": 0.82306987, "num_input_tokens_seen": 30990495, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.296875, "step": 1449, "time_per_iteration": 2.742360830307007 }, { "auxiliary_loss_clip": 0.0121361, "auxiliary_loss_mlp": 0.01055736, "balance_loss_clip": 1.03235936, "balance_loss_mlp": 1.06118345, "epoch": 0.08717871636855554, "flos": 38472357928320.0, "grad_norm": 1.6473108874638012, "language_loss": 0.706613, "learning_rate": 3.925639442382724e-06, "loss": 0.72930646, "num_input_tokens_seen": 31014080, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 1.25, "step": 1450, "time_per_iteration": 2.763427257537842 }, { "auxiliary_loss_clip": 0.01224618, "auxiliary_loss_mlp": 0.01058458, "balance_loss_clip": 1.03596318, "balance_loss_mlp": 1.06166983, "epoch": 0.0872388396212235, "flos": 17600574188160.0, "grad_norm": 1.905135992189888, "language_loss": 0.83518666, "learning_rate": 3.925537341305578e-06, "loss": 0.85801733, "num_input_tokens_seen": 31031210, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.265625, "step": 1451, "time_per_iteration": 2.6721415519714355 }, { "auxiliary_loss_clip": 0.01219915, "auxiliary_loss_mlp": 0.01305505, "balance_loss_clip": 1.03432512, "balance_loss_mlp": 1.05856013, "epoch": 0.08729896287389148, "flos": 25259385951360.0, "grad_norm": 2.341267908650562, "language_loss": 0.7491219, "learning_rate": 3.925435171510957e-06, "loss": 0.77437609, "num_input_tokens_seen": 31049710, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.25, "step": 1452, "time_per_iteration": 2.6455376148223877 }, { "auxiliary_loss_clip": 0.01204046, "auxiliary_loss_mlp": 0.01061373, "balance_loss_clip": 1.03724527, "balance_loss_mlp": 1.05694151, "epoch": 0.08735908612655945, "flos": 15632454234240.0, "grad_norm": 2.6512456897808843, "language_loss": 0.79226601, "learning_rate": 3.925332933002507e-06, "loss": 0.81492031, "num_input_tokens_seen": 31066160, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.2890625, "step": 1453, "time_per_iteration": 2.6307740211486816 }, { "auxiliary_loss_clip": 0.01201459, "auxiliary_loss_mlp": 0.01062164, "balance_loss_clip": 1.03803635, "balance_loss_mlp": 1.05776334, "epoch": 0.08741920937922741, "flos": 20339660684160.0, "grad_norm": 1.8166083832986673, "language_loss": 0.70764357, "learning_rate": 3.925230625783877e-06, "loss": 0.7302798, "num_input_tokens_seen": 31085270, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.25, "step": 1454, "time_per_iteration": 2.590811252593994 }, { "auxiliary_loss_clip": 0.01069765, "auxiliary_loss_mlp": 0.01011186, "balance_loss_clip": 1.00819361, "balance_loss_mlp": 1.01423764, "epoch": 0.08747933263189539, "flos": 62819795433600.0, "grad_norm": 0.7860483759698417, "language_loss": 0.58625674, "learning_rate": 3.925128249858719e-06, "loss": 0.60706627, "num_input_tokens_seen": 31148445, "router_z_loss_clip": 0.02990723, "router_z_loss_mlp": 0.46484375, "step": 1455, "time_per_iteration": 3.2177019119262695 }, { "auxiliary_loss_clip": 0.0118416, "auxiliary_loss_mlp": 0.01064454, "balance_loss_clip": 1.04105353, "balance_loss_mlp": 1.05718756, "epoch": 0.08753945588456336, "flos": 33035877648000.0, "grad_norm": 1.4960324740064503, "language_loss": 0.77362263, "learning_rate": 3.925025805230685e-06, "loss": 0.79610872, "num_input_tokens_seen": 31168770, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.265625, "step": 1456, "time_per_iteration": 2.645615339279175 }, { "auxiliary_loss_clip": 0.01221409, "auxiliary_loss_mlp": 0.01048745, "balance_loss_clip": 1.02579713, "balance_loss_mlp": 1.05776787, "epoch": 0.08759957913723132, "flos": 35547182046720.0, "grad_norm": 1.9744170346477063, "language_loss": 0.71972865, "learning_rate": 3.924923291903433e-06, "loss": 0.74243021, "num_input_tokens_seen": 31189270, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.265625, "step": 1457, "time_per_iteration": 2.766232490539551 }, { "auxiliary_loss_clip": 0.01189196, "auxiliary_loss_mlp": 0.01048736, "balance_loss_clip": 1.02638435, "balance_loss_mlp": 1.05622017, "epoch": 0.0876597023898993, "flos": 23911120022400.0, "grad_norm": 1.5059193805666178, "language_loss": 0.86219394, "learning_rate": 3.924820709880619e-06, "loss": 0.88457328, "num_input_tokens_seen": 31210385, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.234375, "step": 1458, "time_per_iteration": 2.6091468334198 }, { "auxiliary_loss_clip": 0.01207922, "auxiliary_loss_mlp": 0.01060123, "balance_loss_clip": 1.03736639, "balance_loss_mlp": 1.06308222, "epoch": 0.08771982564256726, "flos": 18114025150080.0, "grad_norm": 1.9069607456098066, "language_loss": 0.8029927, "learning_rate": 3.924718059165906e-06, "loss": 0.82567322, "num_input_tokens_seen": 31229745, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 1.265625, "step": 1459, "time_per_iteration": 2.6571338176727295 }, { "auxiliary_loss_clip": 0.01233392, "auxiliary_loss_mlp": 0.01052864, "balance_loss_clip": 1.02927196, "balance_loss_mlp": 1.05784917, "epoch": 0.08777994889523523, "flos": 17712005155200.0, "grad_norm": 2.548352444443196, "language_loss": 0.83863068, "learning_rate": 3.924615339762956e-06, "loss": 0.86149323, "num_input_tokens_seen": 31248280, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.3046875, "step": 1460, "time_per_iteration": 2.6834933757781982 }, { "auxiliary_loss_clip": 0.01190079, "auxiliary_loss_mlp": 0.01052923, "balance_loss_clip": 1.0312866, "balance_loss_mlp": 1.05633676, "epoch": 0.0878400721479032, "flos": 12819930382080.0, "grad_norm": 3.6207256911771717, "language_loss": 0.80206174, "learning_rate": 3.924512551675435e-06, "loss": 0.82449174, "num_input_tokens_seen": 31262190, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.25, "step": 1461, "time_per_iteration": 2.5834317207336426 }, { "auxiliary_loss_clip": 0.01207254, "auxiliary_loss_mlp": 0.01063266, "balance_loss_clip": 1.04110456, "balance_loss_mlp": 1.06043351, "epoch": 0.08790019540057117, "flos": 26392690938240.0, "grad_norm": 1.607211136047096, "language_loss": 0.75980842, "learning_rate": 3.924409694907011e-06, "loss": 0.78251368, "num_input_tokens_seen": 31283690, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.28125, "step": 1462, "time_per_iteration": 2.7081735134124756 }, { "auxiliary_loss_clip": 0.01206103, "auxiliary_loss_mlp": 0.01061674, "balance_loss_clip": 1.03764105, "balance_loss_mlp": 1.05912256, "epoch": 0.08796031865323914, "flos": 19134031662720.0, "grad_norm": 1.8572463937219306, "language_loss": 0.73971879, "learning_rate": 3.924306769461356e-06, "loss": 0.76239657, "num_input_tokens_seen": 31302505, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 1.28125, "step": 1463, "time_per_iteration": 2.561609983444214 }, { "auxiliary_loss_clip": 0.01198699, "auxiliary_loss_mlp": 0.01062991, "balance_loss_clip": 1.03835011, "balance_loss_mlp": 1.05770242, "epoch": 0.0880204419059071, "flos": 26064287867520.0, "grad_norm": 1.8996368494550702, "language_loss": 0.83657491, "learning_rate": 3.924203775342142e-06, "loss": 0.85919183, "num_input_tokens_seen": 31323070, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 1.3203125, "step": 1464, "time_per_iteration": 2.69620418548584 }, { "auxiliary_loss_clip": 0.0120148, "auxiliary_loss_mlp": 0.01059306, "balance_loss_clip": 1.03709698, "balance_loss_mlp": 1.05686557, "epoch": 0.08808056515857508, "flos": 22377842115840.0, "grad_norm": 1.811160502518195, "language_loss": 0.77917016, "learning_rate": 3.924100712553046e-06, "loss": 0.80177802, "num_input_tokens_seen": 31341880, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.265625, "step": 1465, "time_per_iteration": 2.5762643814086914 }, { "auxiliary_loss_clip": 0.01188217, "auxiliary_loss_mlp": 0.010563, "balance_loss_clip": 1.03430629, "balance_loss_mlp": 1.05732048, "epoch": 0.08814068841124305, "flos": 23185293897600.0, "grad_norm": 1.835244433439944, "language_loss": 0.85162246, "learning_rate": 3.923997581097744e-06, "loss": 0.8740676, "num_input_tokens_seen": 31361995, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.3125, "step": 1466, "time_per_iteration": 2.619631290435791 }, { "auxiliary_loss_clip": 0.01218888, "auxiliary_loss_mlp": 0.01306253, "balance_loss_clip": 1.03358674, "balance_loss_mlp": 1.05576372, "epoch": 0.08820081166391101, "flos": 25155281358720.0, "grad_norm": 2.0579257928040255, "language_loss": 0.84213209, "learning_rate": 3.923894380979917e-06, "loss": 0.86738348, "num_input_tokens_seen": 31381515, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 1.265625, "step": 1467, "time_per_iteration": 2.627092123031616 }, { "auxiliary_loss_clip": 0.01193464, "auxiliary_loss_mlp": 0.01058121, "balance_loss_clip": 1.03476787, "balance_loss_mlp": 1.0572598, "epoch": 0.08826093491657899, "flos": 22231685675520.0, "grad_norm": 1.6961793327627375, "language_loss": 0.75686997, "learning_rate": 3.9237911122032485e-06, "loss": 0.77938581, "num_input_tokens_seen": 31400345, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 1.265625, "step": 1468, "time_per_iteration": 2.659862518310547 }, { "auxiliary_loss_clip": 0.01189885, "auxiliary_loss_mlp": 0.01052962, "balance_loss_clip": 1.03179085, "balance_loss_mlp": 1.05672681, "epoch": 0.08832105816924696, "flos": 22126826897280.0, "grad_norm": 1.5136919072936827, "language_loss": 0.80186427, "learning_rate": 3.923687774771424e-06, "loss": 0.82429278, "num_input_tokens_seen": 31419620, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.2421875, "step": 1469, "time_per_iteration": 2.638190984725952 }, { "auxiliary_loss_clip": 0.01186477, "auxiliary_loss_mlp": 0.01055701, "balance_loss_clip": 1.03392148, "balance_loss_mlp": 1.05773652, "epoch": 0.08838118142191492, "flos": 17566495159680.0, "grad_norm": 1.9494482611588664, "language_loss": 0.77203673, "learning_rate": 3.923584368688132e-06, "loss": 0.79445851, "num_input_tokens_seen": 31437970, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.28125, "step": 1470, "time_per_iteration": 2.5564796924591064 }, { "auxiliary_loss_clip": 0.0118201, "auxiliary_loss_mlp": 0.01058923, "balance_loss_clip": 1.03535533, "balance_loss_mlp": 1.05815697, "epoch": 0.0884413046745829, "flos": 20777196251520.0, "grad_norm": 1.6731994540498294, "language_loss": 0.83864403, "learning_rate": 3.923480893957061e-06, "loss": 0.86105335, "num_input_tokens_seen": 31457040, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.234375, "step": 1471, "time_per_iteration": 4.056175947189331 }, { "auxiliary_loss_clip": 0.0120514, "auxiliary_loss_mlp": 0.01051392, "balance_loss_clip": 1.03104317, "balance_loss_mlp": 1.05794573, "epoch": 0.08850142792725087, "flos": 22125462180480.0, "grad_norm": 2.2206086570824013, "language_loss": 0.83164346, "learning_rate": 3.923377350581905e-06, "loss": 0.85420883, "num_input_tokens_seen": 31477520, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.1953125, "step": 1472, "time_per_iteration": 4.182045221328735 }, { "auxiliary_loss_clip": 0.0120168, "auxiliary_loss_mlp": 0.01059548, "balance_loss_clip": 1.03677917, "balance_loss_mlp": 1.05873609, "epoch": 0.08856155117991883, "flos": 22418744728320.0, "grad_norm": 2.3088834473739173, "language_loss": 0.82451367, "learning_rate": 3.923273738566359e-06, "loss": 0.84712595, "num_input_tokens_seen": 31495575, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 1.25, "step": 1473, "time_per_iteration": 4.054985761642456 }, { "auxiliary_loss_clip": 0.01228015, "auxiliary_loss_mlp": 0.01059689, "balance_loss_clip": 1.03917289, "balance_loss_mlp": 1.05691171, "epoch": 0.0886216744325868, "flos": 29937002572800.0, "grad_norm": 1.6324958333293638, "language_loss": 0.78663087, "learning_rate": 3.92317005791412e-06, "loss": 0.80950785, "num_input_tokens_seen": 31520020, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.25, "step": 1474, "time_per_iteration": 2.7333946228027344 }, { "auxiliary_loss_clip": 0.01207965, "auxiliary_loss_mlp": 0.01060712, "balance_loss_clip": 1.03747809, "balance_loss_mlp": 1.05866981, "epoch": 0.08868179768525478, "flos": 23982833525760.0, "grad_norm": 1.691763787791671, "language_loss": 0.78800249, "learning_rate": 3.923066308628889e-06, "loss": 0.81068921, "num_input_tokens_seen": 31539265, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.21875, "step": 1475, "time_per_iteration": 4.217088937759399 }, { "auxiliary_loss_clip": 0.0118176, "auxiliary_loss_mlp": 0.01050258, "balance_loss_clip": 1.02846682, "balance_loss_mlp": 1.05771017, "epoch": 0.08874192093792274, "flos": 43177553216640.0, "grad_norm": 1.629573701800141, "language_loss": 0.7410264, "learning_rate": 3.922962490714368e-06, "loss": 0.76334655, "num_input_tokens_seen": 31563425, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.2421875, "step": 1476, "time_per_iteration": 2.711313247680664 }, { "auxiliary_loss_clip": 0.01201653, "auxiliary_loss_mlp": 0.01060484, "balance_loss_clip": 1.03835845, "balance_loss_mlp": 1.05808878, "epoch": 0.0888020441905907, "flos": 32852445868800.0, "grad_norm": 1.425144551002453, "language_loss": 0.74243325, "learning_rate": 3.922858604174262e-06, "loss": 0.76505464, "num_input_tokens_seen": 31584525, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.25, "step": 1477, "time_per_iteration": 2.720222234725952 }, { "auxiliary_loss_clip": 0.01180974, "auxiliary_loss_mlp": 0.01051511, "balance_loss_clip": 1.02875376, "balance_loss_mlp": 1.05647945, "epoch": 0.08886216744325869, "flos": 23149347361920.0, "grad_norm": 1.8044009896212956, "language_loss": 0.86811167, "learning_rate": 3.922754649012279e-06, "loss": 0.89043653, "num_input_tokens_seen": 31603325, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 1.2421875, "step": 1478, "time_per_iteration": 2.5521273612976074 }, { "auxiliary_loss_clip": 0.01190475, "auxiliary_loss_mlp": 0.01057155, "balance_loss_clip": 1.03576922, "balance_loss_mlp": 1.05745959, "epoch": 0.08892229069592665, "flos": 23331593992320.0, "grad_norm": 1.5517025386988628, "language_loss": 0.77384222, "learning_rate": 3.922650625232128e-06, "loss": 0.79631853, "num_input_tokens_seen": 31624820, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.234375, "step": 1479, "time_per_iteration": 2.637392044067383 }, { "auxiliary_loss_clip": 0.01197914, "auxiliary_loss_mlp": 0.0105127, "balance_loss_clip": 1.02857268, "balance_loss_mlp": 1.05574012, "epoch": 0.08898241394859462, "flos": 26213784272640.0, "grad_norm": 2.14626290599896, "language_loss": 0.77842534, "learning_rate": 3.922546532837522e-06, "loss": 0.80091715, "num_input_tokens_seen": 31646080, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 1.234375, "step": 1480, "time_per_iteration": 2.7175652980804443 }, { "auxiliary_loss_clip": 0.01191448, "auxiliary_loss_mlp": 0.01063093, "balance_loss_clip": 1.03954923, "balance_loss_mlp": 1.05519521, "epoch": 0.0890425372012626, "flos": 23550613171200.0, "grad_norm": 1.674856421925172, "language_loss": 0.66403484, "learning_rate": 3.9224423718321756e-06, "loss": 0.68658024, "num_input_tokens_seen": 31665770, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 1.2734375, "step": 1481, "time_per_iteration": 2.6289470195770264 }, { "auxiliary_loss_clip": 0.01217689, "auxiliary_loss_mlp": 0.01048512, "balance_loss_clip": 1.02780533, "balance_loss_mlp": 1.05914438, "epoch": 0.08910266045393056, "flos": 23002795872000.0, "grad_norm": 1.8352658439820566, "language_loss": 0.96142972, "learning_rate": 3.922338142219806e-06, "loss": 0.98409164, "num_input_tokens_seen": 31683805, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.21875, "step": 1482, "time_per_iteration": 2.676384449005127 }, { "auxiliary_loss_clip": 0.01205552, "auxiliary_loss_mlp": 0.01051073, "balance_loss_clip": 1.02949643, "balance_loss_mlp": 1.05898333, "epoch": 0.08916278370659853, "flos": 31936508035200.0, "grad_norm": 1.956334983033436, "language_loss": 0.78454268, "learning_rate": 3.922233844004133e-06, "loss": 0.807109, "num_input_tokens_seen": 31704630, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.28125, "step": 1483, "time_per_iteration": 2.6765687465667725 }, { "auxiliary_loss_clip": 0.01190346, "auxiliary_loss_mlp": 0.01080786, "balance_loss_clip": 1.0577426, "balance_loss_mlp": 1.05676603, "epoch": 0.08922290695926649, "flos": 17530404969600.0, "grad_norm": 2.0078231952617798, "language_loss": 0.85763741, "learning_rate": 3.922129477188879e-06, "loss": 0.88034868, "num_input_tokens_seen": 31723255, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.2421875, "step": 1484, "time_per_iteration": 2.7219157218933105 }, { "auxiliary_loss_clip": 0.01187631, "auxiliary_loss_mlp": 0.01059769, "balance_loss_clip": 1.03660631, "balance_loss_mlp": 1.05924273, "epoch": 0.08928303021193447, "flos": 32125075459200.0, "grad_norm": 2.169391675523297, "language_loss": 0.8012284, "learning_rate": 3.922025041777768e-06, "loss": 0.82370245, "num_input_tokens_seen": 31747045, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.28125, "step": 1485, "time_per_iteration": 2.774174451828003 }, { "auxiliary_loss_clip": 0.01189874, "auxiliary_loss_mlp": 0.01058152, "balance_loss_clip": 1.03578866, "balance_loss_mlp": 1.05508339, "epoch": 0.08934315346460243, "flos": 22125210785280.0, "grad_norm": 1.8727196744688102, "language_loss": 0.82920325, "learning_rate": 3.921920537774528e-06, "loss": 0.8516835, "num_input_tokens_seen": 31766615, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.2578125, "step": 1486, "time_per_iteration": 2.595766305923462 }, { "auxiliary_loss_clip": 0.01206555, "auxiliary_loss_mlp": 0.01057265, "balance_loss_clip": 1.03529513, "balance_loss_mlp": 1.05648685, "epoch": 0.0894032767172704, "flos": 22565583527040.0, "grad_norm": 1.6512649428667505, "language_loss": 0.76228559, "learning_rate": 3.921815965182887e-06, "loss": 0.78492379, "num_input_tokens_seen": 31785855, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.2265625, "step": 1487, "time_per_iteration": 2.6134426593780518 }, { "auxiliary_loss_clip": 0.01198379, "auxiliary_loss_mlp": 0.01054063, "balance_loss_clip": 1.03116345, "balance_loss_mlp": 1.05639291, "epoch": 0.08946339996993838, "flos": 20193396503040.0, "grad_norm": 1.93284045799955, "language_loss": 0.82749963, "learning_rate": 3.921711324006578e-06, "loss": 0.8500241, "num_input_tokens_seen": 31804210, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.234375, "step": 1488, "time_per_iteration": 2.602076530456543 }, { "auxiliary_loss_clip": 0.01188892, "auxiliary_loss_mlp": 0.01048458, "balance_loss_clip": 1.02729869, "balance_loss_mlp": 1.05768824, "epoch": 0.08952352322260634, "flos": 48360181104000.0, "grad_norm": 1.7565271359703314, "language_loss": 0.7166729, "learning_rate": 3.921606614249335e-06, "loss": 0.73904639, "num_input_tokens_seen": 31826150, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.21875, "step": 1489, "time_per_iteration": 2.7918200492858887 }, { "auxiliary_loss_clip": 0.01202663, "auxiliary_loss_mlp": 0.01052711, "balance_loss_clip": 1.0307765, "balance_loss_mlp": 1.0524888, "epoch": 0.08958364647527431, "flos": 31793081028480.0, "grad_norm": 1.7733208286761504, "language_loss": 0.89550835, "learning_rate": 3.921501835914894e-06, "loss": 0.91806209, "num_input_tokens_seen": 31848060, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.2265625, "step": 1490, "time_per_iteration": 2.719344139099121 }, { "auxiliary_loss_clip": 0.01194094, "auxiliary_loss_mlp": 0.01061841, "balance_loss_clip": 1.03975129, "balance_loss_mlp": 1.05576086, "epoch": 0.08964376972794229, "flos": 23368186972800.0, "grad_norm": 2.353561072629757, "language_loss": 0.73468131, "learning_rate": 3.921396989006997e-06, "loss": 0.75724065, "num_input_tokens_seen": 31870040, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.296875, "step": 1491, "time_per_iteration": 2.5442087650299072 }, { "auxiliary_loss_clip": 0.01178345, "auxiliary_loss_mlp": 0.01298855, "balance_loss_clip": 1.02962279, "balance_loss_mlp": 1.05666959, "epoch": 0.08970389298061025, "flos": 23294785530240.0, "grad_norm": 1.6531837387515596, "language_loss": 0.76945108, "learning_rate": 3.9212920735293824e-06, "loss": 0.79422307, "num_input_tokens_seen": 31890400, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.21875, "step": 1492, "time_per_iteration": 2.604057788848877 }, { "auxiliary_loss_clip": 0.01178848, "auxiliary_loss_mlp": 0.01056263, "balance_loss_clip": 1.03466237, "balance_loss_mlp": 1.05778551, "epoch": 0.08976401623327822, "flos": 33761703772800.0, "grad_norm": 2.0073485997574987, "language_loss": 0.70928133, "learning_rate": 3.921187089485796e-06, "loss": 0.73163241, "num_input_tokens_seen": 31913435, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.2109375, "step": 1493, "time_per_iteration": 2.8057861328125 }, { "auxiliary_loss_clip": 0.01178988, "auxiliary_loss_mlp": 0.01056452, "balance_loss_clip": 1.03458881, "balance_loss_mlp": 1.05632627, "epoch": 0.08982413948594618, "flos": 23911335504000.0, "grad_norm": 1.746736378742042, "language_loss": 0.86989403, "learning_rate": 3.921082036879985e-06, "loss": 0.89224839, "num_input_tokens_seen": 31932435, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.2265625, "step": 1494, "time_per_iteration": 2.6898679733276367 }, { "auxiliary_loss_clip": 0.01232108, "auxiliary_loss_mlp": 0.01059576, "balance_loss_clip": 1.03740263, "balance_loss_mlp": 1.05549037, "epoch": 0.08988426273861416, "flos": 16837544551680.0, "grad_norm": 1.727613009962231, "language_loss": 0.82954651, "learning_rate": 3.9209769157156976e-06, "loss": 0.85246336, "num_input_tokens_seen": 31950125, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.21875, "step": 1495, "time_per_iteration": 2.659390926361084 }, { "auxiliary_loss_clip": 0.01194515, "auxiliary_loss_mlp": 0.01055925, "balance_loss_clip": 1.03450274, "balance_loss_mlp": 1.06192398, "epoch": 0.08994438599128213, "flos": 14793365548800.0, "grad_norm": 1.8550415446129676, "language_loss": 0.69939137, "learning_rate": 3.920871725996685e-06, "loss": 0.72189569, "num_input_tokens_seen": 31968050, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.234375, "step": 1496, "time_per_iteration": 2.632477283477783 }, { "auxiliary_loss_clip": 0.01194609, "auxiliary_loss_mlp": 0.01049909, "balance_loss_clip": 1.03015614, "balance_loss_mlp": 1.05483365, "epoch": 0.09000450924395009, "flos": 17384320356480.0, "grad_norm": 1.5589092462312786, "language_loss": 0.79536903, "learning_rate": 3.920766467726702e-06, "loss": 0.81781423, "num_input_tokens_seen": 31985675, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.2109375, "step": 1497, "time_per_iteration": 2.5338919162750244 }, { "auxiliary_loss_clip": 0.01230029, "auxiliary_loss_mlp": 0.01055341, "balance_loss_clip": 1.03309643, "balance_loss_mlp": 1.057181, "epoch": 0.09006463249661807, "flos": 24280317964800.0, "grad_norm": 2.6416487059132487, "language_loss": 0.82737327, "learning_rate": 3.920661140909505e-06, "loss": 0.85022688, "num_input_tokens_seen": 32005180, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.2734375, "step": 1498, "time_per_iteration": 2.697065830230713 }, { "auxiliary_loss_clip": 0.01201443, "auxiliary_loss_mlp": 0.01063207, "balance_loss_clip": 1.04289329, "balance_loss_mlp": 1.05848658, "epoch": 0.09012475574928604, "flos": 13661928069120.0, "grad_norm": 2.1829030091838035, "language_loss": 0.78755426, "learning_rate": 3.920555745548851e-06, "loss": 0.81020075, "num_input_tokens_seen": 32022970, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.25, "step": 1499, "time_per_iteration": 2.6022374629974365 }, { "auxiliary_loss_clip": 0.0118559, "auxiliary_loss_mlp": 0.01305761, "balance_loss_clip": 1.03543603, "balance_loss_mlp": 1.05709434, "epoch": 0.090184879001954, "flos": 23327751237120.0, "grad_norm": 1.7120868997279513, "language_loss": 0.92847949, "learning_rate": 3.920450281648503e-06, "loss": 0.95339298, "num_input_tokens_seen": 32043055, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.1875, "step": 1500, "time_per_iteration": 2.641038179397583 }, { "auxiliary_loss_clip": 0.01209599, "auxiliary_loss_mlp": 0.01053671, "balance_loss_clip": 1.03061604, "balance_loss_mlp": 1.05762184, "epoch": 0.09024500225462198, "flos": 23002688131200.0, "grad_norm": 2.241728535294002, "language_loss": 0.74223423, "learning_rate": 3.920344749212226e-06, "loss": 0.76486689, "num_input_tokens_seen": 32061900, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.25, "step": 1501, "time_per_iteration": 2.6000092029571533 }, { "auxiliary_loss_clip": 0.01100833, "auxiliary_loss_mlp": 0.01011593, "balance_loss_clip": 1.00874388, "balance_loss_mlp": 1.02758694, "epoch": 0.09030512550728995, "flos": 62189203242240.0, "grad_norm": 0.7294928412311964, "language_loss": 0.583745, "learning_rate": 3.920239148243783e-06, "loss": 0.60486925, "num_input_tokens_seen": 32122745, "router_z_loss_clip": 0.02844238, "router_z_loss_mlp": 0.45898438, "step": 1502, "time_per_iteration": 3.236300468444824 }, { "auxiliary_loss_clip": 0.01191873, "auxiliary_loss_mlp": 0.01051268, "balance_loss_clip": 1.0312171, "balance_loss_mlp": 1.05439544, "epoch": 0.09036524875995791, "flos": 38800689171840.0, "grad_norm": 2.1673613363311643, "language_loss": 0.6936419, "learning_rate": 3.920133478746944e-06, "loss": 0.71607333, "num_input_tokens_seen": 32145125, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.1875, "step": 1503, "time_per_iteration": 2.8427042961120605 }, { "auxiliary_loss_clip": 0.01218851, "auxiliary_loss_mlp": 0.01056495, "balance_loss_clip": 1.03588378, "balance_loss_mlp": 1.05738819, "epoch": 0.09042537201262588, "flos": 21690081429120.0, "grad_norm": 1.9577820693788945, "language_loss": 0.85988641, "learning_rate": 3.920027740725481e-06, "loss": 0.88263988, "num_input_tokens_seen": 32166255, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.25, "step": 1504, "time_per_iteration": 2.6578214168548584 }, { "auxiliary_loss_clip": 0.0121434, "auxiliary_loss_mlp": 0.01062284, "balance_loss_clip": 1.03764355, "balance_loss_mlp": 1.05876493, "epoch": 0.09048549526529386, "flos": 22267021680000.0, "grad_norm": 1.8019451439429464, "language_loss": 0.72392142, "learning_rate": 3.919921934183167e-06, "loss": 0.74668765, "num_input_tokens_seen": 32184010, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 1.28125, "step": 1505, "time_per_iteration": 2.7104971408843994 }, { "auxiliary_loss_clip": 0.01204775, "auxiliary_loss_mlp": 0.01050927, "balance_loss_clip": 1.02951741, "balance_loss_mlp": 1.05775738, "epoch": 0.09054561851796182, "flos": 14610939350400.0, "grad_norm": 2.3099814915672097, "language_loss": 0.80612564, "learning_rate": 3.919816059123778e-06, "loss": 0.82868266, "num_input_tokens_seen": 32201635, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.1953125, "step": 1506, "time_per_iteration": 2.5870208740234375 }, { "auxiliary_loss_clip": 0.01213458, "auxiliary_loss_mlp": 0.01047398, "balance_loss_clip": 1.02720404, "balance_loss_mlp": 1.05724907, "epoch": 0.09060574177062979, "flos": 27636169916160.0, "grad_norm": 1.7306165196095218, "language_loss": 0.75827402, "learning_rate": 3.919710115551092e-06, "loss": 0.7808826, "num_input_tokens_seen": 32221940, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.1953125, "step": 1507, "time_per_iteration": 2.739222764968872 }, { "auxiliary_loss_clip": 0.01089076, "auxiliary_loss_mlp": 0.01003664, "balance_loss_clip": 1.00066042, "balance_loss_mlp": 1.02528739, "epoch": 0.09066586502329776, "flos": 66085797513600.0, "grad_norm": 0.7258825150483802, "language_loss": 0.57682723, "learning_rate": 3.91960410346889e-06, "loss": 0.59775466, "num_input_tokens_seen": 32276495, "router_z_loss_clip": 0.0300293, "router_z_loss_mlp": 0.45703125, "step": 1508, "time_per_iteration": 3.0540363788604736 }, { "auxiliary_loss_clip": 0.01199333, "auxiliary_loss_mlp": 0.01052075, "balance_loss_clip": 1.03077269, "balance_loss_mlp": 1.05744028, "epoch": 0.09072598827596573, "flos": 18916449027840.0, "grad_norm": 1.95318910404404, "language_loss": 0.85227382, "learning_rate": 3.919498022880955e-06, "loss": 0.87478787, "num_input_tokens_seen": 32294130, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.234375, "step": 1509, "time_per_iteration": 2.6306240558624268 }, { "auxiliary_loss_clip": 0.01204942, "auxiliary_loss_mlp": 0.01061746, "balance_loss_clip": 1.03861964, "balance_loss_mlp": 1.05857003, "epoch": 0.0907861115286337, "flos": 24821742643200.0, "grad_norm": 2.338461600485905, "language_loss": 0.84012628, "learning_rate": 3.9193918737910735e-06, "loss": 0.86279315, "num_input_tokens_seen": 32313555, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 1.28125, "step": 1510, "time_per_iteration": 2.6913716793060303 }, { "auxiliary_loss_clip": 0.01197568, "auxiliary_loss_mlp": 0.01050094, "balance_loss_clip": 1.02843356, "balance_loss_mlp": 1.05573344, "epoch": 0.09084623478130167, "flos": 21652842003840.0, "grad_norm": 1.759459674121909, "language_loss": 0.8524487, "learning_rate": 3.919285656203033e-06, "loss": 0.87492532, "num_input_tokens_seen": 32331430, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.234375, "step": 1511, "time_per_iteration": 2.642348051071167 }, { "auxiliary_loss_clip": 0.01206059, "auxiliary_loss_mlp": 0.01048243, "balance_loss_clip": 1.02691603, "balance_loss_mlp": 1.05777514, "epoch": 0.09090635803396964, "flos": 27639258485760.0, "grad_norm": 1.4410243850167341, "language_loss": 0.84860337, "learning_rate": 3.919179370120624e-06, "loss": 0.87114638, "num_input_tokens_seen": 32353705, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.2109375, "step": 1512, "time_per_iteration": 2.730100631713867 }, { "auxiliary_loss_clip": 0.01199404, "auxiliary_loss_mlp": 0.0104789, "balance_loss_clip": 1.02799428, "balance_loss_mlp": 1.05338812, "epoch": 0.0909664812866376, "flos": 17669127294720.0, "grad_norm": 2.3101744549432217, "language_loss": 0.86716259, "learning_rate": 3.919073015547641e-06, "loss": 0.8896355, "num_input_tokens_seen": 32370520, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.1875, "step": 1513, "time_per_iteration": 5.475035905838013 }, { "auxiliary_loss_clip": 0.01185212, "auxiliary_loss_mlp": 0.01049235, "balance_loss_clip": 1.02964926, "balance_loss_mlp": 1.05525851, "epoch": 0.09102660453930557, "flos": 23951448017280.0, "grad_norm": 1.9809747487076408, "language_loss": 0.86523765, "learning_rate": 3.918966592487878e-06, "loss": 0.88758218, "num_input_tokens_seen": 32389105, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.203125, "step": 1514, "time_per_iteration": 4.029045343399048 }, { "auxiliary_loss_clip": 0.01215279, "auxiliary_loss_mlp": 0.01056245, "balance_loss_clip": 1.03643203, "balance_loss_mlp": 1.05843866, "epoch": 0.09108672779197355, "flos": 25812949426560.0, "grad_norm": 1.768103913756337, "language_loss": 0.89588761, "learning_rate": 3.918860100945134e-06, "loss": 0.91860276, "num_input_tokens_seen": 32408065, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.203125, "step": 1515, "time_per_iteration": 2.7074902057647705 }, { "auxiliary_loss_clip": 0.01196803, "auxiliary_loss_mlp": 0.01051686, "balance_loss_clip": 1.0292747, "balance_loss_mlp": 1.05538321, "epoch": 0.09114685104464151, "flos": 29639482220160.0, "grad_norm": 1.8126888797020397, "language_loss": 0.85243398, "learning_rate": 3.9187535409232076e-06, "loss": 0.87491888, "num_input_tokens_seen": 32427225, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.234375, "step": 1516, "time_per_iteration": 2.6915950775146484 }, { "auxiliary_loss_clip": 0.01199046, "auxiliary_loss_mlp": 0.01052018, "balance_loss_clip": 1.03033352, "balance_loss_mlp": 1.05490756, "epoch": 0.09120697429730948, "flos": 33729635905920.0, "grad_norm": 2.139564703975357, "language_loss": 0.8116591, "learning_rate": 3.918646912425904e-06, "loss": 0.83416975, "num_input_tokens_seen": 32450510, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.265625, "step": 1517, "time_per_iteration": 5.041289567947388 }, { "auxiliary_loss_clip": 0.01192919, "auxiliary_loss_mlp": 0.01062228, "balance_loss_clip": 1.04048407, "balance_loss_mlp": 1.05815816, "epoch": 0.09126709754997746, "flos": 18401381953920.0, "grad_norm": 1.74463244497498, "language_loss": 0.77864647, "learning_rate": 3.918540215457027e-06, "loss": 0.80119795, "num_input_tokens_seen": 32468425, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.2578125, "step": 1518, "time_per_iteration": 2.6052424907684326 }, { "auxiliary_loss_clip": 0.01185842, "auxiliary_loss_mlp": 0.01061329, "balance_loss_clip": 1.03999066, "balance_loss_mlp": 1.05421817, "epoch": 0.09132722080264542, "flos": 22091957769600.0, "grad_norm": 1.7611536015286442, "language_loss": 0.86272579, "learning_rate": 3.918433450020386e-06, "loss": 0.88519752, "num_input_tokens_seen": 32487510, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.2265625, "step": 1519, "time_per_iteration": 2.6486120223999023 }, { "auxiliary_loss_clip": 0.01195819, "auxiliary_loss_mlp": 0.01050677, "balance_loss_clip": 1.02791989, "balance_loss_mlp": 1.05413926, "epoch": 0.09138734405531339, "flos": 21033131633280.0, "grad_norm": 2.1741631510128236, "language_loss": 0.72682887, "learning_rate": 3.9183266161197885e-06, "loss": 0.7492938, "num_input_tokens_seen": 32507250, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.234375, "step": 1520, "time_per_iteration": 2.771573305130005 }, { "auxiliary_loss_clip": 0.01198423, "auxiliary_loss_mlp": 0.01055041, "balance_loss_clip": 1.03308296, "balance_loss_mlp": 1.05612266, "epoch": 0.09144746730798137, "flos": 20083940784000.0, "grad_norm": 2.782163987146113, "language_loss": 0.85334587, "learning_rate": 3.91821971375905e-06, "loss": 0.87588048, "num_input_tokens_seen": 32526045, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.234375, "step": 1521, "time_per_iteration": 2.7633605003356934 }, { "auxiliary_loss_clip": 0.0122645, "auxiliary_loss_mlp": 0.01058141, "balance_loss_clip": 1.03589678, "balance_loss_mlp": 1.05593801, "epoch": 0.09150759056064933, "flos": 22778210085120.0, "grad_norm": 2.493502219848415, "language_loss": 0.84459984, "learning_rate": 3.918112742941983e-06, "loss": 0.86744571, "num_input_tokens_seen": 32546575, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.25, "step": 1522, "time_per_iteration": 2.7501728534698486 }, { "auxiliary_loss_clip": 0.01172227, "auxiliary_loss_mlp": 0.01061699, "balance_loss_clip": 1.03954995, "balance_loss_mlp": 1.05425119, "epoch": 0.0915677138133173, "flos": 27564205017600.0, "grad_norm": 2.6441735301446467, "language_loss": 0.80731899, "learning_rate": 3.9180057036724066e-06, "loss": 0.82965821, "num_input_tokens_seen": 32568795, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.1796875, "step": 1523, "time_per_iteration": 2.6707334518432617 }, { "auxiliary_loss_clip": 0.01199651, "auxiliary_loss_mlp": 0.01305879, "balance_loss_clip": 1.03591895, "balance_loss_mlp": 1.05750299, "epoch": 0.09162783706598528, "flos": 17674765729920.0, "grad_norm": 2.7456765062022535, "language_loss": 0.74647415, "learning_rate": 3.9178985959541406e-06, "loss": 0.77152944, "num_input_tokens_seen": 32587010, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.234375, "step": 1524, "time_per_iteration": 2.5843186378479004 }, { "auxiliary_loss_clip": 0.0120552, "auxiliary_loss_mlp": 0.01058046, "balance_loss_clip": 1.03570652, "balance_loss_mlp": 1.05558038, "epoch": 0.09168796031865324, "flos": 18478195188480.0, "grad_norm": 1.9943082496627762, "language_loss": 0.85950923, "learning_rate": 3.917791419791006e-06, "loss": 0.88214481, "num_input_tokens_seen": 32602375, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.21875, "step": 1525, "time_per_iteration": 2.713604688644409 }, { "auxiliary_loss_clip": 0.01197058, "auxiliary_loss_mlp": 0.01045505, "balance_loss_clip": 1.02322435, "balance_loss_mlp": 1.05613041, "epoch": 0.0917480835713212, "flos": 29387605075200.0, "grad_norm": 2.274926088453371, "language_loss": 0.74837571, "learning_rate": 3.91768417518683e-06, "loss": 0.77080131, "num_input_tokens_seen": 32621460, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.2265625, "step": 1526, "time_per_iteration": 2.6552484035491943 }, { "auxiliary_loss_clip": 0.01175224, "auxiliary_loss_mlp": 0.01058381, "balance_loss_clip": 1.03708982, "balance_loss_mlp": 1.05499578, "epoch": 0.09180820682398917, "flos": 19829262378240.0, "grad_norm": 8.586554347478938, "language_loss": 0.77113771, "learning_rate": 3.917576862145438e-06, "loss": 0.79347372, "num_input_tokens_seen": 32640440, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.203125, "step": 1527, "time_per_iteration": 2.535776138305664 }, { "auxiliary_loss_clip": 0.01186484, "auxiliary_loss_mlp": 0.01054533, "balance_loss_clip": 1.03147805, "balance_loss_mlp": 1.05395865, "epoch": 0.09186833007665715, "flos": 23841848643840.0, "grad_norm": 1.8412505978389218, "language_loss": 0.78270203, "learning_rate": 3.91746948067066e-06, "loss": 0.80511218, "num_input_tokens_seen": 32660020, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.234375, "step": 1528, "time_per_iteration": 2.6051597595214844 }, { "auxiliary_loss_clip": 0.0120502, "auxiliary_loss_mlp": 0.01050668, "balance_loss_clip": 1.02910292, "balance_loss_mlp": 1.05477071, "epoch": 0.09192845332932512, "flos": 12932726065920.0, "grad_norm": 2.39185043412873, "language_loss": 0.76562703, "learning_rate": 3.91736203076633e-06, "loss": 0.78818393, "num_input_tokens_seen": 32678170, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.2265625, "step": 1529, "time_per_iteration": 2.6372621059417725 }, { "auxiliary_loss_clip": 0.01180804, "auxiliary_loss_mlp": 0.01300709, "balance_loss_clip": 1.02943921, "balance_loss_mlp": 1.05489635, "epoch": 0.09198857658199308, "flos": 24568177559040.0, "grad_norm": 2.568123643946314, "language_loss": 0.82656264, "learning_rate": 3.9172545124362795e-06, "loss": 0.85137784, "num_input_tokens_seen": 32697540, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.2578125, "step": 1530, "time_per_iteration": 2.7410526275634766 }, { "auxiliary_loss_clip": 0.01210863, "auxiliary_loss_mlp": 0.01064224, "balance_loss_clip": 1.04226518, "balance_loss_mlp": 1.05369031, "epoch": 0.09204869983466106, "flos": 20266941600000.0, "grad_norm": 2.0390473439041283, "language_loss": 0.83845305, "learning_rate": 3.9171469256843484e-06, "loss": 0.86120391, "num_input_tokens_seen": 32716805, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.203125, "step": 1531, "time_per_iteration": 2.7047336101531982 }, { "auxiliary_loss_clip": 0.01203843, "auxiliary_loss_mlp": 0.01053023, "balance_loss_clip": 1.03133845, "balance_loss_mlp": 1.05375993, "epoch": 0.09210882308732903, "flos": 20885646389760.0, "grad_norm": 2.0411666913947166, "language_loss": 0.81446362, "learning_rate": 3.917039270514375e-06, "loss": 0.83703232, "num_input_tokens_seen": 32736385, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.2265625, "step": 1532, "time_per_iteration": 2.680023193359375 }, { "auxiliary_loss_clip": 0.01196218, "auxiliary_loss_mlp": 0.01051362, "balance_loss_clip": 1.0290221, "balance_loss_mlp": 1.05528426, "epoch": 0.09216894633999699, "flos": 30956326727040.0, "grad_norm": 1.982132720986281, "language_loss": 0.83722609, "learning_rate": 3.9169315469302e-06, "loss": 0.85970187, "num_input_tokens_seen": 32757140, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.2265625, "step": 1533, "time_per_iteration": 2.7662863731384277 }, { "auxiliary_loss_clip": 0.01208516, "auxiliary_loss_mlp": 0.01049769, "balance_loss_clip": 1.02769101, "balance_loss_mlp": 1.05739355, "epoch": 0.09222906959266497, "flos": 13151565676800.0, "grad_norm": 2.0040892504654266, "language_loss": 0.89795315, "learning_rate": 3.91682375493567e-06, "loss": 0.92053598, "num_input_tokens_seen": 32774860, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.234375, "step": 1534, "time_per_iteration": 2.701331853866577 }, { "auxiliary_loss_clip": 0.01205807, "auxiliary_loss_mlp": 0.01062603, "balance_loss_clip": 1.03883231, "balance_loss_mlp": 1.0545485, "epoch": 0.09228919284533293, "flos": 25994477784960.0, "grad_norm": 2.1129517709665753, "language_loss": 0.75325692, "learning_rate": 3.916715894534631e-06, "loss": 0.77594107, "num_input_tokens_seen": 32795250, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 1.234375, "step": 1535, "time_per_iteration": 2.6872823238372803 }, { "auxiliary_loss_clip": 0.01180811, "auxiliary_loss_mlp": 0.01044084, "balance_loss_clip": 1.02368772, "balance_loss_mlp": 1.0538559, "epoch": 0.0923493160980009, "flos": 18660800954880.0, "grad_norm": 1.7142839681775073, "language_loss": 0.81390691, "learning_rate": 3.916607965730932e-06, "loss": 0.83615589, "num_input_tokens_seen": 32813805, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.1796875, "step": 1536, "time_per_iteration": 2.70004940032959 }, { "auxiliary_loss_clip": 0.01173054, "auxiliary_loss_mlp": 0.01050987, "balance_loss_clip": 1.03006589, "balance_loss_mlp": 1.05371571, "epoch": 0.09240943935066886, "flos": 21140576190720.0, "grad_norm": 1.892484065647357, "language_loss": 0.88884819, "learning_rate": 3.9164999685284245e-06, "loss": 0.91108859, "num_input_tokens_seen": 32830960, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.1953125, "step": 1537, "time_per_iteration": 2.6774396896362305 }, { "auxiliary_loss_clip": 0.01194222, "auxiliary_loss_mlp": 0.01061363, "balance_loss_clip": 1.03895152, "balance_loss_mlp": 1.05430365, "epoch": 0.09246956260333684, "flos": 20592435669120.0, "grad_norm": 2.1357147054674215, "language_loss": 0.80770481, "learning_rate": 3.916391902930963e-06, "loss": 0.83026063, "num_input_tokens_seen": 32848275, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.21875, "step": 1538, "time_per_iteration": 2.618600368499756 }, { "auxiliary_loss_clip": 0.01092566, "auxiliary_loss_mlp": 0.01008066, "balance_loss_clip": 1.00541997, "balance_loss_mlp": 1.02770615, "epoch": 0.09252968585600481, "flos": 67558710614400.0, "grad_norm": 0.7354101814932315, "language_loss": 0.57406497, "learning_rate": 3.916283768942404e-06, "loss": 0.59507126, "num_input_tokens_seen": 32917730, "router_z_loss_clip": 0.02648926, "router_z_loss_mlp": 0.46875, "step": 1539, "time_per_iteration": 3.3881235122680664 }, { "auxiliary_loss_clip": 0.01197213, "auxiliary_loss_mlp": 0.01050278, "balance_loss_clip": 1.02848685, "balance_loss_mlp": 1.05683851, "epoch": 0.09258980910867277, "flos": 17383853479680.0, "grad_norm": 2.761114859515859, "language_loss": 0.67299992, "learning_rate": 3.916175566566607e-06, "loss": 0.69547486, "num_input_tokens_seen": 32934910, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.21875, "step": 1540, "time_per_iteration": 2.5183866024017334 }, { "auxiliary_loss_clip": 0.01204706, "auxiliary_loss_mlp": 0.01046081, "balance_loss_clip": 1.02358663, "balance_loss_mlp": 1.05692935, "epoch": 0.09264993236134075, "flos": 19865927185920.0, "grad_norm": 2.068431378123152, "language_loss": 0.83585155, "learning_rate": 3.916067295807433e-06, "loss": 0.85835946, "num_input_tokens_seen": 32953840, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.203125, "step": 1541, "time_per_iteration": 2.6098227500915527 }, { "auxiliary_loss_clip": 0.01073539, "auxiliary_loss_mlp": 0.01004817, "balance_loss_clip": 1.00208747, "balance_loss_mlp": 1.02727509, "epoch": 0.09271005561400872, "flos": 62284401262080.0, "grad_norm": 0.8881173526395398, "language_loss": 0.61955655, "learning_rate": 3.915958956668745e-06, "loss": 0.64034009, "num_input_tokens_seen": 33011410, "router_z_loss_clip": 0.02734375, "router_z_loss_mlp": 0.46289062, "step": 1542, "time_per_iteration": 3.141490936279297 }, { "auxiliary_loss_clip": 0.01198659, "auxiliary_loss_mlp": 0.01058768, "balance_loss_clip": 1.03793061, "balance_loss_mlp": 1.05769706, "epoch": 0.09277017886667668, "flos": 23329870139520.0, "grad_norm": 1.4836038401040688, "language_loss": 0.82514, "learning_rate": 3.915850549154412e-06, "loss": 0.84771425, "num_input_tokens_seen": 33031675, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.2265625, "step": 1543, "time_per_iteration": 2.6186952590942383 }, { "auxiliary_loss_clip": 0.01200747, "auxiliary_loss_mlp": 0.01054863, "balance_loss_clip": 1.03377461, "balance_loss_mlp": 1.05493903, "epoch": 0.09283030211934466, "flos": 54745169875200.0, "grad_norm": 1.6389062534899683, "language_loss": 0.71992993, "learning_rate": 3.9157420732682995e-06, "loss": 0.742486, "num_input_tokens_seen": 33056355, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.1875, "step": 1544, "time_per_iteration": 2.9512293338775635 }, { "auxiliary_loss_clip": 0.0118828, "auxiliary_loss_mlp": 0.01051187, "balance_loss_clip": 1.02914572, "balance_loss_mlp": 1.05581343, "epoch": 0.09289042537201263, "flos": 30334784762880.0, "grad_norm": 1.5549160490136056, "language_loss": 0.77440935, "learning_rate": 3.91563352901428e-06, "loss": 0.79680407, "num_input_tokens_seen": 33079520, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.234375, "step": 1545, "time_per_iteration": 2.712280750274658 }, { "auxiliary_loss_clip": 0.01201561, "auxiliary_loss_mlp": 0.01049748, "balance_loss_clip": 1.02882648, "balance_loss_mlp": 1.05613124, "epoch": 0.09295054862468059, "flos": 17746838369280.0, "grad_norm": 3.3290897865653357, "language_loss": 0.74426329, "learning_rate": 3.915524916396229e-06, "loss": 0.76677638, "num_input_tokens_seen": 33096135, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.1796875, "step": 1546, "time_per_iteration": 2.655057668685913 }, { "auxiliary_loss_clip": 0.01192284, "auxiliary_loss_mlp": 0.01058132, "balance_loss_clip": 1.03624547, "balance_loss_mlp": 1.05798602, "epoch": 0.09301067187734856, "flos": 23658021815040.0, "grad_norm": 2.039377726361048, "language_loss": 0.84261435, "learning_rate": 3.91541623541802e-06, "loss": 0.8651185, "num_input_tokens_seen": 33115245, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.25, "step": 1547, "time_per_iteration": 2.6466152667999268 }, { "auxiliary_loss_clip": 0.01203564, "auxiliary_loss_mlp": 0.01051681, "balance_loss_clip": 1.03072429, "balance_loss_mlp": 1.05437434, "epoch": 0.09307079513001654, "flos": 27527719777920.0, "grad_norm": 2.9307167397775893, "language_loss": 0.67295319, "learning_rate": 3.9153074860835326e-06, "loss": 0.69550562, "num_input_tokens_seen": 33136640, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.21875, "step": 1548, "time_per_iteration": 2.7521440982818604 }, { "auxiliary_loss_clip": 0.01223587, "auxiliary_loss_mlp": 0.01058709, "balance_loss_clip": 1.03729916, "balance_loss_mlp": 1.05548859, "epoch": 0.0931309183826845, "flos": 20627340710400.0, "grad_norm": 1.807077189117591, "language_loss": 0.83340555, "learning_rate": 3.915198668396649e-06, "loss": 0.85622847, "num_input_tokens_seen": 33155060, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.21875, "step": 1549, "time_per_iteration": 2.620962619781494 }, { "auxiliary_loss_clip": 0.01187055, "auxiliary_loss_mlp": 0.01052837, "balance_loss_clip": 1.03204715, "balance_loss_mlp": 1.0567348, "epoch": 0.09319104163535247, "flos": 29020921084800.0, "grad_norm": 1.6427308439068187, "language_loss": 0.75796753, "learning_rate": 3.91508978236125e-06, "loss": 0.78036654, "num_input_tokens_seen": 33175420, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.2109375, "step": 1550, "time_per_iteration": 2.663280487060547 }, { "auxiliary_loss_clip": 0.01208058, "auxiliary_loss_mlp": 0.01068588, "balance_loss_clip": 1.04546106, "balance_loss_mlp": 1.0563668, "epoch": 0.09325116488802045, "flos": 25301545539840.0, "grad_norm": 1.849626044742112, "language_loss": 0.82815248, "learning_rate": 3.914980827981223e-06, "loss": 0.85091889, "num_input_tokens_seen": 33194120, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 1.2421875, "step": 1551, "time_per_iteration": 2.6142828464508057 }, { "auxiliary_loss_clip": 0.01078135, "auxiliary_loss_mlp": 0.01263644, "balance_loss_clip": 1.01187253, "balance_loss_mlp": 1.02343798, "epoch": 0.09331128814068841, "flos": 61536203942400.0, "grad_norm": 0.7509650267937286, "language_loss": 0.61926341, "learning_rate": 3.914871805260456e-06, "loss": 0.64268124, "num_input_tokens_seen": 33261080, "router_z_loss_clip": 0.02978516, "router_z_loss_mlp": 0.45507812, "step": 1552, "time_per_iteration": 3.305417776107788 }, { "auxiliary_loss_clip": 0.01075619, "auxiliary_loss_mlp": 0.01006991, "balance_loss_clip": 1.00393963, "balance_loss_mlp": 1.02100372, "epoch": 0.09337141139335638, "flos": 53293700171520.0, "grad_norm": 0.8361354809023555, "language_loss": 0.59047639, "learning_rate": 3.91476271420284e-06, "loss": 0.6113025, "num_input_tokens_seen": 33330235, "router_z_loss_clip": 0.03051758, "router_z_loss_mlp": 0.453125, "step": 1553, "time_per_iteration": 3.2922205924987793 }, { "auxiliary_loss_clip": 0.01206718, "auxiliary_loss_mlp": 0.01054093, "balance_loss_clip": 1.03208733, "balance_loss_mlp": 1.05434978, "epoch": 0.09343153464602436, "flos": 23476852592640.0, "grad_norm": 1.7110467236349545, "language_loss": 0.87562883, "learning_rate": 3.914653554812269e-06, "loss": 0.89823699, "num_input_tokens_seen": 33349035, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.25, "step": 1554, "time_per_iteration": 4.162382364273071 }, { "auxiliary_loss_clip": 0.01193541, "auxiliary_loss_mlp": 0.01055381, "balance_loss_clip": 1.03412604, "balance_loss_mlp": 1.05763805, "epoch": 0.09349165789869232, "flos": 19353481804800.0, "grad_norm": 2.0980752247422387, "language_loss": 0.81416643, "learning_rate": 3.914544327092637e-06, "loss": 0.83665562, "num_input_tokens_seen": 33368060, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.171875, "step": 1555, "time_per_iteration": 2.686514377593994 }, { "auxiliary_loss_clip": 0.0107374, "auxiliary_loss_mlp": 0.01005108, "balance_loss_clip": 1.00196075, "balance_loss_mlp": 1.01930547, "epoch": 0.09355178115136029, "flos": 67502580635520.0, "grad_norm": 0.8693688834302471, "language_loss": 0.5964843, "learning_rate": 3.914435031047844e-06, "loss": 0.61727285, "num_input_tokens_seen": 33430825, "router_z_loss_clip": 0.03149414, "router_z_loss_mlp": 0.453125, "step": 1556, "time_per_iteration": 4.595407247543335 }, { "auxiliary_loss_clip": 0.01187107, "auxiliary_loss_mlp": 0.0104994, "balance_loss_clip": 1.02949619, "balance_loss_mlp": 1.05831385, "epoch": 0.09361190440402825, "flos": 37341638720640.0, "grad_norm": 1.7453127061707379, "language_loss": 0.84161341, "learning_rate": 3.9143256666817875e-06, "loss": 0.86398393, "num_input_tokens_seen": 33454855, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.1953125, "step": 1557, "time_per_iteration": 2.6751766204833984 }, { "auxiliary_loss_clip": 0.01230359, "auxiliary_loss_mlp": 0.01052731, "balance_loss_clip": 1.02946091, "balance_loss_mlp": 1.05525041, "epoch": 0.09367202765669623, "flos": 24899705112960.0, "grad_norm": 1.6272218320144112, "language_loss": 0.77819341, "learning_rate": 3.914216233998373e-06, "loss": 0.80102426, "num_input_tokens_seen": 33476000, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.203125, "step": 1558, "time_per_iteration": 4.276896953582764 }, { "auxiliary_loss_clip": 0.01199245, "auxiliary_loss_mlp": 0.01049428, "balance_loss_clip": 1.028018, "balance_loss_mlp": 1.0567683, "epoch": 0.0937321509093642, "flos": 15705568368000.0, "grad_norm": 2.0544026694777, "language_loss": 0.7985357, "learning_rate": 3.914106733001505e-06, "loss": 0.82102245, "num_input_tokens_seen": 33493845, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.2421875, "step": 1559, "time_per_iteration": 2.631939649581909 }, { "auxiliary_loss_clip": 0.01199236, "auxiliary_loss_mlp": 0.0104999, "balance_loss_clip": 1.02980781, "balance_loss_mlp": 1.05474234, "epoch": 0.09379227416203216, "flos": 20483698222080.0, "grad_norm": 1.9805785828860507, "language_loss": 0.76366353, "learning_rate": 3.9139971636950914e-06, "loss": 0.78615582, "num_input_tokens_seen": 33510850, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.171875, "step": 1560, "time_per_iteration": 2.6055121421813965 }, { "auxiliary_loss_clip": 0.01207853, "auxiliary_loss_mlp": 0.01053644, "balance_loss_clip": 1.03201962, "balance_loss_mlp": 1.05586612, "epoch": 0.09385239741470014, "flos": 24352498344960.0, "grad_norm": 1.7113438220841966, "language_loss": 0.81173885, "learning_rate": 3.913887526083042e-06, "loss": 0.8343538, "num_input_tokens_seen": 33530430, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.2421875, "step": 1561, "time_per_iteration": 2.659679651260376 }, { "auxiliary_loss_clip": 0.01185291, "auxiliary_loss_mlp": 0.01048147, "balance_loss_clip": 1.02612925, "balance_loss_mlp": 1.05309987, "epoch": 0.0939125206673681, "flos": 33291489807360.0, "grad_norm": 2.64920434449781, "language_loss": 0.61083949, "learning_rate": 3.91377782016927e-06, "loss": 0.63317394, "num_input_tokens_seen": 33551975, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.234375, "step": 1562, "time_per_iteration": 2.647430896759033 }, { "auxiliary_loss_clip": 0.01193961, "auxiliary_loss_mlp": 0.01055043, "balance_loss_clip": 1.03480089, "balance_loss_mlp": 1.05833244, "epoch": 0.09397264392003607, "flos": 19244923925760.0, "grad_norm": 2.1755105735478972, "language_loss": 0.84888446, "learning_rate": 3.9136680459576905e-06, "loss": 0.87137461, "num_input_tokens_seen": 33569850, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.171875, "step": 1563, "time_per_iteration": 2.5963969230651855 }, { "auxiliary_loss_clip": 0.01189835, "auxiliary_loss_mlp": 0.01046648, "balance_loss_clip": 1.02669215, "balance_loss_mlp": 1.05347252, "epoch": 0.09403276717270405, "flos": 19317930318720.0, "grad_norm": 1.7506565499906528, "language_loss": 0.75818086, "learning_rate": 3.913558203452221e-06, "loss": 0.78054571, "num_input_tokens_seen": 33590510, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.1796875, "step": 1564, "time_per_iteration": 2.604037284851074 }, { "auxiliary_loss_clip": 0.01196399, "auxiliary_loss_mlp": 0.01055086, "balance_loss_clip": 1.03421307, "balance_loss_mlp": 1.05810881, "epoch": 0.09409289042537201, "flos": 23583471137280.0, "grad_norm": 2.0115030449537623, "language_loss": 0.79593891, "learning_rate": 3.913448292656782e-06, "loss": 0.81845379, "num_input_tokens_seen": 33608810, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.1953125, "step": 1565, "time_per_iteration": 2.6493258476257324 }, { "auxiliary_loss_clip": 0.0119302, "auxiliary_loss_mlp": 0.01060896, "balance_loss_clip": 1.03917646, "balance_loss_mlp": 1.05379605, "epoch": 0.09415301367803998, "flos": 20078446003200.0, "grad_norm": 2.1550219548105316, "language_loss": 0.75407827, "learning_rate": 3.913338313575295e-06, "loss": 0.77661753, "num_input_tokens_seen": 33627265, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.2109375, "step": 1566, "time_per_iteration": 2.5313010215759277 }, { "auxiliary_loss_clip": 0.01202422, "auxiliary_loss_mlp": 0.01304062, "balance_loss_clip": 1.03356576, "balance_loss_mlp": 1.05451334, "epoch": 0.09421313693070796, "flos": 21062075016960.0, "grad_norm": 1.982614525092009, "language_loss": 0.77914226, "learning_rate": 3.913228266211685e-06, "loss": 0.80420709, "num_input_tokens_seen": 33644810, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.203125, "step": 1567, "time_per_iteration": 2.673175811767578 }, { "auxiliary_loss_clip": 0.01196012, "auxiliary_loss_mlp": 0.01049786, "balance_loss_clip": 1.03012788, "balance_loss_mlp": 1.05520928, "epoch": 0.09427326018337592, "flos": 24316156759680.0, "grad_norm": 2.127080982016844, "language_loss": 0.82324421, "learning_rate": 3.91311815056988e-06, "loss": 0.84570223, "num_input_tokens_seen": 33665665, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.2265625, "step": 1568, "time_per_iteration": 2.5575881004333496 }, { "auxiliary_loss_clip": 0.01178294, "auxiliary_loss_mlp": 0.01050254, "balance_loss_clip": 1.02804542, "balance_loss_mlp": 1.0555768, "epoch": 0.09433338343604389, "flos": 20263888944000.0, "grad_norm": 1.979581083245779, "language_loss": 0.76527196, "learning_rate": 3.9130079666538094e-06, "loss": 0.78755748, "num_input_tokens_seen": 33684760, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.2265625, "step": 1569, "time_per_iteration": 2.567105770111084 }, { "auxiliary_loss_clip": 0.01207699, "auxiliary_loss_mlp": 0.01051095, "balance_loss_clip": 1.02932763, "balance_loss_mlp": 1.05416977, "epoch": 0.09439350668871185, "flos": 12742973493120.0, "grad_norm": 2.1672847071514534, "language_loss": 0.85306966, "learning_rate": 3.912897714467405e-06, "loss": 0.87565756, "num_input_tokens_seen": 33700750, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.171875, "step": 1570, "time_per_iteration": 2.613786458969116 }, { "auxiliary_loss_clip": 0.01213045, "auxiliary_loss_mlp": 0.01048348, "balance_loss_clip": 1.02716482, "balance_loss_mlp": 1.05623972, "epoch": 0.09445362994137983, "flos": 25962266263680.0, "grad_norm": 1.5054633709682614, "language_loss": 0.76122648, "learning_rate": 3.912787394014602e-06, "loss": 0.78384048, "num_input_tokens_seen": 33724430, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.203125, "step": 1571, "time_per_iteration": 2.6861701011657715 }, { "auxiliary_loss_clip": 0.0120851, "auxiliary_loss_mlp": 0.0105567, "balance_loss_clip": 1.03533304, "balance_loss_mlp": 1.05521476, "epoch": 0.0945137531940478, "flos": 19715353372800.0, "grad_norm": 1.6492810783210872, "language_loss": 0.79336548, "learning_rate": 3.912677005299337e-06, "loss": 0.81600738, "num_input_tokens_seen": 33743455, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.171875, "step": 1572, "time_per_iteration": 2.5746843814849854 }, { "auxiliary_loss_clip": 0.01197784, "auxiliary_loss_mlp": 0.01054711, "balance_loss_clip": 1.03408742, "balance_loss_mlp": 1.05323625, "epoch": 0.09457387644671576, "flos": 23617047375360.0, "grad_norm": 2.110740290849247, "language_loss": 0.87573439, "learning_rate": 3.912566548325549e-06, "loss": 0.89825934, "num_input_tokens_seen": 33763435, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.171875, "step": 1573, "time_per_iteration": 2.6271448135375977 }, { "auxiliary_loss_clip": 0.01194543, "auxiliary_loss_mlp": 0.01058418, "balance_loss_clip": 1.03489816, "balance_loss_mlp": 1.05519891, "epoch": 0.09463399969938374, "flos": 26907291135360.0, "grad_norm": 2.6073900677840993, "language_loss": 0.81494427, "learning_rate": 3.912456023097182e-06, "loss": 0.83747393, "num_input_tokens_seen": 33784325, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 1.2109375, "step": 1574, "time_per_iteration": 2.6165614128112793 }, { "auxiliary_loss_clip": 0.01192679, "auxiliary_loss_mlp": 0.01052801, "balance_loss_clip": 1.03260708, "balance_loss_mlp": 1.05468678, "epoch": 0.0946941229520517, "flos": 23659566099840.0, "grad_norm": 1.648816875128319, "language_loss": 0.80777776, "learning_rate": 3.912345429618178e-06, "loss": 0.83023256, "num_input_tokens_seen": 33802510, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.1953125, "step": 1575, "time_per_iteration": 2.6878163814544678 }, { "auxiliary_loss_clip": 0.01172724, "auxiliary_loss_mlp": 0.01063909, "balance_loss_clip": 1.04278564, "balance_loss_mlp": 1.05466151, "epoch": 0.09475424620471967, "flos": 24134053783680.0, "grad_norm": 2.022582809329554, "language_loss": 0.86250448, "learning_rate": 3.912234767892486e-06, "loss": 0.88487077, "num_input_tokens_seen": 33819980, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.1796875, "step": 1576, "time_per_iteration": 2.652390718460083 }, { "auxiliary_loss_clip": 0.01090683, "auxiliary_loss_mlp": 0.01013993, "balance_loss_clip": 1.01070333, "balance_loss_mlp": 1.02749002, "epoch": 0.09481436945738765, "flos": 68426168065920.0, "grad_norm": 0.9807792107974898, "language_loss": 0.65935963, "learning_rate": 3.912124037924053e-06, "loss": 0.68040639, "num_input_tokens_seen": 33878925, "router_z_loss_clip": 0.03295898, "router_z_loss_mlp": 0.44921875, "step": 1577, "time_per_iteration": 3.2490272521972656 }, { "auxiliary_loss_clip": 0.01203313, "auxiliary_loss_mlp": 0.01045822, "balance_loss_clip": 1.02618802, "balance_loss_mlp": 1.05633903, "epoch": 0.09487449271005562, "flos": 16654076858880.0, "grad_norm": 1.9163102882070087, "language_loss": 0.79265481, "learning_rate": 3.912013239716831e-06, "loss": 0.81514615, "num_input_tokens_seen": 33897600, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.1953125, "step": 1578, "time_per_iteration": 2.7137258052825928 }, { "auxiliary_loss_clip": 0.0117271, "auxiliary_loss_mlp": 0.01063, "balance_loss_clip": 1.04184067, "balance_loss_mlp": 1.05349708, "epoch": 0.09493461596272358, "flos": 24275685110400.0, "grad_norm": 1.7694418695027763, "language_loss": 0.77983594, "learning_rate": 3.911902373274776e-06, "loss": 0.80219305, "num_input_tokens_seen": 33917365, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.1953125, "step": 1579, "time_per_iteration": 2.611696481704712 }, { "auxiliary_loss_clip": 0.01199601, "auxiliary_loss_mlp": 0.01054911, "balance_loss_clip": 1.03276169, "balance_loss_mlp": 1.05275261, "epoch": 0.09499473921539155, "flos": 21870173243520.0, "grad_norm": 1.8419252179395915, "language_loss": 0.73080045, "learning_rate": 3.911791438601842e-06, "loss": 0.75334561, "num_input_tokens_seen": 33936680, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.1953125, "step": 1580, "time_per_iteration": 2.760406255722046 }, { "auxiliary_loss_clip": 0.01187585, "auxiliary_loss_mlp": 0.01053706, "balance_loss_clip": 1.03245115, "balance_loss_mlp": 1.0533458, "epoch": 0.09505486246805953, "flos": 33547137880320.0, "grad_norm": 1.9932076006492734, "language_loss": 0.77419043, "learning_rate": 3.91168043570199e-06, "loss": 0.79660332, "num_input_tokens_seen": 33960685, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.1640625, "step": 1581, "time_per_iteration": 2.6937971115112305 }, { "auxiliary_loss_clip": 0.01202174, "auxiliary_loss_mlp": 0.01054088, "balance_loss_clip": 1.03320253, "balance_loss_mlp": 1.05450785, "epoch": 0.09511498572072749, "flos": 21215342350080.0, "grad_norm": 1.8785926357241038, "language_loss": 0.87202758, "learning_rate": 3.911569364579181e-06, "loss": 0.8945902, "num_input_tokens_seen": 33980015, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.1953125, "step": 1582, "time_per_iteration": 2.649038553237915 }, { "auxiliary_loss_clip": 0.0118353, "auxiliary_loss_mlp": 0.01054265, "balance_loss_clip": 1.03203261, "balance_loss_mlp": 1.0545311, "epoch": 0.09517510897339546, "flos": 14611262572800.0, "grad_norm": 2.311260285283305, "language_loss": 0.66537619, "learning_rate": 3.9114582252373786e-06, "loss": 0.68775415, "num_input_tokens_seen": 33997705, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.203125, "step": 1583, "time_per_iteration": 2.531853437423706 }, { "auxiliary_loss_clip": 0.01185272, "auxiliary_loss_mlp": 0.01049115, "balance_loss_clip": 1.02715695, "balance_loss_mlp": 1.05547416, "epoch": 0.09523523222606343, "flos": 27817339138560.0, "grad_norm": 1.8022202330338462, "language_loss": 0.70341933, "learning_rate": 3.911347017680548e-06, "loss": 0.7257632, "num_input_tokens_seen": 34017465, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.2109375, "step": 1584, "time_per_iteration": 2.6720807552337646 }, { "auxiliary_loss_clip": 0.01211589, "auxiliary_loss_mlp": 0.01053566, "balance_loss_clip": 1.0335871, "balance_loss_mlp": 1.0552423, "epoch": 0.0952953554787314, "flos": 20706272847360.0, "grad_norm": 1.5712387114804212, "language_loss": 0.80674875, "learning_rate": 3.911235741912659e-06, "loss": 0.8294003, "num_input_tokens_seen": 34038550, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.203125, "step": 1585, "time_per_iteration": 2.650160789489746 }, { "auxiliary_loss_clip": 0.01195979, "auxiliary_loss_mlp": 0.01056932, "balance_loss_clip": 1.03331709, "balance_loss_mlp": 1.05383229, "epoch": 0.09535547873139937, "flos": 24787627701120.0, "grad_norm": 1.6082901413384312, "language_loss": 0.71708167, "learning_rate": 3.911124397937683e-06, "loss": 0.73961079, "num_input_tokens_seen": 34058665, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 1.234375, "step": 1586, "time_per_iteration": 2.684647798538208 }, { "auxiliary_loss_clip": 0.01096212, "auxiliary_loss_mlp": 0.01007446, "balance_loss_clip": 1.00439453, "balance_loss_mlp": 1.02378583, "epoch": 0.09541560198406734, "flos": 71912194905600.0, "grad_norm": 0.8217139667756629, "language_loss": 0.55477285, "learning_rate": 3.911012985759594e-06, "loss": 0.57580948, "num_input_tokens_seen": 34109655, "router_z_loss_clip": 0.03051758, "router_z_loss_mlp": 0.44921875, "step": 1587, "time_per_iteration": 3.0656795501708984 }, { "auxiliary_loss_clip": 0.01205222, "auxiliary_loss_mlp": 0.01053253, "balance_loss_clip": 1.03170037, "balance_loss_mlp": 1.05518246, "epoch": 0.09547572523673531, "flos": 28982604251520.0, "grad_norm": 1.746845678620939, "language_loss": 0.81145406, "learning_rate": 3.910901505382367e-06, "loss": 0.83403885, "num_input_tokens_seen": 34131115, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.2265625, "step": 1588, "time_per_iteration": 2.6688411235809326 }, { "auxiliary_loss_clip": 0.01209532, "auxiliary_loss_mlp": 0.01048885, "balance_loss_clip": 1.02798724, "balance_loss_mlp": 1.05396223, "epoch": 0.09553584848940327, "flos": 24133910129280.0, "grad_norm": 1.5004696179326662, "language_loss": 0.81475663, "learning_rate": 3.910789956809981e-06, "loss": 0.83734077, "num_input_tokens_seen": 34151925, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.1875, "step": 1589, "time_per_iteration": 2.770415782928467 }, { "auxiliary_loss_clip": 0.01209437, "auxiliary_loss_mlp": 0.01298358, "balance_loss_clip": 1.02671313, "balance_loss_mlp": 1.05473197, "epoch": 0.09559597174207124, "flos": 42851376789120.0, "grad_norm": 1.3684944413199749, "language_loss": 0.64793324, "learning_rate": 3.910678340046415e-06, "loss": 0.67301118, "num_input_tokens_seen": 34175395, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.1796875, "step": 1590, "time_per_iteration": 2.8441407680511475 }, { "auxiliary_loss_clip": 0.01199039, "auxiliary_loss_mlp": 0.01052336, "balance_loss_clip": 1.03079498, "balance_loss_mlp": 1.05386806, "epoch": 0.09565609499473922, "flos": 32670845683200.0, "grad_norm": 1.59861189947668, "language_loss": 0.83355474, "learning_rate": 3.910566655095655e-06, "loss": 0.85606849, "num_input_tokens_seen": 34197760, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.1796875, "step": 1591, "time_per_iteration": 2.758450984954834 }, { "auxiliary_loss_clip": 0.01192971, "auxiliary_loss_mlp": 0.01059859, "balance_loss_clip": 1.03825796, "balance_loss_mlp": 1.05194545, "epoch": 0.09571621824740718, "flos": 18478410670080.0, "grad_norm": 2.475371847478917, "language_loss": 0.73699546, "learning_rate": 3.9104549019616855e-06, "loss": 0.75952375, "num_input_tokens_seen": 34215330, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.234375, "step": 1592, "time_per_iteration": 2.5650553703308105 }, { "auxiliary_loss_clip": 0.01180921, "auxiliary_loss_mlp": 0.01052451, "balance_loss_clip": 1.03094542, "balance_loss_mlp": 1.05184627, "epoch": 0.09577634150007515, "flos": 29387497334400.0, "grad_norm": 1.7999426256385591, "language_loss": 0.73665243, "learning_rate": 3.910343080648495e-06, "loss": 0.75898618, "num_input_tokens_seen": 34237745, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.203125, "step": 1593, "time_per_iteration": 2.6497867107391357 }, { "auxiliary_loss_clip": 0.01185466, "auxiliary_loss_mlp": 0.01052736, "balance_loss_clip": 1.03120732, "balance_loss_mlp": 1.05676115, "epoch": 0.09583646475274313, "flos": 22747830157440.0, "grad_norm": 1.8045503120874404, "language_loss": 0.69588333, "learning_rate": 3.910231191160074e-06, "loss": 0.71826535, "num_input_tokens_seen": 34256565, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.1953125, "step": 1594, "time_per_iteration": 2.550628900527954 }, { "auxiliary_loss_clip": 0.01183303, "auxiliary_loss_mlp": 0.01054669, "balance_loss_clip": 1.03416538, "balance_loss_mlp": 1.0539453, "epoch": 0.0958965880054111, "flos": 23218367345280.0, "grad_norm": 2.8240593796839875, "language_loss": 0.82509267, "learning_rate": 3.910119233500415e-06, "loss": 0.84747243, "num_input_tokens_seen": 34275970, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.203125, "step": 1595, "time_per_iteration": 2.603003978729248 }, { "auxiliary_loss_clip": 0.01192613, "auxiliary_loss_mlp": 0.01051134, "balance_loss_clip": 1.02871084, "balance_loss_mlp": 1.05403233, "epoch": 0.09595671125807906, "flos": 21324438933120.0, "grad_norm": 1.7822259185196876, "language_loss": 0.84407645, "learning_rate": 3.910007207673514e-06, "loss": 0.86651397, "num_input_tokens_seen": 34295490, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.203125, "step": 1596, "time_per_iteration": 5.553807258605957 }, { "auxiliary_loss_clip": 0.01187326, "auxiliary_loss_mlp": 0.01051775, "balance_loss_clip": 1.0287199, "balance_loss_mlp": 1.05659056, "epoch": 0.09601683451074704, "flos": 39603472185600.0, "grad_norm": 2.092394240234029, "language_loss": 0.69233537, "learning_rate": 3.909895113683369e-06, "loss": 0.71472633, "num_input_tokens_seen": 34319990, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.21875, "step": 1597, "time_per_iteration": 4.192671060562134 }, { "auxiliary_loss_clip": 0.01183037, "auxiliary_loss_mlp": 0.012925, "balance_loss_clip": 1.02216601, "balance_loss_mlp": 1.05138373, "epoch": 0.096076957763415, "flos": 23732716147200.0, "grad_norm": 2.019184112108147, "language_loss": 0.74183905, "learning_rate": 3.9097829515339805e-06, "loss": 0.76659441, "num_input_tokens_seen": 34339225, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.2265625, "step": 1598, "time_per_iteration": 2.598984479904175 }, { "auxiliary_loss_clip": 0.01187634, "auxiliary_loss_mlp": 0.01048481, "balance_loss_clip": 1.02574778, "balance_loss_mlp": 1.05529833, "epoch": 0.09613708101608297, "flos": 34678108483200.0, "grad_norm": 1.7585203147184336, "language_loss": 0.69011986, "learning_rate": 3.909670721229351e-06, "loss": 0.71248102, "num_input_tokens_seen": 34361020, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.234375, "step": 1599, "time_per_iteration": 2.7624576091766357 }, { "auxiliary_loss_clip": 0.01202958, "auxiliary_loss_mlp": 0.01055147, "balance_loss_clip": 1.033988, "balance_loss_mlp": 1.05457854, "epoch": 0.09619720426875093, "flos": 20740028653440.0, "grad_norm": 2.1952541875207694, "language_loss": 0.84043312, "learning_rate": 3.909558422773485e-06, "loss": 0.86301422, "num_input_tokens_seen": 34378630, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.2109375, "step": 1600, "time_per_iteration": 4.07806921005249 }, { "auxiliary_loss_clip": 0.01192383, "auxiliary_loss_mlp": 0.01053034, "balance_loss_clip": 1.03164792, "balance_loss_mlp": 1.05447817, "epoch": 0.09625732752141891, "flos": 13042720488960.0, "grad_norm": 1.9028832729414438, "language_loss": 0.80167437, "learning_rate": 3.909446056170392e-06, "loss": 0.82412851, "num_input_tokens_seen": 34397110, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.1953125, "step": 1601, "time_per_iteration": 2.7375330924987793 }, { "auxiliary_loss_clip": 0.01186176, "auxiliary_loss_mlp": 0.01058165, "balance_loss_clip": 1.03605163, "balance_loss_mlp": 1.05454421, "epoch": 0.09631745077408688, "flos": 22273629782400.0, "grad_norm": 2.3622425872415445, "language_loss": 0.82665825, "learning_rate": 3.9093336214240805e-06, "loss": 0.84910166, "num_input_tokens_seen": 34414165, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.2265625, "step": 1602, "time_per_iteration": 2.7234904766082764 }, { "auxiliary_loss_clip": 0.01201409, "auxiliary_loss_mlp": 0.01051987, "balance_loss_clip": 1.02921796, "balance_loss_mlp": 1.0555836, "epoch": 0.09637757402675484, "flos": 24754266944640.0, "grad_norm": 1.8680331989031878, "language_loss": 0.62965316, "learning_rate": 3.9092211185385625e-06, "loss": 0.65218711, "num_input_tokens_seen": 34434445, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 1.1875, "step": 1603, "time_per_iteration": 2.617295265197754 }, { "auxiliary_loss_clip": 0.01177074, "auxiliary_loss_mlp": 0.01052184, "balance_loss_clip": 1.02884305, "balance_loss_mlp": 1.05574834, "epoch": 0.09643769727942282, "flos": 22525758322560.0, "grad_norm": 2.875050873869056, "language_loss": 0.71289861, "learning_rate": 3.909108547517855e-06, "loss": 0.73519123, "num_input_tokens_seen": 34453095, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 1.2109375, "step": 1604, "time_per_iteration": 2.667236328125 }, { "auxiliary_loss_clip": 0.01182406, "auxiliary_loss_mlp": 0.01055854, "balance_loss_clip": 1.03448009, "balance_loss_mlp": 1.05439162, "epoch": 0.09649782053209079, "flos": 30921026636160.0, "grad_norm": 2.220188701030473, "language_loss": 0.79652727, "learning_rate": 3.908995908365974e-06, "loss": 0.81890988, "num_input_tokens_seen": 34473680, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.1875, "step": 1605, "time_per_iteration": 2.6191694736480713 }, { "auxiliary_loss_clip": 0.0119648, "auxiliary_loss_mlp": 0.01044472, "balance_loss_clip": 1.02302575, "balance_loss_mlp": 1.0542953, "epoch": 0.09655794378475875, "flos": 25337635729920.0, "grad_norm": 1.8847701177375482, "language_loss": 0.74270666, "learning_rate": 3.908883201086939e-06, "loss": 0.76511616, "num_input_tokens_seen": 34492610, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.2421875, "step": 1606, "time_per_iteration": 2.7802419662475586 }, { "auxiliary_loss_clip": 0.01185083, "auxiliary_loss_mlp": 0.01048995, "balance_loss_clip": 1.02773976, "balance_loss_mlp": 1.05571747, "epoch": 0.09661806703742673, "flos": 22346061557760.0, "grad_norm": 1.7494359752398974, "language_loss": 0.75282371, "learning_rate": 3.908770425684774e-06, "loss": 0.77516448, "num_input_tokens_seen": 34511855, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.203125, "step": 1607, "time_per_iteration": 2.6335184574127197 }, { "auxiliary_loss_clip": 0.01204287, "auxiliary_loss_mlp": 0.01044924, "balance_loss_clip": 1.02401495, "balance_loss_mlp": 1.05476582, "epoch": 0.0966781902900947, "flos": 17457578144640.0, "grad_norm": 1.8209621629092878, "language_loss": 0.8634243, "learning_rate": 3.908657582163501e-06, "loss": 0.88591641, "num_input_tokens_seen": 34528905, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.21875, "step": 1608, "time_per_iteration": 2.636918783187866 }, { "auxiliary_loss_clip": 0.01228489, "auxiliary_loss_mlp": 0.01056385, "balance_loss_clip": 1.03473628, "balance_loss_mlp": 1.05653214, "epoch": 0.09673831354276266, "flos": 36903995412480.0, "grad_norm": 2.1849946363351247, "language_loss": 0.71345961, "learning_rate": 3.90854467052715e-06, "loss": 0.7363084, "num_input_tokens_seen": 34548480, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.265625, "step": 1609, "time_per_iteration": 2.800828695297241 }, { "auxiliary_loss_clip": 0.01201092, "auxiliary_loss_mlp": 0.0105517, "balance_loss_clip": 1.03423667, "balance_loss_mlp": 1.05365086, "epoch": 0.09679843679543064, "flos": 20701388597760.0, "grad_norm": 2.648039882709159, "language_loss": 0.84713644, "learning_rate": 3.908431690779748e-06, "loss": 0.869699, "num_input_tokens_seen": 34565410, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.203125, "step": 1610, "time_per_iteration": 2.6365485191345215 }, { "auxiliary_loss_clip": 0.01194996, "auxiliary_loss_mlp": 0.01049543, "balance_loss_clip": 1.02715564, "balance_loss_mlp": 1.05672765, "epoch": 0.0968585600480986, "flos": 23514415240320.0, "grad_norm": 6.679344957774369, "language_loss": 0.67082012, "learning_rate": 3.9083186429253284e-06, "loss": 0.6932655, "num_input_tokens_seen": 34584840, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.203125, "step": 1611, "time_per_iteration": 2.651350736618042 }, { "auxiliary_loss_clip": 0.01212162, "auxiliary_loss_mlp": 0.01049922, "balance_loss_clip": 1.02971649, "balance_loss_mlp": 1.05606258, "epoch": 0.09691868330076657, "flos": 20121072468480.0, "grad_norm": 2.0225808355957513, "language_loss": 0.80937374, "learning_rate": 3.908205526967925e-06, "loss": 0.83199465, "num_input_tokens_seen": 34603360, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.1953125, "step": 1612, "time_per_iteration": 2.7388837337493896 }, { "auxiliary_loss_clip": 0.0118869, "auxiliary_loss_mlp": 0.01060898, "balance_loss_clip": 1.03934515, "balance_loss_mlp": 1.05834532, "epoch": 0.09697880655343454, "flos": 16544692967040.0, "grad_norm": 1.9952627000099612, "language_loss": 0.80585682, "learning_rate": 3.9080923429115755e-06, "loss": 0.82835269, "num_input_tokens_seen": 34620760, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.2109375, "step": 1613, "time_per_iteration": 2.707699775695801 }, { "auxiliary_loss_clip": 0.01203038, "auxiliary_loss_mlp": 0.01053072, "balance_loss_clip": 1.03068495, "balance_loss_mlp": 1.0546726, "epoch": 0.09703892980610251, "flos": 26104184899200.0, "grad_norm": 2.5138021477295793, "language_loss": 0.83975542, "learning_rate": 3.907979090760318e-06, "loss": 0.86231649, "num_input_tokens_seen": 34640695, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.2109375, "step": 1614, "time_per_iteration": 2.605928897857666 }, { "auxiliary_loss_clip": 0.01081637, "auxiliary_loss_mlp": 0.01263145, "balance_loss_clip": 1.01131427, "balance_loss_mlp": 1.02721858, "epoch": 0.09709905305877048, "flos": 60443622000000.0, "grad_norm": 0.7143731564982216, "language_loss": 0.54627448, "learning_rate": 3.907865770518194e-06, "loss": 0.56972235, "num_input_tokens_seen": 34702395, "router_z_loss_clip": 0.03015137, "router_z_loss_mlp": 0.453125, "step": 1615, "time_per_iteration": 3.117008924484253 }, { "auxiliary_loss_clip": 0.01109004, "auxiliary_loss_mlp": 0.01010631, "balance_loss_clip": 1.00761533, "balance_loss_mlp": 1.02720821, "epoch": 0.09715917631143844, "flos": 57639932893440.0, "grad_norm": 0.7621778988627429, "language_loss": 0.58256966, "learning_rate": 3.9077523821892495e-06, "loss": 0.60376602, "num_input_tokens_seen": 34768910, "router_z_loss_clip": 0.03015137, "router_z_loss_mlp": 0.453125, "step": 1616, "time_per_iteration": 3.3548123836517334 }, { "auxiliary_loss_clip": 0.01197917, "auxiliary_loss_mlp": 0.01054383, "balance_loss_clip": 1.0313046, "balance_loss_mlp": 1.05694294, "epoch": 0.09721929956410642, "flos": 20558212986240.0, "grad_norm": 2.053895023022678, "language_loss": 0.69270408, "learning_rate": 3.907638925777529e-06, "loss": 0.71522701, "num_input_tokens_seen": 34787680, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.2265625, "step": 1617, "time_per_iteration": 2.5902299880981445 }, { "auxiliary_loss_clip": 0.0120501, "auxiliary_loss_mlp": 0.01053596, "balance_loss_clip": 1.02927685, "balance_loss_mlp": 1.05531192, "epoch": 0.09727942281677439, "flos": 27344359825920.0, "grad_norm": 1.7877339863183708, "language_loss": 0.80318832, "learning_rate": 3.907525401287082e-06, "loss": 0.82577437, "num_input_tokens_seen": 34808330, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 1.21875, "step": 1618, "time_per_iteration": 2.6946871280670166 }, { "auxiliary_loss_clip": 0.01192456, "auxiliary_loss_mlp": 0.01051705, "balance_loss_clip": 1.03141606, "balance_loss_mlp": 1.05650973, "epoch": 0.09733954606944235, "flos": 24900028335360.0, "grad_norm": 1.5724671748682049, "language_loss": 0.92971218, "learning_rate": 3.907411808721961e-06, "loss": 0.95215386, "num_input_tokens_seen": 34830020, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.1796875, "step": 1619, "time_per_iteration": 2.693848133087158 }, { "auxiliary_loss_clip": 0.01185301, "auxiliary_loss_mlp": 0.01052337, "balance_loss_clip": 1.03047419, "balance_loss_mlp": 1.05936694, "epoch": 0.09739966932211033, "flos": 31503928544640.0, "grad_norm": 2.061146148413351, "language_loss": 0.88355219, "learning_rate": 3.907298148086219e-06, "loss": 0.90592855, "num_input_tokens_seen": 34850330, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.1640625, "step": 1620, "time_per_iteration": 2.6683943271636963 }, { "auxiliary_loss_clip": 0.01195044, "auxiliary_loss_mlp": 0.01058627, "balance_loss_clip": 1.03577447, "balance_loss_mlp": 1.05504119, "epoch": 0.0974597925747783, "flos": 23878764846720.0, "grad_norm": 2.0475463131195237, "language_loss": 0.76920199, "learning_rate": 3.907184419383912e-06, "loss": 0.79173875, "num_input_tokens_seen": 34871640, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.21875, "step": 1621, "time_per_iteration": 2.614056348800659 }, { "auxiliary_loss_clip": 0.01201576, "auxiliary_loss_mlp": 0.01068091, "balance_loss_clip": 1.04660916, "balance_loss_mlp": 1.05307543, "epoch": 0.09751991582744626, "flos": 17019575700480.0, "grad_norm": 2.0384133698832194, "language_loss": 0.77646387, "learning_rate": 3.907070622619099e-06, "loss": 0.7991606, "num_input_tokens_seen": 34888100, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.2109375, "step": 1622, "time_per_iteration": 2.624828338623047 }, { "auxiliary_loss_clip": 0.0119547, "auxiliary_loss_mlp": 0.01062138, "balance_loss_clip": 1.04051375, "balance_loss_mlp": 1.05460405, "epoch": 0.09758003908011423, "flos": 28402826826240.0, "grad_norm": 1.936659281890359, "language_loss": 0.85646272, "learning_rate": 3.906956757795841e-06, "loss": 0.87903887, "num_input_tokens_seen": 34910485, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.2265625, "step": 1623, "time_per_iteration": 2.610678195953369 }, { "auxiliary_loss_clip": 0.01183847, "auxiliary_loss_mlp": 0.01061626, "balance_loss_clip": 1.03959608, "balance_loss_mlp": 1.05572939, "epoch": 0.09764016233278221, "flos": 18144297336960.0, "grad_norm": 14.883268220598422, "language_loss": 0.80528939, "learning_rate": 3.906842824918201e-06, "loss": 0.82774407, "num_input_tokens_seen": 34928615, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.1875, "step": 1624, "time_per_iteration": 2.66172456741333 }, { "auxiliary_loss_clip": 0.01182263, "auxiliary_loss_mlp": 0.01051088, "balance_loss_clip": 1.03104901, "balance_loss_mlp": 1.05293894, "epoch": 0.09770028558545017, "flos": 15265842071040.0, "grad_norm": 2.231648005481046, "language_loss": 0.85181653, "learning_rate": 3.906728823990246e-06, "loss": 0.87415004, "num_input_tokens_seen": 34946045, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.203125, "step": 1625, "time_per_iteration": 2.521904468536377 }, { "auxiliary_loss_clip": 0.01190089, "auxiliary_loss_mlp": 0.0106644, "balance_loss_clip": 1.04600787, "balance_loss_mlp": 1.05859077, "epoch": 0.09776040883811814, "flos": 23472435219840.0, "grad_norm": 2.14304828359855, "language_loss": 0.85652667, "learning_rate": 3.906614755016044e-06, "loss": 0.87909192, "num_input_tokens_seen": 34962865, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.2265625, "step": 1626, "time_per_iteration": 2.6078948974609375 }, { "auxiliary_loss_clip": 0.01193547, "auxiliary_loss_mlp": 0.01314526, "balance_loss_clip": 1.04275751, "balance_loss_mlp": 1.05991077, "epoch": 0.09782053209078612, "flos": 24499480798080.0, "grad_norm": 2.3105810227708337, "language_loss": 0.8267113, "learning_rate": 3.9065006179996655e-06, "loss": 0.85179204, "num_input_tokens_seen": 34983505, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 1.2421875, "step": 1627, "time_per_iteration": 2.585733652114868 }, { "auxiliary_loss_clip": 0.01183419, "auxiliary_loss_mlp": 0.01057562, "balance_loss_clip": 1.03666472, "balance_loss_mlp": 1.05523455, "epoch": 0.09788065534345408, "flos": 21580158833280.0, "grad_norm": 2.0429707929508125, "language_loss": 0.84378868, "learning_rate": 3.906386412945184e-06, "loss": 0.86619848, "num_input_tokens_seen": 35001825, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.1875, "step": 1628, "time_per_iteration": 2.596994400024414 }, { "auxiliary_loss_clip": 0.01181854, "auxiliary_loss_mlp": 0.01056733, "balance_loss_clip": 1.03564465, "balance_loss_mlp": 1.05429101, "epoch": 0.09794077859612205, "flos": 23842459175040.0, "grad_norm": 1.6385607137995009, "language_loss": 0.75407851, "learning_rate": 3.906272139856676e-06, "loss": 0.77646434, "num_input_tokens_seen": 35023075, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.1796875, "step": 1629, "time_per_iteration": 2.641514539718628 }, { "auxiliary_loss_clip": 0.01214664, "auxiliary_loss_mlp": 0.01055696, "balance_loss_clip": 1.03330898, "balance_loss_mlp": 1.05805206, "epoch": 0.09800090184879003, "flos": 23659889322240.0, "grad_norm": 2.2313461848102922, "language_loss": 0.78383148, "learning_rate": 3.906157798738218e-06, "loss": 0.80653507, "num_input_tokens_seen": 35043480, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.203125, "step": 1630, "time_per_iteration": 2.6980154514312744 }, { "auxiliary_loss_clip": 0.01198805, "auxiliary_loss_mlp": 0.01051865, "balance_loss_clip": 1.02897668, "balance_loss_mlp": 1.0598979, "epoch": 0.09806102510145799, "flos": 17055773631360.0, "grad_norm": 1.90597590506307, "language_loss": 0.86465746, "learning_rate": 3.906043389593892e-06, "loss": 0.88716412, "num_input_tokens_seen": 35061490, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.203125, "step": 1631, "time_per_iteration": 2.5872576236724854 }, { "auxiliary_loss_clip": 0.01193232, "auxiliary_loss_mlp": 0.01056328, "balance_loss_clip": 1.03595495, "balance_loss_mlp": 1.05756068, "epoch": 0.09812114835412596, "flos": 23878477537920.0, "grad_norm": 2.0208943074104435, "language_loss": 0.83147091, "learning_rate": 3.9059289124277804e-06, "loss": 0.85396653, "num_input_tokens_seen": 35079670, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.171875, "step": 1632, "time_per_iteration": 2.657179117202759 }, { "auxiliary_loss_clip": 0.01205673, "auxiliary_loss_mlp": 0.01058617, "balance_loss_clip": 1.03812516, "balance_loss_mlp": 1.05702496, "epoch": 0.09818127160679392, "flos": 20595488325120.0, "grad_norm": 2.850160604088893, "language_loss": 0.78069741, "learning_rate": 3.9058143672439684e-06, "loss": 0.80334032, "num_input_tokens_seen": 35099205, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.2109375, "step": 1633, "time_per_iteration": 2.5817644596099854 }, { "auxiliary_loss_clip": 0.01181655, "auxiliary_loss_mlp": 0.01053737, "balance_loss_clip": 1.03214788, "balance_loss_mlp": 1.05473256, "epoch": 0.0982413948594619, "flos": 15487339288320.0, "grad_norm": 4.365474304726175, "language_loss": 0.72796643, "learning_rate": 3.905699754046544e-06, "loss": 0.75032032, "num_input_tokens_seen": 35115270, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.1796875, "step": 1634, "time_per_iteration": 2.5715417861938477 }, { "auxiliary_loss_clip": 0.01199716, "auxiliary_loss_mlp": 0.01065855, "balance_loss_clip": 1.04356277, "balance_loss_mlp": 1.05526137, "epoch": 0.09830151811212987, "flos": 24207958016640.0, "grad_norm": 2.767467793086232, "language_loss": 0.72684789, "learning_rate": 3.905585072839597e-06, "loss": 0.74950361, "num_input_tokens_seen": 35134065, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.2578125, "step": 1635, "time_per_iteration": 2.6991310119628906 }, { "auxiliary_loss_clip": 0.01201155, "auxiliary_loss_mlp": 0.01059884, "balance_loss_clip": 1.03750849, "balance_loss_mlp": 1.05910599, "epoch": 0.09836164136479783, "flos": 20594590485120.0, "grad_norm": 2.097563306556665, "language_loss": 0.78071296, "learning_rate": 3.905470323627221e-06, "loss": 0.80332333, "num_input_tokens_seen": 35154870, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.234375, "step": 1636, "time_per_iteration": 2.728501081466675 }, { "auxiliary_loss_clip": 0.01194294, "auxiliary_loss_mlp": 0.01056194, "balance_loss_clip": 1.03456926, "balance_loss_mlp": 1.05621767, "epoch": 0.09842176461746581, "flos": 19934157070080.0, "grad_norm": 1.967956293157026, "language_loss": 0.69783449, "learning_rate": 3.9053555064135106e-06, "loss": 0.7203393, "num_input_tokens_seen": 35171850, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.203125, "step": 1637, "time_per_iteration": 2.6168503761291504 }, { "auxiliary_loss_clip": 0.01178134, "auxiliary_loss_mlp": 0.01056194, "balance_loss_clip": 1.03414011, "balance_loss_mlp": 1.05691624, "epoch": 0.09848188787013377, "flos": 21214659991680.0, "grad_norm": 2.121211227469307, "language_loss": 0.76873982, "learning_rate": 3.905240621202563e-06, "loss": 0.79108316, "num_input_tokens_seen": 35188795, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.21875, "step": 1638, "time_per_iteration": 4.066016435623169 }, { "auxiliary_loss_clip": 0.01199345, "auxiliary_loss_mlp": 0.01046423, "balance_loss_clip": 1.02582395, "balance_loss_mlp": 1.05418134, "epoch": 0.09854201112280174, "flos": 30154226071680.0, "grad_norm": 1.4533559793903597, "language_loss": 0.72358549, "learning_rate": 3.905125667998478e-06, "loss": 0.74604315, "num_input_tokens_seen": 35212100, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.1796875, "step": 1639, "time_per_iteration": 4.055445432662964 }, { "auxiliary_loss_clip": 0.01183567, "auxiliary_loss_mlp": 0.01046686, "balance_loss_clip": 1.02450144, "balance_loss_mlp": 1.05428541, "epoch": 0.09860213437546972, "flos": 21795730306560.0, "grad_norm": 1.7963912656992862, "language_loss": 0.88291746, "learning_rate": 3.90501064680536e-06, "loss": 0.90521997, "num_input_tokens_seen": 35230390, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.203125, "step": 1640, "time_per_iteration": 2.6052207946777344 }, { "auxiliary_loss_clip": 0.01185008, "auxiliary_loss_mlp": 0.01041559, "balance_loss_clip": 1.02066159, "balance_loss_mlp": 1.05552411, "epoch": 0.09866225762813768, "flos": 21835555511040.0, "grad_norm": 2.1707880159658695, "language_loss": 0.80668545, "learning_rate": 3.904895557627311e-06, "loss": 0.82895112, "num_input_tokens_seen": 35250405, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.203125, "step": 1641, "time_per_iteration": 2.526620864868164 }, { "auxiliary_loss_clip": 0.01182385, "auxiliary_loss_mlp": 0.01054835, "balance_loss_clip": 1.03176832, "balance_loss_mlp": 1.0536325, "epoch": 0.09872238088080565, "flos": 17599855916160.0, "grad_norm": 2.7098727294737075, "language_loss": 0.85891628, "learning_rate": 3.90478040046844e-06, "loss": 0.88128841, "num_input_tokens_seen": 35262820, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.1953125, "step": 1642, "time_per_iteration": 4.122991561889648 }, { "auxiliary_loss_clip": 0.01203967, "auxiliary_loss_mlp": 0.01051977, "balance_loss_clip": 1.03035223, "balance_loss_mlp": 1.05661535, "epoch": 0.09878250413347361, "flos": 27636134002560.0, "grad_norm": 1.7520734609862787, "language_loss": 0.80433381, "learning_rate": 3.9046651753328565e-06, "loss": 0.82689321, "num_input_tokens_seen": 35284490, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.1953125, "step": 1643, "time_per_iteration": 2.668992757797241 }, { "auxiliary_loss_clip": 0.01182574, "auxiliary_loss_mlp": 0.01063573, "balance_loss_clip": 1.04212761, "balance_loss_mlp": 1.05387473, "epoch": 0.0988426273861416, "flos": 16544728880640.0, "grad_norm": 2.281605104663637, "language_loss": 0.82332408, "learning_rate": 3.904549882224672e-06, "loss": 0.84578562, "num_input_tokens_seen": 35302815, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.1953125, "step": 1644, "time_per_iteration": 2.5656096935272217 }, { "auxiliary_loss_clip": 0.0117804, "auxiliary_loss_mlp": 0.01296613, "balance_loss_clip": 1.02670407, "balance_loss_mlp": 1.05295563, "epoch": 0.09890275063880956, "flos": 21215270522880.0, "grad_norm": 1.7092202200054878, "language_loss": 0.67970508, "learning_rate": 3.904434521148001e-06, "loss": 0.70445168, "num_input_tokens_seen": 35321175, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.15625, "step": 1645, "time_per_iteration": 2.5547966957092285 }, { "auxiliary_loss_clip": 0.0108893, "auxiliary_loss_mlp": 0.0100654, "balance_loss_clip": 1.00348842, "balance_loss_mlp": 1.02567232, "epoch": 0.09896287389147752, "flos": 59379372910080.0, "grad_norm": 0.8465288848353411, "language_loss": 0.60867333, "learning_rate": 3.904319092106961e-06, "loss": 0.629628, "num_input_tokens_seen": 35381740, "router_z_loss_clip": 0.03051758, "router_z_loss_mlp": 0.45117188, "step": 1646, "time_per_iteration": 3.1074275970458984 }, { "auxiliary_loss_clip": 0.0119927, "auxiliary_loss_mlp": 0.01053226, "balance_loss_clip": 1.03061247, "balance_loss_mlp": 1.05306315, "epoch": 0.0990229971441455, "flos": 29642678530560.0, "grad_norm": 1.753272609533345, "language_loss": 0.732544, "learning_rate": 3.904203595105671e-06, "loss": 0.75506896, "num_input_tokens_seen": 35403760, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 1.1875, "step": 1647, "time_per_iteration": 2.842357873916626 }, { "auxiliary_loss_clip": 0.01199555, "auxiliary_loss_mlp": 0.0105967, "balance_loss_clip": 1.03982162, "balance_loss_mlp": 1.05336094, "epoch": 0.09908312039681347, "flos": 21834873152640.0, "grad_norm": 2.5770499561196054, "language_loss": 0.83426577, "learning_rate": 3.904088030148253e-06, "loss": 0.85685802, "num_input_tokens_seen": 35424050, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.1875, "step": 1648, "time_per_iteration": 2.678703546524048 }, { "auxiliary_loss_clip": 0.01076808, "auxiliary_loss_mlp": 0.01005089, "balance_loss_clip": 1.00218058, "balance_loss_mlp": 1.02312732, "epoch": 0.09914324364948143, "flos": 57564304807680.0, "grad_norm": 0.7285150507017519, "language_loss": 0.55725205, "learning_rate": 3.90397239723883e-06, "loss": 0.578071, "num_input_tokens_seen": 35481690, "router_z_loss_clip": 0.02905273, "router_z_loss_mlp": 0.4453125, "step": 1649, "time_per_iteration": 3.079543113708496 }, { "auxiliary_loss_clip": 0.0117785, "auxiliary_loss_mlp": 0.01041652, "balance_loss_clip": 1.02095687, "balance_loss_mlp": 1.05157208, "epoch": 0.09920336690214941, "flos": 34123934476800.0, "grad_norm": 2.28894535194188, "language_loss": 0.90279037, "learning_rate": 3.903856696381531e-06, "loss": 0.92498541, "num_input_tokens_seen": 35498635, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.171875, "step": 1650, "time_per_iteration": 2.726924180984497 }, { "auxiliary_loss_clip": 0.01099862, "auxiliary_loss_mlp": 0.01254423, "balance_loss_clip": 1.00280905, "balance_loss_mlp": 1.01894045, "epoch": 0.09926349015481738, "flos": 71216428464000.0, "grad_norm": 0.7946863320215308, "language_loss": 0.63719928, "learning_rate": 3.903740927580484e-06, "loss": 0.6607421, "num_input_tokens_seen": 35565720, "router_z_loss_clip": 0.0279541, "router_z_loss_mlp": 0.4453125, "step": 1651, "time_per_iteration": 3.348621129989624 }, { "auxiliary_loss_clip": 0.01180639, "auxiliary_loss_mlp": 0.01060845, "balance_loss_clip": 1.03892255, "balance_loss_mlp": 1.05475307, "epoch": 0.09932361340748534, "flos": 23148700917120.0, "grad_norm": 2.0729673949239404, "language_loss": 0.87456268, "learning_rate": 3.90362509083982e-06, "loss": 0.89697754, "num_input_tokens_seen": 35586000, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.171875, "step": 1652, "time_per_iteration": 2.6789848804473877 }, { "auxiliary_loss_clip": 0.01196107, "auxiliary_loss_mlp": 0.01053667, "balance_loss_clip": 1.03244805, "balance_loss_mlp": 1.05913329, "epoch": 0.09938373666015332, "flos": 19828651847040.0, "grad_norm": 1.8400486213752971, "language_loss": 0.82225662, "learning_rate": 3.903509186163673e-06, "loss": 0.84475434, "num_input_tokens_seen": 35604355, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.1875, "step": 1653, "time_per_iteration": 2.6220738887786865 }, { "auxiliary_loss_clip": 0.01187187, "auxiliary_loss_mlp": 0.01303837, "balance_loss_clip": 1.03382695, "balance_loss_mlp": 1.05792499, "epoch": 0.09944385991282129, "flos": 22090664880000.0, "grad_norm": 1.6960461422028597, "language_loss": 0.79112202, "learning_rate": 3.903393213556179e-06, "loss": 0.81603229, "num_input_tokens_seen": 35625495, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.203125, "step": 1654, "time_per_iteration": 2.639896869659424 }, { "auxiliary_loss_clip": 0.01176127, "auxiliary_loss_mlp": 0.01055344, "balance_loss_clip": 1.03548336, "balance_loss_mlp": 1.06089842, "epoch": 0.09950398316548925, "flos": 19828867328640.0, "grad_norm": 1.6732872261976757, "language_loss": 0.80782634, "learning_rate": 3.903277173021479e-06, "loss": 0.83014107, "num_input_tokens_seen": 35645030, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.15625, "step": 1655, "time_per_iteration": 2.5313875675201416 }, { "auxiliary_loss_clip": 0.0117113, "auxiliary_loss_mlp": 0.01054146, "balance_loss_clip": 1.03261662, "balance_loss_mlp": 1.05411959, "epoch": 0.09956410641815722, "flos": 25003701964800.0, "grad_norm": 1.6984572887570129, "language_loss": 0.80194205, "learning_rate": 3.903161064563712e-06, "loss": 0.82419479, "num_input_tokens_seen": 35664305, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.171875, "step": 1656, "time_per_iteration": 2.7190589904785156 }, { "auxiliary_loss_clip": 0.01183205, "auxiliary_loss_mlp": 0.0105786, "balance_loss_clip": 1.03705764, "balance_loss_mlp": 1.0570569, "epoch": 0.0996242296708252, "flos": 19317714837120.0, "grad_norm": 1.7540491032670027, "language_loss": 0.88755631, "learning_rate": 3.9030448881870206e-06, "loss": 0.90996695, "num_input_tokens_seen": 35684060, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.171875, "step": 1657, "time_per_iteration": 2.691423177719116 }, { "auxiliary_loss_clip": 0.01199236, "auxiliary_loss_mlp": 0.0105498, "balance_loss_clip": 1.0316627, "balance_loss_mlp": 1.0567596, "epoch": 0.09968435292349316, "flos": 21871609787520.0, "grad_norm": 2.4723959553963506, "language_loss": 0.84771043, "learning_rate": 3.902928643895554e-06, "loss": 0.87025261, "num_input_tokens_seen": 35703250, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.2421875, "step": 1658, "time_per_iteration": 2.6473708152770996 }, { "auxiliary_loss_clip": 0.01064095, "auxiliary_loss_mlp": 0.01004488, "balance_loss_clip": 1.00150776, "balance_loss_mlp": 1.01918948, "epoch": 0.09974447617616113, "flos": 65384533313280.0, "grad_norm": 0.9021535523969803, "language_loss": 0.60801518, "learning_rate": 3.9028123316934575e-06, "loss": 0.62870097, "num_input_tokens_seen": 35762165, "router_z_loss_clip": 0.02978516, "router_z_loss_mlp": 0.44921875, "step": 1659, "time_per_iteration": 3.2024447917938232 }, { "auxiliary_loss_clip": 0.01192971, "auxiliary_loss_mlp": 0.010527, "balance_loss_clip": 1.03074169, "balance_loss_mlp": 1.05690539, "epoch": 0.0998045994288291, "flos": 23659817495040.0, "grad_norm": 2.334919003485038, "language_loss": 0.84857023, "learning_rate": 3.902695951584885e-06, "loss": 0.87102693, "num_input_tokens_seen": 35781520, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.171875, "step": 1660, "time_per_iteration": 2.681281089782715 }, { "auxiliary_loss_clip": 0.01193647, "auxiliary_loss_mlp": 0.01057465, "balance_loss_clip": 1.03504181, "balance_loss_mlp": 1.05841422, "epoch": 0.09986472268149707, "flos": 19609704495360.0, "grad_norm": 2.5598184344751016, "language_loss": 0.79752064, "learning_rate": 3.902579503573987e-06, "loss": 0.82003176, "num_input_tokens_seen": 35799565, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.171875, "step": 1661, "time_per_iteration": 2.6049258708953857 }, { "auxiliary_loss_clip": 0.01196378, "auxiliary_loss_mlp": 0.01051638, "balance_loss_clip": 1.02884531, "balance_loss_mlp": 1.05375695, "epoch": 0.09992484593416504, "flos": 26213317395840.0, "grad_norm": 1.8613324667926967, "language_loss": 0.83373559, "learning_rate": 3.902462987664922e-06, "loss": 0.85621578, "num_input_tokens_seen": 35821085, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 1.2421875, "step": 1662, "time_per_iteration": 2.6259560585021973 }, { "auxiliary_loss_clip": 0.01193838, "auxiliary_loss_mlp": 0.01062879, "balance_loss_clip": 1.03896594, "balance_loss_mlp": 1.05767822, "epoch": 0.09998496918683301, "flos": 17493632421120.0, "grad_norm": 2.7195756022730952, "language_loss": 0.8890565, "learning_rate": 3.902346403861846e-06, "loss": 0.91162372, "num_input_tokens_seen": 35839840, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.171875, "step": 1663, "time_per_iteration": 2.660910129547119 }, { "auxiliary_loss_clip": 0.01187662, "auxiliary_loss_mlp": 0.01054396, "balance_loss_clip": 1.03157902, "balance_loss_mlp": 1.05804515, "epoch": 0.10004509243950098, "flos": 22784925928320.0, "grad_norm": 1.9554213187236635, "language_loss": 0.69711268, "learning_rate": 3.9022297521689196e-06, "loss": 0.71953326, "num_input_tokens_seen": 35861545, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.203125, "step": 1664, "time_per_iteration": 2.6926207542419434 }, { "auxiliary_loss_clip": 0.01206111, "auxiliary_loss_mlp": 0.01048866, "balance_loss_clip": 1.027349, "balance_loss_mlp": 1.05980849, "epoch": 0.10010521569216894, "flos": 16253385667200.0, "grad_norm": 2.1974279053080346, "language_loss": 0.78051126, "learning_rate": 3.902113032590307e-06, "loss": 0.80306107, "num_input_tokens_seen": 35878295, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.1875, "step": 1665, "time_per_iteration": 2.681427001953125 }, { "auxiliary_loss_clip": 0.01189387, "auxiliary_loss_mlp": 0.0106488, "balance_loss_clip": 1.04350603, "balance_loss_mlp": 1.0610888, "epoch": 0.10016533894483691, "flos": 23402589223680.0, "grad_norm": 1.8832584014596416, "language_loss": 0.69792855, "learning_rate": 3.901996245130174e-06, "loss": 0.72047126, "num_input_tokens_seen": 35898990, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.1953125, "step": 1666, "time_per_iteration": 2.596313953399658 }, { "auxiliary_loss_clip": 0.012325, "auxiliary_loss_mlp": 0.01061277, "balance_loss_clip": 1.03810287, "balance_loss_mlp": 1.05626631, "epoch": 0.10022546219750489, "flos": 19354164163200.0, "grad_norm": 1.8421884103017838, "language_loss": 0.78676897, "learning_rate": 3.901879389792686e-06, "loss": 0.80970669, "num_input_tokens_seen": 35916225, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 1.21875, "step": 1667, "time_per_iteration": 2.7079851627349854 }, { "auxiliary_loss_clip": 0.01193042, "auxiliary_loss_mlp": 0.01054008, "balance_loss_clip": 1.03114343, "balance_loss_mlp": 1.0554595, "epoch": 0.10028558545017285, "flos": 27085766837760.0, "grad_norm": 2.763877828000517, "language_loss": 0.76916385, "learning_rate": 3.9017624665820155e-06, "loss": 0.79163432, "num_input_tokens_seen": 35934630, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.1875, "step": 1668, "time_per_iteration": 2.663860559463501 }, { "auxiliary_loss_clip": 0.01175819, "auxiliary_loss_mlp": 0.01052793, "balance_loss_clip": 1.03031051, "balance_loss_mlp": 1.05646849, "epoch": 0.10034570870284082, "flos": 25847136195840.0, "grad_norm": 1.8033998731868026, "language_loss": 0.77708578, "learning_rate": 3.901645475502334e-06, "loss": 0.79937196, "num_input_tokens_seen": 35953855, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.1875, "step": 1669, "time_per_iteration": 2.707608222961426 }, { "auxiliary_loss_clip": 0.01180541, "auxiliary_loss_mlp": 0.01062757, "balance_loss_clip": 1.03968954, "balance_loss_mlp": 1.05755281, "epoch": 0.1004058319555088, "flos": 26249587153920.0, "grad_norm": 2.575267799090881, "language_loss": 0.85147887, "learning_rate": 3.901528416557817e-06, "loss": 0.87391186, "num_input_tokens_seen": 35974555, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.234375, "step": 1670, "time_per_iteration": 2.5500566959381104 }, { "auxiliary_loss_clip": 0.01165582, "auxiliary_loss_mlp": 0.01051157, "balance_loss_clip": 1.02959204, "balance_loss_mlp": 1.05240905, "epoch": 0.10046595520817676, "flos": 25374480105600.0, "grad_norm": 2.028185380752318, "language_loss": 0.76717836, "learning_rate": 3.901411289752643e-06, "loss": 0.78934574, "num_input_tokens_seen": 35996830, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.1328125, "step": 1671, "time_per_iteration": 2.664431095123291 }, { "auxiliary_loss_clip": 0.0107363, "auxiliary_loss_mlp": 0.01002081, "balance_loss_clip": 0.99955386, "balance_loss_mlp": 1.02145147, "epoch": 0.10052607846084473, "flos": 67461821677440.0, "grad_norm": 0.7758256076224188, "language_loss": 0.60735399, "learning_rate": 3.901294095090991e-06, "loss": 0.62811106, "num_input_tokens_seen": 36054465, "router_z_loss_clip": 0.02526855, "router_z_loss_mlp": 0.4296875, "step": 1672, "time_per_iteration": 3.213768243789673 }, { "auxiliary_loss_clip": 0.01202856, "auxiliary_loss_mlp": 0.01057405, "balance_loss_clip": 1.03483915, "balance_loss_mlp": 1.05403996, "epoch": 0.10058620171351271, "flos": 21360493209600.0, "grad_norm": 2.13813585264044, "language_loss": 0.76765883, "learning_rate": 3.901176832577043e-06, "loss": 0.79026145, "num_input_tokens_seen": 36073480, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.2109375, "step": 1673, "time_per_iteration": 2.6411964893341064 }, { "auxiliary_loss_clip": 0.01200626, "auxiliary_loss_mlp": 0.01053029, "balance_loss_clip": 1.0316906, "balance_loss_mlp": 1.05268145, "epoch": 0.10064632496618067, "flos": 16800125558400.0, "grad_norm": 2.0622275180555594, "language_loss": 0.72780234, "learning_rate": 3.901059502214984e-06, "loss": 0.75033891, "num_input_tokens_seen": 36091830, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.203125, "step": 1674, "time_per_iteration": 2.570789098739624 }, { "auxiliary_loss_clip": 0.01182643, "auxiliary_loss_mlp": 0.0130143, "balance_loss_clip": 1.03113627, "balance_loss_mlp": 1.05295277, "epoch": 0.10070644821884864, "flos": 23624445576960.0, "grad_norm": 4.8705880230535605, "language_loss": 0.79036981, "learning_rate": 3.900942104009003e-06, "loss": 0.81521058, "num_input_tokens_seen": 36111400, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.203125, "step": 1675, "time_per_iteration": 2.7461562156677246 }, { "auxiliary_loss_clip": 0.01171911, "auxiliary_loss_mlp": 0.01065431, "balance_loss_clip": 1.04478359, "balance_loss_mlp": 1.05378294, "epoch": 0.1007665714715166, "flos": 24462564595200.0, "grad_norm": 2.1883526971006786, "language_loss": 0.81050718, "learning_rate": 3.900824637963287e-06, "loss": 0.83288062, "num_input_tokens_seen": 36129345, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1796875, "step": 1676, "time_per_iteration": 2.634748935699463 }, { "auxiliary_loss_clip": 0.01187568, "auxiliary_loss_mlp": 0.01060094, "balance_loss_clip": 1.03877962, "balance_loss_mlp": 1.05825138, "epoch": 0.10082669472418458, "flos": 16799119977600.0, "grad_norm": 2.0470202231046644, "language_loss": 0.8609724, "learning_rate": 3.9007071040820285e-06, "loss": 0.88344908, "num_input_tokens_seen": 36146255, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.203125, "step": 1677, "time_per_iteration": 2.5929925441741943 }, { "auxiliary_loss_clip": 0.01194632, "auxiliary_loss_mlp": 0.01058738, "balance_loss_clip": 1.0367322, "balance_loss_mlp": 1.05722189, "epoch": 0.10088681797685255, "flos": 13553513844480.0, "grad_norm": 2.124700059562905, "language_loss": 0.8599233, "learning_rate": 3.900589502369423e-06, "loss": 0.88245702, "num_input_tokens_seen": 36164050, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.1875, "step": 1678, "time_per_iteration": 2.722170114517212 }, { "auxiliary_loss_clip": 0.01194295, "auxiliary_loss_mlp": 0.01052061, "balance_loss_clip": 1.02937603, "balance_loss_mlp": 1.05681992, "epoch": 0.10094694122952051, "flos": 25265706744960.0, "grad_norm": 2.2057980864075644, "language_loss": 0.89619833, "learning_rate": 3.9004718328296676e-06, "loss": 0.91866183, "num_input_tokens_seen": 36183530, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.1875, "step": 1679, "time_per_iteration": 4.195310831069946 }, { "auxiliary_loss_clip": 0.01068208, "auxiliary_loss_mlp": 0.0100402, "balance_loss_clip": 1.00164795, "balance_loss_mlp": 1.02534497, "epoch": 0.10100706448218849, "flos": 69854299885440.0, "grad_norm": 0.785090756863761, "language_loss": 0.53009844, "learning_rate": 3.900354095466962e-06, "loss": 0.55082077, "num_input_tokens_seen": 36248550, "router_z_loss_clip": 0.02368164, "router_z_loss_mlp": 0.4296875, "step": 1680, "time_per_iteration": 4.733456611633301 }, { "auxiliary_loss_clip": 0.01171529, "auxiliary_loss_mlp": 0.0130296, "balance_loss_clip": 1.03307652, "balance_loss_mlp": 1.05513811, "epoch": 0.10106718773485646, "flos": 20007163463040.0, "grad_norm": 2.7553987174377643, "language_loss": 0.7640959, "learning_rate": 3.900236290285506e-06, "loss": 0.78884077, "num_input_tokens_seen": 36266065, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.1640625, "step": 1681, "time_per_iteration": 4.018561840057373 }, { "auxiliary_loss_clip": 0.01196445, "auxiliary_loss_mlp": 0.01055128, "balance_loss_clip": 1.03052354, "balance_loss_mlp": 1.05379844, "epoch": 0.10112731098752442, "flos": 13479825093120.0, "grad_norm": 2.2624866634990926, "language_loss": 0.93804252, "learning_rate": 3.900118417289504e-06, "loss": 0.9605583, "num_input_tokens_seen": 36280960, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 1.25, "step": 1682, "time_per_iteration": 2.632798433303833 }, { "auxiliary_loss_clip": 0.0118308, "auxiliary_loss_mlp": 0.01049041, "balance_loss_clip": 1.02788115, "balance_loss_mlp": 1.05417323, "epoch": 0.1011874342401924, "flos": 18515901490560.0, "grad_norm": 2.0165171511055386, "language_loss": 0.88012505, "learning_rate": 3.900000476483164e-06, "loss": 0.90244627, "num_input_tokens_seen": 36299010, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.1953125, "step": 1683, "time_per_iteration": 4.09153938293457 }, { "auxiliary_loss_clip": 0.01198232, "auxiliary_loss_mlp": 0.01055911, "balance_loss_clip": 1.03481126, "balance_loss_mlp": 1.05514503, "epoch": 0.10124755749286037, "flos": 20702861055360.0, "grad_norm": 3.038614095768891, "language_loss": 0.74436307, "learning_rate": 3.8998824678706946e-06, "loss": 0.76690459, "num_input_tokens_seen": 36318400, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.15625, "step": 1684, "time_per_iteration": 2.6591033935546875 }, { "auxiliary_loss_clip": 0.01084641, "auxiliary_loss_mlp": 0.01002707, "balance_loss_clip": 1.000108, "balance_loss_mlp": 1.0240109, "epoch": 0.10130768074552833, "flos": 56109456247680.0, "grad_norm": 0.7478506818227288, "language_loss": 0.61082828, "learning_rate": 3.899764391456306e-06, "loss": 0.63170183, "num_input_tokens_seen": 36381815, "router_z_loss_clip": 0.02600098, "router_z_loss_mlp": 0.42578125, "step": 1685, "time_per_iteration": 3.2877683639526367 }, { "auxiliary_loss_clip": 0.0119028, "auxiliary_loss_mlp": 0.01056011, "balance_loss_clip": 1.03373051, "balance_loss_mlp": 1.05407786, "epoch": 0.1013678039981963, "flos": 33402346156800.0, "grad_norm": 2.628354329584567, "language_loss": 0.61252773, "learning_rate": 3.8996462472442145e-06, "loss": 0.63499063, "num_input_tokens_seen": 36404320, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.1796875, "step": 1686, "time_per_iteration": 2.6778342723846436 }, { "auxiliary_loss_clip": 0.01202209, "auxiliary_loss_mlp": 0.01053984, "balance_loss_clip": 1.03161979, "balance_loss_mlp": 1.05906987, "epoch": 0.10142792725086427, "flos": 31905338008320.0, "grad_norm": 2.59713171351627, "language_loss": 0.81382108, "learning_rate": 3.8995280352386344e-06, "loss": 0.83638299, "num_input_tokens_seen": 36427510, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.15625, "step": 1687, "time_per_iteration": 2.7131009101867676 }, { "auxiliary_loss_clip": 0.01184959, "auxiliary_loss_mlp": 0.0105384, "balance_loss_clip": 1.03181064, "balance_loss_mlp": 1.05562901, "epoch": 0.10148805050353224, "flos": 28475905046400.0, "grad_norm": 2.356531158896621, "language_loss": 0.72002262, "learning_rate": 3.899409755443785e-06, "loss": 0.7424106, "num_input_tokens_seen": 36448230, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.203125, "step": 1688, "time_per_iteration": 2.597931146621704 }, { "auxiliary_loss_clip": 0.0118966, "auxiliary_loss_mlp": 0.01053971, "balance_loss_clip": 1.03233504, "balance_loss_mlp": 1.05489039, "epoch": 0.1015481737562002, "flos": 25148888737920.0, "grad_norm": 2.5139216835123777, "language_loss": 0.86791313, "learning_rate": 3.899291407863887e-06, "loss": 0.89034951, "num_input_tokens_seen": 36464395, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.1640625, "step": 1689, "time_per_iteration": 2.6563007831573486 }, { "auxiliary_loss_clip": 0.01178853, "auxiliary_loss_mlp": 0.01047399, "balance_loss_clip": 1.02782428, "balance_loss_mlp": 1.05247998, "epoch": 0.10160829700886818, "flos": 30882781630080.0, "grad_norm": 2.0023637525818665, "language_loss": 0.87593555, "learning_rate": 3.899172992503165e-06, "loss": 0.89819807, "num_input_tokens_seen": 36486475, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.171875, "step": 1690, "time_per_iteration": 2.648550271987915 }, { "auxiliary_loss_clip": 0.01191, "auxiliary_loss_mlp": 0.01051437, "balance_loss_clip": 1.02906132, "balance_loss_mlp": 1.05344224, "epoch": 0.10166842026153615, "flos": 20412020632320.0, "grad_norm": 1.9955823510019957, "language_loss": 0.83028406, "learning_rate": 3.899054509365843e-06, "loss": 0.8527084, "num_input_tokens_seen": 36505310, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.1953125, "step": 1691, "time_per_iteration": 2.6305713653564453 }, { "auxiliary_loss_clip": 0.01194848, "auxiliary_loss_mlp": 0.0105049, "balance_loss_clip": 1.02848434, "balance_loss_mlp": 1.05590773, "epoch": 0.10172854351420411, "flos": 33476968661760.0, "grad_norm": 1.6144824236278115, "language_loss": 0.66215229, "learning_rate": 3.89893595845615e-06, "loss": 0.6846056, "num_input_tokens_seen": 36529820, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.2109375, "step": 1692, "time_per_iteration": 2.7382917404174805 }, { "auxiliary_loss_clip": 0.01187833, "auxiliary_loss_mlp": 0.01063628, "balance_loss_clip": 1.04220629, "balance_loss_mlp": 1.05186069, "epoch": 0.1017886667668721, "flos": 23550325862400.0, "grad_norm": 1.6744459672543257, "language_loss": 0.75524819, "learning_rate": 3.898817339778319e-06, "loss": 0.77776277, "num_input_tokens_seen": 36549000, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.1796875, "step": 1693, "time_per_iteration": 2.600040912628174 }, { "auxiliary_loss_clip": 0.01201448, "auxiliary_loss_mlp": 0.01053903, "balance_loss_clip": 1.03165817, "balance_loss_mlp": 1.05461681, "epoch": 0.10184879001954006, "flos": 23296078419840.0, "grad_norm": 1.6377679326884722, "language_loss": 0.86997789, "learning_rate": 3.898698653336581e-06, "loss": 0.89253139, "num_input_tokens_seen": 36567515, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.1875, "step": 1694, "time_per_iteration": 2.719393730163574 }, { "auxiliary_loss_clip": 0.01215477, "auxiliary_loss_mlp": 0.01053218, "balance_loss_clip": 1.03096199, "balance_loss_mlp": 1.05506325, "epoch": 0.10190891327220802, "flos": 18333116156160.0, "grad_norm": 1.8351031194974032, "language_loss": 0.7909205, "learning_rate": 3.8985798991351715e-06, "loss": 0.81360739, "num_input_tokens_seen": 36586190, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.2421875, "step": 1695, "time_per_iteration": 2.595207929611206 }, { "auxiliary_loss_clip": 0.01209125, "auxiliary_loss_mlp": 0.01058634, "balance_loss_clip": 1.03590059, "balance_loss_mlp": 1.05345309, "epoch": 0.10196903652487599, "flos": 26465374108800.0, "grad_norm": 1.956706495547612, "language_loss": 0.86365545, "learning_rate": 3.898461077178329e-06, "loss": 0.88633299, "num_input_tokens_seen": 36607495, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 1.1953125, "step": 1696, "time_per_iteration": 2.6320693492889404 }, { "auxiliary_loss_clip": 0.01180591, "auxiliary_loss_mlp": 0.01054229, "balance_loss_clip": 1.03278315, "balance_loss_mlp": 1.05454671, "epoch": 0.10202915977754397, "flos": 21869526798720.0, "grad_norm": 1.856505436755876, "language_loss": 0.81661588, "learning_rate": 3.898342187470296e-06, "loss": 0.8389641, "num_input_tokens_seen": 36628555, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.171875, "step": 1697, "time_per_iteration": 2.5613632202148438 }, { "auxiliary_loss_clip": 0.0117255, "auxiliary_loss_mlp": 0.01050606, "balance_loss_clip": 1.02750278, "balance_loss_mlp": 1.05344987, "epoch": 0.10208928303021193, "flos": 22309755886080.0, "grad_norm": 2.134268792191414, "language_loss": 0.80558974, "learning_rate": 3.898223230015311e-06, "loss": 0.82782131, "num_input_tokens_seen": 36646250, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 1.1953125, "step": 1698, "time_per_iteration": 2.6020877361297607 }, { "auxiliary_loss_clip": 0.01201825, "auxiliary_loss_mlp": 0.01047332, "balance_loss_clip": 1.02664876, "balance_loss_mlp": 1.05628037, "epoch": 0.1021494062828799, "flos": 22125569921280.0, "grad_norm": 2.8463509655328427, "language_loss": 0.75757802, "learning_rate": 3.8981042048176235e-06, "loss": 0.78006959, "num_input_tokens_seen": 36666675, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1796875, "step": 1699, "time_per_iteration": 2.566159963607788 }, { "auxiliary_loss_clip": 0.01191119, "auxiliary_loss_mlp": 0.01050203, "balance_loss_clip": 1.02888858, "balance_loss_mlp": 1.05629265, "epoch": 0.10220952953554788, "flos": 19646728439040.0, "grad_norm": 1.6477717591108914, "language_loss": 0.79829705, "learning_rate": 3.897985111881478e-06, "loss": 0.82071036, "num_input_tokens_seen": 36685225, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.1640625, "step": 1700, "time_per_iteration": 2.616539716720581 }, { "auxiliary_loss_clip": 0.01182772, "auxiliary_loss_mlp": 0.01048215, "balance_loss_clip": 1.02696013, "balance_loss_mlp": 1.05416501, "epoch": 0.10226965278821584, "flos": 29787290686080.0, "grad_norm": 1.9084838590447037, "language_loss": 0.76777327, "learning_rate": 3.897865951211127e-06, "loss": 0.79008317, "num_input_tokens_seen": 36705985, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.1953125, "step": 1701, "time_per_iteration": 2.6262047290802 }, { "auxiliary_loss_clip": 0.01185036, "auxiliary_loss_mlp": 0.01052716, "balance_loss_clip": 1.03060257, "balance_loss_mlp": 1.05405617, "epoch": 0.10232977604088381, "flos": 27016818681600.0, "grad_norm": 1.9387465653131704, "language_loss": 0.77756512, "learning_rate": 3.897746722810822e-06, "loss": 0.79994261, "num_input_tokens_seen": 36725815, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.21875, "step": 1702, "time_per_iteration": 2.618931293487549 }, { "auxiliary_loss_clip": 0.01189883, "auxiliary_loss_mlp": 0.01046969, "balance_loss_clip": 1.02621472, "balance_loss_mlp": 1.05427814, "epoch": 0.10238989929355179, "flos": 20777519473920.0, "grad_norm": 2.432724001725919, "language_loss": 0.94531941, "learning_rate": 3.897627426684818e-06, "loss": 0.9676879, "num_input_tokens_seen": 36742345, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.171875, "step": 1703, "time_per_iteration": 2.6236631870269775 }, { "auxiliary_loss_clip": 0.01103947, "auxiliary_loss_mlp": 0.01007302, "balance_loss_clip": 1.00465584, "balance_loss_mlp": 1.02484608, "epoch": 0.10245002254621975, "flos": 57698322451200.0, "grad_norm": 0.8614910356290124, "language_loss": 0.55010152, "learning_rate": 3.897508062837372e-06, "loss": 0.57121408, "num_input_tokens_seen": 36798775, "router_z_loss_clip": 0.02648926, "router_z_loss_mlp": 0.421875, "step": 1704, "time_per_iteration": 3.1739697456359863 }, { "auxiliary_loss_clip": 0.011717, "auxiliary_loss_mlp": 0.01043345, "balance_loss_clip": 1.02192354, "balance_loss_mlp": 1.05349088, "epoch": 0.10251014579888772, "flos": 16800125558400.0, "grad_norm": 2.046468884595722, "language_loss": 0.83330858, "learning_rate": 3.897388631272745e-06, "loss": 0.85545897, "num_input_tokens_seen": 36816295, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.1796875, "step": 1705, "time_per_iteration": 2.569603681564331 }, { "auxiliary_loss_clip": 0.01082133, "auxiliary_loss_mlp": 0.01002531, "balance_loss_clip": 0.99976557, "balance_loss_mlp": 1.02200413, "epoch": 0.1025702690515557, "flos": 68565500922240.0, "grad_norm": 0.7643134135696812, "language_loss": 0.60335559, "learning_rate": 3.8972691319951975e-06, "loss": 0.62420225, "num_input_tokens_seen": 36882030, "router_z_loss_clip": 0.02770996, "router_z_loss_mlp": 0.41796875, "step": 1706, "time_per_iteration": 3.2628836631774902 }, { "auxiliary_loss_clip": 0.01202377, "auxiliary_loss_mlp": 0.01045991, "balance_loss_clip": 1.02399635, "balance_loss_mlp": 1.05380154, "epoch": 0.10263039230422366, "flos": 14866623336960.0, "grad_norm": 2.5449195356544654, "language_loss": 0.86322439, "learning_rate": 3.897149565008996e-06, "loss": 0.88570809, "num_input_tokens_seen": 36899245, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.2109375, "step": 1707, "time_per_iteration": 2.659602165222168 }, { "auxiliary_loss_clip": 0.01179358, "auxiliary_loss_mlp": 0.01042616, "balance_loss_clip": 1.02163553, "balance_loss_mlp": 1.05302668, "epoch": 0.10269051555689163, "flos": 25337599816320.0, "grad_norm": 1.673578650469009, "language_loss": 0.73145676, "learning_rate": 3.897029930318406e-06, "loss": 0.75367647, "num_input_tokens_seen": 36920950, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.171875, "step": 1708, "time_per_iteration": 2.584813356399536 }, { "auxiliary_loss_clip": 0.0118788, "auxiliary_loss_mlp": 0.01302422, "balance_loss_clip": 1.03156197, "balance_loss_mlp": 1.05324173, "epoch": 0.10275063880955959, "flos": 21068826773760.0, "grad_norm": 1.9309740731277467, "language_loss": 0.9093529, "learning_rate": 3.8969102279276974e-06, "loss": 0.93425596, "num_input_tokens_seen": 36938900, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.1640625, "step": 1709, "time_per_iteration": 2.634108543395996 }, { "auxiliary_loss_clip": 0.01202067, "auxiliary_loss_mlp": 0.01048324, "balance_loss_clip": 1.02693796, "balance_loss_mlp": 1.05785918, "epoch": 0.10281076206222757, "flos": 30366780802560.0, "grad_norm": 2.406123841516675, "language_loss": 0.87936175, "learning_rate": 3.896790457841142e-06, "loss": 0.90186566, "num_input_tokens_seen": 36957010, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.171875, "step": 1710, "time_per_iteration": 2.686903476715088 }, { "auxiliary_loss_clip": 0.01176021, "auxiliary_loss_mlp": 0.01048461, "balance_loss_clip": 1.02750397, "balance_loss_mlp": 1.05154634, "epoch": 0.10287088531489554, "flos": 22418313765120.0, "grad_norm": 3.7124554588485816, "language_loss": 0.7929734, "learning_rate": 3.896670620063015e-06, "loss": 0.81521821, "num_input_tokens_seen": 36977690, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.15625, "step": 1711, "time_per_iteration": 2.7417428493499756 }, { "auxiliary_loss_clip": 0.01201125, "auxiliary_loss_mlp": 0.01049735, "balance_loss_clip": 1.02777672, "balance_loss_mlp": 1.05434132, "epoch": 0.1029310085675635, "flos": 25115994858240.0, "grad_norm": 1.8872931060142843, "language_loss": 0.73558795, "learning_rate": 3.896550714597592e-06, "loss": 0.75809658, "num_input_tokens_seen": 36997300, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.1953125, "step": 1712, "time_per_iteration": 2.7590391635894775 }, { "auxiliary_loss_clip": 0.01194744, "auxiliary_loss_mlp": 0.01049377, "balance_loss_clip": 1.02892041, "balance_loss_mlp": 1.05363417, "epoch": 0.10299113182023148, "flos": 19865639877120.0, "grad_norm": 1.9080465000132125, "language_loss": 0.86641955, "learning_rate": 3.896430741449153e-06, "loss": 0.88886082, "num_input_tokens_seen": 37016110, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.140625, "step": 1713, "time_per_iteration": 2.6807610988616943 }, { "auxiliary_loss_clip": 0.01175315, "auxiliary_loss_mlp": 0.01052043, "balance_loss_clip": 1.03169346, "balance_loss_mlp": 1.05053413, "epoch": 0.10305125507289944, "flos": 20447608032000.0, "grad_norm": 2.2997357997927375, "language_loss": 0.72667909, "learning_rate": 3.8963107006219785e-06, "loss": 0.74895275, "num_input_tokens_seen": 37036405, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.15625, "step": 1714, "time_per_iteration": 2.646669626235962 }, { "auxiliary_loss_clip": 0.01183189, "auxiliary_loss_mlp": 0.01056606, "balance_loss_clip": 1.0336467, "balance_loss_mlp": 1.05219555, "epoch": 0.10311137832556741, "flos": 26250772302720.0, "grad_norm": 1.8793459120013545, "language_loss": 0.90780461, "learning_rate": 3.896190592120353e-06, "loss": 0.93020248, "num_input_tokens_seen": 37057580, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 1.21875, "step": 1715, "time_per_iteration": 2.6658294200897217 }, { "auxiliary_loss_clip": 0.01177077, "auxiliary_loss_mlp": 0.01053267, "balance_loss_clip": 1.03147531, "balance_loss_mlp": 1.05107176, "epoch": 0.10317150157823539, "flos": 35298932175360.0, "grad_norm": 2.0966921878702363, "language_loss": 0.75378811, "learning_rate": 3.896070415948563e-06, "loss": 0.77609158, "num_input_tokens_seen": 37079120, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.171875, "step": 1716, "time_per_iteration": 2.706498861312866 }, { "auxiliary_loss_clip": 0.01190356, "auxiliary_loss_mlp": 0.01062908, "balance_loss_clip": 1.04034162, "balance_loss_mlp": 1.0521493, "epoch": 0.10323162483090335, "flos": 25739943033600.0, "grad_norm": 1.8423926146174006, "language_loss": 0.85126984, "learning_rate": 3.895950172110897e-06, "loss": 0.87380242, "num_input_tokens_seen": 37099710, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 1.1953125, "step": 1717, "time_per_iteration": 2.668240547180176 }, { "auxiliary_loss_clip": 0.01198379, "auxiliary_loss_mlp": 0.0106054, "balance_loss_clip": 1.03935671, "balance_loss_mlp": 1.0547626, "epoch": 0.10329174808357132, "flos": 16289870906880.0, "grad_norm": 2.124636436934411, "language_loss": 0.8243438, "learning_rate": 3.895829860611646e-06, "loss": 0.84693301, "num_input_tokens_seen": 37117775, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.1640625, "step": 1718, "time_per_iteration": 2.5826470851898193 }, { "auxiliary_loss_clip": 0.01177775, "auxiliary_loss_mlp": 0.01046482, "balance_loss_clip": 1.02513182, "balance_loss_mlp": 1.053092, "epoch": 0.10335187133623928, "flos": 36687166963200.0, "grad_norm": 1.6643500899652997, "language_loss": 0.73017579, "learning_rate": 3.895709481455105e-06, "loss": 0.75241834, "num_input_tokens_seen": 37140280, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.15625, "step": 1719, "time_per_iteration": 2.675635576248169 }, { "auxiliary_loss_clip": 0.01178606, "auxiliary_loss_mlp": 0.01049947, "balance_loss_clip": 1.02963352, "balance_loss_mlp": 1.05271006, "epoch": 0.10341199458890726, "flos": 14975648092800.0, "grad_norm": 2.1025955720924685, "language_loss": 0.92859632, "learning_rate": 3.895589034645568e-06, "loss": 0.95088184, "num_input_tokens_seen": 37158350, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.171875, "step": 1720, "time_per_iteration": 2.555617570877075 }, { "auxiliary_loss_clip": 0.01193757, "auxiliary_loss_mlp": 0.01053124, "balance_loss_clip": 1.03062904, "balance_loss_mlp": 1.04784679, "epoch": 0.10347211784157523, "flos": 21031587348480.0, "grad_norm": 1.7366682953112982, "language_loss": 0.79414266, "learning_rate": 3.8954685201873344e-06, "loss": 0.81661141, "num_input_tokens_seen": 37177120, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.1875, "step": 1721, "time_per_iteration": 4.093005418777466 }, { "auxiliary_loss_clip": 0.0121093, "auxiliary_loss_mlp": 0.01057214, "balance_loss_clip": 1.03275228, "balance_loss_mlp": 1.05231881, "epoch": 0.1035322410942432, "flos": 19792094780160.0, "grad_norm": 1.8608915232436072, "language_loss": 0.80663216, "learning_rate": 3.895347938084706e-06, "loss": 0.82931364, "num_input_tokens_seen": 37195895, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 1.21875, "step": 1722, "time_per_iteration": 4.03538966178894 }, { "auxiliary_loss_clip": 0.01076469, "auxiliary_loss_mlp": 0.01010968, "balance_loss_clip": 1.00814247, "balance_loss_mlp": 1.01657963, "epoch": 0.10359236434691117, "flos": 52698874947840.0, "grad_norm": 0.9135964464747034, "language_loss": 0.67128819, "learning_rate": 3.895227288341984e-06, "loss": 0.69216251, "num_input_tokens_seen": 37247270, "router_z_loss_clip": 0.02819824, "router_z_loss_mlp": 0.41796875, "step": 1723, "time_per_iteration": 2.977978467941284 }, { "auxiliary_loss_clip": 0.01179174, "auxiliary_loss_mlp": 0.01055108, "balance_loss_clip": 1.03454411, "balance_loss_mlp": 1.04976583, "epoch": 0.10365248759957914, "flos": 18405404277120.0, "grad_norm": 3.272851016047338, "language_loss": 0.77370942, "learning_rate": 3.8951065709634755e-06, "loss": 0.79605228, "num_input_tokens_seen": 37265595, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.203125, "step": 1724, "time_per_iteration": 4.022023439407349 }, { "auxiliary_loss_clip": 0.0120196, "auxiliary_loss_mlp": 0.01058229, "balance_loss_clip": 1.03786755, "balance_loss_mlp": 1.05364096, "epoch": 0.1037126108522471, "flos": 47553555335040.0, "grad_norm": 1.7238379677976587, "language_loss": 0.74659383, "learning_rate": 3.8949857859534884e-06, "loss": 0.76919574, "num_input_tokens_seen": 37286660, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.2109375, "step": 1725, "time_per_iteration": 2.918224573135376 }, { "auxiliary_loss_clip": 0.01196488, "auxiliary_loss_mlp": 0.01054986, "balance_loss_clip": 1.03405309, "balance_loss_mlp": 1.05208468, "epoch": 0.10377273410491508, "flos": 22816670572800.0, "grad_norm": 2.0302259797747104, "language_loss": 0.7480908, "learning_rate": 3.894864933316333e-06, "loss": 0.77060556, "num_input_tokens_seen": 37304915, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.171875, "step": 1726, "time_per_iteration": 2.7517569065093994 }, { "auxiliary_loss_clip": 0.01190464, "auxiliary_loss_mlp": 0.0105676, "balance_loss_clip": 1.03436065, "balance_loss_mlp": 1.05234325, "epoch": 0.10383285735758305, "flos": 26138694890880.0, "grad_norm": 1.8646114245146483, "language_loss": 0.73410785, "learning_rate": 3.894744013056322e-06, "loss": 0.75658005, "num_input_tokens_seen": 37325265, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.1953125, "step": 1727, "time_per_iteration": 2.667766571044922 }, { "auxiliary_loss_clip": 0.01179023, "auxiliary_loss_mlp": 0.01052163, "balance_loss_clip": 1.03111088, "balance_loss_mlp": 1.05082011, "epoch": 0.10389298061025101, "flos": 17091791994240.0, "grad_norm": 2.3889775560107247, "language_loss": 0.84057045, "learning_rate": 3.894623025177772e-06, "loss": 0.86288232, "num_input_tokens_seen": 37341650, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.1875, "step": 1728, "time_per_iteration": 2.6564910411834717 }, { "auxiliary_loss_clip": 0.01187941, "auxiliary_loss_mlp": 0.01046144, "balance_loss_clip": 1.02455485, "balance_loss_mlp": 1.05310667, "epoch": 0.10395310386291898, "flos": 20776513893120.0, "grad_norm": 2.3071749772041876, "language_loss": 0.70562601, "learning_rate": 3.894501969684999e-06, "loss": 0.7279669, "num_input_tokens_seen": 37360270, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.171875, "step": 1729, "time_per_iteration": 2.650330066680908 }, { "auxiliary_loss_clip": 0.01178905, "auxiliary_loss_mlp": 0.01051548, "balance_loss_clip": 1.02856457, "balance_loss_mlp": 1.05125952, "epoch": 0.10401322711558696, "flos": 12820540913280.0, "grad_norm": 2.423512889553257, "language_loss": 0.81519413, "learning_rate": 3.894380846582324e-06, "loss": 0.83749866, "num_input_tokens_seen": 37375225, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.1875, "step": 1730, "time_per_iteration": 2.58780837059021 }, { "auxiliary_loss_clip": 0.01231201, "auxiliary_loss_mlp": 0.01041277, "balance_loss_clip": 1.02175093, "balance_loss_mlp": 1.05019927, "epoch": 0.10407335036825492, "flos": 23184683366400.0, "grad_norm": 1.7078725274036344, "language_loss": 0.75573373, "learning_rate": 3.89425965587407e-06, "loss": 0.77845854, "num_input_tokens_seen": 37395165, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.171875, "step": 1731, "time_per_iteration": 2.624537706375122 }, { "auxiliary_loss_clip": 0.01200231, "auxiliary_loss_mlp": 0.01044372, "balance_loss_clip": 1.02441597, "balance_loss_mlp": 1.05087388, "epoch": 0.10413347362092289, "flos": 26104184899200.0, "grad_norm": 1.8471791548018661, "language_loss": 0.82709801, "learning_rate": 3.894138397564562e-06, "loss": 0.84954405, "num_input_tokens_seen": 37414845, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.125, "step": 1732, "time_per_iteration": 2.70255184173584 }, { "auxiliary_loss_clip": 0.01196004, "auxiliary_loss_mlp": 0.01044805, "balance_loss_clip": 1.02502871, "balance_loss_mlp": 1.05205178, "epoch": 0.10419359687359087, "flos": 12641059630080.0, "grad_norm": 2.0107361851220986, "language_loss": 0.83113527, "learning_rate": 3.894017071658125e-06, "loss": 0.8535434, "num_input_tokens_seen": 37432490, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.1640625, "step": 1733, "time_per_iteration": 2.5370235443115234 }, { "auxiliary_loss_clip": 0.01188877, "auxiliary_loss_mlp": 0.01050879, "balance_loss_clip": 1.03044701, "balance_loss_mlp": 1.05135357, "epoch": 0.10425372012625883, "flos": 12125094716160.0, "grad_norm": 2.84869787108503, "language_loss": 0.76576912, "learning_rate": 3.893895678159092e-06, "loss": 0.78816664, "num_input_tokens_seen": 37449435, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.1875, "step": 1734, "time_per_iteration": 2.6352925300598145 }, { "auxiliary_loss_clip": 0.01191953, "auxiliary_loss_mlp": 0.01044088, "balance_loss_clip": 1.02373934, "balance_loss_mlp": 1.04867172, "epoch": 0.1043138433789268, "flos": 25337563902720.0, "grad_norm": 1.6675901636521544, "language_loss": 0.74493486, "learning_rate": 3.8937742170717935e-06, "loss": 0.76729524, "num_input_tokens_seen": 37469105, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.15625, "step": 1735, "time_per_iteration": 2.7523932456970215 }, { "auxiliary_loss_clip": 0.01178187, "auxiliary_loss_mlp": 0.01044948, "balance_loss_clip": 1.02413392, "balance_loss_mlp": 1.0506053, "epoch": 0.10437396663159478, "flos": 29167149352320.0, "grad_norm": 2.0540063640861375, "language_loss": 0.78607094, "learning_rate": 3.893652688400565e-06, "loss": 0.80830228, "num_input_tokens_seen": 37490540, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.1875, "step": 1736, "time_per_iteration": 2.7867679595947266 }, { "auxiliary_loss_clip": 0.01177046, "auxiliary_loss_mlp": 0.01055886, "balance_loss_clip": 1.03494048, "balance_loss_mlp": 1.05277121, "epoch": 0.10443408988426274, "flos": 25080946162560.0, "grad_norm": 1.9029989199633344, "language_loss": 0.70379376, "learning_rate": 3.893531092149743e-06, "loss": 0.72612298, "num_input_tokens_seen": 37511905, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.15625, "step": 1737, "time_per_iteration": 2.599383592605591 }, { "auxiliary_loss_clip": 0.01198463, "auxiliary_loss_mlp": 0.01051296, "balance_loss_clip": 1.02751327, "balance_loss_mlp": 1.04929614, "epoch": 0.1044942131369307, "flos": 26759662237440.0, "grad_norm": 2.052982349816799, "language_loss": 0.81487703, "learning_rate": 3.893409428323666e-06, "loss": 0.83737469, "num_input_tokens_seen": 37533635, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 1.21875, "step": 1738, "time_per_iteration": 2.7330996990203857 }, { "auxiliary_loss_clip": 0.0116964, "auxiliary_loss_mlp": 0.01061136, "balance_loss_clip": 1.03772354, "balance_loss_mlp": 1.04914451, "epoch": 0.10455433638959867, "flos": 18442571875200.0, "grad_norm": 2.3182926440706026, "language_loss": 0.74441373, "learning_rate": 3.8932876969266785e-06, "loss": 0.76672149, "num_input_tokens_seen": 37552035, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.203125, "step": 1739, "time_per_iteration": 2.5690150260925293 }, { "auxiliary_loss_clip": 0.01195864, "auxiliary_loss_mlp": 0.01054092, "balance_loss_clip": 1.03309941, "balance_loss_mlp": 1.05135298, "epoch": 0.10461445964226665, "flos": 23218977876480.0, "grad_norm": 1.612783347856334, "language_loss": 0.77039516, "learning_rate": 3.893165897963123e-06, "loss": 0.79289472, "num_input_tokens_seen": 37571540, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.171875, "step": 1740, "time_per_iteration": 2.6228456497192383 }, { "auxiliary_loss_clip": 0.01190182, "auxiliary_loss_mlp": 0.01051894, "balance_loss_clip": 1.03118742, "balance_loss_mlp": 1.05414116, "epoch": 0.10467458289493461, "flos": 24345243797760.0, "grad_norm": 2.193272825348321, "language_loss": 0.8581903, "learning_rate": 3.893044031437346e-06, "loss": 0.88061106, "num_input_tokens_seen": 37588265, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1796875, "step": 1741, "time_per_iteration": 2.707397222518921 }, { "auxiliary_loss_clip": 0.01190365, "auxiliary_loss_mlp": 0.01058045, "balance_loss_clip": 1.03500175, "balance_loss_mlp": 1.05204046, "epoch": 0.10473470614760258, "flos": 21287953693440.0, "grad_norm": 2.2097430365599404, "language_loss": 0.75716972, "learning_rate": 3.892922097353697e-06, "loss": 0.77965379, "num_input_tokens_seen": 37606860, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.203125, "step": 1742, "time_per_iteration": 2.614375591278076 }, { "auxiliary_loss_clip": 0.01207738, "auxiliary_loss_mlp": 0.01060392, "balance_loss_clip": 1.04022193, "balance_loss_mlp": 1.05577612, "epoch": 0.10479482940027056, "flos": 21687208341120.0, "grad_norm": 2.476847696845739, "language_loss": 0.87543786, "learning_rate": 3.8928000957165275e-06, "loss": 0.89811915, "num_input_tokens_seen": 37625210, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.15625, "step": 1743, "time_per_iteration": 2.7061989307403564 }, { "auxiliary_loss_clip": 0.01194646, "auxiliary_loss_mlp": 0.01050777, "balance_loss_clip": 1.02787662, "balance_loss_mlp": 1.05170739, "epoch": 0.10485495265293852, "flos": 21573694385280.0, "grad_norm": 6.482806255196035, "language_loss": 0.75647706, "learning_rate": 3.8926780265301915e-06, "loss": 0.77893126, "num_input_tokens_seen": 37644110, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 1.15625, "step": 1744, "time_per_iteration": 2.6220414638519287 }, { "auxiliary_loss_clip": 0.01177305, "auxiliary_loss_mlp": 0.01055535, "balance_loss_clip": 1.03599715, "balance_loss_mlp": 1.05357516, "epoch": 0.10491507590560649, "flos": 37961923708800.0, "grad_norm": 2.1223223444359456, "language_loss": 0.78638816, "learning_rate": 3.8925558897990445e-06, "loss": 0.80871654, "num_input_tokens_seen": 37665800, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.1484375, "step": 1745, "time_per_iteration": 2.8542654514312744 }, { "auxiliary_loss_clip": 0.01168499, "auxiliary_loss_mlp": 0.01061072, "balance_loss_clip": 1.03944755, "balance_loss_mlp": 1.05117583, "epoch": 0.10497519915827447, "flos": 26396282298240.0, "grad_norm": 3.0711476786238556, "language_loss": 0.8247956, "learning_rate": 3.892433685527447e-06, "loss": 0.84709132, "num_input_tokens_seen": 37685095, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.171875, "step": 1746, "time_per_iteration": 2.548952341079712 }, { "auxiliary_loss_clip": 0.01186802, "auxiliary_loss_mlp": 0.01056933, "balance_loss_clip": 1.03594077, "balance_loss_mlp": 1.05401039, "epoch": 0.10503532241094243, "flos": 40662190581120.0, "grad_norm": 1.6096622280183408, "language_loss": 0.69971645, "learning_rate": 3.892311413719759e-06, "loss": 0.72215378, "num_input_tokens_seen": 37707445, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.1484375, "step": 1747, "time_per_iteration": 2.7590737342834473 }, { "auxiliary_loss_clip": 0.01204256, "auxiliary_loss_mlp": 0.01059837, "balance_loss_clip": 1.03837919, "balance_loss_mlp": 1.05523896, "epoch": 0.1050954456636104, "flos": 29789409588480.0, "grad_norm": 2.2429692778578643, "language_loss": 0.84722865, "learning_rate": 3.892189074380345e-06, "loss": 0.86986953, "num_input_tokens_seen": 37728325, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.21875, "step": 1748, "time_per_iteration": 2.6624927520751953 }, { "auxiliary_loss_clip": 0.011636, "auxiliary_loss_mlp": 0.01050228, "balance_loss_clip": 1.02797103, "balance_loss_mlp": 1.04891181, "epoch": 0.10515556891627838, "flos": 23948754497280.0, "grad_norm": 1.7477159929011246, "language_loss": 0.71337211, "learning_rate": 3.892066667513569e-06, "loss": 0.73551041, "num_input_tokens_seen": 37748910, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.140625, "step": 1749, "time_per_iteration": 2.6093971729278564 }, { "auxiliary_loss_clip": 0.01184271, "auxiliary_loss_mlp": 0.01062546, "balance_loss_clip": 1.04113626, "balance_loss_mlp": 1.05262089, "epoch": 0.10521569216894634, "flos": 18259606972800.0, "grad_norm": 2.04859345459769, "language_loss": 0.81940031, "learning_rate": 3.891944193123801e-06, "loss": 0.84186852, "num_input_tokens_seen": 37765745, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.1328125, "step": 1750, "time_per_iteration": 2.5648748874664307 }, { "auxiliary_loss_clip": 0.01202632, "auxiliary_loss_mlp": 0.01056755, "balance_loss_clip": 1.03558362, "balance_loss_mlp": 1.05710983, "epoch": 0.10527581542161431, "flos": 15630909949440.0, "grad_norm": 4.132191400041771, "language_loss": 0.92256904, "learning_rate": 3.891821651215411e-06, "loss": 0.94516295, "num_input_tokens_seen": 37780520, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.1796875, "step": 1751, "time_per_iteration": 2.6509904861450195 }, { "auxiliary_loss_clip": 0.01173451, "auxiliary_loss_mlp": 0.01044449, "balance_loss_clip": 1.02327776, "balance_loss_mlp": 1.05123091, "epoch": 0.10533593867428227, "flos": 18296559089280.0, "grad_norm": 2.065730470747477, "language_loss": 0.79282153, "learning_rate": 3.8916990417927735e-06, "loss": 0.81500053, "num_input_tokens_seen": 37799515, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.1328125, "step": 1752, "time_per_iteration": 2.6477816104888916 }, { "auxiliary_loss_clip": 0.01203825, "auxiliary_loss_mlp": 0.0104938, "balance_loss_clip": 1.02810144, "balance_loss_mlp": 1.05380499, "epoch": 0.10539606192695025, "flos": 29023219555200.0, "grad_norm": 1.797652010187303, "language_loss": 0.75922507, "learning_rate": 3.891576364860262e-06, "loss": 0.78175712, "num_input_tokens_seen": 37818695, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.1328125, "step": 1753, "time_per_iteration": 2.714743137359619 }, { "auxiliary_loss_clip": 0.01200991, "auxiliary_loss_mlp": 0.0105537, "balance_loss_clip": 1.03277993, "balance_loss_mlp": 1.05366373, "epoch": 0.10545618517961822, "flos": 19969313506560.0, "grad_norm": 1.7939665150247366, "language_loss": 0.84050417, "learning_rate": 3.891453620422258e-06, "loss": 0.86306775, "num_input_tokens_seen": 37837860, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.1953125, "step": 1754, "time_per_iteration": 2.6905059814453125 }, { "auxiliary_loss_clip": 0.01210704, "auxiliary_loss_mlp": 0.0105098, "balance_loss_clip": 1.02810359, "balance_loss_mlp": 1.05696487, "epoch": 0.10551630843228618, "flos": 16143427157760.0, "grad_norm": 2.5772059751279746, "language_loss": 0.68989038, "learning_rate": 3.891330808483137e-06, "loss": 0.71250725, "num_input_tokens_seen": 37856260, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.171875, "step": 1755, "time_per_iteration": 2.609027862548828 }, { "auxiliary_loss_clip": 0.01181972, "auxiliary_loss_mlp": 0.01062774, "balance_loss_clip": 1.04135251, "balance_loss_mlp": 1.05455685, "epoch": 0.10557643168495416, "flos": 23440115957760.0, "grad_norm": 2.1285931675435124, "language_loss": 0.76662046, "learning_rate": 3.891207929047286e-06, "loss": 0.78906786, "num_input_tokens_seen": 37876960, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.1796875, "step": 1756, "time_per_iteration": 2.662961721420288 }, { "auxiliary_loss_clip": 0.01170346, "auxiliary_loss_mlp": 0.01058645, "balance_loss_clip": 1.03802156, "balance_loss_mlp": 1.05281115, "epoch": 0.10563655493762213, "flos": 21799034357760.0, "grad_norm": 1.7021902950793857, "language_loss": 0.79877663, "learning_rate": 3.8910849821190884e-06, "loss": 0.82106656, "num_input_tokens_seen": 37897070, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.171875, "step": 1757, "time_per_iteration": 2.767442464828491 }, { "auxiliary_loss_clip": 0.0119917, "auxiliary_loss_mlp": 0.01050531, "balance_loss_clip": 1.02980089, "balance_loss_mlp": 1.05484271, "epoch": 0.10569667819029009, "flos": 53800863275520.0, "grad_norm": 3.0487643586645428, "language_loss": 0.78773677, "learning_rate": 3.890961967702933e-06, "loss": 0.81023383, "num_input_tokens_seen": 37923635, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.171875, "step": 1758, "time_per_iteration": 2.9700610637664795 }, { "auxiliary_loss_clip": 0.01194645, "auxiliary_loss_mlp": 0.01052005, "balance_loss_clip": 1.03017759, "balance_loss_mlp": 1.05646086, "epoch": 0.10575680144295807, "flos": 22925515760640.0, "grad_norm": 1.952916396544287, "language_loss": 0.91508365, "learning_rate": 3.890838885803208e-06, "loss": 0.93755019, "num_input_tokens_seen": 37942650, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.203125, "step": 1759, "time_per_iteration": 2.629693031311035 }, { "auxiliary_loss_clip": 0.0119103, "auxiliary_loss_mlp": 0.01052487, "balance_loss_clip": 1.0294435, "balance_loss_mlp": 1.05291641, "epoch": 0.10581692469562604, "flos": 14136667148160.0, "grad_norm": 2.5036003212321174, "language_loss": 0.77266628, "learning_rate": 3.890715736424307e-06, "loss": 0.7951014, "num_input_tokens_seen": 37960660, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.203125, "step": 1760, "time_per_iteration": 2.648693323135376 }, { "auxiliary_loss_clip": 0.01184345, "auxiliary_loss_mlp": 0.01064449, "balance_loss_clip": 1.04090559, "balance_loss_mlp": 1.05492795, "epoch": 0.105877047948294, "flos": 25958674903680.0, "grad_norm": 1.9912638318741231, "language_loss": 0.89121914, "learning_rate": 3.890592519570626e-06, "loss": 0.91370714, "num_input_tokens_seen": 37978625, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 1.203125, "step": 1761, "time_per_iteration": 2.593125104904175 }, { "auxiliary_loss_clip": 0.01216133, "auxiliary_loss_mlp": 0.01309312, "balance_loss_clip": 1.03944945, "balance_loss_mlp": 1.05366993, "epoch": 0.10593717120096197, "flos": 30664768032000.0, "grad_norm": 3.619791920702402, "language_loss": 0.78257865, "learning_rate": 3.89046923524656e-06, "loss": 0.80783314, "num_input_tokens_seen": 38000005, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.171875, "step": 1762, "time_per_iteration": 5.782445430755615 }, { "auxiliary_loss_clip": 0.01084015, "auxiliary_loss_mlp": 0.01002177, "balance_loss_clip": 0.99933988, "balance_loss_mlp": 1.02283144, "epoch": 0.10599729445362994, "flos": 66436682497920.0, "grad_norm": 0.8142957071676115, "language_loss": 0.60522473, "learning_rate": 3.8903458834565105e-06, "loss": 0.62608665, "num_input_tokens_seen": 38066165, "router_z_loss_clip": 0.02832031, "router_z_loss_mlp": 0.4296875, "step": 1763, "time_per_iteration": 3.400928020477295 }, { "auxiliary_loss_clip": 0.0118852, "auxiliary_loss_mlp": 0.01048377, "balance_loss_clip": 1.02703822, "balance_loss_mlp": 1.05215573, "epoch": 0.10605741770629791, "flos": 23948179879680.0, "grad_norm": 2.306585259784486, "language_loss": 0.79694104, "learning_rate": 3.890222464204879e-06, "loss": 0.81931007, "num_input_tokens_seen": 38086150, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.1796875, "step": 1764, "time_per_iteration": 4.004598617553711 }, { "auxiliary_loss_clip": 0.01199461, "auxiliary_loss_mlp": 0.01050202, "balance_loss_clip": 1.02749276, "balance_loss_mlp": 1.05576134, "epoch": 0.10611754095896588, "flos": 19387524919680.0, "grad_norm": 1.7412965076026214, "language_loss": 0.79938328, "learning_rate": 3.89009897749607e-06, "loss": 0.82187992, "num_input_tokens_seen": 38104205, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.1640625, "step": 1765, "time_per_iteration": 2.6198978424072266 }, { "auxiliary_loss_clip": 0.01174353, "auxiliary_loss_mlp": 0.01056739, "balance_loss_clip": 1.03501916, "balance_loss_mlp": 1.0500412, "epoch": 0.10617766421163385, "flos": 22237755073920.0, "grad_norm": 1.7449612928510665, "language_loss": 0.76428682, "learning_rate": 3.88997542333449e-06, "loss": 0.78659773, "num_input_tokens_seen": 38122005, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.1484375, "step": 1766, "time_per_iteration": 5.0131354331970215 }, { "auxiliary_loss_clip": 0.01173244, "auxiliary_loss_mlp": 0.01062847, "balance_loss_clip": 1.04050732, "balance_loss_mlp": 1.05600584, "epoch": 0.10623778746430182, "flos": 28404407024640.0, "grad_norm": 1.725897737329823, "language_loss": 0.77326727, "learning_rate": 3.889851801724549e-06, "loss": 0.79562825, "num_input_tokens_seen": 38143365, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.171875, "step": 1767, "time_per_iteration": 2.6708157062530518 }, { "auxiliary_loss_clip": 0.01126384, "auxiliary_loss_mlp": 0.01002105, "balance_loss_clip": 0.99935132, "balance_loss_mlp": 1.01963794, "epoch": 0.10629791071696978, "flos": 64234639221120.0, "grad_norm": 0.6739616207134669, "language_loss": 0.57905948, "learning_rate": 3.889728112670658e-06, "loss": 0.60034436, "num_input_tokens_seen": 38210035, "router_z_loss_clip": 0.02758789, "router_z_loss_mlp": 0.42578125, "step": 1768, "time_per_iteration": 3.624675750732422 }, { "auxiliary_loss_clip": 0.01198846, "auxiliary_loss_mlp": 0.01046223, "balance_loss_clip": 1.026577, "balance_loss_mlp": 1.05458117, "epoch": 0.10635803396963776, "flos": 22747578762240.0, "grad_norm": 1.4293031321775898, "language_loss": 0.86457479, "learning_rate": 3.8896043561772325e-06, "loss": 0.88702548, "num_input_tokens_seen": 38231230, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.171875, "step": 1769, "time_per_iteration": 2.960343360900879 }, { "auxiliary_loss_clip": 0.01209304, "auxiliary_loss_mlp": 0.01302959, "balance_loss_clip": 1.03243899, "balance_loss_mlp": 1.05400693, "epoch": 0.10641815722230573, "flos": 31395586147200.0, "grad_norm": 1.8723485061749827, "language_loss": 0.6165185, "learning_rate": 3.889480532248688e-06, "loss": 0.64164114, "num_input_tokens_seen": 38253890, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.1875, "step": 1770, "time_per_iteration": 2.716557502746582 }, { "auxiliary_loss_clip": 0.0107059, "auxiliary_loss_mlp": 0.01003094, "balance_loss_clip": 1.00055456, "balance_loss_mlp": 1.01863325, "epoch": 0.1064782804749737, "flos": 58552527784320.0, "grad_norm": 0.8838236833061185, "language_loss": 0.57027936, "learning_rate": 3.889356640889444e-06, "loss": 0.59101617, "num_input_tokens_seen": 38304290, "router_z_loss_clip": 0.02539062, "router_z_loss_mlp": 0.4296875, "step": 1771, "time_per_iteration": 3.0794575214385986 }, { "auxiliary_loss_clip": 0.01196295, "auxiliary_loss_mlp": 0.01060609, "balance_loss_clip": 1.0398308, "balance_loss_mlp": 1.05366158, "epoch": 0.10653840372764166, "flos": 23987825516160.0, "grad_norm": 1.741992330015076, "language_loss": 0.88054872, "learning_rate": 3.8892326821039205e-06, "loss": 0.90311778, "num_input_tokens_seen": 38324725, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.15625, "step": 1772, "time_per_iteration": 2.5955188274383545 }, { "auxiliary_loss_clip": 0.01172964, "auxiliary_loss_mlp": 0.01056612, "balance_loss_clip": 1.03367639, "balance_loss_mlp": 1.05159092, "epoch": 0.10659852698030964, "flos": 18294655668480.0, "grad_norm": 8.42871818128596, "language_loss": 0.76118177, "learning_rate": 3.889108655896542e-06, "loss": 0.78347754, "num_input_tokens_seen": 38340735, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 1.2109375, "step": 1773, "time_per_iteration": 2.5990991592407227 }, { "auxiliary_loss_clip": 0.01198307, "auxiliary_loss_mlp": 0.01057155, "balance_loss_clip": 1.03437412, "balance_loss_mlp": 1.05436039, "epoch": 0.1066586502329776, "flos": 32160591031680.0, "grad_norm": 1.7413444803904512, "language_loss": 0.82696187, "learning_rate": 3.888984562271736e-06, "loss": 0.84951651, "num_input_tokens_seen": 38361315, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.1640625, "step": 1774, "time_per_iteration": 2.6484506130218506 }, { "auxiliary_loss_clip": 0.01192829, "auxiliary_loss_mlp": 0.01302725, "balance_loss_clip": 1.03172565, "balance_loss_mlp": 1.05308878, "epoch": 0.10671877348564557, "flos": 17785155202560.0, "grad_norm": 2.4544581904209073, "language_loss": 0.75996429, "learning_rate": 3.888860401233929e-06, "loss": 0.7849198, "num_input_tokens_seen": 38377425, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.21875, "step": 1775, "time_per_iteration": 2.5900590419769287 }, { "auxiliary_loss_clip": 0.01056723, "auxiliary_loss_mlp": 0.01003793, "balance_loss_clip": 1.00123048, "balance_loss_mlp": 1.01487315, "epoch": 0.10677889673831355, "flos": 63510177813120.0, "grad_norm": 0.8198036850157904, "language_loss": 0.57474834, "learning_rate": 3.8887361727875535e-06, "loss": 0.59535354, "num_input_tokens_seen": 38440275, "router_z_loss_clip": 0.02563477, "router_z_loss_mlp": 0.41796875, "step": 1776, "time_per_iteration": 3.1899452209472656 }, { "auxiliary_loss_clip": 0.0118853, "auxiliary_loss_mlp": 0.01060701, "balance_loss_clip": 1.03702617, "balance_loss_mlp": 1.05244398, "epoch": 0.10683901999098151, "flos": 22017694400640.0, "grad_norm": 1.5920007367816942, "language_loss": 0.83218241, "learning_rate": 3.888611876937043e-06, "loss": 0.8546747, "num_input_tokens_seen": 38461820, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.1796875, "step": 1777, "time_per_iteration": 2.643632411956787 }, { "auxiliary_loss_clip": 0.01203153, "auxiliary_loss_mlp": 0.01300083, "balance_loss_clip": 1.03018141, "balance_loss_mlp": 1.0537684, "epoch": 0.10689914324364948, "flos": 25042952551680.0, "grad_norm": 1.6968205038980704, "language_loss": 0.86987555, "learning_rate": 3.888487513686832e-06, "loss": 0.89490789, "num_input_tokens_seen": 38482235, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.125, "step": 1778, "time_per_iteration": 2.5928187370300293 }, { "auxiliary_loss_clip": 0.01216963, "auxiliary_loss_mlp": 0.0130614, "balance_loss_clip": 1.03521609, "balance_loss_mlp": 1.05393636, "epoch": 0.10695926649631746, "flos": 16435129507200.0, "grad_norm": 2.121790266451745, "language_loss": 0.84057462, "learning_rate": 3.88836308304136e-06, "loss": 0.86580569, "num_input_tokens_seen": 38500690, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.171875, "step": 1779, "time_per_iteration": 2.6219899654388428 }, { "auxiliary_loss_clip": 0.01184896, "auxiliary_loss_mlp": 0.01299847, "balance_loss_clip": 1.03048205, "balance_loss_mlp": 1.05192161, "epoch": 0.10701938974898542, "flos": 16979211792000.0, "grad_norm": 1.8504992165149632, "language_loss": 0.67311382, "learning_rate": 3.888238585005066e-06, "loss": 0.69796133, "num_input_tokens_seen": 38518405, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.1484375, "step": 1780, "time_per_iteration": 2.521993398666382 }, { "auxiliary_loss_clip": 0.01177499, "auxiliary_loss_mlp": 0.01045969, "balance_loss_clip": 1.02407038, "balance_loss_mlp": 1.05294394, "epoch": 0.10707951300165339, "flos": 15888102307200.0, "grad_norm": 2.3432640527235895, "language_loss": 0.91479558, "learning_rate": 3.888114019582395e-06, "loss": 0.93703026, "num_input_tokens_seen": 38535060, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.15625, "step": 1781, "time_per_iteration": 2.570507287979126 }, { "auxiliary_loss_clip": 0.01206768, "auxiliary_loss_mlp": 0.01049372, "balance_loss_clip": 1.02810538, "balance_loss_mlp": 1.05272603, "epoch": 0.10713963625432135, "flos": 14247164361600.0, "grad_norm": 2.1113736854128407, "language_loss": 0.79028332, "learning_rate": 3.887989386777791e-06, "loss": 0.81284475, "num_input_tokens_seen": 38552855, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.171875, "step": 1782, "time_per_iteration": 2.654642343521118 }, { "auxiliary_loss_clip": 0.01192582, "auxiliary_loss_mlp": 0.01051519, "balance_loss_clip": 1.03009748, "balance_loss_mlp": 1.05427599, "epoch": 0.10719975950698933, "flos": 16756780821120.0, "grad_norm": 2.181188544298854, "language_loss": 0.79405266, "learning_rate": 3.887864686595703e-06, "loss": 0.81649369, "num_input_tokens_seen": 38570075, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.1953125, "step": 1783, "time_per_iteration": 2.602684259414673 }, { "auxiliary_loss_clip": 0.01190539, "auxiliary_loss_mlp": 0.01052764, "balance_loss_clip": 1.03041232, "balance_loss_mlp": 1.05319667, "epoch": 0.1072598827596573, "flos": 22710626645760.0, "grad_norm": 2.052783497359654, "language_loss": 0.86187416, "learning_rate": 3.887739919040579e-06, "loss": 0.88430721, "num_input_tokens_seen": 38587970, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.1875, "step": 1784, "time_per_iteration": 2.6216259002685547 }, { "auxiliary_loss_clip": 0.01190264, "auxiliary_loss_mlp": 0.01052292, "balance_loss_clip": 1.03035736, "balance_loss_mlp": 1.05211067, "epoch": 0.10732000601232526, "flos": 23258264376960.0, "grad_norm": 2.4052982756349426, "language_loss": 1.00840735, "learning_rate": 3.887615084116874e-06, "loss": 1.03083289, "num_input_tokens_seen": 38605840, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.203125, "step": 1785, "time_per_iteration": 2.700336217880249 }, { "auxiliary_loss_clip": 0.01177002, "auxiliary_loss_mlp": 0.01051876, "balance_loss_clip": 1.03132403, "balance_loss_mlp": 1.05458009, "epoch": 0.10738012926499324, "flos": 24207060176640.0, "grad_norm": 1.367279751315531, "language_loss": 0.84886014, "learning_rate": 3.887490181829042e-06, "loss": 0.87114888, "num_input_tokens_seen": 38627070, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.1328125, "step": 1786, "time_per_iteration": 2.657174825668335 }, { "auxiliary_loss_clip": 0.01170761, "auxiliary_loss_mlp": 0.01057643, "balance_loss_clip": 1.03433716, "balance_loss_mlp": 1.05102932, "epoch": 0.1074402525176612, "flos": 20923065383040.0, "grad_norm": 9.583710137052648, "language_loss": 0.78248382, "learning_rate": 3.887365212181542e-06, "loss": 0.80476791, "num_input_tokens_seen": 38645840, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.1953125, "step": 1787, "time_per_iteration": 2.6651008129119873 }, { "auxiliary_loss_clip": 0.01184167, "auxiliary_loss_mlp": 0.0104644, "balance_loss_clip": 1.02421904, "balance_loss_mlp": 1.05379033, "epoch": 0.10750037577032917, "flos": 16946928443520.0, "grad_norm": 1.838186976054259, "language_loss": 0.82104385, "learning_rate": 3.88724017517883e-06, "loss": 0.84334987, "num_input_tokens_seen": 38664770, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.2109375, "step": 1788, "time_per_iteration": 2.6991965770721436 }, { "auxiliary_loss_clip": 0.0117129, "auxiliary_loss_mlp": 0.01059104, "balance_loss_clip": 1.03832626, "balance_loss_mlp": 1.0537622, "epoch": 0.10756049902299715, "flos": 20266546550400.0, "grad_norm": 2.1007445516458287, "language_loss": 0.77676022, "learning_rate": 3.887115070825373e-06, "loss": 0.79906416, "num_input_tokens_seen": 38683865, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.171875, "step": 1789, "time_per_iteration": 2.610727071762085 }, { "auxiliary_loss_clip": 0.01173535, "auxiliary_loss_mlp": 0.01062715, "balance_loss_clip": 1.0382055, "balance_loss_mlp": 1.05502748, "epoch": 0.10762062227566511, "flos": 23586523793280.0, "grad_norm": 2.5455732398598085, "language_loss": 0.74719799, "learning_rate": 3.886989899125632e-06, "loss": 0.76956046, "num_input_tokens_seen": 38702485, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.1875, "step": 1790, "time_per_iteration": 2.6803202629089355 }, { "auxiliary_loss_clip": 0.0118595, "auxiliary_loss_mlp": 0.01313812, "balance_loss_clip": 1.04064178, "balance_loss_mlp": 1.0564177, "epoch": 0.10768074552833308, "flos": 24310626065280.0, "grad_norm": 2.6175683995491883, "language_loss": 0.78718114, "learning_rate": 3.886864660084075e-06, "loss": 0.81217873, "num_input_tokens_seen": 38722475, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.203125, "step": 1791, "time_per_iteration": 2.610154390335083 }, { "auxiliary_loss_clip": 0.01193941, "auxiliary_loss_mlp": 0.01056113, "balance_loss_clip": 1.03484559, "balance_loss_mlp": 1.05355835, "epoch": 0.10774086878100106, "flos": 25299965341440.0, "grad_norm": 1.8965836987127436, "language_loss": 0.70717227, "learning_rate": 3.886739353705173e-06, "loss": 0.72967291, "num_input_tokens_seen": 38743285, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.125, "step": 1792, "time_per_iteration": 2.730623960494995 }, { "auxiliary_loss_clip": 0.01201904, "auxiliary_loss_mlp": 0.0104823, "balance_loss_clip": 1.02449536, "balance_loss_mlp": 1.05265045, "epoch": 0.10780099203366902, "flos": 22054035985920.0, "grad_norm": 2.02253655334134, "language_loss": 0.75956237, "learning_rate": 3.886613979993396e-06, "loss": 0.78206372, "num_input_tokens_seen": 38763035, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 1.21875, "step": 1793, "time_per_iteration": 2.6037778854370117 }, { "auxiliary_loss_clip": 0.01207287, "auxiliary_loss_mlp": 0.01056881, "balance_loss_clip": 1.03503013, "balance_loss_mlp": 1.05587292, "epoch": 0.10786111528633699, "flos": 22747471021440.0, "grad_norm": 1.6133813823252605, "language_loss": 0.84971237, "learning_rate": 3.886488538953219e-06, "loss": 0.87235409, "num_input_tokens_seen": 38784900, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.15625, "step": 1794, "time_per_iteration": 2.7071382999420166 }, { "auxiliary_loss_clip": 0.01223881, "auxiliary_loss_mlp": 0.01050376, "balance_loss_clip": 1.02641511, "balance_loss_mlp": 1.05562663, "epoch": 0.10792123853900495, "flos": 20851064570880.0, "grad_norm": 1.7505362357211245, "language_loss": 0.74863023, "learning_rate": 3.8863630305891196e-06, "loss": 0.77137274, "num_input_tokens_seen": 38804695, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 1.2265625, "step": 1795, "time_per_iteration": 2.6485202312469482 }, { "auxiliary_loss_clip": 0.01212258, "auxiliary_loss_mlp": 0.01060394, "balance_loss_clip": 1.03810179, "balance_loss_mlp": 1.05497468, "epoch": 0.10798136179167293, "flos": 17748705876480.0, "grad_norm": 4.105094677377714, "language_loss": 0.81606197, "learning_rate": 3.8862374549055755e-06, "loss": 0.83878845, "num_input_tokens_seen": 38822395, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.203125, "step": 1796, "time_per_iteration": 2.6941370964050293 }, { "auxiliary_loss_clip": 0.01194737, "auxiliary_loss_mlp": 0.01068545, "balance_loss_clip": 1.04559708, "balance_loss_mlp": 1.05667412, "epoch": 0.1080414850443409, "flos": 13589639948160.0, "grad_norm": 2.2857123942303126, "language_loss": 0.74005723, "learning_rate": 3.886111811907069e-06, "loss": 0.76269007, "num_input_tokens_seen": 38839865, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 1.1953125, "step": 1797, "time_per_iteration": 2.631866931915283 }, { "auxiliary_loss_clip": 0.01189417, "auxiliary_loss_mlp": 0.010516, "balance_loss_clip": 1.03092921, "balance_loss_mlp": 1.05372238, "epoch": 0.10810160829700886, "flos": 16253421580800.0, "grad_norm": 2.015430898883969, "language_loss": 0.80982852, "learning_rate": 3.885986101598082e-06, "loss": 0.83223879, "num_input_tokens_seen": 38857300, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.171875, "step": 1798, "time_per_iteration": 2.6513869762420654 }, { "auxiliary_loss_clip": 0.01202561, "auxiliary_loss_mlp": 0.01054381, "balance_loss_clip": 1.03094447, "balance_loss_mlp": 1.0556016, "epoch": 0.10816173154967684, "flos": 15158002464000.0, "grad_norm": 2.301057064313211, "language_loss": 0.85310692, "learning_rate": 3.885860323983104e-06, "loss": 0.87567639, "num_input_tokens_seen": 38874960, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.1953125, "step": 1799, "time_per_iteration": 2.6991755962371826 }, { "auxiliary_loss_clip": 0.01188743, "auxiliary_loss_mlp": 0.01061893, "balance_loss_clip": 1.03994608, "balance_loss_mlp": 1.05653024, "epoch": 0.10822185480234481, "flos": 17785334770560.0, "grad_norm": 1.7737821983692972, "language_loss": 0.76556492, "learning_rate": 3.885734479066622e-06, "loss": 0.78807133, "num_input_tokens_seen": 38893610, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.140625, "step": 1800, "time_per_iteration": 2.606853485107422 }, { "auxiliary_loss_clip": 0.01168215, "auxiliary_loss_mlp": 0.01046549, "balance_loss_clip": 1.02574682, "balance_loss_mlp": 1.05245543, "epoch": 0.10828197805501277, "flos": 25556654908800.0, "grad_norm": 1.5412716590729243, "language_loss": 0.72981662, "learning_rate": 3.885608566853126e-06, "loss": 0.75196427, "num_input_tokens_seen": 38913485, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.15625, "step": 1801, "time_per_iteration": 2.6303131580352783 }, { "auxiliary_loss_clip": 0.01182395, "auxiliary_loss_mlp": 0.01053507, "balance_loss_clip": 1.03247833, "balance_loss_mlp": 1.05781674, "epoch": 0.10834210130768075, "flos": 28984435845120.0, "grad_norm": 2.4850590240580126, "language_loss": 0.66007864, "learning_rate": 3.8854825873471115e-06, "loss": 0.68243772, "num_input_tokens_seen": 38935650, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.25, "step": 1802, "time_per_iteration": 2.639606237411499 }, { "auxiliary_loss_clip": 0.01181311, "auxiliary_loss_mlp": 0.01058572, "balance_loss_clip": 1.03647077, "balance_loss_mlp": 1.05526352, "epoch": 0.10840222456034872, "flos": 20264212166400.0, "grad_norm": 19.313529154977747, "language_loss": 0.81472254, "learning_rate": 3.885356540553073e-06, "loss": 0.83712143, "num_input_tokens_seen": 38954130, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.1640625, "step": 1803, "time_per_iteration": 2.5936243534088135 }, { "auxiliary_loss_clip": 0.01178867, "auxiliary_loss_mlp": 0.01058849, "balance_loss_clip": 1.03684282, "balance_loss_mlp": 1.05375767, "epoch": 0.10846234781301668, "flos": 19863054097920.0, "grad_norm": 2.1019053876423928, "language_loss": 0.90786362, "learning_rate": 3.88523042647551e-06, "loss": 0.93024075, "num_input_tokens_seen": 38972905, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.15625, "step": 1804, "time_per_iteration": 5.5774829387664795 }, { "auxiliary_loss_clip": 0.01210491, "auxiliary_loss_mlp": 0.01051377, "balance_loss_clip": 1.02869177, "balance_loss_mlp": 1.05560684, "epoch": 0.10852247106568465, "flos": 26469037296000.0, "grad_norm": 1.9283290665442505, "language_loss": 0.75984967, "learning_rate": 3.885104245118921e-06, "loss": 0.78246832, "num_input_tokens_seen": 38993255, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.1796875, "step": 1805, "time_per_iteration": 2.675459384918213 }, { "auxiliary_loss_clip": 0.01188203, "auxiliary_loss_mlp": 0.01046658, "balance_loss_clip": 1.02669072, "balance_loss_mlp": 1.05466557, "epoch": 0.10858259431835263, "flos": 30081506987520.0, "grad_norm": 2.898233072241593, "language_loss": 0.86024106, "learning_rate": 3.8849779964878125e-06, "loss": 0.8825897, "num_input_tokens_seen": 39012610, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.1484375, "step": 1806, "time_per_iteration": 4.090376138687134 }, { "auxiliary_loss_clip": 0.0117961, "auxiliary_loss_mlp": 0.01296762, "balance_loss_clip": 1.02744174, "balance_loss_mlp": 1.05333328, "epoch": 0.10864271757102059, "flos": 19063180085760.0, "grad_norm": 2.4331678313889844, "language_loss": 0.81226695, "learning_rate": 3.884851680586687e-06, "loss": 0.83703071, "num_input_tokens_seen": 39030120, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.171875, "step": 1807, "time_per_iteration": 2.5616471767425537 }, { "auxiliary_loss_clip": 0.01196612, "auxiliary_loss_mlp": 0.01050328, "balance_loss_clip": 1.03036046, "balance_loss_mlp": 1.05516481, "epoch": 0.10870284082368856, "flos": 24715052271360.0, "grad_norm": 1.7244368377574102, "language_loss": 0.78793287, "learning_rate": 3.884725297420053e-06, "loss": 0.81040227, "num_input_tokens_seen": 39049875, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.140625, "step": 1808, "time_per_iteration": 4.635498762130737 }, { "auxiliary_loss_clip": 0.01191427, "auxiliary_loss_mlp": 0.01050443, "balance_loss_clip": 1.02866387, "balance_loss_mlp": 1.05606627, "epoch": 0.10876296407635654, "flos": 20627663932800.0, "grad_norm": 1.7791080193940354, "language_loss": 0.79054713, "learning_rate": 3.884598846992422e-06, "loss": 0.81296581, "num_input_tokens_seen": 39068935, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.171875, "step": 1809, "time_per_iteration": 2.741901397705078 }, { "auxiliary_loss_clip": 0.01169966, "auxiliary_loss_mlp": 0.01056999, "balance_loss_clip": 1.03598273, "balance_loss_mlp": 1.05382919, "epoch": 0.1088230873290245, "flos": 21579835610880.0, "grad_norm": 2.125797517961587, "language_loss": 0.84936404, "learning_rate": 3.884472329308306e-06, "loss": 0.87163365, "num_input_tokens_seen": 39087370, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.15625, "step": 1810, "time_per_iteration": 2.6077637672424316 }, { "auxiliary_loss_clip": 0.01205354, "auxiliary_loss_mlp": 0.01055068, "balance_loss_clip": 1.032251, "balance_loss_mlp": 1.05721509, "epoch": 0.10888321058169247, "flos": 26469037296000.0, "grad_norm": 2.165841510864184, "language_loss": 0.62991136, "learning_rate": 3.8843457443722195e-06, "loss": 0.65251553, "num_input_tokens_seen": 39106635, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.203125, "step": 1811, "time_per_iteration": 2.7730963230133057 }, { "auxiliary_loss_clip": 0.01190417, "auxiliary_loss_mlp": 0.01046566, "balance_loss_clip": 1.0251677, "balance_loss_mlp": 1.0549711, "epoch": 0.10894333383436045, "flos": 25848608653440.0, "grad_norm": 6.562534320366889, "language_loss": 0.74220914, "learning_rate": 3.884219092188681e-06, "loss": 0.76457894, "num_input_tokens_seen": 39126335, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.171875, "step": 1812, "time_per_iteration": 2.691321849822998 }, { "auxiliary_loss_clip": 0.01189646, "auxiliary_loss_mlp": 0.01049719, "balance_loss_clip": 1.02832103, "balance_loss_mlp": 1.05481708, "epoch": 0.10900345708702841, "flos": 19537093152000.0, "grad_norm": 1.7430196715804511, "language_loss": 0.72242904, "learning_rate": 3.884092372762209e-06, "loss": 0.74482274, "num_input_tokens_seen": 39144820, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.1640625, "step": 1813, "time_per_iteration": 2.635004758834839 }, { "auxiliary_loss_clip": 0.01187513, "auxiliary_loss_mlp": 0.01047851, "balance_loss_clip": 1.02729964, "balance_loss_mlp": 1.05677366, "epoch": 0.10906358033969638, "flos": 23623296341760.0, "grad_norm": 1.9239005991471585, "language_loss": 0.8253507, "learning_rate": 3.883965586097327e-06, "loss": 0.84770435, "num_input_tokens_seen": 39165945, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.125, "step": 1814, "time_per_iteration": 2.7287893295288086 }, { "auxiliary_loss_clip": 0.01201948, "auxiliary_loss_mlp": 0.01056463, "balance_loss_clip": 1.03623283, "balance_loss_mlp": 1.05638313, "epoch": 0.10912370359236434, "flos": 21214731818880.0, "grad_norm": 2.0020019481506464, "language_loss": 0.84242141, "learning_rate": 3.88383873219856e-06, "loss": 0.86500549, "num_input_tokens_seen": 39183520, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.1875, "step": 1815, "time_per_iteration": 2.644245147705078 }, { "auxiliary_loss_clip": 0.01199632, "auxiliary_loss_mlp": 0.01048548, "balance_loss_clip": 1.02676857, "balance_loss_mlp": 1.05820131, "epoch": 0.10918382684503232, "flos": 13553190622080.0, "grad_norm": 4.005628975907575, "language_loss": 0.70916176, "learning_rate": 3.8837118110704345e-06, "loss": 0.7316435, "num_input_tokens_seen": 39201190, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.140625, "step": 1816, "time_per_iteration": 2.6867504119873047 }, { "auxiliary_loss_clip": 0.01182884, "auxiliary_loss_mlp": 0.01058593, "balance_loss_clip": 1.03622925, "balance_loss_mlp": 1.05571866, "epoch": 0.10924395009770028, "flos": 27964321591680.0, "grad_norm": 2.420029166178284, "language_loss": 0.72612953, "learning_rate": 3.88358482271748e-06, "loss": 0.74854434, "num_input_tokens_seen": 39221210, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.1875, "step": 1817, "time_per_iteration": 2.8041625022888184 }, { "auxiliary_loss_clip": 0.01219743, "auxiliary_loss_mlp": 0.01056259, "balance_loss_clip": 1.03512299, "balance_loss_mlp": 1.05704856, "epoch": 0.10930407335036825, "flos": 25593750679680.0, "grad_norm": 1.7834944083770983, "language_loss": 0.66917086, "learning_rate": 3.883457767144228e-06, "loss": 0.69193083, "num_input_tokens_seen": 39242025, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.171875, "step": 1818, "time_per_iteration": 2.850954532623291 }, { "auxiliary_loss_clip": 0.01191816, "auxiliary_loss_mlp": 0.01055888, "balance_loss_clip": 1.03470421, "balance_loss_mlp": 1.05612206, "epoch": 0.10936419660303623, "flos": 18406194376320.0, "grad_norm": 2.8484490947695273, "language_loss": 0.72935551, "learning_rate": 3.883330644355212e-06, "loss": 0.75183254, "num_input_tokens_seen": 39259870, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.171875, "step": 1819, "time_per_iteration": 2.646371841430664 }, { "auxiliary_loss_clip": 0.01194117, "auxiliary_loss_mlp": 0.01302269, "balance_loss_clip": 1.03356683, "balance_loss_mlp": 1.05988383, "epoch": 0.1094243198557042, "flos": 23840052963840.0, "grad_norm": 2.891812866284084, "language_loss": 0.73750275, "learning_rate": 3.8832034543549716e-06, "loss": 0.76246655, "num_input_tokens_seen": 39278500, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.15625, "step": 1820, "time_per_iteration": 2.7463626861572266 }, { "auxiliary_loss_clip": 0.0117254, "auxiliary_loss_mlp": 0.01053614, "balance_loss_clip": 1.03100014, "balance_loss_mlp": 1.05736709, "epoch": 0.10948444310837216, "flos": 14643940970880.0, "grad_norm": 1.993908507502792, "language_loss": 0.82046473, "learning_rate": 3.883076197148043e-06, "loss": 0.84272623, "num_input_tokens_seen": 39294800, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.15625, "step": 1821, "time_per_iteration": 2.5752573013305664 }, { "auxiliary_loss_clip": 0.01185553, "auxiliary_loss_mlp": 0.01048551, "balance_loss_clip": 1.02855957, "balance_loss_mlp": 1.05377197, "epoch": 0.10954456636104014, "flos": 27818811596160.0, "grad_norm": 2.040765427529535, "language_loss": 0.76203907, "learning_rate": 3.8829488727389684e-06, "loss": 0.7843802, "num_input_tokens_seen": 39314625, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.1328125, "step": 1822, "time_per_iteration": 2.797516345977783 }, { "auxiliary_loss_clip": 0.01186359, "auxiliary_loss_mlp": 0.01047152, "balance_loss_clip": 1.02830529, "balance_loss_mlp": 1.05567789, "epoch": 0.1096046896137081, "flos": 33620934372480.0, "grad_norm": 1.8589801922162954, "language_loss": 0.79678875, "learning_rate": 3.882821481132294e-06, "loss": 0.81912386, "num_input_tokens_seen": 39336465, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.125, "step": 1823, "time_per_iteration": 2.7511489391326904 }, { "auxiliary_loss_clip": 0.01186554, "auxiliary_loss_mlp": 0.01045891, "balance_loss_clip": 1.0257926, "balance_loss_mlp": 1.05539358, "epoch": 0.10966481286637607, "flos": 26980010219520.0, "grad_norm": 1.5802018304571641, "language_loss": 0.79266697, "learning_rate": 3.882694022332562e-06, "loss": 0.81499147, "num_input_tokens_seen": 39357930, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.1328125, "step": 1824, "time_per_iteration": 2.640843152999878 }, { "auxiliary_loss_clip": 0.01184147, "auxiliary_loss_mlp": 0.01050955, "balance_loss_clip": 1.03103566, "balance_loss_mlp": 1.05416965, "epoch": 0.10972493611904403, "flos": 23036551678080.0, "grad_norm": 1.797826184070038, "language_loss": 0.87562519, "learning_rate": 3.882566496344324e-06, "loss": 0.89797622, "num_input_tokens_seen": 39376380, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.1171875, "step": 1825, "time_per_iteration": 2.653904914855957 }, { "auxiliary_loss_clip": 0.01189755, "auxiliary_loss_mlp": 0.01051534, "balance_loss_clip": 1.03113747, "balance_loss_mlp": 1.04987669, "epoch": 0.10978505937171201, "flos": 38104632443520.0, "grad_norm": 2.9444891223012655, "language_loss": 0.75904536, "learning_rate": 3.88243890317213e-06, "loss": 0.7814582, "num_input_tokens_seen": 39399935, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.125, "step": 1826, "time_per_iteration": 2.751182794570923 }, { "auxiliary_loss_clip": 0.01070298, "auxiliary_loss_mlp": 0.01012988, "balance_loss_clip": 1.01052022, "balance_loss_mlp": 1.01963806, "epoch": 0.10984518262437998, "flos": 59849694616320.0, "grad_norm": 0.8260191299056594, "language_loss": 0.54997128, "learning_rate": 3.882311242820534e-06, "loss": 0.57080412, "num_input_tokens_seen": 39460685, "router_z_loss_clip": 0.0246582, "router_z_loss_mlp": 0.41601562, "step": 1827, "time_per_iteration": 3.202707529067993 }, { "auxiliary_loss_clip": 0.01060512, "auxiliary_loss_mlp": 0.01005658, "balance_loss_clip": 1.00309527, "balance_loss_mlp": 1.01888275, "epoch": 0.10990530587704794, "flos": 66719837410560.0, "grad_norm": 1.0186319703211515, "language_loss": 0.55357569, "learning_rate": 3.882183515294092e-06, "loss": 0.57423735, "num_input_tokens_seen": 39524765, "router_z_loss_clip": 0.02563477, "router_z_loss_mlp": 0.41796875, "step": 1828, "time_per_iteration": 3.17287015914917 }, { "auxiliary_loss_clip": 0.01190341, "auxiliary_loss_mlp": 0.01054819, "balance_loss_clip": 1.03230035, "balance_loss_mlp": 1.05439866, "epoch": 0.10996542912971592, "flos": 25447199189760.0, "grad_norm": 2.314982051330621, "language_loss": 0.83811307, "learning_rate": 3.882055720597362e-06, "loss": 0.86056465, "num_input_tokens_seen": 39543640, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.1796875, "step": 1829, "time_per_iteration": 2.5984814167022705 }, { "auxiliary_loss_clip": 0.01207545, "auxiliary_loss_mlp": 0.01049958, "balance_loss_clip": 1.02892983, "balance_loss_mlp": 1.05526316, "epoch": 0.11002555238238389, "flos": 44018186186880.0, "grad_norm": 1.7660058943534778, "language_loss": 0.8858844, "learning_rate": 3.8819278587349045e-06, "loss": 0.90845942, "num_input_tokens_seen": 39567525, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.1640625, "step": 1830, "time_per_iteration": 2.8070228099823 }, { "auxiliary_loss_clip": 0.01078769, "auxiliary_loss_mlp": 0.01002295, "balance_loss_clip": 0.99999458, "balance_loss_mlp": 1.0192523, "epoch": 0.11008567563505185, "flos": 54065133590400.0, "grad_norm": 0.6922764823363702, "language_loss": 0.5554291, "learning_rate": 3.881799929711282e-06, "loss": 0.57623971, "num_input_tokens_seen": 39628470, "router_z_loss_clip": 0.02294922, "router_z_loss_mlp": 0.41796875, "step": 1831, "time_per_iteration": 3.1948776245117188 }, { "auxiliary_loss_clip": 0.01192939, "auxiliary_loss_mlp": 0.0106357, "balance_loss_clip": 1.04149246, "balance_loss_mlp": 1.05500531, "epoch": 0.11014579888771983, "flos": 24243150366720.0, "grad_norm": 3.4249782141795597, "language_loss": 0.91199064, "learning_rate": 3.881671933531061e-06, "loss": 0.93455577, "num_input_tokens_seen": 39646670, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.1953125, "step": 1832, "time_per_iteration": 2.70375394821167 }, { "auxiliary_loss_clip": 0.01069679, "auxiliary_loss_mlp": 0.01002877, "balance_loss_clip": 1.00040936, "balance_loss_mlp": 1.01878297, "epoch": 0.1102059221403878, "flos": 57743965658880.0, "grad_norm": 0.7132277413318966, "language_loss": 0.59928864, "learning_rate": 3.881543870198809e-06, "loss": 0.62001419, "num_input_tokens_seen": 39712915, "router_z_loss_clip": 0.0246582, "router_z_loss_mlp": 0.41796875, "step": 1833, "time_per_iteration": 3.2537496089935303 }, { "auxiliary_loss_clip": 0.01176892, "auxiliary_loss_mlp": 0.01046937, "balance_loss_clip": 1.02619481, "balance_loss_mlp": 1.05131745, "epoch": 0.11026604539305576, "flos": 16795923667200.0, "grad_norm": 2.5001358134522973, "language_loss": 0.80079275, "learning_rate": 3.881415739719096e-06, "loss": 0.82303107, "num_input_tokens_seen": 39730650, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1640625, "step": 1834, "time_per_iteration": 2.6693155765533447 }, { "auxiliary_loss_clip": 0.01207997, "auxiliary_loss_mlp": 0.01050025, "balance_loss_clip": 1.02775693, "balance_loss_mlp": 1.05621433, "epoch": 0.11032616864572373, "flos": 23988076911360.0, "grad_norm": 3.191550676392078, "language_loss": 0.90726221, "learning_rate": 3.881287542096494e-06, "loss": 0.92984247, "num_input_tokens_seen": 39751065, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.15625, "step": 1835, "time_per_iteration": 2.710669994354248 }, { "auxiliary_loss_clip": 0.01197748, "auxiliary_loss_mlp": 0.0105303, "balance_loss_clip": 1.03079712, "balance_loss_mlp": 1.05409288, "epoch": 0.1103862918983917, "flos": 19683141851520.0, "grad_norm": 3.070282648053945, "language_loss": 0.63556826, "learning_rate": 3.881159277335581e-06, "loss": 0.65807605, "num_input_tokens_seen": 39769245, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.1640625, "step": 1836, "time_per_iteration": 2.6906561851501465 }, { "auxiliary_loss_clip": 0.01195392, "auxiliary_loss_mlp": 0.01052963, "balance_loss_clip": 1.03293562, "balance_loss_mlp": 1.05239844, "epoch": 0.11044641515105967, "flos": 32160878340480.0, "grad_norm": 1.9718157496228208, "language_loss": 0.73030961, "learning_rate": 3.88103094544093e-06, "loss": 0.75279319, "num_input_tokens_seen": 39790830, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.15625, "step": 1837, "time_per_iteration": 2.7582931518554688 }, { "auxiliary_loss_clip": 0.01183077, "auxiliary_loss_mlp": 0.01054837, "balance_loss_clip": 1.03420174, "balance_loss_mlp": 1.05795038, "epoch": 0.11050653840372764, "flos": 16689233295360.0, "grad_norm": 2.023100070965251, "language_loss": 0.78391004, "learning_rate": 3.880902546417125e-06, "loss": 0.80628914, "num_input_tokens_seen": 39809475, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.15625, "step": 1838, "time_per_iteration": 2.657219171524048 }, { "auxiliary_loss_clip": 0.01188381, "auxiliary_loss_mlp": 0.01062431, "balance_loss_clip": 1.04090166, "balance_loss_mlp": 1.05496454, "epoch": 0.11056666165639562, "flos": 21208877902080.0, "grad_norm": 2.1846285883026724, "language_loss": 0.71645463, "learning_rate": 3.880774080268745e-06, "loss": 0.73896277, "num_input_tokens_seen": 39826355, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.15625, "step": 1839, "time_per_iteration": 2.6865243911743164 }, { "auxiliary_loss_clip": 0.01175945, "auxiliary_loss_mlp": 0.01298024, "balance_loss_clip": 1.02776945, "balance_loss_mlp": 1.05722308, "epoch": 0.11062678490906358, "flos": 19165488998400.0, "grad_norm": 2.029021971924251, "language_loss": 0.78432298, "learning_rate": 3.880645547000377e-06, "loss": 0.80906266, "num_input_tokens_seen": 39845335, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.1875, "step": 1840, "time_per_iteration": 2.704533576965332 }, { "auxiliary_loss_clip": 0.01175468, "auxiliary_loss_mlp": 0.01043272, "balance_loss_clip": 1.02303052, "balance_loss_mlp": 1.05324292, "epoch": 0.11068690816173155, "flos": 24895287740160.0, "grad_norm": 1.623025553353892, "language_loss": 0.87616712, "learning_rate": 3.880516946616606e-06, "loss": 0.89835453, "num_input_tokens_seen": 39865065, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.125, "step": 1841, "time_per_iteration": 2.705277681350708 }, { "auxiliary_loss_clip": 0.01195795, "auxiliary_loss_mlp": 0.0105517, "balance_loss_clip": 1.03480923, "balance_loss_mlp": 1.05618262, "epoch": 0.11074703141439952, "flos": 16472368932480.0, "grad_norm": 1.9028797346298785, "language_loss": 0.89767963, "learning_rate": 3.880388279122023e-06, "loss": 0.92018932, "num_input_tokens_seen": 39882780, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.125, "step": 1842, "time_per_iteration": 2.587693929672241 }, { "auxiliary_loss_clip": 0.01185751, "auxiliary_loss_mlp": 0.01050455, "balance_loss_clip": 1.03094053, "balance_loss_mlp": 1.05272806, "epoch": 0.11080715466706749, "flos": 19172420323200.0, "grad_norm": 2.084450070764847, "language_loss": 0.85592669, "learning_rate": 3.880259544521219e-06, "loss": 0.87828875, "num_input_tokens_seen": 39900295, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.1484375, "step": 1843, "time_per_iteration": 2.7247958183288574 }, { "auxiliary_loss_clip": 0.01205401, "auxiliary_loss_mlp": 0.01300319, "balance_loss_clip": 1.0311991, "balance_loss_mlp": 1.05591941, "epoch": 0.11086727791973545, "flos": 27704687109120.0, "grad_norm": 1.5155450687728569, "language_loss": 0.74414361, "learning_rate": 3.880130742818789e-06, "loss": 0.7692008, "num_input_tokens_seen": 39922075, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.1328125, "step": 1844, "time_per_iteration": 2.6568496227264404 }, { "auxiliary_loss_clip": 0.01170191, "auxiliary_loss_mlp": 0.01049282, "balance_loss_clip": 1.02750254, "balance_loss_mlp": 1.05386496, "epoch": 0.11092740117240343, "flos": 18514967736960.0, "grad_norm": 2.2414740304889196, "language_loss": 0.75079381, "learning_rate": 3.880001874019328e-06, "loss": 0.77298856, "num_input_tokens_seen": 39940115, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.1640625, "step": 1845, "time_per_iteration": 4.0403478145599365 }, { "auxiliary_loss_clip": 0.01186175, "auxiliary_loss_mlp": 0.01052236, "balance_loss_clip": 1.0325067, "balance_loss_mlp": 1.0541358, "epoch": 0.1109875244250714, "flos": 20522446018560.0, "grad_norm": 1.6756265830881534, "language_loss": 0.76357871, "learning_rate": 3.879872938127438e-06, "loss": 0.78596282, "num_input_tokens_seen": 39959920, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.140625, "step": 1846, "time_per_iteration": 4.293850660324097 }, { "auxiliary_loss_clip": 0.01182225, "auxiliary_loss_mlp": 0.01049958, "balance_loss_clip": 1.0298121, "balance_loss_mlp": 1.05542064, "epoch": 0.11104764767773936, "flos": 14098601710080.0, "grad_norm": 2.801978192642052, "language_loss": 0.85692084, "learning_rate": 3.879743935147717e-06, "loss": 0.87924266, "num_input_tokens_seen": 39974755, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.1796875, "step": 1847, "time_per_iteration": 4.040914058685303 }, { "auxiliary_loss_clip": 0.01171259, "auxiliary_loss_mlp": 0.01045167, "balance_loss_clip": 1.02463889, "balance_loss_mlp": 1.05330539, "epoch": 0.11110777093040733, "flos": 20594518657920.0, "grad_norm": 1.97978625858801, "language_loss": 0.76695889, "learning_rate": 3.87961486508477e-06, "loss": 0.78912312, "num_input_tokens_seen": 39993355, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.1796875, "step": 1848, "time_per_iteration": 2.5576424598693848 }, { "auxiliary_loss_clip": 0.01179225, "auxiliary_loss_mlp": 0.01050916, "balance_loss_clip": 1.03173494, "balance_loss_mlp": 1.05990922, "epoch": 0.11116789418307531, "flos": 21870065502720.0, "grad_norm": 2.2060604006082705, "language_loss": 0.7712785, "learning_rate": 3.879485727943204e-06, "loss": 0.79357994, "num_input_tokens_seen": 40012410, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.1015625, "step": 1849, "time_per_iteration": 4.250816583633423 }, { "auxiliary_loss_clip": 0.01199878, "auxiliary_loss_mlp": 0.01303617, "balance_loss_clip": 1.03519368, "balance_loss_mlp": 1.05478644, "epoch": 0.11122801743574327, "flos": 15523106256000.0, "grad_norm": 2.3631368262455807, "language_loss": 0.71285605, "learning_rate": 3.879356523727627e-06, "loss": 0.73789096, "num_input_tokens_seen": 40029315, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.1796875, "step": 1850, "time_per_iteration": 2.6089859008789062 }, { "auxiliary_loss_clip": 0.01182425, "auxiliary_loss_mlp": 0.01046829, "balance_loss_clip": 1.02647972, "balance_loss_mlp": 1.0570333, "epoch": 0.11128814068841124, "flos": 14392279307520.0, "grad_norm": 2.2465680779674884, "language_loss": 0.81387919, "learning_rate": 3.87922725244265e-06, "loss": 0.83617175, "num_input_tokens_seen": 40045765, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.1640625, "step": 1851, "time_per_iteration": 2.508101463317871 }, { "auxiliary_loss_clip": 0.0117815, "auxiliary_loss_mlp": 0.01058719, "balance_loss_clip": 1.03892982, "balance_loss_mlp": 1.05471635, "epoch": 0.11134826394107922, "flos": 16653933204480.0, "grad_norm": 2.79079010329326, "language_loss": 0.87915683, "learning_rate": 3.879097914092886e-06, "loss": 0.9015255, "num_input_tokens_seen": 40061660, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.140625, "step": 1852, "time_per_iteration": 2.628222942352295 }, { "auxiliary_loss_clip": 0.01181738, "auxiliary_loss_mlp": 0.01062428, "balance_loss_clip": 1.0411613, "balance_loss_mlp": 1.05726635, "epoch": 0.11140838719374718, "flos": 16690993061760.0, "grad_norm": 2.2513508566804963, "language_loss": 0.71934563, "learning_rate": 3.878968508682952e-06, "loss": 0.74178725, "num_input_tokens_seen": 40080180, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.15625, "step": 1853, "time_per_iteration": 2.562788724899292 }, { "auxiliary_loss_clip": 0.01062725, "auxiliary_loss_mlp": 0.01013586, "balance_loss_clip": 1.01017702, "balance_loss_mlp": 1.02061272, "epoch": 0.11146851044641515, "flos": 60976355587200.0, "grad_norm": 0.7726126654889416, "language_loss": 0.5373795, "learning_rate": 3.878839036217464e-06, "loss": 0.5581426, "num_input_tokens_seen": 40138910, "router_z_loss_clip": 0.03417969, "router_z_loss_mlp": 0.421875, "step": 1854, "time_per_iteration": 3.096808910369873 }, { "auxiliary_loss_clip": 0.01197323, "auxiliary_loss_mlp": 0.01053527, "balance_loss_clip": 1.03183103, "balance_loss_mlp": 1.05659318, "epoch": 0.11152863369908313, "flos": 22193835719040.0, "grad_norm": 2.1828350948581488, "language_loss": 0.84505856, "learning_rate": 3.878709496701045e-06, "loss": 0.86756712, "num_input_tokens_seen": 40157745, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.2265625, "step": 1855, "time_per_iteration": 2.599975109100342 }, { "auxiliary_loss_clip": 0.01191292, "auxiliary_loss_mlp": 0.01057429, "balance_loss_clip": 1.03768814, "balance_loss_mlp": 1.05712152, "epoch": 0.11158875695175109, "flos": 19537524115200.0, "grad_norm": 2.4413421125285626, "language_loss": 0.81335044, "learning_rate": 3.8785798901383155e-06, "loss": 0.83583766, "num_input_tokens_seen": 40175375, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.15625, "step": 1856, "time_per_iteration": 2.6224117279052734 }, { "auxiliary_loss_clip": 0.01180983, "auxiliary_loss_mlp": 0.01045663, "balance_loss_clip": 1.02483678, "balance_loss_mlp": 1.0582031, "epoch": 0.11164888020441906, "flos": 25442709989760.0, "grad_norm": 1.895814785036508, "language_loss": 0.82875752, "learning_rate": 3.878450216533902e-06, "loss": 0.85102403, "num_input_tokens_seen": 40195715, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.140625, "step": 1857, "time_per_iteration": 2.6187660694122314 }, { "auxiliary_loss_clip": 0.01200361, "auxiliary_loss_mlp": 0.01041343, "balance_loss_clip": 1.02201927, "balance_loss_mlp": 1.05541348, "epoch": 0.11170900345708702, "flos": 15632741543040.0, "grad_norm": 2.143310630585745, "language_loss": 0.82724255, "learning_rate": 3.878320475892433e-06, "loss": 0.84965962, "num_input_tokens_seen": 40213975, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.1796875, "step": 1858, "time_per_iteration": 2.646541118621826 }, { "auxiliary_loss_clip": 0.01200722, "auxiliary_loss_mlp": 0.01052666, "balance_loss_clip": 1.03222179, "balance_loss_mlp": 1.0567354, "epoch": 0.111769126709755, "flos": 23039424766080.0, "grad_norm": 2.1174088389257717, "language_loss": 0.90775144, "learning_rate": 3.878190668218537e-06, "loss": 0.93028533, "num_input_tokens_seen": 40233905, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.171875, "step": 1859, "time_per_iteration": 2.628368854522705 }, { "auxiliary_loss_clip": 0.01182798, "auxiliary_loss_mlp": 0.01044771, "balance_loss_clip": 1.02389789, "balance_loss_mlp": 1.05515039, "epoch": 0.11182924996242297, "flos": 20850705434880.0, "grad_norm": 2.396177823845084, "language_loss": 0.81299776, "learning_rate": 3.878060793516847e-06, "loss": 0.8352735, "num_input_tokens_seen": 40252810, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.1875, "step": 1860, "time_per_iteration": 2.624880075454712 }, { "auxiliary_loss_clip": 0.01187306, "auxiliary_loss_mlp": 0.0105413, "balance_loss_clip": 1.03394747, "balance_loss_mlp": 1.05626488, "epoch": 0.11188937321509093, "flos": 17455315587840.0, "grad_norm": 1.984051988307078, "language_loss": 0.74520516, "learning_rate": 3.8779308517919995e-06, "loss": 0.76761949, "num_input_tokens_seen": 40272000, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.1328125, "step": 1861, "time_per_iteration": 2.647249937057495 }, { "auxiliary_loss_clip": 0.01169218, "auxiliary_loss_mlp": 0.01048427, "balance_loss_clip": 1.0289247, "balance_loss_mlp": 1.05553007, "epoch": 0.11194949646775891, "flos": 24095916518400.0, "grad_norm": 2.9145592684933903, "language_loss": 0.88904899, "learning_rate": 3.87780084304863e-06, "loss": 0.91122544, "num_input_tokens_seen": 40290660, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.140625, "step": 1862, "time_per_iteration": 2.6606671810150146 }, { "auxiliary_loss_clip": 0.01192387, "auxiliary_loss_mlp": 0.01055503, "balance_loss_clip": 1.0350709, "balance_loss_mlp": 1.05807543, "epoch": 0.11200961972042688, "flos": 25153880728320.0, "grad_norm": 2.6987500817232744, "language_loss": 0.86866856, "learning_rate": 3.877670767291379e-06, "loss": 0.89114743, "num_input_tokens_seen": 40307820, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.15625, "step": 1863, "time_per_iteration": 2.7331387996673584 }, { "auxiliary_loss_clip": 0.01190812, "auxiliary_loss_mlp": 0.010485, "balance_loss_clip": 1.02624321, "balance_loss_mlp": 1.05637348, "epoch": 0.11206974297309484, "flos": 21288312829440.0, "grad_norm": 2.0632412230007353, "language_loss": 0.6404866, "learning_rate": 3.877540624524888e-06, "loss": 0.66287971, "num_input_tokens_seen": 40327430, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.1640625, "step": 1864, "time_per_iteration": 2.6494505405426025 }, { "auxiliary_loss_clip": 0.01216197, "auxiliary_loss_mlp": 0.01046712, "balance_loss_clip": 1.02710247, "balance_loss_mlp": 1.05746961, "epoch": 0.11212986622576282, "flos": 18915982151040.0, "grad_norm": 2.1785379359306667, "language_loss": 0.73891038, "learning_rate": 3.877410414753802e-06, "loss": 0.76153946, "num_input_tokens_seen": 40344545, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.1328125, "step": 1865, "time_per_iteration": 2.6741786003112793 }, { "auxiliary_loss_clip": 0.01216511, "auxiliary_loss_mlp": 0.0105641, "balance_loss_clip": 1.03367662, "balance_loss_mlp": 1.05557144, "epoch": 0.11218998947843078, "flos": 22054754257920.0, "grad_norm": 2.3364972240096287, "language_loss": 0.84607089, "learning_rate": 3.877280137982767e-06, "loss": 0.86880016, "num_input_tokens_seen": 40362300, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 1.15625, "step": 1866, "time_per_iteration": 2.6758549213409424 }, { "auxiliary_loss_clip": 0.01214538, "auxiliary_loss_mlp": 0.01294495, "balance_loss_clip": 1.02489257, "balance_loss_mlp": 1.05424929, "epoch": 0.11225011273109875, "flos": 24571697091840.0, "grad_norm": 1.5862992996716745, "language_loss": 0.81047219, "learning_rate": 3.877149794216433e-06, "loss": 0.83556253, "num_input_tokens_seen": 40384720, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.1484375, "step": 1867, "time_per_iteration": 2.756495475769043 }, { "auxiliary_loss_clip": 0.01173909, "auxiliary_loss_mlp": 0.01055086, "balance_loss_clip": 1.03536892, "balance_loss_mlp": 1.05849934, "epoch": 0.11231023598376672, "flos": 28438665621120.0, "grad_norm": 2.248350314607855, "language_loss": 0.86600715, "learning_rate": 3.877019383459451e-06, "loss": 0.88829708, "num_input_tokens_seen": 40404000, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.15625, "step": 1868, "time_per_iteration": 2.6758525371551514 }, { "auxiliary_loss_clip": 0.0119005, "auxiliary_loss_mlp": 0.01299491, "balance_loss_clip": 1.03088784, "balance_loss_mlp": 1.0582273, "epoch": 0.1123703592364347, "flos": 14426466076800.0, "grad_norm": 2.268582780180726, "language_loss": 0.68030787, "learning_rate": 3.876888905716476e-06, "loss": 0.70520329, "num_input_tokens_seen": 40418665, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.1328125, "step": 1869, "time_per_iteration": 2.6341915130615234 }, { "auxiliary_loss_clip": 0.01202058, "auxiliary_loss_mlp": 0.0105689, "balance_loss_clip": 1.03546762, "balance_loss_mlp": 1.05594778, "epoch": 0.11243048248910266, "flos": 22236282616320.0, "grad_norm": 1.673917473099829, "language_loss": 0.77449024, "learning_rate": 3.876758360992165e-06, "loss": 0.79707968, "num_input_tokens_seen": 40437870, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.1875, "step": 1870, "time_per_iteration": 2.6332716941833496 }, { "auxiliary_loss_clip": 0.01212085, "auxiliary_loss_mlp": 0.0105197, "balance_loss_clip": 1.0303812, "balance_loss_mlp": 1.05656457, "epoch": 0.11249060574177062, "flos": 18584167288320.0, "grad_norm": 2.338417091720562, "language_loss": 0.75824571, "learning_rate": 3.8766277492911736e-06, "loss": 0.78088629, "num_input_tokens_seen": 40455570, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.1875, "step": 1871, "time_per_iteration": 2.7612144947052 }, { "auxiliary_loss_clip": 0.01220316, "auxiliary_loss_mlp": 0.01049726, "balance_loss_clip": 1.02991319, "balance_loss_mlp": 1.058918, "epoch": 0.1125507289944386, "flos": 22856567604480.0, "grad_norm": 1.979954596926267, "language_loss": 0.8136071, "learning_rate": 3.876497070618166e-06, "loss": 0.83630753, "num_input_tokens_seen": 40473600, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.15625, "step": 1872, "time_per_iteration": 2.6974799633026123 }, { "auxiliary_loss_clip": 0.01174044, "auxiliary_loss_mlp": 0.01055384, "balance_loss_clip": 1.03532147, "balance_loss_mlp": 1.05751932, "epoch": 0.11261085224710657, "flos": 19676390094720.0, "grad_norm": 2.360415910156777, "language_loss": 0.83076346, "learning_rate": 3.876366324977806e-06, "loss": 0.85305768, "num_input_tokens_seen": 40490025, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.1640625, "step": 1873, "time_per_iteration": 2.6441450119018555 }, { "auxiliary_loss_clip": 0.01205059, "auxiliary_loss_mlp": 0.01056689, "balance_loss_clip": 1.03465891, "balance_loss_mlp": 1.05738389, "epoch": 0.11267097549977453, "flos": 26063246373120.0, "grad_norm": 1.8872536992520277, "language_loss": 0.91982281, "learning_rate": 3.876235512374757e-06, "loss": 0.94244033, "num_input_tokens_seen": 40511580, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.203125, "step": 1874, "time_per_iteration": 2.7124295234680176 }, { "auxiliary_loss_clip": 0.01198261, "auxiliary_loss_mlp": 0.01062186, "balance_loss_clip": 1.0431006, "balance_loss_mlp": 1.05704546, "epoch": 0.11273109875244251, "flos": 21068036674560.0, "grad_norm": 1.7426530012839994, "language_loss": 0.75217211, "learning_rate": 3.876104632813689e-06, "loss": 0.77477652, "num_input_tokens_seen": 40530155, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.140625, "step": 1875, "time_per_iteration": 2.7331724166870117 }, { "auxiliary_loss_clip": 0.01179929, "auxiliary_loss_mlp": 0.01058053, "balance_loss_clip": 1.0390631, "balance_loss_mlp": 1.05973482, "epoch": 0.11279122200511048, "flos": 27088999061760.0, "grad_norm": 1.8464481740884036, "language_loss": 0.71929514, "learning_rate": 3.875973686299272e-06, "loss": 0.74167502, "num_input_tokens_seen": 40549500, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.1171875, "step": 1876, "time_per_iteration": 2.7348947525024414 }, { "auxiliary_loss_clip": 0.01188684, "auxiliary_loss_mlp": 0.01050351, "balance_loss_clip": 1.0297997, "balance_loss_mlp": 1.05945539, "epoch": 0.11285134525777844, "flos": 20187901722240.0, "grad_norm": 1.973148223012069, "language_loss": 0.7640903, "learning_rate": 3.875842672836182e-06, "loss": 0.78648067, "num_input_tokens_seen": 40567475, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.109375, "step": 1877, "time_per_iteration": 2.599055767059326 }, { "auxiliary_loss_clip": 0.01184284, "auxiliary_loss_mlp": 0.01057608, "balance_loss_clip": 1.03624547, "balance_loss_mlp": 1.05904353, "epoch": 0.11291146851044641, "flos": 12458453863680.0, "grad_norm": 3.258145798170814, "language_loss": 0.87904835, "learning_rate": 3.87571159242909e-06, "loss": 0.9014672, "num_input_tokens_seen": 40583280, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.15625, "step": 1878, "time_per_iteration": 2.629584312438965 }, { "auxiliary_loss_clip": 0.01199436, "auxiliary_loss_mlp": 0.01048128, "balance_loss_clip": 1.02785015, "balance_loss_mlp": 1.05718541, "epoch": 0.11297159176311439, "flos": 23842315520640.0, "grad_norm": 2.175984595889174, "language_loss": 0.81243378, "learning_rate": 3.875580445082677e-06, "loss": 0.83490944, "num_input_tokens_seen": 40603080, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.1484375, "step": 1879, "time_per_iteration": 2.767256498336792 }, { "auxiliary_loss_clip": 0.01192041, "auxiliary_loss_mlp": 0.01056075, "balance_loss_clip": 1.03538036, "balance_loss_mlp": 1.05747151, "epoch": 0.11303171501578235, "flos": 29930538124800.0, "grad_norm": 1.8688055301309594, "language_loss": 0.70013225, "learning_rate": 3.875449230801622e-06, "loss": 0.72261345, "num_input_tokens_seen": 40623255, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1640625, "step": 1880, "time_per_iteration": 2.7276663780212402 }, { "auxiliary_loss_clip": 0.01189194, "auxiliary_loss_mlp": 0.01046746, "balance_loss_clip": 1.02663589, "balance_loss_mlp": 1.05725396, "epoch": 0.11309183826845032, "flos": 16180558842240.0, "grad_norm": 1.708906791344601, "language_loss": 0.72053975, "learning_rate": 3.875317949590609e-06, "loss": 0.74289918, "num_input_tokens_seen": 40641570, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.1328125, "step": 1881, "time_per_iteration": 2.5574538707733154 }, { "auxiliary_loss_clip": 0.01199841, "auxiliary_loss_mlp": 0.01305632, "balance_loss_clip": 1.0351069, "balance_loss_mlp": 1.05561531, "epoch": 0.1131519615211183, "flos": 12020702814720.0, "grad_norm": 2.318532116610503, "language_loss": 0.74284065, "learning_rate": 3.875186601454322e-06, "loss": 0.7678954, "num_input_tokens_seen": 40658775, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.171875, "step": 1882, "time_per_iteration": 2.67661452293396 }, { "auxiliary_loss_clip": 0.01171172, "auxiliary_loss_mlp": 0.01055832, "balance_loss_clip": 1.03432679, "balance_loss_mlp": 1.05705202, "epoch": 0.11321208477378626, "flos": 26250125857920.0, "grad_norm": 1.9539896127469096, "language_loss": 0.79000753, "learning_rate": 3.8750551863974484e-06, "loss": 0.81227756, "num_input_tokens_seen": 40679555, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.140625, "step": 1883, "time_per_iteration": 2.6656458377838135 }, { "auxiliary_loss_clip": 0.0119897, "auxiliary_loss_mlp": 0.0105425, "balance_loss_clip": 1.03291154, "balance_loss_mlp": 1.05426967, "epoch": 0.11327220802645423, "flos": 13626376583040.0, "grad_norm": 1.8879498317806436, "language_loss": 0.76793468, "learning_rate": 3.874923704424679e-06, "loss": 0.7904669, "num_input_tokens_seen": 40697295, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.171875, "step": 1884, "time_per_iteration": 2.7177963256835938 }, { "auxiliary_loss_clip": 0.01083351, "auxiliary_loss_mlp": 0.01019837, "balance_loss_clip": 1.01666582, "balance_loss_mlp": 1.03089833, "epoch": 0.1133323312791222, "flos": 57191802814080.0, "grad_norm": 0.7927316527398393, "language_loss": 0.55196881, "learning_rate": 3.8747921555407045e-06, "loss": 0.57300067, "num_input_tokens_seen": 40758095, "router_z_loss_clip": 0.03173828, "router_z_loss_mlp": 0.43359375, "step": 1885, "time_per_iteration": 3.1050589084625244 }, { "auxiliary_loss_clip": 0.01217944, "auxiliary_loss_mlp": 0.01046372, "balance_loss_clip": 1.02700007, "balance_loss_mlp": 1.05385971, "epoch": 0.11339245453179017, "flos": 24351708245760.0, "grad_norm": 1.9771281608294604, "language_loss": 0.90263009, "learning_rate": 3.874660539750222e-06, "loss": 0.92527318, "num_input_tokens_seen": 40777140, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.09375, "step": 1886, "time_per_iteration": 2.6961581707000732 }, { "auxiliary_loss_clip": 0.01207829, "auxiliary_loss_mlp": 0.01045739, "balance_loss_clip": 1.02602208, "balance_loss_mlp": 1.0572499, "epoch": 0.11345257778445814, "flos": 22670693700480.0, "grad_norm": 2.53061515785616, "language_loss": 0.84660888, "learning_rate": 3.874528857057926e-06, "loss": 0.86914456, "num_input_tokens_seen": 40797505, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.140625, "step": 1887, "time_per_iteration": 5.658379554748535 }, { "auxiliary_loss_clip": 0.01196346, "auxiliary_loss_mlp": 0.0105215, "balance_loss_clip": 1.0332787, "balance_loss_mlp": 1.05586576, "epoch": 0.11351270103712612, "flos": 20988242611200.0, "grad_norm": 4.071464518977581, "language_loss": 0.76234782, "learning_rate": 3.874397107468516e-06, "loss": 0.78483272, "num_input_tokens_seen": 40812970, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.1328125, "step": 1888, "time_per_iteration": 4.1083009243011475 }, { "auxiliary_loss_clip": 0.01213757, "auxiliary_loss_mlp": 0.010541, "balance_loss_clip": 1.03170037, "balance_loss_mlp": 1.05888104, "epoch": 0.11357282428979408, "flos": 37347923600640.0, "grad_norm": 1.785494259197903, "language_loss": 0.68408895, "learning_rate": 3.874265290986696e-06, "loss": 0.70676756, "num_input_tokens_seen": 40837745, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.1875, "step": 1889, "time_per_iteration": 2.832547664642334 }, { "auxiliary_loss_clip": 0.01170807, "auxiliary_loss_mlp": 0.0104498, "balance_loss_clip": 1.02516723, "balance_loss_mlp": 1.05677295, "epoch": 0.11363294754246205, "flos": 21757018423680.0, "grad_norm": 3.742446931759786, "language_loss": 0.83943081, "learning_rate": 3.874133407617169e-06, "loss": 0.86158872, "num_input_tokens_seen": 40856490, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.140625, "step": 1890, "time_per_iteration": 2.722033739089966 }, { "auxiliary_loss_clip": 0.01185589, "auxiliary_loss_mlp": 0.01052653, "balance_loss_clip": 1.03264964, "balance_loss_mlp": 1.05442739, "epoch": 0.11369307079513001, "flos": 22601637803520.0, "grad_norm": 2.2522961058550814, "language_loss": 0.64858747, "learning_rate": 3.874001457364642e-06, "loss": 0.67096984, "num_input_tokens_seen": 40874070, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.125, "step": 1891, "time_per_iteration": 4.986026048660278 }, { "auxiliary_loss_clip": 0.01200862, "auxiliary_loss_mlp": 0.01041702, "balance_loss_clip": 1.02254534, "balance_loss_mlp": 1.05728006, "epoch": 0.11375319404779799, "flos": 21944257044480.0, "grad_norm": 2.0656496112831864, "language_loss": 0.88294828, "learning_rate": 3.873869440233822e-06, "loss": 0.90537393, "num_input_tokens_seen": 40892425, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.15625, "step": 1892, "time_per_iteration": 2.673732280731201 }, { "auxiliary_loss_clip": 0.01219956, "auxiliary_loss_mlp": 0.01058055, "balance_loss_clip": 1.03756332, "balance_loss_mlp": 1.05772531, "epoch": 0.11381331730046595, "flos": 26395456285440.0, "grad_norm": 2.2188981419845057, "language_loss": 0.72421849, "learning_rate": 3.8737373562294225e-06, "loss": 0.74699855, "num_input_tokens_seen": 40912190, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.1640625, "step": 1893, "time_per_iteration": 2.7284011840820312 }, { "auxiliary_loss_clip": 0.01170068, "auxiliary_loss_mlp": 0.01058113, "balance_loss_clip": 1.03638101, "balance_loss_mlp": 1.05565357, "epoch": 0.11387344055313392, "flos": 23804716959360.0, "grad_norm": 2.478054719321421, "language_loss": 0.79702163, "learning_rate": 3.873605205356157e-06, "loss": 0.81930339, "num_input_tokens_seen": 40928395, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.1484375, "step": 1894, "time_per_iteration": 2.69269061088562 }, { "auxiliary_loss_clip": 0.01191613, "auxiliary_loss_mlp": 0.01060174, "balance_loss_clip": 1.03909802, "balance_loss_mlp": 1.05433536, "epoch": 0.1139335638058019, "flos": 34522865902080.0, "grad_norm": 2.6123455450184716, "language_loss": 0.7936604, "learning_rate": 3.873472987618742e-06, "loss": 0.81617826, "num_input_tokens_seen": 40946555, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.1875, "step": 1895, "time_per_iteration": 2.6833102703094482 }, { "auxiliary_loss_clip": 0.01070036, "auxiliary_loss_mlp": 0.01008706, "balance_loss_clip": 1.00585651, "balance_loss_mlp": 1.0272944, "epoch": 0.11399368705846986, "flos": 70587811520640.0, "grad_norm": 0.8065672700106692, "language_loss": 0.63400304, "learning_rate": 3.873340703021894e-06, "loss": 0.65479052, "num_input_tokens_seen": 41004910, "router_z_loss_clip": 0.02844238, "router_z_loss_mlp": 0.42773438, "step": 1896, "time_per_iteration": 3.2924718856811523 }, { "auxiliary_loss_clip": 0.01188766, "auxiliary_loss_mlp": 0.01057333, "balance_loss_clip": 1.03580379, "balance_loss_mlp": 1.05737555, "epoch": 0.11405381031113783, "flos": 21324259365120.0, "grad_norm": 2.4596604278126755, "language_loss": 0.8514185, "learning_rate": 3.873208351570335e-06, "loss": 0.87387943, "num_input_tokens_seen": 41026385, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.1328125, "step": 1897, "time_per_iteration": 2.669633626937866 }, { "auxiliary_loss_clip": 0.011716, "auxiliary_loss_mlp": 0.01306043, "balance_loss_clip": 1.03668952, "balance_loss_mlp": 1.05589557, "epoch": 0.11411393356380581, "flos": 19719627091200.0, "grad_norm": 2.286473941960933, "language_loss": 0.79384953, "learning_rate": 3.873075933268788e-06, "loss": 0.81862593, "num_input_tokens_seen": 41045315, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.15625, "step": 1898, "time_per_iteration": 2.6718766689300537 }, { "auxiliary_loss_clip": 0.01209151, "auxiliary_loss_mlp": 0.01057157, "balance_loss_clip": 1.03498435, "balance_loss_mlp": 1.05519414, "epoch": 0.11417405681647377, "flos": 17530440883200.0, "grad_norm": 1.9836233856187697, "language_loss": 0.72965258, "learning_rate": 3.87294344812198e-06, "loss": 0.75231564, "num_input_tokens_seen": 41063390, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.171875, "step": 1899, "time_per_iteration": 2.680934190750122 }, { "auxiliary_loss_clip": 0.01101073, "auxiliary_loss_mlp": 0.01005732, "balance_loss_clip": 1.00284672, "balance_loss_mlp": 1.02148199, "epoch": 0.11423418006914174, "flos": 59674666619520.0, "grad_norm": 0.9051569885935309, "language_loss": 0.63401723, "learning_rate": 3.8728108961346386e-06, "loss": 0.65508521, "num_input_tokens_seen": 41124180, "router_z_loss_clip": 0.02880859, "router_z_loss_mlp": 0.43359375, "step": 1900, "time_per_iteration": 3.1822762489318848 }, { "auxiliary_loss_clip": 0.01215293, "auxiliary_loss_mlp": 0.01054467, "balance_loss_clip": 1.03376007, "balance_loss_mlp": 1.05410421, "epoch": 0.1142943033218097, "flos": 22963114321920.0, "grad_norm": 1.9608333670634204, "language_loss": 0.78146434, "learning_rate": 3.872678277311493e-06, "loss": 0.80416197, "num_input_tokens_seen": 41143485, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.15625, "step": 1901, "time_per_iteration": 2.7314984798431396 }, { "auxiliary_loss_clip": 0.01178514, "auxiliary_loss_mlp": 0.01055139, "balance_loss_clip": 1.03483737, "balance_loss_mlp": 1.05522037, "epoch": 0.11435442657447768, "flos": 18256267008000.0, "grad_norm": 1.930539293213529, "language_loss": 0.83670974, "learning_rate": 3.872545591657276e-06, "loss": 0.85904622, "num_input_tokens_seen": 41161695, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.140625, "step": 1902, "time_per_iteration": 2.6145472526550293 }, { "auxiliary_loss_clip": 0.01179928, "auxiliary_loss_mlp": 0.01048522, "balance_loss_clip": 1.02649248, "balance_loss_mlp": 1.05462837, "epoch": 0.11441454982714565, "flos": 24061191045120.0, "grad_norm": 1.6575691273905597, "language_loss": 0.77693641, "learning_rate": 3.872412839176725e-06, "loss": 0.79922092, "num_input_tokens_seen": 41181715, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.1640625, "step": 1903, "time_per_iteration": 2.6785800457000732 }, { "auxiliary_loss_clip": 0.01189866, "auxiliary_loss_mlp": 0.01037508, "balance_loss_clip": 1.0198772, "balance_loss_mlp": 1.05639064, "epoch": 0.11447467307981361, "flos": 25337707557120.0, "grad_norm": 1.8013491746532506, "language_loss": 0.75480235, "learning_rate": 3.872280019874576e-06, "loss": 0.77707607, "num_input_tokens_seen": 41201770, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 1.15625, "step": 1904, "time_per_iteration": 2.606419086456299 }, { "auxiliary_loss_clip": 0.01188294, "auxiliary_loss_mlp": 0.01051931, "balance_loss_clip": 1.03073537, "balance_loss_mlp": 1.05594492, "epoch": 0.11453479633248159, "flos": 21726063878400.0, "grad_norm": 2.0414194376544925, "language_loss": 0.92139405, "learning_rate": 3.872147133755568e-06, "loss": 0.94379628, "num_input_tokens_seen": 41220590, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.140625, "step": 1905, "time_per_iteration": 2.677280902862549 }, { "auxiliary_loss_clip": 0.01188119, "auxiliary_loss_mlp": 0.01046582, "balance_loss_clip": 1.02461219, "balance_loss_mlp": 1.04835677, "epoch": 0.11459491958514956, "flos": 12969714096000.0, "grad_norm": 2.54484427275919, "language_loss": 0.77447051, "learning_rate": 3.872014180824446e-06, "loss": 0.79681754, "num_input_tokens_seen": 41237250, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.2109375, "step": 1906, "time_per_iteration": 2.5947113037109375 }, { "auxiliary_loss_clip": 0.01168762, "auxiliary_loss_mlp": 0.01053039, "balance_loss_clip": 1.03183162, "balance_loss_mlp": 1.0526247, "epoch": 0.11465504283781752, "flos": 22711273090560.0, "grad_norm": 1.9723892607951403, "language_loss": 0.81220806, "learning_rate": 3.8718811610859526e-06, "loss": 0.83442611, "num_input_tokens_seen": 41256680, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.1640625, "step": 1907, "time_per_iteration": 2.654599666595459 }, { "auxiliary_loss_clip": 0.0119799, "auxiliary_loss_mlp": 0.01055078, "balance_loss_clip": 1.03521752, "balance_loss_mlp": 1.05624926, "epoch": 0.1147151660904855, "flos": 23398387332480.0, "grad_norm": 5.235723679396975, "language_loss": 0.84645891, "learning_rate": 3.8717480745448356e-06, "loss": 0.86898959, "num_input_tokens_seen": 41270955, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.140625, "step": 1908, "time_per_iteration": 2.6244406700134277 }, { "auxiliary_loss_clip": 0.01101242, "auxiliary_loss_mlp": 0.01019646, "balance_loss_clip": 1.01687992, "balance_loss_mlp": 1.02176404, "epoch": 0.11477528934315347, "flos": 63011843498880.0, "grad_norm": 0.935755819257767, "language_loss": 0.61077714, "learning_rate": 3.871614921205845e-06, "loss": 0.63198596, "num_input_tokens_seen": 41319180, "router_z_loss_clip": 0.02770996, "router_z_loss_mlp": 0.43359375, "step": 1909, "time_per_iteration": 3.0275871753692627 }, { "auxiliary_loss_clip": 0.0117347, "auxiliary_loss_mlp": 0.01050821, "balance_loss_clip": 1.03024507, "balance_loss_mlp": 1.05786419, "epoch": 0.11483541259582143, "flos": 16325601960960.0, "grad_norm": 2.5318782889949984, "language_loss": 0.78702015, "learning_rate": 3.871481701073731e-06, "loss": 0.80926305, "num_input_tokens_seen": 41337480, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.15625, "step": 1910, "time_per_iteration": 2.565884828567505 }, { "auxiliary_loss_clip": 0.01194428, "auxiliary_loss_mlp": 0.01052348, "balance_loss_clip": 1.03106928, "balance_loss_mlp": 1.05896008, "epoch": 0.1148955358484894, "flos": 21580410228480.0, "grad_norm": 2.054220738109863, "language_loss": 0.76993155, "learning_rate": 3.8713484141532505e-06, "loss": 0.79239929, "num_input_tokens_seen": 41354650, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.171875, "step": 1911, "time_per_iteration": 2.7725963592529297 }, { "auxiliary_loss_clip": 0.01183243, "auxiliary_loss_mlp": 0.01049259, "balance_loss_clip": 1.02873158, "balance_loss_mlp": 1.05321252, "epoch": 0.11495565910115738, "flos": 27673696650240.0, "grad_norm": 1.740619282931932, "language_loss": 0.79075593, "learning_rate": 3.871215060449158e-06, "loss": 0.81308097, "num_input_tokens_seen": 41376935, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.1171875, "step": 1912, "time_per_iteration": 2.6508774757385254 }, { "auxiliary_loss_clip": 0.01170287, "auxiliary_loss_mlp": 0.01311013, "balance_loss_clip": 1.04047847, "balance_loss_mlp": 1.05588949, "epoch": 0.11501578235382534, "flos": 20632368614400.0, "grad_norm": 1.9922444645270438, "language_loss": 0.78069943, "learning_rate": 3.871081639966213e-06, "loss": 0.80551243, "num_input_tokens_seen": 41396105, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.140625, "step": 1913, "time_per_iteration": 2.611792802810669 }, { "auxiliary_loss_clip": 0.01178614, "auxiliary_loss_mlp": 0.01045594, "balance_loss_clip": 1.02398181, "balance_loss_mlp": 1.053195, "epoch": 0.1150759056064933, "flos": 19829046896640.0, "grad_norm": 2.065995261938078, "language_loss": 0.69869745, "learning_rate": 3.870948152709178e-06, "loss": 0.72093952, "num_input_tokens_seen": 41415600, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.1640625, "step": 1914, "time_per_iteration": 2.7238783836364746 }, { "auxiliary_loss_clip": 0.01110796, "auxiliary_loss_mlp": 0.01002323, "balance_loss_clip": 0.99973631, "balance_loss_mlp": 1.02315998, "epoch": 0.11513602885916129, "flos": 70045776311040.0, "grad_norm": 0.7595832834192717, "language_loss": 0.61020529, "learning_rate": 3.870814598682816e-06, "loss": 0.63133645, "num_input_tokens_seen": 41478760, "router_z_loss_clip": 0.02587891, "router_z_loss_mlp": 0.421875, "step": 1915, "time_per_iteration": 3.617318868637085 }, { "auxiliary_loss_clip": 0.01180769, "auxiliary_loss_mlp": 0.01051168, "balance_loss_clip": 1.03025877, "balance_loss_mlp": 1.05715764, "epoch": 0.11519615211182925, "flos": 15741730385280.0, "grad_norm": 2.360119988098741, "language_loss": 0.93030554, "learning_rate": 3.8706809778918935e-06, "loss": 0.95262492, "num_input_tokens_seen": 41495720, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.140625, "step": 1916, "time_per_iteration": 2.8948097229003906 }, { "auxiliary_loss_clip": 0.01190338, "auxiliary_loss_mlp": 0.01060106, "balance_loss_clip": 1.0369432, "balance_loss_mlp": 1.05652452, "epoch": 0.11525627536449722, "flos": 20667632791680.0, "grad_norm": 1.995080100933946, "language_loss": 0.72727162, "learning_rate": 3.870547290341179e-06, "loss": 0.74977607, "num_input_tokens_seen": 41513585, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.15625, "step": 1917, "time_per_iteration": 2.6231870651245117 }, { "auxiliary_loss_clip": 0.01198765, "auxiliary_loss_mlp": 0.01051634, "balance_loss_clip": 1.02944911, "balance_loss_mlp": 1.05695796, "epoch": 0.1153163986171652, "flos": 20303283185280.0, "grad_norm": 2.135784629389292, "language_loss": 0.73636007, "learning_rate": 3.870413536035442e-06, "loss": 0.75886405, "num_input_tokens_seen": 41533390, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.140625, "step": 1918, "time_per_iteration": 2.7398364543914795 }, { "auxiliary_loss_clip": 0.01199662, "auxiliary_loss_mlp": 0.01047176, "balance_loss_clip": 1.02650523, "balance_loss_mlp": 1.05288148, "epoch": 0.11537652186983316, "flos": 17639321984640.0, "grad_norm": 2.346994569140675, "language_loss": 0.86712551, "learning_rate": 3.870279714979458e-06, "loss": 0.88959384, "num_input_tokens_seen": 41551015, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1953125, "step": 1919, "time_per_iteration": 2.596160411834717 }, { "auxiliary_loss_clip": 0.01195248, "auxiliary_loss_mlp": 0.01059323, "balance_loss_clip": 1.03809166, "balance_loss_mlp": 1.05407143, "epoch": 0.11543664512250112, "flos": 21069401391360.0, "grad_norm": 1.871053316639401, "language_loss": 0.86603737, "learning_rate": 3.870145827178002e-06, "loss": 0.88858306, "num_input_tokens_seen": 41568055, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.140625, "step": 1920, "time_per_iteration": 2.7131540775299072 }, { "auxiliary_loss_clip": 0.01176545, "auxiliary_loss_mlp": 0.01050837, "balance_loss_clip": 1.02991545, "balance_loss_mlp": 1.05460346, "epoch": 0.11549676837516909, "flos": 22747542848640.0, "grad_norm": 2.2968512246849833, "language_loss": 0.78958154, "learning_rate": 3.8700118726358525e-06, "loss": 0.81185532, "num_input_tokens_seen": 41587435, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.125, "step": 1921, "time_per_iteration": 2.6133618354797363 }, { "auxiliary_loss_clip": 0.01181488, "auxiliary_loss_mlp": 0.01062196, "balance_loss_clip": 1.03954625, "balance_loss_mlp": 1.05378437, "epoch": 0.11555689162783707, "flos": 19168972617600.0, "grad_norm": 1.9156630005828794, "language_loss": 0.7849378, "learning_rate": 3.869877851357789e-06, "loss": 0.80737466, "num_input_tokens_seen": 41604975, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.1796875, "step": 1922, "time_per_iteration": 2.6023712158203125 }, { "auxiliary_loss_clip": 0.01237825, "auxiliary_loss_mlp": 0.01061374, "balance_loss_clip": 1.04103673, "balance_loss_mlp": 1.05747175, "epoch": 0.11561701488050503, "flos": 24572056227840.0, "grad_norm": 1.989923921254754, "language_loss": 0.84354132, "learning_rate": 3.869743763348595e-06, "loss": 0.86653328, "num_input_tokens_seen": 41626155, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.171875, "step": 1923, "time_per_iteration": 2.7428700923919678 }, { "auxiliary_loss_clip": 0.01183844, "auxiliary_loss_mlp": 0.01055821, "balance_loss_clip": 1.03396988, "balance_loss_mlp": 1.05840778, "epoch": 0.115677138133173, "flos": 17092546179840.0, "grad_norm": 2.031246937618579, "language_loss": 0.80677593, "learning_rate": 3.869609608613055e-06, "loss": 0.82917261, "num_input_tokens_seen": 41644805, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.1640625, "step": 1924, "time_per_iteration": 2.682589530944824 }, { "auxiliary_loss_clip": 0.01083951, "auxiliary_loss_mlp": 0.01017865, "balance_loss_clip": 1.01524258, "balance_loss_mlp": 1.02312732, "epoch": 0.11573726138584098, "flos": 62703875266560.0, "grad_norm": 0.8102049666362374, "language_loss": 0.61170149, "learning_rate": 3.869475387155958e-06, "loss": 0.63271964, "num_input_tokens_seen": 41709345, "router_z_loss_clip": 0.02624512, "router_z_loss_mlp": 0.4296875, "step": 1925, "time_per_iteration": 3.257267475128174 }, { "auxiliary_loss_clip": 0.01180538, "auxiliary_loss_mlp": 0.01073598, "balance_loss_clip": 1.05150867, "balance_loss_mlp": 1.05505812, "epoch": 0.11579738463850894, "flos": 22601135013120.0, "grad_norm": 1.903545735191954, "language_loss": 0.74879301, "learning_rate": 3.8693410989820925e-06, "loss": 0.77133435, "num_input_tokens_seen": 41730210, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.1640625, "step": 1926, "time_per_iteration": 2.679137706756592 }, { "auxiliary_loss_clip": 0.0121842, "auxiliary_loss_mlp": 0.01305545, "balance_loss_clip": 1.03316116, "balance_loss_mlp": 1.05487561, "epoch": 0.11585750789117691, "flos": 21726135705600.0, "grad_norm": 2.4613149288411416, "language_loss": 0.72249448, "learning_rate": 3.869206744096252e-06, "loss": 0.74773413, "num_input_tokens_seen": 41750270, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 1.1796875, "step": 1927, "time_per_iteration": 2.6818437576293945 }, { "auxiliary_loss_clip": 0.01187862, "auxiliary_loss_mlp": 0.01055449, "balance_loss_clip": 1.03333569, "balance_loss_mlp": 1.05446744, "epoch": 0.11591763114384489, "flos": 26287544851200.0, "grad_norm": 1.8218801112897696, "language_loss": 0.86650926, "learning_rate": 3.869072322503232e-06, "loss": 0.88894236, "num_input_tokens_seen": 41772975, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.1484375, "step": 1928, "time_per_iteration": 4.113442420959473 }, { "auxiliary_loss_clip": 0.01199441, "auxiliary_loss_mlp": 0.01052479, "balance_loss_clip": 1.03073502, "balance_loss_mlp": 1.05615497, "epoch": 0.11597775439651285, "flos": 23000461488000.0, "grad_norm": 1.7233922190967899, "language_loss": 0.77477157, "learning_rate": 3.868937834207828e-06, "loss": 0.7972908, "num_input_tokens_seen": 41791765, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.1640625, "step": 1929, "time_per_iteration": 4.226290225982666 }, { "auxiliary_loss_clip": 0.01168273, "auxiliary_loss_mlp": 0.01054292, "balance_loss_clip": 1.03480113, "balance_loss_mlp": 1.05503392, "epoch": 0.11603787764918082, "flos": 31941715507200.0, "grad_norm": 1.5985371098362537, "language_loss": 0.76507044, "learning_rate": 3.86880327921484e-06, "loss": 0.78729606, "num_input_tokens_seen": 41815615, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.1328125, "step": 1930, "time_per_iteration": 4.069920778274536 }, { "auxiliary_loss_clip": 0.01188905, "auxiliary_loss_mlp": 0.01051234, "balance_loss_clip": 1.03046727, "balance_loss_mlp": 1.05645919, "epoch": 0.1160980009018488, "flos": 22271654534400.0, "grad_norm": 1.7916406257539674, "language_loss": 0.72084463, "learning_rate": 3.8686686575290695e-06, "loss": 0.74324596, "num_input_tokens_seen": 41834810, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.140625, "step": 1931, "time_per_iteration": 2.614678382873535 }, { "auxiliary_loss_clip": 0.01191374, "auxiliary_loss_mlp": 0.01051734, "balance_loss_clip": 1.02966857, "balance_loss_mlp": 1.05624652, "epoch": 0.11615812415451676, "flos": 22783633038720.0, "grad_norm": 2.2763045109307263, "language_loss": 0.81698579, "learning_rate": 3.868533969155322e-06, "loss": 0.83941686, "num_input_tokens_seen": 41854975, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.1640625, "step": 1932, "time_per_iteration": 4.133213520050049 }, { "auxiliary_loss_clip": 0.01093333, "auxiliary_loss_mlp": 0.0101284, "balance_loss_clip": 1.01002657, "balance_loss_mlp": 1.02370381, "epoch": 0.11621824740718473, "flos": 67146096107520.0, "grad_norm": 0.7634824249450716, "language_loss": 0.61069107, "learning_rate": 3.868399214098404e-06, "loss": 0.63175285, "num_input_tokens_seen": 41911105, "router_z_loss_clip": 0.02807617, "router_z_loss_mlp": 0.421875, "step": 1933, "time_per_iteration": 3.0916383266448975 }, { "auxiliary_loss_clip": 0.01218502, "auxiliary_loss_mlp": 0.01297312, "balance_loss_clip": 1.02789688, "balance_loss_mlp": 1.05505884, "epoch": 0.11627837065985269, "flos": 20375930442240.0, "grad_norm": 2.4161956725753666, "language_loss": 0.85258412, "learning_rate": 3.868264392363124e-06, "loss": 0.87774229, "num_input_tokens_seen": 41931750, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.1796875, "step": 1934, "time_per_iteration": 2.7600862979888916 }, { "auxiliary_loss_clip": 0.01212888, "auxiliary_loss_mlp": 0.01053805, "balance_loss_clip": 1.03146505, "balance_loss_mlp": 1.05780005, "epoch": 0.11633849391252067, "flos": 21725812483200.0, "grad_norm": 2.3019383352497544, "language_loss": 0.65715182, "learning_rate": 3.868129503954293e-06, "loss": 0.67981875, "num_input_tokens_seen": 41949400, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.1875, "step": 1935, "time_per_iteration": 0.12894177436828613 }, { "auxiliary_loss_clip": 0.01174205, "auxiliary_loss_mlp": 0.01045536, "balance_loss_clip": 1.02596164, "balance_loss_mlp": 1.05608118, "epoch": 0.11639861716518864, "flos": 18805341283200.0, "grad_norm": 1.86722404522683, "language_loss": 0.75900483, "learning_rate": 3.867994548876726e-06, "loss": 0.7812022, "num_input_tokens_seen": 41968100, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.1796875, "step": 1936, "time_per_iteration": 2.5961039066314697 }, { "auxiliary_loss_clip": 0.01172239, "auxiliary_loss_mlp": 0.01049803, "balance_loss_clip": 1.02780902, "balance_loss_mlp": 1.0542171, "epoch": 0.1164587404178566, "flos": 21214983214080.0, "grad_norm": 1.891586391750952, "language_loss": 0.84290862, "learning_rate": 3.867859527135238e-06, "loss": 0.86512905, "num_input_tokens_seen": 41986375, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.1796875, "step": 1937, "time_per_iteration": 2.619680643081665 }, { "auxiliary_loss_clip": 0.0118363, "auxiliary_loss_mlp": 0.01042106, "balance_loss_clip": 1.02172136, "balance_loss_mlp": 1.05270219, "epoch": 0.11651886367052458, "flos": 27818632028160.0, "grad_norm": 1.9798755837252917, "language_loss": 0.75859112, "learning_rate": 3.867724438734649e-06, "loss": 0.78084844, "num_input_tokens_seen": 42006055, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.125, "step": 1938, "time_per_iteration": 2.779602289199829 }, { "auxiliary_loss_clip": 0.01174029, "auxiliary_loss_mlp": 0.01050234, "balance_loss_clip": 1.0294919, "balance_loss_mlp": 1.05579627, "epoch": 0.11657898692319255, "flos": 22889569224960.0, "grad_norm": 3.5228576442018715, "language_loss": 0.79230297, "learning_rate": 3.867589283679779e-06, "loss": 0.81454551, "num_input_tokens_seen": 42024995, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1875, "step": 1939, "time_per_iteration": 2.7617766857147217 }, { "auxiliary_loss_clip": 0.0118963, "auxiliary_loss_mlp": 0.01051032, "balance_loss_clip": 1.02927673, "balance_loss_mlp": 1.05541265, "epoch": 0.11663911017586051, "flos": 24315905364480.0, "grad_norm": 2.756527700895119, "language_loss": 0.86350638, "learning_rate": 3.867454061975451e-06, "loss": 0.88591301, "num_input_tokens_seen": 42042640, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.15625, "step": 1940, "time_per_iteration": 2.671757936477661 }, { "auxiliary_loss_clip": 0.01187399, "auxiliary_loss_mlp": 0.01055381, "balance_loss_clip": 1.03549719, "balance_loss_mlp": 1.05712652, "epoch": 0.11669923342852849, "flos": 42340152470400.0, "grad_norm": 1.5527132321497026, "language_loss": 0.75683892, "learning_rate": 3.8673187736264914e-06, "loss": 0.77926672, "num_input_tokens_seen": 42067005, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.1171875, "step": 1941, "time_per_iteration": 2.9225330352783203 }, { "auxiliary_loss_clip": 0.01184013, "auxiliary_loss_mlp": 0.01303672, "balance_loss_clip": 1.03373313, "balance_loss_mlp": 1.05168116, "epoch": 0.11675935668119646, "flos": 14642288945280.0, "grad_norm": 2.413976483060917, "language_loss": 0.88122314, "learning_rate": 3.8671834186377275e-06, "loss": 0.90610003, "num_input_tokens_seen": 42082295, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.140625, "step": 1942, "time_per_iteration": 2.532705783843994 }, { "auxiliary_loss_clip": 0.01176305, "auxiliary_loss_mlp": 0.01048692, "balance_loss_clip": 1.02918935, "balance_loss_mlp": 1.05422008, "epoch": 0.11681947993386442, "flos": 35116470063360.0, "grad_norm": 1.829809934381118, "language_loss": 0.67699915, "learning_rate": 3.867047997013991e-06, "loss": 0.69924915, "num_input_tokens_seen": 42105295, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.1328125, "step": 1943, "time_per_iteration": 2.7030341625213623 }, { "auxiliary_loss_clip": 0.0118563, "auxiliary_loss_mlp": 0.0104303, "balance_loss_clip": 1.02237129, "balance_loss_mlp": 1.05396199, "epoch": 0.11687960318653239, "flos": 38983259024640.0, "grad_norm": 1.8973474554283754, "language_loss": 0.68919957, "learning_rate": 3.866912508760114e-06, "loss": 0.71148616, "num_input_tokens_seen": 42125520, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.1328125, "step": 1944, "time_per_iteration": 2.719453811645508 }, { "auxiliary_loss_clip": 0.01173999, "auxiliary_loss_mlp": 0.01047189, "balance_loss_clip": 1.02850926, "balance_loss_mlp": 1.05132663, "epoch": 0.11693972643920036, "flos": 25994980575360.0, "grad_norm": 1.6775261415231386, "language_loss": 0.82526976, "learning_rate": 3.866776953880932e-06, "loss": 0.84748161, "num_input_tokens_seen": 42146335, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.1328125, "step": 1945, "time_per_iteration": 2.6139724254608154 }, { "auxiliary_loss_clip": 0.01183173, "auxiliary_loss_mlp": 0.01055041, "balance_loss_clip": 1.03481126, "balance_loss_mlp": 1.05064189, "epoch": 0.11699984969186833, "flos": 27272107618560.0, "grad_norm": 2.0791287529820406, "language_loss": 0.76051599, "learning_rate": 3.8666413323812825e-06, "loss": 0.78289807, "num_input_tokens_seen": 42165320, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.140625, "step": 1946, "time_per_iteration": 2.6092562675476074 }, { "auxiliary_loss_clip": 0.01193221, "auxiliary_loss_mlp": 0.01049284, "balance_loss_clip": 1.03001976, "balance_loss_mlp": 1.05425143, "epoch": 0.1170599729445363, "flos": 15267853232640.0, "grad_norm": 2.0766489769985226, "language_loss": 0.68497121, "learning_rate": 3.8665056442660055e-06, "loss": 0.70739627, "num_input_tokens_seen": 42182955, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.1171875, "step": 1947, "time_per_iteration": 2.5753655433654785 }, { "auxiliary_loss_clip": 0.01189604, "auxiliary_loss_mlp": 0.01063922, "balance_loss_clip": 1.04339409, "balance_loss_mlp": 1.05688405, "epoch": 0.11712009619720427, "flos": 17164439251200.0, "grad_norm": 4.606595149018074, "language_loss": 0.84605253, "learning_rate": 3.866369889539942e-06, "loss": 0.86858779, "num_input_tokens_seen": 42200760, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.140625, "step": 1948, "time_per_iteration": 2.519335985183716 }, { "auxiliary_loss_clip": 0.0107919, "auxiliary_loss_mlp": 0.01259964, "balance_loss_clip": 1.00912559, "balance_loss_mlp": 1.01882958, "epoch": 0.11718021944987224, "flos": 70940991997440.0, "grad_norm": 0.8125525054610844, "language_loss": 0.65163493, "learning_rate": 3.86623406820794e-06, "loss": 0.67502642, "num_input_tokens_seen": 42265745, "router_z_loss_clip": 0.02392578, "router_z_loss_mlp": 0.421875, "step": 1949, "time_per_iteration": 3.1633310317993164 }, { "auxiliary_loss_clip": 0.01174621, "auxiliary_loss_mlp": 0.01053189, "balance_loss_clip": 1.03309035, "balance_loss_mlp": 1.05365074, "epoch": 0.1172403427025402, "flos": 27453456408960.0, "grad_norm": 1.5184310809925718, "language_loss": 0.71804237, "learning_rate": 3.8660981802748434e-06, "loss": 0.74032044, "num_input_tokens_seen": 42286245, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.1171875, "step": 1950, "time_per_iteration": 2.7173280715942383 }, { "auxiliary_loss_clip": 0.01180667, "auxiliary_loss_mlp": 0.01054793, "balance_loss_clip": 1.03405046, "balance_loss_mlp": 1.05445695, "epoch": 0.11730046595520818, "flos": 15668723992320.0, "grad_norm": 2.4951362644190613, "language_loss": 0.76770353, "learning_rate": 3.865962225745504e-06, "loss": 0.79005814, "num_input_tokens_seen": 42302710, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.171875, "step": 1951, "time_per_iteration": 2.4902336597442627 }, { "auxiliary_loss_clip": 0.01205778, "auxiliary_loss_mlp": 0.01058119, "balance_loss_clip": 1.03762674, "balance_loss_mlp": 1.0552783, "epoch": 0.11736058920787615, "flos": 25630164092160.0, "grad_norm": 1.9976221899095516, "language_loss": 0.75729012, "learning_rate": 3.865826204624771e-06, "loss": 0.7799291, "num_input_tokens_seen": 42324115, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.140625, "step": 1952, "time_per_iteration": 2.6573140621185303 }, { "auxiliary_loss_clip": 0.01199013, "auxiliary_loss_mlp": 0.01055815, "balance_loss_clip": 1.03562117, "balance_loss_mlp": 1.05405188, "epoch": 0.11742071246054411, "flos": 21434289701760.0, "grad_norm": 3.2352719537217007, "language_loss": 0.71475208, "learning_rate": 3.865690116917501e-06, "loss": 0.7373004, "num_input_tokens_seen": 42342505, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.171875, "step": 1953, "time_per_iteration": 2.5717809200286865 }, { "auxiliary_loss_clip": 0.01192934, "auxiliary_loss_mlp": 0.01299648, "balance_loss_clip": 1.02869391, "balance_loss_mlp": 1.05402184, "epoch": 0.11748083571321208, "flos": 15997845335040.0, "grad_norm": 2.638501049753012, "language_loss": 0.79340851, "learning_rate": 3.8655539626285505e-06, "loss": 0.81833434, "num_input_tokens_seen": 42360525, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.203125, "step": 1954, "time_per_iteration": 2.5822906494140625 }, { "auxiliary_loss_clip": 0.01218341, "auxiliary_loss_mlp": 0.01050804, "balance_loss_clip": 1.02921486, "balance_loss_mlp": 1.05480504, "epoch": 0.11754095896588006, "flos": 16180056051840.0, "grad_norm": 2.510921087758897, "language_loss": 0.85490429, "learning_rate": 3.865417741762777e-06, "loss": 0.87759566, "num_input_tokens_seen": 42377045, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.1796875, "step": 1955, "time_per_iteration": 2.61346173286438 }, { "auxiliary_loss_clip": 0.01176358, "auxiliary_loss_mlp": 0.01295896, "balance_loss_clip": 1.02704978, "balance_loss_mlp": 1.0509181, "epoch": 0.11760108221854802, "flos": 13261596013440.0, "grad_norm": 2.1692729041661205, "language_loss": 0.77728873, "learning_rate": 3.865281454325043e-06, "loss": 0.80201125, "num_input_tokens_seen": 42393960, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.1640625, "step": 1956, "time_per_iteration": 2.5809414386749268 }, { "auxiliary_loss_clip": 0.01178124, "auxiliary_loss_mlp": 0.01049168, "balance_loss_clip": 1.02785325, "balance_loss_mlp": 1.05582523, "epoch": 0.11766120547121599, "flos": 24498439303680.0, "grad_norm": 2.2872168198877154, "language_loss": 0.799052, "learning_rate": 3.865145100320212e-06, "loss": 0.82132494, "num_input_tokens_seen": 42413160, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.1328125, "step": 1957, "time_per_iteration": 2.572493553161621 }, { "auxiliary_loss_clip": 0.01184488, "auxiliary_loss_mlp": 0.01049585, "balance_loss_clip": 1.02897358, "balance_loss_mlp": 1.05717611, "epoch": 0.11772132872388397, "flos": 17784005967360.0, "grad_norm": 2.208502887653745, "language_loss": 0.77830672, "learning_rate": 3.86500867975315e-06, "loss": 0.80064744, "num_input_tokens_seen": 42432590, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.1875, "step": 1958, "time_per_iteration": 2.6116905212402344 }, { "auxiliary_loss_clip": 0.0119223, "auxiliary_loss_mlp": 0.01045848, "balance_loss_clip": 1.02408028, "balance_loss_mlp": 1.05791283, "epoch": 0.11778145197655193, "flos": 13217030213760.0, "grad_norm": 2.0162877224652003, "language_loss": 0.76645708, "learning_rate": 3.864872192628725e-06, "loss": 0.78883791, "num_input_tokens_seen": 42450135, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.1640625, "step": 1959, "time_per_iteration": 2.5180134773254395 }, { "auxiliary_loss_clip": 0.01188802, "auxiliary_loss_mlp": 0.01048518, "balance_loss_clip": 1.02735782, "balance_loss_mlp": 1.05420113, "epoch": 0.1178415752292199, "flos": 20230204965120.0, "grad_norm": 1.887916610372283, "language_loss": 0.6970377, "learning_rate": 3.864735638951809e-06, "loss": 0.7194109, "num_input_tokens_seen": 42470050, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.1640625, "step": 1960, "time_per_iteration": 2.6059582233428955 }, { "auxiliary_loss_clip": 0.01192928, "auxiliary_loss_mlp": 0.010481, "balance_loss_clip": 1.02678561, "balance_loss_mlp": 1.05546689, "epoch": 0.11790169848188788, "flos": 13040134709760.0, "grad_norm": 17.16597774485873, "language_loss": 0.8110221, "learning_rate": 3.864599018727275e-06, "loss": 0.83343238, "num_input_tokens_seen": 42484335, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.1875, "step": 1961, "time_per_iteration": 2.495661735534668 }, { "auxiliary_loss_clip": 0.01174548, "auxiliary_loss_mlp": 0.01306347, "balance_loss_clip": 1.03767943, "balance_loss_mlp": 1.05441236, "epoch": 0.11796182173455584, "flos": 22265728790400.0, "grad_norm": 1.881362945984494, "language_loss": 0.92288327, "learning_rate": 3.864462331959998e-06, "loss": 0.94769228, "num_input_tokens_seen": 42502720, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.109375, "step": 1962, "time_per_iteration": 2.6059727668762207 }, { "auxiliary_loss_clip": 0.0119726, "auxiliary_loss_mlp": 0.01056328, "balance_loss_clip": 1.03534722, "balance_loss_mlp": 1.05388188, "epoch": 0.1180219449872238, "flos": 10635017892480.0, "grad_norm": 2.016825465983966, "language_loss": 0.87653506, "learning_rate": 3.864325578654856e-06, "loss": 0.89907092, "num_input_tokens_seen": 42519460, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.15625, "step": 1963, "time_per_iteration": 2.554431200027466 }, { "auxiliary_loss_clip": 0.01189439, "auxiliary_loss_mlp": 0.01292515, "balance_loss_clip": 1.02431703, "balance_loss_mlp": 1.05315506, "epoch": 0.11808206823989177, "flos": 20923532259840.0, "grad_norm": 3.359957459049847, "language_loss": 0.83927667, "learning_rate": 3.864188758816731e-06, "loss": 0.86409616, "num_input_tokens_seen": 42539420, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.1796875, "step": 1964, "time_per_iteration": 2.640009641647339 }, { "auxiliary_loss_clip": 0.01200559, "auxiliary_loss_mlp": 0.01059754, "balance_loss_clip": 1.03967953, "balance_loss_mlp": 1.05886889, "epoch": 0.11814219149255975, "flos": 20777770869120.0, "grad_norm": 1.8453776502504735, "language_loss": 0.82954919, "learning_rate": 3.864051872450504e-06, "loss": 0.85215235, "num_input_tokens_seen": 42558225, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.140625, "step": 1965, "time_per_iteration": 2.5549299716949463 }, { "auxiliary_loss_clip": 0.01169491, "auxiliary_loss_mlp": 0.01047777, "balance_loss_clip": 1.02771366, "balance_loss_mlp": 1.05300045, "epoch": 0.11820231474522772, "flos": 48759938542080.0, "grad_norm": 1.557236759246327, "language_loss": 0.74290717, "learning_rate": 3.863914919561059e-06, "loss": 0.76507986, "num_input_tokens_seen": 42580790, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.1640625, "step": 1966, "time_per_iteration": 2.805379629135132 }, { "auxiliary_loss_clip": 0.01196173, "auxiliary_loss_mlp": 0.01055675, "balance_loss_clip": 1.03498077, "balance_loss_mlp": 1.05872095, "epoch": 0.11826243799789568, "flos": 16690598012160.0, "grad_norm": 2.4145967242937436, "language_loss": 0.73078817, "learning_rate": 3.863777900153287e-06, "loss": 0.75330663, "num_input_tokens_seen": 42597355, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1953125, "step": 1967, "time_per_iteration": 2.5797033309936523 }, { "auxiliary_loss_clip": 0.01179518, "auxiliary_loss_mlp": 0.01047277, "balance_loss_clip": 1.02647471, "balance_loss_mlp": 1.05518985, "epoch": 0.11832256125056366, "flos": 16909868586240.0, "grad_norm": 3.1460475176456923, "language_loss": 0.88107193, "learning_rate": 3.863640814232076e-06, "loss": 0.90333986, "num_input_tokens_seen": 42616060, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.15625, "step": 1968, "time_per_iteration": 2.593172311782837 }, { "auxiliary_loss_clip": 0.01187092, "auxiliary_loss_mlp": 0.01048635, "balance_loss_clip": 1.0274514, "balance_loss_mlp": 1.0533154, "epoch": 0.11838268450323162, "flos": 22820405587200.0, "grad_norm": 1.929458036797937, "language_loss": 0.66930699, "learning_rate": 3.863503661802317e-06, "loss": 0.69166428, "num_input_tokens_seen": 42636285, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.15625, "step": 1969, "time_per_iteration": 2.6205880641937256 }, { "auxiliary_loss_clip": 0.01198393, "auxiliary_loss_mlp": 0.01053285, "balance_loss_clip": 1.03107619, "balance_loss_mlp": 1.05635452, "epoch": 0.11844280775589959, "flos": 33545844990720.0, "grad_norm": 2.1261150129928574, "language_loss": 0.80975926, "learning_rate": 3.863366442868906e-06, "loss": 0.83227611, "num_input_tokens_seen": 42658320, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.140625, "step": 1970, "time_per_iteration": 4.154564380645752 }, { "auxiliary_loss_clip": 0.01073689, "auxiliary_loss_mlp": 0.01011498, "balance_loss_clip": 1.00866055, "balance_loss_mlp": 1.02204204, "epoch": 0.11850293100856757, "flos": 66350998604160.0, "grad_norm": 0.8009146648842271, "language_loss": 0.66189075, "learning_rate": 3.863229157436741e-06, "loss": 0.6827426, "num_input_tokens_seen": 42721500, "router_z_loss_clip": 0.02832031, "router_z_loss_mlp": 0.42578125, "step": 1971, "time_per_iteration": 6.143375873565674 }, { "auxiliary_loss_clip": 0.01187514, "auxiliary_loss_mlp": 0.01052292, "balance_loss_clip": 1.0319314, "balance_loss_mlp": 1.05177879, "epoch": 0.11856305426123553, "flos": 24681045070080.0, "grad_norm": 2.134415232018262, "language_loss": 0.79354131, "learning_rate": 3.863091805510718e-06, "loss": 0.81593937, "num_input_tokens_seen": 42739825, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.171875, "step": 1972, "time_per_iteration": 2.6483559608459473 }, { "auxiliary_loss_clip": 0.0119818, "auxiliary_loss_mlp": 0.0129499, "balance_loss_clip": 1.0262922, "balance_loss_mlp": 1.05506921, "epoch": 0.1186231775139035, "flos": 24280102483200.0, "grad_norm": 2.6735353610728554, "language_loss": 0.71810073, "learning_rate": 3.862954387095743e-06, "loss": 0.74303246, "num_input_tokens_seen": 42758695, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.15625, "step": 1973, "time_per_iteration": 2.698181390762329 }, { "auxiliary_loss_clip": 0.01193635, "auxiliary_loss_mlp": 0.01045109, "balance_loss_clip": 1.02452195, "balance_loss_mlp": 1.0528717, "epoch": 0.11868330076657148, "flos": 21757413473280.0, "grad_norm": 1.701964526402787, "language_loss": 0.71436083, "learning_rate": 3.862816902196717e-06, "loss": 0.73674834, "num_input_tokens_seen": 42778510, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.1328125, "step": 1974, "time_per_iteration": 4.612811803817749 }, { "auxiliary_loss_clip": 0.01195923, "auxiliary_loss_mlp": 0.01043495, "balance_loss_clip": 1.0232414, "balance_loss_mlp": 1.05340719, "epoch": 0.11874342401923944, "flos": 17193274894080.0, "grad_norm": 2.3192653686734657, "language_loss": 0.77342165, "learning_rate": 3.862679350818547e-06, "loss": 0.79581583, "num_input_tokens_seen": 42793995, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.1484375, "step": 1975, "time_per_iteration": 2.6306347846984863 }, { "auxiliary_loss_clip": 0.01204184, "auxiliary_loss_mlp": 0.01049298, "balance_loss_clip": 1.02960455, "balance_loss_mlp": 1.05437839, "epoch": 0.11880354727190741, "flos": 15229572312960.0, "grad_norm": 2.786126818928191, "language_loss": 0.74977612, "learning_rate": 3.862541732966144e-06, "loss": 0.77231097, "num_input_tokens_seen": 42809000, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.140625, "step": 1976, "time_per_iteration": 2.6666343212127686 }, { "auxiliary_loss_clip": 0.01176078, "auxiliary_loss_mlp": 0.0105144, "balance_loss_clip": 1.03130579, "balance_loss_mlp": 1.05314088, "epoch": 0.11886367052457537, "flos": 27309706179840.0, "grad_norm": 1.5413288726627656, "language_loss": 0.75026077, "learning_rate": 3.862404048644416e-06, "loss": 0.77253592, "num_input_tokens_seen": 42831585, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.140625, "step": 1977, "time_per_iteration": 2.6934661865234375 }, { "auxiliary_loss_clip": 0.01169406, "auxiliary_loss_mlp": 0.01049263, "balance_loss_clip": 1.02930737, "balance_loss_mlp": 1.05280709, "epoch": 0.11892379377724335, "flos": 21798280172160.0, "grad_norm": 2.122398086219545, "language_loss": 0.74140084, "learning_rate": 3.862266297858279e-06, "loss": 0.76358747, "num_input_tokens_seen": 42848420, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.1640625, "step": 1978, "time_per_iteration": 2.6091151237487793 }, { "auxiliary_loss_clip": 0.01194365, "auxiliary_loss_mlp": 0.01048419, "balance_loss_clip": 1.02863073, "balance_loss_mlp": 1.05379725, "epoch": 0.11898391702991132, "flos": 13991013498240.0, "grad_norm": 2.9051467929491155, "language_loss": 0.73133433, "learning_rate": 3.862128480612648e-06, "loss": 0.75376213, "num_input_tokens_seen": 42866645, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.1328125, "step": 1979, "time_per_iteration": 2.61020565032959 }, { "auxiliary_loss_clip": 0.01195996, "auxiliary_loss_mlp": 0.01047419, "balance_loss_clip": 1.02789271, "balance_loss_mlp": 1.0534184, "epoch": 0.11904404028257928, "flos": 32234567091840.0, "grad_norm": 1.778438979029847, "language_loss": 0.98458707, "learning_rate": 3.8619905969124415e-06, "loss": 1.00702119, "num_input_tokens_seen": 42888515, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.15625, "step": 1980, "time_per_iteration": 2.810685396194458 }, { "auxiliary_loss_clip": 0.0118991, "auxiliary_loss_mlp": 0.01046947, "balance_loss_clip": 1.02625227, "balance_loss_mlp": 1.05394125, "epoch": 0.11910416353524726, "flos": 23586272398080.0, "grad_norm": 1.8909891675705255, "language_loss": 0.86187911, "learning_rate": 3.86185264676258e-06, "loss": 0.88424766, "num_input_tokens_seen": 42909035, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1796875, "step": 1981, "time_per_iteration": 2.640270233154297 }, { "auxiliary_loss_clip": 0.0118982, "auxiliary_loss_mlp": 0.01058867, "balance_loss_clip": 1.03791058, "balance_loss_mlp": 1.05598712, "epoch": 0.11916428678791523, "flos": 25333038789120.0, "grad_norm": 1.9650431280796177, "language_loss": 0.85169339, "learning_rate": 3.861714630167987e-06, "loss": 0.87418026, "num_input_tokens_seen": 42927555, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.15625, "step": 1982, "time_per_iteration": 2.6678879261016846 }, { "auxiliary_loss_clip": 0.01185712, "auxiliary_loss_mlp": 0.01046156, "balance_loss_clip": 1.02633166, "balance_loss_mlp": 1.05338204, "epoch": 0.11922441004058319, "flos": 19788431592960.0, "grad_norm": 2.0169946261543736, "language_loss": 0.84806204, "learning_rate": 3.8615765471335874e-06, "loss": 0.8703807, "num_input_tokens_seen": 42945300, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.140625, "step": 1983, "time_per_iteration": 2.681727886199951 }, { "auxiliary_loss_clip": 0.01209185, "auxiliary_loss_mlp": 0.01054385, "balance_loss_clip": 1.03375018, "balance_loss_mlp": 1.05555499, "epoch": 0.11928453329325117, "flos": 21536347219200.0, "grad_norm": 2.774177111633684, "language_loss": 0.76035953, "learning_rate": 3.8614383976643096e-06, "loss": 0.78299522, "num_input_tokens_seen": 42961295, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.171875, "step": 1984, "time_per_iteration": 2.6463918685913086 }, { "auxiliary_loss_clip": 0.01176807, "auxiliary_loss_mlp": 0.01052568, "balance_loss_clip": 1.03325653, "balance_loss_mlp": 1.05351615, "epoch": 0.11934465654591914, "flos": 20815010294400.0, "grad_norm": 1.722607129907654, "language_loss": 0.83177781, "learning_rate": 3.861300181765084e-06, "loss": 0.8540715, "num_input_tokens_seen": 42980330, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.140625, "step": 1985, "time_per_iteration": 2.5672807693481445 }, { "auxiliary_loss_clip": 0.01181698, "auxiliary_loss_mlp": 0.01045366, "balance_loss_clip": 1.02538705, "balance_loss_mlp": 1.05200708, "epoch": 0.1194047797985871, "flos": 19060486565760.0, "grad_norm": 2.1551134207934175, "language_loss": 0.73803288, "learning_rate": 3.861161899440843e-06, "loss": 0.7603035, "num_input_tokens_seen": 42996125, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.109375, "step": 1986, "time_per_iteration": 2.5997774600982666 }, { "auxiliary_loss_clip": 0.01188882, "auxiliary_loss_mlp": 0.01053326, "balance_loss_clip": 1.03381133, "balance_loss_mlp": 1.05524445, "epoch": 0.11946490305125507, "flos": 27190805184000.0, "grad_norm": 2.0336546625580656, "language_loss": 0.71922278, "learning_rate": 3.86102355069652e-06, "loss": 0.74164486, "num_input_tokens_seen": 43014180, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.15625, "step": 1987, "time_per_iteration": 2.627166509628296 }, { "auxiliary_loss_clip": 0.01187303, "auxiliary_loss_mlp": 0.01050528, "balance_loss_clip": 1.02932072, "balance_loss_mlp": 1.05359328, "epoch": 0.11952502630392305, "flos": 21140791672320.0, "grad_norm": 2.674963978710432, "language_loss": 0.71790767, "learning_rate": 3.860885135537054e-06, "loss": 0.74028599, "num_input_tokens_seen": 43032120, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.15625, "step": 1988, "time_per_iteration": 2.6538288593292236 }, { "auxiliary_loss_clip": 0.01175005, "auxiliary_loss_mlp": 0.01064505, "balance_loss_clip": 1.04267824, "balance_loss_mlp": 1.05127072, "epoch": 0.11958514955659101, "flos": 22124241118080.0, "grad_norm": 2.236356633944204, "language_loss": 0.80964345, "learning_rate": 3.860746653967384e-06, "loss": 0.83203852, "num_input_tokens_seen": 43052215, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.1484375, "step": 1989, "time_per_iteration": 2.6243557929992676 }, { "auxiliary_loss_clip": 0.01198506, "auxiliary_loss_mlp": 0.01056438, "balance_loss_clip": 1.03576744, "balance_loss_mlp": 1.05531955, "epoch": 0.11964527280925898, "flos": 17421452040960.0, "grad_norm": 2.6751093548083262, "language_loss": 0.75655133, "learning_rate": 3.860608105992454e-06, "loss": 0.77910078, "num_input_tokens_seen": 43069720, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.15625, "step": 1990, "time_per_iteration": 2.643857479095459 }, { "auxiliary_loss_clip": 0.01085297, "auxiliary_loss_mlp": 0.01023683, "balance_loss_clip": 1.02103662, "balance_loss_mlp": 1.02429318, "epoch": 0.11970539606192696, "flos": 70679741402880.0, "grad_norm": 0.84755028540281, "language_loss": 0.55177671, "learning_rate": 3.860469491617206e-06, "loss": 0.5728665, "num_input_tokens_seen": 43123130, "router_z_loss_clip": 0.02648926, "router_z_loss_mlp": 0.42578125, "step": 1991, "time_per_iteration": 3.170715808868408 }, { "auxiliary_loss_clip": 0.011956, "auxiliary_loss_mlp": 0.01057359, "balance_loss_clip": 1.03730774, "balance_loss_mlp": 1.05558288, "epoch": 0.11976551931459492, "flos": 21215019127680.0, "grad_norm": 2.3642046960634584, "language_loss": 0.77672929, "learning_rate": 3.8603308108465864e-06, "loss": 0.79925889, "num_input_tokens_seen": 43140015, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.125, "step": 1992, "time_per_iteration": 2.601956605911255 }, { "auxiliary_loss_clip": 0.011699, "auxiliary_loss_mlp": 0.0104635, "balance_loss_clip": 1.02607214, "balance_loss_mlp": 1.05230534, "epoch": 0.11982564256726289, "flos": 25989306226560.0, "grad_norm": 2.528478524492239, "language_loss": 0.78942692, "learning_rate": 3.8601920636855466e-06, "loss": 0.81158948, "num_input_tokens_seen": 43160105, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.171875, "step": 1993, "time_per_iteration": 2.650804042816162 }, { "auxiliary_loss_clip": 0.01194194, "auxiliary_loss_mlp": 0.01053943, "balance_loss_clip": 1.03390443, "balance_loss_mlp": 1.05234826, "epoch": 0.11988576581993086, "flos": 21650866755840.0, "grad_norm": 1.7745300161763495, "language_loss": 0.82143891, "learning_rate": 3.860053250139036e-06, "loss": 0.84392023, "num_input_tokens_seen": 43179835, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.140625, "step": 1994, "time_per_iteration": 2.6705706119537354 }, { "auxiliary_loss_clip": 0.01185029, "auxiliary_loss_mlp": 0.01049588, "balance_loss_clip": 1.03040743, "balance_loss_mlp": 1.05404258, "epoch": 0.11994588907259883, "flos": 17857407409920.0, "grad_norm": 2.1505039713488823, "language_loss": 0.88699943, "learning_rate": 3.859914370212011e-06, "loss": 0.90934563, "num_input_tokens_seen": 43197210, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.125, "step": 1995, "time_per_iteration": 2.647404193878174 }, { "auxiliary_loss_clip": 0.01188684, "auxiliary_loss_mlp": 0.01057157, "balance_loss_clip": 1.036129, "balance_loss_mlp": 1.05480039, "epoch": 0.1200060123252668, "flos": 24462744163200.0, "grad_norm": 1.953402483448196, "language_loss": 0.74031925, "learning_rate": 3.859775423909426e-06, "loss": 0.76277769, "num_input_tokens_seen": 43215050, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.15625, "step": 1996, "time_per_iteration": 2.6362898349761963 }, { "auxiliary_loss_clip": 0.01196545, "auxiliary_loss_mlp": 0.01048791, "balance_loss_clip": 1.02763164, "balance_loss_mlp": 1.05443203, "epoch": 0.12006613557793476, "flos": 18732191235840.0, "grad_norm": 1.8934797301115553, "language_loss": 0.87756491, "learning_rate": 3.8596364112362395e-06, "loss": 0.90001822, "num_input_tokens_seen": 43233900, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.1484375, "step": 1997, "time_per_iteration": 2.6617281436920166 }, { "auxiliary_loss_clip": 0.01177146, "auxiliary_loss_mlp": 0.0130477, "balance_loss_clip": 1.03489399, "balance_loss_mlp": 1.05327916, "epoch": 0.12012625883060274, "flos": 22267739952000.0, "grad_norm": 2.0749546238592047, "language_loss": 0.78574765, "learning_rate": 3.859497332197413e-06, "loss": 0.81056678, "num_input_tokens_seen": 43252105, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.1484375, "step": 1998, "time_per_iteration": 2.6839137077331543 }, { "auxiliary_loss_clip": 0.01180044, "auxiliary_loss_mlp": 0.01299857, "balance_loss_clip": 1.03019714, "balance_loss_mlp": 1.0568912, "epoch": 0.1201863820832707, "flos": 21758885930880.0, "grad_norm": 1.6683373372852957, "language_loss": 0.73307598, "learning_rate": 3.8593581867979105e-06, "loss": 0.75787497, "num_input_tokens_seen": 43270315, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.140625, "step": 1999, "time_per_iteration": 2.6303324699401855 }, { "auxiliary_loss_clip": 0.01170513, "auxiliary_loss_mlp": 0.01059802, "balance_loss_clip": 1.03932142, "balance_loss_mlp": 1.0546515, "epoch": 0.12024650533593867, "flos": 21907987286400.0, "grad_norm": 2.5565493066960734, "language_loss": 0.74358606, "learning_rate": 3.8592189750426965e-06, "loss": 0.76588923, "num_input_tokens_seen": 43289935, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.15625, "step": 2000, "time_per_iteration": 2.6682515144348145 }, { "auxiliary_loss_clip": 0.01187169, "auxiliary_loss_mlp": 0.01049208, "balance_loss_clip": 1.02831078, "balance_loss_mlp": 1.05345583, "epoch": 0.12030662858860665, "flos": 21689219502720.0, "grad_norm": 1.82159817685283, "language_loss": 0.84916961, "learning_rate": 3.85907969693674e-06, "loss": 0.87153333, "num_input_tokens_seen": 43309325, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.15625, "step": 2001, "time_per_iteration": 2.709284543991089 }, { "auxiliary_loss_clip": 0.01186941, "auxiliary_loss_mlp": 0.01049647, "balance_loss_clip": 1.0297395, "balance_loss_mlp": 1.05429006, "epoch": 0.12036675184127461, "flos": 12933228856320.0, "grad_norm": 1.9608281648852173, "language_loss": 0.74208987, "learning_rate": 3.858940352485011e-06, "loss": 0.7644558, "num_input_tokens_seen": 43327010, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.140625, "step": 2002, "time_per_iteration": 2.6564066410064697 }, { "auxiliary_loss_clip": 0.01202694, "auxiliary_loss_mlp": 0.01050387, "balance_loss_clip": 1.02935815, "balance_loss_mlp": 1.0557853, "epoch": 0.12042687509394258, "flos": 20851028657280.0, "grad_norm": 2.679591244321034, "language_loss": 0.77914095, "learning_rate": 3.8588009416924835e-06, "loss": 0.80167174, "num_input_tokens_seen": 43345650, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.1953125, "step": 2003, "time_per_iteration": 2.627460241317749 }, { "auxiliary_loss_clip": 0.01174532, "auxiliary_loss_mlp": 0.01049757, "balance_loss_clip": 1.029217, "balance_loss_mlp": 1.05240476, "epoch": 0.12048699834661056, "flos": 23878513451520.0, "grad_norm": 1.747952452597611, "language_loss": 0.71967649, "learning_rate": 3.858661464564131e-06, "loss": 0.7419194, "num_input_tokens_seen": 43365555, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.1328125, "step": 2004, "time_per_iteration": 2.6537423133850098 }, { "auxiliary_loss_clip": 0.01192163, "auxiliary_loss_mlp": 0.01059147, "balance_loss_clip": 1.03766537, "balance_loss_mlp": 1.05551016, "epoch": 0.12054712159927852, "flos": 19756363726080.0, "grad_norm": 1.7444715209409307, "language_loss": 0.78165066, "learning_rate": 3.858521921104932e-06, "loss": 0.80416381, "num_input_tokens_seen": 43384990, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.1875, "step": 2005, "time_per_iteration": 2.5572800636291504 }, { "auxiliary_loss_clip": 0.01067581, "auxiliary_loss_mlp": 0.01251954, "balance_loss_clip": 1.00093007, "balance_loss_mlp": 1.02460027, "epoch": 0.12060724485194649, "flos": 51672763123200.0, "grad_norm": 0.9232367638108646, "language_loss": 0.58055955, "learning_rate": 3.858382311319866e-06, "loss": 0.60375488, "num_input_tokens_seen": 43436335, "router_z_loss_clip": 0.0267334, "router_z_loss_mlp": 0.4296875, "step": 2006, "time_per_iteration": 3.0393176078796387 }, { "auxiliary_loss_clip": 0.01183448, "auxiliary_loss_mlp": 0.01050798, "balance_loss_clip": 1.03074694, "balance_loss_mlp": 1.05409646, "epoch": 0.12066736810461445, "flos": 18990425088000.0, "grad_norm": 1.7319867204749138, "language_loss": 0.76625043, "learning_rate": 3.858242635213917e-06, "loss": 0.78859293, "num_input_tokens_seen": 43456495, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.109375, "step": 2007, "time_per_iteration": 2.610914707183838 }, { "auxiliary_loss_clip": 0.01197629, "auxiliary_loss_mlp": 0.01056396, "balance_loss_clip": 1.03548706, "balance_loss_mlp": 1.05478299, "epoch": 0.12072749135728243, "flos": 16471973882880.0, "grad_norm": 3.6006452307364993, "language_loss": 0.82402194, "learning_rate": 3.858102892792067e-06, "loss": 0.84656215, "num_input_tokens_seen": 43473085, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.15625, "step": 2008, "time_per_iteration": 2.626652956008911 }, { "auxiliary_loss_clip": 0.01167352, "auxiliary_loss_mlp": 0.01048384, "balance_loss_clip": 1.02900076, "balance_loss_mlp": 1.05317783, "epoch": 0.1207876146099504, "flos": 18077108947200.0, "grad_norm": 2.114840639287141, "language_loss": 0.83926076, "learning_rate": 3.857963084059304e-06, "loss": 0.86141813, "num_input_tokens_seen": 43491135, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.140625, "step": 2009, "time_per_iteration": 2.5651848316192627 }, { "auxiliary_loss_clip": 0.01170208, "auxiliary_loss_mlp": 0.0105775, "balance_loss_clip": 1.0360775, "balance_loss_mlp": 1.05593717, "epoch": 0.12084773786261836, "flos": 21871573873920.0, "grad_norm": 1.8788501849811505, "language_loss": 0.83618641, "learning_rate": 3.857823209020619e-06, "loss": 0.85846597, "num_input_tokens_seen": 43510440, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.140625, "step": 2010, "time_per_iteration": 2.6629250049591064 }, { "auxiliary_loss_clip": 0.01192095, "auxiliary_loss_mlp": 0.01304606, "balance_loss_clip": 1.03540826, "balance_loss_mlp": 1.05880082, "epoch": 0.12090786111528634, "flos": 18333044328960.0, "grad_norm": 1.6634634965117021, "language_loss": 0.83962286, "learning_rate": 3.857683267681002e-06, "loss": 0.86458981, "num_input_tokens_seen": 43530145, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.15625, "step": 2011, "time_per_iteration": 4.0260045528411865 }, { "auxiliary_loss_clip": 0.01199783, "auxiliary_loss_mlp": 0.01052565, "balance_loss_clip": 1.03265703, "balance_loss_mlp": 1.05726528, "epoch": 0.1209679843679543, "flos": 21105850717440.0, "grad_norm": 2.0503030547546617, "language_loss": 0.95899606, "learning_rate": 3.857543260045448e-06, "loss": 0.98151946, "num_input_tokens_seen": 43549315, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.15625, "step": 2012, "time_per_iteration": 2.669088840484619 }, { "auxiliary_loss_clip": 0.01182147, "auxiliary_loss_mlp": 0.01044641, "balance_loss_clip": 1.02422071, "balance_loss_mlp": 1.05326629, "epoch": 0.12102810762062227, "flos": 29241053585280.0, "grad_norm": 2.4502149759148852, "language_loss": 0.80358726, "learning_rate": 3.857403186118952e-06, "loss": 0.82585514, "num_input_tokens_seen": 43569240, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.109375, "step": 2013, "time_per_iteration": 5.6763763427734375 }, { "auxiliary_loss_clip": 0.01209501, "auxiliary_loss_mlp": 0.01050263, "balance_loss_clip": 1.02923405, "balance_loss_mlp": 1.05520225, "epoch": 0.12108823087329025, "flos": 17930701111680.0, "grad_norm": 2.4847134733075067, "language_loss": 0.76243401, "learning_rate": 3.857263045906516e-06, "loss": 0.78503162, "num_input_tokens_seen": 43587710, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.171875, "step": 2014, "time_per_iteration": 2.6538431644439697 }, { "auxiliary_loss_clip": 0.01179461, "auxiliary_loss_mlp": 0.01046224, "balance_loss_clip": 1.02579176, "balance_loss_mlp": 1.05719197, "epoch": 0.12114835412595822, "flos": 22091850028800.0, "grad_norm": 2.5708387735958436, "language_loss": 0.87043732, "learning_rate": 3.857122839413138e-06, "loss": 0.89269418, "num_input_tokens_seen": 43606000, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.125, "step": 2015, "time_per_iteration": 4.3061864376068115 }, { "auxiliary_loss_clip": 0.0116153, "auxiliary_loss_mlp": 0.01046368, "balance_loss_clip": 1.02725935, "balance_loss_mlp": 1.05036962, "epoch": 0.12120847737862618, "flos": 20412343854720.0, "grad_norm": 2.5233487058450477, "language_loss": 0.68820786, "learning_rate": 3.856982566643824e-06, "loss": 0.71028692, "num_input_tokens_seen": 43624815, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.109375, "step": 2016, "time_per_iteration": 2.6103529930114746 }, { "auxiliary_loss_clip": 0.01186114, "auxiliary_loss_mlp": 0.01047505, "balance_loss_clip": 1.02617836, "balance_loss_mlp": 1.05435681, "epoch": 0.12126860063129415, "flos": 22309037614080.0, "grad_norm": 2.6492833419179593, "language_loss": 0.79516828, "learning_rate": 3.856842227603578e-06, "loss": 0.81750441, "num_input_tokens_seen": 43643960, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.140625, "step": 2017, "time_per_iteration": 2.6099953651428223 }, { "auxiliary_loss_clip": 0.01177849, "auxiliary_loss_mlp": 0.01050706, "balance_loss_clip": 1.02977288, "balance_loss_mlp": 1.05465531, "epoch": 0.12132872388396213, "flos": 13699275235200.0, "grad_norm": 2.515375999047227, "language_loss": 0.6861347, "learning_rate": 3.856701822297409e-06, "loss": 0.70842028, "num_input_tokens_seen": 43662650, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.140625, "step": 2018, "time_per_iteration": 2.5520052909851074 }, { "auxiliary_loss_clip": 0.01195244, "auxiliary_loss_mlp": 0.01049726, "balance_loss_clip": 1.03102207, "balance_loss_mlp": 1.05607629, "epoch": 0.12138884713663009, "flos": 26466954307200.0, "grad_norm": 2.333079434910049, "language_loss": 0.72520661, "learning_rate": 3.856561350730329e-06, "loss": 0.74765629, "num_input_tokens_seen": 43684205, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.109375, "step": 2019, "time_per_iteration": 2.6793313026428223 }, { "auxiliary_loss_clip": 0.01212388, "auxiliary_loss_mlp": 0.01056486, "balance_loss_clip": 1.03580284, "balance_loss_mlp": 1.05114937, "epoch": 0.12144897038929806, "flos": 26141603892480.0, "grad_norm": 1.9173275533685943, "language_loss": 0.92003685, "learning_rate": 3.856420812907349e-06, "loss": 0.94272566, "num_input_tokens_seen": 43706320, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.15625, "step": 2020, "time_per_iteration": 2.6203465461730957 }, { "auxiliary_loss_clip": 0.01186715, "auxiliary_loss_mlp": 0.01050587, "balance_loss_clip": 1.03077388, "balance_loss_mlp": 1.05508447, "epoch": 0.12150909364196603, "flos": 24717530309760.0, "grad_norm": 2.0887665726815583, "language_loss": 0.77267027, "learning_rate": 3.856280208833486e-06, "loss": 0.79504329, "num_input_tokens_seen": 43724805, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.125, "step": 2021, "time_per_iteration": 2.651113986968994 }, { "auxiliary_loss_clip": 0.01166223, "auxiliary_loss_mlp": 0.01052074, "balance_loss_clip": 1.03257155, "balance_loss_mlp": 1.05418611, "epoch": 0.121569216894634, "flos": 25186990089600.0, "grad_norm": 6.527134508642709, "language_loss": 0.80817437, "learning_rate": 3.856139538513758e-06, "loss": 0.83035737, "num_input_tokens_seen": 43742320, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.125, "step": 2022, "time_per_iteration": 2.5389175415039062 }, { "auxiliary_loss_clip": 0.01192788, "auxiliary_loss_mlp": 0.01055561, "balance_loss_clip": 1.03486609, "balance_loss_mlp": 1.06049776, "epoch": 0.12162934014730196, "flos": 13444094039040.0, "grad_norm": 1.7058522155540297, "language_loss": 0.84868109, "learning_rate": 3.855998801953183e-06, "loss": 0.87116462, "num_input_tokens_seen": 43760665, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.140625, "step": 2023, "time_per_iteration": 2.5840604305267334 }, { "auxiliary_loss_clip": 0.01198137, "auxiliary_loss_mlp": 0.01053445, "balance_loss_clip": 1.03313172, "balance_loss_mlp": 1.0558641, "epoch": 0.12168946339996994, "flos": 16946138344320.0, "grad_norm": 2.1728629215010162, "language_loss": 0.84916878, "learning_rate": 3.855857999156786e-06, "loss": 0.87168467, "num_input_tokens_seen": 43779020, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.15625, "step": 2024, "time_per_iteration": 2.6018431186676025 }, { "auxiliary_loss_clip": 0.01165454, "auxiliary_loss_mlp": 0.01053716, "balance_loss_clip": 1.03392732, "balance_loss_mlp": 1.0509131, "epoch": 0.12174958665263791, "flos": 29821585196160.0, "grad_norm": 2.0344631869115894, "language_loss": 0.7189225, "learning_rate": 3.85571713012959e-06, "loss": 0.74111414, "num_input_tokens_seen": 43798850, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.1484375, "step": 2025, "time_per_iteration": 2.624648332595825 }, { "auxiliary_loss_clip": 0.01186395, "auxiliary_loss_mlp": 0.01049937, "balance_loss_clip": 1.030339, "balance_loss_mlp": 1.0536654, "epoch": 0.12180970990530587, "flos": 24641902224000.0, "grad_norm": 2.1765616115421005, "language_loss": 0.75945485, "learning_rate": 3.855576194876624e-06, "loss": 0.78181815, "num_input_tokens_seen": 43820130, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.140625, "step": 2026, "time_per_iteration": 2.709338665008545 }, { "auxiliary_loss_clip": 0.01187908, "auxiliary_loss_mlp": 0.01049516, "balance_loss_clip": 1.02959609, "balance_loss_mlp": 1.05461156, "epoch": 0.12186983315797385, "flos": 20521691832960.0, "grad_norm": 3.5142075799100025, "language_loss": 0.88400114, "learning_rate": 3.855435193402916e-06, "loss": 0.90637541, "num_input_tokens_seen": 43838485, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.1484375, "step": 2027, "time_per_iteration": 2.7924485206604004 }, { "auxiliary_loss_clip": 0.01184059, "auxiliary_loss_mlp": 0.01057616, "balance_loss_clip": 1.03954399, "balance_loss_mlp": 1.05427969, "epoch": 0.12192995641064182, "flos": 27818344719360.0, "grad_norm": 1.835025904751871, "language_loss": 0.75184178, "learning_rate": 3.8552941257135e-06, "loss": 0.77425849, "num_input_tokens_seen": 43859080, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 1.1171875, "step": 2028, "time_per_iteration": 2.7060399055480957 }, { "auxiliary_loss_clip": 0.01204451, "auxiliary_loss_mlp": 0.01060339, "balance_loss_clip": 1.04052651, "balance_loss_mlp": 1.05401182, "epoch": 0.12199007966330978, "flos": 22017119783040.0, "grad_norm": 2.074452306035403, "language_loss": 0.76576817, "learning_rate": 3.855152991813408e-06, "loss": 0.78841603, "num_input_tokens_seen": 43879030, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.140625, "step": 2029, "time_per_iteration": 2.6330955028533936 }, { "auxiliary_loss_clip": 0.01176113, "auxiliary_loss_mlp": 0.01053094, "balance_loss_clip": 1.03324604, "balance_loss_mlp": 1.05259085, "epoch": 0.12205020291597775, "flos": 23295216493440.0, "grad_norm": 1.8651636102238505, "language_loss": 0.78964394, "learning_rate": 3.855011791707678e-06, "loss": 0.81193602, "num_input_tokens_seen": 43898505, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.140625, "step": 2030, "time_per_iteration": 2.758692741394043 }, { "auxiliary_loss_clip": 0.011674, "auxiliary_loss_mlp": 0.01051332, "balance_loss_clip": 1.0303508, "balance_loss_mlp": 1.05524445, "epoch": 0.12211032616864573, "flos": 26031609469440.0, "grad_norm": 5.177581946673376, "language_loss": 0.73818052, "learning_rate": 3.854870525401349e-06, "loss": 0.76036787, "num_input_tokens_seen": 43917945, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.125, "step": 2031, "time_per_iteration": 2.588315486907959 }, { "auxiliary_loss_clip": 0.01169125, "auxiliary_loss_mlp": 0.01045439, "balance_loss_clip": 1.02714038, "balance_loss_mlp": 1.05670261, "epoch": 0.12217044942131369, "flos": 20410943224320.0, "grad_norm": 1.7252852165239208, "language_loss": 0.74913514, "learning_rate": 3.8547291928994615e-06, "loss": 0.77128077, "num_input_tokens_seen": 43937385, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.125, "step": 2032, "time_per_iteration": 2.614070415496826 }, { "auxiliary_loss_clip": 0.0117084, "auxiliary_loss_mlp": 0.0104441, "balance_loss_clip": 1.02558684, "balance_loss_mlp": 1.05216897, "epoch": 0.12223057267398166, "flos": 22857142222080.0, "grad_norm": 1.7124174250230528, "language_loss": 0.8889553, "learning_rate": 3.8545877942070605e-06, "loss": 0.91110778, "num_input_tokens_seen": 43958130, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.1015625, "step": 2033, "time_per_iteration": 2.5504770278930664 }, { "auxiliary_loss_clip": 0.01193726, "auxiliary_loss_mlp": 0.01049883, "balance_loss_clip": 1.03035617, "balance_loss_mlp": 1.06011999, "epoch": 0.12229069592664964, "flos": 20047563285120.0, "grad_norm": 1.9783497506681886, "language_loss": 0.64849997, "learning_rate": 3.8544463293291914e-06, "loss": 0.67093599, "num_input_tokens_seen": 43976800, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.15625, "step": 2034, "time_per_iteration": 2.6179726123809814 }, { "auxiliary_loss_clip": 0.01189266, "auxiliary_loss_mlp": 0.01056039, "balance_loss_clip": 1.03558278, "balance_loss_mlp": 1.05814612, "epoch": 0.1223508191793176, "flos": 22274240313600.0, "grad_norm": 1.5390218414327983, "language_loss": 0.76559472, "learning_rate": 3.8543047982709035e-06, "loss": 0.78804779, "num_input_tokens_seen": 43996620, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.1328125, "step": 2035, "time_per_iteration": 2.5618889331817627 }, { "auxiliary_loss_clip": 0.01173378, "auxiliary_loss_mlp": 0.01053465, "balance_loss_clip": 1.0330205, "balance_loss_mlp": 1.05729508, "epoch": 0.12241094243198557, "flos": 21285978445440.0, "grad_norm": 3.789505503006845, "language_loss": 0.71271515, "learning_rate": 3.854163201037247e-06, "loss": 0.73498356, "num_input_tokens_seen": 44016175, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.1640625, "step": 2036, "time_per_iteration": 2.618513584136963 }, { "auxiliary_loss_clip": 0.01188553, "auxiliary_loss_mlp": 0.01056149, "balance_loss_clip": 1.03533483, "balance_loss_mlp": 1.05860007, "epoch": 0.12247106568465355, "flos": 17382381022080.0, "grad_norm": 4.393520306338102, "language_loss": 0.82891798, "learning_rate": 3.854021537633275e-06, "loss": 0.85136497, "num_input_tokens_seen": 44035060, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.1171875, "step": 2037, "time_per_iteration": 2.568434238433838 }, { "auxiliary_loss_clip": 0.01192598, "auxiliary_loss_mlp": 0.01055421, "balance_loss_clip": 1.03385627, "balance_loss_mlp": 1.05862045, "epoch": 0.12253118893732151, "flos": 27045438842880.0, "grad_norm": 3.938582127201731, "language_loss": 0.79548407, "learning_rate": 3.853879808064044e-06, "loss": 0.81796432, "num_input_tokens_seen": 44053330, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.15625, "step": 2038, "time_per_iteration": 2.702587842941284 }, { "auxiliary_loss_clip": 0.0107509, "auxiliary_loss_mlp": 0.01256212, "balance_loss_clip": 1.00538599, "balance_loss_mlp": 1.02575564, "epoch": 0.12259131218998948, "flos": 53861518368000.0, "grad_norm": 0.8182633484877561, "language_loss": 0.58655918, "learning_rate": 3.8537380123346105e-06, "loss": 0.6098721, "num_input_tokens_seen": 44107575, "router_z_loss_clip": 0.02478027, "router_z_loss_mlp": 0.40234375, "step": 2039, "time_per_iteration": 3.0767199993133545 }, { "auxiliary_loss_clip": 0.01203069, "auxiliary_loss_mlp": 0.01050857, "balance_loss_clip": 1.02900624, "balance_loss_mlp": 1.05955875, "epoch": 0.12265143544265744, "flos": 17891917401600.0, "grad_norm": 2.194124813493525, "language_loss": 0.79987013, "learning_rate": 3.853596150450037e-06, "loss": 0.82240933, "num_input_tokens_seen": 44126075, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.1640625, "step": 2040, "time_per_iteration": 2.633741855621338 }, { "auxiliary_loss_clip": 0.01185138, "auxiliary_loss_mlp": 0.01049255, "balance_loss_clip": 1.02890611, "balance_loss_mlp": 1.05539763, "epoch": 0.12271155869532542, "flos": 21799932197760.0, "grad_norm": 1.8476504848112494, "language_loss": 0.82291114, "learning_rate": 3.853454222415384e-06, "loss": 0.84525514, "num_input_tokens_seen": 44145605, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.1171875, "step": 2041, "time_per_iteration": 2.643054246902466 }, { "auxiliary_loss_clip": 0.01191469, "auxiliary_loss_mlp": 0.01052122, "balance_loss_clip": 1.03108168, "balance_loss_mlp": 1.05699205, "epoch": 0.12277168194799339, "flos": 19828759587840.0, "grad_norm": 1.5697210279821145, "language_loss": 0.66870749, "learning_rate": 3.853312228235717e-06, "loss": 0.69114339, "num_input_tokens_seen": 44164770, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.1640625, "step": 2042, "time_per_iteration": 2.6947057247161865 }, { "auxiliary_loss_clip": 0.01198237, "auxiliary_loss_mlp": 0.01055777, "balance_loss_clip": 1.03514171, "balance_loss_mlp": 1.05754483, "epoch": 0.12283180520066135, "flos": 23221024951680.0, "grad_norm": 1.806129911317149, "language_loss": 0.81809694, "learning_rate": 3.853170167916106e-06, "loss": 0.84063709, "num_input_tokens_seen": 44184025, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.140625, "step": 2043, "time_per_iteration": 2.7145211696624756 }, { "auxiliary_loss_clip": 0.0118648, "auxiliary_loss_mlp": 0.01047664, "balance_loss_clip": 1.02690983, "balance_loss_mlp": 1.05412054, "epoch": 0.12289192845332933, "flos": 18588476920320.0, "grad_norm": 2.04964741113747, "language_loss": 0.79867697, "learning_rate": 3.853028041461617e-06, "loss": 0.82101846, "num_input_tokens_seen": 44202950, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.140625, "step": 2044, "time_per_iteration": 2.635061025619507 }, { "auxiliary_loss_clip": 0.0120344, "auxiliary_loss_mlp": 0.01049127, "balance_loss_clip": 1.02918291, "balance_loss_mlp": 1.05716264, "epoch": 0.1229520517059973, "flos": 25769532862080.0, "grad_norm": 1.7590386676664935, "language_loss": 0.78112018, "learning_rate": 3.852885848877323e-06, "loss": 0.80364579, "num_input_tokens_seen": 44221115, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.1015625, "step": 2045, "time_per_iteration": 2.732816457748413 }, { "auxiliary_loss_clip": 0.0118401, "auxiliary_loss_mlp": 0.0106131, "balance_loss_clip": 1.04010296, "balance_loss_mlp": 1.05731595, "epoch": 0.12301217495866526, "flos": 20887154760960.0, "grad_norm": 2.1818984607972842, "language_loss": 0.67260569, "learning_rate": 3.852743590168301e-06, "loss": 0.69505894, "num_input_tokens_seen": 44240575, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.171875, "step": 2046, "time_per_iteration": 2.5521419048309326 }, { "auxiliary_loss_clip": 0.01168626, "auxiliary_loss_mlp": 0.01048209, "balance_loss_clip": 1.02760911, "balance_loss_mlp": 1.05699074, "epoch": 0.12307229821133324, "flos": 22378811783040.0, "grad_norm": 2.2486846137930248, "language_loss": 0.72043949, "learning_rate": 3.852601265339625e-06, "loss": 0.74260783, "num_input_tokens_seen": 44257145, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.1171875, "step": 2047, "time_per_iteration": 2.6270899772644043 }, { "auxiliary_loss_clip": 0.01177531, "auxiliary_loss_mlp": 0.01058354, "balance_loss_clip": 1.03649139, "balance_loss_mlp": 1.0571698, "epoch": 0.1231324214640012, "flos": 23367396873600.0, "grad_norm": 1.7922179171075336, "language_loss": 0.7609632, "learning_rate": 3.8524588743963755e-06, "loss": 0.7833221, "num_input_tokens_seen": 44278035, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.109375, "step": 2048, "time_per_iteration": 2.608642101287842 }, { "auxiliary_loss_clip": 0.0118471, "auxiliary_loss_mlp": 0.01048317, "balance_loss_clip": 1.02972078, "balance_loss_mlp": 1.05625594, "epoch": 0.12319254471666917, "flos": 23767154311680.0, "grad_norm": 2.405332362610022, "language_loss": 0.84782684, "learning_rate": 3.852316417343634e-06, "loss": 0.87015712, "num_input_tokens_seen": 44296980, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.1015625, "step": 2049, "time_per_iteration": 2.665112018585205 }, { "auxiliary_loss_clip": 0.01206668, "auxiliary_loss_mlp": 0.01049594, "balance_loss_clip": 1.02919698, "balance_loss_mlp": 1.05677688, "epoch": 0.12325266796933713, "flos": 23550146294400.0, "grad_norm": 1.7221310756924533, "language_loss": 0.75245512, "learning_rate": 3.852173894186484e-06, "loss": 0.77501774, "num_input_tokens_seen": 44318005, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.1328125, "step": 2050, "time_per_iteration": 2.7131528854370117 }, { "auxiliary_loss_clip": 0.01196877, "auxiliary_loss_mlp": 0.01058064, "balance_loss_clip": 1.03729784, "balance_loss_mlp": 1.05606043, "epoch": 0.12331279122200511, "flos": 24423996366720.0, "grad_norm": 2.0121627127115125, "language_loss": 0.80652702, "learning_rate": 3.852031304930012e-06, "loss": 0.82907641, "num_input_tokens_seen": 44335260, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.1328125, "step": 2051, "time_per_iteration": 2.6583762168884277 }, { "auxiliary_loss_clip": 0.01168471, "auxiliary_loss_mlp": 0.01298507, "balance_loss_clip": 1.03004611, "balance_loss_mlp": 1.05746627, "epoch": 0.12337291447467308, "flos": 25484294960640.0, "grad_norm": 2.042898840456879, "language_loss": 0.80153239, "learning_rate": 3.851888649579307e-06, "loss": 0.82620215, "num_input_tokens_seen": 44355315, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.109375, "step": 2052, "time_per_iteration": 2.635561466217041 }, { "auxiliary_loss_clip": 0.0118112, "auxiliary_loss_mlp": 0.010563, "balance_loss_clip": 1.03525949, "balance_loss_mlp": 1.05697036, "epoch": 0.12343303772734104, "flos": 23550002640000.0, "grad_norm": 2.323267044723356, "language_loss": 0.73392469, "learning_rate": 3.85174592813946e-06, "loss": 0.7562989, "num_input_tokens_seen": 44373020, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.1484375, "step": 2053, "time_per_iteration": 4.034372329711914 }, { "auxiliary_loss_clip": 0.01165142, "auxiliary_loss_mlp": 0.01297302, "balance_loss_clip": 1.02828264, "balance_loss_mlp": 1.05103207, "epoch": 0.12349316098000902, "flos": 47557074867840.0, "grad_norm": 1.691299180963268, "language_loss": 0.74110091, "learning_rate": 3.851603140615564e-06, "loss": 0.76572537, "num_input_tokens_seen": 44397525, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.140625, "step": 2054, "time_per_iteration": 5.749230623245239 }, { "auxiliary_loss_clip": 0.01183012, "auxiliary_loss_mlp": 0.01040021, "balance_loss_clip": 1.02163875, "balance_loss_mlp": 1.05420613, "epoch": 0.12355328423267699, "flos": 25045969294080.0, "grad_norm": 2.112613120709098, "language_loss": 0.85003531, "learning_rate": 3.851460287012714e-06, "loss": 0.8722657, "num_input_tokens_seen": 44415890, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.109375, "step": 2055, "time_per_iteration": 2.644984006881714 }, { "auxiliary_loss_clip": 0.01205198, "auxiliary_loss_mlp": 0.01048, "balance_loss_clip": 1.02875996, "balance_loss_mlp": 1.05550551, "epoch": 0.12361340748534495, "flos": 27709140395520.0, "grad_norm": 1.9489899963679247, "language_loss": 0.77587855, "learning_rate": 3.85131736733601e-06, "loss": 0.79841053, "num_input_tokens_seen": 44436625, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.1328125, "step": 2056, "time_per_iteration": 2.6252269744873047 }, { "auxiliary_loss_clip": 0.01176516, "auxiliary_loss_mlp": 0.01054009, "balance_loss_clip": 1.03402948, "balance_loss_mlp": 1.05499578, "epoch": 0.12367353073801293, "flos": 26140598311680.0, "grad_norm": 2.0154945756244227, "language_loss": 0.83260912, "learning_rate": 3.851174381590551e-06, "loss": 0.85491437, "num_input_tokens_seen": 44455265, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.125, "step": 2057, "time_per_iteration": 4.09697699546814 }, { "auxiliary_loss_clip": 0.01187761, "auxiliary_loss_mlp": 0.01055543, "balance_loss_clip": 1.03478861, "balance_loss_mlp": 1.05626571, "epoch": 0.1237336539906809, "flos": 25156035544320.0, "grad_norm": 1.7332407731402124, "language_loss": 0.78489184, "learning_rate": 3.85103132978144e-06, "loss": 0.80732483, "num_input_tokens_seen": 44475815, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1328125, "step": 2058, "time_per_iteration": 2.6976165771484375 }, { "auxiliary_loss_clip": 0.01189762, "auxiliary_loss_mlp": 0.01051187, "balance_loss_clip": 1.02983689, "balance_loss_mlp": 1.05661428, "epoch": 0.12379377724334886, "flos": 15304589867520.0, "grad_norm": 2.2568822647245925, "language_loss": 0.82559633, "learning_rate": 3.850888211913782e-06, "loss": 0.84800589, "num_input_tokens_seen": 44494045, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.1484375, "step": 2059, "time_per_iteration": 2.5774643421173096 }, { "auxiliary_loss_clip": 0.01197275, "auxiliary_loss_mlp": 0.01057681, "balance_loss_clip": 1.0356636, "balance_loss_mlp": 1.05519295, "epoch": 0.12385390049601683, "flos": 21316717509120.0, "grad_norm": 2.9989353078353465, "language_loss": 0.81398773, "learning_rate": 3.8507450279926856e-06, "loss": 0.83653724, "num_input_tokens_seen": 44509120, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.1484375, "step": 2060, "time_per_iteration": 2.6508500576019287 }, { "auxiliary_loss_clip": 0.01186395, "auxiliary_loss_mlp": 0.01054332, "balance_loss_clip": 1.03271925, "balance_loss_mlp": 1.05421209, "epoch": 0.1239140237486848, "flos": 15116309752320.0, "grad_norm": 2.446288938066283, "language_loss": 0.86224204, "learning_rate": 3.850601778023259e-06, "loss": 0.88464928, "num_input_tokens_seen": 44525780, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.140625, "step": 2061, "time_per_iteration": 2.5523629188537598 }, { "auxiliary_loss_clip": 0.01174254, "auxiliary_loss_mlp": 0.01045119, "balance_loss_clip": 1.02573609, "balance_loss_mlp": 1.05623925, "epoch": 0.12397414700135277, "flos": 21976791788160.0, "grad_norm": 1.8029410258826752, "language_loss": 0.84724355, "learning_rate": 3.850458462010615e-06, "loss": 0.86943734, "num_input_tokens_seen": 44543125, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.09375, "step": 2062, "time_per_iteration": 2.606696844100952 }, { "auxiliary_loss_clip": 0.01196691, "auxiliary_loss_mlp": 0.01056285, "balance_loss_clip": 1.03542304, "balance_loss_mlp": 1.05836368, "epoch": 0.12403427025402074, "flos": 13400892956160.0, "grad_norm": 2.617580647071147, "language_loss": 0.77932346, "learning_rate": 3.850315079959869e-06, "loss": 0.80185318, "num_input_tokens_seen": 44560275, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.109375, "step": 2063, "time_per_iteration": 2.6311702728271484 }, { "auxiliary_loss_clip": 0.01205281, "auxiliary_loss_mlp": 0.01056362, "balance_loss_clip": 1.03447521, "balance_loss_mlp": 1.05640876, "epoch": 0.12409439350668872, "flos": 15304374385920.0, "grad_norm": 2.6333047998881036, "language_loss": 0.7947253, "learning_rate": 3.850171631876137e-06, "loss": 0.81734174, "num_input_tokens_seen": 44577640, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.125, "step": 2064, "time_per_iteration": 2.7204556465148926 }, { "auxiliary_loss_clip": 0.01185583, "auxiliary_loss_mlp": 0.01055846, "balance_loss_clip": 1.03518677, "balance_loss_mlp": 1.05564618, "epoch": 0.12415451675935668, "flos": 25009376313600.0, "grad_norm": 2.324573183841102, "language_loss": 0.92957222, "learning_rate": 3.850028117764539e-06, "loss": 0.95198649, "num_input_tokens_seen": 44594860, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1171875, "step": 2065, "time_per_iteration": 2.628087282180786 }, { "auxiliary_loss_clip": 0.01187828, "auxiliary_loss_mlp": 0.01052461, "balance_loss_clip": 1.03066933, "balance_loss_mlp": 1.05485165, "epoch": 0.12421464001202465, "flos": 23659673840640.0, "grad_norm": 1.7893570010667057, "language_loss": 0.80635524, "learning_rate": 3.849884537630196e-06, "loss": 0.82875812, "num_input_tokens_seen": 44614780, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.1484375, "step": 2066, "time_per_iteration": 2.6887693405151367 }, { "auxiliary_loss_clip": 0.01104964, "auxiliary_loss_mlp": 0.01009914, "balance_loss_clip": 1.00751841, "balance_loss_mlp": 1.02900124, "epoch": 0.12427476326469263, "flos": 65732904345600.0, "grad_norm": 0.8619688163331176, "language_loss": 0.63318944, "learning_rate": 3.849740891478233e-06, "loss": 0.65433824, "num_input_tokens_seen": 44671240, "router_z_loss_clip": 0.02392578, "router_z_loss_mlp": 0.40234375, "step": 2067, "time_per_iteration": 3.2382309436798096 }, { "auxiliary_loss_clip": 0.0120416, "auxiliary_loss_mlp": 0.01053793, "balance_loss_clip": 1.03328919, "balance_loss_mlp": 1.0547508, "epoch": 0.12433488651736059, "flos": 24535427333760.0, "grad_norm": 1.9443835436126804, "language_loss": 0.92913944, "learning_rate": 3.849597179313775e-06, "loss": 0.95171905, "num_input_tokens_seen": 44691050, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.1328125, "step": 2068, "time_per_iteration": 2.697105884552002 }, { "auxiliary_loss_clip": 0.01180293, "auxiliary_loss_mlp": 0.01050219, "balance_loss_clip": 1.03132463, "balance_loss_mlp": 1.05876017, "epoch": 0.12439500977002856, "flos": 21031659175680.0, "grad_norm": 1.7547587548310772, "language_loss": 0.80925304, "learning_rate": 3.849453401141952e-06, "loss": 0.83155823, "num_input_tokens_seen": 44709850, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.125, "step": 2069, "time_per_iteration": 2.701845169067383 }, { "auxiliary_loss_clip": 0.01207145, "auxiliary_loss_mlp": 0.01047756, "balance_loss_clip": 1.02757418, "balance_loss_mlp": 1.05734515, "epoch": 0.12445513302269653, "flos": 26830621555200.0, "grad_norm": 1.6481560150648795, "language_loss": 0.77064127, "learning_rate": 3.8493095569678945e-06, "loss": 0.79319024, "num_input_tokens_seen": 44731475, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.125, "step": 2070, "time_per_iteration": 2.6774914264678955 }, { "auxiliary_loss_clip": 0.01183739, "auxiliary_loss_mlp": 0.01051441, "balance_loss_clip": 1.03090096, "balance_loss_mlp": 1.0605576, "epoch": 0.1245152562753645, "flos": 18368919037440.0, "grad_norm": 3.249851161442964, "language_loss": 0.81057084, "learning_rate": 3.849165646796735e-06, "loss": 0.8329227, "num_input_tokens_seen": 44749685, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.140625, "step": 2071, "time_per_iteration": 2.7374422550201416 }, { "auxiliary_loss_clip": 0.01178239, "auxiliary_loss_mlp": 0.01057539, "balance_loss_clip": 1.03537786, "balance_loss_mlp": 1.05878019, "epoch": 0.12457537952803246, "flos": 33107986200960.0, "grad_norm": 1.7377320688259943, "language_loss": 0.782619, "learning_rate": 3.849021670633611e-06, "loss": 0.80497682, "num_input_tokens_seen": 44772165, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.1015625, "step": 2072, "time_per_iteration": 2.7407121658325195 }, { "auxiliary_loss_clip": 0.01188739, "auxiliary_loss_mlp": 0.01054803, "balance_loss_clip": 1.03451347, "balance_loss_mlp": 1.05875826, "epoch": 0.12463550278070043, "flos": 22270217990400.0, "grad_norm": 1.7649025141680543, "language_loss": 0.74707681, "learning_rate": 3.8488776284836595e-06, "loss": 0.76951218, "num_input_tokens_seen": 44790580, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.1171875, "step": 2073, "time_per_iteration": 2.7010812759399414 }, { "auxiliary_loss_clip": 0.012133, "auxiliary_loss_mlp": 0.01051702, "balance_loss_clip": 1.03086436, "balance_loss_mlp": 1.05536819, "epoch": 0.12469562603336841, "flos": 14679025580160.0, "grad_norm": 2.289576361905999, "language_loss": 0.90238285, "learning_rate": 3.8487335203520215e-06, "loss": 0.92503285, "num_input_tokens_seen": 44806730, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.125, "step": 2074, "time_per_iteration": 2.590376138687134 }, { "auxiliary_loss_clip": 0.01218557, "auxiliary_loss_mlp": 0.01062212, "balance_loss_clip": 1.03909755, "balance_loss_mlp": 1.05705929, "epoch": 0.12475574928603637, "flos": 24644775312000.0, "grad_norm": 2.8395290935610102, "language_loss": 0.83868301, "learning_rate": 3.84858934624384e-06, "loss": 0.86149067, "num_input_tokens_seen": 44825550, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.15625, "step": 2075, "time_per_iteration": 2.7434914112091064 }, { "auxiliary_loss_clip": 0.01206159, "auxiliary_loss_mlp": 0.01052997, "balance_loss_clip": 1.03114557, "balance_loss_mlp": 1.05634212, "epoch": 0.12481587253870434, "flos": 21762980081280.0, "grad_norm": 3.0614370188575317, "language_loss": 0.73412865, "learning_rate": 3.8484451061642585e-06, "loss": 0.75672024, "num_input_tokens_seen": 44844155, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.1328125, "step": 2076, "time_per_iteration": 2.6418843269348145 }, { "auxiliary_loss_clip": 0.01201989, "auxiliary_loss_mlp": 0.0104765, "balance_loss_clip": 1.02800429, "balance_loss_mlp": 1.05673158, "epoch": 0.12487599579137232, "flos": 21432529935360.0, "grad_norm": 1.8899224316767853, "language_loss": 0.7558049, "learning_rate": 3.8483008001184275e-06, "loss": 0.77830124, "num_input_tokens_seen": 44863780, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.09375, "step": 2077, "time_per_iteration": 2.712678909301758 }, { "auxiliary_loss_clip": 0.01176891, "auxiliary_loss_mlp": 0.01058903, "balance_loss_clip": 1.03676581, "balance_loss_mlp": 1.0550499, "epoch": 0.12493611904404028, "flos": 16107624276480.0, "grad_norm": 2.1655828252682277, "language_loss": 0.82048142, "learning_rate": 3.848156428111495e-06, "loss": 0.84283942, "num_input_tokens_seen": 44881480, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.1328125, "step": 2078, "time_per_iteration": 2.694997549057007 }, { "auxiliary_loss_clip": 0.01189196, "auxiliary_loss_mlp": 0.01049592, "balance_loss_clip": 1.02851629, "balance_loss_mlp": 1.05748963, "epoch": 0.12499624229670825, "flos": 21580266574080.0, "grad_norm": 1.5886732800795078, "language_loss": 0.74832857, "learning_rate": 3.8480119901486135e-06, "loss": 0.77071649, "num_input_tokens_seen": 44900390, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.1328125, "step": 2079, "time_per_iteration": 2.6072325706481934 }, { "auxiliary_loss_clip": 0.01184941, "auxiliary_loss_mlp": 0.01055069, "balance_loss_clip": 1.03389776, "balance_loss_mlp": 1.06038177, "epoch": 0.1250563655493762, "flos": 25699040421120.0, "grad_norm": 2.087840732001098, "language_loss": 0.83075273, "learning_rate": 3.847867486234937e-06, "loss": 0.85315275, "num_input_tokens_seen": 44920375, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.15625, "step": 2080, "time_per_iteration": 2.6906309127807617 }, { "auxiliary_loss_clip": 0.01176865, "auxiliary_loss_mlp": 0.0105742, "balance_loss_clip": 1.03654683, "balance_loss_mlp": 1.05558825, "epoch": 0.12511648880204418, "flos": 16909509450240.0, "grad_norm": 1.9511404853868783, "language_loss": 0.84402966, "learning_rate": 3.847722916375624e-06, "loss": 0.86637247, "num_input_tokens_seen": 44938415, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.1171875, "step": 2081, "time_per_iteration": 2.5676376819610596 }, { "auxiliary_loss_clip": 0.01176342, "auxiliary_loss_mlp": 0.01047114, "balance_loss_clip": 1.0273844, "balance_loss_mlp": 1.05584455, "epoch": 0.12517661205471217, "flos": 17567500740480.0, "grad_norm": 1.6143893781947947, "language_loss": 0.76566434, "learning_rate": 3.847578280575832e-06, "loss": 0.7878989, "num_input_tokens_seen": 44957135, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.109375, "step": 2082, "time_per_iteration": 2.6714141368865967 }, { "auxiliary_loss_clip": 0.01202334, "auxiliary_loss_mlp": 0.01049908, "balance_loss_clip": 1.02825975, "balance_loss_mlp": 1.06242228, "epoch": 0.12523673530738014, "flos": 16033791870720.0, "grad_norm": 2.2789104944505243, "language_loss": 0.78500557, "learning_rate": 3.847433578840725e-06, "loss": 0.80752802, "num_input_tokens_seen": 44974480, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.2109375, "step": 2083, "time_per_iteration": 2.5934360027313232 }, { "auxiliary_loss_clip": 0.01190154, "auxiliary_loss_mlp": 0.01051903, "balance_loss_clip": 1.02946806, "balance_loss_mlp": 1.05624115, "epoch": 0.1252968585600481, "flos": 18807747494400.0, "grad_norm": 3.48165813637252, "language_loss": 0.90616381, "learning_rate": 3.847288811175465e-06, "loss": 0.9285844, "num_input_tokens_seen": 44990310, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.15625, "step": 2084, "time_per_iteration": 2.6669204235076904 }, { "auxiliary_loss_clip": 0.01197843, "auxiliary_loss_mlp": 0.01051201, "balance_loss_clip": 1.03018475, "balance_loss_mlp": 1.05766344, "epoch": 0.12535698181271607, "flos": 27271568914560.0, "grad_norm": 2.241093111953161, "language_loss": 0.79535592, "learning_rate": 3.84714397758522e-06, "loss": 0.8178463, "num_input_tokens_seen": 45010720, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.125, "step": 2085, "time_per_iteration": 2.61818790435791 }, { "auxiliary_loss_clip": 0.01167984, "auxiliary_loss_mlp": 0.0105316, "balance_loss_clip": 1.03282285, "balance_loss_mlp": 1.0562489, "epoch": 0.12541710506538403, "flos": 22054107813120.0, "grad_norm": 2.0007705922880903, "language_loss": 0.8800925, "learning_rate": 3.846999078075156e-06, "loss": 0.90230393, "num_input_tokens_seen": 45030360, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.1171875, "step": 2086, "time_per_iteration": 2.709474802017212 }, { "auxiliary_loss_clip": 0.01182579, "auxiliary_loss_mlp": 0.0104374, "balance_loss_clip": 1.02345085, "balance_loss_mlp": 1.05548692, "epoch": 0.125477228318052, "flos": 12603173760000.0, "grad_norm": 2.174940666239578, "language_loss": 0.87203872, "learning_rate": 3.8468541126504476e-06, "loss": 0.89430189, "num_input_tokens_seen": 45045085, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.0859375, "step": 2087, "time_per_iteration": 2.5249946117401123 }, { "auxiliary_loss_clip": 0.01185782, "auxiliary_loss_mlp": 0.01054407, "balance_loss_clip": 1.03275907, "balance_loss_mlp": 1.05597055, "epoch": 0.12553735157071996, "flos": 23878549365120.0, "grad_norm": 1.7799852378447272, "language_loss": 0.73092532, "learning_rate": 3.846709081316266e-06, "loss": 0.75332719, "num_input_tokens_seen": 45065145, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.1171875, "step": 2088, "time_per_iteration": 2.7195794582366943 }, { "auxiliary_loss_clip": 0.01073948, "auxiliary_loss_mlp": 0.01003747, "balance_loss_clip": 1.00082588, "balance_loss_mlp": 1.0253011, "epoch": 0.12559747482338796, "flos": 69920841830400.0, "grad_norm": 0.7597940664411269, "language_loss": 0.61741424, "learning_rate": 3.846563984077788e-06, "loss": 0.63819122, "num_input_tokens_seen": 45126230, "router_z_loss_clip": 0.0291748, "router_z_loss_mlp": 0.39453125, "step": 2089, "time_per_iteration": 3.097895383834839 }, { "auxiliary_loss_clip": 0.01185278, "auxiliary_loss_mlp": 0.01057134, "balance_loss_clip": 1.0356766, "balance_loss_mlp": 1.05650365, "epoch": 0.12565759807605592, "flos": 24279563779200.0, "grad_norm": 2.1007419196407757, "language_loss": 0.77545273, "learning_rate": 3.846418820940191e-06, "loss": 0.79787683, "num_input_tokens_seen": 45145545, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.109375, "step": 2090, "time_per_iteration": 2.6576647758483887 }, { "auxiliary_loss_clip": 0.01082708, "auxiliary_loss_mlp": 0.01003377, "balance_loss_clip": 1.00080252, "balance_loss_mlp": 1.02475214, "epoch": 0.12571772132872389, "flos": 56451180286080.0, "grad_norm": 0.7474822422604896, "language_loss": 0.59398031, "learning_rate": 3.846273591908656e-06, "loss": 0.6148411, "num_input_tokens_seen": 45206845, "router_z_loss_clip": 0.02575684, "router_z_loss_mlp": 0.3984375, "step": 2091, "time_per_iteration": 3.1020805835723877 }, { "auxiliary_loss_clip": 0.01181449, "auxiliary_loss_mlp": 0.0105058, "balance_loss_clip": 1.03114891, "balance_loss_mlp": 1.06084406, "epoch": 0.12577784458139185, "flos": 41245846675200.0, "grad_norm": 2.3308741637825454, "language_loss": 0.63179517, "learning_rate": 3.846128296988365e-06, "loss": 0.65411544, "num_input_tokens_seen": 45228495, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.1171875, "step": 2092, "time_per_iteration": 2.7206552028656006 }, { "auxiliary_loss_clip": 0.0119115, "auxiliary_loss_mlp": 0.01062124, "balance_loss_clip": 1.0406301, "balance_loss_mlp": 1.05596542, "epoch": 0.12583796783405982, "flos": 19755501799680.0, "grad_norm": 2.2164388853146244, "language_loss": 0.7993778, "learning_rate": 3.845982936184505e-06, "loss": 0.8219105, "num_input_tokens_seen": 45245720, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.171875, "step": 2093, "time_per_iteration": 2.7912676334381104 }, { "auxiliary_loss_clip": 0.01177558, "auxiliary_loss_mlp": 0.01054175, "balance_loss_clip": 1.03387332, "balance_loss_mlp": 1.05705667, "epoch": 0.12589809108672778, "flos": 22602104680320.0, "grad_norm": 1.6866553981965249, "language_loss": 0.75565916, "learning_rate": 3.845837509502262e-06, "loss": 0.77797651, "num_input_tokens_seen": 45265650, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.1171875, "step": 2094, "time_per_iteration": 4.07616662979126 }, { "auxiliary_loss_clip": 0.01191765, "auxiliary_loss_mlp": 0.0105443, "balance_loss_clip": 1.03387833, "balance_loss_mlp": 1.05404699, "epoch": 0.12595821433939577, "flos": 45222845541120.0, "grad_norm": 1.55232953439169, "language_loss": 0.76276588, "learning_rate": 3.845692016946826e-06, "loss": 0.78522784, "num_input_tokens_seen": 45287790, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.1015625, "step": 2095, "time_per_iteration": 2.853607654571533 }, { "auxiliary_loss_clip": 0.01185437, "auxiliary_loss_mlp": 0.01058099, "balance_loss_clip": 1.03792894, "balance_loss_mlp": 1.05418897, "epoch": 0.12601833759206374, "flos": 14319811618560.0, "grad_norm": 2.1526615067014054, "language_loss": 0.8234297, "learning_rate": 3.845546458523391e-06, "loss": 0.84586501, "num_input_tokens_seen": 45305720, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.1328125, "step": 2096, "time_per_iteration": 5.41054105758667 }, { "auxiliary_loss_clip": 0.01177265, "auxiliary_loss_mlp": 0.01050465, "balance_loss_clip": 1.03068781, "balance_loss_mlp": 1.0559299, "epoch": 0.1260784608447317, "flos": 21288241002240.0, "grad_norm": 2.0014047964254216, "language_loss": 0.75500095, "learning_rate": 3.845400834237148e-06, "loss": 0.77727824, "num_input_tokens_seen": 45325290, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.125, "step": 2097, "time_per_iteration": 2.619943380355835 }, { "auxiliary_loss_clip": 0.01176112, "auxiliary_loss_mlp": 0.01059288, "balance_loss_clip": 1.0409174, "balance_loss_mlp": 1.05645573, "epoch": 0.12613858409739967, "flos": 26251311006720.0, "grad_norm": 2.217530569727742, "language_loss": 0.86984867, "learning_rate": 3.8452551440932975e-06, "loss": 0.89220262, "num_input_tokens_seen": 45344465, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.109375, "step": 2098, "time_per_iteration": 4.158971071243286 }, { "auxiliary_loss_clip": 0.01209676, "auxiliary_loss_mlp": 0.01057341, "balance_loss_clip": 1.03429759, "balance_loss_mlp": 1.05711412, "epoch": 0.12619870735006763, "flos": 21579979265280.0, "grad_norm": 3.872546586140851, "language_loss": 0.69271457, "learning_rate": 3.8451093880970365e-06, "loss": 0.71538472, "num_input_tokens_seen": 45362465, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.1640625, "step": 2099, "time_per_iteration": 2.6650710105895996 }, { "auxiliary_loss_clip": 0.01187052, "auxiliary_loss_mlp": 0.01057735, "balance_loss_clip": 1.03611088, "balance_loss_mlp": 1.05508733, "epoch": 0.1262588306027356, "flos": 23367037737600.0, "grad_norm": 2.3838714277594084, "language_loss": 0.81906056, "learning_rate": 3.844963566253569e-06, "loss": 0.84150851, "num_input_tokens_seen": 45382700, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.140625, "step": 2100, "time_per_iteration": 2.7066004276275635 }, { "auxiliary_loss_clip": 0.01188584, "auxiliary_loss_mlp": 0.01054715, "balance_loss_clip": 1.03428221, "balance_loss_mlp": 1.05744135, "epoch": 0.12631895385540357, "flos": 23949185460480.0, "grad_norm": 2.0126200156017395, "language_loss": 0.80351627, "learning_rate": 3.844817678568097e-06, "loss": 0.82594931, "num_input_tokens_seen": 45401005, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.125, "step": 2101, "time_per_iteration": 2.6826577186584473 }, { "auxiliary_loss_clip": 0.01059247, "auxiliary_loss_mlp": 0.01025625, "balance_loss_clip": 1.02279937, "balance_loss_mlp": 1.01941562, "epoch": 0.12637907710807156, "flos": 70282138780800.0, "grad_norm": 0.7006475515779091, "language_loss": 0.57043755, "learning_rate": 3.8446717250458275e-06, "loss": 0.59128618, "num_input_tokens_seen": 45466555, "router_z_loss_clip": 0.02819824, "router_z_loss_mlp": 0.3984375, "step": 2102, "time_per_iteration": 3.218630075454712 }, { "auxiliary_loss_clip": 0.01199111, "auxiliary_loss_mlp": 0.01056974, "balance_loss_clip": 1.03714943, "balance_loss_mlp": 1.05808687, "epoch": 0.12643920036073952, "flos": 18915084311040.0, "grad_norm": 2.0916660100361595, "language_loss": 0.93194914, "learning_rate": 3.844525705691969e-06, "loss": 0.95450997, "num_input_tokens_seen": 45485165, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.140625, "step": 2103, "time_per_iteration": 2.6533589363098145 }, { "auxiliary_loss_clip": 0.01167195, "auxiliary_loss_mlp": 0.01036715, "balance_loss_clip": 1.01857173, "balance_loss_mlp": 1.05215228, "epoch": 0.1264993236134075, "flos": 27782470010880.0, "grad_norm": 2.137170694688075, "language_loss": 0.77664828, "learning_rate": 3.844379620511733e-06, "loss": 0.79868734, "num_input_tokens_seen": 45504630, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.0546875, "step": 2104, "time_per_iteration": 2.704937219619751 }, { "auxiliary_loss_clip": 0.01198653, "auxiliary_loss_mlp": 0.01053851, "balance_loss_clip": 1.03488469, "balance_loss_mlp": 1.05949569, "epoch": 0.12655944686607545, "flos": 24754697907840.0, "grad_norm": 1.8520777188268012, "language_loss": 0.80528927, "learning_rate": 3.844233469510333e-06, "loss": 0.82781428, "num_input_tokens_seen": 45524885, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.1171875, "step": 2105, "time_per_iteration": 2.5785446166992188 }, { "auxiliary_loss_clip": 0.01185177, "auxiliary_loss_mlp": 0.01054833, "balance_loss_clip": 1.03356576, "balance_loss_mlp": 1.05921817, "epoch": 0.12661957011874342, "flos": 24133048202880.0, "grad_norm": 2.3391480204326056, "language_loss": 0.83356482, "learning_rate": 3.844087252692984e-06, "loss": 0.85596496, "num_input_tokens_seen": 45545000, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.171875, "step": 2106, "time_per_iteration": 2.5991339683532715 }, { "auxiliary_loss_clip": 0.01205021, "auxiliary_loss_mlp": 0.0104717, "balance_loss_clip": 1.02709508, "balance_loss_mlp": 1.05855834, "epoch": 0.12667969337141138, "flos": 24569614103040.0, "grad_norm": 1.913115365184285, "language_loss": 0.73248994, "learning_rate": 3.843940970064904e-06, "loss": 0.75501192, "num_input_tokens_seen": 45564210, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.1015625, "step": 2107, "time_per_iteration": 2.6252198219299316 }, { "auxiliary_loss_clip": 0.01166396, "auxiliary_loss_mlp": 0.0104368, "balance_loss_clip": 1.02459455, "balance_loss_mlp": 1.05673933, "epoch": 0.12673981662407935, "flos": 22961677777920.0, "grad_norm": 2.4122539921183637, "language_loss": 0.7886008, "learning_rate": 3.843794621631314e-06, "loss": 0.81070161, "num_input_tokens_seen": 45583030, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.09375, "step": 2108, "time_per_iteration": 2.602623462677002 }, { "auxiliary_loss_clip": 0.01167178, "auxiliary_loss_mlp": 0.01042406, "balance_loss_clip": 1.02212834, "balance_loss_mlp": 1.05407238, "epoch": 0.12679993987674734, "flos": 17274864637440.0, "grad_norm": 1.881350598936374, "language_loss": 0.76025105, "learning_rate": 3.843648207397438e-06, "loss": 0.78234684, "num_input_tokens_seen": 45602265, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.1328125, "step": 2109, "time_per_iteration": 2.5846147537231445 }, { "auxiliary_loss_clip": 0.0119255, "auxiliary_loss_mlp": 0.01047865, "balance_loss_clip": 1.02866101, "balance_loss_mlp": 1.05687761, "epoch": 0.1268600631294153, "flos": 17275080119040.0, "grad_norm": 1.6789919773851592, "language_loss": 0.82469964, "learning_rate": 3.843501727368498e-06, "loss": 0.84710371, "num_input_tokens_seen": 45620595, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.078125, "step": 2110, "time_per_iteration": 2.6118295192718506 }, { "auxiliary_loss_clip": 0.0118601, "auxiliary_loss_mlp": 0.01300863, "balance_loss_clip": 1.03159165, "balance_loss_mlp": 1.05730581, "epoch": 0.12692018638208327, "flos": 24061047390720.0, "grad_norm": 2.2367091828902126, "language_loss": 0.7841779, "learning_rate": 3.8433551815497255e-06, "loss": 0.80904663, "num_input_tokens_seen": 45641140, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1015625, "step": 2111, "time_per_iteration": 2.5852272510528564 }, { "auxiliary_loss_clip": 0.01214412, "auxiliary_loss_mlp": 0.01068722, "balance_loss_clip": 1.04647791, "balance_loss_mlp": 1.05916309, "epoch": 0.12698030963475124, "flos": 31831900652160.0, "grad_norm": 2.102590805042648, "language_loss": 0.76938063, "learning_rate": 3.843208569946347e-06, "loss": 0.79221201, "num_input_tokens_seen": 45662315, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.1875, "step": 2112, "time_per_iteration": 2.6516268253326416 }, { "auxiliary_loss_clip": 0.01195389, "auxiliary_loss_mlp": 0.01055315, "balance_loss_clip": 1.03551435, "balance_loss_mlp": 1.05749035, "epoch": 0.1270404328874192, "flos": 25187744275200.0, "grad_norm": 1.9953495293715398, "language_loss": 0.85431635, "learning_rate": 3.843061892563596e-06, "loss": 0.87682343, "num_input_tokens_seen": 45680335, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.109375, "step": 2113, "time_per_iteration": 2.633239269256592 }, { "auxiliary_loss_clip": 0.01194865, "auxiliary_loss_mlp": 0.0104912, "balance_loss_clip": 1.02861583, "balance_loss_mlp": 1.05661988, "epoch": 0.12710055614008717, "flos": 15997342544640.0, "grad_norm": 1.966560828238792, "language_loss": 0.74081367, "learning_rate": 3.842915149406707e-06, "loss": 0.76325351, "num_input_tokens_seen": 45696240, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.109375, "step": 2114, "time_per_iteration": 2.5955262184143066 }, { "auxiliary_loss_clip": 0.0117717, "auxiliary_loss_mlp": 0.01059213, "balance_loss_clip": 1.03845882, "balance_loss_mlp": 1.05788302, "epoch": 0.12716067939275516, "flos": 15085642515840.0, "grad_norm": 2.406075471396409, "language_loss": 0.82984692, "learning_rate": 3.842768340480917e-06, "loss": 0.85221076, "num_input_tokens_seen": 45713695, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1015625, "step": 2115, "time_per_iteration": 2.714318037033081 }, { "auxiliary_loss_clip": 0.01187788, "auxiliary_loss_mlp": 0.01056249, "balance_loss_clip": 1.03648424, "balance_loss_mlp": 1.05831981, "epoch": 0.12722080264542313, "flos": 28366736636160.0, "grad_norm": 1.7970069526331565, "language_loss": 0.86627674, "learning_rate": 3.8426214657914656e-06, "loss": 0.88871717, "num_input_tokens_seen": 45736655, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.1171875, "step": 2116, "time_per_iteration": 2.629650354385376 }, { "auxiliary_loss_clip": 0.0118213, "auxiliary_loss_mlp": 0.01294712, "balance_loss_clip": 1.02734816, "balance_loss_mlp": 1.05465555, "epoch": 0.1272809258980911, "flos": 32379897519360.0, "grad_norm": 1.7513888782853626, "language_loss": 0.70247716, "learning_rate": 3.842474525343594e-06, "loss": 0.72724563, "num_input_tokens_seen": 45758195, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.09375, "step": 2117, "time_per_iteration": 2.7413904666900635 }, { "auxiliary_loss_clip": 0.01185542, "auxiliary_loss_mlp": 0.01054278, "balance_loss_clip": 1.03323722, "balance_loss_mlp": 1.05698442, "epoch": 0.12734104915075906, "flos": 16034402401920.0, "grad_norm": 2.1335673007066047, "language_loss": 0.86549604, "learning_rate": 3.842327519142545e-06, "loss": 0.88789427, "num_input_tokens_seen": 45774280, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.1015625, "step": 2118, "time_per_iteration": 2.575439929962158 }, { "auxiliary_loss_clip": 0.01173816, "auxiliary_loss_mlp": 0.01052665, "balance_loss_clip": 1.03355598, "balance_loss_mlp": 1.05463088, "epoch": 0.12740117240342702, "flos": 18260325244800.0, "grad_norm": 1.8708515508055157, "language_loss": 0.87518334, "learning_rate": 3.842180447193566e-06, "loss": 0.89744818, "num_input_tokens_seen": 45792760, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.1015625, "step": 2119, "time_per_iteration": 2.6626904010772705 }, { "auxiliary_loss_clip": 0.01176855, "auxiliary_loss_mlp": 0.01295856, "balance_loss_clip": 1.02703202, "balance_loss_mlp": 1.05685735, "epoch": 0.127461295656095, "flos": 12121790664960.0, "grad_norm": 3.112337561905617, "language_loss": 0.8751592, "learning_rate": 3.842033309501905e-06, "loss": 0.89988625, "num_input_tokens_seen": 45804300, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.109375, "step": 2120, "time_per_iteration": 2.547501564025879 }, { "auxiliary_loss_clip": 0.01166807, "auxiliary_loss_mlp": 0.01043651, "balance_loss_clip": 1.02433872, "balance_loss_mlp": 1.05578828, "epoch": 0.12752141890876295, "flos": 23149095966720.0, "grad_norm": 2.1162573515430965, "language_loss": 0.75422317, "learning_rate": 3.841886106072815e-06, "loss": 0.77632773, "num_input_tokens_seen": 45823780, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.109375, "step": 2121, "time_per_iteration": 2.6760950088500977 }, { "auxiliary_loss_clip": 0.01201803, "auxiliary_loss_mlp": 0.01044627, "balance_loss_clip": 1.02531528, "balance_loss_mlp": 1.05532932, "epoch": 0.12758154216143094, "flos": 21615997628160.0, "grad_norm": 2.069662266534679, "language_loss": 0.83009756, "learning_rate": 3.841738836911547e-06, "loss": 0.85256183, "num_input_tokens_seen": 45840495, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.1015625, "step": 2122, "time_per_iteration": 2.78055477142334 }, { "auxiliary_loss_clip": 0.0118451, "auxiliary_loss_mlp": 0.01048436, "balance_loss_clip": 1.02905238, "balance_loss_mlp": 1.05600011, "epoch": 0.1276416654140989, "flos": 15924874855680.0, "grad_norm": 1.8401947276746389, "language_loss": 0.79219395, "learning_rate": 3.8415915020233574e-06, "loss": 0.8145234, "num_input_tokens_seen": 45857735, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.109375, "step": 2123, "time_per_iteration": 2.726046562194824 }, { "auxiliary_loss_clip": 0.01166688, "auxiliary_loss_mlp": 0.01048013, "balance_loss_clip": 1.02861738, "balance_loss_mlp": 1.05521977, "epoch": 0.12770178866676687, "flos": 22382690451840.0, "grad_norm": 1.6567123199422293, "language_loss": 0.79004848, "learning_rate": 3.8414441014135045e-06, "loss": 0.81219542, "num_input_tokens_seen": 45876485, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.109375, "step": 2124, "time_per_iteration": 2.561593770980835 }, { "auxiliary_loss_clip": 0.01176754, "auxiliary_loss_mlp": 0.01299166, "balance_loss_clip": 1.03172243, "balance_loss_mlp": 1.05499041, "epoch": 0.12776191191943484, "flos": 21652482867840.0, "grad_norm": 2.1176553222155423, "language_loss": 0.75504696, "learning_rate": 3.8412966350872475e-06, "loss": 0.7798062, "num_input_tokens_seen": 45894645, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.125, "step": 2125, "time_per_iteration": 2.6122779846191406 }, { "auxiliary_loss_clip": 0.01174227, "auxiliary_loss_mlp": 0.01292964, "balance_loss_clip": 1.0247252, "balance_loss_mlp": 1.05406213, "epoch": 0.1278220351721028, "flos": 25735561574400.0, "grad_norm": 3.8096743680002527, "language_loss": 0.77921736, "learning_rate": 3.841149103049851e-06, "loss": 0.80388921, "num_input_tokens_seen": 45913755, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.109375, "step": 2126, "time_per_iteration": 2.63948655128479 }, { "auxiliary_loss_clip": 0.01194913, "auxiliary_loss_mlp": 0.0105345, "balance_loss_clip": 1.03475797, "balance_loss_mlp": 1.05870509, "epoch": 0.12788215842477077, "flos": 41243224982400.0, "grad_norm": 1.5797299243402743, "language_loss": 0.68914545, "learning_rate": 3.8410015053065785e-06, "loss": 0.71162909, "num_input_tokens_seen": 45936095, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.09375, "step": 2127, "time_per_iteration": 2.8089048862457275 }, { "auxiliary_loss_clip": 0.01096471, "auxiliary_loss_mlp": 0.01254079, "balance_loss_clip": 1.00256562, "balance_loss_mlp": 1.0208056, "epoch": 0.12794228167743876, "flos": 70877430881280.0, "grad_norm": 0.8551092701664493, "language_loss": 0.62820405, "learning_rate": 3.8408538418626985e-06, "loss": 0.65170962, "num_input_tokens_seen": 46004655, "router_z_loss_clip": 0.0324707, "router_z_loss_mlp": 0.39453125, "step": 2128, "time_per_iteration": 3.223708152770996 }, { "auxiliary_loss_clip": 0.01176235, "auxiliary_loss_mlp": 0.01042379, "balance_loss_clip": 1.02120686, "balance_loss_mlp": 1.05434477, "epoch": 0.12800240493010673, "flos": 16289727252480.0, "grad_norm": 2.2392386359720673, "language_loss": 0.77117252, "learning_rate": 3.840706112723479e-06, "loss": 0.79335868, "num_input_tokens_seen": 46023610, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.125, "step": 2129, "time_per_iteration": 2.540557384490967 }, { "auxiliary_loss_clip": 0.01213775, "auxiliary_loss_mlp": 0.01054774, "balance_loss_clip": 1.03289866, "balance_loss_mlp": 1.0597533, "epoch": 0.1280625281827747, "flos": 20631542601600.0, "grad_norm": 2.1451320223522097, "language_loss": 0.78826219, "learning_rate": 3.840558317894194e-06, "loss": 0.81094766, "num_input_tokens_seen": 46041725, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.171875, "step": 2130, "time_per_iteration": 2.6614255905151367 }, { "auxiliary_loss_clip": 0.0118531, "auxiliary_loss_mlp": 0.01044197, "balance_loss_clip": 1.0249213, "balance_loss_mlp": 1.0544678, "epoch": 0.12812265143544266, "flos": 22638230784000.0, "grad_norm": 1.9566594730938136, "language_loss": 0.70736647, "learning_rate": 3.840410457380117e-06, "loss": 0.72966152, "num_input_tokens_seen": 46061095, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.125, "step": 2131, "time_per_iteration": 2.571033000946045 }, { "auxiliary_loss_clip": 0.01210957, "auxiliary_loss_mlp": 0.01051987, "balance_loss_clip": 1.03224564, "balance_loss_mlp": 1.05568182, "epoch": 0.12818277468811062, "flos": 34714701463680.0, "grad_norm": 2.1468029179357315, "language_loss": 0.7239446, "learning_rate": 3.840262531186525e-06, "loss": 0.74657398, "num_input_tokens_seen": 46082670, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.1015625, "step": 2132, "time_per_iteration": 2.75357985496521 }, { "auxiliary_loss_clip": 0.0119788, "auxiliary_loss_mlp": 0.01052947, "balance_loss_clip": 1.0326221, "balance_loss_mlp": 1.05405807, "epoch": 0.1282428979407786, "flos": 23112107936640.0, "grad_norm": 2.632717581050812, "language_loss": 0.81794113, "learning_rate": 3.840114539318697e-06, "loss": 0.84044939, "num_input_tokens_seen": 46102410, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.0703125, "step": 2133, "time_per_iteration": 2.6135952472686768 }, { "auxiliary_loss_clip": 0.01182409, "auxiliary_loss_mlp": 0.01055329, "balance_loss_clip": 1.03362119, "balance_loss_mlp": 1.0554235, "epoch": 0.12830302119344655, "flos": 20886508316160.0, "grad_norm": 2.4709657570295813, "language_loss": 0.7971189, "learning_rate": 3.839966481781914e-06, "loss": 0.81949627, "num_input_tokens_seen": 46121145, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.1796875, "step": 2134, "time_per_iteration": 2.5900723934173584 }, { "auxiliary_loss_clip": 0.01178314, "auxiliary_loss_mlp": 0.01051158, "balance_loss_clip": 1.03057027, "balance_loss_mlp": 1.05644369, "epoch": 0.12836314444611455, "flos": 21397768548480.0, "grad_norm": 1.7730939708368927, "language_loss": 0.82172459, "learning_rate": 3.83981835858146e-06, "loss": 0.84401929, "num_input_tokens_seen": 46140740, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.125, "step": 2135, "time_per_iteration": 2.535521984100342 }, { "auxiliary_loss_clip": 0.0118182, "auxiliary_loss_mlp": 0.01056527, "balance_loss_clip": 1.03523612, "balance_loss_mlp": 1.05317211, "epoch": 0.1284232676987825, "flos": 13662466773120.0, "grad_norm": 3.356967207949192, "language_loss": 0.76984525, "learning_rate": 3.839670169722622e-06, "loss": 0.79222876, "num_input_tokens_seen": 46156805, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.1015625, "step": 2136, "time_per_iteration": 4.063634872436523 }, { "auxiliary_loss_clip": 0.0109545, "auxiliary_loss_mlp": 0.01006088, "balance_loss_clip": 1.00311995, "balance_loss_mlp": 1.02002704, "epoch": 0.12848339095145048, "flos": 59994737735040.0, "grad_norm": 0.8973414542388322, "language_loss": 0.59237468, "learning_rate": 3.839521915210688e-06, "loss": 0.61339009, "num_input_tokens_seen": 46222085, "router_z_loss_clip": 0.02966309, "router_z_loss_mlp": 0.39453125, "step": 2137, "time_per_iteration": 6.286866664886475 }, { "auxiliary_loss_clip": 0.01172139, "auxiliary_loss_mlp": 0.01056151, "balance_loss_clip": 1.03631496, "balance_loss_mlp": 1.05119002, "epoch": 0.12854351420411844, "flos": 13881378211200.0, "grad_norm": 3.670194484416657, "language_loss": 0.81854272, "learning_rate": 3.839373595050948e-06, "loss": 0.84082556, "num_input_tokens_seen": 46239970, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.125, "step": 2138, "time_per_iteration": 2.5902392864227295 }, { "auxiliary_loss_clip": 0.0118003, "auxiliary_loss_mlp": 0.01056734, "balance_loss_clip": 1.03402436, "balance_loss_mlp": 1.05624175, "epoch": 0.1286036374567864, "flos": 22637943475200.0, "grad_norm": 2.5686773453577665, "language_loss": 0.78849781, "learning_rate": 3.839225209248696e-06, "loss": 0.81086546, "num_input_tokens_seen": 46257740, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.1484375, "step": 2139, "time_per_iteration": 2.5309886932373047 }, { "auxiliary_loss_clip": 0.01195656, "auxiliary_loss_mlp": 0.01044508, "balance_loss_clip": 1.02462387, "balance_loss_mlp": 1.0537374, "epoch": 0.12866376070945437, "flos": 16324775948160.0, "grad_norm": 2.1694852782723206, "language_loss": 0.84862816, "learning_rate": 3.839076757809228e-06, "loss": 0.87102979, "num_input_tokens_seen": 46275445, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.1484375, "step": 2140, "time_per_iteration": 4.0213377475738525 }, { "auxiliary_loss_clip": 0.0118057, "auxiliary_loss_mlp": 0.0104613, "balance_loss_clip": 1.02667499, "balance_loss_mlp": 1.05257511, "epoch": 0.12872388396212234, "flos": 11874546374400.0, "grad_norm": 2.375974256076882, "language_loss": 0.85893589, "learning_rate": 3.83892824073784e-06, "loss": 0.88120294, "num_input_tokens_seen": 46291710, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.09375, "step": 2141, "time_per_iteration": 2.5868499279022217 }, { "auxiliary_loss_clip": 0.0118758, "auxiliary_loss_mlp": 0.01051585, "balance_loss_clip": 1.03071094, "balance_loss_mlp": 1.05390525, "epoch": 0.12878400721479033, "flos": 28366700722560.0, "grad_norm": 2.438538794114126, "language_loss": 0.67930776, "learning_rate": 3.838779658039834e-06, "loss": 0.70169938, "num_input_tokens_seen": 46311335, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.15625, "step": 2142, "time_per_iteration": 2.6133973598480225 }, { "auxiliary_loss_clip": 0.01181163, "auxiliary_loss_mlp": 0.01299731, "balance_loss_clip": 1.02938855, "balance_loss_mlp": 1.05688715, "epoch": 0.1288441304674583, "flos": 25885632597120.0, "grad_norm": 5.981570727271602, "language_loss": 0.83161485, "learning_rate": 3.838631009720513e-06, "loss": 0.8564238, "num_input_tokens_seen": 46330985, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.15625, "step": 2143, "time_per_iteration": 2.7863619327545166 }, { "auxiliary_loss_clip": 0.01171549, "auxiliary_loss_mlp": 0.01301924, "balance_loss_clip": 1.03309202, "balance_loss_mlp": 1.05860519, "epoch": 0.12890425372012626, "flos": 20813789232000.0, "grad_norm": 1.6779783147843461, "language_loss": 0.81760079, "learning_rate": 3.83848229578518e-06, "loss": 0.84233552, "num_input_tokens_seen": 46351295, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.125, "step": 2144, "time_per_iteration": 2.5432863235473633 }, { "auxiliary_loss_clip": 0.01182331, "auxiliary_loss_mlp": 0.01049509, "balance_loss_clip": 1.02753925, "balance_loss_mlp": 1.05159545, "epoch": 0.12896437697279423, "flos": 22565870835840.0, "grad_norm": 2.3079944643666113, "language_loss": 0.77753222, "learning_rate": 3.838333516239142e-06, "loss": 0.79985058, "num_input_tokens_seen": 46368600, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.125, "step": 2145, "time_per_iteration": 2.6257834434509277 }, { "auxiliary_loss_clip": 0.0116827, "auxiliary_loss_mlp": 0.01045637, "balance_loss_clip": 1.02419114, "balance_loss_mlp": 1.05403507, "epoch": 0.1290245002254622, "flos": 17493776075520.0, "grad_norm": 2.2721928798653965, "language_loss": 0.82626879, "learning_rate": 3.83818467108771e-06, "loss": 0.84840786, "num_input_tokens_seen": 46387370, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.140625, "step": 2146, "time_per_iteration": 2.532181978225708 }, { "auxiliary_loss_clip": 0.01185218, "auxiliary_loss_mlp": 0.01051275, "balance_loss_clip": 1.02967381, "balance_loss_mlp": 1.05536437, "epoch": 0.12908462347813016, "flos": 36315957859200.0, "grad_norm": 2.6483544153927125, "language_loss": 0.7050643, "learning_rate": 3.838035760336196e-06, "loss": 0.72742921, "num_input_tokens_seen": 46409570, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.109375, "step": 2147, "time_per_iteration": 2.881758689880371 }, { "auxiliary_loss_clip": 0.01183466, "auxiliary_loss_mlp": 0.0105436, "balance_loss_clip": 1.03449953, "balance_loss_mlp": 1.05318868, "epoch": 0.12914474673079815, "flos": 22528703237760.0, "grad_norm": 2.1954829867388717, "language_loss": 0.71142846, "learning_rate": 3.837886783989914e-06, "loss": 0.73380673, "num_input_tokens_seen": 46429320, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.125, "step": 2148, "time_per_iteration": 2.654575824737549 }, { "auxiliary_loss_clip": 0.0116637, "auxiliary_loss_mlp": 0.01046878, "balance_loss_clip": 1.0269109, "balance_loss_mlp": 1.05722582, "epoch": 0.12920486998346611, "flos": 21471888263040.0, "grad_norm": 1.8453071875243627, "language_loss": 0.79050982, "learning_rate": 3.837737742054179e-06, "loss": 0.81264228, "num_input_tokens_seen": 46450155, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.09375, "step": 2149, "time_per_iteration": 2.673626661300659 }, { "auxiliary_loss_clip": 0.01177384, "auxiliary_loss_mlp": 0.01045805, "balance_loss_clip": 1.02470541, "balance_loss_mlp": 1.05716085, "epoch": 0.12926499323613408, "flos": 27308556944640.0, "grad_norm": 2.0276262166689327, "language_loss": 0.76375258, "learning_rate": 3.837588634534312e-06, "loss": 0.78598452, "num_input_tokens_seen": 46470280, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.109375, "step": 2150, "time_per_iteration": 2.7765796184539795 }, { "auxiliary_loss_clip": 0.0117768, "auxiliary_loss_mlp": 0.01051256, "balance_loss_clip": 1.0304184, "balance_loss_mlp": 1.05587757, "epoch": 0.12932511648880204, "flos": 22091131756800.0, "grad_norm": 2.256488477068664, "language_loss": 0.70597339, "learning_rate": 3.837439461435634e-06, "loss": 0.72826278, "num_input_tokens_seen": 46487605, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.125, "step": 2151, "time_per_iteration": 2.6413867473602295 }, { "auxiliary_loss_clip": 0.01163453, "auxiliary_loss_mlp": 0.01045779, "balance_loss_clip": 1.02590632, "balance_loss_mlp": 1.05492973, "epoch": 0.12938523974147, "flos": 20302780394880.0, "grad_norm": 2.0782297329819284, "language_loss": 0.836725, "learning_rate": 3.837290222763467e-06, "loss": 0.85881734, "num_input_tokens_seen": 46505100, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.0859375, "step": 2152, "time_per_iteration": 2.651362895965576 }, { "auxiliary_loss_clip": 0.01176498, "auxiliary_loss_mlp": 0.01054419, "balance_loss_clip": 1.03395069, "balance_loss_mlp": 1.05473673, "epoch": 0.12944536299413797, "flos": 19499961467520.0, "grad_norm": 1.9201251490713325, "language_loss": 0.78125614, "learning_rate": 3.837140918523139e-06, "loss": 0.80356538, "num_input_tokens_seen": 46524020, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.125, "step": 2153, "time_per_iteration": 2.634556293487549 }, { "auxiliary_loss_clip": 0.01192346, "auxiliary_loss_mlp": 0.01295953, "balance_loss_clip": 1.02735734, "balance_loss_mlp": 1.05421734, "epoch": 0.12950548624680594, "flos": 27707919333120.0, "grad_norm": 1.7410001692275854, "language_loss": 0.79801011, "learning_rate": 3.836991548719977e-06, "loss": 0.82289314, "num_input_tokens_seen": 46544640, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.109375, "step": 2154, "time_per_iteration": 2.7544174194335938 }, { "auxiliary_loss_clip": 0.01177729, "auxiliary_loss_mlp": 0.0105058, "balance_loss_clip": 1.03014779, "balance_loss_mlp": 1.05732679, "epoch": 0.12956560949947393, "flos": 17565740974080.0, "grad_norm": 2.505395819975409, "language_loss": 0.83063996, "learning_rate": 3.836842113359312e-06, "loss": 0.8529231, "num_input_tokens_seen": 46561395, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.1171875, "step": 2155, "time_per_iteration": 2.5921707153320312 }, { "auxiliary_loss_clip": 0.01176773, "auxiliary_loss_mlp": 0.0105247, "balance_loss_clip": 1.03271663, "balance_loss_mlp": 1.05420172, "epoch": 0.1296257327521419, "flos": 20740711011840.0, "grad_norm": 2.686881557287839, "language_loss": 0.7570619, "learning_rate": 3.836692612446477e-06, "loss": 0.77935433, "num_input_tokens_seen": 46579395, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.1328125, "step": 2156, "time_per_iteration": 2.6580615043640137 }, { "auxiliary_loss_clip": 0.01208244, "auxiliary_loss_mlp": 0.01051671, "balance_loss_clip": 1.03184652, "balance_loss_mlp": 1.05383015, "epoch": 0.12968585600480986, "flos": 16395735265920.0, "grad_norm": 1.7483013979810476, "language_loss": 0.86290252, "learning_rate": 3.836543045986806e-06, "loss": 0.88550168, "num_input_tokens_seen": 46597090, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.0859375, "step": 2157, "time_per_iteration": 2.6758625507354736 }, { "auxiliary_loss_clip": 0.01166594, "auxiliary_loss_mlp": 0.01052713, "balance_loss_clip": 1.03189874, "balance_loss_mlp": 1.05203962, "epoch": 0.12974597925747783, "flos": 28329533124480.0, "grad_norm": 4.756372463240822, "language_loss": 0.80536342, "learning_rate": 3.836393413985639e-06, "loss": 0.82755643, "num_input_tokens_seen": 46617355, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.1484375, "step": 2158, "time_per_iteration": 2.751858711242676 }, { "auxiliary_loss_clip": 0.0117254, "auxiliary_loss_mlp": 0.0105464, "balance_loss_clip": 1.03497052, "balance_loss_mlp": 1.05812168, "epoch": 0.1298061025101458, "flos": 9683025782400.0, "grad_norm": 2.4803763470436464, "language_loss": 0.74807465, "learning_rate": 3.836243716448315e-06, "loss": 0.7703464, "num_input_tokens_seen": 46633130, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.140625, "step": 2159, "time_per_iteration": 2.55718731880188 }, { "auxiliary_loss_clip": 0.01188524, "auxiliary_loss_mlp": 0.01049384, "balance_loss_clip": 1.02856994, "balance_loss_mlp": 1.05179453, "epoch": 0.12986622576281376, "flos": 27709535445120.0, "grad_norm": 2.1377624516274945, "language_loss": 0.82429659, "learning_rate": 3.8360939533801755e-06, "loss": 0.84667563, "num_input_tokens_seen": 46650575, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.09375, "step": 2160, "time_per_iteration": 2.711596965789795 }, { "auxiliary_loss_clip": 0.01172065, "auxiliary_loss_mlp": 0.01040998, "balance_loss_clip": 1.02106595, "balance_loss_mlp": 1.05786598, "epoch": 0.12992634901548175, "flos": 18802719590400.0, "grad_norm": 1.8192520047591083, "language_loss": 0.82038409, "learning_rate": 3.835944124786566e-06, "loss": 0.84251475, "num_input_tokens_seen": 46668780, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.046875, "step": 2161, "time_per_iteration": 2.5892300605773926 }, { "auxiliary_loss_clip": 0.01181624, "auxiliary_loss_mlp": 0.01049282, "balance_loss_clip": 1.02903998, "balance_loss_mlp": 1.05394983, "epoch": 0.12998647226814972, "flos": 29127575543040.0, "grad_norm": 2.1761108191194336, "language_loss": 0.82737064, "learning_rate": 3.835794230672833e-06, "loss": 0.84967971, "num_input_tokens_seen": 46687550, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.09375, "step": 2162, "time_per_iteration": 2.7582809925079346 }, { "auxiliary_loss_clip": 0.01195635, "auxiliary_loss_mlp": 0.01049711, "balance_loss_clip": 1.03008914, "balance_loss_mlp": 1.05548191, "epoch": 0.13004659552081768, "flos": 19573686132480.0, "grad_norm": 2.262703053705858, "language_loss": 0.72698522, "learning_rate": 3.8356442710443264e-06, "loss": 0.74943864, "num_input_tokens_seen": 46706730, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.1328125, "step": 2163, "time_per_iteration": 2.5950534343719482 }, { "auxiliary_loss_clip": 0.01172143, "auxiliary_loss_mlp": 0.01301664, "balance_loss_clip": 1.03083861, "balance_loss_mlp": 1.05750096, "epoch": 0.13010671877348565, "flos": 22490709626880.0, "grad_norm": 3.1864751979423143, "language_loss": 0.81430328, "learning_rate": 3.835494245906398e-06, "loss": 0.83904135, "num_input_tokens_seen": 46724250, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.1484375, "step": 2164, "time_per_iteration": 2.6289477348327637 }, { "auxiliary_loss_clip": 0.01202236, "auxiliary_loss_mlp": 0.01044226, "balance_loss_clip": 1.02523637, "balance_loss_mlp": 1.05501032, "epoch": 0.1301668420261536, "flos": 23878226142720.0, "grad_norm": 2.1546435655311758, "language_loss": 0.71958733, "learning_rate": 3.835344155264401e-06, "loss": 0.74205196, "num_input_tokens_seen": 46744105, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.109375, "step": 2165, "time_per_iteration": 2.7324910163879395 }, { "auxiliary_loss_clip": 0.01214526, "auxiliary_loss_mlp": 0.01047456, "balance_loss_clip": 1.02627254, "balance_loss_mlp": 1.0557493, "epoch": 0.13022696527882158, "flos": 23150065633920.0, "grad_norm": 1.6363611429150002, "language_loss": 0.74548888, "learning_rate": 3.835193999123692e-06, "loss": 0.76810873, "num_input_tokens_seen": 46764250, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.1328125, "step": 2166, "time_per_iteration": 2.729397773742676 }, { "auxiliary_loss_clip": 0.01168016, "auxiliary_loss_mlp": 0.010485, "balance_loss_clip": 1.02921212, "balance_loss_mlp": 1.05562472, "epoch": 0.13028708853148954, "flos": 26908548111360.0, "grad_norm": 1.5264512489863318, "language_loss": 0.82960379, "learning_rate": 3.83504377748963e-06, "loss": 0.85176897, "num_input_tokens_seen": 46786865, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.125, "step": 2167, "time_per_iteration": 2.6264126300811768 }, { "auxiliary_loss_clip": 0.01201888, "auxiliary_loss_mlp": 0.01055801, "balance_loss_clip": 1.03551185, "balance_loss_mlp": 1.05511522, "epoch": 0.13034721178415754, "flos": 21251468453760.0, "grad_norm": 1.629720886021688, "language_loss": 0.82872713, "learning_rate": 3.834893490367576e-06, "loss": 0.85130399, "num_input_tokens_seen": 46807030, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.1015625, "step": 2168, "time_per_iteration": 2.633441686630249 }, { "auxiliary_loss_clip": 0.01222282, "auxiliary_loss_mlp": 0.01049062, "balance_loss_clip": 1.02735424, "balance_loss_mlp": 1.05514884, "epoch": 0.1304073350368255, "flos": 18767239931520.0, "grad_norm": 2.835021732904577, "language_loss": 0.80139464, "learning_rate": 3.834743137762894e-06, "loss": 0.82410806, "num_input_tokens_seen": 46826280, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.125, "step": 2169, "time_per_iteration": 2.7042341232299805 }, { "auxiliary_loss_clip": 0.01165964, "auxiliary_loss_mlp": 0.0104094, "balance_loss_clip": 1.01955438, "balance_loss_mlp": 1.05530441, "epoch": 0.13046745828949347, "flos": 28364653647360.0, "grad_norm": 2.1840022386965843, "language_loss": 0.66655856, "learning_rate": 3.834592719680948e-06, "loss": 0.68862766, "num_input_tokens_seen": 46846505, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.109375, "step": 2170, "time_per_iteration": 2.643925189971924 }, { "auxiliary_loss_clip": 0.01182732, "auxiliary_loss_mlp": 0.01052214, "balance_loss_clip": 1.03191292, "balance_loss_mlp": 1.05452573, "epoch": 0.13052758154216143, "flos": 29605044055680.0, "grad_norm": 1.8058061786652277, "language_loss": 0.66761249, "learning_rate": 3.834442236127107e-06, "loss": 0.68996191, "num_input_tokens_seen": 46867380, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.1015625, "step": 2171, "time_per_iteration": 2.6933000087738037 }, { "auxiliary_loss_clip": 0.01173515, "auxiliary_loss_mlp": 0.01046486, "balance_loss_clip": 1.02493334, "balance_loss_mlp": 1.05324268, "epoch": 0.1305877047948294, "flos": 19390864884480.0, "grad_norm": 1.9125305218226119, "language_loss": 0.71423161, "learning_rate": 3.834291687106741e-06, "loss": 0.7364316, "num_input_tokens_seen": 46886810, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.109375, "step": 2172, "time_per_iteration": 2.5727322101593018 }, { "auxiliary_loss_clip": 0.01188906, "auxiliary_loss_mlp": 0.01041223, "balance_loss_clip": 1.02327013, "balance_loss_mlp": 1.05602729, "epoch": 0.13064782804749736, "flos": 16873527000960.0, "grad_norm": 1.711087703781932, "language_loss": 0.75181109, "learning_rate": 3.834141072625224e-06, "loss": 0.77411234, "num_input_tokens_seen": 46905620, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.0546875, "step": 2173, "time_per_iteration": 2.646801471710205 }, { "auxiliary_loss_clip": 0.01194294, "auxiliary_loss_mlp": 0.01056493, "balance_loss_clip": 1.03520226, "balance_loss_mlp": 1.05440569, "epoch": 0.13070795130016533, "flos": 24499085748480.0, "grad_norm": 2.160126700081544, "language_loss": 0.70436883, "learning_rate": 3.833990392687929e-06, "loss": 0.72687662, "num_input_tokens_seen": 46925120, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.125, "step": 2174, "time_per_iteration": 2.59702730178833 }, { "auxiliary_loss_clip": 0.01079404, "auxiliary_loss_mlp": 0.01010952, "balance_loss_clip": 1.00794816, "balance_loss_mlp": 1.02105498, "epoch": 0.13076807455283332, "flos": 71054505953280.0, "grad_norm": 0.8216478739228584, "language_loss": 0.5907433, "learning_rate": 3.833839647300235e-06, "loss": 0.61164689, "num_input_tokens_seen": 46988195, "router_z_loss_clip": 0.0300293, "router_z_loss_mlp": 0.40234375, "step": 2175, "time_per_iteration": 3.3329105377197266 }, { "auxiliary_loss_clip": 0.01172836, "auxiliary_loss_mlp": 0.01054037, "balance_loss_clip": 1.03428423, "balance_loss_mlp": 1.05393577, "epoch": 0.13082819780550128, "flos": 20264499475200.0, "grad_norm": 2.1991068778631027, "language_loss": 0.79769838, "learning_rate": 3.8336888364675215e-06, "loss": 0.81996715, "num_input_tokens_seen": 47004720, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.09375, "step": 2176, "time_per_iteration": 2.5862014293670654 }, { "auxiliary_loss_clip": 0.0116166, "auxiliary_loss_mlp": 0.01050658, "balance_loss_clip": 1.02880669, "balance_loss_mlp": 1.05213475, "epoch": 0.13088832105816925, "flos": 34203441231360.0, "grad_norm": 1.7513011115132993, "language_loss": 0.74346334, "learning_rate": 3.83353796019517e-06, "loss": 0.76558656, "num_input_tokens_seen": 47024255, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.09375, "step": 2177, "time_per_iteration": 2.6389288902282715 }, { "auxiliary_loss_clip": 0.01188749, "auxiliary_loss_mlp": 0.0104863, "balance_loss_clip": 1.02707672, "balance_loss_mlp": 1.05169702, "epoch": 0.13094844431083721, "flos": 17894970057600.0, "grad_norm": 2.075603992158306, "language_loss": 0.82003152, "learning_rate": 3.833387018488565e-06, "loss": 0.84240532, "num_input_tokens_seen": 47042465, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.1015625, "step": 2178, "time_per_iteration": 4.048434495925903 }, { "auxiliary_loss_clip": 0.01162847, "auxiliary_loss_mlp": 0.0104423, "balance_loss_clip": 1.02384543, "balance_loss_mlp": 1.05353034, "epoch": 0.13100856756350518, "flos": 17311313963520.0, "grad_norm": 2.4656138717683125, "language_loss": 0.7464174, "learning_rate": 3.833236011353094e-06, "loss": 0.76848817, "num_input_tokens_seen": 47060370, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.09375, "step": 2179, "time_per_iteration": 5.625402212142944 }, { "auxiliary_loss_clip": 0.01186118, "auxiliary_loss_mlp": 0.01048582, "balance_loss_clip": 1.02872157, "balance_loss_mlp": 1.05209661, "epoch": 0.13106869081617314, "flos": 22200551562240.0, "grad_norm": 2.3509706417801306, "language_loss": 0.84761143, "learning_rate": 3.833084938794144e-06, "loss": 0.8699584, "num_input_tokens_seen": 47081415, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.0703125, "step": 2180, "time_per_iteration": 2.706075668334961 }, { "auxiliary_loss_clip": 0.01173826, "auxiliary_loss_mlp": 0.01055281, "balance_loss_clip": 1.03600454, "balance_loss_mlp": 1.05618572, "epoch": 0.13112881406884114, "flos": 21763123735680.0, "grad_norm": 2.0839672996770067, "language_loss": 0.89924669, "learning_rate": 3.832933800817109e-06, "loss": 0.92153776, "num_input_tokens_seen": 47099860, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.0859375, "step": 2181, "time_per_iteration": 4.1382129192352295 }, { "auxiliary_loss_clip": 0.01171767, "auxiliary_loss_mlp": 0.01051979, "balance_loss_clip": 1.03233337, "balance_loss_mlp": 1.05430031, "epoch": 0.1311889373215091, "flos": 23331091201920.0, "grad_norm": 1.9889143925708135, "language_loss": 0.68296432, "learning_rate": 3.832782597427381e-06, "loss": 0.70520175, "num_input_tokens_seen": 47118540, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.0859375, "step": 2182, "time_per_iteration": 2.5892512798309326 }, { "auxiliary_loss_clip": 0.01191039, "auxiliary_loss_mlp": 0.0105102, "balance_loss_clip": 1.03033733, "balance_loss_mlp": 1.05310702, "epoch": 0.13124906057417707, "flos": 21467363149440.0, "grad_norm": 2.3260029899240426, "language_loss": 0.78628099, "learning_rate": 3.832631328630357e-06, "loss": 0.80870152, "num_input_tokens_seen": 47136710, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.109375, "step": 2183, "time_per_iteration": 2.579984188079834 }, { "auxiliary_loss_clip": 0.01162443, "auxiliary_loss_mlp": 0.01050008, "balance_loss_clip": 1.02955174, "balance_loss_mlp": 1.05306625, "epoch": 0.13130918382684503, "flos": 23255319461760.0, "grad_norm": 1.7059679227144084, "language_loss": 0.85378659, "learning_rate": 3.832479994431435e-06, "loss": 0.87591112, "num_input_tokens_seen": 47157155, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.09375, "step": 2184, "time_per_iteration": 2.5750553607940674 }, { "auxiliary_loss_clip": 0.0119199, "auxiliary_loss_mlp": 0.01051066, "balance_loss_clip": 1.03045464, "balance_loss_mlp": 1.05306423, "epoch": 0.131369307079513, "flos": 20850274471680.0, "grad_norm": 1.8663332906617767, "language_loss": 0.82176262, "learning_rate": 3.8323285948360155e-06, "loss": 0.84419322, "num_input_tokens_seen": 47176820, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.1171875, "step": 2185, "time_per_iteration": 2.5908868312835693 }, { "auxiliary_loss_clip": 0.01182431, "auxiliary_loss_mlp": 0.0105275, "balance_loss_clip": 1.03161418, "balance_loss_mlp": 1.0511328, "epoch": 0.13142943033218096, "flos": 17858341163520.0, "grad_norm": 2.0878122256996563, "language_loss": 0.73232138, "learning_rate": 3.832177129849501e-06, "loss": 0.75467318, "num_input_tokens_seen": 47195855, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.125, "step": 2186, "time_per_iteration": 2.5937697887420654 }, { "auxiliary_loss_clip": 0.01171655, "auxiliary_loss_mlp": 0.01047903, "balance_loss_clip": 1.02608752, "balance_loss_mlp": 1.05319834, "epoch": 0.13148955358484893, "flos": 20996035862400.0, "grad_norm": 2.34724378503082, "language_loss": 0.80321908, "learning_rate": 3.832025599477299e-06, "loss": 0.82541466, "num_input_tokens_seen": 47214535, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.09375, "step": 2187, "time_per_iteration": 2.609727382659912 }, { "auxiliary_loss_clip": 0.0109648, "auxiliary_loss_mlp": 0.01016123, "balance_loss_clip": 1.0135361, "balance_loss_mlp": 1.02000165, "epoch": 0.13154967683751692, "flos": 70172467580160.0, "grad_norm": 0.840655603251272, "language_loss": 0.59021837, "learning_rate": 3.831874003724815e-06, "loss": 0.61134434, "num_input_tokens_seen": 47270300, "router_z_loss_clip": 0.02587891, "router_z_loss_mlp": 0.40234375, "step": 2188, "time_per_iteration": 3.259007453918457 }, { "auxiliary_loss_clip": 0.01221138, "auxiliary_loss_mlp": 0.01054877, "balance_loss_clip": 1.03417087, "balance_loss_mlp": 1.05665612, "epoch": 0.1316098000901849, "flos": 20376145923840.0, "grad_norm": 1.7597936714649691, "language_loss": 0.73835194, "learning_rate": 3.83172234259746e-06, "loss": 0.76111209, "num_input_tokens_seen": 47290720, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1015625, "step": 2189, "time_per_iteration": 2.7051219940185547 }, { "auxiliary_loss_clip": 0.01170095, "auxiliary_loss_mlp": 0.01042636, "balance_loss_clip": 1.02176297, "balance_loss_mlp": 1.05237854, "epoch": 0.13166992334285285, "flos": 23221132692480.0, "grad_norm": 1.813153093206859, "language_loss": 0.72908169, "learning_rate": 3.831570616100646e-06, "loss": 0.75120902, "num_input_tokens_seen": 47311820, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.0859375, "step": 2190, "time_per_iteration": 2.633241653442383 }, { "auxiliary_loss_clip": 0.01175582, "auxiliary_loss_mlp": 0.01050963, "balance_loss_clip": 1.0311501, "balance_loss_mlp": 1.05693936, "epoch": 0.13173004659552082, "flos": 23330947547520.0, "grad_norm": 1.789277596132385, "language_loss": 0.74501443, "learning_rate": 3.831418824239789e-06, "loss": 0.76727986, "num_input_tokens_seen": 47331605, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.1015625, "step": 2191, "time_per_iteration": 2.795607805252075 }, { "auxiliary_loss_clip": 0.0119045, "auxiliary_loss_mlp": 0.01047947, "balance_loss_clip": 1.02659678, "balance_loss_mlp": 1.05359387, "epoch": 0.13179016984818878, "flos": 21251504367360.0, "grad_norm": 1.8199639137945458, "language_loss": 0.79161006, "learning_rate": 3.831266967020304e-06, "loss": 0.81399405, "num_input_tokens_seen": 47350455, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.09375, "step": 2192, "time_per_iteration": 2.585305690765381 }, { "auxiliary_loss_clip": 0.01201816, "auxiliary_loss_mlp": 0.01046684, "balance_loss_clip": 1.02690697, "balance_loss_mlp": 1.0547148, "epoch": 0.13185029310085675, "flos": 17778690754560.0, "grad_norm": 1.9587825090268385, "language_loss": 0.85054505, "learning_rate": 3.831115044447613e-06, "loss": 0.87303007, "num_input_tokens_seen": 47368225, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.109375, "step": 2193, "time_per_iteration": 2.7031142711639404 }, { "auxiliary_loss_clip": 0.01078569, "auxiliary_loss_mlp": 0.00999071, "balance_loss_clip": 0.99649602, "balance_loss_mlp": 1.01954031, "epoch": 0.1319104163535247, "flos": 69851785933440.0, "grad_norm": 0.748120832316465, "language_loss": 0.54028654, "learning_rate": 3.830963056527136e-06, "loss": 0.56106299, "num_input_tokens_seen": 47427125, "router_z_loss_clip": 0.02575684, "router_z_loss_mlp": 0.40820312, "step": 2194, "time_per_iteration": 3.140761613845825 }, { "auxiliary_loss_clip": 0.01160962, "auxiliary_loss_mlp": 0.01043118, "balance_loss_clip": 1.02282858, "balance_loss_mlp": 1.05198932, "epoch": 0.1319705396061927, "flos": 25193095401600.0, "grad_norm": 1.7814533872994434, "language_loss": 0.72347248, "learning_rate": 3.830811003264296e-06, "loss": 0.74551326, "num_input_tokens_seen": 47450275, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.09375, "step": 2195, "time_per_iteration": 2.653381109237671 }, { "auxiliary_loss_clip": 0.01193006, "auxiliary_loss_mlp": 0.0129418, "balance_loss_clip": 1.02399254, "balance_loss_mlp": 1.0523355, "epoch": 0.13203066285886067, "flos": 20740459616640.0, "grad_norm": 1.9763091191155737, "language_loss": 0.7814796, "learning_rate": 3.830658884664522e-06, "loss": 0.80635148, "num_input_tokens_seen": 47469155, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.1328125, "step": 2196, "time_per_iteration": 2.6605641841888428 }, { "auxiliary_loss_clip": 0.01174242, "auxiliary_loss_mlp": 0.01049089, "balance_loss_clip": 1.0293951, "balance_loss_mlp": 1.05302858, "epoch": 0.13209078611152864, "flos": 22054395121920.0, "grad_norm": 2.004014816466797, "language_loss": 0.7498436, "learning_rate": 3.830506700733241e-06, "loss": 0.77207696, "num_input_tokens_seen": 47488405, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.1171875, "step": 2197, "time_per_iteration": 2.606419324874878 }, { "auxiliary_loss_clip": 0.01164573, "auxiliary_loss_mlp": 0.01046161, "balance_loss_clip": 1.0264082, "balance_loss_mlp": 1.05021286, "epoch": 0.1321509093641966, "flos": 16284950743680.0, "grad_norm": 1.9759025202982838, "language_loss": 0.79200339, "learning_rate": 3.830354451475884e-06, "loss": 0.81411076, "num_input_tokens_seen": 47505650, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.140625, "step": 2198, "time_per_iteration": 2.5414600372314453 }, { "auxiliary_loss_clip": 0.01162861, "auxiliary_loss_mlp": 0.01053215, "balance_loss_clip": 1.03389096, "balance_loss_mlp": 1.05367088, "epoch": 0.13221103261686457, "flos": 16983018633600.0, "grad_norm": 2.05647309754596, "language_loss": 0.82807136, "learning_rate": 3.830202136897886e-06, "loss": 0.85023212, "num_input_tokens_seen": 47521540, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0859375, "step": 2199, "time_per_iteration": 2.5700831413269043 }, { "auxiliary_loss_clip": 0.01198817, "auxiliary_loss_mlp": 0.0105102, "balance_loss_clip": 1.03055215, "balance_loss_mlp": 1.05164218, "epoch": 0.13227115586953253, "flos": 34233605677440.0, "grad_norm": 1.6273613670333522, "language_loss": 0.69544566, "learning_rate": 3.8300497570046804e-06, "loss": 0.71794403, "num_input_tokens_seen": 47543625, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.109375, "step": 2200, "time_per_iteration": 2.7616138458251953 }, { "auxiliary_loss_clip": 0.01161314, "auxiliary_loss_mlp": 0.01052357, "balance_loss_clip": 1.03217447, "balance_loss_mlp": 1.0507108, "epoch": 0.13233127912220052, "flos": 20704656735360.0, "grad_norm": 1.9119085399440077, "language_loss": 0.84391522, "learning_rate": 3.829897311801707e-06, "loss": 0.86605203, "num_input_tokens_seen": 47563740, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.109375, "step": 2201, "time_per_iteration": 2.627281427383423 }, { "auxiliary_loss_clip": 0.01163441, "auxiliary_loss_mlp": 0.01051411, "balance_loss_clip": 1.03110993, "balance_loss_mlp": 1.05112147, "epoch": 0.1323914023748685, "flos": 25805048434560.0, "grad_norm": 1.946878951727351, "language_loss": 0.86739838, "learning_rate": 3.829744801294406e-06, "loss": 0.88954687, "num_input_tokens_seen": 47582655, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.125, "step": 2202, "time_per_iteration": 2.6668882369995117 }, { "auxiliary_loss_clip": 0.01179408, "auxiliary_loss_mlp": 0.01045036, "balance_loss_clip": 1.02536607, "balance_loss_mlp": 1.05093825, "epoch": 0.13245152562753645, "flos": 21251540280960.0, "grad_norm": 2.5610177204879006, "language_loss": 0.7213921, "learning_rate": 3.8295922254882186e-06, "loss": 0.74363649, "num_input_tokens_seen": 47600875, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.1015625, "step": 2203, "time_per_iteration": 2.7127487659454346 }, { "auxiliary_loss_clip": 0.01199904, "auxiliary_loss_mlp": 0.01051036, "balance_loss_clip": 1.0317477, "balance_loss_mlp": 1.0509584, "epoch": 0.13251164888020442, "flos": 26610955931520.0, "grad_norm": 2.1395261951794575, "language_loss": 0.73244798, "learning_rate": 3.829439584388591e-06, "loss": 0.75495732, "num_input_tokens_seen": 47619250, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.125, "step": 2204, "time_per_iteration": 2.7183518409729004 }, { "auxiliary_loss_clip": 0.01173595, "auxiliary_loss_mlp": 0.01052899, "balance_loss_clip": 1.03173947, "balance_loss_mlp": 1.05251861, "epoch": 0.13257177213287238, "flos": 29826541272960.0, "grad_norm": 1.5769011125977932, "language_loss": 0.78149259, "learning_rate": 3.8292868780009715e-06, "loss": 0.80375755, "num_input_tokens_seen": 47639445, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.1171875, "step": 2205, "time_per_iteration": 2.6704559326171875 }, { "auxiliary_loss_clip": 0.01189175, "auxiliary_loss_mlp": 0.0104909, "balance_loss_clip": 1.02839506, "balance_loss_mlp": 1.05298519, "epoch": 0.13263189538554035, "flos": 21288456483840.0, "grad_norm": 2.056916543575881, "language_loss": 0.78924555, "learning_rate": 3.829134106330809e-06, "loss": 0.81162822, "num_input_tokens_seen": 47658740, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.0859375, "step": 2206, "time_per_iteration": 2.75152325630188 }, { "auxiliary_loss_clip": 0.01191132, "auxiliary_loss_mlp": 0.01045993, "balance_loss_clip": 1.02726495, "balance_loss_mlp": 1.05247498, "epoch": 0.13269201863820831, "flos": 16874101618560.0, "grad_norm": 1.9593092490208168, "language_loss": 0.74283099, "learning_rate": 3.828981269383554e-06, "loss": 0.76520222, "num_input_tokens_seen": 47676880, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.1171875, "step": 2207, "time_per_iteration": 2.6127254962921143 }, { "auxiliary_loss_clip": 0.01173837, "auxiliary_loss_mlp": 0.01046379, "balance_loss_clip": 1.02558947, "balance_loss_mlp": 1.04905891, "epoch": 0.1327521418908763, "flos": 23768914078080.0, "grad_norm": 1.874630574021486, "language_loss": 0.83923942, "learning_rate": 3.828828367164663e-06, "loss": 0.86144161, "num_input_tokens_seen": 47696635, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.0625, "step": 2208, "time_per_iteration": 2.7691140174865723 }, { "auxiliary_loss_clip": 0.01167372, "auxiliary_loss_mlp": 0.01055067, "balance_loss_clip": 1.03705478, "balance_loss_mlp": 1.05314445, "epoch": 0.13281226514354427, "flos": 26505594362880.0, "grad_norm": 2.028569552421068, "language_loss": 0.85110927, "learning_rate": 3.828675399679592e-06, "loss": 0.87333363, "num_input_tokens_seen": 47717760, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.046875, "step": 2209, "time_per_iteration": 2.654033899307251 }, { "auxiliary_loss_clip": 0.01180114, "auxiliary_loss_mlp": 0.01288647, "balance_loss_clip": 1.02173638, "balance_loss_mlp": 1.05128312, "epoch": 0.13287238839621224, "flos": 24498762526080.0, "grad_norm": 2.142438097010944, "language_loss": 0.8205868, "learning_rate": 3.8285223669337995e-06, "loss": 0.84527439, "num_input_tokens_seen": 47737685, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.1015625, "step": 2210, "time_per_iteration": 2.6780483722686768 }, { "auxiliary_loss_clip": 0.01086498, "auxiliary_loss_mlp": 0.01262772, "balance_loss_clip": 1.01217842, "balance_loss_mlp": 1.01829731, "epoch": 0.1329325116488802, "flos": 67694344369920.0, "grad_norm": 0.7505926368700289, "language_loss": 0.58036822, "learning_rate": 3.828369268932747e-06, "loss": 0.60386091, "num_input_tokens_seen": 47802415, "router_z_loss_clip": 0.02429199, "router_z_loss_mlp": 0.41015625, "step": 2211, "time_per_iteration": 3.314880847930908 }, { "auxiliary_loss_clip": 0.01076968, "auxiliary_loss_mlp": 0.01008354, "balance_loss_clip": 1.00604153, "balance_loss_mlp": 1.01780915, "epoch": 0.13299263490154817, "flos": 72261894741120.0, "grad_norm": 0.7963405534203327, "language_loss": 0.55281407, "learning_rate": 3.828216105681899e-06, "loss": 0.57366729, "num_input_tokens_seen": 47871485, "router_z_loss_clip": 0.02307129, "router_z_loss_mlp": 0.41015625, "step": 2212, "time_per_iteration": 3.2866404056549072 }, { "auxiliary_loss_clip": 0.01191484, "auxiliary_loss_mlp": 0.01051799, "balance_loss_clip": 1.03153384, "balance_loss_mlp": 1.05256963, "epoch": 0.13305275815421613, "flos": 17931275729280.0, "grad_norm": 2.37209994310709, "language_loss": 0.73946565, "learning_rate": 3.8280628771867205e-06, "loss": 0.76189852, "num_input_tokens_seen": 47888315, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.1171875, "step": 2213, "time_per_iteration": 2.6882119178771973 }, { "auxiliary_loss_clip": 0.01183762, "auxiliary_loss_mlp": 0.01048029, "balance_loss_clip": 1.02932549, "balance_loss_mlp": 1.05035067, "epoch": 0.13311288140688413, "flos": 22340889999360.0, "grad_norm": 1.7090258461285388, "language_loss": 0.78781307, "learning_rate": 3.8279095834526815e-06, "loss": 0.81013101, "num_input_tokens_seen": 47906600, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0625, "step": 2214, "time_per_iteration": 2.694434642791748 }, { "auxiliary_loss_clip": 0.01180021, "auxiliary_loss_mlp": 0.01046903, "balance_loss_clip": 1.02708995, "balance_loss_mlp": 1.05156732, "epoch": 0.1331730046595521, "flos": 31868888682240.0, "grad_norm": 1.8704155862917398, "language_loss": 0.68834233, "learning_rate": 3.8277562244852495e-06, "loss": 0.71061158, "num_input_tokens_seen": 47927630, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.1015625, "step": 2215, "time_per_iteration": 2.7600061893463135 }, { "auxiliary_loss_clip": 0.01172114, "auxiliary_loss_mlp": 0.01048035, "balance_loss_clip": 1.02819848, "balance_loss_mlp": 1.05161047, "epoch": 0.13323312791222006, "flos": 22566589107840.0, "grad_norm": 1.7646680784356201, "language_loss": 0.81248319, "learning_rate": 3.827602800289901e-06, "loss": 0.83468461, "num_input_tokens_seen": 47947935, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.109375, "step": 2216, "time_per_iteration": 2.6395628452301025 }, { "auxiliary_loss_clip": 0.01169888, "auxiliary_loss_mlp": 0.0105389, "balance_loss_clip": 1.03405404, "balance_loss_mlp": 1.05090594, "epoch": 0.13329325116488802, "flos": 15085319293440.0, "grad_norm": 3.130571977101411, "language_loss": 0.8740989, "learning_rate": 3.827449310872109e-06, "loss": 0.89633662, "num_input_tokens_seen": 47965515, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.1015625, "step": 2217, "time_per_iteration": 2.575953960418701 }, { "auxiliary_loss_clip": 0.01177848, "auxiliary_loss_mlp": 0.01055547, "balance_loss_clip": 1.03392208, "balance_loss_mlp": 1.04973507, "epoch": 0.133353374417556, "flos": 27453671890560.0, "grad_norm": 2.0337984950217587, "language_loss": 0.72962058, "learning_rate": 3.827295756237351e-06, "loss": 0.7519545, "num_input_tokens_seen": 47985675, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.1015625, "step": 2218, "time_per_iteration": 2.6577072143554688 }, { "auxiliary_loss_clip": 0.01194615, "auxiliary_loss_mlp": 0.01044721, "balance_loss_clip": 1.02432418, "balance_loss_mlp": 1.04855871, "epoch": 0.13341349767022395, "flos": 24094695456000.0, "grad_norm": 1.7134189599330678, "language_loss": 0.87258154, "learning_rate": 3.8271421363911095e-06, "loss": 0.89497483, "num_input_tokens_seen": 48004985, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.09375, "step": 2219, "time_per_iteration": 4.115412712097168 }, { "auxiliary_loss_clip": 0.01188048, "auxiliary_loss_mlp": 0.01047078, "balance_loss_clip": 1.02823138, "balance_loss_mlp": 1.0523268, "epoch": 0.13347362092289192, "flos": 24133335511680.0, "grad_norm": 2.0433346355197206, "language_loss": 0.77051216, "learning_rate": 3.826988451338864e-06, "loss": 0.79286343, "num_input_tokens_seen": 48024965, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.0859375, "step": 2220, "time_per_iteration": 4.230267763137817 }, { "auxiliary_loss_clip": 0.01174088, "auxiliary_loss_mlp": 0.01046276, "balance_loss_clip": 1.02787054, "balance_loss_mlp": 1.04803813, "epoch": 0.1335337441755599, "flos": 18436538390400.0, "grad_norm": 2.3159629475570385, "language_loss": 0.78485906, "learning_rate": 3.826834701086101e-06, "loss": 0.80706263, "num_input_tokens_seen": 48040890, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.078125, "step": 2221, "time_per_iteration": 3.9947669506073 }, { "auxiliary_loss_clip": 0.01067939, "auxiliary_loss_mlp": 0.0103768, "balance_loss_clip": 1.03490281, "balance_loss_mlp": 1.0184207, "epoch": 0.13359386742822787, "flos": 50611997652480.0, "grad_norm": 0.9814843466375963, "language_loss": 0.69001079, "learning_rate": 3.826680885638306e-06, "loss": 0.71106696, "num_input_tokens_seen": 48091855, "router_z_loss_clip": 0.02783203, "router_z_loss_mlp": 0.40820312, "step": 2222, "time_per_iteration": 3.0320451259613037 }, { "auxiliary_loss_clip": 0.01189988, "auxiliary_loss_mlp": 0.01052198, "balance_loss_clip": 1.0337559, "balance_loss_mlp": 1.05401111, "epoch": 0.13365399068089584, "flos": 22778569221120.0, "grad_norm": 2.197260063190259, "language_loss": 0.6746195, "learning_rate": 3.826527005000969e-06, "loss": 0.69704139, "num_input_tokens_seen": 48111350, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.0859375, "step": 2223, "time_per_iteration": 4.207049131393433 }, { "auxiliary_loss_clip": 0.01169809, "auxiliary_loss_mlp": 0.0105664, "balance_loss_clip": 1.03592157, "balance_loss_mlp": 1.04949713, "epoch": 0.1337141139335638, "flos": 12531603911040.0, "grad_norm": 2.1378902776689106, "language_loss": 0.81855452, "learning_rate": 3.826373059179582e-06, "loss": 0.840819, "num_input_tokens_seen": 48129840, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.109375, "step": 2224, "time_per_iteration": 2.646660566329956 }, { "auxiliary_loss_clip": 0.01164336, "auxiliary_loss_mlp": 0.01051021, "balance_loss_clip": 1.0303266, "balance_loss_mlp": 1.0514667, "epoch": 0.13377423718623177, "flos": 23038957889280.0, "grad_norm": 2.329245158234574, "language_loss": 0.65257597, "learning_rate": 3.826219048179639e-06, "loss": 0.67472959, "num_input_tokens_seen": 48149240, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.125, "step": 2225, "time_per_iteration": 2.6180126667022705 }, { "auxiliary_loss_clip": 0.01171152, "auxiliary_loss_mlp": 0.01055258, "balance_loss_clip": 1.03580308, "balance_loss_mlp": 1.05130434, "epoch": 0.13383436043889974, "flos": 16216397637120.0, "grad_norm": 2.2735459106615235, "language_loss": 0.8931005, "learning_rate": 3.826064972006635e-06, "loss": 0.91536462, "num_input_tokens_seen": 48166330, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.109375, "step": 2226, "time_per_iteration": 2.6250996589660645 }, { "auxiliary_loss_clip": 0.01180624, "auxiliary_loss_mlp": 0.01060302, "balance_loss_clip": 1.04051328, "balance_loss_mlp": 1.05144167, "epoch": 0.1338944836915677, "flos": 24279671520000.0, "grad_norm": 1.9753897713541486, "language_loss": 0.74005288, "learning_rate": 3.825910830666069e-06, "loss": 0.76246214, "num_input_tokens_seen": 48187600, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.109375, "step": 2227, "time_per_iteration": 2.7142226696014404 }, { "auxiliary_loss_clip": 0.01171184, "auxiliary_loss_mlp": 0.0105759, "balance_loss_clip": 1.03696644, "balance_loss_mlp": 1.05148447, "epoch": 0.1339546069442357, "flos": 17598742594560.0, "grad_norm": 1.9628416994544091, "language_loss": 0.85373867, "learning_rate": 3.825756624163443e-06, "loss": 0.87602639, "num_input_tokens_seen": 48204400, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.109375, "step": 2228, "time_per_iteration": 2.5847437381744385 }, { "auxiliary_loss_clip": 0.0117908, "auxiliary_loss_mlp": 0.01055974, "balance_loss_clip": 1.03616142, "balance_loss_mlp": 1.05226278, "epoch": 0.13401473019690366, "flos": 18990065952000.0, "grad_norm": 2.1312551798467605, "language_loss": 0.81147528, "learning_rate": 3.825602352504259e-06, "loss": 0.83382583, "num_input_tokens_seen": 48222180, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.0859375, "step": 2229, "time_per_iteration": 2.617359161376953 }, { "auxiliary_loss_clip": 0.01201458, "auxiliary_loss_mlp": 0.01075533, "balance_loss_clip": 1.05535114, "balance_loss_mlp": 1.05406082, "epoch": 0.13407485344957162, "flos": 26943812288640.0, "grad_norm": 4.277062604115583, "language_loss": 0.73673594, "learning_rate": 3.825448015694023e-06, "loss": 0.75950587, "num_input_tokens_seen": 48243245, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.109375, "step": 2230, "time_per_iteration": 2.657676935195923 }, { "auxiliary_loss_clip": 0.01190382, "auxiliary_loss_mlp": 0.01070666, "balance_loss_clip": 1.05092454, "balance_loss_mlp": 1.05207253, "epoch": 0.1341349767022396, "flos": 20339373375360.0, "grad_norm": 1.774234465603792, "language_loss": 0.80140686, "learning_rate": 3.8252936137382435e-06, "loss": 0.82401735, "num_input_tokens_seen": 48262600, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.109375, "step": 2231, "time_per_iteration": 2.614931106567383 }, { "auxiliary_loss_clip": 0.01204954, "auxiliary_loss_mlp": 0.01061294, "balance_loss_clip": 1.04101658, "balance_loss_mlp": 1.05639172, "epoch": 0.13419509995490755, "flos": 29862020931840.0, "grad_norm": 1.5572923422451215, "language_loss": 0.72165155, "learning_rate": 3.82513914664243e-06, "loss": 0.74431407, "num_input_tokens_seen": 48285075, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.1171875, "step": 2232, "time_per_iteration": 2.6655116081237793 }, { "auxiliary_loss_clip": 0.01196948, "auxiliary_loss_mlp": 0.01055615, "balance_loss_clip": 1.03432417, "balance_loss_mlp": 1.05600941, "epoch": 0.13425522320757552, "flos": 26942986275840.0, "grad_norm": 2.562407323980229, "language_loss": 0.65790325, "learning_rate": 3.824984614412095e-06, "loss": 0.68042886, "num_input_tokens_seen": 48301285, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.140625, "step": 2233, "time_per_iteration": 2.726331949234009 }, { "auxiliary_loss_clip": 0.01162759, "auxiliary_loss_mlp": 0.01051645, "balance_loss_clip": 1.03148687, "balance_loss_mlp": 1.04965615, "epoch": 0.1343153464602435, "flos": 15777281871360.0, "grad_norm": 2.3537300708967432, "language_loss": 0.81385928, "learning_rate": 3.824830017052753e-06, "loss": 0.8360033, "num_input_tokens_seen": 48317835, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.1328125, "step": 2234, "time_per_iteration": 2.5669751167297363 }, { "auxiliary_loss_clip": 0.01184866, "auxiliary_loss_mlp": 0.01064002, "balance_loss_clip": 1.04345024, "balance_loss_mlp": 1.05488658, "epoch": 0.13437546971291148, "flos": 24314756129280.0, "grad_norm": 2.232464189657562, "language_loss": 0.81876004, "learning_rate": 3.824675354569923e-06, "loss": 0.84124875, "num_input_tokens_seen": 48335670, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.1171875, "step": 2235, "time_per_iteration": 2.803889513015747 }, { "auxiliary_loss_clip": 0.01171523, "auxiliary_loss_mlp": 0.01059968, "balance_loss_clip": 1.03790259, "balance_loss_mlp": 1.05119514, "epoch": 0.13443559296557944, "flos": 26650673395200.0, "grad_norm": 1.816981688821181, "language_loss": 0.86261988, "learning_rate": 3.824520626969122e-06, "loss": 0.88493478, "num_input_tokens_seen": 48357805, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.109375, "step": 2236, "time_per_iteration": 2.6184020042419434 }, { "auxiliary_loss_clip": 0.0116795, "auxiliary_loss_mlp": 0.01054192, "balance_loss_clip": 1.03273392, "balance_loss_mlp": 1.05561149, "epoch": 0.1344957162182474, "flos": 21796197183360.0, "grad_norm": 2.0369791378097033, "language_loss": 0.77308726, "learning_rate": 3.824365834255874e-06, "loss": 0.79530871, "num_input_tokens_seen": 48377845, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.125, "step": 2237, "time_per_iteration": 2.715334177017212 }, { "auxiliary_loss_clip": 0.01183474, "auxiliary_loss_mlp": 0.01057253, "balance_loss_clip": 1.03488898, "balance_loss_mlp": 1.05362868, "epoch": 0.13455583947091537, "flos": 19865568049920.0, "grad_norm": 1.9470593065698543, "language_loss": 0.78575861, "learning_rate": 3.824210976435702e-06, "loss": 0.80816591, "num_input_tokens_seen": 48394735, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.1171875, "step": 2238, "time_per_iteration": 2.589647054672241 }, { "auxiliary_loss_clip": 0.01181978, "auxiliary_loss_mlp": 0.01052949, "balance_loss_clip": 1.03023922, "balance_loss_mlp": 1.05386734, "epoch": 0.13461596272358334, "flos": 30846835094400.0, "grad_norm": 2.113863466071894, "language_loss": 0.68077683, "learning_rate": 3.824056053514132e-06, "loss": 0.70312613, "num_input_tokens_seen": 48414200, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.1015625, "step": 2239, "time_per_iteration": 2.6648976802825928 }, { "auxiliary_loss_clip": 0.01185571, "auxiliary_loss_mlp": 0.01052574, "balance_loss_clip": 1.03193879, "balance_loss_mlp": 1.05403399, "epoch": 0.1346760859762513, "flos": 12494436312960.0, "grad_norm": 2.692025542356757, "language_loss": 0.81208223, "learning_rate": 3.823901065496693e-06, "loss": 0.83446366, "num_input_tokens_seen": 48431065, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1328125, "step": 2240, "time_per_iteration": 2.543862819671631 }, { "auxiliary_loss_clip": 0.01204311, "auxiliary_loss_mlp": 0.01052535, "balance_loss_clip": 1.03217387, "balance_loss_mlp": 1.05425692, "epoch": 0.1347362092289193, "flos": 21836022387840.0, "grad_norm": 1.6622662933504486, "language_loss": 0.77796233, "learning_rate": 3.823746012388918e-06, "loss": 0.80053079, "num_input_tokens_seen": 48450335, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.1328125, "step": 2241, "time_per_iteration": 2.671987533569336 }, { "auxiliary_loss_clip": 0.01162037, "auxiliary_loss_mlp": 0.01041226, "balance_loss_clip": 1.02295148, "balance_loss_mlp": 1.05534291, "epoch": 0.13479633248158726, "flos": 23509459163520.0, "grad_norm": 1.7124919859978849, "language_loss": 0.83127481, "learning_rate": 3.823590894196339e-06, "loss": 0.85330743, "num_input_tokens_seen": 48468555, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 1.0625, "step": 2242, "time_per_iteration": 2.6664206981658936 }, { "auxiliary_loss_clip": 0.01165997, "auxiliary_loss_mlp": 0.01055037, "balance_loss_clip": 1.03314996, "balance_loss_mlp": 1.0532707, "epoch": 0.13485645573425523, "flos": 29344332165120.0, "grad_norm": 2.1232597402050204, "language_loss": 0.64414477, "learning_rate": 3.823435710924491e-06, "loss": 0.66635513, "num_input_tokens_seen": 48488515, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.125, "step": 2243, "time_per_iteration": 2.6052119731903076 }, { "auxiliary_loss_clip": 0.01174075, "auxiliary_loss_mlp": 0.01046217, "balance_loss_clip": 1.02577233, "balance_loss_mlp": 1.05126429, "epoch": 0.1349165789869232, "flos": 28037112503040.0, "grad_norm": 1.4335322010640712, "language_loss": 0.72748756, "learning_rate": 3.823280462578913e-06, "loss": 0.74969047, "num_input_tokens_seen": 48510515, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.046875, "step": 2244, "time_per_iteration": 2.65386700630188 }, { "auxiliary_loss_clip": 0.01171564, "auxiliary_loss_mlp": 0.01047409, "balance_loss_clip": 1.02840686, "balance_loss_mlp": 1.05313778, "epoch": 0.13497670223959116, "flos": 22853730430080.0, "grad_norm": 1.515495044626558, "language_loss": 0.85910654, "learning_rate": 3.8231251491651455e-06, "loss": 0.8812964, "num_input_tokens_seen": 48529940, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.09375, "step": 2245, "time_per_iteration": 2.5919318199157715 }, { "auxiliary_loss_clip": 0.01168462, "auxiliary_loss_mlp": 0.01053587, "balance_loss_clip": 1.0330832, "balance_loss_mlp": 1.0532937, "epoch": 0.13503682549225912, "flos": 16504580453760.0, "grad_norm": 1.603389104090265, "language_loss": 0.78754854, "learning_rate": 3.822969770688732e-06, "loss": 0.80976903, "num_input_tokens_seen": 48548190, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.0625, "step": 2246, "time_per_iteration": 2.5933666229248047 }, { "auxiliary_loss_clip": 0.01105893, "auxiliary_loss_mlp": 0.01018237, "balance_loss_clip": 1.01582861, "balance_loss_mlp": 1.0208621, "epoch": 0.1350969487449271, "flos": 70756303242240.0, "grad_norm": 0.7468490809077936, "language_loss": 0.60550261, "learning_rate": 3.8228143271552154e-06, "loss": 0.62674391, "num_input_tokens_seen": 48613165, "router_z_loss_clip": 0.02404785, "router_z_loss_mlp": 0.39453125, "step": 2247, "time_per_iteration": 3.3401949405670166 }, { "auxiliary_loss_clip": 0.0117883, "auxiliary_loss_mlp": 0.01054889, "balance_loss_clip": 1.03338385, "balance_loss_mlp": 1.05505848, "epoch": 0.13515707199759508, "flos": 23075981832960.0, "grad_norm": 1.9203109812512724, "language_loss": 0.81001902, "learning_rate": 3.822658818570145e-06, "loss": 0.83235621, "num_input_tokens_seen": 48631705, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.1484375, "step": 2248, "time_per_iteration": 2.688918113708496 }, { "auxiliary_loss_clip": 0.01174574, "auxiliary_loss_mlp": 0.01044638, "balance_loss_clip": 1.02527785, "balance_loss_mlp": 1.05173624, "epoch": 0.13521719525026304, "flos": 23186371305600.0, "grad_norm": 1.9847341714504587, "language_loss": 0.7720592, "learning_rate": 3.822503244939069e-06, "loss": 0.79425138, "num_input_tokens_seen": 48649740, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.046875, "step": 2249, "time_per_iteration": 2.6854088306427 }, { "auxiliary_loss_clip": 0.01190073, "auxiliary_loss_mlp": 0.01053863, "balance_loss_clip": 1.03439617, "balance_loss_mlp": 1.05512762, "epoch": 0.135277318502931, "flos": 24790931752320.0, "grad_norm": 1.4266693921561109, "language_loss": 0.84289658, "learning_rate": 3.822347606267541e-06, "loss": 0.865336, "num_input_tokens_seen": 48671565, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.078125, "step": 2250, "time_per_iteration": 2.769468069076538 }, { "auxiliary_loss_clip": 0.01174316, "auxiliary_loss_mlp": 0.01052358, "balance_loss_clip": 1.03131795, "balance_loss_mlp": 1.05471444, "epoch": 0.13533744175559898, "flos": 21908525990400.0, "grad_norm": 1.8448150899342264, "language_loss": 0.81851536, "learning_rate": 3.8221919025611145e-06, "loss": 0.84078217, "num_input_tokens_seen": 48690425, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.1015625, "step": 2251, "time_per_iteration": 2.686624526977539 }, { "auxiliary_loss_clip": 0.01163011, "auxiliary_loss_mlp": 0.01047843, "balance_loss_clip": 1.02627826, "balance_loss_mlp": 1.05261946, "epoch": 0.13539756500826694, "flos": 21211643249280.0, "grad_norm": 1.6803713340186222, "language_loss": 0.8575241, "learning_rate": 3.822036133825346e-06, "loss": 0.87963259, "num_input_tokens_seen": 48707505, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.109375, "step": 2252, "time_per_iteration": 2.660574197769165 }, { "auxiliary_loss_clip": 0.01067108, "auxiliary_loss_mlp": 0.01005415, "balance_loss_clip": 1.00305486, "balance_loss_mlp": 1.01824713, "epoch": 0.1354576882609349, "flos": 63242103634560.0, "grad_norm": 0.746213051921625, "language_loss": 0.61822593, "learning_rate": 3.821880300065794e-06, "loss": 0.63895118, "num_input_tokens_seen": 48775895, "router_z_loss_clip": 0.02355957, "router_z_loss_mlp": 0.3984375, "step": 2253, "time_per_iteration": 3.268698215484619 }, { "auxiliary_loss_clip": 0.01172845, "auxiliary_loss_mlp": 0.01059488, "balance_loss_clip": 1.04003274, "balance_loss_mlp": 1.05429316, "epoch": 0.1355178115136029, "flos": 25483037984640.0, "grad_norm": 1.726678169622357, "language_loss": 0.89277506, "learning_rate": 3.821724401288022e-06, "loss": 0.91509837, "num_input_tokens_seen": 48798370, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.09375, "step": 2254, "time_per_iteration": 2.6256816387176514 }, { "auxiliary_loss_clip": 0.0118489, "auxiliary_loss_mlp": 0.01050083, "balance_loss_clip": 1.02788579, "balance_loss_mlp": 1.0524447, "epoch": 0.13557793476627086, "flos": 21616967295360.0, "grad_norm": 2.36635400043011, "language_loss": 0.84364402, "learning_rate": 3.821568437497592e-06, "loss": 0.86599374, "num_input_tokens_seen": 48817955, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.140625, "step": 2255, "time_per_iteration": 2.6427090167999268 }, { "auxiliary_loss_clip": 0.01159906, "auxiliary_loss_mlp": 0.0104675, "balance_loss_clip": 1.02693701, "balance_loss_mlp": 1.05050874, "epoch": 0.13563805801893883, "flos": 24928253447040.0, "grad_norm": 2.3332409496398188, "language_loss": 0.74619979, "learning_rate": 3.821412408700069e-06, "loss": 0.76826644, "num_input_tokens_seen": 48836330, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.09375, "step": 2256, "time_per_iteration": 2.5791196823120117 }, { "auxiliary_loss_clip": 0.01196243, "auxiliary_loss_mlp": 0.01053725, "balance_loss_clip": 1.03331673, "balance_loss_mlp": 1.05413961, "epoch": 0.1356981812716068, "flos": 14750272206720.0, "grad_norm": 1.9938805991322177, "language_loss": 0.83300757, "learning_rate": 3.821256314901023e-06, "loss": 0.85550725, "num_input_tokens_seen": 48851890, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.1484375, "step": 2257, "time_per_iteration": 2.5771830081939697 }, { "auxiliary_loss_clip": 0.01186477, "auxiliary_loss_mlp": 0.01299415, "balance_loss_clip": 1.02911925, "balance_loss_mlp": 1.05281091, "epoch": 0.13575830452427476, "flos": 11108571822720.0, "grad_norm": 2.4358764188054005, "language_loss": 0.81887448, "learning_rate": 3.821100156106024e-06, "loss": 0.84373337, "num_input_tokens_seen": 48865510, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.15625, "step": 2258, "time_per_iteration": 2.5282766819000244 }, { "auxiliary_loss_clip": 0.01172465, "auxiliary_loss_mlp": 0.01050799, "balance_loss_clip": 1.02822065, "balance_loss_mlp": 1.05285192, "epoch": 0.13581842777694272, "flos": 17960290940160.0, "grad_norm": 2.3706410668509568, "language_loss": 0.82410014, "learning_rate": 3.820943932320644e-06, "loss": 0.84633273, "num_input_tokens_seen": 48882360, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 1.1015625, "step": 2259, "time_per_iteration": 2.561452865600586 }, { "auxiliary_loss_clip": 0.01189849, "auxiliary_loss_mlp": 0.01052635, "balance_loss_clip": 1.03400278, "balance_loss_mlp": 1.05313897, "epoch": 0.1358785510296107, "flos": 22857142222080.0, "grad_norm": 1.9883607383717217, "language_loss": 0.73441476, "learning_rate": 3.82078764355046e-06, "loss": 0.75683963, "num_input_tokens_seen": 48902700, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.09375, "step": 2260, "time_per_iteration": 2.6061084270477295 }, { "auxiliary_loss_clip": 0.01181209, "auxiliary_loss_mlp": 0.01057282, "balance_loss_clip": 1.03819609, "balance_loss_mlp": 1.05264926, "epoch": 0.13593867428227868, "flos": 25739404329600.0, "grad_norm": 2.259680658397799, "language_loss": 0.74880463, "learning_rate": 3.820631289801048e-06, "loss": 0.77118957, "num_input_tokens_seen": 48922525, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.1015625, "step": 2261, "time_per_iteration": 4.174629211425781 }, { "auxiliary_loss_clip": 0.01162328, "auxiliary_loss_mlp": 0.0104738, "balance_loss_clip": 1.02825892, "balance_loss_mlp": 1.05020368, "epoch": 0.13599879753494665, "flos": 31249214225280.0, "grad_norm": 1.9420220353687372, "language_loss": 0.62833714, "learning_rate": 3.82047487107799e-06, "loss": 0.65043426, "num_input_tokens_seen": 48942510, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.1171875, "step": 2262, "time_per_iteration": 5.60773229598999 }, { "auxiliary_loss_clip": 0.01183885, "auxiliary_loss_mlp": 0.01050344, "balance_loss_clip": 1.03149688, "balance_loss_mlp": 1.05330741, "epoch": 0.1360589207876146, "flos": 23915034604800.0, "grad_norm": 2.946648837081815, "language_loss": 0.81906021, "learning_rate": 3.820318387386865e-06, "loss": 0.84140247, "num_input_tokens_seen": 48962625, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.125, "step": 2263, "time_per_iteration": 2.6585240364074707 }, { "auxiliary_loss_clip": 0.01212486, "auxiliary_loss_mlp": 0.01061254, "balance_loss_clip": 1.04004633, "balance_loss_mlp": 1.05494285, "epoch": 0.13611904404028258, "flos": 19974197756160.0, "grad_norm": 1.857015781813505, "language_loss": 0.87799788, "learning_rate": 3.8201618387332605e-06, "loss": 0.90073526, "num_input_tokens_seen": 48982525, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.125, "step": 2264, "time_per_iteration": 2.6217200756073 }, { "auxiliary_loss_clip": 0.01177954, "auxiliary_loss_mlp": 0.01054031, "balance_loss_clip": 1.03222775, "balance_loss_mlp": 1.05471194, "epoch": 0.13617916729295054, "flos": 15340644144000.0, "grad_norm": 2.792366791809436, "language_loss": 0.71093607, "learning_rate": 3.82000522512276e-06, "loss": 0.73325586, "num_input_tokens_seen": 48997605, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.140625, "step": 2265, "time_per_iteration": 4.792941570281982 }, { "auxiliary_loss_clip": 0.01199227, "auxiliary_loss_mlp": 0.01046457, "balance_loss_clip": 1.02729988, "balance_loss_mlp": 1.05428135, "epoch": 0.1362392905456185, "flos": 27451445247360.0, "grad_norm": 1.9880533322018084, "language_loss": 0.66589421, "learning_rate": 3.819848546560957e-06, "loss": 0.68835104, "num_input_tokens_seen": 49018535, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0859375, "step": 2266, "time_per_iteration": 2.713637590408325 }, { "auxiliary_loss_clip": 0.01198139, "auxiliary_loss_mlp": 0.01060765, "balance_loss_clip": 1.04225147, "balance_loss_mlp": 1.05402029, "epoch": 0.1362994137982865, "flos": 25009017177600.0, "grad_norm": 1.64988398724298, "language_loss": 0.76390958, "learning_rate": 3.819691803053439e-06, "loss": 0.78649867, "num_input_tokens_seen": 49038865, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.078125, "step": 2267, "time_per_iteration": 2.667942762374878 }, { "auxiliary_loss_clip": 0.0117893, "auxiliary_loss_mlp": 0.0104897, "balance_loss_clip": 1.02927637, "balance_loss_mlp": 1.05062127, "epoch": 0.13635953705095447, "flos": 20303031790080.0, "grad_norm": 3.663831520609217, "language_loss": 0.81571805, "learning_rate": 3.819534994605802e-06, "loss": 0.83799708, "num_input_tokens_seen": 49058010, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.1015625, "step": 2268, "time_per_iteration": 2.658733606338501 }, { "auxiliary_loss_clip": 0.01172876, "auxiliary_loss_mlp": 0.01044224, "balance_loss_clip": 1.02399421, "balance_loss_mlp": 1.0556339, "epoch": 0.13641966030362243, "flos": 31358418549120.0, "grad_norm": 1.8544004871045743, "language_loss": 0.75834465, "learning_rate": 3.819378121223641e-06, "loss": 0.78051567, "num_input_tokens_seen": 49080330, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.078125, "step": 2269, "time_per_iteration": 2.6411361694335938 }, { "auxiliary_loss_clip": 0.01196536, "auxiliary_loss_mlp": 0.01045977, "balance_loss_clip": 1.02620041, "balance_loss_mlp": 1.05723381, "epoch": 0.1364797835562904, "flos": 20478095700480.0, "grad_norm": 1.939983456402531, "language_loss": 0.80481446, "learning_rate": 3.819221182912555e-06, "loss": 0.82723963, "num_input_tokens_seen": 49097035, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.1171875, "step": 2270, "time_per_iteration": 2.6754815578460693 }, { "auxiliary_loss_clip": 0.01186722, "auxiliary_loss_mlp": 0.01055714, "balance_loss_clip": 1.03530514, "balance_loss_mlp": 1.05472267, "epoch": 0.13653990680895836, "flos": 13078343802240.0, "grad_norm": 2.615699238064539, "language_loss": 0.75914252, "learning_rate": 3.819064179678145e-06, "loss": 0.78156686, "num_input_tokens_seen": 49113945, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.140625, "step": 2271, "time_per_iteration": 2.5776071548461914 }, { "auxiliary_loss_clip": 0.01174417, "auxiliary_loss_mlp": 0.01056462, "balance_loss_clip": 1.03617287, "balance_loss_mlp": 1.05427027, "epoch": 0.13660003006162633, "flos": 16946712961920.0, "grad_norm": 2.0849709894452784, "language_loss": 0.8065685, "learning_rate": 3.8189071115260134e-06, "loss": 0.82887733, "num_input_tokens_seen": 49132855, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.109375, "step": 2272, "time_per_iteration": 2.668100357055664 }, { "auxiliary_loss_clip": 0.01072339, "auxiliary_loss_mlp": 0.01008159, "balance_loss_clip": 1.00564384, "balance_loss_mlp": 1.02393341, "epoch": 0.1366601533142943, "flos": 68682749892480.0, "grad_norm": 0.6897549366513411, "language_loss": 0.60673016, "learning_rate": 3.818749978461765e-06, "loss": 0.62753516, "num_input_tokens_seen": 49198310, "router_z_loss_clip": 0.02514648, "router_z_loss_mlp": 0.39453125, "step": 2273, "time_per_iteration": 3.311460256576538 }, { "auxiliary_loss_clip": 0.01163381, "auxiliary_loss_mlp": 0.0129549, "balance_loss_clip": 1.02704906, "balance_loss_mlp": 1.05454981, "epoch": 0.13672027656696228, "flos": 19244241567360.0, "grad_norm": 1.6596350511136926, "language_loss": 0.77439123, "learning_rate": 3.8185927804910096e-06, "loss": 0.79897988, "num_input_tokens_seen": 49217250, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.0859375, "step": 2274, "time_per_iteration": 2.5951507091522217 }, { "auxiliary_loss_clip": 0.01175199, "auxiliary_loss_mlp": 0.0104965, "balance_loss_clip": 1.02866936, "balance_loss_mlp": 1.05533159, "epoch": 0.13678039981963025, "flos": 24534924543360.0, "grad_norm": 1.7961585719476987, "language_loss": 0.78148818, "learning_rate": 3.818435517619355e-06, "loss": 0.80373657, "num_input_tokens_seen": 49236615, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.109375, "step": 2275, "time_per_iteration": 2.6956982612609863 }, { "auxiliary_loss_clip": 0.01173562, "auxiliary_loss_mlp": 0.01044895, "balance_loss_clip": 1.02516639, "balance_loss_mlp": 1.05422711, "epoch": 0.13684052307229821, "flos": 15669334523520.0, "grad_norm": 2.6821048421348186, "language_loss": 0.80790657, "learning_rate": 3.818278189852415e-06, "loss": 0.83009112, "num_input_tokens_seen": 49253935, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.109375, "step": 2276, "time_per_iteration": 2.5810937881469727 }, { "auxiliary_loss_clip": 0.01178881, "auxiliary_loss_mlp": 0.01056366, "balance_loss_clip": 1.03322744, "balance_loss_mlp": 1.05318952, "epoch": 0.13690064632496618, "flos": 28364689560960.0, "grad_norm": 2.221927293917284, "language_loss": 0.68961275, "learning_rate": 3.8181207971958025e-06, "loss": 0.7119652, "num_input_tokens_seen": 49273605, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 1.171875, "step": 2277, "time_per_iteration": 2.728224515914917 }, { "auxiliary_loss_clip": 0.0117606, "auxiliary_loss_mlp": 0.01297397, "balance_loss_clip": 1.02851832, "balance_loss_mlp": 1.05470443, "epoch": 0.13696076957763414, "flos": 23404779953280.0, "grad_norm": 2.0559548059823456, "language_loss": 0.80247957, "learning_rate": 3.817963339655137e-06, "loss": 0.82721418, "num_input_tokens_seen": 49291785, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.125, "step": 2278, "time_per_iteration": 2.6292543411254883 }, { "auxiliary_loss_clip": 0.01177265, "auxiliary_loss_mlp": 0.01045818, "balance_loss_clip": 1.0259335, "balance_loss_mlp": 1.05773711, "epoch": 0.1370208928303021, "flos": 37196595601920.0, "grad_norm": 2.5482005438123645, "language_loss": 0.75439513, "learning_rate": 3.8178058172360346e-06, "loss": 0.77662599, "num_input_tokens_seen": 49311405, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.1015625, "step": 2279, "time_per_iteration": 2.7479922771453857 }, { "auxiliary_loss_clip": 0.01212163, "auxiliary_loss_mlp": 0.01053453, "balance_loss_clip": 1.03317547, "balance_loss_mlp": 1.05250013, "epoch": 0.13708101608297008, "flos": 26976311118720.0, "grad_norm": 2.2868787761313656, "language_loss": 0.76092184, "learning_rate": 3.817648229944119e-06, "loss": 0.78357804, "num_input_tokens_seen": 49331835, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.1484375, "step": 2280, "time_per_iteration": 2.629037618637085 }, { "auxiliary_loss_clip": 0.01195916, "auxiliary_loss_mlp": 0.01044629, "balance_loss_clip": 1.0253768, "balance_loss_mlp": 1.05119801, "epoch": 0.13714113933563807, "flos": 32556864850560.0, "grad_norm": 1.6783560310507526, "language_loss": 0.79981482, "learning_rate": 3.817490577785014e-06, "loss": 0.82222033, "num_input_tokens_seen": 49352290, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.0859375, "step": 2281, "time_per_iteration": 2.714641571044922 }, { "auxiliary_loss_clip": 0.01168756, "auxiliary_loss_mlp": 0.01054413, "balance_loss_clip": 1.03410029, "balance_loss_mlp": 1.05457616, "epoch": 0.13720126258830603, "flos": 16101267569280.0, "grad_norm": 1.7276357163464107, "language_loss": 0.83642948, "learning_rate": 3.817332860764346e-06, "loss": 0.85866117, "num_input_tokens_seen": 49370285, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.140625, "step": 2282, "time_per_iteration": 2.560642719268799 }, { "auxiliary_loss_clip": 0.01182341, "auxiliary_loss_mlp": 0.01053804, "balance_loss_clip": 1.03452802, "balance_loss_mlp": 1.05468976, "epoch": 0.137261385840974, "flos": 18953544798720.0, "grad_norm": 2.442307961312467, "language_loss": 0.73641866, "learning_rate": 3.817175078887742e-06, "loss": 0.75878012, "num_input_tokens_seen": 49389610, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.09375, "step": 2283, "time_per_iteration": 2.652402400970459 }, { "auxiliary_loss_clip": 0.01175015, "auxiliary_loss_mlp": 0.01050344, "balance_loss_clip": 1.03181934, "balance_loss_mlp": 1.05739546, "epoch": 0.13732150909364196, "flos": 23295360147840.0, "grad_norm": 1.9654642005002303, "language_loss": 0.83916694, "learning_rate": 3.8170172321608345e-06, "loss": 0.86142051, "num_input_tokens_seen": 49408390, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0859375, "step": 2284, "time_per_iteration": 2.6719915866851807 }, { "auxiliary_loss_clip": 0.0118527, "auxiliary_loss_mlp": 0.01047848, "balance_loss_clip": 1.02653337, "balance_loss_mlp": 1.05209029, "epoch": 0.13738163234630993, "flos": 29351263489920.0, "grad_norm": 2.089167413592136, "language_loss": 0.74892718, "learning_rate": 3.816859320589255e-06, "loss": 0.77125835, "num_input_tokens_seen": 49427725, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.1484375, "step": 2285, "time_per_iteration": 2.735952854156494 }, { "auxiliary_loss_clip": 0.01180364, "auxiliary_loss_mlp": 0.0104865, "balance_loss_clip": 1.02913547, "balance_loss_mlp": 1.05406117, "epoch": 0.1374417555989779, "flos": 26651319840000.0, "grad_norm": 1.8901104658897945, "language_loss": 0.7476002, "learning_rate": 3.81670134417864e-06, "loss": 0.76989037, "num_input_tokens_seen": 49449000, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0859375, "step": 2286, "time_per_iteration": 2.7167019844055176 }, { "auxiliary_loss_clip": 0.01203571, "auxiliary_loss_mlp": 0.01047855, "balance_loss_clip": 1.0267787, "balance_loss_mlp": 1.05476379, "epoch": 0.1375018788516459, "flos": 28403401443840.0, "grad_norm": 2.124384489450479, "language_loss": 0.86146152, "learning_rate": 3.8165433029346276e-06, "loss": 0.88397586, "num_input_tokens_seen": 49468360, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.125, "step": 2287, "time_per_iteration": 2.745077133178711 }, { "auxiliary_loss_clip": 0.01193126, "auxiliary_loss_mlp": 0.01048637, "balance_loss_clip": 1.02902734, "balance_loss_mlp": 1.05467153, "epoch": 0.13756200210431385, "flos": 37413783187200.0, "grad_norm": 2.2544874709184404, "language_loss": 0.68138969, "learning_rate": 3.816385196862858e-06, "loss": 0.70380729, "num_input_tokens_seen": 49493450, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.109375, "step": 2288, "time_per_iteration": 2.948192834854126 }, { "auxiliary_loss_clip": 0.0120126, "auxiliary_loss_mlp": 0.01051615, "balance_loss_clip": 1.03078902, "balance_loss_mlp": 1.05323958, "epoch": 0.13762212535698182, "flos": 22711021695360.0, "grad_norm": 2.7954121406886836, "language_loss": 0.86861938, "learning_rate": 3.816227025968972e-06, "loss": 0.89114809, "num_input_tokens_seen": 49511220, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.1171875, "step": 2289, "time_per_iteration": 2.5738861560821533 }, { "auxiliary_loss_clip": 0.0117521, "auxiliary_loss_mlp": 0.01294542, "balance_loss_clip": 1.02787125, "balance_loss_mlp": 1.0505116, "epoch": 0.13768224860964978, "flos": 23952130375680.0, "grad_norm": 1.919483925299843, "language_loss": 0.7430771, "learning_rate": 3.8160687902586155e-06, "loss": 0.76777458, "num_input_tokens_seen": 49529820, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0703125, "step": 2290, "time_per_iteration": 2.67655348777771 }, { "auxiliary_loss_clip": 0.01088151, "auxiliary_loss_mlp": 0.01004333, "balance_loss_clip": 1.00186539, "balance_loss_mlp": 1.02102041, "epoch": 0.13774237186231775, "flos": 63590438753280.0, "grad_norm": 0.6994125021319206, "language_loss": 0.5156917, "learning_rate": 3.815910489737436e-06, "loss": 0.5366165, "num_input_tokens_seen": 49595325, "router_z_loss_clip": 0.0246582, "router_z_loss_mlp": 0.40234375, "step": 2291, "time_per_iteration": 3.2152421474456787 }, { "auxiliary_loss_clip": 0.01172468, "auxiliary_loss_mlp": 0.0104743, "balance_loss_clip": 1.02736723, "balance_loss_mlp": 1.05215907, "epoch": 0.1378024951149857, "flos": 24279456038400.0, "grad_norm": 1.7635507683317195, "language_loss": 0.70795345, "learning_rate": 3.815752124411081e-06, "loss": 0.73015249, "num_input_tokens_seen": 49615850, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.109375, "step": 2292, "time_per_iteration": 2.704540491104126 }, { "auxiliary_loss_clip": 0.01172795, "auxiliary_loss_mlp": 0.01044432, "balance_loss_clip": 1.0255487, "balance_loss_mlp": 1.05415452, "epoch": 0.13786261836765368, "flos": 14021537080320.0, "grad_norm": 2.864504537432699, "language_loss": 0.80218923, "learning_rate": 3.815593694285204e-06, "loss": 0.82436144, "num_input_tokens_seen": 49631860, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.09375, "step": 2293, "time_per_iteration": 2.515890598297119 }, { "auxiliary_loss_clip": 0.01171573, "auxiliary_loss_mlp": 0.01045329, "balance_loss_clip": 1.02527761, "balance_loss_mlp": 1.05106151, "epoch": 0.13792274162032167, "flos": 28878679226880.0, "grad_norm": 2.1350279427451144, "language_loss": 0.78340834, "learning_rate": 3.815435199365459e-06, "loss": 0.8055774, "num_input_tokens_seen": 49652145, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.1171875, "step": 2294, "time_per_iteration": 2.6504123210906982 }, { "auxiliary_loss_clip": 0.01188843, "auxiliary_loss_mlp": 0.0104495, "balance_loss_clip": 1.0281893, "balance_loss_mlp": 1.05326605, "epoch": 0.13798286487298964, "flos": 21141150808320.0, "grad_norm": 2.9349925881176535, "language_loss": 0.79810584, "learning_rate": 3.815276639657501e-06, "loss": 0.82044381, "num_input_tokens_seen": 49669880, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 1.078125, "step": 2295, "time_per_iteration": 2.7041075229644775 }, { "auxiliary_loss_clip": 0.01168391, "auxiliary_loss_mlp": 0.01047415, "balance_loss_clip": 1.02801931, "balance_loss_mlp": 1.04994655, "epoch": 0.1380429881256576, "flos": 22487477402880.0, "grad_norm": 1.8898191748559021, "language_loss": 0.78387135, "learning_rate": 3.815118015166989e-06, "loss": 0.80602944, "num_input_tokens_seen": 49687255, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.09375, "step": 2296, "time_per_iteration": 2.6508748531341553 }, { "auxiliary_loss_clip": 0.01175004, "auxiliary_loss_mlp": 0.01060093, "balance_loss_clip": 1.03967297, "balance_loss_mlp": 1.05588007, "epoch": 0.13810311137832557, "flos": 21393674398080.0, "grad_norm": 2.160150502628085, "language_loss": 0.78536433, "learning_rate": 3.814959325899584e-06, "loss": 0.8077153, "num_input_tokens_seen": 49706650, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.1015625, "step": 2297, "time_per_iteration": 2.629232406616211 }, { "auxiliary_loss_clip": 0.01197905, "auxiliary_loss_mlp": 0.01047293, "balance_loss_clip": 1.02851748, "balance_loss_mlp": 1.05372262, "epoch": 0.13816323463099353, "flos": 25989844930560.0, "grad_norm": 2.0751426498113874, "language_loss": 0.69538671, "learning_rate": 3.81480057186095e-06, "loss": 0.7178387, "num_input_tokens_seen": 49725715, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.078125, "step": 2298, "time_per_iteration": 2.7050013542175293 }, { "auxiliary_loss_clip": 0.01183337, "auxiliary_loss_mlp": 0.01048304, "balance_loss_clip": 1.02936125, "balance_loss_mlp": 1.05335712, "epoch": 0.1382233578836615, "flos": 19244313394560.0, "grad_norm": 1.8622637276380296, "language_loss": 0.86646223, "learning_rate": 3.814641753056751e-06, "loss": 0.88877857, "num_input_tokens_seen": 49744710, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.125, "step": 2299, "time_per_iteration": 2.606727361679077 }, { "auxiliary_loss_clip": 0.01161213, "auxiliary_loss_mlp": 0.01051808, "balance_loss_clip": 1.03196013, "balance_loss_mlp": 1.05105472, "epoch": 0.1382834811363295, "flos": 25666290195840.0, "grad_norm": 2.7204825393154772, "language_loss": 0.75398338, "learning_rate": 3.8144828694926565e-06, "loss": 0.77611363, "num_input_tokens_seen": 49764300, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.1015625, "step": 2300, "time_per_iteration": 2.6301910877227783 }, { "auxiliary_loss_clip": 0.01189587, "auxiliary_loss_mlp": 0.01047232, "balance_loss_clip": 1.02837336, "balance_loss_mlp": 1.05421269, "epoch": 0.13834360438899745, "flos": 19784193788160.0, "grad_norm": 2.836270547786878, "language_loss": 0.8187238, "learning_rate": 3.814323921174335e-06, "loss": 0.84109193, "num_input_tokens_seen": 49778380, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.078125, "step": 2301, "time_per_iteration": 2.6780667304992676 }, { "auxiliary_loss_clip": 0.01175512, "auxiliary_loss_mlp": 0.01292703, "balance_loss_clip": 1.02460074, "balance_loss_mlp": 1.05058169, "epoch": 0.13840372764166542, "flos": 26651858544000.0, "grad_norm": 1.9319648842593014, "language_loss": 0.85972863, "learning_rate": 3.81416490810746e-06, "loss": 0.8844108, "num_input_tokens_seen": 49797460, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.0625, "step": 2302, "time_per_iteration": 4.15455174446106 }, { "auxiliary_loss_clip": 0.01069568, "auxiliary_loss_mlp": 0.01004181, "balance_loss_clip": 1.00160587, "balance_loss_mlp": 1.02170825, "epoch": 0.13846385089433338, "flos": 70510998286080.0, "grad_norm": 0.7533626107182264, "language_loss": 0.65664709, "learning_rate": 3.814005830297706e-06, "loss": 0.67738456, "num_input_tokens_seen": 49868005, "router_z_loss_clip": 0.02575684, "router_z_loss_mlp": 0.390625, "step": 2303, "time_per_iteration": 6.299836874008179 }, { "auxiliary_loss_clip": 0.01165314, "auxiliary_loss_mlp": 0.01047084, "balance_loss_clip": 1.02914333, "balance_loss_mlp": 1.05016398, "epoch": 0.13852397414700135, "flos": 17348732956800.0, "grad_norm": 1.9343479202848999, "language_loss": 0.78367686, "learning_rate": 3.81384668775075e-06, "loss": 0.80580091, "num_input_tokens_seen": 49885825, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.0625, "step": 2304, "time_per_iteration": 2.594891309738159 }, { "auxiliary_loss_clip": 0.01173484, "auxiliary_loss_mlp": 0.01047861, "balance_loss_clip": 1.02901387, "balance_loss_mlp": 1.05367529, "epoch": 0.13858409739966931, "flos": 21543781334400.0, "grad_norm": 1.6568744745747441, "language_loss": 0.77820158, "learning_rate": 3.8136874804722724e-06, "loss": 0.80041504, "num_input_tokens_seen": 49905975, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.109375, "step": 2305, "time_per_iteration": 2.6944892406463623 }, { "auxiliary_loss_clip": 0.01209743, "auxiliary_loss_mlp": 0.01049643, "balance_loss_clip": 1.03141594, "balance_loss_mlp": 1.04917765, "epoch": 0.13864422065233728, "flos": 21579907438080.0, "grad_norm": 2.3405908785785066, "language_loss": 0.87845778, "learning_rate": 3.813528208467953e-06, "loss": 0.90105164, "num_input_tokens_seen": 49925800, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 1.0546875, "step": 2306, "time_per_iteration": 4.511924743652344 }, { "auxiliary_loss_clip": 0.01057727, "auxiliary_loss_mlp": 0.01001331, "balance_loss_clip": 0.99881566, "balance_loss_mlp": 1.01988149, "epoch": 0.13870434390500527, "flos": 53371156872960.0, "grad_norm": 0.8839214214793798, "language_loss": 0.58983701, "learning_rate": 3.813368871743477e-06, "loss": 0.61042756, "num_input_tokens_seen": 49977620, "router_z_loss_clip": 0.02514648, "router_z_loss_mlp": 0.37890625, "step": 2307, "time_per_iteration": 3.1861369609832764 }, { "auxiliary_loss_clip": 0.01165362, "auxiliary_loss_mlp": 0.01295092, "balance_loss_clip": 1.0268538, "balance_loss_mlp": 1.05246842, "epoch": 0.13876446715767324, "flos": 22565906749440.0, "grad_norm": 1.7390348848816117, "language_loss": 0.7940731, "learning_rate": 3.813209470304531e-06, "loss": 0.81867766, "num_input_tokens_seen": 49996650, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.125, "step": 2308, "time_per_iteration": 2.7434346675872803 }, { "auxiliary_loss_clip": 0.01187081, "auxiliary_loss_mlp": 0.01039068, "balance_loss_clip": 1.02010155, "balance_loss_mlp": 1.05262899, "epoch": 0.1388245904103412, "flos": 20705231352960.0, "grad_norm": 3.063461491110337, "language_loss": 0.77998608, "learning_rate": 3.813050004156802e-06, "loss": 0.80224752, "num_input_tokens_seen": 50015640, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0703125, "step": 2309, "time_per_iteration": 2.5903265476226807 }, { "auxiliary_loss_clip": 0.01192717, "auxiliary_loss_mlp": 0.01046891, "balance_loss_clip": 1.0274477, "balance_loss_mlp": 1.0530622, "epoch": 0.13888471366300917, "flos": 20554729367040.0, "grad_norm": 1.9654332278310358, "language_loss": 0.67932814, "learning_rate": 3.812890473305983e-06, "loss": 0.70172423, "num_input_tokens_seen": 50033500, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.125, "step": 2310, "time_per_iteration": 2.7983179092407227 }, { "auxiliary_loss_clip": 0.01161138, "auxiliary_loss_mlp": 0.01059026, "balance_loss_clip": 1.03841519, "balance_loss_mlp": 1.05181956, "epoch": 0.13894483691567713, "flos": 13838033473920.0, "grad_norm": 2.007813508339734, "language_loss": 0.83937204, "learning_rate": 3.812730877757766e-06, "loss": 0.8615737, "num_input_tokens_seen": 50050075, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.09375, "step": 2311, "time_per_iteration": 2.6287686824798584 }, { "auxiliary_loss_clip": 0.01167247, "auxiliary_loss_mlp": 0.01044067, "balance_loss_clip": 1.02519619, "balance_loss_mlp": 1.05380678, "epoch": 0.1390049601683451, "flos": 28031186759040.0, "grad_norm": 2.429919998296328, "language_loss": 0.81629628, "learning_rate": 3.812571217517847e-06, "loss": 0.83840942, "num_input_tokens_seen": 50070080, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.1328125, "step": 2312, "time_per_iteration": 2.8802073001861572 }, { "auxiliary_loss_clip": 0.01192779, "auxiliary_loss_mlp": 0.01294878, "balance_loss_clip": 1.02809644, "balance_loss_mlp": 1.05440259, "epoch": 0.13906508342101306, "flos": 26756860976640.0, "grad_norm": 2.1843145437728215, "language_loss": 0.86795902, "learning_rate": 3.8124114925919234e-06, "loss": 0.89283556, "num_input_tokens_seen": 50090040, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.109375, "step": 2313, "time_per_iteration": 2.8342466354370117 }, { "auxiliary_loss_clip": 0.01161191, "auxiliary_loss_mlp": 0.0105084, "balance_loss_clip": 1.03070557, "balance_loss_mlp": 1.05283976, "epoch": 0.13912520667368106, "flos": 24535104111360.0, "grad_norm": 2.044633459560367, "language_loss": 0.79646283, "learning_rate": 3.812251702985696e-06, "loss": 0.81858313, "num_input_tokens_seen": 50110595, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.0859375, "step": 2314, "time_per_iteration": 2.6692774295806885 }, { "auxiliary_loss_clip": 0.01186509, "auxiliary_loss_mlp": 0.01052529, "balance_loss_clip": 1.03363431, "balance_loss_mlp": 1.05161381, "epoch": 0.13918532992634902, "flos": 19383215287680.0, "grad_norm": 3.452175594917151, "language_loss": 0.85026395, "learning_rate": 3.8120918487048673e-06, "loss": 0.87265432, "num_input_tokens_seen": 50125430, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.078125, "step": 2315, "time_per_iteration": 2.647001028060913 }, { "auxiliary_loss_clip": 0.01170768, "auxiliary_loss_mlp": 0.01053757, "balance_loss_clip": 1.03369403, "balance_loss_mlp": 1.05340362, "epoch": 0.139245453179017, "flos": 21323756574720.0, "grad_norm": 2.090532166716492, "language_loss": 0.77296001, "learning_rate": 3.8119319297551417e-06, "loss": 0.7952053, "num_input_tokens_seen": 50144120, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.078125, "step": 2316, "time_per_iteration": 2.5702524185180664 }, { "auxiliary_loss_clip": 0.01158035, "auxiliary_loss_mlp": 0.01305856, "balance_loss_clip": 1.03705096, "balance_loss_mlp": 1.0512948, "epoch": 0.13930557643168495, "flos": 19500607912320.0, "grad_norm": 1.6755915400318468, "language_loss": 0.76955903, "learning_rate": 3.811771946142226e-06, "loss": 0.79419792, "num_input_tokens_seen": 50162500, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.0703125, "step": 2317, "time_per_iteration": 2.569352626800537 }, { "auxiliary_loss_clip": 0.01182751, "auxiliary_loss_mlp": 0.01059204, "balance_loss_clip": 1.03846169, "balance_loss_mlp": 1.0542742, "epoch": 0.13936569968435292, "flos": 25410821690880.0, "grad_norm": 1.953201660935717, "language_loss": 0.80202454, "learning_rate": 3.8116118978718298e-06, "loss": 0.82444406, "num_input_tokens_seen": 50182415, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.109375, "step": 2318, "time_per_iteration": 2.565631151199341 }, { "auxiliary_loss_clip": 0.01074868, "auxiliary_loss_mlp": 0.01031778, "balance_loss_clip": 1.02928662, "balance_loss_mlp": 1.01937866, "epoch": 0.13942582293702088, "flos": 70771063731840.0, "grad_norm": 0.8561882653619879, "language_loss": 0.58987474, "learning_rate": 3.811451784949665e-06, "loss": 0.61094129, "num_input_tokens_seen": 50245160, "router_z_loss_clip": 0.02490234, "router_z_loss_mlp": 0.37109375, "step": 2319, "time_per_iteration": 3.2062184810638428 }, { "auxiliary_loss_clip": 0.01192487, "auxiliary_loss_mlp": 0.01053752, "balance_loss_clip": 1.03421366, "balance_loss_mlp": 1.05263305, "epoch": 0.13948594618968888, "flos": 35590885920000.0, "grad_norm": 2.762730767329365, "language_loss": 0.64998007, "learning_rate": 3.811291607381446e-06, "loss": 0.67244244, "num_input_tokens_seen": 50268215, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.125, "step": 2320, "time_per_iteration": 2.6803741455078125 }, { "auxiliary_loss_clip": 0.01194052, "auxiliary_loss_mlp": 0.01044767, "balance_loss_clip": 1.0254674, "balance_loss_mlp": 1.05167735, "epoch": 0.13954606944235684, "flos": 21105204272640.0, "grad_norm": 1.7398035565894732, "language_loss": 0.70556998, "learning_rate": 3.8111313651728887e-06, "loss": 0.7279582, "num_input_tokens_seen": 50288575, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0625, "step": 2321, "time_per_iteration": 2.6781771183013916 }, { "auxiliary_loss_clip": 0.01187096, "auxiliary_loss_mlp": 0.01052963, "balance_loss_clip": 1.03206563, "balance_loss_mlp": 1.05024242, "epoch": 0.1396061926950248, "flos": 25044425009280.0, "grad_norm": 1.9500927828673695, "language_loss": 0.85716009, "learning_rate": 3.810971058329712e-06, "loss": 0.87956071, "num_input_tokens_seen": 50308735, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.09375, "step": 2322, "time_per_iteration": 2.593452215194702 }, { "auxiliary_loss_clip": 0.01178704, "auxiliary_loss_mlp": 0.01050499, "balance_loss_clip": 1.03236771, "balance_loss_mlp": 1.04836714, "epoch": 0.13966631594769277, "flos": 37634023428480.0, "grad_norm": 1.6775205287911823, "language_loss": 0.66890788, "learning_rate": 3.810810686857636e-06, "loss": 0.6911999, "num_input_tokens_seen": 50331025, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.03125, "step": 2323, "time_per_iteration": 2.712106943130493 }, { "auxiliary_loss_clip": 0.01196543, "auxiliary_loss_mlp": 0.01047481, "balance_loss_clip": 1.02717936, "balance_loss_mlp": 1.05445313, "epoch": 0.13972643920036074, "flos": 16690993061760.0, "grad_norm": 1.86888999179416, "language_loss": 0.88681853, "learning_rate": 3.8106502507623847e-06, "loss": 0.90925878, "num_input_tokens_seen": 50349725, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.1484375, "step": 2324, "time_per_iteration": 2.6261417865753174 }, { "auxiliary_loss_clip": 0.0117068, "auxiliary_loss_mlp": 0.01052407, "balance_loss_clip": 1.03210545, "balance_loss_mlp": 1.05101299, "epoch": 0.1397865624530287, "flos": 23331055288320.0, "grad_norm": 3.1295942222902053, "language_loss": 0.71417642, "learning_rate": 3.810489750049684e-06, "loss": 0.73640728, "num_input_tokens_seen": 50367965, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.109375, "step": 2325, "time_per_iteration": 2.5723471641540527 }, { "auxiliary_loss_clip": 0.01182051, "auxiliary_loss_mlp": 0.01299419, "balance_loss_clip": 1.03130364, "balance_loss_mlp": 1.05499268, "epoch": 0.13984668570569667, "flos": 22778317825920.0, "grad_norm": 2.0365408228045894, "language_loss": 0.81891781, "learning_rate": 3.810329184725261e-06, "loss": 0.84373254, "num_input_tokens_seen": 50385605, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.0859375, "step": 2326, "time_per_iteration": 2.6797432899475098 }, { "auxiliary_loss_clip": 0.01165761, "auxiliary_loss_mlp": 0.01041994, "balance_loss_clip": 1.02417195, "balance_loss_mlp": 1.05319452, "epoch": 0.13990680895836466, "flos": 19464553635840.0, "grad_norm": 1.7975480383900204, "language_loss": 0.8881796, "learning_rate": 3.8101685547948456e-06, "loss": 0.9102571, "num_input_tokens_seen": 50403985, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 1.03125, "step": 2327, "time_per_iteration": 2.590061902999878 }, { "auxiliary_loss_clip": 0.01184292, "auxiliary_loss_mlp": 0.01050671, "balance_loss_clip": 1.03094196, "balance_loss_mlp": 1.05153823, "epoch": 0.13996693221103262, "flos": 20303283185280.0, "grad_norm": 2.0271892931556414, "language_loss": 0.84878135, "learning_rate": 3.8100078602641714e-06, "loss": 0.87113094, "num_input_tokens_seen": 50421590, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.0625, "step": 2328, "time_per_iteration": 2.615811347961426 }, { "auxiliary_loss_clip": 0.01168803, "auxiliary_loss_mlp": 0.01046101, "balance_loss_clip": 1.02612162, "balance_loss_mlp": 1.05124998, "epoch": 0.1400270554637006, "flos": 26617707688320.0, "grad_norm": 1.5882318108414992, "language_loss": 0.73596328, "learning_rate": 3.8098471011389723e-06, "loss": 0.75811231, "num_input_tokens_seen": 50443945, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.0859375, "step": 2329, "time_per_iteration": 2.6640753746032715 }, { "auxiliary_loss_clip": 0.01168677, "auxiliary_loss_mlp": 0.01047959, "balance_loss_clip": 1.02868295, "balance_loss_mlp": 1.04987347, "epoch": 0.14008717871636855, "flos": 19391475415680.0, "grad_norm": 2.170953140084605, "language_loss": 0.7843821, "learning_rate": 3.809686277424986e-06, "loss": 0.80654848, "num_input_tokens_seen": 50462065, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.09375, "step": 2330, "time_per_iteration": 2.6507039070129395 }, { "auxiliary_loss_clip": 0.01174325, "auxiliary_loss_mlp": 0.01046772, "balance_loss_clip": 1.02793694, "balance_loss_mlp": 1.04937744, "epoch": 0.14014730196903652, "flos": 15304266645120.0, "grad_norm": 2.3401082551116854, "language_loss": 0.71702933, "learning_rate": 3.809525389127951e-06, "loss": 0.73924029, "num_input_tokens_seen": 50479565, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.0703125, "step": 2331, "time_per_iteration": 2.5268630981445312 }, { "auxiliary_loss_clip": 0.01155269, "auxiliary_loss_mlp": 0.01050258, "balance_loss_clip": 1.03108907, "balance_loss_mlp": 1.05370808, "epoch": 0.14020742522170448, "flos": 14939701557120.0, "grad_norm": 1.7445012810074405, "language_loss": 0.72553051, "learning_rate": 3.8093644362536094e-06, "loss": 0.74758577, "num_input_tokens_seen": 50497305, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.015625, "step": 2332, "time_per_iteration": 2.5559170246124268 }, { "auxiliary_loss_clip": 0.01069824, "auxiliary_loss_mlp": 0.01027874, "balance_loss_clip": 1.02549005, "balance_loss_mlp": 1.02335978, "epoch": 0.14026754847437245, "flos": 48824580044160.0, "grad_norm": 0.8327717265592288, "language_loss": 0.56111151, "learning_rate": 3.809203418807706e-06, "loss": 0.58208853, "num_input_tokens_seen": 50549735, "router_z_loss_clip": 0.02380371, "router_z_loss_mlp": 0.375, "step": 2333, "time_per_iteration": 2.9955294132232666 }, { "auxiliary_loss_clip": 0.01184627, "auxiliary_loss_mlp": 0.01042224, "balance_loss_clip": 1.02323449, "balance_loss_mlp": 1.0513761, "epoch": 0.14032767172704044, "flos": 25773267876480.0, "grad_norm": 1.9515190593266545, "language_loss": 0.82625937, "learning_rate": 3.8090423367959862e-06, "loss": 0.84852785, "num_input_tokens_seen": 50570100, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0625, "step": 2334, "time_per_iteration": 2.6771252155303955 }, { "auxiliary_loss_clip": 0.01182938, "auxiliary_loss_mlp": 0.01047766, "balance_loss_clip": 1.02930045, "balance_loss_mlp": 1.04972541, "epoch": 0.1403877949797084, "flos": 21216312017280.0, "grad_norm": 1.7816364828438152, "language_loss": 0.80773813, "learning_rate": 3.8088811902241984e-06, "loss": 0.83004516, "num_input_tokens_seen": 50589185, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0625, "step": 2335, "time_per_iteration": 2.5538365840911865 }, { "auxiliary_loss_clip": 0.01200186, "auxiliary_loss_mlp": 0.01049066, "balance_loss_clip": 1.02690482, "balance_loss_mlp": 1.05309176, "epoch": 0.14044791823237637, "flos": 22747973811840.0, "grad_norm": 2.3761097907451045, "language_loss": 0.81913549, "learning_rate": 3.8087199790980943e-06, "loss": 0.84162802, "num_input_tokens_seen": 50609645, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.109375, "step": 2336, "time_per_iteration": 2.600363254547119 }, { "auxiliary_loss_clip": 0.01168697, "auxiliary_loss_mlp": 0.01048698, "balance_loss_clip": 1.02977967, "balance_loss_mlp": 1.05165243, "epoch": 0.14050804148504434, "flos": 22964443125120.0, "grad_norm": 1.9268983379288778, "language_loss": 0.80373263, "learning_rate": 3.8085587034234268e-06, "loss": 0.82590657, "num_input_tokens_seen": 50628385, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.078125, "step": 2337, "time_per_iteration": 2.520260810852051 }, { "auxiliary_loss_clip": 0.01165417, "auxiliary_loss_mlp": 0.01052649, "balance_loss_clip": 1.03413582, "balance_loss_mlp": 1.04870343, "epoch": 0.1405681647377123, "flos": 22200336080640.0, "grad_norm": 2.209543270145411, "language_loss": 0.79036748, "learning_rate": 3.8083973632059507e-06, "loss": 0.81254816, "num_input_tokens_seen": 50647260, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.078125, "step": 2338, "time_per_iteration": 2.6205894947052 }, { "auxiliary_loss_clip": 0.01174185, "auxiliary_loss_mlp": 0.01056529, "balance_loss_clip": 1.0364418, "balance_loss_mlp": 1.05517888, "epoch": 0.14062828799038027, "flos": 23732787974400.0, "grad_norm": 2.0551013771309665, "language_loss": 0.79961687, "learning_rate": 3.8082359584514254e-06, "loss": 0.82192403, "num_input_tokens_seen": 50666130, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.1015625, "step": 2339, "time_per_iteration": 2.538625955581665 }, { "auxiliary_loss_clip": 0.01204824, "auxiliary_loss_mlp": 0.01064207, "balance_loss_clip": 1.04530036, "balance_loss_mlp": 1.0520699, "epoch": 0.14068841124304826, "flos": 39202493685120.0, "grad_norm": 1.8663529364858227, "language_loss": 0.65436339, "learning_rate": 3.8080744891656095e-06, "loss": 0.67705375, "num_input_tokens_seen": 50687440, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0703125, "step": 2340, "time_per_iteration": 2.73441743850708 }, { "auxiliary_loss_clip": 0.01165588, "auxiliary_loss_mlp": 0.01054995, "balance_loss_clip": 1.03648186, "balance_loss_mlp": 1.05211604, "epoch": 0.14074853449571623, "flos": 20192283181440.0, "grad_norm": 2.378726296596466, "language_loss": 0.78567642, "learning_rate": 3.807912955354266e-06, "loss": 0.80788231, "num_input_tokens_seen": 50704030, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.046875, "step": 2341, "time_per_iteration": 2.5692193508148193 }, { "auxiliary_loss_clip": 0.01155848, "auxiliary_loss_mlp": 0.01055599, "balance_loss_clip": 1.03684783, "balance_loss_mlp": 1.05083811, "epoch": 0.1408086577483842, "flos": 18405871153920.0, "grad_norm": 2.008866449155085, "language_loss": 0.8001802, "learning_rate": 3.80775135702316e-06, "loss": 0.82229471, "num_input_tokens_seen": 50723305, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.046875, "step": 2342, "time_per_iteration": 2.5310122966766357 }, { "auxiliary_loss_clip": 0.01165637, "auxiliary_loss_mlp": 0.01057371, "balance_loss_clip": 1.03973985, "balance_loss_mlp": 1.05398607, "epoch": 0.14086878100105216, "flos": 25264593423360.0, "grad_norm": 1.861000567468319, "language_loss": 0.77565265, "learning_rate": 3.8075896941780576e-06, "loss": 0.79788274, "num_input_tokens_seen": 50743270, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 1.0234375, "step": 2343, "time_per_iteration": 2.6372628211975098 }, { "auxiliary_loss_clip": 0.01074836, "auxiliary_loss_mlp": 0.01067221, "balance_loss_clip": 1.06503904, "balance_loss_mlp": 1.01920223, "epoch": 0.14092890425372012, "flos": 65978388869760.0, "grad_norm": 0.8455826939457468, "language_loss": 0.61518985, "learning_rate": 3.807427966824729e-06, "loss": 0.63661045, "num_input_tokens_seen": 50802710, "router_z_loss_clip": 0.02185059, "router_z_loss_mlp": 0.375, "step": 2344, "time_per_iteration": 4.56473445892334 }, { "auxiliary_loss_clip": 0.01161393, "auxiliary_loss_mlp": 0.01051073, "balance_loss_clip": 1.03378749, "balance_loss_mlp": 1.04889297, "epoch": 0.1409890275063881, "flos": 23694973931520.0, "grad_norm": 1.5792891945204965, "language_loss": 0.64776504, "learning_rate": 3.807266174968946e-06, "loss": 0.66988969, "num_input_tokens_seen": 50822625, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 1.0390625, "step": 2345, "time_per_iteration": 5.535008668899536 }, { "auxiliary_loss_clip": 0.0117088, "auxiliary_loss_mlp": 0.01045173, "balance_loss_clip": 1.0253489, "balance_loss_mlp": 1.04922664, "epoch": 0.14104915075905605, "flos": 23623152687360.0, "grad_norm": 2.3001287520321156, "language_loss": 0.7324785, "learning_rate": 3.8071043186164813e-06, "loss": 0.75463903, "num_input_tokens_seen": 50842330, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.125, "step": 2346, "time_per_iteration": 2.5660178661346436 }, { "auxiliary_loss_clip": 0.01169945, "auxiliary_loss_mlp": 0.01047473, "balance_loss_clip": 1.02801824, "balance_loss_mlp": 1.05190933, "epoch": 0.14110927401172405, "flos": 20595165102720.0, "grad_norm": 1.7048107406046165, "language_loss": 0.77517068, "learning_rate": 3.8069423977731123e-06, "loss": 0.7973448, "num_input_tokens_seen": 50861035, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.0859375, "step": 2347, "time_per_iteration": 2.5755507946014404 }, { "auxiliary_loss_clip": 0.0115653, "auxiliary_loss_mlp": 0.01047485, "balance_loss_clip": 1.02961588, "balance_loss_mlp": 1.04943693, "epoch": 0.141169397264392, "flos": 28548049512960.0, "grad_norm": 1.7358308473647484, "language_loss": 0.76733804, "learning_rate": 3.8067804124446167e-06, "loss": 0.78937817, "num_input_tokens_seen": 50880105, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 1.078125, "step": 2348, "time_per_iteration": 3.95807147026062 }, { "auxiliary_loss_clip": 0.011672, "auxiliary_loss_mlp": 0.01042922, "balance_loss_clip": 1.02426553, "balance_loss_mlp": 1.05183077, "epoch": 0.14122952051705998, "flos": 17092258871040.0, "grad_norm": 1.7105044238928122, "language_loss": 0.86282802, "learning_rate": 3.806618362636776e-06, "loss": 0.88492918, "num_input_tokens_seen": 50897720, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.0625, "step": 2349, "time_per_iteration": 2.647965669631958 }, { "auxiliary_loss_clip": 0.0116686, "auxiliary_loss_mlp": 0.01047386, "balance_loss_clip": 1.02834821, "balance_loss_mlp": 1.05367661, "epoch": 0.14128964376972794, "flos": 28946801370240.0, "grad_norm": 1.844065228945305, "language_loss": 0.88724679, "learning_rate": 3.806456248355373e-06, "loss": 0.90938926, "num_input_tokens_seen": 50918385, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.046875, "step": 2350, "time_per_iteration": 2.5913820266723633 }, { "auxiliary_loss_clip": 0.01177724, "auxiliary_loss_mlp": 0.01042361, "balance_loss_clip": 1.02204776, "balance_loss_mlp": 1.05253339, "epoch": 0.1413497670223959, "flos": 18989778643200.0, "grad_norm": 1.6307988255839154, "language_loss": 0.81162834, "learning_rate": 3.806294069606194e-06, "loss": 0.83382916, "num_input_tokens_seen": 50938270, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.0703125, "step": 2351, "time_per_iteration": 2.6371679306030273 }, { "auxiliary_loss_clip": 0.0116626, "auxiliary_loss_mlp": 0.01036895, "balance_loss_clip": 1.01808381, "balance_loss_mlp": 1.04977822, "epoch": 0.14140989027506387, "flos": 29862236413440.0, "grad_norm": 3.6677565248810193, "language_loss": 0.83094096, "learning_rate": 3.806131826395025e-06, "loss": 0.85297245, "num_input_tokens_seen": 50958155, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0703125, "step": 2352, "time_per_iteration": 2.65848445892334 }, { "auxiliary_loss_clip": 0.01061989, "auxiliary_loss_mlp": 0.01007397, "balance_loss_clip": 1.00501239, "balance_loss_mlp": 1.01571441, "epoch": 0.14147001352773186, "flos": 62079532041600.0, "grad_norm": 0.903889404990372, "language_loss": 0.61951911, "learning_rate": 3.805969518727658e-06, "loss": 0.64021289, "num_input_tokens_seen": 51020705, "router_z_loss_clip": 0.02380371, "router_z_loss_mlp": 0.37109375, "step": 2353, "time_per_iteration": 3.1193461418151855 }, { "auxiliary_loss_clip": 0.0117336, "auxiliary_loss_mlp": 0.01041275, "balance_loss_clip": 1.02285743, "balance_loss_mlp": 1.05069828, "epoch": 0.14153013678039983, "flos": 22017514832640.0, "grad_norm": 1.9266131870764829, "language_loss": 0.87103486, "learning_rate": 3.805807146609884e-06, "loss": 0.8931812, "num_input_tokens_seen": 51039995, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.046875, "step": 2354, "time_per_iteration": 2.6370675563812256 }, { "auxiliary_loss_clip": 0.0118511, "auxiliary_loss_mlp": 0.01045508, "balance_loss_clip": 1.02588582, "balance_loss_mlp": 1.05077457, "epoch": 0.1415902600330678, "flos": 19720093968000.0, "grad_norm": 2.4435030580265353, "language_loss": 0.74367821, "learning_rate": 3.8056447100474976e-06, "loss": 0.76598442, "num_input_tokens_seen": 51059075, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.0703125, "step": 2355, "time_per_iteration": 2.6519546508789062 }, { "auxiliary_loss_clip": 0.01059556, "auxiliary_loss_mlp": 0.01001626, "balance_loss_clip": 0.99927807, "balance_loss_mlp": 1.01333547, "epoch": 0.14165038328573576, "flos": 65900929190400.0, "grad_norm": 0.6783639650944712, "language_loss": 0.51818389, "learning_rate": 3.8054822090462963e-06, "loss": 0.53879571, "num_input_tokens_seen": 51120380, "router_z_loss_clip": 0.0234375, "router_z_loss_mlp": 0.37109375, "step": 2356, "time_per_iteration": 3.083566904067993 }, { "auxiliary_loss_clip": 0.01156211, "auxiliary_loss_mlp": 0.01049787, "balance_loss_clip": 1.03128612, "balance_loss_mlp": 1.05023623, "epoch": 0.14171050653840372, "flos": 12130158533760.0, "grad_norm": 1.9326275031042224, "language_loss": 0.7029264, "learning_rate": 3.80531964361208e-06, "loss": 0.72498637, "num_input_tokens_seen": 51136950, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.0625, "step": 2357, "time_per_iteration": 2.504070281982422 }, { "auxiliary_loss_clip": 0.01188506, "auxiliary_loss_mlp": 0.01054605, "balance_loss_clip": 1.03623533, "balance_loss_mlp": 1.05149603, "epoch": 0.1417706297910717, "flos": 20412487509120.0, "grad_norm": 1.7440707142804563, "language_loss": 0.81785154, "learning_rate": 3.8051570137506485e-06, "loss": 0.84028268, "num_input_tokens_seen": 51155175, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.09375, "step": 2358, "time_per_iteration": 2.6502931118011475 }, { "auxiliary_loss_clip": 0.01179114, "auxiliary_loss_mlp": 0.0105092, "balance_loss_clip": 1.03144145, "balance_loss_mlp": 1.05045414, "epoch": 0.14183075304373965, "flos": 22380607463040.0, "grad_norm": 2.4629213651757436, "language_loss": 0.71434176, "learning_rate": 3.804994319467807e-06, "loss": 0.73664218, "num_input_tokens_seen": 51174500, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.109375, "step": 2359, "time_per_iteration": 2.5485427379608154 }, { "auxiliary_loss_clip": 0.01192565, "auxiliary_loss_mlp": 0.01297288, "balance_loss_clip": 1.02970731, "balance_loss_mlp": 1.04957247, "epoch": 0.14189087629640765, "flos": 21580913018880.0, "grad_norm": 1.9172883457601961, "language_loss": 0.75525147, "learning_rate": 3.804831560769361e-06, "loss": 0.78015006, "num_input_tokens_seen": 51194270, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.0625, "step": 2360, "time_per_iteration": 2.7225019931793213 }, { "auxiliary_loss_clip": 0.01193168, "auxiliary_loss_mlp": 0.01040868, "balance_loss_clip": 1.02193737, "balance_loss_mlp": 1.05194032, "epoch": 0.1419509995490756, "flos": 20008564093440.0, "grad_norm": 2.2599091552465196, "language_loss": 0.81226975, "learning_rate": 3.8046687376611196e-06, "loss": 0.83461016, "num_input_tokens_seen": 51211850, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.046875, "step": 2361, "time_per_iteration": 2.553478240966797 }, { "auxiliary_loss_clip": 0.01163047, "auxiliary_loss_mlp": 0.01052374, "balance_loss_clip": 1.03313398, "balance_loss_mlp": 1.05034542, "epoch": 0.14201112280174358, "flos": 31941464112000.0, "grad_norm": 2.19736010024534, "language_loss": 0.7465046, "learning_rate": 3.8045058501488927e-06, "loss": 0.76865882, "num_input_tokens_seen": 51233545, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.03125, "step": 2362, "time_per_iteration": 2.669811725616455 }, { "auxiliary_loss_clip": 0.01165512, "auxiliary_loss_mlp": 0.01045218, "balance_loss_clip": 1.02628732, "balance_loss_mlp": 1.05122566, "epoch": 0.14207124605441154, "flos": 41464147582080.0, "grad_norm": 4.5120148859715545, "language_loss": 0.73873675, "learning_rate": 3.804342898238494e-06, "loss": 0.76084399, "num_input_tokens_seen": 51257615, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0546875, "step": 2363, "time_per_iteration": 2.710906505584717 }, { "auxiliary_loss_clip": 0.01183689, "auxiliary_loss_mlp": 0.01047479, "balance_loss_clip": 1.02862048, "balance_loss_mlp": 1.04989612, "epoch": 0.1421313693070795, "flos": 31905086613120.0, "grad_norm": 1.6886344977936791, "language_loss": 0.72954607, "learning_rate": 3.8041798819357386e-06, "loss": 0.75185776, "num_input_tokens_seen": 51279645, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.0625, "step": 2364, "time_per_iteration": 2.8093109130859375 }, { "auxiliary_loss_clip": 0.01169661, "auxiliary_loss_mlp": 0.01047342, "balance_loss_clip": 1.03035522, "balance_loss_mlp": 1.05014753, "epoch": 0.14219149255974747, "flos": 26871165031680.0, "grad_norm": 2.0515498651705055, "language_loss": 0.90234935, "learning_rate": 3.804016801246444e-06, "loss": 0.92451942, "num_input_tokens_seen": 51299775, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 1.015625, "step": 2365, "time_per_iteration": 2.744694948196411 }, { "auxiliary_loss_clip": 0.01174517, "auxiliary_loss_mlp": 0.01050062, "balance_loss_clip": 1.0312748, "balance_loss_mlp": 1.04911041, "epoch": 0.14225161581241544, "flos": 27454426076160.0, "grad_norm": 1.9904059172833668, "language_loss": 0.65204304, "learning_rate": 3.80385365617643e-06, "loss": 0.67428881, "num_input_tokens_seen": 51319430, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.0703125, "step": 2366, "time_per_iteration": 2.6830480098724365 }, { "auxiliary_loss_clip": 0.01172093, "auxiliary_loss_mlp": 0.01055379, "balance_loss_clip": 1.03553069, "balance_loss_mlp": 1.04986453, "epoch": 0.14231173906508343, "flos": 10561436881920.0, "grad_norm": 2.6536180788774395, "language_loss": 0.80260623, "learning_rate": 3.8036904467315196e-06, "loss": 0.82488096, "num_input_tokens_seen": 51336045, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.0390625, "step": 2367, "time_per_iteration": 2.533663749694824 }, { "auxiliary_loss_clip": 0.0117593, "auxiliary_loss_mlp": 0.01054463, "balance_loss_clip": 1.03417397, "balance_loss_mlp": 1.05015743, "epoch": 0.1423718623177514, "flos": 28360882719360.0, "grad_norm": 1.8562373339052083, "language_loss": 0.82387531, "learning_rate": 3.8035271729175366e-06, "loss": 0.84617925, "num_input_tokens_seen": 51357030, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.078125, "step": 2368, "time_per_iteration": 2.623361110687256 }, { "auxiliary_loss_clip": 0.01183365, "auxiliary_loss_mlp": 0.01050646, "balance_loss_clip": 1.03039265, "balance_loss_mlp": 1.05028963, "epoch": 0.14243198557041936, "flos": 19354235990400.0, "grad_norm": 23.404932461268235, "language_loss": 0.86896658, "learning_rate": 3.803363834740308e-06, "loss": 0.8913067, "num_input_tokens_seen": 51374890, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.0546875, "step": 2369, "time_per_iteration": 2.655379056930542 }, { "auxiliary_loss_clip": 0.01187041, "auxiliary_loss_mlp": 0.01047955, "balance_loss_clip": 1.02872646, "balance_loss_mlp": 1.04912019, "epoch": 0.14249210882308733, "flos": 28806857982720.0, "grad_norm": 1.5871920567689957, "language_loss": 0.76019162, "learning_rate": 3.8032004322056627e-06, "loss": 0.78254163, "num_input_tokens_seen": 51398100, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.1015625, "step": 2370, "time_per_iteration": 2.6447606086730957 }, { "auxiliary_loss_clip": 0.01164361, "auxiliary_loss_mlp": 0.01308607, "balance_loss_clip": 1.04114795, "balance_loss_mlp": 1.04834723, "epoch": 0.1425522320757553, "flos": 21835016807040.0, "grad_norm": 2.735315323051177, "language_loss": 0.83048022, "learning_rate": 3.8030369653194326e-06, "loss": 0.85520995, "num_input_tokens_seen": 51418745, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0625, "step": 2371, "time_per_iteration": 2.6291451454162598 }, { "auxiliary_loss_clip": 0.01159257, "auxiliary_loss_mlp": 0.0105369, "balance_loss_clip": 1.03459215, "balance_loss_mlp": 1.05163765, "epoch": 0.14261235532842326, "flos": 17311457617920.0, "grad_norm": 2.107920286124758, "language_loss": 0.82943392, "learning_rate": 3.802873434087451e-06, "loss": 0.85156333, "num_input_tokens_seen": 51437455, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.078125, "step": 2372, "time_per_iteration": 2.5028955936431885 }, { "auxiliary_loss_clip": 0.01157764, "auxiliary_loss_mlp": 0.0104938, "balance_loss_clip": 1.03007972, "balance_loss_mlp": 1.05124784, "epoch": 0.14267247858109125, "flos": 18806741913600.0, "grad_norm": 2.7218099610654827, "language_loss": 0.84146655, "learning_rate": 3.8027098385155546e-06, "loss": 0.86353797, "num_input_tokens_seen": 51455710, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0625, "step": 2373, "time_per_iteration": 2.587700843811035 }, { "auxiliary_loss_clip": 0.01171039, "auxiliary_loss_mlp": 0.01046257, "balance_loss_clip": 1.02860248, "balance_loss_mlp": 1.048388, "epoch": 0.14273260183375922, "flos": 11358904682880.0, "grad_norm": 2.4090885038268195, "language_loss": 0.85936409, "learning_rate": 3.802546178609581e-06, "loss": 0.88153702, "num_input_tokens_seen": 51471270, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 1.046875, "step": 2374, "time_per_iteration": 2.5124881267547607 }, { "auxiliary_loss_clip": 0.01168355, "auxiliary_loss_mlp": 0.0104937, "balance_loss_clip": 1.02966499, "balance_loss_mlp": 1.04940844, "epoch": 0.14279272508642718, "flos": 27567688636800.0, "grad_norm": 1.6887938503469448, "language_loss": 0.78976661, "learning_rate": 3.8023824543753706e-06, "loss": 0.81194389, "num_input_tokens_seen": 51492705, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.09375, "step": 2375, "time_per_iteration": 2.6710104942321777 }, { "auxiliary_loss_clip": 0.01165821, "auxiliary_loss_mlp": 0.01056889, "balance_loss_clip": 1.03783941, "balance_loss_mlp": 1.05106699, "epoch": 0.14285284833909515, "flos": 16252559654400.0, "grad_norm": 2.410721899145085, "language_loss": 0.76186925, "learning_rate": 3.802218665818767e-06, "loss": 0.78409636, "num_input_tokens_seen": 51510780, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.0546875, "step": 2376, "time_per_iteration": 2.4986352920532227 }, { "auxiliary_loss_clip": 0.01165225, "auxiliary_loss_mlp": 0.01048119, "balance_loss_clip": 1.02912891, "balance_loss_mlp": 1.05036509, "epoch": 0.1429129715917631, "flos": 19755609540480.0, "grad_norm": 1.9272304478439943, "language_loss": 0.92973113, "learning_rate": 3.802054812945615e-06, "loss": 0.9518646, "num_input_tokens_seen": 51531400, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0625, "step": 2377, "time_per_iteration": 2.660736560821533 }, { "auxiliary_loss_clip": 0.01174854, "auxiliary_loss_mlp": 0.01046296, "balance_loss_clip": 1.02575684, "balance_loss_mlp": 1.04943323, "epoch": 0.14297309484443108, "flos": 21137092571520.0, "grad_norm": 3.476646095247673, "language_loss": 0.91590405, "learning_rate": 3.801890895761762e-06, "loss": 0.9381156, "num_input_tokens_seen": 51548215, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.0703125, "step": 2378, "time_per_iteration": 2.557577610015869 }, { "auxiliary_loss_clip": 0.01168688, "auxiliary_loss_mlp": 0.01048174, "balance_loss_clip": 1.02855206, "balance_loss_mlp": 1.04970253, "epoch": 0.14303321809709904, "flos": 23586667447680.0, "grad_norm": 1.6761025953059947, "language_loss": 0.73509574, "learning_rate": 3.8017269142730584e-06, "loss": 0.75726438, "num_input_tokens_seen": 51566820, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.09375, "step": 2379, "time_per_iteration": 2.5854222774505615 }, { "auxiliary_loss_clip": 0.01181313, "auxiliary_loss_mlp": 0.01057965, "balance_loss_clip": 1.03875995, "balance_loss_mlp": 1.04797339, "epoch": 0.14309334134976703, "flos": 15888281875200.0, "grad_norm": 1.988882958842535, "language_loss": 0.78738308, "learning_rate": 3.801562868485355e-06, "loss": 0.80977583, "num_input_tokens_seen": 51585075, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0625, "step": 2380, "time_per_iteration": 2.5318422317504883 }, { "auxiliary_loss_clip": 0.01185851, "auxiliary_loss_mlp": 0.01053365, "balance_loss_clip": 1.03424406, "balance_loss_mlp": 1.05152464, "epoch": 0.143153464602435, "flos": 16325601960960.0, "grad_norm": 2.563137319456023, "language_loss": 0.8797096, "learning_rate": 3.801398758404508e-06, "loss": 0.90210176, "num_input_tokens_seen": 51603185, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0703125, "step": 2381, "time_per_iteration": 2.6291444301605225 }, { "auxiliary_loss_clip": 0.01173697, "auxiliary_loss_mlp": 0.01049557, "balance_loss_clip": 1.03045917, "balance_loss_mlp": 1.05034971, "epoch": 0.14321358785510296, "flos": 17092079303040.0, "grad_norm": 2.678712071828196, "language_loss": 0.76311851, "learning_rate": 3.801234584036372e-06, "loss": 0.78535104, "num_input_tokens_seen": 51620880, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.046875, "step": 2382, "time_per_iteration": 2.505507707595825 }, { "auxiliary_loss_clip": 0.01181043, "auxiliary_loss_mlp": 0.01055793, "balance_loss_clip": 1.03638589, "balance_loss_mlp": 1.04718757, "epoch": 0.14327371110777093, "flos": 26322916769280.0, "grad_norm": 2.570371056998489, "language_loss": 0.76434082, "learning_rate": 3.801070345386808e-06, "loss": 0.78670919, "num_input_tokens_seen": 51640170, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.0703125, "step": 2383, "time_per_iteration": 2.744020938873291 }, { "auxiliary_loss_clip": 0.01186011, "auxiliary_loss_mlp": 0.01051827, "balance_loss_clip": 1.03146577, "balance_loss_mlp": 1.05030537, "epoch": 0.1433338343604389, "flos": 18076462502400.0, "grad_norm": 2.5249439422004967, "language_loss": 0.87374818, "learning_rate": 3.8009060424616757e-06, "loss": 0.89612663, "num_input_tokens_seen": 51656580, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.078125, "step": 2384, "time_per_iteration": 2.5272932052612305 }, { "auxiliary_loss_clip": 0.011722, "auxiliary_loss_mlp": 0.01049605, "balance_loss_clip": 1.02835023, "balance_loss_mlp": 1.05174136, "epoch": 0.14339395761310686, "flos": 15522783033600.0, "grad_norm": 2.5007304758113977, "language_loss": 0.79341686, "learning_rate": 3.800741675266839e-06, "loss": 0.81563491, "num_input_tokens_seen": 51674645, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.109375, "step": 2385, "time_per_iteration": 4.013274908065796 }, { "auxiliary_loss_clip": 0.0116648, "auxiliary_loss_mlp": 0.01045158, "balance_loss_clip": 1.02576292, "balance_loss_mlp": 1.04905045, "epoch": 0.14345408086577485, "flos": 28548767784960.0, "grad_norm": 1.9285026953544973, "language_loss": 0.74995261, "learning_rate": 3.8005772438081645e-06, "loss": 0.77206898, "num_input_tokens_seen": 51695770, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.0859375, "step": 2386, "time_per_iteration": 4.163359642028809 }, { "auxiliary_loss_clip": 0.01157861, "auxiliary_loss_mlp": 0.01045201, "balance_loss_clip": 1.02600789, "balance_loss_mlp": 1.05087543, "epoch": 0.14351420411844282, "flos": 20230061310720.0, "grad_norm": 2.349037984185374, "language_loss": 0.78394008, "learning_rate": 3.80041274809152e-06, "loss": 0.80597067, "num_input_tokens_seen": 51714165, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0703125, "step": 2387, "time_per_iteration": 4.019424915313721 }, { "auxiliary_loss_clip": 0.01175354, "auxiliary_loss_mlp": 0.01050574, "balance_loss_clip": 1.03030837, "balance_loss_mlp": 1.05052495, "epoch": 0.14357432737111078, "flos": 19865029345920.0, "grad_norm": 2.1072385907461264, "language_loss": 0.81730175, "learning_rate": 3.8002481881227753e-06, "loss": 0.8395611, "num_input_tokens_seen": 51734440, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.0625, "step": 2388, "time_per_iteration": 2.5985500812530518 }, { "auxiliary_loss_clip": 0.01169592, "auxiliary_loss_mlp": 0.01042781, "balance_loss_clip": 1.02394557, "balance_loss_mlp": 1.0521127, "epoch": 0.14363445062377875, "flos": 28256814040320.0, "grad_norm": 2.664462063692543, "language_loss": 0.82240033, "learning_rate": 3.8000835639078038e-06, "loss": 0.84452403, "num_input_tokens_seen": 51753730, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.0859375, "step": 2389, "time_per_iteration": 2.628760814666748 }, { "auxiliary_loss_clip": 0.01179603, "auxiliary_loss_mlp": 0.0104882, "balance_loss_clip": 1.02897143, "balance_loss_mlp": 1.05036223, "epoch": 0.1436945738764467, "flos": 18186672407040.0, "grad_norm": 2.5702340941916537, "language_loss": 0.82431179, "learning_rate": 3.79991887545248e-06, "loss": 0.846596, "num_input_tokens_seen": 51771195, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.109375, "step": 2390, "time_per_iteration": 4.501373767852783 }, { "auxiliary_loss_clip": 0.01165248, "auxiliary_loss_mlp": 0.01047558, "balance_loss_clip": 1.02897382, "balance_loss_mlp": 1.0489862, "epoch": 0.14375469712911468, "flos": 27307910499840.0, "grad_norm": 2.0899571405232376, "language_loss": 0.74768484, "learning_rate": 3.799754122762682e-06, "loss": 0.76981288, "num_input_tokens_seen": 51792290, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0703125, "step": 2391, "time_per_iteration": 2.5724174976348877 }, { "auxiliary_loss_clip": 0.01080034, "auxiliary_loss_mlp": 0.01027788, "balance_loss_clip": 1.02559459, "balance_loss_mlp": 1.01543117, "epoch": 0.14381482038178264, "flos": 56891445287040.0, "grad_norm": 0.8734036007470528, "language_loss": 0.6183446, "learning_rate": 3.7995893058442886e-06, "loss": 0.63942283, "num_input_tokens_seen": 51843675, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.375, "step": 2392, "time_per_iteration": 3.048401117324829 }, { "auxiliary_loss_clip": 0.01170991, "auxiliary_loss_mlp": 0.01050053, "balance_loss_clip": 1.02912033, "balance_loss_mlp": 1.04945028, "epoch": 0.14387494363445064, "flos": 14282177143680.0, "grad_norm": 3.212100581448318, "language_loss": 0.76531601, "learning_rate": 3.7994244247031814e-06, "loss": 0.78752649, "num_input_tokens_seen": 51860285, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.125, "step": 2393, "time_per_iteration": 2.4671285152435303 }, { "auxiliary_loss_clip": 0.01159863, "auxiliary_loss_mlp": 0.01050223, "balance_loss_clip": 1.03123271, "balance_loss_mlp": 1.05165219, "epoch": 0.1439350668871186, "flos": 26761493831040.0, "grad_norm": 2.2453796392869347, "language_loss": 0.76193988, "learning_rate": 3.799259479345246e-06, "loss": 0.78404069, "num_input_tokens_seen": 51880105, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.078125, "step": 2394, "time_per_iteration": 2.5990824699401855 }, { "auxiliary_loss_clip": 0.0115839, "auxiliary_loss_mlp": 0.01048162, "balance_loss_clip": 1.02881444, "balance_loss_mlp": 1.05019021, "epoch": 0.14399519013978657, "flos": 40700040537600.0, "grad_norm": 1.5117337951064185, "language_loss": 0.86062968, "learning_rate": 3.799094469776367e-06, "loss": 0.8826952, "num_input_tokens_seen": 51905175, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0859375, "step": 2395, "time_per_iteration": 2.692499876022339 }, { "auxiliary_loss_clip": 0.01184432, "auxiliary_loss_mlp": 0.01047804, "balance_loss_clip": 1.02856374, "balance_loss_mlp": 1.05164039, "epoch": 0.14405531339245453, "flos": 20557530627840.0, "grad_norm": 1.6363923160504334, "language_loss": 0.834059, "learning_rate": 3.7989293960024353e-06, "loss": 0.85638136, "num_input_tokens_seen": 51924490, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.0546875, "step": 2396, "time_per_iteration": 2.5981602668762207 }, { "auxiliary_loss_clip": 0.01170601, "auxiliary_loss_mlp": 0.01292156, "balance_loss_clip": 1.02521098, "balance_loss_mlp": 1.04801452, "epoch": 0.1441154366451225, "flos": 19572931946880.0, "grad_norm": 2.9820478194753766, "language_loss": 0.82498837, "learning_rate": 3.79876425802934e-06, "loss": 0.84961593, "num_input_tokens_seen": 51940490, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.0390625, "step": 2397, "time_per_iteration": 2.556621789932251 }, { "auxiliary_loss_clip": 0.0115975, "auxiliary_loss_mlp": 0.01048828, "balance_loss_clip": 1.02927732, "balance_loss_mlp": 1.0500617, "epoch": 0.14417555989779046, "flos": 18515721922560.0, "grad_norm": 1.9176887512875636, "language_loss": 0.7992475, "learning_rate": 3.798599055862976e-06, "loss": 0.82133329, "num_input_tokens_seen": 51957910, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.09375, "step": 2398, "time_per_iteration": 2.6204636096954346 }, { "auxiliary_loss_clip": 0.01164602, "auxiliary_loss_mlp": 0.01051022, "balance_loss_clip": 1.03228199, "balance_loss_mlp": 1.04962587, "epoch": 0.14423568315045843, "flos": 26031681296640.0, "grad_norm": 2.6010437960341575, "language_loss": 0.64801741, "learning_rate": 3.798433789509238e-06, "loss": 0.67017365, "num_input_tokens_seen": 51978010, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0546875, "step": 2399, "time_per_iteration": 2.7101147174835205 }, { "auxiliary_loss_clip": 0.0117684, "auxiliary_loss_mlp": 0.01047989, "balance_loss_clip": 1.02849841, "balance_loss_mlp": 1.05203724, "epoch": 0.14429580640312642, "flos": 21288743792640.0, "grad_norm": 1.8065230659223002, "language_loss": 0.81977957, "learning_rate": 3.798268458974024e-06, "loss": 0.8420279, "num_input_tokens_seen": 51998515, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.0625, "step": 2400, "time_per_iteration": 2.5918893814086914 }, { "auxiliary_loss_clip": 0.01179994, "auxiliary_loss_mlp": 0.01048203, "balance_loss_clip": 1.02792597, "balance_loss_mlp": 1.05159843, "epoch": 0.14435592965579438, "flos": 25627865621760.0, "grad_norm": 1.5264738465225127, "language_loss": 0.74070859, "learning_rate": 3.7981030642632348e-06, "loss": 0.76299059, "num_input_tokens_seen": 52019270, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.1015625, "step": 2401, "time_per_iteration": 2.6172304153442383 }, { "auxiliary_loss_clip": 0.01174671, "auxiliary_loss_mlp": 0.01042519, "balance_loss_clip": 1.02377892, "balance_loss_mlp": 1.05069542, "epoch": 0.14441605290846235, "flos": 22965053656320.0, "grad_norm": 1.9812590812485473, "language_loss": 0.80634981, "learning_rate": 3.797937605382772e-06, "loss": 0.82852167, "num_input_tokens_seen": 52039315, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0546875, "step": 2402, "time_per_iteration": 2.5992724895477295 }, { "auxiliary_loss_clip": 0.01185995, "auxiliary_loss_mlp": 0.0104964, "balance_loss_clip": 1.03047109, "balance_loss_mlp": 1.05144107, "epoch": 0.14447617616113032, "flos": 17347655548800.0, "grad_norm": 2.426694105602848, "language_loss": 0.84423208, "learning_rate": 3.7977720823385413e-06, "loss": 0.86658841, "num_input_tokens_seen": 52056555, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.078125, "step": 2403, "time_per_iteration": 2.6145620346069336 }, { "auxiliary_loss_clip": 0.01183949, "auxiliary_loss_mlp": 0.01304858, "balance_loss_clip": 1.03561211, "balance_loss_mlp": 1.04920483, "epoch": 0.14453629941379828, "flos": 24060185464320.0, "grad_norm": 1.7550896602182418, "language_loss": 0.698138, "learning_rate": 3.797606495136449e-06, "loss": 0.72302616, "num_input_tokens_seen": 52075800, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.078125, "step": 2404, "time_per_iteration": 2.614544630050659 }, { "auxiliary_loss_clip": 0.01171093, "auxiliary_loss_mlp": 0.01046579, "balance_loss_clip": 1.02816081, "balance_loss_mlp": 1.04794908, "epoch": 0.14459642266646625, "flos": 14429554646400.0, "grad_norm": 2.773103265635537, "language_loss": 0.73329616, "learning_rate": 3.7974408437824055e-06, "loss": 0.7554729, "num_input_tokens_seen": 52092585, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.0546875, "step": 2405, "time_per_iteration": 2.5853843688964844 }, { "auxiliary_loss_clip": 0.01190958, "auxiliary_loss_mlp": 0.0104458, "balance_loss_clip": 1.02606678, "balance_loss_mlp": 1.0498817, "epoch": 0.14465654591913424, "flos": 9867032179200.0, "grad_norm": 2.5824786093099905, "language_loss": 0.72704357, "learning_rate": 3.7972751282823216e-06, "loss": 0.74939895, "num_input_tokens_seen": 52108990, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.046875, "step": 2406, "time_per_iteration": 2.598599433898926 }, { "auxiliary_loss_clip": 0.01167116, "auxiliary_loss_mlp": 0.0105197, "balance_loss_clip": 1.03128743, "balance_loss_mlp": 1.04975414, "epoch": 0.1447166691718022, "flos": 24972926987520.0, "grad_norm": 2.7743141817577834, "language_loss": 0.75450206, "learning_rate": 3.797109348642111e-06, "loss": 0.77669299, "num_input_tokens_seen": 52125385, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.0859375, "step": 2407, "time_per_iteration": 2.6874420642852783 }, { "auxiliary_loss_clip": 0.01154285, "auxiliary_loss_mlp": 0.0104343, "balance_loss_clip": 1.02508414, "balance_loss_mlp": 1.04813457, "epoch": 0.14477679242447017, "flos": 21908023200000.0, "grad_norm": 2.0365500227417237, "language_loss": 0.79177022, "learning_rate": 3.796943504867691e-06, "loss": 0.81374735, "num_input_tokens_seen": 52144985, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0625, "step": 2408, "time_per_iteration": 2.5763652324676514 }, { "auxiliary_loss_clip": 0.01168249, "auxiliary_loss_mlp": 0.01055864, "balance_loss_clip": 1.03540766, "balance_loss_mlp": 1.05167866, "epoch": 0.14483691567713813, "flos": 20740746925440.0, "grad_norm": 1.9274618021498537, "language_loss": 0.82223362, "learning_rate": 3.7967775969649796e-06, "loss": 0.84447479, "num_input_tokens_seen": 52163885, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.078125, "step": 2409, "time_per_iteration": 2.5797271728515625 }, { "auxiliary_loss_clip": 0.01194899, "auxiliary_loss_mlp": 0.01049108, "balance_loss_clip": 1.02995074, "balance_loss_mlp": 1.05088019, "epoch": 0.1448970389298061, "flos": 35407705536000.0, "grad_norm": 2.6997993679376657, "language_loss": 0.74800986, "learning_rate": 3.7966116249398974e-06, "loss": 0.77044988, "num_input_tokens_seen": 52184325, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.078125, "step": 2410, "time_per_iteration": 2.6736202239990234 }, { "auxiliary_loss_clip": 0.01156117, "auxiliary_loss_mlp": 0.01047088, "balance_loss_clip": 1.02912354, "balance_loss_mlp": 1.0497036, "epoch": 0.14495716218247406, "flos": 15414368808960.0, "grad_norm": 1.9997915649226807, "language_loss": 0.81270486, "learning_rate": 3.7964455887983675e-06, "loss": 0.83473688, "num_input_tokens_seen": 52202740, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.0625, "step": 2411, "time_per_iteration": 2.626661539077759 }, { "auxiliary_loss_clip": 0.01175493, "auxiliary_loss_mlp": 0.01051074, "balance_loss_clip": 1.03209543, "balance_loss_mlp": 1.05041826, "epoch": 0.14501728543514203, "flos": 33693222493440.0, "grad_norm": 2.1105475717375572, "language_loss": 0.70122743, "learning_rate": 3.7962794885463165e-06, "loss": 0.7234931, "num_input_tokens_seen": 52223100, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0625, "step": 2412, "time_per_iteration": 2.7499775886535645 }, { "auxiliary_loss_clip": 0.01165728, "auxiliary_loss_mlp": 0.01041696, "balance_loss_clip": 1.02294421, "balance_loss_mlp": 1.05108237, "epoch": 0.14507740868781002, "flos": 15596112648960.0, "grad_norm": 1.9339899364163677, "language_loss": 0.76868737, "learning_rate": 3.7961133241896706e-06, "loss": 0.79076153, "num_input_tokens_seen": 52239690, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0546875, "step": 2413, "time_per_iteration": 2.648850679397583 }, { "auxiliary_loss_clip": 0.01068151, "auxiliary_loss_mlp": 0.0100851, "balance_loss_clip": 1.00615013, "balance_loss_mlp": 1.01969147, "epoch": 0.145137531940478, "flos": 66675343438080.0, "grad_norm": 0.8916349928198396, "language_loss": 0.58799911, "learning_rate": 3.79594709573436e-06, "loss": 0.60876572, "num_input_tokens_seen": 52296705, "router_z_loss_clip": 0.02355957, "router_z_loss_mlp": 0.39453125, "step": 2414, "time_per_iteration": 3.1121222972869873 }, { "auxiliary_loss_clip": 0.01076826, "auxiliary_loss_mlp": 0.01005968, "balance_loss_clip": 1.00365567, "balance_loss_mlp": 1.01944637, "epoch": 0.14519765519314595, "flos": 67521578929920.0, "grad_norm": 0.8387400313102247, "language_loss": 0.62234092, "learning_rate": 3.7957808031863173e-06, "loss": 0.64316881, "num_input_tokens_seen": 52361830, "router_z_loss_clip": 0.02307129, "router_z_loss_mlp": 0.39648438, "step": 2415, "time_per_iteration": 3.1850826740264893 }, { "auxiliary_loss_clip": 0.01164942, "auxiliary_loss_mlp": 0.01042393, "balance_loss_clip": 1.0235703, "balance_loss_mlp": 1.04916549, "epoch": 0.14525777844581392, "flos": 17198913329280.0, "grad_norm": 2.0521629918900257, "language_loss": 0.71524054, "learning_rate": 3.7956144465514775e-06, "loss": 0.73731387, "num_input_tokens_seen": 52379420, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.0703125, "step": 2416, "time_per_iteration": 2.609467029571533 }, { "auxiliary_loss_clip": 0.01084629, "auxiliary_loss_mlp": 0.01003454, "balance_loss_clip": 1.00098598, "balance_loss_mlp": 1.01861072, "epoch": 0.14531790169848188, "flos": 65404609015680.0, "grad_norm": 0.7058622983180991, "language_loss": 0.60411167, "learning_rate": 3.7954480258357765e-06, "loss": 0.62499249, "num_input_tokens_seen": 52446290, "router_z_loss_clip": 0.0246582, "router_z_loss_mlp": 0.38671875, "step": 2417, "time_per_iteration": 3.2524092197418213 }, { "auxiliary_loss_clip": 0.01158924, "auxiliary_loss_mlp": 0.01050763, "balance_loss_clip": 1.03164172, "balance_loss_mlp": 1.04884171, "epoch": 0.14537802495114985, "flos": 32562467372160.0, "grad_norm": 9.977859779836452, "language_loss": 0.78965932, "learning_rate": 3.7952815410451542e-06, "loss": 0.81175619, "num_input_tokens_seen": 52467295, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.1015625, "step": 2418, "time_per_iteration": 2.7095894813537598 }, { "auxiliary_loss_clip": 0.01180459, "auxiliary_loss_mlp": 0.01046966, "balance_loss_clip": 1.02852392, "balance_loss_mlp": 1.04858601, "epoch": 0.1454381482038178, "flos": 20226685432320.0, "grad_norm": 1.7375909507978382, "language_loss": 0.71114695, "learning_rate": 3.7951149921855515e-06, "loss": 0.73342121, "num_input_tokens_seen": 52487295, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.046875, "step": 2419, "time_per_iteration": 2.5813848972320557 }, { "auxiliary_loss_clip": 0.01172253, "auxiliary_loss_mlp": 0.01041735, "balance_loss_clip": 1.02261388, "balance_loss_mlp": 1.0492295, "epoch": 0.1454982714564858, "flos": 22893124671360.0, "grad_norm": 2.46131321730724, "language_loss": 0.89465022, "learning_rate": 3.794948379262913e-06, "loss": 0.91679013, "num_input_tokens_seen": 52504220, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.046875, "step": 2420, "time_per_iteration": 2.625025749206543 }, { "auxiliary_loss_clip": 0.01163881, "auxiliary_loss_mlp": 0.01047879, "balance_loss_clip": 1.02967584, "balance_loss_mlp": 1.04891109, "epoch": 0.14555839470915377, "flos": 20229845829120.0, "grad_norm": 1.9539302777747414, "language_loss": 0.82459444, "learning_rate": 3.794781702283183e-06, "loss": 0.84671199, "num_input_tokens_seen": 52521900, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.0625, "step": 2421, "time_per_iteration": 2.5889132022857666 }, { "auxiliary_loss_clip": 0.01192109, "auxiliary_loss_mlp": 0.01044774, "balance_loss_clip": 1.02567673, "balance_loss_mlp": 1.04774117, "epoch": 0.14561851796182174, "flos": 22236282616320.0, "grad_norm": 1.6408031460651746, "language_loss": 0.81360465, "learning_rate": 3.7946149612523116e-06, "loss": 0.8359735, "num_input_tokens_seen": 52540495, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.078125, "step": 2422, "time_per_iteration": 2.6969239711761475 }, { "auxiliary_loss_clip": 0.01082175, "auxiliary_loss_mlp": 0.01253914, "balance_loss_clip": 1.00329018, "balance_loss_mlp": 1.01655054, "epoch": 0.1456786412144897, "flos": 52636786289280.0, "grad_norm": 0.906992226877235, "language_loss": 0.63287425, "learning_rate": 3.794448156176248e-06, "loss": 0.65623516, "num_input_tokens_seen": 52603305, "router_z_loss_clip": 0.0255127, "router_z_loss_mlp": 0.3828125, "step": 2423, "time_per_iteration": 3.2651185989379883 }, { "auxiliary_loss_clip": 0.01193196, "auxiliary_loss_mlp": 0.01290788, "balance_loss_clip": 1.02533412, "balance_loss_mlp": 1.05123305, "epoch": 0.14573876446715767, "flos": 23221671396480.0, "grad_norm": 1.7734689290504284, "language_loss": 0.82270116, "learning_rate": 3.794281287060946e-06, "loss": 0.84754097, "num_input_tokens_seen": 52623435, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 1.0546875, "step": 2424, "time_per_iteration": 2.630375385284424 }, { "auxiliary_loss_clip": 0.01170995, "auxiliary_loss_mlp": 0.01042631, "balance_loss_clip": 1.02401018, "balance_loss_mlp": 1.04813981, "epoch": 0.14579888771982563, "flos": 18114384286080.0, "grad_norm": 3.1581782649389454, "language_loss": 0.78534496, "learning_rate": 3.7941143539123596e-06, "loss": 0.80748117, "num_input_tokens_seen": 52642255, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.046875, "step": 2425, "time_per_iteration": 2.6098225116729736 }, { "auxiliary_loss_clip": 0.01165068, "auxiliary_loss_mlp": 0.01047634, "balance_loss_clip": 1.02929938, "balance_loss_mlp": 1.04968238, "epoch": 0.14585901097249362, "flos": 23001107932800.0, "grad_norm": 2.4960065990709683, "language_loss": 0.83712435, "learning_rate": 3.7939473567364473e-06, "loss": 0.85925138, "num_input_tokens_seen": 52658700, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0625, "step": 2426, "time_per_iteration": 2.61405611038208 }, { "auxiliary_loss_clip": 0.01153653, "auxiliary_loss_mlp": 0.01042511, "balance_loss_clip": 1.02452242, "balance_loss_mlp": 1.04978108, "epoch": 0.1459191342251616, "flos": 21908669644800.0, "grad_norm": 2.1276659284546295, "language_loss": 0.87243897, "learning_rate": 3.793780295539169e-06, "loss": 0.89440066, "num_input_tokens_seen": 52678140, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.0390625, "step": 2427, "time_per_iteration": 5.664028644561768 }, { "auxiliary_loss_clip": 0.01187096, "auxiliary_loss_mlp": 0.01043426, "balance_loss_clip": 1.02425683, "balance_loss_mlp": 1.05010509, "epoch": 0.14597925747782955, "flos": 14975504438400.0, "grad_norm": 3.019097876174485, "language_loss": 0.66522014, "learning_rate": 3.793613170326485e-06, "loss": 0.68752539, "num_input_tokens_seen": 52696825, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.09375, "step": 2428, "time_per_iteration": 3.9998157024383545 }, { "auxiliary_loss_clip": 0.01164598, "auxiliary_loss_mlp": 0.01048851, "balance_loss_clip": 1.03011143, "balance_loss_mlp": 1.04901612, "epoch": 0.14603938073049752, "flos": 21068898600960.0, "grad_norm": 1.9045003355920387, "language_loss": 0.83399218, "learning_rate": 3.793445981104362e-06, "loss": 0.85612679, "num_input_tokens_seen": 52715125, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0625, "step": 2429, "time_per_iteration": 2.6061060428619385 }, { "auxiliary_loss_clip": 0.01178591, "auxiliary_loss_mlp": 0.01044772, "balance_loss_clip": 1.02721262, "balance_loss_mlp": 1.04596853, "epoch": 0.14609950398316549, "flos": 19864777950720.0, "grad_norm": 1.7338949013010179, "language_loss": 0.7919609, "learning_rate": 3.7932787278787643e-06, "loss": 0.81419456, "num_input_tokens_seen": 52734015, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 1.0546875, "step": 2430, "time_per_iteration": 2.561155319213867 }, { "auxiliary_loss_clip": 0.01158534, "auxiliary_loss_mlp": 0.01046504, "balance_loss_clip": 1.02815795, "balance_loss_mlp": 1.05062461, "epoch": 0.14615962723583345, "flos": 22418852469120.0, "grad_norm": 2.3487680364185866, "language_loss": 0.82900429, "learning_rate": 3.7931114106556618e-06, "loss": 0.85105467, "num_input_tokens_seen": 52753025, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.078125, "step": 2431, "time_per_iteration": 2.6148343086242676 }, { "auxiliary_loss_clip": 0.01169972, "auxiliary_loss_mlp": 0.01051667, "balance_loss_clip": 1.03211713, "balance_loss_mlp": 1.0503881, "epoch": 0.14621975048850142, "flos": 22346241125760.0, "grad_norm": 1.9211729793775552, "language_loss": 0.78581476, "learning_rate": 3.7929440294410256e-06, "loss": 0.8080312, "num_input_tokens_seen": 52773420, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.109375, "step": 2432, "time_per_iteration": 4.145348072052002 }, { "auxiliary_loss_clip": 0.0115529, "auxiliary_loss_mlp": 0.01298877, "balance_loss_clip": 1.0306716, "balance_loss_mlp": 1.04861689, "epoch": 0.1462798737411694, "flos": 24389163152640.0, "grad_norm": 2.1575129069997825, "language_loss": 0.79705143, "learning_rate": 3.792776584240829e-06, "loss": 0.82159311, "num_input_tokens_seen": 52792870, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.0703125, "step": 2433, "time_per_iteration": 2.6758079528808594 }, { "auxiliary_loss_clip": 0.01179462, "auxiliary_loss_mlp": 0.01053846, "balance_loss_clip": 1.03459322, "balance_loss_mlp": 1.0497427, "epoch": 0.14633999699383737, "flos": 19244672530560.0, "grad_norm": 1.9505249485344738, "language_loss": 0.78239191, "learning_rate": 3.7926090750610477e-06, "loss": 0.80472499, "num_input_tokens_seen": 52811615, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.0234375, "step": 2434, "time_per_iteration": 2.63694429397583 }, { "auxiliary_loss_clip": 0.01081202, "auxiliary_loss_mlp": 0.01002417, "balance_loss_clip": 1.00010443, "balance_loss_mlp": 1.0160408, "epoch": 0.14640012024650534, "flos": 62660638270080.0, "grad_norm": 0.8421085223721825, "language_loss": 0.58377558, "learning_rate": 3.7924415019076593e-06, "loss": 0.60461181, "num_input_tokens_seen": 52873230, "router_z_loss_clip": 0.02307129, "router_z_loss_mlp": 0.37890625, "step": 2435, "time_per_iteration": 3.351492404937744 }, { "auxiliary_loss_clip": 0.01148371, "auxiliary_loss_mlp": 0.01044489, "balance_loss_clip": 1.02645278, "balance_loss_mlp": 1.04581523, "epoch": 0.1464602434991733, "flos": 12276243146880.0, "grad_norm": 2.468311711514309, "language_loss": 0.883959, "learning_rate": 3.7922738647866447e-06, "loss": 0.9058876, "num_input_tokens_seen": 52889325, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.0234375, "step": 2436, "time_per_iteration": 2.5990066528320312 }, { "auxiliary_loss_clip": 0.0116532, "auxiliary_loss_mlp": 0.01294756, "balance_loss_clip": 1.02797878, "balance_loss_mlp": 1.04838562, "epoch": 0.14652036675184127, "flos": 20922311197440.0, "grad_norm": 1.9982134525663349, "language_loss": 0.74464703, "learning_rate": 3.792106163703986e-06, "loss": 0.76924777, "num_input_tokens_seen": 52909705, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.078125, "step": 2437, "time_per_iteration": 2.593808889389038 }, { "auxiliary_loss_clip": 0.01156811, "auxiliary_loss_mlp": 0.01046408, "balance_loss_clip": 1.02555776, "balance_loss_mlp": 1.04884863, "epoch": 0.14658049000450923, "flos": 27703681528320.0, "grad_norm": 3.4021124725857708, "language_loss": 0.73094976, "learning_rate": 3.791938398665668e-06, "loss": 0.75298196, "num_input_tokens_seen": 52930300, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.078125, "step": 2438, "time_per_iteration": 2.7295968532562256 }, { "auxiliary_loss_clip": 0.01162508, "auxiliary_loss_mlp": 0.01038667, "balance_loss_clip": 1.0208931, "balance_loss_mlp": 1.0501585, "epoch": 0.14664061325717723, "flos": 24936513575040.0, "grad_norm": 2.680749459398635, "language_loss": 0.74036747, "learning_rate": 3.7917705696776786e-06, "loss": 0.76237923, "num_input_tokens_seen": 52949955, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 1.03125, "step": 2439, "time_per_iteration": 2.6206769943237305 }, { "auxiliary_loss_clip": 0.01162719, "auxiliary_loss_mlp": 0.01049092, "balance_loss_clip": 1.0301019, "balance_loss_mlp": 1.04918838, "epoch": 0.1467007365098452, "flos": 40297661406720.0, "grad_norm": 2.5677589742364906, "language_loss": 0.73959422, "learning_rate": 3.7916026767460067e-06, "loss": 0.76171231, "num_input_tokens_seen": 52972905, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.046875, "step": 2440, "time_per_iteration": 2.810427665710449 }, { "auxiliary_loss_clip": 0.01147766, "auxiliary_loss_mlp": 0.01052496, "balance_loss_clip": 1.03463876, "balance_loss_mlp": 1.04683471, "epoch": 0.14676085976251316, "flos": 26541074021760.0, "grad_norm": 1.6106721110339337, "language_loss": 0.82615495, "learning_rate": 3.791434719876643e-06, "loss": 0.84815764, "num_input_tokens_seen": 52994850, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 1.0078125, "step": 2441, "time_per_iteration": 2.6270759105682373 }, { "auxiliary_loss_clip": 0.0116882, "auxiliary_loss_mlp": 0.01046625, "balance_loss_clip": 1.02696705, "balance_loss_mlp": 1.04970169, "epoch": 0.14682098301518112, "flos": 23550110380800.0, "grad_norm": 1.977147219234409, "language_loss": 0.71308559, "learning_rate": 3.7912666990755825e-06, "loss": 0.73524004, "num_input_tokens_seen": 53014740, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.1015625, "step": 2442, "time_per_iteration": 2.673994302749634 }, { "auxiliary_loss_clip": 0.0118808, "auxiliary_loss_mlp": 0.0104311, "balance_loss_clip": 1.02375007, "balance_loss_mlp": 1.04984188, "epoch": 0.1468811062678491, "flos": 11651073909120.0, "grad_norm": 2.6594952859987986, "language_loss": 0.80714011, "learning_rate": 3.791098614348821e-06, "loss": 0.82945204, "num_input_tokens_seen": 53029780, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.109375, "step": 2443, "time_per_iteration": 2.7286136150360107 }, { "auxiliary_loss_clip": 0.01163404, "auxiliary_loss_mlp": 0.01045436, "balance_loss_clip": 1.02693486, "balance_loss_mlp": 1.04967248, "epoch": 0.14694122952051705, "flos": 23002616304000.0, "grad_norm": 2.2381270783549154, "language_loss": 0.83134794, "learning_rate": 3.790930465702358e-06, "loss": 0.85343629, "num_input_tokens_seen": 53048620, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.046875, "step": 2444, "time_per_iteration": 2.573669910430908 }, { "auxiliary_loss_clip": 0.01173663, "auxiliary_loss_mlp": 0.0104638, "balance_loss_clip": 1.02827215, "balance_loss_mlp": 1.04893494, "epoch": 0.14700135277318502, "flos": 26502972670080.0, "grad_norm": 2.3580091693226017, "language_loss": 0.71163201, "learning_rate": 3.790762253142193e-06, "loss": 0.73383248, "num_input_tokens_seen": 53070055, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.0625, "step": 2445, "time_per_iteration": 2.782866954803467 }, { "auxiliary_loss_clip": 0.01050022, "auxiliary_loss_mlp": 0.01001613, "balance_loss_clip": 0.99927616, "balance_loss_mlp": 1.01373529, "epoch": 0.147061476025853, "flos": 59449434387840.0, "grad_norm": 0.8204672632766745, "language_loss": 0.63113368, "learning_rate": 3.7905939766743296e-06, "loss": 0.65165007, "num_input_tokens_seen": 53126945, "router_z_loss_clip": 0.02331543, "router_z_loss_mlp": 0.36328125, "step": 2446, "time_per_iteration": 3.039475917816162 }, { "auxiliary_loss_clip": 0.01192886, "auxiliary_loss_mlp": 0.01041877, "balance_loss_clip": 1.02307808, "balance_loss_mlp": 1.05094218, "epoch": 0.14712159927852098, "flos": 28330897841280.0, "grad_norm": 1.7951618284343527, "language_loss": 0.74590731, "learning_rate": 3.790425636304773e-06, "loss": 0.76825494, "num_input_tokens_seen": 53149130, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0546875, "step": 2447, "time_per_iteration": 2.7250421047210693 }, { "auxiliary_loss_clip": 0.01153889, "auxiliary_loss_mlp": 0.01042125, "balance_loss_clip": 1.02400517, "balance_loss_mlp": 1.04939938, "epoch": 0.14718172253118894, "flos": 27089825074560.0, "grad_norm": 1.881459484295108, "language_loss": 0.85314012, "learning_rate": 3.7902572320395313e-06, "loss": 0.87510026, "num_input_tokens_seen": 53167120, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.046875, "step": 2448, "time_per_iteration": 2.578411340713501 }, { "auxiliary_loss_clip": 0.01050322, "auxiliary_loss_mlp": 0.01004956, "balance_loss_clip": 1.00278652, "balance_loss_mlp": 1.01406288, "epoch": 0.1472418457838569, "flos": 66706764860160.0, "grad_norm": 0.7692517760016483, "language_loss": 0.56875843, "learning_rate": 3.790088763884614e-06, "loss": 0.58931118, "num_input_tokens_seen": 53227945, "router_z_loss_clip": 0.02172852, "router_z_loss_mlp": 0.36328125, "step": 2449, "time_per_iteration": 3.113156318664551 }, { "auxiliary_loss_clip": 0.01168306, "auxiliary_loss_mlp": 0.01045213, "balance_loss_clip": 1.02647376, "balance_loss_mlp": 1.04774237, "epoch": 0.14730196903652487, "flos": 19573578391680.0, "grad_norm": 1.8846688684176625, "language_loss": 0.84734911, "learning_rate": 3.789920231846033e-06, "loss": 0.86948431, "num_input_tokens_seen": 53244615, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0234375, "step": 2450, "time_per_iteration": 2.572768211364746 }, { "auxiliary_loss_clip": 0.011653, "auxiliary_loss_mlp": 0.01049455, "balance_loss_clip": 1.03053653, "balance_loss_mlp": 1.05015302, "epoch": 0.14736209228919284, "flos": 16071031296000.0, "grad_norm": 1.7991325800785334, "language_loss": 0.7495023, "learning_rate": 3.7897516359298034e-06, "loss": 0.77164984, "num_input_tokens_seen": 53262205, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0625, "step": 2451, "time_per_iteration": 2.6228222846984863 }, { "auxiliary_loss_clip": 0.01166171, "auxiliary_loss_mlp": 0.01040632, "balance_loss_clip": 1.02335894, "balance_loss_mlp": 1.04800701, "epoch": 0.1474222155418608, "flos": 23039460679680.0, "grad_norm": 1.6542204972887995, "language_loss": 0.82114953, "learning_rate": 3.7895829761419417e-06, "loss": 0.84321755, "num_input_tokens_seen": 53282445, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 1.0, "step": 2452, "time_per_iteration": 2.6446328163146973 }, { "auxiliary_loss_clip": 0.01207754, "auxiliary_loss_mlp": 0.01036574, "balance_loss_clip": 1.01943195, "balance_loss_mlp": 1.05045438, "epoch": 0.1474823387945288, "flos": 17018641946880.0, "grad_norm": 1.8373615508392327, "language_loss": 0.74291772, "learning_rate": 3.789414252488467e-06, "loss": 0.76536101, "num_input_tokens_seen": 53299060, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 1.03125, "step": 2453, "time_per_iteration": 2.6648290157318115 }, { "auxiliary_loss_clip": 0.01173832, "auxiliary_loss_mlp": 0.0104909, "balance_loss_clip": 1.03033793, "balance_loss_mlp": 1.049209, "epoch": 0.14754246204719676, "flos": 17895041884800.0, "grad_norm": 2.323213298514322, "language_loss": 0.75468302, "learning_rate": 3.7892454649754006e-06, "loss": 0.77691227, "num_input_tokens_seen": 53315970, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0625, "step": 2454, "time_per_iteration": 2.608333110809326 }, { "auxiliary_loss_clip": 0.01173192, "auxiliary_loss_mlp": 0.01041456, "balance_loss_clip": 1.02374125, "balance_loss_mlp": 1.04830229, "epoch": 0.14760258529986472, "flos": 13079097987840.0, "grad_norm": 1.9929010716177218, "language_loss": 0.82883179, "learning_rate": 3.789076613608766e-06, "loss": 0.85097826, "num_input_tokens_seen": 53332940, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 1.0703125, "step": 2455, "time_per_iteration": 2.550584077835083 }, { "auxiliary_loss_clip": 0.011942, "auxiliary_loss_mlp": 0.01044543, "balance_loss_clip": 1.02651858, "balance_loss_mlp": 1.04792655, "epoch": 0.1476627085525327, "flos": 30806399358720.0, "grad_norm": 2.0417113649020338, "language_loss": 0.83750951, "learning_rate": 3.788907698394589e-06, "loss": 0.8598969, "num_input_tokens_seen": 53353295, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 1.09375, "step": 2456, "time_per_iteration": 2.665705680847168 }, { "auxiliary_loss_clip": 0.01161738, "auxiliary_loss_mlp": 0.01040806, "balance_loss_clip": 1.02310348, "balance_loss_mlp": 1.04969323, "epoch": 0.14772283180520066, "flos": 21689434984320.0, "grad_norm": 2.00201737068883, "language_loss": 0.84267533, "learning_rate": 3.788738719338898e-06, "loss": 0.86470068, "num_input_tokens_seen": 53373410, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 1.03125, "step": 2457, "time_per_iteration": 2.5587546825408936 }, { "auxiliary_loss_clip": 0.01157515, "auxiliary_loss_mlp": 0.01042048, "balance_loss_clip": 1.0246439, "balance_loss_mlp": 1.04801738, "epoch": 0.14778295505786862, "flos": 18770400328320.0, "grad_norm": 1.9015994789321777, "language_loss": 0.75497019, "learning_rate": 3.788569676447723e-06, "loss": 0.77696586, "num_input_tokens_seen": 53391430, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 1.0, "step": 2458, "time_per_iteration": 2.5555672645568848 }, { "auxiliary_loss_clip": 0.01197282, "auxiliary_loss_mlp": 0.01047383, "balance_loss_clip": 1.02769017, "balance_loss_mlp": 1.05085516, "epoch": 0.1478430783105366, "flos": 22893555634560.0, "grad_norm": 2.570275766172785, "language_loss": 0.83173174, "learning_rate": 3.7884005697270976e-06, "loss": 0.85417831, "num_input_tokens_seen": 53409960, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.09375, "step": 2459, "time_per_iteration": 2.5905511379241943 }, { "auxiliary_loss_clip": 0.01167632, "auxiliary_loss_mlp": 0.01039664, "balance_loss_clip": 1.02272439, "balance_loss_mlp": 1.04791939, "epoch": 0.14790320156320458, "flos": 15085319293440.0, "grad_norm": 2.265485635773293, "language_loss": 0.75459933, "learning_rate": 3.7882313991830553e-06, "loss": 0.77667224, "num_input_tokens_seen": 53426160, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 1.015625, "step": 2460, "time_per_iteration": 2.6168458461761475 }, { "auxiliary_loss_clip": 0.0118673, "auxiliary_loss_mlp": 0.01043618, "balance_loss_clip": 1.0247829, "balance_loss_mlp": 1.05081916, "epoch": 0.14796332481587254, "flos": 26504768350080.0, "grad_norm": 1.8339165736967995, "language_loss": 0.81427884, "learning_rate": 3.788062164821635e-06, "loss": 0.83658236, "num_input_tokens_seen": 53448530, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.09375, "step": 2461, "time_per_iteration": 2.6295325756073 }, { "auxiliary_loss_clip": 0.01165895, "auxiliary_loss_mlp": 0.01047784, "balance_loss_clip": 1.02848446, "balance_loss_mlp": 1.0507015, "epoch": 0.1480234480685405, "flos": 17563191108480.0, "grad_norm": 2.25496492687851, "language_loss": 0.65531939, "learning_rate": 3.7878928666488755e-06, "loss": 0.6774562, "num_input_tokens_seen": 53465915, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0625, "step": 2462, "time_per_iteration": 2.5947728157043457 }, { "auxiliary_loss_clip": 0.01172792, "auxiliary_loss_mlp": 0.01048106, "balance_loss_clip": 1.02881837, "balance_loss_mlp": 1.04964185, "epoch": 0.14808357132120847, "flos": 53582203232640.0, "grad_norm": 1.9811156042534093, "language_loss": 0.67300022, "learning_rate": 3.787723504670818e-06, "loss": 0.69520921, "num_input_tokens_seen": 53496055, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.046875, "step": 2463, "time_per_iteration": 2.865058422088623 }, { "auxiliary_loss_clip": 0.01163039, "auxiliary_loss_mlp": 0.01045913, "balance_loss_clip": 1.02662444, "balance_loss_mlp": 1.04835248, "epoch": 0.14814369457387644, "flos": 19829190551040.0, "grad_norm": 2.9899017965627195, "language_loss": 0.765975, "learning_rate": 3.7875540788935076e-06, "loss": 0.78806454, "num_input_tokens_seen": 53513790, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0546875, "step": 2464, "time_per_iteration": 2.638273000717163 }, { "auxiliary_loss_clip": 0.01170661, "auxiliary_loss_mlp": 0.01046555, "balance_loss_clip": 1.02936459, "balance_loss_mlp": 1.04934561, "epoch": 0.1482038178265444, "flos": 23914962777600.0, "grad_norm": 1.9750985408735973, "language_loss": 0.79697007, "learning_rate": 3.7873845893229896e-06, "loss": 0.81914222, "num_input_tokens_seen": 53533410, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 1.03125, "step": 2465, "time_per_iteration": 2.5778510570526123 }, { "auxiliary_loss_clip": 0.011599, "auxiliary_loss_mlp": 0.0104347, "balance_loss_clip": 1.02427745, "balance_loss_mlp": 1.05119467, "epoch": 0.1482639410792124, "flos": 24170503109760.0, "grad_norm": 1.9166648483011177, "language_loss": 0.7568621, "learning_rate": 3.7872150359653143e-06, "loss": 0.7788958, "num_input_tokens_seen": 53554775, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0859375, "step": 2466, "time_per_iteration": 2.7401578426361084 }, { "auxiliary_loss_clip": 0.01096485, "auxiliary_loss_mlp": 0.01012381, "balance_loss_clip": 1.0101279, "balance_loss_mlp": 1.01479673, "epoch": 0.14832406433188036, "flos": 66191051341440.0, "grad_norm": 0.787332835509344, "language_loss": 0.60077661, "learning_rate": 3.787045418826531e-06, "loss": 0.62186527, "num_input_tokens_seen": 53609675, "router_z_loss_clip": 0.02258301, "router_z_loss_mlp": 0.36328125, "step": 2467, "time_per_iteration": 3.2523767948150635 }, { "auxiliary_loss_clip": 0.01173824, "auxiliary_loss_mlp": 0.01038946, "balance_loss_clip": 1.02019453, "balance_loss_mlp": 1.05044389, "epoch": 0.14838418758454833, "flos": 25411252654080.0, "grad_norm": 2.4125947107230497, "language_loss": 0.87583745, "learning_rate": 3.7868757379126938e-06, "loss": 0.89796519, "num_input_tokens_seen": 53626950, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.046875, "step": 2468, "time_per_iteration": 2.649707794189453 }, { "auxiliary_loss_clip": 0.01185549, "auxiliary_loss_mlp": 0.01043068, "balance_loss_clip": 1.02318406, "balance_loss_mlp": 1.0502547, "epoch": 0.1484443108372163, "flos": 23289901280640.0, "grad_norm": 2.1676064440300347, "language_loss": 0.76037347, "learning_rate": 3.7867059932298578e-06, "loss": 0.78265965, "num_input_tokens_seen": 53644200, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.078125, "step": 2469, "time_per_iteration": 5.541909456253052 }, { "auxiliary_loss_clip": 0.01180149, "auxiliary_loss_mlp": 0.01043669, "balance_loss_clip": 1.02538204, "balance_loss_mlp": 1.04886472, "epoch": 0.14850443408988426, "flos": 14647675985280.0, "grad_norm": 4.654939124872262, "language_loss": 0.75644732, "learning_rate": 3.786536184784081e-06, "loss": 0.77868545, "num_input_tokens_seen": 53659650, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 1.046875, "step": 2470, "time_per_iteration": 3.9267070293426514 }, { "auxiliary_loss_clip": 0.01150657, "auxiliary_loss_mlp": 0.01040986, "balance_loss_clip": 1.02173352, "balance_loss_mlp": 1.04808545, "epoch": 0.14856455734255222, "flos": 23548314700800.0, "grad_norm": 1.7490767149215671, "language_loss": 0.71955764, "learning_rate": 3.786366312581423e-06, "loss": 0.74147403, "num_input_tokens_seen": 53680275, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.0234375, "step": 2471, "time_per_iteration": 2.5853681564331055 }, { "auxiliary_loss_clip": 0.01176323, "auxiliary_loss_mlp": 0.01045378, "balance_loss_clip": 1.02495706, "balance_loss_mlp": 1.04844999, "epoch": 0.1486246805952202, "flos": 18077288515200.0, "grad_norm": 2.651256119246228, "language_loss": 0.89378858, "learning_rate": 3.786196376627947e-06, "loss": 0.91600561, "num_input_tokens_seen": 53698270, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.09375, "step": 2472, "time_per_iteration": 2.613293170928955 }, { "auxiliary_loss_clip": 0.01181934, "auxiliary_loss_mlp": 0.01043047, "balance_loss_clip": 1.02402103, "balance_loss_mlp": 1.04825151, "epoch": 0.14868480384788818, "flos": 19353625459200.0, "grad_norm": 3.5193998393332513, "language_loss": 0.80307317, "learning_rate": 3.7860263769297163e-06, "loss": 0.82532299, "num_input_tokens_seen": 53716845, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.0625, "step": 2473, "time_per_iteration": 4.05193829536438 }, { "auxiliary_loss_clip": 0.01167305, "auxiliary_loss_mlp": 0.01049378, "balance_loss_clip": 1.0304358, "balance_loss_mlp": 1.04948187, "epoch": 0.14874492710055615, "flos": 22200192426240.0, "grad_norm": 2.643319690061896, "language_loss": 0.77487791, "learning_rate": 3.7858563134927985e-06, "loss": 0.79704475, "num_input_tokens_seen": 53734970, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0859375, "step": 2474, "time_per_iteration": 2.6134819984436035 }, { "auxiliary_loss_clip": 0.01184216, "auxiliary_loss_mlp": 0.01049509, "balance_loss_clip": 1.03038824, "balance_loss_mlp": 1.04794049, "epoch": 0.1488050503532241, "flos": 21103444506240.0, "grad_norm": 1.9232040672701218, "language_loss": 0.81903088, "learning_rate": 3.785686186323263e-06, "loss": 0.84136814, "num_input_tokens_seen": 53753415, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0859375, "step": 2475, "time_per_iteration": 2.7300848960876465 }, { "auxiliary_loss_clip": 0.01192847, "auxiliary_loss_mlp": 0.01048237, "balance_loss_clip": 1.02980685, "balance_loss_mlp": 1.05265415, "epoch": 0.14886517360589208, "flos": 12786569625600.0, "grad_norm": 1.7127782994407845, "language_loss": 0.8020705, "learning_rate": 3.785515995427181e-06, "loss": 0.82448137, "num_input_tokens_seen": 53770305, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0390625, "step": 2476, "time_per_iteration": 2.5198187828063965 }, { "auxiliary_loss_clip": 0.01160947, "auxiliary_loss_mlp": 0.010469, "balance_loss_clip": 1.02879262, "balance_loss_mlp": 1.0513624, "epoch": 0.14892529685856004, "flos": 29022860419200.0, "grad_norm": 2.307059403369985, "language_loss": 0.77663219, "learning_rate": 3.7853457408106257e-06, "loss": 0.7987107, "num_input_tokens_seen": 53788895, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.0078125, "step": 2477, "time_per_iteration": 2.623671054840088 }, { "auxiliary_loss_clip": 0.01073213, "auxiliary_loss_mlp": 0.0100704, "balance_loss_clip": 1.00472689, "balance_loss_mlp": 1.01734006, "epoch": 0.148985420111228, "flos": 61926121054080.0, "grad_norm": 0.8050436496722203, "language_loss": 0.60164279, "learning_rate": 3.785175422479673e-06, "loss": 0.62244529, "num_input_tokens_seen": 53850260, "router_z_loss_clip": 0.02307129, "router_z_loss_mlp": 0.375, "step": 2478, "time_per_iteration": 3.1978092193603516 }, { "auxiliary_loss_clip": 0.01194055, "auxiliary_loss_mlp": 0.0103933, "balance_loss_clip": 1.02167535, "balance_loss_mlp": 1.05282569, "epoch": 0.149045543363896, "flos": 23915106432000.0, "grad_norm": 2.2726608709489216, "language_loss": 0.71085286, "learning_rate": 3.785005040440402e-06, "loss": 0.73318666, "num_input_tokens_seen": 53867520, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 1.046875, "step": 2479, "time_per_iteration": 2.6570584774017334 }, { "auxiliary_loss_clip": 0.01173296, "auxiliary_loss_mlp": 0.01044371, "balance_loss_clip": 1.02561891, "balance_loss_mlp": 1.0502106, "epoch": 0.14910566661656396, "flos": 23654394541440.0, "grad_norm": 1.8223291031641577, "language_loss": 0.80802506, "learning_rate": 3.784834594698892e-06, "loss": 0.83020175, "num_input_tokens_seen": 53886620, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.046875, "step": 2480, "time_per_iteration": 2.619014024734497 }, { "auxiliary_loss_clip": 0.01184843, "auxiliary_loss_mlp": 0.01043199, "balance_loss_clip": 1.02503204, "balance_loss_mlp": 1.05061293, "epoch": 0.14916578986923193, "flos": 20515299212160.0, "grad_norm": 2.1576740408430823, "language_loss": 0.84132165, "learning_rate": 3.7846640852612275e-06, "loss": 0.8636021, "num_input_tokens_seen": 53902230, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.0703125, "step": 2481, "time_per_iteration": 2.597142457962036 }, { "auxiliary_loss_clip": 0.01164048, "auxiliary_loss_mlp": 0.0104975, "balance_loss_clip": 1.02973485, "balance_loss_mlp": 1.04962015, "epoch": 0.1492259131218999, "flos": 22491822948480.0, "grad_norm": 3.081203318610705, "language_loss": 0.77410752, "learning_rate": 3.7844935121334917e-06, "loss": 0.79624546, "num_input_tokens_seen": 53919475, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.0546875, "step": 2482, "time_per_iteration": 2.617145299911499 }, { "auxiliary_loss_clip": 0.01190598, "auxiliary_loss_mlp": 0.01039101, "balance_loss_clip": 1.01926494, "balance_loss_mlp": 1.05205846, "epoch": 0.14928603637456786, "flos": 23185868515200.0, "grad_norm": 2.348991074437475, "language_loss": 0.78236717, "learning_rate": 3.7843228753217726e-06, "loss": 0.80466413, "num_input_tokens_seen": 53939150, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.109375, "step": 2483, "time_per_iteration": 2.599945545196533 }, { "auxiliary_loss_clip": 0.01161026, "auxiliary_loss_mlp": 0.01289674, "balance_loss_clip": 1.02469397, "balance_loss_mlp": 1.05038393, "epoch": 0.14934615962723582, "flos": 21653237053440.0, "grad_norm": 1.6268947266547444, "language_loss": 0.7009424, "learning_rate": 3.784152174832161e-06, "loss": 0.72544938, "num_input_tokens_seen": 53958735, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 1.015625, "step": 2484, "time_per_iteration": 2.5988411903381348 }, { "auxiliary_loss_clip": 0.01176248, "auxiliary_loss_mlp": 0.01290106, "balance_loss_clip": 1.02177334, "balance_loss_mlp": 1.04980338, "epoch": 0.1494062828799038, "flos": 27010066924800.0, "grad_norm": 1.9281811957808028, "language_loss": 0.84276569, "learning_rate": 3.783981410670747e-06, "loss": 0.86742926, "num_input_tokens_seen": 53975065, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.0859375, "step": 2485, "time_per_iteration": 2.586200475692749 }, { "auxiliary_loss_clip": 0.01170371, "auxiliary_loss_mlp": 0.01046736, "balance_loss_clip": 1.02691102, "balance_loss_mlp": 1.05243564, "epoch": 0.14946640613257178, "flos": 21214947300480.0, "grad_norm": 2.7669601411293825, "language_loss": 0.84736162, "learning_rate": 3.7838105828436246e-06, "loss": 0.8695327, "num_input_tokens_seen": 53993330, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.09375, "step": 2486, "time_per_iteration": 2.629138946533203 }, { "auxiliary_loss_clip": 0.0116095, "auxiliary_loss_mlp": 0.01040925, "balance_loss_clip": 1.02476013, "balance_loss_mlp": 1.04840577, "epoch": 0.14952652938523975, "flos": 13370872164480.0, "grad_norm": 2.23120536971926, "language_loss": 0.74675786, "learning_rate": 3.7836396913568924e-06, "loss": 0.76877666, "num_input_tokens_seen": 54010515, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 1.03125, "step": 2487, "time_per_iteration": 2.5926647186279297 }, { "auxiliary_loss_clip": 0.01163743, "auxiliary_loss_mlp": 0.01043247, "balance_loss_clip": 1.02493668, "balance_loss_mlp": 1.04995537, "epoch": 0.1495866526379077, "flos": 35517699959040.0, "grad_norm": 2.8362022732110725, "language_loss": 0.71635133, "learning_rate": 3.783468736216647e-06, "loss": 0.7384212, "num_input_tokens_seen": 54031315, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.046875, "step": 2488, "time_per_iteration": 2.7583675384521484 }, { "auxiliary_loss_clip": 0.01176613, "auxiliary_loss_mlp": 0.01046593, "balance_loss_clip": 1.02836585, "balance_loss_mlp": 1.05024016, "epoch": 0.14964677589057568, "flos": 17632749795840.0, "grad_norm": 2.5415973288334737, "language_loss": 0.70313108, "learning_rate": 3.78329771742899e-06, "loss": 0.72536314, "num_input_tokens_seen": 54045965, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.078125, "step": 2489, "time_per_iteration": 2.4769458770751953 }, { "auxiliary_loss_clip": 0.01176518, "auxiliary_loss_mlp": 0.01048497, "balance_loss_clip": 1.03027046, "balance_loss_mlp": 1.05007517, "epoch": 0.14970689914324364, "flos": 20185280029440.0, "grad_norm": 3.871615523463072, "language_loss": 0.815516, "learning_rate": 3.7831266350000246e-06, "loss": 0.83776617, "num_input_tokens_seen": 54059960, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.078125, "step": 2490, "time_per_iteration": 2.5927417278289795 }, { "auxiliary_loss_clip": 0.01188524, "auxiliary_loss_mlp": 0.01049112, "balance_loss_clip": 1.03136206, "balance_loss_mlp": 1.05279112, "epoch": 0.1497670223959116, "flos": 37228699382400.0, "grad_norm": 1.8151842918063188, "language_loss": 0.79941237, "learning_rate": 3.7829554889358566e-06, "loss": 0.82178879, "num_input_tokens_seen": 54079330, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 1.0859375, "step": 2491, "time_per_iteration": 2.649303674697876 }, { "auxiliary_loss_clip": 0.01167991, "auxiliary_loss_mlp": 0.01052765, "balance_loss_clip": 1.03232002, "balance_loss_mlp": 1.04816985, "epoch": 0.1498271456485796, "flos": 24455848752000.0, "grad_norm": 1.9007004818650033, "language_loss": 0.90852988, "learning_rate": 3.782784279242593e-06, "loss": 0.93073744, "num_input_tokens_seen": 54097555, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.109375, "step": 2492, "time_per_iteration": 2.767467498779297 }, { "auxiliary_loss_clip": 0.01165745, "auxiliary_loss_mlp": 0.01058981, "balance_loss_clip": 1.03974056, "balance_loss_mlp": 1.05067074, "epoch": 0.14988726890124757, "flos": 16253601148800.0, "grad_norm": 4.386385093999351, "language_loss": 0.78756136, "learning_rate": 3.782613005926345e-06, "loss": 0.80980861, "num_input_tokens_seen": 54115600, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.0625, "step": 2493, "time_per_iteration": 2.580536365509033 }, { "auxiliary_loss_clip": 0.01163895, "auxiliary_loss_mlp": 0.01043636, "balance_loss_clip": 1.02416921, "balance_loss_mlp": 1.04799163, "epoch": 0.14994739215391553, "flos": 20666555383680.0, "grad_norm": 1.8289064936406152, "language_loss": 0.80018193, "learning_rate": 3.7824416689932236e-06, "loss": 0.82225728, "num_input_tokens_seen": 54135220, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0703125, "step": 2494, "time_per_iteration": 2.7090485095977783 }, { "auxiliary_loss_clip": 0.01172437, "auxiliary_loss_mlp": 0.01047913, "balance_loss_clip": 1.02848196, "balance_loss_mlp": 1.04934621, "epoch": 0.1500075154065835, "flos": 70652375239680.0, "grad_norm": 1.851961816832031, "language_loss": 0.65646535, "learning_rate": 3.782270268449345e-06, "loss": 0.67866886, "num_input_tokens_seen": 54161065, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.046875, "step": 2495, "time_per_iteration": 3.0251150131225586 }, { "auxiliary_loss_clip": 0.01064176, "auxiliary_loss_mlp": 0.01018801, "balance_loss_clip": 1.01633298, "balance_loss_mlp": 1.01714182, "epoch": 0.15006763865925146, "flos": 68011937447040.0, "grad_norm": 0.8964087798236753, "language_loss": 0.59544623, "learning_rate": 3.7820988043008242e-06, "loss": 0.61627591, "num_input_tokens_seen": 54225095, "router_z_loss_clip": 0.0246582, "router_z_loss_mlp": 0.38085938, "step": 2496, "time_per_iteration": 3.2591147422790527 }, { "auxiliary_loss_clip": 0.0115483, "auxiliary_loss_mlp": 0.01056381, "balance_loss_clip": 1.03642547, "balance_loss_mlp": 1.04667175, "epoch": 0.15012776191191943, "flos": 18916269459840.0, "grad_norm": 1.710600361835461, "language_loss": 0.65142888, "learning_rate": 3.7819272765537817e-06, "loss": 0.67354095, "num_input_tokens_seen": 54243750, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.078125, "step": 2497, "time_per_iteration": 2.6580758094787598 }, { "auxiliary_loss_clip": 0.01179595, "auxiliary_loss_mlp": 0.01049598, "balance_loss_clip": 1.03071523, "balance_loss_mlp": 1.05322993, "epoch": 0.1501878851645874, "flos": 23701330638720.0, "grad_norm": 1.5231751644938571, "language_loss": 0.75079513, "learning_rate": 3.781755685214338e-06, "loss": 0.77308702, "num_input_tokens_seen": 54266185, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.078125, "step": 2498, "time_per_iteration": 2.6886789798736572 }, { "auxiliary_loss_clip": 0.01189611, "auxiliary_loss_mlp": 0.01045504, "balance_loss_clip": 1.0248332, "balance_loss_mlp": 1.05254459, "epoch": 0.15024800841725539, "flos": 20412523422720.0, "grad_norm": 2.1388982755219548, "language_loss": 0.72140014, "learning_rate": 3.7815840302886174e-06, "loss": 0.74375135, "num_input_tokens_seen": 54283940, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1015625, "step": 2499, "time_per_iteration": 2.747596025466919 }, { "auxiliary_loss_clip": 0.01175409, "auxiliary_loss_mlp": 0.01053808, "balance_loss_clip": 1.03428185, "balance_loss_mlp": 1.04936647, "epoch": 0.15030813166992335, "flos": 31831002812160.0, "grad_norm": 2.8577554011774438, "language_loss": 0.7184577, "learning_rate": 3.7814123117827446e-06, "loss": 0.74074996, "num_input_tokens_seen": 54304830, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.078125, "step": 2500, "time_per_iteration": 2.642331838607788 }, { "auxiliary_loss_clip": 0.01185249, "auxiliary_loss_mlp": 0.01058253, "balance_loss_clip": 1.03814209, "balance_loss_mlp": 1.05167294, "epoch": 0.15036825492259132, "flos": 35657822914560.0, "grad_norm": 1.777768202367705, "language_loss": 0.64992189, "learning_rate": 3.7812405297028496e-06, "loss": 0.6723569, "num_input_tokens_seen": 54325595, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.0625, "step": 2501, "time_per_iteration": 2.743905544281006 }, { "auxiliary_loss_clip": 0.01173361, "auxiliary_loss_mlp": 0.01054912, "balance_loss_clip": 1.03385997, "balance_loss_mlp": 1.04777002, "epoch": 0.15042837817525928, "flos": 18838163335680.0, "grad_norm": 2.195904081827499, "language_loss": 0.83426768, "learning_rate": 3.7810686840550627e-06, "loss": 0.85655046, "num_input_tokens_seen": 54342180, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.078125, "step": 2502, "time_per_iteration": 2.5663392543792725 }, { "auxiliary_loss_clip": 0.01180142, "auxiliary_loss_mlp": 0.01050403, "balance_loss_clip": 1.03231931, "balance_loss_mlp": 1.0478096, "epoch": 0.15048850142792725, "flos": 19535548867200.0, "grad_norm": 1.9588379745847802, "language_loss": 0.77030492, "learning_rate": 3.780896774845515e-06, "loss": 0.79261035, "num_input_tokens_seen": 54360255, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 1.0546875, "step": 2503, "time_per_iteration": 2.6725029945373535 }, { "auxiliary_loss_clip": 0.01155756, "auxiliary_loss_mlp": 0.0104473, "balance_loss_clip": 1.02627683, "balance_loss_mlp": 1.05057621, "epoch": 0.1505486246805952, "flos": 22017550746240.0, "grad_norm": 2.126825389928523, "language_loss": 0.85167092, "learning_rate": 3.780724802080342e-06, "loss": 0.87367582, "num_input_tokens_seen": 54378260, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.046875, "step": 2504, "time_per_iteration": 2.5627217292785645 }, { "auxiliary_loss_clip": 0.01161933, "auxiliary_loss_mlp": 0.01044005, "balance_loss_clip": 1.02587342, "balance_loss_mlp": 1.04839265, "epoch": 0.15060874793326318, "flos": 20743153136640.0, "grad_norm": 1.6069439520851834, "language_loss": 0.82839644, "learning_rate": 3.780552765765682e-06, "loss": 0.85045588, "num_input_tokens_seen": 54399745, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.046875, "step": 2505, "time_per_iteration": 2.576629638671875 }, { "auxiliary_loss_clip": 0.01180645, "auxiliary_loss_mlp": 0.01048761, "balance_loss_clip": 1.02975869, "balance_loss_mlp": 1.04756093, "epoch": 0.15066887118593117, "flos": 16471902055680.0, "grad_norm": 5.098913387677473, "language_loss": 0.76057845, "learning_rate": 3.7803806659076736e-06, "loss": 0.78287244, "num_input_tokens_seen": 54417105, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0625, "step": 2506, "time_per_iteration": 2.542523145675659 }, { "auxiliary_loss_clip": 0.01159163, "auxiliary_loss_mlp": 0.01047608, "balance_loss_clip": 1.02857018, "balance_loss_mlp": 1.04976547, "epoch": 0.15072899443859913, "flos": 19859319083520.0, "grad_norm": 3.8862347394476844, "language_loss": 0.75802296, "learning_rate": 3.7802085025124596e-06, "loss": 0.78009069, "num_input_tokens_seen": 54433920, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.09375, "step": 2507, "time_per_iteration": 2.5672006607055664 }, { "auxiliary_loss_clip": 0.01157067, "auxiliary_loss_mlp": 0.01044255, "balance_loss_clip": 1.02555108, "balance_loss_mlp": 1.04479969, "epoch": 0.1507891176912671, "flos": 20776226584320.0, "grad_norm": 2.2361195530956746, "language_loss": 0.68845522, "learning_rate": 3.780036275586183e-06, "loss": 0.71046841, "num_input_tokens_seen": 54451540, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.03125, "step": 2508, "time_per_iteration": 2.5897727012634277 }, { "auxiliary_loss_clip": 0.0117635, "auxiliary_loss_mlp": 0.0104996, "balance_loss_clip": 1.03122056, "balance_loss_mlp": 1.05138481, "epoch": 0.15084924094393506, "flos": 23586631534080.0, "grad_norm": 3.2760249302678757, "language_loss": 0.77274579, "learning_rate": 3.77986398513499e-06, "loss": 0.7950089, "num_input_tokens_seen": 54470800, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0625, "step": 2509, "time_per_iteration": 2.64713716506958 }, { "auxiliary_loss_clip": 0.01180578, "auxiliary_loss_mlp": 0.01299548, "balance_loss_clip": 1.0302794, "balance_loss_mlp": 1.04957318, "epoch": 0.15090936419660303, "flos": 18911313383040.0, "grad_norm": 3.7363601574874377, "language_loss": 0.79767805, "learning_rate": 3.7796916311650306e-06, "loss": 0.82247937, "num_input_tokens_seen": 54486525, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.125, "step": 2510, "time_per_iteration": 5.761715650558472 }, { "auxiliary_loss_clip": 0.01166796, "auxiliary_loss_mlp": 0.01054546, "balance_loss_clip": 1.03408992, "balance_loss_mlp": 1.04956722, "epoch": 0.150969487449271, "flos": 17928223073280.0, "grad_norm": 2.0009870888398513, "language_loss": 0.74052572, "learning_rate": 3.779519213682454e-06, "loss": 0.76273918, "num_input_tokens_seen": 54503795, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.0859375, "step": 2511, "time_per_iteration": 3.930220127105713 }, { "auxiliary_loss_clip": 0.01199547, "auxiliary_loss_mlp": 0.01044914, "balance_loss_clip": 1.02621055, "balance_loss_mlp": 1.04818904, "epoch": 0.151029610701939, "flos": 24243078539520.0, "grad_norm": 2.1327047816965856, "language_loss": 0.68567479, "learning_rate": 3.7793467326934147e-06, "loss": 0.70811939, "num_input_tokens_seen": 54523025, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0625, "step": 2512, "time_per_iteration": 2.6429479122161865 }, { "auxiliary_loss_clip": 0.01165398, "auxiliary_loss_mlp": 0.01048372, "balance_loss_clip": 1.02960825, "balance_loss_mlp": 1.05013108, "epoch": 0.15108973395460695, "flos": 30262496641920.0, "grad_norm": 2.7534125535288814, "language_loss": 0.73666471, "learning_rate": 3.7791741882040677e-06, "loss": 0.75880241, "num_input_tokens_seen": 54545025, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0625, "step": 2513, "time_per_iteration": 2.6333346366882324 }, { "auxiliary_loss_clip": 0.01104285, "auxiliary_loss_mlp": 0.0100229, "balance_loss_clip": 0.99976236, "balance_loss_mlp": 1.01974511, "epoch": 0.15114985720727492, "flos": 60437624428800.0, "grad_norm": 1.0221147899338099, "language_loss": 0.64781988, "learning_rate": 3.7790015802205703e-06, "loss": 0.66888559, "num_input_tokens_seen": 54604545, "router_z_loss_clip": 0.02526855, "router_z_loss_mlp": 0.39257812, "step": 2514, "time_per_iteration": 3.1711461544036865 }, { "auxiliary_loss_clip": 0.01160872, "auxiliary_loss_mlp": 0.01045501, "balance_loss_clip": 1.02683294, "balance_loss_mlp": 1.04734397, "epoch": 0.15120998045994288, "flos": 20521691832960.0, "grad_norm": 2.1210961512072966, "language_loss": 0.7295084, "learning_rate": 3.778828908749082e-06, "loss": 0.75157207, "num_input_tokens_seen": 54620590, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.046875, "step": 2515, "time_per_iteration": 4.776270389556885 }, { "auxiliary_loss_clip": 0.01171823, "auxiliary_loss_mlp": 0.01042639, "balance_loss_clip": 1.02313626, "balance_loss_mlp": 1.04986632, "epoch": 0.15127010371261085, "flos": 21178893024000.0, "grad_norm": 1.9202587165809175, "language_loss": 0.7692076, "learning_rate": 3.7786561737957664e-06, "loss": 0.79135227, "num_input_tokens_seen": 54640410, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0390625, "step": 2516, "time_per_iteration": 2.6220977306365967 }, { "auxiliary_loss_clip": 0.01068502, "auxiliary_loss_mlp": 0.01006258, "balance_loss_clip": 1.00380278, "balance_loss_mlp": 1.01945412, "epoch": 0.1513302269652788, "flos": 65320648974720.0, "grad_norm": 0.7233120574274171, "language_loss": 0.54676867, "learning_rate": 3.7784833753667867e-06, "loss": 0.56751633, "num_input_tokens_seen": 54701430, "router_z_loss_clip": 0.02453613, "router_z_loss_mlp": 0.3984375, "step": 2517, "time_per_iteration": 3.2091779708862305 }, { "auxiliary_loss_clip": 0.01181605, "auxiliary_loss_mlp": 0.01049828, "balance_loss_clip": 1.02981257, "balance_loss_mlp": 1.0475378, "epoch": 0.15139035021794678, "flos": 19135827342720.0, "grad_norm": 2.3366491729660064, "language_loss": 0.78043866, "learning_rate": 3.7783105134683108e-06, "loss": 0.80275297, "num_input_tokens_seen": 54720845, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.0625, "step": 2518, "time_per_iteration": 2.627784490585327 }, { "auxiliary_loss_clip": 0.01169822, "auxiliary_loss_mlp": 0.0105135, "balance_loss_clip": 1.03153729, "balance_loss_mlp": 1.05071092, "epoch": 0.15145047347061477, "flos": 26578564842240.0, "grad_norm": 2.07460376251734, "language_loss": 0.70428681, "learning_rate": 3.7781375881065066e-06, "loss": 0.7264986, "num_input_tokens_seen": 54740495, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.09375, "step": 2519, "time_per_iteration": 2.690089702606201 }, { "auxiliary_loss_clip": 0.01164844, "auxiliary_loss_mlp": 0.01048673, "balance_loss_clip": 1.02983832, "balance_loss_mlp": 1.04900599, "epoch": 0.15151059672328274, "flos": 20302959962880.0, "grad_norm": 1.931035734032231, "language_loss": 0.78311682, "learning_rate": 3.7779645992875453e-06, "loss": 0.80525196, "num_input_tokens_seen": 54758415, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.0703125, "step": 2520, "time_per_iteration": 2.597982168197632 }, { "auxiliary_loss_clip": 0.01175039, "auxiliary_loss_mlp": 0.0105124, "balance_loss_clip": 1.03118885, "balance_loss_mlp": 1.04849505, "epoch": 0.1515707199759507, "flos": 27228367831680.0, "grad_norm": 1.8309781326313495, "language_loss": 0.74332875, "learning_rate": 3.7777915470176013e-06, "loss": 0.7655915, "num_input_tokens_seen": 54779355, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.0859375, "step": 2521, "time_per_iteration": 2.6938483715057373 }, { "auxiliary_loss_clip": 0.01176761, "auxiliary_loss_mlp": 0.01047382, "balance_loss_clip": 1.0274384, "balance_loss_mlp": 1.04843235, "epoch": 0.15163084322861867, "flos": 23587349806080.0, "grad_norm": 2.1949381644197024, "language_loss": 0.81829429, "learning_rate": 3.7776184313028504e-06, "loss": 0.84053576, "num_input_tokens_seen": 54799465, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.1015625, "step": 2522, "time_per_iteration": 2.6119284629821777 }, { "auxiliary_loss_clip": 0.01097177, "auxiliary_loss_mlp": 0.01007692, "balance_loss_clip": 1.00516438, "balance_loss_mlp": 1.02010918, "epoch": 0.15169096648128663, "flos": 66889622021760.0, "grad_norm": 0.8199453180799785, "language_loss": 0.57871985, "learning_rate": 3.7774452521494703e-06, "loss": 0.59976852, "num_input_tokens_seen": 54857665, "router_z_loss_clip": 0.02526855, "router_z_loss_mlp": 0.40625, "step": 2523, "time_per_iteration": 3.190681219100952 }, { "auxiliary_loss_clip": 0.01163327, "auxiliary_loss_mlp": 0.01050247, "balance_loss_clip": 1.0297786, "balance_loss_mlp": 1.04819536, "epoch": 0.1517510897339546, "flos": 29095435848960.0, "grad_norm": 1.9941816277315727, "language_loss": 0.74252158, "learning_rate": 3.777272009563641e-06, "loss": 0.76465732, "num_input_tokens_seen": 54879895, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.0625, "step": 2524, "time_per_iteration": 2.6521127223968506 }, { "auxiliary_loss_clip": 0.01172088, "auxiliary_loss_mlp": 0.01045124, "balance_loss_clip": 1.02532363, "balance_loss_mlp": 1.04783297, "epoch": 0.1518112129866226, "flos": 18406553512320.0, "grad_norm": 1.8852663281935529, "language_loss": 0.74621469, "learning_rate": 3.7770987035515454e-06, "loss": 0.76838678, "num_input_tokens_seen": 54898245, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.0546875, "step": 2525, "time_per_iteration": 2.550144672393799 }, { "auxiliary_loss_clip": 0.01167548, "auxiliary_loss_mlp": 0.01042844, "balance_loss_clip": 1.02233994, "balance_loss_mlp": 1.05000687, "epoch": 0.15187133623929056, "flos": 19425410789760.0, "grad_norm": 2.344283329601817, "language_loss": 0.79606152, "learning_rate": 3.7769253341193677e-06, "loss": 0.81816542, "num_input_tokens_seen": 54917060, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.0859375, "step": 2526, "time_per_iteration": 2.595541477203369 }, { "auxiliary_loss_clip": 0.01169573, "auxiliary_loss_mlp": 0.01045343, "balance_loss_clip": 1.02682996, "balance_loss_mlp": 1.04915607, "epoch": 0.15193145949195852, "flos": 17566207850880.0, "grad_norm": 2.195558897708494, "language_loss": 0.84493035, "learning_rate": 3.7767519012732968e-06, "loss": 0.8670795, "num_input_tokens_seen": 54936365, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0234375, "step": 2527, "time_per_iteration": 2.5478880405426025 }, { "auxiliary_loss_clip": 0.0117419, "auxiliary_loss_mlp": 0.01042026, "balance_loss_clip": 1.02365565, "balance_loss_mlp": 1.04923606, "epoch": 0.15199158274462649, "flos": 36176265866880.0, "grad_norm": 2.9536925453971317, "language_loss": 0.69045138, "learning_rate": 3.77657840501952e-06, "loss": 0.71261358, "num_input_tokens_seen": 54961365, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0703125, "step": 2528, "time_per_iteration": 2.798473834991455 }, { "auxiliary_loss_clip": 0.01173328, "auxiliary_loss_mlp": 0.0105225, "balance_loss_clip": 1.03293824, "balance_loss_mlp": 1.04949737, "epoch": 0.15205170599729445, "flos": 23074042498560.0, "grad_norm": 3.9148559332308017, "language_loss": 0.86849076, "learning_rate": 3.77640484536423e-06, "loss": 0.89074653, "num_input_tokens_seen": 54980750, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0546875, "step": 2529, "time_per_iteration": 2.527435302734375 }, { "auxiliary_loss_clip": 0.01173046, "auxiliary_loss_mlp": 0.01040169, "balance_loss_clip": 1.02023697, "balance_loss_mlp": 1.05024052, "epoch": 0.15211182924996242, "flos": 21908382336000.0, "grad_norm": 2.1487767676434166, "language_loss": 0.83320332, "learning_rate": 3.7762312223136206e-06, "loss": 0.85533541, "num_input_tokens_seen": 54999675, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.046875, "step": 2530, "time_per_iteration": 2.657050848007202 }, { "auxiliary_loss_clip": 0.01175009, "auxiliary_loss_mlp": 0.01049044, "balance_loss_clip": 1.02917171, "balance_loss_mlp": 1.05044603, "epoch": 0.15217195250263038, "flos": 13881521865600.0, "grad_norm": 2.202037448822957, "language_loss": 0.80076575, "learning_rate": 3.7760575358738885e-06, "loss": 0.82300627, "num_input_tokens_seen": 55018295, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.0625, "step": 2531, "time_per_iteration": 2.5140695571899414 }, { "auxiliary_loss_clip": 0.01164021, "auxiliary_loss_mlp": 0.01053853, "balance_loss_clip": 1.03485084, "balance_loss_mlp": 1.04938948, "epoch": 0.15223207575529837, "flos": 24535319592960.0, "grad_norm": 1.825093367195786, "language_loss": 0.78579956, "learning_rate": 3.7758837860512306e-06, "loss": 0.80797827, "num_input_tokens_seen": 55037975, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0546875, "step": 2532, "time_per_iteration": 2.6486129760742188 }, { "auxiliary_loss_clip": 0.01163301, "auxiliary_loss_mlp": 0.01052755, "balance_loss_clip": 1.03269207, "balance_loss_mlp": 1.05077147, "epoch": 0.15229219900796634, "flos": 25556798563200.0, "grad_norm": 1.7607424498376254, "language_loss": 0.8714056, "learning_rate": 3.775709972851849e-06, "loss": 0.89356613, "num_input_tokens_seen": 55057135, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.03125, "step": 2533, "time_per_iteration": 2.611016273498535 }, { "auxiliary_loss_clip": 0.01191631, "auxiliary_loss_mlp": 0.01054819, "balance_loss_clip": 1.03511417, "balance_loss_mlp": 1.0481025, "epoch": 0.1523523222606343, "flos": 18217806520320.0, "grad_norm": 2.6715941277036555, "language_loss": 0.7889322, "learning_rate": 3.775536096281946e-06, "loss": 0.81139672, "num_input_tokens_seen": 55075525, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.0703125, "step": 2534, "time_per_iteration": 2.5729386806488037 }, { "auxiliary_loss_clip": 0.01161817, "auxiliary_loss_mlp": 0.01049432, "balance_loss_clip": 1.02787948, "balance_loss_mlp": 1.05008233, "epoch": 0.15241244551330227, "flos": 13260087642240.0, "grad_norm": 3.4459158766704308, "language_loss": 0.7689724, "learning_rate": 3.7753621563477268e-06, "loss": 0.79108489, "num_input_tokens_seen": 55090845, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.1171875, "step": 2535, "time_per_iteration": 2.504210948944092 }, { "auxiliary_loss_clip": 0.01178387, "auxiliary_loss_mlp": 0.01045617, "balance_loss_clip": 1.02598286, "balance_loss_mlp": 1.04843128, "epoch": 0.15247256876597023, "flos": 19715568854400.0, "grad_norm": 2.3773005401374636, "language_loss": 0.77909744, "learning_rate": 3.7751881530553993e-06, "loss": 0.80133748, "num_input_tokens_seen": 55108750, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.109375, "step": 2536, "time_per_iteration": 2.5591073036193848 }, { "auxiliary_loss_clip": 0.01171944, "auxiliary_loss_mlp": 0.01057252, "balance_loss_clip": 1.0385716, "balance_loss_mlp": 1.04987335, "epoch": 0.1525326920186382, "flos": 20375858615040.0, "grad_norm": 2.6700243967637793, "language_loss": 0.75631809, "learning_rate": 3.775014086411173e-06, "loss": 0.77860999, "num_input_tokens_seen": 55126750, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0390625, "step": 2537, "time_per_iteration": 2.5585572719573975 }, { "auxiliary_loss_clip": 0.01166212, "auxiliary_loss_mlp": 0.01053838, "balance_loss_clip": 1.03397799, "balance_loss_mlp": 1.05000401, "epoch": 0.15259281527130616, "flos": 13589963170560.0, "grad_norm": 2.5620477120627645, "language_loss": 0.77266115, "learning_rate": 3.7748399564212595e-06, "loss": 0.79486161, "num_input_tokens_seen": 55144690, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.0703125, "step": 2538, "time_per_iteration": 2.6030781269073486 }, { "auxiliary_loss_clip": 0.01189635, "auxiliary_loss_mlp": 0.0104498, "balance_loss_clip": 1.02722955, "balance_loss_mlp": 1.04971695, "epoch": 0.15265293852397416, "flos": 22860374446080.0, "grad_norm": 2.0049698855694835, "language_loss": 0.89496058, "learning_rate": 3.7746657630918735e-06, "loss": 0.91730678, "num_input_tokens_seen": 55166055, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 1.03125, "step": 2539, "time_per_iteration": 2.654236316680908 }, { "auxiliary_loss_clip": 0.01164486, "auxiliary_loss_mlp": 0.01053387, "balance_loss_clip": 1.03334785, "balance_loss_mlp": 1.04928184, "epoch": 0.15271306177664212, "flos": 29238108670080.0, "grad_norm": 1.8080570756711145, "language_loss": 0.93017769, "learning_rate": 3.7744915064292313e-06, "loss": 0.95235634, "num_input_tokens_seen": 55186285, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.0625, "step": 2540, "time_per_iteration": 2.702314853668213 }, { "auxiliary_loss_clip": 0.01167681, "auxiliary_loss_mlp": 0.01046752, "balance_loss_clip": 1.02865624, "balance_loss_mlp": 1.04663265, "epoch": 0.1527731850293101, "flos": 31246269310080.0, "grad_norm": 1.865287509014421, "language_loss": 0.75101733, "learning_rate": 3.7743171864395524e-06, "loss": 0.77316165, "num_input_tokens_seen": 55207915, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 1.03125, "step": 2541, "time_per_iteration": 2.6642191410064697 }, { "auxiliary_loss_clip": 0.01176871, "auxiliary_loss_mlp": 0.01053103, "balance_loss_clip": 1.03343379, "balance_loss_mlp": 1.04637241, "epoch": 0.15283330828197805, "flos": 22382079920640.0, "grad_norm": 1.652486945309316, "language_loss": 0.80997038, "learning_rate": 3.774142803129057e-06, "loss": 0.83227009, "num_input_tokens_seen": 55227860, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.03125, "step": 2542, "time_per_iteration": 2.6450676918029785 }, { "auxiliary_loss_clip": 0.01154014, "auxiliary_loss_mlp": 0.0105377, "balance_loss_clip": 1.03406429, "balance_loss_mlp": 1.04742932, "epoch": 0.15289343153464602, "flos": 25520133755520.0, "grad_norm": 2.0177115617694557, "language_loss": 0.76837867, "learning_rate": 3.7739683565039674e-06, "loss": 0.79045653, "num_input_tokens_seen": 55247330, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.0625, "step": 2543, "time_per_iteration": 2.6326916217803955 }, { "auxiliary_loss_clip": 0.01162147, "auxiliary_loss_mlp": 0.0104783, "balance_loss_clip": 1.02909017, "balance_loss_mlp": 1.04758883, "epoch": 0.15295355478731398, "flos": 22710016114560.0, "grad_norm": 2.1562896148392623, "language_loss": 0.8592754, "learning_rate": 3.7737938465705115e-06, "loss": 0.88137519, "num_input_tokens_seen": 55266195, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0625, "step": 2544, "time_per_iteration": 2.5566296577453613 }, { "auxiliary_loss_clip": 0.0116682, "auxiliary_loss_mlp": 0.01057875, "balance_loss_clip": 1.03718042, "balance_loss_mlp": 1.04785252, "epoch": 0.15301367803998198, "flos": 23251907669760.0, "grad_norm": 2.1114192545031556, "language_loss": 0.8225385, "learning_rate": 3.773619273334916e-06, "loss": 0.84478545, "num_input_tokens_seen": 55283305, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.09375, "step": 2545, "time_per_iteration": 2.5952467918395996 }, { "auxiliary_loss_clip": 0.0115523, "auxiliary_loss_mlp": 0.01047163, "balance_loss_clip": 1.02762473, "balance_loss_mlp": 1.04944122, "epoch": 0.15307380129264994, "flos": 25886279041920.0, "grad_norm": 2.546537265278793, "language_loss": 0.71043396, "learning_rate": 3.77344463680341e-06, "loss": 0.73245794, "num_input_tokens_seen": 55303035, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0625, "step": 2546, "time_per_iteration": 2.6357593536376953 }, { "auxiliary_loss_clip": 0.0117399, "auxiliary_loss_mlp": 0.0104862, "balance_loss_clip": 1.02846181, "balance_loss_mlp": 1.04873919, "epoch": 0.1531339245453179, "flos": 46973239205760.0, "grad_norm": 3.571948383378242, "language_loss": 0.7726686, "learning_rate": 3.7732699369822276e-06, "loss": 0.7948947, "num_input_tokens_seen": 55327570, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.0703125, "step": 2547, "time_per_iteration": 2.8365705013275146 }, { "auxiliary_loss_clip": 0.01192793, "auxiliary_loss_mlp": 0.010478, "balance_loss_clip": 1.02785587, "balance_loss_mlp": 1.04914057, "epoch": 0.15319404779798587, "flos": 35882049565440.0, "grad_norm": 2.4003573729239123, "language_loss": 0.73791802, "learning_rate": 3.7730951738776025e-06, "loss": 0.760324, "num_input_tokens_seen": 55351090, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.0703125, "step": 2548, "time_per_iteration": 2.7275640964508057 }, { "auxiliary_loss_clip": 0.01173118, "auxiliary_loss_mlp": 0.01052006, "balance_loss_clip": 1.03199089, "balance_loss_mlp": 1.04795218, "epoch": 0.15325417105065384, "flos": 25664638170240.0, "grad_norm": 1.4848055305600558, "language_loss": 0.80448693, "learning_rate": 3.7729203474957715e-06, "loss": 0.82673818, "num_input_tokens_seen": 55371050, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.0703125, "step": 2549, "time_per_iteration": 2.633662700653076 }, { "auxiliary_loss_clip": 0.01162355, "auxiliary_loss_mlp": 0.01052428, "balance_loss_clip": 1.03240097, "balance_loss_mlp": 1.04688239, "epoch": 0.1533142943033218, "flos": 18770831291520.0, "grad_norm": 1.8502960655059753, "language_loss": 0.74786478, "learning_rate": 3.7727454578429735e-06, "loss": 0.77001262, "num_input_tokens_seen": 55390375, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.0625, "step": 2550, "time_per_iteration": 2.534963369369507 }, { "auxiliary_loss_clip": 0.01164906, "auxiliary_loss_mlp": 0.01038988, "balance_loss_clip": 1.01944971, "balance_loss_mlp": 1.04717875, "epoch": 0.15337441755598977, "flos": 23107367341440.0, "grad_norm": 3.377885220327771, "language_loss": 0.76952755, "learning_rate": 3.7725705049254507e-06, "loss": 0.79156649, "num_input_tokens_seen": 55408890, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0859375, "step": 2551, "time_per_iteration": 2.6032516956329346 }, { "auxiliary_loss_clip": 0.0106817, "auxiliary_loss_mlp": 0.01002751, "balance_loss_clip": 1.00025928, "balance_loss_mlp": 1.01957786, "epoch": 0.15343454080865776, "flos": 59861079227520.0, "grad_norm": 0.931830504180044, "language_loss": 0.56691229, "learning_rate": 3.7723954887494457e-06, "loss": 0.58762145, "num_input_tokens_seen": 55463815, "router_z_loss_clip": 0.02490234, "router_z_loss_mlp": 0.39453125, "step": 2552, "time_per_iteration": 5.926017999649048 }, { "auxiliary_loss_clip": 0.01193101, "auxiliary_loss_mlp": 0.01045347, "balance_loss_clip": 1.0259515, "balance_loss_mlp": 1.04849005, "epoch": 0.15349466406132573, "flos": 11910887959680.0, "grad_norm": 2.595673068233099, "language_loss": 0.88245702, "learning_rate": 3.772220409321205e-06, "loss": 0.90484154, "num_input_tokens_seen": 55481050, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0859375, "step": 2553, "time_per_iteration": 4.066634893417358 }, { "auxiliary_loss_clip": 0.0116707, "auxiliary_loss_mlp": 0.01046912, "balance_loss_clip": 1.02600265, "balance_loss_mlp": 1.04814231, "epoch": 0.1535547873139937, "flos": 24096922099200.0, "grad_norm": 1.840388181832891, "language_loss": 0.7784059, "learning_rate": 3.7720452666469766e-06, "loss": 0.80054569, "num_input_tokens_seen": 55500050, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.1015625, "step": 2554, "time_per_iteration": 2.547150135040283 }, { "auxiliary_loss_clip": 0.01203476, "auxiliary_loss_mlp": 0.01051512, "balance_loss_clip": 1.03097236, "balance_loss_mlp": 1.05018115, "epoch": 0.15361491056666166, "flos": 17566459246080.0, "grad_norm": 2.447103108797657, "language_loss": 0.78226465, "learning_rate": 3.7718700607330114e-06, "loss": 0.80481452, "num_input_tokens_seen": 55518125, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.078125, "step": 2555, "time_per_iteration": 2.5814998149871826 }, { "auxiliary_loss_clip": 0.01178388, "auxiliary_loss_mlp": 0.01044825, "balance_loss_clip": 1.02711082, "balance_loss_mlp": 1.0475297, "epoch": 0.15367503381932962, "flos": 25046041121280.0, "grad_norm": 1.5600380384978203, "language_loss": 0.7694909, "learning_rate": 3.7716947915855607e-06, "loss": 0.79172307, "num_input_tokens_seen": 55540960, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 1.0390625, "step": 2556, "time_per_iteration": 4.263160705566406 }, { "auxiliary_loss_clip": 0.01157109, "auxiliary_loss_mlp": 0.01289997, "balance_loss_clip": 1.02242196, "balance_loss_mlp": 1.04610419, "epoch": 0.15373515707199759, "flos": 21507332008320.0, "grad_norm": 1.8540772677529425, "language_loss": 0.89634496, "learning_rate": 3.7715194592108805e-06, "loss": 0.920816, "num_input_tokens_seen": 55559210, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.015625, "step": 2557, "time_per_iteration": 2.5326311588287354 }, { "auxiliary_loss_clip": 0.01171874, "auxiliary_loss_mlp": 0.01043634, "balance_loss_clip": 1.02367806, "balance_loss_mlp": 1.04659688, "epoch": 0.15379528032466555, "flos": 25994729180160.0, "grad_norm": 2.8678787289802834, "language_loss": 0.70526659, "learning_rate": 3.7713440636152276e-06, "loss": 0.7274217, "num_input_tokens_seen": 55578925, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.0703125, "step": 2558, "time_per_iteration": 2.6729209423065186 }, { "auxiliary_loss_clip": 0.01171847, "auxiliary_loss_mlp": 0.01045396, "balance_loss_clip": 1.02590561, "balance_loss_mlp": 1.04718196, "epoch": 0.15385540357733354, "flos": 19277315015040.0, "grad_norm": 2.6564598020698287, "language_loss": 0.91778117, "learning_rate": 3.7711686048048613e-06, "loss": 0.93995363, "num_input_tokens_seen": 55597255, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0625, "step": 2559, "time_per_iteration": 2.5624945163726807 }, { "auxiliary_loss_clip": 0.01173352, "auxiliary_loss_mlp": 0.01058246, "balance_loss_clip": 1.03727698, "balance_loss_mlp": 1.04786742, "epoch": 0.1539155268300015, "flos": 28549126920960.0, "grad_norm": 2.2131711788867365, "language_loss": 0.63602561, "learning_rate": 3.7709930827860445e-06, "loss": 0.65834159, "num_input_tokens_seen": 55619515, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.0703125, "step": 2560, "time_per_iteration": 2.706130266189575 }, { "auxiliary_loss_clip": 0.01170225, "auxiliary_loss_mlp": 0.01047252, "balance_loss_clip": 1.02703404, "balance_loss_mlp": 1.04417765, "epoch": 0.15397565008266947, "flos": 23547883737600.0, "grad_norm": 1.7698244337121611, "language_loss": 0.87947822, "learning_rate": 3.770817497565039e-06, "loss": 0.90165293, "num_input_tokens_seen": 55640050, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.078125, "step": 2561, "time_per_iteration": 2.5762720108032227 }, { "auxiliary_loss_clip": 0.01156761, "auxiliary_loss_mlp": 0.01043321, "balance_loss_clip": 1.02484393, "balance_loss_mlp": 1.04585898, "epoch": 0.15403577333533744, "flos": 17129821518720.0, "grad_norm": 2.6079380123267017, "language_loss": 0.82896787, "learning_rate": 3.770641849148113e-06, "loss": 0.85096872, "num_input_tokens_seen": 55658695, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.015625, "step": 2562, "time_per_iteration": 2.6320412158966064 }, { "auxiliary_loss_clip": 0.01179923, "auxiliary_loss_mlp": 0.01296369, "balance_loss_clip": 1.0261817, "balance_loss_mlp": 1.05025244, "epoch": 0.1540958965880054, "flos": 17894503180800.0, "grad_norm": 2.059137303084003, "language_loss": 0.74528742, "learning_rate": 3.7704661375415336e-06, "loss": 0.77005035, "num_input_tokens_seen": 55676340, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.1171875, "step": 2563, "time_per_iteration": 2.549576997756958 }, { "auxiliary_loss_clip": 0.01190223, "auxiliary_loss_mlp": 0.01042123, "balance_loss_clip": 1.02171469, "balance_loss_mlp": 1.04505682, "epoch": 0.15415601984067337, "flos": 32161057908480.0, "grad_norm": 2.1573348736468194, "language_loss": 0.75549716, "learning_rate": 3.770290362751572e-06, "loss": 0.77782059, "num_input_tokens_seen": 55698890, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.09375, "step": 2564, "time_per_iteration": 2.7079806327819824 }, { "auxiliary_loss_clip": 0.01169554, "auxiliary_loss_mlp": 0.01050528, "balance_loss_clip": 1.03171659, "balance_loss_mlp": 1.04574072, "epoch": 0.15421614309334136, "flos": 24024418496640.0, "grad_norm": 2.097194446798446, "language_loss": 0.70672631, "learning_rate": 3.7701145247845006e-06, "loss": 0.72892714, "num_input_tokens_seen": 55718535, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0546875, "step": 2565, "time_per_iteration": 2.588686943054199 }, { "auxiliary_loss_clip": 0.0116054, "auxiliary_loss_mlp": 0.0104805, "balance_loss_clip": 1.02844, "balance_loss_mlp": 1.04597545, "epoch": 0.15427626634600933, "flos": 24386290064640.0, "grad_norm": 2.1634924623810137, "language_loss": 0.7173537, "learning_rate": 3.7699386236465954e-06, "loss": 0.73943967, "num_input_tokens_seen": 55738970, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.0546875, "step": 2566, "time_per_iteration": 2.61220121383667 }, { "auxiliary_loss_clip": 0.01148714, "auxiliary_loss_mlp": 0.01041412, "balance_loss_clip": 1.02252913, "balance_loss_mlp": 1.04421866, "epoch": 0.1543363895986773, "flos": 23331522165120.0, "grad_norm": 1.7087350896233584, "language_loss": 0.84847713, "learning_rate": 3.769762659344134e-06, "loss": 0.87037832, "num_input_tokens_seen": 55759585, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.046875, "step": 2567, "time_per_iteration": 2.58427095413208 }, { "auxiliary_loss_clip": 0.01178137, "auxiliary_loss_mlp": 0.01043531, "balance_loss_clip": 1.02479148, "balance_loss_mlp": 1.04611933, "epoch": 0.15439651285134526, "flos": 24274284480000.0, "grad_norm": 2.2041419741489436, "language_loss": 0.7797913, "learning_rate": 3.7695866318833946e-06, "loss": 0.80200791, "num_input_tokens_seen": 55779250, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.046875, "step": 2568, "time_per_iteration": 2.700418472290039 }, { "auxiliary_loss_clip": 0.01170712, "auxiliary_loss_mlp": 0.01040391, "balance_loss_clip": 1.02014959, "balance_loss_mlp": 1.0476234, "epoch": 0.15445663610401322, "flos": 22456163721600.0, "grad_norm": 1.8492089523554365, "language_loss": 0.70545292, "learning_rate": 3.769410541270661e-06, "loss": 0.72756398, "num_input_tokens_seen": 55800470, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.046875, "step": 2569, "time_per_iteration": 2.6550230979919434 }, { "auxiliary_loss_clip": 0.01165723, "auxiliary_loss_mlp": 0.01044873, "balance_loss_clip": 1.02612174, "balance_loss_mlp": 1.04548824, "epoch": 0.1545167593566812, "flos": 22049510872320.0, "grad_norm": 2.105823242995232, "language_loss": 0.76556194, "learning_rate": 3.7692343875122167e-06, "loss": 0.78766787, "num_input_tokens_seen": 55817795, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0234375, "step": 2570, "time_per_iteration": 2.6326777935028076 }, { "auxiliary_loss_clip": 0.01160095, "auxiliary_loss_mlp": 0.01040853, "balance_loss_clip": 1.02119541, "balance_loss_mlp": 1.04729867, "epoch": 0.15457688260934915, "flos": 19318253541120.0, "grad_norm": 2.406423689896063, "language_loss": 0.77458817, "learning_rate": 3.769058170614348e-06, "loss": 0.79659766, "num_input_tokens_seen": 55836125, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.0390625, "step": 2571, "time_per_iteration": 2.621126174926758 }, { "auxiliary_loss_clip": 0.01170322, "auxiliary_loss_mlp": 0.01047228, "balance_loss_clip": 1.02833343, "balance_loss_mlp": 1.04599071, "epoch": 0.15463700586201715, "flos": 24133981956480.0, "grad_norm": 3.3428663886366103, "language_loss": 0.8244226, "learning_rate": 3.768881890583344e-06, "loss": 0.84659809, "num_input_tokens_seen": 55855280, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0625, "step": 2572, "time_per_iteration": 2.6504228115081787 }, { "auxiliary_loss_clip": 0.01154909, "auxiliary_loss_mlp": 0.01047131, "balance_loss_clip": 1.02810502, "balance_loss_mlp": 1.04727006, "epoch": 0.1546971291146851, "flos": 22420935457920.0, "grad_norm": 1.5258974306172794, "language_loss": 0.90296543, "learning_rate": 3.7687055474254946e-06, "loss": 0.92498583, "num_input_tokens_seen": 55875695, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.078125, "step": 2573, "time_per_iteration": 2.7174291610717773 }, { "auxiliary_loss_clip": 0.01192186, "auxiliary_loss_mlp": 0.01048383, "balance_loss_clip": 1.02895212, "balance_loss_mlp": 1.0478847, "epoch": 0.15475725236735308, "flos": 17530225401600.0, "grad_norm": 2.182159614956824, "language_loss": 0.70076543, "learning_rate": 3.7685291411470946e-06, "loss": 0.72317111, "num_input_tokens_seen": 55894575, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.078125, "step": 2574, "time_per_iteration": 2.685107707977295 }, { "auxiliary_loss_clip": 0.01169211, "auxiliary_loss_mlp": 0.01046615, "balance_loss_clip": 1.02603936, "balance_loss_mlp": 1.04378128, "epoch": 0.15481737562002104, "flos": 22561740771840.0, "grad_norm": 3.4993671200088534, "language_loss": 0.82424629, "learning_rate": 3.768352671754439e-06, "loss": 0.84640455, "num_input_tokens_seen": 55912855, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.078125, "step": 2575, "time_per_iteration": 2.635204315185547 }, { "auxiliary_loss_clip": 0.01170784, "auxiliary_loss_mlp": 0.01048048, "balance_loss_clip": 1.02960682, "balance_loss_mlp": 1.04463935, "epoch": 0.154877498872689, "flos": 24900567039360.0, "grad_norm": 1.8486890106425382, "language_loss": 0.84710997, "learning_rate": 3.7681761392538246e-06, "loss": 0.86929834, "num_input_tokens_seen": 55932375, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.078125, "step": 2576, "time_per_iteration": 2.678537130355835 }, { "auxiliary_loss_clip": 0.01150055, "auxiliary_loss_mlp": 0.01044825, "balance_loss_clip": 1.02576363, "balance_loss_mlp": 1.04283333, "epoch": 0.15493762212535697, "flos": 28147501975680.0, "grad_norm": 1.65302702254499, "language_loss": 0.81821144, "learning_rate": 3.7679995436515525e-06, "loss": 0.84016019, "num_input_tokens_seen": 55953970, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.078125, "step": 2577, "time_per_iteration": 2.7997236251831055 }, { "auxiliary_loss_clip": 0.01183833, "auxiliary_loss_mlp": 0.01046602, "balance_loss_clip": 1.02729034, "balance_loss_mlp": 1.04993093, "epoch": 0.15499774537802496, "flos": 25411073086080.0, "grad_norm": 2.1671366697295693, "language_loss": 0.76222056, "learning_rate": 3.7678228849539244e-06, "loss": 0.78452492, "num_input_tokens_seen": 55973120, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0703125, "step": 2578, "time_per_iteration": 2.6338603496551514 }, { "auxiliary_loss_clip": 0.01181213, "auxiliary_loss_mlp": 0.0104697, "balance_loss_clip": 1.02788484, "balance_loss_mlp": 1.04783916, "epoch": 0.15505786863069293, "flos": 22091562720000.0, "grad_norm": 1.7792957040407547, "language_loss": 0.82501316, "learning_rate": 3.767646163167245e-06, "loss": 0.84729505, "num_input_tokens_seen": 55993260, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0625, "step": 2579, "time_per_iteration": 2.71614146232605 }, { "auxiliary_loss_clip": 0.01171644, "auxiliary_loss_mlp": 0.01043184, "balance_loss_clip": 1.02468264, "balance_loss_mlp": 1.04856253, "epoch": 0.1551179918833609, "flos": 18917131386240.0, "grad_norm": 1.587716880915854, "language_loss": 0.80113101, "learning_rate": 3.7674693782978206e-06, "loss": 0.82327926, "num_input_tokens_seen": 56012130, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.046875, "step": 2580, "time_per_iteration": 2.5516724586486816 }, { "auxiliary_loss_clip": 0.01087772, "auxiliary_loss_mlp": 0.01016819, "balance_loss_clip": 1.01355314, "balance_loss_mlp": 1.02895188, "epoch": 0.15517811513602886, "flos": 66239172587520.0, "grad_norm": 0.8456782233856743, "language_loss": 0.58888125, "learning_rate": 3.7672925303519605e-06, "loss": 0.60992718, "num_input_tokens_seen": 56079045, "router_z_loss_clip": 0.03271484, "router_z_loss_mlp": 0.49609375, "step": 2581, "time_per_iteration": 3.324341297149658 }, { "auxiliary_loss_clip": 0.0118028, "auxiliary_loss_mlp": 0.01044518, "balance_loss_clip": 1.02553988, "balance_loss_mlp": 1.0457269, "epoch": 0.15523823838869683, "flos": 24021078531840.0, "grad_norm": 3.1215543648111868, "language_loss": 0.85462517, "learning_rate": 3.7671156193359764e-06, "loss": 0.87687314, "num_input_tokens_seen": 56098745, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0703125, "step": 2582, "time_per_iteration": 2.6926324367523193 }, { "auxiliary_loss_clip": 0.01163321, "auxiliary_loss_mlp": 0.0104918, "balance_loss_clip": 1.03014266, "balance_loss_mlp": 1.04822159, "epoch": 0.1552983616413648, "flos": 20485062938880.0, "grad_norm": 2.470216528055031, "language_loss": 0.78339255, "learning_rate": 3.766938645256182e-06, "loss": 0.80551755, "num_input_tokens_seen": 56117655, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.0625, "step": 2583, "time_per_iteration": 2.5923116207122803 }, { "auxiliary_loss_clip": 0.01162746, "auxiliary_loss_mlp": 0.01051774, "balance_loss_clip": 1.03099561, "balance_loss_mlp": 1.04764676, "epoch": 0.15535848489403276, "flos": 32123710742400.0, "grad_norm": 1.6672630659319128, "language_loss": 0.75915903, "learning_rate": 3.766761608118892e-06, "loss": 0.78130424, "num_input_tokens_seen": 56141960, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.0625, "step": 2584, "time_per_iteration": 2.6473052501678467 }, { "auxiliary_loss_clip": 0.01159703, "auxiliary_loss_mlp": 0.01041386, "balance_loss_clip": 1.02255058, "balance_loss_mlp": 1.04566562, "epoch": 0.15541860814670075, "flos": 19098444263040.0, "grad_norm": 2.4550114844131077, "language_loss": 0.75969291, "learning_rate": 3.766584507930424e-06, "loss": 0.78170371, "num_input_tokens_seen": 56161430, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.046875, "step": 2585, "time_per_iteration": 2.6837024688720703 }, { "auxiliary_loss_clip": 0.01176029, "auxiliary_loss_mlp": 0.01038797, "balance_loss_clip": 1.02022433, "balance_loss_mlp": 1.04577661, "epoch": 0.1554787313993687, "flos": 19172097100800.0, "grad_norm": 1.8823810685941131, "language_loss": 0.60612339, "learning_rate": 3.7664073446971e-06, "loss": 0.62827164, "num_input_tokens_seen": 56179390, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.03125, "step": 2586, "time_per_iteration": 2.65862774848938 }, { "auxiliary_loss_clip": 0.0117034, "auxiliary_loss_mlp": 0.0103992, "balance_loss_clip": 1.0215261, "balance_loss_mlp": 1.04571843, "epoch": 0.15553885465203668, "flos": 16143822207360.0, "grad_norm": 1.73460801416835, "language_loss": 0.81268895, "learning_rate": 3.7662301184252413e-06, "loss": 0.83479154, "num_input_tokens_seen": 56198020, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0625, "step": 2587, "time_per_iteration": 2.611509323120117 }, { "auxiliary_loss_clip": 0.01155997, "auxiliary_loss_mlp": 0.010524, "balance_loss_clip": 1.0302633, "balance_loss_mlp": 1.04694796, "epoch": 0.15559897790470464, "flos": 25337779384320.0, "grad_norm": 1.9319060370422076, "language_loss": 0.88453543, "learning_rate": 3.766052829121173e-06, "loss": 0.90661943, "num_input_tokens_seen": 56218165, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.09375, "step": 2588, "time_per_iteration": 2.711381196975708 }, { "auxiliary_loss_clip": 0.01188636, "auxiliary_loss_mlp": 0.01050354, "balance_loss_clip": 1.02968287, "balance_loss_mlp": 1.04774642, "epoch": 0.1556591011573726, "flos": 23148772744320.0, "grad_norm": 3.170774423160771, "language_loss": 0.64523566, "learning_rate": 3.765875476791222e-06, "loss": 0.66762561, "num_input_tokens_seen": 56237160, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.046875, "step": 2589, "time_per_iteration": 2.635732650756836 }, { "auxiliary_loss_clip": 0.0117971, "auxiliary_loss_mlp": 0.01042015, "balance_loss_clip": 1.02152264, "balance_loss_mlp": 1.04620171, "epoch": 0.15571922441004057, "flos": 25370888745600.0, "grad_norm": 2.3619891966732305, "language_loss": 0.82251132, "learning_rate": 3.765698061441718e-06, "loss": 0.84472853, "num_input_tokens_seen": 56257610, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.0625, "step": 2590, "time_per_iteration": 2.7126543521881104 }, { "auxiliary_loss_clip": 0.01177672, "auxiliary_loss_mlp": 0.0104267, "balance_loss_clip": 1.02375102, "balance_loss_mlp": 1.043733, "epoch": 0.15577934766270854, "flos": 14501375890560.0, "grad_norm": 2.1809994415154943, "language_loss": 0.78981018, "learning_rate": 3.7655205830789918e-06, "loss": 0.81201363, "num_input_tokens_seen": 56275215, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0625, "step": 2591, "time_per_iteration": 2.571828603744507 }, { "auxiliary_loss_clip": 0.01158343, "auxiliary_loss_mlp": 0.01050246, "balance_loss_clip": 1.03224492, "balance_loss_mlp": 1.04531145, "epoch": 0.15583947091537653, "flos": 37414537372800.0, "grad_norm": 2.4447177794173034, "language_loss": 0.64787561, "learning_rate": 3.7653430417093777e-06, "loss": 0.66996145, "num_input_tokens_seen": 56297130, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.0390625, "step": 2592, "time_per_iteration": 2.767271041870117 }, { "auxiliary_loss_clip": 0.01175648, "auxiliary_loss_mlp": 0.01046553, "balance_loss_clip": 1.02716994, "balance_loss_mlp": 1.05092156, "epoch": 0.1558995941680445, "flos": 21834729498240.0, "grad_norm": 2.428285861777953, "language_loss": 0.81902099, "learning_rate": 3.765165437339211e-06, "loss": 0.84124297, "num_input_tokens_seen": 56314995, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.0625, "step": 2593, "time_per_iteration": 2.5942864418029785 }, { "auxiliary_loss_clip": 0.01158101, "auxiliary_loss_mlp": 0.01045124, "balance_loss_clip": 1.02486992, "balance_loss_mlp": 1.046664, "epoch": 0.15595971742071246, "flos": 19792633484160.0, "grad_norm": 2.3749062085802817, "language_loss": 0.73069024, "learning_rate": 3.764987769974831e-06, "loss": 0.7527225, "num_input_tokens_seen": 56334005, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.0234375, "step": 2594, "time_per_iteration": 5.544780969619751 }, { "auxiliary_loss_clip": 0.0116505, "auxiliary_loss_mlp": 0.01038209, "balance_loss_clip": 1.01994586, "balance_loss_mlp": 1.04573166, "epoch": 0.15601984067338043, "flos": 26722135503360.0, "grad_norm": 2.238236527329976, "language_loss": 0.81326103, "learning_rate": 3.764810039622577e-06, "loss": 0.83529359, "num_input_tokens_seen": 56353795, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 1.0078125, "step": 2595, "time_per_iteration": 4.07058048248291 }, { "auxiliary_loss_clip": 0.01148535, "auxiliary_loss_mlp": 0.01043471, "balance_loss_clip": 1.02489841, "balance_loss_mlp": 1.04415429, "epoch": 0.1560799639260484, "flos": 18369278173440.0, "grad_norm": 2.5572386804843914, "language_loss": 0.86294258, "learning_rate": 3.7646322462887927e-06, "loss": 0.88486266, "num_input_tokens_seen": 56373195, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.046875, "step": 2596, "time_per_iteration": 2.6376633644104004 }, { "auxiliary_loss_clip": 0.01157443, "auxiliary_loss_mlp": 0.01044716, "balance_loss_clip": 1.02638173, "balance_loss_mlp": 1.04748356, "epoch": 0.15614008717871636, "flos": 22598980197120.0, "grad_norm": 1.7323847689231728, "language_loss": 0.68412209, "learning_rate": 3.764454389979822e-06, "loss": 0.70614362, "num_input_tokens_seen": 56391525, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0078125, "step": 2597, "time_per_iteration": 2.577753782272339 }, { "auxiliary_loss_clip": 0.01164158, "auxiliary_loss_mlp": 0.01042015, "balance_loss_clip": 1.02408648, "balance_loss_mlp": 1.04631293, "epoch": 0.15620021043138435, "flos": 22746860490240.0, "grad_norm": 1.7660545751165289, "language_loss": 0.79425216, "learning_rate": 3.7642764707020134e-06, "loss": 0.81631386, "num_input_tokens_seen": 56410715, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.99609375, "step": 2598, "time_per_iteration": 4.861827373504639 }, { "auxiliary_loss_clip": 0.01161298, "auxiliary_loss_mlp": 0.01283398, "balance_loss_clip": 1.0169301, "balance_loss_mlp": 1.04322016, "epoch": 0.15626033368405232, "flos": 13114936782720.0, "grad_norm": 2.3532708613561413, "language_loss": 0.82398188, "learning_rate": 3.764098488461716e-06, "loss": 0.84842885, "num_input_tokens_seen": 56429170, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.0, "step": 2599, "time_per_iteration": 2.690797805786133 }, { "auxiliary_loss_clip": 0.01158388, "auxiliary_loss_mlp": 0.01053938, "balance_loss_clip": 1.03318369, "balance_loss_mlp": 1.04837799, "epoch": 0.15632045693672028, "flos": 16472297105280.0, "grad_norm": 2.3037441367808906, "language_loss": 0.81827241, "learning_rate": 3.7639204432652808e-06, "loss": 0.84039569, "num_input_tokens_seen": 56445685, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1015625, "step": 2600, "time_per_iteration": 2.524959087371826 }, { "auxiliary_loss_clip": 0.01154031, "auxiliary_loss_mlp": 0.01293982, "balance_loss_clip": 1.02667642, "balance_loss_mlp": 1.04881811, "epoch": 0.15638058018938825, "flos": 20850346298880.0, "grad_norm": 2.158849324867031, "language_loss": 0.88511884, "learning_rate": 3.7637423351190628e-06, "loss": 0.90959901, "num_input_tokens_seen": 56465900, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.0546875, "step": 2601, "time_per_iteration": 2.679452896118164 }, { "auxiliary_loss_clip": 0.01156787, "auxiliary_loss_mlp": 0.01063677, "balance_loss_clip": 1.04195678, "balance_loss_mlp": 1.04949868, "epoch": 0.1564407034420562, "flos": 21872220318720.0, "grad_norm": 1.8380507885417574, "language_loss": 0.78013772, "learning_rate": 3.7635641640294177e-06, "loss": 0.8023423, "num_input_tokens_seen": 56485020, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.078125, "step": 2602, "time_per_iteration": 2.6521120071411133 }, { "auxiliary_loss_clip": 0.01178722, "auxiliary_loss_mlp": 0.01041688, "balance_loss_clip": 1.02273357, "balance_loss_mlp": 1.04833484, "epoch": 0.15650082669472418, "flos": 21834549930240.0, "grad_norm": 1.8750956729561965, "language_loss": 0.73729205, "learning_rate": 3.7633859300027036e-06, "loss": 0.75949621, "num_input_tokens_seen": 56505205, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.03125, "step": 2603, "time_per_iteration": 2.6951372623443604 }, { "auxiliary_loss_clip": 0.01162513, "auxiliary_loss_mlp": 0.01047852, "balance_loss_clip": 1.02945781, "balance_loss_mlp": 1.04875398, "epoch": 0.15656094994739214, "flos": 13800542653440.0, "grad_norm": 6.061202918207396, "language_loss": 0.87180507, "learning_rate": 3.7632076330452823e-06, "loss": 0.8939088, "num_input_tokens_seen": 56521495, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.046875, "step": 2604, "time_per_iteration": 2.5670411586761475 }, { "auxiliary_loss_clip": 0.01162657, "auxiliary_loss_mlp": 0.01044259, "balance_loss_clip": 1.02486372, "balance_loss_mlp": 1.04698205, "epoch": 0.15662107320006013, "flos": 27308197808640.0, "grad_norm": 2.7828718642623285, "language_loss": 0.85194361, "learning_rate": 3.7630292731635155e-06, "loss": 0.87401277, "num_input_tokens_seen": 56540665, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0625, "step": 2605, "time_per_iteration": 2.6615302562713623 }, { "auxiliary_loss_clip": 0.0116461, "auxiliary_loss_mlp": 0.01048341, "balance_loss_clip": 1.02819443, "balance_loss_mlp": 1.04674029, "epoch": 0.1566811964527281, "flos": 26685075646080.0, "grad_norm": 2.2479081188422945, "language_loss": 0.72860283, "learning_rate": 3.762850850363769e-06, "loss": 0.7507323, "num_input_tokens_seen": 56560805, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.0859375, "step": 2606, "time_per_iteration": 2.7426986694335938 }, { "auxiliary_loss_clip": 0.01155047, "auxiliary_loss_mlp": 0.01049788, "balance_loss_clip": 1.03226399, "balance_loss_mlp": 1.05087447, "epoch": 0.15674131970539606, "flos": 16103422385280.0, "grad_norm": 2.1245664376190816, "language_loss": 0.76239681, "learning_rate": 3.7626723646524107e-06, "loss": 0.78444523, "num_input_tokens_seen": 56576335, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 1.046875, "step": 2607, "time_per_iteration": 2.656337261199951 }, { "auxiliary_loss_clip": 0.01178879, "auxiliary_loss_mlp": 0.01040806, "balance_loss_clip": 1.02288842, "balance_loss_mlp": 1.0485059, "epoch": 0.15680144295806403, "flos": 19169690889600.0, "grad_norm": 1.917897924934716, "language_loss": 0.81908184, "learning_rate": 3.7624938160358096e-06, "loss": 0.84127867, "num_input_tokens_seen": 56595880, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.03125, "step": 2608, "time_per_iteration": 2.665705919265747 }, { "auxiliary_loss_clip": 0.01172648, "auxiliary_loss_mlp": 0.01047917, "balance_loss_clip": 1.02691197, "balance_loss_mlp": 1.04699659, "epoch": 0.156861566210732, "flos": 20813430096000.0, "grad_norm": 7.021329451750547, "language_loss": 0.72161663, "learning_rate": 3.762315204520338e-06, "loss": 0.74382234, "num_input_tokens_seen": 56615130, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.078125, "step": 2609, "time_per_iteration": 2.656132936477661 }, { "auxiliary_loss_clip": 0.01164746, "auxiliary_loss_mlp": 0.01040644, "balance_loss_clip": 1.02132022, "balance_loss_mlp": 1.0474658, "epoch": 0.15692168946339996, "flos": 20047922421120.0, "grad_norm": 1.9211486474330497, "language_loss": 0.72051114, "learning_rate": 3.7621365301123696e-06, "loss": 0.74256504, "num_input_tokens_seen": 56634005, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.078125, "step": 2610, "time_per_iteration": 2.7673134803771973 }, { "auxiliary_loss_clip": 0.01153691, "auxiliary_loss_mlp": 0.01050921, "balance_loss_clip": 1.03033304, "balance_loss_mlp": 1.04531479, "epoch": 0.15698181271606793, "flos": 21398019943680.0, "grad_norm": 1.9747715291294268, "language_loss": 0.72919077, "learning_rate": 3.7619577928182816e-06, "loss": 0.75123686, "num_input_tokens_seen": 56653480, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.0859375, "step": 2611, "time_per_iteration": 2.6116888523101807 }, { "auxiliary_loss_clip": 0.01163276, "auxiliary_loss_mlp": 0.0104556, "balance_loss_clip": 1.02693915, "balance_loss_mlp": 1.04817951, "epoch": 0.15704193596873592, "flos": 20845785271680.0, "grad_norm": 2.293343128700507, "language_loss": 0.70659256, "learning_rate": 3.7617789926444525e-06, "loss": 0.72868091, "num_input_tokens_seen": 56672270, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.0625, "step": 2612, "time_per_iteration": 2.675074815750122 }, { "auxiliary_loss_clip": 0.0118218, "auxiliary_loss_mlp": 0.01054025, "balance_loss_clip": 1.03514218, "balance_loss_mlp": 1.04897594, "epoch": 0.15710205922140388, "flos": 21762908254080.0, "grad_norm": 4.04647301772428, "language_loss": 0.75406301, "learning_rate": 3.761600129597262e-06, "loss": 0.77642506, "num_input_tokens_seen": 56691510, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.0546875, "step": 2613, "time_per_iteration": 2.592087984085083 }, { "auxiliary_loss_clip": 0.01178821, "auxiliary_loss_mlp": 0.01057865, "balance_loss_clip": 1.03837419, "balance_loss_mlp": 1.0460906, "epoch": 0.15716218247407185, "flos": 25007760201600.0, "grad_norm": 5.018575743236926, "language_loss": 0.65572792, "learning_rate": 3.761421203683095e-06, "loss": 0.67809474, "num_input_tokens_seen": 56712230, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0546875, "step": 2614, "time_per_iteration": 2.665670871734619 }, { "auxiliary_loss_clip": 0.01167622, "auxiliary_loss_mlp": 0.0104206, "balance_loss_clip": 1.02233124, "balance_loss_mlp": 1.04926109, "epoch": 0.1572223057267398, "flos": 20191780391040.0, "grad_norm": 3.1609606611988945, "language_loss": 0.7504524, "learning_rate": 3.7612422149083362e-06, "loss": 0.77254915, "num_input_tokens_seen": 56727490, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.09375, "step": 2615, "time_per_iteration": 2.6549270153045654 }, { "auxiliary_loss_clip": 0.01161411, "auxiliary_loss_mlp": 0.010484, "balance_loss_clip": 1.03047097, "balance_loss_mlp": 1.04965281, "epoch": 0.15728242897940778, "flos": 20959514709120.0, "grad_norm": 1.8606155058629517, "language_loss": 0.72794557, "learning_rate": 3.761063163279373e-06, "loss": 0.75004369, "num_input_tokens_seen": 56747385, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.03125, "step": 2616, "time_per_iteration": 2.7021429538726807 }, { "auxiliary_loss_clip": 0.0117283, "auxiliary_loss_mlp": 0.01056051, "balance_loss_clip": 1.03677535, "balance_loss_mlp": 1.04835427, "epoch": 0.15734255223207574, "flos": 23038275530880.0, "grad_norm": 1.9449390624618228, "language_loss": 0.72306162, "learning_rate": 3.7608840488025955e-06, "loss": 0.74535048, "num_input_tokens_seen": 56768055, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.0625, "step": 2617, "time_per_iteration": 2.6143813133239746 }, { "auxiliary_loss_clip": 0.01159149, "auxiliary_loss_mlp": 0.01045035, "balance_loss_clip": 1.02636731, "balance_loss_mlp": 1.04686522, "epoch": 0.15740267548474374, "flos": 20551281661440.0, "grad_norm": 3.0990039984279454, "language_loss": 0.74497449, "learning_rate": 3.760704871484396e-06, "loss": 0.76701629, "num_input_tokens_seen": 56785110, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.03125, "step": 2618, "time_per_iteration": 2.625060796737671 }, { "auxiliary_loss_clip": 0.01157711, "auxiliary_loss_mlp": 0.01051478, "balance_loss_clip": 1.03086638, "balance_loss_mlp": 1.04863787, "epoch": 0.1574627987374117, "flos": 22666922772480.0, "grad_norm": 2.107574648100102, "language_loss": 0.7850731, "learning_rate": 3.7605256313311684e-06, "loss": 0.80716503, "num_input_tokens_seen": 56804975, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.09375, "step": 2619, "time_per_iteration": 2.692397356033325 }, { "auxiliary_loss_clip": 0.01170627, "auxiliary_loss_mlp": 0.01054653, "balance_loss_clip": 1.03598464, "balance_loss_mlp": 1.0493741, "epoch": 0.15752292199007967, "flos": 16800664262400.0, "grad_norm": 1.9465583347262343, "language_loss": 0.76728487, "learning_rate": 3.7603463283493093e-06, "loss": 0.78953761, "num_input_tokens_seen": 56822470, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.03125, "step": 2620, "time_per_iteration": 2.6792151927948 }, { "auxiliary_loss_clip": 0.01177743, "auxiliary_loss_mlp": 0.01052647, "balance_loss_clip": 1.03210723, "balance_loss_mlp": 1.05122912, "epoch": 0.15758304524274763, "flos": 29826002568960.0, "grad_norm": 1.9815195896249576, "language_loss": 0.70677906, "learning_rate": 3.760166962545219e-06, "loss": 0.72908294, "num_input_tokens_seen": 56842100, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.0859375, "step": 2621, "time_per_iteration": 2.7206809520721436 }, { "auxiliary_loss_clip": 0.01194806, "auxiliary_loss_mlp": 0.01049999, "balance_loss_clip": 1.02948272, "balance_loss_mlp": 1.05070686, "epoch": 0.1576431684954156, "flos": 53577426723840.0, "grad_norm": 2.063171219372862, "language_loss": 0.71853513, "learning_rate": 3.7599875339252962e-06, "loss": 0.74098319, "num_input_tokens_seen": 56865920, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.078125, "step": 2622, "time_per_iteration": 2.8954594135284424 }, { "auxiliary_loss_clip": 0.01190144, "auxiliary_loss_mlp": 0.01048423, "balance_loss_clip": 1.03024364, "balance_loss_mlp": 1.04883718, "epoch": 0.15770329174808356, "flos": 20813609664000.0, "grad_norm": 1.6734896025314225, "language_loss": 0.87535864, "learning_rate": 3.759808042495947e-06, "loss": 0.8977443, "num_input_tokens_seen": 56885265, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.046875, "step": 2623, "time_per_iteration": 2.6730878353118896 }, { "auxiliary_loss_clip": 0.01153968, "auxiliary_loss_mlp": 0.01051088, "balance_loss_clip": 1.03278995, "balance_loss_mlp": 1.04928589, "epoch": 0.15776341500075153, "flos": 24974004395520.0, "grad_norm": 1.8886651378099206, "language_loss": 0.81620473, "learning_rate": 3.7596284882635746e-06, "loss": 0.83825529, "num_input_tokens_seen": 56906710, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.046875, "step": 2624, "time_per_iteration": 2.6161441802978516 }, { "auxiliary_loss_clip": 0.01181539, "auxiliary_loss_mlp": 0.01048235, "balance_loss_clip": 1.0280056, "balance_loss_mlp": 1.04741442, "epoch": 0.15782353825341952, "flos": 21907915459200.0, "grad_norm": 5.5399352534904445, "language_loss": 0.79494447, "learning_rate": 3.7594488712345878e-06, "loss": 0.81724226, "num_input_tokens_seen": 56924275, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.0703125, "step": 2625, "time_per_iteration": 2.7613770961761475 }, { "auxiliary_loss_clip": 0.01153635, "auxiliary_loss_mlp": 0.01045465, "balance_loss_clip": 1.02758408, "balance_loss_mlp": 1.04967952, "epoch": 0.15788366150608749, "flos": 26177191292160.0, "grad_norm": 2.231301334990447, "language_loss": 0.79966462, "learning_rate": 3.7592691914153967e-06, "loss": 0.82165557, "num_input_tokens_seen": 56941525, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 1.0390625, "step": 2626, "time_per_iteration": 2.6929731369018555 }, { "auxiliary_loss_clip": 0.01191953, "auxiliary_loss_mlp": 0.01039329, "balance_loss_clip": 1.01971865, "balance_loss_mlp": 1.05217862, "epoch": 0.15794378475875545, "flos": 27709822753920.0, "grad_norm": 1.7608814771790569, "language_loss": 0.73996115, "learning_rate": 3.7590894488124134e-06, "loss": 0.76227391, "num_input_tokens_seen": 56962145, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.03125, "step": 2627, "time_per_iteration": 2.7608232498168945 }, { "auxiliary_loss_clip": 0.01183528, "auxiliary_loss_mlp": 0.01047731, "balance_loss_clip": 1.02857399, "balance_loss_mlp": 1.05139947, "epoch": 0.15800390801142342, "flos": 12130158533760.0, "grad_norm": 9.880000876473918, "language_loss": 0.85156333, "learning_rate": 3.7589096434320534e-06, "loss": 0.87387586, "num_input_tokens_seen": 56977505, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.046875, "step": 2628, "time_per_iteration": 2.582740545272827 }, { "auxiliary_loss_clip": 0.01169944, "auxiliary_loss_mlp": 0.01038444, "balance_loss_clip": 1.02120602, "balance_loss_mlp": 1.04879773, "epoch": 0.15806403126409138, "flos": 20704728562560.0, "grad_norm": 1.8889554354274507, "language_loss": 0.76644653, "learning_rate": 3.7587297752807315e-06, "loss": 0.78853035, "num_input_tokens_seen": 56996770, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 1.03125, "step": 2629, "time_per_iteration": 2.6593706607818604 }, { "auxiliary_loss_clip": 0.01165735, "auxiliary_loss_mlp": 0.01046463, "balance_loss_clip": 1.02682948, "balance_loss_mlp": 1.05034471, "epoch": 0.15812415451675935, "flos": 17821712269440.0, "grad_norm": 2.5205054971116705, "language_loss": 0.73766661, "learning_rate": 3.758549844364869e-06, "loss": 0.75978857, "num_input_tokens_seen": 57014970, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.0625, "step": 2630, "time_per_iteration": 2.729804754257202 }, { "auxiliary_loss_clip": 0.01167738, "auxiliary_loss_mlp": 0.0104389, "balance_loss_clip": 1.02432823, "balance_loss_mlp": 1.05025363, "epoch": 0.15818427776942734, "flos": 20084048524800.0, "grad_norm": 2.8910850631834593, "language_loss": 0.83612448, "learning_rate": 3.7583698506908854e-06, "loss": 0.85824078, "num_input_tokens_seen": 57034045, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0859375, "step": 2631, "time_per_iteration": 2.7280147075653076 }, { "auxiliary_loss_clip": 0.01154112, "auxiliary_loss_mlp": 0.01043236, "balance_loss_clip": 1.02485371, "balance_loss_mlp": 1.04817533, "epoch": 0.1582444010220953, "flos": 21214911386880.0, "grad_norm": 1.8219929161097461, "language_loss": 0.78511512, "learning_rate": 3.7581897942652046e-06, "loss": 0.80708861, "num_input_tokens_seen": 57053695, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0625, "step": 2632, "time_per_iteration": 2.8345768451690674 }, { "auxiliary_loss_clip": 0.01166537, "auxiliary_loss_mlp": 0.01054561, "balance_loss_clip": 1.03472447, "balance_loss_mlp": 1.05037975, "epoch": 0.15830452427476327, "flos": 17858341163520.0, "grad_norm": 2.7985954486490163, "language_loss": 0.83186865, "learning_rate": 3.7580096750942535e-06, "loss": 0.8540796, "num_input_tokens_seen": 57071290, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.0703125, "step": 2633, "time_per_iteration": 2.6507115364074707 }, { "auxiliary_loss_clip": 0.01158319, "auxiliary_loss_mlp": 0.01045457, "balance_loss_clip": 1.02603769, "balance_loss_mlp": 1.05036688, "epoch": 0.15836464752743123, "flos": 24534960456960.0, "grad_norm": 1.617597317640904, "language_loss": 0.77544594, "learning_rate": 3.7578294931844584e-06, "loss": 0.79748368, "num_input_tokens_seen": 57091465, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.078125, "step": 2634, "time_per_iteration": 2.6431591510772705 }, { "auxiliary_loss_clip": 0.01186497, "auxiliary_loss_mlp": 0.01292515, "balance_loss_clip": 1.02463317, "balance_loss_mlp": 1.05025649, "epoch": 0.1584247707800992, "flos": 20120821073280.0, "grad_norm": 3.050854976056668, "language_loss": 0.88468885, "learning_rate": 3.757649248542251e-06, "loss": 0.90947896, "num_input_tokens_seen": 57110075, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.09375, "step": 2635, "time_per_iteration": 4.102481365203857 }, { "auxiliary_loss_clip": 0.01201582, "auxiliary_loss_mlp": 0.01052031, "balance_loss_clip": 1.03165781, "balance_loss_mlp": 1.0485177, "epoch": 0.15848489403276717, "flos": 20375966355840.0, "grad_norm": 2.79344197762799, "language_loss": 0.75736964, "learning_rate": 3.757468941174063e-06, "loss": 0.77990574, "num_input_tokens_seen": 57128945, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.078125, "step": 2636, "time_per_iteration": 5.534560918807983 }, { "auxiliary_loss_clip": 0.01189476, "auxiliary_loss_mlp": 0.01048132, "balance_loss_clip": 1.02747321, "balance_loss_mlp": 1.05273569, "epoch": 0.15854501728543513, "flos": 39346890359040.0, "grad_norm": 2.2144506980755687, "language_loss": 0.72290206, "learning_rate": 3.7572885710863293e-06, "loss": 0.74527824, "num_input_tokens_seen": 57152385, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.1015625, "step": 2637, "time_per_iteration": 2.7280919551849365 }, { "auxiliary_loss_clip": 0.01161486, "auxiliary_loss_mlp": 0.01043676, "balance_loss_clip": 1.0249604, "balance_loss_mlp": 1.04767847, "epoch": 0.15860514053810312, "flos": 24864225454080.0, "grad_norm": 2.094933971808578, "language_loss": 0.77231467, "learning_rate": 3.7571081382854866e-06, "loss": 0.79436636, "num_input_tokens_seen": 57172620, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.046875, "step": 2638, "time_per_iteration": 2.638923406600952 }, { "auxiliary_loss_clip": 0.01182222, "auxiliary_loss_mlp": 0.01056867, "balance_loss_clip": 1.03606486, "balance_loss_mlp": 1.04714084, "epoch": 0.1586652637907711, "flos": 26177694082560.0, "grad_norm": 2.4807300978516325, "language_loss": 0.75233287, "learning_rate": 3.756927642777974e-06, "loss": 0.77472377, "num_input_tokens_seen": 57194680, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.078125, "step": 2639, "time_per_iteration": 4.265498399734497 }, { "auxiliary_loss_clip": 0.01199401, "auxiliary_loss_mlp": 0.01052959, "balance_loss_clip": 1.03349185, "balance_loss_mlp": 1.04648185, "epoch": 0.15872538704343905, "flos": 19792058866560.0, "grad_norm": 2.013404625234407, "language_loss": 0.81061202, "learning_rate": 3.7567470845702337e-06, "loss": 0.8331356, "num_input_tokens_seen": 57214675, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.078125, "step": 2640, "time_per_iteration": 2.6670002937316895 }, { "auxiliary_loss_clip": 0.01168699, "auxiliary_loss_mlp": 0.01048478, "balance_loss_clip": 1.02911878, "balance_loss_mlp": 1.04658175, "epoch": 0.15878551029610702, "flos": 28475366342400.0, "grad_norm": 1.434496132025264, "language_loss": 0.6679976, "learning_rate": 3.756566463668709e-06, "loss": 0.69016939, "num_input_tokens_seen": 57235830, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0390625, "step": 2641, "time_per_iteration": 2.6350083351135254 }, { "auxiliary_loss_clip": 0.01178947, "auxiliary_loss_mlp": 0.01054746, "balance_loss_clip": 1.03513646, "balance_loss_mlp": 1.04889691, "epoch": 0.15884563354877498, "flos": 24206701040640.0, "grad_norm": 2.783491642969679, "language_loss": 0.75269842, "learning_rate": 3.756385780079845e-06, "loss": 0.77503532, "num_input_tokens_seen": 57255970, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.125, "step": 2642, "time_per_iteration": 2.5967774391174316 }, { "auxiliary_loss_clip": 0.0116146, "auxiliary_loss_mlp": 0.01047604, "balance_loss_clip": 1.02899551, "balance_loss_mlp": 1.0489769, "epoch": 0.15890575680144295, "flos": 23949795991680.0, "grad_norm": 2.0290619553398033, "language_loss": 0.70275962, "learning_rate": 3.756205033810091e-06, "loss": 0.72485024, "num_input_tokens_seen": 57274435, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.03125, "step": 2643, "time_per_iteration": 2.673362970352173 }, { "auxiliary_loss_clip": 0.01151343, "auxiliary_loss_mlp": 0.0103954, "balance_loss_clip": 1.02168226, "balance_loss_mlp": 1.04820335, "epoch": 0.15896588005411091, "flos": 21215019127680.0, "grad_norm": 2.6634131855528884, "language_loss": 0.77817452, "learning_rate": 3.7560242248658963e-06, "loss": 0.80008334, "num_input_tokens_seen": 57293115, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 1.03125, "step": 2644, "time_per_iteration": 2.602309465408325 }, { "auxiliary_loss_clip": 0.01161525, "auxiliary_loss_mlp": 0.01046694, "balance_loss_clip": 1.02765656, "balance_loss_mlp": 1.04718423, "epoch": 0.1590260033067789, "flos": 24352390604160.0, "grad_norm": 2.9324678906880357, "language_loss": 0.82046247, "learning_rate": 3.7558433532537145e-06, "loss": 0.84254462, "num_input_tokens_seen": 57312565, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.0546875, "step": 2645, "time_per_iteration": 2.7260491847991943 }, { "auxiliary_loss_clip": 0.01153941, "auxiliary_loss_mlp": 0.01049724, "balance_loss_clip": 1.02885079, "balance_loss_mlp": 1.0458374, "epoch": 0.15908612655944687, "flos": 32048944583040.0, "grad_norm": 2.5767791068503154, "language_loss": 0.70081335, "learning_rate": 3.75566241898e-06, "loss": 0.72284997, "num_input_tokens_seen": 57333360, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.078125, "step": 2646, "time_per_iteration": 2.670933246612549 }, { "auxiliary_loss_clip": 0.0116716, "auxiliary_loss_mlp": 0.01042805, "balance_loss_clip": 1.02419603, "balance_loss_mlp": 1.04556942, "epoch": 0.15914624981211484, "flos": 17785370684160.0, "grad_norm": 2.1546411232883385, "language_loss": 0.62220573, "learning_rate": 3.7554814220512095e-06, "loss": 0.64430535, "num_input_tokens_seen": 57350575, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0390625, "step": 2647, "time_per_iteration": 2.6747312545776367 }, { "auxiliary_loss_clip": 0.01153704, "auxiliary_loss_mlp": 0.0104717, "balance_loss_clip": 1.02736962, "balance_loss_mlp": 1.04897714, "epoch": 0.1592063730647828, "flos": 17712507945600.0, "grad_norm": 2.1506566688768327, "language_loss": 0.89437419, "learning_rate": 3.755300362473803e-06, "loss": 0.91638291, "num_input_tokens_seen": 57367570, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.046875, "step": 2648, "time_per_iteration": 2.6036477088928223 }, { "auxiliary_loss_clip": 0.01150258, "auxiliary_loss_mlp": 0.01044823, "balance_loss_clip": 1.02704847, "balance_loss_mlp": 1.04808807, "epoch": 0.15926649631745077, "flos": 18803545603200.0, "grad_norm": 1.8705088122880815, "language_loss": 0.90898728, "learning_rate": 3.7551192402542418e-06, "loss": 0.93093807, "num_input_tokens_seen": 57383980, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 1.015625, "step": 2649, "time_per_iteration": 2.5745623111724854 }, { "auxiliary_loss_clip": 0.01216623, "auxiliary_loss_mlp": 0.01043339, "balance_loss_clip": 1.02316856, "balance_loss_mlp": 1.04817367, "epoch": 0.15932661957011873, "flos": 17566243764480.0, "grad_norm": 2.3879637513424194, "language_loss": 0.71002316, "learning_rate": 3.7549380553989893e-06, "loss": 0.73262274, "num_input_tokens_seen": 57400840, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.140625, "step": 2650, "time_per_iteration": 2.6614978313446045 }, { "auxiliary_loss_clip": 0.01158633, "auxiliary_loss_mlp": 0.01035868, "balance_loss_clip": 1.01922607, "balance_loss_mlp": 1.04699183, "epoch": 0.15938674282278673, "flos": 13334351011200.0, "grad_norm": 1.9902983274119648, "language_loss": 0.71018231, "learning_rate": 3.7547568079145116e-06, "loss": 0.73212725, "num_input_tokens_seen": 57419230, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 1.0234375, "step": 2651, "time_per_iteration": 2.75766658782959 }, { "auxiliary_loss_clip": 0.01183167, "auxiliary_loss_mlp": 0.012908, "balance_loss_clip": 1.02388859, "balance_loss_mlp": 1.04882896, "epoch": 0.1594468660754547, "flos": 22488842119680.0, "grad_norm": 2.2311539025660454, "language_loss": 0.8029449, "learning_rate": 3.754575497807278e-06, "loss": 0.82768452, "num_input_tokens_seen": 57439315, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.078125, "step": 2652, "time_per_iteration": 2.6478536128997803 }, { "auxiliary_loss_clip": 0.01169037, "auxiliary_loss_mlp": 0.0128966, "balance_loss_clip": 1.02277493, "balance_loss_mlp": 1.04868221, "epoch": 0.15950698932812266, "flos": 15007320910080.0, "grad_norm": 3.0644677134506977, "language_loss": 0.69999194, "learning_rate": 3.7543941250837578e-06, "loss": 0.72457892, "num_input_tokens_seen": 57454635, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0234375, "step": 2653, "time_per_iteration": 2.5811610221862793 }, { "auxiliary_loss_clip": 0.0116322, "auxiliary_loss_mlp": 0.01035587, "balance_loss_clip": 1.01654887, "balance_loss_mlp": 1.04871249, "epoch": 0.15956711258079062, "flos": 30155052084480.0, "grad_norm": 4.140239894383424, "language_loss": 0.76152682, "learning_rate": 3.7542126897504235e-06, "loss": 0.78351486, "num_input_tokens_seen": 57476805, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0546875, "step": 2654, "time_per_iteration": 2.5959999561309814 }, { "auxiliary_loss_clip": 0.01178957, "auxiliary_loss_mlp": 0.01042057, "balance_loss_clip": 1.02346063, "balance_loss_mlp": 1.04677534, "epoch": 0.1596272358334586, "flos": 21032700670080.0, "grad_norm": 2.205415330921658, "language_loss": 0.81877279, "learning_rate": 3.754031191813752e-06, "loss": 0.84098297, "num_input_tokens_seen": 57496400, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0546875, "step": 2655, "time_per_iteration": 2.559413433074951 }, { "auxiliary_loss_clip": 0.01178438, "auxiliary_loss_mlp": 0.01045153, "balance_loss_clip": 1.02717662, "balance_loss_mlp": 1.04602432, "epoch": 0.15968735908612655, "flos": 15268032800640.0, "grad_norm": 1.9775022097966433, "language_loss": 0.73200989, "learning_rate": 3.753849631280218e-06, "loss": 0.75424576, "num_input_tokens_seen": 57513700, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.0546875, "step": 2656, "time_per_iteration": 2.535698652267456 }, { "auxiliary_loss_clip": 0.01174496, "auxiliary_loss_mlp": 0.01040412, "balance_loss_clip": 1.02214944, "balance_loss_mlp": 1.04582143, "epoch": 0.15974748233879452, "flos": 52665726695040.0, "grad_norm": 2.195427882848419, "language_loss": 0.77918792, "learning_rate": 3.7536680081563023e-06, "loss": 0.801337, "num_input_tokens_seen": 57536180, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 1.015625, "step": 2657, "time_per_iteration": 2.8664095401763916 }, { "auxiliary_loss_clip": 0.01161176, "auxiliary_loss_mlp": 0.01046124, "balance_loss_clip": 1.02769423, "balance_loss_mlp": 1.04867315, "epoch": 0.1598076055914625, "flos": 18733232730240.0, "grad_norm": 1.9688406917052523, "language_loss": 0.74139065, "learning_rate": 3.753486322448487e-06, "loss": 0.76346362, "num_input_tokens_seen": 57555025, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.03125, "step": 2658, "time_per_iteration": 2.573810338973999 }, { "auxiliary_loss_clip": 0.01179851, "auxiliary_loss_mlp": 0.01043653, "balance_loss_clip": 1.0227561, "balance_loss_mlp": 1.04682374, "epoch": 0.15986772884413047, "flos": 34349238535680.0, "grad_norm": 2.648178126875374, "language_loss": 0.74918848, "learning_rate": 3.753304574163255e-06, "loss": 0.77142352, "num_input_tokens_seen": 57577660, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.0625, "step": 2659, "time_per_iteration": 2.6535375118255615 }, { "auxiliary_loss_clip": 0.01183184, "auxiliary_loss_mlp": 0.01049635, "balance_loss_clip": 1.03006041, "balance_loss_mlp": 1.04933774, "epoch": 0.15992785209679844, "flos": 22054969739520.0, "grad_norm": 2.3487469309614752, "language_loss": 0.9071101, "learning_rate": 3.7531227633070924e-06, "loss": 0.92943823, "num_input_tokens_seen": 57596335, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.0625, "step": 2660, "time_per_iteration": 2.587710380554199 }, { "auxiliary_loss_clip": 0.0119363, "auxiliary_loss_mlp": 0.01044762, "balance_loss_clip": 1.02626085, "balance_loss_mlp": 1.05098283, "epoch": 0.1599879753494664, "flos": 33066652625280.0, "grad_norm": 1.7704577138160107, "language_loss": 0.78058314, "learning_rate": 3.7529408898864887e-06, "loss": 0.80296701, "num_input_tokens_seen": 57616830, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0625, "step": 2661, "time_per_iteration": 2.755845308303833 }, { "auxiliary_loss_clip": 0.0117458, "auxiliary_loss_mlp": 0.01291147, "balance_loss_clip": 1.02377081, "balance_loss_mlp": 1.04866016, "epoch": 0.16004809860213437, "flos": 28038010343040.0, "grad_norm": 1.9474869509221455, "language_loss": 0.74190682, "learning_rate": 3.752758953907933e-06, "loss": 0.76656407, "num_input_tokens_seen": 57635515, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.078125, "step": 2662, "time_per_iteration": 2.7889034748077393 }, { "auxiliary_loss_clip": 0.01173401, "auxiliary_loss_mlp": 0.01297661, "balance_loss_clip": 1.02988589, "balance_loss_mlp": 1.04793215, "epoch": 0.16010822185480234, "flos": 22780113505920.0, "grad_norm": 1.8747624709559625, "language_loss": 0.81731266, "learning_rate": 3.7525769553779192e-06, "loss": 0.84202331, "num_input_tokens_seen": 57654250, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.078125, "step": 2663, "time_per_iteration": 2.5663704872131348 }, { "auxiliary_loss_clip": 0.01194879, "auxiliary_loss_mlp": 0.0104285, "balance_loss_clip": 1.02406216, "balance_loss_mlp": 1.05111289, "epoch": 0.16016834510747033, "flos": 20084012611200.0, "grad_norm": 2.7166126490979723, "language_loss": 0.79377925, "learning_rate": 3.7523948943029424e-06, "loss": 0.81615651, "num_input_tokens_seen": 57672645, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0703125, "step": 2664, "time_per_iteration": 2.6621670722961426 }, { "auxiliary_loss_clip": 0.01160843, "auxiliary_loss_mlp": 0.01046918, "balance_loss_clip": 1.02792835, "balance_loss_mlp": 1.04607701, "epoch": 0.1602284683601383, "flos": 21173829206400.0, "grad_norm": 1.919972503089957, "language_loss": 0.93804067, "learning_rate": 3.752212770689499e-06, "loss": 0.96011829, "num_input_tokens_seen": 57691055, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0546875, "step": 2665, "time_per_iteration": 2.5361509323120117 }, { "auxiliary_loss_clip": 0.01179162, "auxiliary_loss_mlp": 0.01042959, "balance_loss_clip": 1.02388501, "balance_loss_mlp": 1.04642153, "epoch": 0.16028859161280626, "flos": 14647568244480.0, "grad_norm": 2.5304867985429813, "language_loss": 0.8483212, "learning_rate": 3.752030584544089e-06, "loss": 0.87054241, "num_input_tokens_seen": 57707235, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.0546875, "step": 2666, "time_per_iteration": 2.579314708709717 }, { "auxiliary_loss_clip": 0.01170881, "auxiliary_loss_mlp": 0.01288883, "balance_loss_clip": 1.02294898, "balance_loss_mlp": 1.04828763, "epoch": 0.16034871486547422, "flos": 20990325600000.0, "grad_norm": 2.144694847043041, "language_loss": 0.81001234, "learning_rate": 3.7518483358732142e-06, "loss": 0.83460999, "num_input_tokens_seen": 57724190, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 1.046875, "step": 2667, "time_per_iteration": 2.655534029006958 }, { "auxiliary_loss_clip": 0.01153402, "auxiliary_loss_mlp": 0.01043119, "balance_loss_clip": 1.02291322, "balance_loss_mlp": 1.0502255, "epoch": 0.1604088381181422, "flos": 21397732634880.0, "grad_norm": 2.2193447783484443, "language_loss": 0.73338461, "learning_rate": 3.751666024683379e-06, "loss": 0.75534987, "num_input_tokens_seen": 57743620, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.03125, "step": 2668, "time_per_iteration": 2.728907823562622 }, { "auxiliary_loss_clip": 0.01162939, "auxiliary_loss_mlp": 0.01047349, "balance_loss_clip": 1.02828717, "balance_loss_mlp": 1.04852986, "epoch": 0.16046896137081015, "flos": 23877040993920.0, "grad_norm": 2.145521034279159, "language_loss": 0.77479464, "learning_rate": 3.751483650981089e-06, "loss": 0.79689753, "num_input_tokens_seen": 57764810, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.0546875, "step": 2669, "time_per_iteration": 2.5534281730651855 }, { "auxiliary_loss_clip": 0.01086838, "auxiliary_loss_mlp": 0.01007424, "balance_loss_clip": 1.00502765, "balance_loss_mlp": 1.03569341, "epoch": 0.16052908462347812, "flos": 59806709015040.0, "grad_norm": 0.7990854658424568, "language_loss": 0.55538619, "learning_rate": 3.7513012147728527e-06, "loss": 0.57632881, "num_input_tokens_seen": 57824390, "router_z_loss_clip": 0.02392578, "router_z_loss_mlp": 0.421875, "step": 2670, "time_per_iteration": 3.0813822746276855 }, { "auxiliary_loss_clip": 0.01162061, "auxiliary_loss_mlp": 0.01041705, "balance_loss_clip": 1.02335858, "balance_loss_mlp": 1.04675615, "epoch": 0.1605892078761461, "flos": 18296559089280.0, "grad_norm": 2.2357244854896963, "language_loss": 0.77635568, "learning_rate": 3.751118716065181e-06, "loss": 0.79839337, "num_input_tokens_seen": 57843665, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0625, "step": 2671, "time_per_iteration": 2.534454107284546 }, { "auxiliary_loss_clip": 0.01165116, "auxiliary_loss_mlp": 0.01040256, "balance_loss_clip": 1.0221839, "balance_loss_mlp": 1.0508002, "epoch": 0.16064933112881408, "flos": 32160734686080.0, "grad_norm": 2.18935887760651, "language_loss": 0.644445, "learning_rate": 3.750936154864587e-06, "loss": 0.66649878, "num_input_tokens_seen": 57863305, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 1.046875, "step": 2672, "time_per_iteration": 2.668253183364868 }, { "auxiliary_loss_clip": 0.01164101, "auxiliary_loss_mlp": 0.01041602, "balance_loss_clip": 1.02088368, "balance_loss_mlp": 1.04764795, "epoch": 0.16070945438148204, "flos": 19828795501440.0, "grad_norm": 2.040108131983194, "language_loss": 0.85344666, "learning_rate": 3.750753531177586e-06, "loss": 0.87550372, "num_input_tokens_seen": 57883025, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.0703125, "step": 2673, "time_per_iteration": 2.535233974456787 }, { "auxiliary_loss_clip": 0.01181496, "auxiliary_loss_mlp": 0.01047403, "balance_loss_clip": 1.02897358, "balance_loss_mlp": 1.04984856, "epoch": 0.16076957763415, "flos": 18913144976640.0, "grad_norm": 2.515841919721811, "language_loss": 0.72379196, "learning_rate": 3.750570845010694e-06, "loss": 0.74608094, "num_input_tokens_seen": 57901430, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.046875, "step": 2674, "time_per_iteration": 2.5479536056518555 }, { "auxiliary_loss_clip": 0.01159425, "auxiliary_loss_mlp": 0.01045628, "balance_loss_clip": 1.0255177, "balance_loss_mlp": 1.04565036, "epoch": 0.16082970088681797, "flos": 16764358590720.0, "grad_norm": 1.9134639321932552, "language_loss": 0.8433364, "learning_rate": 3.7503880963704314e-06, "loss": 0.8653869, "num_input_tokens_seen": 57919550, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.046875, "step": 2675, "time_per_iteration": 2.5507259368896484 }, { "auxiliary_loss_clip": 0.01183907, "auxiliary_loss_mlp": 0.01044166, "balance_loss_clip": 1.02436495, "balance_loss_mlp": 1.05199885, "epoch": 0.16088982413948594, "flos": 35150261783040.0, "grad_norm": 2.0910986392822775, "language_loss": 0.82350034, "learning_rate": 3.7502052852633206e-06, "loss": 0.84578109, "num_input_tokens_seen": 57939890, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.046875, "step": 2676, "time_per_iteration": 2.6692333221435547 }, { "auxiliary_loss_clip": 0.01161721, "auxiliary_loss_mlp": 0.0104611, "balance_loss_clip": 1.02923024, "balance_loss_mlp": 1.05059755, "epoch": 0.1609499473921539, "flos": 18625105814400.0, "grad_norm": 1.9060922614409594, "language_loss": 0.73227692, "learning_rate": 3.7500224116958856e-06, "loss": 0.75435519, "num_input_tokens_seen": 57957410, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 1.015625, "step": 2677, "time_per_iteration": 6.908281087875366 }, { "auxiliary_loss_clip": 0.01170139, "auxiliary_loss_mlp": 0.01044455, "balance_loss_clip": 1.02608478, "balance_loss_mlp": 1.04893339, "epoch": 0.1610100706448219, "flos": 33145728416640.0, "grad_norm": 1.8781946946742878, "language_loss": 0.76295471, "learning_rate": 3.7498394756746522e-06, "loss": 0.78510058, "num_input_tokens_seen": 57977900, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.03125, "step": 2678, "time_per_iteration": 2.703416347503662 }, { "auxiliary_loss_clip": 0.01162795, "auxiliary_loss_mlp": 0.01041912, "balance_loss_clip": 1.02250481, "balance_loss_mlp": 1.04811943, "epoch": 0.16107019389748986, "flos": 34676707852800.0, "grad_norm": 2.025328633762839, "language_loss": 0.7054612, "learning_rate": 3.749656477206149e-06, "loss": 0.72750831, "num_input_tokens_seen": 57998210, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.0546875, "step": 2679, "time_per_iteration": 2.62248158454895 }, { "auxiliary_loss_clip": 0.01101774, "auxiliary_loss_mlp": 0.01256244, "balance_loss_clip": 1.0058068, "balance_loss_mlp": 1.03527021, "epoch": 0.16113031715015783, "flos": 65713403260800.0, "grad_norm": 0.7796633116974574, "language_loss": 0.51780248, "learning_rate": 3.749473416296906e-06, "loss": 0.54138267, "num_input_tokens_seen": 58059420, "router_z_loss_clip": 0.02526855, "router_z_loss_mlp": 0.39453125, "step": 2680, "time_per_iteration": 3.2321949005126953 }, { "auxiliary_loss_clip": 0.01172367, "auxiliary_loss_mlp": 0.01048191, "balance_loss_clip": 1.02814054, "balance_loss_mlp": 1.04768014, "epoch": 0.1611904404028258, "flos": 20810413353600.0, "grad_norm": 2.145948996270436, "language_loss": 0.8004117, "learning_rate": 3.749290292953458e-06, "loss": 0.82261729, "num_input_tokens_seen": 58078370, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.0625, "step": 2681, "time_per_iteration": 3.927126884460449 }, { "auxiliary_loss_clip": 0.01170694, "auxiliary_loss_mlp": 0.01056454, "balance_loss_clip": 1.03846538, "balance_loss_mlp": 1.05051529, "epoch": 0.16125056365549376, "flos": 27013335062400.0, "grad_norm": 2.09987011524544, "language_loss": 0.68925643, "learning_rate": 3.749107107182339e-06, "loss": 0.71152794, "num_input_tokens_seen": 58097395, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.015625, "step": 2682, "time_per_iteration": 2.545877695083618 }, { "auxiliary_loss_clip": 0.01175961, "auxiliary_loss_mlp": 0.01051387, "balance_loss_clip": 1.03157413, "balance_loss_mlp": 1.05293322, "epoch": 0.16131068690816172, "flos": 19276524915840.0, "grad_norm": 2.332258291713377, "language_loss": 0.87236404, "learning_rate": 3.7489238589900855e-06, "loss": 0.89463753, "num_input_tokens_seen": 58115630, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.046875, "step": 2683, "time_per_iteration": 2.5215659141540527 }, { "auxiliary_loss_clip": 0.0115792, "auxiliary_loss_mlp": 0.01061721, "balance_loss_clip": 1.04201531, "balance_loss_mlp": 1.05210209, "epoch": 0.16137081016082971, "flos": 35337931367040.0, "grad_norm": 1.8869758983494749, "language_loss": 0.73834258, "learning_rate": 3.7487405483832395e-06, "loss": 0.76053905, "num_input_tokens_seen": 58138655, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.0546875, "step": 2684, "time_per_iteration": 2.6617512702941895 }, { "auxiliary_loss_clip": 0.01178036, "auxiliary_loss_mlp": 0.0130014, "balance_loss_clip": 1.03199899, "balance_loss_mlp": 1.05333591, "epoch": 0.16143093341349768, "flos": 34235257703040.0, "grad_norm": 1.953396868055617, "language_loss": 0.70421386, "learning_rate": 3.748557175368341e-06, "loss": 0.72899562, "num_input_tokens_seen": 58157440, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.0625, "step": 2685, "time_per_iteration": 2.612389087677002 }, { "auxiliary_loss_clip": 0.01179948, "auxiliary_loss_mlp": 0.01053801, "balance_loss_clip": 1.03464389, "balance_loss_mlp": 1.05074251, "epoch": 0.16149105666616564, "flos": 27999262546560.0, "grad_norm": 1.8562672888785707, "language_loss": 0.72083002, "learning_rate": 3.748373739951935e-06, "loss": 0.74316752, "num_input_tokens_seen": 58176660, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0234375, "step": 2686, "time_per_iteration": 2.6077775955200195 }, { "auxiliary_loss_clip": 0.01165868, "auxiliary_loss_mlp": 0.0105561, "balance_loss_clip": 1.03601241, "balance_loss_mlp": 1.05151999, "epoch": 0.1615511799188336, "flos": 19422214479360.0, "grad_norm": 4.046327516649197, "language_loss": 0.81437588, "learning_rate": 3.7481902421405676e-06, "loss": 0.83659071, "num_input_tokens_seen": 58195085, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.0546875, "step": 2687, "time_per_iteration": 2.5462214946746826 }, { "auxiliary_loss_clip": 0.01166774, "auxiliary_loss_mlp": 0.01053951, "balance_loss_clip": 1.03397107, "balance_loss_mlp": 1.04799581, "epoch": 0.16161130317150157, "flos": 22854915578880.0, "grad_norm": 2.124101447942268, "language_loss": 0.71508014, "learning_rate": 3.7480066819407876e-06, "loss": 0.7372874, "num_input_tokens_seen": 58213540, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.1015625, "step": 2688, "time_per_iteration": 2.637274980545044 }, { "auxiliary_loss_clip": 0.01180819, "auxiliary_loss_mlp": 0.01048718, "balance_loss_clip": 1.03052664, "balance_loss_mlp": 1.04928374, "epoch": 0.16167142642416954, "flos": 26110577520000.0, "grad_norm": 4.0996724701064196, "language_loss": 0.76032412, "learning_rate": 3.7478230593591448e-06, "loss": 0.78261948, "num_input_tokens_seen": 58236995, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.046875, "step": 2689, "time_per_iteration": 2.625087261199951 }, { "auxiliary_loss_clip": 0.01156972, "auxiliary_loss_mlp": 0.01049259, "balance_loss_clip": 1.02974451, "balance_loss_mlp": 1.0530982, "epoch": 0.1617315496768375, "flos": 22779646629120.0, "grad_norm": 2.1566278168802624, "language_loss": 0.87451047, "learning_rate": 3.747639374402193e-06, "loss": 0.89657277, "num_input_tokens_seen": 58257230, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0390625, "step": 2690, "time_per_iteration": 2.583301305770874 }, { "auxiliary_loss_clip": 0.0116384, "auxiliary_loss_mlp": 0.01051538, "balance_loss_clip": 1.03377604, "balance_loss_mlp": 1.0511291, "epoch": 0.1617916729295055, "flos": 22017299351040.0, "grad_norm": 1.9478076709289731, "language_loss": 0.88228649, "learning_rate": 3.7474556270764877e-06, "loss": 0.90444028, "num_input_tokens_seen": 58277080, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 1.03125, "step": 2691, "time_per_iteration": 2.6108345985412598 }, { "auxiliary_loss_clip": 0.01169173, "auxiliary_loss_mlp": 0.01051547, "balance_loss_clip": 1.03067374, "balance_loss_mlp": 1.05003476, "epoch": 0.16185179618217346, "flos": 23438248450560.0, "grad_norm": 2.55997492907445, "language_loss": 0.81700718, "learning_rate": 3.7472718173885864e-06, "loss": 0.83921444, "num_input_tokens_seen": 58294815, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.1015625, "step": 2692, "time_per_iteration": 2.591705083847046 }, { "auxiliary_loss_clip": 0.01160361, "auxiliary_loss_mlp": 0.01049922, "balance_loss_clip": 1.0292629, "balance_loss_mlp": 1.05097055, "epoch": 0.16191191943484143, "flos": 25666110627840.0, "grad_norm": 2.425819540297306, "language_loss": 0.80892652, "learning_rate": 3.747087945345048e-06, "loss": 0.8310293, "num_input_tokens_seen": 58313215, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.09375, "step": 2693, "time_per_iteration": 2.5269370079040527 }, { "auxiliary_loss_clip": 0.01152795, "auxiliary_loss_mlp": 0.01053069, "balance_loss_clip": 1.03451991, "balance_loss_mlp": 1.05115569, "epoch": 0.1619720426875094, "flos": 23477355383040.0, "grad_norm": 1.715687348266621, "language_loss": 0.83725625, "learning_rate": 3.746904010952435e-06, "loss": 0.85931492, "num_input_tokens_seen": 58333215, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.015625, "step": 2694, "time_per_iteration": 2.581895112991333 }, { "auxiliary_loss_clip": 0.01187401, "auxiliary_loss_mlp": 0.01052421, "balance_loss_clip": 1.03195298, "balance_loss_mlp": 1.05110884, "epoch": 0.16203216594017736, "flos": 24133658734080.0, "grad_norm": 2.475615241685549, "language_loss": 0.69159806, "learning_rate": 3.7467200142173114e-06, "loss": 0.71399629, "num_input_tokens_seen": 58351160, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.09375, "step": 2695, "time_per_iteration": 2.558290719985962 }, { "auxiliary_loss_clip": 0.01173123, "auxiliary_loss_mlp": 0.01056687, "balance_loss_clip": 1.03574157, "balance_loss_mlp": 1.05625176, "epoch": 0.16209228919284532, "flos": 22340889999360.0, "grad_norm": 2.1092500915121, "language_loss": 0.82653403, "learning_rate": 3.7465359551462438e-06, "loss": 0.84883213, "num_input_tokens_seen": 58368505, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.078125, "step": 2696, "time_per_iteration": 2.7061774730682373 }, { "auxiliary_loss_clip": 0.01164362, "auxiliary_loss_mlp": 0.01058214, "balance_loss_clip": 1.03651786, "balance_loss_mlp": 1.05211627, "epoch": 0.1621524124455133, "flos": 15815131827840.0, "grad_norm": 2.062515869935404, "language_loss": 0.88322926, "learning_rate": 3.7463518337458006e-06, "loss": 0.90545499, "num_input_tokens_seen": 58385085, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.125, "step": 2697, "time_per_iteration": 2.521648645401001 }, { "auxiliary_loss_clip": 0.01149137, "auxiliary_loss_mlp": 0.01043964, "balance_loss_clip": 1.02551055, "balance_loss_mlp": 1.0485239, "epoch": 0.16221253569818128, "flos": 30186688988160.0, "grad_norm": 1.5313785932823227, "language_loss": 0.80355227, "learning_rate": 3.7461676500225522e-06, "loss": 0.8254832, "num_input_tokens_seen": 58406985, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0078125, "step": 2698, "time_per_iteration": 2.666050910949707 }, { "auxiliary_loss_clip": 0.01160031, "auxiliary_loss_mlp": 0.01050349, "balance_loss_clip": 1.03084612, "balance_loss_mlp": 1.04994416, "epoch": 0.16227265895084925, "flos": 24605991601920.0, "grad_norm": 1.7138781211829865, "language_loss": 0.77837765, "learning_rate": 3.7459834039830726e-06, "loss": 0.80048144, "num_input_tokens_seen": 58426205, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0078125, "step": 2699, "time_per_iteration": 2.5599465370178223 }, { "auxiliary_loss_clip": 0.01171348, "auxiliary_loss_mlp": 0.01042501, "balance_loss_clip": 1.0246079, "balance_loss_mlp": 1.0481956, "epoch": 0.1623327822035172, "flos": 19573326996480.0, "grad_norm": 2.181628646595251, "language_loss": 0.86125326, "learning_rate": 3.745799095633936e-06, "loss": 0.88339174, "num_input_tokens_seen": 58443830, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 1.046875, "step": 2700, "time_per_iteration": 2.6512842178344727 }, { "auxiliary_loss_clip": 0.0115878, "auxiliary_loss_mlp": 0.0129457, "balance_loss_clip": 1.02568221, "balance_loss_mlp": 1.04737401, "epoch": 0.16239290545618518, "flos": 26468462678400.0, "grad_norm": 1.6750964239538844, "language_loss": 0.80848968, "learning_rate": 3.7456147249817203e-06, "loss": 0.83302307, "num_input_tokens_seen": 58464405, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.0234375, "step": 2701, "time_per_iteration": 2.6338412761688232 }, { "auxiliary_loss_clip": 0.01173284, "auxiliary_loss_mlp": 0.01043955, "balance_loss_clip": 1.02460742, "balance_loss_mlp": 1.05154872, "epoch": 0.16245302870885314, "flos": 15851940289920.0, "grad_norm": 1.877862900261413, "language_loss": 0.729343, "learning_rate": 3.745430292033006e-06, "loss": 0.75151539, "num_input_tokens_seen": 58483295, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0390625, "step": 2702, "time_per_iteration": 2.6160147190093994 }, { "auxiliary_loss_clip": 0.01164182, "auxiliary_loss_mlp": 0.01048343, "balance_loss_clip": 1.02754116, "balance_loss_mlp": 1.05072796, "epoch": 0.1625131519615211, "flos": 14756521173120.0, "grad_norm": 1.906832171851725, "language_loss": 0.72611481, "learning_rate": 3.745245796794374e-06, "loss": 0.74824011, "num_input_tokens_seen": 58501205, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.046875, "step": 2703, "time_per_iteration": 2.5098915100097656 }, { "auxiliary_loss_clip": 0.01152697, "auxiliary_loss_mlp": 0.01045356, "balance_loss_clip": 1.02438712, "balance_loss_mlp": 1.04693282, "epoch": 0.1625732752141891, "flos": 28220508368640.0, "grad_norm": 2.071919885638983, "language_loss": 0.70307302, "learning_rate": 3.7450612392724084e-06, "loss": 0.72505355, "num_input_tokens_seen": 58522315, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.0546875, "step": 2704, "time_per_iteration": 2.664778709411621 }, { "auxiliary_loss_clip": 0.01098282, "auxiliary_loss_mlp": 0.01006734, "balance_loss_clip": 1.00377798, "balance_loss_mlp": 1.03165197, "epoch": 0.16263339846685707, "flos": 67327947688320.0, "grad_norm": 0.7742172011620371, "language_loss": 0.53316545, "learning_rate": 3.7448766194736967e-06, "loss": 0.55421567, "num_input_tokens_seen": 58586695, "router_z_loss_clip": 0.02954102, "router_z_loss_mlp": 0.3984375, "step": 2705, "time_per_iteration": 3.1632888317108154 }, { "auxiliary_loss_clip": 0.0117781, "auxiliary_loss_mlp": 0.01297218, "balance_loss_clip": 1.02920008, "balance_loss_mlp": 1.0521673, "epoch": 0.16269352171952503, "flos": 14319165173760.0, "grad_norm": 2.953863920581612, "language_loss": 0.75859487, "learning_rate": 3.7446919374048265e-06, "loss": 0.7833451, "num_input_tokens_seen": 58602435, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.078125, "step": 2706, "time_per_iteration": 2.532954454421997 }, { "auxiliary_loss_clip": 0.01183536, "auxiliary_loss_mlp": 0.01044248, "balance_loss_clip": 1.02619982, "balance_loss_mlp": 1.05149007, "epoch": 0.162753644972193, "flos": 28361205941760.0, "grad_norm": 1.9824745042420944, "language_loss": 0.72224343, "learning_rate": 3.7445071930723888e-06, "loss": 0.74452126, "num_input_tokens_seen": 58621275, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 1.046875, "step": 2707, "time_per_iteration": 2.658345937728882 }, { "auxiliary_loss_clip": 0.01173672, "auxiliary_loss_mlp": 0.01045484, "balance_loss_clip": 1.02688718, "balance_loss_mlp": 1.05144727, "epoch": 0.16281376822486096, "flos": 19937856170880.0, "grad_norm": 4.020391850017446, "language_loss": 0.83219838, "learning_rate": 3.7443223864829773e-06, "loss": 0.85438991, "num_input_tokens_seen": 58637550, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0390625, "step": 2708, "time_per_iteration": 2.5716826915740967 }, { "auxiliary_loss_clip": 0.01205721, "auxiliary_loss_mlp": 0.0129142, "balance_loss_clip": 1.02165937, "balance_loss_mlp": 1.05157757, "epoch": 0.16287389147752893, "flos": 21251719848960.0, "grad_norm": 2.0653733400579033, "language_loss": 0.85997635, "learning_rate": 3.7441375176431863e-06, "loss": 0.88494778, "num_input_tokens_seen": 58654135, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.09375, "step": 2709, "time_per_iteration": 2.629124402999878 }, { "auxiliary_loss_clip": 0.01173689, "auxiliary_loss_mlp": 0.01295991, "balance_loss_clip": 1.02931058, "balance_loss_mlp": 1.05003381, "epoch": 0.1629340147301969, "flos": 19244672530560.0, "grad_norm": 1.9461422298745281, "language_loss": 0.91799033, "learning_rate": 3.7439525865596137e-06, "loss": 0.94268715, "num_input_tokens_seen": 58674320, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0546875, "step": 2710, "time_per_iteration": 2.5944907665252686 }, { "auxiliary_loss_clip": 0.01183563, "auxiliary_loss_mlp": 0.01052774, "balance_loss_clip": 1.03312886, "balance_loss_mlp": 1.05345941, "epoch": 0.16299413798286488, "flos": 21249816428160.0, "grad_norm": 2.9907808495585493, "language_loss": 0.81571609, "learning_rate": 3.7437675932388596e-06, "loss": 0.83807951, "num_input_tokens_seen": 58691000, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.03125, "step": 2711, "time_per_iteration": 2.63132381439209 }, { "auxiliary_loss_clip": 0.01174925, "auxiliary_loss_mlp": 0.01040449, "balance_loss_clip": 1.02076781, "balance_loss_mlp": 1.04827535, "epoch": 0.16305426123553285, "flos": 18770579896320.0, "grad_norm": 2.476395748889553, "language_loss": 0.80714381, "learning_rate": 3.7435825376875253e-06, "loss": 0.82929748, "num_input_tokens_seen": 58710230, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.0859375, "step": 2712, "time_per_iteration": 2.534418821334839 }, { "auxiliary_loss_clip": 0.01157376, "auxiliary_loss_mlp": 0.01053539, "balance_loss_clip": 1.03371406, "balance_loss_mlp": 1.05032289, "epoch": 0.16311438448820081, "flos": 22087648137600.0, "grad_norm": 1.8987737564827172, "language_loss": 0.76846063, "learning_rate": 3.743397419912215e-06, "loss": 0.79056984, "num_input_tokens_seen": 58728610, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.0703125, "step": 2713, "time_per_iteration": 2.623340606689453 }, { "auxiliary_loss_clip": 0.01164346, "auxiliary_loss_mlp": 0.01050218, "balance_loss_clip": 1.03131127, "balance_loss_mlp": 1.05443513, "epoch": 0.16317450774086878, "flos": 16467700164480.0, "grad_norm": 2.205383307457891, "language_loss": 0.79056513, "learning_rate": 3.7432122399195365e-06, "loss": 0.81271076, "num_input_tokens_seen": 58744385, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.0078125, "step": 2714, "time_per_iteration": 2.5364925861358643 }, { "auxiliary_loss_clip": 0.01165478, "auxiliary_loss_mlp": 0.01055405, "balance_loss_clip": 1.03729761, "balance_loss_mlp": 1.05218077, "epoch": 0.16323463099353674, "flos": 24352929308160.0, "grad_norm": 1.9605895388724326, "language_loss": 0.77612966, "learning_rate": 3.7430269977160956e-06, "loss": 0.79833853, "num_input_tokens_seen": 58763905, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.0390625, "step": 2715, "time_per_iteration": 2.5505881309509277 }, { "auxiliary_loss_clip": 0.01170507, "auxiliary_loss_mlp": 0.01042978, "balance_loss_clip": 1.02539444, "balance_loss_mlp": 1.04862428, "epoch": 0.1632947542462047, "flos": 24900782520960.0, "grad_norm": 2.2222093430758854, "language_loss": 0.81301421, "learning_rate": 3.742841693308506e-06, "loss": 0.83514905, "num_input_tokens_seen": 58785580, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 1.03125, "step": 2716, "time_per_iteration": 2.543656349182129 }, { "auxiliary_loss_clip": 0.01178032, "auxiliary_loss_mlp": 0.01051774, "balance_loss_clip": 1.0334518, "balance_loss_mlp": 1.05544591, "epoch": 0.1633548774988727, "flos": 24900279730560.0, "grad_norm": 2.054226450119262, "language_loss": 0.86305004, "learning_rate": 3.742656326703379e-06, "loss": 0.88534808, "num_input_tokens_seen": 58806075, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.046875, "step": 2717, "time_per_iteration": 2.6381452083587646 }, { "auxiliary_loss_clip": 0.01169659, "auxiliary_loss_mlp": 0.01046797, "balance_loss_clip": 1.02893889, "balance_loss_mlp": 1.04932225, "epoch": 0.16341500075154067, "flos": 30441798357120.0, "grad_norm": 2.0222465656691533, "language_loss": 0.7658571, "learning_rate": 3.7424708979073306e-06, "loss": 0.78802168, "num_input_tokens_seen": 58827405, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 1.0234375, "step": 2718, "time_per_iteration": 4.112726211547852 }, { "auxiliary_loss_clip": 0.01171592, "auxiliary_loss_mlp": 0.01043365, "balance_loss_clip": 1.02549517, "balance_loss_mlp": 1.04755354, "epoch": 0.16347512400420863, "flos": 22784530878720.0, "grad_norm": 2.3077771480452776, "language_loss": 0.7326616, "learning_rate": 3.742285406926978e-06, "loss": 0.75481117, "num_input_tokens_seen": 58847205, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 1.0625, "step": 2719, "time_per_iteration": 5.293447732925415 }, { "auxiliary_loss_clip": 0.01182587, "auxiliary_loss_mlp": 0.0104345, "balance_loss_clip": 1.02482963, "balance_loss_mlp": 1.04986787, "epoch": 0.1635352472568766, "flos": 22633274707200.0, "grad_norm": 2.204764978598505, "language_loss": 0.71691179, "learning_rate": 3.7420998537689402e-06, "loss": 0.7391721, "num_input_tokens_seen": 58866865, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.046875, "step": 2720, "time_per_iteration": 2.566941499710083 }, { "auxiliary_loss_clip": 0.01178454, "auxiliary_loss_mlp": 0.01045943, "balance_loss_clip": 1.02609456, "balance_loss_mlp": 1.04955065, "epoch": 0.16359537050954456, "flos": 15522998515200.0, "grad_norm": 2.3383977246378294, "language_loss": 0.74794519, "learning_rate": 3.7419142384398404e-06, "loss": 0.77018917, "num_input_tokens_seen": 58885200, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.015625, "step": 2721, "time_per_iteration": 2.6146888732910156 }, { "auxiliary_loss_clip": 0.01193828, "auxiliary_loss_mlp": 0.01047473, "balance_loss_clip": 1.02744603, "balance_loss_mlp": 1.04981577, "epoch": 0.16365549376221253, "flos": 22090162089600.0, "grad_norm": 1.8931222098879503, "language_loss": 0.79055041, "learning_rate": 3.7417285609463026e-06, "loss": 0.81296349, "num_input_tokens_seen": 58906385, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.078125, "step": 2722, "time_per_iteration": 2.553190231323242 }, { "auxiliary_loss_clip": 0.01168903, "auxiliary_loss_mlp": 0.01298684, "balance_loss_clip": 1.02898073, "balance_loss_mlp": 1.05137813, "epoch": 0.1637156170148805, "flos": 24060400945920.0, "grad_norm": 2.497082852756544, "language_loss": 0.84461904, "learning_rate": 3.7415428212949524e-06, "loss": 0.86929488, "num_input_tokens_seen": 58925040, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.0859375, "step": 2723, "time_per_iteration": 3.9814674854278564 }, { "auxiliary_loss_clip": 0.01184786, "auxiliary_loss_mlp": 0.01045934, "balance_loss_clip": 1.02731311, "balance_loss_mlp": 1.04897249, "epoch": 0.1637757402675485, "flos": 26685362954880.0, "grad_norm": 1.9613922018044008, "language_loss": 0.70129216, "learning_rate": 3.7413570194924183e-06, "loss": 0.72359943, "num_input_tokens_seen": 58944790, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0, "step": 2724, "time_per_iteration": 2.6009299755096436 }, { "auxiliary_loss_clip": 0.01177672, "auxiliary_loss_mlp": 0.01045745, "balance_loss_clip": 1.02848375, "balance_loss_mlp": 1.05016398, "epoch": 0.16383586352021645, "flos": 16106941918080.0, "grad_norm": 2.0685876929039666, "language_loss": 0.70575613, "learning_rate": 3.741171155545332e-06, "loss": 0.72799033, "num_input_tokens_seen": 58962500, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 1.0078125, "step": 2725, "time_per_iteration": 2.6239092350006104 }, { "auxiliary_loss_clip": 0.01178241, "auxiliary_loss_mlp": 0.0104208, "balance_loss_clip": 1.02465129, "balance_loss_mlp": 1.05156267, "epoch": 0.16389598677288442, "flos": 19165991788800.0, "grad_norm": 2.36529475695326, "language_loss": 0.8854177, "learning_rate": 3.7409852294603255e-06, "loss": 0.90762091, "num_input_tokens_seen": 58980355, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.9921875, "step": 2726, "time_per_iteration": 2.6085524559020996 }, { "auxiliary_loss_clip": 0.01165651, "auxiliary_loss_mlp": 0.01044588, "balance_loss_clip": 1.02564526, "balance_loss_mlp": 1.05291569, "epoch": 0.16395611002555238, "flos": 21507008785920.0, "grad_norm": 1.9547859911351, "language_loss": 0.74021411, "learning_rate": 3.740799241244035e-06, "loss": 0.76231652, "num_input_tokens_seen": 58999505, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0390625, "step": 2727, "time_per_iteration": 2.561433792114258 }, { "auxiliary_loss_clip": 0.01160573, "auxiliary_loss_mlp": 0.01047489, "balance_loss_clip": 1.03079975, "balance_loss_mlp": 1.05136049, "epoch": 0.16401623327822035, "flos": 21470918595840.0, "grad_norm": 1.7662863698156386, "language_loss": 0.8266722, "learning_rate": 3.7406131909030972e-06, "loss": 0.84875286, "num_input_tokens_seen": 59017930, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 1.0, "step": 2728, "time_per_iteration": 2.634092330932617 }, { "auxiliary_loss_clip": 0.01165487, "auxiliary_loss_mlp": 0.01049329, "balance_loss_clip": 1.03093493, "balance_loss_mlp": 1.05044746, "epoch": 0.1640763565308883, "flos": 13626232928640.0, "grad_norm": 2.2555860848005556, "language_loss": 0.85086542, "learning_rate": 3.740427078444152e-06, "loss": 0.87301362, "num_input_tokens_seen": 59035130, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0625, "step": 2729, "time_per_iteration": 2.5116019248962402 }, { "auxiliary_loss_clip": 0.01153961, "auxiliary_loss_mlp": 0.01042642, "balance_loss_clip": 1.02530849, "balance_loss_mlp": 1.04975522, "epoch": 0.16413647978355628, "flos": 15451464579840.0, "grad_norm": 2.206866926396273, "language_loss": 0.72800756, "learning_rate": 3.7402409038738416e-06, "loss": 0.74997354, "num_input_tokens_seen": 59053080, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 1.046875, "step": 2730, "time_per_iteration": 2.508037567138672 }, { "auxiliary_loss_clip": 0.01172678, "auxiliary_loss_mlp": 0.0104705, "balance_loss_clip": 1.02704692, "balance_loss_mlp": 1.04776525, "epoch": 0.16419660303622427, "flos": 45878682015360.0, "grad_norm": 1.7057236068864168, "language_loss": 0.7429074, "learning_rate": 3.7400546671988096e-06, "loss": 0.76510465, "num_input_tokens_seen": 59075610, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.0625, "step": 2731, "time_per_iteration": 2.877333641052246 }, { "auxiliary_loss_clip": 0.01184274, "auxiliary_loss_mlp": 0.01044393, "balance_loss_clip": 1.02466345, "balance_loss_mlp": 1.0528084, "epoch": 0.16425672628889224, "flos": 18952826526720.0, "grad_norm": 2.4003506580903533, "language_loss": 0.79037237, "learning_rate": 3.739868368425702e-06, "loss": 0.81265903, "num_input_tokens_seen": 59094555, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.046875, "step": 2732, "time_per_iteration": 2.610322952270508 }, { "auxiliary_loss_clip": 0.01172607, "auxiliary_loss_mlp": 0.01044194, "balance_loss_clip": 1.02535903, "balance_loss_mlp": 1.04984975, "epoch": 0.1643168495415602, "flos": 24312996362880.0, "grad_norm": 1.868839655948402, "language_loss": 0.6934014, "learning_rate": 3.7396820075611682e-06, "loss": 0.71556938, "num_input_tokens_seen": 59113515, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.046875, "step": 2733, "time_per_iteration": 2.5721194744110107 }, { "auxiliary_loss_clip": 0.01171231, "auxiliary_loss_mlp": 0.01049932, "balance_loss_clip": 1.02991652, "balance_loss_mlp": 1.05004549, "epoch": 0.16437697279422817, "flos": 26428421992320.0, "grad_norm": 2.6095795567135815, "language_loss": 0.80979371, "learning_rate": 3.7394955846118585e-06, "loss": 0.83200538, "num_input_tokens_seen": 59133275, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.03125, "step": 2734, "time_per_iteration": 2.721902370452881 }, { "auxiliary_loss_clip": 0.01171103, "auxiliary_loss_mlp": 0.0104805, "balance_loss_clip": 1.02937031, "balance_loss_mlp": 1.04973304, "epoch": 0.16443709604689613, "flos": 34532239351680.0, "grad_norm": 3.650773414788533, "language_loss": 0.81965709, "learning_rate": 3.739309099584426e-06, "loss": 0.84184861, "num_input_tokens_seen": 59154095, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.03125, "step": 2735, "time_per_iteration": 2.6784534454345703 }, { "auxiliary_loss_clip": 0.0120462, "auxiliary_loss_mlp": 0.01041684, "balance_loss_clip": 1.0242312, "balance_loss_mlp": 1.05010152, "epoch": 0.1644972192995641, "flos": 23258048895360.0, "grad_norm": 2.9512483701867724, "language_loss": 0.78456676, "learning_rate": 3.7391225524855256e-06, "loss": 0.80702972, "num_input_tokens_seen": 59173795, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 1.0, "step": 2736, "time_per_iteration": 2.6912522315979004 }, { "auxiliary_loss_clip": 0.01174983, "auxiliary_loss_mlp": 0.01049975, "balance_loss_clip": 1.03198647, "balance_loss_mlp": 1.05185509, "epoch": 0.1645573425522321, "flos": 26979543342720.0, "grad_norm": 2.032024420019518, "language_loss": 0.81238735, "learning_rate": 3.738935943321815e-06, "loss": 0.83463693, "num_input_tokens_seen": 59191610, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.046875, "step": 2737, "time_per_iteration": 2.610581874847412 }, { "auxiliary_loss_clip": 0.01162661, "auxiliary_loss_mlp": 0.01037686, "balance_loss_clip": 1.02015054, "balance_loss_mlp": 1.04804158, "epoch": 0.16461746580490005, "flos": 28731768600960.0, "grad_norm": 1.9026942931499589, "language_loss": 0.86887419, "learning_rate": 3.7387492720999536e-06, "loss": 0.89087766, "num_input_tokens_seen": 59213000, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 1.0546875, "step": 2738, "time_per_iteration": 2.6627752780914307 }, { "auxiliary_loss_clip": 0.01173036, "auxiliary_loss_mlp": 0.01052399, "balance_loss_clip": 1.03433847, "balance_loss_mlp": 1.05047894, "epoch": 0.16467758905756802, "flos": 24930156867840.0, "grad_norm": 5.750880865611997, "language_loss": 0.72024143, "learning_rate": 3.7385625388266037e-06, "loss": 0.74249578, "num_input_tokens_seen": 59232340, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 1.046875, "step": 2739, "time_per_iteration": 2.5643420219421387 }, { "auxiliary_loss_clip": 0.01158887, "auxiliary_loss_mlp": 0.01045655, "balance_loss_clip": 1.02635479, "balance_loss_mlp": 1.0472219, "epoch": 0.16473771231023598, "flos": 24826519152000.0, "grad_norm": 1.7416942711319272, "language_loss": 0.81469917, "learning_rate": 3.7383757435084284e-06, "loss": 0.83674461, "num_input_tokens_seen": 59253950, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.03125, "step": 2740, "time_per_iteration": 2.622445821762085 }, { "auxiliary_loss_clip": 0.01184345, "auxiliary_loss_mlp": 0.01049028, "balance_loss_clip": 1.02838063, "balance_loss_mlp": 1.05099654, "epoch": 0.16479783556290395, "flos": 39896072375040.0, "grad_norm": 2.26138457796855, "language_loss": 0.6886571, "learning_rate": 3.7381888861520943e-06, "loss": 0.71099085, "num_input_tokens_seen": 59275545, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.0625, "step": 2741, "time_per_iteration": 2.738832950592041 }, { "auxiliary_loss_clip": 0.01152013, "auxiliary_loss_mlp": 0.01040553, "balance_loss_clip": 1.02260005, "balance_loss_mlp": 1.0490818, "epoch": 0.16485795881557191, "flos": 19897061299200.0, "grad_norm": 2.118880516385119, "language_loss": 0.79610974, "learning_rate": 3.73800196676427e-06, "loss": 0.81803536, "num_input_tokens_seen": 59293480, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.03125, "step": 2742, "time_per_iteration": 2.4836630821228027 }, { "auxiliary_loss_clip": 0.01168663, "auxiliary_loss_mlp": 0.01049294, "balance_loss_clip": 1.02993441, "balance_loss_mlp": 1.04952192, "epoch": 0.16491808206823988, "flos": 20556129997440.0, "grad_norm": 2.808095601118459, "language_loss": 0.84216905, "learning_rate": 3.737814985351627e-06, "loss": 0.86434859, "num_input_tokens_seen": 59313435, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0078125, "step": 2743, "time_per_iteration": 2.5742926597595215 }, { "auxiliary_loss_clip": 0.01150254, "auxiliary_loss_mlp": 0.01048214, "balance_loss_clip": 1.02970052, "balance_loss_mlp": 1.04963923, "epoch": 0.16497820532090787, "flos": 23800802376960.0, "grad_norm": 1.8830077729801613, "language_loss": 0.85748124, "learning_rate": 3.7376279419208367e-06, "loss": 0.87946594, "num_input_tokens_seen": 59331535, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0078125, "step": 2744, "time_per_iteration": 2.556920289993286 }, { "auxiliary_loss_clip": 0.01165672, "auxiliary_loss_mlp": 0.01044107, "balance_loss_clip": 1.02690542, "balance_loss_mlp": 1.04877305, "epoch": 0.16503832857357584, "flos": 25482642935040.0, "grad_norm": 1.8522376304533605, "language_loss": 0.82764459, "learning_rate": 3.7374408364785744e-06, "loss": 0.84974235, "num_input_tokens_seen": 59350680, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.98828125, "step": 2745, "time_per_iteration": 2.6153790950775146 }, { "auxiliary_loss_clip": 0.01187223, "auxiliary_loss_mlp": 0.01052299, "balance_loss_clip": 1.03390503, "balance_loss_mlp": 1.05434, "epoch": 0.1650984518262438, "flos": 17676058619520.0, "grad_norm": 2.0934286116587173, "language_loss": 0.76219571, "learning_rate": 3.7372536690315187e-06, "loss": 0.78459084, "num_input_tokens_seen": 59367020, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.0625, "step": 2746, "time_per_iteration": 2.5572097301483154 }, { "auxiliary_loss_clip": 0.01152368, "auxiliary_loss_mlp": 0.010499, "balance_loss_clip": 1.03123176, "balance_loss_mlp": 1.05036509, "epoch": 0.16515857507891177, "flos": 18698327688960.0, "grad_norm": 1.7060983230652773, "language_loss": 0.80472469, "learning_rate": 3.737066439586348e-06, "loss": 0.8267473, "num_input_tokens_seen": 59386075, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.015625, "step": 2747, "time_per_iteration": 2.651937246322632 }, { "auxiliary_loss_clip": 0.01167754, "auxiliary_loss_mlp": 0.01039973, "balance_loss_clip": 1.0215075, "balance_loss_mlp": 1.05476069, "epoch": 0.16521869833157973, "flos": 15010481306880.0, "grad_norm": 2.0111187771005428, "language_loss": 0.69374359, "learning_rate": 3.7368791481497448e-06, "loss": 0.71582085, "num_input_tokens_seen": 59402690, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.0390625, "step": 2748, "time_per_iteration": 2.551467180252075 }, { "auxiliary_loss_clip": 0.01156046, "auxiliary_loss_mlp": 0.01294138, "balance_loss_clip": 1.0281589, "balance_loss_mlp": 1.05318308, "epoch": 0.1652788215842477, "flos": 22121152548480.0, "grad_norm": 2.149632549735422, "language_loss": 0.87919009, "learning_rate": 3.736691794728392e-06, "loss": 0.90369189, "num_input_tokens_seen": 59421130, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 1.03125, "step": 2749, "time_per_iteration": 2.5884790420532227 }, { "auxiliary_loss_clip": 0.0116147, "auxiliary_loss_mlp": 0.01041097, "balance_loss_clip": 1.02227402, "balance_loss_mlp": 1.04884624, "epoch": 0.16533894483691566, "flos": 18333080242560.0, "grad_norm": 2.290593419698256, "language_loss": 0.78856897, "learning_rate": 3.736504379328976e-06, "loss": 0.81059462, "num_input_tokens_seen": 59438970, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.0390625, "step": 2750, "time_per_iteration": 2.522608518600464 }, { "auxiliary_loss_clip": 0.01161324, "auxiliary_loss_mlp": 0.0104014, "balance_loss_clip": 1.02175784, "balance_loss_mlp": 1.05072832, "epoch": 0.16539906808958366, "flos": 22382115834240.0, "grad_norm": 3.882192198350098, "language_loss": 0.95008194, "learning_rate": 3.7363169019581865e-06, "loss": 0.97209656, "num_input_tokens_seen": 59458510, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.015625, "step": 2751, "time_per_iteration": 2.634425401687622 }, { "auxiliary_loss_clip": 0.01207524, "auxiliary_loss_mlp": 0.01046276, "balance_loss_clip": 1.02722657, "balance_loss_mlp": 1.05345523, "epoch": 0.16545919134225162, "flos": 22711093522560.0, "grad_norm": 2.4288550911592655, "language_loss": 0.70849293, "learning_rate": 3.7361293626227125e-06, "loss": 0.73103094, "num_input_tokens_seen": 59477110, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.9921875, "step": 2752, "time_per_iteration": 2.6148757934570312 }, { "auxiliary_loss_clip": 0.010779, "auxiliary_loss_mlp": 0.01011711, "balance_loss_clip": 1.00899279, "balance_loss_mlp": 1.03103018, "epoch": 0.1655193145949196, "flos": 67802974076160.0, "grad_norm": 0.7656862626894786, "language_loss": 0.54074049, "learning_rate": 3.735941761329248e-06, "loss": 0.56163657, "num_input_tokens_seen": 59541155, "router_z_loss_clip": 0.02722168, "router_z_loss_mlp": 0.37890625, "step": 2753, "time_per_iteration": 3.3244833946228027 }, { "auxiliary_loss_clip": 0.01151127, "auxiliary_loss_mlp": 0.01285644, "balance_loss_clip": 1.01902175, "balance_loss_mlp": 1.0485816, "epoch": 0.16557943784758755, "flos": 24280389792000.0, "grad_norm": 1.8661274039960178, "language_loss": 0.75234455, "learning_rate": 3.735754098084487e-06, "loss": 0.77671224, "num_input_tokens_seen": 59561155, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.0234375, "step": 2754, "time_per_iteration": 2.570822238922119 }, { "auxiliary_loss_clip": 0.01180703, "auxiliary_loss_mlp": 0.0105362, "balance_loss_clip": 1.0330801, "balance_loss_mlp": 1.05373716, "epoch": 0.16563956110025552, "flos": 20083617561600.0, "grad_norm": 2.3055358035173916, "language_loss": 0.86484659, "learning_rate": 3.7355663728951265e-06, "loss": 0.88718981, "num_input_tokens_seen": 59580460, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.09375, "step": 2755, "time_per_iteration": 2.6330177783966064 }, { "auxiliary_loss_clip": 0.01178538, "auxiliary_loss_mlp": 0.01046415, "balance_loss_clip": 1.02780628, "balance_loss_mlp": 1.04949498, "epoch": 0.16569968435292348, "flos": 28034454896640.0, "grad_norm": 1.8107655299438201, "language_loss": 0.73317921, "learning_rate": 3.7353785857678675e-06, "loss": 0.75542879, "num_input_tokens_seen": 59600025, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.015625, "step": 2756, "time_per_iteration": 2.623711347579956 }, { "auxiliary_loss_clip": 0.01176137, "auxiliary_loss_mlp": 0.0104691, "balance_loss_clip": 1.02871835, "balance_loss_mlp": 1.0518961, "epoch": 0.16575980760559147, "flos": 26250233598720.0, "grad_norm": 1.5971795969194693, "language_loss": 0.74359596, "learning_rate": 3.7351907367094105e-06, "loss": 0.76582646, "num_input_tokens_seen": 59620600, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.97265625, "step": 2757, "time_per_iteration": 2.6356866359710693 }, { "auxiliary_loss_clip": 0.01169421, "auxiliary_loss_mlp": 0.01043705, "balance_loss_clip": 1.0251317, "balance_loss_mlp": 1.05030382, "epoch": 0.16581993085825944, "flos": 26943955943040.0, "grad_norm": 1.666357786853228, "language_loss": 0.84434295, "learning_rate": 3.7350028257264593e-06, "loss": 0.86647421, "num_input_tokens_seen": 59641385, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0078125, "step": 2758, "time_per_iteration": 2.601101875305176 }, { "auxiliary_loss_clip": 0.01152599, "auxiliary_loss_mlp": 0.01050768, "balance_loss_clip": 1.03342295, "balance_loss_mlp": 1.05182767, "epoch": 0.1658800541109274, "flos": 21653632103040.0, "grad_norm": 2.1150508638730203, "language_loss": 0.78800356, "learning_rate": 3.7348148528257202e-06, "loss": 0.81003726, "num_input_tokens_seen": 59659865, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 1.0078125, "step": 2759, "time_per_iteration": 2.576338529586792 }, { "auxiliary_loss_clip": 0.01197081, "auxiliary_loss_mlp": 0.01048479, "balance_loss_clip": 1.02989459, "balance_loss_mlp": 1.05010819, "epoch": 0.16594017736359537, "flos": 16435488643200.0, "grad_norm": 2.4171315472814814, "language_loss": 0.74819696, "learning_rate": 3.734626818013902e-06, "loss": 0.77065253, "num_input_tokens_seen": 59678780, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.015625, "step": 2760, "time_per_iteration": 6.95620322227478 }, { "auxiliary_loss_clip": 0.01191115, "auxiliary_loss_mlp": 0.01038824, "balance_loss_clip": 1.02063298, "balance_loss_mlp": 1.04982138, "epoch": 0.16600030061626334, "flos": 22637297030400.0, "grad_norm": 1.7128688258655898, "language_loss": 0.72964352, "learning_rate": 3.734438721297714e-06, "loss": 0.75194293, "num_input_tokens_seen": 59698795, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.0546875, "step": 2761, "time_per_iteration": 2.6458418369293213 }, { "auxiliary_loss_clip": 0.01168084, "auxiliary_loss_mlp": 0.01045004, "balance_loss_clip": 1.02689576, "balance_loss_mlp": 1.04986477, "epoch": 0.1660604238689313, "flos": 26396569607040.0, "grad_norm": 2.1826478092489574, "language_loss": 0.88906646, "learning_rate": 3.73425056268387e-06, "loss": 0.91119736, "num_input_tokens_seen": 59718795, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.0, "step": 2762, "time_per_iteration": 2.551161050796509 }, { "auxiliary_loss_clip": 0.01188172, "auxiliary_loss_mlp": 0.01050041, "balance_loss_clip": 1.03180182, "balance_loss_mlp": 1.04993737, "epoch": 0.16612054712159927, "flos": 23039999383680.0, "grad_norm": 2.070777155266907, "language_loss": 0.87933034, "learning_rate": 3.7340623421790843e-06, "loss": 0.90171254, "num_input_tokens_seen": 59737555, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 1.015625, "step": 2763, "time_per_iteration": 2.629939317703247 }, { "auxiliary_loss_clip": 0.01060514, "auxiliary_loss_mlp": 0.01008348, "balance_loss_clip": 1.00603509, "balance_loss_mlp": 1.02442408, "epoch": 0.16618067037426726, "flos": 59241225202560.0, "grad_norm": 0.7791699190904577, "language_loss": 0.5981729, "learning_rate": 3.733874059790074e-06, "loss": 0.61886156, "num_input_tokens_seen": 59800915, "router_z_loss_clip": 0.02307129, "router_z_loss_mlp": 0.36132812, "step": 2764, "time_per_iteration": 4.666817665100098 }, { "auxiliary_loss_clip": 0.01162417, "auxiliary_loss_mlp": 0.01048935, "balance_loss_clip": 1.02868092, "balance_loss_mlp": 1.05076373, "epoch": 0.16624079362693522, "flos": 27198813916800.0, "grad_norm": 1.94788903640741, "language_loss": 0.82056469, "learning_rate": 3.733685715523559e-06, "loss": 0.84267825, "num_input_tokens_seen": 59822910, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.03125, "step": 2765, "time_per_iteration": 2.590243339538574 }, { "auxiliary_loss_clip": 0.01157692, "auxiliary_loss_mlp": 0.01047243, "balance_loss_clip": 1.02670312, "balance_loss_mlp": 1.049999, "epoch": 0.1663009168796032, "flos": 10925068216320.0, "grad_norm": 4.547674025036093, "language_loss": 0.69889057, "learning_rate": 3.7334973093862595e-06, "loss": 0.72093987, "num_input_tokens_seen": 59838805, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.078125, "step": 2766, "time_per_iteration": 2.5777337551116943 }, { "auxiliary_loss_clip": 0.01160517, "auxiliary_loss_mlp": 0.01040992, "balance_loss_clip": 1.02381396, "balance_loss_mlp": 1.05314255, "epoch": 0.16636104013227115, "flos": 17894431353600.0, "grad_norm": 2.2847937786850143, "language_loss": 0.89039481, "learning_rate": 3.7333088413849008e-06, "loss": 0.9124099, "num_input_tokens_seen": 59855345, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.984375, "step": 2767, "time_per_iteration": 2.6159768104553223 }, { "auxiliary_loss_clip": 0.01067829, "auxiliary_loss_mlp": 0.01001458, "balance_loss_clip": 0.99900192, "balance_loss_mlp": 1.02302623, "epoch": 0.16642116338493912, "flos": 66726050463360.0, "grad_norm": 0.6453978387969368, "language_loss": 0.52874219, "learning_rate": 3.7331203115262078e-06, "loss": 0.54943502, "num_input_tokens_seen": 59917710, "router_z_loss_clip": 0.02453613, "router_z_loss_mlp": 0.359375, "step": 2768, "time_per_iteration": 3.2874698638916016 }, { "auxiliary_loss_clip": 0.0117275, "auxiliary_loss_mlp": 0.01050258, "balance_loss_clip": 1.03098178, "balance_loss_mlp": 1.04955363, "epoch": 0.16648128663760708, "flos": 19026048401280.0, "grad_norm": 2.748310810324695, "language_loss": 0.85033506, "learning_rate": 3.7329317198169098e-06, "loss": 0.87256515, "num_input_tokens_seen": 59935105, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.046875, "step": 2769, "time_per_iteration": 2.5284411907196045 }, { "auxiliary_loss_clip": 0.01076071, "auxiliary_loss_mlp": 0.01003753, "balance_loss_clip": 1.00133312, "balance_loss_mlp": 1.021608, "epoch": 0.16654140989027508, "flos": 70134976759680.0, "grad_norm": 0.8033353062485064, "language_loss": 0.57432532, "learning_rate": 3.732743066263736e-06, "loss": 0.59512359, "num_input_tokens_seen": 59984085, "router_z_loss_clip": 0.02416992, "router_z_loss_mlp": 0.36328125, "step": 2770, "time_per_iteration": 2.9997189044952393 }, { "auxiliary_loss_clip": 0.01083273, "auxiliary_loss_mlp": 0.0100309, "balance_loss_clip": 1.000646, "balance_loss_mlp": 1.02034402, "epoch": 0.16660153314294304, "flos": 70272406195200.0, "grad_norm": 0.8677754866054311, "language_loss": 0.56157309, "learning_rate": 3.7325543508734187e-06, "loss": 0.58243674, "num_input_tokens_seen": 60043470, "router_z_loss_clip": 0.02441406, "router_z_loss_mlp": 0.359375, "step": 2771, "time_per_iteration": 3.0313308238983154 }, { "auxiliary_loss_clip": 0.01188665, "auxiliary_loss_mlp": 0.01054151, "balance_loss_clip": 1.03512549, "balance_loss_mlp": 1.05061531, "epoch": 0.166661656395611, "flos": 23075048079360.0, "grad_norm": 2.757224177455497, "language_loss": 0.70074266, "learning_rate": 3.732365573652694e-06, "loss": 0.72317082, "num_input_tokens_seen": 60063045, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.015625, "step": 2772, "time_per_iteration": 2.588412046432495 }, { "auxiliary_loss_clip": 0.01157453, "auxiliary_loss_mlp": 0.01041174, "balance_loss_clip": 1.02255321, "balance_loss_mlp": 1.0483222, "epoch": 0.16672177964827897, "flos": 28366341586560.0, "grad_norm": 1.9832862337984267, "language_loss": 0.85994112, "learning_rate": 3.7321767346082977e-06, "loss": 0.88192743, "num_input_tokens_seen": 60081945, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.0, "step": 2773, "time_per_iteration": 2.65191912651062 }, { "auxiliary_loss_clip": 0.01187903, "auxiliary_loss_mlp": 0.0104245, "balance_loss_clip": 1.02562976, "balance_loss_mlp": 1.0506314, "epoch": 0.16678190290094694, "flos": 19091010147840.0, "grad_norm": 2.477354137570482, "language_loss": 0.81684446, "learning_rate": 3.7319878337469694e-06, "loss": 0.83914798, "num_input_tokens_seen": 60096820, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 1.015625, "step": 2774, "time_per_iteration": 2.5526111125946045 }, { "auxiliary_loss_clip": 0.01179046, "auxiliary_loss_mlp": 0.01038474, "balance_loss_clip": 1.0204854, "balance_loss_mlp": 1.04839039, "epoch": 0.1668420261536149, "flos": 21799106184960.0, "grad_norm": 2.305934118510026, "language_loss": 0.83100557, "learning_rate": 3.73179887107545e-06, "loss": 0.85318077, "num_input_tokens_seen": 60116140, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.03125, "step": 2775, "time_per_iteration": 2.5955512523651123 }, { "auxiliary_loss_clip": 0.0115825, "auxiliary_loss_mlp": 0.01048083, "balance_loss_clip": 1.0309881, "balance_loss_mlp": 1.04976737, "epoch": 0.16690214940628287, "flos": 19062533640960.0, "grad_norm": 2.518215073971663, "language_loss": 0.80650795, "learning_rate": 3.731609846600485e-06, "loss": 0.82857132, "num_input_tokens_seen": 60134235, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.99609375, "step": 2776, "time_per_iteration": 2.4840738773345947 }, { "auxiliary_loss_clip": 0.01154169, "auxiliary_loss_mlp": 0.01044251, "balance_loss_clip": 1.02626276, "balance_loss_mlp": 1.04963231, "epoch": 0.16696227265895086, "flos": 18588548747520.0, "grad_norm": 1.8639649221933992, "language_loss": 0.79862362, "learning_rate": 3.731420760328818e-06, "loss": 0.82060778, "num_input_tokens_seen": 60153275, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.953125, "step": 2777, "time_per_iteration": 2.574544668197632 }, { "auxiliary_loss_clip": 0.01158561, "auxiliary_loss_mlp": 0.01040856, "balance_loss_clip": 1.02342749, "balance_loss_mlp": 1.04903269, "epoch": 0.16702239591161883, "flos": 23294139085440.0, "grad_norm": 1.8319915672337157, "language_loss": 0.85152406, "learning_rate": 3.7312316122671977e-06, "loss": 0.87351823, "num_input_tokens_seen": 60173215, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 1.0, "step": 2778, "time_per_iteration": 2.588070869445801 }, { "auxiliary_loss_clip": 0.01171329, "auxiliary_loss_mlp": 0.01039319, "balance_loss_clip": 1.02122307, "balance_loss_mlp": 1.05001378, "epoch": 0.1670825191642868, "flos": 24425648392320.0, "grad_norm": 3.0411920004598136, "language_loss": 0.74529099, "learning_rate": 3.731042402422375e-06, "loss": 0.76739746, "num_input_tokens_seen": 60190515, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.03125, "step": 2779, "time_per_iteration": 2.6379451751708984 }, { "auxiliary_loss_clip": 0.01186303, "auxiliary_loss_mlp": 0.01291041, "balance_loss_clip": 1.02490306, "balance_loss_mlp": 1.04934871, "epoch": 0.16714264241695476, "flos": 26797512193920.0, "grad_norm": 2.999313330158569, "language_loss": 0.64832449, "learning_rate": 3.730853130801101e-06, "loss": 0.67309797, "num_input_tokens_seen": 60211655, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 1.0078125, "step": 2780, "time_per_iteration": 2.6967828273773193 }, { "auxiliary_loss_clip": 0.01167547, "auxiliary_loss_mlp": 0.01044448, "balance_loss_clip": 1.0262208, "balance_loss_mlp": 1.04905546, "epoch": 0.16720276566962272, "flos": 21835304115840.0, "grad_norm": 5.329347843707926, "language_loss": 0.78117645, "learning_rate": 3.7306637974101312e-06, "loss": 0.80329645, "num_input_tokens_seen": 60230860, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.0, "step": 2781, "time_per_iteration": 2.5693140029907227 }, { "auxiliary_loss_clip": 0.01189618, "auxiliary_loss_mlp": 0.01044749, "balance_loss_clip": 1.0266645, "balance_loss_mlp": 1.04983616, "epoch": 0.1672628889222907, "flos": 21470415805440.0, "grad_norm": 1.598376875020323, "language_loss": 0.7526871, "learning_rate": 3.730474402256223e-06, "loss": 0.77503073, "num_input_tokens_seen": 60250535, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.03125, "step": 2782, "time_per_iteration": 2.57450270652771 }, { "auxiliary_loss_clip": 0.01186264, "auxiliary_loss_mlp": 0.01054345, "balance_loss_clip": 1.03469968, "balance_loss_mlp": 1.05197203, "epoch": 0.16732301217495865, "flos": 30774008269440.0, "grad_norm": 2.4487353676790455, "language_loss": 0.67843217, "learning_rate": 3.7302849453461337e-06, "loss": 0.70083821, "num_input_tokens_seen": 60269530, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.0703125, "step": 2783, "time_per_iteration": 2.676886558532715 }, { "auxiliary_loss_clip": 0.01149542, "auxiliary_loss_mlp": 0.01050036, "balance_loss_clip": 1.03208268, "balance_loss_mlp": 1.04881382, "epoch": 0.16738313542762664, "flos": 23474625949440.0, "grad_norm": 1.6514848314477109, "language_loss": 0.69692659, "learning_rate": 3.730095426686626e-06, "loss": 0.71892238, "num_input_tokens_seen": 60289900, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.0078125, "step": 2784, "time_per_iteration": 2.566678047180176 }, { "auxiliary_loss_clip": 0.01170831, "auxiliary_loss_mlp": 0.01052179, "balance_loss_clip": 1.03240263, "balance_loss_mlp": 1.04891121, "epoch": 0.1674432586802946, "flos": 29789086366080.0, "grad_norm": 2.172695221701718, "language_loss": 0.60270816, "learning_rate": 3.729905846284463e-06, "loss": 0.62493825, "num_input_tokens_seen": 60310025, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.0390625, "step": 2785, "time_per_iteration": 2.6191365718841553 }, { "auxiliary_loss_clip": 0.01070123, "auxiliary_loss_mlp": 0.01023402, "balance_loss_clip": 1.02097023, "balance_loss_mlp": 1.01614451, "epoch": 0.16750338193296258, "flos": 66136073575680.0, "grad_norm": 0.7708070293405683, "language_loss": 0.58839309, "learning_rate": 3.72971620414641e-06, "loss": 0.60932833, "num_input_tokens_seen": 60377800, "router_z_loss_clip": 0.02429199, "router_z_loss_mlp": 0.359375, "step": 2786, "time_per_iteration": 3.2471237182617188 }, { "auxiliary_loss_clip": 0.01160979, "auxiliary_loss_mlp": 0.01042717, "balance_loss_clip": 1.024418, "balance_loss_mlp": 1.04914737, "epoch": 0.16756350518563054, "flos": 25696777864320.0, "grad_norm": 3.218395054292844, "language_loss": 0.76051545, "learning_rate": 3.729526500279235e-06, "loss": 0.78255236, "num_input_tokens_seen": 60398215, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.03125, "step": 2787, "time_per_iteration": 2.56903076171875 }, { "auxiliary_loss_clip": 0.0116109, "auxiliary_loss_mlp": 0.01041024, "balance_loss_clip": 1.02326155, "balance_loss_mlp": 1.0497185, "epoch": 0.1676236284382985, "flos": 23836102467840.0, "grad_norm": 2.0233422714066585, "language_loss": 0.76947451, "learning_rate": 3.729336734689708e-06, "loss": 0.79149568, "num_input_tokens_seen": 60416910, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 1.0234375, "step": 2788, "time_per_iteration": 2.6943905353546143 }, { "auxiliary_loss_clip": 0.01072177, "auxiliary_loss_mlp": 0.01011089, "balance_loss_clip": 1.00878799, "balance_loss_mlp": 1.01762223, "epoch": 0.16768375169096647, "flos": 59874902985600.0, "grad_norm": 0.8505913037118029, "language_loss": 0.59344608, "learning_rate": 3.7291469073846017e-06, "loss": 0.61427873, "num_input_tokens_seen": 60468660, "router_z_loss_clip": 0.02294922, "router_z_loss_mlp": 0.36328125, "step": 2789, "time_per_iteration": 2.9755613803863525 }, { "auxiliary_loss_clip": 0.01160717, "auxiliary_loss_mlp": 0.01045745, "balance_loss_clip": 1.0265646, "balance_loss_mlp": 1.04812443, "epoch": 0.16774387494363446, "flos": 38435657207040.0, "grad_norm": 2.072199701410554, "language_loss": 0.70131135, "learning_rate": 3.72895701837069e-06, "loss": 0.72337598, "num_input_tokens_seen": 60492370, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0390625, "step": 2790, "time_per_iteration": 2.7735953330993652 }, { "auxiliary_loss_clip": 0.0117865, "auxiliary_loss_mlp": 0.01048603, "balance_loss_clip": 1.03107905, "balance_loss_mlp": 1.04946101, "epoch": 0.16780399819630243, "flos": 22637620252800.0, "grad_norm": 12.522920692382057, "language_loss": 0.79180634, "learning_rate": 3.7287670676547495e-06, "loss": 0.81407887, "num_input_tokens_seen": 60512655, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 1.0234375, "step": 2791, "time_per_iteration": 2.558109760284424 }, { "auxiliary_loss_clip": 0.01171732, "auxiliary_loss_mlp": 0.01043434, "balance_loss_clip": 1.02470601, "balance_loss_mlp": 1.04964852, "epoch": 0.1678641214489704, "flos": 32891516887680.0, "grad_norm": 3.480459455564356, "language_loss": 0.71394348, "learning_rate": 3.7285770552435593e-06, "loss": 0.73609513, "num_input_tokens_seen": 60533090, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0390625, "step": 2792, "time_per_iteration": 2.6592283248901367 }, { "auxiliary_loss_clip": 0.01178745, "auxiliary_loss_mlp": 0.01039292, "balance_loss_clip": 1.02111268, "balance_loss_mlp": 1.04869199, "epoch": 0.16792424470163836, "flos": 19974916028160.0, "grad_norm": 2.461440008100324, "language_loss": 0.71311414, "learning_rate": 3.7283869811439006e-06, "loss": 0.73529446, "num_input_tokens_seen": 60553190, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.03125, "step": 2793, "time_per_iteration": 2.5723817348480225 }, { "auxiliary_loss_clip": 0.01159901, "auxiliary_loss_mlp": 0.01046914, "balance_loss_clip": 1.02855611, "balance_loss_mlp": 1.04851079, "epoch": 0.16798436795430632, "flos": 19719878486400.0, "grad_norm": 1.8742596965820095, "language_loss": 0.76831436, "learning_rate": 3.728196845362557e-06, "loss": 0.79038244, "num_input_tokens_seen": 60571995, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0234375, "step": 2794, "time_per_iteration": 2.5306546688079834 }, { "auxiliary_loss_clip": 0.01172097, "auxiliary_loss_mlp": 0.01049169, "balance_loss_clip": 1.03072703, "balance_loss_mlp": 1.05117035, "epoch": 0.1680444912069743, "flos": 28104839596800.0, "grad_norm": 2.3457248624149107, "language_loss": 0.7154457, "learning_rate": 3.7280066479063128e-06, "loss": 0.73765838, "num_input_tokens_seen": 60591275, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.0234375, "step": 2795, "time_per_iteration": 2.5762274265289307 }, { "auxiliary_loss_clip": 0.0116748, "auxiliary_loss_mlp": 0.01037954, "balance_loss_clip": 1.01970279, "balance_loss_mlp": 1.04752803, "epoch": 0.16810461445964225, "flos": 18075205526400.0, "grad_norm": 1.9035357294688156, "language_loss": 0.83819306, "learning_rate": 3.7278163887819565e-06, "loss": 0.86024737, "num_input_tokens_seen": 60609235, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 1.0234375, "step": 2796, "time_per_iteration": 2.5942718982696533 }, { "auxiliary_loss_clip": 0.01195938, "auxiliary_loss_mlp": 0.01046312, "balance_loss_clip": 1.02705932, "balance_loss_mlp": 1.047544, "epoch": 0.16816473771231025, "flos": 23878657105920.0, "grad_norm": 1.7721020391296143, "language_loss": 0.81210077, "learning_rate": 3.727626067996277e-06, "loss": 0.8345232, "num_input_tokens_seen": 60629880, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.03125, "step": 2797, "time_per_iteration": 2.7012529373168945 }, { "auxiliary_loss_clip": 0.01154817, "auxiliary_loss_mlp": 0.01044967, "balance_loss_clip": 1.02920735, "balance_loss_mlp": 1.04899502, "epoch": 0.1682248609649782, "flos": 22783597125120.0, "grad_norm": 2.3402074317574852, "language_loss": 0.74788064, "learning_rate": 3.727435685556068e-06, "loss": 0.76987851, "num_input_tokens_seen": 60651175, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.96875, "step": 2798, "time_per_iteration": 2.644543170928955 }, { "auxiliary_loss_clip": 0.01152206, "auxiliary_loss_mlp": 0.01046809, "balance_loss_clip": 1.02986979, "balance_loss_mlp": 1.05100286, "epoch": 0.16828498421764618, "flos": 20705123612160.0, "grad_norm": 1.8122026382446452, "language_loss": 0.79615486, "learning_rate": 3.7272452414681227e-06, "loss": 0.81814504, "num_input_tokens_seen": 60670210, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 1.0078125, "step": 2799, "time_per_iteration": 2.5492334365844727 }, { "auxiliary_loss_clip": 0.01206191, "auxiliary_loss_mlp": 0.01037907, "balance_loss_clip": 1.01951349, "balance_loss_mlp": 1.04784739, "epoch": 0.16834510747031414, "flos": 29420606695680.0, "grad_norm": 2.7336267739156694, "language_loss": 0.70435494, "learning_rate": 3.7270547357392375e-06, "loss": 0.72679591, "num_input_tokens_seen": 60690895, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0390625, "step": 2800, "time_per_iteration": 2.6992647647857666 }, { "auxiliary_loss_clip": 0.01175569, "auxiliary_loss_mlp": 0.01290761, "balance_loss_clip": 1.02311265, "balance_loss_mlp": 1.04733157, "epoch": 0.1684052307229821, "flos": 18145374744960.0, "grad_norm": 1.8920316783256332, "language_loss": 0.83765322, "learning_rate": 3.7268641683762113e-06, "loss": 0.86231649, "num_input_tokens_seen": 60708280, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0078125, "step": 2801, "time_per_iteration": 3.9363045692443848 }, { "auxiliary_loss_clip": 0.01167835, "auxiliary_loss_mlp": 0.01048825, "balance_loss_clip": 1.03128886, "balance_loss_mlp": 1.04780984, "epoch": 0.16846535397565007, "flos": 16574929240320.0, "grad_norm": 2.714958437002314, "language_loss": 0.82315439, "learning_rate": 3.7266735393858456e-06, "loss": 0.845321, "num_input_tokens_seen": 60724150, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 1.015625, "step": 2802, "time_per_iteration": 5.405985355377197 }, { "auxiliary_loss_clip": 0.01161459, "auxiliary_loss_mlp": 0.01045122, "balance_loss_clip": 1.02634704, "balance_loss_mlp": 1.04887962, "epoch": 0.16852547722831807, "flos": 30408868563840.0, "grad_norm": 1.7366028320382998, "language_loss": 0.81079799, "learning_rate": 3.7264828487749422e-06, "loss": 0.83286381, "num_input_tokens_seen": 60746485, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.03125, "step": 2803, "time_per_iteration": 2.6094863414764404 }, { "auxiliary_loss_clip": 0.01158207, "auxiliary_loss_mlp": 0.01044382, "balance_loss_clip": 1.02697754, "balance_loss_mlp": 1.04962635, "epoch": 0.16858560048098603, "flos": 33507420416640.0, "grad_norm": 5.656845206692518, "language_loss": 0.76066351, "learning_rate": 3.726292096550307e-06, "loss": 0.78268945, "num_input_tokens_seen": 60762875, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9921875, "step": 2804, "time_per_iteration": 2.6553523540496826 }, { "auxiliary_loss_clip": 0.01086332, "auxiliary_loss_mlp": 0.01002122, "balance_loss_clip": 0.99969006, "balance_loss_mlp": 1.02239132, "epoch": 0.168645723733654, "flos": 67370502326400.0, "grad_norm": 0.8465577294565306, "language_loss": 0.5544337, "learning_rate": 3.7261012827187477e-06, "loss": 0.57531822, "num_input_tokens_seen": 60825510, "router_z_loss_clip": 0.02429199, "router_z_loss_mlp": 0.3671875, "step": 2805, "time_per_iteration": 3.119555950164795 }, { "auxiliary_loss_clip": 0.01154264, "auxiliary_loss_mlp": 0.01040278, "balance_loss_clip": 1.023458, "balance_loss_mlp": 1.04624891, "epoch": 0.16870584698632196, "flos": 21324618501120.0, "grad_norm": 1.9517459269304982, "language_loss": 0.72386289, "learning_rate": 3.725910407287074e-06, "loss": 0.74580836, "num_input_tokens_seen": 60844440, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.98828125, "step": 2806, "time_per_iteration": 4.113410234451294 }, { "auxiliary_loss_clip": 0.01174912, "auxiliary_loss_mlp": 0.01044441, "balance_loss_clip": 1.02720308, "balance_loss_mlp": 1.04858506, "epoch": 0.16876597023898993, "flos": 20740746925440.0, "grad_norm": 1.78586815693926, "language_loss": 0.69636202, "learning_rate": 3.7257194702620964e-06, "loss": 0.71855557, "num_input_tokens_seen": 60863210, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9921875, "step": 2807, "time_per_iteration": 2.6033225059509277 }, { "auxiliary_loss_clip": 0.01168353, "auxiliary_loss_mlp": 0.01052966, "balance_loss_clip": 1.03429759, "balance_loss_mlp": 1.04784763, "epoch": 0.1688260934916579, "flos": 20303498666880.0, "grad_norm": 2.4823498418035976, "language_loss": 0.69520009, "learning_rate": 3.725528471650631e-06, "loss": 0.71741325, "num_input_tokens_seen": 60882510, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.0234375, "step": 2808, "time_per_iteration": 2.6336190700531006 }, { "auxiliary_loss_clip": 0.01178278, "auxiliary_loss_mlp": 0.0104429, "balance_loss_clip": 1.02550244, "balance_loss_mlp": 1.04721522, "epoch": 0.16888621674432586, "flos": 20340702178560.0, "grad_norm": 2.29547811243469, "language_loss": 0.80333436, "learning_rate": 3.7253374114594925e-06, "loss": 0.82556009, "num_input_tokens_seen": 60901105, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0390625, "step": 2809, "time_per_iteration": 2.5435516834259033 }, { "auxiliary_loss_clip": 0.01163581, "auxiliary_loss_mlp": 0.01049003, "balance_loss_clip": 1.03103805, "balance_loss_mlp": 1.04861784, "epoch": 0.16894633999699385, "flos": 16244802316800.0, "grad_norm": 2.0519719998990977, "language_loss": 0.88189459, "learning_rate": 3.7251462896955e-06, "loss": 0.90402043, "num_input_tokens_seen": 60915340, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.0625, "step": 2810, "time_per_iteration": 2.5317599773406982 }, { "auxiliary_loss_clip": 0.01162143, "auxiliary_loss_mlp": 0.01052672, "balance_loss_clip": 1.03461218, "balance_loss_mlp": 1.04959369, "epoch": 0.16900646324966181, "flos": 19610171372160.0, "grad_norm": 2.0228193479989414, "language_loss": 0.9225992, "learning_rate": 3.724955106365474e-06, "loss": 0.94474739, "num_input_tokens_seen": 60933735, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 1.03125, "step": 2811, "time_per_iteration": 2.602912425994873 }, { "auxiliary_loss_clip": 0.01162898, "auxiliary_loss_mlp": 0.01047902, "balance_loss_clip": 1.03024685, "balance_loss_mlp": 1.05053699, "epoch": 0.16906658650232978, "flos": 22018089450240.0, "grad_norm": 1.9735843299288047, "language_loss": 0.78722537, "learning_rate": 3.724763861476237e-06, "loss": 0.80933332, "num_input_tokens_seen": 60953105, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 1.03125, "step": 2812, "time_per_iteration": 2.5763559341430664 }, { "auxiliary_loss_clip": 0.01159508, "auxiliary_loss_mlp": 0.01055141, "balance_loss_clip": 1.03823686, "balance_loss_mlp": 1.05061173, "epoch": 0.16912670975499774, "flos": 11763690024960.0, "grad_norm": 2.9949118001517707, "language_loss": 0.74845535, "learning_rate": 3.724572555034615e-06, "loss": 0.77060187, "num_input_tokens_seen": 60969150, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 1.0, "step": 2813, "time_per_iteration": 2.4842889308929443 }, { "auxiliary_loss_clip": 0.01172124, "auxiliary_loss_mlp": 0.01048267, "balance_loss_clip": 1.03001595, "balance_loss_mlp": 1.04849052, "epoch": 0.1691868330076657, "flos": 17161386595200.0, "grad_norm": 2.548389272352349, "language_loss": 0.68509078, "learning_rate": 3.7243811870474346e-06, "loss": 0.7072947, "num_input_tokens_seen": 60982825, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 1.0546875, "step": 2814, "time_per_iteration": 2.5201423168182373 }, { "auxiliary_loss_clip": 0.01153546, "auxiliary_loss_mlp": 0.01046917, "balance_loss_clip": 1.02847517, "balance_loss_mlp": 1.05086279, "epoch": 0.16924695626033368, "flos": 22416553998720.0, "grad_norm": 2.0557569634634296, "language_loss": 0.61532891, "learning_rate": 3.724189757521525e-06, "loss": 0.63733351, "num_input_tokens_seen": 61000875, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0234375, "step": 2815, "time_per_iteration": 2.516526937484741 }, { "auxiliary_loss_clip": 0.01168136, "auxiliary_loss_mlp": 0.0104262, "balance_loss_clip": 1.02508402, "balance_loss_mlp": 1.04915524, "epoch": 0.16930707951300164, "flos": 25739655724800.0, "grad_norm": 1.7119112360987643, "language_loss": 0.82371628, "learning_rate": 3.7239982664637185e-06, "loss": 0.84582388, "num_input_tokens_seen": 61021940, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 1.0078125, "step": 2816, "time_per_iteration": 2.5678513050079346 }, { "auxiliary_loss_clip": 0.01172828, "auxiliary_loss_mlp": 0.01050658, "balance_loss_clip": 1.03222823, "balance_loss_mlp": 1.05081761, "epoch": 0.16936720276566963, "flos": 22747040058240.0, "grad_norm": 4.517260202957396, "language_loss": 0.86868709, "learning_rate": 3.7238067138808477e-06, "loss": 0.89092195, "num_input_tokens_seen": 61040285, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.0390625, "step": 2817, "time_per_iteration": 2.5465667247772217 }, { "auxiliary_loss_clip": 0.0117017, "auxiliary_loss_mlp": 0.01052845, "balance_loss_clip": 1.03428459, "balance_loss_mlp": 1.05164146, "epoch": 0.1694273260183376, "flos": 19573973441280.0, "grad_norm": 1.9249584275692684, "language_loss": 0.81162572, "learning_rate": 3.72361509977975e-06, "loss": 0.83385587, "num_input_tokens_seen": 61059020, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0, "step": 2818, "time_per_iteration": 2.6234939098358154 }, { "auxiliary_loss_clip": 0.01150915, "auxiliary_loss_mlp": 0.0104723, "balance_loss_clip": 1.02903891, "balance_loss_mlp": 1.04892743, "epoch": 0.16948744927100556, "flos": 12457843332480.0, "grad_norm": 2.4535636448297162, "language_loss": 0.80722415, "learning_rate": 3.7234234241672632e-06, "loss": 0.82920563, "num_input_tokens_seen": 61074245, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.015625, "step": 2819, "time_per_iteration": 2.542949914932251 }, { "auxiliary_loss_clip": 0.01076832, "auxiliary_loss_mlp": 0.01012128, "balance_loss_clip": 1.0098033, "balance_loss_mlp": 1.02317905, "epoch": 0.16954757252367353, "flos": 71291694435840.0, "grad_norm": 0.9308869325193673, "language_loss": 0.61088192, "learning_rate": 3.7232316870502274e-06, "loss": 0.6317715, "num_input_tokens_seen": 61127080, "router_z_loss_clip": 0.02319336, "router_z_loss_mlp": 0.35351562, "step": 2820, "time_per_iteration": 3.0731875896453857 }, { "auxiliary_loss_clip": 0.01178667, "auxiliary_loss_mlp": 0.01047902, "balance_loss_clip": 1.0301156, "balance_loss_mlp": 1.04901409, "epoch": 0.1696076957763415, "flos": 29606516513280.0, "grad_norm": 2.2683209432842295, "language_loss": 0.78559339, "learning_rate": 3.723039888435485e-06, "loss": 0.80785906, "num_input_tokens_seen": 61146955, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 1.0234375, "step": 2821, "time_per_iteration": 2.605527877807617 }, { "auxiliary_loss_clip": 0.01161533, "auxiliary_loss_mlp": 0.01051753, "balance_loss_clip": 1.03254867, "balance_loss_mlp": 1.05073822, "epoch": 0.16966781902900946, "flos": 24388588535040.0, "grad_norm": 2.119836305384795, "language_loss": 0.78519833, "learning_rate": 3.722848028329882e-06, "loss": 0.80733114, "num_input_tokens_seen": 61166605, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.015625, "step": 2822, "time_per_iteration": 2.5907082557678223 }, { "auxiliary_loss_clip": 0.01158375, "auxiliary_loss_mlp": 0.0129224, "balance_loss_clip": 1.02625144, "balance_loss_mlp": 1.05013347, "epoch": 0.16972794228167745, "flos": 23038814234880.0, "grad_norm": 1.7339674844261561, "language_loss": 0.7464242, "learning_rate": 3.7226561067402638e-06, "loss": 0.77093041, "num_input_tokens_seen": 61186535, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9921875, "step": 2823, "time_per_iteration": 2.559610366821289 }, { "auxiliary_loss_clip": 0.01171631, "auxiliary_loss_mlp": 0.01053218, "balance_loss_clip": 1.0342164, "balance_loss_mlp": 1.05193663, "epoch": 0.16978806553434542, "flos": 35228691129600.0, "grad_norm": 1.8679602846760965, "language_loss": 0.59997201, "learning_rate": 3.7224641236734805e-06, "loss": 0.62222058, "num_input_tokens_seen": 61208965, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.015625, "step": 2824, "time_per_iteration": 2.733454465866089 }, { "auxiliary_loss_clip": 0.01172201, "auxiliary_loss_mlp": 0.01042131, "balance_loss_clip": 1.02286708, "balance_loss_mlp": 1.05298829, "epoch": 0.16984818878701338, "flos": 32014290936960.0, "grad_norm": 2.691765392642672, "language_loss": 0.73033756, "learning_rate": 3.7222720791363837e-06, "loss": 0.75248086, "num_input_tokens_seen": 61230670, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0078125, "step": 2825, "time_per_iteration": 2.649901866912842 }, { "auxiliary_loss_clip": 0.01158569, "auxiliary_loss_mlp": 0.01050193, "balance_loss_clip": 1.02964103, "balance_loss_mlp": 1.05186749, "epoch": 0.16990831203968135, "flos": 22818609907200.0, "grad_norm": 2.2570757499852596, "language_loss": 0.85731721, "learning_rate": 3.7220799731358264e-06, "loss": 0.87940478, "num_input_tokens_seen": 61249510, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.0625, "step": 2826, "time_per_iteration": 2.576293706893921 }, { "auxiliary_loss_clip": 0.01167866, "auxiliary_loss_mlp": 0.0104948, "balance_loss_clip": 1.03095484, "balance_loss_mlp": 1.05293369, "epoch": 0.1699684352923493, "flos": 23039604334080.0, "grad_norm": 1.7090873806151323, "language_loss": 0.82151771, "learning_rate": 3.721887805678665e-06, "loss": 0.84369123, "num_input_tokens_seen": 61269440, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0625, "step": 2827, "time_per_iteration": 2.6041197776794434 }, { "auxiliary_loss_clip": 0.01176084, "auxiliary_loss_mlp": 0.01041798, "balance_loss_clip": 1.02183044, "balance_loss_mlp": 1.05256248, "epoch": 0.17002855854501728, "flos": 21434110133760.0, "grad_norm": 2.00493194216747, "language_loss": 0.73776817, "learning_rate": 3.7216955767717558e-06, "loss": 0.759947, "num_input_tokens_seen": 61288195, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.0546875, "step": 2828, "time_per_iteration": 2.590461015701294 }, { "auxiliary_loss_clip": 0.01088534, "auxiliary_loss_mlp": 0.01003515, "balance_loss_clip": 1.00105953, "balance_loss_mlp": 1.0257597, "epoch": 0.17008868179768524, "flos": 71453509205760.0, "grad_norm": 0.763589893931699, "language_loss": 0.56461316, "learning_rate": 3.721503286421961e-06, "loss": 0.58553362, "num_input_tokens_seen": 61350850, "router_z_loss_clip": 0.02453613, "router_z_loss_mlp": 0.35742188, "step": 2829, "time_per_iteration": 3.2458629608154297 }, { "auxiliary_loss_clip": 0.01178542, "auxiliary_loss_mlp": 0.01048924, "balance_loss_clip": 1.0305177, "balance_loss_mlp": 1.05001044, "epoch": 0.17014880505035324, "flos": 24900315644160.0, "grad_norm": 1.7991827845821486, "language_loss": 0.82997084, "learning_rate": 3.7213109346361424e-06, "loss": 0.85224551, "num_input_tokens_seen": 61370765, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0078125, "step": 2830, "time_per_iteration": 2.6783061027526855 }, { "auxiliary_loss_clip": 0.01181027, "auxiliary_loss_mlp": 0.01043449, "balance_loss_clip": 1.0243876, "balance_loss_mlp": 1.05111337, "epoch": 0.1702089283030212, "flos": 29862415981440.0, "grad_norm": 11.536171273397438, "language_loss": 0.78721535, "learning_rate": 3.721118521421164e-06, "loss": 0.80946016, "num_input_tokens_seen": 61388935, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.03125, "step": 2831, "time_per_iteration": 2.7395002841949463 }, { "auxiliary_loss_clip": 0.01182137, "auxiliary_loss_mlp": 0.0104913, "balance_loss_clip": 1.02977085, "balance_loss_mlp": 1.05139458, "epoch": 0.17026905155568917, "flos": 17744180762880.0, "grad_norm": 2.185760270622884, "language_loss": 0.7920475, "learning_rate": 3.7209260467838926e-06, "loss": 0.81436014, "num_input_tokens_seen": 61407350, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.03125, "step": 2832, "time_per_iteration": 2.5698330402374268 }, { "auxiliary_loss_clip": 0.01174862, "auxiliary_loss_mlp": 0.01047678, "balance_loss_clip": 1.02837813, "balance_loss_mlp": 1.05261564, "epoch": 0.17032917480835713, "flos": 23148665003520.0, "grad_norm": 1.9785260948833063, "language_loss": 0.88511169, "learning_rate": 3.720733510731198e-06, "loss": 0.90733707, "num_input_tokens_seen": 61429010, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.046875, "step": 2833, "time_per_iteration": 2.6299729347229004 }, { "auxiliary_loss_clip": 0.01154371, "auxiliary_loss_mlp": 0.01048197, "balance_loss_clip": 1.03014874, "balance_loss_mlp": 1.05181646, "epoch": 0.1703892980610251, "flos": 39202565512320.0, "grad_norm": 2.116535181165047, "language_loss": 0.72338331, "learning_rate": 3.72054091326995e-06, "loss": 0.74540901, "num_input_tokens_seen": 61450040, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.0234375, "step": 2834, "time_per_iteration": 2.6459999084472656 }, { "auxiliary_loss_clip": 0.01185501, "auxiliary_loss_mlp": 0.01054724, "balance_loss_clip": 1.03708076, "balance_loss_mlp": 1.05497479, "epoch": 0.17044942131369306, "flos": 23039101543680.0, "grad_norm": 2.2968439600035073, "language_loss": 0.86252159, "learning_rate": 3.7203482544070227e-06, "loss": 0.88492393, "num_input_tokens_seen": 61468585, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 1.03125, "step": 2835, "time_per_iteration": 2.6759657859802246 }, { "auxiliary_loss_clip": 0.01166205, "auxiliary_loss_mlp": 0.01047993, "balance_loss_clip": 1.02710748, "balance_loss_mlp": 1.05104876, "epoch": 0.17050954456636103, "flos": 17054983532160.0, "grad_norm": 2.448183732021956, "language_loss": 0.73646784, "learning_rate": 3.720155534149292e-06, "loss": 0.75860983, "num_input_tokens_seen": 61486330, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.0625, "step": 2836, "time_per_iteration": 2.4909892082214355 }, { "auxiliary_loss_clip": 0.01195266, "auxiliary_loss_mlp": 0.01045168, "balance_loss_clip": 1.02466381, "balance_loss_mlp": 1.05164123, "epoch": 0.17056966781902902, "flos": 16836969934080.0, "grad_norm": 2.1607742239969587, "language_loss": 0.80312318, "learning_rate": 3.7199627525036343e-06, "loss": 0.82552749, "num_input_tokens_seen": 61503950, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.078125, "step": 2837, "time_per_iteration": 2.578704357147217 }, { "auxiliary_loss_clip": 0.01160955, "auxiliary_loss_mlp": 0.01045305, "balance_loss_clip": 1.0268631, "balance_loss_mlp": 1.05285525, "epoch": 0.17062979107169698, "flos": 17712543859200.0, "grad_norm": 1.9533134091130246, "language_loss": 0.82952207, "learning_rate": 3.7197699094769303e-06, "loss": 0.85158467, "num_input_tokens_seen": 61523550, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.9921875, "step": 2838, "time_per_iteration": 2.5008857250213623 }, { "auxiliary_loss_clip": 0.01186134, "auxiliary_loss_mlp": 0.01046947, "balance_loss_clip": 1.02944756, "balance_loss_mlp": 1.05047739, "epoch": 0.17068991432436495, "flos": 22525040050560.0, "grad_norm": 1.8235875193388529, "language_loss": 0.9298842, "learning_rate": 3.719577005076062e-06, "loss": 0.95221496, "num_input_tokens_seen": 61542720, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.9921875, "step": 2839, "time_per_iteration": 2.6124801635742188 }, { "auxiliary_loss_clip": 0.0117116, "auxiliary_loss_mlp": 0.01045838, "balance_loss_clip": 1.026371, "balance_loss_mlp": 1.05008829, "epoch": 0.17075003757703291, "flos": 25882939077120.0, "grad_norm": 3.3096194913814605, "language_loss": 0.83455729, "learning_rate": 3.719384039307914e-06, "loss": 0.85672724, "num_input_tokens_seen": 61563040, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.03125, "step": 2840, "time_per_iteration": 2.6694674491882324 }, { "auxiliary_loss_clip": 0.0116333, "auxiliary_loss_mlp": 0.01044463, "balance_loss_clip": 1.02504349, "balance_loss_mlp": 1.04972053, "epoch": 0.17081016082970088, "flos": 20120713332480.0, "grad_norm": 1.7830090374218133, "language_loss": 0.75573266, "learning_rate": 3.7191910121793723e-06, "loss": 0.77781063, "num_input_tokens_seen": 61581890, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.046875, "step": 2841, "time_per_iteration": 2.6745667457580566 }, { "auxiliary_loss_clip": 0.0117059, "auxiliary_loss_mlp": 0.01048086, "balance_loss_clip": 1.02897692, "balance_loss_mlp": 1.04976344, "epoch": 0.17087028408236885, "flos": 24936477661440.0, "grad_norm": 1.7261715517640632, "language_loss": 0.76343596, "learning_rate": 3.718997923697326e-06, "loss": 0.78562272, "num_input_tokens_seen": 61602095, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.03125, "step": 2842, "time_per_iteration": 4.02075719833374 }, { "auxiliary_loss_clip": 0.0115053, "auxiliary_loss_mlp": 0.01046508, "balance_loss_clip": 1.02822113, "balance_loss_mlp": 1.05084944, "epoch": 0.17093040733503684, "flos": 19057864872960.0, "grad_norm": 1.8843328522772715, "language_loss": 0.85352588, "learning_rate": 3.7188047738686655e-06, "loss": 0.87549627, "num_input_tokens_seen": 61620400, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 1.0, "step": 2843, "time_per_iteration": 2.521742105484009 }, { "auxiliary_loss_clip": 0.01150641, "auxiliary_loss_mlp": 0.01045462, "balance_loss_clip": 1.02815247, "balance_loss_mlp": 1.05171955, "epoch": 0.1709905305877048, "flos": 13078954333440.0, "grad_norm": 2.0230444320571443, "language_loss": 0.68420309, "learning_rate": 3.7186115627002837e-06, "loss": 0.70616406, "num_input_tokens_seen": 61637680, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9921875, "step": 2844, "time_per_iteration": 5.357793569564819 }, { "auxiliary_loss_clip": 0.01172483, "auxiliary_loss_mlp": 0.0129553, "balance_loss_clip": 1.02882576, "balance_loss_mlp": 1.05252528, "epoch": 0.17105065384037277, "flos": 19209336526080.0, "grad_norm": 1.7673790467402264, "language_loss": 0.78524214, "learning_rate": 3.718418290199076e-06, "loss": 0.80992228, "num_input_tokens_seen": 61655630, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0234375, "step": 2845, "time_per_iteration": 2.631783962249756 }, { "auxiliary_loss_clip": 0.01162656, "auxiliary_loss_mlp": 0.01043368, "balance_loss_clip": 1.02612996, "balance_loss_mlp": 1.05197024, "epoch": 0.17111077709304073, "flos": 18515183218560.0, "grad_norm": 2.3407754450095264, "language_loss": 0.77905273, "learning_rate": 3.71822495637194e-06, "loss": 0.80111301, "num_input_tokens_seen": 61673475, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 1.015625, "step": 2846, "time_per_iteration": 2.4990103244781494 }, { "auxiliary_loss_clip": 0.01162612, "auxiliary_loss_mlp": 0.01042994, "balance_loss_clip": 1.02550578, "balance_loss_mlp": 1.05239463, "epoch": 0.1711709003457087, "flos": 25082670015360.0, "grad_norm": 1.9593626071440775, "language_loss": 0.79580832, "learning_rate": 3.7180315612257748e-06, "loss": 0.8178643, "num_input_tokens_seen": 61693370, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 1.015625, "step": 2847, "time_per_iteration": 4.021216630935669 }, { "auxiliary_loss_clip": 0.01161402, "auxiliary_loss_mlp": 0.01047312, "balance_loss_clip": 1.02829814, "balance_loss_mlp": 1.04850864, "epoch": 0.17123102359837666, "flos": 17566387418880.0, "grad_norm": 2.2489686086788603, "language_loss": 0.87141049, "learning_rate": 3.7178381047674825e-06, "loss": 0.89349771, "num_input_tokens_seen": 61710820, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.0390625, "step": 2848, "time_per_iteration": 2.5305166244506836 }, { "auxiliary_loss_clip": 0.01183656, "auxiliary_loss_mlp": 0.01046627, "balance_loss_clip": 1.02863824, "balance_loss_mlp": 1.05450177, "epoch": 0.17129114685104463, "flos": 26173635845760.0, "grad_norm": 2.125739452055343, "language_loss": 0.75187564, "learning_rate": 3.717644587003967e-06, "loss": 0.7741785, "num_input_tokens_seen": 61729855, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.0234375, "step": 2849, "time_per_iteration": 2.594006061553955 }, { "auxiliary_loss_clip": 0.01089522, "auxiliary_loss_mlp": 0.01028836, "balance_loss_clip": 1.02669048, "balance_loss_mlp": 1.02688289, "epoch": 0.17135127010371262, "flos": 69269710037760.0, "grad_norm": 0.7948085734747199, "language_loss": 0.57495892, "learning_rate": 3.7174510079421347e-06, "loss": 0.59614253, "num_input_tokens_seen": 61790290, "router_z_loss_clip": 0.02148438, "router_z_loss_mlp": 0.3515625, "step": 2850, "time_per_iteration": 3.1598575115203857 }, { "auxiliary_loss_clip": 0.01175846, "auxiliary_loss_mlp": 0.01043112, "balance_loss_clip": 1.02520692, "balance_loss_mlp": 1.05009294, "epoch": 0.1714113933563806, "flos": 23550110380800.0, "grad_norm": 2.5753566912843335, "language_loss": 0.80846167, "learning_rate": 3.7172573675888937e-06, "loss": 0.83065116, "num_input_tokens_seen": 61809265, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.98828125, "step": 2851, "time_per_iteration": 2.6094932556152344 }, { "auxiliary_loss_clip": 0.01179123, "auxiliary_loss_mlp": 0.01037641, "balance_loss_clip": 1.02095199, "balance_loss_mlp": 1.05237412, "epoch": 0.17147151660904855, "flos": 21142443697920.0, "grad_norm": 2.5349647144234955, "language_loss": 0.93129468, "learning_rate": 3.717063665951155e-06, "loss": 0.95346236, "num_input_tokens_seen": 61828980, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.9921875, "step": 2852, "time_per_iteration": 2.6209304332733154 }, { "auxiliary_loss_clip": 0.01181122, "auxiliary_loss_mlp": 0.01047705, "balance_loss_clip": 1.02941847, "balance_loss_mlp": 1.05064821, "epoch": 0.17153163986171652, "flos": 18624890332800.0, "grad_norm": 1.9541971468289472, "language_loss": 0.6958822, "learning_rate": 3.7168699030358305e-06, "loss": 0.71817052, "num_input_tokens_seen": 61847915, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 1.03125, "step": 2853, "time_per_iteration": 2.5658185482025146 }, { "auxiliary_loss_clip": 0.01190367, "auxiliary_loss_mlp": 0.01044776, "balance_loss_clip": 1.02787232, "balance_loss_mlp": 1.05103707, "epoch": 0.17159176311438448, "flos": 18223265387520.0, "grad_norm": 2.0351311442657156, "language_loss": 0.66469532, "learning_rate": 3.7166760788498355e-06, "loss": 0.68704677, "num_input_tokens_seen": 61865570, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 1.03125, "step": 2854, "time_per_iteration": 2.6697463989257812 }, { "auxiliary_loss_clip": 0.01169656, "auxiliary_loss_mlp": 0.01035922, "balance_loss_clip": 1.01879144, "balance_loss_mlp": 1.05112934, "epoch": 0.17165188636705245, "flos": 20738987159040.0, "grad_norm": 1.7139326101825094, "language_loss": 0.8909995, "learning_rate": 3.716482193400087e-06, "loss": 0.9130553, "num_input_tokens_seen": 61883340, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 1.0078125, "step": 2855, "time_per_iteration": 2.576279401779175 }, { "auxiliary_loss_clip": 0.01154448, "auxiliary_loss_mlp": 0.01045644, "balance_loss_clip": 1.02903843, "balance_loss_mlp": 1.05221236, "epoch": 0.17171200961972044, "flos": 24899884680960.0, "grad_norm": 2.130767002131282, "language_loss": 0.83159447, "learning_rate": 3.7162882466935042e-06, "loss": 0.85359538, "num_input_tokens_seen": 61900610, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 1.0234375, "step": 2856, "time_per_iteration": 2.6135025024414062 }, { "auxiliary_loss_clip": 0.01162889, "auxiliary_loss_mlp": 0.01041576, "balance_loss_clip": 1.0245887, "balance_loss_mlp": 1.05164194, "epoch": 0.1717721328723884, "flos": 20157234485760.0, "grad_norm": 4.052112391621518, "language_loss": 0.8683579, "learning_rate": 3.716094238737009e-06, "loss": 0.89040256, "num_input_tokens_seen": 61916795, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 1.0234375, "step": 2857, "time_per_iteration": 2.6011383533477783 }, { "auxiliary_loss_clip": 0.01161321, "auxiliary_loss_mlp": 0.01047533, "balance_loss_clip": 1.0296874, "balance_loss_mlp": 1.05078769, "epoch": 0.17183225612505637, "flos": 23361650697600.0, "grad_norm": 2.090542666057262, "language_loss": 0.78079486, "learning_rate": 3.715900169537524e-06, "loss": 0.80288339, "num_input_tokens_seen": 61936665, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 1.015625, "step": 2858, "time_per_iteration": 2.636960983276367 }, { "auxiliary_loss_clip": 0.01178016, "auxiliary_loss_mlp": 0.01047927, "balance_loss_clip": 1.02694631, "balance_loss_mlp": 1.04921412, "epoch": 0.17189237937772434, "flos": 18114240631680.0, "grad_norm": 1.985673489490971, "language_loss": 0.77123082, "learning_rate": 3.7157060391019767e-06, "loss": 0.79349029, "num_input_tokens_seen": 61954415, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.109375, "step": 2859, "time_per_iteration": 2.53977632522583 }, { "auxiliary_loss_clip": 0.01188219, "auxiliary_loss_mlp": 0.01041399, "balance_loss_clip": 1.02304077, "balance_loss_mlp": 1.0509696, "epoch": 0.1719525026303923, "flos": 23258408031360.0, "grad_norm": 2.17262698533174, "language_loss": 0.76761287, "learning_rate": 3.7155118474372936e-06, "loss": 0.78990901, "num_input_tokens_seen": 61973940, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0078125, "step": 2860, "time_per_iteration": 2.6276092529296875 }, { "auxiliary_loss_clip": 0.01160379, "auxiliary_loss_mlp": 0.01040678, "balance_loss_clip": 1.02313066, "balance_loss_mlp": 1.04810631, "epoch": 0.17201262588306027, "flos": 20810413353600.0, "grad_norm": 2.6571393380291184, "language_loss": 0.81435323, "learning_rate": 3.7153175945504057e-06, "loss": 0.83636379, "num_input_tokens_seen": 61991845, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 1.03125, "step": 2861, "time_per_iteration": 2.61755633354187 }, { "auxiliary_loss_clip": 0.01169752, "auxiliary_loss_mlp": 0.01038199, "balance_loss_clip": 1.02115846, "balance_loss_mlp": 1.0491569, "epoch": 0.17207274913572823, "flos": 20375858615040.0, "grad_norm": 2.3756818913939672, "language_loss": 0.85398918, "learning_rate": 3.7151232804482456e-06, "loss": 0.87606871, "num_input_tokens_seen": 62009395, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 1.0234375, "step": 2862, "time_per_iteration": 2.5645580291748047 }, { "auxiliary_loss_clip": 0.01173765, "auxiliary_loss_mlp": 0.01035836, "balance_loss_clip": 1.01986229, "balance_loss_mlp": 1.04832733, "epoch": 0.17213287238839622, "flos": 26797727675520.0, "grad_norm": 1.9698653597840086, "language_loss": 0.77930939, "learning_rate": 3.7149289051377474e-06, "loss": 0.80140543, "num_input_tokens_seen": 62029005, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.98828125, "step": 2863, "time_per_iteration": 2.647861957550049 }, { "auxiliary_loss_clip": 0.01176836, "auxiliary_loss_mlp": 0.01291305, "balance_loss_clip": 1.02608752, "balance_loss_mlp": 1.04920816, "epoch": 0.1721929956410642, "flos": 26030819370240.0, "grad_norm": 1.9052674050952898, "language_loss": 0.72409439, "learning_rate": 3.714734468625847e-06, "loss": 0.74877584, "num_input_tokens_seen": 62048730, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 1.0078125, "step": 2864, "time_per_iteration": 2.580289840698242 }, { "auxiliary_loss_clip": 0.01179537, "auxiliary_loss_mlp": 0.01049003, "balance_loss_clip": 1.03170562, "balance_loss_mlp": 1.04878485, "epoch": 0.17225311889373215, "flos": 22273091078400.0, "grad_norm": 2.2248897707993627, "language_loss": 0.72268999, "learning_rate": 3.714539970919485e-06, "loss": 0.74497539, "num_input_tokens_seen": 62069000, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 1.0390625, "step": 2865, "time_per_iteration": 2.624281644821167 }, { "auxiliary_loss_clip": 0.01161428, "auxiliary_loss_mlp": 0.01294906, "balance_loss_clip": 1.02987361, "balance_loss_mlp": 1.05120015, "epoch": 0.17231324214640012, "flos": 21287774125440.0, "grad_norm": 3.670007827710049, "language_loss": 0.78417009, "learning_rate": 3.7143454120256017e-06, "loss": 0.80873334, "num_input_tokens_seen": 62086750, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 1.015625, "step": 2866, "time_per_iteration": 2.533529043197632 }, { "auxiliary_loss_clip": 0.01151635, "auxiliary_loss_mlp": 0.0104709, "balance_loss_clip": 1.02868366, "balance_loss_mlp": 1.04891849, "epoch": 0.17237336539906808, "flos": 19680735640320.0, "grad_norm": 3.506155512202708, "language_loss": 0.79561067, "learning_rate": 3.71415079195114e-06, "loss": 0.81759799, "num_input_tokens_seen": 62106240, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.0234375, "step": 2867, "time_per_iteration": 2.6618471145629883 }, { "auxiliary_loss_clip": 0.01170497, "auxiliary_loss_mlp": 0.01041018, "balance_loss_clip": 1.02327943, "balance_loss_mlp": 1.05001402, "epoch": 0.17243348865173605, "flos": 17529650784000.0, "grad_norm": 1.789220878062468, "language_loss": 0.79815245, "learning_rate": 3.713956110703046e-06, "loss": 0.82026762, "num_input_tokens_seen": 62124895, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 1.0234375, "step": 2868, "time_per_iteration": 2.536773204803467 }, { "auxiliary_loss_clip": 0.01193951, "auxiliary_loss_mlp": 0.01044956, "balance_loss_clip": 1.02818298, "balance_loss_mlp": 1.05293083, "epoch": 0.17249361190440402, "flos": 18259858368000.0, "grad_norm": 2.245305689799921, "language_loss": 0.83405113, "learning_rate": 3.713761368288268e-06, "loss": 0.85644019, "num_input_tokens_seen": 62143510, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 1.046875, "step": 2869, "time_per_iteration": 2.6409013271331787 }, { "auxiliary_loss_clip": 0.01173173, "auxiliary_loss_mlp": 0.01049622, "balance_loss_clip": 1.02992845, "balance_loss_mlp": 1.05039966, "epoch": 0.172553735157072, "flos": 21174367910400.0, "grad_norm": 1.9490697091146492, "language_loss": 0.77018553, "learning_rate": 3.713566564713754e-06, "loss": 0.79241347, "num_input_tokens_seen": 62162285, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.046875, "step": 2870, "time_per_iteration": 2.55143141746521 }, { "auxiliary_loss_clip": 0.01147404, "auxiliary_loss_mlp": 0.01039936, "balance_loss_clip": 1.02340198, "balance_loss_mlp": 1.04996967, "epoch": 0.17261385840973997, "flos": 22273270646400.0, "grad_norm": 1.7347824280279278, "language_loss": 0.76973248, "learning_rate": 3.7133716999864574e-06, "loss": 0.79160583, "num_input_tokens_seen": 62180970, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.9765625, "step": 2871, "time_per_iteration": 2.640549898147583 }, { "auxiliary_loss_clip": 0.01170411, "auxiliary_loss_mlp": 0.01043957, "balance_loss_clip": 1.02562296, "balance_loss_mlp": 1.04952204, "epoch": 0.17267398166240794, "flos": 27922233830400.0, "grad_norm": 2.3601812986617188, "language_loss": 0.74494267, "learning_rate": 3.7131767741133327e-06, "loss": 0.76708627, "num_input_tokens_seen": 62198965, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0234375, "step": 2872, "time_per_iteration": 2.5779716968536377 }, { "auxiliary_loss_clip": 0.01147538, "auxiliary_loss_mlp": 0.01038911, "balance_loss_clip": 1.0212208, "balance_loss_mlp": 1.04849482, "epoch": 0.1727341049150759, "flos": 21945118970880.0, "grad_norm": 1.8295462366019608, "language_loss": 0.81921828, "learning_rate": 3.712981787101335e-06, "loss": 0.84108281, "num_input_tokens_seen": 62219890, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.9921875, "step": 2873, "time_per_iteration": 2.674375057220459 }, { "auxiliary_loss_clip": 0.0117678, "auxiliary_loss_mlp": 0.01041777, "balance_loss_clip": 1.02339482, "balance_loss_mlp": 1.04946184, "epoch": 0.17279422816774387, "flos": 18107883924480.0, "grad_norm": 2.023778642756291, "language_loss": 0.74734318, "learning_rate": 3.7127867389574244e-06, "loss": 0.76952875, "num_input_tokens_seen": 62237140, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0, "step": 2874, "time_per_iteration": 2.5504844188690186 }, { "auxiliary_loss_clip": 0.01151787, "auxiliary_loss_mlp": 0.01046581, "balance_loss_clip": 1.02765012, "balance_loss_mlp": 1.04951942, "epoch": 0.17285435142041183, "flos": 21835447770240.0, "grad_norm": 2.580111592313788, "language_loss": 0.80498743, "learning_rate": 3.7125916296885606e-06, "loss": 0.82697117, "num_input_tokens_seen": 62255405, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0234375, "step": 2875, "time_per_iteration": 2.6750097274780273 }, { "auxiliary_loss_clip": 0.01172404, "auxiliary_loss_mlp": 0.01044979, "balance_loss_clip": 1.02664423, "balance_loss_mlp": 1.04997206, "epoch": 0.17291447467307983, "flos": 18368452160640.0, "grad_norm": 2.5161437968851725, "language_loss": 0.88100761, "learning_rate": 3.7123964593017066e-06, "loss": 0.90318143, "num_input_tokens_seen": 62271280, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.046875, "step": 2876, "time_per_iteration": 2.5443053245544434 }, { "auxiliary_loss_clip": 0.01171099, "auxiliary_loss_mlp": 0.01038453, "balance_loss_clip": 1.02075052, "balance_loss_mlp": 1.0524869, "epoch": 0.1729745979257478, "flos": 18624638937600.0, "grad_norm": 2.0684179809317795, "language_loss": 0.84206545, "learning_rate": 3.7122012278038285e-06, "loss": 0.86416095, "num_input_tokens_seen": 62289140, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 1.0078125, "step": 2877, "time_per_iteration": 2.6790659427642822 }, { "auxiliary_loss_clip": 0.01179054, "auxiliary_loss_mlp": 0.01045838, "balance_loss_clip": 1.02765846, "balance_loss_mlp": 1.05169678, "epoch": 0.17303472117841576, "flos": 22998234844800.0, "grad_norm": 2.274715285131001, "language_loss": 0.79466647, "learning_rate": 3.7120059352018922e-06, "loss": 0.81691539, "num_input_tokens_seen": 62307490, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.0, "step": 2878, "time_per_iteration": 2.6443614959716797 }, { "auxiliary_loss_clip": 0.0116826, "auxiliary_loss_mlp": 0.01042004, "balance_loss_clip": 1.02507687, "balance_loss_mlp": 1.05053329, "epoch": 0.17309484443108372, "flos": 25664386775040.0, "grad_norm": 1.922086256814665, "language_loss": 0.70086586, "learning_rate": 3.7118105815028677e-06, "loss": 0.72296858, "num_input_tokens_seen": 62328570, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.99609375, "step": 2879, "time_per_iteration": 2.707157850265503 }, { "auxiliary_loss_clip": 0.01168905, "auxiliary_loss_mlp": 0.01046053, "balance_loss_clip": 1.02868414, "balance_loss_mlp": 1.048944, "epoch": 0.1731549676837517, "flos": 13552903313280.0, "grad_norm": 1.9715801851582966, "language_loss": 0.82659811, "learning_rate": 3.7116151667137272e-06, "loss": 0.84874773, "num_input_tokens_seen": 62345735, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 1.015625, "step": 2880, "time_per_iteration": 2.572129011154175 }, { "auxiliary_loss_clip": 0.01194137, "auxiliary_loss_mlp": 0.0104457, "balance_loss_clip": 1.02610481, "balance_loss_mlp": 1.0535562, "epoch": 0.17321509093641965, "flos": 22857070394880.0, "grad_norm": 2.1171862179427006, "language_loss": 0.80985677, "learning_rate": 3.7114196908414444e-06, "loss": 0.8322438, "num_input_tokens_seen": 62365525, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.046875, "step": 2881, "time_per_iteration": 2.6688883304595947 }, { "auxiliary_loss_clip": 0.01170959, "auxiliary_loss_mlp": 0.01045266, "balance_loss_clip": 1.0283618, "balance_loss_mlp": 1.05179048, "epoch": 0.17327521418908762, "flos": 24352785653760.0, "grad_norm": 2.0691024797472637, "language_loss": 0.77242911, "learning_rate": 3.7112241538929946e-06, "loss": 0.79459131, "num_input_tokens_seen": 62385160, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 1.015625, "step": 2882, "time_per_iteration": 2.6900365352630615 }, { "auxiliary_loss_clip": 0.01187651, "auxiliary_loss_mlp": 0.01052243, "balance_loss_clip": 1.0345881, "balance_loss_mlp": 1.05184031, "epoch": 0.1733353374417556, "flos": 33105651816960.0, "grad_norm": 1.828010121348785, "language_loss": 0.75861126, "learning_rate": 3.711028555875357e-06, "loss": 0.78101015, "num_input_tokens_seen": 62405280, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.99609375, "step": 2883, "time_per_iteration": 2.7135274410247803 }, { "auxiliary_loss_clip": 0.0117698, "auxiliary_loss_mlp": 0.01047133, "balance_loss_clip": 1.02984786, "balance_loss_mlp": 1.05084789, "epoch": 0.17339546069442358, "flos": 24388947671040.0, "grad_norm": 2.177466548019497, "language_loss": 0.85308909, "learning_rate": 3.7108328967955113e-06, "loss": 0.87533021, "num_input_tokens_seen": 62423665, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.98828125, "step": 2884, "time_per_iteration": 4.039714574813843 }, { "auxiliary_loss_clip": 0.01171708, "auxiliary_loss_mlp": 0.01292706, "balance_loss_clip": 1.02751374, "balance_loss_mlp": 1.05238736, "epoch": 0.17345558394709154, "flos": 27454174680960.0, "grad_norm": 1.6439866195928305, "language_loss": 0.7401697, "learning_rate": 3.7106371766604408e-06, "loss": 0.76481384, "num_input_tokens_seen": 62445170, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 1.015625, "step": 2885, "time_per_iteration": 4.2043187618255615 }, { "auxiliary_loss_clip": 0.01166743, "auxiliary_loss_mlp": 0.01048138, "balance_loss_clip": 1.03188944, "balance_loss_mlp": 1.05298543, "epoch": 0.1735157071997595, "flos": 24682158391680.0, "grad_norm": 1.7964214313900373, "language_loss": 0.70532387, "learning_rate": 3.7104413954771294e-06, "loss": 0.72747266, "num_input_tokens_seen": 62466135, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.953125, "step": 2886, "time_per_iteration": 3.9887871742248535 }, { "auxiliary_loss_clip": 0.01160629, "auxiliary_loss_mlp": 0.01039842, "balance_loss_clip": 1.02204394, "balance_loss_mlp": 1.05014527, "epoch": 0.17357583045242747, "flos": 21688932193920.0, "grad_norm": 2.1503826009289964, "language_loss": 0.69079524, "learning_rate": 3.710245553252564e-06, "loss": 0.71279997, "num_input_tokens_seen": 62483910, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 1.015625, "step": 2887, "time_per_iteration": 2.5451810359954834 }, { "auxiliary_loss_clip": 0.01178704, "auxiliary_loss_mlp": 0.01046212, "balance_loss_clip": 1.02945173, "balance_loss_mlp": 1.05046821, "epoch": 0.17363595370509544, "flos": 15375728753280.0, "grad_norm": 1.664134120240517, "language_loss": 0.85069036, "learning_rate": 3.7100496499937345e-06, "loss": 0.87293953, "num_input_tokens_seen": 62501530, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 1.0078125, "step": 2888, "time_per_iteration": 2.577548027038574 }, { "auxiliary_loss_clip": 0.01197592, "auxiliary_loss_mlp": 0.0104615, "balance_loss_clip": 1.02767301, "balance_loss_mlp": 1.05164886, "epoch": 0.1736960769577634, "flos": 23440941970560.0, "grad_norm": 2.0429458906268185, "language_loss": 0.78138041, "learning_rate": 3.7098536857076315e-06, "loss": 0.80381787, "num_input_tokens_seen": 62521295, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.0078125, "step": 2889, "time_per_iteration": 4.065525531768799 }, { "auxiliary_loss_clip": 0.01174946, "auxiliary_loss_mlp": 0.01291116, "balance_loss_clip": 1.02457714, "balance_loss_mlp": 1.05090678, "epoch": 0.1737562002104314, "flos": 18587830475520.0, "grad_norm": 2.1203021523383994, "language_loss": 0.84026313, "learning_rate": 3.7096576604012492e-06, "loss": 0.86492383, "num_input_tokens_seen": 62539615, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.97265625, "step": 2890, "time_per_iteration": 2.5884127616882324 }, { "auxiliary_loss_clip": 0.0116214, "auxiliary_loss_mlp": 0.01041595, "balance_loss_clip": 1.02424979, "balance_loss_mlp": 1.0514338, "epoch": 0.17381632346309936, "flos": 15998060816640.0, "grad_norm": 1.9565728829392166, "language_loss": 0.82234681, "learning_rate": 3.7094615740815824e-06, "loss": 0.84438413, "num_input_tokens_seen": 62556820, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 1.015625, "step": 2891, "time_per_iteration": 2.470818281173706 }, { "auxiliary_loss_clip": 0.01178605, "auxiliary_loss_mlp": 0.0104324, "balance_loss_clip": 1.02422571, "balance_loss_mlp": 1.04791641, "epoch": 0.17387644671576732, "flos": 13369830670080.0, "grad_norm": 2.010563315431654, "language_loss": 0.8127501, "learning_rate": 3.709265426755629e-06, "loss": 0.83496857, "num_input_tokens_seen": 62572450, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.03125, "step": 2892, "time_per_iteration": 2.541407585144043 }, { "auxiliary_loss_clip": 0.01163244, "auxiliary_loss_mlp": 0.01052493, "balance_loss_clip": 1.03361058, "balance_loss_mlp": 1.05212951, "epoch": 0.1739365699684353, "flos": 26615516958720.0, "grad_norm": 4.193828909957851, "language_loss": 0.74324816, "learning_rate": 3.7090692184303894e-06, "loss": 0.76540554, "num_input_tokens_seen": 62592580, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0234375, "step": 2893, "time_per_iteration": 2.605726480484009 }, { "auxiliary_loss_clip": 0.01170893, "auxiliary_loss_mlp": 0.01045933, "balance_loss_clip": 1.02768219, "balance_loss_mlp": 1.05076396, "epoch": 0.17399669322110325, "flos": 23367971491200.0, "grad_norm": 1.984185670246576, "language_loss": 0.82352602, "learning_rate": 3.7088729491128665e-06, "loss": 0.8456943, "num_input_tokens_seen": 62611220, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0234375, "step": 2894, "time_per_iteration": 2.6593596935272217 }, { "auxiliary_loss_clip": 0.01179557, "auxiliary_loss_mlp": 0.01043861, "balance_loss_clip": 1.02249825, "balance_loss_mlp": 1.05044174, "epoch": 0.17405681647377122, "flos": 22054107813120.0, "grad_norm": 1.9244686435057572, "language_loss": 0.7399292, "learning_rate": 3.708676618810063e-06, "loss": 0.76216334, "num_input_tokens_seen": 62629185, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.015625, "step": 2895, "time_per_iteration": 2.637868642807007 }, { "auxiliary_loss_clip": 0.01113465, "auxiliary_loss_mlp": 0.01015717, "balance_loss_clip": 1.01339269, "balance_loss_mlp": 1.03800392, "epoch": 0.1741169397264392, "flos": 61457559114240.0, "grad_norm": 0.8715103797755, "language_loss": 0.62752014, "learning_rate": 3.7084802275289866e-06, "loss": 0.64881194, "num_input_tokens_seen": 62691895, "router_z_loss_clip": 0.02319336, "router_z_loss_mlp": 0.39257812, "step": 2896, "time_per_iteration": 3.3131260871887207 }, { "auxiliary_loss_clip": 0.01166361, "auxiliary_loss_mlp": 0.01283333, "balance_loss_clip": 1.01794028, "balance_loss_mlp": 1.04681468, "epoch": 0.17417706297910718, "flos": 27017680608000.0, "grad_norm": 1.7958688989394334, "language_loss": 0.75711977, "learning_rate": 3.708283775276645e-06, "loss": 0.78161669, "num_input_tokens_seen": 62713790, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 1.015625, "step": 2897, "time_per_iteration": 2.673267126083374 }, { "auxiliary_loss_clip": 0.01158585, "auxiliary_loss_mlp": 0.01039721, "balance_loss_clip": 1.02170885, "balance_loss_mlp": 1.05049706, "epoch": 0.17423718623177514, "flos": 33508856960640.0, "grad_norm": 2.938901453026472, "language_loss": 0.68682754, "learning_rate": 3.70808726206005e-06, "loss": 0.70881063, "num_input_tokens_seen": 62736285, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9921875, "step": 2898, "time_per_iteration": 2.66863751411438 }, { "auxiliary_loss_clip": 0.01173466, "auxiliary_loss_mlp": 0.01044504, "balance_loss_clip": 1.02650332, "balance_loss_mlp": 1.05188203, "epoch": 0.1742973094844431, "flos": 27198634348800.0, "grad_norm": 2.3757566800372154, "language_loss": 0.75622022, "learning_rate": 3.7078906878862145e-06, "loss": 0.77839994, "num_input_tokens_seen": 62756240, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.03125, "step": 2899, "time_per_iteration": 2.6892876625061035 }, { "auxiliary_loss_clip": 0.01184995, "auxiliary_loss_mlp": 0.01042316, "balance_loss_clip": 1.02427995, "balance_loss_mlp": 1.05002737, "epoch": 0.17435743273711107, "flos": 22710734386560.0, "grad_norm": 1.9257361693141117, "language_loss": 0.72552001, "learning_rate": 3.7076940527621536e-06, "loss": 0.74779314, "num_input_tokens_seen": 62775910, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.98828125, "step": 2900, "time_per_iteration": 2.585599660873413 }, { "auxiliary_loss_clip": 0.01180852, "auxiliary_loss_mlp": 0.01293433, "balance_loss_clip": 1.02786422, "balance_loss_mlp": 1.0520153, "epoch": 0.17441755598977904, "flos": 41646466039680.0, "grad_norm": 1.8396426751079116, "language_loss": 0.70054734, "learning_rate": 3.707497356694884e-06, "loss": 0.72529018, "num_input_tokens_seen": 62799385, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 1.015625, "step": 2901, "time_per_iteration": 2.750847101211548 }, { "auxiliary_loss_clip": 0.01172964, "auxiliary_loss_mlp": 0.01048775, "balance_loss_clip": 1.03133512, "balance_loss_mlp": 1.05210996, "epoch": 0.174477679242447, "flos": 26287077974400.0, "grad_norm": 2.0968880568056463, "language_loss": 0.76054716, "learning_rate": 3.707300599691427e-06, "loss": 0.78276455, "num_input_tokens_seen": 62819380, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 1.03125, "step": 2902, "time_per_iteration": 2.6441757678985596 }, { "auxiliary_loss_clip": 0.01159329, "auxiliary_loss_mlp": 0.01051855, "balance_loss_clip": 1.03478408, "balance_loss_mlp": 1.04905403, "epoch": 0.174537802495115, "flos": 17858412990720.0, "grad_norm": 2.210354108516872, "language_loss": 0.81151277, "learning_rate": 3.7071037817588023e-06, "loss": 0.83362466, "num_input_tokens_seen": 62836205, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 1.015625, "step": 2903, "time_per_iteration": 2.5487539768218994 }, { "auxiliary_loss_clip": 0.0116976, "auxiliary_loss_mlp": 0.01041363, "balance_loss_clip": 1.02383912, "balance_loss_mlp": 1.05107331, "epoch": 0.17459792574778296, "flos": 16940715390720.0, "grad_norm": 1.7424441419920376, "language_loss": 0.73375642, "learning_rate": 3.706906902904036e-06, "loss": 0.75586766, "num_input_tokens_seen": 62854045, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 1.0078125, "step": 2904, "time_per_iteration": 2.5613291263580322 }, { "auxiliary_loss_clip": 0.01197406, "auxiliary_loss_mlp": 0.01042868, "balance_loss_clip": 1.02558303, "balance_loss_mlp": 1.05113649, "epoch": 0.17465804900045093, "flos": 25520026014720.0, "grad_norm": 2.0336066767549945, "language_loss": 0.64358318, "learning_rate": 3.7067099631341517e-06, "loss": 0.66598588, "num_input_tokens_seen": 62873075, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 1.0078125, "step": 2905, "time_per_iteration": 2.702043294906616 }, { "auxiliary_loss_clip": 0.01188752, "auxiliary_loss_mlp": 0.01046227, "balance_loss_clip": 1.0273447, "balance_loss_mlp": 1.05441165, "epoch": 0.1747181722531189, "flos": 24129708238080.0, "grad_norm": 2.3696300236660295, "language_loss": 0.79813981, "learning_rate": 3.70651296245618e-06, "loss": 0.82048965, "num_input_tokens_seen": 62892675, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.0703125, "step": 2906, "time_per_iteration": 2.613236904144287 }, { "auxiliary_loss_clip": 0.01162997, "auxiliary_loss_mlp": 0.01053002, "balance_loss_clip": 1.03547859, "balance_loss_mlp": 1.05357838, "epoch": 0.17477829550578686, "flos": 17748813617280.0, "grad_norm": 1.8061429339933956, "language_loss": 0.81042492, "learning_rate": 3.70631590087715e-06, "loss": 0.83258486, "num_input_tokens_seen": 62910675, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 1.0078125, "step": 2907, "time_per_iteration": 2.649500846862793 }, { "auxiliary_loss_clip": 0.01163823, "auxiliary_loss_mlp": 0.01044501, "balance_loss_clip": 1.02735877, "balance_loss_mlp": 1.05294812, "epoch": 0.17483841875845482, "flos": 15377344865280.0, "grad_norm": 2.12404961976927, "language_loss": 0.80578852, "learning_rate": 3.706118778404095e-06, "loss": 0.82787174, "num_input_tokens_seen": 62928130, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 1.0234375, "step": 2908, "time_per_iteration": 2.5538077354431152 }, { "auxiliary_loss_clip": 0.0117178, "auxiliary_loss_mlp": 0.0105126, "balance_loss_clip": 1.03201985, "balance_loss_mlp": 1.05226183, "epoch": 0.17489854201112282, "flos": 17163254102400.0, "grad_norm": 2.647643312270932, "language_loss": 0.7970615, "learning_rate": 3.7059215950440487e-06, "loss": 0.81929183, "num_input_tokens_seen": 62944290, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.015625, "step": 2909, "time_per_iteration": 2.5650644302368164 }, { "auxiliary_loss_clip": 0.0119229, "auxiliary_loss_mlp": 0.01045621, "balance_loss_clip": 1.02757263, "balance_loss_mlp": 1.05255556, "epoch": 0.17495866526379078, "flos": 19755286318080.0, "grad_norm": 1.8315326212947718, "language_loss": 0.76609129, "learning_rate": 3.7057243508040494e-06, "loss": 0.78847039, "num_input_tokens_seen": 62963505, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.03125, "step": 2910, "time_per_iteration": 2.611931085586548 }, { "auxiliary_loss_clip": 0.01153808, "auxiliary_loss_mlp": 0.01046197, "balance_loss_clip": 1.02715945, "balance_loss_mlp": 1.05169153, "epoch": 0.17501878851645875, "flos": 28511133310080.0, "grad_norm": 1.6882175309022807, "language_loss": 0.87089586, "learning_rate": 3.7055270456911354e-06, "loss": 0.89289594, "num_input_tokens_seen": 62985020, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.0234375, "step": 2911, "time_per_iteration": 2.596503973007202 }, { "auxiliary_loss_clip": 0.01149883, "auxiliary_loss_mlp": 0.01297397, "balance_loss_clip": 1.03076315, "balance_loss_mlp": 1.0485146, "epoch": 0.1750789117691267, "flos": 17931203902080.0, "grad_norm": 2.2386408709251793, "language_loss": 0.89489412, "learning_rate": 3.7053296797123485e-06, "loss": 0.91936696, "num_input_tokens_seen": 63001745, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.015625, "step": 2912, "time_per_iteration": 2.5606794357299805 }, { "auxiliary_loss_clip": 0.0116366, "auxiliary_loss_mlp": 0.01044792, "balance_loss_clip": 1.02613592, "balance_loss_mlp": 1.05145252, "epoch": 0.17513903502179468, "flos": 18259427404800.0, "grad_norm": 2.396953417608618, "language_loss": 0.72232282, "learning_rate": 3.7051322528747313e-06, "loss": 0.7444073, "num_input_tokens_seen": 63019750, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.03125, "step": 2913, "time_per_iteration": 2.542771816253662 }, { "auxiliary_loss_clip": 0.01171326, "auxiliary_loss_mlp": 0.01042954, "balance_loss_clip": 1.02512026, "balance_loss_mlp": 1.05255294, "epoch": 0.17519915827446264, "flos": 20704728562560.0, "grad_norm": 1.8022758495731344, "language_loss": 0.6908766, "learning_rate": 3.704934765185331e-06, "loss": 0.71301943, "num_input_tokens_seen": 63039500, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 1.0078125, "step": 2914, "time_per_iteration": 2.6676881313323975 }, { "auxiliary_loss_clip": 0.01160897, "auxiliary_loss_mlp": 0.01042831, "balance_loss_clip": 1.02514076, "balance_loss_mlp": 1.05073476, "epoch": 0.1752592815271306, "flos": 20523415685760.0, "grad_norm": 1.6346161076393426, "language_loss": 0.93241042, "learning_rate": 3.7047372166511945e-06, "loss": 0.95444763, "num_input_tokens_seen": 63059785, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 1.015625, "step": 2915, "time_per_iteration": 2.5671801567077637 }, { "auxiliary_loss_clip": 0.01173708, "auxiliary_loss_mlp": 0.01043728, "balance_loss_clip": 1.02596545, "balance_loss_mlp": 1.04945397, "epoch": 0.1753194047797986, "flos": 21799178012160.0, "grad_norm": 2.9357656059740873, "language_loss": 0.80682516, "learning_rate": 3.704539607279371e-06, "loss": 0.82899952, "num_input_tokens_seen": 63079385, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.97265625, "step": 2916, "time_per_iteration": 2.6311581134796143 }, { "auxiliary_loss_clip": 0.01159549, "auxiliary_loss_mlp": 0.01050522, "balance_loss_clip": 1.03235495, "balance_loss_mlp": 1.04869318, "epoch": 0.17537952803246656, "flos": 20668351063680.0, "grad_norm": 1.8092653405880972, "language_loss": 0.73600388, "learning_rate": 3.704341937076914e-06, "loss": 0.75810456, "num_input_tokens_seen": 63098970, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.015625, "step": 2917, "time_per_iteration": 2.5701026916503906 }, { "auxiliary_loss_clip": 0.01153942, "auxiliary_loss_mlp": 0.01046093, "balance_loss_clip": 1.02732992, "balance_loss_mlp": 1.04802239, "epoch": 0.17543965128513453, "flos": 23295072839040.0, "grad_norm": 1.994604142982471, "language_loss": 0.77025133, "learning_rate": 3.7041442060508778e-06, "loss": 0.79225171, "num_input_tokens_seen": 63118750, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.96875, "step": 2918, "time_per_iteration": 2.5906002521514893 }, { "auxiliary_loss_clip": 0.0119945, "auxiliary_loss_mlp": 0.01044358, "balance_loss_clip": 1.02596378, "balance_loss_mlp": 1.05030847, "epoch": 0.1754997745378025, "flos": 29095615416960.0, "grad_norm": 2.0453621736433645, "language_loss": 0.74158567, "learning_rate": 3.7039464142083183e-06, "loss": 0.76402372, "num_input_tokens_seen": 63136865, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.046875, "step": 2919, "time_per_iteration": 2.6814537048339844 }, { "auxiliary_loss_clip": 0.01174247, "auxiliary_loss_mlp": 0.01042768, "balance_loss_clip": 1.0252558, "balance_loss_mlp": 1.05125678, "epoch": 0.17555989779047046, "flos": 30371844620160.0, "grad_norm": 1.9709585390144488, "language_loss": 0.7449137, "learning_rate": 3.7037485615562936e-06, "loss": 0.76708388, "num_input_tokens_seen": 63158325, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 1.046875, "step": 2920, "time_per_iteration": 2.6710970401763916 }, { "auxiliary_loss_clip": 0.01167434, "auxiliary_loss_mlp": 0.01039667, "balance_loss_clip": 1.02257252, "balance_loss_mlp": 1.05009019, "epoch": 0.17562002104313842, "flos": 23287746464640.0, "grad_norm": 2.2350837461332023, "language_loss": 0.79150939, "learning_rate": 3.703550648101866e-06, "loss": 0.81358039, "num_input_tokens_seen": 63173115, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.9921875, "step": 2921, "time_per_iteration": 2.560333251953125 }, { "auxiliary_loss_clip": 0.01182243, "auxiliary_loss_mlp": 0.01043681, "balance_loss_clip": 1.02578783, "balance_loss_mlp": 1.05128169, "epoch": 0.1756801442958064, "flos": 24790500789120.0, "grad_norm": 1.9274895278171467, "language_loss": 0.8784321, "learning_rate": 3.7033526738520983e-06, "loss": 0.90069133, "num_input_tokens_seen": 63192880, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.0390625, "step": 2922, "time_per_iteration": 2.683689832687378 }, { "auxiliary_loss_clip": 0.01160721, "auxiliary_loss_mlp": 0.01042238, "balance_loss_clip": 1.02401078, "balance_loss_mlp": 1.0493995, "epoch": 0.17574026754847438, "flos": 25771651764480.0, "grad_norm": 2.1919120326152406, "language_loss": 0.62305999, "learning_rate": 3.7031546388140545e-06, "loss": 0.64508963, "num_input_tokens_seen": 63214395, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 1.0234375, "step": 2923, "time_per_iteration": 2.6317570209503174 }, { "auxiliary_loss_clip": 0.01183295, "auxiliary_loss_mlp": 0.01044818, "balance_loss_clip": 1.02567315, "balance_loss_mlp": 1.05260217, "epoch": 0.17580039080114235, "flos": 17456608477440.0, "grad_norm": 3.729863562413301, "language_loss": 0.80312729, "learning_rate": 3.702956542994802e-06, "loss": 0.82540834, "num_input_tokens_seen": 63231020, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.03125, "step": 2924, "time_per_iteration": 2.640544891357422 }, { "auxiliary_loss_clip": 0.01174342, "auxiliary_loss_mlp": 0.01053272, "balance_loss_clip": 1.03379321, "balance_loss_mlp": 1.05160952, "epoch": 0.1758605140538103, "flos": 14864648088960.0, "grad_norm": 2.1369654001208707, "language_loss": 0.70981151, "learning_rate": 3.7027583864014123e-06, "loss": 0.73208767, "num_input_tokens_seen": 63246245, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.046875, "step": 2925, "time_per_iteration": 3.9240405559539795 }, { "auxiliary_loss_clip": 0.01178565, "auxiliary_loss_mlp": 0.01039688, "balance_loss_clip": 1.02227139, "balance_loss_mlp": 1.05149674, "epoch": 0.17592063730647828, "flos": 23004268329600.0, "grad_norm": 1.8436306523559327, "language_loss": 0.71898383, "learning_rate": 3.7025601690409555e-06, "loss": 0.74116635, "num_input_tokens_seen": 63267790, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 1.0, "step": 2926, "time_per_iteration": 2.6579432487487793 }, { "auxiliary_loss_clip": 0.01180252, "auxiliary_loss_mlp": 0.0103802, "balance_loss_clip": 1.01850581, "balance_loss_mlp": 1.04969263, "epoch": 0.17598076055914624, "flos": 20741501111040.0, "grad_norm": 1.827144730213423, "language_loss": 0.84915876, "learning_rate": 3.702361890920505e-06, "loss": 0.87134147, "num_input_tokens_seen": 63286830, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.03125, "step": 2927, "time_per_iteration": 5.4435954093933105 }, { "auxiliary_loss_clip": 0.0118706, "auxiliary_loss_mlp": 0.01042273, "balance_loss_clip": 1.02538133, "balance_loss_mlp": 1.05174315, "epoch": 0.1760408838118142, "flos": 34092441227520.0, "grad_norm": 1.9078249467053268, "language_loss": 0.72103202, "learning_rate": 3.702163552047138e-06, "loss": 0.74332535, "num_input_tokens_seen": 63308870, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.98828125, "step": 2928, "time_per_iteration": 2.830928325653076 }, { "auxiliary_loss_clip": 0.01176807, "auxiliary_loss_mlp": 0.01043415, "balance_loss_clip": 1.02519941, "balance_loss_mlp": 1.05137563, "epoch": 0.1761010070644822, "flos": 24168384207360.0, "grad_norm": 2.0270208458273022, "language_loss": 0.83333099, "learning_rate": 3.7019651524279326e-06, "loss": 0.85553324, "num_input_tokens_seen": 63329005, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.98046875, "step": 2929, "time_per_iteration": 2.6230695247650146 }, { "auxiliary_loss_clip": 0.01169728, "auxiliary_loss_mlp": 0.01039328, "balance_loss_clip": 1.02193582, "balance_loss_mlp": 1.05062938, "epoch": 0.17616113031715017, "flos": 26576697335040.0, "grad_norm": 1.5808571750661766, "language_loss": 0.7928766, "learning_rate": 3.7017666920699693e-06, "loss": 0.81496716, "num_input_tokens_seen": 63349390, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 1.0078125, "step": 2930, "time_per_iteration": 4.910793304443359 }, { "auxiliary_loss_clip": 0.01154263, "auxiliary_loss_mlp": 0.01044936, "balance_loss_clip": 1.0264343, "balance_loss_mlp": 1.05362546, "epoch": 0.17622125356981813, "flos": 25666685245440.0, "grad_norm": 1.964420034300023, "language_loss": 0.76708257, "learning_rate": 3.701568170980329e-06, "loss": 0.7890746, "num_input_tokens_seen": 63368835, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0, "step": 2931, "time_per_iteration": 2.6627814769744873 }, { "auxiliary_loss_clip": 0.01208208, "auxiliary_loss_mlp": 0.01042966, "balance_loss_clip": 1.02553809, "balance_loss_mlp": 1.05209208, "epoch": 0.1762813768224861, "flos": 16508530949760.0, "grad_norm": 2.76598541743139, "language_loss": 0.74787378, "learning_rate": 3.7013695891660985e-06, "loss": 0.7703855, "num_input_tokens_seen": 63385220, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 1.015625, "step": 2932, "time_per_iteration": 2.619323968887329 }, { "auxiliary_loss_clip": 0.01168012, "auxiliary_loss_mlp": 0.01044546, "balance_loss_clip": 1.02540135, "balance_loss_mlp": 1.0519954, "epoch": 0.17634150007515406, "flos": 11211850402560.0, "grad_norm": 2.984912960089666, "language_loss": 0.89850259, "learning_rate": 3.701170946634364e-06, "loss": 0.92062813, "num_input_tokens_seen": 63400865, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0703125, "step": 2933, "time_per_iteration": 2.659160614013672 }, { "auxiliary_loss_clip": 0.01149229, "auxiliary_loss_mlp": 0.01047154, "balance_loss_clip": 1.03010678, "balance_loss_mlp": 1.05180001, "epoch": 0.17640162332782203, "flos": 23659925235840.0, "grad_norm": 1.7886232301668326, "language_loss": 0.88155437, "learning_rate": 3.700972243392214e-06, "loss": 0.9035182, "num_input_tokens_seen": 63421390, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9765625, "step": 2934, "time_per_iteration": 2.618105173110962 }, { "auxiliary_loss_clip": 0.01163713, "auxiliary_loss_mlp": 0.01042934, "balance_loss_clip": 1.02605438, "balance_loss_mlp": 1.04877138, "epoch": 0.17646174658049, "flos": 53796984606720.0, "grad_norm": 1.562233147768052, "language_loss": 0.70507467, "learning_rate": 3.70077347944674e-06, "loss": 0.72714114, "num_input_tokens_seen": 63444715, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.96875, "step": 2935, "time_per_iteration": 2.9505059719085693 }, { "auxiliary_loss_clip": 0.01182926, "auxiliary_loss_mlp": 0.01038486, "balance_loss_clip": 1.02066445, "balance_loss_mlp": 1.05191946, "epoch": 0.17652186983315798, "flos": 24243868638720.0, "grad_norm": 2.1197350589208757, "language_loss": 0.70206845, "learning_rate": 3.7005746548050353e-06, "loss": 0.72428262, "num_input_tokens_seen": 63465525, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 1.0390625, "step": 2936, "time_per_iteration": 2.682783842086792 }, { "auxiliary_loss_clip": 0.01191276, "auxiliary_loss_mlp": 0.01041736, "balance_loss_clip": 1.02517807, "balance_loss_mlp": 1.05532193, "epoch": 0.17658199308582595, "flos": 27454282421760.0, "grad_norm": 2.1919471407671605, "language_loss": 0.71120596, "learning_rate": 3.7003757694741956e-06, "loss": 0.73353606, "num_input_tokens_seen": 63485815, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 1.0, "step": 2937, "time_per_iteration": 2.6476314067840576 }, { "auxiliary_loss_clip": 0.01176871, "auxiliary_loss_mlp": 0.01043521, "balance_loss_clip": 1.02454305, "balance_loss_mlp": 1.05440986, "epoch": 0.17664211633849392, "flos": 22418672901120.0, "grad_norm": 2.277266431616182, "language_loss": 0.75517339, "learning_rate": 3.7001768234613188e-06, "loss": 0.77737731, "num_input_tokens_seen": 63503905, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.046875, "step": 2938, "time_per_iteration": 2.5823307037353516 }, { "auxiliary_loss_clip": 0.01161743, "auxiliary_loss_mlp": 0.0103583, "balance_loss_clip": 1.01883125, "balance_loss_mlp": 1.05192721, "epoch": 0.17670223959116188, "flos": 24715124098560.0, "grad_norm": 2.2242231686674168, "language_loss": 0.70785367, "learning_rate": 3.6999778167735043e-06, "loss": 0.72982943, "num_input_tokens_seen": 63521985, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 1.0078125, "step": 2939, "time_per_iteration": 2.5902647972106934 }, { "auxiliary_loss_clip": 0.01152275, "auxiliary_loss_mlp": 0.01039164, "balance_loss_clip": 1.0215925, "balance_loss_mlp": 1.05295801, "epoch": 0.17676236284382985, "flos": 22527051212160.0, "grad_norm": 2.2589642846015003, "language_loss": 0.72934377, "learning_rate": 3.699778749417855e-06, "loss": 0.75125813, "num_input_tokens_seen": 63539830, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9921875, "step": 2940, "time_per_iteration": 2.6035854816436768 }, { "auxiliary_loss_clip": 0.01183, "auxiliary_loss_mlp": 0.01043647, "balance_loss_clip": 1.02439475, "balance_loss_mlp": 1.05533803, "epoch": 0.1768224860964978, "flos": 12385160161920.0, "grad_norm": 2.320252644614458, "language_loss": 0.85182607, "learning_rate": 3.699579621401474e-06, "loss": 0.87409252, "num_input_tokens_seen": 63555495, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.0078125, "step": 2941, "time_per_iteration": 2.573390483856201 }, { "auxiliary_loss_clip": 0.01151731, "auxiliary_loss_mlp": 0.01038243, "balance_loss_clip": 1.02076674, "balance_loss_mlp": 1.05106831, "epoch": 0.1768826093491658, "flos": 24353360271360.0, "grad_norm": 2.239531733641487, "language_loss": 0.76652861, "learning_rate": 3.699380432731468e-06, "loss": 0.78842837, "num_input_tokens_seen": 63575290, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 1.0078125, "step": 2942, "time_per_iteration": 2.617203712463379 }, { "auxiliary_loss_clip": 0.0118196, "auxiliary_loss_mlp": 0.0128942, "balance_loss_clip": 1.02319694, "balance_loss_mlp": 1.05261087, "epoch": 0.17694273260183377, "flos": 23587062497280.0, "grad_norm": 1.8925396202698068, "language_loss": 0.80230021, "learning_rate": 3.699181183414946e-06, "loss": 0.82701397, "num_input_tokens_seen": 63594670, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.015625, "step": 2943, "time_per_iteration": 2.5769717693328857 }, { "auxiliary_loss_clip": 0.01175532, "auxiliary_loss_mlp": 0.01045618, "balance_loss_clip": 1.02611494, "balance_loss_mlp": 1.05343699, "epoch": 0.17700285585450173, "flos": 26760991040640.0, "grad_norm": 2.3332556479864253, "language_loss": 0.80759728, "learning_rate": 3.698981873459018e-06, "loss": 0.82980883, "num_input_tokens_seen": 63614780, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0390625, "step": 2944, "time_per_iteration": 2.6618216037750244 }, { "auxiliary_loss_clip": 0.01155189, "auxiliary_loss_mlp": 0.01049467, "balance_loss_clip": 1.03187191, "balance_loss_mlp": 1.05350566, "epoch": 0.1770629791071697, "flos": 42776323320960.0, "grad_norm": 1.9560465781043466, "language_loss": 0.73376769, "learning_rate": 3.6987825028707976e-06, "loss": 0.75581425, "num_input_tokens_seen": 63637190, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 1.015625, "step": 2945, "time_per_iteration": 2.7111589908599854 }, { "auxiliary_loss_clip": 0.01165131, "auxiliary_loss_mlp": 0.01044596, "balance_loss_clip": 1.02673888, "balance_loss_mlp": 1.05627549, "epoch": 0.17712310235983766, "flos": 17345572560000.0, "grad_norm": 2.6905344885870113, "language_loss": 0.77833319, "learning_rate": 3.698583071657399e-06, "loss": 0.80043042, "num_input_tokens_seen": 63652140, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 1.0, "step": 2946, "time_per_iteration": 2.5939829349517822 }, { "auxiliary_loss_clip": 0.01183498, "auxiliary_loss_mlp": 0.01045115, "balance_loss_clip": 1.0272218, "balance_loss_mlp": 1.05599356, "epoch": 0.17718322561250563, "flos": 23878477537920.0, "grad_norm": 2.2232135716291657, "language_loss": 0.76084518, "learning_rate": 3.6983835798259404e-06, "loss": 0.78313136, "num_input_tokens_seen": 63671700, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 1.0078125, "step": 2947, "time_per_iteration": 2.632122755050659 }, { "auxiliary_loss_clip": 0.01164074, "auxiliary_loss_mlp": 0.01039247, "balance_loss_clip": 1.02104425, "balance_loss_mlp": 1.05508685, "epoch": 0.1772433488651736, "flos": 36466352104320.0, "grad_norm": 2.179197271242147, "language_loss": 0.72659802, "learning_rate": 3.6981840273835405e-06, "loss": 0.74863124, "num_input_tokens_seen": 63691685, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.0, "step": 2948, "time_per_iteration": 2.763140916824341 }, { "auxiliary_loss_clip": 0.01168932, "auxiliary_loss_mlp": 0.01048058, "balance_loss_clip": 1.03034306, "balance_loss_mlp": 1.05878925, "epoch": 0.1773034721178416, "flos": 26684716510080.0, "grad_norm": 1.7493030322747427, "language_loss": 0.81557643, "learning_rate": 3.6979844143373207e-06, "loss": 0.83774626, "num_input_tokens_seen": 63711720, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 1.0078125, "step": 2949, "time_per_iteration": 2.711660623550415 }, { "auxiliary_loss_clip": 0.01080464, "auxiliary_loss_mlp": 0.01008901, "balance_loss_clip": 1.00586081, "balance_loss_mlp": 1.0328927, "epoch": 0.17736359537050955, "flos": 57117467617920.0, "grad_norm": 0.7727895956199575, "language_loss": 0.6503315, "learning_rate": 3.6977847406944053e-06, "loss": 0.67122507, "num_input_tokens_seen": 63776280, "router_z_loss_clip": 0.03039551, "router_z_loss_mlp": 0.38476562, "step": 2950, "time_per_iteration": 3.2627015113830566 }, { "auxiliary_loss_clip": 0.01182361, "auxiliary_loss_mlp": 0.01042628, "balance_loss_clip": 1.02461576, "balance_loss_mlp": 1.05694771, "epoch": 0.17742371862317752, "flos": 27198203385600.0, "grad_norm": 2.2630046032692466, "language_loss": 0.83791643, "learning_rate": 3.6975850064619193e-06, "loss": 0.86016631, "num_input_tokens_seen": 63797535, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.984375, "step": 2951, "time_per_iteration": 2.7104594707489014 }, { "auxiliary_loss_clip": 0.01176034, "auxiliary_loss_mlp": 0.01044647, "balance_loss_clip": 1.02658725, "balance_loss_mlp": 1.05460191, "epoch": 0.17748384187584548, "flos": 20959694277120.0, "grad_norm": 2.7421218323844045, "language_loss": 0.79827321, "learning_rate": 3.697385211646991e-06, "loss": 0.82047999, "num_input_tokens_seen": 63817045, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 1.03125, "step": 2952, "time_per_iteration": 2.662635564804077 }, { "auxiliary_loss_clip": 0.01161078, "auxiliary_loss_mlp": 0.01046793, "balance_loss_clip": 1.02701628, "balance_loss_mlp": 1.05942178, "epoch": 0.17754396512851345, "flos": 25009986844800.0, "grad_norm": 2.2927488168714563, "language_loss": 0.79562777, "learning_rate": 3.697185356256751e-06, "loss": 0.81770647, "num_input_tokens_seen": 63837665, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.015625, "step": 2953, "time_per_iteration": 2.653168201446533 }, { "auxiliary_loss_clip": 0.01187904, "auxiliary_loss_mlp": 0.01041155, "balance_loss_clip": 1.02353585, "balance_loss_mlp": 1.05907798, "epoch": 0.1776040883811814, "flos": 32051566275840.0, "grad_norm": 1.881702727190337, "language_loss": 0.87358236, "learning_rate": 3.6969854402983314e-06, "loss": 0.89587295, "num_input_tokens_seen": 63858455, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 1.015625, "step": 2954, "time_per_iteration": 2.672555446624756 }, { "auxiliary_loss_clip": 0.01180065, "auxiliary_loss_mlp": 0.01050936, "balance_loss_clip": 1.0318625, "balance_loss_mlp": 1.06038678, "epoch": 0.17766421163384938, "flos": 21574125348480.0, "grad_norm": 1.7831372333374256, "language_loss": 0.84717315, "learning_rate": 3.6967854637788665e-06, "loss": 0.86948311, "num_input_tokens_seen": 63876935, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.015625, "step": 2955, "time_per_iteration": 2.707944393157959 }, { "auxiliary_loss_clip": 0.01177653, "auxiliary_loss_mlp": 0.01044166, "balance_loss_clip": 1.02597511, "balance_loss_mlp": 1.05918109, "epoch": 0.17772433488651737, "flos": 22419319345920.0, "grad_norm": 1.8120729933561983, "language_loss": 0.70810533, "learning_rate": 3.696585426705493e-06, "loss": 0.73032355, "num_input_tokens_seen": 63896815, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.0, "step": 2956, "time_per_iteration": 2.577702045440674 }, { "auxiliary_loss_clip": 0.0116838, "auxiliary_loss_mlp": 0.01048004, "balance_loss_clip": 1.0296104, "balance_loss_mlp": 1.05929863, "epoch": 0.17778445813918534, "flos": 25629445820160.0, "grad_norm": 1.9820825053849664, "language_loss": 0.82727396, "learning_rate": 3.6963853290853503e-06, "loss": 0.84943777, "num_input_tokens_seen": 63916140, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0, "step": 2957, "time_per_iteration": 2.624114751815796 }, { "auxiliary_loss_clip": 0.01192914, "auxiliary_loss_mlp": 0.01042998, "balance_loss_clip": 1.02536678, "balance_loss_mlp": 1.05674839, "epoch": 0.1778445813918533, "flos": 25628871202560.0, "grad_norm": 1.7259047033083437, "language_loss": 0.75156397, "learning_rate": 3.6961851709255784e-06, "loss": 0.7739231, "num_input_tokens_seen": 63935220, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 1.0, "step": 2958, "time_per_iteration": 2.6340816020965576 }, { "auxiliary_loss_clip": 0.01179497, "auxiliary_loss_mlp": 0.01042899, "balance_loss_clip": 1.02524459, "balance_loss_mlp": 1.05895185, "epoch": 0.17790470464452127, "flos": 22345522853760.0, "grad_norm": 3.1565189973785346, "language_loss": 0.80141461, "learning_rate": 3.6959849522333206e-06, "loss": 0.82363862, "num_input_tokens_seen": 63954550, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 1.0234375, "step": 2959, "time_per_iteration": 2.622138023376465 }, { "auxiliary_loss_clip": 0.01161326, "auxiliary_loss_mlp": 0.01051331, "balance_loss_clip": 1.03156638, "balance_loss_mlp": 1.05714679, "epoch": 0.17796482789718923, "flos": 18765875214720.0, "grad_norm": 1.943560474959149, "language_loss": 0.51814926, "learning_rate": 3.6957846730157222e-06, "loss": 0.54027587, "num_input_tokens_seen": 63972425, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.0390625, "step": 2960, "time_per_iteration": 2.545741319656372 }, { "auxiliary_loss_clip": 0.01198217, "auxiliary_loss_mlp": 0.01054051, "balance_loss_clip": 1.03576434, "balance_loss_mlp": 1.05837524, "epoch": 0.1780249511498572, "flos": 23440941970560.0, "grad_norm": 2.0480960999774362, "language_loss": 0.88994884, "learning_rate": 3.6955843332799317e-06, "loss": 0.91247153, "num_input_tokens_seen": 63992165, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 1.0390625, "step": 2961, "time_per_iteration": 2.7755556106567383 }, { "auxiliary_loss_clip": 0.01187712, "auxiliary_loss_mlp": 0.01050572, "balance_loss_clip": 1.03093839, "balance_loss_mlp": 1.05582476, "epoch": 0.1780850744025252, "flos": 23367468700800.0, "grad_norm": 1.6283261096719424, "language_loss": 0.78869569, "learning_rate": 3.6953839330330972e-06, "loss": 0.81107855, "num_input_tokens_seen": 64013470, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.046875, "step": 2962, "time_per_iteration": 2.607989549636841 }, { "auxiliary_loss_clip": 0.01169941, "auxiliary_loss_mlp": 0.0104802, "balance_loss_clip": 1.02844584, "balance_loss_mlp": 1.0581913, "epoch": 0.17814519765519315, "flos": 13771994319360.0, "grad_norm": 1.5832350587864572, "language_loss": 0.7429055, "learning_rate": 3.6951834722823715e-06, "loss": 0.7650851, "num_input_tokens_seen": 64030975, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.0234375, "step": 2963, "time_per_iteration": 2.674644947052002 }, { "auxiliary_loss_clip": 0.01180354, "auxiliary_loss_mlp": 0.01048397, "balance_loss_clip": 1.02848864, "balance_loss_mlp": 1.0567894, "epoch": 0.17820532090786112, "flos": 21976396738560.0, "grad_norm": 2.550172720003788, "language_loss": 0.78792918, "learning_rate": 3.6949829510349082e-06, "loss": 0.81021667, "num_input_tokens_seen": 64050075, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.0625, "step": 2964, "time_per_iteration": 2.6084110736846924 }, { "auxiliary_loss_clip": 0.01167453, "auxiliary_loss_mlp": 0.01299, "balance_loss_clip": 1.03212559, "balance_loss_mlp": 1.0575664, "epoch": 0.17826544416052909, "flos": 24790752184320.0, "grad_norm": 2.077759687617816, "language_loss": 0.81077045, "learning_rate": 3.6947823692978634e-06, "loss": 0.83543497, "num_input_tokens_seen": 64071920, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.0078125, "step": 2965, "time_per_iteration": 2.6706645488739014 }, { "auxiliary_loss_clip": 0.01162269, "auxiliary_loss_mlp": 0.01047611, "balance_loss_clip": 1.0301708, "balance_loss_mlp": 1.05271959, "epoch": 0.17832556741319705, "flos": 13879582531200.0, "grad_norm": 2.7360634500240515, "language_loss": 0.94965672, "learning_rate": 3.6945817270783955e-06, "loss": 0.97175556, "num_input_tokens_seen": 64086835, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 1.0078125, "step": 2966, "time_per_iteration": 2.5704586505889893 }, { "auxiliary_loss_clip": 0.01194354, "auxiliary_loss_mlp": 0.01294443, "balance_loss_clip": 1.02623773, "balance_loss_mlp": 1.05583143, "epoch": 0.17838569066586502, "flos": 36641703323520.0, "grad_norm": 2.091888682237519, "language_loss": 0.72648776, "learning_rate": 3.6943810243836648e-06, "loss": 0.75137568, "num_input_tokens_seen": 64107360, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.0234375, "step": 2967, "time_per_iteration": 4.103399991989136 }, { "auxiliary_loss_clip": 0.01171098, "auxiliary_loss_mlp": 0.01045431, "balance_loss_clip": 1.02690601, "balance_loss_mlp": 1.05470514, "epoch": 0.17844581391853298, "flos": 18727271072640.0, "grad_norm": 1.8828224681632901, "language_loss": 0.77009821, "learning_rate": 3.6941802612208334e-06, "loss": 0.79226351, "num_input_tokens_seen": 64124690, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.98046875, "step": 2968, "time_per_iteration": 5.533022880554199 }, { "auxiliary_loss_clip": 0.01200241, "auxiliary_loss_mlp": 0.01046797, "balance_loss_clip": 1.02854562, "balance_loss_mlp": 1.05512738, "epoch": 0.17850593717120097, "flos": 27378259286400.0, "grad_norm": 2.008482081943266, "language_loss": 0.76012981, "learning_rate": 3.6939794375970667e-06, "loss": 0.78260016, "num_input_tokens_seen": 64146315, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 1.0, "step": 2969, "time_per_iteration": 2.7114756107330322 }, { "auxiliary_loss_clip": 0.01078682, "auxiliary_loss_mlp": 0.01002678, "balance_loss_clip": 0.99955505, "balance_loss_mlp": 1.02410984, "epoch": 0.17856606042386894, "flos": 66996025084800.0, "grad_norm": 0.8402397255065952, "language_loss": 0.68987083, "learning_rate": 3.693778553519531e-06, "loss": 0.71068448, "num_input_tokens_seen": 64210875, "router_z_loss_clip": 0.03125, "router_z_loss_mlp": 0.36328125, "step": 2970, "time_per_iteration": 3.2023558616638184 }, { "auxiliary_loss_clip": 0.01175248, "auxiliary_loss_mlp": 0.01040648, "balance_loss_clip": 1.02170551, "balance_loss_mlp": 1.05274487, "epoch": 0.1786261836765369, "flos": 36977001805440.0, "grad_norm": 2.4352773264735506, "language_loss": 0.67399848, "learning_rate": 3.6935776089953956e-06, "loss": 0.69615746, "num_input_tokens_seen": 64230740, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.046875, "step": 2971, "time_per_iteration": 2.695009231567383 }, { "auxiliary_loss_clip": 0.01190339, "auxiliary_loss_mlp": 0.01046456, "balance_loss_clip": 1.02690601, "balance_loss_mlp": 1.05148566, "epoch": 0.17868630692920487, "flos": 24825441744000.0, "grad_norm": 1.8654477225230792, "language_loss": 0.89510947, "learning_rate": 3.6933766040318323e-06, "loss": 0.91747737, "num_input_tokens_seen": 64252300, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0234375, "step": 2972, "time_per_iteration": 4.275820970535278 }, { "auxiliary_loss_clip": 0.01181586, "auxiliary_loss_mlp": 0.01055656, "balance_loss_clip": 1.03692818, "balance_loss_mlp": 1.05274415, "epoch": 0.17874643018187283, "flos": 16981977139200.0, "grad_norm": 2.303680120020041, "language_loss": 0.88084984, "learning_rate": 3.693175538636014e-06, "loss": 0.90322226, "num_input_tokens_seen": 64270105, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.015625, "step": 2973, "time_per_iteration": 2.666769504547119 }, { "auxiliary_loss_clip": 0.01153563, "auxiliary_loss_mlp": 0.0104702, "balance_loss_clip": 1.02764821, "balance_loss_mlp": 1.05258012, "epoch": 0.1788065534345408, "flos": 21032233793280.0, "grad_norm": 2.211727593302289, "language_loss": 0.76198238, "learning_rate": 3.692974412815116e-06, "loss": 0.78398824, "num_input_tokens_seen": 64287250, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0078125, "step": 2974, "time_per_iteration": 2.602708578109741 }, { "auxiliary_loss_clip": 0.01174086, "auxiliary_loss_mlp": 0.01049538, "balance_loss_clip": 1.02982104, "balance_loss_mlp": 1.05393386, "epoch": 0.17886667668720876, "flos": 23987717775360.0, "grad_norm": 3.4823220150446006, "language_loss": 0.74684441, "learning_rate": 3.692773226576315e-06, "loss": 0.76908064, "num_input_tokens_seen": 64307140, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.0234375, "step": 2975, "time_per_iteration": 2.6414031982421875 }, { "auxiliary_loss_clip": 0.01161651, "auxiliary_loss_mlp": 0.01047819, "balance_loss_clip": 1.0291512, "balance_loss_mlp": 1.05266023, "epoch": 0.17892679993987676, "flos": 25739476156800.0, "grad_norm": 1.704171091953757, "language_loss": 0.73007011, "learning_rate": 3.692571979926793e-06, "loss": 0.75216484, "num_input_tokens_seen": 64328760, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.99609375, "step": 2976, "time_per_iteration": 2.6161041259765625 }, { "auxiliary_loss_clip": 0.01157475, "auxiliary_loss_mlp": 0.01044319, "balance_loss_clip": 1.02886367, "balance_loss_mlp": 1.05294549, "epoch": 0.17898692319254472, "flos": 25699686865920.0, "grad_norm": 1.647798104785232, "language_loss": 0.77112836, "learning_rate": 3.69237067287373e-06, "loss": 0.79314631, "num_input_tokens_seen": 64348800, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.953125, "step": 2977, "time_per_iteration": 2.856630563735962 }, { "auxiliary_loss_clip": 0.01182124, "auxiliary_loss_mlp": 0.01048369, "balance_loss_clip": 1.0308094, "balance_loss_mlp": 1.05579615, "epoch": 0.1790470464452127, "flos": 19317786664320.0, "grad_norm": 2.068430103153036, "language_loss": 0.79602033, "learning_rate": 3.6921693054243118e-06, "loss": 0.81832522, "num_input_tokens_seen": 64367955, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.9921875, "step": 2978, "time_per_iteration": 2.5903825759887695 }, { "auxiliary_loss_clip": 0.01174064, "auxiliary_loss_mlp": 0.01044518, "balance_loss_clip": 1.02574253, "balance_loss_mlp": 1.05260897, "epoch": 0.17910716969788065, "flos": 30044267562240.0, "grad_norm": 1.4578332235466822, "language_loss": 0.76199979, "learning_rate": 3.6919678775857235e-06, "loss": 0.78418565, "num_input_tokens_seen": 64389805, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.03125, "step": 2979, "time_per_iteration": 2.747772455215454 }, { "auxiliary_loss_clip": 0.01178558, "auxiliary_loss_mlp": 0.01042715, "balance_loss_clip": 1.02477455, "balance_loss_mlp": 1.05232215, "epoch": 0.17916729295054862, "flos": 19427709260160.0, "grad_norm": 2.1122758897709852, "language_loss": 0.69138038, "learning_rate": 3.691766389365154e-06, "loss": 0.71359313, "num_input_tokens_seen": 64408220, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9921875, "step": 2980, "time_per_iteration": 2.5993762016296387 }, { "auxiliary_loss_clip": 0.011641, "auxiliary_loss_mlp": 0.01044885, "balance_loss_clip": 1.02446413, "balance_loss_mlp": 1.05441248, "epoch": 0.17922741620321658, "flos": 14611549881600.0, "grad_norm": 1.8420406354146452, "language_loss": 0.70603859, "learning_rate": 3.6915648407697936e-06, "loss": 0.72812843, "num_input_tokens_seen": 64426380, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.0078125, "step": 2981, "time_per_iteration": 2.712258815765381 }, { "auxiliary_loss_clip": 0.01175121, "auxiliary_loss_mlp": 0.01060448, "balance_loss_clip": 1.04101682, "balance_loss_mlp": 1.05404675, "epoch": 0.17928753945588458, "flos": 17165301177600.0, "grad_norm": 2.0937215679252943, "language_loss": 0.81728798, "learning_rate": 3.691363231806836e-06, "loss": 0.83964366, "num_input_tokens_seen": 64444355, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.03125, "step": 2982, "time_per_iteration": 2.577587127685547 }, { "auxiliary_loss_clip": 0.01152047, "auxiliary_loss_mlp": 0.01044587, "balance_loss_clip": 1.02630019, "balance_loss_mlp": 1.05194712, "epoch": 0.17934766270855254, "flos": 31395622060800.0, "grad_norm": 1.5833897324078376, "language_loss": 0.826491, "learning_rate": 3.691161562483474e-06, "loss": 0.84845734, "num_input_tokens_seen": 64467800, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 1.0, "step": 2983, "time_per_iteration": 2.6309683322906494 }, { "auxiliary_loss_clip": 0.01163784, "auxiliary_loss_mlp": 0.0105176, "balance_loss_clip": 1.03231704, "balance_loss_mlp": 1.04988503, "epoch": 0.1794077859612205, "flos": 20814184281600.0, "grad_norm": 1.6881667098724706, "language_loss": 0.85358667, "learning_rate": 3.690959832806907e-06, "loss": 0.87574208, "num_input_tokens_seen": 64487230, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.046875, "step": 2984, "time_per_iteration": 2.611625909805298 }, { "auxiliary_loss_clip": 0.01180782, "auxiliary_loss_mlp": 0.01046688, "balance_loss_clip": 1.02718508, "balance_loss_mlp": 1.05099511, "epoch": 0.17946790921388847, "flos": 28986447006720.0, "grad_norm": 1.3466724502011067, "language_loss": 0.8914479, "learning_rate": 3.690758042784333e-06, "loss": 0.91372263, "num_input_tokens_seen": 64509165, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.03125, "step": 2985, "time_per_iteration": 2.7144675254821777 }, { "auxiliary_loss_clip": 0.01149318, "auxiliary_loss_mlp": 0.01041635, "balance_loss_clip": 1.02491021, "balance_loss_mlp": 1.05179274, "epoch": 0.17952803246655644, "flos": 20737406960640.0, "grad_norm": 1.9243867248940223, "language_loss": 0.69713831, "learning_rate": 3.690556192422954e-06, "loss": 0.71904778, "num_input_tokens_seen": 64527940, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.9765625, "step": 2986, "time_per_iteration": 2.5575921535491943 }, { "auxiliary_loss_clip": 0.01151599, "auxiliary_loss_mlp": 0.01293191, "balance_loss_clip": 1.026088, "balance_loss_mlp": 1.05179429, "epoch": 0.1795881557192244, "flos": 28255988027520.0, "grad_norm": 2.4048149700022075, "language_loss": 0.77677286, "learning_rate": 3.6903542817299725e-06, "loss": 0.80122077, "num_input_tokens_seen": 64545230, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.99609375, "step": 2987, "time_per_iteration": 2.692772388458252 }, { "auxiliary_loss_clip": 0.01154513, "auxiliary_loss_mlp": 0.01045285, "balance_loss_clip": 1.02534091, "balance_loss_mlp": 1.05101132, "epoch": 0.17964827897189237, "flos": 18552027594240.0, "grad_norm": 1.7592745949260697, "language_loss": 0.77523375, "learning_rate": 3.690152310712595e-06, "loss": 0.79723167, "num_input_tokens_seen": 64563820, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.0390625, "step": 2988, "time_per_iteration": 2.5199484825134277 }, { "auxiliary_loss_clip": 0.01097095, "auxiliary_loss_mlp": 0.01254065, "balance_loss_clip": 1.00352001, "balance_loss_mlp": 1.02486598, "epoch": 0.17970840222456036, "flos": 58165088711040.0, "grad_norm": 0.766225012536995, "language_loss": 0.62777144, "learning_rate": 3.6899502793780295e-06, "loss": 0.65128303, "num_input_tokens_seen": 64621315, "router_z_loss_clip": 0.02709961, "router_z_loss_mlp": 0.359375, "step": 2989, "time_per_iteration": 3.120269536972046 }, { "auxiliary_loss_clip": 0.01173332, "auxiliary_loss_mlp": 0.01036532, "balance_loss_clip": 1.01831639, "balance_loss_mlp": 1.05375922, "epoch": 0.17976852547722832, "flos": 20300805146880.0, "grad_norm": 2.584293780346194, "language_loss": 0.70583171, "learning_rate": 3.689748187733485e-06, "loss": 0.72793031, "num_input_tokens_seen": 64639885, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 1.015625, "step": 2990, "time_per_iteration": 2.6434485912323 }, { "auxiliary_loss_clip": 0.01172519, "auxiliary_loss_mlp": 0.01043142, "balance_loss_clip": 1.02492714, "balance_loss_mlp": 1.05047071, "epoch": 0.1798286487298963, "flos": 39669367685760.0, "grad_norm": 2.042966656137986, "language_loss": 0.68512177, "learning_rate": 3.6895460357861743e-06, "loss": 0.70727837, "num_input_tokens_seen": 64661220, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.0390625, "step": 2991, "time_per_iteration": 2.6769986152648926 }, { "auxiliary_loss_clip": 0.01189959, "auxiliary_loss_mlp": 0.01042442, "balance_loss_clip": 1.02375007, "balance_loss_mlp": 1.05239487, "epoch": 0.17988877198256426, "flos": 25520313323520.0, "grad_norm": 1.7762481095914628, "language_loss": 0.82896435, "learning_rate": 3.6893438235433117e-06, "loss": 0.85128832, "num_input_tokens_seen": 64682530, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.015625, "step": 2992, "time_per_iteration": 2.6148693561553955 }, { "auxiliary_loss_clip": 0.01155474, "auxiliary_loss_mlp": 0.01041374, "balance_loss_clip": 1.02389789, "balance_loss_mlp": 1.04892802, "epoch": 0.17994889523523222, "flos": 18807496099200.0, "grad_norm": 2.1402406879756444, "language_loss": 0.82184017, "learning_rate": 3.689141551012114e-06, "loss": 0.84380865, "num_input_tokens_seen": 64701025, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.9765625, "step": 2993, "time_per_iteration": 2.521120548248291 }, { "auxiliary_loss_clip": 0.01158771, "auxiliary_loss_mlp": 0.01046128, "balance_loss_clip": 1.02769852, "balance_loss_mlp": 1.0488807, "epoch": 0.18000901848790019, "flos": 21104450087040.0, "grad_norm": 1.771878519297794, "language_loss": 0.78292024, "learning_rate": 3.688939218199799e-06, "loss": 0.80496919, "num_input_tokens_seen": 64719570, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.0078125, "step": 2994, "time_per_iteration": 2.567861318588257 }, { "auxiliary_loss_clip": 0.01153987, "auxiliary_loss_mlp": 0.01046212, "balance_loss_clip": 1.02881932, "balance_loss_mlp": 1.05261135, "epoch": 0.18006914174056818, "flos": 19646441130240.0, "grad_norm": 2.3324658343260176, "language_loss": 0.80026805, "learning_rate": 3.6887368251135875e-06, "loss": 0.82227004, "num_input_tokens_seen": 64738110, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 1.015625, "step": 2995, "time_per_iteration": 2.5567257404327393 }, { "auxiliary_loss_clip": 0.01162461, "auxiliary_loss_mlp": 0.01045671, "balance_loss_clip": 1.02817154, "balance_loss_mlp": 1.05236912, "epoch": 0.18012926499323614, "flos": 19499889640320.0, "grad_norm": 1.9897964032176605, "language_loss": 0.84432817, "learning_rate": 3.688534371760703e-06, "loss": 0.86640948, "num_input_tokens_seen": 64756345, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 1.0078125, "step": 2996, "time_per_iteration": 2.69578218460083 }, { "auxiliary_loss_clip": 0.01173851, "auxiliary_loss_mlp": 0.01039548, "balance_loss_clip": 1.02213192, "balance_loss_mlp": 1.05018508, "epoch": 0.1801893882459041, "flos": 19464553635840.0, "grad_norm": 1.7636040902183563, "language_loss": 0.88421518, "learning_rate": 3.68833185814837e-06, "loss": 0.90634918, "num_input_tokens_seen": 64776375, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.96484375, "step": 2997, "time_per_iteration": 2.593439817428589 }, { "auxiliary_loss_clip": 0.0117216, "auxiliary_loss_mlp": 0.01048016, "balance_loss_clip": 1.02856135, "balance_loss_mlp": 1.04939234, "epoch": 0.18024951149857207, "flos": 26870590414080.0, "grad_norm": 1.9362966450235373, "language_loss": 0.85000437, "learning_rate": 3.688129284283816e-06, "loss": 0.87220615, "num_input_tokens_seen": 64796210, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.0546875, "step": 2998, "time_per_iteration": 2.6309549808502197 }, { "auxiliary_loss_clip": 0.01153128, "auxiliary_loss_mlp": 0.01045006, "balance_loss_clip": 1.02723193, "balance_loss_mlp": 1.0544771, "epoch": 0.18030963475124004, "flos": 30226621933440.0, "grad_norm": 1.7578838846932823, "language_loss": 0.84088778, "learning_rate": 3.6879266501742705e-06, "loss": 0.86286914, "num_input_tokens_seen": 64818590, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.984375, "step": 2999, "time_per_iteration": 2.6370227336883545 }, { "auxiliary_loss_clip": 0.0116049, "auxiliary_loss_mlp": 0.01046118, "balance_loss_clip": 1.02690136, "balance_loss_mlp": 1.05085444, "epoch": 0.180369758003908, "flos": 22307493329280.0, "grad_norm": 1.759992543778092, "language_loss": 0.75050104, "learning_rate": 3.6877239558269642e-06, "loss": 0.77256715, "num_input_tokens_seen": 64838350, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.0078125, "step": 3000, "time_per_iteration": 2.560246229171753 }, { "auxiliary_loss_clip": 0.01166921, "auxiliary_loss_mlp": 0.01304877, "balance_loss_clip": 1.03793716, "balance_loss_mlp": 1.05041742, "epoch": 0.18042988125657597, "flos": 23732033788800.0, "grad_norm": 1.7215321746940673, "language_loss": 0.70732939, "learning_rate": 3.687521201249132e-06, "loss": 0.73204732, "num_input_tokens_seen": 64858065, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.984375, "step": 3001, "time_per_iteration": 2.6131553649902344 }, { "auxiliary_loss_clip": 0.01162658, "auxiliary_loss_mlp": 0.01051044, "balance_loss_clip": 1.03210139, "balance_loss_mlp": 1.05076253, "epoch": 0.18049000450924396, "flos": 24093582134400.0, "grad_norm": 1.90649804982052, "language_loss": 0.88041288, "learning_rate": 3.687318386448008e-06, "loss": 0.90254986, "num_input_tokens_seen": 64877305, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.03125, "step": 3002, "time_per_iteration": 2.6077535152435303 }, { "auxiliary_loss_clip": 0.01150232, "auxiliary_loss_mlp": 0.01046468, "balance_loss_clip": 1.02869368, "balance_loss_mlp": 1.05012178, "epoch": 0.18055012776191193, "flos": 22163168482560.0, "grad_norm": 1.7070613557353123, "language_loss": 0.80260563, "learning_rate": 3.687115511430832e-06, "loss": 0.82457256, "num_input_tokens_seen": 64896955, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 1.0, "step": 3003, "time_per_iteration": 2.6499452590942383 }, { "auxiliary_loss_clip": 0.01152068, "auxiliary_loss_mlp": 0.01041286, "balance_loss_clip": 1.02333331, "balance_loss_mlp": 1.04900968, "epoch": 0.1806102510145799, "flos": 28913512440960.0, "grad_norm": 1.9304589744990013, "language_loss": 0.66967189, "learning_rate": 3.6869125762048423e-06, "loss": 0.69160539, "num_input_tokens_seen": 64917080, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.03125, "step": 3004, "time_per_iteration": 2.6992669105529785 }, { "auxiliary_loss_clip": 0.01162355, "auxiliary_loss_mlp": 0.01047871, "balance_loss_clip": 1.02905989, "balance_loss_mlp": 1.05223417, "epoch": 0.18067037426724786, "flos": 19025689265280.0, "grad_norm": 1.856586681287515, "language_loss": 0.85232669, "learning_rate": 3.6867095807772826e-06, "loss": 0.87442893, "num_input_tokens_seen": 64935215, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.0078125, "step": 3005, "time_per_iteration": 2.6340255737304688 }, { "auxiliary_loss_clip": 0.0115612, "auxiliary_loss_mlp": 0.01043484, "balance_loss_clip": 1.02702081, "balance_loss_mlp": 1.04931819, "epoch": 0.18073049751991582, "flos": 27453635976960.0, "grad_norm": 1.4815267893934367, "language_loss": 0.82938361, "learning_rate": 3.6865065251553967e-06, "loss": 0.85137963, "num_input_tokens_seen": 64956275, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.9765625, "step": 3006, "time_per_iteration": 2.7580032348632812 }, { "auxiliary_loss_clip": 0.01166679, "auxiliary_loss_mlp": 0.01052534, "balance_loss_clip": 1.03398466, "balance_loss_mlp": 1.04863548, "epoch": 0.1807906207725838, "flos": 28729039167360.0, "grad_norm": 1.576744854440682, "language_loss": 0.77290457, "learning_rate": 3.6863034093464307e-06, "loss": 0.79509676, "num_input_tokens_seen": 64979390, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.99609375, "step": 3007, "time_per_iteration": 2.7953896522521973 }, { "auxiliary_loss_clip": 0.01084048, "auxiliary_loss_mlp": 0.01021398, "balance_loss_clip": 1.01873994, "balance_loss_mlp": 1.02997255, "epoch": 0.18085074402525175, "flos": 64466515468800.0, "grad_norm": 0.8026682709900244, "language_loss": 0.56963933, "learning_rate": 3.686100233357634e-06, "loss": 0.59069377, "num_input_tokens_seen": 65043135, "router_z_loss_clip": 0.02661133, "router_z_loss_mlp": 0.36132812, "step": 3008, "time_per_iteration": 3.406949281692505 }, { "auxiliary_loss_clip": 0.01170352, "auxiliary_loss_mlp": 0.01052423, "balance_loss_clip": 1.03389788, "balance_loss_mlp": 1.05220425, "epoch": 0.18091086727791975, "flos": 23476960333440.0, "grad_norm": 1.9170326801986777, "language_loss": 0.67188644, "learning_rate": 3.6858969971962573e-06, "loss": 0.69411421, "num_input_tokens_seen": 65062845, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0, "step": 3009, "time_per_iteration": 7.000864267349243 }, { "auxiliary_loss_clip": 0.01154158, "auxiliary_loss_mlp": 0.01044009, "balance_loss_clip": 1.02662802, "balance_loss_mlp": 1.05220652, "epoch": 0.1809709905305877, "flos": 24170467196160.0, "grad_norm": 2.336187326999141, "language_loss": 0.75836027, "learning_rate": 3.685693700869553e-06, "loss": 0.78034198, "num_input_tokens_seen": 65082110, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 1.015625, "step": 3010, "time_per_iteration": 2.5829899311065674 }, { "auxiliary_loss_clip": 0.01162885, "auxiliary_loss_mlp": 0.01040057, "balance_loss_clip": 1.02357006, "balance_loss_mlp": 1.04934263, "epoch": 0.18103111378325568, "flos": 21650902669440.0, "grad_norm": 1.7109325724284532, "language_loss": 0.67154193, "learning_rate": 3.6854903443847772e-06, "loss": 0.69357133, "num_input_tokens_seen": 65101985, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.95703125, "step": 3011, "time_per_iteration": 2.701667070388794 }, { "auxiliary_loss_clip": 0.01171701, "auxiliary_loss_mlp": 0.01292505, "balance_loss_clip": 1.02762246, "balance_loss_mlp": 1.04816341, "epoch": 0.18109123703592364, "flos": 53686918356480.0, "grad_norm": 1.5593563096423457, "language_loss": 0.71431327, "learning_rate": 3.6852869277491865e-06, "loss": 0.73895532, "num_input_tokens_seen": 65129295, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.96484375, "step": 3012, "time_per_iteration": 2.889547109603882 }, { "auxiliary_loss_clip": 0.01166061, "auxiliary_loss_mlp": 0.01047609, "balance_loss_clip": 1.03010881, "balance_loss_mlp": 1.05180514, "epoch": 0.1811513602885916, "flos": 35845564325760.0, "grad_norm": 5.290697406675095, "language_loss": 0.62180662, "learning_rate": 3.68508345097004e-06, "loss": 0.64394331, "num_input_tokens_seen": 65150625, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.9609375, "step": 3013, "time_per_iteration": 2.6667895317077637 }, { "auxiliary_loss_clip": 0.01169723, "auxiliary_loss_mlp": 0.0105359, "balance_loss_clip": 1.03704393, "balance_loss_mlp": 1.05096388, "epoch": 0.18121148354125957, "flos": 23732572492800.0, "grad_norm": 1.6323552641006291, "language_loss": 0.76002675, "learning_rate": 3.6848799140546e-06, "loss": 0.78225988, "num_input_tokens_seen": 65170880, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 1.0078125, "step": 3014, "time_per_iteration": 4.0034966468811035 }, { "auxiliary_loss_clip": 0.01155625, "auxiliary_loss_mlp": 0.01043279, "balance_loss_clip": 1.02360892, "balance_loss_mlp": 1.05265796, "epoch": 0.18127160679392756, "flos": 28728320895360.0, "grad_norm": 1.5977464665171406, "language_loss": 0.66052115, "learning_rate": 3.6846763170101297e-06, "loss": 0.68251014, "num_input_tokens_seen": 65192530, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.03125, "step": 3015, "time_per_iteration": 2.60439133644104 }, { "auxiliary_loss_clip": 0.01167037, "auxiliary_loss_mlp": 0.01288515, "balance_loss_clip": 1.02300882, "balance_loss_mlp": 1.05155003, "epoch": 0.18133173004659553, "flos": 20485062938880.0, "grad_norm": 2.662616853845424, "language_loss": 0.77942067, "learning_rate": 3.684472659843895e-06, "loss": 0.80397618, "num_input_tokens_seen": 65211675, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.97265625, "step": 3016, "time_per_iteration": 2.6349470615386963 }, { "auxiliary_loss_clip": 0.01165934, "auxiliary_loss_mlp": 0.0104471, "balance_loss_clip": 1.02629244, "balance_loss_mlp": 1.05462837, "epoch": 0.1813918532992635, "flos": 22852078404480.0, "grad_norm": 1.7842169144447164, "language_loss": 0.83679795, "learning_rate": 3.6842689425631645e-06, "loss": 0.85890448, "num_input_tokens_seen": 65231185, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0234375, "step": 3017, "time_per_iteration": 2.593505620956421 }, { "auxiliary_loss_clip": 0.01156901, "auxiliary_loss_mlp": 0.01035052, "balance_loss_clip": 1.01817155, "balance_loss_mlp": 1.05104613, "epoch": 0.18145197655193146, "flos": 36065122208640.0, "grad_norm": 2.0541067614515227, "language_loss": 0.67453647, "learning_rate": 3.684065165175208e-06, "loss": 0.69645602, "num_input_tokens_seen": 65251645, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.96875, "step": 3018, "time_per_iteration": 2.7361114025115967 }, { "auxiliary_loss_clip": 0.01160259, "auxiliary_loss_mlp": 0.01284027, "balance_loss_clip": 1.01840615, "balance_loss_mlp": 1.05090404, "epoch": 0.18151209980459942, "flos": 24023951619840.0, "grad_norm": 2.1720393245579337, "language_loss": 0.75618422, "learning_rate": 3.683861327687297e-06, "loss": 0.78062713, "num_input_tokens_seen": 65271125, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 1.0078125, "step": 3019, "time_per_iteration": 2.5680294036865234 }, { "auxiliary_loss_clip": 0.01179983, "auxiliary_loss_mlp": 0.01041259, "balance_loss_clip": 1.02257884, "balance_loss_mlp": 1.05210721, "epoch": 0.1815722230572674, "flos": 23951627585280.0, "grad_norm": 2.140052290075239, "language_loss": 0.81742823, "learning_rate": 3.683657430106707e-06, "loss": 0.83964062, "num_input_tokens_seen": 65290600, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0, "step": 3020, "time_per_iteration": 2.6699867248535156 }, { "auxiliary_loss_clip": 0.01170858, "auxiliary_loss_mlp": 0.01041322, "balance_loss_clip": 1.02254617, "balance_loss_mlp": 1.05290031, "epoch": 0.18163234630993536, "flos": 24386469632640.0, "grad_norm": 1.9432179741261002, "language_loss": 0.77327132, "learning_rate": 3.683453472440714e-06, "loss": 0.79539311, "num_input_tokens_seen": 65311040, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0, "step": 3021, "time_per_iteration": 2.514082908630371 }, { "auxiliary_loss_clip": 0.01154047, "auxiliary_loss_mlp": 0.01040621, "balance_loss_clip": 1.02300167, "balance_loss_mlp": 1.04628706, "epoch": 0.18169246956260335, "flos": 24681332378880.0, "grad_norm": 1.9446395952621376, "language_loss": 0.84609646, "learning_rate": 3.6832494546965975e-06, "loss": 0.86804312, "num_input_tokens_seen": 65332115, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9921875, "step": 3022, "time_per_iteration": 2.5672428607940674 }, { "auxiliary_loss_clip": 0.01179888, "auxiliary_loss_mlp": 0.01044546, "balance_loss_clip": 1.02471006, "balance_loss_mlp": 1.04999518, "epoch": 0.1817525928152713, "flos": 24243294021120.0, "grad_norm": 1.9580842674227987, "language_loss": 0.68772501, "learning_rate": 3.6830453768816376e-06, "loss": 0.70996934, "num_input_tokens_seen": 65352210, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.03125, "step": 3023, "time_per_iteration": 2.5621120929718018 }, { "auxiliary_loss_clip": 0.0116635, "auxiliary_loss_mlp": 0.01041064, "balance_loss_clip": 1.02365971, "balance_loss_mlp": 1.0497508, "epoch": 0.18181271606793928, "flos": 16472081623680.0, "grad_norm": 1.7661195103067644, "language_loss": 0.73908305, "learning_rate": 3.6828412390031174e-06, "loss": 0.76115716, "num_input_tokens_seen": 65370600, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.984375, "step": 3024, "time_per_iteration": 2.579228401184082 }, { "auxiliary_loss_clip": 0.0115871, "auxiliary_loss_mlp": 0.01044127, "balance_loss_clip": 1.02550614, "balance_loss_mlp": 1.04960632, "epoch": 0.18187283932060724, "flos": 18581042805120.0, "grad_norm": 2.215274231809193, "language_loss": 0.88072836, "learning_rate": 3.682637041068322e-06, "loss": 0.90275681, "num_input_tokens_seen": 65387270, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0, "step": 3025, "time_per_iteration": 2.4987454414367676 }, { "auxiliary_loss_clip": 0.01166689, "auxiliary_loss_mlp": 0.01046856, "balance_loss_clip": 1.02835453, "balance_loss_mlp": 1.05061221, "epoch": 0.1819329625732752, "flos": 20266833859200.0, "grad_norm": 1.7826090517402324, "language_loss": 0.78741312, "learning_rate": 3.6824327830845387e-06, "loss": 0.80954862, "num_input_tokens_seen": 65406550, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.98046875, "step": 3026, "time_per_iteration": 2.6084768772125244 }, { "auxiliary_loss_clip": 0.01170034, "auxiliary_loss_mlp": 0.01052817, "balance_loss_clip": 1.03445864, "balance_loss_mlp": 1.05065084, "epoch": 0.18199308582594317, "flos": 25915186512000.0, "grad_norm": 2.084260310336886, "language_loss": 0.75786126, "learning_rate": 3.6822284650590576e-06, "loss": 0.7800898, "num_input_tokens_seen": 65425955, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.015625, "step": 3027, "time_per_iteration": 2.596889019012451 }, { "auxiliary_loss_clip": 0.0115337, "auxiliary_loss_mlp": 0.01048208, "balance_loss_clip": 1.02884829, "balance_loss_mlp": 1.0506134, "epoch": 0.18205320907861114, "flos": 15377524433280.0, "grad_norm": 1.868769619529375, "language_loss": 0.85716259, "learning_rate": 3.68202408699917e-06, "loss": 0.8791784, "num_input_tokens_seen": 65442820, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.03125, "step": 3028, "time_per_iteration": 2.5573174953460693 }, { "auxiliary_loss_clip": 0.01148222, "auxiliary_loss_mlp": 0.01043238, "balance_loss_clip": 1.02532089, "balance_loss_mlp": 1.05077481, "epoch": 0.18211333233127913, "flos": 25624310175360.0, "grad_norm": 2.31972230196858, "language_loss": 0.82799733, "learning_rate": 3.6818196489121683e-06, "loss": 0.84991193, "num_input_tokens_seen": 65461825, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9765625, "step": 3029, "time_per_iteration": 2.5927345752716064 }, { "auxiliary_loss_clip": 0.01178294, "auxiliary_loss_mlp": 0.0104341, "balance_loss_clip": 1.02462232, "balance_loss_mlp": 1.05072546, "epoch": 0.1821734555839471, "flos": 14976007228800.0, "grad_norm": 2.8055334496837, "language_loss": 0.77799046, "learning_rate": 3.68161515080535e-06, "loss": 0.8002075, "num_input_tokens_seen": 65479480, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0, "step": 3030, "time_per_iteration": 2.5946600437164307 }, { "auxiliary_loss_clip": 0.01161436, "auxiliary_loss_mlp": 0.01044642, "balance_loss_clip": 1.02592587, "balance_loss_mlp": 1.05033159, "epoch": 0.18223357883661506, "flos": 20194007034240.0, "grad_norm": 1.9401864601192476, "language_loss": 0.84926474, "learning_rate": 3.681410592686013e-06, "loss": 0.87132549, "num_input_tokens_seen": 65497775, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.015625, "step": 3031, "time_per_iteration": 2.57261061668396 }, { "auxiliary_loss_clip": 0.01177215, "auxiliary_loss_mlp": 0.01295715, "balance_loss_clip": 1.02965498, "balance_loss_mlp": 1.04997897, "epoch": 0.18229370208928303, "flos": 15231978524160.0, "grad_norm": 2.125284825792004, "language_loss": 0.79756725, "learning_rate": 3.681205974561457e-06, "loss": 0.8222965, "num_input_tokens_seen": 65516505, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.0, "step": 3032, "time_per_iteration": 2.579852342605591 }, { "auxiliary_loss_clip": 0.0115654, "auxiliary_loss_mlp": 0.01057098, "balance_loss_clip": 1.03851318, "balance_loss_mlp": 1.05358326, "epoch": 0.182353825341951, "flos": 23840483927040.0, "grad_norm": 2.1380378665046393, "language_loss": 0.81227708, "learning_rate": 3.6810012964389846e-06, "loss": 0.83441341, "num_input_tokens_seen": 65536160, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.03125, "step": 3033, "time_per_iteration": 2.6122817993164062 }, { "auxiliary_loss_clip": 0.01103602, "auxiliary_loss_mlp": 0.01013474, "balance_loss_clip": 1.01079166, "balance_loss_mlp": 1.02386141, "epoch": 0.18241394859461896, "flos": 61190957393280.0, "grad_norm": 0.8913944625562246, "language_loss": 0.63532686, "learning_rate": 3.680796558325899e-06, "loss": 0.6564976, "num_input_tokens_seen": 65589375, "router_z_loss_clip": 0.02685547, "router_z_loss_mlp": 0.34375, "step": 3034, "time_per_iteration": 3.1476874351501465 }, { "auxiliary_loss_clip": 0.01177038, "auxiliary_loss_mlp": 0.0104575, "balance_loss_clip": 1.02801192, "balance_loss_mlp": 1.05060053, "epoch": 0.18247407184728695, "flos": 18471694826880.0, "grad_norm": 1.9494227491780445, "language_loss": 0.78997219, "learning_rate": 3.6805917602295084e-06, "loss": 0.81220007, "num_input_tokens_seen": 65606720, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9921875, "step": 3035, "time_per_iteration": 2.6824347972869873 }, { "auxiliary_loss_clip": 0.01164528, "auxiliary_loss_mlp": 0.01044218, "balance_loss_clip": 1.02689707, "balance_loss_mlp": 1.05034029, "epoch": 0.18253419509995492, "flos": 21795191602560.0, "grad_norm": 1.8556959856050672, "language_loss": 0.84286153, "learning_rate": 3.680386902157121e-06, "loss": 0.86494905, "num_input_tokens_seen": 65625495, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.9609375, "step": 3036, "time_per_iteration": 2.6028857231140137 }, { "auxiliary_loss_clip": 0.0115529, "auxiliary_loss_mlp": 0.01041552, "balance_loss_clip": 1.02481496, "balance_loss_mlp": 1.05146599, "epoch": 0.18259431835262288, "flos": 20149764456960.0, "grad_norm": 2.057681164809509, "language_loss": 0.80269492, "learning_rate": 3.680181984116047e-06, "loss": 0.82466334, "num_input_tokens_seen": 65643515, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.94921875, "step": 3037, "time_per_iteration": 2.6094822883605957 }, { "auxiliary_loss_clip": 0.01168752, "auxiliary_loss_mlp": 0.012972, "balance_loss_clip": 1.02945566, "balance_loss_mlp": 1.05190146, "epoch": 0.18265444160529085, "flos": 16981653916800.0, "grad_norm": 3.9282963646409854, "language_loss": 0.79073358, "learning_rate": 3.6799770061136e-06, "loss": 0.81539315, "num_input_tokens_seen": 65658155, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.98828125, "step": 3038, "time_per_iteration": 2.5788280963897705 }, { "auxiliary_loss_clip": 0.01163461, "auxiliary_loss_mlp": 0.01044449, "balance_loss_clip": 1.02797401, "balance_loss_mlp": 1.04841483, "epoch": 0.1827145648579588, "flos": 34423250509440.0, "grad_norm": 1.8874085458074312, "language_loss": 0.67268002, "learning_rate": 3.6797719681570953e-06, "loss": 0.69475907, "num_input_tokens_seen": 65679310, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.96875, "step": 3039, "time_per_iteration": 2.852027177810669 }, { "auxiliary_loss_clip": 0.01168832, "auxiliary_loss_mlp": 0.01055097, "balance_loss_clip": 1.03694153, "balance_loss_mlp": 1.05151081, "epoch": 0.18277468811062678, "flos": 53287017264000.0, "grad_norm": 1.6513136588452495, "language_loss": 0.73306978, "learning_rate": 3.6795668702538505e-06, "loss": 0.75530905, "num_input_tokens_seen": 65705235, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9921875, "step": 3040, "time_per_iteration": 2.885021686553955 }, { "auxiliary_loss_clip": 0.01156787, "auxiliary_loss_mlp": 0.01046342, "balance_loss_clip": 1.02785265, "balance_loss_mlp": 1.04964828, "epoch": 0.18283481136329474, "flos": 31650659602560.0, "grad_norm": 1.8879640317380135, "language_loss": 0.60504353, "learning_rate": 3.6793617124111836e-06, "loss": 0.62707484, "num_input_tokens_seen": 65727575, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.98046875, "step": 3041, "time_per_iteration": 2.6817445755004883 }, { "auxiliary_loss_clip": 0.01174856, "auxiliary_loss_mlp": 0.01056734, "balance_loss_clip": 1.03875721, "balance_loss_mlp": 1.0495373, "epoch": 0.18289493461596273, "flos": 53137664513280.0, "grad_norm": 1.5518924438237347, "language_loss": 0.59841621, "learning_rate": 3.6791564946364176e-06, "loss": 0.62073207, "num_input_tokens_seen": 65751370, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.984375, "step": 3042, "time_per_iteration": 2.925534725189209 }, { "auxiliary_loss_clip": 0.01166581, "auxiliary_loss_mlp": 0.0104687, "balance_loss_clip": 1.02973962, "balance_loss_mlp": 1.05141914, "epoch": 0.1829550578686307, "flos": 25589369220480.0, "grad_norm": 1.7034753736065575, "language_loss": 0.87203419, "learning_rate": 3.678951216936875e-06, "loss": 0.89416867, "num_input_tokens_seen": 65771040, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.96875, "step": 3043, "time_per_iteration": 2.6220617294311523 }, { "auxiliary_loss_clip": 0.0121324, "auxiliary_loss_mlp": 0.01044948, "balance_loss_clip": 1.02644718, "balance_loss_mlp": 1.05042195, "epoch": 0.18301518112129866, "flos": 22601422321920.0, "grad_norm": 2.317199958216482, "language_loss": 0.70990163, "learning_rate": 3.6787458793198825e-06, "loss": 0.73248351, "num_input_tokens_seen": 65789345, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.9921875, "step": 3044, "time_per_iteration": 2.685225486755371 }, { "auxiliary_loss_clip": 0.01188876, "auxiliary_loss_mlp": 0.01048203, "balance_loss_clip": 1.02920079, "balance_loss_mlp": 1.05038798, "epoch": 0.18307530437396663, "flos": 34020799551360.0, "grad_norm": 1.7461290091789632, "language_loss": 0.63754714, "learning_rate": 3.678540481792768e-06, "loss": 0.65991795, "num_input_tokens_seen": 65810990, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0234375, "step": 3045, "time_per_iteration": 2.7407116889953613 }, { "auxiliary_loss_clip": 0.01147732, "auxiliary_loss_mlp": 0.01051335, "balance_loss_clip": 1.03379893, "balance_loss_mlp": 1.05059123, "epoch": 0.1831354276266346, "flos": 21279765392640.0, "grad_norm": 2.3966547769252617, "language_loss": 0.78814054, "learning_rate": 3.6783350243628613e-06, "loss": 0.81013119, "num_input_tokens_seen": 65827230, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.96875, "step": 3046, "time_per_iteration": 2.5159642696380615 }, { "auxiliary_loss_clip": 0.0117526, "auxiliary_loss_mlp": 0.01041905, "balance_loss_clip": 1.02372527, "balance_loss_mlp": 1.04706264, "epoch": 0.18319555087930256, "flos": 21032952065280.0, "grad_norm": 2.201172658601568, "language_loss": 0.78559184, "learning_rate": 3.678129507037495e-06, "loss": 0.80776346, "num_input_tokens_seen": 65845900, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.0078125, "step": 3047, "time_per_iteration": 2.5329036712646484 }, { "auxiliary_loss_clip": 0.0115532, "auxiliary_loss_mlp": 0.01039831, "balance_loss_clip": 1.0227598, "balance_loss_mlp": 1.05087662, "epoch": 0.18325567413197055, "flos": 34382958428160.0, "grad_norm": 1.5172304513271204, "language_loss": 0.80587077, "learning_rate": 3.6779239298240032e-06, "loss": 0.82782233, "num_input_tokens_seen": 65868730, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.953125, "step": 3048, "time_per_iteration": 2.6589467525482178 }, { "auxiliary_loss_clip": 0.01158001, "auxiliary_loss_mlp": 0.01037032, "balance_loss_clip": 1.01870918, "balance_loss_mlp": 1.04977751, "epoch": 0.18331579738463852, "flos": 20558464381440.0, "grad_norm": 1.914295340239438, "language_loss": 0.86325049, "learning_rate": 3.6777182927297225e-06, "loss": 0.8852008, "num_input_tokens_seen": 65888420, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.9921875, "step": 3049, "time_per_iteration": 2.5967483520507812 }, { "auxiliary_loss_clip": 0.01163985, "auxiliary_loss_mlp": 0.0104058, "balance_loss_clip": 1.02173328, "balance_loss_mlp": 1.04988325, "epoch": 0.18337592063730648, "flos": 19607872901760.0, "grad_norm": 2.9765056713705196, "language_loss": 0.7659508, "learning_rate": 3.6775125957619913e-06, "loss": 0.78799641, "num_input_tokens_seen": 65905840, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.046875, "step": 3050, "time_per_iteration": 4.01301908493042 }, { "auxiliary_loss_clip": 0.01162185, "auxiliary_loss_mlp": 0.01039097, "balance_loss_clip": 1.02171659, "balance_loss_mlp": 1.04617774, "epoch": 0.18343604388997445, "flos": 20850885002880.0, "grad_norm": 2.20674740777576, "language_loss": 0.99180549, "learning_rate": 3.6773068389281507e-06, "loss": 1.01381826, "num_input_tokens_seen": 65922845, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.98046875, "step": 3051, "time_per_iteration": 5.444629430770874 }, { "auxiliary_loss_clip": 0.01163796, "auxiliary_loss_mlp": 0.01037678, "balance_loss_clip": 1.02028561, "balance_loss_mlp": 1.04932117, "epoch": 0.1834961671426424, "flos": 24394370624640.0, "grad_norm": 2.00380399827294, "language_loss": 0.86478639, "learning_rate": 3.6771010222355434e-06, "loss": 0.88680118, "num_input_tokens_seen": 65945555, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9609375, "step": 3052, "time_per_iteration": 2.616398334503174 }, { "auxiliary_loss_clip": 0.01144989, "auxiliary_loss_mlp": 0.01043686, "balance_loss_clip": 1.0262816, "balance_loss_mlp": 1.04813719, "epoch": 0.18355629039531038, "flos": 21251612108160.0, "grad_norm": 2.7063582539203503, "language_loss": 0.73227137, "learning_rate": 3.6768951456915147e-06, "loss": 0.75415814, "num_input_tokens_seen": 65963965, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.96875, "step": 3053, "time_per_iteration": 2.57140851020813 }, { "auxiliary_loss_clip": 0.01150728, "auxiliary_loss_mlp": 0.01044671, "balance_loss_clip": 1.02636051, "balance_loss_mlp": 1.05123174, "epoch": 0.18361641364797834, "flos": 28656499651200.0, "grad_norm": 1.8843828888365821, "language_loss": 0.61866993, "learning_rate": 3.6766892093034123e-06, "loss": 0.64062393, "num_input_tokens_seen": 65985965, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.99609375, "step": 3054, "time_per_iteration": 2.726935625076294 }, { "auxiliary_loss_clip": 0.01183607, "auxiliary_loss_mlp": 0.01042306, "balance_loss_clip": 1.02455568, "balance_loss_mlp": 1.04927135, "epoch": 0.18367653690064634, "flos": 20918827578240.0, "grad_norm": 2.0940446929595655, "language_loss": 0.78206182, "learning_rate": 3.6764832130785846e-06, "loss": 0.80432093, "num_input_tokens_seen": 66005645, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.984375, "step": 3055, "time_per_iteration": 2.5764784812927246 }, { "auxiliary_loss_clip": 0.01178585, "auxiliary_loss_mlp": 0.01048643, "balance_loss_clip": 1.03122616, "balance_loss_mlp": 1.04697514, "epoch": 0.1837366601533143, "flos": 28765596234240.0, "grad_norm": 1.877635233889541, "language_loss": 0.69678807, "learning_rate": 3.6762771570243834e-06, "loss": 0.7190603, "num_input_tokens_seen": 66025675, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.953125, "step": 3056, "time_per_iteration": 4.1508238315582275 }, { "auxiliary_loss_clip": 0.01156809, "auxiliary_loss_mlp": 0.01039659, "balance_loss_clip": 1.02232587, "balance_loss_mlp": 1.04947305, "epoch": 0.18379678340598227, "flos": 21251432540160.0, "grad_norm": 1.5899776889223045, "language_loss": 0.80405617, "learning_rate": 3.6760710411481623e-06, "loss": 0.82602078, "num_input_tokens_seen": 66046125, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.984375, "step": 3057, "time_per_iteration": 2.67799973487854 }, { "auxiliary_loss_clip": 0.01162762, "auxiliary_loss_mlp": 0.01043495, "balance_loss_clip": 1.02330065, "balance_loss_mlp": 1.04899526, "epoch": 0.18385690665865023, "flos": 20449619193600.0, "grad_norm": 2.0020373865493917, "language_loss": 0.82405484, "learning_rate": 3.675864865457277e-06, "loss": 0.84611738, "num_input_tokens_seen": 66064375, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.046875, "step": 3058, "time_per_iteration": 2.5556089878082275 }, { "auxiliary_loss_clip": 0.01173542, "auxiliary_loss_mlp": 0.01295853, "balance_loss_clip": 1.02968073, "balance_loss_mlp": 1.0467664, "epoch": 0.1839170299113182, "flos": 26140562398080.0, "grad_norm": 2.106798238154275, "language_loss": 0.85211658, "learning_rate": 3.675658629959086e-06, "loss": 0.87681049, "num_input_tokens_seen": 66084590, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.99609375, "step": 3059, "time_per_iteration": 2.617140769958496 }, { "auxiliary_loss_clip": 0.01143864, "auxiliary_loss_mlp": 0.01290754, "balance_loss_clip": 1.02527285, "balance_loss_mlp": 1.04650247, "epoch": 0.18397715316398616, "flos": 31758032332800.0, "grad_norm": 1.5606316097905284, "language_loss": 0.72938299, "learning_rate": 3.6754523346609486e-06, "loss": 0.75372916, "num_input_tokens_seen": 66107105, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.97265625, "step": 3060, "time_per_iteration": 2.64274263381958 }, { "auxiliary_loss_clip": 0.01158153, "auxiliary_loss_mlp": 0.01039497, "balance_loss_clip": 1.02094793, "balance_loss_mlp": 1.04987597, "epoch": 0.18403727641665413, "flos": 24611989173120.0, "grad_norm": 1.7846943559806214, "language_loss": 0.72905141, "learning_rate": 3.675245979570227e-06, "loss": 0.75102788, "num_input_tokens_seen": 66129295, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.9921875, "step": 3061, "time_per_iteration": 2.669018030166626 }, { "auxiliary_loss_clip": 0.01155373, "auxiliary_loss_mlp": 0.01046876, "balance_loss_clip": 1.02897084, "balance_loss_mlp": 1.04948688, "epoch": 0.18409739966932212, "flos": 23439900476160.0, "grad_norm": 1.7545162631646947, "language_loss": 0.81602395, "learning_rate": 3.6750395646942857e-06, "loss": 0.83804643, "num_input_tokens_seen": 66146910, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.96875, "step": 3062, "time_per_iteration": 2.561282157897949 }, { "auxiliary_loss_clip": 0.0116037, "auxiliary_loss_mlp": 0.01042277, "balance_loss_clip": 1.02428877, "balance_loss_mlp": 1.04996502, "epoch": 0.18415752292199009, "flos": 21872112577920.0, "grad_norm": 6.140053681033206, "language_loss": 0.73066622, "learning_rate": 3.674833090040491e-06, "loss": 0.7526927, "num_input_tokens_seen": 66165370, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.015625, "step": 3063, "time_per_iteration": 2.571791172027588 }, { "auxiliary_loss_clip": 0.01151744, "auxiliary_loss_mlp": 0.01037476, "balance_loss_clip": 1.02178848, "balance_loss_mlp": 1.04551768, "epoch": 0.18421764617465805, "flos": 25410678036480.0, "grad_norm": 1.7484813455475086, "language_loss": 0.65715003, "learning_rate": 3.6746265556162116e-06, "loss": 0.67904222, "num_input_tokens_seen": 66186210, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.96875, "step": 3064, "time_per_iteration": 2.5603713989257812 }, { "auxiliary_loss_clip": 0.01154701, "auxiliary_loss_mlp": 0.01044437, "balance_loss_clip": 1.02673435, "balance_loss_mlp": 1.04908586, "epoch": 0.18427776942732602, "flos": 27198131558400.0, "grad_norm": 24.255908479091012, "language_loss": 0.68656194, "learning_rate": 3.6744199614288174e-06, "loss": 0.70855331, "num_input_tokens_seen": 66204800, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.96484375, "step": 3065, "time_per_iteration": 2.6371235847473145 }, { "auxiliary_loss_clip": 0.01168159, "auxiliary_loss_mlp": 0.01039746, "balance_loss_clip": 1.02117276, "balance_loss_mlp": 1.04966259, "epoch": 0.18433789267999398, "flos": 27852351920640.0, "grad_norm": 3.6315296534734287, "language_loss": 0.72680032, "learning_rate": 3.6742133074856828e-06, "loss": 0.74887931, "num_input_tokens_seen": 66222195, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0078125, "step": 3066, "time_per_iteration": 2.5484113693237305 }, { "auxiliary_loss_clip": 0.01163753, "auxiliary_loss_mlp": 0.01038571, "balance_loss_clip": 1.02177477, "balance_loss_mlp": 1.04674661, "epoch": 0.18439801593266195, "flos": 17856940533120.0, "grad_norm": 2.536915209958528, "language_loss": 0.81736642, "learning_rate": 3.6740065937941815e-06, "loss": 0.83938968, "num_input_tokens_seen": 66239505, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.98828125, "step": 3067, "time_per_iteration": 2.524221181869507 }, { "auxiliary_loss_clip": 0.01061657, "auxiliary_loss_mlp": 0.01252096, "balance_loss_clip": 1.00204027, "balance_loss_mlp": 1.02696609, "epoch": 0.18445813918532994, "flos": 56389522590720.0, "grad_norm": 0.9121863146174154, "language_loss": 0.59706438, "learning_rate": 3.673799820361691e-06, "loss": 0.62020195, "num_input_tokens_seen": 66295695, "router_z_loss_clip": 0.02233887, "router_z_loss_mlp": 0.34765625, "step": 3068, "time_per_iteration": 2.9645907878875732 }, { "auxiliary_loss_clip": 0.01165103, "auxiliary_loss_mlp": 0.01040203, "balance_loss_clip": 1.02276254, "balance_loss_mlp": 1.05091667, "epoch": 0.1845182624379979, "flos": 20957180325120.0, "grad_norm": 1.6911301371547602, "language_loss": 0.76019716, "learning_rate": 3.67359298719559e-06, "loss": 0.78225023, "num_input_tokens_seen": 66315315, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9609375, "step": 3069, "time_per_iteration": 2.5988306999206543 }, { "auxiliary_loss_clip": 0.01155196, "auxiliary_loss_mlp": 0.01040596, "balance_loss_clip": 1.02279806, "balance_loss_mlp": 1.04731905, "epoch": 0.18457838569066587, "flos": 20485170679680.0, "grad_norm": 1.8489376133876878, "language_loss": 0.84988391, "learning_rate": 3.6733860943032607e-06, "loss": 0.87184179, "num_input_tokens_seen": 66333675, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9921875, "step": 3070, "time_per_iteration": 2.4912500381469727 }, { "auxiliary_loss_clip": 0.01174656, "auxiliary_loss_mlp": 0.01041576, "balance_loss_clip": 1.02291965, "balance_loss_mlp": 1.04829371, "epoch": 0.18463850894333383, "flos": 25010022758400.0, "grad_norm": 2.2188046851698426, "language_loss": 0.77479041, "learning_rate": 3.6731791416920863e-06, "loss": 0.79695272, "num_input_tokens_seen": 66354075, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.9921875, "step": 3071, "time_per_iteration": 2.6100151538848877 }, { "auxiliary_loss_clip": 0.01177436, "auxiliary_loss_mlp": 0.01046355, "balance_loss_clip": 1.02834249, "balance_loss_mlp": 1.05034494, "epoch": 0.1846986321960018, "flos": 16800628348800.0, "grad_norm": 2.2249578660381926, "language_loss": 0.77320093, "learning_rate": 3.6729721293694523e-06, "loss": 0.79543889, "num_input_tokens_seen": 66372520, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.99609375, "step": 3072, "time_per_iteration": 2.6550161838531494 }, { "auxiliary_loss_clip": 0.01156045, "auxiliary_loss_mlp": 0.01045009, "balance_loss_clip": 1.0268054, "balance_loss_mlp": 1.04651904, "epoch": 0.18475875544866976, "flos": 20814327936000.0, "grad_norm": 2.2092519964147646, "language_loss": 0.86123669, "learning_rate": 3.6727650573427464e-06, "loss": 0.8832472, "num_input_tokens_seen": 66390745, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.0078125, "step": 3073, "time_per_iteration": 2.6900718212127686 }, { "auxiliary_loss_clip": 0.01149766, "auxiliary_loss_mlp": 0.01044711, "balance_loss_clip": 1.0286479, "balance_loss_mlp": 1.05235553, "epoch": 0.18481887870133773, "flos": 22601422321920.0, "grad_norm": 2.232591569897819, "language_loss": 0.91394877, "learning_rate": 3.672557925619358e-06, "loss": 0.93589354, "num_input_tokens_seen": 66410525, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.97265625, "step": 3074, "time_per_iteration": 2.592752695083618 }, { "auxiliary_loss_clip": 0.01169351, "auxiliary_loss_mlp": 0.01042336, "balance_loss_clip": 1.02407289, "balance_loss_mlp": 1.04579139, "epoch": 0.18487900195400572, "flos": 29458815788160.0, "grad_norm": 1.71501446337036, "language_loss": 0.65329325, "learning_rate": 3.67235073420668e-06, "loss": 0.67541015, "num_input_tokens_seen": 66432535, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.9609375, "step": 3075, "time_per_iteration": 2.626560926437378 }, { "auxiliary_loss_clip": 0.01163907, "auxiliary_loss_mlp": 0.01042603, "balance_loss_clip": 1.02579427, "balance_loss_mlp": 1.04941368, "epoch": 0.1849391252066737, "flos": 20628777254400.0, "grad_norm": 2.2057238294459824, "language_loss": 0.7280792, "learning_rate": 3.672143483112106e-06, "loss": 0.7501443, "num_input_tokens_seen": 66450620, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.96484375, "step": 3076, "time_per_iteration": 2.655122995376587 }, { "auxiliary_loss_clip": 0.01174909, "auxiliary_loss_mlp": 0.01041118, "balance_loss_clip": 1.02427375, "balance_loss_mlp": 1.04783869, "epoch": 0.18499924845934165, "flos": 14428549065600.0, "grad_norm": 2.141308873633505, "language_loss": 0.80203211, "learning_rate": 3.6719361723430325e-06, "loss": 0.8241924, "num_input_tokens_seen": 66467865, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 1.0, "step": 3077, "time_per_iteration": 2.5360872745513916 }, { "auxiliary_loss_clip": 0.01160968, "auxiliary_loss_mlp": 0.01042985, "balance_loss_clip": 1.02594984, "balance_loss_mlp": 1.0477066, "epoch": 0.18505937171200962, "flos": 23727652329600.0, "grad_norm": 1.7628608162075632, "language_loss": 0.78900814, "learning_rate": 3.671728801906857e-06, "loss": 0.81104773, "num_input_tokens_seen": 66486245, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.94921875, "step": 3078, "time_per_iteration": 2.587453603744507 }, { "auxiliary_loss_clip": 0.01146206, "auxiliary_loss_mlp": 0.01043683, "balance_loss_clip": 1.02761316, "balance_loss_mlp": 1.04868102, "epoch": 0.18511949496467758, "flos": 25957489754880.0, "grad_norm": 1.801560681715595, "language_loss": 0.77847624, "learning_rate": 3.6715213718109816e-06, "loss": 0.80037516, "num_input_tokens_seen": 66506510, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.9765625, "step": 3079, "time_per_iteration": 2.559767961502075 }, { "auxiliary_loss_clip": 0.01160385, "auxiliary_loss_mlp": 0.01040659, "balance_loss_clip": 1.02359974, "balance_loss_mlp": 1.04527104, "epoch": 0.18517961821734555, "flos": 42413553912960.0, "grad_norm": 1.85759014717422, "language_loss": 0.81743056, "learning_rate": 3.671313882062808e-06, "loss": 0.839441, "num_input_tokens_seen": 66530960, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.96875, "step": 3080, "time_per_iteration": 2.7573494911193848 }, { "auxiliary_loss_clip": 0.01184173, "auxiliary_loss_mlp": 0.01046124, "balance_loss_clip": 1.02848101, "balance_loss_mlp": 1.04822195, "epoch": 0.18523974147001354, "flos": 24097568544000.0, "grad_norm": 1.8374481795621513, "language_loss": 0.74319977, "learning_rate": 3.6711063326697405e-06, "loss": 0.76550275, "num_input_tokens_seen": 66550275, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 1.0, "step": 3081, "time_per_iteration": 2.5784666538238525 }, { "auxiliary_loss_clip": 0.01164886, "auxiliary_loss_mlp": 0.01049562, "balance_loss_clip": 1.03077424, "balance_loss_mlp": 1.04955852, "epoch": 0.1852998647226815, "flos": 27375278457600.0, "grad_norm": 2.3123914924865687, "language_loss": 0.71554065, "learning_rate": 3.6708987236391867e-06, "loss": 0.73768514, "num_input_tokens_seen": 66569040, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.97265625, "step": 3082, "time_per_iteration": 2.579603433609009 }, { "auxiliary_loss_clip": 0.01174203, "auxiliary_loss_mlp": 0.0104788, "balance_loss_clip": 1.02921212, "balance_loss_mlp": 1.04917443, "epoch": 0.18535998797534947, "flos": 18332757020160.0, "grad_norm": 1.9837258860650722, "language_loss": 0.69372559, "learning_rate": 3.6706910549785562e-06, "loss": 0.71594638, "num_input_tokens_seen": 66587775, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.9765625, "step": 3083, "time_per_iteration": 2.5136990547180176 }, { "auxiliary_loss_clip": 0.01160716, "auxiliary_loss_mlp": 0.01046523, "balance_loss_clip": 1.02990484, "balance_loss_mlp": 1.0476737, "epoch": 0.18542011122801744, "flos": 37845859887360.0, "grad_norm": 1.9487542362437438, "language_loss": 0.6820004, "learning_rate": 3.670483326695259e-06, "loss": 0.70407277, "num_input_tokens_seen": 66610800, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.9453125, "step": 3084, "time_per_iteration": 2.753831148147583 }, { "auxiliary_loss_clip": 0.01143686, "auxiliary_loss_mlp": 0.01038493, "balance_loss_clip": 1.02200603, "balance_loss_mlp": 1.04770422, "epoch": 0.1854802344806854, "flos": 25186128163200.0, "grad_norm": 1.4644776632357044, "language_loss": 0.77519822, "learning_rate": 3.6702755387967097e-06, "loss": 0.79702008, "num_input_tokens_seen": 66630960, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.9609375, "step": 3085, "time_per_iteration": 2.6505072116851807 }, { "auxiliary_loss_clip": 0.01141842, "auxiliary_loss_mlp": 0.01048035, "balance_loss_clip": 1.03085709, "balance_loss_mlp": 1.04562247, "epoch": 0.18554035773335337, "flos": 26684788337280.0, "grad_norm": 1.7712394753022604, "language_loss": 0.73010564, "learning_rate": 3.6700676912903214e-06, "loss": 0.75200438, "num_input_tokens_seen": 66650585, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9609375, "step": 3086, "time_per_iteration": 2.634370803833008 }, { "auxiliary_loss_clip": 0.01153605, "auxiliary_loss_mlp": 0.01047612, "balance_loss_clip": 1.02965879, "balance_loss_mlp": 1.04974568, "epoch": 0.18560048098602133, "flos": 22346887570560.0, "grad_norm": 2.422423300045613, "language_loss": 0.69467431, "learning_rate": 3.6698597841835144e-06, "loss": 0.71668649, "num_input_tokens_seen": 66670045, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.94921875, "step": 3087, "time_per_iteration": 2.5684750080108643 }, { "auxiliary_loss_clip": 0.01175699, "auxiliary_loss_mlp": 0.0105203, "balance_loss_clip": 1.03363585, "balance_loss_mlp": 1.04999042, "epoch": 0.18566060423868933, "flos": 17748526308480.0, "grad_norm": 2.0892909012972813, "language_loss": 0.72241664, "learning_rate": 3.6696518174837064e-06, "loss": 0.74469399, "num_input_tokens_seen": 66688790, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.984375, "step": 3088, "time_per_iteration": 2.656770706176758 }, { "auxiliary_loss_clip": 0.01161613, "auxiliary_loss_mlp": 0.01037961, "balance_loss_clip": 1.02092576, "balance_loss_mlp": 1.04783916, "epoch": 0.1857207274913573, "flos": 24677274142080.0, "grad_norm": 2.7250598248372615, "language_loss": 0.9199475, "learning_rate": 3.6694437911983197e-06, "loss": 0.94194329, "num_input_tokens_seen": 66708090, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.953125, "step": 3089, "time_per_iteration": 2.5798330307006836 }, { "auxiliary_loss_clip": 0.01181933, "auxiliary_loss_mlp": 0.0104265, "balance_loss_clip": 1.02574563, "balance_loss_mlp": 1.05057383, "epoch": 0.18578085074402526, "flos": 28147825198080.0, "grad_norm": 3.156243697631719, "language_loss": 0.572604, "learning_rate": 3.669235705334779e-06, "loss": 0.59484982, "num_input_tokens_seen": 66727320, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.953125, "step": 3090, "time_per_iteration": 2.703909158706665 }, { "auxiliary_loss_clip": 0.01163652, "auxiliary_loss_mlp": 0.01045215, "balance_loss_clip": 1.02750063, "balance_loss_mlp": 1.04930472, "epoch": 0.18584097399669322, "flos": 23951878980480.0, "grad_norm": 1.8188608796636836, "language_loss": 0.81663787, "learning_rate": 3.669027559900509e-06, "loss": 0.83872652, "num_input_tokens_seen": 66747505, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.9609375, "step": 3091, "time_per_iteration": 4.01737117767334 }, { "auxiliary_loss_clip": 0.0119654, "auxiliary_loss_mlp": 0.01049155, "balance_loss_clip": 1.03123748, "balance_loss_mlp": 1.05243647, "epoch": 0.18590109724936119, "flos": 17201678676480.0, "grad_norm": 2.215419138342846, "language_loss": 0.84105778, "learning_rate": 3.6688193549029397e-06, "loss": 0.86351466, "num_input_tokens_seen": 66766425, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.98828125, "step": 3092, "time_per_iteration": 5.590075731277466 }, { "auxiliary_loss_clip": 0.01169644, "auxiliary_loss_mlp": 0.01043394, "balance_loss_clip": 1.02564383, "balance_loss_mlp": 1.05069828, "epoch": 0.18596122050202915, "flos": 17234644383360.0, "grad_norm": 2.4476237175839772, "language_loss": 0.9345758, "learning_rate": 3.6686110903494995e-06, "loss": 0.95670617, "num_input_tokens_seen": 66781130, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 1.015625, "step": 3093, "time_per_iteration": 2.474297046661377 }, { "auxiliary_loss_clip": 0.01169904, "auxiliary_loss_mlp": 0.0104248, "balance_loss_clip": 1.02514684, "balance_loss_mlp": 1.05443215, "epoch": 0.18602134375469712, "flos": 19020733188480.0, "grad_norm": 1.911723505785269, "language_loss": 0.77160561, "learning_rate": 3.668402766247622e-06, "loss": 0.79372942, "num_input_tokens_seen": 66797535, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9765625, "step": 3094, "time_per_iteration": 2.5679266452789307 }, { "auxiliary_loss_clip": 0.01195761, "auxiliary_loss_mlp": 0.0104661, "balance_loss_clip": 1.03025436, "balance_loss_mlp": 1.05275083, "epoch": 0.1860814670073651, "flos": 50950094417280.0, "grad_norm": 1.8880373200571463, "language_loss": 0.69441426, "learning_rate": 3.6681943826047413e-06, "loss": 0.716838, "num_input_tokens_seen": 66821720, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.9765625, "step": 3095, "time_per_iteration": 2.886065721511841 }, { "auxiliary_loss_clip": 0.01179965, "auxiliary_loss_mlp": 0.01045809, "balance_loss_clip": 1.02724862, "balance_loss_mlp": 1.05317831, "epoch": 0.18614159026003307, "flos": 19390972625280.0, "grad_norm": 2.039342260979788, "language_loss": 0.80627859, "learning_rate": 3.6679859394282944e-06, "loss": 0.82853627, "num_input_tokens_seen": 66839060, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.9921875, "step": 3096, "time_per_iteration": 2.5670247077941895 }, { "auxiliary_loss_clip": 0.01159866, "auxiliary_loss_mlp": 0.01048201, "balance_loss_clip": 1.02956843, "balance_loss_mlp": 1.05178916, "epoch": 0.18620171351270104, "flos": 21798782962560.0, "grad_norm": 1.9622866711835039, "language_loss": 0.74969494, "learning_rate": 3.6677774367257194e-06, "loss": 0.7717756, "num_input_tokens_seen": 66857760, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.9921875, "step": 3097, "time_per_iteration": 2.527492046356201 }, { "auxiliary_loss_clip": 0.01168317, "auxiliary_loss_mlp": 0.01047121, "balance_loss_clip": 1.02962136, "balance_loss_mlp": 1.05412197, "epoch": 0.186261836765369, "flos": 16362877299840.0, "grad_norm": 2.3818815063596417, "language_loss": 0.66190726, "learning_rate": 3.6675688745044583e-06, "loss": 0.68406165, "num_input_tokens_seen": 66876460, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.9609375, "step": 3098, "time_per_iteration": 3.933319330215454 }, { "auxiliary_loss_clip": 0.01151538, "auxiliary_loss_mlp": 0.01046566, "balance_loss_clip": 1.02672911, "balance_loss_mlp": 1.05148625, "epoch": 0.18632196001803697, "flos": 23370054480000.0, "grad_norm": 1.565825850586775, "language_loss": 0.69634771, "learning_rate": 3.6673602527719533e-06, "loss": 0.71832883, "num_input_tokens_seen": 66897960, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.0, "step": 3099, "time_per_iteration": 2.6302874088287354 }, { "auxiliary_loss_clip": 0.01161374, "auxiliary_loss_mlp": 0.0105157, "balance_loss_clip": 1.03298545, "balance_loss_mlp": 1.05263448, "epoch": 0.18638208327070493, "flos": 22492002516480.0, "grad_norm": 1.6171098238827415, "language_loss": 0.71723294, "learning_rate": 3.66715157153565e-06, "loss": 0.73936242, "num_input_tokens_seen": 66917675, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.99609375, "step": 3100, "time_per_iteration": 2.586442232131958 }, { "auxiliary_loss_clip": 0.0116965, "auxiliary_loss_mlp": 0.01050156, "balance_loss_clip": 1.03172648, "balance_loss_mlp": 1.05198836, "epoch": 0.18644220652337293, "flos": 29165245931520.0, "grad_norm": 2.2559560743981413, "language_loss": 0.80115825, "learning_rate": 3.666942830802996e-06, "loss": 0.82335627, "num_input_tokens_seen": 66936000, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.9921875, "step": 3101, "time_per_iteration": 2.7462501525878906 }, { "auxiliary_loss_clip": 0.01171885, "auxiliary_loss_mlp": 0.01043692, "balance_loss_clip": 1.0274322, "balance_loss_mlp": 1.05044675, "epoch": 0.1865023297760409, "flos": 24243796811520.0, "grad_norm": 1.6733045589022013, "language_loss": 0.76882184, "learning_rate": 3.6667340305814394e-06, "loss": 0.7909776, "num_input_tokens_seen": 66955700, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.9453125, "step": 3102, "time_per_iteration": 2.5758275985717773 }, { "auxiliary_loss_clip": 0.01160359, "auxiliary_loss_mlp": 0.01039939, "balance_loss_clip": 1.02230775, "balance_loss_mlp": 1.05253029, "epoch": 0.18656245302870886, "flos": 19128716449920.0, "grad_norm": 2.1579842311231974, "language_loss": 0.76670796, "learning_rate": 3.6665251708784325e-06, "loss": 0.78871095, "num_input_tokens_seen": 66972815, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.98828125, "step": 3103, "time_per_iteration": 2.6138699054718018 }, { "auxiliary_loss_clip": 0.01168537, "auxiliary_loss_mlp": 0.01045749, "balance_loss_clip": 1.0279392, "balance_loss_mlp": 1.05269563, "epoch": 0.18662257628137682, "flos": 17786088956160.0, "grad_norm": 1.9211918252060238, "language_loss": 0.78807384, "learning_rate": 3.6663162517014294e-06, "loss": 0.81021672, "num_input_tokens_seen": 66992280, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9765625, "step": 3104, "time_per_iteration": 2.640272617340088 }, { "auxiliary_loss_clip": 0.01177123, "auxiliary_loss_mlp": 0.01041666, "balance_loss_clip": 1.02453542, "balance_loss_mlp": 1.05377841, "epoch": 0.1866826995340448, "flos": 24024382583040.0, "grad_norm": 1.9045175560112684, "language_loss": 0.85318089, "learning_rate": 3.6661072730578858e-06, "loss": 0.87536877, "num_input_tokens_seen": 67012220, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9609375, "step": 3105, "time_per_iteration": 2.783714532852173 }, { "auxiliary_loss_clip": 0.01182812, "auxiliary_loss_mlp": 0.01044804, "balance_loss_clip": 1.02564692, "balance_loss_mlp": 1.05148482, "epoch": 0.18674282278671275, "flos": 26141244756480.0, "grad_norm": 1.887254267715637, "language_loss": 0.86310506, "learning_rate": 3.665898234955259e-06, "loss": 0.88538122, "num_input_tokens_seen": 67032030, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0390625, "step": 3106, "time_per_iteration": 2.6872377395629883 }, { "auxiliary_loss_clip": 0.01164468, "auxiliary_loss_mlp": 0.01040115, "balance_loss_clip": 1.02244854, "balance_loss_mlp": 1.05339146, "epoch": 0.18680294603938072, "flos": 19201938324480.0, "grad_norm": 2.2068324273273974, "language_loss": 0.78254747, "learning_rate": 3.6656891374010097e-06, "loss": 0.80459332, "num_input_tokens_seen": 67048920, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 1.015625, "step": 3107, "time_per_iteration": 2.565471887588501 }, { "auxiliary_loss_clip": 0.01161529, "auxiliary_loss_mlp": 0.01045733, "balance_loss_clip": 1.02648103, "balance_loss_mlp": 1.05250764, "epoch": 0.1868630692920487, "flos": 28544889116160.0, "grad_norm": 2.2088738631669926, "language_loss": 0.74123394, "learning_rate": 3.665479980402599e-06, "loss": 0.76330662, "num_input_tokens_seen": 67068645, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.0, "step": 3108, "time_per_iteration": 2.596418619155884 }, { "auxiliary_loss_clip": 0.01186733, "auxiliary_loss_mlp": 0.01041858, "balance_loss_clip": 1.02373779, "balance_loss_mlp": 1.05256236, "epoch": 0.18692319254471668, "flos": 17238020261760.0, "grad_norm": 1.5465236818614816, "language_loss": 0.74098921, "learning_rate": 3.665270763967493e-06, "loss": 0.76327515, "num_input_tokens_seen": 67087075, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.984375, "step": 3109, "time_per_iteration": 2.6114895343780518 }, { "auxiliary_loss_clip": 0.0115772, "auxiliary_loss_mlp": 0.01043731, "balance_loss_clip": 1.02567065, "balance_loss_mlp": 1.05114794, "epoch": 0.18698331579738464, "flos": 23185186156800.0, "grad_norm": 1.4059609011673186, "language_loss": 0.84054506, "learning_rate": 3.6650614881031567e-06, "loss": 0.86255956, "num_input_tokens_seen": 67108040, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.9765625, "step": 3110, "time_per_iteration": 2.5725650787353516 }, { "auxiliary_loss_clip": 0.01160754, "auxiliary_loss_mlp": 0.01042092, "balance_loss_clip": 1.02367401, "balance_loss_mlp": 1.05380642, "epoch": 0.1870434390500526, "flos": 25516721963520.0, "grad_norm": 1.5962998048436674, "language_loss": 0.84143353, "learning_rate": 3.664852152817059e-06, "loss": 0.86346197, "num_input_tokens_seen": 67127605, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.98046875, "step": 3111, "time_per_iteration": 2.601320743560791 }, { "auxiliary_loss_clip": 0.01164642, "auxiliary_loss_mlp": 0.01038497, "balance_loss_clip": 1.02116394, "balance_loss_mlp": 1.04954219, "epoch": 0.18710356230272057, "flos": 19500823393920.0, "grad_norm": 5.364572556405311, "language_loss": 0.76454568, "learning_rate": 3.6646427581166702e-06, "loss": 0.78657705, "num_input_tokens_seen": 67145785, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.96875, "step": 3112, "time_per_iteration": 2.639287233352661 }, { "auxiliary_loss_clip": 0.01158495, "auxiliary_loss_mlp": 0.01048076, "balance_loss_clip": 1.02995658, "balance_loss_mlp": 1.05042851, "epoch": 0.18716368555538854, "flos": 26760847386240.0, "grad_norm": 2.0891085713423605, "language_loss": 0.64939415, "learning_rate": 3.6644333040094636e-06, "loss": 0.67145991, "num_input_tokens_seen": 67165930, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.98828125, "step": 3113, "time_per_iteration": 2.58825421333313 }, { "auxiliary_loss_clip": 0.01166249, "auxiliary_loss_mlp": 0.01045061, "balance_loss_clip": 1.02636874, "balance_loss_mlp": 1.05404329, "epoch": 0.1872238088080565, "flos": 25189827264000.0, "grad_norm": 2.0007035595034286, "language_loss": 0.81548953, "learning_rate": 3.6642237905029132e-06, "loss": 0.83760262, "num_input_tokens_seen": 67185830, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.03125, "step": 3114, "time_per_iteration": 2.6241378784179688 }, { "auxiliary_loss_clip": 0.01188228, "auxiliary_loss_mlp": 0.01052348, "balance_loss_clip": 1.03347731, "balance_loss_mlp": 1.05189025, "epoch": 0.1872839320607245, "flos": 24134305178880.0, "grad_norm": 1.9912167129951406, "language_loss": 0.57959342, "learning_rate": 3.664014217604497e-06, "loss": 0.60199916, "num_input_tokens_seen": 67206930, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.0, "step": 3115, "time_per_iteration": 2.6611456871032715 }, { "auxiliary_loss_clip": 0.01175833, "auxiliary_loss_mlp": 0.01051542, "balance_loss_clip": 1.03389907, "balance_loss_mlp": 1.05217314, "epoch": 0.18734405531339246, "flos": 21173793292800.0, "grad_norm": 2.2600311084729166, "language_loss": 0.70823652, "learning_rate": 3.6638045853216938e-06, "loss": 0.73051029, "num_input_tokens_seen": 67226290, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.96484375, "step": 3116, "time_per_iteration": 2.655391216278076 }, { "auxiliary_loss_clip": 0.01143146, "auxiliary_loss_mlp": 0.01033474, "balance_loss_clip": 1.01617658, "balance_loss_mlp": 1.04803109, "epoch": 0.18740417856606043, "flos": 17237697039360.0, "grad_norm": 2.194975342368501, "language_loss": 0.7907474, "learning_rate": 3.663594893661985e-06, "loss": 0.81251359, "num_input_tokens_seen": 67244410, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.953125, "step": 3117, "time_per_iteration": 2.4851183891296387 }, { "auxiliary_loss_clip": 0.01164991, "auxiliary_loss_mlp": 0.01048063, "balance_loss_clip": 1.02994347, "balance_loss_mlp": 1.05139375, "epoch": 0.1874643018187284, "flos": 32558049999360.0, "grad_norm": 1.7349265912409557, "language_loss": 0.84103328, "learning_rate": 3.663385142632853e-06, "loss": 0.86316383, "num_input_tokens_seen": 67264470, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.953125, "step": 3118, "time_per_iteration": 2.7560505867004395 }, { "auxiliary_loss_clip": 0.0115847, "auxiliary_loss_mlp": 0.01045979, "balance_loss_clip": 1.02954054, "balance_loss_mlp": 1.05154943, "epoch": 0.18752442507139636, "flos": 23258156636160.0, "grad_norm": 1.7906400304540793, "language_loss": 0.76621103, "learning_rate": 3.663175332241785e-06, "loss": 0.78825545, "num_input_tokens_seen": 67284315, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.98046875, "step": 3119, "time_per_iteration": 2.6213748455047607 }, { "auxiliary_loss_clip": 0.01149386, "auxiliary_loss_mlp": 0.01048759, "balance_loss_clip": 1.03090191, "balance_loss_mlp": 1.05069792, "epoch": 0.18758454832406432, "flos": 21760933006080.0, "grad_norm": 1.636850189244784, "language_loss": 0.81959873, "learning_rate": 3.6629654624962666e-06, "loss": 0.84158015, "num_input_tokens_seen": 67302780, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.98828125, "step": 3120, "time_per_iteration": 2.6334142684936523 }, { "auxiliary_loss_clip": 0.01162429, "auxiliary_loss_mlp": 0.01036772, "balance_loss_clip": 1.02044022, "balance_loss_mlp": 1.04809785, "epoch": 0.1876446715767323, "flos": 29570210841600.0, "grad_norm": 1.7901063780670472, "language_loss": 0.84913015, "learning_rate": 3.6627555334037893e-06, "loss": 0.87112218, "num_input_tokens_seen": 67323405, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.9609375, "step": 3121, "time_per_iteration": 2.6512398719787598 }, { "auxiliary_loss_clip": 0.01184813, "auxiliary_loss_mlp": 0.0104594, "balance_loss_clip": 1.02876186, "balance_loss_mlp": 1.05167842, "epoch": 0.18770479482940028, "flos": 30339992234880.0, "grad_norm": 1.707328202503088, "language_loss": 0.70112765, "learning_rate": 3.662545544971844e-06, "loss": 0.72343522, "num_input_tokens_seen": 67345800, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.96875, "step": 3122, "time_per_iteration": 2.7197299003601074 }, { "auxiliary_loss_clip": 0.01144748, "auxiliary_loss_mlp": 0.0104246, "balance_loss_clip": 1.02363646, "balance_loss_mlp": 1.04857743, "epoch": 0.18776491808206824, "flos": 14465357527680.0, "grad_norm": 2.102550872712054, "language_loss": 0.7050631, "learning_rate": 3.662335497207924e-06, "loss": 0.72693515, "num_input_tokens_seen": 67363575, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.9609375, "step": 3123, "time_per_iteration": 2.4953091144561768 }, { "auxiliary_loss_clip": 0.01155875, "auxiliary_loss_mlp": 0.01044634, "balance_loss_clip": 1.02774227, "balance_loss_mlp": 1.05022359, "epoch": 0.1878250413347362, "flos": 24498547044480.0, "grad_norm": 2.2688563896637772, "language_loss": 0.73755854, "learning_rate": 3.662125390119527e-06, "loss": 0.75956357, "num_input_tokens_seen": 67381765, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.96484375, "step": 3124, "time_per_iteration": 2.652468204498291 }, { "auxiliary_loss_clip": 0.0114876, "auxiliary_loss_mlp": 0.01047254, "balance_loss_clip": 1.02791858, "balance_loss_mlp": 1.04996753, "epoch": 0.18788516458740417, "flos": 39786185692800.0, "grad_norm": 1.8519605184775327, "language_loss": 0.8050493, "learning_rate": 3.66191522371415e-06, "loss": 0.8270095, "num_input_tokens_seen": 67405000, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.98828125, "step": 3125, "time_per_iteration": 2.721350908279419 }, { "auxiliary_loss_clip": 0.01102694, "auxiliary_loss_mlp": 0.01003749, "balance_loss_clip": 1.00087595, "balance_loss_mlp": 1.03143513, "epoch": 0.18794528784007214, "flos": 64699250664960.0, "grad_norm": 0.9357915029345827, "language_loss": 0.63514662, "learning_rate": 3.6617049979992937e-06, "loss": 0.65621102, "num_input_tokens_seen": 67467140, "router_z_loss_clip": 0.02868652, "router_z_loss_mlp": 0.34960938, "step": 3126, "time_per_iteration": 3.235762357711792 }, { "auxiliary_loss_clip": 0.01164582, "auxiliary_loss_mlp": 0.01041563, "balance_loss_clip": 1.02366948, "balance_loss_mlp": 1.05176699, "epoch": 0.1880054110927401, "flos": 28622061486720.0, "grad_norm": 1.670729607180854, "language_loss": 0.80974966, "learning_rate": 3.6614947129824603e-06, "loss": 0.83181113, "num_input_tokens_seen": 67487980, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.94921875, "step": 3127, "time_per_iteration": 2.7334840297698975 }, { "auxiliary_loss_clip": 0.01066721, "auxiliary_loss_mlp": 0.01001112, "balance_loss_clip": 0.99840623, "balance_loss_mlp": 1.03202713, "epoch": 0.1880655343454081, "flos": 64488958490880.0, "grad_norm": 0.7628967034731109, "language_loss": 0.61888909, "learning_rate": 3.6612843686711542e-06, "loss": 0.63956743, "num_input_tokens_seen": 67552500, "router_z_loss_clip": 0.02709961, "router_z_loss_mlp": 0.34765625, "step": 3128, "time_per_iteration": 3.224971055984497 }, { "auxiliary_loss_clip": 0.01187083, "auxiliary_loss_mlp": 0.01039314, "balance_loss_clip": 1.02112877, "balance_loss_mlp": 1.04885721, "epoch": 0.18812565759807606, "flos": 32124464928000.0, "grad_norm": 2.1276783488310014, "language_loss": 0.71228182, "learning_rate": 3.661073965072883e-06, "loss": 0.73454583, "num_input_tokens_seen": 67573295, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.0234375, "step": 3129, "time_per_iteration": 2.747626304626465 }, { "auxiliary_loss_clip": 0.01147233, "auxiliary_loss_mlp": 0.01050257, "balance_loss_clip": 1.0317322, "balance_loss_mlp": 1.04883218, "epoch": 0.18818578085074403, "flos": 20624539449600.0, "grad_norm": 1.8151142512348386, "language_loss": 0.84750915, "learning_rate": 3.6608635021951546e-06, "loss": 0.86948401, "num_input_tokens_seen": 67590010, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.98046875, "step": 3130, "time_per_iteration": 2.5548622608184814 }, { "auxiliary_loss_clip": 0.01173992, "auxiliary_loss_mlp": 0.01044127, "balance_loss_clip": 1.02527976, "balance_loss_mlp": 1.04721165, "epoch": 0.188245904103412, "flos": 28840506048000.0, "grad_norm": 1.9585038440096583, "language_loss": 0.76511002, "learning_rate": 3.6606529800454794e-06, "loss": 0.78729117, "num_input_tokens_seen": 67611110, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.0, "step": 3131, "time_per_iteration": 2.677065849304199 }, { "auxiliary_loss_clip": 0.01174057, "auxiliary_loss_mlp": 0.01047691, "balance_loss_clip": 1.0285219, "balance_loss_mlp": 1.05055523, "epoch": 0.18830602735607996, "flos": 29420319386880.0, "grad_norm": 1.7266028904952124, "language_loss": 0.81238455, "learning_rate": 3.660442398631372e-06, "loss": 0.834602, "num_input_tokens_seen": 67631990, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.96484375, "step": 3132, "time_per_iteration": 2.666759967803955 }, { "auxiliary_loss_clip": 0.01188869, "auxiliary_loss_mlp": 0.01291959, "balance_loss_clip": 1.02520776, "balance_loss_mlp": 1.05216455, "epoch": 0.18836615060874792, "flos": 28872933050880.0, "grad_norm": 1.9212509936409394, "language_loss": 0.79492402, "learning_rate": 3.660231757960346e-06, "loss": 0.81973231, "num_input_tokens_seen": 67650490, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0, "step": 3133, "time_per_iteration": 4.018431663513184 }, { "auxiliary_loss_clip": 0.01177477, "auxiliary_loss_mlp": 0.01046688, "balance_loss_clip": 1.02920032, "balance_loss_mlp": 1.05090594, "epoch": 0.18842627386141592, "flos": 22601673717120.0, "grad_norm": 2.158517742367877, "language_loss": 0.82779598, "learning_rate": 3.660021058039919e-06, "loss": 0.85003769, "num_input_tokens_seen": 67668860, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.99609375, "step": 3134, "time_per_iteration": 5.491886138916016 }, { "auxiliary_loss_clip": 0.01163831, "auxiliary_loss_mlp": 0.01046807, "balance_loss_clip": 1.02797198, "balance_loss_mlp": 1.04925132, "epoch": 0.18848639711408388, "flos": 24573600512640.0, "grad_norm": 1.341585699984577, "language_loss": 0.82628524, "learning_rate": 3.659810298877611e-06, "loss": 0.84839159, "num_input_tokens_seen": 67690220, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.96484375, "step": 3135, "time_per_iteration": 2.6321983337402344 }, { "auxiliary_loss_clip": 0.01156127, "auxiliary_loss_mlp": 0.01044384, "balance_loss_clip": 1.02658641, "balance_loss_mlp": 1.04690742, "epoch": 0.18854652036675185, "flos": 34166920078080.0, "grad_norm": 1.8676257479723304, "language_loss": 0.78592706, "learning_rate": 3.659599480480943e-06, "loss": 0.8079322, "num_input_tokens_seen": 67709820, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 1.0, "step": 3136, "time_per_iteration": 2.676731824874878 }, { "auxiliary_loss_clip": 0.01167596, "auxiliary_loss_mlp": 0.01044208, "balance_loss_clip": 1.02519381, "balance_loss_mlp": 1.05083108, "epoch": 0.1886066436194198, "flos": 24200236592640.0, "grad_norm": 2.2665025848253744, "language_loss": 0.81139398, "learning_rate": 3.659388602857438e-06, "loss": 0.83351207, "num_input_tokens_seen": 67729490, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.98046875, "step": 3137, "time_per_iteration": 2.635526657104492 }, { "auxiliary_loss_clip": 0.01150728, "auxiliary_loss_mlp": 0.01044866, "balance_loss_clip": 1.02802205, "balance_loss_mlp": 1.05264747, "epoch": 0.18866676687208778, "flos": 21251109317760.0, "grad_norm": 1.8502386310629484, "language_loss": 0.80590284, "learning_rate": 3.6591776660146225e-06, "loss": 0.82785881, "num_input_tokens_seen": 67749665, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.98046875, "step": 3138, "time_per_iteration": 2.5855329036712646 }, { "auxiliary_loss_clip": 0.01160749, "auxiliary_loss_mlp": 0.01050196, "balance_loss_clip": 1.03170657, "balance_loss_mlp": 1.05083609, "epoch": 0.18872689012475574, "flos": 37308673013760.0, "grad_norm": 2.171433121085652, "language_loss": 0.63630545, "learning_rate": 3.6589666699600247e-06, "loss": 0.65841484, "num_input_tokens_seen": 67776230, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.0078125, "step": 3139, "time_per_iteration": 4.207665920257568 }, { "auxiliary_loss_clip": 0.01147038, "auxiliary_loss_mlp": 0.0104435, "balance_loss_clip": 1.0247643, "balance_loss_mlp": 1.04828286, "epoch": 0.1887870133774237, "flos": 21652303299840.0, "grad_norm": 3.430129070752127, "language_loss": 0.71408224, "learning_rate": 3.6587556147011728e-06, "loss": 0.73599607, "num_input_tokens_seen": 67795080, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.98828125, "step": 3140, "time_per_iteration": 2.6611459255218506 }, { "auxiliary_loss_clip": 0.01148192, "auxiliary_loss_mlp": 0.01047947, "balance_loss_clip": 1.02867067, "balance_loss_mlp": 1.04862714, "epoch": 0.1888471366300917, "flos": 15924659374080.0, "grad_norm": 2.179897542479791, "language_loss": 0.8735736, "learning_rate": 3.6585445002456004e-06, "loss": 0.89553499, "num_input_tokens_seen": 67813110, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.9921875, "step": 3141, "time_per_iteration": 2.5885026454925537 }, { "auxiliary_loss_clip": 0.01172605, "auxiliary_loss_mlp": 0.01044386, "balance_loss_clip": 1.0251931, "balance_loss_mlp": 1.05198407, "epoch": 0.18890725988275966, "flos": 18551955767040.0, "grad_norm": 2.006025752760523, "language_loss": 0.7666291, "learning_rate": 3.6583333266008404e-06, "loss": 0.78879899, "num_input_tokens_seen": 67831070, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0234375, "step": 3142, "time_per_iteration": 2.609961748123169 }, { "auxiliary_loss_clip": 0.01165982, "auxiliary_loss_mlp": 0.01040721, "balance_loss_clip": 1.02444887, "balance_loss_mlp": 1.05130911, "epoch": 0.18896738313542763, "flos": 28840865184000.0, "grad_norm": 1.8896163488541173, "language_loss": 0.78453237, "learning_rate": 3.6581220937744305e-06, "loss": 0.80659944, "num_input_tokens_seen": 67852170, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.96875, "step": 3143, "time_per_iteration": 2.6101012229919434 }, { "auxiliary_loss_clip": 0.01175842, "auxiliary_loss_mlp": 0.01047534, "balance_loss_clip": 1.02899671, "balance_loss_mlp": 1.04864502, "epoch": 0.1890275063880956, "flos": 22412747157120.0, "grad_norm": 2.0092643802510324, "language_loss": 0.7120167, "learning_rate": 3.6579108017739076e-06, "loss": 0.73425055, "num_input_tokens_seen": 67869945, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0, "step": 3144, "time_per_iteration": 2.5978567600250244 }, { "auxiliary_loss_clip": 0.01170099, "auxiliary_loss_mlp": 0.010453, "balance_loss_clip": 1.02601242, "balance_loss_mlp": 1.04952884, "epoch": 0.18908762964076356, "flos": 24243904552320.0, "grad_norm": 2.4284898046855323, "language_loss": 0.73181045, "learning_rate": 3.6576994506068136e-06, "loss": 0.75396448, "num_input_tokens_seen": 67890240, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.0234375, "step": 3145, "time_per_iteration": 2.600642442703247 }, { "auxiliary_loss_clip": 0.01171983, "auxiliary_loss_mlp": 0.01040751, "balance_loss_clip": 1.02356064, "balance_loss_mlp": 1.04815125, "epoch": 0.18914775289343153, "flos": 16982910892800.0, "grad_norm": 2.52273885698154, "language_loss": 0.75814641, "learning_rate": 3.6574880402806897e-06, "loss": 0.78027368, "num_input_tokens_seen": 67907825, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.96875, "step": 3146, "time_per_iteration": 2.5844790935516357 }, { "auxiliary_loss_clip": 0.01184191, "auxiliary_loss_mlp": 0.01048212, "balance_loss_clip": 1.0299964, "balance_loss_mlp": 1.0494647, "epoch": 0.1892078761460995, "flos": 21543781334400.0, "grad_norm": 1.9955116890090472, "language_loss": 0.78689337, "learning_rate": 3.6572765708030813e-06, "loss": 0.80921739, "num_input_tokens_seen": 67926670, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.984375, "step": 3147, "time_per_iteration": 2.5990207195281982 }, { "auxiliary_loss_clip": 0.01158069, "auxiliary_loss_mlp": 0.01045241, "balance_loss_clip": 1.026752, "balance_loss_mlp": 1.05061972, "epoch": 0.18926799939876748, "flos": 23001538896000.0, "grad_norm": 2.622704501137582, "language_loss": 0.66759098, "learning_rate": 3.657065042181536e-06, "loss": 0.68962407, "num_input_tokens_seen": 67943645, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.984375, "step": 3148, "time_per_iteration": 2.7296648025512695 }, { "auxiliary_loss_clip": 0.0117563, "auxiliary_loss_mlp": 0.01036848, "balance_loss_clip": 1.01905012, "balance_loss_mlp": 1.05071938, "epoch": 0.18932812265143545, "flos": 22273019251200.0, "grad_norm": 2.186953628732284, "language_loss": 0.75687593, "learning_rate": 3.6568534544236008e-06, "loss": 0.7790007, "num_input_tokens_seen": 67962345, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9765625, "step": 3149, "time_per_iteration": 2.6695430278778076 }, { "auxiliary_loss_clip": 0.01165601, "auxiliary_loss_mlp": 0.01045473, "balance_loss_clip": 1.02809191, "balance_loss_mlp": 1.0526886, "epoch": 0.1893882459041034, "flos": 18624423456000.0, "grad_norm": 2.274605808396526, "language_loss": 0.80040592, "learning_rate": 3.656641807536828e-06, "loss": 0.82251668, "num_input_tokens_seen": 67979760, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9453125, "step": 3150, "time_per_iteration": 2.6483945846557617 }, { "auxiliary_loss_clip": 0.0116322, "auxiliary_loss_mlp": 0.01048627, "balance_loss_clip": 1.03080559, "balance_loss_mlp": 1.05416179, "epoch": 0.18944836915677138, "flos": 22892981016960.0, "grad_norm": 2.6188089632532736, "language_loss": 0.85133135, "learning_rate": 3.6564301015287706e-06, "loss": 0.8734498, "num_input_tokens_seen": 67996895, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 1.0, "step": 3151, "time_per_iteration": 2.5947024822235107 }, { "auxiliary_loss_clip": 0.01178521, "auxiliary_loss_mlp": 0.01047271, "balance_loss_clip": 1.02966356, "balance_loss_mlp": 1.05211127, "epoch": 0.18950849240943934, "flos": 26796542526720.0, "grad_norm": 1.7185469092870238, "language_loss": 0.7422747, "learning_rate": 3.6562183364069835e-06, "loss": 0.76453263, "num_input_tokens_seen": 68018365, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.99609375, "step": 3152, "time_per_iteration": 2.6171679496765137 }, { "auxiliary_loss_clip": 0.0117761, "auxiliary_loss_mlp": 0.01044604, "balance_loss_clip": 1.0253036, "balance_loss_mlp": 1.05176711, "epoch": 0.1895686156621073, "flos": 24971239048320.0, "grad_norm": 1.9487593415936688, "language_loss": 0.75814438, "learning_rate": 3.6560065121790244e-06, "loss": 0.78036654, "num_input_tokens_seen": 68037985, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.984375, "step": 3153, "time_per_iteration": 2.6567540168762207 }, { "auxiliary_loss_clip": 0.01150935, "auxiliary_loss_mlp": 0.01047001, "balance_loss_clip": 1.02935791, "balance_loss_mlp": 1.0511533, "epoch": 0.1896287389147753, "flos": 21944544353280.0, "grad_norm": 2.247331791756839, "language_loss": 0.79167163, "learning_rate": 3.655794628852453e-06, "loss": 0.81365097, "num_input_tokens_seen": 68057975, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 1.0, "step": 3154, "time_per_iteration": 2.6043879985809326 }, { "auxiliary_loss_clip": 0.01180284, "auxiliary_loss_mlp": 0.01047177, "balance_loss_clip": 1.02878308, "balance_loss_mlp": 1.05038249, "epoch": 0.18968886216744327, "flos": 18179058723840.0, "grad_norm": 3.3213889872472886, "language_loss": 0.72808605, "learning_rate": 3.6555826864348297e-06, "loss": 0.75036067, "num_input_tokens_seen": 68074175, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.03125, "step": 3155, "time_per_iteration": 2.618879795074463 }, { "auxiliary_loss_clip": 0.01164889, "auxiliary_loss_mlp": 0.01046136, "balance_loss_clip": 1.02769423, "balance_loss_mlp": 1.05035257, "epoch": 0.18974898542011123, "flos": 20412487509120.0, "grad_norm": 2.4891275792900576, "language_loss": 0.74083805, "learning_rate": 3.6553706849337197e-06, "loss": 0.76294833, "num_input_tokens_seen": 68095230, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.96875, "step": 3156, "time_per_iteration": 2.5908710956573486 }, { "auxiliary_loss_clip": 0.01149402, "auxiliary_loss_mlp": 0.01044594, "balance_loss_clip": 1.02677202, "balance_loss_mlp": 1.04994524, "epoch": 0.1898091086727792, "flos": 23985024255360.0, "grad_norm": 1.9135488383570103, "language_loss": 0.68205535, "learning_rate": 3.6551586243566877e-06, "loss": 0.70399523, "num_input_tokens_seen": 68113805, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.99609375, "step": 3157, "time_per_iteration": 2.67081618309021 }, { "auxiliary_loss_clip": 0.01175414, "auxiliary_loss_mlp": 0.01040905, "balance_loss_clip": 1.02320218, "balance_loss_mlp": 1.04984808, "epoch": 0.18986923192544716, "flos": 27637067756160.0, "grad_norm": 2.2815074696902626, "language_loss": 0.80063266, "learning_rate": 3.654946504711302e-06, "loss": 0.82279587, "num_input_tokens_seen": 68133190, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.984375, "step": 3158, "time_per_iteration": 2.592567205429077 }, { "auxiliary_loss_clip": 0.01153725, "auxiliary_loss_mlp": 0.01047677, "balance_loss_clip": 1.02800727, "balance_loss_mlp": 1.0515883, "epoch": 0.18992935517811513, "flos": 25484151306240.0, "grad_norm": 1.9205319764281337, "language_loss": 0.72032952, "learning_rate": 3.6547343260051323e-06, "loss": 0.74234354, "num_input_tokens_seen": 68152330, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.0234375, "step": 3159, "time_per_iteration": 2.698848247528076 }, { "auxiliary_loss_clip": 0.01162068, "auxiliary_loss_mlp": 0.01052247, "balance_loss_clip": 1.03269672, "balance_loss_mlp": 1.05360544, "epoch": 0.1899894784307831, "flos": 17420805596160.0, "grad_norm": 1.8747153195021558, "language_loss": 0.84966111, "learning_rate": 3.6545220882457518e-06, "loss": 0.87180424, "num_input_tokens_seen": 68170185, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.9921875, "step": 3160, "time_per_iteration": 2.5306403636932373 }, { "auxiliary_loss_clip": 0.01161121, "auxiliary_loss_mlp": 0.01050929, "balance_loss_clip": 1.03372741, "balance_loss_mlp": 1.04867172, "epoch": 0.19004960168345109, "flos": 27492240119040.0, "grad_norm": 1.644012854875828, "language_loss": 0.73101211, "learning_rate": 3.6543097914407336e-06, "loss": 0.75313264, "num_input_tokens_seen": 68191665, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.94140625, "step": 3161, "time_per_iteration": 2.746002197265625 }, { "auxiliary_loss_clip": 0.0115562, "auxiliary_loss_mlp": 0.01047843, "balance_loss_clip": 1.02900815, "balance_loss_mlp": 1.04957604, "epoch": 0.19010972493611905, "flos": 38654676385920.0, "grad_norm": 1.7155143649286881, "language_loss": 0.80466151, "learning_rate": 3.6540974355976537e-06, "loss": 0.8266961, "num_input_tokens_seen": 68214635, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.96875, "step": 3162, "time_per_iteration": 2.8139190673828125 }, { "auxiliary_loss_clip": 0.01149033, "auxiliary_loss_mlp": 0.01045799, "balance_loss_clip": 1.02763104, "balance_loss_mlp": 1.0508647, "epoch": 0.19016984818878702, "flos": 19244744357760.0, "grad_norm": 2.7393320054231873, "language_loss": 0.75685215, "learning_rate": 3.653885020724092e-06, "loss": 0.77880049, "num_input_tokens_seen": 68232150, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9765625, "step": 3163, "time_per_iteration": 2.6112911701202393 }, { "auxiliary_loss_clip": 0.01182956, "auxiliary_loss_mlp": 0.01041576, "balance_loss_clip": 1.02349186, "balance_loss_mlp": 1.05135524, "epoch": 0.19022997144145498, "flos": 37596891744000.0, "grad_norm": 1.7726674234680724, "language_loss": 0.74200732, "learning_rate": 3.653672546827628e-06, "loss": 0.7642526, "num_input_tokens_seen": 68253370, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.953125, "step": 3164, "time_per_iteration": 2.7681024074554443 }, { "auxiliary_loss_clip": 0.01155543, "auxiliary_loss_mlp": 0.01040273, "balance_loss_clip": 1.02198601, "balance_loss_mlp": 1.04990005, "epoch": 0.19029009469412295, "flos": 61530921665280.0, "grad_norm": 2.89079582281775, "language_loss": 0.66769755, "learning_rate": 3.653460013915844e-06, "loss": 0.68965572, "num_input_tokens_seen": 68278895, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.96875, "step": 3165, "time_per_iteration": 3.0200843811035156 }, { "auxiliary_loss_clip": 0.01158896, "auxiliary_loss_mlp": 0.01050959, "balance_loss_clip": 1.03120565, "balance_loss_mlp": 1.05197549, "epoch": 0.1903502179467909, "flos": 13954851480960.0, "grad_norm": 2.496818955693319, "language_loss": 0.74023658, "learning_rate": 3.653247421996326e-06, "loss": 0.76233506, "num_input_tokens_seen": 68294880, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.9765625, "step": 3166, "time_per_iteration": 2.6148858070373535 }, { "auxiliary_loss_clip": 0.01092852, "auxiliary_loss_mlp": 0.01010616, "balance_loss_clip": 1.0082078, "balance_loss_mlp": 1.03131616, "epoch": 0.1904103411994589, "flos": 66899641916160.0, "grad_norm": 0.7829207657243548, "language_loss": 0.50393879, "learning_rate": 3.65303477107666e-06, "loss": 0.52497351, "num_input_tokens_seen": 68359665, "router_z_loss_clip": 0.02404785, "router_z_loss_mlp": 0.34375, "step": 3167, "time_per_iteration": 3.223233222961426 }, { "auxiliary_loss_clip": 0.0115506, "auxiliary_loss_mlp": 0.01049152, "balance_loss_clip": 1.03153312, "balance_loss_mlp": 1.05001736, "epoch": 0.19047046445212687, "flos": 21908741472000.0, "grad_norm": 2.0071435837206346, "language_loss": 0.74594337, "learning_rate": 3.6528220611644356e-06, "loss": 0.76798546, "num_input_tokens_seen": 68378950, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9609375, "step": 3168, "time_per_iteration": 2.6288645267486572 }, { "auxiliary_loss_clip": 0.01100889, "auxiliary_loss_mlp": 0.01002756, "balance_loss_clip": 1.00032449, "balance_loss_mlp": 1.03020096, "epoch": 0.19053058770479483, "flos": 59255156701440.0, "grad_norm": 0.8651594007441705, "language_loss": 0.60010755, "learning_rate": 3.652609292267242e-06, "loss": 0.62114394, "num_input_tokens_seen": 68434235, "router_z_loss_clip": 0.02429199, "router_z_loss_mlp": 0.34375, "step": 3169, "time_per_iteration": 3.098311424255371 }, { "auxiliary_loss_clip": 0.01196515, "auxiliary_loss_mlp": 0.01050692, "balance_loss_clip": 1.03190422, "balance_loss_mlp": 1.04995358, "epoch": 0.1905907109574628, "flos": 23951304362880.0, "grad_norm": 1.7447476105423998, "language_loss": 0.7804631, "learning_rate": 3.6523964643926754e-06, "loss": 0.80293512, "num_input_tokens_seen": 68453830, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.015625, "step": 3170, "time_per_iteration": 2.6752912998199463 }, { "auxiliary_loss_clip": 0.01153512, "auxiliary_loss_mlp": 0.01046891, "balance_loss_clip": 1.02868819, "balance_loss_mlp": 1.04857671, "epoch": 0.19065083421013077, "flos": 20812316774400.0, "grad_norm": 1.7684234354614425, "language_loss": 0.78333294, "learning_rate": 3.6521835775483285e-06, "loss": 0.80533701, "num_input_tokens_seen": 68473005, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9609375, "step": 3171, "time_per_iteration": 2.5556983947753906 }, { "auxiliary_loss_clip": 0.01150947, "auxiliary_loss_mlp": 0.01038742, "balance_loss_clip": 1.01981163, "balance_loss_mlp": 1.05041027, "epoch": 0.19071095746279873, "flos": 31284981192960.0, "grad_norm": 2.012079138272134, "language_loss": 0.78464371, "learning_rate": 3.6519706317417995e-06, "loss": 0.80654061, "num_input_tokens_seen": 68493470, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0078125, "step": 3172, "time_per_iteration": 2.6416711807250977 }, { "auxiliary_loss_clip": 0.01156438, "auxiliary_loss_mlp": 0.01050729, "balance_loss_clip": 1.03295469, "balance_loss_mlp": 1.04981947, "epoch": 0.1907710807154667, "flos": 14356117290240.0, "grad_norm": 2.1228540452934825, "language_loss": 0.80137891, "learning_rate": 3.6517576269806885e-06, "loss": 0.82345057, "num_input_tokens_seen": 68511290, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9765625, "step": 3173, "time_per_iteration": 2.642301321029663 }, { "auxiliary_loss_clip": 0.01176559, "auxiliary_loss_mlp": 0.01294351, "balance_loss_clip": 1.02901554, "balance_loss_mlp": 1.05028319, "epoch": 0.1908312039681347, "flos": 26907039740160.0, "grad_norm": 1.9382494502919794, "language_loss": 0.78556222, "learning_rate": 3.651544563272597e-06, "loss": 0.81027132, "num_input_tokens_seen": 68532575, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.98828125, "step": 3174, "time_per_iteration": 4.0581159591674805 }, { "auxiliary_loss_clip": 0.01187468, "auxiliary_loss_mlp": 0.01044531, "balance_loss_clip": 1.02613676, "balance_loss_mlp": 1.05269516, "epoch": 0.19089132722080265, "flos": 14494695960960.0, "grad_norm": 2.47616560911524, "language_loss": 0.80581319, "learning_rate": 3.651331440625127e-06, "loss": 0.82813317, "num_input_tokens_seen": 68548760, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.9921875, "step": 3175, "time_per_iteration": 4.099333763122559 }, { "auxiliary_loss_clip": 0.01195948, "auxiliary_loss_mlp": 0.01049364, "balance_loss_clip": 1.03092217, "balance_loss_mlp": 1.0512197, "epoch": 0.19095145047347062, "flos": 13952876232960.0, "grad_norm": 6.38248240451953, "language_loss": 0.85507131, "learning_rate": 3.651118259045887e-06, "loss": 0.8775245, "num_input_tokens_seen": 68563100, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.99609375, "step": 3176, "time_per_iteration": 3.9647367000579834 }, { "auxiliary_loss_clip": 0.0117227, "auxiliary_loss_mlp": 0.01048648, "balance_loss_clip": 1.02908599, "balance_loss_mlp": 1.05234599, "epoch": 0.19101157372613858, "flos": 25301832848640.0, "grad_norm": 2.471495342928861, "language_loss": 0.81480718, "learning_rate": 3.650905018542483e-06, "loss": 0.8370164, "num_input_tokens_seen": 68581650, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.015625, "step": 3177, "time_per_iteration": 2.6025686264038086 }, { "auxiliary_loss_clip": 0.01164101, "auxiliary_loss_mlp": 0.01045077, "balance_loss_clip": 1.02681398, "balance_loss_mlp": 1.04943204, "epoch": 0.19107169697880655, "flos": 20558212986240.0, "grad_norm": 2.204379065655342, "language_loss": 0.74328351, "learning_rate": 3.650691719122525e-06, "loss": 0.76537532, "num_input_tokens_seen": 68600360, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.96484375, "step": 3178, "time_per_iteration": 2.5968453884124756 }, { "auxiliary_loss_clip": 0.01159479, "auxiliary_loss_mlp": 0.01035172, "balance_loss_clip": 1.01838684, "balance_loss_mlp": 1.05129373, "epoch": 0.19113182023147451, "flos": 22163204396160.0, "grad_norm": 1.6445017540635944, "language_loss": 0.81197083, "learning_rate": 3.6504783607936266e-06, "loss": 0.83391732, "num_input_tokens_seen": 68617885, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.9921875, "step": 3179, "time_per_iteration": 2.6018266677856445 }, { "auxiliary_loss_clip": 0.01159016, "auxiliary_loss_mlp": 0.01039237, "balance_loss_clip": 1.02129638, "balance_loss_mlp": 1.05198312, "epoch": 0.19119194348414248, "flos": 18581796990720.0, "grad_norm": 2.6661839702734214, "language_loss": 0.79112989, "learning_rate": 3.6502649435634006e-06, "loss": 0.81311238, "num_input_tokens_seen": 68634550, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.98046875, "step": 3180, "time_per_iteration": 4.002766370773315 }, { "auxiliary_loss_clip": 0.01163393, "auxiliary_loss_mlp": 0.01041867, "balance_loss_clip": 1.0237112, "balance_loss_mlp": 1.04820287, "epoch": 0.19125206673681047, "flos": 19026623018880.0, "grad_norm": 2.1200966171929436, "language_loss": 0.78981483, "learning_rate": 3.6500514674394634e-06, "loss": 0.81186742, "num_input_tokens_seen": 68651895, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.96875, "step": 3181, "time_per_iteration": 2.6684730052948 }, { "auxiliary_loss_clip": 0.01156899, "auxiliary_loss_mlp": 0.01048583, "balance_loss_clip": 1.03084457, "balance_loss_mlp": 1.04895163, "epoch": 0.19131218998947844, "flos": 21690153256320.0, "grad_norm": 1.8204024071255305, "language_loss": 0.73464096, "learning_rate": 3.649837932429434e-06, "loss": 0.75669575, "num_input_tokens_seen": 68671500, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.98828125, "step": 3182, "time_per_iteration": 2.6824493408203125 }, { "auxiliary_loss_clip": 0.01176863, "auxiliary_loss_mlp": 0.01043963, "balance_loss_clip": 1.02617669, "balance_loss_mlp": 1.05202258, "epoch": 0.1913723132421464, "flos": 18442500048000.0, "grad_norm": 2.2398785656304265, "language_loss": 0.64601278, "learning_rate": 3.649624338540933e-06, "loss": 0.668221, "num_input_tokens_seen": 68690570, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9765625, "step": 3183, "time_per_iteration": 2.7628748416900635 }, { "auxiliary_loss_clip": 0.01165456, "auxiliary_loss_mlp": 0.01046363, "balance_loss_clip": 1.02768266, "balance_loss_mlp": 1.04944003, "epoch": 0.19143243649481437, "flos": 27160102033920.0, "grad_norm": 1.6534773290719003, "language_loss": 0.73046637, "learning_rate": 3.649410685781582e-06, "loss": 0.75258452, "num_input_tokens_seen": 68709735, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.98046875, "step": 3184, "time_per_iteration": 2.6434314250946045 }, { "auxiliary_loss_clip": 0.01168837, "auxiliary_loss_mlp": 0.0104642, "balance_loss_clip": 1.02654815, "balance_loss_mlp": 1.0510931, "epoch": 0.19149255974748233, "flos": 21718952985600.0, "grad_norm": 1.86579633473349, "language_loss": 0.8765918, "learning_rate": 3.6491969741590075e-06, "loss": 0.8987444, "num_input_tokens_seen": 68727565, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.99609375, "step": 3185, "time_per_iteration": 2.6096041202545166 }, { "auxiliary_loss_clip": 0.01145353, "auxiliary_loss_mlp": 0.01041801, "balance_loss_clip": 1.0236218, "balance_loss_mlp": 1.04924774, "epoch": 0.1915526830001503, "flos": 22963293889920.0, "grad_norm": 2.1882876693990587, "language_loss": 0.72706145, "learning_rate": 3.648983203680834e-06, "loss": 0.74893302, "num_input_tokens_seen": 68748110, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9609375, "step": 3186, "time_per_iteration": 2.550504446029663 }, { "auxiliary_loss_clip": 0.01171822, "auxiliary_loss_mlp": 0.01044288, "balance_loss_clip": 1.02392709, "balance_loss_mlp": 1.05330968, "epoch": 0.1916128062528183, "flos": 26140741966080.0, "grad_norm": 1.793228528260403, "language_loss": 0.83307242, "learning_rate": 3.6487693743546927e-06, "loss": 0.85523349, "num_input_tokens_seen": 68769765, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.0, "step": 3187, "time_per_iteration": 2.6226065158843994 }, { "auxiliary_loss_clip": 0.01074694, "auxiliary_loss_mlp": 0.01003855, "balance_loss_clip": 1.00150681, "balance_loss_mlp": 1.03115153, "epoch": 0.19167292950548626, "flos": 54925767457920.0, "grad_norm": 0.8510824156127985, "language_loss": 0.55807793, "learning_rate": 3.648555486188213e-06, "loss": 0.57886338, "num_input_tokens_seen": 68826815, "router_z_loss_clip": 0.0234375, "router_z_loss_mlp": 0.34375, "step": 3188, "time_per_iteration": 3.1467669010162354 }, { "auxiliary_loss_clip": 0.01158459, "auxiliary_loss_mlp": 0.01054983, "balance_loss_clip": 1.03671968, "balance_loss_mlp": 1.05158019, "epoch": 0.19173305275815422, "flos": 29935601942400.0, "grad_norm": 1.666358285185559, "language_loss": 0.6990338, "learning_rate": 3.648341539189029e-06, "loss": 0.72116816, "num_input_tokens_seen": 68847585, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.9765625, "step": 3189, "time_per_iteration": 2.6221842765808105 }, { "auxiliary_loss_clip": 0.01159981, "auxiliary_loss_mlp": 0.01042293, "balance_loss_clip": 1.02535331, "balance_loss_mlp": 1.04772425, "epoch": 0.1917931760108222, "flos": 24752471264640.0, "grad_norm": 1.8370044307976985, "language_loss": 0.74235344, "learning_rate": 3.648127533364775e-06, "loss": 0.76437616, "num_input_tokens_seen": 68866620, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.94140625, "step": 3190, "time_per_iteration": 2.6211745738983154 }, { "auxiliary_loss_clip": 0.01172339, "auxiliary_loss_mlp": 0.01062542, "balance_loss_clip": 1.04474401, "balance_loss_mlp": 1.04960012, "epoch": 0.19185329926349015, "flos": 18843550375680.0, "grad_norm": 1.9615298067987752, "language_loss": 0.84038246, "learning_rate": 3.6479134687230887e-06, "loss": 0.86273128, "num_input_tokens_seen": 68885515, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.95703125, "step": 3191, "time_per_iteration": 2.5569167137145996 }, { "auxiliary_loss_clip": 0.01159842, "auxiliary_loss_mlp": 0.01046821, "balance_loss_clip": 1.03001285, "balance_loss_mlp": 1.04823399, "epoch": 0.19191342251615812, "flos": 22086858038400.0, "grad_norm": 1.9012193808751634, "language_loss": 0.8955313, "learning_rate": 3.64769934527161e-06, "loss": 0.91759789, "num_input_tokens_seen": 68903225, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.9375, "step": 3192, "time_per_iteration": 2.6037113666534424 }, { "auxiliary_loss_clip": 0.01177376, "auxiliary_loss_mlp": 0.01055049, "balance_loss_clip": 1.03613019, "balance_loss_mlp": 1.0514617, "epoch": 0.19197354576882608, "flos": 22199115018240.0, "grad_norm": 1.9730448113330241, "language_loss": 0.74688959, "learning_rate": 3.64748516301798e-06, "loss": 0.76921386, "num_input_tokens_seen": 68922860, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.98828125, "step": 3193, "time_per_iteration": 2.672950506210327 }, { "auxiliary_loss_clip": 0.01166519, "auxiliary_loss_mlp": 0.01298059, "balance_loss_clip": 1.03117335, "balance_loss_mlp": 1.04928839, "epoch": 0.19203366902149407, "flos": 24896185580160.0, "grad_norm": 2.017337091957469, "language_loss": 0.7476216, "learning_rate": 3.6472709219698422e-06, "loss": 0.77226734, "num_input_tokens_seen": 68943000, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.98828125, "step": 3194, "time_per_iteration": 2.7188568115234375 }, { "auxiliary_loss_clip": 0.01069696, "auxiliary_loss_mlp": 0.01011634, "balance_loss_clip": 1.00933349, "balance_loss_mlp": 1.02728891, "epoch": 0.19209379227416204, "flos": 68416722789120.0, "grad_norm": 0.7894479443315754, "language_loss": 0.68446618, "learning_rate": 3.647056622134843e-06, "loss": 0.70527947, "num_input_tokens_seen": 69000255, "router_z_loss_clip": 0.02294922, "router_z_loss_mlp": 0.33984375, "step": 3195, "time_per_iteration": 3.1333067417144775 }, { "auxiliary_loss_clip": 0.01176606, "auxiliary_loss_mlp": 0.01051943, "balance_loss_clip": 1.03397799, "balance_loss_mlp": 1.05118299, "epoch": 0.19215391552683, "flos": 22055185221120.0, "grad_norm": 2.255395824272492, "language_loss": 0.72795743, "learning_rate": 3.6468422635206297e-06, "loss": 0.75024295, "num_input_tokens_seen": 69019665, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.98046875, "step": 3196, "time_per_iteration": 2.6575584411621094 }, { "auxiliary_loss_clip": 0.01167967, "auxiliary_loss_mlp": 0.01048006, "balance_loss_clip": 1.03017187, "balance_loss_mlp": 1.05394566, "epoch": 0.19221403877949797, "flos": 20302959962880.0, "grad_norm": 1.8194173654374992, "language_loss": 0.83485788, "learning_rate": 3.6466278461348514e-06, "loss": 0.85701764, "num_input_tokens_seen": 69039055, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.9609375, "step": 3197, "time_per_iteration": 2.5321903228759766 }, { "auxiliary_loss_clip": 0.01193496, "auxiliary_loss_mlp": 0.01043825, "balance_loss_clip": 1.02539515, "balance_loss_mlp": 1.04888546, "epoch": 0.19227416203216594, "flos": 23185329811200.0, "grad_norm": 2.1138278717171124, "language_loss": 0.7980988, "learning_rate": 3.646413369985161e-06, "loss": 0.82047194, "num_input_tokens_seen": 69056370, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.9921875, "step": 3198, "time_per_iteration": 2.667381763458252 }, { "auxiliary_loss_clip": 0.01168006, "auxiliary_loss_mlp": 0.01055139, "balance_loss_clip": 1.03650641, "balance_loss_mlp": 1.05031705, "epoch": 0.1923342852848339, "flos": 25776607841280.0, "grad_norm": 1.8945850778596325, "language_loss": 0.78689456, "learning_rate": 3.6461988350792137e-06, "loss": 0.80912602, "num_input_tokens_seen": 69075915, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.0, "step": 3199, "time_per_iteration": 2.574760675430298 }, { "auxiliary_loss_clip": 0.01175246, "auxiliary_loss_mlp": 0.01046076, "balance_loss_clip": 1.02913618, "balance_loss_mlp": 1.0509429, "epoch": 0.19239440853750187, "flos": 17128349061120.0, "grad_norm": 2.0909826611337525, "language_loss": 0.83933628, "learning_rate": 3.6459842414246636e-06, "loss": 0.86154956, "num_input_tokens_seen": 69094145, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.96875, "step": 3200, "time_per_iteration": 2.5521631240844727 }, { "auxiliary_loss_clip": 0.01146755, "auxiliary_loss_mlp": 0.01052132, "balance_loss_clip": 1.03466833, "balance_loss_mlp": 1.0505414, "epoch": 0.19245453179016986, "flos": 16435093593600.0, "grad_norm": 1.7964550508282051, "language_loss": 0.79017282, "learning_rate": 3.6457695890291697e-06, "loss": 0.81216168, "num_input_tokens_seen": 69111110, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.9609375, "step": 3201, "time_per_iteration": 2.4968488216400146 }, { "auxiliary_loss_clip": 0.01183218, "auxiliary_loss_mlp": 0.01044014, "balance_loss_clip": 1.02678788, "balance_loss_mlp": 1.0483973, "epoch": 0.19251465504283782, "flos": 20230276792320.0, "grad_norm": 1.916578793359589, "language_loss": 0.68935555, "learning_rate": 3.645554877900393e-06, "loss": 0.71162784, "num_input_tokens_seen": 69130280, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.98828125, "step": 3202, "time_per_iteration": 2.562413215637207 }, { "auxiliary_loss_clip": 0.01161459, "auxiliary_loss_mlp": 0.01035235, "balance_loss_clip": 1.01757979, "balance_loss_mlp": 1.04844916, "epoch": 0.1925747782955058, "flos": 19464374067840.0, "grad_norm": 1.8484800601889209, "language_loss": 0.91165853, "learning_rate": 3.645340108045995e-06, "loss": 0.93362546, "num_input_tokens_seen": 69149570, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.953125, "step": 3203, "time_per_iteration": 2.53126859664917 }, { "auxiliary_loss_clip": 0.01157171, "auxiliary_loss_mlp": 0.01047103, "balance_loss_clip": 1.02830362, "balance_loss_mlp": 1.04938638, "epoch": 0.19263490154817375, "flos": 17785586165760.0, "grad_norm": 2.4272449893145485, "language_loss": 0.81797826, "learning_rate": 3.6451252794736417e-06, "loss": 0.84002107, "num_input_tokens_seen": 69168190, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.984375, "step": 3204, "time_per_iteration": 2.518137216567993 }, { "auxiliary_loss_clip": 0.01169362, "auxiliary_loss_mlp": 0.01045518, "balance_loss_clip": 1.02777934, "balance_loss_mlp": 1.04716086, "epoch": 0.19269502480084172, "flos": 17457075354240.0, "grad_norm": 2.03506740379181, "language_loss": 0.76129365, "learning_rate": 3.6449103921909983e-06, "loss": 0.78344238, "num_input_tokens_seen": 69186950, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.953125, "step": 3205, "time_per_iteration": 2.6311190128326416 }, { "auxiliary_loss_clip": 0.01166664, "auxiliary_loss_mlp": 0.01046609, "balance_loss_clip": 1.02887058, "balance_loss_mlp": 1.05015838, "epoch": 0.19275514805350968, "flos": 21506901045120.0, "grad_norm": 5.265545719768235, "language_loss": 0.83239174, "learning_rate": 3.644695446205735e-06, "loss": 0.85452449, "num_input_tokens_seen": 69204850, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.984375, "step": 3206, "time_per_iteration": 2.5938849449157715 }, { "auxiliary_loss_clip": 0.01073111, "auxiliary_loss_mlp": 0.01004995, "balance_loss_clip": 1.00239611, "balance_loss_mlp": 1.02104425, "epoch": 0.19281527130617768, "flos": 47695979738880.0, "grad_norm": 0.8315143657266363, "language_loss": 0.60592937, "learning_rate": 3.644480441525521e-06, "loss": 0.62671041, "num_input_tokens_seen": 69259200, "router_z_loss_clip": 0.02600098, "router_z_loss_mlp": 0.33789062, "step": 3207, "time_per_iteration": 3.019028425216675 }, { "auxiliary_loss_clip": 0.01173472, "auxiliary_loss_mlp": 0.01046604, "balance_loss_clip": 1.02884209, "balance_loss_mlp": 1.04909718, "epoch": 0.19287539455884564, "flos": 11801252672640.0, "grad_norm": 2.851416952809959, "language_loss": 0.75031501, "learning_rate": 3.6442653781580305e-06, "loss": 0.77251577, "num_input_tokens_seen": 69275835, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9765625, "step": 3208, "time_per_iteration": 2.599581241607666 }, { "auxiliary_loss_clip": 0.01156717, "auxiliary_loss_mlp": 0.01294753, "balance_loss_clip": 1.0273912, "balance_loss_mlp": 1.04817331, "epoch": 0.1929355178115136, "flos": 20631434860800.0, "grad_norm": 1.9676581080478452, "language_loss": 0.60328835, "learning_rate": 3.6440502561109384e-06, "loss": 0.62780309, "num_input_tokens_seen": 69294810, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.99609375, "step": 3209, "time_per_iteration": 2.6276609897613525 }, { "auxiliary_loss_clip": 0.01150524, "auxiliary_loss_mlp": 0.01050962, "balance_loss_clip": 1.03176975, "balance_loss_mlp": 1.05039883, "epoch": 0.19299564106418157, "flos": 40807916058240.0, "grad_norm": 5.314100584813105, "language_loss": 0.79791641, "learning_rate": 3.6438350753919213e-06, "loss": 0.81993127, "num_input_tokens_seen": 69316065, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0, "step": 3210, "time_per_iteration": 2.7543327808380127 }, { "auxiliary_loss_clip": 0.01167056, "auxiliary_loss_mlp": 0.0104173, "balance_loss_clip": 1.02465892, "balance_loss_mlp": 1.04591238, "epoch": 0.19305576431684954, "flos": 11361418634880.0, "grad_norm": 2.616287392067373, "language_loss": 0.82528877, "learning_rate": 3.643619836008659e-06, "loss": 0.8473767, "num_input_tokens_seen": 69332900, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9453125, "step": 3211, "time_per_iteration": 2.654395818710327 }, { "auxiliary_loss_clip": 0.01055659, "auxiliary_loss_mlp": 0.01000234, "balance_loss_clip": 0.99744433, "balance_loss_mlp": 1.0223254, "epoch": 0.1931158875695175, "flos": 54511895975040.0, "grad_norm": 0.9724656668238553, "language_loss": 0.6371482, "learning_rate": 3.6434045379688324e-06, "loss": 0.6577071, "num_input_tokens_seen": 69382535, "router_z_loss_clip": 0.0279541, "router_z_loss_mlp": 0.33398438, "step": 3212, "time_per_iteration": 3.0261542797088623 }, { "auxiliary_loss_clip": 0.01164403, "auxiliary_loss_mlp": 0.01044558, "balance_loss_clip": 1.02759445, "balance_loss_mlp": 1.0508759, "epoch": 0.19317601082218547, "flos": 19828436365440.0, "grad_norm": 1.6690719372206544, "language_loss": 0.7626996, "learning_rate": 3.6431891812801254e-06, "loss": 0.7847892, "num_input_tokens_seen": 69400600, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.953125, "step": 3213, "time_per_iteration": 2.64717698097229 }, { "auxiliary_loss_clip": 0.01175642, "auxiliary_loss_mlp": 0.01048303, "balance_loss_clip": 1.02955103, "balance_loss_mlp": 1.04960823, "epoch": 0.19323613407485346, "flos": 13152068467200.0, "grad_norm": 2.473890433887327, "language_loss": 0.7118178, "learning_rate": 3.6429737659502237e-06, "loss": 0.73405725, "num_input_tokens_seen": 69417350, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.9921875, "step": 3214, "time_per_iteration": 2.584515333175659 }, { "auxiliary_loss_clip": 0.01143293, "auxiliary_loss_mlp": 0.01047963, "balance_loss_clip": 1.02908075, "balance_loss_mlp": 1.04659581, "epoch": 0.19329625732752143, "flos": 14027247342720.0, "grad_norm": 2.2194911201954546, "language_loss": 0.75619143, "learning_rate": 3.642758291986814e-06, "loss": 0.77810395, "num_input_tokens_seen": 69431845, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.96875, "step": 3215, "time_per_iteration": 2.5865554809570312 }, { "auxiliary_loss_clip": 0.01168267, "auxiliary_loss_mlp": 0.01048599, "balance_loss_clip": 1.03204119, "balance_loss_mlp": 1.0464834, "epoch": 0.1933563805801894, "flos": 23441732069760.0, "grad_norm": 1.7292165199155345, "language_loss": 0.88439822, "learning_rate": 3.642542759397587e-06, "loss": 0.90656686, "num_input_tokens_seen": 69453275, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.953125, "step": 3216, "time_per_iteration": 4.021416902542114 }, { "auxiliary_loss_clip": 0.01142114, "auxiliary_loss_mlp": 0.01051289, "balance_loss_clip": 1.03463578, "balance_loss_mlp": 1.04720926, "epoch": 0.19341650383285736, "flos": 20485314334080.0, "grad_norm": 4.9062602562128825, "language_loss": 0.79697645, "learning_rate": 3.6423271681902336e-06, "loss": 0.81891048, "num_input_tokens_seen": 69471830, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.94921875, "step": 3217, "time_per_iteration": 3.993570327758789 }, { "auxiliary_loss_clip": 0.0116886, "auxiliary_loss_mlp": 0.01052193, "balance_loss_clip": 1.03243971, "balance_loss_mlp": 1.04996324, "epoch": 0.19347662708552532, "flos": 17858484817920.0, "grad_norm": 2.4528979828782536, "language_loss": 0.61224622, "learning_rate": 3.642111518372448e-06, "loss": 0.63445675, "num_input_tokens_seen": 69489320, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.0078125, "step": 3218, "time_per_iteration": 4.084106206893921 }, { "auxiliary_loss_clip": 0.01146668, "auxiliary_loss_mlp": 0.01046284, "balance_loss_clip": 1.02898622, "balance_loss_mlp": 1.04975176, "epoch": 0.1935367503381933, "flos": 18187247024640.0, "grad_norm": 1.7342998609283882, "language_loss": 0.80145758, "learning_rate": 3.6418958099519267e-06, "loss": 0.82338715, "num_input_tokens_seen": 69506665, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.96875, "step": 3219, "time_per_iteration": 2.644292116165161 }, { "auxiliary_loss_clip": 0.01182278, "auxiliary_loss_mlp": 0.0104669, "balance_loss_clip": 1.02797437, "balance_loss_mlp": 1.04925871, "epoch": 0.19359687359086128, "flos": 15957122290560.0, "grad_norm": 2.1395306743947344, "language_loss": 0.85723042, "learning_rate": 3.6416800429363674e-06, "loss": 0.87952006, "num_input_tokens_seen": 69523835, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.96875, "step": 3220, "time_per_iteration": 2.621399164199829 }, { "auxiliary_loss_clip": 0.0114009, "auxiliary_loss_mlp": 0.01037781, "balance_loss_clip": 1.02163374, "balance_loss_mlp": 1.04883003, "epoch": 0.19365699684352924, "flos": 21215198695680.0, "grad_norm": 2.5361471395047315, "language_loss": 0.83872181, "learning_rate": 3.6414642173334704e-06, "loss": 0.86050051, "num_input_tokens_seen": 69542620, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.9140625, "step": 3221, "time_per_iteration": 2.5169029235839844 }, { "auxiliary_loss_clip": 0.01151592, "auxiliary_loss_mlp": 0.01047397, "balance_loss_clip": 1.03117299, "balance_loss_mlp": 1.0491817, "epoch": 0.1937171200961972, "flos": 17311098481920.0, "grad_norm": 2.0354733252467327, "language_loss": 0.86181098, "learning_rate": 3.6412483331509373e-06, "loss": 0.88380086, "num_input_tokens_seen": 69561130, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.93359375, "step": 3222, "time_per_iteration": 4.090735197067261 }, { "auxiliary_loss_clip": 0.01197298, "auxiliary_loss_mlp": 0.0104057, "balance_loss_clip": 1.02314186, "balance_loss_mlp": 1.04609394, "epoch": 0.19377724334886517, "flos": 22635968227200.0, "grad_norm": 1.6609709384724065, "language_loss": 0.78212237, "learning_rate": 3.641032390396473e-06, "loss": 0.80450112, "num_input_tokens_seen": 69580425, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.96875, "step": 3223, "time_per_iteration": 2.6279773712158203 }, { "auxiliary_loss_clip": 0.01151669, "auxiliary_loss_mlp": 0.01043242, "balance_loss_clip": 1.02593327, "balance_loss_mlp": 1.04848576, "epoch": 0.19383736660153314, "flos": 15077813351040.0, "grad_norm": 2.043256441185696, "language_loss": 0.74945021, "learning_rate": 3.6408163890777843e-06, "loss": 0.77139932, "num_input_tokens_seen": 69597085, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.94140625, "step": 3224, "time_per_iteration": 2.5730912685394287 }, { "auxiliary_loss_clip": 0.0116955, "auxiliary_loss_mlp": 0.01040759, "balance_loss_clip": 1.02324724, "balance_loss_mlp": 1.04811263, "epoch": 0.1938974898542011, "flos": 47119934350080.0, "grad_norm": 2.7051592605243093, "language_loss": 0.70293093, "learning_rate": 3.640600329202579e-06, "loss": 0.725034, "num_input_tokens_seen": 69618885, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.94140625, "step": 3225, "time_per_iteration": 2.8675739765167236 }, { "auxiliary_loss_clip": 0.01170437, "auxiliary_loss_mlp": 0.01049626, "balance_loss_clip": 1.03217375, "balance_loss_mlp": 1.0464077, "epoch": 0.19395761310686907, "flos": 25812554376960.0, "grad_norm": 2.2291467742767392, "language_loss": 0.69185203, "learning_rate": 3.6403842107785686e-06, "loss": 0.71405268, "num_input_tokens_seen": 69638200, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.96875, "step": 3226, "time_per_iteration": 2.699122190475464 }, { "auxiliary_loss_clip": 0.01170245, "auxiliary_loss_mlp": 0.01042874, "balance_loss_clip": 1.02455115, "balance_loss_mlp": 1.04993486, "epoch": 0.19401773635953706, "flos": 23039604334080.0, "grad_norm": 1.7731390320089666, "language_loss": 0.7574386, "learning_rate": 3.6401680338134653e-06, "loss": 0.7795698, "num_input_tokens_seen": 69657550, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.93359375, "step": 3227, "time_per_iteration": 2.645134449005127 }, { "auxiliary_loss_clip": 0.01171583, "auxiliary_loss_mlp": 0.01041924, "balance_loss_clip": 1.02485371, "balance_loss_mlp": 1.0480299, "epoch": 0.19407785961220503, "flos": 15920780705280.0, "grad_norm": 1.8003793000425243, "language_loss": 0.69319332, "learning_rate": 3.6399517983149838e-06, "loss": 0.71532845, "num_input_tokens_seen": 69675005, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9609375, "step": 3228, "time_per_iteration": 2.5550732612609863 }, { "auxiliary_loss_clip": 0.01153838, "auxiliary_loss_mlp": 0.01042238, "balance_loss_clip": 1.02492929, "balance_loss_mlp": 1.04999959, "epoch": 0.194137982864873, "flos": 25921722787200.0, "grad_norm": 2.265031741311624, "language_loss": 0.74428415, "learning_rate": 3.6397355042908407e-06, "loss": 0.76624495, "num_input_tokens_seen": 69696455, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.94921875, "step": 3229, "time_per_iteration": 2.6652560234069824 }, { "auxiliary_loss_clip": 0.01168676, "auxiliary_loss_mlp": 0.01043097, "balance_loss_clip": 1.02584743, "balance_loss_mlp": 1.04669034, "epoch": 0.19419810611754096, "flos": 13261344618240.0, "grad_norm": 1.7262354837520275, "language_loss": 0.6571846, "learning_rate": 3.6395191517487557e-06, "loss": 0.67930233, "num_input_tokens_seen": 69714245, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.94921875, "step": 3230, "time_per_iteration": 2.546743631362915 }, { "auxiliary_loss_clip": 0.01155265, "auxiliary_loss_mlp": 0.01289246, "balance_loss_clip": 1.02352524, "balance_loss_mlp": 1.04459691, "epoch": 0.19425822937020892, "flos": 15705568368000.0, "grad_norm": 2.0964654894076524, "language_loss": 0.82038116, "learning_rate": 3.6393027406964494e-06, "loss": 0.84482634, "num_input_tokens_seen": 69731515, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.9296875, "step": 3231, "time_per_iteration": 2.558840751647949 }, { "auxiliary_loss_clip": 0.01164173, "auxiliary_loss_mlp": 0.01037586, "balance_loss_clip": 1.01838183, "balance_loss_mlp": 1.049348, "epoch": 0.1943183526228769, "flos": 23105392093440.0, "grad_norm": 2.8982020807987574, "language_loss": 0.86681652, "learning_rate": 3.639086271141645e-06, "loss": 0.88883412, "num_input_tokens_seen": 69748885, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.96484375, "step": 3232, "time_per_iteration": 2.557088851928711 }, { "auxiliary_loss_clip": 0.01180157, "auxiliary_loss_mlp": 0.01048554, "balance_loss_clip": 1.03048241, "balance_loss_mlp": 1.0493716, "epoch": 0.19437847587554485, "flos": 24712610146560.0, "grad_norm": 1.8106935835294324, "language_loss": 0.85221601, "learning_rate": 3.6388697430920674e-06, "loss": 0.87450308, "num_input_tokens_seen": 69767540, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.94140625, "step": 3233, "time_per_iteration": 2.7274718284606934 }, { "auxiliary_loss_clip": 0.01156524, "auxiliary_loss_mlp": 0.01047959, "balance_loss_clip": 1.03010178, "balance_loss_mlp": 1.04893351, "epoch": 0.19443859912821285, "flos": 23116130259840.0, "grad_norm": 1.7884279390801547, "language_loss": 0.89415258, "learning_rate": 3.638653156555445e-06, "loss": 0.91619742, "num_input_tokens_seen": 69789340, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.984375, "step": 3234, "time_per_iteration": 2.581028699874878 }, { "auxiliary_loss_clip": 0.01153471, "auxiliary_loss_mlp": 0.01038733, "balance_loss_clip": 1.02069628, "balance_loss_mlp": 1.04505301, "epoch": 0.1944987223808808, "flos": 15084385539840.0, "grad_norm": 3.2057086928418563, "language_loss": 0.78234035, "learning_rate": 3.638436511539507e-06, "loss": 0.8042624, "num_input_tokens_seen": 69806470, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9921875, "step": 3235, "time_per_iteration": 2.515841484069824 }, { "auxiliary_loss_clip": 0.0116243, "auxiliary_loss_mlp": 0.01041842, "balance_loss_clip": 1.02514124, "balance_loss_mlp": 1.05003858, "epoch": 0.19455884563354878, "flos": 17126876603520.0, "grad_norm": 2.1135625817489894, "language_loss": 0.79300648, "learning_rate": 3.6382198080519833e-06, "loss": 0.81504923, "num_input_tokens_seen": 69822655, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.9453125, "step": 3236, "time_per_iteration": 2.5226314067840576 }, { "auxiliary_loss_clip": 0.01143986, "auxiliary_loss_mlp": 0.01041723, "balance_loss_clip": 1.02340102, "balance_loss_mlp": 1.04946375, "epoch": 0.19461896888621674, "flos": 20193396503040.0, "grad_norm": 1.7184295886655767, "language_loss": 0.75813007, "learning_rate": 3.6380030461006093e-06, "loss": 0.77998722, "num_input_tokens_seen": 69841895, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.9453125, "step": 3237, "time_per_iteration": 2.585827350616455 }, { "auxiliary_loss_clip": 0.01148188, "auxiliary_loss_mlp": 0.01049216, "balance_loss_clip": 1.03194237, "balance_loss_mlp": 1.048352, "epoch": 0.1946790921388847, "flos": 25301365971840.0, "grad_norm": 1.8829218655636335, "language_loss": 0.7504108, "learning_rate": 3.6377862256931203e-06, "loss": 0.77238482, "num_input_tokens_seen": 69862220, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 1.0, "step": 3238, "time_per_iteration": 2.619037628173828 }, { "auxiliary_loss_clip": 0.01181404, "auxiliary_loss_mlp": 0.01040884, "balance_loss_clip": 1.02250242, "balance_loss_mlp": 1.04867792, "epoch": 0.19473921539155267, "flos": 20193396503040.0, "grad_norm": 1.8170986882309865, "language_loss": 0.73164487, "learning_rate": 3.637569346837253e-06, "loss": 0.75386775, "num_input_tokens_seen": 69881830, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.96875, "step": 3239, "time_per_iteration": 2.6946375370025635 }, { "auxiliary_loss_clip": 0.01160868, "auxiliary_loss_mlp": 0.01045923, "balance_loss_clip": 1.02841139, "balance_loss_mlp": 1.04768336, "epoch": 0.19479933864422067, "flos": 20887549810560.0, "grad_norm": 1.9400089037733446, "language_loss": 0.73529786, "learning_rate": 3.6373524095407485e-06, "loss": 0.75736582, "num_input_tokens_seen": 69900515, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.94921875, "step": 3240, "time_per_iteration": 2.657167434692383 }, { "auxiliary_loss_clip": 0.01161902, "auxiliary_loss_mlp": 0.0103821, "balance_loss_clip": 1.02102041, "balance_loss_mlp": 1.04933047, "epoch": 0.19485946189688863, "flos": 23295072839040.0, "grad_norm": 2.0230704546895697, "language_loss": 0.66379774, "learning_rate": 3.637135413811348e-06, "loss": 0.68579876, "num_input_tokens_seen": 69920060, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.94921875, "step": 3241, "time_per_iteration": 2.6417033672332764 }, { "auxiliary_loss_clip": 0.01161483, "auxiliary_loss_mlp": 0.01045535, "balance_loss_clip": 1.02863073, "balance_loss_mlp": 1.04907274, "epoch": 0.1949195851495566, "flos": 23295036925440.0, "grad_norm": 4.025950627475602, "language_loss": 0.82717884, "learning_rate": 3.636918359656796e-06, "loss": 0.84924901, "num_input_tokens_seen": 69939820, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.94140625, "step": 3242, "time_per_iteration": 2.677799940109253 }, { "auxiliary_loss_clip": 0.01063982, "auxiliary_loss_mlp": 0.01051791, "balance_loss_clip": 1.04886997, "balance_loss_mlp": 1.03054762, "epoch": 0.19497970840222456, "flos": 64962871557120.0, "grad_norm": 0.8243627259459744, "language_loss": 0.5745852, "learning_rate": 3.636701247084839e-06, "loss": 0.59574294, "num_input_tokens_seen": 70002145, "router_z_loss_clip": 0.0291748, "router_z_loss_mlp": 0.3359375, "step": 3243, "time_per_iteration": 3.1495981216430664 }, { "auxiliary_loss_clip": 0.01152434, "auxiliary_loss_mlp": 0.01039427, "balance_loss_clip": 1.02153373, "balance_loss_mlp": 1.04787314, "epoch": 0.19503983165489253, "flos": 19644717277440.0, "grad_norm": 1.7587859017303906, "language_loss": 0.83343494, "learning_rate": 3.6364840761032238e-06, "loss": 0.85535359, "num_input_tokens_seen": 70020510, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.953125, "step": 3244, "time_per_iteration": 2.5444719791412354 }, { "auxiliary_loss_clip": 0.01153379, "auxiliary_loss_mlp": 0.01044647, "balance_loss_clip": 1.02786219, "balance_loss_mlp": 1.05045152, "epoch": 0.1950999549075605, "flos": 21141976821120.0, "grad_norm": 1.5698986276908158, "language_loss": 0.77267092, "learning_rate": 3.6362668467197015e-06, "loss": 0.79465121, "num_input_tokens_seen": 70040760, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.94140625, "step": 3245, "time_per_iteration": 2.608743906021118 }, { "auxiliary_loss_clip": 0.01154322, "auxiliary_loss_mlp": 0.01040011, "balance_loss_clip": 1.02205777, "balance_loss_mlp": 1.04892933, "epoch": 0.19516007816022846, "flos": 20884820376960.0, "grad_norm": 2.051386269740221, "language_loss": 0.8388449, "learning_rate": 3.6360495589420247e-06, "loss": 0.86078817, "num_input_tokens_seen": 70058720, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.96484375, "step": 3246, "time_per_iteration": 2.5377023220062256 }, { "auxiliary_loss_clip": 0.01156426, "auxiliary_loss_mlp": 0.01294981, "balance_loss_clip": 1.02970493, "balance_loss_mlp": 1.04997373, "epoch": 0.19522020141289645, "flos": 16910515031040.0, "grad_norm": 2.127487281350298, "language_loss": 0.7617197, "learning_rate": 3.6358322127779476e-06, "loss": 0.78623378, "num_input_tokens_seen": 70076470, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9765625, "step": 3247, "time_per_iteration": 2.5337464809417725 }, { "auxiliary_loss_clip": 0.01174777, "auxiliary_loss_mlp": 0.01041501, "balance_loss_clip": 1.02471662, "balance_loss_mlp": 1.05086684, "epoch": 0.19528032466556441, "flos": 26724829023360.0, "grad_norm": 1.6125036353733435, "language_loss": 0.75176394, "learning_rate": 3.6356148082352265e-06, "loss": 0.77392673, "num_input_tokens_seen": 70096220, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.96875, "step": 3248, "time_per_iteration": 2.629897356033325 }, { "auxiliary_loss_clip": 0.01142844, "auxiliary_loss_mlp": 0.01050185, "balance_loss_clip": 1.03304231, "balance_loss_mlp": 1.04692435, "epoch": 0.19534044791823238, "flos": 21032808410880.0, "grad_norm": 2.3279734217058246, "language_loss": 0.78399694, "learning_rate": 3.63539734532162e-06, "loss": 0.80592728, "num_input_tokens_seen": 70114800, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9609375, "step": 3249, "time_per_iteration": 2.5627312660217285 }, { "auxiliary_loss_clip": 0.01155982, "auxiliary_loss_mlp": 0.01049959, "balance_loss_clip": 1.03231657, "balance_loss_mlp": 1.05024981, "epoch": 0.19540057117090034, "flos": 22344050396160.0, "grad_norm": 1.4812104930954815, "language_loss": 0.72888821, "learning_rate": 3.6351798240448894e-06, "loss": 0.75094759, "num_input_tokens_seen": 70134930, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.96875, "step": 3250, "time_per_iteration": 2.5653786659240723 }, { "auxiliary_loss_clip": 0.01177566, "auxiliary_loss_mlp": 0.01039978, "balance_loss_clip": 1.02403951, "balance_loss_mlp": 1.04817653, "epoch": 0.1954606944235683, "flos": 20301631159680.0, "grad_norm": 2.0964393669197188, "language_loss": 0.80054003, "learning_rate": 3.634962244412797e-06, "loss": 0.82271552, "num_input_tokens_seen": 70152045, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.9296875, "step": 3251, "time_per_iteration": 2.5847718715667725 }, { "auxiliary_loss_clip": 0.01146215, "auxiliary_loss_mlp": 0.01046159, "balance_loss_clip": 1.02827811, "balance_loss_mlp": 1.04922462, "epoch": 0.19552081767623627, "flos": 17346865449600.0, "grad_norm": 2.210701709671215, "language_loss": 0.83217943, "learning_rate": 3.6347446064331074e-06, "loss": 0.85410315, "num_input_tokens_seen": 70169240, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.97265625, "step": 3252, "time_per_iteration": 2.515340805053711 }, { "auxiliary_loss_clip": 0.01158536, "auxiliary_loss_mlp": 0.01043169, "balance_loss_clip": 1.02452517, "balance_loss_mlp": 1.04922032, "epoch": 0.19558094092890424, "flos": 31977626129280.0, "grad_norm": 1.667744565597729, "language_loss": 0.7384569, "learning_rate": 3.6345269101135885e-06, "loss": 0.76047397, "num_input_tokens_seen": 70192690, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.0, "step": 3253, "time_per_iteration": 2.610570192337036 }, { "auxiliary_loss_clip": 0.01184038, "auxiliary_loss_mlp": 0.01044078, "balance_loss_clip": 1.02611327, "balance_loss_mlp": 1.04793978, "epoch": 0.19564106418157223, "flos": 22268889187200.0, "grad_norm": 2.0873344321996097, "language_loss": 0.76274276, "learning_rate": 3.634309155462008e-06, "loss": 0.78502393, "num_input_tokens_seen": 70209685, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.0, "step": 3254, "time_per_iteration": 2.6054282188415527 }, { "auxiliary_loss_clip": 0.01097686, "auxiliary_loss_mlp": 0.01020395, "balance_loss_clip": 1.01740265, "balance_loss_mlp": 1.02834511, "epoch": 0.1957011874342402, "flos": 54364554385920.0, "grad_norm": 0.7639228680829951, "language_loss": 0.55323225, "learning_rate": 3.6340913424861383e-06, "loss": 0.57441306, "num_input_tokens_seen": 70265050, "router_z_loss_clip": 0.02990723, "router_z_loss_mlp": 0.33203125, "step": 3255, "time_per_iteration": 3.1571784019470215 }, { "auxiliary_loss_clip": 0.0116612, "auxiliary_loss_mlp": 0.01041763, "balance_loss_clip": 1.02421582, "balance_loss_mlp": 1.05102777, "epoch": 0.19576131068690816, "flos": 16506699356160.0, "grad_norm": 3.1832844856190126, "language_loss": 0.70442921, "learning_rate": 3.6338734711937512e-06, "loss": 0.72650802, "num_input_tokens_seen": 70281830, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.97265625, "step": 3256, "time_per_iteration": 2.670612096786499 }, { "auxiliary_loss_clip": 0.01142121, "auxiliary_loss_mlp": 0.0104002, "balance_loss_clip": 1.02364039, "balance_loss_mlp": 1.04741526, "epoch": 0.19582143393957613, "flos": 14719676797440.0, "grad_norm": 2.1274057657239367, "language_loss": 0.80167663, "learning_rate": 3.6336555415926232e-06, "loss": 0.82349801, "num_input_tokens_seen": 70297420, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.9453125, "step": 3257, "time_per_iteration": 3.8946824073791504 }, { "auxiliary_loss_clip": 0.01181304, "auxiliary_loss_mlp": 0.01042932, "balance_loss_clip": 1.02603984, "balance_loss_mlp": 1.05022216, "epoch": 0.1958815571922441, "flos": 24425504737920.0, "grad_norm": 1.7414539794924537, "language_loss": 0.74910134, "learning_rate": 3.6334375536905313e-06, "loss": 0.77134371, "num_input_tokens_seen": 70319210, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.94921875, "step": 3258, "time_per_iteration": 2.6654446125030518 }, { "auxiliary_loss_clip": 0.01192915, "auxiliary_loss_mlp": 0.01041307, "balance_loss_clip": 1.02342534, "balance_loss_mlp": 1.04920387, "epoch": 0.19594168044491206, "flos": 24900279730560.0, "grad_norm": 2.0207851291082206, "language_loss": 0.73666489, "learning_rate": 3.633219507495255e-06, "loss": 0.7590071, "num_input_tokens_seen": 70339045, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.98046875, "step": 3259, "time_per_iteration": 4.273437261581421 }, { "auxiliary_loss_clip": 0.01167256, "auxiliary_loss_mlp": 0.01043213, "balance_loss_clip": 1.02495062, "balance_loss_mlp": 1.05059981, "epoch": 0.19600180369758005, "flos": 12057008486400.0, "grad_norm": 2.3336708691906667, "language_loss": 0.76786143, "learning_rate": 3.633001403014575e-06, "loss": 0.78996611, "num_input_tokens_seen": 70356505, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.984375, "step": 3260, "time_per_iteration": 2.5279388427734375 }, { "auxiliary_loss_clip": 0.0115452, "auxiliary_loss_mlp": 0.01049372, "balance_loss_clip": 1.03170514, "balance_loss_mlp": 1.04825616, "epoch": 0.19606192695024802, "flos": 20850202644480.0, "grad_norm": 2.3252168505459956, "language_loss": 0.82303077, "learning_rate": 3.632783240256276e-06, "loss": 0.84506965, "num_input_tokens_seen": 70375410, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.96875, "step": 3261, "time_per_iteration": 2.632995367050171 }, { "auxiliary_loss_clip": 0.01172502, "auxiliary_loss_mlp": 0.01045718, "balance_loss_clip": 1.02766991, "balance_loss_mlp": 1.05001056, "epoch": 0.19612205020291598, "flos": 28475509996800.0, "grad_norm": 5.686300481857154, "language_loss": 0.76194525, "learning_rate": 3.632565019228143e-06, "loss": 0.78412747, "num_input_tokens_seen": 70396315, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.953125, "step": 3262, "time_per_iteration": 2.6802055835723877 }, { "auxiliary_loss_clip": 0.01155096, "auxiliary_loss_mlp": 0.01059203, "balance_loss_clip": 1.04241824, "balance_loss_mlp": 1.05187798, "epoch": 0.19618217345558395, "flos": 25556618995200.0, "grad_norm": 1.6653351111989487, "language_loss": 0.86352682, "learning_rate": 3.6323467399379634e-06, "loss": 0.88566983, "num_input_tokens_seen": 70417945, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.94140625, "step": 3263, "time_per_iteration": 4.045724391937256 }, { "auxiliary_loss_clip": 0.01161556, "auxiliary_loss_mlp": 0.01051402, "balance_loss_clip": 1.03478456, "balance_loss_mlp": 1.04868126, "epoch": 0.1962422967082519, "flos": 25264413855360.0, "grad_norm": 1.7774376407230272, "language_loss": 0.74087286, "learning_rate": 3.6321284023935284e-06, "loss": 0.76300251, "num_input_tokens_seen": 70438690, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.953125, "step": 3264, "time_per_iteration": 2.6481146812438965 }, { "auxiliary_loss_clip": 0.01152541, "auxiliary_loss_mlp": 0.01056569, "balance_loss_clip": 1.04073763, "balance_loss_mlp": 1.05127525, "epoch": 0.19630241996091988, "flos": 18807352444800.0, "grad_norm": 1.8917916369402243, "language_loss": 0.78365439, "learning_rate": 3.6319100066026284e-06, "loss": 0.80574548, "num_input_tokens_seen": 70455385, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.921875, "step": 3265, "time_per_iteration": 2.629271984100342 }, { "auxiliary_loss_clip": 0.01102371, "auxiliary_loss_mlp": 0.01064948, "balance_loss_clip": 1.06246805, "balance_loss_mlp": 1.03040457, "epoch": 0.19636254321358784, "flos": 62321137896960.0, "grad_norm": 0.795921509918429, "language_loss": 0.53430462, "learning_rate": 3.6316915525730586e-06, "loss": 0.55597782, "num_input_tokens_seen": 70514280, "router_z_loss_clip": 0.02478027, "router_z_loss_mlp": 0.36328125, "step": 3266, "time_per_iteration": 3.374239921569824 }, { "auxiliary_loss_clip": 0.01163878, "auxiliary_loss_mlp": 0.01293417, "balance_loss_clip": 1.02719343, "balance_loss_mlp": 1.04673672, "epoch": 0.19642266646625584, "flos": 21069329564160.0, "grad_norm": 2.38286961423323, "language_loss": 0.7966187, "learning_rate": 3.631473040312614e-06, "loss": 0.82119167, "num_input_tokens_seen": 70531800, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.98828125, "step": 3267, "time_per_iteration": 2.646862268447876 }, { "auxiliary_loss_clip": 0.01156561, "auxiliary_loss_mlp": 0.01041239, "balance_loss_clip": 1.02528918, "balance_loss_mlp": 1.0466311, "epoch": 0.1964827897189238, "flos": 14538651229440.0, "grad_norm": 2.0680351494212426, "language_loss": 0.86331546, "learning_rate": 3.631254469829094e-06, "loss": 0.88529348, "num_input_tokens_seen": 70550615, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.921875, "step": 3268, "time_per_iteration": 2.6302053928375244 }, { "auxiliary_loss_clip": 0.01179001, "auxiliary_loss_mlp": 0.01038455, "balance_loss_clip": 1.02201629, "balance_loss_mlp": 1.04965174, "epoch": 0.19654291297159177, "flos": 19244636616960.0, "grad_norm": 1.924197418590529, "language_loss": 0.68584454, "learning_rate": 3.631035841130297e-06, "loss": 0.70801914, "num_input_tokens_seen": 70568690, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.9296875, "step": 3269, "time_per_iteration": 2.6737253665924072 }, { "auxiliary_loss_clip": 0.01174429, "auxiliary_loss_mlp": 0.01051103, "balance_loss_clip": 1.03320968, "balance_loss_mlp": 1.05110419, "epoch": 0.19660303622425973, "flos": 25775710001280.0, "grad_norm": 5.013508532131185, "language_loss": 0.80481148, "learning_rate": 3.6308171542240273e-06, "loss": 0.82706678, "num_input_tokens_seen": 70588665, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9609375, "step": 3270, "time_per_iteration": 2.683760643005371 }, { "auxiliary_loss_clip": 0.01148762, "auxiliary_loss_mlp": 0.01044051, "balance_loss_clip": 1.02780306, "balance_loss_mlp": 1.04583931, "epoch": 0.1966631594769277, "flos": 20595093275520.0, "grad_norm": 2.26028103272838, "language_loss": 0.84079564, "learning_rate": 3.6305984091180875e-06, "loss": 0.86272377, "num_input_tokens_seen": 70606900, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.9375, "step": 3271, "time_per_iteration": 2.549954891204834 }, { "auxiliary_loss_clip": 0.01146824, "auxiliary_loss_mlp": 0.01047678, "balance_loss_clip": 1.03114939, "balance_loss_mlp": 1.04741657, "epoch": 0.19672328272959566, "flos": 23623188600960.0, "grad_norm": 2.020116540745862, "language_loss": 0.80247897, "learning_rate": 3.630379605820286e-06, "loss": 0.82442403, "num_input_tokens_seen": 70625955, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.90625, "step": 3272, "time_per_iteration": 2.559497833251953 }, { "auxiliary_loss_clip": 0.01161002, "auxiliary_loss_mlp": 0.01043851, "balance_loss_clip": 1.02596974, "balance_loss_mlp": 1.04782152, "epoch": 0.19678340598226365, "flos": 23110922787840.0, "grad_norm": 1.8409090974851898, "language_loss": 0.80569458, "learning_rate": 3.630160744338429e-06, "loss": 0.82774305, "num_input_tokens_seen": 70646090, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.94921875, "step": 3273, "time_per_iteration": 2.6180782318115234 }, { "auxiliary_loss_clip": 0.01150857, "auxiliary_loss_mlp": 0.01048448, "balance_loss_clip": 1.03185415, "balance_loss_mlp": 1.04590547, "epoch": 0.19684352923493162, "flos": 24534852716160.0, "grad_norm": 2.1867385646607627, "language_loss": 0.77577406, "learning_rate": 3.6299418246803287e-06, "loss": 0.7977671, "num_input_tokens_seen": 70666065, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.9609375, "step": 3274, "time_per_iteration": 2.55961275100708 }, { "auxiliary_loss_clip": 0.01152263, "auxiliary_loss_mlp": 0.01044377, "balance_loss_clip": 1.02688873, "balance_loss_mlp": 1.04724956, "epoch": 0.19690365248759958, "flos": 21796448578560.0, "grad_norm": 2.235947395848996, "language_loss": 0.80101919, "learning_rate": 3.6297228468537976e-06, "loss": 0.82298565, "num_input_tokens_seen": 70681580, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.9609375, "step": 3275, "time_per_iteration": 2.5815508365631104 }, { "auxiliary_loss_clip": 0.01169251, "auxiliary_loss_mlp": 0.01039004, "balance_loss_clip": 1.02156317, "balance_loss_mlp": 1.04511786, "epoch": 0.19696377574026755, "flos": 19056643810560.0, "grad_norm": 2.025813318903843, "language_loss": 0.80683076, "learning_rate": 3.6295038108666504e-06, "loss": 0.82891339, "num_input_tokens_seen": 70697745, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.96875, "step": 3276, "time_per_iteration": 2.4951817989349365 }, { "auxiliary_loss_clip": 0.01150988, "auxiliary_loss_mlp": 0.01037923, "balance_loss_clip": 1.0208168, "balance_loss_mlp": 1.04712319, "epoch": 0.19702389899293551, "flos": 22820656982400.0, "grad_norm": 2.4366765637677124, "language_loss": 0.89342165, "learning_rate": 3.629284716726703e-06, "loss": 0.91531086, "num_input_tokens_seen": 70715110, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.94921875, "step": 3277, "time_per_iteration": 2.605787515640259 }, { "auxiliary_loss_clip": 0.01181072, "auxiliary_loss_mlp": 0.01044727, "balance_loss_clip": 1.02494979, "balance_loss_mlp": 1.0466373, "epoch": 0.19708402224560348, "flos": 22894237992960.0, "grad_norm": 1.986251567234807, "language_loss": 0.62048924, "learning_rate": 3.6290655644417757e-06, "loss": 0.64274722, "num_input_tokens_seen": 70734715, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.984375, "step": 3278, "time_per_iteration": 2.553285598754883 }, { "auxiliary_loss_clip": 0.01171489, "auxiliary_loss_mlp": 0.01048563, "balance_loss_clip": 1.03033638, "balance_loss_mlp": 1.04781055, "epoch": 0.19714414549827144, "flos": 25662519267840.0, "grad_norm": 2.335512582948415, "language_loss": 0.73964083, "learning_rate": 3.6288463540196894e-06, "loss": 0.76184142, "num_input_tokens_seen": 70752650, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.96484375, "step": 3279, "time_per_iteration": 2.6391916275024414 }, { "auxiliary_loss_clip": 0.01152801, "auxiliary_loss_mlp": 0.01043119, "balance_loss_clip": 1.02637029, "balance_loss_mlp": 1.04670727, "epoch": 0.19720426875093944, "flos": 23915824704000.0, "grad_norm": 1.7857767305867738, "language_loss": 0.82384115, "learning_rate": 3.6286270854682654e-06, "loss": 0.8458004, "num_input_tokens_seen": 70772365, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.96875, "step": 3280, "time_per_iteration": 2.5289101600646973 }, { "auxiliary_loss_clip": 0.01171412, "auxiliary_loss_mlp": 0.01040312, "balance_loss_clip": 1.02417088, "balance_loss_mlp": 1.04849792, "epoch": 0.1972643920036074, "flos": 13881952828800.0, "grad_norm": 1.8238320422355472, "language_loss": 0.7763778, "learning_rate": 3.6284077587953307e-06, "loss": 0.79849505, "num_input_tokens_seen": 70790340, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.95703125, "step": 3281, "time_per_iteration": 2.5734939575195312 }, { "auxiliary_loss_clip": 0.01159004, "auxiliary_loss_mlp": 0.01040435, "balance_loss_clip": 1.0246402, "balance_loss_mlp": 1.0486244, "epoch": 0.19732451525627537, "flos": 19863592801920.0, "grad_norm": 1.612078631256682, "language_loss": 0.79775691, "learning_rate": 3.628188374008712e-06, "loss": 0.81975126, "num_input_tokens_seen": 70809295, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.921875, "step": 3282, "time_per_iteration": 2.5833208560943604 }, { "auxiliary_loss_clip": 0.01183422, "auxiliary_loss_mlp": 0.01039794, "balance_loss_clip": 1.02273488, "balance_loss_mlp": 1.04981208, "epoch": 0.19738463850894333, "flos": 24973429777920.0, "grad_norm": 2.3846465754719453, "language_loss": 0.71902013, "learning_rate": 3.6279689311162382e-06, "loss": 0.74125224, "num_input_tokens_seen": 70828765, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.97265625, "step": 3283, "time_per_iteration": 2.6381826400756836 }, { "auxiliary_loss_clip": 0.01153312, "auxiliary_loss_mlp": 0.01041394, "balance_loss_clip": 1.02513337, "balance_loss_mlp": 1.04769635, "epoch": 0.1974447617616113, "flos": 18368883123840.0, "grad_norm": 1.8710907029611, "language_loss": 0.78759992, "learning_rate": 3.6277494301257407e-06, "loss": 0.80954695, "num_input_tokens_seen": 70846805, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.96484375, "step": 3284, "time_per_iteration": 2.548644542694092 }, { "auxiliary_loss_clip": 0.0116407, "auxiliary_loss_mlp": 0.01286366, "balance_loss_clip": 1.02045536, "balance_loss_mlp": 1.04796946, "epoch": 0.19750488501427926, "flos": 22892945103360.0, "grad_norm": 2.087288094632047, "language_loss": 0.86142564, "learning_rate": 3.6275298710450533e-06, "loss": 0.88593006, "num_input_tokens_seen": 70863805, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.98046875, "step": 3285, "time_per_iteration": 2.615673303604126 }, { "auxiliary_loss_clip": 0.01169915, "auxiliary_loss_mlp": 0.01040744, "balance_loss_clip": 1.02373302, "balance_loss_mlp": 1.04928207, "epoch": 0.19756500826694723, "flos": 21871502046720.0, "grad_norm": 3.5968962704517318, "language_loss": 0.87807012, "learning_rate": 3.627310253882012e-06, "loss": 0.90017676, "num_input_tokens_seen": 70882660, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9296875, "step": 3286, "time_per_iteration": 2.55593204498291 }, { "auxiliary_loss_clip": 0.01176007, "auxiliary_loss_mlp": 0.01047879, "balance_loss_clip": 1.02868629, "balance_loss_mlp": 1.05167758, "epoch": 0.19762513151961522, "flos": 15158972131200.0, "grad_norm": 2.1268793512632627, "language_loss": 0.78055882, "learning_rate": 3.627090578644452e-06, "loss": 0.80279762, "num_input_tokens_seen": 70898765, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.9765625, "step": 3287, "time_per_iteration": 2.589939594268799 }, { "auxiliary_loss_clip": 0.01162739, "auxiliary_loss_mlp": 0.01045699, "balance_loss_clip": 1.02734089, "balance_loss_mlp": 1.0475893, "epoch": 0.1976852547722832, "flos": 16979175878400.0, "grad_norm": 2.3841474467199433, "language_loss": 0.80997646, "learning_rate": 3.6268708453402163e-06, "loss": 0.83206081, "num_input_tokens_seen": 70916370, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.96875, "step": 3288, "time_per_iteration": 2.5246293544769287 }, { "auxiliary_loss_clip": 0.0114804, "auxiliary_loss_mlp": 0.01044097, "balance_loss_clip": 1.02786112, "balance_loss_mlp": 1.04646218, "epoch": 0.19774537802495115, "flos": 20302924049280.0, "grad_norm": 2.2934219806189593, "language_loss": 0.72540176, "learning_rate": 3.626651053977144e-06, "loss": 0.74732322, "num_input_tokens_seen": 70934870, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.92578125, "step": 3289, "time_per_iteration": 2.6281468868255615 }, { "auxiliary_loss_clip": 0.01160295, "auxiliary_loss_mlp": 0.01043546, "balance_loss_clip": 1.02628398, "balance_loss_mlp": 1.04716325, "epoch": 0.19780550127761912, "flos": 27235478724480.0, "grad_norm": 2.417681386890294, "language_loss": 0.79457676, "learning_rate": 3.6264312045630802e-06, "loss": 0.81661522, "num_input_tokens_seen": 70955140, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.953125, "step": 3290, "time_per_iteration": 2.807706117630005 }, { "auxiliary_loss_clip": 0.01105087, "auxiliary_loss_mlp": 0.01002753, "balance_loss_clip": 0.99980843, "balance_loss_mlp": 1.04548025, "epoch": 0.19786562453028708, "flos": 63550972684800.0, "grad_norm": 0.8985266323575967, "language_loss": 0.60391313, "learning_rate": 3.62621129710587e-06, "loss": 0.62499154, "num_input_tokens_seen": 71012005, "router_z_loss_clip": 0.02941895, "router_z_loss_mlp": 0.41796875, "step": 3291, "time_per_iteration": 3.2475035190582275 }, { "auxiliary_loss_clip": 0.01144977, "auxiliary_loss_mlp": 0.01045947, "balance_loss_clip": 1.02743387, "balance_loss_mlp": 1.04502976, "epoch": 0.19792574778295505, "flos": 26286647011200.0, "grad_norm": 1.8188189959642338, "language_loss": 0.81035078, "learning_rate": 3.6259913316133625e-06, "loss": 0.83226001, "num_input_tokens_seen": 71031140, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0, "step": 3292, "time_per_iteration": 2.639111280441284 }, { "auxiliary_loss_clip": 0.01155364, "auxiliary_loss_mlp": 0.01298952, "balance_loss_clip": 1.03391647, "balance_loss_mlp": 1.0457114, "epoch": 0.19798587103562304, "flos": 19938107566080.0, "grad_norm": 1.7731480770742556, "language_loss": 0.81676221, "learning_rate": 3.625771308093406e-06, "loss": 0.84130538, "num_input_tokens_seen": 71050250, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9140625, "step": 3293, "time_per_iteration": 2.5466082096099854 }, { "auxiliary_loss_clip": 0.01168444, "auxiliary_loss_mlp": 0.01050832, "balance_loss_clip": 1.03177071, "balance_loss_mlp": 1.05070329, "epoch": 0.198045994288291, "flos": 20120282369280.0, "grad_norm": 1.9076951685356436, "language_loss": 0.6086216, "learning_rate": 3.625551226553854e-06, "loss": 0.63081443, "num_input_tokens_seen": 71068665, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.99609375, "step": 3294, "time_per_iteration": 2.6601614952087402 }, { "auxiliary_loss_clip": 0.01160222, "auxiliary_loss_mlp": 0.01053456, "balance_loss_clip": 1.03668308, "balance_loss_mlp": 1.04719639, "epoch": 0.19810611754095897, "flos": 17967653228160.0, "grad_norm": 6.950094874364308, "language_loss": 0.87528163, "learning_rate": 3.6253310870025598e-06, "loss": 0.89741844, "num_input_tokens_seen": 71085320, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.94921875, "step": 3295, "time_per_iteration": 2.5631675720214844 }, { "auxiliary_loss_clip": 0.01168787, "auxiliary_loss_mlp": 0.01052374, "balance_loss_clip": 1.03650737, "balance_loss_mlp": 1.04708147, "epoch": 0.19816624079362694, "flos": 15084996071040.0, "grad_norm": 3.2106167266157892, "language_loss": 0.80234373, "learning_rate": 3.6251108894473806e-06, "loss": 0.8245554, "num_input_tokens_seen": 71102020, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.9453125, "step": 3296, "time_per_iteration": 2.6007182598114014 }, { "auxiliary_loss_clip": 0.01170998, "auxiliary_loss_mlp": 0.01044447, "balance_loss_clip": 1.02571917, "balance_loss_mlp": 1.04528534, "epoch": 0.1982263640462949, "flos": 24900315644160.0, "grad_norm": 1.729733029825456, "language_loss": 0.67852819, "learning_rate": 3.624890633896173e-06, "loss": 0.7006827, "num_input_tokens_seen": 71123390, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.984375, "step": 3297, "time_per_iteration": 2.5700249671936035 }, { "auxiliary_loss_clip": 0.01137198, "auxiliary_loss_mlp": 0.01040871, "balance_loss_clip": 1.02474165, "balance_loss_mlp": 1.04609895, "epoch": 0.19828648729896287, "flos": 20376181837440.0, "grad_norm": 1.9945683345077276, "language_loss": 0.81090301, "learning_rate": 3.6246703203567996e-06, "loss": 0.83268374, "num_input_tokens_seen": 71141800, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.91015625, "step": 3298, "time_per_iteration": 2.6518585681915283 }, { "auxiliary_loss_clip": 0.01165399, "auxiliary_loss_mlp": 0.0105029, "balance_loss_clip": 1.030406, "balance_loss_mlp": 1.0474236, "epoch": 0.19834661055163083, "flos": 18880035615360.0, "grad_norm": 2.5484222166991772, "language_loss": 0.84940773, "learning_rate": 3.624449948837121e-06, "loss": 0.87156463, "num_input_tokens_seen": 71159505, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.99609375, "step": 3299, "time_per_iteration": 3.8886144161224365 }, { "auxiliary_loss_clip": 0.01110628, "auxiliary_loss_mlp": 0.01007351, "balance_loss_clip": 1.00457323, "balance_loss_mlp": 1.04128706, "epoch": 0.19840673380429882, "flos": 60259184640000.0, "grad_norm": 0.9966355886137002, "language_loss": 0.53254938, "learning_rate": 3.6242295193450024e-06, "loss": 0.55372918, "num_input_tokens_seen": 71223265, "router_z_loss_clip": 0.02783203, "router_z_loss_mlp": 0.42578125, "step": 3300, "time_per_iteration": 3.1510696411132812 }, { "auxiliary_loss_clip": 0.01169564, "auxiliary_loss_mlp": 0.01059481, "balance_loss_clip": 1.04076493, "balance_loss_mlp": 1.04619884, "epoch": 0.1984668570569668, "flos": 19902017376000.0, "grad_norm": 1.587518604899164, "language_loss": 0.73319566, "learning_rate": 3.6240090318883103e-06, "loss": 0.75548613, "num_input_tokens_seen": 71242385, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.9609375, "step": 3301, "time_per_iteration": 5.503425121307373 }, { "auxiliary_loss_clip": 0.01168605, "auxiliary_loss_mlp": 0.01041912, "balance_loss_clip": 1.02566385, "balance_loss_mlp": 1.04622722, "epoch": 0.19852698030963475, "flos": 15630766295040.0, "grad_norm": 2.4439656093951814, "language_loss": 0.87966406, "learning_rate": 3.623788486474913e-06, "loss": 0.90176928, "num_input_tokens_seen": 71258990, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.953125, "step": 3302, "time_per_iteration": 2.576131582260132 }, { "auxiliary_loss_clip": 0.01169956, "auxiliary_loss_mlp": 0.01047905, "balance_loss_clip": 1.03072739, "balance_loss_mlp": 1.04574728, "epoch": 0.19858710356230272, "flos": 43143007311360.0, "grad_norm": 1.9021856086740359, "language_loss": 0.73840505, "learning_rate": 3.623567883112682e-06, "loss": 0.76058364, "num_input_tokens_seen": 71282770, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.96875, "step": 3303, "time_per_iteration": 2.7824575901031494 }, { "auxiliary_loss_clip": 0.01149184, "auxiliary_loss_mlp": 0.01042615, "balance_loss_clip": 1.02543736, "balance_loss_mlp": 1.04657876, "epoch": 0.19864722681497068, "flos": 35144084643840.0, "grad_norm": 1.7927211038884499, "language_loss": 0.74473006, "learning_rate": 3.6233472218094897e-06, "loss": 0.76664805, "num_input_tokens_seen": 71301410, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9375, "step": 3304, "time_per_iteration": 2.6922781467437744 }, { "auxiliary_loss_clip": 0.011574, "auxiliary_loss_mlp": 0.01039002, "balance_loss_clip": 1.0230279, "balance_loss_mlp": 1.04497433, "epoch": 0.19870735006763865, "flos": 19426200888960.0, "grad_norm": 1.787435495215507, "language_loss": 0.86243355, "learning_rate": 3.62312650257321e-06, "loss": 0.88439763, "num_input_tokens_seen": 71319670, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.94140625, "step": 3305, "time_per_iteration": 4.820317506790161 }, { "auxiliary_loss_clip": 0.01160252, "auxiliary_loss_mlp": 0.01040115, "balance_loss_clip": 1.02347398, "balance_loss_mlp": 1.044348, "epoch": 0.19876747332030664, "flos": 23547380947200.0, "grad_norm": 1.857193445165702, "language_loss": 0.68141574, "learning_rate": 3.622905725411721e-06, "loss": 0.70341945, "num_input_tokens_seen": 71339850, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.98046875, "step": 3306, "time_per_iteration": 2.55928111076355 }, { "auxiliary_loss_clip": 0.01157455, "auxiliary_loss_mlp": 0.01038342, "balance_loss_clip": 1.02241588, "balance_loss_mlp": 1.04411674, "epoch": 0.1988275965729746, "flos": 19829406032640.0, "grad_norm": 1.6302679897885606, "language_loss": 0.76115108, "learning_rate": 3.622684890332901e-06, "loss": 0.78310901, "num_input_tokens_seen": 71359795, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.953125, "step": 3307, "time_per_iteration": 2.5779573917388916 }, { "auxiliary_loss_clip": 0.01141601, "auxiliary_loss_mlp": 0.01042844, "balance_loss_clip": 1.02658439, "balance_loss_mlp": 1.0481137, "epoch": 0.19888771982564257, "flos": 23513625141120.0, "grad_norm": 1.8989918115304691, "language_loss": 0.75348651, "learning_rate": 3.622463997344632e-06, "loss": 0.77533096, "num_input_tokens_seen": 71378885, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.9375, "step": 3308, "time_per_iteration": 2.5679123401641846 }, { "auxiliary_loss_clip": 0.01178538, "auxiliary_loss_mlp": 0.01037453, "balance_loss_clip": 1.01998889, "balance_loss_mlp": 1.045573, "epoch": 0.19894784307831054, "flos": 18150510389760.0, "grad_norm": 1.9992170825716284, "language_loss": 0.75778878, "learning_rate": 3.622243046454796e-06, "loss": 0.77994871, "num_input_tokens_seen": 71397285, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.96484375, "step": 3309, "time_per_iteration": 2.635166883468628 }, { "auxiliary_loss_clip": 0.01147658, "auxiliary_loss_mlp": 0.01045078, "balance_loss_clip": 1.0271492, "balance_loss_mlp": 1.04480708, "epoch": 0.1990079663309785, "flos": 24276044246400.0, "grad_norm": 1.6623923967058292, "language_loss": 0.87689078, "learning_rate": 3.6220220376712787e-06, "loss": 0.89881819, "num_input_tokens_seen": 71415775, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9375, "step": 3310, "time_per_iteration": 2.5875496864318848 }, { "auxiliary_loss_clip": 0.01138504, "auxiliary_loss_mlp": 0.01037401, "balance_loss_clip": 1.02033067, "balance_loss_mlp": 1.0450747, "epoch": 0.19906808958364647, "flos": 34897666366080.0, "grad_norm": 2.63944566460506, "language_loss": 0.6402213, "learning_rate": 3.621800971001967e-06, "loss": 0.66198039, "num_input_tokens_seen": 71437315, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.9375, "step": 3311, "time_per_iteration": 2.6636252403259277 }, { "auxiliary_loss_clip": 0.01169521, "auxiliary_loss_mlp": 0.01037522, "balance_loss_clip": 1.02066565, "balance_loss_mlp": 1.04505348, "epoch": 0.19912821283631443, "flos": 24024885373440.0, "grad_norm": 2.666853553025921, "language_loss": 0.73852175, "learning_rate": 3.6215798464547505e-06, "loss": 0.7605921, "num_input_tokens_seen": 71456320, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.97265625, "step": 3312, "time_per_iteration": 2.6614670753479004 }, { "auxiliary_loss_clip": 0.01150304, "auxiliary_loss_mlp": 0.01039382, "balance_loss_clip": 1.02282333, "balance_loss_mlp": 1.04664612, "epoch": 0.19918833608898243, "flos": 19859031774720.0, "grad_norm": 2.0145166021385013, "language_loss": 0.83661497, "learning_rate": 3.6213586640375207e-06, "loss": 0.85851181, "num_input_tokens_seen": 71475360, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.94921875, "step": 3313, "time_per_iteration": 2.5815675258636475 }, { "auxiliary_loss_clip": 0.0115267, "auxiliary_loss_mlp": 0.01039778, "balance_loss_clip": 1.02255177, "balance_loss_mlp": 1.04898512, "epoch": 0.1992484593416504, "flos": 29095794984960.0, "grad_norm": 1.9546769315103214, "language_loss": 0.80896354, "learning_rate": 3.6211374237581706e-06, "loss": 0.83088803, "num_input_tokens_seen": 71496155, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9453125, "step": 3314, "time_per_iteration": 2.632493734359741 }, { "auxiliary_loss_clip": 0.01155943, "auxiliary_loss_mlp": 0.01043109, "balance_loss_clip": 1.02684855, "balance_loss_mlp": 1.04530632, "epoch": 0.19930858259431836, "flos": 23295001011840.0, "grad_norm": 1.722492351893694, "language_loss": 0.87141544, "learning_rate": 3.620916125624596e-06, "loss": 0.89340591, "num_input_tokens_seen": 71517295, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.921875, "step": 3315, "time_per_iteration": 2.658647298812866 }, { "auxiliary_loss_clip": 0.01171811, "auxiliary_loss_mlp": 0.01287216, "balance_loss_clip": 1.02179265, "balance_loss_mlp": 1.04920578, "epoch": 0.19936870584698632, "flos": 25378825651200.0, "grad_norm": 1.5071024893344611, "language_loss": 0.71052378, "learning_rate": 3.620694769644694e-06, "loss": 0.73511404, "num_input_tokens_seen": 71540000, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.953125, "step": 3316, "time_per_iteration": 2.7048962116241455 }, { "auxiliary_loss_clip": 0.01132681, "auxiliary_loss_mlp": 0.01001782, "balance_loss_clip": 0.99900401, "balance_loss_mlp": 1.04538965, "epoch": 0.1994288290996543, "flos": 62168053109760.0, "grad_norm": 0.8247225163537855, "language_loss": 0.66338724, "learning_rate": 3.6204733558263653e-06, "loss": 0.68473184, "num_input_tokens_seen": 71607880, "router_z_loss_clip": 0.02783203, "router_z_loss_mlp": 0.421875, "step": 3317, "time_per_iteration": 3.3618361949920654 }, { "auxiliary_loss_clip": 0.0115415, "auxiliary_loss_mlp": 0.01041848, "balance_loss_clip": 1.02389526, "balance_loss_mlp": 1.04646134, "epoch": 0.19948895235232225, "flos": 19025832919680.0, "grad_norm": 2.0202773528085363, "language_loss": 0.74052978, "learning_rate": 3.6202518841775104e-06, "loss": 0.7624898, "num_input_tokens_seen": 71625695, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.984375, "step": 3318, "time_per_iteration": 2.5573689937591553 }, { "auxiliary_loss_clip": 0.01165604, "auxiliary_loss_mlp": 0.01037268, "balance_loss_clip": 1.0210557, "balance_loss_mlp": 1.04602671, "epoch": 0.19954907560499022, "flos": 37815803182080.0, "grad_norm": 2.145725929059151, "language_loss": 0.79034138, "learning_rate": 3.6200303547060336e-06, "loss": 0.81237012, "num_input_tokens_seen": 71648520, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.92578125, "step": 3319, "time_per_iteration": 2.716526508331299 }, { "auxiliary_loss_clip": 0.01161437, "auxiliary_loss_mlp": 0.01038327, "balance_loss_clip": 1.01938462, "balance_loss_mlp": 1.04733849, "epoch": 0.1996091988576582, "flos": 49565199594240.0, "grad_norm": 2.201641474731181, "language_loss": 0.75693309, "learning_rate": 3.61980876741984e-06, "loss": 0.77893078, "num_input_tokens_seen": 71672185, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.9609375, "step": 3320, "time_per_iteration": 2.854902744293213 }, { "auxiliary_loss_clip": 0.01169867, "auxiliary_loss_mlp": 0.01039281, "balance_loss_clip": 1.02206683, "balance_loss_mlp": 1.04744518, "epoch": 0.19966932211032618, "flos": 22635788659200.0, "grad_norm": 1.638451340304603, "language_loss": 0.80236936, "learning_rate": 3.6195871223268392e-06, "loss": 0.82446086, "num_input_tokens_seen": 71692890, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.953125, "step": 3321, "time_per_iteration": 2.639451742172241 }, { "auxiliary_loss_clip": 0.01112291, "auxiliary_loss_mlp": 0.00999293, "balance_loss_clip": 0.99681371, "balance_loss_mlp": 1.04336262, "epoch": 0.19972944536299414, "flos": 54082117745280.0, "grad_norm": 0.860168609358024, "language_loss": 0.65156049, "learning_rate": 3.61936541943494e-06, "loss": 0.67267638, "num_input_tokens_seen": 71745815, "router_z_loss_clip": 0.02478027, "router_z_loss_mlp": 0.41601562, "step": 3322, "time_per_iteration": 2.9770591259002686 }, { "auxiliary_loss_clip": 0.01103519, "auxiliary_loss_mlp": 0.01005283, "balance_loss_clip": 1.00274372, "balance_loss_mlp": 1.04309082, "epoch": 0.1997895686156621, "flos": 69355031817600.0, "grad_norm": 0.7856603474409203, "language_loss": 0.56949472, "learning_rate": 3.619143658752054e-06, "loss": 0.59058273, "num_input_tokens_seen": 71806915, "router_z_loss_clip": 0.02539062, "router_z_loss_mlp": 0.42578125, "step": 3323, "time_per_iteration": 3.336801528930664 }, { "auxiliary_loss_clip": 0.01161288, "auxiliary_loss_mlp": 0.0128754, "balance_loss_clip": 1.0214932, "balance_loss_mlp": 1.04906952, "epoch": 0.19984969186833007, "flos": 18552063507840.0, "grad_norm": 2.37877252215622, "language_loss": 0.80466735, "learning_rate": 3.6189218402860958e-06, "loss": 0.82915562, "num_input_tokens_seen": 71824645, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.94140625, "step": 3324, "time_per_iteration": 2.559136152267456 }, { "auxiliary_loss_clip": 0.01148806, "auxiliary_loss_mlp": 0.01040639, "balance_loss_clip": 1.02217293, "balance_loss_mlp": 1.04610741, "epoch": 0.19990981512099804, "flos": 26429678968320.0, "grad_norm": 1.7794505999586712, "language_loss": 0.53558052, "learning_rate": 3.6186999640449817e-06, "loss": 0.55747503, "num_input_tokens_seen": 71845125, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.9375, "step": 3325, "time_per_iteration": 2.637937307357788 }, { "auxiliary_loss_clip": 0.01159884, "auxiliary_loss_mlp": 0.01043735, "balance_loss_clip": 1.02624667, "balance_loss_mlp": 1.04701173, "epoch": 0.19996993837366603, "flos": 16325997010560.0, "grad_norm": 2.4214753096451833, "language_loss": 0.85991943, "learning_rate": 3.6184780300366294e-06, "loss": 0.88195562, "num_input_tokens_seen": 71863500, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.94921875, "step": 3326, "time_per_iteration": 2.515895366668701 }, { "auxiliary_loss_clip": 0.01140986, "auxiliary_loss_mlp": 0.01042255, "balance_loss_clip": 1.02586341, "balance_loss_mlp": 1.04787719, "epoch": 0.200030061626334, "flos": 20844169159680.0, "grad_norm": 2.080144737767615, "language_loss": 0.71528906, "learning_rate": 3.6182560382689598e-06, "loss": 0.73712146, "num_input_tokens_seen": 71881845, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.9296875, "step": 3327, "time_per_iteration": 2.569974660873413 }, { "auxiliary_loss_clip": 0.0116185, "auxiliary_loss_mlp": 0.01041834, "balance_loss_clip": 1.02296352, "balance_loss_mlp": 1.04644811, "epoch": 0.20009018487900196, "flos": 23762629198080.0, "grad_norm": 1.9019259557370263, "language_loss": 0.76631576, "learning_rate": 3.6180339887498948e-06, "loss": 0.78835261, "num_input_tokens_seen": 71900940, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.9765625, "step": 3328, "time_per_iteration": 2.558401346206665 }, { "auxiliary_loss_clip": 0.01154558, "auxiliary_loss_mlp": 0.01037952, "balance_loss_clip": 1.02214491, "balance_loss_mlp": 1.04520082, "epoch": 0.20015030813166992, "flos": 28111555440000.0, "grad_norm": 1.8392677390616226, "language_loss": 0.69408715, "learning_rate": 3.6178118814873587e-06, "loss": 0.71601224, "num_input_tokens_seen": 71921925, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.9140625, "step": 3329, "time_per_iteration": 2.7424871921539307 }, { "auxiliary_loss_clip": 0.01171371, "auxiliary_loss_mlp": 0.01048527, "balance_loss_clip": 1.02923846, "balance_loss_mlp": 1.04668808, "epoch": 0.2002104313843379, "flos": 26067160955520.0, "grad_norm": 1.7970318855375527, "language_loss": 0.81031531, "learning_rate": 3.6175897164892783e-06, "loss": 0.83251429, "num_input_tokens_seen": 71941855, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.97265625, "step": 3330, "time_per_iteration": 2.6524384021759033 }, { "auxiliary_loss_clip": 0.01150671, "auxiliary_loss_mlp": 0.01041777, "balance_loss_clip": 1.02402639, "balance_loss_mlp": 1.04544544, "epoch": 0.20027055463700585, "flos": 22966633854720.0, "grad_norm": 3.047758445263173, "language_loss": 0.76738548, "learning_rate": 3.617367493763581e-06, "loss": 0.78930998, "num_input_tokens_seen": 71960915, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9609375, "step": 3331, "time_per_iteration": 2.5984809398651123 }, { "auxiliary_loss_clip": 0.01179738, "auxiliary_loss_mlp": 0.01047156, "balance_loss_clip": 1.02953637, "balance_loss_mlp": 1.04644656, "epoch": 0.20033067788967382, "flos": 17165660313600.0, "grad_norm": 1.8108972230211644, "language_loss": 0.78913772, "learning_rate": 3.6171452133181994e-06, "loss": 0.81140661, "num_input_tokens_seen": 71979220, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.96875, "step": 3332, "time_per_iteration": 2.511061191558838 }, { "auxiliary_loss_clip": 0.01088128, "auxiliary_loss_mlp": 0.0101254, "balance_loss_clip": 1.01000047, "balance_loss_mlp": 1.03719032, "epoch": 0.2003908011423418, "flos": 60825566292480.0, "grad_norm": 0.932384225788138, "language_loss": 0.61991823, "learning_rate": 3.6169228751610643e-06, "loss": 0.64092493, "num_input_tokens_seen": 72033950, "router_z_loss_clip": 0.02539062, "router_z_loss_mlp": 0.421875, "step": 3333, "time_per_iteration": 3.0522091388702393 }, { "auxiliary_loss_clip": 0.01169271, "auxiliary_loss_mlp": 0.01298658, "balance_loss_clip": 1.03323507, "balance_loss_mlp": 1.04498017, "epoch": 0.20045092439500978, "flos": 24206234163840.0, "grad_norm": 2.7289350700842476, "language_loss": 0.80895674, "learning_rate": 3.6167004793001107e-06, "loss": 0.83363599, "num_input_tokens_seen": 72051395, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.97265625, "step": 3334, "time_per_iteration": 2.649829626083374 }, { "auxiliary_loss_clip": 0.01191764, "auxiliary_loss_mlp": 0.01048999, "balance_loss_clip": 1.02981877, "balance_loss_mlp": 1.04737663, "epoch": 0.20051104764767774, "flos": 29387605075200.0, "grad_norm": 2.0474151251216166, "language_loss": 0.73828375, "learning_rate": 3.616478025743276e-06, "loss": 0.7606914, "num_input_tokens_seen": 72071305, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.9921875, "step": 3335, "time_per_iteration": 2.715611219406128 }, { "auxiliary_loss_clip": 0.01159749, "auxiliary_loss_mlp": 0.01057462, "balance_loss_clip": 1.03971171, "balance_loss_mlp": 1.05300164, "epoch": 0.2005711709003457, "flos": 23513804709120.0, "grad_norm": 1.8978069406958609, "language_loss": 0.79465336, "learning_rate": 3.6162555144984986e-06, "loss": 0.81682551, "num_input_tokens_seen": 72090165, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9765625, "step": 3336, "time_per_iteration": 2.5708165168762207 }, { "auxiliary_loss_clip": 0.01175688, "auxiliary_loss_mlp": 0.01048389, "balance_loss_clip": 1.02930379, "balance_loss_mlp": 1.04759836, "epoch": 0.20063129415301367, "flos": 22523388024960.0, "grad_norm": 2.1795030543936105, "language_loss": 0.77685332, "learning_rate": 3.6160329455737193e-06, "loss": 0.79909408, "num_input_tokens_seen": 72107210, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.0078125, "step": 3337, "time_per_iteration": 2.635725736618042 }, { "auxiliary_loss_clip": 0.01146714, "auxiliary_loss_mlp": 0.01046157, "balance_loss_clip": 1.02671361, "balance_loss_mlp": 1.049968, "epoch": 0.20069141740568164, "flos": 25958243940480.0, "grad_norm": 3.1381067781073098, "language_loss": 0.69053942, "learning_rate": 3.6158103189768815e-06, "loss": 0.71246815, "num_input_tokens_seen": 72126315, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.96875, "step": 3338, "time_per_iteration": 2.6447434425354004 }, { "auxiliary_loss_clip": 0.01175027, "auxiliary_loss_mlp": 0.01052089, "balance_loss_clip": 1.03504252, "balance_loss_mlp": 1.05170131, "epoch": 0.2007515406583496, "flos": 24790608529920.0, "grad_norm": 1.9168111055236219, "language_loss": 0.68766499, "learning_rate": 3.6155876347159296e-06, "loss": 0.7099362, "num_input_tokens_seen": 72146470, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9609375, "step": 3339, "time_per_iteration": 2.643580436706543 }, { "auxiliary_loss_clip": 0.01186188, "auxiliary_loss_mlp": 0.01037688, "balance_loss_clip": 1.02006924, "balance_loss_mlp": 1.0512166, "epoch": 0.2008116639110176, "flos": 37925582123520.0, "grad_norm": 2.4478913494547663, "language_loss": 0.65911663, "learning_rate": 3.6153648927988104e-06, "loss": 0.68135542, "num_input_tokens_seen": 72166600, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.9921875, "step": 3340, "time_per_iteration": 4.056830644607544 }, { "auxiliary_loss_clip": 0.01156873, "auxiliary_loss_mlp": 0.01037103, "balance_loss_clip": 1.01780295, "balance_loss_mlp": 1.04979587, "epoch": 0.20087178716368556, "flos": 20740531443840.0, "grad_norm": 2.1894240669186713, "language_loss": 0.74110329, "learning_rate": 3.6151420932334737e-06, "loss": 0.76304305, "num_input_tokens_seen": 72185160, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.98046875, "step": 3341, "time_per_iteration": 2.542912721633911 }, { "auxiliary_loss_clip": 0.01170949, "auxiliary_loss_mlp": 0.0104352, "balance_loss_clip": 1.02582908, "balance_loss_mlp": 1.04966938, "epoch": 0.20093191041635353, "flos": 23842279607040.0, "grad_norm": 1.9924616042013457, "language_loss": 0.71594214, "learning_rate": 3.6149192360278706e-06, "loss": 0.73808682, "num_input_tokens_seen": 72205160, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.94140625, "step": 3342, "time_per_iteration": 5.552501440048218 }, { "auxiliary_loss_clip": 0.01180955, "auxiliary_loss_mlp": 0.01044625, "balance_loss_clip": 1.02682734, "balance_loss_mlp": 1.04808784, "epoch": 0.2009920336690215, "flos": 21792067119360.0, "grad_norm": 2.1659633488257266, "language_loss": 0.72441447, "learning_rate": 3.614696321189954e-06, "loss": 0.74667031, "num_input_tokens_seen": 72223555, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.96484375, "step": 3343, "time_per_iteration": 2.631610631942749 }, { "auxiliary_loss_clip": 0.01171697, "auxiliary_loss_mlp": 0.01046418, "balance_loss_clip": 1.02736878, "balance_loss_mlp": 1.04767036, "epoch": 0.20105215692168946, "flos": 26359222440960.0, "grad_norm": 2.054098841459536, "language_loss": 0.80916882, "learning_rate": 3.614473348727679e-06, "loss": 0.83134997, "num_input_tokens_seen": 72242465, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.96875, "step": 3344, "time_per_iteration": 2.6454954147338867 }, { "auxiliary_loss_clip": 0.01144699, "auxiliary_loss_mlp": 0.01039911, "balance_loss_clip": 1.02219605, "balance_loss_mlp": 1.04768324, "epoch": 0.20111228017435742, "flos": 18807280617600.0, "grad_norm": 2.102689394249402, "language_loss": 0.83331817, "learning_rate": 3.614250318649003e-06, "loss": 0.85516429, "num_input_tokens_seen": 72260655, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.96875, "step": 3345, "time_per_iteration": 2.5472617149353027 }, { "auxiliary_loss_clip": 0.01176478, "auxiliary_loss_mlp": 0.01037202, "balance_loss_clip": 1.02118063, "balance_loss_mlp": 1.04873598, "epoch": 0.20117240342702541, "flos": 19975059682560.0, "grad_norm": 2.1251749461099716, "language_loss": 0.67799056, "learning_rate": 3.614027230961885e-06, "loss": 0.70012736, "num_input_tokens_seen": 72279055, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.9140625, "step": 3346, "time_per_iteration": 4.880454063415527 }, { "auxiliary_loss_clip": 0.01170966, "auxiliary_loss_mlp": 0.01044753, "balance_loss_clip": 1.02746737, "balance_loss_mlp": 1.04799628, "epoch": 0.20123252667969338, "flos": 23142703345920.0, "grad_norm": 2.349761977319103, "language_loss": 0.73554528, "learning_rate": 3.613804085674288e-06, "loss": 0.75770247, "num_input_tokens_seen": 72297895, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.95703125, "step": 3347, "time_per_iteration": 2.6267635822296143 }, { "auxiliary_loss_clip": 0.01153991, "auxiliary_loss_mlp": 0.01046446, "balance_loss_clip": 1.0296967, "balance_loss_mlp": 1.04940283, "epoch": 0.20129264993236134, "flos": 23221671396480.0, "grad_norm": 1.6732561646178938, "language_loss": 0.8673594, "learning_rate": 3.6135808827941733e-06, "loss": 0.88936383, "num_input_tokens_seen": 72318385, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.9609375, "step": 3348, "time_per_iteration": 2.6089015007019043 }, { "auxiliary_loss_clip": 0.01177467, "auxiliary_loss_mlp": 0.01039573, "balance_loss_clip": 1.0209527, "balance_loss_mlp": 1.0474031, "epoch": 0.2013527731850293, "flos": 21871466133120.0, "grad_norm": 1.9946979808072713, "language_loss": 0.70991367, "learning_rate": 3.6133576223295083e-06, "loss": 0.7320841, "num_input_tokens_seen": 72338235, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.9375, "step": 3349, "time_per_iteration": 2.682839870452881 }, { "auxiliary_loss_clip": 0.01174175, "auxiliary_loss_mlp": 0.01048765, "balance_loss_clip": 1.02989435, "balance_loss_mlp": 1.05043507, "epoch": 0.20141289643769728, "flos": 18040803275520.0, "grad_norm": 1.8106402658351863, "language_loss": 0.70693684, "learning_rate": 3.61313430428826e-06, "loss": 0.72916627, "num_input_tokens_seen": 72357825, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.96484375, "step": 3350, "time_per_iteration": 2.639698028564453 }, { "auxiliary_loss_clip": 0.01166443, "auxiliary_loss_mlp": 0.01044685, "balance_loss_clip": 1.02548015, "balance_loss_mlp": 1.05163312, "epoch": 0.20147301969036524, "flos": 23951412103680.0, "grad_norm": 2.2027225016261305, "language_loss": 0.75944793, "learning_rate": 3.612910928678397e-06, "loss": 0.78155923, "num_input_tokens_seen": 72376335, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.96875, "step": 3351, "time_per_iteration": 2.6300177574157715 }, { "auxiliary_loss_clip": 0.01168027, "auxiliary_loss_mlp": 0.01044778, "balance_loss_clip": 1.02529931, "balance_loss_mlp": 1.05025339, "epoch": 0.2015331429430332, "flos": 25588471380480.0, "grad_norm": 1.5949210446532018, "language_loss": 0.80532593, "learning_rate": 3.6126874955078926e-06, "loss": 0.82745397, "num_input_tokens_seen": 72395440, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.99609375, "step": 3352, "time_per_iteration": 2.7202062606811523 }, { "auxiliary_loss_clip": 0.01157908, "auxiliary_loss_mlp": 0.01048084, "balance_loss_clip": 1.02989221, "balance_loss_mlp": 1.05188251, "epoch": 0.2015932661957012, "flos": 26724972677760.0, "grad_norm": 3.916926381030757, "language_loss": 0.80187488, "learning_rate": 3.6124640047847193e-06, "loss": 0.82393479, "num_input_tokens_seen": 72414670, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.96875, "step": 3353, "time_per_iteration": 2.7319629192352295 }, { "auxiliary_loss_clip": 0.01160985, "auxiliary_loss_mlp": 0.01048011, "balance_loss_clip": 1.03004611, "balance_loss_mlp": 1.04960322, "epoch": 0.20165338944836916, "flos": 15633136592640.0, "grad_norm": 1.8752361805245743, "language_loss": 0.89805031, "learning_rate": 3.6122404565168533e-06, "loss": 0.92014033, "num_input_tokens_seen": 72432210, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9375, "step": 3354, "time_per_iteration": 2.490955114364624 }, { "auxiliary_loss_clip": 0.01088357, "auxiliary_loss_mlp": 0.01251276, "balance_loss_clip": 1.00062037, "balance_loss_mlp": 1.03339267, "epoch": 0.20171351270103713, "flos": 57912529207680.0, "grad_norm": 0.8301185811304527, "language_loss": 0.55914754, "learning_rate": 3.612016850712273e-06, "loss": 0.58254391, "num_input_tokens_seen": 72489225, "router_z_loss_clip": 0.02856445, "router_z_loss_mlp": 0.37109375, "step": 3355, "time_per_iteration": 3.1583023071289062 }, { "auxiliary_loss_clip": 0.01161749, "auxiliary_loss_mlp": 0.01295916, "balance_loss_clip": 1.03087628, "balance_loss_mlp": 1.05079055, "epoch": 0.2017736359537051, "flos": 20814363849600.0, "grad_norm": 1.6833108148113993, "language_loss": 0.83982581, "learning_rate": 3.611793187378958e-06, "loss": 0.86440253, "num_input_tokens_seen": 72508715, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9296875, "step": 3356, "time_per_iteration": 2.5703933238983154 }, { "auxiliary_loss_clip": 0.01174895, "auxiliary_loss_mlp": 0.01044989, "balance_loss_clip": 1.0238651, "balance_loss_mlp": 1.05406749, "epoch": 0.20183375920637306, "flos": 17092043389440.0, "grad_norm": 5.6626754962854084, "language_loss": 0.69902432, "learning_rate": 3.61156946652489e-06, "loss": 0.72122324, "num_input_tokens_seen": 72525135, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.0234375, "step": 3357, "time_per_iteration": 2.666163444519043 }, { "auxiliary_loss_clip": 0.01168781, "auxiliary_loss_mlp": 0.0104465, "balance_loss_clip": 1.02521873, "balance_loss_mlp": 1.05095625, "epoch": 0.20189388245904102, "flos": 18661339658880.0, "grad_norm": 1.7508339679047655, "language_loss": 0.71786982, "learning_rate": 3.611345688158053e-06, "loss": 0.74000418, "num_input_tokens_seen": 72543690, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.99609375, "step": 3358, "time_per_iteration": 2.596548080444336 }, { "auxiliary_loss_clip": 0.01151394, "auxiliary_loss_mlp": 0.01053598, "balance_loss_clip": 1.03661048, "balance_loss_mlp": 1.04868793, "epoch": 0.20195400571170902, "flos": 16797539779200.0, "grad_norm": 2.0226076454794946, "language_loss": 0.83189034, "learning_rate": 3.6111218522864336e-06, "loss": 0.85394025, "num_input_tokens_seen": 72560725, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9375, "step": 3359, "time_per_iteration": 2.617119073867798 }, { "auxiliary_loss_clip": 0.01076902, "auxiliary_loss_mlp": 0.01010857, "balance_loss_clip": 1.00828195, "balance_loss_mlp": 1.03051829, "epoch": 0.20201412896437698, "flos": 67174716268800.0, "grad_norm": 0.7656597210501968, "language_loss": 0.58997095, "learning_rate": 3.610897958918019e-06, "loss": 0.61084855, "num_input_tokens_seen": 72621940, "router_z_loss_clip": 0.02575684, "router_z_loss_mlp": 0.37109375, "step": 3360, "time_per_iteration": 3.105351209640503 }, { "auxiliary_loss_clip": 0.01197407, "auxiliary_loss_mlp": 0.01057653, "balance_loss_clip": 1.03778052, "balance_loss_mlp": 1.05383372, "epoch": 0.20207425221704495, "flos": 21325013550720.0, "grad_norm": 2.4482982421939026, "language_loss": 0.62265992, "learning_rate": 3.6106740080608e-06, "loss": 0.6452105, "num_input_tokens_seen": 72639135, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.984375, "step": 3361, "time_per_iteration": 2.6650590896606445 }, { "auxiliary_loss_clip": 0.01166267, "auxiliary_loss_mlp": 0.01056954, "balance_loss_clip": 1.03915596, "balance_loss_mlp": 1.05252182, "epoch": 0.2021343754697129, "flos": 22527158952960.0, "grad_norm": 1.839971248908637, "language_loss": 0.75737077, "learning_rate": 3.61044999972277e-06, "loss": 0.779603, "num_input_tokens_seen": 72658525, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.953125, "step": 3362, "time_per_iteration": 2.6853139400482178 }, { "auxiliary_loss_clip": 0.01152262, "auxiliary_loss_mlp": 0.01297458, "balance_loss_clip": 1.03100848, "balance_loss_mlp": 1.0481931, "epoch": 0.20219449872238088, "flos": 19062785036160.0, "grad_norm": 1.6595531893512068, "language_loss": 0.76224935, "learning_rate": 3.610225933911921e-06, "loss": 0.78674662, "num_input_tokens_seen": 72678085, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.94921875, "step": 3363, "time_per_iteration": 2.5911381244659424 }, { "auxiliary_loss_clip": 0.01151663, "auxiliary_loss_mlp": 0.01046288, "balance_loss_clip": 1.02937198, "balance_loss_mlp": 1.04680252, "epoch": 0.20225462197504884, "flos": 24717027519360.0, "grad_norm": 1.5482596509431403, "language_loss": 0.74409348, "learning_rate": 3.6100018106362507e-06, "loss": 0.76607305, "num_input_tokens_seen": 72698695, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9609375, "step": 3364, "time_per_iteration": 2.6988813877105713 }, { "auxiliary_loss_clip": 0.01184, "auxiliary_loss_mlp": 0.01044273, "balance_loss_clip": 1.02508008, "balance_loss_mlp": 1.04946816, "epoch": 0.2023147452277168, "flos": 22018304931840.0, "grad_norm": 2.140989347905413, "language_loss": 0.71516407, "learning_rate": 3.6097776299037573e-06, "loss": 0.73744678, "num_input_tokens_seen": 72717880, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.984375, "step": 3365, "time_per_iteration": 2.5555617809295654 }, { "auxiliary_loss_clip": 0.01173073, "auxiliary_loss_mlp": 0.01049983, "balance_loss_clip": 1.03149343, "balance_loss_mlp": 1.04899168, "epoch": 0.2023748684803848, "flos": 17745365911680.0, "grad_norm": 2.955120163326226, "language_loss": 0.85762954, "learning_rate": 3.609553391722441e-06, "loss": 0.87986004, "num_input_tokens_seen": 72736410, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.96875, "step": 3366, "time_per_iteration": 2.632610559463501 }, { "auxiliary_loss_clip": 0.01170936, "auxiliary_loss_mlp": 0.01040778, "balance_loss_clip": 1.02257514, "balance_loss_mlp": 1.04834652, "epoch": 0.20243499173305277, "flos": 31138932493440.0, "grad_norm": 2.614981145226348, "language_loss": 0.69144279, "learning_rate": 3.6093290961003044e-06, "loss": 0.71355999, "num_input_tokens_seen": 72758295, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9609375, "step": 3367, "time_per_iteration": 2.61006236076355 }, { "auxiliary_loss_clip": 0.01159357, "auxiliary_loss_mlp": 0.01040051, "balance_loss_clip": 1.01928437, "balance_loss_mlp": 1.04809785, "epoch": 0.20249511498572073, "flos": 33839235279360.0, "grad_norm": 1.7485832858464276, "language_loss": 0.68379617, "learning_rate": 3.6091047430453517e-06, "loss": 0.70579028, "num_input_tokens_seen": 72782495, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.0234375, "step": 3368, "time_per_iteration": 2.6895225048065186 }, { "auxiliary_loss_clip": 0.01171937, "auxiliary_loss_mlp": 0.01048359, "balance_loss_clip": 1.02889192, "balance_loss_mlp": 1.04923379, "epoch": 0.2025552382383887, "flos": 21215629658880.0, "grad_norm": 1.902563999813146, "language_loss": 0.76947641, "learning_rate": 3.6088803325655907e-06, "loss": 0.79167938, "num_input_tokens_seen": 72801885, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.95703125, "step": 3369, "time_per_iteration": 2.5564088821411133 }, { "auxiliary_loss_clip": 0.01167272, "auxiliary_loss_mlp": 0.01055157, "balance_loss_clip": 1.03600013, "balance_loss_mlp": 1.04918575, "epoch": 0.20261536149105666, "flos": 14647388676480.0, "grad_norm": 4.187802060347483, "language_loss": 0.64322197, "learning_rate": 3.6086558646690284e-06, "loss": 0.66544628, "num_input_tokens_seen": 72816990, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0, "step": 3370, "time_per_iteration": 2.5831425189971924 }, { "auxiliary_loss_clip": 0.01070446, "auxiliary_loss_mlp": 0.01004293, "balance_loss_clip": 1.00182521, "balance_loss_mlp": 1.0259279, "epoch": 0.20267548474372463, "flos": 66783649921920.0, "grad_norm": 0.6766802677928653, "language_loss": 0.58088827, "learning_rate": 3.608431339363677e-06, "loss": 0.60163558, "num_input_tokens_seen": 72879240, "router_z_loss_clip": 0.0246582, "router_z_loss_mlp": 0.359375, "step": 3371, "time_per_iteration": 3.228803873062134 }, { "auxiliary_loss_clip": 0.01170928, "auxiliary_loss_mlp": 0.01043864, "balance_loss_clip": 1.02537477, "balance_loss_mlp": 1.04618669, "epoch": 0.2027356079963926, "flos": 24680793674880.0, "grad_norm": 1.8626829486192964, "language_loss": 0.91633654, "learning_rate": 3.6082067566575474e-06, "loss": 0.93848449, "num_input_tokens_seen": 72899030, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.9765625, "step": 3372, "time_per_iteration": 2.592708110809326 }, { "auxiliary_loss_clip": 0.01153862, "auxiliary_loss_mlp": 0.01046732, "balance_loss_clip": 1.02764618, "balance_loss_mlp": 1.04768348, "epoch": 0.20279573124906058, "flos": 26392762765440.0, "grad_norm": 1.6504501441085317, "language_loss": 0.78321683, "learning_rate": 3.6079821165586563e-06, "loss": 0.80522275, "num_input_tokens_seen": 72919190, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.96875, "step": 3373, "time_per_iteration": 2.619561195373535 }, { "auxiliary_loss_clip": 0.01174717, "auxiliary_loss_mlp": 0.01042463, "balance_loss_clip": 1.02433181, "balance_loss_mlp": 1.04541779, "epoch": 0.20285585450172855, "flos": 33799984692480.0, "grad_norm": 1.8173833533568402, "language_loss": 0.70928818, "learning_rate": 3.6077574190750194e-06, "loss": 0.73145998, "num_input_tokens_seen": 72939720, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9296875, "step": 3374, "time_per_iteration": 2.683009147644043 }, { "auxiliary_loss_clip": 0.01080523, "auxiliary_loss_mlp": 0.01002044, "balance_loss_clip": 0.99957639, "balance_loss_mlp": 1.02680361, "epoch": 0.20291597775439651, "flos": 71164823598720.0, "grad_norm": 0.9693241319717804, "language_loss": 0.62431419, "learning_rate": 3.607532664214656e-06, "loss": 0.64513981, "num_input_tokens_seen": 73000015, "router_z_loss_clip": 0.0246582, "router_z_loss_mlp": 0.35351562, "step": 3375, "time_per_iteration": 3.174917221069336 }, { "auxiliary_loss_clip": 0.01141581, "auxiliary_loss_mlp": 0.01045154, "balance_loss_clip": 1.02664089, "balance_loss_mlp": 1.04689407, "epoch": 0.20297610100706448, "flos": 19494287118720.0, "grad_norm": 1.735009034947717, "language_loss": 0.8246305, "learning_rate": 3.6073078519855863e-06, "loss": 0.84649789, "num_input_tokens_seen": 73017675, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.9453125, "step": 3376, "time_per_iteration": 2.5506503582000732 }, { "auxiliary_loss_clip": 0.01154102, "auxiliary_loss_mlp": 0.01039476, "balance_loss_clip": 1.02005672, "balance_loss_mlp": 1.04676843, "epoch": 0.20303622425973245, "flos": 25044245441280.0, "grad_norm": 2.021504292147168, "language_loss": 0.8148883, "learning_rate": 3.607082982395835e-06, "loss": 0.83682406, "num_input_tokens_seen": 73036135, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.984375, "step": 3377, "time_per_iteration": 2.6300323009490967 }, { "auxiliary_loss_clip": 0.01158742, "auxiliary_loss_mlp": 0.01055297, "balance_loss_clip": 1.03602028, "balance_loss_mlp": 1.04675877, "epoch": 0.2030963475124004, "flos": 21979988098560.0, "grad_norm": 1.9063215218858496, "language_loss": 0.76696396, "learning_rate": 3.6068580554534245e-06, "loss": 0.78910434, "num_input_tokens_seen": 73054075, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.94140625, "step": 3378, "time_per_iteration": 2.5257129669189453 }, { "auxiliary_loss_clip": 0.01190606, "auxiliary_loss_mlp": 0.01043047, "balance_loss_clip": 1.02458179, "balance_loss_mlp": 1.04920888, "epoch": 0.2031564707650684, "flos": 19500392430720.0, "grad_norm": 1.854663702659808, "language_loss": 0.79864877, "learning_rate": 3.6066330711663845e-06, "loss": 0.82098532, "num_input_tokens_seen": 73073530, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.9609375, "step": 3379, "time_per_iteration": 2.616767168045044 }, { "auxiliary_loss_clip": 0.01150192, "auxiliary_loss_mlp": 0.01039911, "balance_loss_clip": 1.02189827, "balance_loss_mlp": 1.04876065, "epoch": 0.20321659401773637, "flos": 22747075971840.0, "grad_norm": 1.7942103259435136, "language_loss": 0.86811519, "learning_rate": 3.606408029542743e-06, "loss": 0.8900162, "num_input_tokens_seen": 73092820, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.92578125, "step": 3380, "time_per_iteration": 2.528874397277832 }, { "auxiliary_loss_clip": 0.01147724, "auxiliary_loss_mlp": 0.01049058, "balance_loss_clip": 1.03031814, "balance_loss_mlp": 1.05215585, "epoch": 0.20327671727040433, "flos": 22455840499200.0, "grad_norm": 2.438979679231437, "language_loss": 0.74815893, "learning_rate": 3.60618293059053e-06, "loss": 0.77012676, "num_input_tokens_seen": 73113385, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.953125, "step": 3381, "time_per_iteration": 3.9676766395568848 }, { "auxiliary_loss_clip": 0.01163411, "auxiliary_loss_mlp": 0.01047297, "balance_loss_clip": 1.02941573, "balance_loss_mlp": 1.04774106, "epoch": 0.2033368405230723, "flos": 19535010163200.0, "grad_norm": 1.6983285553197565, "language_loss": 0.78973055, "learning_rate": 3.6059577743177803e-06, "loss": 0.81183761, "num_input_tokens_seen": 73131195, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.97265625, "step": 3382, "time_per_iteration": 2.567668914794922 }, { "auxiliary_loss_clip": 0.01175513, "auxiliary_loss_mlp": 0.01288186, "balance_loss_clip": 1.01999009, "balance_loss_mlp": 1.04862189, "epoch": 0.20339696377574026, "flos": 13809233744640.0, "grad_norm": 1.8665366072827034, "language_loss": 0.79708433, "learning_rate": 3.6057325607325293e-06, "loss": 0.82172137, "num_input_tokens_seen": 73148850, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.0, "step": 3383, "time_per_iteration": 2.516946315765381 }, { "auxiliary_loss_clip": 0.01170452, "auxiliary_loss_mlp": 0.01039314, "balance_loss_clip": 1.02113438, "balance_loss_mlp": 1.04774165, "epoch": 0.20345708702840823, "flos": 20339409288960.0, "grad_norm": 1.9356040208769725, "language_loss": 0.74362755, "learning_rate": 3.605507289842813e-06, "loss": 0.76572526, "num_input_tokens_seen": 73166775, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.953125, "step": 3384, "time_per_iteration": 5.5464558601379395 }, { "auxiliary_loss_clip": 0.0116049, "auxiliary_loss_mlp": 0.01047693, "balance_loss_clip": 1.02778518, "balance_loss_mlp": 1.05021191, "epoch": 0.2035172102810762, "flos": 20333950421760.0, "grad_norm": 2.128863013909788, "language_loss": 0.75992596, "learning_rate": 3.6052819616566717e-06, "loss": 0.78200781, "num_input_tokens_seen": 73183215, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.015625, "step": 3385, "time_per_iteration": 2.6246895790100098 }, { "auxiliary_loss_clip": 0.01178852, "auxiliary_loss_mlp": 0.01062694, "balance_loss_clip": 1.04304874, "balance_loss_mlp": 1.04713464, "epoch": 0.2035773335337442, "flos": 23330983461120.0, "grad_norm": 1.7771896794620976, "language_loss": 0.68025041, "learning_rate": 3.6050565761821464e-06, "loss": 0.70266593, "num_input_tokens_seen": 73203290, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.9609375, "step": 3386, "time_per_iteration": 2.5766565799713135 }, { "auxiliary_loss_clip": 0.01184242, "auxiliary_loss_mlp": 0.01055989, "balance_loss_clip": 1.03666544, "balance_loss_mlp": 1.05093145, "epoch": 0.20363745678641215, "flos": 28330287310080.0, "grad_norm": 1.3354224952772018, "language_loss": 0.80827439, "learning_rate": 3.6048311334272806e-06, "loss": 0.83067679, "num_input_tokens_seen": 73226185, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.9765625, "step": 3387, "time_per_iteration": 4.24541449546814 }, { "auxiliary_loss_clip": 0.01159553, "auxiliary_loss_mlp": 0.01045174, "balance_loss_clip": 1.02748907, "balance_loss_mlp": 1.04959524, "epoch": 0.20369758003908012, "flos": 18915658928640.0, "grad_norm": 2.3079363132875086, "language_loss": 0.79819608, "learning_rate": 3.6046056334001195e-06, "loss": 0.82024336, "num_input_tokens_seen": 73243300, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.91796875, "step": 3388, "time_per_iteration": 2.6269965171813965 }, { "auxiliary_loss_clip": 0.01146488, "auxiliary_loss_mlp": 0.01040588, "balance_loss_clip": 1.02196717, "balance_loss_mlp": 1.04949307, "epoch": 0.20375770329174808, "flos": 19206499351680.0, "grad_norm": 1.8531327182022663, "language_loss": 0.71249139, "learning_rate": 3.604380076108711e-06, "loss": 0.73436213, "num_input_tokens_seen": 73261490, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.96875, "step": 3389, "time_per_iteration": 2.543931007385254 }, { "auxiliary_loss_clip": 0.01154045, "auxiliary_loss_mlp": 0.01045519, "balance_loss_clip": 1.02741075, "balance_loss_mlp": 1.05097628, "epoch": 0.20381782654441605, "flos": 19391008538880.0, "grad_norm": 1.7765487668113566, "language_loss": 0.87114692, "learning_rate": 3.6041544615611047e-06, "loss": 0.89314264, "num_input_tokens_seen": 73280180, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9375, "step": 3390, "time_per_iteration": 2.5517256259918213 }, { "auxiliary_loss_clip": 0.01160289, "auxiliary_loss_mlp": 0.01041006, "balance_loss_clip": 1.02336311, "balance_loss_mlp": 1.04826772, "epoch": 0.203877949797084, "flos": 24827704300800.0, "grad_norm": 4.482769124911508, "language_loss": 0.70883548, "learning_rate": 3.6039287897653523e-06, "loss": 0.73084837, "num_input_tokens_seen": 73300680, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.94140625, "step": 3391, "time_per_iteration": 2.5932559967041016 }, { "auxiliary_loss_clip": 0.01167973, "auxiliary_loss_mlp": 0.01044664, "balance_loss_clip": 1.02659202, "balance_loss_mlp": 1.04763007, "epoch": 0.20393807304975198, "flos": 18003707504640.0, "grad_norm": 2.411142595992778, "language_loss": 0.86342192, "learning_rate": 3.6037030607295063e-06, "loss": 0.88554829, "num_input_tokens_seen": 73316760, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.9375, "step": 3392, "time_per_iteration": 2.5538082122802734 }, { "auxiliary_loss_clip": 0.01156877, "auxiliary_loss_mlp": 0.01047883, "balance_loss_clip": 1.03021598, "balance_loss_mlp": 1.04963756, "epoch": 0.20399819630241997, "flos": 24206988349440.0, "grad_norm": 1.532581433452501, "language_loss": 0.80418783, "learning_rate": 3.603477274461624e-06, "loss": 0.82623541, "num_input_tokens_seen": 73339385, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.98046875, "step": 3393, "time_per_iteration": 2.6084392070770264 }, { "auxiliary_loss_clip": 0.0115882, "auxiliary_loss_mlp": 0.01037922, "balance_loss_clip": 1.02129209, "balance_loss_mlp": 1.04719615, "epoch": 0.20405831955508794, "flos": 20777124424320.0, "grad_norm": 1.6872334580725254, "language_loss": 0.86354584, "learning_rate": 3.603251430969762e-06, "loss": 0.88551331, "num_input_tokens_seen": 73357235, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.93359375, "step": 3394, "time_per_iteration": 2.6371731758117676 }, { "auxiliary_loss_clip": 0.011539, "auxiliary_loss_mlp": 0.01039967, "balance_loss_clip": 1.02286041, "balance_loss_mlp": 1.05059171, "epoch": 0.2041184428077559, "flos": 15486908325120.0, "grad_norm": 2.441219328071048, "language_loss": 0.83902138, "learning_rate": 3.603025530261981e-06, "loss": 0.86096007, "num_input_tokens_seen": 73374435, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.94140625, "step": 3395, "time_per_iteration": 2.5160577297210693 }, { "auxiliary_loss_clip": 0.01156169, "auxiliary_loss_mlp": 0.01038963, "balance_loss_clip": 1.02027071, "balance_loss_mlp": 1.04858208, "epoch": 0.20417856606042387, "flos": 15588463052160.0, "grad_norm": 1.9126504979127854, "language_loss": 0.83146095, "learning_rate": 3.602799572346342e-06, "loss": 0.85341227, "num_input_tokens_seen": 73391025, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.984375, "step": 3396, "time_per_iteration": 2.5833029747009277 }, { "auxiliary_loss_clip": 0.01169053, "auxiliary_loss_mlp": 0.01039797, "balance_loss_clip": 1.02058005, "balance_loss_mlp": 1.04778981, "epoch": 0.20423868931309183, "flos": 24279348297600.0, "grad_norm": 1.978669660876637, "language_loss": 0.76842737, "learning_rate": 3.602573557230909e-06, "loss": 0.7905159, "num_input_tokens_seen": 73409270, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.94140625, "step": 3397, "time_per_iteration": 2.579258918762207 }, { "auxiliary_loss_clip": 0.01161879, "auxiliary_loss_mlp": 0.01044786, "balance_loss_clip": 1.02809644, "balance_loss_mlp": 1.05069995, "epoch": 0.2042988125657598, "flos": 18614870438400.0, "grad_norm": 2.8023730891834724, "language_loss": 0.8698982, "learning_rate": 3.602347484923748e-06, "loss": 0.89196485, "num_input_tokens_seen": 73425225, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.9296875, "step": 3398, "time_per_iteration": 2.6373119354248047 }, { "auxiliary_loss_clip": 0.01166349, "auxiliary_loss_mlp": 0.01046507, "balance_loss_clip": 1.02907872, "balance_loss_mlp": 1.05289876, "epoch": 0.2043589358184278, "flos": 17851230270720.0, "grad_norm": 2.095193971058927, "language_loss": 0.77796185, "learning_rate": 3.6021213554329277e-06, "loss": 0.80009049, "num_input_tokens_seen": 73440940, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.94921875, "step": 3399, "time_per_iteration": 2.5583040714263916 }, { "auxiliary_loss_clip": 0.01153258, "auxiliary_loss_mlp": 0.0103764, "balance_loss_clip": 1.01938868, "balance_loss_mlp": 1.04849482, "epoch": 0.20441905907109575, "flos": 21435223455360.0, "grad_norm": 1.893990868123113, "language_loss": 0.76060069, "learning_rate": 3.601895168766517e-06, "loss": 0.78250968, "num_input_tokens_seen": 73458805, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.9609375, "step": 3400, "time_per_iteration": 2.659064292907715 }, { "auxiliary_loss_clip": 0.01171214, "auxiliary_loss_mlp": 0.0129187, "balance_loss_clip": 1.02744794, "balance_loss_mlp": 1.05095613, "epoch": 0.20447918232376372, "flos": 27707703851520.0, "grad_norm": 1.6399197986336336, "language_loss": 0.79487741, "learning_rate": 3.601668924932588e-06, "loss": 0.81950825, "num_input_tokens_seen": 73479380, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.9296875, "step": 3401, "time_per_iteration": 2.5884339809417725 }, { "auxiliary_loss_clip": 0.01161098, "auxiliary_loss_mlp": 0.01043021, "balance_loss_clip": 1.0246985, "balance_loss_mlp": 1.04709768, "epoch": 0.20453930557643168, "flos": 30524214113280.0, "grad_norm": 1.7253727942425638, "language_loss": 0.69356954, "learning_rate": 3.601442623939215e-06, "loss": 0.71561074, "num_input_tokens_seen": 73505105, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.9609375, "step": 3402, "time_per_iteration": 2.712120771408081 }, { "auxiliary_loss_clip": 0.01174543, "auxiliary_loss_mlp": 0.0104375, "balance_loss_clip": 1.02480769, "balance_loss_mlp": 1.05121076, "epoch": 0.20459942882909965, "flos": 18987767481600.0, "grad_norm": 1.797388747856014, "language_loss": 0.8080166, "learning_rate": 3.6012162657944745e-06, "loss": 0.83019954, "num_input_tokens_seen": 73523700, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.9609375, "step": 3403, "time_per_iteration": 2.511256694793701 }, { "auxiliary_loss_clip": 0.01155356, "auxiliary_loss_mlp": 0.01041139, "balance_loss_clip": 1.0236156, "balance_loss_mlp": 1.05060351, "epoch": 0.20465955208176762, "flos": 20339050152960.0, "grad_norm": 1.850869694844682, "language_loss": 0.8305465, "learning_rate": 3.600989850506444e-06, "loss": 0.85251153, "num_input_tokens_seen": 73542625, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.95703125, "step": 3404, "time_per_iteration": 2.576261281967163 }, { "auxiliary_loss_clip": 0.01151434, "auxiliary_loss_mlp": 0.01042628, "balance_loss_clip": 1.02490139, "balance_loss_mlp": 1.04706347, "epoch": 0.20471967533443558, "flos": 21251288885760.0, "grad_norm": 1.873723294923299, "language_loss": 0.86140931, "learning_rate": 3.6007633780832043e-06, "loss": 0.8833499, "num_input_tokens_seen": 73561450, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.953125, "step": 3405, "time_per_iteration": 2.5205297470092773 }, { "auxiliary_loss_clip": 0.01168618, "auxiliary_loss_mlp": 0.01038291, "balance_loss_clip": 1.021137, "balance_loss_mlp": 1.04816151, "epoch": 0.20477979858710357, "flos": 14501555458560.0, "grad_norm": 3.0720469703934556, "language_loss": 0.84713739, "learning_rate": 3.600536848532837e-06, "loss": 0.86920655, "num_input_tokens_seen": 73577155, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.93359375, "step": 3406, "time_per_iteration": 2.5889596939086914 }, { "auxiliary_loss_clip": 0.01142189, "auxiliary_loss_mlp": 0.01039711, "balance_loss_clip": 1.02279532, "balance_loss_mlp": 1.05027819, "epoch": 0.20483992183977154, "flos": 11400310085760.0, "grad_norm": 3.011596565831763, "language_loss": 0.67349648, "learning_rate": 3.600310261863427e-06, "loss": 0.69531554, "num_input_tokens_seen": 73594900, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.91796875, "step": 3407, "time_per_iteration": 2.498975992202759 }, { "auxiliary_loss_clip": 0.01139648, "auxiliary_loss_mlp": 0.01043667, "balance_loss_clip": 1.02610779, "balance_loss_mlp": 1.04752386, "epoch": 0.2049000450924395, "flos": 19060271084160.0, "grad_norm": 2.4094166812628446, "language_loss": 0.84278905, "learning_rate": 3.6000836180830598e-06, "loss": 0.86462218, "num_input_tokens_seen": 73613810, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.91796875, "step": 3408, "time_per_iteration": 2.6910300254821777 }, { "auxiliary_loss_clip": 0.01146875, "auxiliary_loss_mlp": 0.01039205, "balance_loss_clip": 1.02193177, "balance_loss_mlp": 1.05138624, "epoch": 0.20496016834510747, "flos": 14574561851520.0, "grad_norm": 2.479133709561841, "language_loss": 0.63974953, "learning_rate": 3.5998569171998247e-06, "loss": 0.66161031, "num_input_tokens_seen": 73631495, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.953125, "step": 3409, "time_per_iteration": 2.453258752822876 }, { "auxiliary_loss_clip": 0.01150069, "auxiliary_loss_mlp": 0.01041049, "balance_loss_clip": 1.02399588, "balance_loss_mlp": 1.04736745, "epoch": 0.20502029159777543, "flos": 22126647329280.0, "grad_norm": 1.7239105848504055, "language_loss": 0.80253726, "learning_rate": 3.599630159221811e-06, "loss": 0.82444841, "num_input_tokens_seen": 73652840, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.9375, "step": 3410, "time_per_iteration": 2.5522687435150146 }, { "auxiliary_loss_clip": 0.01187936, "auxiliary_loss_mlp": 0.01040536, "balance_loss_clip": 1.02382326, "balance_loss_mlp": 1.05009961, "epoch": 0.2050804148504434, "flos": 25367907916800.0, "grad_norm": 2.1596852871991503, "language_loss": 0.75737739, "learning_rate": 3.599403344157112e-06, "loss": 0.77966213, "num_input_tokens_seen": 73672150, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.9296875, "step": 3411, "time_per_iteration": 2.5899085998535156 }, { "auxiliary_loss_clip": 0.01156167, "auxiliary_loss_mlp": 0.01043223, "balance_loss_clip": 1.02627182, "balance_loss_mlp": 1.05221367, "epoch": 0.2051405381031114, "flos": 23620171858560.0, "grad_norm": 2.425903374930489, "language_loss": 0.73453075, "learning_rate": 3.5991764720138214e-06, "loss": 0.75652462, "num_input_tokens_seen": 73691940, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9453125, "step": 3412, "time_per_iteration": 2.5414791107177734 }, { "auxiliary_loss_clip": 0.01164522, "auxiliary_loss_mlp": 0.01046649, "balance_loss_clip": 1.02832699, "balance_loss_mlp": 1.04825604, "epoch": 0.20520066135577936, "flos": 19565533745280.0, "grad_norm": 2.333660658829232, "language_loss": 0.77446651, "learning_rate": 3.598949542800037e-06, "loss": 0.79657817, "num_input_tokens_seen": 73709080, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.984375, "step": 3413, "time_per_iteration": 2.5388951301574707 }, { "auxiliary_loss_clip": 0.01168913, "auxiliary_loss_mlp": 0.01045891, "balance_loss_clip": 1.02811646, "balance_loss_mlp": 1.05135107, "epoch": 0.20526078460844732, "flos": 17676345928320.0, "grad_norm": 1.948228317786302, "language_loss": 0.84928191, "learning_rate": 3.5987225565238556e-06, "loss": 0.87142986, "num_input_tokens_seen": 73727670, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.90625, "step": 3414, "time_per_iteration": 2.602919816970825 }, { "auxiliary_loss_clip": 0.0114991, "auxiliary_loss_mlp": 0.01040932, "balance_loss_clip": 1.02363491, "balance_loss_mlp": 1.04694843, "epoch": 0.2053209078611153, "flos": 21500328856320.0, "grad_norm": 2.252904971318745, "language_loss": 0.80800134, "learning_rate": 3.598495513193379e-06, "loss": 0.82990974, "num_input_tokens_seen": 73747170, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.94140625, "step": 3415, "time_per_iteration": 2.598881483078003 }, { "auxiliary_loss_clip": 0.01189979, "auxiliary_loss_mlp": 0.0104094, "balance_loss_clip": 1.02539515, "balance_loss_mlp": 1.05178082, "epoch": 0.20538103111378325, "flos": 25663524848640.0, "grad_norm": 1.762479944171688, "language_loss": 0.72991943, "learning_rate": 3.5982684128167093e-06, "loss": 0.75222862, "num_input_tokens_seen": 73767690, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.9296875, "step": 3416, "time_per_iteration": 2.646332263946533 }, { "auxiliary_loss_clip": 0.01155044, "auxiliary_loss_mlp": 0.01043244, "balance_loss_clip": 1.02552927, "balance_loss_mlp": 1.05038142, "epoch": 0.20544115436645122, "flos": 23148952312320.0, "grad_norm": 2.246264280541047, "language_loss": 0.78244996, "learning_rate": 3.598041255401951e-06, "loss": 0.80443287, "num_input_tokens_seen": 73786900, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.95703125, "step": 3417, "time_per_iteration": 2.5999014377593994 }, { "auxiliary_loss_clip": 0.01154216, "auxiliary_loss_mlp": 0.01047575, "balance_loss_clip": 1.0291934, "balance_loss_mlp": 1.05114019, "epoch": 0.20550127761911918, "flos": 19390433921280.0, "grad_norm": 3.2714228136719656, "language_loss": 0.87335634, "learning_rate": 3.5978140409572105e-06, "loss": 0.8953743, "num_input_tokens_seen": 73804515, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.9375, "step": 3418, "time_per_iteration": 2.54162859916687 }, { "auxiliary_loss_clip": 0.01160959, "auxiliary_loss_mlp": 0.01042885, "balance_loss_clip": 1.0255518, "balance_loss_mlp": 1.04888415, "epoch": 0.20556140087178718, "flos": 22893124671360.0, "grad_norm": 2.050941424771897, "language_loss": 0.61961353, "learning_rate": 3.597586769490598e-06, "loss": 0.64165199, "num_input_tokens_seen": 73822910, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.94140625, "step": 3419, "time_per_iteration": 2.556114912033081 }, { "auxiliary_loss_clip": 0.01167747, "auxiliary_loss_mlp": 0.01055568, "balance_loss_clip": 1.03523135, "balance_loss_mlp": 1.05167723, "epoch": 0.20562152412445514, "flos": 19789652655360.0, "grad_norm": 1.9248281998759735, "language_loss": 0.8616479, "learning_rate": 3.5973594410102218e-06, "loss": 0.88388109, "num_input_tokens_seen": 73841160, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.98046875, "step": 3420, "time_per_iteration": 2.5531764030456543 }, { "auxiliary_loss_clip": 0.0117155, "auxiliary_loss_mlp": 0.01293569, "balance_loss_clip": 1.0284903, "balance_loss_mlp": 1.05042303, "epoch": 0.2056816473771231, "flos": 31501989210240.0, "grad_norm": 3.124941362444956, "language_loss": 0.7191987, "learning_rate": 3.5971320555241967e-06, "loss": 0.74384993, "num_input_tokens_seen": 73862795, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.9375, "step": 3421, "time_per_iteration": 2.7449440956115723 }, { "auxiliary_loss_clip": 0.01143125, "auxiliary_loss_mlp": 0.01039936, "balance_loss_clip": 1.0216254, "balance_loss_mlp": 1.05045509, "epoch": 0.20574177062979107, "flos": 23258372117760.0, "grad_norm": 1.9829735940078888, "language_loss": 0.70563763, "learning_rate": 3.5969046130406376e-06, "loss": 0.72746825, "num_input_tokens_seen": 73881525, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.92578125, "step": 3422, "time_per_iteration": 2.6718454360961914 }, { "auxiliary_loss_clip": 0.01073363, "auxiliary_loss_mlp": 0.01030327, "balance_loss_clip": 1.02772844, "balance_loss_mlp": 1.02965212, "epoch": 0.20580189388245904, "flos": 70312518708480.0, "grad_norm": 0.7497466858676984, "language_loss": 0.55522484, "learning_rate": 3.5966771135676596e-06, "loss": 0.5762617, "num_input_tokens_seen": 73937775, "router_z_loss_clip": 0.02600098, "router_z_loss_mlp": 0.34375, "step": 3423, "time_per_iteration": 4.545994997024536 }, { "auxiliary_loss_clip": 0.01181679, "auxiliary_loss_mlp": 0.01039669, "balance_loss_clip": 1.02132225, "balance_loss_mlp": 1.050493, "epoch": 0.205862017135127, "flos": 30737846252160.0, "grad_norm": 1.6554963080321952, "language_loss": 0.71238267, "learning_rate": 3.5964495571133835e-06, "loss": 0.73459619, "num_input_tokens_seen": 73958250, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.953125, "step": 3424, "time_per_iteration": 2.835139751434326 }, { "auxiliary_loss_clip": 0.01173983, "auxiliary_loss_mlp": 0.01295474, "balance_loss_clip": 1.03025126, "balance_loss_mlp": 1.05444169, "epoch": 0.20592214038779497, "flos": 21324546673920.0, "grad_norm": 1.4674808435773405, "language_loss": 0.75386828, "learning_rate": 3.596221943685928e-06, "loss": 0.7785629, "num_input_tokens_seen": 73977775, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.921875, "step": 3425, "time_per_iteration": 4.145843744277954 }, { "auxiliary_loss_clip": 0.01175349, "auxiliary_loss_mlp": 0.01057341, "balance_loss_clip": 1.03980541, "balance_loss_mlp": 1.05480695, "epoch": 0.20598226364046296, "flos": 22891652213760.0, "grad_norm": 2.0886450711133837, "language_loss": 0.88129067, "learning_rate": 3.5959942732934184e-06, "loss": 0.9036175, "num_input_tokens_seen": 73996590, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.93359375, "step": 3426, "time_per_iteration": 2.569070816040039 }, { "auxiliary_loss_clip": 0.01153948, "auxiliary_loss_mlp": 0.01041281, "balance_loss_clip": 1.02355504, "balance_loss_mlp": 1.05291951, "epoch": 0.20604238689313092, "flos": 23878549365120.0, "grad_norm": 1.620963957547317, "language_loss": 0.76355362, "learning_rate": 3.595766545943978e-06, "loss": 0.78550589, "num_input_tokens_seen": 74015935, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.921875, "step": 3427, "time_per_iteration": 2.54268741607666 }, { "auxiliary_loss_clip": 0.01149796, "auxiliary_loss_mlp": 0.0104926, "balance_loss_clip": 1.03067505, "balance_loss_mlp": 1.05352294, "epoch": 0.2061025101457989, "flos": 22491535639680.0, "grad_norm": 1.6201188377346951, "language_loss": 0.73800695, "learning_rate": 3.5955387616457347e-06, "loss": 0.75999749, "num_input_tokens_seen": 74036575, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.9609375, "step": 3428, "time_per_iteration": 2.5280888080596924 }, { "auxiliary_loss_clip": 0.01164937, "auxiliary_loss_mlp": 0.01046563, "balance_loss_clip": 1.02684546, "balance_loss_mlp": 1.05062664, "epoch": 0.20616263339846685, "flos": 22778928357120.0, "grad_norm": 1.8018788537263606, "language_loss": 0.7331019, "learning_rate": 3.5953109204068167e-06, "loss": 0.75521696, "num_input_tokens_seen": 74055365, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.96484375, "step": 3429, "time_per_iteration": 4.070425987243652 }, { "auxiliary_loss_clip": 0.01161602, "auxiliary_loss_mlp": 0.01042756, "balance_loss_clip": 1.02421927, "balance_loss_mlp": 1.05559397, "epoch": 0.20622275665113482, "flos": 20882198684160.0, "grad_norm": 1.8001903611841614, "language_loss": 0.85677826, "learning_rate": 3.5950830222353563e-06, "loss": 0.87882185, "num_input_tokens_seen": 74074875, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.96875, "step": 3430, "time_per_iteration": 2.519779682159424 }, { "auxiliary_loss_clip": 0.01195873, "auxiliary_loss_mlp": 0.01046959, "balance_loss_clip": 1.02805281, "balance_loss_mlp": 1.05343032, "epoch": 0.20628287990380278, "flos": 19354415558400.0, "grad_norm": 2.33004769151595, "language_loss": 0.68945158, "learning_rate": 3.594855067139486e-06, "loss": 0.71187997, "num_input_tokens_seen": 74094505, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.96875, "step": 3431, "time_per_iteration": 2.6285531520843506 }, { "auxiliary_loss_clip": 0.01097506, "auxiliary_loss_mlp": 0.0100213, "balance_loss_clip": 0.9995904, "balance_loss_mlp": 1.02791619, "epoch": 0.20634300315647078, "flos": 71517932248320.0, "grad_norm": 0.8074678045810105, "language_loss": 0.60234153, "learning_rate": 3.59462705512734e-06, "loss": 0.62333786, "num_input_tokens_seen": 74158500, "router_z_loss_clip": 0.02539062, "router_z_loss_mlp": 0.33984375, "step": 3432, "time_per_iteration": 3.329939603805542 }, { "auxiliary_loss_clip": 0.01159295, "auxiliary_loss_mlp": 0.01044056, "balance_loss_clip": 1.02628219, "balance_loss_mlp": 1.05555964, "epoch": 0.20640312640913874, "flos": 21723944976000.0, "grad_norm": 1.6028196775893797, "language_loss": 0.72462136, "learning_rate": 3.594398986207056e-06, "loss": 0.74665487, "num_input_tokens_seen": 74176685, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.94921875, "step": 3433, "time_per_iteration": 2.6594607830047607 }, { "auxiliary_loss_clip": 0.01188661, "auxiliary_loss_mlp": 0.01045195, "balance_loss_clip": 1.02636051, "balance_loss_mlp": 1.05392289, "epoch": 0.2064632496618067, "flos": 20554621626240.0, "grad_norm": 3.125157925859531, "language_loss": 0.86928654, "learning_rate": 3.5941708603867747e-06, "loss": 0.89162517, "num_input_tokens_seen": 74194935, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.98046875, "step": 3434, "time_per_iteration": 2.5815930366516113 }, { "auxiliary_loss_clip": 0.0116302, "auxiliary_loss_mlp": 0.01044154, "balance_loss_clip": 1.02604604, "balance_loss_mlp": 1.05471754, "epoch": 0.20652337291447467, "flos": 29823273135360.0, "grad_norm": 1.7954803818044467, "language_loss": 0.69223654, "learning_rate": 3.5939426776746356e-06, "loss": 0.71430832, "num_input_tokens_seen": 74215400, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.9921875, "step": 3435, "time_per_iteration": 2.658745765686035 }, { "auxiliary_loss_clip": 0.01166226, "auxiliary_loss_mlp": 0.01044255, "balance_loss_clip": 1.02564681, "balance_loss_mlp": 1.05471778, "epoch": 0.20658349616714264, "flos": 26213640618240.0, "grad_norm": 1.9595907787455096, "language_loss": 0.89176381, "learning_rate": 3.593714438078782e-06, "loss": 0.91386861, "num_input_tokens_seen": 74234090, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.93359375, "step": 3436, "time_per_iteration": 2.724447011947632 }, { "auxiliary_loss_clip": 0.01177019, "auxiliary_loss_mlp": 0.01037927, "balance_loss_clip": 1.0196048, "balance_loss_mlp": 1.05440569, "epoch": 0.2066436194198106, "flos": 25994370044160.0, "grad_norm": 1.6632957286573473, "language_loss": 0.7688309, "learning_rate": 3.59348614160736e-06, "loss": 0.79098034, "num_input_tokens_seen": 74253345, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.953125, "step": 3437, "time_per_iteration": 2.6334738731384277 }, { "auxiliary_loss_clip": 0.01159128, "auxiliary_loss_mlp": 0.01041487, "balance_loss_clip": 1.02364099, "balance_loss_mlp": 1.05316949, "epoch": 0.20670374267247857, "flos": 21361067827200.0, "grad_norm": 1.998146238546077, "language_loss": 0.77576941, "learning_rate": 3.5932577882685164e-06, "loss": 0.79777563, "num_input_tokens_seen": 74271615, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.96875, "step": 3438, "time_per_iteration": 2.6548144817352295 }, { "auxiliary_loss_clip": 0.01069442, "auxiliary_loss_mlp": 0.0100433, "balance_loss_clip": 1.00176704, "balance_loss_mlp": 1.02699316, "epoch": 0.20676386592514656, "flos": 66383281952640.0, "grad_norm": 0.8336126331917579, "language_loss": 0.67256594, "learning_rate": 3.593029378070401e-06, "loss": 0.6933037, "num_input_tokens_seen": 74331390, "router_z_loss_clip": 0.02563477, "router_z_loss_mlp": 0.3359375, "step": 3439, "time_per_iteration": 3.1484551429748535 }, { "auxiliary_loss_clip": 0.01159522, "auxiliary_loss_mlp": 0.01039665, "balance_loss_clip": 1.02130675, "balance_loss_mlp": 1.05250418, "epoch": 0.20682398917781453, "flos": 17274577328640.0, "grad_norm": 2.4783672163701347, "language_loss": 0.84244514, "learning_rate": 3.5928009110211646e-06, "loss": 0.86443698, "num_input_tokens_seen": 74347335, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.98046875, "step": 3440, "time_per_iteration": 2.564786672592163 }, { "auxiliary_loss_clip": 0.0115827, "auxiliary_loss_mlp": 0.01044953, "balance_loss_clip": 1.02810884, "balance_loss_mlp": 1.05422878, "epoch": 0.2068841124304825, "flos": 18077288515200.0, "grad_norm": 1.8988854437612614, "language_loss": 0.84655201, "learning_rate": 3.592572387128961e-06, "loss": 0.86858428, "num_input_tokens_seen": 74366310, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.94921875, "step": 3441, "time_per_iteration": 2.468595266342163 }, { "auxiliary_loss_clip": 0.01182933, "auxiliary_loss_mlp": 0.01048218, "balance_loss_clip": 1.03050351, "balance_loss_mlp": 1.05358195, "epoch": 0.20694423568315046, "flos": 27347017432320.0, "grad_norm": 2.1163956827479606, "language_loss": 0.85774922, "learning_rate": 3.5923438064019457e-06, "loss": 0.88006073, "num_input_tokens_seen": 74387100, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9296875, "step": 3442, "time_per_iteration": 2.737475872039795 }, { "auxiliary_loss_clip": 0.01204997, "auxiliary_loss_mlp": 0.01047272, "balance_loss_clip": 1.02841306, "balance_loss_mlp": 1.05383003, "epoch": 0.20700435893581842, "flos": 20229845829120.0, "grad_norm": 2.0564995858870954, "language_loss": 0.73085868, "learning_rate": 3.5921151688482754e-06, "loss": 0.75338137, "num_input_tokens_seen": 74404460, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.96484375, "step": 3443, "time_per_iteration": 2.6215929985046387 }, { "auxiliary_loss_clip": 0.01165813, "auxiliary_loss_mlp": 0.01290863, "balance_loss_clip": 1.02641797, "balance_loss_mlp": 1.05406094, "epoch": 0.2070644821884864, "flos": 20631111638400.0, "grad_norm": 1.9890425549972168, "language_loss": 0.84912848, "learning_rate": 3.5918864744761106e-06, "loss": 0.87369525, "num_input_tokens_seen": 74423790, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.9375, "step": 3444, "time_per_iteration": 2.617342710494995 }, { "auxiliary_loss_clip": 0.01093614, "auxiliary_loss_mlp": 0.01009924, "balance_loss_clip": 1.00762355, "balance_loss_mlp": 1.02488112, "epoch": 0.20712460544115438, "flos": 65941077617280.0, "grad_norm": 0.6872683380334477, "language_loss": 0.5706079, "learning_rate": 3.5916577232936116e-06, "loss": 0.59164327, "num_input_tokens_seen": 74488130, "router_z_loss_clip": 0.02294922, "router_z_loss_mlp": 0.33203125, "step": 3445, "time_per_iteration": 3.122022867202759 }, { "auxiliary_loss_clip": 0.01146048, "auxiliary_loss_mlp": 0.01294174, "balance_loss_clip": 1.02843618, "balance_loss_mlp": 1.0527643, "epoch": 0.20718472869382235, "flos": 19425734012160.0, "grad_norm": 1.693104300591441, "language_loss": 0.78192538, "learning_rate": 3.591428915308944e-06, "loss": 0.80632764, "num_input_tokens_seen": 74506720, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9296875, "step": 3446, "time_per_iteration": 2.5191242694854736 }, { "auxiliary_loss_clip": 0.01161294, "auxiliary_loss_mlp": 0.01047426, "balance_loss_clip": 1.02669561, "balance_loss_mlp": 1.05249524, "epoch": 0.2072448519464903, "flos": 24499049834880.0, "grad_norm": 2.6124679051265844, "language_loss": 0.62683427, "learning_rate": 3.5912000505302706e-06, "loss": 0.64892149, "num_input_tokens_seen": 74525330, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.0, "step": 3447, "time_per_iteration": 2.7152233123779297 }, { "auxiliary_loss_clip": 0.01183955, "auxiliary_loss_mlp": 0.01050329, "balance_loss_clip": 1.03217316, "balance_loss_mlp": 1.05160284, "epoch": 0.20730497519915828, "flos": 23075694524160.0, "grad_norm": 4.022867633471657, "language_loss": 0.86118174, "learning_rate": 3.590971128965761e-06, "loss": 0.88352454, "num_input_tokens_seen": 74544535, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.95703125, "step": 3448, "time_per_iteration": 2.5788333415985107 }, { "auxiliary_loss_clip": 0.01163946, "auxiliary_loss_mlp": 0.01049481, "balance_loss_clip": 1.02925086, "balance_loss_mlp": 1.04948378, "epoch": 0.20736509845182624, "flos": 21069042255360.0, "grad_norm": 7.12261946028078, "language_loss": 0.75140166, "learning_rate": 3.5907421506235844e-06, "loss": 0.77353597, "num_input_tokens_seen": 74562300, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.9609375, "step": 3449, "time_per_iteration": 2.6245076656341553 }, { "auxiliary_loss_clip": 0.01167641, "auxiliary_loss_mlp": 0.01052616, "balance_loss_clip": 1.03347087, "balance_loss_mlp": 1.05062425, "epoch": 0.2074252217044942, "flos": 17633288499840.0, "grad_norm": 1.9121309550047667, "language_loss": 0.76137859, "learning_rate": 3.5905131155119124e-06, "loss": 0.78358114, "num_input_tokens_seen": 74580080, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.9921875, "step": 3450, "time_per_iteration": 2.613050699234009 }, { "auxiliary_loss_clip": 0.011746, "auxiliary_loss_mlp": 0.01048385, "balance_loss_clip": 1.0298841, "balance_loss_mlp": 1.04943013, "epoch": 0.20748534495716217, "flos": 23546985897600.0, "grad_norm": 1.7732776778815897, "language_loss": 0.8201201, "learning_rate": 3.590284023638918e-06, "loss": 0.84234995, "num_input_tokens_seen": 74598980, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.984375, "step": 3451, "time_per_iteration": 2.800588607788086 }, { "auxiliary_loss_clip": 0.01057496, "auxiliary_loss_mlp": 0.01005535, "balance_loss_clip": 1.00312734, "balance_loss_mlp": 1.02416778, "epoch": 0.20754546820983016, "flos": 52252935598080.0, "grad_norm": 0.7852457690978402, "language_loss": 0.56586814, "learning_rate": 3.5900548750127784e-06, "loss": 0.58649838, "num_input_tokens_seen": 74655275, "router_z_loss_clip": 0.02404785, "router_z_loss_mlp": 0.33203125, "step": 3452, "time_per_iteration": 3.0582494735717773 }, { "auxiliary_loss_clip": 0.01174847, "auxiliary_loss_mlp": 0.01297135, "balance_loss_clip": 1.03041768, "balance_loss_mlp": 1.05041003, "epoch": 0.20760559146249813, "flos": 20412379768320.0, "grad_norm": 2.326456555603181, "language_loss": 0.87710935, "learning_rate": 3.5898256696416704e-06, "loss": 0.90182912, "num_input_tokens_seen": 74674560, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.97265625, "step": 3453, "time_per_iteration": 2.7119197845458984 }, { "auxiliary_loss_clip": 0.01158109, "auxiliary_loss_mlp": 0.01047229, "balance_loss_clip": 1.02726173, "balance_loss_mlp": 1.05218303, "epoch": 0.2076657147151661, "flos": 23186012169600.0, "grad_norm": 1.5757160373674624, "language_loss": 0.79955745, "learning_rate": 3.589596407533773e-06, "loss": 0.82161081, "num_input_tokens_seen": 74694500, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.96875, "step": 3454, "time_per_iteration": 2.5776560306549072 }, { "auxiliary_loss_clip": 0.01172845, "auxiliary_loss_mlp": 0.01301739, "balance_loss_clip": 1.03543067, "balance_loss_mlp": 1.04976141, "epoch": 0.20772583796783406, "flos": 18293219124480.0, "grad_norm": 3.1665061486530957, "language_loss": 0.75925034, "learning_rate": 3.589367088697269e-06, "loss": 0.78399616, "num_input_tokens_seen": 74710485, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.9609375, "step": 3455, "time_per_iteration": 2.5446698665618896 }, { "auxiliary_loss_clip": 0.01172653, "auxiliary_loss_mlp": 0.01046863, "balance_loss_clip": 1.02937543, "balance_loss_mlp": 1.0502491, "epoch": 0.20778596122050202, "flos": 17602800831360.0, "grad_norm": 1.788669418792094, "language_loss": 0.80372953, "learning_rate": 3.5891377131403423e-06, "loss": 0.82592475, "num_input_tokens_seen": 74727450, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.953125, "step": 3456, "time_per_iteration": 2.562757968902588 }, { "auxiliary_loss_clip": 0.01186308, "auxiliary_loss_mlp": 0.01291166, "balance_loss_clip": 1.02421153, "balance_loss_mlp": 1.0529331, "epoch": 0.20784608447317, "flos": 23805578885760.0, "grad_norm": 1.6562401526354555, "language_loss": 0.78113532, "learning_rate": 3.5889082808711776e-06, "loss": 0.80591011, "num_input_tokens_seen": 74746725, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.97265625, "step": 3457, "time_per_iteration": 2.6239240169525146 }, { "auxiliary_loss_clip": 0.01195633, "auxiliary_loss_mlp": 0.01057338, "balance_loss_clip": 1.03590453, "balance_loss_mlp": 1.05086029, "epoch": 0.20790620772583795, "flos": 17639286071040.0, "grad_norm": 1.9479346902621382, "language_loss": 0.83123362, "learning_rate": 3.5886787918979645e-06, "loss": 0.85376334, "num_input_tokens_seen": 74765255, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.99609375, "step": 3458, "time_per_iteration": 2.5835928916931152 }, { "auxiliary_loss_clip": 0.01152179, "auxiliary_loss_mlp": 0.01285292, "balance_loss_clip": 1.01949787, "balance_loss_mlp": 1.05031157, "epoch": 0.20796633097850595, "flos": 27673481168640.0, "grad_norm": 1.8805606964021904, "language_loss": 0.75871849, "learning_rate": 3.588449246228891e-06, "loss": 0.78309321, "num_input_tokens_seen": 74785710, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9296875, "step": 3459, "time_per_iteration": 2.5921709537506104 }, { "auxiliary_loss_clip": 0.01185226, "auxiliary_loss_mlp": 0.01037859, "balance_loss_clip": 1.02009726, "balance_loss_mlp": 1.04762113, "epoch": 0.2080264542311739, "flos": 19245606284160.0, "grad_norm": 2.413062445337088, "language_loss": 0.76911545, "learning_rate": 3.5882196438721504e-06, "loss": 0.79134637, "num_input_tokens_seen": 74804490, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.921875, "step": 3460, "time_per_iteration": 2.6085753440856934 }, { "auxiliary_loss_clip": 0.01190728, "auxiliary_loss_mlp": 0.01042035, "balance_loss_clip": 1.0239507, "balance_loss_mlp": 1.04928708, "epoch": 0.20808657748384188, "flos": 27525924097920.0, "grad_norm": 2.057257341635883, "language_loss": 0.75729084, "learning_rate": 3.5879899848359367e-06, "loss": 0.7796185, "num_input_tokens_seen": 74826340, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.9609375, "step": 3461, "time_per_iteration": 2.6620240211486816 }, { "auxiliary_loss_clip": 0.01162597, "auxiliary_loss_mlp": 0.0103896, "balance_loss_clip": 1.01958883, "balance_loss_mlp": 1.04515624, "epoch": 0.20814670073650984, "flos": 26906931999360.0, "grad_norm": 3.0267105051857883, "language_loss": 0.88210583, "learning_rate": 3.587760269128444e-06, "loss": 0.9041214, "num_input_tokens_seen": 74844960, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.9921875, "step": 3462, "time_per_iteration": 2.681034564971924 }, { "auxiliary_loss_clip": 0.01088655, "auxiliary_loss_mlp": 0.01248588, "balance_loss_clip": 0.99877805, "balance_loss_mlp": 1.01983047, "epoch": 0.2082068239891778, "flos": 70175735717760.0, "grad_norm": 0.7543530411087085, "language_loss": 0.59038693, "learning_rate": 3.587530496757872e-06, "loss": 0.61375934, "num_input_tokens_seen": 74909075, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.328125, "step": 3463, "time_per_iteration": 3.2880029678344727 }, { "auxiliary_loss_clip": 0.01162543, "auxiliary_loss_mlp": 0.01048455, "balance_loss_clip": 1.0292387, "balance_loss_mlp": 1.04623258, "epoch": 0.20826694724184577, "flos": 24608074590720.0, "grad_norm": 2.8567133098000075, "language_loss": 0.66258562, "learning_rate": 3.5873006677324204e-06, "loss": 0.6846956, "num_input_tokens_seen": 74928125, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.984375, "step": 3464, "time_per_iteration": 3.9942548274993896 }, { "auxiliary_loss_clip": 0.0117395, "auxiliary_loss_mlp": 0.01049037, "balance_loss_clip": 1.02898598, "balance_loss_mlp": 1.04805565, "epoch": 0.20832707049451377, "flos": 12892829034240.0, "grad_norm": 2.044921949853412, "language_loss": 0.84243292, "learning_rate": 3.587070782060291e-06, "loss": 0.86466277, "num_input_tokens_seen": 74945090, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.9921875, "step": 3465, "time_per_iteration": 2.5783443450927734 }, { "auxiliary_loss_clip": 0.01179494, "auxiliary_loss_mlp": 0.01045403, "balance_loss_clip": 1.02722383, "balance_loss_mlp": 1.04764128, "epoch": 0.20838719374718173, "flos": 22198827709440.0, "grad_norm": 3.124069769235545, "language_loss": 0.81987929, "learning_rate": 3.5868408397496874e-06, "loss": 0.84212828, "num_input_tokens_seen": 74963630, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.95703125, "step": 3466, "time_per_iteration": 2.6302273273468018 }, { "auxiliary_loss_clip": 0.0115145, "auxiliary_loss_mlp": 0.01043206, "balance_loss_clip": 1.02736354, "balance_loss_mlp": 1.04966009, "epoch": 0.2084473169998497, "flos": 15158648908800.0, "grad_norm": 1.8216564092419787, "language_loss": 0.82145727, "learning_rate": 3.5866108408088166e-06, "loss": 0.84340382, "num_input_tokens_seen": 74981875, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.9296875, "step": 3467, "time_per_iteration": 4.088305473327637 }, { "auxiliary_loss_clip": 0.01141011, "auxiliary_loss_mlp": 0.01039862, "balance_loss_clip": 1.02330399, "balance_loss_mlp": 1.05091131, "epoch": 0.20850744025251766, "flos": 17456788045440.0, "grad_norm": 1.9431135077402384, "language_loss": 0.81481248, "learning_rate": 3.5863807852458858e-06, "loss": 0.83662117, "num_input_tokens_seen": 74999155, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8984375, "step": 3468, "time_per_iteration": 2.484445333480835 }, { "auxiliary_loss_clip": 0.01157194, "auxiliary_loss_mlp": 0.01055092, "balance_loss_clip": 1.0345639, "balance_loss_mlp": 1.04955292, "epoch": 0.20856756350518563, "flos": 25698968593920.0, "grad_norm": 1.6988148356465973, "language_loss": 0.90012765, "learning_rate": 3.5861506730691054e-06, "loss": 0.92225051, "num_input_tokens_seen": 75017850, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.98828125, "step": 3469, "time_per_iteration": 2.570640802383423 }, { "auxiliary_loss_clip": 0.01164505, "auxiliary_loss_mlp": 0.01050713, "balance_loss_clip": 1.03194892, "balance_loss_mlp": 1.05067658, "epoch": 0.2086276867578536, "flos": 37889060970240.0, "grad_norm": 2.4689376467293433, "language_loss": 0.76534724, "learning_rate": 3.5859205042866877e-06, "loss": 0.78749943, "num_input_tokens_seen": 75039270, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.953125, "step": 3470, "time_per_iteration": 4.134554862976074 }, { "auxiliary_loss_clip": 0.01160913, "auxiliary_loss_mlp": 0.01048793, "balance_loss_clip": 1.03081679, "balance_loss_mlp": 1.05013919, "epoch": 0.20868781001052156, "flos": 25557049958400.0, "grad_norm": 1.6700570603727134, "language_loss": 0.75769341, "learning_rate": 3.5856902789068465e-06, "loss": 0.77979046, "num_input_tokens_seen": 75059350, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9296875, "step": 3471, "time_per_iteration": 2.566964626312256 }, { "auxiliary_loss_clip": 0.01172806, "auxiliary_loss_mlp": 0.01055283, "balance_loss_clip": 1.0358758, "balance_loss_mlp": 1.04730666, "epoch": 0.20874793326318955, "flos": 27529192235520.0, "grad_norm": 1.7859323047040105, "language_loss": 0.75705016, "learning_rate": 3.585459996937798e-06, "loss": 0.77933109, "num_input_tokens_seen": 75080150, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.98828125, "step": 3472, "time_per_iteration": 2.6475582122802734 }, { "auxiliary_loss_clip": 0.01152296, "auxiliary_loss_mlp": 0.01050003, "balance_loss_clip": 1.03247881, "balance_loss_mlp": 1.04882908, "epoch": 0.20880805651585752, "flos": 18548795370240.0, "grad_norm": 2.0188748003409427, "language_loss": 0.84283137, "learning_rate": 3.585229658387761e-06, "loss": 0.86485434, "num_input_tokens_seen": 75097920, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.94140625, "step": 3473, "time_per_iteration": 2.5109963417053223 }, { "auxiliary_loss_clip": 0.0107899, "auxiliary_loss_mlp": 0.01032687, "balance_loss_clip": 1.03076792, "balance_loss_mlp": 1.01907325, "epoch": 0.20886817976852548, "flos": 65946644225280.0, "grad_norm": 0.8974163530347975, "language_loss": 0.63692176, "learning_rate": 3.5849992632649552e-06, "loss": 0.65803856, "num_input_tokens_seen": 75152410, "router_z_loss_clip": 0.01916504, "router_z_loss_mlp": 0.328125, "step": 3474, "time_per_iteration": 3.108168601989746 }, { "auxiliary_loss_clip": 0.01166208, "auxiliary_loss_mlp": 0.0104572, "balance_loss_clip": 1.02786279, "balance_loss_mlp": 1.05121446, "epoch": 0.20892830302119345, "flos": 36539178929280.0, "grad_norm": 2.531289355945897, "language_loss": 0.69588721, "learning_rate": 3.5847688115776024e-06, "loss": 0.71800655, "num_input_tokens_seen": 75173265, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.96875, "step": 3475, "time_per_iteration": 2.6608572006225586 }, { "auxiliary_loss_clip": 0.01146399, "auxiliary_loss_mlp": 0.01046437, "balance_loss_clip": 1.02874684, "balance_loss_mlp": 1.05076432, "epoch": 0.2089884262738614, "flos": 20956749361920.0, "grad_norm": 2.065352631801958, "language_loss": 0.6975795, "learning_rate": 3.5845383033339274e-06, "loss": 0.71950781, "num_input_tokens_seen": 75193640, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.95703125, "step": 3476, "time_per_iteration": 2.5424282550811768 }, { "auxiliary_loss_clip": 0.01163702, "auxiliary_loss_mlp": 0.01048745, "balance_loss_clip": 1.03105462, "balance_loss_mlp": 1.05091667, "epoch": 0.20904854952652938, "flos": 22784028088320.0, "grad_norm": 1.9041670203803758, "language_loss": 0.89082998, "learning_rate": 3.584307738542156e-06, "loss": 0.91295445, "num_input_tokens_seen": 75212545, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.94921875, "step": 3477, "time_per_iteration": 2.5725982189178467 }, { "auxiliary_loss_clip": 0.01177647, "auxiliary_loss_mlp": 0.01046489, "balance_loss_clip": 1.0284524, "balance_loss_mlp": 1.04753447, "epoch": 0.20910867277919734, "flos": 27303277645440.0, "grad_norm": 2.3774072810087956, "language_loss": 0.67477185, "learning_rate": 3.5840771172105174e-06, "loss": 0.6970132, "num_input_tokens_seen": 75230865, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9375, "step": 3478, "time_per_iteration": 2.5800044536590576 }, { "auxiliary_loss_clip": 0.01170787, "auxiliary_loss_mlp": 0.01044884, "balance_loss_clip": 1.02654982, "balance_loss_mlp": 1.04849792, "epoch": 0.20916879603186533, "flos": 14319237000960.0, "grad_norm": 2.106680484539863, "language_loss": 0.84754622, "learning_rate": 3.5838464393472406e-06, "loss": 0.86970294, "num_input_tokens_seen": 75248285, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.94921875, "step": 3479, "time_per_iteration": 2.5326552391052246 }, { "auxiliary_loss_clip": 0.01172704, "auxiliary_loss_mlp": 0.01048377, "balance_loss_clip": 1.03003025, "balance_loss_mlp": 1.04986966, "epoch": 0.2092289192845333, "flos": 22273019251200.0, "grad_norm": 2.3924647560162553, "language_loss": 0.74170732, "learning_rate": 3.5836157049605587e-06, "loss": 0.76391816, "num_input_tokens_seen": 75266310, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.953125, "step": 3480, "time_per_iteration": 2.551243543624878 }, { "auxiliary_loss_clip": 0.01166608, "auxiliary_loss_mlp": 0.01045557, "balance_loss_clip": 1.02977395, "balance_loss_mlp": 1.04792047, "epoch": 0.20928904253720126, "flos": 14830712714880.0, "grad_norm": 4.0587809919246265, "language_loss": 0.75357282, "learning_rate": 3.5833849140587057e-06, "loss": 0.77569449, "num_input_tokens_seen": 75284175, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.921875, "step": 3481, "time_per_iteration": 2.5419375896453857 }, { "auxiliary_loss_clip": 0.01180107, "auxiliary_loss_mlp": 0.01040575, "balance_loss_clip": 1.02324235, "balance_loss_mlp": 1.04912615, "epoch": 0.20934916578986923, "flos": 23259162216960.0, "grad_norm": 2.1046056782181912, "language_loss": 0.84799993, "learning_rate": 3.583154066649918e-06, "loss": 0.87020683, "num_input_tokens_seen": 75303465, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9453125, "step": 3482, "time_per_iteration": 2.561084032058716 }, { "auxiliary_loss_clip": 0.01189431, "auxiliary_loss_mlp": 0.01045217, "balance_loss_clip": 1.02762127, "balance_loss_mlp": 1.05079949, "epoch": 0.2094092890425372, "flos": 32014398677760.0, "grad_norm": 1.8573324966192826, "language_loss": 0.70755672, "learning_rate": 3.5829231627424345e-06, "loss": 0.72990322, "num_input_tokens_seen": 75325290, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.94140625, "step": 3483, "time_per_iteration": 2.7106356620788574 }, { "auxiliary_loss_clip": 0.0116349, "auxiliary_loss_mlp": 0.01049081, "balance_loss_clip": 1.03100872, "balance_loss_mlp": 1.04852867, "epoch": 0.20946941229520516, "flos": 20010647082240.0, "grad_norm": 1.6386960955873415, "language_loss": 0.75062627, "learning_rate": 3.5826922023444945e-06, "loss": 0.77275199, "num_input_tokens_seen": 75343895, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.96875, "step": 3484, "time_per_iteration": 2.533182382583618 }, { "auxiliary_loss_clip": 0.01163014, "auxiliary_loss_mlp": 0.01040164, "balance_loss_clip": 1.02241373, "balance_loss_mlp": 1.05193222, "epoch": 0.20952953554787315, "flos": 30740072895360.0, "grad_norm": 1.7046496538094622, "language_loss": 0.70302117, "learning_rate": 3.582461185464342e-06, "loss": 0.72505295, "num_input_tokens_seen": 75367100, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9296875, "step": 3485, "time_per_iteration": 2.6193182468414307 }, { "auxiliary_loss_clip": 0.0118519, "auxiliary_loss_mlp": 0.01287588, "balance_loss_clip": 1.02114892, "balance_loss_mlp": 1.05233359, "epoch": 0.20958965880054112, "flos": 27049209770880.0, "grad_norm": 1.8451681673078393, "language_loss": 0.82496512, "learning_rate": 3.5822301121102195e-06, "loss": 0.84969294, "num_input_tokens_seen": 75389925, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.9609375, "step": 3486, "time_per_iteration": 2.593055009841919 }, { "auxiliary_loss_clip": 0.01152211, "auxiliary_loss_mlp": 0.0105016, "balance_loss_clip": 1.03368545, "balance_loss_mlp": 1.04903507, "epoch": 0.20964978205320908, "flos": 34204123589760.0, "grad_norm": 1.6642488491760612, "language_loss": 0.86750543, "learning_rate": 3.5819989822903744e-06, "loss": 0.88952917, "num_input_tokens_seen": 75408575, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.94140625, "step": 3487, "time_per_iteration": 2.696930170059204 }, { "auxiliary_loss_clip": 0.01170357, "auxiliary_loss_mlp": 0.01041117, "balance_loss_clip": 1.02278292, "balance_loss_mlp": 1.04865086, "epoch": 0.20970990530587705, "flos": 23477391296640.0, "grad_norm": 2.516166687226968, "language_loss": 0.71746731, "learning_rate": 3.5817677960130547e-06, "loss": 0.73958206, "num_input_tokens_seen": 75427155, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.9453125, "step": 3488, "time_per_iteration": 2.569045066833496 }, { "auxiliary_loss_clip": 0.01164821, "auxiliary_loss_mlp": 0.01038797, "balance_loss_clip": 1.02204776, "balance_loss_mlp": 1.05263352, "epoch": 0.209770028558545, "flos": 18551452976640.0, "grad_norm": 2.3563596312947155, "language_loss": 0.81068432, "learning_rate": 3.5815365532865113e-06, "loss": 0.83272058, "num_input_tokens_seen": 75444450, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.9375, "step": 3489, "time_per_iteration": 2.5616278648376465 }, { "auxiliary_loss_clip": 0.01145492, "auxiliary_loss_mlp": 0.01044128, "balance_loss_clip": 1.02749872, "balance_loss_mlp": 1.05072045, "epoch": 0.20983015181121298, "flos": 21617003208960.0, "grad_norm": 1.8305575654646535, "language_loss": 0.72191644, "learning_rate": 3.5813052541189972e-06, "loss": 0.74381268, "num_input_tokens_seen": 75462625, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.94921875, "step": 3490, "time_per_iteration": 2.5218732357025146 }, { "auxiliary_loss_clip": 0.01160325, "auxiliary_loss_mlp": 0.01049135, "balance_loss_clip": 1.0328505, "balance_loss_mlp": 1.05251324, "epoch": 0.20989027506388094, "flos": 16614718531200.0, "grad_norm": 2.7133880226129943, "language_loss": 0.70572412, "learning_rate": 3.581073898518766e-06, "loss": 0.72781873, "num_input_tokens_seen": 75480640, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8984375, "step": 3491, "time_per_iteration": 2.6124837398529053 }, { "auxiliary_loss_clip": 0.01153279, "auxiliary_loss_mlp": 0.01289875, "balance_loss_clip": 1.02390718, "balance_loss_mlp": 1.04926658, "epoch": 0.20995039831654894, "flos": 23216823060480.0, "grad_norm": 2.6830762079024253, "language_loss": 0.79530025, "learning_rate": 3.5808424864940737e-06, "loss": 0.81973183, "num_input_tokens_seen": 75494900, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.953125, "step": 3492, "time_per_iteration": 2.483776569366455 }, { "auxiliary_loss_clip": 0.01151655, "auxiliary_loss_mlp": 0.01284244, "balance_loss_clip": 1.01947832, "balance_loss_mlp": 1.04868126, "epoch": 0.2100105215692169, "flos": 18147493647360.0, "grad_norm": 2.0977370071960406, "language_loss": 0.86319065, "learning_rate": 3.5806110180531797e-06, "loss": 0.88754964, "num_input_tokens_seen": 75513370, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.94140625, "step": 3493, "time_per_iteration": 2.5775341987609863 }, { "auxiliary_loss_clip": 0.01156734, "auxiliary_loss_mlp": 0.01037948, "balance_loss_clip": 1.02183115, "balance_loss_mlp": 1.04764056, "epoch": 0.21007064482188487, "flos": 15961611490560.0, "grad_norm": 1.817716264821867, "language_loss": 0.68452108, "learning_rate": 3.5803794932043447e-06, "loss": 0.70646799, "num_input_tokens_seen": 75532480, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.90625, "step": 3494, "time_per_iteration": 2.569254159927368 }, { "auxiliary_loss_clip": 0.01159829, "auxiliary_loss_mlp": 0.0103985, "balance_loss_clip": 1.02283907, "balance_loss_mlp": 1.05304444, "epoch": 0.21013076807455283, "flos": 32234315696640.0, "grad_norm": 2.1117659878102004, "language_loss": 0.78155017, "learning_rate": 3.58014791195583e-06, "loss": 0.80354697, "num_input_tokens_seen": 75552745, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.98046875, "step": 3495, "time_per_iteration": 2.7886195182800293 }, { "auxiliary_loss_clip": 0.01159758, "auxiliary_loss_mlp": 0.01041995, "balance_loss_clip": 1.02492416, "balance_loss_mlp": 1.04763877, "epoch": 0.2101908913272208, "flos": 23696625957120.0, "grad_norm": 1.711160201789661, "language_loss": 0.77299714, "learning_rate": 3.579916274315902e-06, "loss": 0.79501474, "num_input_tokens_seen": 75574355, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.9453125, "step": 3496, "time_per_iteration": 2.624907970428467 }, { "auxiliary_loss_clip": 0.0116403, "auxiliary_loss_mlp": 0.01052791, "balance_loss_clip": 1.03448057, "balance_loss_mlp": 1.04896188, "epoch": 0.21025101457988876, "flos": 20375786787840.0, "grad_norm": 1.8990293223719268, "language_loss": 0.8171984, "learning_rate": 3.5796845802928254e-06, "loss": 0.83936661, "num_input_tokens_seen": 75592215, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.96875, "step": 3497, "time_per_iteration": 2.5857319831848145 }, { "auxiliary_loss_clip": 0.01156904, "auxiliary_loss_mlp": 0.01044706, "balance_loss_clip": 1.02727747, "balance_loss_mlp": 1.05186105, "epoch": 0.21031113783255675, "flos": 25775638174080.0, "grad_norm": 1.75496949907929, "language_loss": 0.66890073, "learning_rate": 3.5794528298948696e-06, "loss": 0.6909169, "num_input_tokens_seen": 75610740, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9609375, "step": 3498, "time_per_iteration": 2.5712833404541016 }, { "auxiliary_loss_clip": 0.01173817, "auxiliary_loss_mlp": 0.01042139, "balance_loss_clip": 1.02449572, "balance_loss_mlp": 1.04912007, "epoch": 0.21037126108522472, "flos": 22018197191040.0, "grad_norm": 2.2176876581787908, "language_loss": 0.80622411, "learning_rate": 3.579221023130306e-06, "loss": 0.82838368, "num_input_tokens_seen": 75631005, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.9765625, "step": 3499, "time_per_iteration": 2.6572213172912598 }, { "auxiliary_loss_clip": 0.0115267, "auxiliary_loss_mlp": 0.01044226, "balance_loss_clip": 1.02748954, "balance_loss_mlp": 1.05068851, "epoch": 0.21043138433789269, "flos": 25334403505920.0, "grad_norm": 2.6497303432040926, "language_loss": 0.78091776, "learning_rate": 3.578989160007405e-06, "loss": 0.80288672, "num_input_tokens_seen": 75650655, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.9296875, "step": 3500, "time_per_iteration": 2.6152029037475586 }, { "auxiliary_loss_clip": 0.01160435, "auxiliary_loss_mlp": 0.01043411, "balance_loss_clip": 1.02613759, "balance_loss_mlp": 1.04849601, "epoch": 0.21049150759056065, "flos": 25556654908800.0, "grad_norm": 1.6825882039792706, "language_loss": 0.7356571, "learning_rate": 3.5787572405344437e-06, "loss": 0.75769556, "num_input_tokens_seen": 75669895, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.94140625, "step": 3501, "time_per_iteration": 2.62373685836792 }, { "auxiliary_loss_clip": 0.01150163, "auxiliary_loss_mlp": 0.01039806, "balance_loss_clip": 1.02290189, "balance_loss_mlp": 1.04863715, "epoch": 0.21055163084322862, "flos": 24495602129280.0, "grad_norm": 1.707800356228631, "language_loss": 0.7521174, "learning_rate": 3.578525264719697e-06, "loss": 0.7740171, "num_input_tokens_seen": 75689535, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.92578125, "step": 3502, "time_per_iteration": 2.5481648445129395 }, { "auxiliary_loss_clip": 0.01179337, "auxiliary_loss_mlp": 0.01038847, "balance_loss_clip": 1.02133465, "balance_loss_mlp": 1.04901838, "epoch": 0.21061175409589658, "flos": 25739045193600.0, "grad_norm": 2.4723083002253756, "language_loss": 0.77116537, "learning_rate": 3.578293232571444e-06, "loss": 0.79334712, "num_input_tokens_seen": 75709265, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.94140625, "step": 3503, "time_per_iteration": 2.6575238704681396 }, { "auxiliary_loss_clip": 0.01192486, "auxiliary_loss_mlp": 0.01042433, "balance_loss_clip": 1.02308559, "balance_loss_mlp": 1.04889107, "epoch": 0.21067187734856455, "flos": 18989168112000.0, "grad_norm": 2.755922601486328, "language_loss": 0.77763188, "learning_rate": 3.5780611440979655e-06, "loss": 0.79998112, "num_input_tokens_seen": 75727050, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.98046875, "step": 3504, "time_per_iteration": 2.641819953918457 }, { "auxiliary_loss_clip": 0.01173032, "auxiliary_loss_mlp": 0.01050364, "balance_loss_clip": 1.03221989, "balance_loss_mlp": 1.04806268, "epoch": 0.21073200060123254, "flos": 24681368292480.0, "grad_norm": 2.8492003036130216, "language_loss": 0.76588106, "learning_rate": 3.5778289993075442e-06, "loss": 0.78811491, "num_input_tokens_seen": 75747175, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9765625, "step": 3505, "time_per_iteration": 2.6127774715423584 }, { "auxiliary_loss_clip": 0.01158557, "auxiliary_loss_mlp": 0.010485, "balance_loss_clip": 1.03101254, "balance_loss_mlp": 1.04877687, "epoch": 0.2107921238539005, "flos": 28549342402560.0, "grad_norm": 2.405875417874078, "language_loss": 0.63409054, "learning_rate": 3.5775967982084644e-06, "loss": 0.65616107, "num_input_tokens_seen": 75767690, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.9140625, "step": 3506, "time_per_iteration": 3.9942030906677246 }, { "auxiliary_loss_clip": 0.01151544, "auxiliary_loss_mlp": 0.01047724, "balance_loss_clip": 1.03062928, "balance_loss_mlp": 1.04943621, "epoch": 0.21085224710656847, "flos": 25885848078720.0, "grad_norm": 1.7524112535360274, "language_loss": 0.81108391, "learning_rate": 3.5773645408090126e-06, "loss": 0.83307666, "num_input_tokens_seen": 75787255, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.9296875, "step": 3507, "time_per_iteration": 2.6168010234832764 }, { "auxiliary_loss_clip": 0.01151553, "auxiliary_loss_mlp": 0.01046315, "balance_loss_clip": 1.02829075, "balance_loss_mlp": 1.04862189, "epoch": 0.21091237035923643, "flos": 14976294537600.0, "grad_norm": 2.0599346552823192, "language_loss": 0.75355816, "learning_rate": 3.577132227117478e-06, "loss": 0.77553684, "num_input_tokens_seen": 75805890, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.94140625, "step": 3508, "time_per_iteration": 2.6008002758026123 }, { "auxiliary_loss_clip": 0.01145208, "auxiliary_loss_mlp": 0.01042156, "balance_loss_clip": 1.02477527, "balance_loss_mlp": 1.04962599, "epoch": 0.2109724936119044, "flos": 16362518163840.0, "grad_norm": 2.5595722966665178, "language_loss": 0.84975386, "learning_rate": 3.576899857142152e-06, "loss": 0.87162751, "num_input_tokens_seen": 75821620, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.95703125, "step": 3509, "time_per_iteration": 4.048318862915039 }, { "auxiliary_loss_clip": 0.01175797, "auxiliary_loss_mlp": 0.01046856, "balance_loss_clip": 1.02848577, "balance_loss_mlp": 1.05095291, "epoch": 0.21103261686457236, "flos": 31502492000640.0, "grad_norm": 1.833045150863064, "language_loss": 0.68031502, "learning_rate": 3.5766674308913254e-06, "loss": 0.70254159, "num_input_tokens_seen": 75842490, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.9765625, "step": 3510, "time_per_iteration": 2.6156115531921387 }, { "auxiliary_loss_clip": 0.01144587, "auxiliary_loss_mlp": 0.01041065, "balance_loss_clip": 1.0241251, "balance_loss_mlp": 1.04870462, "epoch": 0.21109274011724033, "flos": 27344072517120.0, "grad_norm": 1.6913636438243145, "language_loss": 0.71939164, "learning_rate": 3.5764349483732937e-06, "loss": 0.74124807, "num_input_tokens_seen": 75865985, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9609375, "step": 3511, "time_per_iteration": 4.076263427734375 }, { "auxiliary_loss_clip": 0.01211722, "auxiliary_loss_mlp": 0.01042505, "balance_loss_clip": 1.02320504, "balance_loss_mlp": 1.05072343, "epoch": 0.21115286336990832, "flos": 17820383466240.0, "grad_norm": 2.4006392381054202, "language_loss": 0.68755037, "learning_rate": 3.5762024095963543e-06, "loss": 0.7100926, "num_input_tokens_seen": 75882745, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.98046875, "step": 3512, "time_per_iteration": 2.6335015296936035 }, { "auxiliary_loss_clip": 0.01152176, "auxiliary_loss_mlp": 0.01047816, "balance_loss_clip": 1.02919555, "balance_loss_mlp": 1.04669797, "epoch": 0.2112129866225763, "flos": 27197987904000.0, "grad_norm": 1.942594363715605, "language_loss": 0.73281801, "learning_rate": 3.575969814568805e-06, "loss": 0.75481784, "num_input_tokens_seen": 75904305, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.96875, "step": 3513, "time_per_iteration": 2.596142530441284 }, { "auxiliary_loss_clip": 0.01158727, "auxiliary_loss_mlp": 0.01038161, "balance_loss_clip": 1.0217216, "balance_loss_mlp": 1.050017, "epoch": 0.21127310987524425, "flos": 23731279603200.0, "grad_norm": 2.026921151974278, "language_loss": 0.74370217, "learning_rate": 3.5757371632989477e-06, "loss": 0.76567101, "num_input_tokens_seen": 75923710, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.90625, "step": 3514, "time_per_iteration": 2.6693172454833984 }, { "auxiliary_loss_clip": 0.01166003, "auxiliary_loss_mlp": 0.01036734, "balance_loss_clip": 1.01931703, "balance_loss_mlp": 1.05152392, "epoch": 0.21133323312791222, "flos": 18332505624960.0, "grad_norm": 1.8310464304411986, "language_loss": 0.76906693, "learning_rate": 3.5755044557950832e-06, "loss": 0.79109424, "num_input_tokens_seen": 75942625, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.96484375, "step": 3515, "time_per_iteration": 2.607797145843506 }, { "auxiliary_loss_clip": 0.01163269, "auxiliary_loss_mlp": 0.01044502, "balance_loss_clip": 1.02752614, "balance_loss_mlp": 1.05203843, "epoch": 0.21139335638058018, "flos": 17931203902080.0, "grad_norm": 1.8854659091198631, "language_loss": 0.68445355, "learning_rate": 3.575271692065518e-06, "loss": 0.70653129, "num_input_tokens_seen": 75959930, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9296875, "step": 3516, "time_per_iteration": 2.642810344696045 }, { "auxiliary_loss_clip": 0.01174329, "auxiliary_loss_mlp": 0.01050157, "balance_loss_clip": 1.03222823, "balance_loss_mlp": 1.0509721, "epoch": 0.21145347963324815, "flos": 24572092141440.0, "grad_norm": 1.8540420822206756, "language_loss": 0.84784812, "learning_rate": 3.575038872118558e-06, "loss": 0.87009299, "num_input_tokens_seen": 75980335, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.96484375, "step": 3517, "time_per_iteration": 2.607964038848877 }, { "auxiliary_loss_clip": 0.01162454, "auxiliary_loss_mlp": 0.01041237, "balance_loss_clip": 1.02384472, "balance_loss_mlp": 1.05049503, "epoch": 0.21151360288591614, "flos": 35845959375360.0, "grad_norm": 2.027134510368529, "language_loss": 0.62559712, "learning_rate": 3.5748059959625122e-06, "loss": 0.64763409, "num_input_tokens_seen": 76002095, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.94140625, "step": 3518, "time_per_iteration": 2.8171746730804443 }, { "auxiliary_loss_clip": 0.01191141, "auxiliary_loss_mlp": 0.01053032, "balance_loss_clip": 1.03631938, "balance_loss_mlp": 1.05263352, "epoch": 0.2115737261385841, "flos": 24641579001600.0, "grad_norm": 1.8979242971687158, "language_loss": 0.89017087, "learning_rate": 3.574573063605691e-06, "loss": 0.91261262, "num_input_tokens_seen": 76020425, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.93359375, "step": 3519, "time_per_iteration": 2.6332790851593018 }, { "auxiliary_loss_clip": 0.01165129, "auxiliary_loss_mlp": 0.0104643, "balance_loss_clip": 1.02919257, "balance_loss_mlp": 1.05170166, "epoch": 0.21163384939125207, "flos": 25226887121280.0, "grad_norm": 3.9806227502184295, "language_loss": 0.80976182, "learning_rate": 3.574340075056408e-06, "loss": 0.83187735, "num_input_tokens_seen": 76041210, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.953125, "step": 3520, "time_per_iteration": 2.6520583629608154 }, { "auxiliary_loss_clip": 0.01158671, "auxiliary_loss_mlp": 0.01047415, "balance_loss_clip": 1.03059435, "balance_loss_mlp": 1.050843, "epoch": 0.21169397264392004, "flos": 26067520091520.0, "grad_norm": 1.6075967801073006, "language_loss": 0.75818586, "learning_rate": 3.5741070303229776e-06, "loss": 0.78024673, "num_input_tokens_seen": 76062685, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8984375, "step": 3521, "time_per_iteration": 2.662994146347046 }, { "auxiliary_loss_clip": 0.01189836, "auxiliary_loss_mlp": 0.01039291, "balance_loss_clip": 1.02344823, "balance_loss_mlp": 1.05120087, "epoch": 0.211754095896588, "flos": 23108265181440.0, "grad_norm": 1.9932925332498073, "language_loss": 0.7556603, "learning_rate": 3.5738739294137154e-06, "loss": 0.7779516, "num_input_tokens_seen": 76082300, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.9375, "step": 3522, "time_per_iteration": 2.651608943939209 }, { "auxiliary_loss_clip": 0.01142506, "auxiliary_loss_mlp": 0.01046824, "balance_loss_clip": 1.02925229, "balance_loss_mlp": 1.04908538, "epoch": 0.21181421914925597, "flos": 27922341571200.0, "grad_norm": 2.782044640052631, "language_loss": 0.70147133, "learning_rate": 3.573640772336942e-06, "loss": 0.72336459, "num_input_tokens_seen": 76101135, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.93359375, "step": 3523, "time_per_iteration": 2.551236867904663 }, { "auxiliary_loss_clip": 0.01153777, "auxiliary_loss_mlp": 0.01046167, "balance_loss_clip": 1.03007329, "balance_loss_mlp": 1.05228066, "epoch": 0.21187434240192393, "flos": 17128636369920.0, "grad_norm": 2.5180799198818575, "language_loss": 0.76991475, "learning_rate": 3.573407559100977e-06, "loss": 0.79191422, "num_input_tokens_seen": 76119320, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.92578125, "step": 3524, "time_per_iteration": 2.5525083541870117 }, { "auxiliary_loss_clip": 0.01161793, "auxiliary_loss_mlp": 0.0104245, "balance_loss_clip": 1.02589178, "balance_loss_mlp": 1.04776633, "epoch": 0.21193446565459192, "flos": 22347318533760.0, "grad_norm": 1.752108866537625, "language_loss": 0.81125051, "learning_rate": 3.573174289714143e-06, "loss": 0.8332929, "num_input_tokens_seen": 76137445, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.9609375, "step": 3525, "time_per_iteration": 2.545161724090576 }, { "auxiliary_loss_clip": 0.01187341, "auxiliary_loss_mlp": 0.01037741, "balance_loss_clip": 1.0214448, "balance_loss_mlp": 1.05019212, "epoch": 0.2119945889072599, "flos": 27199316707200.0, "grad_norm": 1.8546186226782209, "language_loss": 0.75204325, "learning_rate": 3.572940964184766e-06, "loss": 0.77429408, "num_input_tokens_seen": 76159500, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.91796875, "step": 3526, "time_per_iteration": 2.689213514328003 }, { "auxiliary_loss_clip": 0.01146773, "auxiliary_loss_mlp": 0.01041473, "balance_loss_clip": 1.02447367, "balance_loss_mlp": 1.05121732, "epoch": 0.21205471215992786, "flos": 20991869884800.0, "grad_norm": 2.541462647972894, "language_loss": 0.76861447, "learning_rate": 3.572707582521172e-06, "loss": 0.79049695, "num_input_tokens_seen": 76177990, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9609375, "step": 3527, "time_per_iteration": 2.59883975982666 }, { "auxiliary_loss_clip": 0.01144505, "auxiliary_loss_mlp": 0.01045069, "balance_loss_clip": 1.02668667, "balance_loss_mlp": 1.05150306, "epoch": 0.21211483541259582, "flos": 20777663128320.0, "grad_norm": 2.5769985435780227, "language_loss": 0.7756443, "learning_rate": 3.5724741447316894e-06, "loss": 0.79754007, "num_input_tokens_seen": 76197125, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.9296875, "step": 3528, "time_per_iteration": 2.576510190963745 }, { "auxiliary_loss_clip": 0.01151736, "auxiliary_loss_mlp": 0.01043998, "balance_loss_clip": 1.02736807, "balance_loss_mlp": 1.05014563, "epoch": 0.21217495866526379, "flos": 18989994124800.0, "grad_norm": 2.161287917888553, "language_loss": 0.81709599, "learning_rate": 3.57224065082465e-06, "loss": 0.83905339, "num_input_tokens_seen": 76216215, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.92578125, "step": 3529, "time_per_iteration": 2.6297953128814697 }, { "auxiliary_loss_clip": 0.01164887, "auxiliary_loss_mlp": 0.01296277, "balance_loss_clip": 1.03134394, "balance_loss_mlp": 1.05137873, "epoch": 0.21223508191793175, "flos": 20667309569280.0, "grad_norm": 1.957317493496767, "language_loss": 0.76672339, "learning_rate": 3.572007100808386e-06, "loss": 0.79133505, "num_input_tokens_seen": 76237010, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.953125, "step": 3530, "time_per_iteration": 2.6823301315307617 }, { "auxiliary_loss_clip": 0.01142855, "auxiliary_loss_mlp": 0.01041217, "balance_loss_clip": 1.02552891, "balance_loss_mlp": 1.051651, "epoch": 0.21229520517059972, "flos": 21616464504960.0, "grad_norm": 2.1557651081010554, "language_loss": 0.82843781, "learning_rate": 3.5717734946912323e-06, "loss": 0.85027856, "num_input_tokens_seen": 76255965, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.9140625, "step": 3531, "time_per_iteration": 2.5770697593688965 }, { "auxiliary_loss_clip": 0.01165302, "auxiliary_loss_mlp": 0.01040676, "balance_loss_clip": 1.02343869, "balance_loss_mlp": 1.05292988, "epoch": 0.2123553284232677, "flos": 13991049411840.0, "grad_norm": 2.5325956247482186, "language_loss": 0.73160869, "learning_rate": 3.5715398324815248e-06, "loss": 0.75366849, "num_input_tokens_seen": 76272150, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.94140625, "step": 3532, "time_per_iteration": 2.5099246501922607 }, { "auxiliary_loss_clip": 0.0117041, "auxiliary_loss_mlp": 0.01043688, "balance_loss_clip": 1.02592611, "balance_loss_mlp": 1.05103827, "epoch": 0.21241545167593567, "flos": 18296774570880.0, "grad_norm": 2.04890816654684, "language_loss": 0.73502553, "learning_rate": 3.5713061141876038e-06, "loss": 0.7571665, "num_input_tokens_seen": 76291425, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.92578125, "step": 3533, "time_per_iteration": 2.532270908355713 }, { "auxiliary_loss_clip": 0.0115673, "auxiliary_loss_mlp": 0.01044464, "balance_loss_clip": 1.02763164, "balance_loss_mlp": 1.05280101, "epoch": 0.21247557492860364, "flos": 34713121265280.0, "grad_norm": 2.0697972720496693, "language_loss": 0.70900035, "learning_rate": 3.57107233981781e-06, "loss": 0.73101234, "num_input_tokens_seen": 76313975, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.953125, "step": 3534, "time_per_iteration": 2.6467669010162354 }, { "auxiliary_loss_clip": 0.01166856, "auxiliary_loss_mlp": 0.01039262, "balance_loss_clip": 1.02246571, "balance_loss_mlp": 1.05483007, "epoch": 0.2125356981812716, "flos": 22053820504320.0, "grad_norm": 1.968529097277046, "language_loss": 0.71450168, "learning_rate": 3.570838509380485e-06, "loss": 0.73656285, "num_input_tokens_seen": 76330955, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.9375, "step": 3535, "time_per_iteration": 2.56870436668396 }, { "auxiliary_loss_clip": 0.01151356, "auxiliary_loss_mlp": 0.01046205, "balance_loss_clip": 1.02980137, "balance_loss_mlp": 1.05175018, "epoch": 0.21259582143393957, "flos": 28548336821760.0, "grad_norm": 2.6381082455322677, "language_loss": 0.70441771, "learning_rate": 3.5706046228839744e-06, "loss": 0.72639328, "num_input_tokens_seen": 76352680, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.90625, "step": 3536, "time_per_iteration": 2.6005420684814453 }, { "auxiliary_loss_clip": 0.01145537, "auxiliary_loss_mlp": 0.0104677, "balance_loss_clip": 1.02936554, "balance_loss_mlp": 1.05076718, "epoch": 0.21265594468660753, "flos": 20120892900480.0, "grad_norm": 1.9357256310975108, "language_loss": 0.88005078, "learning_rate": 3.5703706803366245e-06, "loss": 0.90197384, "num_input_tokens_seen": 76370750, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9453125, "step": 3537, "time_per_iteration": 2.542285442352295 }, { "auxiliary_loss_clip": 0.01151045, "auxiliary_loss_mlp": 0.0129105, "balance_loss_clip": 1.0272212, "balance_loss_mlp": 1.05034709, "epoch": 0.21271606793927553, "flos": 23076161400960.0, "grad_norm": 1.780376912165828, "language_loss": 0.80295992, "learning_rate": 3.5701366817467852e-06, "loss": 0.8273809, "num_input_tokens_seen": 76390610, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.91796875, "step": 3538, "time_per_iteration": 2.570164203643799 }, { "auxiliary_loss_clip": 0.01186875, "auxiliary_loss_mlp": 0.0104337, "balance_loss_clip": 1.02759838, "balance_loss_mlp": 1.05094433, "epoch": 0.2127761911919435, "flos": 26388201738240.0, "grad_norm": 1.7090948694314774, "language_loss": 0.87002313, "learning_rate": 3.569902627122807e-06, "loss": 0.89232558, "num_input_tokens_seen": 76408860, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.91015625, "step": 3539, "time_per_iteration": 2.7488532066345215 }, { "auxiliary_loss_clip": 0.01166066, "auxiliary_loss_mlp": 0.01048533, "balance_loss_clip": 1.03192699, "balance_loss_mlp": 1.05458498, "epoch": 0.21283631444461146, "flos": 20228265630720.0, "grad_norm": 2.357481108755682, "language_loss": 0.58088374, "learning_rate": 3.5696685164730413e-06, "loss": 0.60302973, "num_input_tokens_seen": 76424980, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.9375, "step": 3540, "time_per_iteration": 2.508659839630127 }, { "auxiliary_loss_clip": 0.01146094, "auxiliary_loss_mlp": 0.01042318, "balance_loss_clip": 1.02499723, "balance_loss_mlp": 1.05299461, "epoch": 0.21289643769727942, "flos": 13516992691200.0, "grad_norm": 1.8976019616067197, "language_loss": 0.75350314, "learning_rate": 3.569434349805844e-06, "loss": 0.77538729, "num_input_tokens_seen": 76443135, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9296875, "step": 3541, "time_per_iteration": 2.535142183303833 }, { "auxiliary_loss_clip": 0.01171849, "auxiliary_loss_mlp": 0.01046271, "balance_loss_clip": 1.02930772, "balance_loss_mlp": 1.05387866, "epoch": 0.2129565609499474, "flos": 24827021942400.0, "grad_norm": 1.8091733597048616, "language_loss": 0.69227302, "learning_rate": 3.569200127129572e-06, "loss": 0.71445417, "num_input_tokens_seen": 76462470, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.90625, "step": 3542, "time_per_iteration": 2.5985476970672607 }, { "auxiliary_loss_clip": 0.01141386, "auxiliary_loss_mlp": 0.01038866, "balance_loss_clip": 1.02358365, "balance_loss_mlp": 1.05149341, "epoch": 0.21301668420261535, "flos": 23659242877440.0, "grad_norm": 2.050293805236545, "language_loss": 0.75742531, "learning_rate": 3.568965848452584e-06, "loss": 0.77922785, "num_input_tokens_seen": 76481995, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8984375, "step": 3543, "time_per_iteration": 2.683893918991089 }, { "auxiliary_loss_clip": 0.01163859, "auxiliary_loss_mlp": 0.0103747, "balance_loss_clip": 1.02107882, "balance_loss_mlp": 1.05565214, "epoch": 0.21307680745528332, "flos": 16362805472640.0, "grad_norm": 1.8333863819930576, "language_loss": 0.66157794, "learning_rate": 3.568731513783241e-06, "loss": 0.68359125, "num_input_tokens_seen": 76500245, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.90234375, "step": 3544, "time_per_iteration": 2.527986526489258 }, { "auxiliary_loss_clip": 0.01163092, "auxiliary_loss_mlp": 0.01042379, "balance_loss_clip": 1.026739, "balance_loss_mlp": 1.05270183, "epoch": 0.2131369307079513, "flos": 19099054794240.0, "grad_norm": 1.785048441897546, "language_loss": 0.70793796, "learning_rate": 3.568497123129905e-06, "loss": 0.72999269, "num_input_tokens_seen": 76519535, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.921875, "step": 3545, "time_per_iteration": 2.601785659790039 }, { "auxiliary_loss_clip": 0.01167361, "auxiliary_loss_mlp": 0.01044538, "balance_loss_clip": 1.02844477, "balance_loss_mlp": 1.05464363, "epoch": 0.21319705396061928, "flos": 30372275583360.0, "grad_norm": 1.7186918490683605, "language_loss": 0.71798992, "learning_rate": 3.568262676500942e-06, "loss": 0.74010885, "num_input_tokens_seen": 76542065, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.9453125, "step": 3546, "time_per_iteration": 2.5797500610351562 }, { "auxiliary_loss_clip": 0.01184328, "auxiliary_loss_mlp": 0.0104503, "balance_loss_clip": 1.02867424, "balance_loss_mlp": 1.0562048, "epoch": 0.21325717721328724, "flos": 21756192410880.0, "grad_norm": 1.8747890966616225, "language_loss": 0.80519336, "learning_rate": 3.568028173904717e-06, "loss": 0.82748693, "num_input_tokens_seen": 76560540, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.921875, "step": 3547, "time_per_iteration": 3.972477674484253 }, { "auxiliary_loss_clip": 0.01151555, "auxiliary_loss_mlp": 0.01043678, "balance_loss_clip": 1.02622533, "balance_loss_mlp": 1.05483031, "epoch": 0.2133173004659552, "flos": 28730870760960.0, "grad_norm": 2.5744068426249695, "language_loss": 0.7434777, "learning_rate": 3.567793615349601e-06, "loss": 0.76543003, "num_input_tokens_seen": 76581760, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.96875, "step": 3548, "time_per_iteration": 2.662759304046631 }, { "auxiliary_loss_clip": 0.01178574, "auxiliary_loss_mlp": 0.0104953, "balance_loss_clip": 1.03127944, "balance_loss_mlp": 1.05510235, "epoch": 0.21337742371862317, "flos": 16837077674880.0, "grad_norm": 5.809505450259361, "language_loss": 0.74248743, "learning_rate": 3.567559000843963e-06, "loss": 0.76476848, "num_input_tokens_seen": 76599940, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9609375, "step": 3549, "time_per_iteration": 2.5101606845855713 }, { "auxiliary_loss_clip": 0.01177719, "auxiliary_loss_mlp": 0.01046355, "balance_loss_clip": 1.02998734, "balance_loss_mlp": 1.0575974, "epoch": 0.21343754697129114, "flos": 24424930120320.0, "grad_norm": 1.7266955496101886, "language_loss": 0.80769193, "learning_rate": 3.567324330396177e-06, "loss": 0.82993269, "num_input_tokens_seen": 76619580, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.93359375, "step": 3550, "time_per_iteration": 5.486439228057861 }, { "auxiliary_loss_clip": 0.01148451, "auxiliary_loss_mlp": 0.0104234, "balance_loss_clip": 1.02622247, "balance_loss_mlp": 1.05853724, "epoch": 0.21349767022395913, "flos": 19277817805440.0, "grad_norm": 1.7647343197035434, "language_loss": 0.88227797, "learning_rate": 3.5670896040146173e-06, "loss": 0.90418589, "num_input_tokens_seen": 76638195, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8984375, "step": 3551, "time_per_iteration": 2.5581514835357666 }, { "auxiliary_loss_clip": 0.01147974, "auxiliary_loss_mlp": 0.01046206, "balance_loss_clip": 1.02958786, "balance_loss_mlp": 1.05637717, "epoch": 0.2135577934766271, "flos": 17347547808000.0, "grad_norm": 2.138512343166947, "language_loss": 0.83760297, "learning_rate": 3.5668548217076605e-06, "loss": 0.85954475, "num_input_tokens_seen": 76656695, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.9140625, "step": 3552, "time_per_iteration": 2.5839200019836426 }, { "auxiliary_loss_clip": 0.01153856, "auxiliary_loss_mlp": 0.01045129, "balance_loss_clip": 1.02822483, "balance_loss_mlp": 1.05404651, "epoch": 0.21361791672929506, "flos": 24057204635520.0, "grad_norm": 1.6924402741615348, "language_loss": 0.76192486, "learning_rate": 3.5666199834836855e-06, "loss": 0.78391469, "num_input_tokens_seen": 76677430, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.90625, "step": 3553, "time_per_iteration": 4.091699838638306 }, { "auxiliary_loss_clip": 0.01157784, "auxiliary_loss_mlp": 0.01040263, "balance_loss_clip": 1.02515948, "balance_loss_mlp": 1.0585041, "epoch": 0.21367803998196302, "flos": 22162306556160.0, "grad_norm": 1.9026967278211773, "language_loss": 0.7255193, "learning_rate": 3.5663850893510734e-06, "loss": 0.74749976, "num_input_tokens_seen": 76697615, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.90234375, "step": 3554, "time_per_iteration": 2.6129140853881836 }, { "auxiliary_loss_clip": 0.01183092, "auxiliary_loss_mlp": 0.01284246, "balance_loss_clip": 1.02057314, "balance_loss_mlp": 1.05594611, "epoch": 0.213738163234631, "flos": 20886867452160.0, "grad_norm": 1.9629582238609316, "language_loss": 0.67767012, "learning_rate": 3.566150139318206e-06, "loss": 0.70234358, "num_input_tokens_seen": 76715685, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.9140625, "step": 3555, "time_per_iteration": 2.535804271697998 }, { "auxiliary_loss_clip": 0.01176772, "auxiliary_loss_mlp": 0.01036755, "balance_loss_clip": 1.01967239, "balance_loss_mlp": 1.05614305, "epoch": 0.21379828648729896, "flos": 28403114135040.0, "grad_norm": 2.912670789830239, "language_loss": 0.64714617, "learning_rate": 3.56591513339347e-06, "loss": 0.66928136, "num_input_tokens_seen": 76735405, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.93359375, "step": 3556, "time_per_iteration": 2.655137300491333 }, { "auxiliary_loss_clip": 0.01158766, "auxiliary_loss_mlp": 0.01042277, "balance_loss_clip": 1.02521825, "balance_loss_mlp": 1.05780649, "epoch": 0.21385840973996692, "flos": 25479662106240.0, "grad_norm": 1.6851516188787796, "language_loss": 0.72520173, "learning_rate": 3.56568007158525e-06, "loss": 0.74721217, "num_input_tokens_seen": 76754395, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.921875, "step": 3557, "time_per_iteration": 2.576693058013916 }, { "auxiliary_loss_clip": 0.01168822, "auxiliary_loss_mlp": 0.01290094, "balance_loss_clip": 1.02371573, "balance_loss_mlp": 1.05485237, "epoch": 0.2139185329926349, "flos": 28074280101120.0, "grad_norm": 1.8190303713579214, "language_loss": 0.67467713, "learning_rate": 3.565444953901935e-06, "loss": 0.69926625, "num_input_tokens_seen": 76777210, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.9609375, "step": 3558, "time_per_iteration": 2.7231733798980713 }, { "auxiliary_loss_clip": 0.01148686, "auxiliary_loss_mlp": 0.0103994, "balance_loss_clip": 1.02321482, "balance_loss_mlp": 1.05090475, "epoch": 0.21397865624530288, "flos": 19608698914560.0, "grad_norm": 1.7163310289009763, "language_loss": 0.80011904, "learning_rate": 3.5652097803519173e-06, "loss": 0.82200527, "num_input_tokens_seen": 76795830, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.9765625, "step": 3559, "time_per_iteration": 2.542680025100708 }, { "auxiliary_loss_clip": 0.01146873, "auxiliary_loss_mlp": 0.01042361, "balance_loss_clip": 1.02632177, "balance_loss_mlp": 1.05511582, "epoch": 0.21403877949797084, "flos": 24681476033280.0, "grad_norm": 1.8177310646302625, "language_loss": 0.67469919, "learning_rate": 3.5649745509435887e-06, "loss": 0.69659156, "num_input_tokens_seen": 76814700, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.91796875, "step": 3560, "time_per_iteration": 2.706165075302124 }, { "auxiliary_loss_clip": 0.01160819, "auxiliary_loss_mlp": 0.01039211, "balance_loss_clip": 1.02206886, "balance_loss_mlp": 1.05747485, "epoch": 0.2140989027506388, "flos": 19861150677120.0, "grad_norm": 1.897789092034677, "language_loss": 0.73109323, "learning_rate": 3.564739265685344e-06, "loss": 0.75309354, "num_input_tokens_seen": 76833400, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.94140625, "step": 3561, "time_per_iteration": 2.528470754623413 }, { "auxiliary_loss_clip": 0.01170088, "auxiliary_loss_mlp": 0.01044622, "balance_loss_clip": 1.02758658, "balance_loss_mlp": 1.0581255, "epoch": 0.21415902600330677, "flos": 19135324552320.0, "grad_norm": 2.11330584467672, "language_loss": 0.77304041, "learning_rate": 3.56450392458558e-06, "loss": 0.79518747, "num_input_tokens_seen": 76850645, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.9375, "step": 3562, "time_per_iteration": 2.619950771331787 }, { "auxiliary_loss_clip": 0.01147182, "auxiliary_loss_mlp": 0.01036423, "balance_loss_clip": 1.01992464, "balance_loss_mlp": 1.05685532, "epoch": 0.21421914925597474, "flos": 22272624201600.0, "grad_norm": 1.5946860152694866, "language_loss": 0.84209955, "learning_rate": 3.564268527652695e-06, "loss": 0.86393559, "num_input_tokens_seen": 76870135, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.90234375, "step": 3563, "time_per_iteration": 2.5780117511749268 }, { "auxiliary_loss_clip": 0.01149955, "auxiliary_loss_mlp": 0.01031991, "balance_loss_clip": 1.0163027, "balance_loss_mlp": 1.05780768, "epoch": 0.2142792725086427, "flos": 33875109987840.0, "grad_norm": 1.4426039171678737, "language_loss": 0.76565152, "learning_rate": 3.5640330748950902e-06, "loss": 0.78747094, "num_input_tokens_seen": 76893905, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.921875, "step": 3564, "time_per_iteration": 2.656932830810547 }, { "auxiliary_loss_clip": 0.01164307, "auxiliary_loss_mlp": 0.01042475, "balance_loss_clip": 1.02611983, "balance_loss_mlp": 1.05507505, "epoch": 0.2143393957613107, "flos": 19860216923520.0, "grad_norm": 1.9954423330159499, "language_loss": 0.88398689, "learning_rate": 3.5637975663211677e-06, "loss": 0.90605474, "num_input_tokens_seen": 76914205, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.9140625, "step": 3565, "time_per_iteration": 2.54457950592041 }, { "auxiliary_loss_clip": 0.01099108, "auxiliary_loss_mlp": 0.01016668, "balance_loss_clip": 1.01456976, "balance_loss_mlp": 1.04919207, "epoch": 0.21439951901397866, "flos": 68530093090560.0, "grad_norm": 0.8379518476139898, "language_loss": 0.52238894, "learning_rate": 3.563562001939333e-06, "loss": 0.54354668, "num_input_tokens_seen": 76975650, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.31835938, "step": 3566, "time_per_iteration": 3.148101806640625 }, { "auxiliary_loss_clip": 0.01153855, "auxiliary_loss_mlp": 0.01037079, "balance_loss_clip": 1.02182031, "balance_loss_mlp": 1.05540216, "epoch": 0.21445964226664663, "flos": 19682998197120.0, "grad_norm": 1.9103968715202957, "language_loss": 0.67431152, "learning_rate": 3.563326381757993e-06, "loss": 0.69622082, "num_input_tokens_seen": 76992615, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.89453125, "step": 3567, "time_per_iteration": 2.5372672080993652 }, { "auxiliary_loss_clip": 0.01161796, "auxiliary_loss_mlp": 0.01036273, "balance_loss_clip": 1.02120543, "balance_loss_mlp": 1.05631006, "epoch": 0.2145197655193146, "flos": 31107259676160.0, "grad_norm": 1.5363669657324903, "language_loss": 0.74060655, "learning_rate": 3.563090705785555e-06, "loss": 0.76258725, "num_input_tokens_seen": 77017005, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.875, "step": 3568, "time_per_iteration": 2.6437134742736816 }, { "auxiliary_loss_clip": 0.01155429, "auxiliary_loss_mlp": 0.01049394, "balance_loss_clip": 1.03327727, "balance_loss_mlp": 1.05544019, "epoch": 0.21457988877198256, "flos": 20120785159680.0, "grad_norm": 1.6336187099599992, "language_loss": 0.77394444, "learning_rate": 3.5628549740304307e-06, "loss": 0.79599267, "num_input_tokens_seen": 77034990, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.91015625, "step": 3569, "time_per_iteration": 2.537336826324463 }, { "auxiliary_loss_clip": 0.01164608, "auxiliary_loss_mlp": 0.01042753, "balance_loss_clip": 1.02447808, "balance_loss_mlp": 1.05857944, "epoch": 0.21464001202465052, "flos": 18588045957120.0, "grad_norm": 2.279963507884132, "language_loss": 0.69784582, "learning_rate": 3.562619186501032e-06, "loss": 0.71991938, "num_input_tokens_seen": 77052610, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.96875, "step": 3570, "time_per_iteration": 2.5412111282348633 }, { "auxiliary_loss_clip": 0.01149263, "auxiliary_loss_mlp": 0.01049415, "balance_loss_clip": 1.03307736, "balance_loss_mlp": 1.05477977, "epoch": 0.21470013527731852, "flos": 21835160461440.0, "grad_norm": 2.265536762198093, "language_loss": 0.78249747, "learning_rate": 3.562383343205774e-06, "loss": 0.80448425, "num_input_tokens_seen": 77072475, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.9453125, "step": 3571, "time_per_iteration": 2.639082908630371 }, { "auxiliary_loss_clip": 0.01157862, "auxiliary_loss_mlp": 0.01042422, "balance_loss_clip": 1.02461171, "balance_loss_mlp": 1.05618167, "epoch": 0.21476025852998648, "flos": 17603195880960.0, "grad_norm": 2.6586816572465333, "language_loss": 0.82191658, "learning_rate": 3.5621474441530744e-06, "loss": 0.8439194, "num_input_tokens_seen": 77089930, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.92578125, "step": 3572, "time_per_iteration": 2.529472589492798 }, { "auxiliary_loss_clip": 0.01167584, "auxiliary_loss_mlp": 0.01040983, "balance_loss_clip": 1.02361405, "balance_loss_mlp": 1.05337715, "epoch": 0.21482038178265445, "flos": 24828135264000.0, "grad_norm": 2.4844261598239634, "language_loss": 0.64936018, "learning_rate": 3.5619114893513508e-06, "loss": 0.67144585, "num_input_tokens_seen": 77108970, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9609375, "step": 3573, "time_per_iteration": 2.6501657962799072 }, { "auxiliary_loss_clip": 0.01139415, "auxiliary_loss_mlp": 0.01044149, "balance_loss_clip": 1.02861571, "balance_loss_mlp": 1.051139, "epoch": 0.2148805050353224, "flos": 23258228463360.0, "grad_norm": 2.421609653572642, "language_loss": 0.75187767, "learning_rate": 3.5616754788090235e-06, "loss": 0.77371335, "num_input_tokens_seen": 77126045, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8828125, "step": 3574, "time_per_iteration": 2.571732759475708 }, { "auxiliary_loss_clip": 0.01151473, "auxiliary_loss_mlp": 0.01037743, "balance_loss_clip": 1.02040994, "balance_loss_mlp": 1.05211759, "epoch": 0.21494062828799038, "flos": 21321098968320.0, "grad_norm": 2.4709786410515595, "language_loss": 0.72171605, "learning_rate": 3.561439412534515e-06, "loss": 0.74360824, "num_input_tokens_seen": 77144600, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.90625, "step": 3575, "time_per_iteration": 2.709324836730957 }, { "auxiliary_loss_clip": 0.01163723, "auxiliary_loss_mlp": 0.01037687, "balance_loss_clip": 1.02172494, "balance_loss_mlp": 1.05524361, "epoch": 0.21500075154065834, "flos": 18843334894080.0, "grad_norm": 1.8781476894566689, "language_loss": 0.676566, "learning_rate": 3.561203290536251e-06, "loss": 0.69858015, "num_input_tokens_seen": 77162965, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.90625, "step": 3576, "time_per_iteration": 2.5520224571228027 }, { "auxiliary_loss_clip": 0.01157915, "auxiliary_loss_mlp": 0.01042446, "balance_loss_clip": 1.02541137, "balance_loss_mlp": 1.05355024, "epoch": 0.2150608747933263, "flos": 18441997257600.0, "grad_norm": 2.2350155903124245, "language_loss": 0.887707, "learning_rate": 3.560967112822657e-06, "loss": 0.90971059, "num_input_tokens_seen": 77179960, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.953125, "step": 3577, "time_per_iteration": 2.6219019889831543 }, { "auxiliary_loss_clip": 0.01083756, "auxiliary_loss_mlp": 0.01020381, "balance_loss_clip": 1.01845026, "balance_loss_mlp": 1.03473663, "epoch": 0.2151209980459943, "flos": 66598242894720.0, "grad_norm": 0.8081844364346801, "language_loss": 0.56167328, "learning_rate": 3.5607308794021623e-06, "loss": 0.58271468, "num_input_tokens_seen": 77239500, "router_z_loss_clip": 0.01928711, "router_z_loss_mlp": 0.30859375, "step": 3578, "time_per_iteration": 3.1691482067108154 }, { "auxiliary_loss_clip": 0.0115193, "auxiliary_loss_mlp": 0.01291498, "balance_loss_clip": 1.02815056, "balance_loss_mlp": 1.05238414, "epoch": 0.21518112129866226, "flos": 21575885114880.0, "grad_norm": 1.6729346505073148, "language_loss": 0.88440001, "learning_rate": 3.5604945902831975e-06, "loss": 0.90883428, "num_input_tokens_seen": 77254680, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.90625, "step": 3579, "time_per_iteration": 2.632749319076538 }, { "auxiliary_loss_clip": 0.01152416, "auxiliary_loss_mlp": 0.01047621, "balance_loss_clip": 1.03088391, "balance_loss_mlp": 1.05087948, "epoch": 0.21524124455133023, "flos": 20047635112320.0, "grad_norm": 2.326816525227587, "language_loss": 0.77710605, "learning_rate": 3.560258245474194e-06, "loss": 0.79910642, "num_input_tokens_seen": 77274060, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.92578125, "step": 3580, "time_per_iteration": 2.629179000854492 }, { "auxiliary_loss_clip": 0.01149542, "auxiliary_loss_mlp": 0.0104113, "balance_loss_clip": 1.02526343, "balance_loss_mlp": 1.05151904, "epoch": 0.2153013678039982, "flos": 23951807153280.0, "grad_norm": 1.84759202223899, "language_loss": 0.72707784, "learning_rate": 3.5600218449835876e-06, "loss": 0.74898458, "num_input_tokens_seen": 77293255, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.890625, "step": 3581, "time_per_iteration": 2.638810873031616 }, { "auxiliary_loss_clip": 0.01150539, "auxiliary_loss_mlp": 0.01044762, "balance_loss_clip": 1.02695203, "balance_loss_mlp": 1.05153096, "epoch": 0.21536149105666616, "flos": 20594841880320.0, "grad_norm": 2.9046091838671786, "language_loss": 0.71094608, "learning_rate": 3.559785388819815e-06, "loss": 0.73289907, "num_input_tokens_seen": 77312390, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.90234375, "step": 3582, "time_per_iteration": 2.722728729248047 }, { "auxiliary_loss_clip": 0.01140208, "auxiliary_loss_mlp": 0.01038592, "balance_loss_clip": 1.02152157, "balance_loss_mlp": 1.04870701, "epoch": 0.21542161430933413, "flos": 12860042895360.0, "grad_norm": 3.5374456129183613, "language_loss": 0.83942026, "learning_rate": 3.5595488769913134e-06, "loss": 0.86120826, "num_input_tokens_seen": 77330985, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.9140625, "step": 3583, "time_per_iteration": 2.5456931591033936 }, { "auxiliary_loss_clip": 0.01162613, "auxiliary_loss_mlp": 0.01045391, "balance_loss_clip": 1.0294528, "balance_loss_mlp": 1.05111146, "epoch": 0.21548173756200212, "flos": 26103933504000.0, "grad_norm": 2.4724860707038743, "language_loss": 0.83050752, "learning_rate": 3.5593123095065245e-06, "loss": 0.85258758, "num_input_tokens_seen": 77350770, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.9375, "step": 3584, "time_per_iteration": 2.6799750328063965 }, { "auxiliary_loss_clip": 0.01166933, "auxiliary_loss_mlp": 0.01291454, "balance_loss_clip": 1.02654314, "balance_loss_mlp": 1.05087614, "epoch": 0.21554186081467008, "flos": 22163779013760.0, "grad_norm": 1.9565775850529559, "language_loss": 0.89979506, "learning_rate": 3.55907568637389e-06, "loss": 0.92437893, "num_input_tokens_seen": 77370510, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.89453125, "step": 3585, "time_per_iteration": 2.631756067276001 }, { "auxiliary_loss_clip": 0.01150026, "auxiliary_loss_mlp": 0.01044805, "balance_loss_clip": 1.02837753, "balance_loss_mlp": 1.05065179, "epoch": 0.21560198406733805, "flos": 22966741595520.0, "grad_norm": 1.854674218685763, "language_loss": 0.7458238, "learning_rate": 3.558839007601855e-06, "loss": 0.76777208, "num_input_tokens_seen": 77390645, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.90625, "step": 3586, "time_per_iteration": 2.658682107925415 }, { "auxiliary_loss_clip": 0.01178164, "auxiliary_loss_mlp": 0.01038676, "balance_loss_clip": 1.02334535, "balance_loss_mlp": 1.0492878, "epoch": 0.215662107320006, "flos": 22784064001920.0, "grad_norm": 2.399518302047891, "language_loss": 0.83263719, "learning_rate": 3.558602273198865e-06, "loss": 0.85480553, "num_input_tokens_seen": 77409655, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.9296875, "step": 3587, "time_per_iteration": 2.6432809829711914 }, { "auxiliary_loss_clip": 0.01176956, "auxiliary_loss_mlp": 0.01283363, "balance_loss_clip": 1.01873517, "balance_loss_mlp": 1.04986048, "epoch": 0.21572223057267398, "flos": 30883859038080.0, "grad_norm": 1.7590966534841532, "language_loss": 0.75621897, "learning_rate": 3.558365483173369e-06, "loss": 0.78082222, "num_input_tokens_seen": 77430560, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.9140625, "step": 3588, "time_per_iteration": 2.7521719932556152 }, { "auxiliary_loss_clip": 0.01149458, "auxiliary_loss_mlp": 0.01037276, "balance_loss_clip": 1.02096784, "balance_loss_mlp": 1.04873991, "epoch": 0.21578235382534194, "flos": 26910487445760.0, "grad_norm": 1.7371970571635695, "language_loss": 0.80673313, "learning_rate": 3.5581286375338183e-06, "loss": 0.82860053, "num_input_tokens_seen": 77455000, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.9140625, "step": 3589, "time_per_iteration": 4.134890079498291 }, { "auxiliary_loss_clip": 0.01143155, "auxiliary_loss_mlp": 0.01038843, "balance_loss_clip": 1.02354884, "balance_loss_mlp": 1.04995561, "epoch": 0.2158424770780099, "flos": 24425720219520.0, "grad_norm": 1.902012545774328, "language_loss": 0.72632915, "learning_rate": 3.557891736288664e-06, "loss": 0.74814916, "num_input_tokens_seen": 77475075, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.9296875, "step": 3590, "time_per_iteration": 2.673510789871216 }, { "auxiliary_loss_clip": 0.01163581, "auxiliary_loss_mlp": 0.01044494, "balance_loss_clip": 1.0265528, "balance_loss_mlp": 1.04971266, "epoch": 0.2159026003306779, "flos": 23949975559680.0, "grad_norm": 2.0331690302449936, "language_loss": 0.84218866, "learning_rate": 3.5576547794463608e-06, "loss": 0.86426938, "num_input_tokens_seen": 77495945, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9609375, "step": 3591, "time_per_iteration": 2.735326051712036 }, { "auxiliary_loss_clip": 0.01174107, "auxiliary_loss_mlp": 0.01046057, "balance_loss_clip": 1.02668583, "balance_loss_mlp": 1.05071175, "epoch": 0.21596272358334587, "flos": 30040963511040.0, "grad_norm": 2.0267358012879404, "language_loss": 0.68874806, "learning_rate": 3.557417767015366e-06, "loss": 0.71094978, "num_input_tokens_seen": 77517140, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.96484375, "step": 3592, "time_per_iteration": 4.228362083435059 }, { "auxiliary_loss_clip": 0.01172983, "auxiliary_loss_mlp": 0.01044126, "balance_loss_clip": 1.02635169, "balance_loss_mlp": 1.05160403, "epoch": 0.21602284683601383, "flos": 20376217751040.0, "grad_norm": 4.3927934145426955, "language_loss": 0.83785057, "learning_rate": 3.557180699004137e-06, "loss": 0.86002159, "num_input_tokens_seen": 77536085, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9453125, "step": 3593, "time_per_iteration": 2.6581108570098877 }, { "auxiliary_loss_clip": 0.01169959, "auxiliary_loss_mlp": 0.01048762, "balance_loss_clip": 1.03138101, "balance_loss_mlp": 1.04823947, "epoch": 0.2160829700886818, "flos": 20777339905920.0, "grad_norm": 2.1978248630084503, "language_loss": 0.74629188, "learning_rate": 3.556943575421134e-06, "loss": 0.76847911, "num_input_tokens_seen": 77553675, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9453125, "step": 3594, "time_per_iteration": 5.120208978652954 }, { "auxiliary_loss_clip": 0.01149385, "auxiliary_loss_mlp": 0.01040907, "balance_loss_clip": 1.02356255, "balance_loss_mlp": 1.04914188, "epoch": 0.21614309334134976, "flos": 22309755886080.0, "grad_norm": 1.5674646649798196, "language_loss": 0.80506057, "learning_rate": 3.55670639627482e-06, "loss": 0.82696354, "num_input_tokens_seen": 77573360, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.9140625, "step": 3595, "time_per_iteration": 2.653109312057495 }, { "auxiliary_loss_clip": 0.01173762, "auxiliary_loss_mlp": 0.01036241, "balance_loss_clip": 1.01880121, "balance_loss_mlp": 1.05028605, "epoch": 0.21620321659401773, "flos": 19609524927360.0, "grad_norm": 2.104092855472003, "language_loss": 0.78889322, "learning_rate": 3.556469161573659e-06, "loss": 0.81099331, "num_input_tokens_seen": 77591865, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.96875, "step": 3596, "time_per_iteration": 2.5990822315216064 }, { "auxiliary_loss_clip": 0.01186661, "auxiliary_loss_mlp": 0.01045064, "balance_loss_clip": 1.02920949, "balance_loss_mlp": 1.05167365, "epoch": 0.2162633398466857, "flos": 18844555956480.0, "grad_norm": 1.775978164427154, "language_loss": 0.83151257, "learning_rate": 3.556231871326118e-06, "loss": 0.85382986, "num_input_tokens_seen": 77611600, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.90234375, "step": 3597, "time_per_iteration": 2.6758620738983154 }, { "auxiliary_loss_clip": 0.01163063, "auxiliary_loss_mlp": 0.01292651, "balance_loss_clip": 1.0268836, "balance_loss_mlp": 1.05112195, "epoch": 0.21632346309935369, "flos": 18768820129920.0, "grad_norm": 1.6529506660284117, "language_loss": 0.80624735, "learning_rate": 3.5559945255406635e-06, "loss": 0.83080447, "num_input_tokens_seen": 77630665, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.9375, "step": 3598, "time_per_iteration": 2.562300682067871 }, { "auxiliary_loss_clip": 0.01167305, "auxiliary_loss_mlp": 0.01048621, "balance_loss_clip": 1.02996492, "balance_loss_mlp": 1.05295944, "epoch": 0.21638358635202165, "flos": 26324173745280.0, "grad_norm": 1.6818936089241932, "language_loss": 0.81549346, "learning_rate": 3.555757124225767e-06, "loss": 0.8376528, "num_input_tokens_seen": 77650835, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.9609375, "step": 3599, "time_per_iteration": 2.679786205291748 }, { "auxiliary_loss_clip": 0.01169987, "auxiliary_loss_mlp": 0.01286738, "balance_loss_clip": 1.02302265, "balance_loss_mlp": 1.05183208, "epoch": 0.21644370960468962, "flos": 20740854666240.0, "grad_norm": 2.116046597906128, "language_loss": 0.76424277, "learning_rate": 3.5555196673899015e-06, "loss": 0.78881001, "num_input_tokens_seen": 77669000, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.9140625, "step": 3600, "time_per_iteration": 2.572113513946533 }, { "auxiliary_loss_clip": 0.01170816, "auxiliary_loss_mlp": 0.01043038, "balance_loss_clip": 1.02819061, "balance_loss_mlp": 1.05104768, "epoch": 0.21650383285735758, "flos": 23952238116480.0, "grad_norm": 2.0284547252596714, "language_loss": 0.80016845, "learning_rate": 3.5552821550415396e-06, "loss": 0.82230699, "num_input_tokens_seen": 77688745, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.92578125, "step": 3601, "time_per_iteration": 2.672311544418335 }, { "auxiliary_loss_clip": 0.01148167, "auxiliary_loss_mlp": 0.01047029, "balance_loss_clip": 1.03063798, "balance_loss_mlp": 1.05375218, "epoch": 0.21656395611002555, "flos": 23696087253120.0, "grad_norm": 2.029845499909796, "language_loss": 0.82993066, "learning_rate": 3.5550445871891585e-06, "loss": 0.85188258, "num_input_tokens_seen": 77708445, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.9453125, "step": 3602, "time_per_iteration": 2.6040408611297607 }, { "auxiliary_loss_clip": 0.01142825, "auxiliary_loss_mlp": 0.01048381, "balance_loss_clip": 1.03094065, "balance_loss_mlp": 1.04984939, "epoch": 0.2166240793626935, "flos": 20666052593280.0, "grad_norm": 1.8837791651141418, "language_loss": 0.74454415, "learning_rate": 3.554806963841236e-06, "loss": 0.76645619, "num_input_tokens_seen": 77728465, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9296875, "step": 3603, "time_per_iteration": 2.798666000366211 }, { "auxiliary_loss_clip": 0.01160469, "auxiliary_loss_mlp": 0.01037935, "balance_loss_clip": 1.02221715, "balance_loss_mlp": 1.05137849, "epoch": 0.2166842026153615, "flos": 21580410228480.0, "grad_norm": 1.6868215910814224, "language_loss": 0.7386682, "learning_rate": 3.554569285006253e-06, "loss": 0.76065224, "num_input_tokens_seen": 77746735, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.91015625, "step": 3604, "time_per_iteration": 2.5957610607147217 }, { "auxiliary_loss_clip": 0.01145849, "auxiliary_loss_mlp": 0.01038459, "balance_loss_clip": 1.0225091, "balance_loss_mlp": 1.04797745, "epoch": 0.21674432586802947, "flos": 25629948610560.0, "grad_norm": 1.7287960453541695, "language_loss": 0.79665911, "learning_rate": 3.5543315506926903e-06, "loss": 0.81850219, "num_input_tokens_seen": 77768105, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.890625, "step": 3605, "time_per_iteration": 2.669607162475586 }, { "auxiliary_loss_clip": 0.01073007, "auxiliary_loss_mlp": 0.01016809, "balance_loss_clip": 1.01438951, "balance_loss_mlp": 1.04114854, "epoch": 0.21680444912069743, "flos": 56417783616000.0, "grad_norm": 0.7646175440824137, "language_loss": 0.58319271, "learning_rate": 3.5540937609090334e-06, "loss": 0.60409081, "num_input_tokens_seen": 77833750, "router_z_loss_clip": 0.02416992, "router_z_loss_mlp": 0.31835938, "step": 3606, "time_per_iteration": 3.231679916381836 }, { "auxiliary_loss_clip": 0.01144551, "auxiliary_loss_mlp": 0.01039895, "balance_loss_clip": 1.02300298, "balance_loss_mlp": 1.05003858, "epoch": 0.2168645723733654, "flos": 23878944414720.0, "grad_norm": 2.2349076606581857, "language_loss": 0.72777808, "learning_rate": 3.5538559156637675e-06, "loss": 0.74962258, "num_input_tokens_seen": 77853780, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.9453125, "step": 3607, "time_per_iteration": 2.595766305923462 }, { "auxiliary_loss_clip": 0.01166413, "auxiliary_loss_mlp": 0.01041529, "balance_loss_clip": 1.0242914, "balance_loss_mlp": 1.05246544, "epoch": 0.21692469562603336, "flos": 16946174257920.0, "grad_norm": 2.040449185224991, "language_loss": 0.7669369, "learning_rate": 3.5536180149653805e-06, "loss": 0.78901637, "num_input_tokens_seen": 77872575, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9609375, "step": 3608, "time_per_iteration": 2.5763394832611084 }, { "auxiliary_loss_clip": 0.01190051, "auxiliary_loss_mlp": 0.01043724, "balance_loss_clip": 1.02783346, "balance_loss_mlp": 1.05010772, "epoch": 0.21698481887870133, "flos": 25119047514240.0, "grad_norm": 1.9238720082234788, "language_loss": 0.74222028, "learning_rate": 3.5533800588223636e-06, "loss": 0.76455808, "num_input_tokens_seen": 77892700, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.94921875, "step": 3609, "time_per_iteration": 2.634897232055664 }, { "auxiliary_loss_clip": 0.01164867, "auxiliary_loss_mlp": 0.01048674, "balance_loss_clip": 1.03136528, "balance_loss_mlp": 1.05204248, "epoch": 0.2170449421313693, "flos": 17894682748800.0, "grad_norm": 1.8896247920260758, "language_loss": 0.88450497, "learning_rate": 3.553142047243208e-06, "loss": 0.90664041, "num_input_tokens_seen": 77911060, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.9453125, "step": 3610, "time_per_iteration": 2.5700082778930664 }, { "auxiliary_loss_clip": 0.01146633, "auxiliary_loss_mlp": 0.01036558, "balance_loss_clip": 1.02041674, "balance_loss_mlp": 1.0536381, "epoch": 0.2171050653840373, "flos": 22638446265600.0, "grad_norm": 1.8028160899142138, "language_loss": 0.77415442, "learning_rate": 3.5529039802364077e-06, "loss": 0.79598635, "num_input_tokens_seen": 77929930, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.9296875, "step": 3611, "time_per_iteration": 2.518432855606079 }, { "auxiliary_loss_clip": 0.01185079, "auxiliary_loss_mlp": 0.01040596, "balance_loss_clip": 1.02454424, "balance_loss_mlp": 1.05032945, "epoch": 0.21716518863670525, "flos": 19499997381120.0, "grad_norm": 2.232078632622447, "language_loss": 0.63502318, "learning_rate": 3.552665857810459e-06, "loss": 0.65727991, "num_input_tokens_seen": 77949060, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8984375, "step": 3612, "time_per_iteration": 2.6334009170532227 }, { "auxiliary_loss_clip": 0.01179127, "auxiliary_loss_mlp": 0.01039312, "balance_loss_clip": 1.02292061, "balance_loss_mlp": 1.05037498, "epoch": 0.21722531188937322, "flos": 19792022952960.0, "grad_norm": 2.0978144501445617, "language_loss": 0.75940895, "learning_rate": 3.5524276799738594e-06, "loss": 0.78159332, "num_input_tokens_seen": 77967920, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.921875, "step": 3613, "time_per_iteration": 2.5312936305999756 }, { "auxiliary_loss_clip": 0.01166335, "auxiliary_loss_mlp": 0.0104779, "balance_loss_clip": 1.03073144, "balance_loss_mlp": 1.04834938, "epoch": 0.21728543514204118, "flos": 13334386924800.0, "grad_norm": 1.9594243798259787, "language_loss": 0.71172953, "learning_rate": 3.5521894467351095e-06, "loss": 0.73387074, "num_input_tokens_seen": 77985330, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.91015625, "step": 3614, "time_per_iteration": 2.564032793045044 }, { "auxiliary_loss_clip": 0.01150847, "auxiliary_loss_mlp": 0.01044432, "balance_loss_clip": 1.02755213, "balance_loss_mlp": 1.05050182, "epoch": 0.21734555839470915, "flos": 15231870783360.0, "grad_norm": 1.8702058433814301, "language_loss": 0.73079115, "learning_rate": 3.551951158102711e-06, "loss": 0.75274396, "num_input_tokens_seen": 78003105, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.9140625, "step": 3615, "time_per_iteration": 2.4916656017303467 }, { "auxiliary_loss_clip": 0.01156529, "auxiliary_loss_mlp": 0.01044747, "balance_loss_clip": 1.02691293, "balance_loss_mlp": 1.05055857, "epoch": 0.2174056816473771, "flos": 19973982274560.0, "grad_norm": 3.1331304033756657, "language_loss": 0.89472294, "learning_rate": 3.5517128140851682e-06, "loss": 0.91673571, "num_input_tokens_seen": 78019655, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.96875, "step": 3616, "time_per_iteration": 2.601546049118042 }, { "auxiliary_loss_clip": 0.01181852, "auxiliary_loss_mlp": 0.01038527, "balance_loss_clip": 1.0219928, "balance_loss_mlp": 1.0521667, "epoch": 0.21746580490004508, "flos": 16687293960960.0, "grad_norm": 9.767831114862561, "language_loss": 0.80717528, "learning_rate": 3.551474414690986e-06, "loss": 0.82937914, "num_input_tokens_seen": 78036025, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.93359375, "step": 3617, "time_per_iteration": 2.5699918270111084 }, { "auxiliary_loss_clip": 0.01162596, "auxiliary_loss_mlp": 0.0104127, "balance_loss_clip": 1.02526021, "balance_loss_mlp": 1.05184102, "epoch": 0.21752592815271307, "flos": 25772298209280.0, "grad_norm": 1.9709958021292073, "language_loss": 0.75522882, "learning_rate": 3.551235959928673e-06, "loss": 0.77726746, "num_input_tokens_seen": 78055645, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.9296875, "step": 3618, "time_per_iteration": 2.6169893741607666 }, { "auxiliary_loss_clip": 0.01155414, "auxiliary_loss_mlp": 0.01046724, "balance_loss_clip": 1.02943885, "balance_loss_mlp": 1.05290639, "epoch": 0.21758605140538104, "flos": 11254692349440.0, "grad_norm": 1.9294097894221123, "language_loss": 0.69120586, "learning_rate": 3.550997449806739e-06, "loss": 0.71322727, "num_input_tokens_seen": 78071660, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.93359375, "step": 3619, "time_per_iteration": 2.5286502838134766 }, { "auxiliary_loss_clip": 0.01156951, "auxiliary_loss_mlp": 0.01039184, "balance_loss_clip": 1.02223289, "balance_loss_mlp": 1.05287766, "epoch": 0.217646174658049, "flos": 19242625455360.0, "grad_norm": 1.6241055832180498, "language_loss": 0.78123116, "learning_rate": 3.5507588843336953e-06, "loss": 0.80319256, "num_input_tokens_seen": 78091265, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.953125, "step": 3620, "time_per_iteration": 2.6587064266204834 }, { "auxiliary_loss_clip": 0.0114232, "auxiliary_loss_mlp": 0.0103771, "balance_loss_clip": 1.02159309, "balance_loss_mlp": 1.05237043, "epoch": 0.21770629791071697, "flos": 21945083057280.0, "grad_norm": 1.8333441411265188, "language_loss": 0.79562736, "learning_rate": 3.5505202635180556e-06, "loss": 0.81742764, "num_input_tokens_seen": 78110095, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8984375, "step": 3621, "time_per_iteration": 2.6279995441436768 }, { "auxiliary_loss_clip": 0.0115097, "auxiliary_loss_mlp": 0.01037282, "balance_loss_clip": 1.02139688, "balance_loss_mlp": 1.05094612, "epoch": 0.21776642116338493, "flos": 24936764970240.0, "grad_norm": 1.6706643827577037, "language_loss": 0.87651879, "learning_rate": 3.550281587368337e-06, "loss": 0.89840126, "num_input_tokens_seen": 78129475, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.90625, "step": 3622, "time_per_iteration": 2.6207776069641113 }, { "auxiliary_loss_clip": 0.01156469, "auxiliary_loss_mlp": 0.01035473, "balance_loss_clip": 1.01732302, "balance_loss_mlp": 1.05339241, "epoch": 0.2178265444160529, "flos": 17821317219840.0, "grad_norm": 1.828574872578019, "language_loss": 0.76793575, "learning_rate": 3.550042855893056e-06, "loss": 0.78985518, "num_input_tokens_seen": 78146880, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9375, "step": 3623, "time_per_iteration": 2.553936719894409 }, { "auxiliary_loss_clip": 0.01174308, "auxiliary_loss_mlp": 0.01047064, "balance_loss_clip": 1.02915835, "balance_loss_mlp": 1.05325699, "epoch": 0.2178866676687209, "flos": 17712902995200.0, "grad_norm": 1.9926317017049207, "language_loss": 0.84657836, "learning_rate": 3.549804069100733e-06, "loss": 0.86879206, "num_input_tokens_seen": 78165065, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.94140625, "step": 3624, "time_per_iteration": 2.6536831855773926 }, { "auxiliary_loss_clip": 0.01157178, "auxiliary_loss_mlp": 0.0104822, "balance_loss_clip": 1.03145957, "balance_loss_mlp": 1.05443096, "epoch": 0.21794679092138886, "flos": 16945851035520.0, "grad_norm": 5.227277065119703, "language_loss": 0.77244842, "learning_rate": 3.5495652269998887e-06, "loss": 0.79450244, "num_input_tokens_seen": 78180005, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.9375, "step": 3625, "time_per_iteration": 2.556858539581299 }, { "auxiliary_loss_clip": 0.01094676, "auxiliary_loss_mlp": 0.01005965, "balance_loss_clip": 1.00385523, "balance_loss_mlp": 1.05118287, "epoch": 0.21800691417405682, "flos": 63718566566400.0, "grad_norm": 0.8294616348486874, "language_loss": 0.60725945, "learning_rate": 3.549326329599048e-06, "loss": 0.62826592, "num_input_tokens_seen": 78245350, "router_z_loss_clip": 0.02111816, "router_z_loss_mlp": 0.34375, "step": 3626, "time_per_iteration": 3.2847890853881836 }, { "auxiliary_loss_clip": 0.01164696, "auxiliary_loss_mlp": 0.0128849, "balance_loss_clip": 1.02282012, "balance_loss_mlp": 1.05226409, "epoch": 0.21806703742672479, "flos": 21616392677760.0, "grad_norm": 2.136718738489577, "language_loss": 0.90613949, "learning_rate": 3.549087376906736e-06, "loss": 0.93067133, "num_input_tokens_seen": 78264165, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9453125, "step": 3627, "time_per_iteration": 2.6160178184509277 }, { "auxiliary_loss_clip": 0.01178847, "auxiliary_loss_mlp": 0.01035396, "balance_loss_clip": 1.01868296, "balance_loss_mlp": 1.05030727, "epoch": 0.21812716067939275, "flos": 19354882435200.0, "grad_norm": 1.7564234818136033, "language_loss": 0.7325868, "learning_rate": 3.5488483689314795e-06, "loss": 0.75472921, "num_input_tokens_seen": 78283745, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.91796875, "step": 3628, "time_per_iteration": 2.664426803588867 }, { "auxiliary_loss_clip": 0.01143198, "auxiliary_loss_mlp": 0.01037714, "balance_loss_clip": 1.02110863, "balance_loss_mlp": 1.05118454, "epoch": 0.21818728393206072, "flos": 23548063305600.0, "grad_norm": 1.97377198895444, "language_loss": 0.76595235, "learning_rate": 3.5486093056818094e-06, "loss": 0.78776145, "num_input_tokens_seen": 78302900, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.921875, "step": 3629, "time_per_iteration": 2.5985445976257324 }, { "auxiliary_loss_clip": 0.01153569, "auxiliary_loss_mlp": 0.01037686, "balance_loss_clip": 1.02187848, "balance_loss_mlp": 1.05258369, "epoch": 0.21824740718472868, "flos": 30225652266240.0, "grad_norm": 1.5968774979355806, "language_loss": 0.71151888, "learning_rate": 3.5483701871662566e-06, "loss": 0.7334314, "num_input_tokens_seen": 78326470, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.91796875, "step": 3630, "time_per_iteration": 2.641005039215088 }, { "auxiliary_loss_clip": 0.01185655, "auxiliary_loss_mlp": 0.01037204, "balance_loss_clip": 1.02212405, "balance_loss_mlp": 1.05230761, "epoch": 0.21830753043739667, "flos": 26134672567680.0, "grad_norm": 1.9482967231377564, "language_loss": 0.76169705, "learning_rate": 3.5481310133933546e-06, "loss": 0.78392565, "num_input_tokens_seen": 78345810, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8828125, "step": 3631, "time_per_iteration": 4.1350061893463135 }, { "auxiliary_loss_clip": 0.01143915, "auxiliary_loss_mlp": 0.01037088, "balance_loss_clip": 1.0211854, "balance_loss_mlp": 1.05139804, "epoch": 0.21836765369006464, "flos": 21720712752000.0, "grad_norm": 2.2510411261896364, "language_loss": 0.75369346, "learning_rate": 3.547891784371639e-06, "loss": 0.77550352, "num_input_tokens_seen": 78364085, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.92578125, "step": 3632, "time_per_iteration": 2.5074565410614014 }, { "auxiliary_loss_clip": 0.01143451, "auxiliary_loss_mlp": 0.0103704, "balance_loss_clip": 1.02178121, "balance_loss_mlp": 1.05193233, "epoch": 0.2184277769427326, "flos": 19937604775680.0, "grad_norm": 2.1495757373272446, "language_loss": 0.83731943, "learning_rate": 3.547652500109647e-06, "loss": 0.85912436, "num_input_tokens_seen": 78381385, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.9140625, "step": 3633, "time_per_iteration": 5.45142126083374 }, { "auxiliary_loss_clip": 0.01170666, "auxiliary_loss_mlp": 0.01049229, "balance_loss_clip": 1.0335176, "balance_loss_mlp": 1.05266619, "epoch": 0.21848790019540057, "flos": 20340235301760.0, "grad_norm": 5.364166178383746, "language_loss": 0.81707728, "learning_rate": 3.547413160615919e-06, "loss": 0.83927619, "num_input_tokens_seen": 78400500, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.90625, "step": 3634, "time_per_iteration": 2.5646770000457764 }, { "auxiliary_loss_clip": 0.01155941, "auxiliary_loss_mlp": 0.01041172, "balance_loss_clip": 1.02414918, "balance_loss_mlp": 1.05251527, "epoch": 0.21854802344806853, "flos": 15450818135040.0, "grad_norm": 1.950045671341726, "language_loss": 0.75405157, "learning_rate": 3.5471737658989956e-06, "loss": 0.77602267, "num_input_tokens_seen": 78418340, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9453125, "step": 3635, "time_per_iteration": 2.5090198516845703 }, { "auxiliary_loss_clip": 0.01159089, "auxiliary_loss_mlp": 0.01053187, "balance_loss_clip": 1.03734398, "balance_loss_mlp": 1.05139685, "epoch": 0.2186081467007365, "flos": 16320717711360.0, "grad_norm": 2.228849018835669, "language_loss": 0.87928611, "learning_rate": 3.54693431596742e-06, "loss": 0.90140879, "num_input_tokens_seen": 78434375, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8984375, "step": 3636, "time_per_iteration": 4.150970935821533 }, { "auxiliary_loss_clip": 0.01170818, "auxiliary_loss_mlp": 0.01300648, "balance_loss_clip": 1.03581786, "balance_loss_mlp": 1.05091524, "epoch": 0.2186682699534045, "flos": 21689255416320.0, "grad_norm": 2.220948390819255, "language_loss": 0.75543237, "learning_rate": 3.5466948108297377e-06, "loss": 0.78014708, "num_input_tokens_seen": 78451735, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.9296875, "step": 3637, "time_per_iteration": 2.584892511367798 }, { "auxiliary_loss_clip": 0.01167341, "auxiliary_loss_mlp": 0.01045109, "balance_loss_clip": 1.02691782, "balance_loss_mlp": 1.05311894, "epoch": 0.21872839320607246, "flos": 17739260599680.0, "grad_norm": 2.782758140168699, "language_loss": 0.89968961, "learning_rate": 3.5464552504944965e-06, "loss": 0.92181408, "num_input_tokens_seen": 78462730, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.96484375, "step": 3638, "time_per_iteration": 2.4598450660705566 }, { "auxiliary_loss_clip": 0.01153465, "auxiliary_loss_mlp": 0.01048461, "balance_loss_clip": 1.03119946, "balance_loss_mlp": 1.0515492, "epoch": 0.21878851645874042, "flos": 18652289431680.0, "grad_norm": 2.486856413852162, "language_loss": 0.89456522, "learning_rate": 3.546215634970245e-06, "loss": 0.91658449, "num_input_tokens_seen": 78476300, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9296875, "step": 3639, "time_per_iteration": 2.5224063396453857 }, { "auxiliary_loss_clip": 0.01163661, "auxiliary_loss_mlp": 0.01294023, "balance_loss_clip": 1.02993488, "balance_loss_mlp": 1.05216503, "epoch": 0.2188486397114084, "flos": 25557301353600.0, "grad_norm": 1.7304569750683743, "language_loss": 0.79176491, "learning_rate": 3.545975964265535e-06, "loss": 0.81634176, "num_input_tokens_seen": 78496135, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.93359375, "step": 3640, "time_per_iteration": 2.555224657058716 }, { "auxiliary_loss_clip": 0.011587, "auxiliary_loss_mlp": 0.01053747, "balance_loss_clip": 1.0347805, "balance_loss_mlp": 1.056602, "epoch": 0.21890876296407635, "flos": 17892061056000.0, "grad_norm": 2.5078177146180067, "language_loss": 0.72118872, "learning_rate": 3.5457362383889196e-06, "loss": 0.74331319, "num_input_tokens_seen": 78513855, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.9296875, "step": 3641, "time_per_iteration": 2.5994229316711426 }, { "auxiliary_loss_clip": 0.01164899, "auxiliary_loss_mlp": 0.01044053, "balance_loss_clip": 1.027632, "balance_loss_mlp": 1.05551577, "epoch": 0.21896888621674432, "flos": 17749100926080.0, "grad_norm": 2.3336130148065313, "language_loss": 0.81051552, "learning_rate": 3.5454964573489542e-06, "loss": 0.83260512, "num_input_tokens_seen": 78531740, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.9140625, "step": 3642, "time_per_iteration": 2.5030581951141357 }, { "auxiliary_loss_clip": 0.01159365, "auxiliary_loss_mlp": 0.01042355, "balance_loss_clip": 1.0238297, "balance_loss_mlp": 1.0548538, "epoch": 0.21902900946941228, "flos": 23076161400960.0, "grad_norm": 1.8508429752318232, "language_loss": 0.71597934, "learning_rate": 3.545256621154196e-06, "loss": 0.73799658, "num_input_tokens_seen": 78549600, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.95703125, "step": 3643, "time_per_iteration": 2.5452158451080322 }, { "auxiliary_loss_clip": 0.01146821, "auxiliary_loss_mlp": 0.01044476, "balance_loss_clip": 1.02616572, "balance_loss_mlp": 1.05269325, "epoch": 0.21908913272208028, "flos": 48178545004800.0, "grad_norm": 1.97540044449737, "language_loss": 0.68104851, "learning_rate": 3.545016729813203e-06, "loss": 0.7029615, "num_input_tokens_seen": 78573350, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.94140625, "step": 3644, "time_per_iteration": 2.7451090812683105 }, { "auxiliary_loss_clip": 0.01150947, "auxiliary_loss_mlp": 0.01044178, "balance_loss_clip": 1.02686882, "balance_loss_mlp": 1.05338979, "epoch": 0.21914925597474824, "flos": 22236749493120.0, "grad_norm": 3.7037137322179636, "language_loss": 0.77918863, "learning_rate": 3.544776783334538e-06, "loss": 0.80113989, "num_input_tokens_seen": 78591005, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.97265625, "step": 3645, "time_per_iteration": 2.5587337017059326 }, { "auxiliary_loss_clip": 0.01156029, "auxiliary_loss_mlp": 0.01052578, "balance_loss_clip": 1.03596067, "balance_loss_mlp": 1.05516255, "epoch": 0.2192093792274162, "flos": 22125605834880.0, "grad_norm": 1.7993773056965265, "language_loss": 0.82517779, "learning_rate": 3.5445367817267623e-06, "loss": 0.84726393, "num_input_tokens_seen": 78610645, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.91796875, "step": 3646, "time_per_iteration": 2.5573325157165527 }, { "auxiliary_loss_clip": 0.0115454, "auxiliary_loss_mlp": 0.01041038, "balance_loss_clip": 1.02439654, "balance_loss_mlp": 1.05370855, "epoch": 0.21926950248008417, "flos": 15669442264320.0, "grad_norm": 2.3374336222240952, "language_loss": 0.82982641, "learning_rate": 3.5442967249984427e-06, "loss": 0.8517822, "num_input_tokens_seen": 78628340, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.91796875, "step": 3647, "time_per_iteration": 2.5482723712921143 }, { "auxiliary_loss_clip": 0.01142355, "auxiliary_loss_mlp": 0.01050793, "balance_loss_clip": 1.03386509, "balance_loss_mlp": 1.05018044, "epoch": 0.21932962573275214, "flos": 30262496641920.0, "grad_norm": 1.6420643217267208, "language_loss": 0.72437412, "learning_rate": 3.544056613158145e-06, "loss": 0.74630558, "num_input_tokens_seen": 78649355, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.921875, "step": 3648, "time_per_iteration": 2.5862622261047363 }, { "auxiliary_loss_clip": 0.01174476, "auxiliary_loss_mlp": 0.01049325, "balance_loss_clip": 1.03060925, "balance_loss_mlp": 1.05036414, "epoch": 0.2193897489854201, "flos": 10780132838400.0, "grad_norm": 2.4966636354767546, "language_loss": 0.73843181, "learning_rate": 3.5438164462144383e-06, "loss": 0.76066983, "num_input_tokens_seen": 78664915, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.96875, "step": 3649, "time_per_iteration": 2.568157434463501 }, { "auxiliary_loss_clip": 0.01148824, "auxiliary_loss_mlp": 0.0104734, "balance_loss_clip": 1.03126502, "balance_loss_mlp": 1.05009675, "epoch": 0.21944987223808807, "flos": 19133313390720.0, "grad_norm": 2.4364250011292303, "language_loss": 0.86370534, "learning_rate": 3.5435762241758944e-06, "loss": 0.88566697, "num_input_tokens_seen": 78681475, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8984375, "step": 3650, "time_per_iteration": 2.5093259811401367 }, { "auxiliary_loss_clip": 0.01162679, "auxiliary_loss_mlp": 0.01047472, "balance_loss_clip": 1.02929235, "balance_loss_mlp": 1.05058682, "epoch": 0.21950999549075606, "flos": 22711093522560.0, "grad_norm": 2.2143528036947817, "language_loss": 0.83484495, "learning_rate": 3.5433359470510855e-06, "loss": 0.85694647, "num_input_tokens_seen": 78702300, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.94140625, "step": 3651, "time_per_iteration": 2.5924782752990723 }, { "auxiliary_loss_clip": 0.01150612, "auxiliary_loss_mlp": 0.01044821, "balance_loss_clip": 1.0278635, "balance_loss_mlp": 1.04891074, "epoch": 0.21957011874342403, "flos": 10561329141120.0, "grad_norm": 1.9413492019335117, "language_loss": 0.74262846, "learning_rate": 3.5430956148485864e-06, "loss": 0.76458275, "num_input_tokens_seen": 78720230, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.92578125, "step": 3652, "time_per_iteration": 2.5274064540863037 }, { "auxiliary_loss_clip": 0.01118162, "auxiliary_loss_mlp": 0.01013678, "balance_loss_clip": 1.0116396, "balance_loss_mlp": 1.04577243, "epoch": 0.219630241996092, "flos": 65747913252480.0, "grad_norm": 0.7333512613455689, "language_loss": 0.51589137, "learning_rate": 3.542855227576974e-06, "loss": 0.53720975, "num_input_tokens_seen": 78780200, "router_z_loss_clip": 0.02038574, "router_z_loss_mlp": 0.36328125, "step": 3653, "time_per_iteration": 3.195456027984619 }, { "auxiliary_loss_clip": 0.01164326, "auxiliary_loss_mlp": 0.01050052, "balance_loss_clip": 1.03257596, "balance_loss_mlp": 1.05337751, "epoch": 0.21969036524875996, "flos": 23696518216320.0, "grad_norm": 2.059381329202288, "language_loss": 0.74986577, "learning_rate": 3.5426147852448276e-06, "loss": 0.77200949, "num_input_tokens_seen": 78800575, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.9296875, "step": 3654, "time_per_iteration": 2.6420397758483887 }, { "auxiliary_loss_clip": 0.01158596, "auxiliary_loss_mlp": 0.01044951, "balance_loss_clip": 1.02734375, "balance_loss_mlp": 1.05317402, "epoch": 0.21975048850142792, "flos": 19640910435840.0, "grad_norm": 2.3249898883900464, "language_loss": 0.72625935, "learning_rate": 3.542374287860727e-06, "loss": 0.74829483, "num_input_tokens_seen": 78819585, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9609375, "step": 3655, "time_per_iteration": 2.5339548587799072 }, { "auxiliary_loss_clip": 0.011648, "auxiliary_loss_mlp": 0.0104687, "balance_loss_clip": 1.02998984, "balance_loss_mlp": 1.05268598, "epoch": 0.21981061175409589, "flos": 22448550038400.0, "grad_norm": 1.8149484415868002, "language_loss": 0.80610675, "learning_rate": 3.542133735433256e-06, "loss": 0.82822347, "num_input_tokens_seen": 78837330, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.94140625, "step": 3656, "time_per_iteration": 2.6085996627807617 }, { "auxiliary_loss_clip": 0.01172847, "auxiliary_loss_mlp": 0.01286792, "balance_loss_clip": 1.02251005, "balance_loss_mlp": 1.05341232, "epoch": 0.21987073500676388, "flos": 18151049093760.0, "grad_norm": 2.1038548044295045, "language_loss": 0.84598064, "learning_rate": 3.541893127970999e-06, "loss": 0.87057704, "num_input_tokens_seen": 78854955, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.921875, "step": 3657, "time_per_iteration": 2.506131410598755 }, { "auxiliary_loss_clip": 0.01156811, "auxiliary_loss_mlp": 0.01035259, "balance_loss_clip": 1.01779485, "balance_loss_mlp": 1.05225921, "epoch": 0.21993085825943184, "flos": 25626177682560.0, "grad_norm": 1.7318018825843073, "language_loss": 0.80137086, "learning_rate": 3.541652465482542e-06, "loss": 0.82329154, "num_input_tokens_seen": 78874965, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.953125, "step": 3658, "time_per_iteration": 2.648733139038086 }, { "auxiliary_loss_clip": 0.01102364, "auxiliary_loss_mlp": 0.01002567, "balance_loss_clip": 1.00055265, "balance_loss_mlp": 1.04015136, "epoch": 0.2199909815120998, "flos": 70923217743360.0, "grad_norm": 0.7774020231757274, "language_loss": 0.58237737, "learning_rate": 3.5414117479764744e-06, "loss": 0.60342669, "num_input_tokens_seen": 78937740, "router_z_loss_clip": 0.0201416, "router_z_loss_mlp": 0.3515625, "step": 3659, "time_per_iteration": 3.275035858154297 }, { "auxiliary_loss_clip": 0.01171696, "auxiliary_loss_mlp": 0.01041056, "balance_loss_clip": 1.02384245, "balance_loss_mlp": 1.05192435, "epoch": 0.22005110476476777, "flos": 21543529939200.0, "grad_norm": 2.3265744187931565, "language_loss": 0.74333125, "learning_rate": 3.5411709754613864e-06, "loss": 0.76545876, "num_input_tokens_seen": 78955055, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.92578125, "step": 3660, "time_per_iteration": 2.672760248184204 }, { "auxiliary_loss_clip": 0.01155305, "auxiliary_loss_mlp": 0.01036977, "balance_loss_clip": 1.01939392, "balance_loss_mlp": 1.05190241, "epoch": 0.22011122801743574, "flos": 22054502862720.0, "grad_norm": 2.058171289617269, "language_loss": 0.80833429, "learning_rate": 3.5409301479458707e-06, "loss": 0.83025706, "num_input_tokens_seen": 78974895, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9453125, "step": 3661, "time_per_iteration": 2.5603890419006348 }, { "auxiliary_loss_clip": 0.01164223, "auxiliary_loss_mlp": 0.01043487, "balance_loss_clip": 1.02669048, "balance_loss_mlp": 1.05172002, "epoch": 0.2201713512701037, "flos": 26687589598080.0, "grad_norm": 1.8001624711870703, "language_loss": 0.73016834, "learning_rate": 3.5406892654385223e-06, "loss": 0.75224543, "num_input_tokens_seen": 78994990, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.9453125, "step": 3662, "time_per_iteration": 2.581869125366211 }, { "auxiliary_loss_clip": 0.01164225, "auxiliary_loss_mlp": 0.01051878, "balance_loss_clip": 1.0363214, "balance_loss_mlp": 1.05515671, "epoch": 0.22023147452277167, "flos": 22162198815360.0, "grad_norm": 1.928968181797321, "language_loss": 0.78096926, "learning_rate": 3.540448327947936e-06, "loss": 0.80313027, "num_input_tokens_seen": 79014405, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.9140625, "step": 3663, "time_per_iteration": 2.5658793449401855 }, { "auxiliary_loss_clip": 0.01158598, "auxiliary_loss_mlp": 0.01049219, "balance_loss_clip": 1.031183, "balance_loss_mlp": 1.05482626, "epoch": 0.22029159777543966, "flos": 22523280284160.0, "grad_norm": 1.723365015929797, "language_loss": 0.80141979, "learning_rate": 3.5402073354827123e-06, "loss": 0.82349795, "num_input_tokens_seen": 79032375, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9453125, "step": 3664, "time_per_iteration": 2.5637900829315186 }, { "auxiliary_loss_clip": 0.01169769, "auxiliary_loss_mlp": 0.01042398, "balance_loss_clip": 1.02233481, "balance_loss_mlp": 1.05353332, "epoch": 0.22035172102810763, "flos": 13042469093760.0, "grad_norm": 2.3366733574049707, "language_loss": 0.76788783, "learning_rate": 3.5399662880514497e-06, "loss": 0.7900095, "num_input_tokens_seen": 79049635, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.984375, "step": 3665, "time_per_iteration": 2.5673160552978516 }, { "auxiliary_loss_clip": 0.01165094, "auxiliary_loss_mlp": 0.01050485, "balance_loss_clip": 1.03310394, "balance_loss_mlp": 1.05276811, "epoch": 0.2204118442807756, "flos": 12165817760640.0, "grad_norm": 3.181151439428966, "language_loss": 0.97921491, "learning_rate": 3.5397251856627524e-06, "loss": 1.00137067, "num_input_tokens_seen": 79062890, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.94140625, "step": 3666, "time_per_iteration": 2.5259850025177 }, { "auxiliary_loss_clip": 0.01151257, "auxiliary_loss_mlp": 0.01289638, "balance_loss_clip": 1.02466488, "balance_loss_mlp": 1.04917741, "epoch": 0.22047196753344356, "flos": 40108806673920.0, "grad_norm": 1.7681060713991603, "language_loss": 0.80368721, "learning_rate": 3.5394840283252236e-06, "loss": 0.82809621, "num_input_tokens_seen": 79085495, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.93359375, "step": 3667, "time_per_iteration": 2.7412831783294678 }, { "auxiliary_loss_clip": 0.01159418, "auxiliary_loss_mlp": 0.01050834, "balance_loss_clip": 1.03196335, "balance_loss_mlp": 1.05398226, "epoch": 0.22053209078611152, "flos": 20701137202560.0, "grad_norm": 2.071796906314293, "language_loss": 0.77032894, "learning_rate": 3.53924281604747e-06, "loss": 0.79243147, "num_input_tokens_seen": 79101820, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.9609375, "step": 3668, "time_per_iteration": 2.5363271236419678 }, { "auxiliary_loss_clip": 0.01155659, "auxiliary_loss_mlp": 0.01043643, "balance_loss_clip": 1.02577388, "balance_loss_mlp": 1.05283141, "epoch": 0.2205922140387795, "flos": 24716309247360.0, "grad_norm": 1.8795284608037288, "language_loss": 0.71137607, "learning_rate": 3.5390015488381e-06, "loss": 0.73336911, "num_input_tokens_seen": 79123320, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.9375, "step": 3669, "time_per_iteration": 2.705652952194214 }, { "auxiliary_loss_clip": 0.01146052, "auxiliary_loss_mlp": 0.01040408, "balance_loss_clip": 1.0231818, "balance_loss_mlp": 1.05094039, "epoch": 0.22065233729144745, "flos": 23477247642240.0, "grad_norm": 1.6929322035847312, "language_loss": 0.85343856, "learning_rate": 3.5387602267057227e-06, "loss": 0.87530315, "num_input_tokens_seen": 79141615, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.94921875, "step": 3670, "time_per_iteration": 2.539198637008667 }, { "auxiliary_loss_clip": 0.01168948, "auxiliary_loss_mlp": 0.01038878, "balance_loss_clip": 1.02121162, "balance_loss_mlp": 1.05293357, "epoch": 0.22071246054411545, "flos": 35225566646400.0, "grad_norm": 1.960176887069179, "language_loss": 0.77053225, "learning_rate": 3.5385188496589516e-06, "loss": 0.79261059, "num_input_tokens_seen": 79164910, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.9765625, "step": 3671, "time_per_iteration": 2.7840301990509033 }, { "auxiliary_loss_clip": 0.01162707, "auxiliary_loss_mlp": 0.01039896, "balance_loss_clip": 1.02242029, "balance_loss_mlp": 1.05081284, "epoch": 0.2207725837967834, "flos": 18150294908160.0, "grad_norm": 2.2729140046730745, "language_loss": 0.80831802, "learning_rate": 3.5382774177064007e-06, "loss": 0.83034408, "num_input_tokens_seen": 79179685, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.9375, "step": 3672, "time_per_iteration": 3.9987828731536865 }, { "auxiliary_loss_clip": 0.011567, "auxiliary_loss_mlp": 0.01050123, "balance_loss_clip": 1.03293276, "balance_loss_mlp": 1.05268764, "epoch": 0.22083270704945138, "flos": 20479675898880.0, "grad_norm": 1.915834977635123, "language_loss": 0.73535043, "learning_rate": 3.538035930856685e-06, "loss": 0.75741869, "num_input_tokens_seen": 79196285, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.953125, "step": 3673, "time_per_iteration": 2.6259498596191406 }, { "auxiliary_loss_clip": 0.01178001, "auxiliary_loss_mlp": 0.0103724, "balance_loss_clip": 1.01908457, "balance_loss_mlp": 1.05524802, "epoch": 0.22089283030211934, "flos": 34125801984000.0, "grad_norm": 1.9109537288406715, "language_loss": 0.76328301, "learning_rate": 3.5377943891184234e-06, "loss": 0.78543544, "num_input_tokens_seen": 79216060, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.95703125, "step": 3674, "time_per_iteration": 2.7558634281158447 }, { "auxiliary_loss_clip": 0.01145837, "auxiliary_loss_mlp": 0.01045346, "balance_loss_clip": 1.02611756, "balance_loss_mlp": 1.05176544, "epoch": 0.2209529535547873, "flos": 18077216688000.0, "grad_norm": 2.0444205670863638, "language_loss": 0.73705214, "learning_rate": 3.5375527925002357e-06, "loss": 0.758964, "num_input_tokens_seen": 79235145, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.94140625, "step": 3675, "time_per_iteration": 5.47823166847229 }, { "auxiliary_loss_clip": 0.01147577, "auxiliary_loss_mlp": 0.01043932, "balance_loss_clip": 1.02571678, "balance_loss_mlp": 1.0515337, "epoch": 0.22101307680745527, "flos": 27235335070080.0, "grad_norm": 1.7743196608014506, "language_loss": 0.79921544, "learning_rate": 3.537311141010744e-06, "loss": 0.82113063, "num_input_tokens_seen": 79256960, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9609375, "step": 3676, "time_per_iteration": 2.6080822944641113 }, { "auxiliary_loss_clip": 0.0116084, "auxiliary_loss_mlp": 0.01050678, "balance_loss_clip": 1.03356004, "balance_loss_mlp": 1.04820919, "epoch": 0.22107320006012326, "flos": 16543256423040.0, "grad_norm": 2.098663372378355, "language_loss": 0.75305676, "learning_rate": 3.5370694346585718e-06, "loss": 0.775172, "num_input_tokens_seen": 79274860, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.9453125, "step": 3677, "time_per_iteration": 2.56561017036438 }, { "auxiliary_loss_clip": 0.01168733, "auxiliary_loss_mlp": 0.01044966, "balance_loss_clip": 1.02744246, "balance_loss_mlp": 1.04861152, "epoch": 0.22113332331279123, "flos": 22054466949120.0, "grad_norm": 3.7315945927284275, "language_loss": 0.82941288, "learning_rate": 3.5368276734523457e-06, "loss": 0.85154986, "num_input_tokens_seen": 79294005, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.93359375, "step": 3678, "time_per_iteration": 4.16752028465271 }, { "auxiliary_loss_clip": 0.01180601, "auxiliary_loss_mlp": 0.01047296, "balance_loss_clip": 1.02921247, "balance_loss_mlp": 1.05106878, "epoch": 0.2211934465654592, "flos": 26612787525120.0, "grad_norm": 1.780270108062897, "language_loss": 0.88961208, "learning_rate": 3.536585857400693e-06, "loss": 0.91189098, "num_input_tokens_seen": 79314005, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.9296875, "step": 3679, "time_per_iteration": 2.656916618347168 }, { "auxiliary_loss_clip": 0.01156751, "auxiliary_loss_mlp": 0.01047236, "balance_loss_clip": 1.02853167, "balance_loss_mlp": 1.05149508, "epoch": 0.22125356981812716, "flos": 16360363347840.0, "grad_norm": 2.5389526069642128, "language_loss": 0.86069655, "learning_rate": 3.5363439865122436e-06, "loss": 0.88273644, "num_input_tokens_seen": 79331030, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.96484375, "step": 3680, "time_per_iteration": 2.5901741981506348 }, { "auxiliary_loss_clip": 0.01169867, "auxiliary_loss_mlp": 0.01045446, "balance_loss_clip": 1.02754116, "balance_loss_mlp": 1.05124617, "epoch": 0.22131369307079513, "flos": 21651118151040.0, "grad_norm": 1.9466906000540567, "language_loss": 0.81393248, "learning_rate": 3.5361020607956292e-06, "loss": 0.83608568, "num_input_tokens_seen": 79348560, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.9140625, "step": 3681, "time_per_iteration": 2.6160104274749756 }, { "auxiliary_loss_clip": 0.01137187, "auxiliary_loss_mlp": 0.01285277, "balance_loss_clip": 1.02099299, "balance_loss_mlp": 1.04773641, "epoch": 0.2213738163234631, "flos": 19609524927360.0, "grad_norm": 2.3343579956789693, "language_loss": 0.79445815, "learning_rate": 3.535860080259484e-06, "loss": 0.81868279, "num_input_tokens_seen": 79367175, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.89453125, "step": 3682, "time_per_iteration": 2.619290590286255 }, { "auxiliary_loss_clip": 0.01161219, "auxiliary_loss_mlp": 0.01044798, "balance_loss_clip": 1.02591562, "balance_loss_mlp": 1.04761267, "epoch": 0.22143393957613106, "flos": 23623404082560.0, "grad_norm": 1.598335159268851, "language_loss": 0.77339762, "learning_rate": 3.5356180449124424e-06, "loss": 0.79545778, "num_input_tokens_seen": 79388435, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.953125, "step": 3683, "time_per_iteration": 2.6154847145080566 }, { "auxiliary_loss_clip": 0.0116247, "auxiliary_loss_mlp": 0.01047244, "balance_loss_clip": 1.02960062, "balance_loss_mlp": 1.04801679, "epoch": 0.22149406282879905, "flos": 26177801823360.0, "grad_norm": 2.448500417377511, "language_loss": 0.72440255, "learning_rate": 3.535375954763143e-06, "loss": 0.74649966, "num_input_tokens_seen": 79407910, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.96484375, "step": 3684, "time_per_iteration": 2.722423791885376 }, { "auxiliary_loss_clip": 0.01192374, "auxiliary_loss_mlp": 0.01045916, "balance_loss_clip": 1.02711654, "balance_loss_mlp": 1.05126238, "epoch": 0.221554186081467, "flos": 14538758970240.0, "grad_norm": 2.4336090920859603, "language_loss": 0.80515498, "learning_rate": 3.535133809820226e-06, "loss": 0.82753789, "num_input_tokens_seen": 79424020, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.9609375, "step": 3685, "time_per_iteration": 2.5760416984558105 }, { "auxiliary_loss_clip": 0.01137889, "auxiliary_loss_mlp": 0.01039387, "balance_loss_clip": 1.02328181, "balance_loss_mlp": 1.04748893, "epoch": 0.22161430933413498, "flos": 22238257864320.0, "grad_norm": 1.4148560283566978, "language_loss": 0.87135917, "learning_rate": 3.5348916100923318e-06, "loss": 0.89313197, "num_input_tokens_seen": 79445605, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.90234375, "step": 3686, "time_per_iteration": 2.6750073432922363 }, { "auxiliary_loss_clip": 0.01151258, "auxiliary_loss_mlp": 0.0104086, "balance_loss_clip": 1.02266884, "balance_loss_mlp": 1.0497694, "epoch": 0.22167443258680294, "flos": 23476529370240.0, "grad_norm": 2.3558947657138614, "language_loss": 0.78201699, "learning_rate": 3.534649355588104e-06, "loss": 0.80393815, "num_input_tokens_seen": 79463850, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.92578125, "step": 3687, "time_per_iteration": 2.5770103931427 }, { "auxiliary_loss_clip": 0.01166485, "auxiliary_loss_mlp": 0.01047977, "balance_loss_clip": 1.02933264, "balance_loss_mlp": 1.04963255, "epoch": 0.2217345558394709, "flos": 23221132692480.0, "grad_norm": 2.1390662053445197, "language_loss": 0.84954947, "learning_rate": 3.534407046316189e-06, "loss": 0.87169409, "num_input_tokens_seen": 79482845, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.98828125, "step": 3688, "time_per_iteration": 2.625852584838867 }, { "auxiliary_loss_clip": 0.01187215, "auxiliary_loss_mlp": 0.01046583, "balance_loss_clip": 1.0273186, "balance_loss_mlp": 1.0522815, "epoch": 0.22179467909213887, "flos": 20011078045440.0, "grad_norm": 2.340979037280112, "language_loss": 0.81003296, "learning_rate": 3.5341646822852324e-06, "loss": 0.83237094, "num_input_tokens_seen": 79501550, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.9921875, "step": 3689, "time_per_iteration": 2.5813233852386475 }, { "auxiliary_loss_clip": 0.01160172, "auxiliary_loss_mlp": 0.01046713, "balance_loss_clip": 1.02902234, "balance_loss_mlp": 1.04882574, "epoch": 0.22185480234480687, "flos": 19683034110720.0, "grad_norm": 2.0687256925493385, "language_loss": 0.70236588, "learning_rate": 3.5339222635038852e-06, "loss": 0.72443479, "num_input_tokens_seen": 79519680, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.9375, "step": 3690, "time_per_iteration": 2.5983364582061768 }, { "auxiliary_loss_clip": 0.01155062, "auxiliary_loss_mlp": 0.01287359, "balance_loss_clip": 1.02037811, "balance_loss_mlp": 1.0481416, "epoch": 0.22191492559747483, "flos": 21981316901760.0, "grad_norm": 2.5322035677360386, "language_loss": 0.72530347, "learning_rate": 3.533679789980798e-06, "loss": 0.74972767, "num_input_tokens_seen": 79539000, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.98046875, "step": 3691, "time_per_iteration": 2.5032668113708496 }, { "auxiliary_loss_clip": 0.0115666, "auxiliary_loss_mlp": 0.01045884, "balance_loss_clip": 1.02679896, "balance_loss_mlp": 1.05395019, "epoch": 0.2219750488501428, "flos": 23222066446080.0, "grad_norm": 1.816621149323461, "language_loss": 0.71019077, "learning_rate": 3.5334372617246243e-06, "loss": 0.73221624, "num_input_tokens_seen": 79559695, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.9375, "step": 3692, "time_per_iteration": 2.581634044647217 }, { "auxiliary_loss_clip": 0.0115716, "auxiliary_loss_mlp": 0.01046328, "balance_loss_clip": 1.02838659, "balance_loss_mlp": 1.04903352, "epoch": 0.22203517210281076, "flos": 22453685683200.0, "grad_norm": 1.8329779314278711, "language_loss": 0.87437558, "learning_rate": 3.533194678744019e-06, "loss": 0.89641047, "num_input_tokens_seen": 79579095, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.98828125, "step": 3693, "time_per_iteration": 2.542437791824341 }, { "auxiliary_loss_clip": 0.01148907, "auxiliary_loss_mlp": 0.0104195, "balance_loss_clip": 1.02566624, "balance_loss_mlp": 1.04819274, "epoch": 0.22209529535547873, "flos": 17564555825280.0, "grad_norm": 2.0223307046291965, "language_loss": 0.85445726, "learning_rate": 3.53295204104764e-06, "loss": 0.87636584, "num_input_tokens_seen": 79596430, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.91796875, "step": 3694, "time_per_iteration": 2.6685731410980225 }, { "auxiliary_loss_clip": 0.01176082, "auxiliary_loss_mlp": 0.01043469, "balance_loss_clip": 1.02537298, "balance_loss_mlp": 1.04840207, "epoch": 0.2221554186081467, "flos": 21469015175040.0, "grad_norm": 2.0613945406952143, "language_loss": 0.68990612, "learning_rate": 3.532709348644146e-06, "loss": 0.71210164, "num_input_tokens_seen": 79615825, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 1.0078125, "step": 3695, "time_per_iteration": 2.581186294555664 }, { "auxiliary_loss_clip": 0.0116019, "auxiliary_loss_mlp": 0.01042053, "balance_loss_clip": 1.0258646, "balance_loss_mlp": 1.04997492, "epoch": 0.22221554186081466, "flos": 27673445255040.0, "grad_norm": 1.7217712424112914, "language_loss": 0.7132448, "learning_rate": 3.532466601542197e-06, "loss": 0.73526722, "num_input_tokens_seen": 79637875, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.91796875, "step": 3696, "time_per_iteration": 2.6796891689300537 }, { "auxiliary_loss_clip": 0.01152411, "auxiliary_loss_mlp": 0.01039544, "balance_loss_clip": 1.02126932, "balance_loss_mlp": 1.0480727, "epoch": 0.22227566511348265, "flos": 25958926298880.0, "grad_norm": 1.7242788734214798, "language_loss": 0.87388188, "learning_rate": 3.532223799750458e-06, "loss": 0.89580142, "num_input_tokens_seen": 79656970, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.95703125, "step": 3697, "time_per_iteration": 2.6192238330841064 }, { "auxiliary_loss_clip": 0.01155228, "auxiliary_loss_mlp": 0.01043022, "balance_loss_clip": 1.02678585, "balance_loss_mlp": 1.04729342, "epoch": 0.22233578836615062, "flos": 39203678833920.0, "grad_norm": 2.3975016007492584, "language_loss": 0.66261095, "learning_rate": 3.5319809432775916e-06, "loss": 0.68459344, "num_input_tokens_seen": 79680275, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8984375, "step": 3698, "time_per_iteration": 2.7846755981445312 }, { "auxiliary_loss_clip": 0.01154409, "auxiliary_loss_mlp": 0.01040586, "balance_loss_clip": 1.02195382, "balance_loss_mlp": 1.04882979, "epoch": 0.22239591161881858, "flos": 36283782251520.0, "grad_norm": 4.257776425779235, "language_loss": 0.82419717, "learning_rate": 3.531738032132267e-06, "loss": 0.84614706, "num_input_tokens_seen": 79701255, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.96484375, "step": 3699, "time_per_iteration": 2.7731049060821533 }, { "auxiliary_loss_clip": 0.01160716, "auxiliary_loss_mlp": 0.01045629, "balance_loss_clip": 1.02771235, "balance_loss_mlp": 1.04898953, "epoch": 0.22245603487148655, "flos": 19719591177600.0, "grad_norm": 2.8754701684416526, "language_loss": 0.79796493, "learning_rate": 3.531495066323152e-06, "loss": 0.82002842, "num_input_tokens_seen": 79721315, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9375, "step": 3700, "time_per_iteration": 2.5656113624572754 }, { "auxiliary_loss_clip": 0.01184713, "auxiliary_loss_mlp": 0.01047066, "balance_loss_clip": 1.02809954, "balance_loss_mlp": 1.0500772, "epoch": 0.2225161581241545, "flos": 46280450615040.0, "grad_norm": 1.93320603431786, "language_loss": 0.71891278, "learning_rate": 3.5312520458589176e-06, "loss": 0.74123055, "num_input_tokens_seen": 79742705, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.984375, "step": 3701, "time_per_iteration": 2.874403953552246 }, { "auxiliary_loss_clip": 0.01149952, "auxiliary_loss_mlp": 0.01041688, "balance_loss_clip": 1.02402091, "balance_loss_mlp": 1.04628325, "epoch": 0.22257628137682248, "flos": 23696194993920.0, "grad_norm": 1.7830169435345218, "language_loss": 0.79695392, "learning_rate": 3.5310089707482366e-06, "loss": 0.81887031, "num_input_tokens_seen": 79763000, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.9453125, "step": 3702, "time_per_iteration": 2.5905325412750244 }, { "auxiliary_loss_clip": 0.01143791, "auxiliary_loss_mlp": 0.01042448, "balance_loss_clip": 1.02503133, "balance_loss_mlp": 1.04504704, "epoch": 0.22263640462949044, "flos": 19353984595200.0, "grad_norm": 2.091697357352682, "language_loss": 0.78074908, "learning_rate": 3.5307658409997834e-06, "loss": 0.80261147, "num_input_tokens_seen": 79781335, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.8984375, "step": 3703, "time_per_iteration": 2.5838778018951416 }, { "auxiliary_loss_clip": 0.01149812, "auxiliary_loss_mlp": 0.01039295, "balance_loss_clip": 1.02061534, "balance_loss_mlp": 1.04528749, "epoch": 0.22269652788215843, "flos": 20776047016320.0, "grad_norm": 1.9628609424485048, "language_loss": 0.73785198, "learning_rate": 3.530522656622235e-06, "loss": 0.75974303, "num_input_tokens_seen": 79800150, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.953125, "step": 3704, "time_per_iteration": 2.6187450885772705 }, { "auxiliary_loss_clip": 0.01165969, "auxiliary_loss_mlp": 0.01039732, "balance_loss_clip": 1.02320933, "balance_loss_mlp": 1.04872215, "epoch": 0.2227566511348264, "flos": 47958843467520.0, "grad_norm": 1.8162050053766645, "language_loss": 0.64361328, "learning_rate": 3.53027941762427e-06, "loss": 0.66567028, "num_input_tokens_seen": 79822390, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8984375, "step": 3705, "time_per_iteration": 2.905763864517212 }, { "auxiliary_loss_clip": 0.0110431, "auxiliary_loss_mlp": 0.01010499, "balance_loss_clip": 1.00742388, "balance_loss_mlp": 1.03833389, "epoch": 0.22281677438749437, "flos": 66218953230720.0, "grad_norm": 1.40575775779792, "language_loss": 0.65284389, "learning_rate": 3.5300361240145692e-06, "loss": 0.67399198, "num_input_tokens_seen": 79873350, "router_z_loss_clip": 0.03063965, "router_z_loss_mlp": 0.39453125, "step": 3706, "time_per_iteration": 3.068387746810913 }, { "auxiliary_loss_clip": 0.01159051, "auxiliary_loss_mlp": 0.01039298, "balance_loss_clip": 1.02246594, "balance_loss_mlp": 1.04655421, "epoch": 0.22287689764016233, "flos": 21871609787520.0, "grad_norm": 1.9702680878231875, "language_loss": 0.80710077, "learning_rate": 3.5297927758018147e-06, "loss": 0.82908428, "num_input_tokens_seen": 79891715, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.9453125, "step": 3707, "time_per_iteration": 2.5872886180877686 }, { "auxiliary_loss_clip": 0.01143871, "auxiliary_loss_mlp": 0.01039266, "balance_loss_clip": 1.02183807, "balance_loss_mlp": 1.04950631, "epoch": 0.2229370208928303, "flos": 27672475587840.0, "grad_norm": 1.8536939525108627, "language_loss": 0.78218883, "learning_rate": 3.5295493729946913e-06, "loss": 0.80402023, "num_input_tokens_seen": 79911175, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9453125, "step": 3708, "time_per_iteration": 2.543128490447998 }, { "auxiliary_loss_clip": 0.01161452, "auxiliary_loss_mlp": 0.01044899, "balance_loss_clip": 1.02728033, "balance_loss_mlp": 1.0475719, "epoch": 0.22299714414549826, "flos": 30154657034880.0, "grad_norm": 2.0466406170548526, "language_loss": 0.80937594, "learning_rate": 3.529305915601885e-06, "loss": 0.83143944, "num_input_tokens_seen": 79931875, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.95703125, "step": 3709, "time_per_iteration": 2.691770315170288 }, { "auxiliary_loss_clip": 0.01151168, "auxiliary_loss_mlp": 0.01293644, "balance_loss_clip": 1.02817345, "balance_loss_mlp": 1.04647541, "epoch": 0.22305726739816625, "flos": 23143134309120.0, "grad_norm": 1.971375702479551, "language_loss": 0.68367851, "learning_rate": 3.5290624036320843e-06, "loss": 0.7081266, "num_input_tokens_seen": 79952445, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.95703125, "step": 3710, "time_per_iteration": 2.591958999633789 }, { "auxiliary_loss_clip": 0.01174792, "auxiliary_loss_mlp": 0.01042435, "balance_loss_clip": 1.02412486, "balance_loss_mlp": 1.04914665, "epoch": 0.22311739065083422, "flos": 19172061187200.0, "grad_norm": 10.345445234440135, "language_loss": 0.8955512, "learning_rate": 3.5288188370939796e-06, "loss": 0.91772354, "num_input_tokens_seen": 79971030, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.984375, "step": 3711, "time_per_iteration": 2.6195685863494873 }, { "auxiliary_loss_clip": 0.01160514, "auxiliary_loss_mlp": 0.0105034, "balance_loss_clip": 1.03150535, "balance_loss_mlp": 1.0473994, "epoch": 0.22317751390350218, "flos": 13617757319040.0, "grad_norm": 2.686164043583734, "language_loss": 0.89510441, "learning_rate": 3.5285752159962636e-06, "loss": 0.91721296, "num_input_tokens_seen": 79982085, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.94921875, "step": 3712, "time_per_iteration": 2.6089420318603516 }, { "auxiliary_loss_clip": 0.01143934, "auxiliary_loss_mlp": 0.01046091, "balance_loss_clip": 1.0273633, "balance_loss_mlp": 1.05110669, "epoch": 0.22323763715617015, "flos": 11029065068160.0, "grad_norm": 2.989095783643465, "language_loss": 0.75087905, "learning_rate": 3.5283315403476293e-06, "loss": 0.77277935, "num_input_tokens_seen": 79997460, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.92578125, "step": 3713, "time_per_iteration": 3.886178493499756 }, { "auxiliary_loss_clip": 0.01190175, "auxiliary_loss_mlp": 0.0104152, "balance_loss_clip": 1.02429461, "balance_loss_mlp": 1.05134952, "epoch": 0.22329776040883811, "flos": 41351531466240.0, "grad_norm": 1.9831605367356424, "language_loss": 0.6224432, "learning_rate": 3.5280878101567746e-06, "loss": 0.64476019, "num_input_tokens_seen": 80022450, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9375, "step": 3714, "time_per_iteration": 2.808626174926758 }, { "auxiliary_loss_clip": 0.0116961, "auxiliary_loss_mlp": 0.01034891, "balance_loss_clip": 1.01783192, "balance_loss_mlp": 1.0497179, "epoch": 0.22335788366150608, "flos": 25119478477440.0, "grad_norm": 1.956427946047402, "language_loss": 0.78677607, "learning_rate": 3.527844025432396e-06, "loss": 0.80882102, "num_input_tokens_seen": 80042100, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.9296875, "step": 3715, "time_per_iteration": 2.644899368286133 }, { "auxiliary_loss_clip": 0.01175807, "auxiliary_loss_mlp": 0.01049864, "balance_loss_clip": 1.03224444, "balance_loss_mlp": 1.05407703, "epoch": 0.22341800691417404, "flos": 16983377769600.0, "grad_norm": 1.9523346455988304, "language_loss": 0.76942921, "learning_rate": 3.5276001861831945e-06, "loss": 0.79168594, "num_input_tokens_seen": 80059690, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9453125, "step": 3716, "time_per_iteration": 4.10342001914978 }, { "auxiliary_loss_clip": 0.01164063, "auxiliary_loss_mlp": 0.01042105, "balance_loss_clip": 1.02443767, "balance_loss_mlp": 1.05044329, "epoch": 0.22347813016684204, "flos": 14136738975360.0, "grad_norm": 3.597101875748683, "language_loss": 0.7943809, "learning_rate": 3.527356292417872e-06, "loss": 0.81644261, "num_input_tokens_seen": 80076060, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.953125, "step": 3717, "time_per_iteration": 2.627009630203247 }, { "auxiliary_loss_clip": 0.01191112, "auxiliary_loss_mlp": 0.01042832, "balance_loss_clip": 1.02479601, "balance_loss_mlp": 1.04967022, "epoch": 0.22353825341951, "flos": 23583147914880.0, "grad_norm": 3.4399370141210985, "language_loss": 0.68079191, "learning_rate": 3.527112344145132e-06, "loss": 0.70313132, "num_input_tokens_seen": 80094760, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.96484375, "step": 3718, "time_per_iteration": 2.6072208881378174 }, { "auxiliary_loss_clip": 0.01164575, "auxiliary_loss_mlp": 0.01042398, "balance_loss_clip": 1.02339637, "balance_loss_mlp": 1.04890108, "epoch": 0.22359837667217797, "flos": 29824206888960.0, "grad_norm": 1.747410346681577, "language_loss": 0.80488467, "learning_rate": 3.5268683413736808e-06, "loss": 0.82695448, "num_input_tokens_seen": 80114475, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.9765625, "step": 3719, "time_per_iteration": 4.1778669357299805 }, { "auxiliary_loss_clip": 0.01168459, "auxiliary_loss_mlp": 0.01054106, "balance_loss_clip": 1.0345912, "balance_loss_mlp": 1.04885709, "epoch": 0.22365849992484593, "flos": 17603088140160.0, "grad_norm": 3.5751334399907315, "language_loss": 0.86860776, "learning_rate": 3.526624284112226e-06, "loss": 0.89083338, "num_input_tokens_seen": 80132920, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.015625, "step": 3720, "time_per_iteration": 2.6218791007995605 }, { "auxiliary_loss_clip": 0.01163063, "auxiliary_loss_mlp": 0.01035527, "balance_loss_clip": 1.01744282, "balance_loss_mlp": 1.05201268, "epoch": 0.2237186231775139, "flos": 22710949868160.0, "grad_norm": 2.063705762406682, "language_loss": 0.7429167, "learning_rate": 3.5263801723694774e-06, "loss": 0.76490259, "num_input_tokens_seen": 80152845, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.9296875, "step": 3721, "time_per_iteration": 2.557506799697876 }, { "auxiliary_loss_clip": 0.01148606, "auxiliary_loss_mlp": 0.01043118, "balance_loss_clip": 1.02475929, "balance_loss_mlp": 1.05091071, "epoch": 0.22377874643018186, "flos": 13371518609280.0, "grad_norm": 2.3707163059273215, "language_loss": 0.79430366, "learning_rate": 3.5261360061541464e-06, "loss": 0.81622088, "num_input_tokens_seen": 80170680, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.9765625, "step": 3722, "time_per_iteration": 2.5594630241394043 }, { "auxiliary_loss_clip": 0.01171413, "auxiliary_loss_mlp": 0.01038576, "balance_loss_clip": 1.02169561, "balance_loss_mlp": 1.0515554, "epoch": 0.22383886968284986, "flos": 17894970057600.0, "grad_norm": 2.2390996149259035, "language_loss": 0.81970298, "learning_rate": 3.5258917854749476e-06, "loss": 0.84180284, "num_input_tokens_seen": 80189030, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.9296875, "step": 3723, "time_per_iteration": 2.542520046234131 }, { "auxiliary_loss_clip": 0.01148103, "auxiliary_loss_mlp": 0.01045961, "balance_loss_clip": 1.02792442, "balance_loss_mlp": 1.05119848, "epoch": 0.22389899293551782, "flos": 23879123982720.0, "grad_norm": 1.824245103559682, "language_loss": 0.84337246, "learning_rate": 3.5256475103405957e-06, "loss": 0.86531317, "num_input_tokens_seen": 80208365, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.96875, "step": 3724, "time_per_iteration": 2.668179988861084 }, { "auxiliary_loss_clip": 0.01161253, "auxiliary_loss_mlp": 0.01040688, "balance_loss_clip": 1.02300954, "balance_loss_mlp": 1.04949093, "epoch": 0.2239591161881858, "flos": 27272430840960.0, "grad_norm": 3.243048562728407, "language_loss": 0.79144913, "learning_rate": 3.525403180759809e-06, "loss": 0.81346852, "num_input_tokens_seen": 80228685, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.93359375, "step": 3725, "time_per_iteration": 2.6202657222747803 }, { "auxiliary_loss_clip": 0.01189731, "auxiliary_loss_mlp": 0.0104034, "balance_loss_clip": 1.02261353, "balance_loss_mlp": 1.05127692, "epoch": 0.22401923944085375, "flos": 22236857233920.0, "grad_norm": 2.011744255662872, "language_loss": 0.77406883, "learning_rate": 3.5251587967413065e-06, "loss": 0.79636955, "num_input_tokens_seen": 80247635, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.94140625, "step": 3726, "time_per_iteration": 2.7179253101348877 }, { "auxiliary_loss_clip": 0.01163093, "auxiliary_loss_mlp": 0.01042632, "balance_loss_clip": 1.02231884, "balance_loss_mlp": 1.05176437, "epoch": 0.22407936269352172, "flos": 12053668521600.0, "grad_norm": 2.6679480212507074, "language_loss": 0.72225827, "learning_rate": 3.5249143582938096e-06, "loss": 0.74431551, "num_input_tokens_seen": 80260045, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.0234375, "step": 3727, "time_per_iteration": 2.5282623767852783 }, { "auxiliary_loss_clip": 0.01147759, "auxiliary_loss_mlp": 0.01044968, "balance_loss_clip": 1.02469099, "balance_loss_mlp": 1.04856038, "epoch": 0.22413948594618968, "flos": 19353553632000.0, "grad_norm": 2.0358760866922148, "language_loss": 0.86833036, "learning_rate": 3.5246698654260416e-06, "loss": 0.89025766, "num_input_tokens_seen": 80277680, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.9921875, "step": 3728, "time_per_iteration": 2.6627511978149414 }, { "auxiliary_loss_clip": 0.01165026, "auxiliary_loss_mlp": 0.01051024, "balance_loss_clip": 1.03110385, "balance_loss_mlp": 1.05138636, "epoch": 0.22419960919885765, "flos": 24170000319360.0, "grad_norm": 4.447378076722231, "language_loss": 0.80487895, "learning_rate": 3.5244253181467284e-06, "loss": 0.82703948, "num_input_tokens_seen": 80294795, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.95703125, "step": 3729, "time_per_iteration": 2.5950498580932617 }, { "auxiliary_loss_clip": 0.01161897, "auxiliary_loss_mlp": 0.01050723, "balance_loss_clip": 1.03331816, "balance_loss_mlp": 1.05141854, "epoch": 0.22425973245152564, "flos": 27378977558400.0, "grad_norm": 1.7063547497380878, "language_loss": 0.86792338, "learning_rate": 3.5241807164645963e-06, "loss": 0.89004958, "num_input_tokens_seen": 80315425, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.92578125, "step": 3730, "time_per_iteration": 2.730339527130127 }, { "auxiliary_loss_clip": 0.01157687, "auxiliary_loss_mlp": 0.01287627, "balance_loss_clip": 1.02310443, "balance_loss_mlp": 1.04822159, "epoch": 0.2243198557041936, "flos": 13735652734080.0, "grad_norm": 1.8865014930141653, "language_loss": 0.73233038, "learning_rate": 3.5239360603883754e-06, "loss": 0.75678349, "num_input_tokens_seen": 80333905, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.9140625, "step": 3731, "time_per_iteration": 2.68208646774292 }, { "auxiliary_loss_clip": 0.01171557, "auxiliary_loss_mlp": 0.01041678, "balance_loss_clip": 1.0240705, "balance_loss_mlp": 1.04930425, "epoch": 0.22437997895686157, "flos": 19530700531200.0, "grad_norm": 2.316176930110785, "language_loss": 0.7549988, "learning_rate": 3.523691349926797e-06, "loss": 0.77713114, "num_input_tokens_seen": 80352165, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.953125, "step": 3732, "time_per_iteration": 2.5967001914978027 }, { "auxiliary_loss_clip": 0.01144009, "auxiliary_loss_mlp": 0.01059413, "balance_loss_clip": 1.04103088, "balance_loss_mlp": 1.0493896, "epoch": 0.22444010220952954, "flos": 23696230907520.0, "grad_norm": 1.9046215303494578, "language_loss": 0.88027918, "learning_rate": 3.523446585088593e-06, "loss": 0.90231341, "num_input_tokens_seen": 80371305, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.9453125, "step": 3733, "time_per_iteration": 2.558074712753296 }, { "auxiliary_loss_clip": 0.01178716, "auxiliary_loss_mlp": 0.01046974, "balance_loss_clip": 1.02855659, "balance_loss_mlp": 1.04822278, "epoch": 0.2245002254621975, "flos": 22382905933440.0, "grad_norm": 1.6515432671898724, "language_loss": 0.84507388, "learning_rate": 3.5232017658825e-06, "loss": 0.86733079, "num_input_tokens_seen": 80391020, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.94140625, "step": 3734, "time_per_iteration": 2.628268241882324 }, { "auxiliary_loss_clip": 0.0116242, "auxiliary_loss_mlp": 0.01050806, "balance_loss_clip": 1.03305578, "balance_loss_mlp": 1.05050611, "epoch": 0.22456034871486547, "flos": 26942303917440.0, "grad_norm": 2.1978434933742297, "language_loss": 0.76730692, "learning_rate": 3.522956892317253e-06, "loss": 0.78943914, "num_input_tokens_seen": 80411365, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9375, "step": 3735, "time_per_iteration": 2.63824200630188 }, { "auxiliary_loss_clip": 0.01147179, "auxiliary_loss_mlp": 0.01048015, "balance_loss_clip": 1.03164768, "balance_loss_mlp": 1.04859471, "epoch": 0.22462047196753343, "flos": 28983538005120.0, "grad_norm": 1.7994425887908987, "language_loss": 0.84023559, "learning_rate": 3.5227119644015922e-06, "loss": 0.8621875, "num_input_tokens_seen": 80431075, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.89453125, "step": 3736, "time_per_iteration": 2.5780093669891357 }, { "auxiliary_loss_clip": 0.01170487, "auxiliary_loss_mlp": 0.0104605, "balance_loss_clip": 1.02826357, "balance_loss_mlp": 1.04873514, "epoch": 0.22468059522020142, "flos": 20011329440640.0, "grad_norm": 1.6001229731370266, "language_loss": 0.86247671, "learning_rate": 3.5224669821442586e-06, "loss": 0.88464212, "num_input_tokens_seen": 80449240, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.94921875, "step": 3737, "time_per_iteration": 2.5517334938049316 }, { "auxiliary_loss_clip": 0.01145717, "auxiliary_loss_mlp": 0.01054091, "balance_loss_clip": 1.0345881, "balance_loss_mlp": 1.04933429, "epoch": 0.2247407184728694, "flos": 29314239546240.0, "grad_norm": 1.971707505184376, "language_loss": 0.79065192, "learning_rate": 3.522221945553995e-06, "loss": 0.81265002, "num_input_tokens_seen": 80467900, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.96484375, "step": 3738, "time_per_iteration": 2.540815591812134 }, { "auxiliary_loss_clip": 0.0116121, "auxiliary_loss_mlp": 0.01045113, "balance_loss_clip": 1.02795911, "balance_loss_mlp": 1.04932129, "epoch": 0.22480084172553735, "flos": 22310366417280.0, "grad_norm": 1.6086815183587404, "language_loss": 0.76263285, "learning_rate": 3.521976854639546e-06, "loss": 0.7846961, "num_input_tokens_seen": 80487100, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9375, "step": 3739, "time_per_iteration": 2.5740768909454346 }, { "auxiliary_loss_clip": 0.01169427, "auxiliary_loss_mlp": 0.0104681, "balance_loss_clip": 1.0286907, "balance_loss_mlp": 1.04817021, "epoch": 0.22486096497820532, "flos": 25591272641280.0, "grad_norm": 1.9084383460407166, "language_loss": 0.74608946, "learning_rate": 3.5217317094096576e-06, "loss": 0.76825178, "num_input_tokens_seen": 80508625, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.9453125, "step": 3740, "time_per_iteration": 2.608032703399658 }, { "auxiliary_loss_clip": 0.01141212, "auxiliary_loss_mlp": 0.01044057, "balance_loss_clip": 1.02605677, "balance_loss_mlp": 1.04769206, "epoch": 0.22492108823087328, "flos": 17639824775040.0, "grad_norm": 2.1725977314192075, "language_loss": 0.75749248, "learning_rate": 3.5214865098730785e-06, "loss": 0.77934515, "num_input_tokens_seen": 80527345, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9375, "step": 3741, "time_per_iteration": 2.5686707496643066 }, { "auxiliary_loss_clip": 0.01160568, "auxiliary_loss_mlp": 0.01036916, "balance_loss_clip": 1.02003551, "balance_loss_mlp": 1.04984856, "epoch": 0.22498121148354125, "flos": 16034653797120.0, "grad_norm": 2.171631504372134, "language_loss": 0.87226856, "learning_rate": 3.52124125603856e-06, "loss": 0.89424336, "num_input_tokens_seen": 80545545, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.921875, "step": 3742, "time_per_iteration": 2.5481364727020264 }, { "auxiliary_loss_clip": 0.01177132, "auxiliary_loss_mlp": 0.01047877, "balance_loss_clip": 1.02944708, "balance_loss_mlp": 1.04943311, "epoch": 0.22504133473620924, "flos": 24023772051840.0, "grad_norm": 1.9268090170503847, "language_loss": 0.81338835, "learning_rate": 3.520995947914854e-06, "loss": 0.8356384, "num_input_tokens_seen": 80565040, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.9140625, "step": 3743, "time_per_iteration": 2.7056987285614014 }, { "auxiliary_loss_clip": 0.01152343, "auxiliary_loss_mlp": 0.01037589, "balance_loss_clip": 1.02017295, "balance_loss_mlp": 1.04892182, "epoch": 0.2251014579888772, "flos": 16763963541120.0, "grad_norm": 3.2886234885343617, "language_loss": 0.63493943, "learning_rate": 3.520750585510715e-06, "loss": 0.65683877, "num_input_tokens_seen": 80582815, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9453125, "step": 3744, "time_per_iteration": 2.545988082885742 }, { "auxiliary_loss_clip": 0.01149924, "auxiliary_loss_mlp": 0.01042405, "balance_loss_clip": 1.02426147, "balance_loss_mlp": 1.04668474, "epoch": 0.22516158124154517, "flos": 13991013498240.0, "grad_norm": 2.7807835025930046, "language_loss": 0.75068641, "learning_rate": 3.5205051688348997e-06, "loss": 0.77260971, "num_input_tokens_seen": 80600865, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.94140625, "step": 3745, "time_per_iteration": 2.635113000869751 }, { "auxiliary_loss_clip": 0.01159242, "auxiliary_loss_mlp": 0.01038148, "balance_loss_clip": 1.0208869, "balance_loss_mlp": 1.04796004, "epoch": 0.22522170449421314, "flos": 14390016750720.0, "grad_norm": 1.7036831920324458, "language_loss": 0.80363446, "learning_rate": 3.520259697896166e-06, "loss": 0.82560837, "num_input_tokens_seen": 80617455, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.9296875, "step": 3746, "time_per_iteration": 2.581608772277832 }, { "auxiliary_loss_clip": 0.01151074, "auxiliary_loss_mlp": 0.01045628, "balance_loss_clip": 1.02768695, "balance_loss_mlp": 1.04646683, "epoch": 0.2252818277468811, "flos": 23805542972160.0, "grad_norm": 3.0724545556354093, "language_loss": 0.86138773, "learning_rate": 3.5200141727032744e-06, "loss": 0.88335478, "num_input_tokens_seen": 80635125, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.95703125, "step": 3747, "time_per_iteration": 2.6584105491638184 }, { "auxiliary_loss_clip": 0.01150098, "auxiliary_loss_mlp": 0.01282684, "balance_loss_clip": 1.01770973, "balance_loss_mlp": 1.04655385, "epoch": 0.22534195099954907, "flos": 24718033100160.0, "grad_norm": 2.3286066521517044, "language_loss": 0.76291549, "learning_rate": 3.519768593264987e-06, "loss": 0.78724337, "num_input_tokens_seen": 80656370, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.94921875, "step": 3748, "time_per_iteration": 2.593914031982422 }, { "auxiliary_loss_clip": 0.01146984, "auxiliary_loss_mlp": 0.01041484, "balance_loss_clip": 1.02363825, "balance_loss_mlp": 1.05094838, "epoch": 0.22540207425221703, "flos": 21032341534080.0, "grad_norm": 1.8995865830028347, "language_loss": 0.79104006, "learning_rate": 3.519522959590068e-06, "loss": 0.81292474, "num_input_tokens_seen": 80676495, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.95703125, "step": 3749, "time_per_iteration": 2.623713970184326 }, { "auxiliary_loss_clip": 0.01154304, "auxiliary_loss_mlp": 0.0104241, "balance_loss_clip": 1.02561331, "balance_loss_mlp": 1.04600787, "epoch": 0.22546219750488503, "flos": 19390362094080.0, "grad_norm": 2.154713489896113, "language_loss": 0.79179823, "learning_rate": 3.5192772716872827e-06, "loss": 0.81376541, "num_input_tokens_seen": 80694755, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.90625, "step": 3750, "time_per_iteration": 2.497863292694092 }, { "auxiliary_loss_clip": 0.01179939, "auxiliary_loss_mlp": 0.01047645, "balance_loss_clip": 1.02934635, "balance_loss_mlp": 1.04950738, "epoch": 0.225522320757553, "flos": 25192628524800.0, "grad_norm": 1.9500222618395502, "language_loss": 0.81970692, "learning_rate": 3.5190315295653996e-06, "loss": 0.84198278, "num_input_tokens_seen": 80713670, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.94140625, "step": 3751, "time_per_iteration": 2.645711898803711 }, { "auxiliary_loss_clip": 0.011637, "auxiliary_loss_mlp": 0.01042424, "balance_loss_clip": 1.02478063, "balance_loss_mlp": 1.05065334, "epoch": 0.22558244401022096, "flos": 17163110448000.0, "grad_norm": 2.3424372073371833, "language_loss": 0.83409631, "learning_rate": 3.518785733233189e-06, "loss": 0.8561576, "num_input_tokens_seen": 80731450, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.94921875, "step": 3752, "time_per_iteration": 2.5218660831451416 }, { "auxiliary_loss_clip": 0.01140574, "auxiliary_loss_mlp": 0.0103824, "balance_loss_clip": 1.02147877, "balance_loss_mlp": 1.04805112, "epoch": 0.22564256726288892, "flos": 15231008856960.0, "grad_norm": 1.803800781602255, "language_loss": 0.78257549, "learning_rate": 3.518539882699422e-06, "loss": 0.80436361, "num_input_tokens_seen": 80748415, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.92578125, "step": 3753, "time_per_iteration": 2.5466361045837402 }, { "auxiliary_loss_clip": 0.01157454, "auxiliary_loss_mlp": 0.01040371, "balance_loss_clip": 1.02213204, "balance_loss_mlp": 1.04837942, "epoch": 0.2257026905155569, "flos": 34568652764160.0, "grad_norm": 3.9282859558229775, "language_loss": 0.78334916, "learning_rate": 3.518293977972873e-06, "loss": 0.80532742, "num_input_tokens_seen": 80770835, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9140625, "step": 3754, "time_per_iteration": 2.632126808166504 }, { "auxiliary_loss_clip": 0.01160296, "auxiliary_loss_mlp": 0.01048231, "balance_loss_clip": 1.03054059, "balance_loss_mlp": 1.05055249, "epoch": 0.22576281376822485, "flos": 19938430788480.0, "grad_norm": 1.686964144664365, "language_loss": 0.69928151, "learning_rate": 3.5180480190623173e-06, "loss": 0.72136676, "num_input_tokens_seen": 80787840, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.91796875, "step": 3755, "time_per_iteration": 3.989574670791626 }, { "auxiliary_loss_clip": 0.01176214, "auxiliary_loss_mlp": 0.01052894, "balance_loss_clip": 1.03452349, "balance_loss_mlp": 1.05238962, "epoch": 0.22582293702089282, "flos": 24602005192320.0, "grad_norm": 2.9154147651476228, "language_loss": 0.77243292, "learning_rate": 3.517802005976533e-06, "loss": 0.79472399, "num_input_tokens_seen": 80806335, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.96875, "step": 3756, "time_per_iteration": 2.682926654815674 }, { "auxiliary_loss_clip": 0.01167482, "auxiliary_loss_mlp": 0.01045276, "balance_loss_clip": 1.0267508, "balance_loss_mlp": 1.05316925, "epoch": 0.2258830602735608, "flos": 23035438356480.0, "grad_norm": 1.9184723866182343, "language_loss": 0.82679188, "learning_rate": 3.5175559387242988e-06, "loss": 0.84891951, "num_input_tokens_seen": 80825355, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.9609375, "step": 3757, "time_per_iteration": 2.569403648376465 }, { "auxiliary_loss_clip": 0.01153929, "auxiliary_loss_mlp": 0.0104546, "balance_loss_clip": 1.02662468, "balance_loss_mlp": 1.05037844, "epoch": 0.22594318352622877, "flos": 22158427887360.0, "grad_norm": 1.9248621532809493, "language_loss": 0.7293967, "learning_rate": 3.517309817314397e-06, "loss": 0.75139058, "num_input_tokens_seen": 80842570, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.9453125, "step": 3758, "time_per_iteration": 5.451900482177734 }, { "auxiliary_loss_clip": 0.01164869, "auxiliary_loss_mlp": 0.01048931, "balance_loss_clip": 1.03073931, "balance_loss_mlp": 1.05204678, "epoch": 0.22600330677889674, "flos": 20594303176320.0, "grad_norm": 2.176873708702191, "language_loss": 0.77073789, "learning_rate": 3.5170636417556113e-06, "loss": 0.79287589, "num_input_tokens_seen": 80858745, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.94921875, "step": 3759, "time_per_iteration": 2.5796985626220703 }, { "auxiliary_loss_clip": 0.01180944, "auxiliary_loss_mlp": 0.01042642, "balance_loss_clip": 1.02448606, "balance_loss_mlp": 1.04938507, "epoch": 0.2260634300315647, "flos": 35659798162560.0, "grad_norm": 2.095338456203913, "language_loss": 0.782947, "learning_rate": 3.516817412056726e-06, "loss": 0.80518281, "num_input_tokens_seen": 80880085, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9609375, "step": 3760, "time_per_iteration": 2.6885263919830322 }, { "auxiliary_loss_clip": 0.01097332, "auxiliary_loss_mlp": 0.01011274, "balance_loss_clip": 1.00850797, "balance_loss_mlp": 1.03322995, "epoch": 0.22612355328423267, "flos": 72090455126400.0, "grad_norm": 0.9578156922833736, "language_loss": 0.60047311, "learning_rate": 3.516571128226529e-06, "loss": 0.6215592, "num_input_tokens_seen": 80937660, "router_z_loss_clip": 0.02770996, "router_z_loss_mlp": 0.36914062, "step": 3761, "time_per_iteration": 4.522829532623291 }, { "auxiliary_loss_clip": 0.01173771, "auxiliary_loss_mlp": 0.01046383, "balance_loss_clip": 1.02790511, "balance_loss_mlp": 1.04945779, "epoch": 0.22618367653690064, "flos": 22783776693120.0, "grad_norm": 2.0721353786743686, "language_loss": 0.76552653, "learning_rate": 3.51632479027381e-06, "loss": 0.78772807, "num_input_tokens_seen": 80956265, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.9765625, "step": 3762, "time_per_iteration": 2.6147751808166504 }, { "auxiliary_loss_clip": 0.01163517, "auxiliary_loss_mlp": 0.01041834, "balance_loss_clip": 1.02345169, "balance_loss_mlp": 1.04991901, "epoch": 0.22624379978956863, "flos": 20448254476800.0, "grad_norm": 3.003570516784203, "language_loss": 0.7901907, "learning_rate": 3.5160783982073595e-06, "loss": 0.81224418, "num_input_tokens_seen": 80975185, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.95703125, "step": 3763, "time_per_iteration": 2.5123798847198486 }, { "auxiliary_loss_clip": 0.01158836, "auxiliary_loss_mlp": 0.01050762, "balance_loss_clip": 1.03130722, "balance_loss_mlp": 1.05184269, "epoch": 0.2263039230422366, "flos": 17494314779520.0, "grad_norm": 1.8166695045641983, "language_loss": 0.91269732, "learning_rate": 3.5158319520359703e-06, "loss": 0.93479323, "num_input_tokens_seen": 80992830, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.9765625, "step": 3764, "time_per_iteration": 2.547084093093872 }, { "auxiliary_loss_clip": 0.01153013, "auxiliary_loss_mlp": 0.01052386, "balance_loss_clip": 1.03456414, "balance_loss_mlp": 1.05081713, "epoch": 0.22636404629490456, "flos": 28329748606080.0, "grad_norm": 1.7394983924586036, "language_loss": 0.7539463, "learning_rate": 3.515585451768438e-06, "loss": 0.77600032, "num_input_tokens_seen": 81013675, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.9296875, "step": 3765, "time_per_iteration": 2.5873842239379883 }, { "auxiliary_loss_clip": 0.01152673, "auxiliary_loss_mlp": 0.01050865, "balance_loss_clip": 1.03344822, "balance_loss_mlp": 1.05076098, "epoch": 0.22642416954757252, "flos": 17489143221120.0, "grad_norm": 2.5324557906074823, "language_loss": 0.89478022, "learning_rate": 3.51533889741356e-06, "loss": 0.91681558, "num_input_tokens_seen": 81030345, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9296875, "step": 3766, "time_per_iteration": 2.546409845352173 }, { "auxiliary_loss_clip": 0.01161987, "auxiliary_loss_mlp": 0.01043058, "balance_loss_clip": 1.02551019, "balance_loss_mlp": 1.05284667, "epoch": 0.2264842928002405, "flos": 24384530298240.0, "grad_norm": 1.542968695595547, "language_loss": 0.74777412, "learning_rate": 3.515092288980135e-06, "loss": 0.76982462, "num_input_tokens_seen": 81051000, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.91015625, "step": 3767, "time_per_iteration": 2.6006357669830322 }, { "auxiliary_loss_clip": 0.01152834, "auxiliary_loss_mlp": 0.0104467, "balance_loss_clip": 1.0264194, "balance_loss_mlp": 1.0491004, "epoch": 0.22654441605290845, "flos": 19830519354240.0, "grad_norm": 1.4780709373995247, "language_loss": 0.71412575, "learning_rate": 3.5148456264769625e-06, "loss": 0.73610079, "num_input_tokens_seen": 81071205, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.94921875, "step": 3768, "time_per_iteration": 2.5963857173919678 }, { "auxiliary_loss_clip": 0.0115796, "auxiliary_loss_mlp": 0.0105908, "balance_loss_clip": 1.04056716, "balance_loss_mlp": 1.05458355, "epoch": 0.22660453930557642, "flos": 27454569730560.0, "grad_norm": 2.249537560176713, "language_loss": 0.77921629, "learning_rate": 3.5145989099128465e-06, "loss": 0.80138671, "num_input_tokens_seen": 81091880, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.9453125, "step": 3769, "time_per_iteration": 2.6388802528381348 }, { "auxiliary_loss_clip": 0.0116353, "auxiliary_loss_mlp": 0.01042526, "balance_loss_clip": 1.02371478, "balance_loss_mlp": 1.04916167, "epoch": 0.2266646625582444, "flos": 23988148738560.0, "grad_norm": 2.0354779934981737, "language_loss": 0.67954636, "learning_rate": 3.5143521392965914e-06, "loss": 0.70160693, "num_input_tokens_seen": 81113290, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.9609375, "step": 3770, "time_per_iteration": 2.6499476432800293 }, { "auxiliary_loss_clip": 0.01145993, "auxiliary_loss_mlp": 0.01047067, "balance_loss_clip": 1.02944803, "balance_loss_mlp": 1.05006099, "epoch": 0.22672478581091238, "flos": 26028054023040.0, "grad_norm": 1.5618380812001762, "language_loss": 0.80103618, "learning_rate": 3.5141053146370047e-06, "loss": 0.82296675, "num_input_tokens_seen": 81133535, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.9609375, "step": 3771, "time_per_iteration": 2.5705227851867676 }, { "auxiliary_loss_clip": 0.01164884, "auxiliary_loss_mlp": 0.01053332, "balance_loss_clip": 1.03511655, "balance_loss_mlp": 1.0472939, "epoch": 0.22678490906358034, "flos": 23841812730240.0, "grad_norm": 1.518697243148495, "language_loss": 0.78654921, "learning_rate": 3.513858435942893e-06, "loss": 0.80873138, "num_input_tokens_seen": 81154650, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.90625, "step": 3772, "time_per_iteration": 2.6447553634643555 }, { "auxiliary_loss_clip": 0.01110988, "auxiliary_loss_mlp": 0.01019715, "balance_loss_clip": 1.01687765, "balance_loss_mlp": 1.03581667, "epoch": 0.2268450323162483, "flos": 65048088574080.0, "grad_norm": 0.6590516799671498, "language_loss": 0.54366916, "learning_rate": 3.5136115032230683e-06, "loss": 0.56497622, "num_input_tokens_seen": 81221240, "router_z_loss_clip": 0.02832031, "router_z_loss_mlp": 0.390625, "step": 3773, "time_per_iteration": 3.3078136444091797 }, { "auxiliary_loss_clip": 0.01167123, "auxiliary_loss_mlp": 0.01042366, "balance_loss_clip": 1.02441359, "balance_loss_mlp": 1.04859567, "epoch": 0.22690515556891627, "flos": 22526081544960.0, "grad_norm": 1.9927995303387653, "language_loss": 0.70171714, "learning_rate": 3.5133645164863427e-06, "loss": 0.72381198, "num_input_tokens_seen": 81241520, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9140625, "step": 3774, "time_per_iteration": 2.57875919342041 }, { "auxiliary_loss_clip": 0.01158071, "auxiliary_loss_mlp": 0.01040099, "balance_loss_clip": 1.02225327, "balance_loss_mlp": 1.04844368, "epoch": 0.22696527882158424, "flos": 18223444955520.0, "grad_norm": 2.644098043504089, "language_loss": 0.74765074, "learning_rate": 3.5131174757415298e-06, "loss": 0.76963246, "num_input_tokens_seen": 81256825, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.9140625, "step": 3775, "time_per_iteration": 2.536574125289917 }, { "auxiliary_loss_clip": 0.0114135, "auxiliary_loss_mlp": 0.01040097, "balance_loss_clip": 1.02336025, "balance_loss_mlp": 1.04813635, "epoch": 0.22702540207425223, "flos": 17019252478080.0, "grad_norm": 1.8677083773438987, "language_loss": 0.82166797, "learning_rate": 3.512870380997446e-06, "loss": 0.84348243, "num_input_tokens_seen": 81275695, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.9296875, "step": 3776, "time_per_iteration": 2.5645077228546143 }, { "auxiliary_loss_clip": 0.01161093, "auxiliary_loss_mlp": 0.01039495, "balance_loss_clip": 1.0216018, "balance_loss_mlp": 1.04811692, "epoch": 0.2270855253269202, "flos": 21325731822720.0, "grad_norm": 1.8566263702910601, "language_loss": 0.82581103, "learning_rate": 3.5126232322629114e-06, "loss": 0.84781688, "num_input_tokens_seen": 81294920, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.94921875, "step": 3777, "time_per_iteration": 2.6179089546203613 }, { "auxiliary_loss_clip": 0.01162162, "auxiliary_loss_mlp": 0.01040859, "balance_loss_clip": 1.02327609, "balance_loss_mlp": 1.049932, "epoch": 0.22714564857958816, "flos": 23550469516800.0, "grad_norm": 3.2565082414755464, "language_loss": 0.72256076, "learning_rate": 3.5123760295467435e-06, "loss": 0.744591, "num_input_tokens_seen": 81314275, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.94140625, "step": 3778, "time_per_iteration": 2.5566117763519287 }, { "auxiliary_loss_clip": 0.01169543, "auxiliary_loss_mlp": 0.01038208, "balance_loss_clip": 1.02013588, "balance_loss_mlp": 1.04685307, "epoch": 0.22720577183225613, "flos": 25989880844160.0, "grad_norm": 2.3009351756278584, "language_loss": 0.64538044, "learning_rate": 3.5121287728577657e-06, "loss": 0.66745794, "num_input_tokens_seen": 81333890, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.953125, "step": 3779, "time_per_iteration": 2.6476964950561523 }, { "auxiliary_loss_clip": 0.01150152, "auxiliary_loss_mlp": 0.01042733, "balance_loss_clip": 1.02555537, "balance_loss_mlp": 1.0492208, "epoch": 0.2272658950849241, "flos": 20814076540800.0, "grad_norm": 1.796034355911587, "language_loss": 0.70199585, "learning_rate": 3.5118814622048012e-06, "loss": 0.72392476, "num_input_tokens_seen": 81353640, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.91796875, "step": 3780, "time_per_iteration": 2.572758913040161 }, { "auxiliary_loss_clip": 0.01151351, "auxiliary_loss_mlp": 0.01042295, "balance_loss_clip": 1.02365088, "balance_loss_mlp": 1.0502578, "epoch": 0.22732601833759206, "flos": 23909324342400.0, "grad_norm": 2.5088855232436473, "language_loss": 0.89809585, "learning_rate": 3.5116340975966766e-06, "loss": 0.92003226, "num_input_tokens_seen": 81371595, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.91796875, "step": 3781, "time_per_iteration": 2.598148822784424 }, { "auxiliary_loss_clip": 0.01142074, "auxiliary_loss_mlp": 0.01041641, "balance_loss_clip": 1.0240221, "balance_loss_mlp": 1.04819393, "epoch": 0.22738614159026002, "flos": 15924407978880.0, "grad_norm": 2.567695785907716, "language_loss": 0.7422688, "learning_rate": 3.5113866790422195e-06, "loss": 0.76410598, "num_input_tokens_seen": 81388435, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.94140625, "step": 3782, "time_per_iteration": 2.4886627197265625 }, { "auxiliary_loss_clip": 0.01155407, "auxiliary_loss_mlp": 0.01044257, "balance_loss_clip": 1.02749586, "balance_loss_mlp": 1.04574704, "epoch": 0.22744626484292801, "flos": 24205515891840.0, "grad_norm": 1.7060516556290617, "language_loss": 0.82357925, "learning_rate": 3.51113920655026e-06, "loss": 0.84557593, "num_input_tokens_seen": 81410195, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.91796875, "step": 3783, "time_per_iteration": 2.630849838256836 }, { "auxiliary_loss_clip": 0.01160493, "auxiliary_loss_mlp": 0.01045677, "balance_loss_clip": 1.02749777, "balance_loss_mlp": 1.04905868, "epoch": 0.22750638809559598, "flos": 24791614110720.0, "grad_norm": 1.7409448267655705, "language_loss": 0.7599169, "learning_rate": 3.510891680129629e-06, "loss": 0.78197867, "num_input_tokens_seen": 81430060, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9375, "step": 3784, "time_per_iteration": 2.6168975830078125 }, { "auxiliary_loss_clip": 0.01136085, "auxiliary_loss_mlp": 0.01040088, "balance_loss_clip": 1.02236176, "balance_loss_mlp": 1.0458374, "epoch": 0.22756651134826394, "flos": 22236498097920.0, "grad_norm": 1.73702349251988, "language_loss": 0.70871425, "learning_rate": 3.51064409978916e-06, "loss": 0.73047602, "num_input_tokens_seen": 81447375, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.90234375, "step": 3785, "time_per_iteration": 2.6179349422454834 }, { "auxiliary_loss_clip": 0.01112221, "auxiliary_loss_mlp": 0.01004433, "balance_loss_clip": 1.00085628, "balance_loss_mlp": 1.0455873, "epoch": 0.2276266346009319, "flos": 62707466626560.0, "grad_norm": 0.8430547909871972, "language_loss": 0.61915201, "learning_rate": 3.5103964655376894e-06, "loss": 0.64031857, "num_input_tokens_seen": 81505235, "router_z_loss_clip": 0.03564453, "router_z_loss_mlp": 0.39453125, "step": 3786, "time_per_iteration": 3.1558408737182617 }, { "auxiliary_loss_clip": 0.01164121, "auxiliary_loss_mlp": 0.01038815, "balance_loss_clip": 1.02121997, "balance_loss_mlp": 1.05089021, "epoch": 0.22768675785359987, "flos": 18613936684800.0, "grad_norm": 4.501409942432432, "language_loss": 0.86503971, "learning_rate": 3.510148777384054e-06, "loss": 0.88706911, "num_input_tokens_seen": 81518685, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.953125, "step": 3787, "time_per_iteration": 2.571073532104492 }, { "auxiliary_loss_clip": 0.01157437, "auxiliary_loss_mlp": 0.01036769, "balance_loss_clip": 1.02046132, "balance_loss_mlp": 1.04880178, "epoch": 0.22774688110626784, "flos": 26870195364480.0, "grad_norm": 1.2864585722349733, "language_loss": 0.72496653, "learning_rate": 3.5099010353370934e-06, "loss": 0.74690861, "num_input_tokens_seen": 81538940, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.90625, "step": 3788, "time_per_iteration": 2.6791605949401855 }, { "auxiliary_loss_clip": 0.0115825, "auxiliary_loss_mlp": 0.01034834, "balance_loss_clip": 1.01820445, "balance_loss_mlp": 1.04907823, "epoch": 0.2278070043589358, "flos": 15553593924480.0, "grad_norm": 3.9964192983513898, "language_loss": 0.6775192, "learning_rate": 3.5096532394056487e-06, "loss": 0.69945008, "num_input_tokens_seen": 81555525, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.9140625, "step": 3789, "time_per_iteration": 2.573544502258301 }, { "auxiliary_loss_clip": 0.01158468, "auxiliary_loss_mlp": 0.01039942, "balance_loss_clip": 1.02197671, "balance_loss_mlp": 1.04847348, "epoch": 0.2278671276116038, "flos": 22416805393920.0, "grad_norm": 2.140394441489206, "language_loss": 0.7614491, "learning_rate": 3.5094053895985632e-06, "loss": 0.7834332, "num_input_tokens_seen": 81576305, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.91796875, "step": 3790, "time_per_iteration": 2.6571788787841797 }, { "auxiliary_loss_clip": 0.01173178, "auxiliary_loss_mlp": 0.0103636, "balance_loss_clip": 1.01942062, "balance_loss_mlp": 1.04572797, "epoch": 0.22792725086427176, "flos": 20631363033600.0, "grad_norm": 1.949571348472451, "language_loss": 0.90795857, "learning_rate": 3.5091574859246818e-06, "loss": 0.93005395, "num_input_tokens_seen": 81594115, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.9140625, "step": 3791, "time_per_iteration": 2.6054868698120117 }, { "auxiliary_loss_clip": 0.01148051, "auxiliary_loss_mlp": 0.01038469, "balance_loss_clip": 1.02070665, "balance_loss_mlp": 1.04535294, "epoch": 0.22798737411693973, "flos": 31428946903680.0, "grad_norm": 1.755656248068586, "language_loss": 0.82386625, "learning_rate": 3.508909528392852e-06, "loss": 0.8457315, "num_input_tokens_seen": 81615355, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9375, "step": 3792, "time_per_iteration": 2.6243531703948975 }, { "auxiliary_loss_clip": 0.01080983, "auxiliary_loss_mlp": 0.01004453, "balance_loss_clip": 1.00161552, "balance_loss_mlp": 1.04256892, "epoch": 0.2280474973696077, "flos": 52396685827200.0, "grad_norm": 1.1347363543464593, "language_loss": 0.65599352, "learning_rate": 3.5086615170119224e-06, "loss": 0.67684788, "num_input_tokens_seen": 81662075, "router_z_loss_clip": 0.02832031, "router_z_loss_mlp": 0.38476562, "step": 3793, "time_per_iteration": 2.923306941986084 }, { "auxiliary_loss_clip": 0.01144331, "auxiliary_loss_mlp": 0.01047565, "balance_loss_clip": 1.02918327, "balance_loss_mlp": 1.0478425, "epoch": 0.22810762062227566, "flos": 26396066816640.0, "grad_norm": 2.698486940695643, "language_loss": 0.76816052, "learning_rate": 3.508413451790744e-06, "loss": 0.79007947, "num_input_tokens_seen": 81681625, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.96484375, "step": 3794, "time_per_iteration": 2.638622283935547 }, { "auxiliary_loss_clip": 0.01169105, "auxiliary_loss_mlp": 0.01038859, "balance_loss_clip": 1.02189541, "balance_loss_mlp": 1.0477438, "epoch": 0.22816774387494362, "flos": 25630271832960.0, "grad_norm": 1.8525156071163575, "language_loss": 0.80327666, "learning_rate": 3.50816533273817e-06, "loss": 0.8253563, "num_input_tokens_seen": 81701170, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.94140625, "step": 3795, "time_per_iteration": 2.633451461791992 }, { "auxiliary_loss_clip": 0.01148577, "auxiliary_loss_mlp": 0.01044499, "balance_loss_clip": 1.02759528, "balance_loss_mlp": 1.04741085, "epoch": 0.22822786712761162, "flos": 22451602694400.0, "grad_norm": 1.7566867024133317, "language_loss": 0.76644093, "learning_rate": 3.507917159863054e-06, "loss": 0.78837168, "num_input_tokens_seen": 81721265, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.921875, "step": 3796, "time_per_iteration": 2.676643133163452 }, { "auxiliary_loss_clip": 0.01166329, "auxiliary_loss_mlp": 0.01287688, "balance_loss_clip": 1.0238328, "balance_loss_mlp": 1.04719114, "epoch": 0.22828799038027958, "flos": 12202554395520.0, "grad_norm": 2.3690057550497627, "language_loss": 0.95595527, "learning_rate": 3.507668933174254e-06, "loss": 0.98049539, "num_input_tokens_seen": 81736565, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.921875, "step": 3797, "time_per_iteration": 3.895054578781128 }, { "auxiliary_loss_clip": 0.01149179, "auxiliary_loss_mlp": 0.01285928, "balance_loss_clip": 1.02132857, "balance_loss_mlp": 1.04846323, "epoch": 0.22834811363294755, "flos": 22085708803200.0, "grad_norm": 1.551849799460367, "language_loss": 0.81427896, "learning_rate": 3.5074206526806274e-06, "loss": 0.83863002, "num_input_tokens_seen": 81756240, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.91796875, "step": 3798, "time_per_iteration": 2.6077277660369873 }, { "auxiliary_loss_clip": 0.01170594, "auxiliary_loss_mlp": 0.0103777, "balance_loss_clip": 1.02031803, "balance_loss_mlp": 1.04888797, "epoch": 0.2284082368856155, "flos": 24860634094080.0, "grad_norm": 2.248995070604509, "language_loss": 0.79015839, "learning_rate": 3.507172318391036e-06, "loss": 0.81224203, "num_input_tokens_seen": 81775720, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.94921875, "step": 3799, "time_per_iteration": 4.109394550323486 }, { "auxiliary_loss_clip": 0.01159847, "auxiliary_loss_mlp": 0.01047551, "balance_loss_clip": 1.03039694, "balance_loss_mlp": 1.04954958, "epoch": 0.22846836013828348, "flos": 23292882109440.0, "grad_norm": 1.572006066876646, "language_loss": 0.75046521, "learning_rate": 3.506923930314341e-06, "loss": 0.77253914, "num_input_tokens_seen": 81795830, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.921875, "step": 3800, "time_per_iteration": 4.186857223510742 }, { "auxiliary_loss_clip": 0.01160443, "auxiliary_loss_mlp": 0.01042362, "balance_loss_clip": 1.02598274, "balance_loss_mlp": 1.050722, "epoch": 0.22852848339095144, "flos": 27416288810880.0, "grad_norm": 1.6503815997750109, "language_loss": 0.63554454, "learning_rate": 3.5066754884594072e-06, "loss": 0.65757263, "num_input_tokens_seen": 81815745, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.91796875, "step": 3801, "time_per_iteration": 2.652876853942871 }, { "auxiliary_loss_clip": 0.01157736, "auxiliary_loss_mlp": 0.0103615, "balance_loss_clip": 1.02030754, "balance_loss_mlp": 1.05047238, "epoch": 0.2285886066436194, "flos": 26321157002880.0, "grad_norm": 1.650780956553812, "language_loss": 0.81572449, "learning_rate": 3.5064269928351005e-06, "loss": 0.83766329, "num_input_tokens_seen": 81835155, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.890625, "step": 3802, "time_per_iteration": 2.663389205932617 }, { "auxiliary_loss_clip": 0.01157626, "auxiliary_loss_mlp": 0.01050073, "balance_loss_clip": 1.03252506, "balance_loss_mlp": 1.04727185, "epoch": 0.2286487298962874, "flos": 29716475022720.0, "grad_norm": 1.7585784073313566, "language_loss": 0.78964704, "learning_rate": 3.5061784434502897e-06, "loss": 0.81172407, "num_input_tokens_seen": 81855655, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.921875, "step": 3803, "time_per_iteration": 4.083507776260376 }, { "auxiliary_loss_clip": 0.01166438, "auxiliary_loss_mlp": 0.01043185, "balance_loss_clip": 1.02605426, "balance_loss_mlp": 1.04579616, "epoch": 0.22870885314895537, "flos": 21287199507840.0, "grad_norm": 1.807973749250123, "language_loss": 0.85251391, "learning_rate": 3.505929840313845e-06, "loss": 0.87461007, "num_input_tokens_seen": 81876385, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9375, "step": 3804, "time_per_iteration": 2.5912811756134033 }, { "auxiliary_loss_clip": 0.0115178, "auxiliary_loss_mlp": 0.01040228, "balance_loss_clip": 1.02314496, "balance_loss_mlp": 1.04693758, "epoch": 0.22876897640162333, "flos": 14939450161920.0, "grad_norm": 2.3686007314279722, "language_loss": 0.76288599, "learning_rate": 3.5056811834346382e-06, "loss": 0.78480607, "num_input_tokens_seen": 81893225, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.95703125, "step": 3805, "time_per_iteration": 2.5951480865478516 }, { "auxiliary_loss_clip": 0.01161455, "auxiliary_loss_mlp": 0.01294715, "balance_loss_clip": 1.02939129, "balance_loss_mlp": 1.04790258, "epoch": 0.2288290996542913, "flos": 18113917409280.0, "grad_norm": 2.306765055635217, "language_loss": 0.78617883, "learning_rate": 3.5054324728215423e-06, "loss": 0.81074047, "num_input_tokens_seen": 81911350, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.9609375, "step": 3806, "time_per_iteration": 2.522247552871704 }, { "auxiliary_loss_clip": 0.01160131, "auxiliary_loss_mlp": 0.01048964, "balance_loss_clip": 1.03170276, "balance_loss_mlp": 1.04843402, "epoch": 0.22888922290695926, "flos": 39855457071360.0, "grad_norm": 2.3770386800596306, "language_loss": 0.69701552, "learning_rate": 3.505183708483434e-06, "loss": 0.71910644, "num_input_tokens_seen": 81935420, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.9375, "step": 3807, "time_per_iteration": 2.774681568145752 }, { "auxiliary_loss_clip": 0.0115663, "auxiliary_loss_mlp": 0.01050573, "balance_loss_clip": 1.03233385, "balance_loss_mlp": 1.05162501, "epoch": 0.22894934615962723, "flos": 23403774372480.0, "grad_norm": 5.369199587110222, "language_loss": 0.65321434, "learning_rate": 3.504934890429191e-06, "loss": 0.67528641, "num_input_tokens_seen": 81953845, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.9609375, "step": 3808, "time_per_iteration": 2.5319759845733643 }, { "auxiliary_loss_clip": 0.0117576, "auxiliary_loss_mlp": 0.01051805, "balance_loss_clip": 1.03442454, "balance_loss_mlp": 1.04727244, "epoch": 0.22900946941229522, "flos": 18843011671680.0, "grad_norm": 2.979254189332079, "language_loss": 0.74329746, "learning_rate": 3.5046860186676936e-06, "loss": 0.76557308, "num_input_tokens_seen": 81972100, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.92578125, "step": 3809, "time_per_iteration": 2.692474603652954 }, { "auxiliary_loss_clip": 0.01157744, "auxiliary_loss_mlp": 0.01044532, "balance_loss_clip": 1.02817595, "balance_loss_mlp": 1.04918134, "epoch": 0.22906959266496318, "flos": 22929394429440.0, "grad_norm": 1.559581334691944, "language_loss": 0.81514549, "learning_rate": 3.504437093207822e-06, "loss": 0.83716828, "num_input_tokens_seen": 81992760, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.90625, "step": 3810, "time_per_iteration": 2.524547815322876 }, { "auxiliary_loss_clip": 0.01146009, "auxiliary_loss_mlp": 0.01039328, "balance_loss_clip": 1.02383065, "balance_loss_mlp": 1.04702556, "epoch": 0.22912971591763115, "flos": 19354523299200.0, "grad_norm": 2.832162714264442, "language_loss": 0.78089207, "learning_rate": 3.5041881140584602e-06, "loss": 0.80274546, "num_input_tokens_seen": 82009080, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8984375, "step": 3811, "time_per_iteration": 2.586078643798828 }, { "auxiliary_loss_clip": 0.01157726, "auxiliary_loss_mlp": 0.01292473, "balance_loss_clip": 1.02739573, "balance_loss_mlp": 1.04691911, "epoch": 0.22918983917029911, "flos": 19933546538880.0, "grad_norm": 1.7058193960448398, "language_loss": 0.82722557, "learning_rate": 3.5039390812284937e-06, "loss": 0.8517276, "num_input_tokens_seen": 82026705, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.92578125, "step": 3812, "time_per_iteration": 2.587629556655884 }, { "auxiliary_loss_clip": 0.01181863, "auxiliary_loss_mlp": 0.01048491, "balance_loss_clip": 1.0315156, "balance_loss_mlp": 1.050179, "epoch": 0.22924996242296708, "flos": 16690885320960.0, "grad_norm": 3.3825459023592757, "language_loss": 0.83056438, "learning_rate": 3.5036899947268105e-06, "loss": 0.8528679, "num_input_tokens_seen": 82043245, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.953125, "step": 3813, "time_per_iteration": 2.579122304916382 }, { "auxiliary_loss_clip": 0.01148263, "auxiliary_loss_mlp": 0.01043877, "balance_loss_clip": 1.02685356, "balance_loss_mlp": 1.04656541, "epoch": 0.22931008567563504, "flos": 33036164956800.0, "grad_norm": 1.6965945855449953, "language_loss": 0.69813484, "learning_rate": 3.5034408545623e-06, "loss": 0.72005618, "num_input_tokens_seen": 82066870, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.92578125, "step": 3814, "time_per_iteration": 2.6074509620666504 }, { "auxiliary_loss_clip": 0.01144908, "auxiliary_loss_mlp": 0.01041411, "balance_loss_clip": 1.02414989, "balance_loss_mlp": 1.04415345, "epoch": 0.229370208928303, "flos": 23330696152320.0, "grad_norm": 2.646301078049816, "language_loss": 0.66927207, "learning_rate": 3.5031916607438516e-06, "loss": 0.69113529, "num_input_tokens_seen": 82083180, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.9140625, "step": 3815, "time_per_iteration": 2.640288829803467 }, { "auxiliary_loss_clip": 0.01148903, "auxiliary_loss_mlp": 0.01040491, "balance_loss_clip": 1.02382565, "balance_loss_mlp": 1.04765284, "epoch": 0.229430332180971, "flos": 28617213150720.0, "grad_norm": 1.7668939025447132, "language_loss": 0.83138442, "learning_rate": 3.50294241328036e-06, "loss": 0.8532784, "num_input_tokens_seen": 82102950, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.921875, "step": 3816, "time_per_iteration": 2.703294277191162 }, { "auxiliary_loss_clip": 0.01146726, "auxiliary_loss_mlp": 0.01036454, "balance_loss_clip": 1.01921606, "balance_loss_mlp": 1.04675686, "epoch": 0.22949045543363897, "flos": 17238199829760.0, "grad_norm": 2.478158513877142, "language_loss": 0.86751086, "learning_rate": 3.5026931121807195e-06, "loss": 0.88934267, "num_input_tokens_seen": 82119510, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9140625, "step": 3817, "time_per_iteration": 2.5298871994018555 }, { "auxiliary_loss_clip": 0.0115782, "auxiliary_loss_mlp": 0.01044047, "balance_loss_clip": 1.02745342, "balance_loss_mlp": 1.04724658, "epoch": 0.22955057868630693, "flos": 27489438858240.0, "grad_norm": 1.7625722581247336, "language_loss": 0.74769402, "learning_rate": 3.5024437574538275e-06, "loss": 0.76971269, "num_input_tokens_seen": 82140095, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.92578125, "step": 3818, "time_per_iteration": 2.612781524658203 }, { "auxiliary_loss_clip": 0.0114827, "auxiliary_loss_mlp": 0.01036038, "balance_loss_clip": 1.01874089, "balance_loss_mlp": 1.04741335, "epoch": 0.2296107019389749, "flos": 23476421629440.0, "grad_norm": 1.8961736703962189, "language_loss": 0.74598777, "learning_rate": 3.5021943491085823e-06, "loss": 0.76783085, "num_input_tokens_seen": 82159510, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.921875, "step": 3819, "time_per_iteration": 2.5933568477630615 }, { "auxiliary_loss_clip": 0.01147122, "auxiliary_loss_mlp": 0.01042627, "balance_loss_clip": 1.02511549, "balance_loss_mlp": 1.04730725, "epoch": 0.22967082519164286, "flos": 31285160760960.0, "grad_norm": 2.085303170591151, "language_loss": 0.80808365, "learning_rate": 3.5019448871538853e-06, "loss": 0.82998115, "num_input_tokens_seen": 82179580, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.91015625, "step": 3820, "time_per_iteration": 2.720966100692749 }, { "auxiliary_loss_clip": 0.01159289, "auxiliary_loss_mlp": 0.01041429, "balance_loss_clip": 1.0234046, "balance_loss_mlp": 1.04692101, "epoch": 0.22973094844431083, "flos": 14642935390080.0, "grad_norm": 2.448946000956506, "language_loss": 0.68847096, "learning_rate": 3.501695371598638e-06, "loss": 0.71047807, "num_input_tokens_seen": 82195585, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.9453125, "step": 3821, "time_per_iteration": 2.605245351791382 }, { "auxiliary_loss_clip": 0.01172523, "auxiliary_loss_mlp": 0.01035802, "balance_loss_clip": 1.01889837, "balance_loss_mlp": 1.04613209, "epoch": 0.2297910716969788, "flos": 22823853292800.0, "grad_norm": 2.4278588000674586, "language_loss": 0.82941288, "learning_rate": 3.501445802451746e-06, "loss": 0.8514961, "num_input_tokens_seen": 82217530, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.91015625, "step": 3822, "time_per_iteration": 2.66141414642334 }, { "auxiliary_loss_clip": 0.01135518, "auxiliary_loss_mlp": 0.01041373, "balance_loss_clip": 1.02376616, "balance_loss_mlp": 1.04495311, "epoch": 0.2298511949496468, "flos": 23039029716480.0, "grad_norm": 1.7180066109035466, "language_loss": 0.66242898, "learning_rate": 3.5011961797221158e-06, "loss": 0.6841979, "num_input_tokens_seen": 82237980, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.90625, "step": 3823, "time_per_iteration": 2.5773868560791016 }, { "auxiliary_loss_clip": 0.01072973, "auxiliary_loss_mlp": 0.01012599, "balance_loss_clip": 1.00994039, "balance_loss_mlp": 1.03000975, "epoch": 0.22991131820231475, "flos": 66890914911360.0, "grad_norm": 0.8006487336236724, "language_loss": 0.56767273, "learning_rate": 3.5009465034186554e-06, "loss": 0.58852851, "num_input_tokens_seen": 82301785, "router_z_loss_clip": 0.02661133, "router_z_loss_mlp": 0.33984375, "step": 3824, "time_per_iteration": 3.267531394958496 }, { "auxiliary_loss_clip": 0.01152286, "auxiliary_loss_mlp": 0.01035099, "balance_loss_clip": 1.01902986, "balance_loss_mlp": 1.0453608, "epoch": 0.22997144145498272, "flos": 17887248633600.0, "grad_norm": 2.2362944149206125, "language_loss": 0.73172867, "learning_rate": 3.500696773550275e-06, "loss": 0.7536025, "num_input_tokens_seen": 82317355, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.890625, "step": 3825, "time_per_iteration": 2.5282154083251953 }, { "auxiliary_loss_clip": 0.01150284, "auxiliary_loss_mlp": 0.01045943, "balance_loss_clip": 1.02802563, "balance_loss_mlp": 1.04954457, "epoch": 0.23003156470765068, "flos": 24676843178880.0, "grad_norm": 2.7703888407256976, "language_loss": 0.87706327, "learning_rate": 3.5004469901258873e-06, "loss": 0.89902556, "num_input_tokens_seen": 82336645, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.921875, "step": 3826, "time_per_iteration": 2.661444902420044 }, { "auxiliary_loss_clip": 0.01159729, "auxiliary_loss_mlp": 0.01040698, "balance_loss_clip": 1.02254295, "balance_loss_mlp": 1.04699397, "epoch": 0.23009168796031865, "flos": 15814126247040.0, "grad_norm": 2.81816295471606, "language_loss": 0.81441408, "learning_rate": 3.5001971531544053e-06, "loss": 0.83641839, "num_input_tokens_seen": 82354225, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9453125, "step": 3827, "time_per_iteration": 2.531568765640259 }, { "auxiliary_loss_clip": 0.0118089, "auxiliary_loss_mlp": 0.01037467, "balance_loss_clip": 1.02173185, "balance_loss_mlp": 1.04659081, "epoch": 0.2301518112129866, "flos": 16212842190720.0, "grad_norm": 2.0959694696809064, "language_loss": 0.86225927, "learning_rate": 3.499947262644747e-06, "loss": 0.88444287, "num_input_tokens_seen": 82370240, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.890625, "step": 3828, "time_per_iteration": 2.639055013656616 }, { "auxiliary_loss_clip": 0.01153326, "auxiliary_loss_mlp": 0.01042674, "balance_loss_clip": 1.02551985, "balance_loss_mlp": 1.04521608, "epoch": 0.2302119344656546, "flos": 20595452411520.0, "grad_norm": 2.0474556886031308, "language_loss": 0.70663452, "learning_rate": 3.4996973186058284e-06, "loss": 0.72859454, "num_input_tokens_seen": 82389145, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8984375, "step": 3829, "time_per_iteration": 2.6020913124084473 }, { "auxiliary_loss_clip": 0.01181286, "auxiliary_loss_mlp": 0.01035951, "balance_loss_clip": 1.02017975, "balance_loss_mlp": 1.04856813, "epoch": 0.23027205771832257, "flos": 26796901662720.0, "grad_norm": 1.5842460794836322, "language_loss": 0.8428452, "learning_rate": 3.4994473210465706e-06, "loss": 0.86501765, "num_input_tokens_seen": 82409185, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.87890625, "step": 3830, "time_per_iteration": 2.689401626586914 }, { "auxiliary_loss_clip": 0.0114471, "auxiliary_loss_mlp": 0.01043309, "balance_loss_clip": 1.02659547, "balance_loss_mlp": 1.04577184, "epoch": 0.23033218097099054, "flos": 43873143068160.0, "grad_norm": 2.3760771525686537, "language_loss": 0.66919434, "learning_rate": 3.499197269975895e-06, "loss": 0.69107449, "num_input_tokens_seen": 82432070, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8984375, "step": 3831, "time_per_iteration": 2.708273410797119 }, { "auxiliary_loss_clip": 0.01168575, "auxiliary_loss_mlp": 0.01044002, "balance_loss_clip": 1.02775407, "balance_loss_mlp": 1.04854035, "epoch": 0.2303923042236585, "flos": 26067663745920.0, "grad_norm": 2.607479308459801, "language_loss": 0.75359857, "learning_rate": 3.4989471654027247e-06, "loss": 0.77572441, "num_input_tokens_seen": 82450625, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.9296875, "step": 3832, "time_per_iteration": 2.6951069831848145 }, { "auxiliary_loss_clip": 0.01172274, "auxiliary_loss_mlp": 0.01040593, "balance_loss_clip": 1.02390397, "balance_loss_mlp": 1.04686511, "epoch": 0.23045242747632647, "flos": 18296379521280.0, "grad_norm": 1.789563803501262, "language_loss": 0.87234038, "learning_rate": 3.4986970073359865e-06, "loss": 0.89446902, "num_input_tokens_seen": 82468575, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8984375, "step": 3833, "time_per_iteration": 2.559051275253296 }, { "auxiliary_loss_clip": 0.01163675, "auxiliary_loss_mlp": 0.01039317, "balance_loss_clip": 1.02232981, "balance_loss_mlp": 1.04741228, "epoch": 0.23051255072899443, "flos": 25520528805120.0, "grad_norm": 1.9580567923909233, "language_loss": 0.74697691, "learning_rate": 3.498446795784607e-06, "loss": 0.76900685, "num_input_tokens_seen": 82488655, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.89453125, "step": 3834, "time_per_iteration": 2.6725172996520996 }, { "auxiliary_loss_clip": 0.01163495, "auxiliary_loss_mlp": 0.01041575, "balance_loss_clip": 1.02451658, "balance_loss_mlp": 1.04860687, "epoch": 0.2305726739816624, "flos": 21215198695680.0, "grad_norm": 2.029661563476863, "language_loss": 0.85666555, "learning_rate": 3.4981965307575153e-06, "loss": 0.87871623, "num_input_tokens_seen": 82507220, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.875, "step": 3835, "time_per_iteration": 2.639716625213623 }, { "auxiliary_loss_clip": 0.01172043, "auxiliary_loss_mlp": 0.01040527, "balance_loss_clip": 1.02334929, "balance_loss_mlp": 1.04995179, "epoch": 0.2306327972343304, "flos": 23331127115520.0, "grad_norm": 2.502076109721961, "language_loss": 0.8214525, "learning_rate": 3.4979462122636436e-06, "loss": 0.84357816, "num_input_tokens_seen": 82527920, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.94921875, "step": 3836, "time_per_iteration": 2.616081953048706 }, { "auxiliary_loss_clip": 0.01150786, "auxiliary_loss_mlp": 0.01037451, "balance_loss_clip": 1.02133441, "balance_loss_mlp": 1.05160069, "epoch": 0.23069292048699835, "flos": 20666734951680.0, "grad_norm": 1.8412905730890547, "language_loss": 0.79633451, "learning_rate": 3.497695840311925e-06, "loss": 0.81821686, "num_input_tokens_seen": 82549040, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8984375, "step": 3837, "time_per_iteration": 2.603022813796997 }, { "auxiliary_loss_clip": 0.01098902, "auxiliary_loss_mlp": 0.01014495, "balance_loss_clip": 1.01206291, "balance_loss_mlp": 1.03755856, "epoch": 0.23075304373966632, "flos": 70454832393600.0, "grad_norm": 0.8895801563772937, "language_loss": 0.6540044, "learning_rate": 3.4974454149112943e-06, "loss": 0.67513841, "num_input_tokens_seen": 82604070, "router_z_loss_clip": 0.02429199, "router_z_loss_mlp": 0.34570312, "step": 3838, "time_per_iteration": 4.518194675445557 }, { "auxiliary_loss_clip": 0.01147735, "auxiliary_loss_mlp": 0.01041952, "balance_loss_clip": 1.02641904, "balance_loss_mlp": 1.04985952, "epoch": 0.23081316699233428, "flos": 16617986668800.0, "grad_norm": 2.015455624629476, "language_loss": 0.76125431, "learning_rate": 3.4971949360706887e-06, "loss": 0.78315115, "num_input_tokens_seen": 82619665, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.890625, "step": 3839, "time_per_iteration": 2.590872287750244 }, { "auxiliary_loss_clip": 0.0116062, "auxiliary_loss_mlp": 0.01037939, "balance_loss_clip": 1.02138114, "balance_loss_mlp": 1.05130792, "epoch": 0.23087329024500225, "flos": 13298081253120.0, "grad_norm": 1.874519625432217, "language_loss": 0.68611509, "learning_rate": 3.4969444037990466e-06, "loss": 0.70810068, "num_input_tokens_seen": 82637530, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.91015625, "step": 3840, "time_per_iteration": 2.5230979919433594 }, { "auxiliary_loss_clip": 0.01142158, "auxiliary_loss_mlp": 0.0103883, "balance_loss_clip": 1.02152121, "balance_loss_mlp": 1.05202258, "epoch": 0.23093341349767021, "flos": 17785729820160.0, "grad_norm": 3.322559332079898, "language_loss": 0.79385185, "learning_rate": 3.49669381810531e-06, "loss": 0.81566179, "num_input_tokens_seen": 82656130, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.90234375, "step": 3841, "time_per_iteration": 5.549793720245361 }, { "auxiliary_loss_clip": 0.01148684, "auxiliary_loss_mlp": 0.0103836, "balance_loss_clip": 1.02381682, "balance_loss_mlp": 1.05060613, "epoch": 0.23099353675033818, "flos": 23988076911360.0, "grad_norm": 2.2581782689707164, "language_loss": 0.82940084, "learning_rate": 3.4964431789984204e-06, "loss": 0.85127127, "num_input_tokens_seen": 82675295, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.890625, "step": 3842, "time_per_iteration": 2.573639392852783 }, { "auxiliary_loss_clip": 0.01147498, "auxiliary_loss_mlp": 0.01043994, "balance_loss_clip": 1.02674389, "balance_loss_mlp": 1.04908895, "epoch": 0.23105366000300617, "flos": 35995168471680.0, "grad_norm": 1.835545413447985, "language_loss": 0.66417074, "learning_rate": 3.496192486487323e-06, "loss": 0.6860857, "num_input_tokens_seen": 82703260, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.89453125, "step": 3843, "time_per_iteration": 2.7878246307373047 }, { "auxiliary_loss_clip": 0.01145671, "auxiliary_loss_mlp": 0.01035627, "balance_loss_clip": 1.01979649, "balance_loss_mlp": 1.04910529, "epoch": 0.23111378325567414, "flos": 31245335556480.0, "grad_norm": 1.7709883670530253, "language_loss": 0.76945275, "learning_rate": 3.495941740580965e-06, "loss": 0.79126573, "num_input_tokens_seen": 82725060, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.875, "step": 3844, "time_per_iteration": 4.0347900390625 }, { "auxiliary_loss_clip": 0.01150814, "auxiliary_loss_mlp": 0.01040497, "balance_loss_clip": 1.02366471, "balance_loss_mlp": 1.05103064, "epoch": 0.2311739065083421, "flos": 19208223204480.0, "grad_norm": 1.5878949702676013, "language_loss": 0.77997178, "learning_rate": 3.495690941288294e-06, "loss": 0.80188489, "num_input_tokens_seen": 82742960, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.90625, "step": 3845, "time_per_iteration": 2.512345790863037 }, { "auxiliary_loss_clip": 0.01159171, "auxiliary_loss_mlp": 0.01029392, "balance_loss_clip": 1.01489592, "balance_loss_mlp": 1.04781342, "epoch": 0.23123402976101007, "flos": 23360178240000.0, "grad_norm": 2.406978795426782, "language_loss": 0.75847501, "learning_rate": 3.495440088618261e-06, "loss": 0.78036058, "num_input_tokens_seen": 82760205, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.84375, "step": 3846, "time_per_iteration": 2.7949280738830566 }, { "auxiliary_loss_clip": 0.01156854, "auxiliary_loss_mlp": 0.01047342, "balance_loss_clip": 1.0314157, "balance_loss_mlp": 1.04865265, "epoch": 0.23129415301367803, "flos": 13735365425280.0, "grad_norm": 1.940266089519195, "language_loss": 0.69427812, "learning_rate": 3.4951891825798177e-06, "loss": 0.71632004, "num_input_tokens_seen": 82778590, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.90625, "step": 3847, "time_per_iteration": 2.5387513637542725 }, { "auxiliary_loss_clip": 0.01071073, "auxiliary_loss_mlp": 0.01003695, "balance_loss_clip": 1.00132298, "balance_loss_mlp": 1.02923238, "epoch": 0.231354276266346, "flos": 69737015001600.0, "grad_norm": 0.7833521806842378, "language_loss": 0.60984886, "learning_rate": 3.4949382231819186e-06, "loss": 0.63059652, "num_input_tokens_seen": 82833925, "router_z_loss_clip": 0.02368164, "router_z_loss_mlp": 0.328125, "step": 3848, "time_per_iteration": 3.1001622676849365 }, { "auxiliary_loss_clip": 0.01144518, "auxiliary_loss_mlp": 0.01042979, "balance_loss_clip": 1.02675509, "balance_loss_mlp": 1.04709172, "epoch": 0.231414399519014, "flos": 18835900778880.0, "grad_norm": 2.0161117970913986, "language_loss": 0.77956015, "learning_rate": 3.4946872104335192e-06, "loss": 0.80143511, "num_input_tokens_seen": 82850625, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.88671875, "step": 3849, "time_per_iteration": 2.517625570297241 }, { "auxiliary_loss_clip": 0.01146614, "auxiliary_loss_mlp": 0.01037666, "balance_loss_clip": 1.02152479, "balance_loss_mlp": 1.04703641, "epoch": 0.23147452277168196, "flos": 36135470995200.0, "grad_norm": 1.8900056999318984, "language_loss": 0.70963407, "learning_rate": 3.4944361443435788e-06, "loss": 0.73147678, "num_input_tokens_seen": 82872105, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.90625, "step": 3850, "time_per_iteration": 2.7454452514648438 }, { "auxiliary_loss_clip": 0.01153117, "auxiliary_loss_mlp": 0.01281887, "balance_loss_clip": 1.018821, "balance_loss_mlp": 1.04572272, "epoch": 0.23153464602434992, "flos": 20812927305600.0, "grad_norm": 2.0570066159485596, "language_loss": 0.76233125, "learning_rate": 3.4941850249210562e-06, "loss": 0.78668129, "num_input_tokens_seen": 82890595, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.89453125, "step": 3851, "time_per_iteration": 2.526395082473755 }, { "auxiliary_loss_clip": 0.01136308, "auxiliary_loss_mlp": 0.01037532, "balance_loss_clip": 1.02137935, "balance_loss_mlp": 1.04849505, "epoch": 0.2315947692770179, "flos": 19939256801280.0, "grad_norm": 1.9351285420039068, "language_loss": 0.69740576, "learning_rate": 3.4939338521749137e-06, "loss": 0.71914417, "num_input_tokens_seen": 82908910, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.87890625, "step": 3852, "time_per_iteration": 2.5724265575408936 }, { "auxiliary_loss_clip": 0.01167197, "auxiliary_loss_mlp": 0.01038615, "balance_loss_clip": 1.02264118, "balance_loss_mlp": 1.04908872, "epoch": 0.23165489252968585, "flos": 12855553695360.0, "grad_norm": 6.973617157993782, "language_loss": 0.67134881, "learning_rate": 3.493682626114115e-06, "loss": 0.69340694, "num_input_tokens_seen": 82925405, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.90625, "step": 3853, "time_per_iteration": 2.4984772205352783 }, { "auxiliary_loss_clip": 0.01147803, "auxiliary_loss_mlp": 0.01034266, "balance_loss_clip": 1.01725495, "balance_loss_mlp": 1.04780483, "epoch": 0.23171501578235382, "flos": 30628282792320.0, "grad_norm": 2.177862946088824, "language_loss": 0.79554045, "learning_rate": 3.4934313467476255e-06, "loss": 0.81736112, "num_input_tokens_seen": 82945615, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.91015625, "step": 3854, "time_per_iteration": 2.7083675861358643 }, { "auxiliary_loss_clip": 0.0116119, "auxiliary_loss_mlp": 0.01038117, "balance_loss_clip": 1.02061677, "balance_loss_mlp": 1.04725134, "epoch": 0.23177513903502178, "flos": 23842782397440.0, "grad_norm": 2.499638096573916, "language_loss": 0.65501654, "learning_rate": 3.4931800140844123e-06, "loss": 0.67700952, "num_input_tokens_seen": 82967570, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.953125, "step": 3855, "time_per_iteration": 2.5549795627593994 }, { "auxiliary_loss_clip": 0.01138688, "auxiliary_loss_mlp": 0.01045615, "balance_loss_clip": 1.02940214, "balance_loss_mlp": 1.0472641, "epoch": 0.23183526228768978, "flos": 29570282668800.0, "grad_norm": 1.8682125268956178, "language_loss": 0.70418799, "learning_rate": 3.4929286281334455e-06, "loss": 0.72603095, "num_input_tokens_seen": 82987435, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.9140625, "step": 3856, "time_per_iteration": 2.620685577392578 }, { "auxiliary_loss_clip": 0.01173198, "auxiliary_loss_mlp": 0.01038493, "balance_loss_clip": 1.02377081, "balance_loss_mlp": 1.04832006, "epoch": 0.23189538554035774, "flos": 34458694254720.0, "grad_norm": 1.965975461945441, "language_loss": 0.76992762, "learning_rate": 3.4926771889036964e-06, "loss": 0.79204452, "num_input_tokens_seen": 83010505, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.890625, "step": 3857, "time_per_iteration": 2.6419425010681152 }, { "auxiliary_loss_clip": 0.01161156, "auxiliary_loss_mlp": 0.01298364, "balance_loss_clip": 1.0329268, "balance_loss_mlp": 1.0492394, "epoch": 0.2319555087930257, "flos": 18003815245440.0, "grad_norm": 2.5951855256582252, "language_loss": 0.9126538, "learning_rate": 3.4924256964041387e-06, "loss": 0.93724906, "num_input_tokens_seen": 83026705, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9375, "step": 3858, "time_per_iteration": 2.5213611125946045 }, { "auxiliary_loss_clip": 0.01170017, "auxiliary_loss_mlp": 0.01035909, "balance_loss_clip": 1.01963651, "balance_loss_mlp": 1.0479331, "epoch": 0.23201563204569367, "flos": 23143852581120.0, "grad_norm": 2.259000252547913, "language_loss": 0.76149046, "learning_rate": 3.492174150643746e-06, "loss": 0.78354967, "num_input_tokens_seen": 83046500, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.859375, "step": 3859, "time_per_iteration": 2.542897939682007 }, { "auxiliary_loss_clip": 0.01143813, "auxiliary_loss_mlp": 0.01033696, "balance_loss_clip": 1.01720905, "balance_loss_mlp": 1.0456084, "epoch": 0.23207575529836164, "flos": 20667991927680.0, "grad_norm": 1.697784309867959, "language_loss": 0.84136981, "learning_rate": 3.4919225516314967e-06, "loss": 0.86314487, "num_input_tokens_seen": 83065280, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.890625, "step": 3860, "time_per_iteration": 2.5672101974487305 }, { "auxiliary_loss_clip": 0.01171723, "auxiliary_loss_mlp": 0.01037646, "balance_loss_clip": 1.02102828, "balance_loss_mlp": 1.04739523, "epoch": 0.2321358785510296, "flos": 16472189364480.0, "grad_norm": 2.830482799187947, "language_loss": 0.82758629, "learning_rate": 3.491670899376369e-06, "loss": 0.84967995, "num_input_tokens_seen": 83082310, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8828125, "step": 3861, "time_per_iteration": 2.5252084732055664 }, { "auxiliary_loss_clip": 0.01148317, "auxiliary_loss_mlp": 0.01288383, "balance_loss_clip": 1.02471244, "balance_loss_mlp": 1.04917955, "epoch": 0.2321960018036976, "flos": 21616320850560.0, "grad_norm": 1.708874611520855, "language_loss": 0.85523403, "learning_rate": 3.491419193887344e-06, "loss": 0.879601, "num_input_tokens_seen": 83102065, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8984375, "step": 3862, "time_per_iteration": 2.583000421524048 }, { "auxiliary_loss_clip": 0.01162335, "auxiliary_loss_mlp": 0.01035743, "balance_loss_clip": 1.02077007, "balance_loss_mlp": 1.047701, "epoch": 0.23225612505636556, "flos": 22271474966400.0, "grad_norm": 1.436267189578782, "language_loss": 0.74572325, "learning_rate": 3.4911674351734036e-06, "loss": 0.76770401, "num_input_tokens_seen": 83121445, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.875, "step": 3863, "time_per_iteration": 2.6360785961151123 }, { "auxiliary_loss_clip": 0.01163733, "auxiliary_loss_mlp": 0.01044553, "balance_loss_clip": 1.02946091, "balance_loss_mlp": 1.05055165, "epoch": 0.23231624830903352, "flos": 17052325925760.0, "grad_norm": 1.8166783009921157, "language_loss": 0.74506491, "learning_rate": 3.490915623243534e-06, "loss": 0.76714778, "num_input_tokens_seen": 83138175, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.859375, "step": 3864, "time_per_iteration": 2.57869815826416 }, { "auxiliary_loss_clip": 0.01135266, "auxiliary_loss_mlp": 0.01038576, "balance_loss_clip": 1.02237582, "balance_loss_mlp": 1.04659939, "epoch": 0.2323763715617015, "flos": 34640043045120.0, "grad_norm": 1.6628435791864966, "language_loss": 0.70733947, "learning_rate": 3.490663758106721e-06, "loss": 0.72907794, "num_input_tokens_seen": 83161975, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.88671875, "step": 3865, "time_per_iteration": 2.7119665145874023 }, { "auxiliary_loss_clip": 0.01146974, "auxiliary_loss_mlp": 0.01047689, "balance_loss_clip": 1.0297724, "balance_loss_mlp": 1.05069375, "epoch": 0.23243649481436945, "flos": 25551698832000.0, "grad_norm": 1.7890852199602263, "language_loss": 0.94562793, "learning_rate": 3.4904118397719527e-06, "loss": 0.9675746, "num_input_tokens_seen": 83180905, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.96484375, "step": 3866, "time_per_iteration": 2.6113812923431396 }, { "auxiliary_loss_clip": 0.01147666, "auxiliary_loss_mlp": 0.01039957, "balance_loss_clip": 1.02383995, "balance_loss_mlp": 1.05076003, "epoch": 0.23249661806703742, "flos": 20483482740480.0, "grad_norm": 3.083958979137906, "language_loss": 0.72739601, "learning_rate": 3.4901598682482198e-06, "loss": 0.74927223, "num_input_tokens_seen": 83196390, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.87890625, "step": 3867, "time_per_iteration": 2.53877592086792 }, { "auxiliary_loss_clip": 0.01155924, "auxiliary_loss_mlp": 0.01037535, "balance_loss_clip": 1.02065492, "balance_loss_mlp": 1.0485127, "epoch": 0.23255674131970538, "flos": 20376612800640.0, "grad_norm": 2.1480880018437127, "language_loss": 0.81834936, "learning_rate": 3.489907843544514e-06, "loss": 0.84028393, "num_input_tokens_seen": 83216165, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.89453125, "step": 3868, "time_per_iteration": 2.535025119781494 }, { "auxiliary_loss_clip": 0.01144649, "auxiliary_loss_mlp": 0.01038965, "balance_loss_clip": 1.02319384, "balance_loss_mlp": 1.04906714, "epoch": 0.23261686457237338, "flos": 17056096853760.0, "grad_norm": 1.8898490195842785, "language_loss": 0.72786069, "learning_rate": 3.48965576566983e-06, "loss": 0.74969673, "num_input_tokens_seen": 83233845, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.86328125, "step": 3869, "time_per_iteration": 2.5746841430664062 }, { "auxiliary_loss_clip": 0.01148496, "auxiliary_loss_mlp": 0.01043851, "balance_loss_clip": 1.02760291, "balance_loss_mlp": 1.05240083, "epoch": 0.23267698782504134, "flos": 29169878785920.0, "grad_norm": 1.7640396751188627, "language_loss": 0.79300207, "learning_rate": 3.4894036346331633e-06, "loss": 0.81492555, "num_input_tokens_seen": 83254930, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.87109375, "step": 3870, "time_per_iteration": 2.74403715133667 }, { "auxiliary_loss_clip": 0.01142915, "auxiliary_loss_mlp": 0.01038815, "balance_loss_clip": 1.0218513, "balance_loss_mlp": 1.05095053, "epoch": 0.2327371110777093, "flos": 21174655219200.0, "grad_norm": 2.4490909710406816, "language_loss": 0.70266461, "learning_rate": 3.4891514504435122e-06, "loss": 0.72448194, "num_input_tokens_seen": 83272095, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.91796875, "step": 3871, "time_per_iteration": 2.610291004180908 }, { "auxiliary_loss_clip": 0.01154511, "auxiliary_loss_mlp": 0.01051936, "balance_loss_clip": 1.0336374, "balance_loss_mlp": 1.05303061, "epoch": 0.23279723433037727, "flos": 24863112132480.0, "grad_norm": 2.049904845268225, "language_loss": 0.68650925, "learning_rate": 3.488899213109877e-06, "loss": 0.7085737, "num_input_tokens_seen": 83290980, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.92578125, "step": 3872, "time_per_iteration": 2.6206605434417725 }, { "auxiliary_loss_clip": 0.01154298, "auxiliary_loss_mlp": 0.01040555, "balance_loss_clip": 1.0228647, "balance_loss_mlp": 1.05118716, "epoch": 0.23285735758304524, "flos": 38800617344640.0, "grad_norm": 1.7521203754776316, "language_loss": 0.77758718, "learning_rate": 3.4886469226412574e-06, "loss": 0.79953575, "num_input_tokens_seen": 83315175, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.9375, "step": 3873, "time_per_iteration": 2.7391369342803955 }, { "auxiliary_loss_clip": 0.01083412, "auxiliary_loss_mlp": 0.01001476, "balance_loss_clip": 0.99927026, "balance_loss_mlp": 1.0343504, "epoch": 0.2329174808357132, "flos": 53944113692160.0, "grad_norm": 0.8683647324644127, "language_loss": 0.60431033, "learning_rate": 3.48839457904666e-06, "loss": 0.6251592, "num_input_tokens_seen": 83372060, "router_z_loss_clip": 0.02209473, "router_z_loss_mlp": 0.3125, "step": 3874, "time_per_iteration": 3.0759851932525635 }, { "auxiliary_loss_clip": 0.01158305, "auxiliary_loss_mlp": 0.01046768, "balance_loss_clip": 1.03003061, "balance_loss_mlp": 1.05114985, "epoch": 0.23297760408838117, "flos": 21216024708480.0, "grad_norm": 3.194659875122417, "language_loss": 0.80550843, "learning_rate": 3.488142182335088e-06, "loss": 0.82755923, "num_input_tokens_seen": 83389795, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.890625, "step": 3875, "time_per_iteration": 2.6159045696258545 }, { "auxiliary_loss_clip": 0.01139818, "auxiliary_loss_mlp": 0.01284311, "balance_loss_clip": 1.02183521, "balance_loss_mlp": 1.05174994, "epoch": 0.23303772734104916, "flos": 28403006394240.0, "grad_norm": 1.7560132269708257, "language_loss": 0.60995811, "learning_rate": 3.4878897325155493e-06, "loss": 0.63419938, "num_input_tokens_seen": 83410005, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.87890625, "step": 3876, "time_per_iteration": 2.5891449451446533 }, { "auxiliary_loss_clip": 0.01161044, "auxiliary_loss_mlp": 0.0104469, "balance_loss_clip": 1.02792919, "balance_loss_mlp": 1.05270481, "epoch": 0.23309785059371713, "flos": 24314720215680.0, "grad_norm": 2.7821722589417788, "language_loss": 0.70188594, "learning_rate": 3.4876372295970533e-06, "loss": 0.72394335, "num_input_tokens_seen": 83430250, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8984375, "step": 3877, "time_per_iteration": 2.6235134601593018 }, { "auxiliary_loss_clip": 0.01151506, "auxiliary_loss_mlp": 0.01045767, "balance_loss_clip": 1.02846932, "balance_loss_mlp": 1.05252898, "epoch": 0.2331579738463851, "flos": 15992925171840.0, "grad_norm": 2.3327703547435585, "language_loss": 0.80705559, "learning_rate": 3.4873846735886113e-06, "loss": 0.82902825, "num_input_tokens_seen": 83447950, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.90234375, "step": 3878, "time_per_iteration": 2.5230958461761475 }, { "auxiliary_loss_clip": 0.01189048, "auxiliary_loss_mlp": 0.01047381, "balance_loss_clip": 1.02948785, "balance_loss_mlp": 1.05037463, "epoch": 0.23321809709905306, "flos": 36426957863040.0, "grad_norm": 2.0780591855037986, "language_loss": 0.74971378, "learning_rate": 3.487132064499237e-06, "loss": 0.77207804, "num_input_tokens_seen": 83467785, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.9375, "step": 3879, "time_per_iteration": 4.111132383346558 }, { "auxiliary_loss_clip": 0.01174354, "auxiliary_loss_mlp": 0.01040823, "balance_loss_clip": 1.02430069, "balance_loss_mlp": 1.04826128, "epoch": 0.23327822035172102, "flos": 21324762155520.0, "grad_norm": 2.6139186422175755, "language_loss": 0.89464152, "learning_rate": 3.4868794023379433e-06, "loss": 0.91679323, "num_input_tokens_seen": 83485390, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8984375, "step": 3880, "time_per_iteration": 2.698418378829956 }, { "auxiliary_loss_clip": 0.01161316, "auxiliary_loss_mlp": 0.01047937, "balance_loss_clip": 1.03147388, "balance_loss_mlp": 1.05212963, "epoch": 0.233338343604389, "flos": 19171881619200.0, "grad_norm": 1.7804200539716222, "language_loss": 0.7186197, "learning_rate": 3.4866266871137495e-06, "loss": 0.74071223, "num_input_tokens_seen": 83504890, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.9140625, "step": 3881, "time_per_iteration": 2.613678455352783 }, { "auxiliary_loss_clip": 0.01140126, "auxiliary_loss_mlp": 0.01047675, "balance_loss_clip": 1.03123617, "balance_loss_mlp": 1.05101132, "epoch": 0.23339846685705698, "flos": 26908368543360.0, "grad_norm": 2.168675117427847, "language_loss": 0.68259025, "learning_rate": 3.486373918835673e-06, "loss": 0.70446825, "num_input_tokens_seen": 83526475, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.890625, "step": 3882, "time_per_iteration": 2.726325273513794 }, { "auxiliary_loss_clip": 0.01170401, "auxiliary_loss_mlp": 0.01050856, "balance_loss_clip": 1.03435731, "balance_loss_mlp": 1.05129838, "epoch": 0.23345859010972494, "flos": 32343160884480.0, "grad_norm": 1.9861843415060105, "language_loss": 0.76636338, "learning_rate": 3.486121097512735e-06, "loss": 0.78857601, "num_input_tokens_seen": 83546620, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.921875, "step": 3883, "time_per_iteration": 5.5396857261657715 }, { "auxiliary_loss_clip": 0.01085908, "auxiliary_loss_mlp": 0.01015711, "balance_loss_clip": 1.01368475, "balance_loss_mlp": 1.03628325, "epoch": 0.2335187133623929, "flos": 58484229050880.0, "grad_norm": 0.774997227430073, "language_loss": 0.59119773, "learning_rate": 3.4858682231539575e-06, "loss": 0.61221397, "num_input_tokens_seen": 83616160, "router_z_loss_clip": 0.02026367, "router_z_loss_mlp": 0.31640625, "step": 3884, "time_per_iteration": 3.3331923484802246 }, { "auxiliary_loss_clip": 0.01164499, "auxiliary_loss_mlp": 0.0103839, "balance_loss_clip": 1.02292895, "balance_loss_mlp": 1.04822898, "epoch": 0.23357883661506088, "flos": 24502317972480.0, "grad_norm": 1.7539410953606498, "language_loss": 0.80142653, "learning_rate": 3.4856152957683654e-06, "loss": 0.82345539, "num_input_tokens_seen": 83636795, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.890625, "step": 3885, "time_per_iteration": 4.019797325134277 }, { "auxiliary_loss_clip": 0.01147935, "auxiliary_loss_mlp": 0.01038365, "balance_loss_clip": 1.0217706, "balance_loss_mlp": 1.04943371, "epoch": 0.23363895986772884, "flos": 18948516894720.0, "grad_norm": 2.077960421127635, "language_loss": 0.88188386, "learning_rate": 3.4853623153649843e-06, "loss": 0.90374684, "num_input_tokens_seen": 83654050, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.890625, "step": 3886, "time_per_iteration": 2.5745127201080322 }, { "auxiliary_loss_clip": 0.01140631, "auxiliary_loss_mlp": 0.01037432, "balance_loss_clip": 1.02149975, "balance_loss_mlp": 1.04988897, "epoch": 0.2336990831203968, "flos": 31686821619840.0, "grad_norm": 2.144590645711635, "language_loss": 0.73235339, "learning_rate": 3.4851092819528434e-06, "loss": 0.75413406, "num_input_tokens_seen": 83673720, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.90625, "step": 3887, "time_per_iteration": 2.589780569076538 }, { "auxiliary_loss_clip": 0.01147064, "auxiliary_loss_mlp": 0.01041556, "balance_loss_clip": 1.02540958, "balance_loss_mlp": 1.04785204, "epoch": 0.23375920637306477, "flos": 27709750926720.0, "grad_norm": 2.114716799913819, "language_loss": 0.83923197, "learning_rate": 3.4848561955409723e-06, "loss": 0.8611182, "num_input_tokens_seen": 83693470, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.90234375, "step": 3888, "time_per_iteration": 2.612137794494629 }, { "auxiliary_loss_clip": 0.01155561, "auxiliary_loss_mlp": 0.01049716, "balance_loss_clip": 1.03382516, "balance_loss_mlp": 1.04867792, "epoch": 0.23381932962573276, "flos": 17675627656320.0, "grad_norm": 2.0825357405938694, "language_loss": 0.87143016, "learning_rate": 3.4846030561384036e-06, "loss": 0.89348292, "num_input_tokens_seen": 83711620, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.890625, "step": 3889, "time_per_iteration": 2.5157883167266846 }, { "auxiliary_loss_clip": 0.01148006, "auxiliary_loss_mlp": 0.01037003, "balance_loss_clip": 1.02126765, "balance_loss_mlp": 1.04751861, "epoch": 0.23387945287840073, "flos": 14390842763520.0, "grad_norm": 4.013788761939725, "language_loss": 0.77344918, "learning_rate": 3.48434986375417e-06, "loss": 0.79529929, "num_input_tokens_seen": 83727890, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.9140625, "step": 3890, "time_per_iteration": 2.5047202110290527 }, { "auxiliary_loss_clip": 0.01176381, "auxiliary_loss_mlp": 0.01287818, "balance_loss_clip": 1.02393603, "balance_loss_mlp": 1.05091214, "epoch": 0.2339395761310687, "flos": 46097988503040.0, "grad_norm": 2.072782067115361, "language_loss": 0.72465205, "learning_rate": 3.4840966183973085e-06, "loss": 0.74929398, "num_input_tokens_seen": 83749370, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.89453125, "step": 3891, "time_per_iteration": 2.7914938926696777 }, { "auxiliary_loss_clip": 0.01133757, "auxiliary_loss_mlp": 0.01035961, "balance_loss_clip": 1.01987338, "balance_loss_mlp": 1.04737806, "epoch": 0.23399969938373666, "flos": 22382044007040.0, "grad_norm": 1.563739596291093, "language_loss": 0.82376647, "learning_rate": 3.483843320076856e-06, "loss": 0.84546357, "num_input_tokens_seen": 83769560, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.86328125, "step": 3892, "time_per_iteration": 2.562310218811035 }, { "auxiliary_loss_clip": 0.0116638, "auxiliary_loss_mlp": 0.0103598, "balance_loss_clip": 1.01907635, "balance_loss_mlp": 1.04755437, "epoch": 0.23405982263640462, "flos": 43508542066560.0, "grad_norm": 1.514176085350528, "language_loss": 0.64391971, "learning_rate": 3.4835899688018522e-06, "loss": 0.66594332, "num_input_tokens_seen": 83795635, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.9140625, "step": 3893, "time_per_iteration": 2.7865827083587646 }, { "auxiliary_loss_clip": 0.01162631, "auxiliary_loss_mlp": 0.01033723, "balance_loss_clip": 1.01664066, "balance_loss_mlp": 1.04912782, "epoch": 0.2341199458890726, "flos": 22564685687040.0, "grad_norm": 2.768386537001829, "language_loss": 0.78597605, "learning_rate": 3.4833365645813384e-06, "loss": 0.80793953, "num_input_tokens_seen": 83814090, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.8671875, "step": 3894, "time_per_iteration": 2.555387020111084 }, { "auxiliary_loss_clip": 0.01143833, "auxiliary_loss_mlp": 0.01035474, "balance_loss_clip": 1.01976204, "balance_loss_mlp": 1.04635227, "epoch": 0.23418006914174055, "flos": 25633970933760.0, "grad_norm": 1.58326659990499, "language_loss": 0.82031119, "learning_rate": 3.483083107424359e-06, "loss": 0.84210426, "num_input_tokens_seen": 83836870, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8828125, "step": 3895, "time_per_iteration": 2.672084093093872 }, { "auxiliary_loss_clip": 0.01147304, "auxiliary_loss_mlp": 0.01036068, "balance_loss_clip": 1.02008247, "balance_loss_mlp": 1.0492897, "epoch": 0.23424019239440855, "flos": 13545936074880.0, "grad_norm": 2.1232548936710836, "language_loss": 0.80359602, "learning_rate": 3.4828295973399576e-06, "loss": 0.8254298, "num_input_tokens_seen": 83853275, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.890625, "step": 3896, "time_per_iteration": 2.505178213119507 }, { "auxiliary_loss_clip": 0.01158343, "auxiliary_loss_mlp": 0.01035814, "balance_loss_clip": 1.01865971, "balance_loss_mlp": 1.04821181, "epoch": 0.2343003156470765, "flos": 22419498913920.0, "grad_norm": 1.696503619113342, "language_loss": 0.8340497, "learning_rate": 3.4825760343371826e-06, "loss": 0.85599124, "num_input_tokens_seen": 83872340, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.921875, "step": 3897, "time_per_iteration": 2.589354991912842 }, { "auxiliary_loss_clip": 0.01149611, "auxiliary_loss_mlp": 0.01045742, "balance_loss_clip": 1.02892113, "balance_loss_mlp": 1.04778123, "epoch": 0.23436043889974448, "flos": 14790815683200.0, "grad_norm": 1.975957070606426, "language_loss": 0.78812969, "learning_rate": 3.482322418425083e-06, "loss": 0.81008315, "num_input_tokens_seen": 83888795, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.92578125, "step": 3898, "time_per_iteration": 2.527338981628418 }, { "auxiliary_loss_clip": 0.01162142, "auxiliary_loss_mlp": 0.01040014, "balance_loss_clip": 1.02409959, "balance_loss_mlp": 1.04988825, "epoch": 0.23442056215241244, "flos": 22965700101120.0, "grad_norm": 2.06289891154305, "language_loss": 0.73658478, "learning_rate": 3.4820687496127086e-06, "loss": 0.75860631, "num_input_tokens_seen": 83906820, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8515625, "step": 3899, "time_per_iteration": 2.6433353424072266 }, { "auxiliary_loss_clip": 0.01165332, "auxiliary_loss_mlp": 0.0104292, "balance_loss_clip": 1.02612376, "balance_loss_mlp": 1.04809761, "epoch": 0.2344806854050804, "flos": 23071887682560.0, "grad_norm": 2.4496297931056072, "language_loss": 0.74718738, "learning_rate": 3.481815027909113e-06, "loss": 0.76926982, "num_input_tokens_seen": 83926370, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8984375, "step": 3900, "time_per_iteration": 2.544888496398926 }, { "auxiliary_loss_clip": 0.01171439, "auxiliary_loss_mlp": 0.01044673, "balance_loss_clip": 1.02843714, "balance_loss_mlp": 1.04674363, "epoch": 0.23454080865774837, "flos": 16327074418560.0, "grad_norm": 2.808175172309117, "language_loss": 0.66898632, "learning_rate": 3.481561253323351e-06, "loss": 0.69114745, "num_input_tokens_seen": 83944600, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8828125, "step": 3901, "time_per_iteration": 2.561706304550171 }, { "auxiliary_loss_clip": 0.01096078, "auxiliary_loss_mlp": 0.01003012, "balance_loss_clip": 1.00073516, "balance_loss_mlp": 1.02872062, "epoch": 0.23460093191041637, "flos": 67760958142080.0, "grad_norm": 0.7564634889770495, "language_loss": 0.58241463, "learning_rate": 3.4813074258644786e-06, "loss": 0.60340548, "num_input_tokens_seen": 84005100, "router_z_loss_clip": 0.02282715, "router_z_loss_mlp": 0.3125, "step": 3902, "time_per_iteration": 3.129209518432617 }, { "auxiliary_loss_clip": 0.01156367, "auxiliary_loss_mlp": 0.01040105, "balance_loss_clip": 1.02326083, "balance_loss_mlp": 1.0495199, "epoch": 0.23466105516308433, "flos": 20077619990400.0, "grad_norm": 1.9710249202929102, "language_loss": 0.80422163, "learning_rate": 3.4810535455415547e-06, "loss": 0.82618636, "num_input_tokens_seen": 84023775, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.890625, "step": 3903, "time_per_iteration": 2.5807876586914062 }, { "auxiliary_loss_clip": 0.01174706, "auxiliary_loss_mlp": 0.01039955, "balance_loss_clip": 1.02299178, "balance_loss_mlp": 1.04764271, "epoch": 0.2347211784157523, "flos": 24535714642560.0, "grad_norm": 1.7441303180141685, "language_loss": 0.81587434, "learning_rate": 3.4807996123636394e-06, "loss": 0.83802092, "num_input_tokens_seen": 84042605, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.90625, "step": 3904, "time_per_iteration": 2.6150283813476562 }, { "auxiliary_loss_clip": 0.01137891, "auxiliary_loss_mlp": 0.01039211, "balance_loss_clip": 1.02298701, "balance_loss_mlp": 1.04909337, "epoch": 0.23478130166842026, "flos": 23805040181760.0, "grad_norm": 2.1885198947236213, "language_loss": 0.71363407, "learning_rate": 3.4805456263397954e-06, "loss": 0.73540509, "num_input_tokens_seen": 84061520, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.88671875, "step": 3905, "time_per_iteration": 2.6711902618408203 }, { "auxiliary_loss_clip": 0.01134441, "auxiliary_loss_mlp": 0.01031858, "balance_loss_clip": 1.01589584, "balance_loss_mlp": 1.04821634, "epoch": 0.23484142492108823, "flos": 24093618048000.0, "grad_norm": 2.1640315089150066, "language_loss": 0.71077657, "learning_rate": 3.480291587479086e-06, "loss": 0.73243964, "num_input_tokens_seen": 84081800, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.86328125, "step": 3906, "time_per_iteration": 2.597196578979492 }, { "auxiliary_loss_clip": 0.0115166, "auxiliary_loss_mlp": 0.01034348, "balance_loss_clip": 1.01589394, "balance_loss_mlp": 1.04735518, "epoch": 0.2349015481737562, "flos": 29095830898560.0, "grad_norm": 2.009999828836984, "language_loss": 0.73132467, "learning_rate": 3.4800374957905777e-06, "loss": 0.75318468, "num_input_tokens_seen": 84102340, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.953125, "step": 3907, "time_per_iteration": 2.669652223587036 }, { "auxiliary_loss_clip": 0.01146308, "auxiliary_loss_mlp": 0.01040597, "balance_loss_clip": 1.0244199, "balance_loss_mlp": 1.04704022, "epoch": 0.23496167142642416, "flos": 18916305373440.0, "grad_norm": 1.6599781483848983, "language_loss": 0.72337198, "learning_rate": 3.4797833512833376e-06, "loss": 0.74524105, "num_input_tokens_seen": 84120370, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.90234375, "step": 3908, "time_per_iteration": 2.510148048400879 }, { "auxiliary_loss_clip": 0.01078296, "auxiliary_loss_mlp": 0.01006128, "balance_loss_clip": 1.00368464, "balance_loss_mlp": 1.02830553, "epoch": 0.23502179467909215, "flos": 55868062896000.0, "grad_norm": 1.0665318264350654, "language_loss": 0.73293215, "learning_rate": 3.479529153966437e-06, "loss": 0.75377637, "num_input_tokens_seen": 84165515, "router_z_loss_clip": 0.02441406, "router_z_loss_mlp": 0.3203125, "step": 3909, "time_per_iteration": 2.9461982250213623 }, { "auxiliary_loss_clip": 0.01147145, "auxiliary_loss_mlp": 0.01036983, "balance_loss_clip": 1.02084208, "balance_loss_mlp": 1.04784513, "epoch": 0.23508191793176011, "flos": 23401763210880.0, "grad_norm": 2.039651528747511, "language_loss": 0.87691367, "learning_rate": 3.479274903848947e-06, "loss": 0.89875489, "num_input_tokens_seen": 84184540, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.90625, "step": 3910, "time_per_iteration": 2.6177380084991455 }, { "auxiliary_loss_clip": 0.01149086, "auxiliary_loss_mlp": 0.01037975, "balance_loss_clip": 1.02252567, "balance_loss_mlp": 1.05056691, "epoch": 0.23514204118442808, "flos": 20047671025920.0, "grad_norm": 2.0902851464450527, "language_loss": 0.76420981, "learning_rate": 3.4790206009399396e-06, "loss": 0.78608042, "num_input_tokens_seen": 84202025, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8984375, "step": 3911, "time_per_iteration": 2.5052692890167236 }, { "auxiliary_loss_clip": 0.01140627, "auxiliary_loss_mlp": 0.01036117, "balance_loss_clip": 1.02150798, "balance_loss_mlp": 1.04691255, "epoch": 0.23520216443709605, "flos": 21580589796480.0, "grad_norm": 1.6604268819625754, "language_loss": 0.81438601, "learning_rate": 3.4787662452484923e-06, "loss": 0.83615345, "num_input_tokens_seen": 84221895, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8515625, "step": 3912, "time_per_iteration": 2.5688822269439697 }, { "auxiliary_loss_clip": 0.0115382, "auxiliary_loss_mlp": 0.01042399, "balance_loss_clip": 1.02566814, "balance_loss_mlp": 1.04804993, "epoch": 0.235262287689764, "flos": 23185796688000.0, "grad_norm": 2.266834536177365, "language_loss": 0.71467042, "learning_rate": 3.4785118367836816e-06, "loss": 0.73663265, "num_input_tokens_seen": 84240455, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.87890625, "step": 3913, "time_per_iteration": 2.5236761569976807 }, { "auxiliary_loss_clip": 0.01156287, "auxiliary_loss_mlp": 0.01286895, "balance_loss_clip": 1.02132416, "balance_loss_mlp": 1.05202699, "epoch": 0.23532241094243198, "flos": 23185222070400.0, "grad_norm": 1.8295931962727376, "language_loss": 0.76020187, "learning_rate": 3.4782573755545866e-06, "loss": 0.78463364, "num_input_tokens_seen": 84261605, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.953125, "step": 3914, "time_per_iteration": 2.5945539474487305 }, { "auxiliary_loss_clip": 0.01183376, "auxiliary_loss_mlp": 0.01039665, "balance_loss_clip": 1.02269006, "balance_loss_mlp": 1.04898572, "epoch": 0.23538253419509997, "flos": 17019324305280.0, "grad_norm": 2.57228734386786, "language_loss": 0.88949788, "learning_rate": 3.478002861570288e-06, "loss": 0.91172832, "num_input_tokens_seen": 84278675, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.890625, "step": 3915, "time_per_iteration": 2.561105728149414 }, { "auxiliary_loss_clip": 0.0107713, "auxiliary_loss_mlp": 0.01002841, "balance_loss_clip": 1.00017059, "balance_loss_mlp": 1.02761936, "epoch": 0.23544265744776793, "flos": 63448588967040.0, "grad_norm": 0.8090554022150118, "language_loss": 0.59407485, "learning_rate": 3.47774829483987e-06, "loss": 0.61487454, "num_input_tokens_seen": 84329765, "router_z_loss_clip": 0.0267334, "router_z_loss_mlp": 0.3125, "step": 3916, "time_per_iteration": 3.1097187995910645 }, { "auxiliary_loss_clip": 0.01092844, "auxiliary_loss_mlp": 0.01002166, "balance_loss_clip": 0.99967426, "balance_loss_mlp": 1.0258739, "epoch": 0.2355027807004359, "flos": 70515343831680.0, "grad_norm": 0.7652718944982424, "language_loss": 0.49456188, "learning_rate": 3.4774936753724156e-06, "loss": 0.51551199, "num_input_tokens_seen": 84393680, "router_z_loss_clip": 0.02490234, "router_z_loss_mlp": 0.3125, "step": 3917, "time_per_iteration": 3.1846437454223633 }, { "auxiliary_loss_clip": 0.01167131, "auxiliary_loss_mlp": 0.01046631, "balance_loss_clip": 1.03044856, "balance_loss_mlp": 1.04881918, "epoch": 0.23556290395310386, "flos": 21434289701760.0, "grad_norm": 2.1730469755465984, "language_loss": 0.76456094, "learning_rate": 3.4772390031770126e-06, "loss": 0.78669852, "num_input_tokens_seen": 84412640, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.9140625, "step": 3918, "time_per_iteration": 2.585732936859131 }, { "auxiliary_loss_clip": 0.01179471, "auxiliary_loss_mlp": 0.01038327, "balance_loss_clip": 1.02246022, "balance_loss_mlp": 1.04939151, "epoch": 0.23562302720577183, "flos": 18186421011840.0, "grad_norm": 2.426041758168824, "language_loss": 0.6852659, "learning_rate": 3.47698427826275e-06, "loss": 0.70744389, "num_input_tokens_seen": 84431605, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.9375, "step": 3919, "time_per_iteration": 2.5440518856048584 }, { "auxiliary_loss_clip": 0.01164228, "auxiliary_loss_mlp": 0.01036642, "balance_loss_clip": 1.02060866, "balance_loss_mlp": 1.04968858, "epoch": 0.2356831504584398, "flos": 33730497832320.0, "grad_norm": 1.798878913585388, "language_loss": 0.70386076, "learning_rate": 3.4767295006387174e-06, "loss": 0.72586942, "num_input_tokens_seen": 84454210, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.87109375, "step": 3920, "time_per_iteration": 2.7815191745758057 }, { "auxiliary_loss_clip": 0.01155252, "auxiliary_loss_mlp": 0.01041021, "balance_loss_clip": 1.02547574, "balance_loss_mlp": 1.04896486, "epoch": 0.23574327371110776, "flos": 24932778560640.0, "grad_norm": 1.7892809345528835, "language_loss": 0.77292746, "learning_rate": 3.4764746703140077e-06, "loss": 0.79489022, "num_input_tokens_seen": 84475540, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8828125, "step": 3921, "time_per_iteration": 4.035490036010742 }, { "auxiliary_loss_clip": 0.01166374, "auxiliary_loss_mlp": 0.01039576, "balance_loss_clip": 1.0229702, "balance_loss_mlp": 1.05006933, "epoch": 0.23580339696377575, "flos": 17822107319040.0, "grad_norm": 2.3489431514201495, "language_loss": 0.74982643, "learning_rate": 3.476219787297715e-06, "loss": 0.77188599, "num_input_tokens_seen": 84494580, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.89453125, "step": 3922, "time_per_iteration": 2.5194356441497803 }, { "auxiliary_loss_clip": 0.01155204, "auxiliary_loss_mlp": 0.01033782, "balance_loss_clip": 1.01824868, "balance_loss_mlp": 1.04838586, "epoch": 0.23586352021644372, "flos": 26286611097600.0, "grad_norm": 3.326961528088373, "language_loss": 0.80025482, "learning_rate": 3.4759648515989356e-06, "loss": 0.82214463, "num_input_tokens_seen": 84513850, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.890625, "step": 3923, "time_per_iteration": 2.618358850479126 }, { "auxiliary_loss_clip": 0.01174085, "auxiliary_loss_mlp": 0.01039177, "balance_loss_clip": 1.02351284, "balance_loss_mlp": 1.04926848, "epoch": 0.23592364346911168, "flos": 14246697484800.0, "grad_norm": 1.7570162024276403, "language_loss": 0.7449739, "learning_rate": 3.4757098632267663e-06, "loss": 0.76710653, "num_input_tokens_seen": 84532315, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.88671875, "step": 3924, "time_per_iteration": 3.9208056926727295 }, { "auxiliary_loss_clip": 0.01141106, "auxiliary_loss_mlp": 0.01040876, "balance_loss_clip": 1.02515268, "balance_loss_mlp": 1.05185413, "epoch": 0.23598376672177965, "flos": 18587938216320.0, "grad_norm": 1.9346343963853943, "language_loss": 0.82302433, "learning_rate": 3.4754548221903086e-06, "loss": 0.84484416, "num_input_tokens_seen": 84550970, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.89453125, "step": 3925, "time_per_iteration": 4.138741493225098 }, { "auxiliary_loss_clip": 0.0115604, "auxiliary_loss_mlp": 0.01047848, "balance_loss_clip": 1.03151596, "balance_loss_mlp": 1.04717684, "epoch": 0.2360438899744476, "flos": 22675542036480.0, "grad_norm": 1.937327535127494, "language_loss": 0.59791869, "learning_rate": 3.475199728498664e-06, "loss": 0.61995757, "num_input_tokens_seen": 84571655, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.90625, "step": 3926, "time_per_iteration": 2.6229822635650635 }, { "auxiliary_loss_clip": 0.01143306, "auxiliary_loss_mlp": 0.01043138, "balance_loss_clip": 1.02805758, "balance_loss_mlp": 1.04804528, "epoch": 0.23610401322711558, "flos": 29570139014400.0, "grad_norm": 2.0425683354506003, "language_loss": 0.71655059, "learning_rate": 3.474944582160935e-06, "loss": 0.738415, "num_input_tokens_seen": 84593130, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.859375, "step": 3927, "time_per_iteration": 4.094960451126099 }, { "auxiliary_loss_clip": 0.01155703, "auxiliary_loss_mlp": 0.010406, "balance_loss_clip": 1.02608061, "balance_loss_mlp": 1.04870212, "epoch": 0.23616413647978354, "flos": 17858520731520.0, "grad_norm": 2.565462590506999, "language_loss": 0.75011081, "learning_rate": 3.4746893831862287e-06, "loss": 0.77207386, "num_input_tokens_seen": 84612410, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.890625, "step": 3928, "time_per_iteration": 2.5321946144104004 }, { "auxiliary_loss_clip": 0.0115669, "auxiliary_loss_mlp": 0.01045792, "balance_loss_clip": 1.02850652, "balance_loss_mlp": 1.04834712, "epoch": 0.23622425973245154, "flos": 11034847157760.0, "grad_norm": 2.7210618334154684, "language_loss": 0.8181746, "learning_rate": 3.474434131583651e-06, "loss": 0.84019935, "num_input_tokens_seen": 84627610, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.90234375, "step": 3929, "time_per_iteration": 2.5901806354522705 }, { "auxiliary_loss_clip": 0.01147007, "auxiliary_loss_mlp": 0.01045989, "balance_loss_clip": 1.02857292, "balance_loss_mlp": 1.05142117, "epoch": 0.2362843829851195, "flos": 23404061681280.0, "grad_norm": 1.8003638279654748, "language_loss": 0.71863687, "learning_rate": 3.474178827362312e-06, "loss": 0.74056685, "num_input_tokens_seen": 84648415, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.953125, "step": 3930, "time_per_iteration": 2.5247347354888916 }, { "auxiliary_loss_clip": 0.01138733, "auxiliary_loss_mlp": 0.01036777, "balance_loss_clip": 1.0203197, "balance_loss_mlp": 1.04892683, "epoch": 0.23634450623778747, "flos": 39529855261440.0, "grad_norm": 2.579520345255288, "language_loss": 0.73782206, "learning_rate": 3.473923470531323e-06, "loss": 0.75957716, "num_input_tokens_seen": 84670080, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8984375, "step": 3931, "time_per_iteration": 2.7066545486450195 }, { "auxiliary_loss_clip": 0.01149302, "auxiliary_loss_mlp": 0.01038704, "balance_loss_clip": 1.02153826, "balance_loss_mlp": 1.05078673, "epoch": 0.23640462949045543, "flos": 24207167917440.0, "grad_norm": 2.1963790061305715, "language_loss": 0.80163026, "learning_rate": 3.4736680610997965e-06, "loss": 0.82351029, "num_input_tokens_seen": 84686465, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8984375, "step": 3932, "time_per_iteration": 2.5601792335510254 }, { "auxiliary_loss_clip": 0.01165668, "auxiliary_loss_mlp": 0.0103536, "balance_loss_clip": 1.02113831, "balance_loss_mlp": 1.05156279, "epoch": 0.2364647527431234, "flos": 26177622255360.0, "grad_norm": 1.954331355853936, "language_loss": 0.85609055, "learning_rate": 3.4734125990768476e-06, "loss": 0.87810081, "num_input_tokens_seen": 84708825, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8671875, "step": 3933, "time_per_iteration": 2.652120351791382 }, { "auxiliary_loss_clip": 0.01180213, "auxiliary_loss_mlp": 0.01036574, "balance_loss_clip": 1.01982474, "balance_loss_mlp": 1.05240524, "epoch": 0.23652487599579136, "flos": 22637009721600.0, "grad_norm": 2.189344808567551, "language_loss": 0.8283757, "learning_rate": 3.473157084471593e-06, "loss": 0.85054362, "num_input_tokens_seen": 84726165, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.9140625, "step": 3934, "time_per_iteration": 2.5734989643096924 }, { "auxiliary_loss_clip": 0.01140726, "auxiliary_loss_mlp": 0.01037514, "balance_loss_clip": 1.02087259, "balance_loss_mlp": 1.0500803, "epoch": 0.23658499924845935, "flos": 21762261809280.0, "grad_norm": 1.9207292683458583, "language_loss": 0.78642011, "learning_rate": 3.472901517293152e-06, "loss": 0.80820251, "num_input_tokens_seen": 84745815, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.90625, "step": 3935, "time_per_iteration": 2.6292362213134766 }, { "auxiliary_loss_clip": 0.01148643, "auxiliary_loss_mlp": 0.01033657, "balance_loss_clip": 1.01801693, "balance_loss_mlp": 1.05115068, "epoch": 0.23664512250112732, "flos": 21798998444160.0, "grad_norm": 2.6206357869793595, "language_loss": 0.79360348, "learning_rate": 3.472645897550644e-06, "loss": 0.81542647, "num_input_tokens_seen": 84765415, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.88671875, "step": 3936, "time_per_iteration": 2.56252384185791 }, { "auxiliary_loss_clip": 0.01158241, "auxiliary_loss_mlp": 0.01034716, "balance_loss_clip": 1.0182054, "balance_loss_mlp": 1.05043244, "epoch": 0.23670524575379528, "flos": 22637871648000.0, "grad_norm": 2.5451843684108466, "language_loss": 0.79083425, "learning_rate": 3.4723902252531925e-06, "loss": 0.81276381, "num_input_tokens_seen": 84787080, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8984375, "step": 3937, "time_per_iteration": 2.634629487991333 }, { "auxiliary_loss_clip": 0.01147127, "auxiliary_loss_mlp": 0.0103751, "balance_loss_clip": 1.02268076, "balance_loss_mlp": 1.05214965, "epoch": 0.23676536900646325, "flos": 16725000263040.0, "grad_norm": 1.9828030049364056, "language_loss": 0.85106778, "learning_rate": 3.472134500409921e-06, "loss": 0.87291408, "num_input_tokens_seen": 84805395, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.859375, "step": 3938, "time_per_iteration": 2.5364582538604736 }, { "auxiliary_loss_clip": 0.01136522, "auxiliary_loss_mlp": 0.01045863, "balance_loss_clip": 1.02987719, "balance_loss_mlp": 1.04966521, "epoch": 0.23682549225913122, "flos": 11135611785600.0, "grad_norm": 2.905314960363506, "language_loss": 0.93904656, "learning_rate": 3.471878723029956e-06, "loss": 0.96087039, "num_input_tokens_seen": 84818090, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.87109375, "step": 3939, "time_per_iteration": 2.577742099761963 }, { "auxiliary_loss_clip": 0.01142201, "auxiliary_loss_mlp": 0.01043807, "balance_loss_clip": 1.02611661, "balance_loss_mlp": 1.04946613, "epoch": 0.23688561551179918, "flos": 22559226819840.0, "grad_norm": 2.4424058552473866, "language_loss": 0.82090759, "learning_rate": 3.4716228931224253e-06, "loss": 0.84276766, "num_input_tokens_seen": 84837695, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.9296875, "step": 3940, "time_per_iteration": 2.5317986011505127 }, { "auxiliary_loss_clip": 0.01165336, "auxiliary_loss_mlp": 0.01290381, "balance_loss_clip": 1.02609921, "balance_loss_mlp": 1.05191422, "epoch": 0.23694573876446715, "flos": 18514895909760.0, "grad_norm": 2.33834220408801, "language_loss": 0.88753188, "learning_rate": 3.4713670106964596e-06, "loss": 0.91208905, "num_input_tokens_seen": 84854630, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.953125, "step": 3941, "time_per_iteration": 2.675276279449463 }, { "auxiliary_loss_clip": 0.01138966, "auxiliary_loss_mlp": 0.01039509, "balance_loss_clip": 1.0218544, "balance_loss_mlp": 1.04717112, "epoch": 0.23700586201713514, "flos": 15335723980800.0, "grad_norm": 1.9675478996269484, "language_loss": 0.84616488, "learning_rate": 3.4711110757611897e-06, "loss": 0.86794966, "num_input_tokens_seen": 84871805, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.91796875, "step": 3942, "time_per_iteration": 2.5519614219665527 }, { "auxiliary_loss_clip": 0.01170671, "auxiliary_loss_mlp": 0.01043528, "balance_loss_clip": 1.02532446, "balance_loss_mlp": 1.05105019, "epoch": 0.2370659852698031, "flos": 23947605262080.0, "grad_norm": 2.0359852144844925, "language_loss": 0.81123686, "learning_rate": 3.4708550883257496e-06, "loss": 0.83337879, "num_input_tokens_seen": 84889815, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.92578125, "step": 3943, "time_per_iteration": 2.695955991744995 }, { "auxiliary_loss_clip": 0.01165322, "auxiliary_loss_mlp": 0.01038617, "balance_loss_clip": 1.0202229, "balance_loss_mlp": 1.0501864, "epoch": 0.23712610852247107, "flos": 15332527670400.0, "grad_norm": 2.3141779301453536, "language_loss": 0.68142325, "learning_rate": 3.4705990483992746e-06, "loss": 0.70346266, "num_input_tokens_seen": 84904380, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.96875, "step": 3944, "time_per_iteration": 2.5828492641448975 }, { "auxiliary_loss_clip": 0.01162639, "auxiliary_loss_mlp": 0.01035272, "balance_loss_clip": 1.01782024, "balance_loss_mlp": 1.05218816, "epoch": 0.23718623177513903, "flos": 19682567233920.0, "grad_norm": 1.663437262065805, "language_loss": 0.75322497, "learning_rate": 3.470342955990903e-06, "loss": 0.77520406, "num_input_tokens_seen": 84922935, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.92578125, "step": 3945, "time_per_iteration": 2.5654282569885254 }, { "auxiliary_loss_clip": 0.01151376, "auxiliary_loss_mlp": 0.01034811, "balance_loss_clip": 1.01908767, "balance_loss_mlp": 1.05134475, "epoch": 0.237246355027807, "flos": 24973322037120.0, "grad_norm": 1.5881013321696917, "language_loss": 0.63500702, "learning_rate": 3.470086811109773e-06, "loss": 0.65686893, "num_input_tokens_seen": 84943685, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.90625, "step": 3946, "time_per_iteration": 2.778135061264038 }, { "auxiliary_loss_clip": 0.01152564, "auxiliary_loss_mlp": 0.01034479, "balance_loss_clip": 1.01685977, "balance_loss_mlp": 1.05196047, "epoch": 0.23730647828047496, "flos": 15377416692480.0, "grad_norm": 2.295316255731564, "language_loss": 0.76328588, "learning_rate": 3.469830613765026e-06, "loss": 0.78515631, "num_input_tokens_seen": 84959505, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9140625, "step": 3947, "time_per_iteration": 2.6275298595428467 }, { "auxiliary_loss_clip": 0.01156456, "auxiliary_loss_mlp": 0.01040663, "balance_loss_clip": 1.02332973, "balance_loss_mlp": 1.05643845, "epoch": 0.23736660153314296, "flos": 28150662372480.0, "grad_norm": 4.67475825563614, "language_loss": 0.80626959, "learning_rate": 3.4695743639658065e-06, "loss": 0.82824075, "num_input_tokens_seen": 84982130, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.91015625, "step": 3948, "time_per_iteration": 2.6795482635498047 }, { "auxiliary_loss_clip": 0.0116295, "auxiliary_loss_mlp": 0.01044051, "balance_loss_clip": 1.02647984, "balance_loss_mlp": 1.05249727, "epoch": 0.23742672478581092, "flos": 22086570729600.0, "grad_norm": 1.9269350586376286, "language_loss": 0.80071318, "learning_rate": 3.4693180617212568e-06, "loss": 0.82278317, "num_input_tokens_seen": 85000640, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.921875, "step": 3949, "time_per_iteration": 2.553903341293335 }, { "auxiliary_loss_clip": 0.01165391, "auxiliary_loss_mlp": 0.01038969, "balance_loss_clip": 1.02173126, "balance_loss_mlp": 1.05327892, "epoch": 0.2374868480384789, "flos": 19537093152000.0, "grad_norm": 1.760954836732276, "language_loss": 0.73318344, "learning_rate": 3.4690617070405255e-06, "loss": 0.75522697, "num_input_tokens_seen": 85018970, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.94140625, "step": 3950, "time_per_iteration": 2.7289295196533203 }, { "auxiliary_loss_clip": 0.0114446, "auxiliary_loss_mlp": 0.01037106, "balance_loss_clip": 1.02121544, "balance_loss_mlp": 1.05528045, "epoch": 0.23754697129114685, "flos": 19422501788160.0, "grad_norm": 5.719046401118737, "language_loss": 0.72817987, "learning_rate": 3.4688052999327607e-06, "loss": 0.74999547, "num_input_tokens_seen": 85035905, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.890625, "step": 3951, "time_per_iteration": 2.499516487121582 }, { "auxiliary_loss_clip": 0.01166498, "auxiliary_loss_mlp": 0.01039966, "balance_loss_clip": 1.02238226, "balance_loss_mlp": 1.05560184, "epoch": 0.23760709454381482, "flos": 19501002961920.0, "grad_norm": 1.888149057405279, "language_loss": 0.73614311, "learning_rate": 3.4685488404071133e-06, "loss": 0.7582078, "num_input_tokens_seen": 85054560, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9296875, "step": 3952, "time_per_iteration": 2.6040124893188477 }, { "auxiliary_loss_clip": 0.01174713, "auxiliary_loss_mlp": 0.01040412, "balance_loss_clip": 1.02425909, "balance_loss_mlp": 1.05505431, "epoch": 0.23766721779648278, "flos": 27636600879360.0, "grad_norm": 2.5991285461992293, "language_loss": 0.70813811, "learning_rate": 3.468292328472735e-06, "loss": 0.73028934, "num_input_tokens_seen": 85074425, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.92578125, "step": 3953, "time_per_iteration": 2.6380724906921387 }, { "auxiliary_loss_clip": 0.01150119, "auxiliary_loss_mlp": 0.01048168, "balance_loss_clip": 1.02919006, "balance_loss_mlp": 1.0561465, "epoch": 0.23772734104915075, "flos": 23404348990080.0, "grad_norm": 2.043568260831606, "language_loss": 0.81225729, "learning_rate": 3.468035764138781e-06, "loss": 0.8342402, "num_input_tokens_seen": 85092865, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.9375, "step": 3954, "time_per_iteration": 2.737776756286621 }, { "auxiliary_loss_clip": 0.01170334, "auxiliary_loss_mlp": 0.01045686, "balance_loss_clip": 1.0274117, "balance_loss_mlp": 1.05836821, "epoch": 0.23778746430181874, "flos": 15705496540800.0, "grad_norm": 2.135762253829886, "language_loss": 0.66094595, "learning_rate": 3.467779147414406e-06, "loss": 0.68310618, "num_input_tokens_seen": 85110175, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.94140625, "step": 3955, "time_per_iteration": 2.5638489723205566 }, { "auxiliary_loss_clip": 0.01148436, "auxiliary_loss_mlp": 0.01047931, "balance_loss_clip": 1.03102732, "balance_loss_mlp": 1.056813, "epoch": 0.2378475875544867, "flos": 19426452284160.0, "grad_norm": 1.4417659826654456, "language_loss": 0.83792418, "learning_rate": 3.467522478308769e-06, "loss": 0.85988784, "num_input_tokens_seen": 85129925, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.91796875, "step": 3956, "time_per_iteration": 2.6588542461395264 }, { "auxiliary_loss_clip": 0.01172631, "auxiliary_loss_mlp": 0.01042773, "balance_loss_clip": 1.02559459, "balance_loss_mlp": 1.0569253, "epoch": 0.23790771080715467, "flos": 22268565964800.0, "grad_norm": 2.195928550622249, "language_loss": 0.84537041, "learning_rate": 3.46726575683103e-06, "loss": 0.8675245, "num_input_tokens_seen": 85147755, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8828125, "step": 3957, "time_per_iteration": 2.613395929336548 }, { "auxiliary_loss_clip": 0.01193633, "auxiliary_loss_mlp": 0.01041201, "balance_loss_clip": 1.02522659, "balance_loss_mlp": 1.05728531, "epoch": 0.23796783405982264, "flos": 20047311889920.0, "grad_norm": 1.9979400284713293, "language_loss": 0.70173198, "learning_rate": 3.4670089829903503e-06, "loss": 0.72408032, "num_input_tokens_seen": 85165270, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.9140625, "step": 3958, "time_per_iteration": 2.6866025924682617 }, { "auxiliary_loss_clip": 0.01178661, "auxiliary_loss_mlp": 0.010447, "balance_loss_clip": 1.02568579, "balance_loss_mlp": 1.05897319, "epoch": 0.2380279573124906, "flos": 14245943299200.0, "grad_norm": 2.6243894313477494, "language_loss": 0.65735298, "learning_rate": 3.466752156795893e-06, "loss": 0.67958665, "num_input_tokens_seen": 85181555, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.92578125, "step": 3959, "time_per_iteration": 2.6810102462768555 }, { "auxiliary_loss_clip": 0.01185978, "auxiliary_loss_mlp": 0.01042865, "balance_loss_clip": 1.02596152, "balance_loss_mlp": 1.05789149, "epoch": 0.23808808056515857, "flos": 21179180332800.0, "grad_norm": 1.7571547236019667, "language_loss": 0.72512698, "learning_rate": 3.4664952782568253e-06, "loss": 0.74741542, "num_input_tokens_seen": 85199455, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.921875, "step": 3960, "time_per_iteration": 2.6727728843688965 }, { "auxiliary_loss_clip": 0.01212882, "auxiliary_loss_mlp": 0.01043406, "balance_loss_clip": 1.02542877, "balance_loss_mlp": 1.05733263, "epoch": 0.23814820381782653, "flos": 22528308188160.0, "grad_norm": 1.5854692512272395, "language_loss": 0.73542416, "learning_rate": 3.466238347382313e-06, "loss": 0.75798702, "num_input_tokens_seen": 85219170, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9296875, "step": 3961, "time_per_iteration": 2.7005207538604736 }, { "auxiliary_loss_clip": 0.01148928, "auxiliary_loss_mlp": 0.01045072, "balance_loss_clip": 1.02627301, "balance_loss_mlp": 1.0555563, "epoch": 0.23820832707049452, "flos": 22304332932480.0, "grad_norm": 1.9673286404920487, "language_loss": 0.65241247, "learning_rate": 3.465981364181525e-06, "loss": 0.67435247, "num_input_tokens_seen": 85238480, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.93359375, "step": 3962, "time_per_iteration": 3.941469192504883 }, { "auxiliary_loss_clip": 0.01166837, "auxiliary_loss_mlp": 0.0104509, "balance_loss_clip": 1.02794158, "balance_loss_mlp": 1.05608904, "epoch": 0.2382684503231625, "flos": 24864225454080.0, "grad_norm": 2.87373401592737, "language_loss": 0.74541092, "learning_rate": 3.4657243286636332e-06, "loss": 0.76753008, "num_input_tokens_seen": 85259180, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9296875, "step": 3963, "time_per_iteration": 2.6242642402648926 }, { "auxiliary_loss_clip": 0.01181362, "auxiliary_loss_mlp": 0.01040967, "balance_loss_clip": 1.02318132, "balance_loss_mlp": 1.05970025, "epoch": 0.23832857357583045, "flos": 21871609787520.0, "grad_norm": 2.106638675898512, "language_loss": 0.77014995, "learning_rate": 3.4654672408378107e-06, "loss": 0.79237324, "num_input_tokens_seen": 85278550, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9453125, "step": 3964, "time_per_iteration": 2.5968985557556152 }, { "auxiliary_loss_clip": 0.01172805, "auxiliary_loss_mlp": 0.01043856, "balance_loss_clip": 1.0268805, "balance_loss_mlp": 1.05406332, "epoch": 0.23838869682849842, "flos": 21288061434240.0, "grad_norm": 1.9764187488874783, "language_loss": 0.71398926, "learning_rate": 3.4652101007132323e-06, "loss": 0.73615587, "num_input_tokens_seen": 85297345, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9140625, "step": 3965, "time_per_iteration": 2.5831360816955566 }, { "auxiliary_loss_clip": 0.01163601, "auxiliary_loss_mlp": 0.0104146, "balance_loss_clip": 1.02425814, "balance_loss_mlp": 1.05584311, "epoch": 0.23844882008116638, "flos": 16180594755840.0, "grad_norm": 1.8680697623332618, "language_loss": 0.78456247, "learning_rate": 3.4649529082990743e-06, "loss": 0.80661309, "num_input_tokens_seen": 85315105, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.89453125, "step": 3966, "time_per_iteration": 4.010444402694702 }, { "auxiliary_loss_clip": 0.01162206, "auxiliary_loss_mlp": 0.01041865, "balance_loss_clip": 1.02528298, "balance_loss_mlp": 1.05450261, "epoch": 0.23850894333383435, "flos": 21069724613760.0, "grad_norm": 1.8377077752066169, "language_loss": 0.68707258, "learning_rate": 3.4646956636045152e-06, "loss": 0.7091133, "num_input_tokens_seen": 85334735, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8984375, "step": 3967, "time_per_iteration": 4.049858808517456 }, { "auxiliary_loss_clip": 0.01176278, "auxiliary_loss_mlp": 0.0105205, "balance_loss_clip": 1.0344429, "balance_loss_mlp": 1.05730176, "epoch": 0.23856906658650234, "flos": 17201606849280.0, "grad_norm": 2.058170360528669, "language_loss": 0.67777061, "learning_rate": 3.4644383666387347e-06, "loss": 0.70005393, "num_input_tokens_seen": 85352875, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.921875, "step": 3968, "time_per_iteration": 4.01007080078125 }, { "auxiliary_loss_clip": 0.01180541, "auxiliary_loss_mlp": 0.01045882, "balance_loss_clip": 1.02887082, "balance_loss_mlp": 1.05335259, "epoch": 0.2386291898391703, "flos": 29494223619840.0, "grad_norm": 2.8155391597700072, "language_loss": 0.76709795, "learning_rate": 3.464181017410917e-06, "loss": 0.78936219, "num_input_tokens_seen": 85372205, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9140625, "step": 3969, "time_per_iteration": 2.7738821506500244 }, { "auxiliary_loss_clip": 0.01158777, "auxiliary_loss_mlp": 0.01039699, "balance_loss_clip": 1.02331913, "balance_loss_mlp": 1.05233908, "epoch": 0.23868931309183827, "flos": 21142443697920.0, "grad_norm": 2.1600864601459184, "language_loss": 0.76382101, "learning_rate": 3.463923615930245e-06, "loss": 0.7858057, "num_input_tokens_seen": 85389705, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8828125, "step": 3970, "time_per_iteration": 2.6836793422698975 }, { "auxiliary_loss_clip": 0.01172125, "auxiliary_loss_mlp": 0.01042937, "balance_loss_clip": 1.0241611, "balance_loss_mlp": 1.05268168, "epoch": 0.23874943634450624, "flos": 25659394784640.0, "grad_norm": 2.0190106822393807, "language_loss": 0.85387868, "learning_rate": 3.4636661622059042e-06, "loss": 0.87602925, "num_input_tokens_seen": 85407855, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.92578125, "step": 3971, "time_per_iteration": 2.7598860263824463 }, { "auxiliary_loss_clip": 0.01083211, "auxiliary_loss_mlp": 0.01014317, "balance_loss_clip": 1.01161051, "balance_loss_mlp": 1.04397321, "epoch": 0.2388095595971742, "flos": 58986618624000.0, "grad_norm": 0.7424549765616467, "language_loss": 0.62792331, "learning_rate": 3.4634086562470835e-06, "loss": 0.6488986, "num_input_tokens_seen": 85470885, "router_z_loss_clip": 0.02709961, "router_z_loss_mlp": 0.30273438, "step": 3972, "time_per_iteration": 3.2522599697113037 }, { "auxiliary_loss_clip": 0.01163121, "auxiliary_loss_mlp": 0.01036417, "balance_loss_clip": 1.01958442, "balance_loss_mlp": 1.05402637, "epoch": 0.23886968284984217, "flos": 16800341040000.0, "grad_norm": 1.883224682093721, "language_loss": 0.81095529, "learning_rate": 3.463151098062972e-06, "loss": 0.83295071, "num_input_tokens_seen": 85488460, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.91015625, "step": 3973, "time_per_iteration": 2.537309169769287 }, { "auxiliary_loss_clip": 0.01163798, "auxiliary_loss_mlp": 0.01045905, "balance_loss_clip": 1.02863193, "balance_loss_mlp": 1.05566311, "epoch": 0.23892980610251013, "flos": 22382654538240.0, "grad_norm": 1.6491163058406197, "language_loss": 0.79399216, "learning_rate": 3.4628934876627615e-06, "loss": 0.81608915, "num_input_tokens_seen": 85508590, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.8984375, "step": 3974, "time_per_iteration": 2.6677448749542236 }, { "auxiliary_loss_clip": 0.01155208, "auxiliary_loss_mlp": 0.01043405, "balance_loss_clip": 1.0254879, "balance_loss_mlp": 1.053725, "epoch": 0.23898992935517813, "flos": 12823198519680.0, "grad_norm": 2.176737063939263, "language_loss": 0.84647435, "learning_rate": 3.4626358250556458e-06, "loss": 0.86846048, "num_input_tokens_seen": 85525970, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.921875, "step": 3975, "time_per_iteration": 2.53684663772583 }, { "auxiliary_loss_clip": 0.01151287, "auxiliary_loss_mlp": 0.01041207, "balance_loss_clip": 1.02482736, "balance_loss_mlp": 1.05313683, "epoch": 0.2390500526078461, "flos": 22345666508160.0, "grad_norm": 2.418921069650061, "language_loss": 0.83147573, "learning_rate": 3.4623781102508193e-06, "loss": 0.85340077, "num_input_tokens_seen": 85543700, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.890625, "step": 3976, "time_per_iteration": 2.6938557624816895 }, { "auxiliary_loss_clip": 0.01183029, "auxiliary_loss_mlp": 0.01033386, "balance_loss_clip": 1.01818681, "balance_loss_mlp": 1.05007398, "epoch": 0.23911017586051406, "flos": 22635142214400.0, "grad_norm": 1.9346224168703374, "language_loss": 0.74253416, "learning_rate": 3.46212034325748e-06, "loss": 0.76469827, "num_input_tokens_seen": 85562765, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.87890625, "step": 3977, "time_per_iteration": 2.5837230682373047 }, { "auxiliary_loss_clip": 0.01144975, "auxiliary_loss_mlp": 0.01043602, "balance_loss_clip": 1.02587581, "balance_loss_mlp": 1.05367875, "epoch": 0.23917029911318202, "flos": 23653281219840.0, "grad_norm": 2.505788053438687, "language_loss": 0.71688789, "learning_rate": 3.4618625240848264e-06, "loss": 0.7387737, "num_input_tokens_seen": 85581755, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9140625, "step": 3978, "time_per_iteration": 2.6454596519470215 }, { "auxiliary_loss_clip": 0.01165859, "auxiliary_loss_mlp": 0.01046124, "balance_loss_clip": 1.0296849, "balance_loss_mlp": 1.05535865, "epoch": 0.23923042236585, "flos": 22783597125120.0, "grad_norm": 2.1644039922157794, "language_loss": 0.7867102, "learning_rate": 3.4616046527420597e-06, "loss": 0.80883002, "num_input_tokens_seen": 85599455, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.921875, "step": 3979, "time_per_iteration": 2.5671091079711914 }, { "auxiliary_loss_clip": 0.01160146, "auxiliary_loss_mlp": 0.01050166, "balance_loss_clip": 1.03252316, "balance_loss_mlp": 1.05156147, "epoch": 0.23929054561851795, "flos": 28147717457280.0, "grad_norm": 1.7404577589080228, "language_loss": 0.81723499, "learning_rate": 3.4613467292383832e-06, "loss": 0.83933812, "num_input_tokens_seen": 85619970, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.90625, "step": 3980, "time_per_iteration": 2.67453670501709 }, { "auxiliary_loss_clip": 0.01167496, "auxiliary_loss_mlp": 0.01038725, "balance_loss_clip": 1.02180898, "balance_loss_mlp": 1.05210638, "epoch": 0.23935066887118592, "flos": 21686525982720.0, "grad_norm": 1.7974184950330214, "language_loss": 0.84012949, "learning_rate": 3.4610887535830005e-06, "loss": 0.86219174, "num_input_tokens_seen": 85638850, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8828125, "step": 3981, "time_per_iteration": 2.607597589492798 }, { "auxiliary_loss_clip": 0.01165343, "auxiliary_loss_mlp": 0.0104743, "balance_loss_clip": 1.03043079, "balance_loss_mlp": 1.0539552, "epoch": 0.2394107921238539, "flos": 32122274198400.0, "grad_norm": 1.8038836069550594, "language_loss": 0.76560307, "learning_rate": 3.4608307257851186e-06, "loss": 0.78773081, "num_input_tokens_seen": 85656285, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.93359375, "step": 3982, "time_per_iteration": 2.7068963050842285 }, { "auxiliary_loss_clip": 0.01165597, "auxiliary_loss_mlp": 0.01043282, "balance_loss_clip": 1.02755785, "balance_loss_mlp": 1.05095541, "epoch": 0.23947091537652188, "flos": 17019180650880.0, "grad_norm": 1.7083512760326527, "language_loss": 0.7770201, "learning_rate": 3.460572645853946e-06, "loss": 0.79910892, "num_input_tokens_seen": 85673020, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.875, "step": 3983, "time_per_iteration": 2.6506540775299072 }, { "auxiliary_loss_clip": 0.01149967, "auxiliary_loss_mlp": 0.01293109, "balance_loss_clip": 1.02826095, "balance_loss_mlp": 1.05150986, "epoch": 0.23953103862918984, "flos": 20593584904320.0, "grad_norm": 1.8590220255620176, "language_loss": 0.72935736, "learning_rate": 3.4603145137986925e-06, "loss": 0.75378811, "num_input_tokens_seen": 85692565, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8984375, "step": 3984, "time_per_iteration": 2.616729497909546 }, { "auxiliary_loss_clip": 0.0116972, "auxiliary_loss_mlp": 0.01046734, "balance_loss_clip": 1.02962756, "balance_loss_mlp": 1.05056214, "epoch": 0.2395911618818578, "flos": 20704405340160.0, "grad_norm": 2.100465255077742, "language_loss": 0.78789335, "learning_rate": 3.4600563296285704e-06, "loss": 0.81005788, "num_input_tokens_seen": 85709730, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.921875, "step": 3985, "time_per_iteration": 2.5647788047790527 }, { "auxiliary_loss_clip": 0.01181089, "auxiliary_loss_mlp": 0.0104441, "balance_loss_clip": 1.02696919, "balance_loss_mlp": 1.05445218, "epoch": 0.23965128513452577, "flos": 27053519402880.0, "grad_norm": 2.02968394722085, "language_loss": 0.73461384, "learning_rate": 3.459798093352794e-06, "loss": 0.75686884, "num_input_tokens_seen": 85730045, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.90625, "step": 3986, "time_per_iteration": 2.5932681560516357 }, { "auxiliary_loss_clip": 0.01156304, "auxiliary_loss_mlp": 0.01052264, "balance_loss_clip": 1.03551555, "balance_loss_mlp": 1.05283296, "epoch": 0.23971140838719374, "flos": 23144319457920.0, "grad_norm": 1.9505472822058572, "language_loss": 0.87768805, "learning_rate": 3.4595398049805783e-06, "loss": 0.89977372, "num_input_tokens_seen": 85747590, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.9453125, "step": 3987, "time_per_iteration": 2.5942044258117676 }, { "auxiliary_loss_clip": 0.01145357, "auxiliary_loss_mlp": 0.01039626, "balance_loss_clip": 1.02440321, "balance_loss_mlp": 1.05138612, "epoch": 0.23977153163986173, "flos": 18034554309120.0, "grad_norm": 2.1487064986018907, "language_loss": 0.83063269, "learning_rate": 3.459281464521142e-06, "loss": 0.85248256, "num_input_tokens_seen": 85763460, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8515625, "step": 3988, "time_per_iteration": 2.5095667839050293 }, { "auxiliary_loss_clip": 0.01148212, "auxiliary_loss_mlp": 0.01041542, "balance_loss_clip": 1.02476287, "balance_loss_mlp": 1.05035591, "epoch": 0.2398316548925297, "flos": 18113378705280.0, "grad_norm": 1.7111626813497638, "language_loss": 0.85086501, "learning_rate": 3.459023071983703e-06, "loss": 0.87276256, "num_input_tokens_seen": 85782050, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.890625, "step": 3989, "time_per_iteration": 2.5270509719848633 }, { "auxiliary_loss_clip": 0.01138616, "auxiliary_loss_mlp": 0.01039498, "balance_loss_clip": 1.02321362, "balance_loss_mlp": 1.04982173, "epoch": 0.23989177814519766, "flos": 12567730014720.0, "grad_norm": 2.075167347522099, "language_loss": 0.85033524, "learning_rate": 3.458764627377484e-06, "loss": 0.87211645, "num_input_tokens_seen": 85797400, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.890625, "step": 3990, "time_per_iteration": 2.5440356731414795 }, { "auxiliary_loss_clip": 0.01148345, "auxiliary_loss_mlp": 0.01046221, "balance_loss_clip": 1.02990091, "balance_loss_mlp": 1.05072737, "epoch": 0.23995190139786562, "flos": 25264593423360.0, "grad_norm": 1.3617143663976221, "language_loss": 0.75464261, "learning_rate": 3.458506130711708e-06, "loss": 0.77658826, "num_input_tokens_seen": 85818995, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.88671875, "step": 3991, "time_per_iteration": 2.561779499053955 }, { "auxiliary_loss_clip": 0.01079843, "auxiliary_loss_mlp": 0.01021694, "balance_loss_clip": 1.01971483, "balance_loss_mlp": 1.03224301, "epoch": 0.2400120246505336, "flos": 61960379650560.0, "grad_norm": 0.8915364794507948, "language_loss": 0.63684458, "learning_rate": 3.4582475819955995e-06, "loss": 0.65785998, "num_input_tokens_seen": 85876695, "router_z_loss_clip": 0.01977539, "router_z_loss_mlp": 0.29492188, "step": 3992, "time_per_iteration": 3.060333013534546 }, { "auxiliary_loss_clip": 0.01071757, "auxiliary_loss_mlp": 0.01014359, "balance_loss_clip": 1.01226103, "balance_loss_mlp": 1.03298616, "epoch": 0.24007214790320155, "flos": 66708560540160.0, "grad_norm": 0.7520112085209784, "language_loss": 0.62918568, "learning_rate": 3.457988981238386e-06, "loss": 0.65004683, "num_input_tokens_seen": 85940990, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.29882812, "step": 3993, "time_per_iteration": 3.232160806655884 }, { "auxiliary_loss_clip": 0.01174519, "auxiliary_loss_mlp": 0.0104735, "balance_loss_clip": 1.03017211, "balance_loss_mlp": 1.056319, "epoch": 0.24013227115586952, "flos": 25809070757760.0, "grad_norm": 1.7060519025569074, "language_loss": 0.76703733, "learning_rate": 3.457730328449296e-06, "loss": 0.78925598, "num_input_tokens_seen": 85961165, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.91015625, "step": 3994, "time_per_iteration": 2.5518693923950195 }, { "auxiliary_loss_clip": 0.0116146, "auxiliary_loss_mlp": 0.01046415, "balance_loss_clip": 1.02812862, "balance_loss_mlp": 1.05245781, "epoch": 0.2401923944085375, "flos": 25557480921600.0, "grad_norm": 1.6107829094477946, "language_loss": 0.78330976, "learning_rate": 3.457471623637561e-06, "loss": 0.80538845, "num_input_tokens_seen": 85982710, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.91015625, "step": 3995, "time_per_iteration": 2.5925188064575195 }, { "auxiliary_loss_clip": 0.01088307, "auxiliary_loss_mlp": 0.01004819, "balance_loss_clip": 1.00275707, "balance_loss_mlp": 1.03162408, "epoch": 0.24025251766120548, "flos": 54941138478720.0, "grad_norm": 0.9135812733027315, "language_loss": 0.63529664, "learning_rate": 3.457212866812412e-06, "loss": 0.65622795, "num_input_tokens_seen": 86046935, "router_z_loss_clip": 0.02062988, "router_z_loss_mlp": 0.30078125, "step": 3996, "time_per_iteration": 3.2051961421966553 }, { "auxiliary_loss_clip": 0.01161832, "auxiliary_loss_mlp": 0.01038853, "balance_loss_clip": 1.02188945, "balance_loss_mlp": 1.05158687, "epoch": 0.24031264091387344, "flos": 20631075724800.0, "grad_norm": 2.475231639055125, "language_loss": 0.69722098, "learning_rate": 3.4569540579830853e-06, "loss": 0.71922779, "num_input_tokens_seen": 86064355, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.92578125, "step": 3997, "time_per_iteration": 2.5475635528564453 }, { "auxiliary_loss_clip": 0.01151314, "auxiliary_loss_mlp": 0.01046643, "balance_loss_clip": 1.02946544, "balance_loss_mlp": 1.0519309, "epoch": 0.2403727641665414, "flos": 20886256920960.0, "grad_norm": 1.9285184082567497, "language_loss": 0.86860847, "learning_rate": 3.456695197158815e-06, "loss": 0.89058799, "num_input_tokens_seen": 86081340, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.90234375, "step": 3998, "time_per_iteration": 2.5530261993408203 }, { "auxiliary_loss_clip": 0.01152246, "auxiliary_loss_mlp": 0.01036064, "balance_loss_clip": 1.01943386, "balance_loss_mlp": 1.04766715, "epoch": 0.24043288741920937, "flos": 22820046451200.0, "grad_norm": 1.8160640627289637, "language_loss": 0.75682151, "learning_rate": 3.4564362843488403e-06, "loss": 0.77870452, "num_input_tokens_seen": 86102260, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.953125, "step": 3999, "time_per_iteration": 2.5795345306396484 }, { "auxiliary_loss_clip": 0.01158636, "auxiliary_loss_mlp": 0.01041747, "balance_loss_clip": 1.02540338, "balance_loss_mlp": 1.05185366, "epoch": 0.24049301067187734, "flos": 27959652823680.0, "grad_norm": 2.0866219745639745, "language_loss": 0.7187798, "learning_rate": 3.4561773195624015e-06, "loss": 0.74078357, "num_input_tokens_seen": 86123400, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8828125, "step": 4000, "time_per_iteration": 2.620295763015747 }, { "auxiliary_loss_clip": 0.0115821, "auxiliary_loss_mlp": 0.01039904, "balance_loss_clip": 1.0221653, "balance_loss_mlp": 1.05344415, "epoch": 0.24055313392454533, "flos": 27451409333760.0, "grad_norm": 1.6951928855370526, "language_loss": 0.66633993, "learning_rate": 3.4559183028087394e-06, "loss": 0.68832111, "num_input_tokens_seen": 86144060, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.95703125, "step": 4001, "time_per_iteration": 2.615020275115967 }, { "auxiliary_loss_clip": 0.01143076, "auxiliary_loss_mlp": 0.01039112, "balance_loss_clip": 1.02129054, "balance_loss_mlp": 1.05134368, "epoch": 0.2406132571772133, "flos": 25556618995200.0, "grad_norm": 1.6879542123310562, "language_loss": 0.82691813, "learning_rate": 3.4556592340970983e-06, "loss": 0.84873998, "num_input_tokens_seen": 86163005, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.91796875, "step": 4002, "time_per_iteration": 2.6898462772369385 }, { "auxiliary_loss_clip": 0.01171119, "auxiliary_loss_mlp": 0.01040749, "balance_loss_clip": 1.02400041, "balance_loss_mlp": 1.05326092, "epoch": 0.24067338042988126, "flos": 24791398629120.0, "grad_norm": 2.134116420839305, "language_loss": 0.82633221, "learning_rate": 3.4554001134367237e-06, "loss": 0.8484509, "num_input_tokens_seen": 86182580, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.90625, "step": 4003, "time_per_iteration": 2.5625123977661133 }, { "auxiliary_loss_clip": 0.01159741, "auxiliary_loss_mlp": 0.01035268, "balance_loss_clip": 1.01833999, "balance_loss_mlp": 1.05090189, "epoch": 0.24073350368254923, "flos": 21177923356800.0, "grad_norm": 2.990339083306273, "language_loss": 0.87384415, "learning_rate": 3.4551409408368627e-06, "loss": 0.89579427, "num_input_tokens_seen": 86200665, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.90625, "step": 4004, "time_per_iteration": 3.994446039199829 }, { "auxiliary_loss_clip": 0.01159977, "auxiliary_loss_mlp": 0.01050062, "balance_loss_clip": 1.03350365, "balance_loss_mlp": 1.04846275, "epoch": 0.2407936269352172, "flos": 22494300986880.0, "grad_norm": 2.115704050198582, "language_loss": 0.78056407, "learning_rate": 3.4548817163067643e-06, "loss": 0.80266446, "num_input_tokens_seen": 86221640, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.93359375, "step": 4005, "time_per_iteration": 2.600534200668335 }, { "auxiliary_loss_clip": 0.01176342, "auxiliary_loss_mlp": 0.01036612, "balance_loss_clip": 1.02034032, "balance_loss_mlp": 1.0512774, "epoch": 0.24085375018788516, "flos": 18551129754240.0, "grad_norm": 1.6701838003113132, "language_loss": 0.78950703, "learning_rate": 3.4546224398556804e-06, "loss": 0.81163657, "num_input_tokens_seen": 86240795, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.890625, "step": 4006, "time_per_iteration": 2.5902862548828125 }, { "auxiliary_loss_clip": 0.01164923, "auxiliary_loss_mlp": 0.01288844, "balance_loss_clip": 1.02287078, "balance_loss_mlp": 1.05068576, "epoch": 0.24091387344055312, "flos": 24170539023360.0, "grad_norm": 1.6442879001162947, "language_loss": 0.70838958, "learning_rate": 3.4543631114928627e-06, "loss": 0.7329272, "num_input_tokens_seen": 86262000, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.9609375, "step": 4007, "time_per_iteration": 3.984933376312256 }, { "auxiliary_loss_clip": 0.01167011, "auxiliary_loss_mlp": 0.01042421, "balance_loss_clip": 1.02661407, "balance_loss_mlp": 1.05145144, "epoch": 0.24097399669322112, "flos": 11036319615360.0, "grad_norm": 1.7520033587944552, "language_loss": 0.75817585, "learning_rate": 3.454103731227567e-06, "loss": 0.7802701, "num_input_tokens_seen": 86279680, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.88671875, "step": 4008, "time_per_iteration": 4.075334548950195 }, { "auxiliary_loss_clip": 0.01149839, "auxiliary_loss_mlp": 0.01036267, "balance_loss_clip": 1.01962507, "balance_loss_mlp": 1.05034137, "epoch": 0.24103411994588908, "flos": 17165085696000.0, "grad_norm": 2.535940872419007, "language_loss": 0.73935026, "learning_rate": 3.4538442990690494e-06, "loss": 0.76121128, "num_input_tokens_seen": 86297180, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.90625, "step": 4009, "time_per_iteration": 2.4919421672821045 }, { "auxiliary_loss_clip": 0.01152824, "auxiliary_loss_mlp": 0.01045176, "balance_loss_clip": 1.02926195, "balance_loss_mlp": 1.05452037, "epoch": 0.24109424319855705, "flos": 20667956014080.0, "grad_norm": 1.8461904511938145, "language_loss": 0.7977975, "learning_rate": 3.4535848150265684e-06, "loss": 0.81977749, "num_input_tokens_seen": 86317660, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.89453125, "step": 4010, "time_per_iteration": 3.988095760345459 }, { "auxiliary_loss_clip": 0.01159455, "auxiliary_loss_mlp": 0.01049646, "balance_loss_clip": 1.03078687, "balance_loss_mlp": 1.04872501, "epoch": 0.241154366451225, "flos": 28181796485760.0, "grad_norm": 2.011608939091887, "language_loss": 0.70680225, "learning_rate": 3.453325279109385e-06, "loss": 0.72889328, "num_input_tokens_seen": 86338325, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.921875, "step": 4011, "time_per_iteration": 2.5536577701568604 }, { "auxiliary_loss_clip": 0.01195016, "auxiliary_loss_mlp": 0.01041896, "balance_loss_clip": 1.02586198, "balance_loss_mlp": 1.05003476, "epoch": 0.24121448970389298, "flos": 21689722293120.0, "grad_norm": 1.666663475532905, "language_loss": 0.68823075, "learning_rate": 3.45306569132676e-06, "loss": 0.7105999, "num_input_tokens_seen": 86357615, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.90625, "step": 4012, "time_per_iteration": 2.6260452270507812 }, { "auxiliary_loss_clip": 0.01153932, "auxiliary_loss_mlp": 0.01043257, "balance_loss_clip": 1.0248754, "balance_loss_mlp": 1.05299306, "epoch": 0.24127461295656094, "flos": 39676191269760.0, "grad_norm": 1.7533056857874885, "language_loss": 0.73796457, "learning_rate": 3.4528060516879587e-06, "loss": 0.75993645, "num_input_tokens_seen": 86380355, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.91796875, "step": 4013, "time_per_iteration": 2.7148244380950928 }, { "auxiliary_loss_clip": 0.01144798, "auxiliary_loss_mlp": 0.01036917, "balance_loss_clip": 1.02066851, "balance_loss_mlp": 1.05326653, "epoch": 0.2413347362092289, "flos": 19135863256320.0, "grad_norm": 1.9178890259392651, "language_loss": 0.88191295, "learning_rate": 3.4525463602022465e-06, "loss": 0.90373003, "num_input_tokens_seen": 86399125, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.9140625, "step": 4014, "time_per_iteration": 2.565311908721924 }, { "auxiliary_loss_clip": 0.01166834, "auxiliary_loss_mlp": 0.01049367, "balance_loss_clip": 1.03147364, "balance_loss_mlp": 1.05300868, "epoch": 0.2413948594618969, "flos": 26939430829440.0, "grad_norm": 1.9833535298146954, "language_loss": 0.94796389, "learning_rate": 3.452286616878891e-06, "loss": 0.97012591, "num_input_tokens_seen": 86418625, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.953125, "step": 4015, "time_per_iteration": 2.5829381942749023 }, { "auxiliary_loss_clip": 0.01162645, "auxiliary_loss_mlp": 0.01042612, "balance_loss_clip": 1.02670908, "balance_loss_mlp": 1.05314183, "epoch": 0.24145498271456486, "flos": 25228108183680.0, "grad_norm": 2.043520930124663, "language_loss": 0.82539451, "learning_rate": 3.4520268217271616e-06, "loss": 0.84744704, "num_input_tokens_seen": 86438375, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.9140625, "step": 4016, "time_per_iteration": 2.670649766921997 }, { "auxiliary_loss_clip": 0.01166797, "auxiliary_loss_mlp": 0.01048034, "balance_loss_clip": 1.0317862, "balance_loss_mlp": 1.05145276, "epoch": 0.24151510596723283, "flos": 40661759617920.0, "grad_norm": 2.0173858682802885, "language_loss": 0.69000679, "learning_rate": 3.4517669747563305e-06, "loss": 0.7121551, "num_input_tokens_seen": 86463230, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8828125, "step": 4017, "time_per_iteration": 2.764394760131836 }, { "auxiliary_loss_clip": 0.01157207, "auxiliary_loss_mlp": 0.01054581, "balance_loss_clip": 1.03703308, "balance_loss_mlp": 1.0538156, "epoch": 0.2415752292199008, "flos": 18146667634560.0, "grad_norm": 1.8545068251662373, "language_loss": 0.84413624, "learning_rate": 3.4515070759756704e-06, "loss": 0.86625415, "num_input_tokens_seen": 86481230, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9453125, "step": 4018, "time_per_iteration": 2.555494785308838 }, { "auxiliary_loss_clip": 0.01069819, "auxiliary_loss_mlp": 0.01014392, "balance_loss_clip": 1.01223421, "balance_loss_mlp": 1.0313766, "epoch": 0.24163535247256876, "flos": 67288409792640.0, "grad_norm": 0.8771587976440166, "language_loss": 0.60700655, "learning_rate": 3.4512471253944563e-06, "loss": 0.62784868, "num_input_tokens_seen": 86541260, "router_z_loss_clip": 0.02160645, "router_z_loss_mlp": 0.296875, "step": 4019, "time_per_iteration": 3.261369466781616 }, { "auxiliary_loss_clip": 0.01163947, "auxiliary_loss_mlp": 0.0104355, "balance_loss_clip": 1.02783251, "balance_loss_mlp": 1.0541203, "epoch": 0.24169547572523672, "flos": 24929941386240.0, "grad_norm": 2.2550105249044243, "language_loss": 0.73597634, "learning_rate": 3.4509871230219653e-06, "loss": 0.75805134, "num_input_tokens_seen": 86559580, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.91796875, "step": 4020, "time_per_iteration": 2.637791633605957 }, { "auxiliary_loss_clip": 0.01173454, "auxiliary_loss_mlp": 0.01041202, "balance_loss_clip": 1.02465594, "balance_loss_mlp": 1.05604231, "epoch": 0.24175559897790472, "flos": 18728312567040.0, "grad_norm": 1.7762611971022484, "language_loss": 0.81805408, "learning_rate": 3.4507270688674767e-06, "loss": 0.84020066, "num_input_tokens_seen": 86577560, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.90625, "step": 4021, "time_per_iteration": 2.657625436782837 }, { "auxiliary_loss_clip": 0.01153359, "auxiliary_loss_mlp": 0.01054013, "balance_loss_clip": 1.03521407, "balance_loss_mlp": 1.05797827, "epoch": 0.24181572223057268, "flos": 23039281111680.0, "grad_norm": 2.0320187393387887, "language_loss": 0.76357853, "learning_rate": 3.45046696294027e-06, "loss": 0.78565228, "num_input_tokens_seen": 86595350, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.953125, "step": 4022, "time_per_iteration": 2.5608506202697754 }, { "auxiliary_loss_clip": 0.01181673, "auxiliary_loss_mlp": 0.01045364, "balance_loss_clip": 1.02716112, "balance_loss_mlp": 1.05706477, "epoch": 0.24187584548324065, "flos": 20376145923840.0, "grad_norm": 2.4341006569995165, "language_loss": 0.75161338, "learning_rate": 3.4502068052496283e-06, "loss": 0.77388376, "num_input_tokens_seen": 86614805, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9765625, "step": 4023, "time_per_iteration": 2.644993305206299 }, { "auxiliary_loss_clip": 0.01165164, "auxiliary_loss_mlp": 0.01047734, "balance_loss_clip": 1.03057933, "balance_loss_mlp": 1.05617654, "epoch": 0.2419359687359086, "flos": 21397517153280.0, "grad_norm": 2.1997623635038446, "language_loss": 0.82417154, "learning_rate": 3.449946595804837e-06, "loss": 0.84630048, "num_input_tokens_seen": 86633700, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.91015625, "step": 4024, "time_per_iteration": 2.5776736736297607 }, { "auxiliary_loss_clip": 0.01158045, "auxiliary_loss_mlp": 0.01044769, "balance_loss_clip": 1.02734077, "balance_loss_mlp": 1.05842304, "epoch": 0.24199609198857658, "flos": 18369385914240.0, "grad_norm": 2.0596953028795224, "language_loss": 0.85879719, "learning_rate": 3.4496863346151805e-06, "loss": 0.88082528, "num_input_tokens_seen": 86650905, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.90625, "step": 4025, "time_per_iteration": 2.6747775077819824 }, { "auxiliary_loss_clip": 0.01176901, "auxiliary_loss_mlp": 0.01051276, "balance_loss_clip": 1.03395534, "balance_loss_mlp": 1.05372512, "epoch": 0.24205621524124454, "flos": 19463871277440.0, "grad_norm": 1.9421683358433688, "language_loss": 0.71430218, "learning_rate": 3.449426021689949e-06, "loss": 0.73658395, "num_input_tokens_seen": 86669185, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.95703125, "step": 4026, "time_per_iteration": 2.572981119155884 }, { "auxiliary_loss_clip": 0.01155821, "auxiliary_loss_mlp": 0.01044122, "balance_loss_clip": 1.02759922, "balance_loss_mlp": 1.05627215, "epoch": 0.2421163384939125, "flos": 14976330451200.0, "grad_norm": 1.8808677086604355, "language_loss": 0.64209509, "learning_rate": 3.44916565703843e-06, "loss": 0.66409445, "num_input_tokens_seen": 86686805, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.90234375, "step": 4027, "time_per_iteration": 2.603243827819824 }, { "auxiliary_loss_clip": 0.01160923, "auxiliary_loss_mlp": 0.01288686, "balance_loss_clip": 1.02470446, "balance_loss_mlp": 1.05729389, "epoch": 0.2421764617465805, "flos": 18662057930880.0, "grad_norm": 3.6924657608357676, "language_loss": 0.70431083, "learning_rate": 3.4489052406699167e-06, "loss": 0.72880697, "num_input_tokens_seen": 86705520, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.94921875, "step": 4028, "time_per_iteration": 2.536159038543701 }, { "auxiliary_loss_clip": 0.01074423, "auxiliary_loss_mlp": 0.01000949, "balance_loss_clip": 0.99873126, "balance_loss_mlp": 1.03542852, "epoch": 0.24223658499924847, "flos": 64347327164160.0, "grad_norm": 0.8500918689851068, "language_loss": 0.55273879, "learning_rate": 3.4486447725937024e-06, "loss": 0.57349241, "num_input_tokens_seen": 86767320, "router_z_loss_clip": 0.0222168, "router_z_loss_mlp": 0.30078125, "step": 4029, "time_per_iteration": 3.18526029586792 }, { "auxiliary_loss_clip": 0.01159923, "auxiliary_loss_mlp": 0.01046728, "balance_loss_clip": 1.02819109, "balance_loss_mlp": 1.05764341, "epoch": 0.24229670825191643, "flos": 25775243124480.0, "grad_norm": 1.624210784640827, "language_loss": 0.74072468, "learning_rate": 3.448384252819083e-06, "loss": 0.76279122, "num_input_tokens_seen": 86788110, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.9296875, "step": 4030, "time_per_iteration": 2.6743369102478027 }, { "auxiliary_loss_clip": 0.01150792, "auxiliary_loss_mlp": 0.01055143, "balance_loss_clip": 1.03747582, "balance_loss_mlp": 1.05728364, "epoch": 0.2423568315045844, "flos": 20667094087680.0, "grad_norm": 2.1026323017628825, "language_loss": 0.76261044, "learning_rate": 3.4481236813553544e-06, "loss": 0.78466988, "num_input_tokens_seen": 86807640, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.93359375, "step": 4031, "time_per_iteration": 2.637270927429199 }, { "auxiliary_loss_clip": 0.01171516, "auxiliary_loss_mlp": 0.01294827, "balance_loss_clip": 1.02930069, "balance_loss_mlp": 1.05586183, "epoch": 0.24241695475725236, "flos": 22416805393920.0, "grad_norm": 3.938103689468676, "language_loss": 0.64952219, "learning_rate": 3.447863058211817e-06, "loss": 0.67418563, "num_input_tokens_seen": 86826795, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9765625, "step": 4032, "time_per_iteration": 2.728907823562622 }, { "auxiliary_loss_clip": 0.01154419, "auxiliary_loss_mlp": 0.01049904, "balance_loss_clip": 1.0323807, "balance_loss_mlp": 1.0533216, "epoch": 0.24247707800992033, "flos": 17128995505920.0, "grad_norm": 2.0489755653242963, "language_loss": 0.8177458, "learning_rate": 3.447602383397772e-06, "loss": 0.83978903, "num_input_tokens_seen": 86843175, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.921875, "step": 4033, "time_per_iteration": 2.6160025596618652 }, { "auxiliary_loss_clip": 0.01173641, "auxiliary_loss_mlp": 0.0104488, "balance_loss_clip": 1.02743983, "balance_loss_mlp": 1.05468321, "epoch": 0.2425372012625883, "flos": 31613743399680.0, "grad_norm": 2.28838045621385, "language_loss": 0.69490808, "learning_rate": 3.447341656922521e-06, "loss": 0.71709335, "num_input_tokens_seen": 86863185, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9140625, "step": 4034, "time_per_iteration": 2.747624635696411 }, { "auxiliary_loss_clip": 0.01165853, "auxiliary_loss_mlp": 0.0103967, "balance_loss_clip": 1.02067995, "balance_loss_mlp": 1.05416822, "epoch": 0.24259732451525629, "flos": 16326032924160.0, "grad_norm": 2.2147558722682725, "language_loss": 0.96765119, "learning_rate": 3.4470808787953693e-06, "loss": 0.9897064, "num_input_tokens_seen": 86880040, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.9375, "step": 4035, "time_per_iteration": 2.5380489826202393 }, { "auxiliary_loss_clip": 0.01161822, "auxiliary_loss_mlp": 0.01046704, "balance_loss_clip": 1.03107572, "balance_loss_mlp": 1.05507207, "epoch": 0.24265744776792425, "flos": 22856639431680.0, "grad_norm": 1.6320703297130952, "language_loss": 0.7800324, "learning_rate": 3.4468200490256236e-06, "loss": 0.80211759, "num_input_tokens_seen": 86900610, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.88671875, "step": 4036, "time_per_iteration": 2.6477131843566895 }, { "auxiliary_loss_clip": 0.01159952, "auxiliary_loss_mlp": 0.01047804, "balance_loss_clip": 1.03004169, "balance_loss_mlp": 1.05642819, "epoch": 0.24271757102059222, "flos": 21871573873920.0, "grad_norm": 1.781690057863802, "language_loss": 0.7409749, "learning_rate": 3.4465591676225916e-06, "loss": 0.7630524, "num_input_tokens_seen": 86919385, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9453125, "step": 4037, "time_per_iteration": 2.5755441188812256 }, { "auxiliary_loss_clip": 0.01176273, "auxiliary_loss_mlp": 0.01045504, "balance_loss_clip": 1.02770567, "balance_loss_mlp": 1.05450797, "epoch": 0.24277769427326018, "flos": 19208582340480.0, "grad_norm": 2.3484719180073506, "language_loss": 0.76273566, "learning_rate": 3.446298234595584e-06, "loss": 0.78495347, "num_input_tokens_seen": 86938885, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9453125, "step": 4038, "time_per_iteration": 2.6212494373321533 }, { "auxiliary_loss_clip": 0.01156846, "auxiliary_loss_mlp": 0.01046113, "balance_loss_clip": 1.02742088, "balance_loss_mlp": 1.05374694, "epoch": 0.24283781752592815, "flos": 19499889640320.0, "grad_norm": 1.9034025006740154, "language_loss": 0.72013384, "learning_rate": 3.4460372499539133e-06, "loss": 0.74216342, "num_input_tokens_seen": 86957705, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.9375, "step": 4039, "time_per_iteration": 2.5081207752227783 }, { "auxiliary_loss_clip": 0.01155076, "auxiliary_loss_mlp": 0.01043677, "balance_loss_clip": 1.02674925, "balance_loss_mlp": 1.05420756, "epoch": 0.2428979407785961, "flos": 19902196944000.0, "grad_norm": 1.7935811094400442, "language_loss": 0.78507137, "learning_rate": 3.4457762137068923e-06, "loss": 0.80705893, "num_input_tokens_seen": 86975845, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.91796875, "step": 4040, "time_per_iteration": 2.561220169067383 }, { "auxiliary_loss_clip": 0.01177594, "auxiliary_loss_mlp": 0.01035658, "balance_loss_clip": 1.02018499, "balance_loss_mlp": 1.0534904, "epoch": 0.2429580640312641, "flos": 24715878284160.0, "grad_norm": 1.7207664455773761, "language_loss": 0.80231559, "learning_rate": 3.4455151258638377e-06, "loss": 0.82444817, "num_input_tokens_seen": 86994800, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8828125, "step": 4041, "time_per_iteration": 2.5787434577941895 }, { "auxiliary_loss_clip": 0.01172604, "auxiliary_loss_mlp": 0.01049112, "balance_loss_clip": 1.03245878, "balance_loss_mlp": 1.05483103, "epoch": 0.24301818728393207, "flos": 25630343660160.0, "grad_norm": 2.4648056163908425, "language_loss": 0.76606178, "learning_rate": 3.445253986434066e-06, "loss": 0.78827894, "num_input_tokens_seen": 87016845, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.91015625, "step": 4042, "time_per_iteration": 2.6168837547302246 }, { "auxiliary_loss_clip": 0.01206895, "auxiliary_loss_mlp": 0.01032996, "balance_loss_clip": 1.0175699, "balance_loss_mlp": 1.05419302, "epoch": 0.24307831053660003, "flos": 26141388410880.0, "grad_norm": 2.363864205042994, "language_loss": 0.81610036, "learning_rate": 3.4449927954268977e-06, "loss": 0.83849931, "num_input_tokens_seen": 87036270, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.890625, "step": 4043, "time_per_iteration": 2.651256799697876 }, { "auxiliary_loss_clip": 0.01164549, "auxiliary_loss_mlp": 0.01038422, "balance_loss_clip": 1.01987326, "balance_loss_mlp": 1.05104494, "epoch": 0.243138433789268, "flos": 14972415868800.0, "grad_norm": 2.216396868894326, "language_loss": 0.72706014, "learning_rate": 3.444731552851653e-06, "loss": 0.74908984, "num_input_tokens_seen": 87049920, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.953125, "step": 4044, "time_per_iteration": 2.5546696186065674 }, { "auxiliary_loss_clip": 0.01158414, "auxiliary_loss_mlp": 0.01044148, "balance_loss_clip": 1.02698207, "balance_loss_mlp": 1.05822265, "epoch": 0.24319855704193596, "flos": 25191694771200.0, "grad_norm": 1.8454511484844236, "language_loss": 0.83419526, "learning_rate": 3.4444702587176556e-06, "loss": 0.85622084, "num_input_tokens_seen": 87068230, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9140625, "step": 4045, "time_per_iteration": 3.9556491374969482 }, { "auxiliary_loss_clip": 0.01202668, "auxiliary_loss_mlp": 0.0104755, "balance_loss_clip": 1.0308969, "balance_loss_mlp": 1.05684829, "epoch": 0.24325868029460393, "flos": 22127221946880.0, "grad_norm": 1.6557105692985201, "language_loss": 0.86371964, "learning_rate": 3.4442089130342303e-06, "loss": 0.88622183, "num_input_tokens_seen": 87086435, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.921875, "step": 4046, "time_per_iteration": 2.61226487159729 }, { "auxiliary_loss_clip": 0.01159175, "auxiliary_loss_mlp": 0.01039798, "balance_loss_clip": 1.02263141, "balance_loss_mlp": 1.05225778, "epoch": 0.2433188035472719, "flos": 23582106420480.0, "grad_norm": 1.7910296656342704, "language_loss": 0.72355503, "learning_rate": 3.443947515810704e-06, "loss": 0.74554479, "num_input_tokens_seen": 87105340, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.890625, "step": 4047, "time_per_iteration": 2.622714042663574 }, { "auxiliary_loss_clip": 0.01156132, "auxiliary_loss_mlp": 0.0104292, "balance_loss_clip": 1.02524161, "balance_loss_mlp": 1.0541048, "epoch": 0.2433789267999399, "flos": 24462815990400.0, "grad_norm": 2.2241679570544974, "language_loss": 0.7286917, "learning_rate": 3.4436860670564053e-06, "loss": 0.75068223, "num_input_tokens_seen": 87125780, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.9296875, "step": 4048, "time_per_iteration": 2.587916135787964 }, { "auxiliary_loss_clip": 0.01171114, "auxiliary_loss_mlp": 0.01041932, "balance_loss_clip": 1.02565992, "balance_loss_mlp": 1.05476069, "epoch": 0.24343905005260785, "flos": 16727909264640.0, "grad_norm": 1.8113013093817223, "language_loss": 0.7294811, "learning_rate": 3.443424566780664e-06, "loss": 0.75161153, "num_input_tokens_seen": 87144470, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8984375, "step": 4049, "time_per_iteration": 3.9879040718078613 }, { "auxiliary_loss_clip": 0.01168411, "auxiliary_loss_mlp": 0.01044649, "balance_loss_clip": 1.02860296, "balance_loss_mlp": 1.05238366, "epoch": 0.24349917330527582, "flos": 20043756443520.0, "grad_norm": 1.6041284893712944, "language_loss": 0.73381782, "learning_rate": 3.4431630149928126e-06, "loss": 0.75594842, "num_input_tokens_seen": 87162830, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.890625, "step": 4050, "time_per_iteration": 4.050780296325684 }, { "auxiliary_loss_clip": 0.01157747, "auxiliary_loss_mlp": 0.01041209, "balance_loss_clip": 1.02529442, "balance_loss_mlp": 1.05123353, "epoch": 0.24355929655794378, "flos": 17420554200960.0, "grad_norm": 2.37425368249703, "language_loss": 0.74644744, "learning_rate": 3.442901411702186e-06, "loss": 0.76843703, "num_input_tokens_seen": 87180905, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.88671875, "step": 4051, "time_per_iteration": 2.4990763664245605 }, { "auxiliary_loss_clip": 0.01176442, "auxiliary_loss_mlp": 0.01040595, "balance_loss_clip": 1.02401304, "balance_loss_mlp": 1.05019927, "epoch": 0.24361941981061175, "flos": 25410929431680.0, "grad_norm": 1.985922270804705, "language_loss": 0.70761132, "learning_rate": 3.44263975691812e-06, "loss": 0.72978169, "num_input_tokens_seen": 87202290, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.90234375, "step": 4052, "time_per_iteration": 4.0366833209991455 }, { "auxiliary_loss_clip": 0.01161063, "auxiliary_loss_mlp": 0.01047155, "balance_loss_clip": 1.03019106, "balance_loss_mlp": 1.05224156, "epoch": 0.2436795430632797, "flos": 22820800636800.0, "grad_norm": 1.579309817897152, "language_loss": 0.80631828, "learning_rate": 3.4423780506499513e-06, "loss": 0.82840049, "num_input_tokens_seen": 87221650, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.91015625, "step": 4053, "time_per_iteration": 2.6106882095336914 }, { "auxiliary_loss_clip": 0.01147193, "auxiliary_loss_mlp": 0.01038693, "balance_loss_clip": 1.02201533, "balance_loss_mlp": 1.05387402, "epoch": 0.2437396663159477, "flos": 15157786982400.0, "grad_norm": 1.6690967212375616, "language_loss": 0.77915168, "learning_rate": 3.44211629290702e-06, "loss": 0.80101055, "num_input_tokens_seen": 87238515, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.93359375, "step": 4054, "time_per_iteration": 2.512863874435425 }, { "auxiliary_loss_clip": 0.01167148, "auxiliary_loss_mlp": 0.01054864, "balance_loss_clip": 1.0387466, "balance_loss_mlp": 1.04958129, "epoch": 0.24379978956861567, "flos": 22091131756800.0, "grad_norm": 1.6343652146796637, "language_loss": 0.83988321, "learning_rate": 3.441854483698668e-06, "loss": 0.86210334, "num_input_tokens_seen": 87256290, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.90625, "step": 4055, "time_per_iteration": 2.6679458618164062 }, { "auxiliary_loss_clip": 0.01172314, "auxiliary_loss_mlp": 0.01044356, "balance_loss_clip": 1.02695143, "balance_loss_mlp": 1.04975939, "epoch": 0.24385991282128364, "flos": 31467766527360.0, "grad_norm": 2.8637285572455977, "language_loss": 0.55124825, "learning_rate": 3.441592623034239e-06, "loss": 0.57341492, "num_input_tokens_seen": 87277085, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.95703125, "step": 4056, "time_per_iteration": 2.6709799766540527 }, { "auxiliary_loss_clip": 0.01155264, "auxiliary_loss_mlp": 0.01050511, "balance_loss_clip": 1.03297496, "balance_loss_mlp": 1.05319858, "epoch": 0.2439200360739516, "flos": 23838795987840.0, "grad_norm": 2.467782292725026, "language_loss": 0.79762387, "learning_rate": 3.4413307109230772e-06, "loss": 0.81968164, "num_input_tokens_seen": 87293020, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.93359375, "step": 4057, "time_per_iteration": 2.6648013591766357 }, { "auxiliary_loss_clip": 0.01147461, "auxiliary_loss_mlp": 0.01040507, "balance_loss_clip": 1.02430701, "balance_loss_mlp": 1.04972422, "epoch": 0.24398015932661957, "flos": 19169978198400.0, "grad_norm": 1.9214397652846604, "language_loss": 0.79596251, "learning_rate": 3.44106874737453e-06, "loss": 0.81784219, "num_input_tokens_seen": 87311445, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.88671875, "step": 4058, "time_per_iteration": 2.5537092685699463 }, { "auxiliary_loss_clip": 0.01145364, "auxiliary_loss_mlp": 0.01045986, "balance_loss_clip": 1.02992857, "balance_loss_mlp": 1.05038357, "epoch": 0.24404028257928753, "flos": 25262474520960.0, "grad_norm": 1.6360579075907704, "language_loss": 0.85306478, "learning_rate": 3.440806732397945e-06, "loss": 0.8749783, "num_input_tokens_seen": 87332055, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.94921875, "step": 4059, "time_per_iteration": 2.716830253601074 }, { "auxiliary_loss_clip": 0.01147949, "auxiliary_loss_mlp": 0.01035851, "balance_loss_clip": 1.02030587, "balance_loss_mlp": 1.05017793, "epoch": 0.2441004058319555, "flos": 26467600752000.0, "grad_norm": 1.6597146169823878, "language_loss": 0.74337494, "learning_rate": 3.4405446660026753e-06, "loss": 0.76521289, "num_input_tokens_seen": 87351295, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.88671875, "step": 4060, "time_per_iteration": 2.5822510719299316 }, { "auxiliary_loss_clip": 0.01175147, "auxiliary_loss_mlp": 0.01049601, "balance_loss_clip": 1.02977645, "balance_loss_mlp": 1.05204785, "epoch": 0.2441605290846235, "flos": 26760524163840.0, "grad_norm": 1.7407489002500356, "language_loss": 0.73615801, "learning_rate": 3.4402825481980707e-06, "loss": 0.75840545, "num_input_tokens_seen": 87370650, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.9609375, "step": 4061, "time_per_iteration": 2.6530346870422363 }, { "auxiliary_loss_clip": 0.01141337, "auxiliary_loss_mlp": 0.01042244, "balance_loss_clip": 1.02635336, "balance_loss_mlp": 1.05248511, "epoch": 0.24422065233729146, "flos": 21105850717440.0, "grad_norm": 2.0443030110692817, "language_loss": 0.76065218, "learning_rate": 3.4400203789934876e-06, "loss": 0.78248799, "num_input_tokens_seen": 87389020, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.88671875, "step": 4062, "time_per_iteration": 2.55312442779541 }, { "auxiliary_loss_clip": 0.01166372, "auxiliary_loss_mlp": 0.01041715, "balance_loss_clip": 1.02508521, "balance_loss_mlp": 1.05216217, "epoch": 0.24428077558995942, "flos": 25263156879360.0, "grad_norm": 1.644609247407683, "language_loss": 0.85262209, "learning_rate": 3.4397581583982814e-06, "loss": 0.87470293, "num_input_tokens_seen": 87409695, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.87109375, "step": 4063, "time_per_iteration": 2.7894768714904785 }, { "auxiliary_loss_clip": 0.01164775, "auxiliary_loss_mlp": 0.010414, "balance_loss_clip": 1.02457941, "balance_loss_mlp": 1.05240738, "epoch": 0.24434089884262739, "flos": 20485278420480.0, "grad_norm": 2.6998931376136728, "language_loss": 0.68445885, "learning_rate": 3.43949588642181e-06, "loss": 0.70652068, "num_input_tokens_seen": 87428250, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.94140625, "step": 4064, "time_per_iteration": 2.544173002243042 }, { "auxiliary_loss_clip": 0.01154049, "auxiliary_loss_mlp": 0.01040809, "balance_loss_clip": 1.0232141, "balance_loss_mlp": 1.05017066, "epoch": 0.24440102209529535, "flos": 23621895711360.0, "grad_norm": 1.6391995493284224, "language_loss": 0.70148098, "learning_rate": 3.439233563073433e-06, "loss": 0.7234295, "num_input_tokens_seen": 87449380, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.94921875, "step": 4065, "time_per_iteration": 2.6215696334838867 }, { "auxiliary_loss_clip": 0.01147503, "auxiliary_loss_mlp": 0.01054596, "balance_loss_clip": 1.03511715, "balance_loss_mlp": 1.05161476, "epoch": 0.24446114534796332, "flos": 20554729367040.0, "grad_norm": 1.880349916972079, "language_loss": 0.84377664, "learning_rate": 3.4389711883625124e-06, "loss": 0.86579764, "num_input_tokens_seen": 87465365, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.95703125, "step": 4066, "time_per_iteration": 2.5657289028167725 }, { "auxiliary_loss_clip": 0.01065456, "auxiliary_loss_mlp": 0.0101223, "balance_loss_clip": 1.00970328, "balance_loss_mlp": 1.03539622, "epoch": 0.24452126860063128, "flos": 60389575009920.0, "grad_norm": 0.7379917797199619, "language_loss": 0.52302969, "learning_rate": 3.4387087622984114e-06, "loss": 0.54380655, "num_input_tokens_seen": 87522525, "router_z_loss_clip": 0.02526855, "router_z_loss_mlp": 0.30078125, "step": 4067, "time_per_iteration": 3.032477617263794 }, { "auxiliary_loss_clip": 0.01163727, "auxiliary_loss_mlp": 0.01042482, "balance_loss_clip": 1.02474332, "balance_loss_mlp": 1.05290496, "epoch": 0.24458139185329927, "flos": 15121660878720.0, "grad_norm": 4.145279701984695, "language_loss": 0.71906507, "learning_rate": 3.4384462848904956e-06, "loss": 0.74112713, "num_input_tokens_seen": 87539170, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9296875, "step": 4068, "time_per_iteration": 2.5937464237213135 }, { "auxiliary_loss_clip": 0.01178554, "auxiliary_loss_mlp": 0.01041851, "balance_loss_clip": 1.02418447, "balance_loss_mlp": 1.05189681, "epoch": 0.24464151510596724, "flos": 27998723842560.0, "grad_norm": 1.43220590983524, "language_loss": 0.77858841, "learning_rate": 3.438183756148132e-06, "loss": 0.80079246, "num_input_tokens_seen": 87558875, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.90625, "step": 4069, "time_per_iteration": 2.654435634613037 }, { "auxiliary_loss_clip": 0.01159175, "auxiliary_loss_mlp": 0.0104748, "balance_loss_clip": 1.03002763, "balance_loss_mlp": 1.05163264, "epoch": 0.2447016383586352, "flos": 19792884879360.0, "grad_norm": 2.068385944010481, "language_loss": 0.80739021, "learning_rate": 3.4379211760806895e-06, "loss": 0.82945681, "num_input_tokens_seen": 87576485, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.8984375, "step": 4070, "time_per_iteration": 2.630312204360962 }, { "auxiliary_loss_clip": 0.01149159, "auxiliary_loss_mlp": 0.01044826, "balance_loss_clip": 1.02845907, "balance_loss_mlp": 1.04930222, "epoch": 0.24476176161130317, "flos": 26067340523520.0, "grad_norm": 1.6962407356455842, "language_loss": 0.84037638, "learning_rate": 3.4376585446975394e-06, "loss": 0.86231625, "num_input_tokens_seen": 87598620, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.90625, "step": 4071, "time_per_iteration": 2.6020290851593018 }, { "auxiliary_loss_clip": 0.01165086, "auxiliary_loss_mlp": 0.01045876, "balance_loss_clip": 1.02767229, "balance_loss_mlp": 1.05101228, "epoch": 0.24482188486397113, "flos": 18843550375680.0, "grad_norm": 2.3630250294654185, "language_loss": 0.79864287, "learning_rate": 3.4373958620080535e-06, "loss": 0.8207525, "num_input_tokens_seen": 87616595, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9609375, "step": 4072, "time_per_iteration": 2.661322593688965 }, { "auxiliary_loss_clip": 0.01151194, "auxiliary_loss_mlp": 0.01044664, "balance_loss_clip": 1.02849925, "balance_loss_mlp": 1.05394292, "epoch": 0.2448820081166391, "flos": 21251791676160.0, "grad_norm": 1.489678327345966, "language_loss": 0.6977033, "learning_rate": 3.437133128021607e-06, "loss": 0.71966189, "num_input_tokens_seen": 87635755, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8828125, "step": 4073, "time_per_iteration": 2.5604519844055176 }, { "auxiliary_loss_clip": 0.01160216, "auxiliary_loss_mlp": 0.01041016, "balance_loss_clip": 1.02487493, "balance_loss_mlp": 1.05317116, "epoch": 0.2449421313693071, "flos": 23950586090880.0, "grad_norm": 2.862151917211902, "language_loss": 0.67563599, "learning_rate": 3.436870342747576e-06, "loss": 0.69764829, "num_input_tokens_seen": 87652885, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.890625, "step": 4074, "time_per_iteration": 2.631586790084839 }, { "auxiliary_loss_clip": 0.01056707, "auxiliary_loss_mlp": 0.01004768, "balance_loss_clip": 1.0023365, "balance_loss_mlp": 1.02702665, "epoch": 0.24500225462197506, "flos": 60687669980160.0, "grad_norm": 0.89135216994872, "language_loss": 0.68668532, "learning_rate": 3.4366075061953383e-06, "loss": 0.70730007, "num_input_tokens_seen": 87713220, "router_z_loss_clip": 0.02429199, "router_z_loss_mlp": 0.296875, "step": 4075, "time_per_iteration": 3.2295193672180176 }, { "auxiliary_loss_clip": 0.01139521, "auxiliary_loss_mlp": 0.01042572, "balance_loss_clip": 1.02641869, "balance_loss_mlp": 1.05109906, "epoch": 0.24506237787464302, "flos": 26284204886400.0, "grad_norm": 3.3035090420952486, "language_loss": 0.7971313, "learning_rate": 3.4363446183742745e-06, "loss": 0.81895226, "num_input_tokens_seen": 87732680, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8828125, "step": 4076, "time_per_iteration": 2.641303062438965 }, { "auxiliary_loss_clip": 0.01165122, "auxiliary_loss_mlp": 0.01292505, "balance_loss_clip": 1.02628112, "balance_loss_mlp": 1.05218279, "epoch": 0.245122501127311, "flos": 20552287242240.0, "grad_norm": 3.857481419517009, "language_loss": 0.81934369, "learning_rate": 3.436081679293765e-06, "loss": 0.84391999, "num_input_tokens_seen": 87751880, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.94921875, "step": 4077, "time_per_iteration": 2.6222784519195557 }, { "auxiliary_loss_clip": 0.01144615, "auxiliary_loss_mlp": 0.01044429, "balance_loss_clip": 1.02570105, "balance_loss_mlp": 1.05225539, "epoch": 0.24518262437997895, "flos": 29132603447040.0, "grad_norm": 3.7309245989229196, "language_loss": 0.6222018, "learning_rate": 3.435818688963195e-06, "loss": 0.6440922, "num_input_tokens_seen": 87771795, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.921875, "step": 4078, "time_per_iteration": 2.618243455886841 }, { "auxiliary_loss_clip": 0.01158325, "auxiliary_loss_mlp": 0.01037489, "balance_loss_clip": 1.0206089, "balance_loss_mlp": 1.05195034, "epoch": 0.24524274763264692, "flos": 23476924419840.0, "grad_norm": 1.491736991288754, "language_loss": 0.75663656, "learning_rate": 3.4355556473919496e-06, "loss": 0.77859467, "num_input_tokens_seen": 87793640, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8828125, "step": 4079, "time_per_iteration": 2.701320171356201 }, { "auxiliary_loss_clip": 0.01139046, "auxiliary_loss_mlp": 0.01046232, "balance_loss_clip": 1.02787352, "balance_loss_mlp": 1.05003595, "epoch": 0.24530287088531488, "flos": 17201175886080.0, "grad_norm": 1.710751096413741, "language_loss": 0.75112522, "learning_rate": 3.4352925545894158e-06, "loss": 0.77297801, "num_input_tokens_seen": 87812390, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.890625, "step": 4080, "time_per_iteration": 2.5735466480255127 }, { "auxiliary_loss_clip": 0.01152049, "auxiliary_loss_mlp": 0.01046305, "balance_loss_clip": 1.02952027, "balance_loss_mlp": 1.04741144, "epoch": 0.24536299413798288, "flos": 14867449349760.0, "grad_norm": 1.8108774183671268, "language_loss": 0.82505906, "learning_rate": 3.4350294105649823e-06, "loss": 0.84704256, "num_input_tokens_seen": 87830640, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8671875, "step": 4081, "time_per_iteration": 2.5788238048553467 }, { "auxiliary_loss_clip": 0.01157197, "auxiliary_loss_mlp": 0.01037494, "balance_loss_clip": 1.02203202, "balance_loss_mlp": 1.05107498, "epoch": 0.24542311739065084, "flos": 35262051886080.0, "grad_norm": 2.0508244799694872, "language_loss": 0.7296232, "learning_rate": 3.4347662153280407e-06, "loss": 0.75157011, "num_input_tokens_seen": 87850450, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8828125, "step": 4082, "time_per_iteration": 2.6747217178344727 }, { "auxiliary_loss_clip": 0.01161041, "auxiliary_loss_mlp": 0.01046916, "balance_loss_clip": 1.03187132, "balance_loss_mlp": 1.04791844, "epoch": 0.2454832406433188, "flos": 21503130117120.0, "grad_norm": 1.6430440177969994, "language_loss": 0.71786398, "learning_rate": 3.4345029688879837e-06, "loss": 0.7399435, "num_input_tokens_seen": 87868810, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.859375, "step": 4083, "time_per_iteration": 2.594247579574585 }, { "auxiliary_loss_clip": 0.01160353, "auxiliary_loss_mlp": 0.01043842, "balance_loss_clip": 1.02535248, "balance_loss_mlp": 1.0503552, "epoch": 0.24554336389598677, "flos": 14756664827520.0, "grad_norm": 2.3569994091667628, "language_loss": 0.74474442, "learning_rate": 3.4342396712542057e-06, "loss": 0.7667864, "num_input_tokens_seen": 87885685, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.921875, "step": 4084, "time_per_iteration": 2.5589141845703125 }, { "auxiliary_loss_clip": 0.01143983, "auxiliary_loss_mlp": 0.01036874, "balance_loss_clip": 1.02059031, "balance_loss_mlp": 1.04778504, "epoch": 0.24560348714865474, "flos": 14976402278400.0, "grad_norm": 1.8082155860989788, "language_loss": 0.85224205, "learning_rate": 3.433976322436103e-06, "loss": 0.87405062, "num_input_tokens_seen": 87903715, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.87109375, "step": 4085, "time_per_iteration": 2.6645898818969727 }, { "auxiliary_loss_clip": 0.01158177, "auxiliary_loss_mlp": 0.01045814, "balance_loss_clip": 1.02892244, "balance_loss_mlp": 1.05108321, "epoch": 0.2456636104013227, "flos": 22675326554880.0, "grad_norm": 1.828652266951658, "language_loss": 0.79087514, "learning_rate": 3.433712922443074e-06, "loss": 0.81291509, "num_input_tokens_seen": 87923375, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.890625, "step": 4086, "time_per_iteration": 2.5789194107055664 }, { "auxiliary_loss_clip": 0.01175174, "auxiliary_loss_mlp": 0.01040391, "balance_loss_clip": 1.02471519, "balance_loss_mlp": 1.05332851, "epoch": 0.2457237336539907, "flos": 27417869009280.0, "grad_norm": 1.6806327767258866, "language_loss": 0.74848843, "learning_rate": 3.433449471284519e-06, "loss": 0.77064407, "num_input_tokens_seen": 87943115, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.85546875, "step": 4087, "time_per_iteration": 4.002888917922974 }, { "auxiliary_loss_clip": 0.01181255, "auxiliary_loss_mlp": 0.01045126, "balance_loss_clip": 1.02782834, "balance_loss_mlp": 1.05552828, "epoch": 0.24578385690665866, "flos": 20412379768320.0, "grad_norm": 2.6006834943996293, "language_loss": 0.79920137, "learning_rate": 3.433185968969839e-06, "loss": 0.82146513, "num_input_tokens_seen": 87959505, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.890625, "step": 4088, "time_per_iteration": 2.694845199584961 }, { "auxiliary_loss_clip": 0.01162063, "auxiliary_loss_mlp": 0.0103388, "balance_loss_clip": 1.01909816, "balance_loss_mlp": 1.04971862, "epoch": 0.24584398015932662, "flos": 23915393740800.0, "grad_norm": 1.7089457776322559, "language_loss": 0.77175254, "learning_rate": 3.4329224155084386e-06, "loss": 0.79371202, "num_input_tokens_seen": 87979725, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8515625, "step": 4089, "time_per_iteration": 2.5700483322143555 }, { "auxiliary_loss_clip": 0.01145584, "auxiliary_loss_mlp": 0.01047717, "balance_loss_clip": 1.03132546, "balance_loss_mlp": 1.04759753, "epoch": 0.2459041034119946, "flos": 41496359103360.0, "grad_norm": 2.069196740383821, "language_loss": 0.81460512, "learning_rate": 3.4326588109097236e-06, "loss": 0.83653808, "num_input_tokens_seen": 87998270, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.88671875, "step": 4090, "time_per_iteration": 4.097517490386963 }, { "auxiliary_loss_clip": 0.01159447, "auxiliary_loss_mlp": 0.01046262, "balance_loss_clip": 1.02941751, "balance_loss_mlp": 1.05180585, "epoch": 0.24596422666466256, "flos": 19936814676480.0, "grad_norm": 2.0443633111938486, "language_loss": 0.73388708, "learning_rate": 3.4323951551831004e-06, "loss": 0.75594413, "num_input_tokens_seen": 88016760, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8984375, "step": 4091, "time_per_iteration": 4.200138092041016 }, { "auxiliary_loss_clip": 0.01167704, "auxiliary_loss_mlp": 0.01041697, "balance_loss_clip": 1.0265038, "balance_loss_mlp": 1.05403817, "epoch": 0.24602434991733052, "flos": 21544391865600.0, "grad_norm": 2.305003372662394, "language_loss": 0.7692098, "learning_rate": 3.432131448337979e-06, "loss": 0.79130381, "num_input_tokens_seen": 88036465, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8671875, "step": 4092, "time_per_iteration": 2.5902442932128906 }, { "auxiliary_loss_clip": 0.01162209, "auxiliary_loss_mlp": 0.01038721, "balance_loss_clip": 1.02281809, "balance_loss_mlp": 1.05147982, "epoch": 0.24608447316999849, "flos": 23185078416000.0, "grad_norm": 2.5419190464123522, "language_loss": 0.80865091, "learning_rate": 3.43186769038377e-06, "loss": 0.83066022, "num_input_tokens_seen": 88053270, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.921875, "step": 4093, "time_per_iteration": 4.0233659744262695 }, { "auxiliary_loss_clip": 0.01150033, "auxiliary_loss_mlp": 0.0104246, "balance_loss_clip": 1.02573466, "balance_loss_mlp": 1.04899502, "epoch": 0.24614459642266648, "flos": 19641951930240.0, "grad_norm": 1.8286264194031512, "language_loss": 0.86991775, "learning_rate": 3.431603881329886e-06, "loss": 0.89184272, "num_input_tokens_seen": 88072305, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.921875, "step": 4094, "time_per_iteration": 2.5485334396362305 }, { "auxiliary_loss_clip": 0.01100566, "auxiliary_loss_mlp": 0.01012646, "balance_loss_clip": 1.01045275, "balance_loss_mlp": 1.02555346, "epoch": 0.24620471967533444, "flos": 61739816186880.0, "grad_norm": 0.758121860744547, "language_loss": 0.57517558, "learning_rate": 3.4313400211857424e-06, "loss": 0.59630769, "num_input_tokens_seen": 88137995, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.296875, "step": 4095, "time_per_iteration": 3.2568013668060303 }, { "auxiliary_loss_clip": 0.01072911, "auxiliary_loss_mlp": 0.01016927, "balance_loss_clip": 1.01482916, "balance_loss_mlp": 1.02532542, "epoch": 0.2462648429280024, "flos": 69154436315520.0, "grad_norm": 0.6439728661186014, "language_loss": 0.56244206, "learning_rate": 3.431076109960755e-06, "loss": 0.58334047, "num_input_tokens_seen": 88208490, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.296875, "step": 4096, "time_per_iteration": 3.300773859024048 }, { "auxiliary_loss_clip": 0.01160972, "auxiliary_loss_mlp": 0.01041425, "balance_loss_clip": 1.02649987, "balance_loss_mlp": 1.05435228, "epoch": 0.24632496618067037, "flos": 29459605887360.0, "grad_norm": 1.6426855278603039, "language_loss": 0.77066457, "learning_rate": 3.4308121476643423e-06, "loss": 0.79268855, "num_input_tokens_seen": 88228050, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.88671875, "step": 4097, "time_per_iteration": 2.7378971576690674 }, { "auxiliary_loss_clip": 0.01169531, "auxiliary_loss_mlp": 0.01047815, "balance_loss_clip": 1.02925444, "balance_loss_mlp": 1.05091083, "epoch": 0.24638508943333834, "flos": 24316444068480.0, "grad_norm": 1.7739678566206525, "language_loss": 0.76081419, "learning_rate": 3.4305481343059254e-06, "loss": 0.78298771, "num_input_tokens_seen": 88248090, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.9140625, "step": 4098, "time_per_iteration": 2.640303373336792 }, { "auxiliary_loss_clip": 0.01159004, "auxiliary_loss_mlp": 0.01040995, "balance_loss_clip": 1.02585554, "balance_loss_mlp": 1.05138993, "epoch": 0.2464452126860063, "flos": 26613254401920.0, "grad_norm": 1.9293999455356512, "language_loss": 0.67548376, "learning_rate": 3.4302840698949247e-06, "loss": 0.69748378, "num_input_tokens_seen": 88267545, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8984375, "step": 4099, "time_per_iteration": 2.637618064880371 }, { "auxiliary_loss_clip": 0.01143991, "auxiliary_loss_mlp": 0.0104027, "balance_loss_clip": 1.02499962, "balance_loss_mlp": 1.0500741, "epoch": 0.24650533593867427, "flos": 31212405763200.0, "grad_norm": 1.9575984599105356, "language_loss": 0.65984756, "learning_rate": 3.430019954440764e-06, "loss": 0.68169016, "num_input_tokens_seen": 88289785, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8515625, "step": 4100, "time_per_iteration": 2.622270345687866 }, { "auxiliary_loss_clip": 0.01072458, "auxiliary_loss_mlp": 0.01015735, "balance_loss_clip": 1.01332688, "balance_loss_mlp": 1.02509546, "epoch": 0.24656545919134226, "flos": 68494002900480.0, "grad_norm": 0.7182111146894703, "language_loss": 0.61437881, "learning_rate": 3.429755787952871e-06, "loss": 0.6352607, "num_input_tokens_seen": 88357320, "router_z_loss_clip": 0.02404785, "router_z_loss_mlp": 0.29296875, "step": 4101, "time_per_iteration": 3.2364704608917236 }, { "auxiliary_loss_clip": 0.01136284, "auxiliary_loss_mlp": 0.01039495, "balance_loss_clip": 1.02355695, "balance_loss_mlp": 1.04934335, "epoch": 0.24662558244401023, "flos": 20084192179200.0, "grad_norm": 1.5966925788800428, "language_loss": 0.73544025, "learning_rate": 3.429491570440671e-06, "loss": 0.7571981, "num_input_tokens_seen": 88377040, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8671875, "step": 4102, "time_per_iteration": 2.554583787918091 }, { "auxiliary_loss_clip": 0.01159225, "auxiliary_loss_mlp": 0.01036863, "balance_loss_clip": 1.02127063, "balance_loss_mlp": 1.05094504, "epoch": 0.2466857056966782, "flos": 30701361012480.0, "grad_norm": 2.341346100539997, "language_loss": 0.75584495, "learning_rate": 3.4292273019135936e-06, "loss": 0.77780581, "num_input_tokens_seen": 88395085, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.90234375, "step": 4103, "time_per_iteration": 2.7106356620788574 }, { "auxiliary_loss_clip": 0.01149925, "auxiliary_loss_mlp": 0.01041634, "balance_loss_clip": 1.02430093, "balance_loss_mlp": 1.05146086, "epoch": 0.24674582894934616, "flos": 22528523669760.0, "grad_norm": 2.912839628514176, "language_loss": 0.78148478, "learning_rate": 3.4289629823810707e-06, "loss": 0.80340034, "num_input_tokens_seen": 88413205, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.89453125, "step": 4104, "time_per_iteration": 2.5480310916900635 }, { "auxiliary_loss_clip": 0.01148833, "auxiliary_loss_mlp": 0.01041093, "balance_loss_clip": 1.02325916, "balance_loss_mlp": 1.05245566, "epoch": 0.24680595220201412, "flos": 20704297599360.0, "grad_norm": 2.108661491495006, "language_loss": 0.83100212, "learning_rate": 3.4286986118525345e-06, "loss": 0.8529014, "num_input_tokens_seen": 88431525, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.875, "step": 4105, "time_per_iteration": 2.58792781829834 }, { "auxiliary_loss_clip": 0.01143417, "auxiliary_loss_mlp": 0.01042869, "balance_loss_clip": 1.02693057, "balance_loss_mlp": 1.05406582, "epoch": 0.2468660754546821, "flos": 21831174051840.0, "grad_norm": 1.7867384784924145, "language_loss": 0.76432019, "learning_rate": 3.4284341903374196e-06, "loss": 0.78618306, "num_input_tokens_seen": 88451210, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.89453125, "step": 4106, "time_per_iteration": 2.528686285018921 }, { "auxiliary_loss_clip": 0.01157258, "auxiliary_loss_mlp": 0.01041871, "balance_loss_clip": 1.02433574, "balance_loss_mlp": 1.05028105, "epoch": 0.24692619870735008, "flos": 15267709578240.0, "grad_norm": 2.1736998266552656, "language_loss": 0.72103953, "learning_rate": 3.4281697178451638e-06, "loss": 0.74303085, "num_input_tokens_seen": 88467790, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.88671875, "step": 4107, "time_per_iteration": 2.5896944999694824 }, { "auxiliary_loss_clip": 0.0114283, "auxiliary_loss_mlp": 0.01050478, "balance_loss_clip": 1.03389597, "balance_loss_mlp": 1.05365062, "epoch": 0.24698632196001805, "flos": 29680097523840.0, "grad_norm": 1.6854529010094437, "language_loss": 0.64853573, "learning_rate": 3.4279051943852037e-06, "loss": 0.67046881, "num_input_tokens_seen": 88490330, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.890625, "step": 4108, "time_per_iteration": 2.595940351486206 }, { "auxiliary_loss_clip": 0.01178569, "auxiliary_loss_mlp": 0.01045396, "balance_loss_clip": 1.02851582, "balance_loss_mlp": 1.05187166, "epoch": 0.247046445212686, "flos": 39165469741440.0, "grad_norm": 1.6155669171596412, "language_loss": 0.72625279, "learning_rate": 3.42764061996698e-06, "loss": 0.74849248, "num_input_tokens_seen": 88512435, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.90625, "step": 4109, "time_per_iteration": 2.72379994392395 }, { "auxiliary_loss_clip": 0.01167599, "auxiliary_loss_mlp": 0.01048659, "balance_loss_clip": 1.03183889, "balance_loss_mlp": 1.05017328, "epoch": 0.24710656846535398, "flos": 22998845376000.0, "grad_norm": 1.9347076899949458, "language_loss": 0.79099983, "learning_rate": 3.4273759945999356e-06, "loss": 0.81316245, "num_input_tokens_seen": 88529780, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.91015625, "step": 4110, "time_per_iteration": 2.551082134246826 }, { "auxiliary_loss_clip": 0.0116063, "auxiliary_loss_mlp": 0.01035729, "balance_loss_clip": 1.01917124, "balance_loss_mlp": 1.05254674, "epoch": 0.24716669171802194, "flos": 26432803451520.0, "grad_norm": 2.117070893237527, "language_loss": 0.80424476, "learning_rate": 3.4271113182935134e-06, "loss": 0.82620835, "num_input_tokens_seen": 88547200, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8984375, "step": 4111, "time_per_iteration": 2.6360576152801514 }, { "auxiliary_loss_clip": 0.01139036, "auxiliary_loss_mlp": 0.01040481, "balance_loss_clip": 1.02509141, "balance_loss_mlp": 1.04937005, "epoch": 0.2472268149706899, "flos": 23329870139520.0, "grad_norm": 3.1562652879043225, "language_loss": 0.75108773, "learning_rate": 3.4268465910571587e-06, "loss": 0.77288294, "num_input_tokens_seen": 88566415, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8984375, "step": 4112, "time_per_iteration": 2.5440971851348877 }, { "auxiliary_loss_clip": 0.01164307, "auxiliary_loss_mlp": 0.01296748, "balance_loss_clip": 1.03271949, "balance_loss_mlp": 1.04845619, "epoch": 0.24728693822335787, "flos": 23768734510080.0, "grad_norm": 1.9400102272003925, "language_loss": 0.82300222, "learning_rate": 3.42658181290032e-06, "loss": 0.8476128, "num_input_tokens_seen": 88585225, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.890625, "step": 4113, "time_per_iteration": 2.6225876808166504 }, { "auxiliary_loss_clip": 0.01137279, "auxiliary_loss_mlp": 0.01036185, "balance_loss_clip": 1.02028286, "balance_loss_mlp": 1.04824686, "epoch": 0.24734706147602586, "flos": 19317499355520.0, "grad_norm": 3.366489973356836, "language_loss": 0.87115979, "learning_rate": 3.4263169838324458e-06, "loss": 0.89289445, "num_input_tokens_seen": 88603280, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.890625, "step": 4114, "time_per_iteration": 2.4847168922424316 }, { "auxiliary_loss_clip": 0.01175271, "auxiliary_loss_mlp": 0.01039405, "balance_loss_clip": 1.02323985, "balance_loss_mlp": 1.04948497, "epoch": 0.24740718472869383, "flos": 28036932935040.0, "grad_norm": 1.6354935417973988, "language_loss": 0.75314778, "learning_rate": 3.4260521038629878e-06, "loss": 0.77529454, "num_input_tokens_seen": 88624925, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8984375, "step": 4115, "time_per_iteration": 2.650688648223877 }, { "auxiliary_loss_clip": 0.01097751, "auxiliary_loss_mlp": 0.01017322, "balance_loss_clip": 1.0151881, "balance_loss_mlp": 1.02345872, "epoch": 0.2474673079813618, "flos": 68107569408000.0, "grad_norm": 0.669748829017951, "language_loss": 0.58194661, "learning_rate": 3.4257871730013974e-06, "loss": 0.60309732, "num_input_tokens_seen": 88691475, "router_z_loss_clip": 0.0213623, "router_z_loss_mlp": 0.29492188, "step": 4116, "time_per_iteration": 3.31152606010437 }, { "auxiliary_loss_clip": 0.01154963, "auxiliary_loss_mlp": 0.01039016, "balance_loss_clip": 1.02341127, "balance_loss_mlp": 1.04892659, "epoch": 0.24752743123402976, "flos": 29462119839360.0, "grad_norm": 1.4679434653767116, "language_loss": 0.83372587, "learning_rate": 3.4255221912571315e-06, "loss": 0.85566568, "num_input_tokens_seen": 88713425, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8828125, "step": 4117, "time_per_iteration": 2.6915762424468994 }, { "auxiliary_loss_clip": 0.01070576, "auxiliary_loss_mlp": 0.01257902, "balance_loss_clip": 1.00838208, "balance_loss_mlp": 1.02387238, "epoch": 0.24758755448669773, "flos": 58350459824640.0, "grad_norm": 0.8997152249583432, "language_loss": 0.63491768, "learning_rate": 3.425257158639645e-06, "loss": 0.65820247, "num_input_tokens_seen": 88769995, "router_z_loss_clip": 0.02307129, "router_z_loss_mlp": 0.29296875, "step": 4118, "time_per_iteration": 3.026304244995117 }, { "auxiliary_loss_clip": 0.01147195, "auxiliary_loss_mlp": 0.01041329, "balance_loss_clip": 1.02529502, "balance_loss_mlp": 1.04983473, "epoch": 0.2476476777393657, "flos": 20484416494080.0, "grad_norm": 1.8320235296604575, "language_loss": 0.79268539, "learning_rate": 3.424992075158397e-06, "loss": 0.81457061, "num_input_tokens_seen": 88789970, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8828125, "step": 4119, "time_per_iteration": 2.556807041168213 }, { "auxiliary_loss_clip": 0.01148343, "auxiliary_loss_mlp": 0.0103496, "balance_loss_clip": 1.01928425, "balance_loss_mlp": 1.05156243, "epoch": 0.24770780099203366, "flos": 20485853038080.0, "grad_norm": 1.4332007951480426, "language_loss": 0.74104023, "learning_rate": 3.4247269408228467e-06, "loss": 0.76287329, "num_input_tokens_seen": 88810000, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.87890625, "step": 4120, "time_per_iteration": 2.644834041595459 }, { "auxiliary_loss_clip": 0.01148682, "auxiliary_loss_mlp": 0.0104799, "balance_loss_clip": 1.03091931, "balance_loss_mlp": 1.04969537, "epoch": 0.24776792424470165, "flos": 15153405523200.0, "grad_norm": 2.090909020866388, "language_loss": 0.88653374, "learning_rate": 3.424461755642457e-06, "loss": 0.90850043, "num_input_tokens_seen": 88827515, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.8984375, "step": 4121, "time_per_iteration": 2.623349666595459 }, { "auxiliary_loss_clip": 0.01139412, "auxiliary_loss_mlp": 0.01041633, "balance_loss_clip": 1.02462196, "balance_loss_mlp": 1.04852748, "epoch": 0.2478280474973696, "flos": 21725453347200.0, "grad_norm": 2.400237314716397, "language_loss": 0.68956661, "learning_rate": 3.4241965196266912e-06, "loss": 0.71137708, "num_input_tokens_seen": 88845025, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.90625, "step": 4122, "time_per_iteration": 2.5848731994628906 }, { "auxiliary_loss_clip": 0.01141586, "auxiliary_loss_mlp": 0.01040819, "balance_loss_clip": 1.02248478, "balance_loss_mlp": 1.04999709, "epoch": 0.24788817075003758, "flos": 20412200200320.0, "grad_norm": 2.253153322001821, "language_loss": 0.80599332, "learning_rate": 3.4239312327850155e-06, "loss": 0.82781738, "num_input_tokens_seen": 88861740, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.9140625, "step": 4123, "time_per_iteration": 2.551269292831421 }, { "auxiliary_loss_clip": 0.01177279, "auxiliary_loss_mlp": 0.01048898, "balance_loss_clip": 1.03280509, "balance_loss_mlp": 1.05213547, "epoch": 0.24794829400270554, "flos": 22594455083520.0, "grad_norm": 2.6610587322235926, "language_loss": 0.74824923, "learning_rate": 3.423665895126897e-06, "loss": 0.77051103, "num_input_tokens_seen": 88879740, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.890625, "step": 4124, "time_per_iteration": 2.684358596801758 }, { "auxiliary_loss_clip": 0.01146443, "auxiliary_loss_mlp": 0.01038382, "balance_loss_clip": 1.02278924, "balance_loss_mlp": 1.05116153, "epoch": 0.2480084172553735, "flos": 39676047615360.0, "grad_norm": 1.7441678796447673, "language_loss": 0.73716676, "learning_rate": 3.4234005066618047e-06, "loss": 0.75901502, "num_input_tokens_seen": 88904095, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.859375, "step": 4125, "time_per_iteration": 2.771825075149536 }, { "auxiliary_loss_clip": 0.01171301, "auxiliary_loss_mlp": 0.0129504, "balance_loss_clip": 1.03086829, "balance_loss_mlp": 1.05080295, "epoch": 0.24806854050804147, "flos": 22053712763520.0, "grad_norm": 1.8110326278317355, "language_loss": 0.68944514, "learning_rate": 3.4231350673992093e-06, "loss": 0.71410853, "num_input_tokens_seen": 88920740, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.9375, "step": 4126, "time_per_iteration": 2.6190707683563232 }, { "auxiliary_loss_clip": 0.01149886, "auxiliary_loss_mlp": 0.01051558, "balance_loss_clip": 1.03445137, "balance_loss_mlp": 1.05121088, "epoch": 0.24812866376070947, "flos": 15486764670720.0, "grad_norm": 5.030860298708829, "language_loss": 0.81233859, "learning_rate": 3.422869577348584e-06, "loss": 0.83435303, "num_input_tokens_seen": 88938510, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.8984375, "step": 4127, "time_per_iteration": 2.546765089035034 }, { "auxiliary_loss_clip": 0.01154051, "auxiliary_loss_mlp": 0.01297146, "balance_loss_clip": 1.03371358, "balance_loss_mlp": 1.05431867, "epoch": 0.24818878701337743, "flos": 14757419013120.0, "grad_norm": 3.000426689861702, "language_loss": 0.83476299, "learning_rate": 3.422604036519404e-06, "loss": 0.85927498, "num_input_tokens_seen": 88955235, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.90625, "step": 4128, "time_per_iteration": 2.585068464279175 }, { "auxiliary_loss_clip": 0.01160441, "auxiliary_loss_mlp": 0.01049783, "balance_loss_clip": 1.03367829, "balance_loss_mlp": 1.05198979, "epoch": 0.2482489102660454, "flos": 27089501852160.0, "grad_norm": 2.209775670152528, "language_loss": 0.65615237, "learning_rate": 3.4223384449211457e-06, "loss": 0.6782546, "num_input_tokens_seen": 88975210, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.90234375, "step": 4129, "time_per_iteration": 4.02574896812439 }, { "auxiliary_loss_clip": 0.01176472, "auxiliary_loss_mlp": 0.0104606, "balance_loss_clip": 1.0290246, "balance_loss_mlp": 1.04994059, "epoch": 0.24830903351871336, "flos": 26467528924800.0, "grad_norm": 1.6046645053401292, "language_loss": 0.75010359, "learning_rate": 3.4220728025632863e-06, "loss": 0.77232885, "num_input_tokens_seen": 88996120, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.90234375, "step": 4130, "time_per_iteration": 2.6008377075195312 }, { "auxiliary_loss_clip": 0.01186127, "auxiliary_loss_mlp": 0.01054368, "balance_loss_clip": 1.03727341, "balance_loss_mlp": 1.05021644, "epoch": 0.24836915677138133, "flos": 10228436870400.0, "grad_norm": 2.3376730631015663, "language_loss": 0.76889598, "learning_rate": 3.421807109455307e-06, "loss": 0.79130095, "num_input_tokens_seen": 89008685, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.91015625, "step": 4131, "time_per_iteration": 2.6342644691467285 }, { "auxiliary_loss_clip": 0.01146739, "auxiliary_loss_mlp": 0.01041747, "balance_loss_clip": 1.0264051, "balance_loss_mlp": 1.05072117, "epoch": 0.2484292800240493, "flos": 30080429579520.0, "grad_norm": 1.6486631247322527, "language_loss": 0.83526647, "learning_rate": 3.4215413656066893e-06, "loss": 0.85715139, "num_input_tokens_seen": 89031160, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.87109375, "step": 4132, "time_per_iteration": 4.085277080535889 }, { "auxiliary_loss_clip": 0.01158056, "auxiliary_loss_mlp": 0.01041941, "balance_loss_clip": 1.02485847, "balance_loss_mlp": 1.05078912, "epoch": 0.24848940327671726, "flos": 13442944803840.0, "grad_norm": 1.532574994930905, "language_loss": 0.71060646, "learning_rate": 3.4212755710269163e-06, "loss": 0.73260641, "num_input_tokens_seen": 89047235, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.89453125, "step": 4133, "time_per_iteration": 4.13943338394165 }, { "auxiliary_loss_clip": 0.01146743, "auxiliary_loss_mlp": 0.01043492, "balance_loss_clip": 1.02421558, "balance_loss_mlp": 1.05091369, "epoch": 0.24854952652938525, "flos": 19970247260160.0, "grad_norm": 2.372435265453365, "language_loss": 0.61540025, "learning_rate": 3.4210097257254748e-06, "loss": 0.63730264, "num_input_tokens_seen": 89064790, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.95703125, "step": 4134, "time_per_iteration": 2.535914659500122 }, { "auxiliary_loss_clip": 0.01150095, "auxiliary_loss_mlp": 0.01038083, "balance_loss_clip": 1.02158403, "balance_loss_mlp": 1.0496639, "epoch": 0.24860964978205322, "flos": 18150187167360.0, "grad_norm": 2.186365511549995, "language_loss": 0.78010368, "learning_rate": 3.420743829711851e-06, "loss": 0.80198544, "num_input_tokens_seen": 89083250, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.91015625, "step": 4135, "time_per_iteration": 4.07783317565918 }, { "auxiliary_loss_clip": 0.01162676, "auxiliary_loss_mlp": 0.01039813, "balance_loss_clip": 1.02426803, "balance_loss_mlp": 1.05489337, "epoch": 0.24866977303472118, "flos": 11728641329280.0, "grad_norm": 2.4777426271643472, "language_loss": 0.83204389, "learning_rate": 3.420477882995535e-06, "loss": 0.85406876, "num_input_tokens_seen": 89100905, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8984375, "step": 4136, "time_per_iteration": 2.508939504623413 }, { "auxiliary_loss_clip": 0.01169986, "auxiliary_loss_mlp": 0.01048165, "balance_loss_clip": 1.03189313, "balance_loss_mlp": 1.05071557, "epoch": 0.24872989628738915, "flos": 34823582565120.0, "grad_norm": 1.9894052633047168, "language_loss": 0.70809847, "learning_rate": 3.420211885586017e-06, "loss": 0.73027998, "num_input_tokens_seen": 89122630, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.921875, "step": 4137, "time_per_iteration": 2.6961441040039062 }, { "auxiliary_loss_clip": 0.01151126, "auxiliary_loss_mlp": 0.01292848, "balance_loss_clip": 1.02943671, "balance_loss_mlp": 1.04968357, "epoch": 0.2487900195400571, "flos": 13699347062400.0, "grad_norm": 2.1502704523323586, "language_loss": 0.66984135, "learning_rate": 3.41994583749279e-06, "loss": 0.6942811, "num_input_tokens_seen": 89141050, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.92578125, "step": 4138, "time_per_iteration": 2.5676772594451904 }, { "auxiliary_loss_clip": 0.01162859, "auxiliary_loss_mlp": 0.01043506, "balance_loss_clip": 1.02806807, "balance_loss_mlp": 1.04964793, "epoch": 0.24885014279272508, "flos": 25337815297920.0, "grad_norm": 1.8455898781955102, "language_loss": 0.84047914, "learning_rate": 3.4196797387253482e-06, "loss": 0.86254287, "num_input_tokens_seen": 89160810, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.86328125, "step": 4139, "time_per_iteration": 2.6793031692504883 }, { "auxiliary_loss_clip": 0.01169181, "auxiliary_loss_mlp": 0.01044203, "balance_loss_clip": 1.02728736, "balance_loss_mlp": 1.04984832, "epoch": 0.24891026604539307, "flos": 20631434860800.0, "grad_norm": 1.8666273453260593, "language_loss": 0.79064286, "learning_rate": 3.419413589293189e-06, "loss": 0.81277668, "num_input_tokens_seen": 89180610, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.921875, "step": 4140, "time_per_iteration": 2.54899263381958 }, { "auxiliary_loss_clip": 0.01071496, "auxiliary_loss_mlp": 0.01024805, "balance_loss_clip": 1.02298152, "balance_loss_mlp": 1.03284669, "epoch": 0.24897038929806103, "flos": 66960294030720.0, "grad_norm": 0.8276157729386182, "language_loss": 0.61051381, "learning_rate": 3.4191473892058094e-06, "loss": 0.63147676, "num_input_tokens_seen": 89241880, "router_z_loss_clip": 0.01818848, "router_z_loss_mlp": 0.29492188, "step": 4141, "time_per_iteration": 3.2215535640716553 }, { "auxiliary_loss_clip": 0.01144128, "auxiliary_loss_mlp": 0.01052026, "balance_loss_clip": 1.03513408, "balance_loss_mlp": 1.05127084, "epoch": 0.249030512550729, "flos": 36392555612160.0, "grad_norm": 1.8136314223347867, "language_loss": 0.72703218, "learning_rate": 3.4188811384727104e-06, "loss": 0.74899375, "num_input_tokens_seen": 89263340, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.9296875, "step": 4142, "time_per_iteration": 2.783832550048828 }, { "auxiliary_loss_clip": 0.01149597, "auxiliary_loss_mlp": 0.01047739, "balance_loss_clip": 1.03106117, "balance_loss_mlp": 1.05068851, "epoch": 0.24909063580339696, "flos": 20154576879360.0, "grad_norm": 1.6946373683627596, "language_loss": 0.80851436, "learning_rate": 3.418614837103393e-06, "loss": 0.83048773, "num_input_tokens_seen": 89282870, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8984375, "step": 4143, "time_per_iteration": 2.526620626449585 }, { "auxiliary_loss_clip": 0.01152665, "auxiliary_loss_mlp": 0.01041282, "balance_loss_clip": 1.02603531, "balance_loss_mlp": 1.04789448, "epoch": 0.24915075905606493, "flos": 26396569607040.0, "grad_norm": 1.6495555805156294, "language_loss": 0.58428335, "learning_rate": 3.418348485107362e-06, "loss": 0.60622281, "num_input_tokens_seen": 89303830, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.87109375, "step": 4144, "time_per_iteration": 2.623701572418213 }, { "auxiliary_loss_clip": 0.01147712, "auxiliary_loss_mlp": 0.01041866, "balance_loss_clip": 1.02590394, "balance_loss_mlp": 1.0491631, "epoch": 0.2492108823087329, "flos": 27527216987520.0, "grad_norm": 1.9714674432371548, "language_loss": 0.78643274, "learning_rate": 3.4180820824941213e-06, "loss": 0.80832851, "num_input_tokens_seen": 89324350, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.89453125, "step": 4145, "time_per_iteration": 2.576045036315918 }, { "auxiliary_loss_clip": 0.01157683, "auxiliary_loss_mlp": 0.01297721, "balance_loss_clip": 1.0306462, "balance_loss_mlp": 1.05105519, "epoch": 0.24927100556140086, "flos": 16691388111360.0, "grad_norm": 2.077601897719784, "language_loss": 0.64820778, "learning_rate": 3.4178156292731787e-06, "loss": 0.6727618, "num_input_tokens_seen": 89342875, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.9765625, "step": 4146, "time_per_iteration": 2.5927982330322266 }, { "auxiliary_loss_clip": 0.01071495, "auxiliary_loss_mlp": 0.01007994, "balance_loss_clip": 1.00627697, "balance_loss_mlp": 1.02436614, "epoch": 0.24933112881406885, "flos": 62772464286720.0, "grad_norm": 0.9417167049782285, "language_loss": 0.67292702, "learning_rate": 3.4175491254540436e-06, "loss": 0.69372189, "num_input_tokens_seen": 89404925, "router_z_loss_clip": 0.01721191, "router_z_loss_mlp": 0.29296875, "step": 4147, "time_per_iteration": 3.24880313873291 }, { "auxiliary_loss_clip": 0.01156934, "auxiliary_loss_mlp": 0.01048535, "balance_loss_clip": 1.03190565, "balance_loss_mlp": 1.04949999, "epoch": 0.24939125206673682, "flos": 26651894457600.0, "grad_norm": 1.790400104136538, "language_loss": 0.89240068, "learning_rate": 3.4172825710462267e-06, "loss": 0.91445541, "num_input_tokens_seen": 89425090, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.89453125, "step": 4148, "time_per_iteration": 2.613685131072998 }, { "auxiliary_loss_clip": 0.0118183, "auxiliary_loss_mlp": 0.01052506, "balance_loss_clip": 1.03268123, "balance_loss_mlp": 1.05066013, "epoch": 0.24945137531940478, "flos": 20704333512960.0, "grad_norm": 1.8399800645754898, "language_loss": 0.67802703, "learning_rate": 3.4170159660592404e-06, "loss": 0.70037043, "num_input_tokens_seen": 89442615, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.94921875, "step": 4149, "time_per_iteration": 2.577479124069214 }, { "auxiliary_loss_clip": 0.01147925, "auxiliary_loss_mlp": 0.01043754, "balance_loss_clip": 1.02749431, "balance_loss_mlp": 1.05048823, "epoch": 0.24951149857207275, "flos": 23768662682880.0, "grad_norm": 3.0588766976136896, "language_loss": 0.71435976, "learning_rate": 3.416749310502599e-06, "loss": 0.73627663, "num_input_tokens_seen": 89463025, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8828125, "step": 4150, "time_per_iteration": 2.6130409240722656 }, { "auxiliary_loss_clip": 0.01175073, "auxiliary_loss_mlp": 0.01049315, "balance_loss_clip": 1.03189278, "balance_loss_mlp": 1.04821992, "epoch": 0.2495716218247407, "flos": 15664881237120.0, "grad_norm": 2.744495503152574, "language_loss": 0.73336112, "learning_rate": 3.4164826043858195e-06, "loss": 0.75560498, "num_input_tokens_seen": 89480225, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.90625, "step": 4151, "time_per_iteration": 2.5428695678710938 }, { "auxiliary_loss_clip": 0.01156576, "auxiliary_loss_mlp": 0.01055276, "balance_loss_clip": 1.03816986, "balance_loss_mlp": 1.05070686, "epoch": 0.24963174507740868, "flos": 24052499953920.0, "grad_norm": 2.5529261734113047, "language_loss": 0.63322532, "learning_rate": 3.416215847718419e-06, "loss": 0.65534377, "num_input_tokens_seen": 89496985, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.96875, "step": 4152, "time_per_iteration": 2.570277452468872 }, { "auxiliary_loss_clip": 0.01146511, "auxiliary_loss_mlp": 0.01043398, "balance_loss_clip": 1.02660179, "balance_loss_mlp": 1.05029702, "epoch": 0.24969186833007664, "flos": 21799501234560.0, "grad_norm": 2.217304530020204, "language_loss": 0.77056861, "learning_rate": 3.4159490405099183e-06, "loss": 0.79246765, "num_input_tokens_seen": 89514420, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.87109375, "step": 4153, "time_per_iteration": 2.6963181495666504 }, { "auxiliary_loss_clip": 0.01146134, "auxiliary_loss_mlp": 0.01039094, "balance_loss_clip": 1.02233338, "balance_loss_mlp": 1.04936957, "epoch": 0.24975199158274464, "flos": 19938143479680.0, "grad_norm": 2.2125204809486423, "language_loss": 0.75893271, "learning_rate": 3.4156821827698387e-06, "loss": 0.78078502, "num_input_tokens_seen": 89532925, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.87890625, "step": 4154, "time_per_iteration": 2.619205951690674 }, { "auxiliary_loss_clip": 0.01151877, "auxiliary_loss_mlp": 0.01046383, "balance_loss_clip": 1.0278461, "balance_loss_mlp": 1.04786754, "epoch": 0.2498121148354126, "flos": 25338389915520.0, "grad_norm": 2.4439882986763153, "language_loss": 0.70683444, "learning_rate": 3.4154152745077027e-06, "loss": 0.72881711, "num_input_tokens_seen": 89552855, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.94921875, "step": 4155, "time_per_iteration": 2.631183624267578 }, { "auxiliary_loss_clip": 0.01159952, "auxiliary_loss_mlp": 0.01047455, "balance_loss_clip": 1.03027701, "balance_loss_mlp": 1.05006635, "epoch": 0.24987223808808057, "flos": 20558787603840.0, "grad_norm": 2.2933458001604117, "language_loss": 0.74647069, "learning_rate": 3.4151483157330373e-06, "loss": 0.76854473, "num_input_tokens_seen": 89572830, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.921875, "step": 4156, "time_per_iteration": 2.617642641067505 }, { "auxiliary_loss_clip": 0.01147869, "auxiliary_loss_mlp": 0.01042275, "balance_loss_clip": 1.02705204, "balance_loss_mlp": 1.0493083, "epoch": 0.24993236134074853, "flos": 19749037351680.0, "grad_norm": 1.9356278968342466, "language_loss": 0.765926, "learning_rate": 3.4148813064553686e-06, "loss": 0.78782749, "num_input_tokens_seen": 89590345, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.89453125, "step": 4157, "time_per_iteration": 2.558659315109253 }, { "auxiliary_loss_clip": 0.01185684, "auxiliary_loss_mlp": 0.01043547, "balance_loss_clip": 1.02630901, "balance_loss_mlp": 1.04898083, "epoch": 0.2499924845934165, "flos": 18770292587520.0, "grad_norm": 1.5314743277470366, "language_loss": 0.81515896, "learning_rate": 3.4146142466842253e-06, "loss": 0.83745134, "num_input_tokens_seen": 89610295, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.91796875, "step": 4158, "time_per_iteration": 2.589266061782837 }, { "auxiliary_loss_clip": 0.01158157, "auxiliary_loss_mlp": 0.01036033, "balance_loss_clip": 1.01985598, "balance_loss_mlp": 1.04924202, "epoch": 0.25005260784608446, "flos": 16872198197760.0, "grad_norm": 1.7806328233905113, "language_loss": 0.76050812, "learning_rate": 3.414347136429138e-06, "loss": 0.78245002, "num_input_tokens_seen": 89627795, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.90625, "step": 4159, "time_per_iteration": 2.4967660903930664 }, { "auxiliary_loss_clip": 0.01152779, "auxiliary_loss_mlp": 0.01036983, "balance_loss_clip": 1.01922059, "balance_loss_mlp": 1.05033958, "epoch": 0.2501127310987524, "flos": 22124923476480.0, "grad_norm": 2.2634410543095145, "language_loss": 0.7126435, "learning_rate": 3.4140799756996403e-06, "loss": 0.73454118, "num_input_tokens_seen": 89648090, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9296875, "step": 4160, "time_per_iteration": 2.5365822315216064 }, { "auxiliary_loss_clip": 0.01052987, "auxiliary_loss_mlp": 0.01002964, "balance_loss_clip": 1.00086617, "balance_loss_mlp": 1.02427661, "epoch": 0.2501728543514204, "flos": 69458061980160.0, "grad_norm": 0.7491755516461711, "language_loss": 0.56741154, "learning_rate": 3.4138127645052653e-06, "loss": 0.58797103, "num_input_tokens_seen": 89710345, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.28710938, "step": 4161, "time_per_iteration": 3.179098606109619 }, { "auxiliary_loss_clip": 0.01156294, "auxiliary_loss_mlp": 0.01045744, "balance_loss_clip": 1.0277319, "balance_loss_mlp": 1.05136669, "epoch": 0.25023297760408836, "flos": 16289978647680.0, "grad_norm": 1.8442263191427348, "language_loss": 0.80879259, "learning_rate": 3.41354550285555e-06, "loss": 0.83081305, "num_input_tokens_seen": 89729390, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9609375, "step": 4162, "time_per_iteration": 2.4883358478546143 }, { "auxiliary_loss_clip": 0.01160088, "auxiliary_loss_mlp": 0.01288843, "balance_loss_clip": 1.02401221, "balance_loss_mlp": 1.04775882, "epoch": 0.2502931008567563, "flos": 12237998140800.0, "grad_norm": 2.39741702414739, "language_loss": 0.87598687, "learning_rate": 3.413278190760031e-06, "loss": 0.90047616, "num_input_tokens_seen": 89742805, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9453125, "step": 4163, "time_per_iteration": 2.5647730827331543 }, { "auxiliary_loss_clip": 0.01151159, "auxiliary_loss_mlp": 0.01040789, "balance_loss_clip": 1.02425468, "balance_loss_mlp": 1.05017066, "epoch": 0.25035322410942434, "flos": 23181882105600.0, "grad_norm": 1.6383345883249136, "language_loss": 0.82991135, "learning_rate": 3.413010828228249e-06, "loss": 0.85183078, "num_input_tokens_seen": 89761145, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.91796875, "step": 4164, "time_per_iteration": 2.528712749481201 }, { "auxiliary_loss_clip": 0.01148097, "auxiliary_loss_mlp": 0.01047888, "balance_loss_clip": 1.0324924, "balance_loss_mlp": 1.05191636, "epoch": 0.2504133473620923, "flos": 20917534688640.0, "grad_norm": 2.4442153072127453, "language_loss": 0.73648846, "learning_rate": 3.4127434152697453e-06, "loss": 0.75844836, "num_input_tokens_seen": 89780905, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8671875, "step": 4165, "time_per_iteration": 2.644425392150879 }, { "auxiliary_loss_clip": 0.01170805, "auxiliary_loss_mlp": 0.01043608, "balance_loss_clip": 1.02585769, "balance_loss_mlp": 1.04980087, "epoch": 0.2504734706147603, "flos": 20776549806720.0, "grad_norm": 1.7803580497234086, "language_loss": 0.74170172, "learning_rate": 3.4124759518940637e-06, "loss": 0.7638458, "num_input_tokens_seen": 89799230, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.94140625, "step": 4166, "time_per_iteration": 2.563271999359131 }, { "auxiliary_loss_clip": 0.01165462, "auxiliary_loss_mlp": 0.01045004, "balance_loss_clip": 1.02891028, "balance_loss_mlp": 1.04959321, "epoch": 0.25053359386742824, "flos": 24349373861760.0, "grad_norm": 1.6349904544520897, "language_loss": 0.81908226, "learning_rate": 3.412208438110748e-06, "loss": 0.841187, "num_input_tokens_seen": 89818240, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.890625, "step": 4167, "time_per_iteration": 2.6657559871673584 }, { "auxiliary_loss_clip": 0.01149083, "auxiliary_loss_mlp": 0.01043538, "balance_loss_clip": 1.02698636, "balance_loss_mlp": 1.0500133, "epoch": 0.2505937171200962, "flos": 21214336769280.0, "grad_norm": 2.1044316759384953, "language_loss": 0.79360527, "learning_rate": 3.411940873929346e-06, "loss": 0.81553149, "num_input_tokens_seen": 89834485, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8984375, "step": 4168, "time_per_iteration": 2.5388736724853516 }, { "auxiliary_loss_clip": 0.01153579, "auxiliary_loss_mlp": 0.01040499, "balance_loss_clip": 1.02197433, "balance_loss_mlp": 1.04855418, "epoch": 0.25065384037276417, "flos": 41427231379200.0, "grad_norm": 2.6715873983100744, "language_loss": 0.69639021, "learning_rate": 3.411673259359406e-06, "loss": 0.71833098, "num_input_tokens_seen": 89855645, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.9609375, "step": 4169, "time_per_iteration": 2.6956257820129395 }, { "auxiliary_loss_clip": 0.0116298, "auxiliary_loss_mlp": 0.01047314, "balance_loss_clip": 1.03119731, "balance_loss_mlp": 1.04793155, "epoch": 0.25071396362543213, "flos": 26102389219200.0, "grad_norm": 1.74378824596182, "language_loss": 0.77732849, "learning_rate": 3.411405594410479e-06, "loss": 0.79943138, "num_input_tokens_seen": 89874895, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.87890625, "step": 4170, "time_per_iteration": 4.008548736572266 }, { "auxiliary_loss_clip": 0.01158214, "auxiliary_loss_mlp": 0.01044755, "balance_loss_clip": 1.02900743, "balance_loss_mlp": 1.04929078, "epoch": 0.2507740868781001, "flos": 19098982967040.0, "grad_norm": 2.090985758514791, "language_loss": 0.76451218, "learning_rate": 3.4111378790921162e-06, "loss": 0.78654194, "num_input_tokens_seen": 89891700, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.9140625, "step": 4171, "time_per_iteration": 2.542725086212158 }, { "auxiliary_loss_clip": 0.01050495, "auxiliary_loss_mlp": 0.01008403, "balance_loss_clip": 1.00648332, "balance_loss_mlp": 1.0222044, "epoch": 0.25083421013076806, "flos": 64341868296960.0, "grad_norm": 0.8278328300491012, "language_loss": 0.60018408, "learning_rate": 3.4108701134138727e-06, "loss": 0.62077308, "num_input_tokens_seen": 89955775, "router_z_loss_clip": 0.01916504, "router_z_loss_mlp": 0.28320312, "step": 4172, "time_per_iteration": 3.204745054244995 }, { "auxiliary_loss_clip": 0.01158323, "auxiliary_loss_mlp": 0.01046893, "balance_loss_clip": 1.02972698, "balance_loss_mlp": 1.04894567, "epoch": 0.25089433338343603, "flos": 24279599692800.0, "grad_norm": 1.4970578139509887, "language_loss": 0.78804719, "learning_rate": 3.4106022973853045e-06, "loss": 0.81009936, "num_input_tokens_seen": 89977150, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9140625, "step": 4173, "time_per_iteration": 2.5603866577148438 }, { "auxiliary_loss_clip": 0.01140298, "auxiliary_loss_mlp": 0.01046723, "balance_loss_clip": 1.03018892, "balance_loss_mlp": 1.04989159, "epoch": 0.250954456636104, "flos": 14721472477440.0, "grad_norm": 1.974993007520739, "language_loss": 0.83636963, "learning_rate": 3.4103344310159685e-06, "loss": 0.85823989, "num_input_tokens_seen": 89994925, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.90234375, "step": 4174, "time_per_iteration": 5.418715000152588 }, { "auxiliary_loss_clip": 0.01179157, "auxiliary_loss_mlp": 0.01040954, "balance_loss_clip": 1.02437222, "balance_loss_mlp": 1.0521121, "epoch": 0.25101457988877196, "flos": 22273593868800.0, "grad_norm": 1.9489200618740097, "language_loss": 0.71315861, "learning_rate": 3.4100665143154245e-06, "loss": 0.73535979, "num_input_tokens_seen": 90013235, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.91015625, "step": 4175, "time_per_iteration": 2.652919292449951 }, { "auxiliary_loss_clip": 0.01168302, "auxiliary_loss_mlp": 0.01037851, "balance_loss_clip": 1.02089918, "balance_loss_mlp": 1.04693079, "epoch": 0.2510747031414399, "flos": 25188929424000.0, "grad_norm": 2.0853786371481364, "language_loss": 0.80492824, "learning_rate": 3.409798547293234e-06, "loss": 0.82698977, "num_input_tokens_seen": 90032150, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9453125, "step": 4176, "time_per_iteration": 2.7058470249176025 }, { "auxiliary_loss_clip": 0.01159761, "auxiliary_loss_mlp": 0.0104285, "balance_loss_clip": 1.02527833, "balance_loss_mlp": 1.05061984, "epoch": 0.25113482639410795, "flos": 20704189858560.0, "grad_norm": 2.2218050350349077, "language_loss": 0.82649791, "learning_rate": 3.4095305299589593e-06, "loss": 0.84852409, "num_input_tokens_seen": 90049085, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9140625, "step": 4177, "time_per_iteration": 4.02828311920166 }, { "auxiliary_loss_clip": 0.01167968, "auxiliary_loss_mlp": 0.01042003, "balance_loss_clip": 1.0253855, "balance_loss_mlp": 1.05146277, "epoch": 0.2511949496467759, "flos": 21506936958720.0, "grad_norm": 2.491526957412271, "language_loss": 0.82447577, "learning_rate": 3.409262462322166e-06, "loss": 0.84657544, "num_input_tokens_seen": 90067695, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.89453125, "step": 4178, "time_per_iteration": 2.720769166946411 }, { "auxiliary_loss_clip": 0.01134366, "auxiliary_loss_mlp": 0.01292116, "balance_loss_clip": 1.02918506, "balance_loss_mlp": 1.04721963, "epoch": 0.2512550728994439, "flos": 20701999128960.0, "grad_norm": 2.6967198949746973, "language_loss": 0.75519824, "learning_rate": 3.40899434439242e-06, "loss": 0.77946305, "num_input_tokens_seen": 90083890, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.87109375, "step": 4179, "time_per_iteration": 2.5212390422821045 }, { "auxiliary_loss_clip": 0.01169441, "auxiliary_loss_mlp": 0.01048931, "balance_loss_clip": 1.03253984, "balance_loss_mlp": 1.05067146, "epoch": 0.25131519615211184, "flos": 18478626151680.0, "grad_norm": 1.971531393817455, "language_loss": 0.7008785, "learning_rate": 3.4087261761792908e-06, "loss": 0.72306228, "num_input_tokens_seen": 90100995, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.921875, "step": 4180, "time_per_iteration": 2.616969347000122 }, { "auxiliary_loss_clip": 0.01137809, "auxiliary_loss_mlp": 0.01043853, "balance_loss_clip": 1.02780747, "balance_loss_mlp": 1.04945326, "epoch": 0.2513753194047798, "flos": 20484955198080.0, "grad_norm": 1.9374590762072212, "language_loss": 0.85867745, "learning_rate": 3.4084579576923477e-06, "loss": 0.880494, "num_input_tokens_seen": 90120365, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8828125, "step": 4181, "time_per_iteration": 2.5635673999786377 }, { "auxiliary_loss_clip": 0.01139242, "auxiliary_loss_mlp": 0.01043058, "balance_loss_clip": 1.02672648, "balance_loss_mlp": 1.04949927, "epoch": 0.25143544265744777, "flos": 37670077704960.0, "grad_norm": 2.20941232759551, "language_loss": 0.68046021, "learning_rate": 3.4081896889411634e-06, "loss": 0.7022832, "num_input_tokens_seen": 90142610, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8984375, "step": 4182, "time_per_iteration": 2.782367706298828 }, { "auxiliary_loss_clip": 0.01067712, "auxiliary_loss_mlp": 0.01006687, "balance_loss_clip": 1.00469613, "balance_loss_mlp": 1.02083814, "epoch": 0.25149556591011574, "flos": 69367457923200.0, "grad_norm": 0.8441309705157805, "language_loss": 0.70040977, "learning_rate": 3.407921369935311e-06, "loss": 0.72115374, "num_input_tokens_seen": 90200555, "router_z_loss_clip": 0.01989746, "router_z_loss_mlp": 0.2890625, "step": 4183, "time_per_iteration": 3.2204337120056152 }, { "auxiliary_loss_clip": 0.01154629, "auxiliary_loss_mlp": 0.01041277, "balance_loss_clip": 1.0234071, "balance_loss_mlp": 1.04574203, "epoch": 0.2515556891627837, "flos": 13990402967040.0, "grad_norm": 1.7273962964313034, "language_loss": 0.7428571, "learning_rate": 3.407653000684367e-06, "loss": 0.76481611, "num_input_tokens_seen": 90218120, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.90625, "step": 4184, "time_per_iteration": 2.631040334701538 }, { "auxiliary_loss_clip": 0.01148746, "auxiliary_loss_mlp": 0.01050879, "balance_loss_clip": 1.03393912, "balance_loss_mlp": 1.04968226, "epoch": 0.25161581241545167, "flos": 22163527618560.0, "grad_norm": 2.4069360604540195, "language_loss": 0.8301363, "learning_rate": 3.407384581197908e-06, "loss": 0.85213256, "num_input_tokens_seen": 90236790, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.90234375, "step": 4185, "time_per_iteration": 2.6668524742126465 }, { "auxiliary_loss_clip": 0.01084811, "auxiliary_loss_mlp": 0.0100054, "balance_loss_clip": 0.99852526, "balance_loss_mlp": 1.02109313, "epoch": 0.25167593566811963, "flos": 69358407696000.0, "grad_norm": 0.7889766963475302, "language_loss": 0.61497647, "learning_rate": 3.4071161114855134e-06, "loss": 0.63582999, "num_input_tokens_seen": 90297070, "router_z_loss_clip": 0.0201416, "router_z_loss_mlp": 0.28515625, "step": 4186, "time_per_iteration": 3.1496658325195312 }, { "auxiliary_loss_clip": 0.01164092, "auxiliary_loss_mlp": 0.01040149, "balance_loss_clip": 1.02343631, "balance_loss_mlp": 1.04658914, "epoch": 0.2517360589207876, "flos": 13261452359040.0, "grad_norm": 2.020553115020272, "language_loss": 0.79090697, "learning_rate": 3.406847591556764e-06, "loss": 0.81294942, "num_input_tokens_seen": 90315255, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.90625, "step": 4187, "time_per_iteration": 2.593651533126831 }, { "auxiliary_loss_clip": 0.01183891, "auxiliary_loss_mlp": 0.01058014, "balance_loss_clip": 1.04064476, "balance_loss_mlp": 1.04998982, "epoch": 0.25179618217345556, "flos": 20376828282240.0, "grad_norm": 1.4994365786388586, "language_loss": 0.79759383, "learning_rate": 3.406579021421244e-06, "loss": 0.82001281, "num_input_tokens_seen": 90334990, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.89453125, "step": 4188, "time_per_iteration": 2.6460888385772705 }, { "auxiliary_loss_clip": 0.0114487, "auxiliary_loss_mlp": 0.01044811, "balance_loss_clip": 1.02878952, "balance_loss_mlp": 1.04651999, "epoch": 0.25185630542612353, "flos": 27664718250240.0, "grad_norm": 1.8556732811539396, "language_loss": 0.74241298, "learning_rate": 3.406310401088536e-06, "loss": 0.76430976, "num_input_tokens_seen": 90351825, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.89453125, "step": 4189, "time_per_iteration": 2.6367130279541016 }, { "auxiliary_loss_clip": 0.01153605, "auxiliary_loss_mlp": 0.01043777, "balance_loss_clip": 1.02808928, "balance_loss_mlp": 1.04868758, "epoch": 0.25191642867879155, "flos": 20996430912000.0, "grad_norm": 2.0206967467813928, "language_loss": 0.84759176, "learning_rate": 3.4060417305682274e-06, "loss": 0.86956561, "num_input_tokens_seen": 90369860, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8671875, "step": 4190, "time_per_iteration": 2.6090121269226074 }, { "auxiliary_loss_clip": 0.01156547, "auxiliary_loss_mlp": 0.01042493, "balance_loss_clip": 1.02432597, "balance_loss_mlp": 1.0489738, "epoch": 0.2519765519314595, "flos": 21105671149440.0, "grad_norm": 1.9723530949509156, "language_loss": 0.75327826, "learning_rate": 3.4057730098699065e-06, "loss": 0.77526867, "num_input_tokens_seen": 90389245, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.89453125, "step": 4191, "time_per_iteration": 2.5795974731445312 }, { "auxiliary_loss_clip": 0.01055587, "auxiliary_loss_mlp": 0.01001799, "balance_loss_clip": 0.99979591, "balance_loss_mlp": 1.01805544, "epoch": 0.2520366751841275, "flos": 62744993360640.0, "grad_norm": 0.7184625799269324, "language_loss": 0.57190526, "learning_rate": 3.405504239003163e-06, "loss": 0.59247905, "num_input_tokens_seen": 90456735, "router_z_loss_clip": 0.02001953, "router_z_loss_mlp": 0.28710938, "step": 4192, "time_per_iteration": 3.2524917125701904 }, { "auxiliary_loss_clip": 0.01155113, "auxiliary_loss_mlp": 0.01039427, "balance_loss_clip": 1.02317882, "balance_loss_mlp": 1.0488658, "epoch": 0.25209679843679544, "flos": 22230716008320.0, "grad_norm": 1.869935084523277, "language_loss": 0.76061124, "learning_rate": 3.4052354179775883e-06, "loss": 0.78255665, "num_input_tokens_seen": 90474165, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.87890625, "step": 4193, "time_per_iteration": 2.5980443954467773 }, { "auxiliary_loss_clip": 0.0115023, "auxiliary_loss_mlp": 0.01043612, "balance_loss_clip": 1.02681553, "balance_loss_mlp": 1.0486623, "epoch": 0.2521569216894634, "flos": 12166643773440.0, "grad_norm": 2.2468837243290225, "language_loss": 0.84167147, "learning_rate": 3.4049665468027763e-06, "loss": 0.86360991, "num_input_tokens_seen": 90491660, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.92578125, "step": 4194, "time_per_iteration": 2.557741165161133 }, { "auxiliary_loss_clip": 0.01142659, "auxiliary_loss_mlp": 0.01051477, "balance_loss_clip": 1.03617036, "balance_loss_mlp": 1.04978704, "epoch": 0.2522170449421314, "flos": 23699786353920.0, "grad_norm": 2.2234517479326015, "language_loss": 0.88294864, "learning_rate": 3.404697625488322e-06, "loss": 0.90489, "num_input_tokens_seen": 90514025, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.9296875, "step": 4195, "time_per_iteration": 2.6149613857269287 }, { "auxiliary_loss_clip": 0.01139952, "auxiliary_loss_mlp": 0.01043906, "balance_loss_clip": 1.02484405, "balance_loss_mlp": 1.04786575, "epoch": 0.25227716819479934, "flos": 20955456472320.0, "grad_norm": 2.1052952658895014, "language_loss": 0.85564792, "learning_rate": 3.4044286540438233e-06, "loss": 0.87748653, "num_input_tokens_seen": 90533530, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.921875, "step": 4196, "time_per_iteration": 2.683893918991089 }, { "auxiliary_loss_clip": 0.01149794, "auxiliary_loss_mlp": 0.01045523, "balance_loss_clip": 1.02886915, "balance_loss_mlp": 1.0493294, "epoch": 0.2523372914474673, "flos": 23331342597120.0, "grad_norm": 1.7659305170450252, "language_loss": 0.83261037, "learning_rate": 3.4041596324788778e-06, "loss": 0.85456359, "num_input_tokens_seen": 90554025, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.9140625, "step": 4197, "time_per_iteration": 2.6072397232055664 }, { "auxiliary_loss_clip": 0.01151448, "auxiliary_loss_mlp": 0.01048748, "balance_loss_clip": 1.02985358, "balance_loss_mlp": 1.04959059, "epoch": 0.25239741470013527, "flos": 36970321875840.0, "grad_norm": 2.2897211406009603, "language_loss": 0.72726977, "learning_rate": 3.403890560803088e-06, "loss": 0.74927169, "num_input_tokens_seen": 90576930, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.9296875, "step": 4198, "time_per_iteration": 2.7390027046203613 }, { "auxiliary_loss_clip": 0.01144129, "auxiliary_loss_mlp": 0.01047613, "balance_loss_clip": 1.02979136, "balance_loss_mlp": 1.04913914, "epoch": 0.25245753795280323, "flos": 18515757836160.0, "grad_norm": 2.173605822439683, "language_loss": 0.7698198, "learning_rate": 3.4036214390260546e-06, "loss": 0.79173726, "num_input_tokens_seen": 90595710, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.953125, "step": 4199, "time_per_iteration": 2.6249377727508545 }, { "auxiliary_loss_clip": 0.01159308, "auxiliary_loss_mlp": 0.0128978, "balance_loss_clip": 1.02616858, "balance_loss_mlp": 1.04827809, "epoch": 0.2525176612054712, "flos": 32344884737280.0, "grad_norm": 1.7824594024013838, "language_loss": 0.72656077, "learning_rate": 3.403352267157383e-06, "loss": 0.7510516, "num_input_tokens_seen": 90617945, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.93359375, "step": 4200, "time_per_iteration": 2.781256914138794 }, { "auxiliary_loss_clip": 0.01149855, "auxiliary_loss_mlp": 0.0104725, "balance_loss_clip": 1.03155017, "balance_loss_mlp": 1.04807043, "epoch": 0.25257778445813917, "flos": 45258217459200.0, "grad_norm": 1.510268993999025, "language_loss": 0.82380122, "learning_rate": 3.4030830452066785e-06, "loss": 0.84577227, "num_input_tokens_seen": 90640855, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.9296875, "step": 4201, "time_per_iteration": 2.8657987117767334 }, { "auxiliary_loss_clip": 0.01183066, "auxiliary_loss_mlp": 0.01048336, "balance_loss_clip": 1.03116965, "balance_loss_mlp": 1.05235147, "epoch": 0.25263790771080713, "flos": 23367791923200.0, "grad_norm": 2.514918999001432, "language_loss": 0.75047207, "learning_rate": 3.4028137731835492e-06, "loss": 0.77278602, "num_input_tokens_seen": 90661350, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.94921875, "step": 4202, "time_per_iteration": 2.6637749671936035 }, { "auxiliary_loss_clip": 0.01146123, "auxiliary_loss_mlp": 0.01039465, "balance_loss_clip": 1.02249527, "balance_loss_mlp": 1.04738474, "epoch": 0.25269803096347515, "flos": 18515039564160.0, "grad_norm": 1.9593651650245223, "language_loss": 0.72984171, "learning_rate": 3.4025444510976045e-06, "loss": 0.7516976, "num_input_tokens_seen": 90680540, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8984375, "step": 4203, "time_per_iteration": 2.4869327545166016 }, { "auxiliary_loss_clip": 0.01157775, "auxiliary_loss_mlp": 0.01040992, "balance_loss_clip": 1.02360487, "balance_loss_mlp": 1.04824114, "epoch": 0.2527581542161431, "flos": 24610552629120.0, "grad_norm": 1.6888858395977684, "language_loss": 0.77469093, "learning_rate": 3.4022750789584568e-06, "loss": 0.7966786, "num_input_tokens_seen": 90703460, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9140625, "step": 4204, "time_per_iteration": 2.657794713973999 }, { "auxiliary_loss_clip": 0.01167625, "auxiliary_loss_mlp": 0.01050311, "balance_loss_clip": 1.03370464, "balance_loss_mlp": 1.04759669, "epoch": 0.2528182774688111, "flos": 12641275111680.0, "grad_norm": 2.0569251367341765, "language_loss": 0.71954536, "learning_rate": 3.4020056567757183e-06, "loss": 0.74172467, "num_input_tokens_seen": 90718815, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.9296875, "step": 4205, "time_per_iteration": 2.5678868293762207 }, { "auxiliary_loss_clip": 0.01145847, "auxiliary_loss_mlp": 0.01036745, "balance_loss_clip": 1.02176082, "balance_loss_mlp": 1.05027676, "epoch": 0.25287840072147905, "flos": 46936789879680.0, "grad_norm": 1.3635289516917513, "language_loss": 0.75193739, "learning_rate": 3.401736184559005e-06, "loss": 0.7737633, "num_input_tokens_seen": 90742125, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.86328125, "step": 4206, "time_per_iteration": 2.8363587856292725 }, { "auxiliary_loss_clip": 0.01165948, "auxiliary_loss_mlp": 0.01039541, "balance_loss_clip": 1.0228399, "balance_loss_mlp": 1.04796672, "epoch": 0.252938523974147, "flos": 18879712392960.0, "grad_norm": 1.7855426791501519, "language_loss": 0.78521085, "learning_rate": 3.401466662317932e-06, "loss": 0.80726576, "num_input_tokens_seen": 90760785, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.91015625, "step": 4207, "time_per_iteration": 2.7575275897979736 }, { "auxiliary_loss_clip": 0.0114419, "auxiliary_loss_mlp": 0.0104821, "balance_loss_clip": 1.03168726, "balance_loss_mlp": 1.04774976, "epoch": 0.252998647226815, "flos": 21434720664960.0, "grad_norm": 1.5979512720626519, "language_loss": 0.76326609, "learning_rate": 3.4011970900621192e-06, "loss": 0.78519011, "num_input_tokens_seen": 90780045, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.875, "step": 4208, "time_per_iteration": 2.697840690612793 }, { "auxiliary_loss_clip": 0.01145921, "auxiliary_loss_mlp": 0.01035744, "balance_loss_clip": 1.01953197, "balance_loss_mlp": 1.04858541, "epoch": 0.25305877047948294, "flos": 25442171285760.0, "grad_norm": 2.58451358509668, "language_loss": 0.69607937, "learning_rate": 3.400927467801186e-06, "loss": 0.71789598, "num_input_tokens_seen": 90797980, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.88671875, "step": 4209, "time_per_iteration": 2.674769401550293 }, { "auxiliary_loss_clip": 0.01062213, "auxiliary_loss_mlp": 0.01010861, "balance_loss_clip": 1.0088706, "balance_loss_mlp": 1.0247674, "epoch": 0.2531188937321509, "flos": 60185603629440.0, "grad_norm": 0.7690136328533561, "language_loss": 0.55133462, "learning_rate": 3.400657795544756e-06, "loss": 0.57206535, "num_input_tokens_seen": 90864865, "router_z_loss_clip": 0.01989746, "router_z_loss_mlp": 0.28515625, "step": 4210, "time_per_iteration": 3.165552854537964 }, { "auxiliary_loss_clip": 0.01166911, "auxiliary_loss_mlp": 0.01037422, "balance_loss_clip": 1.02107811, "balance_loss_mlp": 1.04909861, "epoch": 0.25317901698481887, "flos": 19682387665920.0, "grad_norm": 2.434042803616098, "language_loss": 0.79917407, "learning_rate": 3.400388073302452e-06, "loss": 0.82121736, "num_input_tokens_seen": 90882885, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.9140625, "step": 4211, "time_per_iteration": 4.001680850982666 }, { "auxiliary_loss_clip": 0.01170276, "auxiliary_loss_mlp": 0.01039146, "balance_loss_clip": 1.02335691, "balance_loss_mlp": 1.04769921, "epoch": 0.25323914023748684, "flos": 24424355502720.0, "grad_norm": 5.251713204241796, "language_loss": 0.7824778, "learning_rate": 3.4001183010838995e-06, "loss": 0.80457199, "num_input_tokens_seen": 90902985, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.87109375, "step": 4212, "time_per_iteration": 2.7790584564208984 }, { "auxiliary_loss_clip": 0.01148082, "auxiliary_loss_mlp": 0.01039747, "balance_loss_clip": 1.0222944, "balance_loss_mlp": 1.04832971, "epoch": 0.2532992634901548, "flos": 25447450584960.0, "grad_norm": 2.0257778579462107, "language_loss": 0.5308553, "learning_rate": 3.3998484788987264e-06, "loss": 0.55273354, "num_input_tokens_seen": 90923550, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.90625, "step": 4213, "time_per_iteration": 2.6140666007995605 }, { "auxiliary_loss_clip": 0.01158113, "auxiliary_loss_mlp": 0.01043804, "balance_loss_clip": 1.02600622, "balance_loss_mlp": 1.049595, "epoch": 0.25335938674282277, "flos": 18880538405760.0, "grad_norm": 2.2840662576370834, "language_loss": 0.64489228, "learning_rate": 3.3995786067565623e-06, "loss": 0.66691142, "num_input_tokens_seen": 90943260, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.90625, "step": 4214, "time_per_iteration": 2.5955300331115723 }, { "auxiliary_loss_clip": 0.0106277, "auxiliary_loss_mlp": 0.01001641, "balance_loss_clip": 0.99961454, "balance_loss_mlp": 1.02506256, "epoch": 0.25341950999549073, "flos": 53062649936640.0, "grad_norm": 0.8484002631063345, "language_loss": 0.58058345, "learning_rate": 3.3993086846670376e-06, "loss": 0.60122752, "num_input_tokens_seen": 90996295, "router_z_loss_clip": 0.02026367, "router_z_loss_mlp": 0.28515625, "step": 4215, "time_per_iteration": 4.4085962772369385 }, { "auxiliary_loss_clip": 0.01158877, "auxiliary_loss_mlp": 0.01044611, "balance_loss_clip": 1.02694428, "balance_loss_mlp": 1.0519402, "epoch": 0.2534796332481587, "flos": 39020247054720.0, "grad_norm": 1.6717359653913793, "language_loss": 0.83440948, "learning_rate": 3.3990387126397854e-06, "loss": 0.85644442, "num_input_tokens_seen": 91017545, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.890625, "step": 4216, "time_per_iteration": 4.1674933433532715 }, { "auxiliary_loss_clip": 0.01149976, "auxiliary_loss_mlp": 0.01041472, "balance_loss_clip": 1.02461576, "balance_loss_mlp": 1.0509268, "epoch": 0.2535397565008267, "flos": 23586990670080.0, "grad_norm": 1.755026771777927, "language_loss": 0.8011471, "learning_rate": 3.3987686906844404e-06, "loss": 0.82306159, "num_input_tokens_seen": 91037715, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8984375, "step": 4217, "time_per_iteration": 2.6848018169403076 }, { "auxiliary_loss_clip": 0.01155085, "auxiliary_loss_mlp": 0.01040852, "balance_loss_clip": 1.02466297, "balance_loss_mlp": 1.0479542, "epoch": 0.2535998797534947, "flos": 19281373251840.0, "grad_norm": 2.09723013037735, "language_loss": 0.74625039, "learning_rate": 3.398498618810639e-06, "loss": 0.7682097, "num_input_tokens_seen": 91055295, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.890625, "step": 4218, "time_per_iteration": 4.200652837753296 }, { "auxiliary_loss_clip": 0.01151012, "auxiliary_loss_mlp": 0.01043466, "balance_loss_clip": 1.02592981, "balance_loss_mlp": 1.0495491, "epoch": 0.25366000300616265, "flos": 24024382583040.0, "grad_norm": 1.7543453723459046, "language_loss": 0.74616218, "learning_rate": 3.398228497028019e-06, "loss": 0.76810694, "num_input_tokens_seen": 91075485, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.92578125, "step": 4219, "time_per_iteration": 2.6169373989105225 }, { "auxiliary_loss_clip": 0.01154703, "auxiliary_loss_mlp": 0.01050896, "balance_loss_clip": 1.03342032, "balance_loss_mlp": 1.0516181, "epoch": 0.2537201262588306, "flos": 16289368116480.0, "grad_norm": 1.9716291417736886, "language_loss": 0.81224364, "learning_rate": 3.397958325346221e-06, "loss": 0.83429962, "num_input_tokens_seen": 91093620, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.9375, "step": 4220, "time_per_iteration": 2.7214322090148926 }, { "auxiliary_loss_clip": 0.01180157, "auxiliary_loss_mlp": 0.01051447, "balance_loss_clip": 1.03349376, "balance_loss_mlp": 1.05314231, "epoch": 0.2537802495114986, "flos": 23294677789440.0, "grad_norm": 1.84967668454979, "language_loss": 0.70891476, "learning_rate": 3.397688103774886e-06, "loss": 0.73123074, "num_input_tokens_seen": 91114110, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.91015625, "step": 4221, "time_per_iteration": 2.573561191558838 }, { "auxiliary_loss_clip": 0.01141132, "auxiliary_loss_mlp": 0.01040381, "balance_loss_clip": 1.02298808, "balance_loss_mlp": 1.04949212, "epoch": 0.25384037276416654, "flos": 17639142416640.0, "grad_norm": 1.7133472661994744, "language_loss": 0.61354572, "learning_rate": 3.397417832323658e-06, "loss": 0.63536084, "num_input_tokens_seen": 91133135, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9140625, "step": 4222, "time_per_iteration": 2.5566229820251465 }, { "auxiliary_loss_clip": 0.0115392, "auxiliary_loss_mlp": 0.01050469, "balance_loss_clip": 1.03343368, "balance_loss_mlp": 1.05131459, "epoch": 0.2539004960168345, "flos": 21507044699520.0, "grad_norm": 1.7089423336733345, "language_loss": 0.74516082, "learning_rate": 3.397147511002182e-06, "loss": 0.7672047, "num_input_tokens_seen": 91151805, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9375, "step": 4223, "time_per_iteration": 2.594196319580078 }, { "auxiliary_loss_clip": 0.01175943, "auxiliary_loss_mlp": 0.0103581, "balance_loss_clip": 1.01982427, "balance_loss_mlp": 1.05277348, "epoch": 0.2539606192695025, "flos": 23950909313280.0, "grad_norm": 1.5961218854493975, "language_loss": 0.79940367, "learning_rate": 3.3968771398201056e-06, "loss": 0.82152116, "num_input_tokens_seen": 91172270, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.875, "step": 4224, "time_per_iteration": 2.6358754634857178 }, { "auxiliary_loss_clip": 0.01147777, "auxiliary_loss_mlp": 0.01290744, "balance_loss_clip": 1.02616334, "balance_loss_mlp": 1.04944563, "epoch": 0.25402074252217044, "flos": 24169784837760.0, "grad_norm": 2.05290640573376, "language_loss": 0.77304554, "learning_rate": 3.396606718787077e-06, "loss": 0.79743081, "num_input_tokens_seen": 91192080, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.890625, "step": 4225, "time_per_iteration": 2.6474990844726562 }, { "auxiliary_loss_clip": 0.01150239, "auxiliary_loss_mlp": 0.01050934, "balance_loss_clip": 1.03350568, "balance_loss_mlp": 1.05017662, "epoch": 0.2540808657748384, "flos": 22303758314880.0, "grad_norm": 2.136736106565679, "language_loss": 0.84215677, "learning_rate": 3.396336247912747e-06, "loss": 0.86416852, "num_input_tokens_seen": 91211450, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9140625, "step": 4226, "time_per_iteration": 2.67997670173645 }, { "auxiliary_loss_clip": 0.01138173, "auxiliary_loss_mlp": 0.0104352, "balance_loss_clip": 1.02571023, "balance_loss_mlp": 1.04923296, "epoch": 0.25414098902750637, "flos": 27599541022080.0, "grad_norm": 1.7964408792010464, "language_loss": 0.71093911, "learning_rate": 3.396065727206768e-06, "loss": 0.73275602, "num_input_tokens_seen": 91231835, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.88671875, "step": 4227, "time_per_iteration": 2.677687883377075 }, { "auxiliary_loss_clip": 0.01050359, "auxiliary_loss_mlp": 0.01008036, "balance_loss_clip": 1.0062356, "balance_loss_mlp": 1.02241898, "epoch": 0.25420111228017434, "flos": 58170834887040.0, "grad_norm": 0.9811233250606337, "language_loss": 0.61862576, "learning_rate": 3.395795156678795e-06, "loss": 0.63920963, "num_input_tokens_seen": 91288755, "router_z_loss_clip": 0.01794434, "router_z_loss_mlp": 0.28125, "step": 4228, "time_per_iteration": 2.998377561569214 }, { "auxiliary_loss_clip": 0.01151409, "auxiliary_loss_mlp": 0.01041355, "balance_loss_clip": 1.02336586, "balance_loss_mlp": 1.05029678, "epoch": 0.2542612355328423, "flos": 11464409905920.0, "grad_norm": 2.9111869350506847, "language_loss": 0.86119598, "learning_rate": 3.395524536338483e-06, "loss": 0.88312358, "num_input_tokens_seen": 91302485, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.921875, "step": 4229, "time_per_iteration": 2.5519821643829346 }, { "auxiliary_loss_clip": 0.01168981, "auxiliary_loss_mlp": 0.01047682, "balance_loss_clip": 1.02997923, "balance_loss_mlp": 1.05115092, "epoch": 0.2543213587855103, "flos": 22965879669120.0, "grad_norm": 2.032424324354941, "language_loss": 0.77101231, "learning_rate": 3.3952538661954893e-06, "loss": 0.79317904, "num_input_tokens_seen": 91321120, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.90625, "step": 4230, "time_per_iteration": 2.659811496734619 }, { "auxiliary_loss_clip": 0.01148523, "auxiliary_loss_mlp": 0.01046036, "balance_loss_clip": 1.02829742, "balance_loss_mlp": 1.04920506, "epoch": 0.2543814820381783, "flos": 18253178438400.0, "grad_norm": 2.1010449817149426, "language_loss": 0.74808371, "learning_rate": 3.3949831462594743e-06, "loss": 0.77002931, "num_input_tokens_seen": 91338575, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.90234375, "step": 4231, "time_per_iteration": 2.532188653945923 }, { "auxiliary_loss_clip": 0.01146625, "auxiliary_loss_mlp": 0.01042656, "balance_loss_clip": 1.02599072, "balance_loss_mlp": 1.04870009, "epoch": 0.25444160529084625, "flos": 15632705629440.0, "grad_norm": 1.6812198902015616, "language_loss": 0.73879987, "learning_rate": 3.3947123765400994e-06, "loss": 0.76069272, "num_input_tokens_seen": 91357355, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.88671875, "step": 4232, "time_per_iteration": 2.5593326091766357 }, { "auxiliary_loss_clip": 0.01141595, "auxiliary_loss_mlp": 0.01043457, "balance_loss_clip": 1.02557528, "balance_loss_mlp": 1.05122125, "epoch": 0.2545017285435142, "flos": 24601610142720.0, "grad_norm": 1.9149715603628776, "language_loss": 0.86265004, "learning_rate": 3.394441557047028e-06, "loss": 0.88450056, "num_input_tokens_seen": 91376515, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.90234375, "step": 4233, "time_per_iteration": 2.710632562637329 }, { "auxiliary_loss_clip": 0.01175675, "auxiliary_loss_mlp": 0.01039352, "balance_loss_clip": 1.02424777, "balance_loss_mlp": 1.04551673, "epoch": 0.2545618517961822, "flos": 24679069822080.0, "grad_norm": 1.783664395034368, "language_loss": 0.7485615, "learning_rate": 3.3941706877899236e-06, "loss": 0.77071178, "num_input_tokens_seen": 91397595, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.84765625, "step": 4234, "time_per_iteration": 2.618708848953247 }, { "auxiliary_loss_clip": 0.01156651, "auxiliary_loss_mlp": 0.01044933, "balance_loss_clip": 1.02895927, "balance_loss_mlp": 1.04750061, "epoch": 0.25462197504885015, "flos": 23915106432000.0, "grad_norm": 1.452312438132265, "language_loss": 0.7473954, "learning_rate": 3.393899768778454e-06, "loss": 0.76941121, "num_input_tokens_seen": 91417775, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.91015625, "step": 4235, "time_per_iteration": 2.5747663974761963 }, { "auxiliary_loss_clip": 0.01173294, "auxiliary_loss_mlp": 0.01290094, "balance_loss_clip": 1.02367592, "balance_loss_mlp": 1.04932916, "epoch": 0.2546820983015181, "flos": 24789387467520.0, "grad_norm": 2.24652988309963, "language_loss": 0.64449716, "learning_rate": 3.393628800022287e-06, "loss": 0.66913098, "num_input_tokens_seen": 91437665, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.96875, "step": 4236, "time_per_iteration": 2.536750078201294 }, { "auxiliary_loss_clip": 0.01136053, "auxiliary_loss_mlp": 0.01035902, "balance_loss_clip": 1.02076244, "balance_loss_mlp": 1.04772055, "epoch": 0.2547422215541861, "flos": 18734130570240.0, "grad_norm": 1.5617120851855009, "language_loss": 0.66717625, "learning_rate": 3.393357781531093e-06, "loss": 0.68889576, "num_input_tokens_seen": 91456705, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8828125, "step": 4237, "time_per_iteration": 2.5572664737701416 }, { "auxiliary_loss_clip": 0.0113871, "auxiliary_loss_mlp": 0.01045608, "balance_loss_clip": 1.02879941, "balance_loss_mlp": 1.04826915, "epoch": 0.25480234480685404, "flos": 21032449274880.0, "grad_norm": 1.9833604180571183, "language_loss": 0.74777877, "learning_rate": 3.393086713314544e-06, "loss": 0.76962197, "num_input_tokens_seen": 91475535, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.90234375, "step": 4238, "time_per_iteration": 2.5126140117645264 }, { "auxiliary_loss_clip": 0.01149489, "auxiliary_loss_mlp": 0.01045088, "balance_loss_clip": 1.02726603, "balance_loss_mlp": 1.04947686, "epoch": 0.254862468059522, "flos": 25082167224960.0, "grad_norm": 1.9867258889663733, "language_loss": 0.8037287, "learning_rate": 3.3928155953823137e-06, "loss": 0.82567441, "num_input_tokens_seen": 91499140, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.91015625, "step": 4239, "time_per_iteration": 2.6264281272888184 }, { "auxiliary_loss_clip": 0.01146045, "auxiliary_loss_mlp": 0.01046227, "balance_loss_clip": 1.02901316, "balance_loss_mlp": 1.04814434, "epoch": 0.25492259131219, "flos": 20558392554240.0, "grad_norm": 1.58623841010892, "language_loss": 0.77165443, "learning_rate": 3.3925444277440774e-06, "loss": 0.79357713, "num_input_tokens_seen": 91518335, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.88671875, "step": 4240, "time_per_iteration": 2.5595595836639404 }, { "auxiliary_loss_clip": 0.01142434, "auxiliary_loss_mlp": 0.0103928, "balance_loss_clip": 1.0213393, "balance_loss_mlp": 1.04817319, "epoch": 0.25498271456485794, "flos": 25042485674880.0, "grad_norm": 2.0476348328802025, "language_loss": 0.83252794, "learning_rate": 3.392273210409512e-06, "loss": 0.85434508, "num_input_tokens_seen": 91537655, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.94140625, "step": 4241, "time_per_iteration": 2.7061164379119873 }, { "auxiliary_loss_clip": 0.01148796, "auxiliary_loss_mlp": 0.01046349, "balance_loss_clip": 1.02931356, "balance_loss_mlp": 1.04802847, "epoch": 0.2550428378175259, "flos": 26178412354560.0, "grad_norm": 3.304126412247526, "language_loss": 0.73110235, "learning_rate": 3.392001943388298e-06, "loss": 0.75305378, "num_input_tokens_seen": 91557545, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.921875, "step": 4242, "time_per_iteration": 2.6503584384918213 }, { "auxiliary_loss_clip": 0.01139251, "auxiliary_loss_mlp": 0.01291295, "balance_loss_clip": 1.02700639, "balance_loss_mlp": 1.04797781, "epoch": 0.2551029610701939, "flos": 15267170874240.0, "grad_norm": 2.810083269244404, "language_loss": 0.72278225, "learning_rate": 3.3917306266901146e-06, "loss": 0.74708772, "num_input_tokens_seen": 91574405, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.9140625, "step": 4243, "time_per_iteration": 2.656507730484009 }, { "auxiliary_loss_clip": 0.01147209, "auxiliary_loss_mlp": 0.01044713, "balance_loss_clip": 1.02707028, "balance_loss_mlp": 1.04774332, "epoch": 0.2551630843228619, "flos": 18112193556480.0, "grad_norm": 1.5570217847296768, "language_loss": 0.81505442, "learning_rate": 3.3914592603246458e-06, "loss": 0.83697361, "num_input_tokens_seen": 91593755, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.90625, "step": 4244, "time_per_iteration": 2.5619020462036133 }, { "auxiliary_loss_clip": 0.01055026, "auxiliary_loss_mlp": 0.01004761, "balance_loss_clip": 1.00312829, "balance_loss_mlp": 1.01812601, "epoch": 0.25522320757552985, "flos": 70520192167680.0, "grad_norm": 0.6855331689083616, "language_loss": 0.57721531, "learning_rate": 3.391187844301575e-06, "loss": 0.59781313, "num_input_tokens_seen": 91660335, "router_z_loss_clip": 0.01635742, "router_z_loss_mlp": 0.28125, "step": 4245, "time_per_iteration": 3.2598958015441895 }, { "auxiliary_loss_clip": 0.01138392, "auxiliary_loss_mlp": 0.01052901, "balance_loss_clip": 1.03580654, "balance_loss_mlp": 1.04597902, "epoch": 0.2552833308281978, "flos": 22893088757760.0, "grad_norm": 2.8730229781874828, "language_loss": 0.65074146, "learning_rate": 3.3909163786305884e-06, "loss": 0.67265445, "num_input_tokens_seen": 91678500, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.921875, "step": 4246, "time_per_iteration": 2.5753417015075684 }, { "auxiliary_loss_clip": 0.01139184, "auxiliary_loss_mlp": 0.01046368, "balance_loss_clip": 1.03114522, "balance_loss_mlp": 1.04480469, "epoch": 0.2553434540808658, "flos": 22053605022720.0, "grad_norm": 2.174397445714642, "language_loss": 0.81062186, "learning_rate": 3.390644863321374e-06, "loss": 0.83247739, "num_input_tokens_seen": 91696430, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.85546875, "step": 4247, "time_per_iteration": 2.5608935356140137 }, { "auxiliary_loss_clip": 0.01152335, "auxiliary_loss_mlp": 0.01049892, "balance_loss_clip": 1.03123534, "balance_loss_mlp": 1.04628658, "epoch": 0.25540357733353375, "flos": 16544190176640.0, "grad_norm": 2.480955663752869, "language_loss": 0.83104062, "learning_rate": 3.390373298383622e-06, "loss": 0.85306287, "num_input_tokens_seen": 91713270, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.96875, "step": 4248, "time_per_iteration": 2.5417568683624268 }, { "auxiliary_loss_clip": 0.01154867, "auxiliary_loss_mlp": 0.0104653, "balance_loss_clip": 1.03001976, "balance_loss_mlp": 1.04608488, "epoch": 0.2554637005862017, "flos": 17565022702080.0, "grad_norm": 2.3060830163458146, "language_loss": 0.84371567, "learning_rate": 3.390101683827023e-06, "loss": 0.86572957, "num_input_tokens_seen": 91728865, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.90625, "step": 4249, "time_per_iteration": 2.514559745788574 }, { "auxiliary_loss_clip": 0.01089425, "auxiliary_loss_mlp": 0.00999034, "balance_loss_clip": 0.99747211, "balance_loss_mlp": 1.01667786, "epoch": 0.2555238238388697, "flos": 72244763953920.0, "grad_norm": 0.754705992120447, "language_loss": 0.56280404, "learning_rate": 3.389830019661271e-06, "loss": 0.58368862, "num_input_tokens_seen": 91787470, "router_z_loss_clip": 0.015625, "router_z_loss_mlp": 0.28125, "step": 4250, "time_per_iteration": 3.2402093410491943 }, { "auxiliary_loss_clip": 0.01154612, "auxiliary_loss_mlp": 0.01037362, "balance_loss_clip": 1.01968324, "balance_loss_mlp": 1.04525065, "epoch": 0.25558394709153764, "flos": 24389414547840.0, "grad_norm": 1.9988038654084463, "language_loss": 0.80154628, "learning_rate": 3.3895583058960604e-06, "loss": 0.82346606, "num_input_tokens_seen": 91805640, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.9140625, "step": 4251, "time_per_iteration": 2.6470985412597656 }, { "auxiliary_loss_clip": 0.01071648, "auxiliary_loss_mlp": 0.0100087, "balance_loss_clip": 0.99922448, "balance_loss_mlp": 1.01609755, "epoch": 0.2556440703442056, "flos": 69231213636480.0, "grad_norm": 0.7950415210012056, "language_loss": 0.66109502, "learning_rate": 3.3892865425410884e-06, "loss": 0.68182027, "num_input_tokens_seen": 91869695, "router_z_loss_clip": 0.01647949, "router_z_loss_mlp": 0.28125, "step": 4252, "time_per_iteration": 3.1967883110046387 }, { "auxiliary_loss_clip": 0.01142834, "auxiliary_loss_mlp": 0.01040412, "balance_loss_clip": 1.02506936, "balance_loss_mlp": 1.04632688, "epoch": 0.2557041935968736, "flos": 24863902231680.0, "grad_norm": 2.354701240364745, "language_loss": 0.73264569, "learning_rate": 3.389014729606054e-06, "loss": 0.75447816, "num_input_tokens_seen": 91889920, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.87890625, "step": 4253, "time_per_iteration": 4.063133478164673 }, { "auxiliary_loss_clip": 0.01164513, "auxiliary_loss_mlp": 0.01045118, "balance_loss_clip": 1.02914393, "balance_loss_mlp": 1.04778767, "epoch": 0.25576431684954154, "flos": 22492110257280.0, "grad_norm": 2.1904588732784545, "language_loss": 0.72076035, "learning_rate": 3.388742867100656e-06, "loss": 0.74285662, "num_input_tokens_seen": 91908665, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.890625, "step": 4254, "time_per_iteration": 2.5792689323425293 }, { "auxiliary_loss_clip": 0.01171774, "auxiliary_loss_mlp": 0.01294484, "balance_loss_clip": 1.02995944, "balance_loss_mlp": 1.04585183, "epoch": 0.2558244401022095, "flos": 19826748426240.0, "grad_norm": 2.663985053708746, "language_loss": 0.80873275, "learning_rate": 3.388470955034598e-06, "loss": 0.8333953, "num_input_tokens_seen": 91927855, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8984375, "step": 4255, "time_per_iteration": 2.6313164234161377 }, { "auxiliary_loss_clip": 0.01152285, "auxiliary_loss_mlp": 0.01042613, "balance_loss_clip": 1.02585196, "balance_loss_mlp": 1.04594696, "epoch": 0.2558845633548775, "flos": 23220486247680.0, "grad_norm": 1.468114266843699, "language_loss": 0.85434282, "learning_rate": 3.3881989934175822e-06, "loss": 0.87629175, "num_input_tokens_seen": 91948500, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8828125, "step": 4256, "time_per_iteration": 4.028311729431152 }, { "auxiliary_loss_clip": 0.01169313, "auxiliary_loss_mlp": 0.01049339, "balance_loss_clip": 1.03139794, "balance_loss_mlp": 1.04853392, "epoch": 0.2559446866075455, "flos": 16837867774080.0, "grad_norm": 1.726177208276294, "language_loss": 0.74996954, "learning_rate": 3.387926982259316e-06, "loss": 0.77215606, "num_input_tokens_seen": 91968375, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9375, "step": 4257, "time_per_iteration": 4.17704439163208 }, { "auxiliary_loss_clip": 0.01078659, "auxiliary_loss_mlp": 0.01008411, "balance_loss_clip": 1.00680184, "balance_loss_mlp": 1.01487207, "epoch": 0.25600480986021346, "flos": 57593786895360.0, "grad_norm": 0.7936474134720224, "language_loss": 0.65222406, "learning_rate": 3.387654921569505e-06, "loss": 0.67309475, "num_input_tokens_seen": 92028490, "router_z_loss_clip": 0.01611328, "router_z_loss_mlp": 0.27734375, "step": 4258, "time_per_iteration": 3.1585121154785156 }, { "auxiliary_loss_clip": 0.0117135, "auxiliary_loss_mlp": 0.01288054, "balance_loss_clip": 1.02282917, "balance_loss_mlp": 1.04839408, "epoch": 0.2560649331128814, "flos": 27819529868160.0, "grad_norm": 1.7325926479870475, "language_loss": 0.76395929, "learning_rate": 3.3873828113578604e-06, "loss": 0.78855336, "num_input_tokens_seen": 92048060, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.875, "step": 4259, "time_per_iteration": 2.7229018211364746 }, { "auxiliary_loss_clip": 0.0106865, "auxiliary_loss_mlp": 0.01005829, "balance_loss_clip": 1.00412452, "balance_loss_mlp": 1.01414251, "epoch": 0.2561250563655494, "flos": 70950509101440.0, "grad_norm": 0.7843173968676939, "language_loss": 0.58427513, "learning_rate": 3.387110651634092e-06, "loss": 0.60501993, "num_input_tokens_seen": 92118180, "router_z_loss_clip": 0.01708984, "router_z_loss_mlp": 0.27734375, "step": 4260, "time_per_iteration": 5.523035764694214 }, { "auxiliary_loss_clip": 0.01145133, "auxiliary_loss_mlp": 0.01048783, "balance_loss_clip": 1.03164053, "balance_loss_mlp": 1.04565907, "epoch": 0.25618517961821735, "flos": 27012329481600.0, "grad_norm": 2.2092363668552566, "language_loss": 0.77102131, "learning_rate": 3.3868384424079122e-06, "loss": 0.79296046, "num_input_tokens_seen": 92137570, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.90625, "step": 4261, "time_per_iteration": 2.590200185775757 }, { "auxiliary_loss_clip": 0.01135962, "auxiliary_loss_mlp": 0.01038113, "balance_loss_clip": 1.0230453, "balance_loss_mlp": 1.04800713, "epoch": 0.2562453028708853, "flos": 23068296322560.0, "grad_norm": 1.5702740382051352, "language_loss": 0.8303349, "learning_rate": 3.3865661836890356e-06, "loss": 0.85207564, "num_input_tokens_seen": 92157625, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.87890625, "step": 4262, "time_per_iteration": 2.638796091079712 }, { "auxiliary_loss_clip": 0.01149517, "auxiliary_loss_mlp": 0.01043282, "balance_loss_clip": 1.025913, "balance_loss_mlp": 1.04606605, "epoch": 0.2563054261235533, "flos": 15120942606720.0, "grad_norm": 2.2877531582600583, "language_loss": 0.72989041, "learning_rate": 3.3862938754871786e-06, "loss": 0.75181842, "num_input_tokens_seen": 92175350, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9453125, "step": 4263, "time_per_iteration": 2.4926183223724365 }, { "auxiliary_loss_clip": 0.01154431, "auxiliary_loss_mlp": 0.01299446, "balance_loss_clip": 1.03545344, "balance_loss_mlp": 1.04880381, "epoch": 0.25636554937622125, "flos": 27854865872640.0, "grad_norm": 1.779421906365733, "language_loss": 0.82839823, "learning_rate": 3.3860215178120597e-06, "loss": 0.85293698, "num_input_tokens_seen": 92196070, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.875, "step": 4264, "time_per_iteration": 2.7122581005096436 }, { "auxiliary_loss_clip": 0.01146568, "auxiliary_loss_mlp": 0.01043049, "balance_loss_clip": 1.02544141, "balance_loss_mlp": 1.04726243, "epoch": 0.2564256726288892, "flos": 28906509288960.0, "grad_norm": 1.7898729438562229, "language_loss": 0.746647, "learning_rate": 3.385749110673398e-06, "loss": 0.76854318, "num_input_tokens_seen": 92216310, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.90234375, "step": 4265, "time_per_iteration": 2.55009126663208 }, { "auxiliary_loss_clip": 0.01159469, "auxiliary_loss_mlp": 0.0103477, "balance_loss_clip": 1.0187726, "balance_loss_mlp": 1.0437988, "epoch": 0.2564857958815572, "flos": 18514931823360.0, "grad_norm": 4.215855756088298, "language_loss": 0.81758201, "learning_rate": 3.3854766540809143e-06, "loss": 0.83952439, "num_input_tokens_seen": 92234510, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.890625, "step": 4266, "time_per_iteration": 2.62196946144104 }, { "auxiliary_loss_clip": 0.01166899, "auxiliary_loss_mlp": 0.0103687, "balance_loss_clip": 1.02071691, "balance_loss_mlp": 1.04398203, "epoch": 0.25654591913422514, "flos": 25808280658560.0, "grad_norm": 1.5265141029253244, "language_loss": 0.79180419, "learning_rate": 3.3852041480443337e-06, "loss": 0.81384188, "num_input_tokens_seen": 92254070, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8671875, "step": 4267, "time_per_iteration": 2.5873401165008545 }, { "auxiliary_loss_clip": 0.01157478, "auxiliary_loss_mlp": 0.01040406, "balance_loss_clip": 1.02419376, "balance_loss_mlp": 1.04594493, "epoch": 0.2566060423868931, "flos": 19099665325440.0, "grad_norm": 1.5957525622170061, "language_loss": 0.79158759, "learning_rate": 3.3849315925733793e-06, "loss": 0.81356645, "num_input_tokens_seen": 92275060, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.84765625, "step": 4268, "time_per_iteration": 2.6510138511657715 }, { "auxiliary_loss_clip": 0.01162758, "auxiliary_loss_mlp": 0.01289296, "balance_loss_clip": 1.02596259, "balance_loss_mlp": 1.04746783, "epoch": 0.25666616563956113, "flos": 23842674656640.0, "grad_norm": 1.4734047905764935, "language_loss": 0.67670774, "learning_rate": 3.384658987677779e-06, "loss": 0.70122826, "num_input_tokens_seen": 92293610, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8828125, "step": 4269, "time_per_iteration": 2.6753108501434326 }, { "auxiliary_loss_clip": 0.01155689, "auxiliary_loss_mlp": 0.01039917, "balance_loss_clip": 1.0235374, "balance_loss_mlp": 1.04785371, "epoch": 0.2567262888922291, "flos": 14604259420800.0, "grad_norm": 3.9747041116885073, "language_loss": 0.78731704, "learning_rate": 3.3843863333672617e-06, "loss": 0.80927306, "num_input_tokens_seen": 92308305, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.90234375, "step": 4270, "time_per_iteration": 2.608452558517456 }, { "auxiliary_loss_clip": 0.01177344, "auxiliary_loss_mlp": 0.01039252, "balance_loss_clip": 1.02064312, "balance_loss_mlp": 1.04718947, "epoch": 0.25678641214489706, "flos": 32923117877760.0, "grad_norm": 2.3329177627977873, "language_loss": 0.68452519, "learning_rate": 3.3841136296515574e-06, "loss": 0.70669115, "num_input_tokens_seen": 92329875, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.94140625, "step": 4271, "time_per_iteration": 2.6876885890960693 }, { "auxiliary_loss_clip": 0.01148216, "auxiliary_loss_mlp": 0.0129571, "balance_loss_clip": 1.03084838, "balance_loss_mlp": 1.04686093, "epoch": 0.256846535397565, "flos": 24098933260800.0, "grad_norm": 1.6948924130755165, "language_loss": 0.87434089, "learning_rate": 3.3838408765403974e-06, "loss": 0.89878011, "num_input_tokens_seen": 92348780, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.92578125, "step": 4272, "time_per_iteration": 2.6728806495666504 }, { "auxiliary_loss_clip": 0.01158188, "auxiliary_loss_mlp": 0.01043221, "balance_loss_clip": 1.02578092, "balance_loss_mlp": 1.04876947, "epoch": 0.256906658650233, "flos": 19718441942400.0, "grad_norm": 2.1662365479409282, "language_loss": 0.81433821, "learning_rate": 3.3835680740435164e-06, "loss": 0.83635235, "num_input_tokens_seen": 92368175, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.9140625, "step": 4273, "time_per_iteration": 2.700162172317505 }, { "auxiliary_loss_clip": 0.01161325, "auxiliary_loss_mlp": 0.01037665, "balance_loss_clip": 1.02148843, "balance_loss_mlp": 1.04880285, "epoch": 0.25696678190290095, "flos": 22926018551040.0, "grad_norm": 2.085600748611511, "language_loss": 0.77356398, "learning_rate": 3.38329522217065e-06, "loss": 0.79555392, "num_input_tokens_seen": 92387755, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.85546875, "step": 4274, "time_per_iteration": 2.630154609680176 }, { "auxiliary_loss_clip": 0.01149048, "auxiliary_loss_mlp": 0.01035755, "balance_loss_clip": 1.01901793, "balance_loss_mlp": 1.04410434, "epoch": 0.2570269051555689, "flos": 27307838672640.0, "grad_norm": 1.75570477579612, "language_loss": 0.83858562, "learning_rate": 3.383022320931535e-06, "loss": 0.86043358, "num_input_tokens_seen": 92409850, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8671875, "step": 4275, "time_per_iteration": 2.639531135559082 }, { "auxiliary_loss_clip": 0.01155147, "auxiliary_loss_mlp": 0.01033024, "balance_loss_clip": 1.01608419, "balance_loss_mlp": 1.0467236, "epoch": 0.2570870284082369, "flos": 27563414918400.0, "grad_norm": 2.287350501270394, "language_loss": 0.78561735, "learning_rate": 3.3827493703359116e-06, "loss": 0.80749905, "num_input_tokens_seen": 92431250, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.90234375, "step": 4276, "time_per_iteration": 2.6324567794799805 }, { "auxiliary_loss_clip": 0.01144993, "auxiliary_loss_mlp": 0.01042367, "balance_loss_clip": 1.02617788, "balance_loss_mlp": 1.04796934, "epoch": 0.25714715166090485, "flos": 28730834847360.0, "grad_norm": 1.7321851118751712, "language_loss": 0.79029119, "learning_rate": 3.38247637039352e-06, "loss": 0.81216478, "num_input_tokens_seen": 92452065, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8828125, "step": 4277, "time_per_iteration": 2.648899793624878 }, { "auxiliary_loss_clip": 0.01136503, "auxiliary_loss_mlp": 0.01038367, "balance_loss_clip": 1.02207077, "balance_loss_mlp": 1.0474062, "epoch": 0.2572072749135728, "flos": 20116152305280.0, "grad_norm": 2.357666572051431, "language_loss": 0.78272986, "learning_rate": 3.3822033211141018e-06, "loss": 0.80447865, "num_input_tokens_seen": 92470025, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.890625, "step": 4278, "time_per_iteration": 2.5400259494781494 }, { "auxiliary_loss_clip": 0.0115865, "auxiliary_loss_mlp": 0.01040611, "balance_loss_clip": 1.02350414, "balance_loss_mlp": 1.04870081, "epoch": 0.2572673981662408, "flos": 26030855283840.0, "grad_norm": 1.691103091858685, "language_loss": 0.74218464, "learning_rate": 3.381930222507403e-06, "loss": 0.7641772, "num_input_tokens_seen": 92489825, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.921875, "step": 4279, "time_per_iteration": 2.6433987617492676 }, { "auxiliary_loss_clip": 0.01144471, "auxiliary_loss_mlp": 0.01046657, "balance_loss_clip": 1.03015888, "balance_loss_mlp": 1.04480052, "epoch": 0.25732752141890874, "flos": 16106618695680.0, "grad_norm": 2.1338220687614577, "language_loss": 0.85113776, "learning_rate": 3.3816570745831696e-06, "loss": 0.87304902, "num_input_tokens_seen": 92507270, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.90625, "step": 4280, "time_per_iteration": 2.5139520168304443 }, { "auxiliary_loss_clip": 0.01147775, "auxiliary_loss_mlp": 0.01039499, "balance_loss_clip": 1.02220213, "balance_loss_mlp": 1.04775214, "epoch": 0.2573876446715767, "flos": 22524429519360.0, "grad_norm": 2.601120803374784, "language_loss": 0.79001069, "learning_rate": 3.3813838773511496e-06, "loss": 0.81188345, "num_input_tokens_seen": 92526300, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.9140625, "step": 4281, "time_per_iteration": 2.713263750076294 }, { "auxiliary_loss_clip": 0.01165409, "auxiliary_loss_mlp": 0.0103854, "balance_loss_clip": 1.02192235, "balance_loss_mlp": 1.04866266, "epoch": 0.2574477679242447, "flos": 23950837486080.0, "grad_norm": 1.7675602187195665, "language_loss": 0.86929005, "learning_rate": 3.3811106308210916e-06, "loss": 0.89132953, "num_input_tokens_seen": 92546465, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8984375, "step": 4282, "time_per_iteration": 2.68306565284729 }, { "auxiliary_loss_clip": 0.01138663, "auxiliary_loss_mlp": 0.01043481, "balance_loss_clip": 1.02730417, "balance_loss_mlp": 1.04683065, "epoch": 0.2575078911769127, "flos": 21981711951360.0, "grad_norm": 2.686005603118987, "language_loss": 0.70380896, "learning_rate": 3.380837335002748e-06, "loss": 0.7256304, "num_input_tokens_seen": 92567260, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.91796875, "step": 4283, "time_per_iteration": 2.575704574584961 }, { "auxiliary_loss_clip": 0.01134196, "auxiliary_loss_mlp": 0.01289581, "balance_loss_clip": 1.0262816, "balance_loss_mlp": 1.04924858, "epoch": 0.25756801442958066, "flos": 21945406279680.0, "grad_norm": 1.8331431375444562, "language_loss": 0.80325139, "learning_rate": 3.380563989905872e-06, "loss": 0.82748914, "num_input_tokens_seen": 92585425, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.84765625, "step": 4284, "time_per_iteration": 2.543891191482544 }, { "auxiliary_loss_clip": 0.01183899, "auxiliary_loss_mlp": 0.01039692, "balance_loss_clip": 1.02513719, "balance_loss_mlp": 1.04969382, "epoch": 0.2576281376822486, "flos": 35261980058880.0, "grad_norm": 2.7689432604843254, "language_loss": 0.69840014, "learning_rate": 3.3802905955402185e-06, "loss": 0.72063607, "num_input_tokens_seen": 92604770, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.890625, "step": 4285, "time_per_iteration": 2.7863175868988037 }, { "auxiliary_loss_clip": 0.01156501, "auxiliary_loss_mlp": 0.01038093, "balance_loss_clip": 1.02306056, "balance_loss_mlp": 1.04965162, "epoch": 0.2576882609349166, "flos": 14132285688960.0, "grad_norm": 2.3605711110277436, "language_loss": 0.58241951, "learning_rate": 3.3800171519155443e-06, "loss": 0.60436547, "num_input_tokens_seen": 92622635, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.88671875, "step": 4286, "time_per_iteration": 2.502403497695923 }, { "auxiliary_loss_clip": 0.01150735, "auxiliary_loss_mlp": 0.01051326, "balance_loss_clip": 1.03418386, "balance_loss_mlp": 1.04999804, "epoch": 0.25774838418758456, "flos": 23258336204160.0, "grad_norm": 1.698242824388542, "language_loss": 0.63870072, "learning_rate": 3.379743659041607e-06, "loss": 0.6607213, "num_input_tokens_seen": 92642960, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.921875, "step": 4287, "time_per_iteration": 2.6146068572998047 }, { "auxiliary_loss_clip": 0.01159825, "auxiliary_loss_mlp": 0.01040619, "balance_loss_clip": 1.02302337, "balance_loss_mlp": 1.04908943, "epoch": 0.2578085074402525, "flos": 22601745544320.0, "grad_norm": 1.9827124155778184, "language_loss": 0.71368319, "learning_rate": 3.3794701169281686e-06, "loss": 0.73568767, "num_input_tokens_seen": 92662455, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9296875, "step": 4288, "time_per_iteration": 2.583768129348755 }, { "auxiliary_loss_clip": 0.0116356, "auxiliary_loss_mlp": 0.01040384, "balance_loss_clip": 1.02492833, "balance_loss_mlp": 1.04870725, "epoch": 0.2578686306929205, "flos": 24571840746240.0, "grad_norm": 1.6915445426484532, "language_loss": 0.77064818, "learning_rate": 3.37919652558499e-06, "loss": 0.79268765, "num_input_tokens_seen": 92683520, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.87890625, "step": 4289, "time_per_iteration": 2.712677001953125 }, { "auxiliary_loss_clip": 0.01138783, "auxiliary_loss_mlp": 0.01286537, "balance_loss_clip": 1.02197266, "balance_loss_mlp": 1.04823351, "epoch": 0.25792875394558845, "flos": 18113953322880.0, "grad_norm": 3.1491832438493694, "language_loss": 0.85109317, "learning_rate": 3.3789228850218347e-06, "loss": 0.87534636, "num_input_tokens_seen": 92701450, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.90625, "step": 4290, "time_per_iteration": 2.512500762939453 }, { "auxiliary_loss_clip": 0.01168408, "auxiliary_loss_mlp": 0.01050078, "balance_loss_clip": 1.0321492, "balance_loss_mlp": 1.04974174, "epoch": 0.2579888771982564, "flos": 17712902995200.0, "grad_norm": 2.748774729280066, "language_loss": 0.71994114, "learning_rate": 3.3786491952484686e-06, "loss": 0.74212605, "num_input_tokens_seen": 92720355, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9140625, "step": 4291, "time_per_iteration": 2.632296562194824 }, { "auxiliary_loss_clip": 0.01143824, "auxiliary_loss_mlp": 0.01045056, "balance_loss_clip": 1.02724576, "balance_loss_mlp": 1.05154562, "epoch": 0.2580490004509244, "flos": 16434878112000.0, "grad_norm": 4.535205161829959, "language_loss": 0.80617785, "learning_rate": 3.378375456274659e-06, "loss": 0.82806665, "num_input_tokens_seen": 92736755, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.921875, "step": 4292, "time_per_iteration": 2.567418098449707 }, { "auxiliary_loss_clip": 0.01143949, "auxiliary_loss_mlp": 0.01048998, "balance_loss_clip": 1.03185558, "balance_loss_mlp": 1.05258, "epoch": 0.25810912370359235, "flos": 33835141128960.0, "grad_norm": 2.814803042867267, "language_loss": 0.68000543, "learning_rate": 3.378101668110175e-06, "loss": 0.70193481, "num_input_tokens_seen": 92757655, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9140625, "step": 4293, "time_per_iteration": 2.6548068523406982 }, { "auxiliary_loss_clip": 0.01161535, "auxiliary_loss_mlp": 0.01041723, "balance_loss_clip": 1.02613091, "balance_loss_mlp": 1.04907405, "epoch": 0.2581692469562603, "flos": 25192197561600.0, "grad_norm": 1.8301595358584466, "language_loss": 0.75403929, "learning_rate": 3.377827830764788e-06, "loss": 0.77607191, "num_input_tokens_seen": 92776100, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.859375, "step": 4294, "time_per_iteration": 4.153322219848633 }, { "auxiliary_loss_clip": 0.01165177, "auxiliary_loss_mlp": 0.01044125, "balance_loss_clip": 1.02668428, "balance_loss_mlp": 1.04882598, "epoch": 0.2582293702089283, "flos": 34932212271360.0, "grad_norm": 2.82143170243467, "language_loss": 0.80581355, "learning_rate": 3.3775539442482695e-06, "loss": 0.82790649, "num_input_tokens_seen": 92798880, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.890625, "step": 4295, "time_per_iteration": 2.6534323692321777 }, { "auxiliary_loss_clip": 0.01215564, "auxiliary_loss_mlp": 0.0104566, "balance_loss_clip": 1.02779031, "balance_loss_mlp": 1.05100226, "epoch": 0.2582894934615963, "flos": 26833746038400.0, "grad_norm": 2.268048762315739, "language_loss": 0.72490728, "learning_rate": 3.377280008570394e-06, "loss": 0.74751949, "num_input_tokens_seen": 92817750, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.9296875, "step": 4296, "time_per_iteration": 2.7301154136657715 }, { "auxiliary_loss_clip": 0.01186918, "auxiliary_loss_mlp": 0.01041821, "balance_loss_clip": 1.02489352, "balance_loss_mlp": 1.05169439, "epoch": 0.25834961671426426, "flos": 23515241253120.0, "grad_norm": 1.6013973740467746, "language_loss": 0.86905944, "learning_rate": 3.3770060237409382e-06, "loss": 0.89134687, "num_input_tokens_seen": 92837995, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.90234375, "step": 4297, "time_per_iteration": 2.624788761138916 }, { "auxiliary_loss_clip": 0.01139344, "auxiliary_loss_mlp": 0.01049367, "balance_loss_clip": 1.03345227, "balance_loss_mlp": 1.04957104, "epoch": 0.25840973996693223, "flos": 22451028076800.0, "grad_norm": 1.8534748675917068, "language_loss": 0.84730333, "learning_rate": 3.3767319897696795e-06, "loss": 0.86919039, "num_input_tokens_seen": 92857245, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8984375, "step": 4298, "time_per_iteration": 3.926729679107666 }, { "auxiliary_loss_clip": 0.01148141, "auxiliary_loss_mlp": 0.01281235, "balance_loss_clip": 1.01671982, "balance_loss_mlp": 1.04936337, "epoch": 0.2584698632196002, "flos": 11290854366720.0, "grad_norm": 2.0994164267142694, "language_loss": 0.83333737, "learning_rate": 3.376457906666397e-06, "loss": 0.85763115, "num_input_tokens_seen": 92873265, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8984375, "step": 4299, "time_per_iteration": 4.176985502243042 }, { "auxiliary_loss_clip": 0.01155442, "auxiliary_loss_mlp": 0.0103402, "balance_loss_clip": 1.01821315, "balance_loss_mlp": 1.05089474, "epoch": 0.25852998647226816, "flos": 17929982839680.0, "grad_norm": 2.6762674598181206, "language_loss": 0.82857728, "learning_rate": 3.3761837744408728e-06, "loss": 0.85047185, "num_input_tokens_seen": 92890880, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8671875, "step": 4300, "time_per_iteration": 2.6150386333465576 }, { "auxiliary_loss_clip": 0.01150816, "auxiliary_loss_mlp": 0.01044394, "balance_loss_clip": 1.02743077, "balance_loss_mlp": 1.05011153, "epoch": 0.2585901097249361, "flos": 33256117889280.0, "grad_norm": 2.1999205175253285, "language_loss": 0.67124236, "learning_rate": 3.375909593102889e-06, "loss": 0.69319445, "num_input_tokens_seen": 92910770, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.91796875, "step": 4301, "time_per_iteration": 4.314717769622803 }, { "auxiliary_loss_clip": 0.01152519, "auxiliary_loss_mlp": 0.01039519, "balance_loss_clip": 1.02250803, "balance_loss_mlp": 1.05006206, "epoch": 0.2586502329776041, "flos": 18441278985600.0, "grad_norm": 3.2540171209717594, "language_loss": 0.81466639, "learning_rate": 3.3756353626622325e-06, "loss": 0.83658671, "num_input_tokens_seen": 92929520, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9375, "step": 4302, "time_per_iteration": 2.5700645446777344 }, { "auxiliary_loss_clip": 0.01176789, "auxiliary_loss_mlp": 0.01045047, "balance_loss_clip": 1.02878642, "balance_loss_mlp": 1.04968905, "epoch": 0.25871035623027205, "flos": 17968120104960.0, "grad_norm": 1.850106264282461, "language_loss": 0.91897589, "learning_rate": 3.375361083128687e-06, "loss": 0.9411943, "num_input_tokens_seen": 92947890, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.9140625, "step": 4303, "time_per_iteration": 2.583359718322754 }, { "auxiliary_loss_clip": 0.01148231, "auxiliary_loss_mlp": 0.01036672, "balance_loss_clip": 1.02086484, "balance_loss_mlp": 1.05099332, "epoch": 0.25877047948294, "flos": 27777729415680.0, "grad_norm": 1.9527818595398139, "language_loss": 0.67118454, "learning_rate": 3.3750867545120434e-06, "loss": 0.69303358, "num_input_tokens_seen": 92967690, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8828125, "step": 4304, "time_per_iteration": 2.7336580753326416 }, { "auxiliary_loss_clip": 0.01167888, "auxiliary_loss_mlp": 0.01050943, "balance_loss_clip": 1.03405142, "balance_loss_mlp": 1.04939151, "epoch": 0.258830602735608, "flos": 27125843437440.0, "grad_norm": 2.300267885578565, "language_loss": 0.72851717, "learning_rate": 3.3748123768220902e-06, "loss": 0.75070548, "num_input_tokens_seen": 92986830, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.91796875, "step": 4305, "time_per_iteration": 2.6524007320404053 }, { "auxiliary_loss_clip": 0.01177377, "auxiliary_loss_mlp": 0.01292207, "balance_loss_clip": 1.02706122, "balance_loss_mlp": 1.04841948, "epoch": 0.25889072598827595, "flos": 17891486438400.0, "grad_norm": 2.1318131397751685, "language_loss": 0.75552285, "learning_rate": 3.3745379500686197e-06, "loss": 0.78021866, "num_input_tokens_seen": 93002740, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.9296875, "step": 4306, "time_per_iteration": 2.631838083267212 }, { "auxiliary_loss_clip": 0.01067827, "auxiliary_loss_mlp": 0.01001839, "balance_loss_clip": 1.00007463, "balance_loss_mlp": 1.02149022, "epoch": 0.2589508492409439, "flos": 53934955724160.0, "grad_norm": 0.8234252238488821, "language_loss": 0.57078975, "learning_rate": 3.3742634742614256e-06, "loss": 0.59148645, "num_input_tokens_seen": 93058645, "router_z_loss_clip": 0.0177002, "router_z_loss_mlp": 0.28710938, "step": 4307, "time_per_iteration": 3.1245031356811523 }, { "auxiliary_loss_clip": 0.01155798, "auxiliary_loss_mlp": 0.01040578, "balance_loss_clip": 1.02434206, "balance_loss_mlp": 1.04990363, "epoch": 0.2590109724936119, "flos": 22125785402880.0, "grad_norm": 1.7496486887242897, "language_loss": 0.71857846, "learning_rate": 3.373988949410303e-06, "loss": 0.74054229, "num_input_tokens_seen": 93077140, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.87890625, "step": 4308, "time_per_iteration": 2.5760257244110107 }, { "auxiliary_loss_clip": 0.01141939, "auxiliary_loss_mlp": 0.01044659, "balance_loss_clip": 1.02783823, "balance_loss_mlp": 1.04949141, "epoch": 0.2590710957462799, "flos": 13474294398720.0, "grad_norm": 3.1071958211827444, "language_loss": 0.83996272, "learning_rate": 3.3737143755250488e-06, "loss": 0.86182874, "num_input_tokens_seen": 93093580, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.921875, "step": 4309, "time_per_iteration": 2.5724740028381348 }, { "auxiliary_loss_clip": 0.01149749, "auxiliary_loss_mlp": 0.01045883, "balance_loss_clip": 1.03077912, "balance_loss_mlp": 1.05175662, "epoch": 0.25913121899894787, "flos": 22307098279680.0, "grad_norm": 1.6335957226035343, "language_loss": 0.84772027, "learning_rate": 3.3734397526154626e-06, "loss": 0.86967653, "num_input_tokens_seen": 93112345, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.890625, "step": 4310, "time_per_iteration": 2.5542006492614746 }, { "auxiliary_loss_clip": 0.01151423, "auxiliary_loss_mlp": 0.01049959, "balance_loss_clip": 1.03371048, "balance_loss_mlp": 1.05060434, "epoch": 0.25919134225161583, "flos": 25811728364160.0, "grad_norm": 1.9093198685979738, "language_loss": 0.76858169, "learning_rate": 3.373165080691344e-06, "loss": 0.79059547, "num_input_tokens_seen": 93131545, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.91796875, "step": 4311, "time_per_iteration": 2.671787738800049 }, { "auxiliary_loss_clip": 0.01158339, "auxiliary_loss_mlp": 0.01044887, "balance_loss_clip": 1.02835321, "balance_loss_mlp": 1.04893792, "epoch": 0.2592514655042838, "flos": 31212262108800.0, "grad_norm": 2.1075337531009173, "language_loss": 0.72215104, "learning_rate": 3.3728903597624967e-06, "loss": 0.7441833, "num_input_tokens_seen": 93150730, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.9140625, "step": 4312, "time_per_iteration": 2.6740682125091553 }, { "auxiliary_loss_clip": 0.01138565, "auxiliary_loss_mlp": 0.01046785, "balance_loss_clip": 1.03051281, "balance_loss_mlp": 1.04855108, "epoch": 0.25931158875695176, "flos": 18474998878080.0, "grad_norm": 1.9688657531852938, "language_loss": 0.69891369, "learning_rate": 3.372615589838724e-06, "loss": 0.7207672, "num_input_tokens_seen": 93167895, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.90234375, "step": 4313, "time_per_iteration": 2.586653232574463 }, { "auxiliary_loss_clip": 0.01138915, "auxiliary_loss_mlp": 0.01054454, "balance_loss_clip": 1.03892124, "balance_loss_mlp": 1.04982996, "epoch": 0.2593717120096197, "flos": 19207935895680.0, "grad_norm": 1.6950064683974437, "language_loss": 0.80422235, "learning_rate": 3.3723407709298314e-06, "loss": 0.82615602, "num_input_tokens_seen": 93187650, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.890625, "step": 4314, "time_per_iteration": 2.5367722511291504 }, { "auxiliary_loss_clip": 0.01169544, "auxiliary_loss_mlp": 0.01053308, "balance_loss_clip": 1.03572488, "balance_loss_mlp": 1.04959297, "epoch": 0.2594318352622877, "flos": 31248100903680.0, "grad_norm": 3.043049575688694, "language_loss": 0.66945493, "learning_rate": 3.3720659030456262e-06, "loss": 0.69168347, "num_input_tokens_seen": 93207370, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.93359375, "step": 4315, "time_per_iteration": 2.755803108215332 }, { "auxiliary_loss_clip": 0.01173655, "auxiliary_loss_mlp": 0.01051011, "balance_loss_clip": 1.03560865, "balance_loss_mlp": 1.04801321, "epoch": 0.25949195851495566, "flos": 22237144542720.0, "grad_norm": 1.611523495657306, "language_loss": 0.79090881, "learning_rate": 3.371790986195919e-06, "loss": 0.81315541, "num_input_tokens_seen": 93227925, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.890625, "step": 4316, "time_per_iteration": 2.646636486053467 }, { "auxiliary_loss_clip": 0.01176649, "auxiliary_loss_mlp": 0.01043026, "balance_loss_clip": 1.02630091, "balance_loss_mlp": 1.04798365, "epoch": 0.2595520817676236, "flos": 28075716645120.0, "grad_norm": 1.8671664986493512, "language_loss": 0.77566606, "learning_rate": 3.37151602039052e-06, "loss": 0.79786277, "num_input_tokens_seen": 93250020, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.921875, "step": 4317, "time_per_iteration": 2.699522018432617 }, { "auxiliary_loss_clip": 0.01141261, "auxiliary_loss_mlp": 0.0105551, "balance_loss_clip": 1.03812957, "balance_loss_mlp": 1.05080676, "epoch": 0.2596122050202916, "flos": 20190954378240.0, "grad_norm": 2.0324505958219103, "language_loss": 0.77631688, "learning_rate": 3.3712410056392418e-06, "loss": 0.79828459, "num_input_tokens_seen": 93269070, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.90625, "step": 4318, "time_per_iteration": 2.754564046859741 }, { "auxiliary_loss_clip": 0.01138129, "auxiliary_loss_mlp": 0.01043138, "balance_loss_clip": 1.02573299, "balance_loss_mlp": 1.04870462, "epoch": 0.25967232827295955, "flos": 22527949052160.0, "grad_norm": 1.8350615569297382, "language_loss": 0.76289856, "learning_rate": 3.3709659419518994e-06, "loss": 0.78471124, "num_input_tokens_seen": 93290250, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.89453125, "step": 4319, "time_per_iteration": 2.5610101222991943 }, { "auxiliary_loss_clip": 0.01154011, "auxiliary_loss_mlp": 0.01040766, "balance_loss_clip": 1.02520907, "balance_loss_mlp": 1.04882908, "epoch": 0.2597324515256275, "flos": 21068252156160.0, "grad_norm": 1.6991530562558452, "language_loss": 0.7639758, "learning_rate": 3.3706908293383095e-06, "loss": 0.78592354, "num_input_tokens_seen": 93310090, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.875, "step": 4320, "time_per_iteration": 2.5488295555114746 }, { "auxiliary_loss_clip": 0.01165458, "auxiliary_loss_mlp": 0.01039227, "balance_loss_clip": 1.02308655, "balance_loss_mlp": 1.04844856, "epoch": 0.2597925747782955, "flos": 22050013662720.0, "grad_norm": 1.5261282363489659, "language_loss": 0.70878005, "learning_rate": 3.37041566780829e-06, "loss": 0.73082691, "num_input_tokens_seen": 93329570, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8984375, "step": 4321, "time_per_iteration": 2.586972713470459 }, { "auxiliary_loss_clip": 0.01160824, "auxiliary_loss_mlp": 0.01044299, "balance_loss_clip": 1.02735972, "balance_loss_mlp": 1.04834199, "epoch": 0.2598526980309635, "flos": 19536949497600.0, "grad_norm": 2.2172554191562037, "language_loss": 0.7461015, "learning_rate": 3.3701404573716597e-06, "loss": 0.76815271, "num_input_tokens_seen": 93347920, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9453125, "step": 4322, "time_per_iteration": 2.5163049697875977 }, { "auxiliary_loss_clip": 0.01165275, "auxiliary_loss_mlp": 0.01050629, "balance_loss_clip": 1.03331947, "balance_loss_mlp": 1.04636431, "epoch": 0.25991282128363147, "flos": 24495207079680.0, "grad_norm": 5.762413956726807, "language_loss": 0.73884642, "learning_rate": 3.3698651980382417e-06, "loss": 0.76100552, "num_input_tokens_seen": 93367145, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.921875, "step": 4323, "time_per_iteration": 2.5544633865356445 }, { "auxiliary_loss_clip": 0.0115415, "auxiliary_loss_mlp": 0.012926, "balance_loss_clip": 1.02715516, "balance_loss_mlp": 1.04858398, "epoch": 0.25997294453629943, "flos": 24201457655040.0, "grad_norm": 2.1693864458859133, "language_loss": 0.67443126, "learning_rate": 3.3695898898178573e-06, "loss": 0.69889879, "num_input_tokens_seen": 93386555, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.96875, "step": 4324, "time_per_iteration": 2.526219129562378 }, { "auxiliary_loss_clip": 0.01173658, "auxiliary_loss_mlp": 0.01044426, "balance_loss_clip": 1.02897668, "balance_loss_mlp": 1.04897928, "epoch": 0.2600330677889674, "flos": 31431460855680.0, "grad_norm": 2.005764575017957, "language_loss": 0.70354873, "learning_rate": 3.3693145327203336e-06, "loss": 0.72572958, "num_input_tokens_seen": 93405590, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.890625, "step": 4325, "time_per_iteration": 2.622022867202759 }, { "auxiliary_loss_clip": 0.01147324, "auxiliary_loss_mlp": 0.01036232, "balance_loss_clip": 1.01953065, "balance_loss_mlp": 1.04731798, "epoch": 0.26009319104163536, "flos": 32266527217920.0, "grad_norm": 2.04590330412878, "language_loss": 0.72715163, "learning_rate": 3.3690391267554963e-06, "loss": 0.74898714, "num_input_tokens_seen": 93424750, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.91015625, "step": 4326, "time_per_iteration": 2.6250128746032715 }, { "auxiliary_loss_clip": 0.01152191, "auxiliary_loss_mlp": 0.01287604, "balance_loss_clip": 1.02504885, "balance_loss_mlp": 1.04792738, "epoch": 0.26015331429430333, "flos": 26286754752000.0, "grad_norm": 1.8284703535491114, "language_loss": 0.86008549, "learning_rate": 3.3687636719331744e-06, "loss": 0.88448346, "num_input_tokens_seen": 93443465, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.859375, "step": 4327, "time_per_iteration": 2.5840003490448 }, { "auxiliary_loss_clip": 0.01144116, "auxiliary_loss_mlp": 0.01302188, "balance_loss_clip": 1.03565812, "balance_loss_mlp": 1.05107343, "epoch": 0.2602134375469713, "flos": 21142335957120.0, "grad_norm": 1.912408931953052, "language_loss": 0.80186749, "learning_rate": 3.368488168263198e-06, "loss": 0.82633054, "num_input_tokens_seen": 93462580, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.9296875, "step": 4328, "time_per_iteration": 2.5603742599487305 }, { "auxiliary_loss_clip": 0.01156966, "auxiliary_loss_mlp": 0.01292181, "balance_loss_clip": 1.0283916, "balance_loss_mlp": 1.05015111, "epoch": 0.26027356079963926, "flos": 25921327737600.0, "grad_norm": 3.2401018757097484, "language_loss": 0.87683254, "learning_rate": 3.3682126157553983e-06, "loss": 0.90132403, "num_input_tokens_seen": 93482790, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.890625, "step": 4329, "time_per_iteration": 2.587470531463623 }, { "auxiliary_loss_clip": 0.01136768, "auxiliary_loss_mlp": 0.01042064, "balance_loss_clip": 1.02601862, "balance_loss_mlp": 1.04876542, "epoch": 0.2603336840523072, "flos": 26359222440960.0, "grad_norm": 2.220223916351674, "language_loss": 0.7744329, "learning_rate": 3.3679370144196106e-06, "loss": 0.79622126, "num_input_tokens_seen": 93498795, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8828125, "step": 4330, "time_per_iteration": 2.6668035984039307 }, { "auxiliary_loss_clip": 0.01141501, "auxiliary_loss_mlp": 0.01054429, "balance_loss_clip": 1.03714347, "balance_loss_mlp": 1.05004227, "epoch": 0.2603938073049752, "flos": 23513661054720.0, "grad_norm": 4.152014097822723, "language_loss": 0.7529158, "learning_rate": 3.367661364265669e-06, "loss": 0.77487516, "num_input_tokens_seen": 93518335, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.9140625, "step": 4331, "time_per_iteration": 2.491054058074951 }, { "auxiliary_loss_clip": 0.0116116, "auxiliary_loss_mlp": 0.01043019, "balance_loss_clip": 1.02691364, "balance_loss_mlp": 1.05368578, "epoch": 0.26045393055764315, "flos": 25374300537600.0, "grad_norm": 1.3712847336544576, "language_loss": 0.68958688, "learning_rate": 3.367385665303412e-06, "loss": 0.71162862, "num_input_tokens_seen": 93539170, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.890625, "step": 4332, "time_per_iteration": 2.5626320838928223 }, { "auxiliary_loss_clip": 0.0116911, "auxiliary_loss_mlp": 0.01049203, "balance_loss_clip": 1.03202474, "balance_loss_mlp": 1.04916024, "epoch": 0.2605140538103111, "flos": 27635272076160.0, "grad_norm": 1.963744348739088, "language_loss": 0.79181939, "learning_rate": 3.3671099175426773e-06, "loss": 0.81400245, "num_input_tokens_seen": 93558480, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9296875, "step": 4333, "time_per_iteration": 2.6411094665527344 }, { "auxiliary_loss_clip": 0.01136648, "auxiliary_loss_mlp": 0.01041059, "balance_loss_clip": 1.02535939, "balance_loss_mlp": 1.04996455, "epoch": 0.2605741770629791, "flos": 13769839503360.0, "grad_norm": 1.7238732413255726, "language_loss": 0.80609596, "learning_rate": 3.366834120993307e-06, "loss": 0.82787305, "num_input_tokens_seen": 93575220, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8671875, "step": 4334, "time_per_iteration": 2.514815330505371 }, { "auxiliary_loss_clip": 0.01157114, "auxiliary_loss_mlp": 0.01040982, "balance_loss_clip": 1.02360082, "balance_loss_mlp": 1.04926682, "epoch": 0.26063430031564705, "flos": 26031681296640.0, "grad_norm": 1.7822240536417457, "language_loss": 0.7938323, "learning_rate": 3.3665582756651424e-06, "loss": 0.81581318, "num_input_tokens_seen": 93597015, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.90234375, "step": 4335, "time_per_iteration": 2.612138509750366 }, { "auxiliary_loss_clip": 0.01067207, "auxiliary_loss_mlp": 0.01001007, "balance_loss_clip": 0.99917144, "balance_loss_mlp": 1.02893329, "epoch": 0.26069442356831507, "flos": 62443809820800.0, "grad_norm": 0.8635649321304302, "language_loss": 0.60809362, "learning_rate": 3.366282381568028e-06, "loss": 0.62877578, "num_input_tokens_seen": 93657775, "router_z_loss_clip": 0.01831055, "router_z_loss_mlp": 0.29101562, "step": 4336, "time_per_iteration": 4.510652303695679 }, { "auxiliary_loss_clip": 0.01167265, "auxiliary_loss_mlp": 0.01050417, "balance_loss_clip": 1.03412127, "balance_loss_mlp": 1.04952669, "epoch": 0.26075454682098304, "flos": 13626376583040.0, "grad_norm": 2.35526828858328, "language_loss": 0.7712968, "learning_rate": 3.3660064387118104e-06, "loss": 0.79347366, "num_input_tokens_seen": 93676145, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.91015625, "step": 4337, "time_per_iteration": 2.705306053161621 }, { "auxiliary_loss_clip": 0.01159579, "auxiliary_loss_mlp": 0.01045722, "balance_loss_clip": 1.02806759, "balance_loss_mlp": 1.05105102, "epoch": 0.260814670073651, "flos": 12126531260160.0, "grad_norm": 2.429182371850274, "language_loss": 0.74163747, "learning_rate": 3.3657304471063363e-06, "loss": 0.76369053, "num_input_tokens_seen": 93692480, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.90625, "step": 4338, "time_per_iteration": 2.6442112922668457 }, { "auxiliary_loss_clip": 0.01153305, "auxiliary_loss_mlp": 0.01042169, "balance_loss_clip": 1.02502668, "balance_loss_mlp": 1.05039239, "epoch": 0.26087479332631897, "flos": 15122522805120.0, "grad_norm": 2.4325559354399373, "language_loss": 0.80366206, "learning_rate": 3.3654544067614557e-06, "loss": 0.82561678, "num_input_tokens_seen": 93710165, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.9375, "step": 4339, "time_per_iteration": 2.529849052429199 }, { "auxiliary_loss_clip": 0.0117452, "auxiliary_loss_mlp": 0.01041646, "balance_loss_clip": 1.0259105, "balance_loss_mlp": 1.05028915, "epoch": 0.26093491657898693, "flos": 24680937329280.0, "grad_norm": 1.5371248133602338, "language_loss": 0.76140296, "learning_rate": 3.36517831768702e-06, "loss": 0.78356463, "num_input_tokens_seen": 93730185, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8828125, "step": 4340, "time_per_iteration": 4.034888744354248 }, { "auxiliary_loss_clip": 0.01187939, "auxiliary_loss_mlp": 0.01044755, "balance_loss_clip": 1.02835155, "balance_loss_mlp": 1.05088973, "epoch": 0.2609950398316549, "flos": 25116138512640.0, "grad_norm": 1.6135273320005834, "language_loss": 0.82395172, "learning_rate": 3.3649021798928813e-06, "loss": 0.84627867, "num_input_tokens_seen": 93747690, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.91796875, "step": 4341, "time_per_iteration": 4.103615045547485 }, { "auxiliary_loss_clip": 0.01140243, "auxiliary_loss_mlp": 0.0104075, "balance_loss_clip": 1.02428746, "balance_loss_mlp": 1.05098259, "epoch": 0.26105516308432286, "flos": 28548588216960.0, "grad_norm": 1.7709323548834144, "language_loss": 0.76512074, "learning_rate": 3.364625993388895e-06, "loss": 0.78693068, "num_input_tokens_seen": 93767405, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.89453125, "step": 4342, "time_per_iteration": 2.653576374053955 }, { "auxiliary_loss_clip": 0.01147533, "auxiliary_loss_mlp": 0.01035438, "balance_loss_clip": 1.01818871, "balance_loss_mlp": 1.04882276, "epoch": 0.2611152863369908, "flos": 39530609447040.0, "grad_norm": 1.8346200781411404, "language_loss": 0.65818596, "learning_rate": 3.364349758184917e-06, "loss": 0.68001568, "num_input_tokens_seen": 93789950, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.89453125, "step": 4343, "time_per_iteration": 4.2568583488464355 }, { "auxiliary_loss_clip": 0.01187508, "auxiliary_loss_mlp": 0.01041328, "balance_loss_clip": 1.0244596, "balance_loss_mlp": 1.04965687, "epoch": 0.2611754095896588, "flos": 13735329511680.0, "grad_norm": 1.8994310799236016, "language_loss": 0.7344932, "learning_rate": 3.3640734742908066e-06, "loss": 0.75678158, "num_input_tokens_seen": 93807835, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.92578125, "step": 4344, "time_per_iteration": 2.5705103874206543 }, { "auxiliary_loss_clip": 0.01140605, "auxiliary_loss_mlp": 0.01039535, "balance_loss_clip": 1.02294087, "balance_loss_mlp": 1.0505898, "epoch": 0.26123553284232676, "flos": 21506649649920.0, "grad_norm": 3.417724324428439, "language_loss": 0.86694896, "learning_rate": 3.3637971417164213e-06, "loss": 0.88875037, "num_input_tokens_seen": 93825670, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.90234375, "step": 4345, "time_per_iteration": 2.6258978843688965 }, { "auxiliary_loss_clip": 0.01153296, "auxiliary_loss_mlp": 0.01043936, "balance_loss_clip": 1.02752042, "balance_loss_mlp": 1.04924226, "epoch": 0.2612956560949947, "flos": 21139786091520.0, "grad_norm": 2.001664112386005, "language_loss": 0.7639302, "learning_rate": 3.3635207604716254e-06, "loss": 0.7859025, "num_input_tokens_seen": 93844045, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.86328125, "step": 4346, "time_per_iteration": 2.5301969051361084 }, { "auxiliary_loss_clip": 0.01147581, "auxiliary_loss_mlp": 0.01043862, "balance_loss_clip": 1.02712476, "balance_loss_mlp": 1.04899895, "epoch": 0.2613557793476627, "flos": 25119011600640.0, "grad_norm": 1.4646589590248351, "language_loss": 0.75746131, "learning_rate": 3.36324433056628e-06, "loss": 0.77937579, "num_input_tokens_seen": 93864380, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.89453125, "step": 4347, "time_per_iteration": 2.666163682937622 }, { "auxiliary_loss_clip": 0.01167547, "auxiliary_loss_mlp": 0.01036866, "balance_loss_clip": 1.0200336, "balance_loss_mlp": 1.05128217, "epoch": 0.26141590260033065, "flos": 26067699659520.0, "grad_norm": 1.3097965945441188, "language_loss": 0.73444605, "learning_rate": 3.3629678520102517e-06, "loss": 0.75649017, "num_input_tokens_seen": 93885475, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.890625, "step": 4348, "time_per_iteration": 2.63824725151062 }, { "auxiliary_loss_clip": 0.01168209, "auxiliary_loss_mlp": 0.01050207, "balance_loss_clip": 1.03356552, "balance_loss_mlp": 1.05068493, "epoch": 0.2614760258529987, "flos": 25701518459520.0, "grad_norm": 2.2885748570482525, "language_loss": 0.90885925, "learning_rate": 3.3626913248134065e-06, "loss": 0.93104339, "num_input_tokens_seen": 93905545, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.90625, "step": 4349, "time_per_iteration": 2.652700424194336 }, { "auxiliary_loss_clip": 0.01145727, "auxiliary_loss_mlp": 0.01040248, "balance_loss_clip": 1.02427411, "balance_loss_mlp": 1.0500288, "epoch": 0.26153614910566664, "flos": 17457147181440.0, "grad_norm": 1.8744951365105489, "language_loss": 0.79995078, "learning_rate": 3.3624147489856134e-06, "loss": 0.82181054, "num_input_tokens_seen": 93924185, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.86328125, "step": 4350, "time_per_iteration": 2.5320968627929688 }, { "auxiliary_loss_clip": 0.01142764, "auxiliary_loss_mlp": 0.01043372, "balance_loss_clip": 1.02715921, "balance_loss_mlp": 1.0478456, "epoch": 0.2615962723583346, "flos": 17712831168000.0, "grad_norm": 2.1460733944606485, "language_loss": 0.6245141, "learning_rate": 3.3621381245367425e-06, "loss": 0.64637554, "num_input_tokens_seen": 93942825, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.859375, "step": 4351, "time_per_iteration": 2.590001344680786 }, { "auxiliary_loss_clip": 0.01147476, "auxiliary_loss_mlp": 0.01041795, "balance_loss_clip": 1.02511764, "balance_loss_mlp": 1.04802132, "epoch": 0.26165639561100257, "flos": 23257725672960.0, "grad_norm": 1.9772604146701744, "language_loss": 0.83538431, "learning_rate": 3.361861451476665e-06, "loss": 0.85727704, "num_input_tokens_seen": 93962045, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.90234375, "step": 4352, "time_per_iteration": 2.5366270542144775 }, { "auxiliary_loss_clip": 0.0106231, "auxiliary_loss_mlp": 0.01004113, "balance_loss_clip": 1.00224102, "balance_loss_mlp": 1.0246222, "epoch": 0.26171651886367053, "flos": 66737970800640.0, "grad_norm": 0.7919923257139171, "language_loss": 0.70516455, "learning_rate": 3.361584729815256e-06, "loss": 0.72582877, "num_input_tokens_seen": 94021175, "router_z_loss_clip": 0.01867676, "router_z_loss_mlp": 0.2890625, "step": 4353, "time_per_iteration": 3.0442867279052734 }, { "auxiliary_loss_clip": 0.01155337, "auxiliary_loss_mlp": 0.0105032, "balance_loss_clip": 1.03241515, "balance_loss_mlp": 1.04958797, "epoch": 0.2617766421163385, "flos": 22349581090560.0, "grad_norm": 1.8331973599434594, "language_loss": 0.77304554, "learning_rate": 3.36130795956239e-06, "loss": 0.79510212, "num_input_tokens_seen": 94043370, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.875, "step": 4354, "time_per_iteration": 2.622450113296509 }, { "auxiliary_loss_clip": 0.01178171, "auxiliary_loss_mlp": 0.01049398, "balance_loss_clip": 1.03285146, "balance_loss_mlp": 1.04879427, "epoch": 0.26183676536900646, "flos": 26067125041920.0, "grad_norm": 2.263894244461629, "language_loss": 0.67492461, "learning_rate": 3.3610311407279456e-06, "loss": 0.6972003, "num_input_tokens_seen": 94063510, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.9296875, "step": 4355, "time_per_iteration": 2.614664077758789 }, { "auxiliary_loss_clip": 0.01172324, "auxiliary_loss_mlp": 0.01043352, "balance_loss_clip": 1.02533925, "balance_loss_mlp": 1.04853082, "epoch": 0.26189688862167443, "flos": 20996466825600.0, "grad_norm": 2.0760620742575173, "language_loss": 0.66612267, "learning_rate": 3.3607542733218002e-06, "loss": 0.68827951, "num_input_tokens_seen": 94083865, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.87890625, "step": 4356, "time_per_iteration": 2.6268815994262695 }, { "auxiliary_loss_clip": 0.01059636, "auxiliary_loss_mlp": 0.01003025, "balance_loss_clip": 1.00110626, "balance_loss_mlp": 1.02224731, "epoch": 0.2619570118743424, "flos": 65798261141760.0, "grad_norm": 0.6812086946193507, "language_loss": 0.53148663, "learning_rate": 3.360477357353835e-06, "loss": 0.55211318, "num_input_tokens_seen": 94144095, "router_z_loss_clip": 0.01916504, "router_z_loss_mlp": 0.28710938, "step": 4357, "time_per_iteration": 3.073533058166504 }, { "auxiliary_loss_clip": 0.01151104, "auxiliary_loss_mlp": 0.01041265, "balance_loss_clip": 1.02453995, "balance_loss_mlp": 1.0500561, "epoch": 0.26201713512701036, "flos": 28766817296640.0, "grad_norm": 1.9196754699736993, "language_loss": 0.83911484, "learning_rate": 3.3602003928339325e-06, "loss": 0.86103857, "num_input_tokens_seen": 94163035, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.921875, "step": 4358, "time_per_iteration": 2.612294912338257 }, { "auxiliary_loss_clip": 0.01160181, "auxiliary_loss_mlp": 0.01044115, "balance_loss_clip": 1.02591181, "balance_loss_mlp": 1.04736781, "epoch": 0.2620772583796783, "flos": 26432516142720.0, "grad_norm": 2.8211039953878703, "language_loss": 0.67201793, "learning_rate": 3.359923379771977e-06, "loss": 0.69406092, "num_input_tokens_seen": 94182520, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9453125, "step": 4359, "time_per_iteration": 2.534669876098633 }, { "auxiliary_loss_clip": 0.01153071, "auxiliary_loss_mlp": 0.01040915, "balance_loss_clip": 1.0249294, "balance_loss_mlp": 1.04587507, "epoch": 0.2621373816323463, "flos": 20156552127360.0, "grad_norm": 1.833277416745242, "language_loss": 0.78173554, "learning_rate": 3.359646318177854e-06, "loss": 0.80367541, "num_input_tokens_seen": 94201795, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.890625, "step": 4360, "time_per_iteration": 2.546548366546631 }, { "auxiliary_loss_clip": 0.01150392, "auxiliary_loss_mlp": 0.01040544, "balance_loss_clip": 1.02472472, "balance_loss_mlp": 1.04572606, "epoch": 0.26219750488501425, "flos": 28621235473920.0, "grad_norm": 1.8728290115129675, "language_loss": 0.68239594, "learning_rate": 3.3593692080614515e-06, "loss": 0.70430535, "num_input_tokens_seen": 94222390, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8671875, "step": 4361, "time_per_iteration": 2.570056676864624 }, { "auxiliary_loss_clip": 0.01146546, "auxiliary_loss_mlp": 0.01052038, "balance_loss_clip": 1.03512228, "balance_loss_mlp": 1.04656529, "epoch": 0.2622576281376823, "flos": 15042549173760.0, "grad_norm": 2.05194955881376, "language_loss": 0.84426951, "learning_rate": 3.3590920494326585e-06, "loss": 0.8662554, "num_input_tokens_seen": 94239980, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.91015625, "step": 4362, "time_per_iteration": 2.5527260303497314 }, { "auxiliary_loss_clip": 0.01161724, "auxiliary_loss_mlp": 0.010465, "balance_loss_clip": 1.02840376, "balance_loss_mlp": 1.04726148, "epoch": 0.26231775139035024, "flos": 26396174557440.0, "grad_norm": 2.062795973376926, "language_loss": 0.65041339, "learning_rate": 3.3588148423013665e-06, "loss": 0.67249566, "num_input_tokens_seen": 94260715, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.875, "step": 4363, "time_per_iteration": 2.579620838165283 }, { "auxiliary_loss_clip": 0.0106624, "auxiliary_loss_mlp": 0.01006213, "balance_loss_clip": 1.00429428, "balance_loss_mlp": 1.02046859, "epoch": 0.2623778746430182, "flos": 65408918647680.0, "grad_norm": 0.8814517694158275, "language_loss": 0.61199641, "learning_rate": 3.3585375866774683e-06, "loss": 0.63272095, "num_input_tokens_seen": 94321285, "router_z_loss_clip": 0.01916504, "router_z_loss_mlp": 0.28125, "step": 4364, "time_per_iteration": 3.2638230323791504 }, { "auxiliary_loss_clip": 0.01139191, "auxiliary_loss_mlp": 0.0105431, "balance_loss_clip": 1.03603554, "balance_loss_mlp": 1.04827213, "epoch": 0.26243799789568617, "flos": 12604215254400.0, "grad_norm": 2.9120949152679954, "language_loss": 0.72552299, "learning_rate": 3.3582602825708577e-06, "loss": 0.74745804, "num_input_tokens_seen": 94335420, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.91015625, "step": 4365, "time_per_iteration": 2.513120412826538 }, { "auxiliary_loss_clip": 0.01157494, "auxiliary_loss_mlp": 0.01298061, "balance_loss_clip": 1.03455842, "balance_loss_mlp": 1.05075252, "epoch": 0.26249812114835414, "flos": 28623821253120.0, "grad_norm": 6.474749075847076, "language_loss": 0.77074778, "learning_rate": 3.3579829299914314e-06, "loss": 0.79530329, "num_input_tokens_seen": 94357440, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.88671875, "step": 4366, "time_per_iteration": 2.656961679458618 }, { "auxiliary_loss_clip": 0.01138895, "auxiliary_loss_mlp": 0.01043815, "balance_loss_clip": 1.02765071, "balance_loss_mlp": 1.04902375, "epoch": 0.2625582444010221, "flos": 14465393441280.0, "grad_norm": 2.4720103427997318, "language_loss": 0.76074904, "learning_rate": 3.3577055289490875e-06, "loss": 0.78257614, "num_input_tokens_seen": 94375690, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8984375, "step": 4367, "time_per_iteration": 2.506535291671753 }, { "auxiliary_loss_clip": 0.01143513, "auxiliary_loss_mlp": 0.01042618, "balance_loss_clip": 1.0263226, "balance_loss_mlp": 1.0476048, "epoch": 0.26261836765369007, "flos": 16613174246400.0, "grad_norm": 1.9136833177833603, "language_loss": 0.69218183, "learning_rate": 3.357428079453726e-06, "loss": 0.71404314, "num_input_tokens_seen": 94393190, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8671875, "step": 4368, "time_per_iteration": 2.5306148529052734 }, { "auxiliary_loss_clip": 0.01151432, "auxiliary_loss_mlp": 0.01045543, "balance_loss_clip": 1.02853215, "balance_loss_mlp": 1.04583061, "epoch": 0.26267849090635803, "flos": 20519932066560.0, "grad_norm": 1.9165710867911747, "language_loss": 0.78944224, "learning_rate": 3.357150581515248e-06, "loss": 0.81141198, "num_input_tokens_seen": 94410975, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.87890625, "step": 4369, "time_per_iteration": 2.613487482070923 }, { "auxiliary_loss_clip": 0.01153995, "auxiliary_loss_mlp": 0.01041559, "balance_loss_clip": 1.02481055, "balance_loss_mlp": 1.0474813, "epoch": 0.262738614159026, "flos": 21323936142720.0, "grad_norm": 2.102008727345222, "language_loss": 0.83248645, "learning_rate": 3.3568730351435565e-06, "loss": 0.854442, "num_input_tokens_seen": 94429985, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8828125, "step": 4370, "time_per_iteration": 2.5975663661956787 }, { "auxiliary_loss_clip": 0.01148007, "auxiliary_loss_mlp": 0.01055162, "balance_loss_clip": 1.03632689, "balance_loss_mlp": 1.04754019, "epoch": 0.26279873741169396, "flos": 17603590930560.0, "grad_norm": 1.645891277096422, "language_loss": 0.71084017, "learning_rate": 3.356595440348557e-06, "loss": 0.73287189, "num_input_tokens_seen": 94448660, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.9140625, "step": 4371, "time_per_iteration": 2.596963405609131 }, { "auxiliary_loss_clip": 0.01047567, "auxiliary_loss_mlp": 0.01002948, "balance_loss_clip": 1.00109982, "balance_loss_mlp": 1.0195024, "epoch": 0.2628588606643619, "flos": 60949746587520.0, "grad_norm": 0.6851332202582705, "language_loss": 0.56537491, "learning_rate": 3.356317797140156e-06, "loss": 0.5858801, "num_input_tokens_seen": 94515630, "router_z_loss_clip": 0.01843262, "router_z_loss_mlp": 0.28125, "step": 4372, "time_per_iteration": 3.2570719718933105 }, { "auxiliary_loss_clip": 0.01151211, "auxiliary_loss_mlp": 0.01039459, "balance_loss_clip": 1.02361596, "balance_loss_mlp": 1.04691911, "epoch": 0.2629189839170299, "flos": 27016315891200.0, "grad_norm": 1.7557742883386307, "language_loss": 0.77990997, "learning_rate": 3.3560401055282617e-06, "loss": 0.80181664, "num_input_tokens_seen": 94535385, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.86328125, "step": 4373, "time_per_iteration": 2.6758627891540527 }, { "auxiliary_loss_clip": 0.01160643, "auxiliary_loss_mlp": 0.01042629, "balance_loss_clip": 1.02655411, "balance_loss_mlp": 1.04673147, "epoch": 0.26297910716969786, "flos": 17019863009280.0, "grad_norm": 2.51138854987291, "language_loss": 0.70589918, "learning_rate": 3.3557623655227835e-06, "loss": 0.72793192, "num_input_tokens_seen": 94552650, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.87109375, "step": 4374, "time_per_iteration": 2.5564699172973633 }, { "auxiliary_loss_clip": 0.01154039, "auxiliary_loss_mlp": 0.01044821, "balance_loss_clip": 1.02713001, "balance_loss_mlp": 1.04827046, "epoch": 0.2630392304223659, "flos": 24897370728960.0, "grad_norm": 2.1487845700404415, "language_loss": 0.80921048, "learning_rate": 3.355484577133634e-06, "loss": 0.83119905, "num_input_tokens_seen": 94574075, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.875, "step": 4375, "time_per_iteration": 2.673685073852539 }, { "auxiliary_loss_clip": 0.01151397, "auxiliary_loss_mlp": 0.01037555, "balance_loss_clip": 1.02233195, "balance_loss_mlp": 1.04694152, "epoch": 0.26309935367503384, "flos": 32854026067200.0, "grad_norm": 2.2946576106758307, "language_loss": 0.65551251, "learning_rate": 3.3552067403707272e-06, "loss": 0.67740202, "num_input_tokens_seen": 94594255, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8671875, "step": 4376, "time_per_iteration": 2.653733253479004 }, { "auxiliary_loss_clip": 0.011725, "auxiliary_loss_mlp": 0.01047931, "balance_loss_clip": 1.03145599, "balance_loss_mlp": 1.04749632, "epoch": 0.2631594769277018, "flos": 15887958652800.0, "grad_norm": 2.095499838490014, "language_loss": 0.69443297, "learning_rate": 3.3549288552439777e-06, "loss": 0.71663725, "num_input_tokens_seen": 94611410, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.890625, "step": 4377, "time_per_iteration": 4.0172741413116455 }, { "auxiliary_loss_clip": 0.01146385, "auxiliary_loss_mlp": 0.01042221, "balance_loss_clip": 1.02573431, "balance_loss_mlp": 1.04872012, "epoch": 0.2632196001803698, "flos": 50804943557760.0, "grad_norm": 1.6015405802150744, "language_loss": 0.78871274, "learning_rate": 3.3546509217633025e-06, "loss": 0.81059873, "num_input_tokens_seen": 94636575, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.88671875, "step": 4378, "time_per_iteration": 2.8535873889923096 }, { "auxiliary_loss_clip": 0.01153758, "auxiliary_loss_mlp": 0.01044417, "balance_loss_clip": 1.0295155, "balance_loss_mlp": 1.04883111, "epoch": 0.26327972343303774, "flos": 13733031041280.0, "grad_norm": 2.03291803968833, "language_loss": 0.76692605, "learning_rate": 3.3543729399386207e-06, "loss": 0.78890777, "num_input_tokens_seen": 94654345, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.87109375, "step": 4379, "time_per_iteration": 2.5163285732269287 }, { "auxiliary_loss_clip": 0.01168515, "auxiliary_loss_mlp": 0.01042328, "balance_loss_clip": 1.02386236, "balance_loss_mlp": 1.05059552, "epoch": 0.2633398466857057, "flos": 23769057732480.0, "grad_norm": 2.632375597009145, "language_loss": 0.77551121, "learning_rate": 3.354094909779852e-06, "loss": 0.79761964, "num_input_tokens_seen": 94673985, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.9140625, "step": 4380, "time_per_iteration": 2.6195766925811768 }, { "auxiliary_loss_clip": 0.01156501, "auxiliary_loss_mlp": 0.01039475, "balance_loss_clip": 1.02286887, "balance_loss_mlp": 1.04732275, "epoch": 0.26339996993837367, "flos": 27600223380480.0, "grad_norm": 1.9390225753083636, "language_loss": 0.63654476, "learning_rate": 3.353816831296919e-06, "loss": 0.65850449, "num_input_tokens_seen": 94693145, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.91015625, "step": 4381, "time_per_iteration": 4.030869960784912 }, { "auxiliary_loss_clip": 0.01146354, "auxiliary_loss_mlp": 0.01042341, "balance_loss_clip": 1.0260216, "balance_loss_mlp": 1.04802465, "epoch": 0.26346009319104163, "flos": 16946317912320.0, "grad_norm": 2.164367462306115, "language_loss": 0.82494879, "learning_rate": 3.353538704499747e-06, "loss": 0.84683573, "num_input_tokens_seen": 94710185, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.890625, "step": 4382, "time_per_iteration": 3.998839855194092 }, { "auxiliary_loss_clip": 0.01169322, "auxiliary_loss_mlp": 0.01050085, "balance_loss_clip": 1.03250146, "balance_loss_mlp": 1.04931569, "epoch": 0.2635202164437096, "flos": 37232218915200.0, "grad_norm": 2.2309078703706584, "language_loss": 0.69668949, "learning_rate": 3.3532605293982592e-06, "loss": 0.71888357, "num_input_tokens_seen": 94730280, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9296875, "step": 4383, "time_per_iteration": 2.734455108642578 }, { "auxiliary_loss_clip": 0.01146874, "auxiliary_loss_mlp": 0.01039576, "balance_loss_clip": 1.02415061, "balance_loss_mlp": 1.04918897, "epoch": 0.26358033969637756, "flos": 20996359084800.0, "grad_norm": 1.7371446922040783, "language_loss": 0.68849635, "learning_rate": 3.3529823060023847e-06, "loss": 0.71036088, "num_input_tokens_seen": 94748560, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.88671875, "step": 4384, "time_per_iteration": 2.583129644393921 }, { "auxiliary_loss_clip": 0.01143811, "auxiliary_loss_mlp": 0.01036693, "balance_loss_clip": 1.02069521, "balance_loss_mlp": 1.04805076, "epoch": 0.26364046294904553, "flos": 27746092512000.0, "grad_norm": 2.1339907983066486, "language_loss": 0.69757056, "learning_rate": 3.352704034322052e-06, "loss": 0.71937567, "num_input_tokens_seen": 94767570, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8671875, "step": 4385, "time_per_iteration": 3.996103286743164 }, { "auxiliary_loss_clip": 0.01167962, "auxiliary_loss_mlp": 0.0104296, "balance_loss_clip": 1.02622318, "balance_loss_mlp": 1.05289602, "epoch": 0.2637005862017135, "flos": 22893088757760.0, "grad_norm": 1.9164191328669955, "language_loss": 0.85518193, "learning_rate": 3.352425714367191e-06, "loss": 0.8772912, "num_input_tokens_seen": 94784985, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.87890625, "step": 4386, "time_per_iteration": 2.5979199409484863 }, { "auxiliary_loss_clip": 0.01156447, "auxiliary_loss_mlp": 0.01043111, "balance_loss_clip": 1.02640986, "balance_loss_mlp": 1.05073214, "epoch": 0.26376070945438146, "flos": 15048834053760.0, "grad_norm": 2.8085898748980296, "language_loss": 0.77288604, "learning_rate": 3.352147346147736e-06, "loss": 0.79488164, "num_input_tokens_seen": 94802545, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.87890625, "step": 4387, "time_per_iteration": 2.488781452178955 }, { "auxiliary_loss_clip": 0.01149345, "auxiliary_loss_mlp": 0.01045555, "balance_loss_clip": 1.02980745, "balance_loss_mlp": 1.05379772, "epoch": 0.2638208327070494, "flos": 21141833166720.0, "grad_norm": 2.159322817566022, "language_loss": 0.76021409, "learning_rate": 3.35186892967362e-06, "loss": 0.78216308, "num_input_tokens_seen": 94820730, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.86328125, "step": 4388, "time_per_iteration": 2.563964366912842 }, { "auxiliary_loss_clip": 0.01138321, "auxiliary_loss_mlp": 0.01034939, "balance_loss_clip": 1.01851153, "balance_loss_mlp": 1.05030191, "epoch": 0.26388095595971744, "flos": 21725597001600.0, "grad_norm": 2.2694793229391634, "language_loss": 0.85983729, "learning_rate": 3.3515904649547797e-06, "loss": 0.88156992, "num_input_tokens_seen": 94839175, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8828125, "step": 4389, "time_per_iteration": 2.4963150024414062 }, { "auxiliary_loss_clip": 0.01063082, "auxiliary_loss_mlp": 0.0100516, "balance_loss_clip": 1.00294292, "balance_loss_mlp": 1.0255394, "epoch": 0.2639410792123854, "flos": 65515537192320.0, "grad_norm": 0.804878579186816, "language_loss": 0.60379744, "learning_rate": 3.351311952001152e-06, "loss": 0.62447989, "num_input_tokens_seen": 94898865, "router_z_loss_clip": 0.0222168, "router_z_loss_mlp": 0.28515625, "step": 4390, "time_per_iteration": 3.1468183994293213 }, { "auxiliary_loss_clip": 0.01159002, "auxiliary_loss_mlp": 0.01042107, "balance_loss_clip": 1.0248214, "balance_loss_mlp": 1.0502404, "epoch": 0.2640012024650534, "flos": 23948574929280.0, "grad_norm": 1.6993839395043995, "language_loss": 0.77234095, "learning_rate": 3.3510333908226765e-06, "loss": 0.79435205, "num_input_tokens_seen": 94917490, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.90625, "step": 4391, "time_per_iteration": 2.56075119972229 }, { "auxiliary_loss_clip": 0.01080821, "auxiliary_loss_mlp": 0.01248881, "balance_loss_clip": 0.99958992, "balance_loss_mlp": 1.02540219, "epoch": 0.26406132571772134, "flos": 56441163369600.0, "grad_norm": 0.8764006290989176, "language_loss": 0.58716136, "learning_rate": 3.3507547814292953e-06, "loss": 0.61045837, "num_input_tokens_seen": 94969065, "router_z_loss_clip": 0.0222168, "router_z_loss_mlp": 0.28515625, "step": 4392, "time_per_iteration": 3.2036168575286865 }, { "auxiliary_loss_clip": 0.01152441, "auxiliary_loss_mlp": 0.01043553, "balance_loss_clip": 1.02704811, "balance_loss_mlp": 1.0525198, "epoch": 0.2641214489703893, "flos": 22090557139200.0, "grad_norm": 1.7331151262690663, "language_loss": 0.68228507, "learning_rate": 3.35047612383095e-06, "loss": 0.70424509, "num_input_tokens_seen": 94988540, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.90625, "step": 4393, "time_per_iteration": 2.5716030597686768 }, { "auxiliary_loss_clip": 0.01152917, "auxiliary_loss_mlp": 0.01043112, "balance_loss_clip": 1.0249685, "balance_loss_mlp": 1.05144835, "epoch": 0.26418157222305727, "flos": 16544764794240.0, "grad_norm": 2.2389839938093674, "language_loss": 0.83723116, "learning_rate": 3.3501974180375857e-06, "loss": 0.85919142, "num_input_tokens_seen": 95004810, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.92578125, "step": 4394, "time_per_iteration": 2.5309524536132812 }, { "auxiliary_loss_clip": 0.01164056, "auxiliary_loss_mlp": 0.01047814, "balance_loss_clip": 1.02949166, "balance_loss_mlp": 1.05271018, "epoch": 0.26424169547572524, "flos": 18002486442240.0, "grad_norm": 3.5723378790778435, "language_loss": 0.70379663, "learning_rate": 3.349918664059149e-06, "loss": 0.72591531, "num_input_tokens_seen": 95024085, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.93359375, "step": 4395, "time_per_iteration": 2.6328468322753906 }, { "auxiliary_loss_clip": 0.01165098, "auxiliary_loss_mlp": 0.01037646, "balance_loss_clip": 1.02094424, "balance_loss_mlp": 1.05091417, "epoch": 0.2643018187283932, "flos": 16983162288000.0, "grad_norm": 2.0001665257178876, "language_loss": 0.86459434, "learning_rate": 3.3496398619055876e-06, "loss": 0.88662177, "num_input_tokens_seen": 95042515, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.87109375, "step": 4396, "time_per_iteration": 2.635362386703491 }, { "auxiliary_loss_clip": 0.01093745, "auxiliary_loss_mlp": 0.0100861, "balance_loss_clip": 1.00638127, "balance_loss_mlp": 1.02027154, "epoch": 0.26436194198106117, "flos": 59664359416320.0, "grad_norm": 0.7898593848036544, "language_loss": 0.55025887, "learning_rate": 3.3493610115868505e-06, "loss": 0.57128245, "num_input_tokens_seen": 95094835, "router_z_loss_clip": 0.02233887, "router_z_loss_mlp": 0.28125, "step": 4397, "time_per_iteration": 3.002768039703369 }, { "auxiliary_loss_clip": 0.0114666, "auxiliary_loss_mlp": 0.01049642, "balance_loss_clip": 1.03369236, "balance_loss_mlp": 1.05119503, "epoch": 0.26442206523372913, "flos": 32921322197760.0, "grad_norm": 1.9415421194658127, "language_loss": 0.77689469, "learning_rate": 3.3490821131128905e-06, "loss": 0.79885769, "num_input_tokens_seen": 95113480, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8671875, "step": 4398, "time_per_iteration": 2.6077799797058105 }, { "auxiliary_loss_clip": 0.01150601, "auxiliary_loss_mlp": 0.0104659, "balance_loss_clip": 1.02965045, "balance_loss_mlp": 1.05224562, "epoch": 0.2644821884863971, "flos": 21031300039680.0, "grad_norm": 1.9034121281096092, "language_loss": 0.67077863, "learning_rate": 3.34880316649366e-06, "loss": 0.69275057, "num_input_tokens_seen": 95132580, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8984375, "step": 4399, "time_per_iteration": 2.6447970867156982 }, { "auxiliary_loss_clip": 0.01161506, "auxiliary_loss_mlp": 0.01038488, "balance_loss_clip": 1.02258539, "balance_loss_mlp": 1.05045891, "epoch": 0.26454231173906506, "flos": 20776801201920.0, "grad_norm": 1.9042453348624044, "language_loss": 0.86387581, "learning_rate": 3.3485241717391137e-06, "loss": 0.88587576, "num_input_tokens_seen": 95152375, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8359375, "step": 4400, "time_per_iteration": 2.5333704948425293 }, { "auxiliary_loss_clip": 0.01157953, "auxiliary_loss_mlp": 0.01039669, "balance_loss_clip": 1.02245474, "balance_loss_mlp": 1.05116212, "epoch": 0.264602434991733, "flos": 16618669027200.0, "grad_norm": 2.064381072725013, "language_loss": 0.75838637, "learning_rate": 3.348245128859209e-06, "loss": 0.78036261, "num_input_tokens_seen": 95170265, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.88671875, "step": 4401, "time_per_iteration": 2.608494997024536 }, { "auxiliary_loss_clip": 0.01166122, "auxiliary_loss_mlp": 0.01046868, "balance_loss_clip": 1.02883172, "balance_loss_mlp": 1.04734993, "epoch": 0.26466255824440105, "flos": 19062677295360.0, "grad_norm": 1.762051528161893, "language_loss": 0.8813656, "learning_rate": 3.3479660378639036e-06, "loss": 0.90349549, "num_input_tokens_seen": 95188655, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.91796875, "step": 4402, "time_per_iteration": 2.5011684894561768 }, { "auxiliary_loss_clip": 0.01165021, "auxiliary_loss_mlp": 0.01045902, "balance_loss_clip": 1.02880704, "balance_loss_mlp": 1.04813385, "epoch": 0.264722681497069, "flos": 22638554006400.0, "grad_norm": 1.6795407938708014, "language_loss": 0.78036451, "learning_rate": 3.3476868987631575e-06, "loss": 0.80247378, "num_input_tokens_seen": 95209615, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.89453125, "step": 4403, "time_per_iteration": 2.615813732147217 }, { "auxiliary_loss_clip": 0.01145016, "auxiliary_loss_mlp": 0.01037864, "balance_loss_clip": 1.02110314, "balance_loss_mlp": 1.04591084, "epoch": 0.264782804749737, "flos": 22492253911680.0, "grad_norm": 2.869388228472334, "language_loss": 0.87828422, "learning_rate": 3.3474077115669327e-06, "loss": 0.90011299, "num_input_tokens_seen": 95227810, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8984375, "step": 4404, "time_per_iteration": 2.5282154083251953 }, { "auxiliary_loss_clip": 0.01155787, "auxiliary_loss_mlp": 0.01041433, "balance_loss_clip": 1.02555418, "balance_loss_mlp": 1.0472939, "epoch": 0.26484292800240494, "flos": 16800269212800.0, "grad_norm": 1.9411195899821387, "language_loss": 0.76154447, "learning_rate": 3.347128476285193e-06, "loss": 0.78351665, "num_input_tokens_seen": 95245890, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.90625, "step": 4405, "time_per_iteration": 2.599170446395874 }, { "auxiliary_loss_clip": 0.01138981, "auxiliary_loss_mlp": 0.01039301, "balance_loss_clip": 1.02189636, "balance_loss_mlp": 1.04949009, "epoch": 0.2649030512550729, "flos": 20449583280000.0, "grad_norm": 1.791115644147841, "language_loss": 0.69740462, "learning_rate": 3.346849192927903e-06, "loss": 0.7191875, "num_input_tokens_seen": 95264955, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.89453125, "step": 4406, "time_per_iteration": 2.468763828277588 }, { "auxiliary_loss_clip": 0.01154709, "auxiliary_loss_mlp": 0.01048211, "balance_loss_clip": 1.03248739, "balance_loss_mlp": 1.0490849, "epoch": 0.2649631745077409, "flos": 22416123035520.0, "grad_norm": 1.6880417378551402, "language_loss": 0.83339566, "learning_rate": 3.3465698615050295e-06, "loss": 0.85542488, "num_input_tokens_seen": 95284245, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.875, "step": 4407, "time_per_iteration": 2.6431405544281006 }, { "auxiliary_loss_clip": 0.01160412, "auxiliary_loss_mlp": 0.01033476, "balance_loss_clip": 1.01718628, "balance_loss_mlp": 1.04628849, "epoch": 0.26502329776040884, "flos": 35116110927360.0, "grad_norm": 1.7202291285840778, "language_loss": 0.75771838, "learning_rate": 3.346290482026542e-06, "loss": 0.77965724, "num_input_tokens_seen": 95307125, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.875, "step": 4408, "time_per_iteration": 2.6606900691986084 }, { "auxiliary_loss_clip": 0.01162532, "auxiliary_loss_mlp": 0.01038978, "balance_loss_clip": 1.02289653, "balance_loss_mlp": 1.04927897, "epoch": 0.2650834210130768, "flos": 38687498438400.0, "grad_norm": 1.76284282447839, "language_loss": 0.70909476, "learning_rate": 3.3460110545024094e-06, "loss": 0.73110992, "num_input_tokens_seen": 95329150, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.859375, "step": 4409, "time_per_iteration": 2.7214696407318115 }, { "auxiliary_loss_clip": 0.01145916, "auxiliary_loss_mlp": 0.01042227, "balance_loss_clip": 1.02484643, "balance_loss_mlp": 1.04685092, "epoch": 0.26514354426574477, "flos": 24716847951360.0, "grad_norm": 1.667922518238131, "language_loss": 0.73103142, "learning_rate": 3.3457315789426054e-06, "loss": 0.75291276, "num_input_tokens_seen": 95349880, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.90234375, "step": 4410, "time_per_iteration": 2.617563247680664 }, { "auxiliary_loss_clip": 0.01150781, "auxiliary_loss_mlp": 0.01051859, "balance_loss_clip": 1.03570652, "balance_loss_mlp": 1.05015254, "epoch": 0.26520366751841273, "flos": 20340055733760.0, "grad_norm": 1.911445435016808, "language_loss": 0.72937787, "learning_rate": 3.345452055357103e-06, "loss": 0.75140429, "num_input_tokens_seen": 95368570, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.9140625, "step": 4411, "time_per_iteration": 2.6545870304107666 }, { "auxiliary_loss_clip": 0.01170645, "auxiliary_loss_mlp": 0.01041551, "balance_loss_clip": 1.02505207, "balance_loss_mlp": 1.04603684, "epoch": 0.2652637907710807, "flos": 22343870828160.0, "grad_norm": 1.9379088486938243, "language_loss": 0.81954443, "learning_rate": 3.345172483755878e-06, "loss": 0.84166634, "num_input_tokens_seen": 95387065, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8828125, "step": 4412, "time_per_iteration": 2.5888891220092773 }, { "auxiliary_loss_clip": 0.01144257, "auxiliary_loss_mlp": 0.0105273, "balance_loss_clip": 1.03668475, "balance_loss_mlp": 1.04663312, "epoch": 0.26532391402374866, "flos": 19354235990400.0, "grad_norm": 1.658522627156338, "language_loss": 0.74201941, "learning_rate": 3.3448928641489057e-06, "loss": 0.76398933, "num_input_tokens_seen": 95406345, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.890625, "step": 4413, "time_per_iteration": 2.5412282943725586 }, { "auxiliary_loss_clip": 0.01054624, "auxiliary_loss_mlp": 0.01032254, "balance_loss_clip": 1.03021586, "balance_loss_mlp": 1.0175755, "epoch": 0.26538403727641663, "flos": 44787611422080.0, "grad_norm": 0.8815434917513609, "language_loss": 0.56979001, "learning_rate": 3.344613196546168e-06, "loss": 0.59065878, "num_input_tokens_seen": 95463595, "router_z_loss_clip": 0.02038574, "router_z_loss_mlp": 0.27929688, "step": 4414, "time_per_iteration": 3.0544588565826416 }, { "auxiliary_loss_clip": 0.01151899, "auxiliary_loss_mlp": 0.01045864, "balance_loss_clip": 1.03041446, "balance_loss_mlp": 1.04857409, "epoch": 0.26544416052908465, "flos": 28182119708160.0, "grad_norm": 1.649622049183269, "language_loss": 0.74652678, "learning_rate": 3.3443334809576434e-06, "loss": 0.76850438, "num_input_tokens_seen": 95484115, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8515625, "step": 4415, "time_per_iteration": 2.605867385864258 }, { "auxiliary_loss_clip": 0.01175502, "auxiliary_loss_mlp": 0.01041639, "balance_loss_clip": 1.02245796, "balance_loss_mlp": 1.04773891, "epoch": 0.2655042837817526, "flos": 17565274097280.0, "grad_norm": 2.822379035656687, "language_loss": 0.86837947, "learning_rate": 3.344053717393315e-06, "loss": 0.89055097, "num_input_tokens_seen": 95501435, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.9140625, "step": 4416, "time_per_iteration": 2.5689072608947754 }, { "auxiliary_loss_clip": 0.01145554, "auxiliary_loss_mlp": 0.01042787, "balance_loss_clip": 1.02492929, "balance_loss_mlp": 1.04655051, "epoch": 0.2655644070344206, "flos": 23404636298880.0, "grad_norm": 1.5454066479090547, "language_loss": 0.76459587, "learning_rate": 3.343773905863167e-06, "loss": 0.78647923, "num_input_tokens_seen": 95520135, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.8984375, "step": 4417, "time_per_iteration": 2.515113592147827 }, { "auxiliary_loss_clip": 0.01151214, "auxiliary_loss_mlp": 0.01039059, "balance_loss_clip": 1.0207603, "balance_loss_mlp": 1.04639769, "epoch": 0.26562453028708854, "flos": 26468462678400.0, "grad_norm": 1.5568367179422096, "language_loss": 0.80345869, "learning_rate": 3.3434940463771847e-06, "loss": 0.82536137, "num_input_tokens_seen": 95541705, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.8671875, "step": 4418, "time_per_iteration": 2.6321868896484375 }, { "auxiliary_loss_clip": 0.01148579, "auxiliary_loss_mlp": 0.01045809, "balance_loss_clip": 1.0293107, "balance_loss_mlp": 1.0492928, "epoch": 0.2656846535397565, "flos": 19207576759680.0, "grad_norm": 2.152147735288132, "language_loss": 0.67202353, "learning_rate": 3.343214138945356e-06, "loss": 0.69396746, "num_input_tokens_seen": 95560300, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.90234375, "step": 4419, "time_per_iteration": 3.8977596759796143 }, { "auxiliary_loss_clip": 0.01137579, "auxiliary_loss_mlp": 0.01048214, "balance_loss_clip": 1.03003502, "balance_loss_mlp": 1.04724646, "epoch": 0.2657447767924245, "flos": 30551325903360.0, "grad_norm": 1.7362636383800365, "language_loss": 0.79288948, "learning_rate": 3.3429341835776695e-06, "loss": 0.81474739, "num_input_tokens_seen": 95580150, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.90234375, "step": 4420, "time_per_iteration": 2.5758469104766846 }, { "auxiliary_loss_clip": 0.01159957, "auxiliary_loss_mlp": 0.01053643, "balance_loss_clip": 1.03428316, "balance_loss_mlp": 1.04981995, "epoch": 0.26580490004509244, "flos": 20922742160640.0, "grad_norm": 2.0881918525254894, "language_loss": 0.81465149, "learning_rate": 3.342654180284117e-06, "loss": 0.83678746, "num_input_tokens_seen": 95597570, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.92578125, "step": 4421, "time_per_iteration": 2.6258034706115723 }, { "auxiliary_loss_clip": 0.01144265, "auxiliary_loss_mlp": 0.01039962, "balance_loss_clip": 1.02289104, "balance_loss_mlp": 1.04814458, "epoch": 0.2658650232977604, "flos": 43945682584320.0, "grad_norm": 1.740090617065316, "language_loss": 0.65451783, "learning_rate": 3.3423741290746897e-06, "loss": 0.67636007, "num_input_tokens_seen": 95619415, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.875, "step": 4422, "time_per_iteration": 2.70646595954895 }, { "auxiliary_loss_clip": 0.01147958, "auxiliary_loss_mlp": 0.01043986, "balance_loss_clip": 1.02754688, "balance_loss_mlp": 1.04786205, "epoch": 0.26592514655042837, "flos": 29716439109120.0, "grad_norm": 2.0989615789123484, "language_loss": 0.74003541, "learning_rate": 3.342094029959383e-06, "loss": 0.76195484, "num_input_tokens_seen": 95639155, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.90625, "step": 4423, "time_per_iteration": 4.037718772888184 }, { "auxiliary_loss_clip": 0.01146793, "auxiliary_loss_mlp": 0.01047826, "balance_loss_clip": 1.03079128, "balance_loss_mlp": 1.04647708, "epoch": 0.26598526980309634, "flos": 46677730014720.0, "grad_norm": 1.641778507472307, "language_loss": 0.77717674, "learning_rate": 3.341813882948193e-06, "loss": 0.79912293, "num_input_tokens_seen": 95663320, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9140625, "step": 4424, "time_per_iteration": 4.347491979598999 }, { "auxiliary_loss_clip": 0.01147707, "auxiliary_loss_mlp": 0.01045153, "balance_loss_clip": 1.02895892, "balance_loss_mlp": 1.04867136, "epoch": 0.2660453930557643, "flos": 11509442582400.0, "grad_norm": 2.0134445615241714, "language_loss": 0.79243773, "learning_rate": 3.341533688051117e-06, "loss": 0.81436634, "num_input_tokens_seen": 95680260, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8984375, "step": 4425, "time_per_iteration": 2.551563024520874 }, { "auxiliary_loss_clip": 0.01151946, "auxiliary_loss_mlp": 0.01044336, "balance_loss_clip": 1.02860022, "balance_loss_mlp": 1.0479331, "epoch": 0.26610551630843227, "flos": 24791578197120.0, "grad_norm": 1.6402798993385164, "language_loss": 0.80089545, "learning_rate": 3.3412534452781543e-06, "loss": 0.82285833, "num_input_tokens_seen": 95701140, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.859375, "step": 4426, "time_per_iteration": 4.201855897903442 }, { "auxiliary_loss_clip": 0.01139652, "auxiliary_loss_mlp": 0.01048398, "balance_loss_clip": 1.03144634, "balance_loss_mlp": 1.05107117, "epoch": 0.26616563956110023, "flos": 27636385397760.0, "grad_norm": 2.296640223410501, "language_loss": 0.76649022, "learning_rate": 3.3409731546393067e-06, "loss": 0.78837073, "num_input_tokens_seen": 95722060, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8828125, "step": 4427, "time_per_iteration": 2.592634439468384 }, { "auxiliary_loss_clip": 0.01160639, "auxiliary_loss_mlp": 0.01037071, "balance_loss_clip": 1.02046514, "balance_loss_mlp": 1.04712391, "epoch": 0.26622576281376825, "flos": 28362893880960.0, "grad_norm": 1.4836996598859251, "language_loss": 0.76576167, "learning_rate": 3.3406928161445756e-06, "loss": 0.78773874, "num_input_tokens_seen": 95742495, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8671875, "step": 4428, "time_per_iteration": 2.6305110454559326 }, { "auxiliary_loss_clip": 0.0116636, "auxiliary_loss_mlp": 0.01280758, "balance_loss_clip": 1.01730442, "balance_loss_mlp": 1.04869175, "epoch": 0.2662858860664362, "flos": 18041341979520.0, "grad_norm": 2.4921485253648004, "language_loss": 0.82798159, "learning_rate": 3.340412429803967e-06, "loss": 0.85245275, "num_input_tokens_seen": 95761510, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.90625, "step": 4429, "time_per_iteration": 2.5745632648468018 }, { "auxiliary_loss_clip": 0.01161753, "auxiliary_loss_mlp": 0.01038648, "balance_loss_clip": 1.02197015, "balance_loss_mlp": 1.04739177, "epoch": 0.2663460093191042, "flos": 22745818995840.0, "grad_norm": 1.7217745547421726, "language_loss": 0.71851158, "learning_rate": 3.3401319956274872e-06, "loss": 0.74051559, "num_input_tokens_seen": 95782385, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.87109375, "step": 4430, "time_per_iteration": 2.559629440307617 }, { "auxiliary_loss_clip": 0.0113888, "auxiliary_loss_mlp": 0.01050468, "balance_loss_clip": 1.03308702, "balance_loss_mlp": 1.0489831, "epoch": 0.26640613257177215, "flos": 16508782344960.0, "grad_norm": 2.5555379623845957, "language_loss": 0.8202858, "learning_rate": 3.3398515136251435e-06, "loss": 0.84217924, "num_input_tokens_seen": 95800595, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.8984375, "step": 4431, "time_per_iteration": 2.5799388885498047 }, { "auxiliary_loss_clip": 0.01153387, "auxiliary_loss_mlp": 0.01051653, "balance_loss_clip": 1.03386688, "balance_loss_mlp": 1.05006695, "epoch": 0.2664662558244401, "flos": 23075945919360.0, "grad_norm": 2.4223088967455015, "language_loss": 0.76092029, "learning_rate": 3.3395709838069463e-06, "loss": 0.78297067, "num_input_tokens_seen": 95818480, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9453125, "step": 4432, "time_per_iteration": 2.509525775909424 }, { "auxiliary_loss_clip": 0.01173294, "auxiliary_loss_mlp": 0.01040154, "balance_loss_clip": 1.02364373, "balance_loss_mlp": 1.04828548, "epoch": 0.2665263790771081, "flos": 23769273214080.0, "grad_norm": 2.040152387396831, "language_loss": 0.82556164, "learning_rate": 3.3392904061829054e-06, "loss": 0.84769607, "num_input_tokens_seen": 95837205, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.88671875, "step": 4433, "time_per_iteration": 2.6523327827453613 }, { "auxiliary_loss_clip": 0.01145947, "auxiliary_loss_mlp": 0.01041353, "balance_loss_clip": 1.02444923, "balance_loss_mlp": 1.04914403, "epoch": 0.26658650232977604, "flos": 28001273708160.0, "grad_norm": 2.151998661248645, "language_loss": 0.76506019, "learning_rate": 3.3390097807630353e-06, "loss": 0.78693318, "num_input_tokens_seen": 95858395, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.875, "step": 4434, "time_per_iteration": 2.57668137550354 }, { "auxiliary_loss_clip": 0.01134424, "auxiliary_loss_mlp": 0.01042471, "balance_loss_clip": 1.02591324, "balance_loss_mlp": 1.04700875, "epoch": 0.266646625582444, "flos": 22163635359360.0, "grad_norm": 1.9363740839470025, "language_loss": 0.8233161, "learning_rate": 3.3387291075573508e-06, "loss": 0.84508502, "num_input_tokens_seen": 95877875, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.875, "step": 4435, "time_per_iteration": 2.5462677478790283 }, { "auxiliary_loss_clip": 0.01180558, "auxiliary_loss_mlp": 0.01054941, "balance_loss_clip": 1.03673744, "balance_loss_mlp": 1.0500896, "epoch": 0.266706748835112, "flos": 27853537069440.0, "grad_norm": 2.168982944261232, "language_loss": 0.87454343, "learning_rate": 3.3384483865758677e-06, "loss": 0.89689839, "num_input_tokens_seen": 95895820, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.94140625, "step": 4436, "time_per_iteration": 2.6085870265960693 }, { "auxiliary_loss_clip": 0.01158236, "auxiliary_loss_mlp": 0.01044204, "balance_loss_clip": 1.02780092, "balance_loss_mlp": 1.04883933, "epoch": 0.26676687208777994, "flos": 25812123413760.0, "grad_norm": 1.9163252969887794, "language_loss": 0.788396, "learning_rate": 3.3381676178286047e-06, "loss": 0.81042033, "num_input_tokens_seen": 95918025, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.9140625, "step": 4437, "time_per_iteration": 2.6624279022216797 }, { "auxiliary_loss_clip": 0.01183965, "auxiliary_loss_mlp": 0.0104305, "balance_loss_clip": 1.02614665, "balance_loss_mlp": 1.05031276, "epoch": 0.2668269953404479, "flos": 36064583504640.0, "grad_norm": 1.8345343879739515, "language_loss": 0.64012659, "learning_rate": 3.337886801325582e-06, "loss": 0.66239679, "num_input_tokens_seen": 95937725, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.890625, "step": 4438, "time_per_iteration": 2.738715887069702 }, { "auxiliary_loss_clip": 0.01184098, "auxiliary_loss_mlp": 0.01040728, "balance_loss_clip": 1.02470624, "balance_loss_mlp": 1.04896307, "epoch": 0.26688711859311587, "flos": 26570987072640.0, "grad_norm": 1.8704037479191369, "language_loss": 0.75815165, "learning_rate": 3.3376059370768202e-06, "loss": 0.78039992, "num_input_tokens_seen": 95956335, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8984375, "step": 4439, "time_per_iteration": 2.7470715045928955 }, { "auxiliary_loss_clip": 0.01185574, "auxiliary_loss_mlp": 0.01040446, "balance_loss_clip": 1.02236247, "balance_loss_mlp": 1.04909825, "epoch": 0.26694724184578383, "flos": 26761565658240.0, "grad_norm": 2.9368017807126603, "language_loss": 0.71682787, "learning_rate": 3.337325025092344e-06, "loss": 0.73908806, "num_input_tokens_seen": 95977135, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.91796875, "step": 4440, "time_per_iteration": 2.6955618858337402 }, { "auxiliary_loss_clip": 0.01161166, "auxiliary_loss_mlp": 0.01044132, "balance_loss_clip": 1.02573776, "balance_loss_mlp": 1.05088592, "epoch": 0.2670073650984518, "flos": 20959586536320.0, "grad_norm": 1.8661566647256635, "language_loss": 0.66468501, "learning_rate": 3.337044065382177e-06, "loss": 0.68673807, "num_input_tokens_seen": 95995435, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.921875, "step": 4441, "time_per_iteration": 2.6342926025390625 }, { "auxiliary_loss_clip": 0.01147054, "auxiliary_loss_mlp": 0.01045165, "balance_loss_clip": 1.02741456, "balance_loss_mlp": 1.04819524, "epoch": 0.2670674883511198, "flos": 28366054277760.0, "grad_norm": 1.452919089255241, "language_loss": 0.76171649, "learning_rate": 3.3367630579563465e-06, "loss": 0.78363872, "num_input_tokens_seen": 96016340, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.8984375, "step": 4442, "time_per_iteration": 2.7401180267333984 }, { "auxiliary_loss_clip": 0.01093133, "auxiliary_loss_mlp": 0.01015481, "balance_loss_clip": 1.01310849, "balance_loss_mlp": 1.01979399, "epoch": 0.2671276116037878, "flos": 58971319430400.0, "grad_norm": 0.9290511232967265, "language_loss": 0.61194372, "learning_rate": 3.3364820028248816e-06, "loss": 0.63302982, "num_input_tokens_seen": 96071205, "router_z_loss_clip": 0.02368164, "router_z_loss_mlp": 0.28515625, "step": 4443, "time_per_iteration": 3.1473982334136963 }, { "auxiliary_loss_clip": 0.01168412, "auxiliary_loss_mlp": 0.01040531, "balance_loss_clip": 1.02430701, "balance_loss_mlp": 1.05085778, "epoch": 0.26718773485645575, "flos": 43945072053120.0, "grad_norm": 2.5947201621510776, "language_loss": 0.76375759, "learning_rate": 3.336200899997812e-06, "loss": 0.78584695, "num_input_tokens_seen": 96094240, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.90625, "step": 4444, "time_per_iteration": 2.8243699073791504 }, { "auxiliary_loss_clip": 0.01148962, "auxiliary_loss_mlp": 0.01042425, "balance_loss_clip": 1.02449596, "balance_loss_mlp": 1.04953766, "epoch": 0.2672478581091237, "flos": 25228323665280.0, "grad_norm": 1.6876234111680957, "language_loss": 0.79869783, "learning_rate": 3.3359197494851687e-06, "loss": 0.82061166, "num_input_tokens_seen": 96114105, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.90625, "step": 4445, "time_per_iteration": 2.5475728511810303 }, { "auxiliary_loss_clip": 0.01158696, "auxiliary_loss_mlp": 0.01041481, "balance_loss_clip": 1.02326548, "balance_loss_mlp": 1.0488658, "epoch": 0.2673079813617917, "flos": 15268176455040.0, "grad_norm": 1.988143966034479, "language_loss": 0.89044297, "learning_rate": 3.335638551296986e-06, "loss": 0.91244471, "num_input_tokens_seen": 96132140, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.91796875, "step": 4446, "time_per_iteration": 2.60145902633667 }, { "auxiliary_loss_clip": 0.01149849, "auxiliary_loss_mlp": 0.01049752, "balance_loss_clip": 1.03278875, "balance_loss_mlp": 1.05095744, "epoch": 0.26736810461445965, "flos": 25812733944960.0, "grad_norm": 2.3379519200020793, "language_loss": 0.68102199, "learning_rate": 3.3353573054432997e-06, "loss": 0.70301801, "num_input_tokens_seen": 96152090, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8984375, "step": 4447, "time_per_iteration": 2.579591751098633 }, { "auxiliary_loss_clip": 0.01149763, "auxiliary_loss_mlp": 0.01039319, "balance_loss_clip": 1.02227187, "balance_loss_mlp": 1.04966629, "epoch": 0.2674282278671276, "flos": 24312709054080.0, "grad_norm": 1.818421956648323, "language_loss": 0.83568096, "learning_rate": 3.335076011934146e-06, "loss": 0.85757178, "num_input_tokens_seen": 96170015, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9140625, "step": 4448, "time_per_iteration": 2.628227949142456 }, { "auxiliary_loss_clip": 0.0116371, "auxiliary_loss_mlp": 0.01049387, "balance_loss_clip": 1.03201771, "balance_loss_mlp": 1.04891598, "epoch": 0.2674883511197956, "flos": 22815521337600.0, "grad_norm": 2.8858649995971293, "language_loss": 0.84197468, "learning_rate": 3.3347946707795627e-06, "loss": 0.8641057, "num_input_tokens_seen": 96188065, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.875, "step": 4449, "time_per_iteration": 2.5663981437683105 }, { "auxiliary_loss_clip": 0.01166126, "auxiliary_loss_mlp": 0.0105296, "balance_loss_clip": 1.03463733, "balance_loss_mlp": 1.05008447, "epoch": 0.26754847437246354, "flos": 25370170473600.0, "grad_norm": 3.2295888506701966, "language_loss": 0.83962572, "learning_rate": 3.3345132819895918e-06, "loss": 0.86181659, "num_input_tokens_seen": 96205780, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.9765625, "step": 4450, "time_per_iteration": 2.6725423336029053 }, { "auxiliary_loss_clip": 0.0116151, "auxiliary_loss_mlp": 0.01049337, "balance_loss_clip": 1.03338671, "balance_loss_mlp": 1.04792309, "epoch": 0.2676085976251315, "flos": 20230420446720.0, "grad_norm": 1.6768662807029822, "language_loss": 0.80848378, "learning_rate": 3.3342318455742748e-06, "loss": 0.83059227, "num_input_tokens_seen": 96224990, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8671875, "step": 4451, "time_per_iteration": 2.609532356262207 }, { "auxiliary_loss_clip": 0.01150391, "auxiliary_loss_mlp": 0.01044195, "balance_loss_clip": 1.02837563, "balance_loss_mlp": 1.0524199, "epoch": 0.26766872087779947, "flos": 28038225824640.0, "grad_norm": 2.025081944948484, "language_loss": 0.86197221, "learning_rate": 3.333950361543655e-06, "loss": 0.88391805, "num_input_tokens_seen": 96245345, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.890625, "step": 4452, "time_per_iteration": 2.7110812664031982 }, { "auxiliary_loss_clip": 0.01161602, "auxiliary_loss_mlp": 0.0104513, "balance_loss_clip": 1.0271771, "balance_loss_mlp": 1.05153263, "epoch": 0.26772884413046744, "flos": 18325179250560.0, "grad_norm": 2.298277198814409, "language_loss": 0.83135521, "learning_rate": 3.333668829907778e-06, "loss": 0.85342246, "num_input_tokens_seen": 96259000, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.91796875, "step": 4453, "time_per_iteration": 2.5685977935791016 }, { "auxiliary_loss_clip": 0.01147072, "auxiliary_loss_mlp": 0.0105296, "balance_loss_clip": 1.0359726, "balance_loss_mlp": 1.04924011, "epoch": 0.2677889673831354, "flos": 22127509255680.0, "grad_norm": 2.32565357200011, "language_loss": 0.79886216, "learning_rate": 3.333387250676692e-06, "loss": 0.82086247, "num_input_tokens_seen": 96277000, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.890625, "step": 4454, "time_per_iteration": 2.6594438552856445 }, { "auxiliary_loss_clip": 0.01159196, "auxiliary_loss_mlp": 0.01048474, "balance_loss_clip": 1.03174877, "balance_loss_mlp": 1.05088687, "epoch": 0.2678490906358034, "flos": 23729699404800.0, "grad_norm": 1.741402858794122, "language_loss": 0.72825837, "learning_rate": 3.3331056238604437e-06, "loss": 0.7503351, "num_input_tokens_seen": 96297010, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.90625, "step": 4455, "time_per_iteration": 2.645455837249756 }, { "auxiliary_loss_clip": 0.01166636, "auxiliary_loss_mlp": 0.01044739, "balance_loss_clip": 1.02859807, "balance_loss_mlp": 1.05247438, "epoch": 0.2679092138884714, "flos": 21762872340480.0, "grad_norm": 2.065766633141292, "language_loss": 0.73682535, "learning_rate": 3.3328239494690856e-06, "loss": 0.75893915, "num_input_tokens_seen": 96315780, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.87109375, "step": 4456, "time_per_iteration": 2.6560847759246826 }, { "auxiliary_loss_clip": 0.01152392, "auxiliary_loss_mlp": 0.01042614, "balance_loss_clip": 1.02559066, "balance_loss_mlp": 1.05065835, "epoch": 0.26796933714113935, "flos": 19861186590720.0, "grad_norm": 2.327597977493363, "language_loss": 0.70998752, "learning_rate": 3.332542227512669e-06, "loss": 0.73193759, "num_input_tokens_seen": 96333465, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9296875, "step": 4457, "time_per_iteration": 2.5591583251953125 }, { "auxiliary_loss_clip": 0.01172739, "auxiliary_loss_mlp": 0.01052741, "balance_loss_clip": 1.0352056, "balance_loss_mlp": 1.05454731, "epoch": 0.2680294603938073, "flos": 20047886507520.0, "grad_norm": 1.9001531897106376, "language_loss": 0.79041088, "learning_rate": 3.332260458001248e-06, "loss": 0.8126657, "num_input_tokens_seen": 96352005, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9140625, "step": 4458, "time_per_iteration": 2.6505374908447266 }, { "auxiliary_loss_clip": 0.01075224, "auxiliary_loss_mlp": 0.01014791, "balance_loss_clip": 1.01255, "balance_loss_mlp": 1.02059484, "epoch": 0.2680895836464753, "flos": 72113763052800.0, "grad_norm": 0.8645369143352629, "language_loss": 0.58660758, "learning_rate": 3.3319786409448776e-06, "loss": 0.60750777, "num_input_tokens_seen": 96406265, "router_z_loss_clip": 0.02246094, "router_z_loss_mlp": 0.27929688, "step": 4459, "time_per_iteration": 3.2095530033111572 }, { "auxiliary_loss_clip": 0.01174448, "auxiliary_loss_mlp": 0.0104434, "balance_loss_clip": 1.0280683, "balance_loss_mlp": 1.04943776, "epoch": 0.26814970689914325, "flos": 20449044576000.0, "grad_norm": 2.6801745191929864, "language_loss": 0.85083491, "learning_rate": 3.3316967763536167e-06, "loss": 0.87302279, "num_input_tokens_seen": 96425225, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8828125, "step": 4460, "time_per_iteration": 4.009393930435181 }, { "auxiliary_loss_clip": 0.01148404, "auxiliary_loss_mlp": 0.01042732, "balance_loss_clip": 1.025792, "balance_loss_mlp": 1.05088758, "epoch": 0.2682098301518112, "flos": 17566674727680.0, "grad_norm": 2.0376601644533174, "language_loss": 0.68210733, "learning_rate": 3.331414864237523e-06, "loss": 0.70401865, "num_input_tokens_seen": 96443780, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.88671875, "step": 4461, "time_per_iteration": 2.555388927459717 }, { "auxiliary_loss_clip": 0.01164391, "auxiliary_loss_mlp": 0.01049825, "balance_loss_clip": 1.03315961, "balance_loss_mlp": 1.04943585, "epoch": 0.2682699534044792, "flos": 18333259810560.0, "grad_norm": 1.4860685860229879, "language_loss": 0.66985571, "learning_rate": 3.331132904606658e-06, "loss": 0.69199789, "num_input_tokens_seen": 96464530, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.875, "step": 4462, "time_per_iteration": 2.6974174976348877 }, { "auxiliary_loss_clip": 0.01166331, "auxiliary_loss_mlp": 0.01046199, "balance_loss_clip": 1.0284251, "balance_loss_mlp": 1.05101717, "epoch": 0.26833007665714714, "flos": 25301294144640.0, "grad_norm": 2.1107889928812282, "language_loss": 0.69533527, "learning_rate": 3.330850897471083e-06, "loss": 0.71746057, "num_input_tokens_seen": 96483345, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.88671875, "step": 4463, "time_per_iteration": 2.612902879714966 }, { "auxiliary_loss_clip": 0.01164257, "auxiliary_loss_mlp": 0.010427, "balance_loss_clip": 1.02541471, "balance_loss_mlp": 1.05317187, "epoch": 0.2683901999098151, "flos": 16099759198080.0, "grad_norm": 4.651015584079468, "language_loss": 0.77581638, "learning_rate": 3.3305688428408634e-06, "loss": 0.79788601, "num_input_tokens_seen": 96498305, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.9296875, "step": 4464, "time_per_iteration": 3.9546492099761963 }, { "auxiliary_loss_clip": 0.01148387, "auxiliary_loss_mlp": 0.01042073, "balance_loss_clip": 1.02478814, "balance_loss_mlp": 1.050102, "epoch": 0.2684503231624831, "flos": 27308054154240.0, "grad_norm": 1.6593893909192592, "language_loss": 0.71053886, "learning_rate": 3.330286740726064e-06, "loss": 0.73244345, "num_input_tokens_seen": 96519740, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.890625, "step": 4465, "time_per_iteration": 4.0813562870025635 }, { "auxiliary_loss_clip": 0.01068086, "auxiliary_loss_mlp": 0.01005418, "balance_loss_clip": 1.00329614, "balance_loss_mlp": 1.02230644, "epoch": 0.26851044641515104, "flos": 71858007239040.0, "grad_norm": 0.6789205443508488, "language_loss": 0.53088367, "learning_rate": 3.3300045911367527e-06, "loss": 0.5516187, "num_input_tokens_seen": 96588870, "router_z_loss_clip": 0.02124023, "router_z_loss_mlp": 0.28125, "step": 4466, "time_per_iteration": 3.2587265968322754 }, { "auxiliary_loss_clip": 0.01155445, "auxiliary_loss_mlp": 0.01046282, "balance_loss_clip": 1.02899706, "balance_loss_mlp": 1.04942179, "epoch": 0.268570569667819, "flos": 18733771434240.0, "grad_norm": 2.834981075903952, "language_loss": 0.7386505, "learning_rate": 3.3297223940829993e-06, "loss": 0.7606678, "num_input_tokens_seen": 96605100, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.8828125, "step": 4467, "time_per_iteration": 2.5823585987091064 }, { "auxiliary_loss_clip": 0.01162875, "auxiliary_loss_mlp": 0.01296939, "balance_loss_clip": 1.03119278, "balance_loss_mlp": 1.05422759, "epoch": 0.268630692920487, "flos": 18178376365440.0, "grad_norm": 3.072101426659116, "language_loss": 0.8087306, "learning_rate": 3.3294401495748733e-06, "loss": 0.83332872, "num_input_tokens_seen": 96621410, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.90625, "step": 4468, "time_per_iteration": 4.026928901672363 }, { "auxiliary_loss_clip": 0.01049886, "auxiliary_loss_mlp": 0.01004724, "balance_loss_clip": 1.00257874, "balance_loss_mlp": 1.02153015, "epoch": 0.268690816173155, "flos": 68731768978560.0, "grad_norm": 0.8421392015582257, "language_loss": 0.59491265, "learning_rate": 3.3291578576224487e-06, "loss": 0.61545873, "num_input_tokens_seen": 96684810, "router_z_loss_clip": 0.02148438, "router_z_loss_mlp": 0.28320312, "step": 4469, "time_per_iteration": 3.247283935546875 }, { "auxiliary_loss_clip": 0.01143105, "auxiliary_loss_mlp": 0.01045924, "balance_loss_clip": 1.02781618, "balance_loss_mlp": 1.05268228, "epoch": 0.26875093942582295, "flos": 23293636295040.0, "grad_norm": 1.9979041150519263, "language_loss": 0.8183623, "learning_rate": 3.328875518235799e-06, "loss": 0.84025264, "num_input_tokens_seen": 96701920, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.90625, "step": 4470, "time_per_iteration": 2.5787320137023926 }, { "auxiliary_loss_clip": 0.01154653, "auxiliary_loss_mlp": 0.01041171, "balance_loss_clip": 1.02430272, "balance_loss_mlp": 1.05195308, "epoch": 0.2688110626784909, "flos": 21543458112000.0, "grad_norm": 1.7592546577658283, "language_loss": 0.82522798, "learning_rate": 3.328593131425e-06, "loss": 0.84718621, "num_input_tokens_seen": 96721260, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.84375, "step": 4471, "time_per_iteration": 2.530425786972046 }, { "auxiliary_loss_clip": 0.01146412, "auxiliary_loss_mlp": 0.01041518, "balance_loss_clip": 1.02552021, "balance_loss_mlp": 1.05205953, "epoch": 0.2688711859311589, "flos": 28400600183040.0, "grad_norm": 2.05287228865814, "language_loss": 0.68843251, "learning_rate": 3.3283106972001303e-06, "loss": 0.71031177, "num_input_tokens_seen": 96740385, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8515625, "step": 4472, "time_per_iteration": 2.657679557800293 }, { "auxiliary_loss_clip": 0.01136767, "auxiliary_loss_mlp": 0.01041057, "balance_loss_clip": 1.02509499, "balance_loss_mlp": 1.05044246, "epoch": 0.26893130918382685, "flos": 25994944661760.0, "grad_norm": 1.515427738093528, "language_loss": 0.67513752, "learning_rate": 3.3280282155712684e-06, "loss": 0.69691569, "num_input_tokens_seen": 96761860, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.86328125, "step": 4473, "time_per_iteration": 2.585967540740967 }, { "auxiliary_loss_clip": 0.01156337, "auxiliary_loss_mlp": 0.0104835, "balance_loss_clip": 1.03232789, "balance_loss_mlp": 1.05243695, "epoch": 0.2689914324364948, "flos": 20339624770560.0, "grad_norm": 1.7566582713798435, "language_loss": 0.82547766, "learning_rate": 3.3277456865484956e-06, "loss": 0.84752452, "num_input_tokens_seen": 96781890, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.859375, "step": 4474, "time_per_iteration": 2.612928867340088 }, { "auxiliary_loss_clip": 0.0114656, "auxiliary_loss_mlp": 0.01051672, "balance_loss_clip": 1.03543544, "balance_loss_mlp": 1.05191922, "epoch": 0.2690515556891628, "flos": 19464553635840.0, "grad_norm": 2.011833383848885, "language_loss": 0.70637023, "learning_rate": 3.3274631101418942e-06, "loss": 0.72835255, "num_input_tokens_seen": 96800390, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.85546875, "step": 4475, "time_per_iteration": 2.6342413425445557 }, { "auxiliary_loss_clip": 0.01167146, "auxiliary_loss_mlp": 0.01293414, "balance_loss_clip": 1.02914238, "balance_loss_mlp": 1.05161166, "epoch": 0.26911167894183075, "flos": 18146631720960.0, "grad_norm": 1.6570406689386792, "language_loss": 0.7314862, "learning_rate": 3.32718048636155e-06, "loss": 0.75609183, "num_input_tokens_seen": 96816685, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.890625, "step": 4476, "time_per_iteration": 2.629032850265503 }, { "auxiliary_loss_clip": 0.01154524, "auxiliary_loss_mlp": 0.01044013, "balance_loss_clip": 1.02831292, "balance_loss_mlp": 1.05106473, "epoch": 0.2691718021944987, "flos": 19975131509760.0, "grad_norm": 1.654580444271934, "language_loss": 0.80553043, "learning_rate": 3.3268978152175474e-06, "loss": 0.82751578, "num_input_tokens_seen": 96836285, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.85546875, "step": 4477, "time_per_iteration": 2.577023506164551 }, { "auxiliary_loss_clip": 0.01174239, "auxiliary_loss_mlp": 0.01048268, "balance_loss_clip": 1.03126907, "balance_loss_mlp": 1.04952312, "epoch": 0.2692319254471667, "flos": 37447215770880.0, "grad_norm": 1.6098381041358425, "language_loss": 0.648646, "learning_rate": 3.3266150967199752e-06, "loss": 0.67087108, "num_input_tokens_seen": 96857745, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.88671875, "step": 4478, "time_per_iteration": 2.7820420265197754 }, { "auxiliary_loss_clip": 0.01147699, "auxiliary_loss_mlp": 0.01046023, "balance_loss_clip": 1.02967966, "balance_loss_mlp": 1.05054402, "epoch": 0.26929204869983464, "flos": 22127796564480.0, "grad_norm": 2.0215133350763907, "language_loss": 0.80676281, "learning_rate": 3.3263323308789225e-06, "loss": 0.82870007, "num_input_tokens_seen": 96877295, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.87890625, "step": 4479, "time_per_iteration": 2.5821290016174316 }, { "auxiliary_loss_clip": 0.0115806, "auxiliary_loss_mlp": 0.01295765, "balance_loss_clip": 1.03134, "balance_loss_mlp": 1.05054545, "epoch": 0.2693521719525026, "flos": 19792813052160.0, "grad_norm": 2.022883014089009, "language_loss": 0.80807835, "learning_rate": 3.3260495177044806e-06, "loss": 0.83261657, "num_input_tokens_seen": 96896160, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8984375, "step": 4480, "time_per_iteration": 2.578479051589966 }, { "auxiliary_loss_clip": 0.01158001, "auxiliary_loss_mlp": 0.01042537, "balance_loss_clip": 1.02726603, "balance_loss_mlp": 1.04732025, "epoch": 0.2694122952051706, "flos": 20994383836800.0, "grad_norm": 1.9663513195954239, "language_loss": 0.77859223, "learning_rate": 3.325766657206743e-06, "loss": 0.80059767, "num_input_tokens_seen": 96915410, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8359375, "step": 4481, "time_per_iteration": 2.615988254547119 }, { "auxiliary_loss_clip": 0.01146489, "auxiliary_loss_mlp": 0.0104714, "balance_loss_clip": 1.0304029, "balance_loss_mlp": 1.04880273, "epoch": 0.2694724184578386, "flos": 25849291011840.0, "grad_norm": 1.7806112804644172, "language_loss": 0.73783636, "learning_rate": 3.3254837493958032e-06, "loss": 0.75977266, "num_input_tokens_seen": 96937865, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8828125, "step": 4482, "time_per_iteration": 2.589383602142334 }, { "auxiliary_loss_clip": 0.01157456, "auxiliary_loss_mlp": 0.01040242, "balance_loss_clip": 1.02344525, "balance_loss_mlp": 1.05191624, "epoch": 0.26953254171050656, "flos": 21726961718400.0, "grad_norm": 2.0151318529788917, "language_loss": 0.72367895, "learning_rate": 3.3252007942817575e-06, "loss": 0.74565589, "num_input_tokens_seen": 96957710, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.875, "step": 4483, "time_per_iteration": 2.6144766807556152 }, { "auxiliary_loss_clip": 0.01146591, "auxiliary_loss_mlp": 0.01047815, "balance_loss_clip": 1.03047037, "balance_loss_mlp": 1.04610777, "epoch": 0.2695926649631745, "flos": 19682926369920.0, "grad_norm": 2.0436523499952415, "language_loss": 0.86096114, "learning_rate": 3.324917791874705e-06, "loss": 0.88290524, "num_input_tokens_seen": 96975890, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9140625, "step": 4484, "time_per_iteration": 2.6563704013824463 }, { "auxiliary_loss_clip": 0.01144754, "auxiliary_loss_mlp": 0.01042197, "balance_loss_clip": 1.02606773, "balance_loss_mlp": 1.04725683, "epoch": 0.2696527882158425, "flos": 32886596724480.0, "grad_norm": 1.4549921667059749, "language_loss": 0.66279745, "learning_rate": 3.324634742184744e-06, "loss": 0.68466699, "num_input_tokens_seen": 96998595, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8828125, "step": 4485, "time_per_iteration": 2.8191826343536377 }, { "auxiliary_loss_clip": 0.01156127, "auxiliary_loss_mlp": 0.01043794, "balance_loss_clip": 1.02693152, "balance_loss_mlp": 1.04893994, "epoch": 0.26971291146851045, "flos": 12124843320960.0, "grad_norm": 2.9206650374046506, "language_loss": 0.72917461, "learning_rate": 3.324351645221977e-06, "loss": 0.75117385, "num_input_tokens_seen": 97013715, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.890625, "step": 4486, "time_per_iteration": 2.5209271907806396 }, { "auxiliary_loss_clip": 0.0114298, "auxiliary_loss_mlp": 0.01046781, "balance_loss_clip": 1.03123617, "balance_loss_mlp": 1.05357718, "epoch": 0.2697730347211784, "flos": 22634459856000.0, "grad_norm": 1.9357701012211823, "language_loss": 0.83983457, "learning_rate": 3.3240685009965065e-06, "loss": 0.86173213, "num_input_tokens_seen": 97031570, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.89453125, "step": 4487, "time_per_iteration": 2.6504225730895996 }, { "auxiliary_loss_clip": 0.01168867, "auxiliary_loss_mlp": 0.010463, "balance_loss_clip": 1.02884793, "balance_loss_mlp": 1.05058956, "epoch": 0.2698331579738464, "flos": 23513050523520.0, "grad_norm": 2.200655000592663, "language_loss": 0.71720922, "learning_rate": 3.3237853095184365e-06, "loss": 0.73936093, "num_input_tokens_seen": 97049815, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.9140625, "step": 4488, "time_per_iteration": 2.5829591751098633 }, { "auxiliary_loss_clip": 0.01194054, "auxiliary_loss_mlp": 0.01050996, "balance_loss_clip": 1.03424668, "balance_loss_mlp": 1.05061591, "epoch": 0.26989328122651435, "flos": 24641040297600.0, "grad_norm": 1.8009377757250307, "language_loss": 0.83950347, "learning_rate": 3.3235020707978747e-06, "loss": 0.86195397, "num_input_tokens_seen": 97067570, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8984375, "step": 4489, "time_per_iteration": 2.732771873474121 }, { "auxiliary_loss_clip": 0.01159156, "auxiliary_loss_mlp": 0.01057001, "balance_loss_clip": 1.03885782, "balance_loss_mlp": 1.05076337, "epoch": 0.2699534044791823, "flos": 10772555068800.0, "grad_norm": 3.2985044597643762, "language_loss": 0.89434314, "learning_rate": 3.323218784844928e-06, "loss": 0.91650474, "num_input_tokens_seen": 97082180, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.90234375, "step": 4490, "time_per_iteration": 2.4957616329193115 }, { "auxiliary_loss_clip": 0.01179503, "auxiliary_loss_mlp": 0.01037704, "balance_loss_clip": 1.02155137, "balance_loss_mlp": 1.04906309, "epoch": 0.2700135277318503, "flos": 36171597098880.0, "grad_norm": 2.385924789601232, "language_loss": 0.72927308, "learning_rate": 3.322935451669706e-06, "loss": 0.75144511, "num_input_tokens_seen": 97103470, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8515625, "step": 4491, "time_per_iteration": 2.760000705718994 }, { "auxiliary_loss_clip": 0.0114035, "auxiliary_loss_mlp": 0.01043749, "balance_loss_clip": 1.02713156, "balance_loss_mlp": 1.05199194, "epoch": 0.27007365098451824, "flos": 17418614866560.0, "grad_norm": 3.051929151662201, "language_loss": 0.74303526, "learning_rate": 3.322652071282322e-06, "loss": 0.76487625, "num_input_tokens_seen": 97118100, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8828125, "step": 4492, "time_per_iteration": 2.51242995262146 }, { "auxiliary_loss_clip": 0.011563, "auxiliary_loss_mlp": 0.01042317, "balance_loss_clip": 1.02503133, "balance_loss_mlp": 1.05044651, "epoch": 0.2701337742371862, "flos": 23185688947200.0, "grad_norm": 2.209237337965219, "language_loss": 0.89000911, "learning_rate": 3.3223686436928874e-06, "loss": 0.91199529, "num_input_tokens_seen": 97136765, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.87890625, "step": 4493, "time_per_iteration": 2.5851566791534424 }, { "auxiliary_loss_clip": 0.01147903, "auxiliary_loss_mlp": 0.01040454, "balance_loss_clip": 1.02479005, "balance_loss_mlp": 1.05108047, "epoch": 0.2701938974898542, "flos": 24389450461440.0, "grad_norm": 1.4934209892768524, "language_loss": 0.71143126, "learning_rate": 3.322085168911517e-06, "loss": 0.73331481, "num_input_tokens_seen": 97157470, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.875, "step": 4494, "time_per_iteration": 2.573383092880249 }, { "auxiliary_loss_clip": 0.01150657, "auxiliary_loss_mlp": 0.01038121, "balance_loss_clip": 1.02258813, "balance_loss_mlp": 1.04849172, "epoch": 0.2702540207425222, "flos": 26214322976640.0, "grad_norm": 1.9989556651605243, "language_loss": 0.86502951, "learning_rate": 3.321801646948328e-06, "loss": 0.88691735, "num_input_tokens_seen": 97176905, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.84375, "step": 4495, "time_per_iteration": 2.629298210144043 }, { "auxiliary_loss_clip": 0.01139026, "auxiliary_loss_mlp": 0.01049125, "balance_loss_clip": 1.03232813, "balance_loss_mlp": 1.05156267, "epoch": 0.27031414399519016, "flos": 22926377687040.0, "grad_norm": 4.794976877409425, "language_loss": 0.76634216, "learning_rate": 3.321518077813438e-06, "loss": 0.78822362, "num_input_tokens_seen": 97196380, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.875, "step": 4496, "time_per_iteration": 2.55867075920105 }, { "auxiliary_loss_clip": 0.0105396, "auxiliary_loss_mlp": 0.01024521, "balance_loss_clip": 1.02247036, "balance_loss_mlp": 1.02639842, "epoch": 0.2703742672478581, "flos": 63019780404480.0, "grad_norm": 0.7037835281100225, "language_loss": 0.50141108, "learning_rate": 3.321234461516967e-06, "loss": 0.52219582, "num_input_tokens_seen": 97260100, "router_z_loss_clip": 0.02050781, "router_z_loss_mlp": 0.27539062, "step": 4497, "time_per_iteration": 3.222191095352173 }, { "auxiliary_loss_clip": 0.01174847, "auxiliary_loss_mlp": 0.01042725, "balance_loss_clip": 1.02681041, "balance_loss_mlp": 1.05252743, "epoch": 0.2704343905005261, "flos": 18840820942080.0, "grad_norm": 1.6097102607469274, "language_loss": 0.72465265, "learning_rate": 3.3209507980690375e-06, "loss": 0.74682838, "num_input_tokens_seen": 97277935, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.86328125, "step": 4498, "time_per_iteration": 2.6178297996520996 }, { "auxiliary_loss_clip": 0.01079823, "auxiliary_loss_mlp": 0.01008047, "balance_loss_clip": 1.00603199, "balance_loss_mlp": 1.02533388, "epoch": 0.27049451375319405, "flos": 71233412618880.0, "grad_norm": 0.755151550522861, "language_loss": 0.59125274, "learning_rate": 3.3206670874797717e-06, "loss": 0.61213142, "num_input_tokens_seen": 97338845, "router_z_loss_clip": 0.0201416, "router_z_loss_mlp": 0.2734375, "step": 4499, "time_per_iteration": 3.2046868801116943 }, { "auxiliary_loss_clip": 0.01154241, "auxiliary_loss_mlp": 0.01037609, "balance_loss_clip": 1.02061009, "balance_loss_mlp": 1.05007386, "epoch": 0.270554637005862, "flos": 24278594112000.0, "grad_norm": 5.349922686111312, "language_loss": 0.73490041, "learning_rate": 3.3203833297592943e-06, "loss": 0.75681895, "num_input_tokens_seen": 97356640, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.86328125, "step": 4500, "time_per_iteration": 2.6519014835357666 }, { "auxiliary_loss_clip": 0.01153228, "auxiliary_loss_mlp": 0.01044696, "balance_loss_clip": 1.02702904, "balance_loss_mlp": 1.04765224, "epoch": 0.27061476025853, "flos": 17632318832640.0, "grad_norm": 2.9578161609536653, "language_loss": 0.80290276, "learning_rate": 3.3200995249177324e-06, "loss": 0.82488197, "num_input_tokens_seen": 97372585, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.875, "step": 4501, "time_per_iteration": 2.4821372032165527 }, { "auxiliary_loss_clip": 0.0116841, "auxiliary_loss_mlp": 0.01050895, "balance_loss_clip": 1.03377652, "balance_loss_mlp": 1.05181694, "epoch": 0.27067488351119795, "flos": 22710123855360.0, "grad_norm": 1.8129715957113193, "language_loss": 0.72361642, "learning_rate": 3.3198156729652144e-06, "loss": 0.74580944, "num_input_tokens_seen": 97393315, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8984375, "step": 4502, "time_per_iteration": 4.079127788543701 }, { "auxiliary_loss_clip": 0.01156983, "auxiliary_loss_mlp": 0.01046363, "balance_loss_clip": 1.0279212, "balance_loss_mlp": 1.04691529, "epoch": 0.2707350067638659, "flos": 41719616087040.0, "grad_norm": 1.8665475620796221, "language_loss": 0.68527997, "learning_rate": 3.31953177391187e-06, "loss": 0.70731342, "num_input_tokens_seen": 97417860, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.921875, "step": 4503, "time_per_iteration": 2.7824361324310303 }, { "auxiliary_loss_clip": 0.0117389, "auxiliary_loss_mlp": 0.01049072, "balance_loss_clip": 1.032776, "balance_loss_mlp": 1.05097914, "epoch": 0.2707951300165339, "flos": 20193037367040.0, "grad_norm": 3.084256688563234, "language_loss": 0.68200493, "learning_rate": 3.319247827767831e-06, "loss": 0.7042346, "num_input_tokens_seen": 97436780, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.87109375, "step": 4504, "time_per_iteration": 2.5805041790008545 }, { "auxiliary_loss_clip": 0.01146668, "auxiliary_loss_mlp": 0.01044174, "balance_loss_clip": 1.02876031, "balance_loss_mlp": 1.05227804, "epoch": 0.27085525326920185, "flos": 21433966479360.0, "grad_norm": 1.4778049012245513, "language_loss": 0.75300634, "learning_rate": 3.31896383454323e-06, "loss": 0.7749148, "num_input_tokens_seen": 97456190, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.859375, "step": 4505, "time_per_iteration": 2.6139414310455322 }, { "auxiliary_loss_clip": 0.01142404, "auxiliary_loss_mlp": 0.01049971, "balance_loss_clip": 1.03204203, "balance_loss_mlp": 1.05108762, "epoch": 0.2709153765218698, "flos": 17675232606720.0, "grad_norm": 1.9004941046277877, "language_loss": 0.73012018, "learning_rate": 3.3186797942482025e-06, "loss": 0.75204402, "num_input_tokens_seen": 97474545, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.9140625, "step": 4506, "time_per_iteration": 4.0061047077178955 }, { "auxiliary_loss_clip": 0.01158466, "auxiliary_loss_mlp": 0.01047386, "balance_loss_clip": 1.0297786, "balance_loss_mlp": 1.05133808, "epoch": 0.2709754997745378, "flos": 24456243801600.0, "grad_norm": 2.1870237082292348, "language_loss": 0.80970383, "learning_rate": 3.3183957068928855e-06, "loss": 0.83176231, "num_input_tokens_seen": 97494520, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.890625, "step": 4507, "time_per_iteration": 4.118772506713867 }, { "auxiliary_loss_clip": 0.01068862, "auxiliary_loss_mlp": 0.01022588, "balance_loss_clip": 1.02064466, "balance_loss_mlp": 1.02289367, "epoch": 0.2710356230272058, "flos": 65210798206080.0, "grad_norm": 0.9026292775043396, "language_loss": 0.5080812, "learning_rate": 3.318111572487417e-06, "loss": 0.52899569, "num_input_tokens_seen": 97552455, "router_z_loss_clip": 0.01940918, "router_z_loss_mlp": 0.27929688, "step": 4508, "time_per_iteration": 3.0971670150756836 }, { "auxiliary_loss_clip": 0.01179393, "auxiliary_loss_mlp": 0.01036845, "balance_loss_clip": 1.02120471, "balance_loss_mlp": 1.04973209, "epoch": 0.27109574627987376, "flos": 25484438615040.0, "grad_norm": 1.8093452086443833, "language_loss": 0.7450648, "learning_rate": 3.3178273910419376e-06, "loss": 0.76722717, "num_input_tokens_seen": 97572650, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.84375, "step": 4509, "time_per_iteration": 4.098143815994263 }, { "auxiliary_loss_clip": 0.01140518, "auxiliary_loss_mlp": 0.01038759, "balance_loss_clip": 1.02370286, "balance_loss_mlp": 1.04817593, "epoch": 0.2711558695325417, "flos": 19682782715520.0, "grad_norm": 1.7743248966780583, "language_loss": 0.71734703, "learning_rate": 3.3175431625665876e-06, "loss": 0.7391398, "num_input_tokens_seen": 97591150, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8359375, "step": 4510, "time_per_iteration": 2.5397164821624756 }, { "auxiliary_loss_clip": 0.01147375, "auxiliary_loss_mlp": 0.0103355, "balance_loss_clip": 1.01767123, "balance_loss_mlp": 1.05236149, "epoch": 0.2712159927852097, "flos": 18587758648320.0, "grad_norm": 2.4819138297694776, "language_loss": 0.69726342, "learning_rate": 3.317258887071512e-06, "loss": 0.71907264, "num_input_tokens_seen": 97607410, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.859375, "step": 4511, "time_per_iteration": 2.537036657333374 }, { "auxiliary_loss_clip": 0.01156752, "auxiliary_loss_mlp": 0.01045023, "balance_loss_clip": 1.02827382, "balance_loss_mlp": 1.04991555, "epoch": 0.27127611603787766, "flos": 25630235919360.0, "grad_norm": 2.1933945425414083, "language_loss": 0.81650513, "learning_rate": 3.3169745645668546e-06, "loss": 0.83852291, "num_input_tokens_seen": 97626870, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8828125, "step": 4512, "time_per_iteration": 2.5916695594787598 }, { "auxiliary_loss_clip": 0.01132746, "auxiliary_loss_mlp": 0.01038729, "balance_loss_clip": 1.02434039, "balance_loss_mlp": 1.04878008, "epoch": 0.2713362392905456, "flos": 23148952312320.0, "grad_norm": 1.6057227676280605, "language_loss": 0.80345589, "learning_rate": 3.3166901950627627e-06, "loss": 0.82517064, "num_input_tokens_seen": 97646595, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.83984375, "step": 4513, "time_per_iteration": 2.609365224838257 }, { "auxiliary_loss_clip": 0.01154223, "auxiliary_loss_mlp": 0.01041549, "balance_loss_clip": 1.02612364, "balance_loss_mlp": 1.04869926, "epoch": 0.2713963625432136, "flos": 18366045949440.0, "grad_norm": 1.7474908622158587, "language_loss": 0.87996888, "learning_rate": 3.3164057785693846e-06, "loss": 0.90192652, "num_input_tokens_seen": 97665485, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.875, "step": 4514, "time_per_iteration": 2.5456738471984863 }, { "auxiliary_loss_clip": 0.01153931, "auxiliary_loss_mlp": 0.01046972, "balance_loss_clip": 1.03105748, "balance_loss_mlp": 1.05109346, "epoch": 0.27145648579588155, "flos": 22491751121280.0, "grad_norm": 1.5954918500303326, "language_loss": 0.92148882, "learning_rate": 3.316121315096871e-06, "loss": 0.9434979, "num_input_tokens_seen": 97683800, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8515625, "step": 4515, "time_per_iteration": 2.6782710552215576 }, { "auxiliary_loss_clip": 0.01178085, "auxiliary_loss_mlp": 0.01056131, "balance_loss_clip": 1.03790426, "balance_loss_mlp": 1.05121136, "epoch": 0.2715166090485495, "flos": 19239177749760.0, "grad_norm": 2.6421422286489045, "language_loss": 0.73710889, "learning_rate": 3.3158368046553724e-06, "loss": 0.75945103, "num_input_tokens_seen": 97700505, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.90625, "step": 4516, "time_per_iteration": 2.5832889080047607 }, { "auxiliary_loss_clip": 0.01146106, "auxiliary_loss_mlp": 0.01045653, "balance_loss_clip": 1.02891564, "balance_loss_mlp": 1.05039871, "epoch": 0.2715767323012175, "flos": 17709598944000.0, "grad_norm": 2.187970389219424, "language_loss": 0.76350331, "learning_rate": 3.315552247255043e-06, "loss": 0.78542089, "num_input_tokens_seen": 97717410, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8671875, "step": 4517, "time_per_iteration": 2.681813955307007 }, { "auxiliary_loss_clip": 0.01163619, "auxiliary_loss_mlp": 0.01045196, "balance_loss_clip": 1.02842295, "balance_loss_mlp": 1.05016875, "epoch": 0.27163685555388545, "flos": 22382834106240.0, "grad_norm": 2.674109354487891, "language_loss": 0.76771653, "learning_rate": 3.3152676429060385e-06, "loss": 0.7898047, "num_input_tokens_seen": 97734545, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8671875, "step": 4518, "time_per_iteration": 2.6500096321105957 }, { "auxiliary_loss_clip": 0.01153939, "auxiliary_loss_mlp": 0.01046486, "balance_loss_clip": 1.03144145, "balance_loss_mlp": 1.04864669, "epoch": 0.2716969788065534, "flos": 22346708002560.0, "grad_norm": 1.8041367338749907, "language_loss": 0.67987436, "learning_rate": 3.3149829916185147e-06, "loss": 0.70187855, "num_input_tokens_seen": 97754000, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.87109375, "step": 4519, "time_per_iteration": 2.648810863494873 }, { "auxiliary_loss_clip": 0.01142184, "auxiliary_loss_mlp": 0.01046105, "balance_loss_clip": 1.03110886, "balance_loss_mlp": 1.04824793, "epoch": 0.2717571020592214, "flos": 25228467319680.0, "grad_norm": 1.8750657483529076, "language_loss": 0.7526195, "learning_rate": 3.314698293402631e-06, "loss": 0.7745024, "num_input_tokens_seen": 97772080, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8515625, "step": 4520, "time_per_iteration": 2.5723979473114014 }, { "auxiliary_loss_clip": 0.01150068, "auxiliary_loss_mlp": 0.01055428, "balance_loss_clip": 1.03752291, "balance_loss_mlp": 1.0528537, "epoch": 0.2718172253118894, "flos": 20189769229440.0, "grad_norm": 2.2076449667461975, "language_loss": 0.76197314, "learning_rate": 3.314413548268546e-06, "loss": 0.78402805, "num_input_tokens_seen": 97789370, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.8828125, "step": 4521, "time_per_iteration": 2.5663042068481445 }, { "auxiliary_loss_clip": 0.01157677, "auxiliary_loss_mlp": 0.01050794, "balance_loss_clip": 1.03458166, "balance_loss_mlp": 1.05088556, "epoch": 0.27187734856455736, "flos": 14319129260160.0, "grad_norm": 2.2119237034837647, "language_loss": 0.75004518, "learning_rate": 3.3141287562264232e-06, "loss": 0.77212983, "num_input_tokens_seen": 97807385, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.88671875, "step": 4522, "time_per_iteration": 2.5241618156433105 }, { "auxiliary_loss_clip": 0.01140278, "auxiliary_loss_mlp": 0.01039318, "balance_loss_clip": 1.02330852, "balance_loss_mlp": 1.05226457, "epoch": 0.27193747181722533, "flos": 21107682311040.0, "grad_norm": 1.7537924413697497, "language_loss": 0.7217831, "learning_rate": 3.3138439172864258e-06, "loss": 0.74357903, "num_input_tokens_seen": 97827930, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.87890625, "step": 4523, "time_per_iteration": 2.5603199005126953 }, { "auxiliary_loss_clip": 0.01150196, "auxiliary_loss_mlp": 0.01043238, "balance_loss_clip": 1.02691793, "balance_loss_mlp": 1.04758871, "epoch": 0.2719975950698933, "flos": 19682782715520.0, "grad_norm": 1.5053221512998856, "language_loss": 0.74726665, "learning_rate": 3.313559031458718e-06, "loss": 0.76920104, "num_input_tokens_seen": 97847440, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.84375, "step": 4524, "time_per_iteration": 2.5560834407806396 }, { "auxiliary_loss_clip": 0.01138095, "auxiliary_loss_mlp": 0.01046454, "balance_loss_clip": 1.03051615, "balance_loss_mlp": 1.05056465, "epoch": 0.27205771832256126, "flos": 24754482426240.0, "grad_norm": 1.5392389575785677, "language_loss": 0.7631799, "learning_rate": 3.313274098753467e-06, "loss": 0.78502548, "num_input_tokens_seen": 97867620, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.875, "step": 4525, "time_per_iteration": 2.551682949066162 }, { "auxiliary_loss_clip": 0.01142617, "auxiliary_loss_mlp": 0.01053297, "balance_loss_clip": 1.03700078, "balance_loss_mlp": 1.04846716, "epoch": 0.2721178415752292, "flos": 21755581879680.0, "grad_norm": 1.91075446988448, "language_loss": 0.81835431, "learning_rate": 3.3129891191808423e-06, "loss": 0.84031349, "num_input_tokens_seen": 97884345, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8515625, "step": 4526, "time_per_iteration": 2.5705459117889404 }, { "auxiliary_loss_clip": 0.01142576, "auxiliary_loss_mlp": 0.01048424, "balance_loss_clip": 1.03123403, "balance_loss_mlp": 1.05126262, "epoch": 0.2721779648278972, "flos": 12676826597760.0, "grad_norm": 7.779974888628395, "language_loss": 0.76451355, "learning_rate": 3.312704092751013e-06, "loss": 0.78642356, "num_input_tokens_seen": 97901500, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9140625, "step": 4527, "time_per_iteration": 2.5427029132843018 }, { "auxiliary_loss_clip": 0.01163268, "auxiliary_loss_mlp": 0.01295836, "balance_loss_clip": 1.0316987, "balance_loss_mlp": 1.04940522, "epoch": 0.27223808808056515, "flos": 16253206099200.0, "grad_norm": 2.135142843455407, "language_loss": 0.82686102, "learning_rate": 3.312419019474151e-06, "loss": 0.85145205, "num_input_tokens_seen": 97917800, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8671875, "step": 4528, "time_per_iteration": 2.7114241123199463 }, { "auxiliary_loss_clip": 0.01153844, "auxiliary_loss_mlp": 0.01050116, "balance_loss_clip": 1.03463089, "balance_loss_mlp": 1.05067801, "epoch": 0.2722982113332331, "flos": 27745805203200.0, "grad_norm": 3.3959856381118243, "language_loss": 0.76995718, "learning_rate": 3.3121338993604306e-06, "loss": 0.79199678, "num_input_tokens_seen": 97937225, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8515625, "step": 4529, "time_per_iteration": 2.660301923751831 }, { "auxiliary_loss_clip": 0.01156848, "auxiliary_loss_mlp": 0.01048542, "balance_loss_clip": 1.03346241, "balance_loss_mlp": 1.05051231, "epoch": 0.2723583345859011, "flos": 21726243446400.0, "grad_norm": 2.2926483217302884, "language_loss": 0.81742042, "learning_rate": 3.3118487324200267e-06, "loss": 0.83947432, "num_input_tokens_seen": 97956845, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.88671875, "step": 4530, "time_per_iteration": 2.6747887134552 }, { "auxiliary_loss_clip": 0.01146299, "auxiliary_loss_mlp": 0.01041869, "balance_loss_clip": 1.02603805, "balance_loss_mlp": 1.04932261, "epoch": 0.27241845783856905, "flos": 17347260499200.0, "grad_norm": 2.033918976018382, "language_loss": 0.91217995, "learning_rate": 3.3115635186631156e-06, "loss": 0.93406159, "num_input_tokens_seen": 97972465, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8828125, "step": 4531, "time_per_iteration": 2.569916248321533 }, { "auxiliary_loss_clip": 0.01145576, "auxiliary_loss_mlp": 0.01042809, "balance_loss_clip": 1.02676296, "balance_loss_mlp": 1.04833865, "epoch": 0.272478581091237, "flos": 24754302858240.0, "grad_norm": 2.693621883201803, "language_loss": 0.76542574, "learning_rate": 3.3112782580998767e-06, "loss": 0.78730953, "num_input_tokens_seen": 97990770, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8828125, "step": 4532, "time_per_iteration": 2.755518913269043 }, { "auxiliary_loss_clip": 0.01159211, "auxiliary_loss_mlp": 0.01036104, "balance_loss_clip": 1.02004623, "balance_loss_mlp": 1.04849029, "epoch": 0.272538704343905, "flos": 17890624512000.0, "grad_norm": 9.417872595517865, "language_loss": 0.8919552, "learning_rate": 3.3109929507404895e-06, "loss": 0.91390836, "num_input_tokens_seen": 98005775, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8359375, "step": 4533, "time_per_iteration": 2.547581672668457 }, { "auxiliary_loss_clip": 0.01150937, "auxiliary_loss_mlp": 0.01036926, "balance_loss_clip": 1.02113104, "balance_loss_mlp": 1.04875481, "epoch": 0.272598827596573, "flos": 22932016122240.0, "grad_norm": 1.7545047309644235, "language_loss": 0.71268511, "learning_rate": 3.3107075965951355e-06, "loss": 0.73456371, "num_input_tokens_seen": 98025750, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.84375, "step": 4534, "time_per_iteration": 2.6742196083068848 }, { "auxiliary_loss_clip": 0.01154062, "auxiliary_loss_mlp": 0.01043645, "balance_loss_clip": 1.02578735, "balance_loss_mlp": 1.04817176, "epoch": 0.27265895084924097, "flos": 24238409771520.0, "grad_norm": 1.7889996470739036, "language_loss": 0.91415346, "learning_rate": 3.3104221956739996e-06, "loss": 0.93613052, "num_input_tokens_seen": 98044955, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.8828125, "step": 4535, "time_per_iteration": 2.5534162521362305 }, { "auxiliary_loss_clip": 0.0115917, "auxiliary_loss_mlp": 0.01044388, "balance_loss_clip": 1.02766275, "balance_loss_mlp": 1.05350924, "epoch": 0.27271907410190893, "flos": 27013155494400.0, "grad_norm": 2.0335563598342405, "language_loss": 0.7255578, "learning_rate": 3.3101367479872667e-06, "loss": 0.74759334, "num_input_tokens_seen": 98065860, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.87890625, "step": 4536, "time_per_iteration": 2.652676582336426 }, { "auxiliary_loss_clip": 0.01143325, "auxiliary_loss_mlp": 0.01037662, "balance_loss_clip": 1.02106762, "balance_loss_mlp": 1.04748821, "epoch": 0.2727791973545769, "flos": 34452588942720.0, "grad_norm": 1.6208212094854457, "language_loss": 0.71556598, "learning_rate": 3.309851253545123e-06, "loss": 0.73737586, "num_input_tokens_seen": 98085450, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8671875, "step": 4537, "time_per_iteration": 2.7337074279785156 }, { "auxiliary_loss_clip": 0.01153163, "auxiliary_loss_mlp": 0.01041458, "balance_loss_clip": 1.02518559, "balance_loss_mlp": 1.04770041, "epoch": 0.27283932060724486, "flos": 15041723160960.0, "grad_norm": 2.36577680365254, "language_loss": 0.78136253, "learning_rate": 3.3095657123577572e-06, "loss": 0.80330873, "num_input_tokens_seen": 98099115, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.875, "step": 4538, "time_per_iteration": 2.600581645965576 }, { "auxiliary_loss_clip": 0.01144073, "auxiliary_loss_mlp": 0.01041508, "balance_loss_clip": 1.02529597, "balance_loss_mlp": 1.04872417, "epoch": 0.2728994438599128, "flos": 21652411040640.0, "grad_norm": 1.8776140065040008, "language_loss": 0.90220547, "learning_rate": 3.30928012443536e-06, "loss": 0.92406124, "num_input_tokens_seen": 98118415, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.86328125, "step": 4539, "time_per_iteration": 2.560664176940918 }, { "auxiliary_loss_clip": 0.01153504, "auxiliary_loss_mlp": 0.01041137, "balance_loss_clip": 1.02381587, "balance_loss_mlp": 1.04956937, "epoch": 0.2729595671125808, "flos": 17488424949120.0, "grad_norm": 2.5915820530920244, "language_loss": 0.88131475, "learning_rate": 3.308994489788123e-06, "loss": 0.90326118, "num_input_tokens_seen": 98136300, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.859375, "step": 4540, "time_per_iteration": 2.61480450630188 }, { "auxiliary_loss_clip": 0.01144761, "auxiliary_loss_mlp": 0.012849, "balance_loss_clip": 1.02115846, "balance_loss_mlp": 1.04780018, "epoch": 0.27301969036524876, "flos": 19318145800320.0, "grad_norm": 2.3073774793976622, "language_loss": 0.81978077, "learning_rate": 3.308708808426239e-06, "loss": 0.84407735, "num_input_tokens_seen": 98154580, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8828125, "step": 4541, "time_per_iteration": 2.554784059524536 }, { "auxiliary_loss_clip": 0.01151862, "auxiliary_loss_mlp": 0.0104884, "balance_loss_clip": 1.03263915, "balance_loss_mlp": 1.04769647, "epoch": 0.2730798136179167, "flos": 21065666376960.0, "grad_norm": 1.765513919226599, "language_loss": 0.79500967, "learning_rate": 3.308423080359905e-06, "loss": 0.81701666, "num_input_tokens_seen": 98173115, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.86328125, "step": 4542, "time_per_iteration": 2.6201112270355225 }, { "auxiliary_loss_clip": 0.01154597, "auxiliary_loss_mlp": 0.01039868, "balance_loss_clip": 1.02370298, "balance_loss_mlp": 1.05058527, "epoch": 0.2731399368705847, "flos": 19171737964800.0, "grad_norm": 2.020481263243143, "language_loss": 0.89396226, "learning_rate": 3.3081373055993167e-06, "loss": 0.91590691, "num_input_tokens_seen": 98190260, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.86328125, "step": 4543, "time_per_iteration": 4.05025053024292 }, { "auxiliary_loss_clip": 0.01158718, "auxiliary_loss_mlp": 0.01292647, "balance_loss_clip": 1.0278722, "balance_loss_mlp": 1.05068409, "epoch": 0.27320006012325265, "flos": 18290130554880.0, "grad_norm": 6.441842070591232, "language_loss": 0.6305443, "learning_rate": 3.3078514841546728e-06, "loss": 0.65505791, "num_input_tokens_seen": 98207115, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8984375, "step": 4544, "time_per_iteration": 2.546149969100952 }, { "auxiliary_loss_clip": 0.01156941, "auxiliary_loss_mlp": 0.0104645, "balance_loss_clip": 1.02839017, "balance_loss_mlp": 1.05253541, "epoch": 0.2732601833759206, "flos": 34860929731200.0, "grad_norm": 1.7102128537368986, "language_loss": 0.69860864, "learning_rate": 3.307565616036174e-06, "loss": 0.72064251, "num_input_tokens_seen": 98230610, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.859375, "step": 4545, "time_per_iteration": 2.7638185024261475 }, { "auxiliary_loss_clip": 0.01066846, "auxiliary_loss_mlp": 0.0100463, "balance_loss_clip": 1.00291371, "balance_loss_mlp": 1.02093339, "epoch": 0.2733203066285886, "flos": 53910824762880.0, "grad_norm": 0.7866206382225033, "language_loss": 0.61607403, "learning_rate": 3.3072797012540214e-06, "loss": 0.63678879, "num_input_tokens_seen": 98293585, "router_z_loss_clip": 0.01721191, "router_z_loss_mlp": 0.27539062, "step": 4546, "time_per_iteration": 3.2283058166503906 }, { "auxiliary_loss_clip": 0.01201652, "auxiliary_loss_mlp": 0.01052474, "balance_loss_clip": 1.035272, "balance_loss_mlp": 1.05281401, "epoch": 0.2733804298812566, "flos": 20660378244480.0, "grad_norm": 1.8266337580158676, "language_loss": 0.64594948, "learning_rate": 3.306993739818419e-06, "loss": 0.66849077, "num_input_tokens_seen": 98311680, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.859375, "step": 4547, "time_per_iteration": 2.6831295490264893 }, { "auxiliary_loss_clip": 0.01151658, "auxiliary_loss_mlp": 0.01288443, "balance_loss_clip": 1.02466798, "balance_loss_mlp": 1.05033207, "epoch": 0.27344055313392457, "flos": 25884339707520.0, "grad_norm": 2.022310282905737, "language_loss": 0.76952338, "learning_rate": 3.3067077317395722e-06, "loss": 0.79392433, "num_input_tokens_seen": 98330770, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8359375, "step": 4548, "time_per_iteration": 4.016105890274048 }, { "auxiliary_loss_clip": 0.01151552, "auxiliary_loss_mlp": 0.01047749, "balance_loss_clip": 1.03320551, "balance_loss_mlp": 1.0491755, "epoch": 0.27350067638659253, "flos": 22929753565440.0, "grad_norm": 1.8999936995239926, "language_loss": 0.82447642, "learning_rate": 3.3064216770276874e-06, "loss": 0.8464694, "num_input_tokens_seen": 98349860, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.84375, "step": 4549, "time_per_iteration": 4.1789631843566895 }, { "auxiliary_loss_clip": 0.01144221, "auxiliary_loss_mlp": 0.01045484, "balance_loss_clip": 1.02794886, "balance_loss_mlp": 1.04758096, "epoch": 0.2735607996392605, "flos": 16574821499520.0, "grad_norm": 2.252807124225989, "language_loss": 0.70902932, "learning_rate": 3.3061355756929733e-06, "loss": 0.73092639, "num_input_tokens_seen": 98367040, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.875, "step": 4550, "time_per_iteration": 4.1036927700042725 }, { "auxiliary_loss_clip": 0.01162101, "auxiliary_loss_mlp": 0.0103844, "balance_loss_clip": 1.02325296, "balance_loss_mlp": 1.0501008, "epoch": 0.27362092289192846, "flos": 19645291895040.0, "grad_norm": 3.321808032350485, "language_loss": 0.77814519, "learning_rate": 3.305849427745641e-06, "loss": 0.80015063, "num_input_tokens_seen": 98384010, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8515625, "step": 4551, "time_per_iteration": 2.616955518722534 }, { "auxiliary_loss_clip": 0.01163903, "auxiliary_loss_mlp": 0.01044365, "balance_loss_clip": 1.02764034, "balance_loss_mlp": 1.05058312, "epoch": 0.27368104614459643, "flos": 17639142416640.0, "grad_norm": 2.3151449736634255, "language_loss": 0.70559859, "learning_rate": 3.305563233195901e-06, "loss": 0.72768128, "num_input_tokens_seen": 98399625, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.86328125, "step": 4552, "time_per_iteration": 2.572216272354126 }, { "auxiliary_loss_clip": 0.01156634, "auxiliary_loss_mlp": 0.01041976, "balance_loss_clip": 1.02531064, "balance_loss_mlp": 1.05234814, "epoch": 0.2737411693972644, "flos": 21580015178880.0, "grad_norm": 2.1496061701821065, "language_loss": 0.71648532, "learning_rate": 3.305276992053968e-06, "loss": 0.73847139, "num_input_tokens_seen": 98417310, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8671875, "step": 4553, "time_per_iteration": 2.5461814403533936 }, { "auxiliary_loss_clip": 0.01144104, "auxiliary_loss_mlp": 0.01042517, "balance_loss_clip": 1.02520812, "balance_loss_mlp": 1.04910517, "epoch": 0.27380129264993236, "flos": 25484043565440.0, "grad_norm": 1.7230747740774792, "language_loss": 0.58827889, "learning_rate": 3.304990704330057e-06, "loss": 0.61014509, "num_input_tokens_seen": 98438670, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.859375, "step": 4554, "time_per_iteration": 2.6604714393615723 }, { "auxiliary_loss_clip": 0.01160592, "auxiliary_loss_mlp": 0.01037504, "balance_loss_clip": 1.02050447, "balance_loss_mlp": 1.05303538, "epoch": 0.2738614159026003, "flos": 18661196004480.0, "grad_norm": 1.7666053739398822, "language_loss": 0.73614365, "learning_rate": 3.304704370034384e-06, "loss": 0.75812459, "num_input_tokens_seen": 98456060, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8984375, "step": 4555, "time_per_iteration": 2.5994396209716797 }, { "auxiliary_loss_clip": 0.0115054, "auxiliary_loss_mlp": 0.0103734, "balance_loss_clip": 1.02103186, "balance_loss_mlp": 1.05367398, "epoch": 0.2739215391552683, "flos": 23477139901440.0, "grad_norm": 1.8821789013782833, "language_loss": 0.77443361, "learning_rate": 3.3044179891771684e-06, "loss": 0.79631233, "num_input_tokens_seen": 98473765, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.87890625, "step": 4556, "time_per_iteration": 2.6471967697143555 }, { "auxiliary_loss_clip": 0.01156391, "auxiliary_loss_mlp": 0.0105117, "balance_loss_clip": 1.03303874, "balance_loss_mlp": 1.05142891, "epoch": 0.27398166240793626, "flos": 17128636369920.0, "grad_norm": 1.8884218496021, "language_loss": 0.8239361, "learning_rate": 3.3041315617686298e-06, "loss": 0.84601176, "num_input_tokens_seen": 98490590, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9609375, "step": 4557, "time_per_iteration": 2.495164394378662 }, { "auxiliary_loss_clip": 0.01175453, "auxiliary_loss_mlp": 0.01038225, "balance_loss_clip": 1.02291822, "balance_loss_mlp": 1.05181646, "epoch": 0.2740417856606042, "flos": 23404744039680.0, "grad_norm": 1.9055148350008393, "language_loss": 0.72912139, "learning_rate": 3.303845087818991e-06, "loss": 0.75125813, "num_input_tokens_seen": 98510590, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.87109375, "step": 4558, "time_per_iteration": 2.675974130630493 }, { "auxiliary_loss_clip": 0.01153782, "auxiliary_loss_mlp": 0.01048826, "balance_loss_clip": 1.03207684, "balance_loss_mlp": 1.04849482, "epoch": 0.2741019089132722, "flos": 12780428400000.0, "grad_norm": 2.958932877808387, "language_loss": 0.68358213, "learning_rate": 3.3035585673384745e-06, "loss": 0.70560813, "num_input_tokens_seen": 98527875, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.875, "step": 4559, "time_per_iteration": 2.5471770763397217 }, { "auxiliary_loss_clip": 0.0115295, "auxiliary_loss_mlp": 0.01049965, "balance_loss_clip": 1.033216, "balance_loss_mlp": 1.04962897, "epoch": 0.27416203216594015, "flos": 20631542601600.0, "grad_norm": 1.6960516078716437, "language_loss": 0.71994901, "learning_rate": 3.3032720003373057e-06, "loss": 0.74197817, "num_input_tokens_seen": 98547575, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8515625, "step": 4560, "time_per_iteration": 2.5940020084381104 }, { "auxiliary_loss_clip": 0.01155024, "auxiliary_loss_mlp": 0.01043717, "balance_loss_clip": 1.02736127, "balance_loss_mlp": 1.04922819, "epoch": 0.27422215541860817, "flos": 26541576812160.0, "grad_norm": 1.7879857494222458, "language_loss": 0.81270146, "learning_rate": 3.302985386825712e-06, "loss": 0.83468884, "num_input_tokens_seen": 98566290, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.87890625, "step": 4561, "time_per_iteration": 2.574481964111328 }, { "auxiliary_loss_clip": 0.01137364, "auxiliary_loss_mlp": 0.01039392, "balance_loss_clip": 1.02389503, "balance_loss_mlp": 1.04901564, "epoch": 0.27428227867127614, "flos": 23331163029120.0, "grad_norm": 1.7248341457459544, "language_loss": 0.75156897, "learning_rate": 3.302698726813921e-06, "loss": 0.77333653, "num_input_tokens_seen": 98586255, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8828125, "step": 4562, "time_per_iteration": 2.6372768878936768 }, { "auxiliary_loss_clip": 0.01135838, "auxiliary_loss_mlp": 0.01038176, "balance_loss_clip": 1.02273858, "balance_loss_mlp": 1.05004179, "epoch": 0.2743424019239441, "flos": 23035115134080.0, "grad_norm": 1.6762417559613256, "language_loss": 0.74308234, "learning_rate": 3.3024120203121637e-06, "loss": 0.76482248, "num_input_tokens_seen": 98606030, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.85546875, "step": 4563, "time_per_iteration": 2.511845827102661 }, { "auxiliary_loss_clip": 0.01166348, "auxiliary_loss_mlp": 0.01045184, "balance_loss_clip": 1.02706361, "balance_loss_mlp": 1.04903126, "epoch": 0.27440252517661207, "flos": 21981101420160.0, "grad_norm": 2.003009528551499, "language_loss": 0.62612915, "learning_rate": 3.302125267330672e-06, "loss": 0.64824444, "num_input_tokens_seen": 98625225, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.90234375, "step": 4564, "time_per_iteration": 2.6411662101745605 }, { "auxiliary_loss_clip": 0.01155856, "auxiliary_loss_mlp": 0.01041912, "balance_loss_clip": 1.02437687, "balance_loss_mlp": 1.05013609, "epoch": 0.27446264842928003, "flos": 40187451502080.0, "grad_norm": 1.609645330610846, "language_loss": 0.77857727, "learning_rate": 3.3018384678796786e-06, "loss": 0.80055493, "num_input_tokens_seen": 98649470, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.87890625, "step": 4565, "time_per_iteration": 2.67384934425354 }, { "auxiliary_loss_clip": 0.01172292, "auxiliary_loss_mlp": 0.01041646, "balance_loss_clip": 1.02591085, "balance_loss_mlp": 1.05021513, "epoch": 0.274522771681948, "flos": 13479681438720.0, "grad_norm": 1.9356921476872528, "language_loss": 0.68062955, "learning_rate": 3.3015516219694186e-06, "loss": 0.70276892, "num_input_tokens_seen": 98666915, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.859375, "step": 4566, "time_per_iteration": 2.5695197582244873 }, { "auxiliary_loss_clip": 0.01141969, "auxiliary_loss_mlp": 0.01046286, "balance_loss_clip": 1.03170645, "balance_loss_mlp": 1.05002308, "epoch": 0.27458289493461596, "flos": 28622133313920.0, "grad_norm": 1.6294399250881832, "language_loss": 0.61200571, "learning_rate": 3.3012647296101296e-06, "loss": 0.63388824, "num_input_tokens_seen": 98688240, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.828125, "step": 4567, "time_per_iteration": 2.5693602561950684 }, { "auxiliary_loss_clip": 0.011439, "auxiliary_loss_mlp": 0.01043518, "balance_loss_clip": 1.02674556, "balance_loss_mlp": 1.04905868, "epoch": 0.2746430181872839, "flos": 20119815492480.0, "grad_norm": 1.832297241220847, "language_loss": 0.82259107, "learning_rate": 3.30097779081205e-06, "loss": 0.84446526, "num_input_tokens_seen": 98708245, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.859375, "step": 4568, "time_per_iteration": 2.5665488243103027 }, { "auxiliary_loss_clip": 0.0114887, "auxiliary_loss_mlp": 0.01033395, "balance_loss_clip": 1.01714683, "balance_loss_mlp": 1.05233669, "epoch": 0.2747031414399519, "flos": 20193468330240.0, "grad_norm": 1.9622125767911351, "language_loss": 0.68338311, "learning_rate": 3.300690805585419e-06, "loss": 0.7052058, "num_input_tokens_seen": 98724575, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.875, "step": 4569, "time_per_iteration": 2.4937899112701416 }, { "auxiliary_loss_clip": 0.0115404, "auxiliary_loss_mlp": 0.01042616, "balance_loss_clip": 1.02590334, "balance_loss_mlp": 1.04862571, "epoch": 0.27476326469261986, "flos": 13516346246400.0, "grad_norm": 2.2260928715950103, "language_loss": 0.71170014, "learning_rate": 3.300403773940479e-06, "loss": 0.73366666, "num_input_tokens_seen": 98740700, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.875, "step": 4570, "time_per_iteration": 2.5542469024658203 }, { "auxiliary_loss_clip": 0.01080783, "auxiliary_loss_mlp": 0.01006763, "balance_loss_clip": 1.00516534, "balance_loss_mlp": 1.02601397, "epoch": 0.2748233879452878, "flos": 65937127121280.0, "grad_norm": 0.7387680739179158, "language_loss": 0.55776149, "learning_rate": 3.3001166958874738e-06, "loss": 0.57863694, "num_input_tokens_seen": 98803030, "router_z_loss_clip": 0.01599121, "router_z_loss_mlp": 0.28125, "step": 4571, "time_per_iteration": 3.295581102371216 }, { "auxiliary_loss_clip": 0.01176721, "auxiliary_loss_mlp": 0.01039159, "balance_loss_clip": 1.02341104, "balance_loss_mlp": 1.05219102, "epoch": 0.2748835111979558, "flos": 17384212615680.0, "grad_norm": 2.93965851477075, "language_loss": 0.77860522, "learning_rate": 3.299829571436648e-06, "loss": 0.80076396, "num_input_tokens_seen": 98820505, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.88671875, "step": 4572, "time_per_iteration": 2.586408853530884 }, { "auxiliary_loss_clip": 0.01131081, "auxiliary_loss_mlp": 0.01037031, "balance_loss_clip": 1.02291703, "balance_loss_mlp": 1.04918778, "epoch": 0.27494363445062375, "flos": 23587565287680.0, "grad_norm": 1.6857578611908395, "language_loss": 0.81231415, "learning_rate": 3.2995424005982475e-06, "loss": 0.83399522, "num_input_tokens_seen": 98842150, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8203125, "step": 4573, "time_per_iteration": 2.686232089996338 }, { "auxiliary_loss_clip": 0.01140091, "auxiliary_loss_mlp": 0.01041702, "balance_loss_clip": 1.02585888, "balance_loss_mlp": 1.04846323, "epoch": 0.2750037577032918, "flos": 17164582905600.0, "grad_norm": 2.1742015692418906, "language_loss": 0.79058969, "learning_rate": 3.299255183382522e-06, "loss": 0.81240761, "num_input_tokens_seen": 98861050, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.828125, "step": 4574, "time_per_iteration": 2.5571703910827637 }, { "auxiliary_loss_clip": 0.01169369, "auxiliary_loss_mlp": 0.01043031, "balance_loss_clip": 1.02785563, "balance_loss_mlp": 1.0480876, "epoch": 0.27506388095595974, "flos": 24491903028480.0, "grad_norm": 2.1976986064362305, "language_loss": 0.74137664, "learning_rate": 3.298967919799722e-06, "loss": 0.76350069, "num_input_tokens_seen": 98879695, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.85546875, "step": 4575, "time_per_iteration": 2.651573419570923 }, { "auxiliary_loss_clip": 0.01149503, "auxiliary_loss_mlp": 0.0103511, "balance_loss_clip": 1.02076912, "balance_loss_mlp": 1.04871726, "epoch": 0.2751240042086277, "flos": 38764706722560.0, "grad_norm": 1.6600710940932686, "language_loss": 0.71938306, "learning_rate": 3.2986806098600973e-06, "loss": 0.74122918, "num_input_tokens_seen": 98902035, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.828125, "step": 4576, "time_per_iteration": 2.710170269012451 }, { "auxiliary_loss_clip": 0.01162834, "auxiliary_loss_mlp": 0.01287105, "balance_loss_clip": 1.02337599, "balance_loss_mlp": 1.05004644, "epoch": 0.27518412746129567, "flos": 26907039740160.0, "grad_norm": 1.6084347303805826, "language_loss": 0.73338622, "learning_rate": 3.298393253573902e-06, "loss": 0.75788558, "num_input_tokens_seen": 98921835, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.859375, "step": 4577, "time_per_iteration": 2.714049816131592 }, { "auxiliary_loss_clip": 0.01154826, "auxiliary_loss_mlp": 0.01035012, "balance_loss_clip": 1.01901436, "balance_loss_mlp": 1.05052066, "epoch": 0.27524425071396363, "flos": 24900531125760.0, "grad_norm": 1.9082549126247412, "language_loss": 0.76499283, "learning_rate": 3.298105850951392e-06, "loss": 0.78689122, "num_input_tokens_seen": 98939610, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.86328125, "step": 4578, "time_per_iteration": 2.6210968494415283 }, { "auxiliary_loss_clip": 0.01154681, "auxiliary_loss_mlp": 0.01043175, "balance_loss_clip": 1.02437532, "balance_loss_mlp": 1.04910815, "epoch": 0.2753043739666316, "flos": 26288047641600.0, "grad_norm": 1.3872294964538392, "language_loss": 0.66415834, "learning_rate": 3.2978184020028232e-06, "loss": 0.68613696, "num_input_tokens_seen": 98962250, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.875, "step": 4579, "time_per_iteration": 2.772641658782959 }, { "auxiliary_loss_clip": 0.01163907, "auxiliary_loss_mlp": 0.01051282, "balance_loss_clip": 1.03392506, "balance_loss_mlp": 1.0501157, "epoch": 0.27536449721929956, "flos": 24206772867840.0, "grad_norm": 1.8483122884394216, "language_loss": 0.80024904, "learning_rate": 3.297530906738454e-06, "loss": 0.82240093, "num_input_tokens_seen": 98981845, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.875, "step": 4580, "time_per_iteration": 2.5900614261627197 }, { "auxiliary_loss_clip": 0.01146062, "auxiliary_loss_mlp": 0.0104642, "balance_loss_clip": 1.02850246, "balance_loss_mlp": 1.05063975, "epoch": 0.27542462047196753, "flos": 19537272720000.0, "grad_norm": 1.6139055887274538, "language_loss": 0.67430675, "learning_rate": 3.297243365168544e-06, "loss": 0.69623154, "num_input_tokens_seen": 99001855, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.8671875, "step": 4581, "time_per_iteration": 2.642794370651245 }, { "auxiliary_loss_clip": 0.0115474, "auxiliary_loss_mlp": 0.01046282, "balance_loss_clip": 1.03068924, "balance_loss_mlp": 1.048684, "epoch": 0.2754847437246355, "flos": 14319165173760.0, "grad_norm": 1.8016795875541476, "language_loss": 0.77871776, "learning_rate": 3.2969557773033555e-06, "loss": 0.80072796, "num_input_tokens_seen": 99019880, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8828125, "step": 4582, "time_per_iteration": 2.533825635910034 }, { "auxiliary_loss_clip": 0.01170808, "auxiliary_loss_mlp": 0.01038624, "balance_loss_clip": 1.02421188, "balance_loss_mlp": 1.05085731, "epoch": 0.27554486697730346, "flos": 18838773866880.0, "grad_norm": 1.7069468842772897, "language_loss": 0.84105408, "learning_rate": 3.296668143153152e-06, "loss": 0.86314833, "num_input_tokens_seen": 99037570, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8359375, "step": 4583, "time_per_iteration": 2.6334571838378906 }, { "auxiliary_loss_clip": 0.01144984, "auxiliary_loss_mlp": 0.01041187, "balance_loss_clip": 1.02555847, "balance_loss_mlp": 1.0484314, "epoch": 0.2756049902299714, "flos": 22382295402240.0, "grad_norm": 1.7824854758429376, "language_loss": 0.66852772, "learning_rate": 3.296380462728197e-06, "loss": 0.69038945, "num_input_tokens_seen": 99056875, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.875, "step": 4584, "time_per_iteration": 2.7008519172668457 }, { "auxiliary_loss_clip": 0.01168485, "auxiliary_loss_mlp": 0.01044924, "balance_loss_clip": 1.0294745, "balance_loss_mlp": 1.04870844, "epoch": 0.2756651134826394, "flos": 19573901614080.0, "grad_norm": 2.0432955127788213, "language_loss": 0.77112359, "learning_rate": 3.2960927360387585e-06, "loss": 0.79325771, "num_input_tokens_seen": 99074685, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.83203125, "step": 4585, "time_per_iteration": 4.0840630531311035 }, { "auxiliary_loss_clip": 0.0114849, "auxiliary_loss_mlp": 0.01292266, "balance_loss_clip": 1.02838635, "balance_loss_mlp": 1.0520978, "epoch": 0.27572523673530736, "flos": 23586559706880.0, "grad_norm": 1.9510104037078726, "language_loss": 0.7173748, "learning_rate": 3.2958049630951038e-06, "loss": 0.74178231, "num_input_tokens_seen": 99095300, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.875, "step": 4586, "time_per_iteration": 2.7032477855682373 }, { "auxiliary_loss_clip": 0.01153468, "auxiliary_loss_mlp": 0.0103993, "balance_loss_clip": 1.02424216, "balance_loss_mlp": 1.05049908, "epoch": 0.2757853599879754, "flos": 22820118278400.0, "grad_norm": 1.481054786981422, "language_loss": 0.80130619, "learning_rate": 3.295517143907504e-06, "loss": 0.82324016, "num_input_tokens_seen": 99115965, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8515625, "step": 4587, "time_per_iteration": 2.582886219024658 }, { "auxiliary_loss_clip": 0.01144276, "auxiliary_loss_mlp": 0.01041635, "balance_loss_clip": 1.02681184, "balance_loss_mlp": 1.05103362, "epoch": 0.27584548324064334, "flos": 18551704371840.0, "grad_norm": 1.8365775202525931, "language_loss": 0.8221488, "learning_rate": 3.2952292784862286e-06, "loss": 0.84400791, "num_input_tokens_seen": 99134265, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.83984375, "step": 4588, "time_per_iteration": 2.5146162509918213 }, { "auxiliary_loss_clip": 0.01160097, "auxiliary_loss_mlp": 0.01042259, "balance_loss_clip": 1.02665436, "balance_loss_mlp": 1.04935598, "epoch": 0.2759056064933113, "flos": 23769883745280.0, "grad_norm": 1.3596139361180903, "language_loss": 0.75307322, "learning_rate": 3.2949413668415526e-06, "loss": 0.77509689, "num_input_tokens_seen": 99156185, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8359375, "step": 4589, "time_per_iteration": 3.983424663543701 }, { "auxiliary_loss_clip": 0.0114094, "auxiliary_loss_mlp": 0.01052721, "balance_loss_clip": 1.03635335, "balance_loss_mlp": 1.04769444, "epoch": 0.27596572974597927, "flos": 24281898163200.0, "grad_norm": 1.948354203949789, "language_loss": 0.88030416, "learning_rate": 3.29465340898375e-06, "loss": 0.90224075, "num_input_tokens_seen": 99176735, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.83984375, "step": 4590, "time_per_iteration": 4.117035627365112 }, { "auxiliary_loss_clip": 0.01142478, "auxiliary_loss_mlp": 0.01050921, "balance_loss_clip": 1.03464866, "balance_loss_mlp": 1.04760289, "epoch": 0.27602585299864724, "flos": 35040985632000.0, "grad_norm": 1.6118514279150478, "language_loss": 0.7059114, "learning_rate": 3.2943654049230982e-06, "loss": 0.72784543, "num_input_tokens_seen": 99199765, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.859375, "step": 4591, "time_per_iteration": 2.682537078857422 }, { "auxiliary_loss_clip": 0.0116069, "auxiliary_loss_mlp": 0.01048595, "balance_loss_clip": 1.03196502, "balance_loss_mlp": 1.04816604, "epoch": 0.2760859762513152, "flos": 24309405002880.0, "grad_norm": 2.23582971163114, "language_loss": 0.79813188, "learning_rate": 3.2940773546698745e-06, "loss": 0.82022476, "num_input_tokens_seen": 99218435, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.859375, "step": 4592, "time_per_iteration": 4.125776767730713 }, { "auxiliary_loss_clip": 0.01064455, "auxiliary_loss_mlp": 0.0124963, "balance_loss_clip": 1.00086725, "balance_loss_mlp": 1.02735376, "epoch": 0.27614609950398317, "flos": 71260739890560.0, "grad_norm": 0.71739762080653, "language_loss": 0.61595124, "learning_rate": 3.2937892582343574e-06, "loss": 0.63909209, "num_input_tokens_seen": 99276200, "router_z_loss_clip": 0.01696777, "router_z_loss_mlp": 0.28125, "step": 4593, "time_per_iteration": 3.0848231315612793 }, { "auxiliary_loss_clip": 0.01142492, "auxiliary_loss_mlp": 0.0105156, "balance_loss_clip": 1.03630102, "balance_loss_mlp": 1.04859197, "epoch": 0.27620622275665113, "flos": 29674854138240.0, "grad_norm": 2.0447303058977724, "language_loss": 0.77440226, "learning_rate": 3.2935011156268313e-06, "loss": 0.79634273, "num_input_tokens_seen": 99297625, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.84375, "step": 4594, "time_per_iteration": 2.639458179473877 }, { "auxiliary_loss_clip": 0.01142303, "auxiliary_loss_mlp": 0.01042253, "balance_loss_clip": 1.02720869, "balance_loss_mlp": 1.04909015, "epoch": 0.2762663460093191, "flos": 15378063137280.0, "grad_norm": 1.421528063040866, "language_loss": 0.91582727, "learning_rate": 3.293212926857577e-06, "loss": 0.93767285, "num_input_tokens_seen": 99315790, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.84375, "step": 4595, "time_per_iteration": 2.4997949600219727 }, { "auxiliary_loss_clip": 0.01144291, "auxiliary_loss_mlp": 0.01051957, "balance_loss_clip": 1.03572083, "balance_loss_mlp": 1.04905987, "epoch": 0.27632646926198706, "flos": 20704082117760.0, "grad_norm": 2.12500908432843, "language_loss": 0.69040674, "learning_rate": 3.2929246919368796e-06, "loss": 0.71236926, "num_input_tokens_seen": 99334615, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.86328125, "step": 4596, "time_per_iteration": 2.5627944469451904 }, { "auxiliary_loss_clip": 0.01157942, "auxiliary_loss_mlp": 0.01044378, "balance_loss_clip": 1.02830839, "balance_loss_mlp": 1.04919577, "epoch": 0.276386592514655, "flos": 32813374849920.0, "grad_norm": 1.898935538008348, "language_loss": 0.63324636, "learning_rate": 3.2926364108750263e-06, "loss": 0.65526962, "num_input_tokens_seen": 99356685, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.90625, "step": 4597, "time_per_iteration": 2.6481986045837402 }, { "auxiliary_loss_clip": 0.01132787, "auxiliary_loss_mlp": 0.01047505, "balance_loss_clip": 1.03235376, "balance_loss_mlp": 1.05027032, "epoch": 0.276446715767323, "flos": 18551704371840.0, "grad_norm": 2.3875106691965633, "language_loss": 0.86288953, "learning_rate": 3.292348083682304e-06, "loss": 0.88469243, "num_input_tokens_seen": 99374810, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.828125, "step": 4598, "time_per_iteration": 2.654205083847046 }, { "auxiliary_loss_clip": 0.01135142, "auxiliary_loss_mlp": 0.01044671, "balance_loss_clip": 1.02837503, "balance_loss_mlp": 1.04910529, "epoch": 0.27650683901999096, "flos": 22819615488000.0, "grad_norm": 2.3205211193992525, "language_loss": 0.79733187, "learning_rate": 3.2920597103690035e-06, "loss": 0.81912994, "num_input_tokens_seen": 99391290, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.859375, "step": 4599, "time_per_iteration": 2.5505478382110596 }, { "auxiliary_loss_clip": 0.01156908, "auxiliary_loss_mlp": 0.01050002, "balance_loss_clip": 1.0343622, "balance_loss_mlp": 1.04949093, "epoch": 0.276566962272659, "flos": 21361534704000.0, "grad_norm": 1.7185524939027121, "language_loss": 0.78461909, "learning_rate": 3.2917712909454148e-06, "loss": 0.80668819, "num_input_tokens_seen": 99409120, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.89453125, "step": 4600, "time_per_iteration": 2.5684659481048584 }, { "auxiliary_loss_clip": 0.01164535, "auxiliary_loss_mlp": 0.01045883, "balance_loss_clip": 1.02984953, "balance_loss_mlp": 1.04932523, "epoch": 0.27662708552532694, "flos": 17710604524800.0, "grad_norm": 1.8059483389552213, "language_loss": 0.72752321, "learning_rate": 3.291482825421832e-06, "loss": 0.74962741, "num_input_tokens_seen": 99426180, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.88671875, "step": 4601, "time_per_iteration": 2.536221504211426 }, { "auxiliary_loss_clip": 0.01142883, "auxiliary_loss_mlp": 0.01042915, "balance_loss_clip": 1.0264523, "balance_loss_mlp": 1.04891813, "epoch": 0.2766872087779949, "flos": 21252725429760.0, "grad_norm": 2.15279240116118, "language_loss": 0.79943436, "learning_rate": 3.2911943138085496e-06, "loss": 0.82129228, "num_input_tokens_seen": 99447720, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8515625, "step": 4602, "time_per_iteration": 2.5746889114379883 }, { "auxiliary_loss_clip": 0.01159457, "auxiliary_loss_mlp": 0.0105101, "balance_loss_clip": 1.03349781, "balance_loss_mlp": 1.05140877, "epoch": 0.2767473320306629, "flos": 12931900053120.0, "grad_norm": 1.9386151601484327, "language_loss": 0.77067482, "learning_rate": 3.290905756115863e-06, "loss": 0.79277945, "num_input_tokens_seen": 99464720, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.90625, "step": 4603, "time_per_iteration": 2.5611021518707275 }, { "auxiliary_loss_clip": 0.01148678, "auxiliary_loss_mlp": 0.01041465, "balance_loss_clip": 1.02638531, "balance_loss_mlp": 1.04807901, "epoch": 0.27680745528333084, "flos": 15012851604480.0, "grad_norm": 1.5675342123623157, "language_loss": 0.81262153, "learning_rate": 3.2906171523540706e-06, "loss": 0.83452296, "num_input_tokens_seen": 99482310, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.82421875, "step": 4604, "time_per_iteration": 2.533804178237915 }, { "auxiliary_loss_clip": 0.01173111, "auxiliary_loss_mlp": 0.01034935, "balance_loss_clip": 1.01879382, "balance_loss_mlp": 1.04951596, "epoch": 0.2768675785359988, "flos": 22637835734400.0, "grad_norm": 2.014481020038506, "language_loss": 0.70087522, "learning_rate": 3.2903285025334723e-06, "loss": 0.7229557, "num_input_tokens_seen": 99501255, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.875, "step": 4605, "time_per_iteration": 2.6006641387939453 }, { "auxiliary_loss_clip": 0.01060728, "auxiliary_loss_mlp": 0.01003171, "balance_loss_clip": 1.0015738, "balance_loss_mlp": 1.02455902, "epoch": 0.27692770178866677, "flos": 66130542881280.0, "grad_norm": 0.7197920114578291, "language_loss": 0.57166779, "learning_rate": 3.290039806664368e-06, "loss": 0.59230673, "num_input_tokens_seen": 99568925, "router_z_loss_clip": 0.01599121, "router_z_loss_mlp": 0.2734375, "step": 4606, "time_per_iteration": 3.308338165283203 }, { "auxiliary_loss_clip": 0.01160105, "auxiliary_loss_mlp": 0.01043985, "balance_loss_clip": 1.02720046, "balance_loss_mlp": 1.05367374, "epoch": 0.27698782504133473, "flos": 26464979059200.0, "grad_norm": 9.370126666933158, "language_loss": 0.69491589, "learning_rate": 3.2897510647570626e-06, "loss": 0.71695685, "num_input_tokens_seen": 99588455, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.88671875, "step": 4607, "time_per_iteration": 2.6249146461486816 }, { "auxiliary_loss_clip": 0.011409, "auxiliary_loss_mlp": 0.01032551, "balance_loss_clip": 1.01732802, "balance_loss_mlp": 1.04945469, "epoch": 0.2770479482940027, "flos": 25884806584320.0, "grad_norm": 1.546968625217961, "language_loss": 0.69721276, "learning_rate": 3.2894622768218587e-06, "loss": 0.71894729, "num_input_tokens_seen": 99609355, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.82421875, "step": 4608, "time_per_iteration": 2.562948703765869 }, { "auxiliary_loss_clip": 0.01185022, "auxiliary_loss_mlp": 0.01036736, "balance_loss_clip": 1.02076221, "balance_loss_mlp": 1.05281878, "epoch": 0.27710807154667066, "flos": 22857249962880.0, "grad_norm": 1.833454215305529, "language_loss": 0.72766638, "learning_rate": 3.289173442869063e-06, "loss": 0.74988395, "num_input_tokens_seen": 99628780, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.875, "step": 4609, "time_per_iteration": 2.614197254180908 }, { "auxiliary_loss_clip": 0.01162449, "auxiliary_loss_mlp": 0.01044014, "balance_loss_clip": 1.02737212, "balance_loss_mlp": 1.04768014, "epoch": 0.27716819479933863, "flos": 17711071401600.0, "grad_norm": 2.2587136633101452, "language_loss": 0.8381809, "learning_rate": 3.2888845629089833e-06, "loss": 0.86024547, "num_input_tokens_seen": 99644545, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.875, "step": 4610, "time_per_iteration": 2.501229763031006 }, { "auxiliary_loss_clip": 0.01186783, "auxiliary_loss_mlp": 0.01293705, "balance_loss_clip": 1.02891946, "balance_loss_mlp": 1.05110872, "epoch": 0.2772283180520066, "flos": 19646046080640.0, "grad_norm": 3.447566951344175, "language_loss": 0.69055986, "learning_rate": 3.2885956369519287e-06, "loss": 0.71536469, "num_input_tokens_seen": 99663125, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.90625, "step": 4611, "time_per_iteration": 2.6123175621032715 }, { "auxiliary_loss_clip": 0.01133353, "auxiliary_loss_mlp": 0.01035524, "balance_loss_clip": 1.01971722, "balance_loss_mlp": 1.05174279, "epoch": 0.27728844130467456, "flos": 21032628842880.0, "grad_norm": 1.792545861445858, "language_loss": 0.73414052, "learning_rate": 3.2883066650082106e-06, "loss": 0.75582933, "num_input_tokens_seen": 99682645, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8125, "step": 4612, "time_per_iteration": 2.5090906620025635 }, { "auxiliary_loss_clip": 0.01149011, "auxiliary_loss_mlp": 0.01291017, "balance_loss_clip": 1.02655578, "balance_loss_mlp": 1.05151582, "epoch": 0.2773485645573425, "flos": 18989204025600.0, "grad_norm": 2.4206358743979632, "language_loss": 0.66941917, "learning_rate": 3.288017647088142e-06, "loss": 0.69381946, "num_input_tokens_seen": 99700520, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.88671875, "step": 4613, "time_per_iteration": 2.5730643272399902 }, { "auxiliary_loss_clip": 0.01170648, "auxiliary_loss_mlp": 0.01042681, "balance_loss_clip": 1.02624202, "balance_loss_mlp": 1.04905891, "epoch": 0.27740868781001055, "flos": 21468440557440.0, "grad_norm": 1.9516496827367111, "language_loss": 0.79744625, "learning_rate": 3.2877285832020363e-06, "loss": 0.81957954, "num_input_tokens_seen": 99720355, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.859375, "step": 4614, "time_per_iteration": 2.556239128112793 }, { "auxiliary_loss_clip": 0.01147589, "auxiliary_loss_mlp": 0.01042585, "balance_loss_clip": 1.02664661, "balance_loss_mlp": 1.04989696, "epoch": 0.2774688110626785, "flos": 19827825834240.0, "grad_norm": 2.5757894561589536, "language_loss": 0.79988688, "learning_rate": 3.28743947336021e-06, "loss": 0.82178867, "num_input_tokens_seen": 99736090, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.88671875, "step": 4615, "time_per_iteration": 2.568357467651367 }, { "auxiliary_loss_clip": 0.01167161, "auxiliary_loss_mlp": 0.01045178, "balance_loss_clip": 1.0282625, "balance_loss_mlp": 1.05004132, "epoch": 0.2775289343153465, "flos": 18216226321920.0, "grad_norm": 2.9543148359700653, "language_loss": 0.63643622, "learning_rate": 3.2871503175729807e-06, "loss": 0.65855956, "num_input_tokens_seen": 99751805, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.90234375, "step": 4616, "time_per_iteration": 2.506885528564453 }, { "auxiliary_loss_clip": 0.01146519, "auxiliary_loss_mlp": 0.01041465, "balance_loss_clip": 1.02530026, "balance_loss_mlp": 1.04962146, "epoch": 0.27758905756801444, "flos": 16472476673280.0, "grad_norm": 2.745140454107629, "language_loss": 0.82030559, "learning_rate": 3.286861115850667e-06, "loss": 0.84218538, "num_input_tokens_seen": 99770610, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.875, "step": 4617, "time_per_iteration": 2.5705678462982178 }, { "auxiliary_loss_clip": 0.01137623, "auxiliary_loss_mlp": 0.01043307, "balance_loss_clip": 1.0273571, "balance_loss_mlp": 1.04981565, "epoch": 0.2776491808206824, "flos": 18728240739840.0, "grad_norm": 2.1876457677353667, "language_loss": 0.76612908, "learning_rate": 3.286571868203591e-06, "loss": 0.78793836, "num_input_tokens_seen": 99787305, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.87890625, "step": 4618, "time_per_iteration": 2.513550281524658 }, { "auxiliary_loss_clip": 0.01139891, "auxiliary_loss_mlp": 0.01036285, "balance_loss_clip": 1.02090693, "balance_loss_mlp": 1.05216455, "epoch": 0.27770930407335037, "flos": 28038189911040.0, "grad_norm": 1.6200977867889275, "language_loss": 0.84662122, "learning_rate": 3.286282574642074e-06, "loss": 0.86838299, "num_input_tokens_seen": 99808940, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.87890625, "step": 4619, "time_per_iteration": 2.6274147033691406 }, { "auxiliary_loss_clip": 0.0115185, "auxiliary_loss_mlp": 0.01043012, "balance_loss_clip": 1.02868342, "balance_loss_mlp": 1.04983199, "epoch": 0.27776942732601834, "flos": 23549823072000.0, "grad_norm": 1.6761286591684053, "language_loss": 0.76559865, "learning_rate": 3.2859932351764413e-06, "loss": 0.78754729, "num_input_tokens_seen": 99829575, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.84375, "step": 4620, "time_per_iteration": 2.585670232772827 }, { "auxiliary_loss_clip": 0.011614, "auxiliary_loss_mlp": 0.01042389, "balance_loss_clip": 1.0269866, "balance_loss_mlp": 1.04989612, "epoch": 0.2778295505786863, "flos": 23908713811200.0, "grad_norm": 1.8900236528100496, "language_loss": 0.78073692, "learning_rate": 3.2857038498170175e-06, "loss": 0.80277479, "num_input_tokens_seen": 99847575, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.84375, "step": 4621, "time_per_iteration": 2.5931272506713867 }, { "auxiliary_loss_clip": 0.01164689, "auxiliary_loss_mlp": 0.01049327, "balance_loss_clip": 1.03199399, "balance_loss_mlp": 1.050071, "epoch": 0.27788967383135427, "flos": 25554571920000.0, "grad_norm": 2.143445269858058, "language_loss": 0.88083655, "learning_rate": 3.2854144185741303e-06, "loss": 0.90297669, "num_input_tokens_seen": 99864995, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.87890625, "step": 4622, "time_per_iteration": 2.607962131500244 }, { "auxiliary_loss_clip": 0.0117029, "auxiliary_loss_mlp": 0.01051707, "balance_loss_clip": 1.03491032, "balance_loss_mlp": 1.04828978, "epoch": 0.27794979708402223, "flos": 16252631481600.0, "grad_norm": 2.011361505646684, "language_loss": 0.81276321, "learning_rate": 3.285124941458109e-06, "loss": 0.83498317, "num_input_tokens_seen": 99881540, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8671875, "step": 4623, "time_per_iteration": 2.5923047065734863 }, { "auxiliary_loss_clip": 0.01166772, "auxiliary_loss_mlp": 0.0104687, "balance_loss_clip": 1.0307529, "balance_loss_mlp": 1.05253947, "epoch": 0.2780099203366902, "flos": 20667632791680.0, "grad_norm": 1.8934243468431495, "language_loss": 0.81777608, "learning_rate": 3.2848354184792845e-06, "loss": 0.83991253, "num_input_tokens_seen": 99899595, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.875, "step": 4624, "time_per_iteration": 2.5597634315490723 }, { "auxiliary_loss_clip": 0.01154437, "auxiliary_loss_mlp": 0.01294099, "balance_loss_clip": 1.03031552, "balance_loss_mlp": 1.05158854, "epoch": 0.27807004358935816, "flos": 17739583822080.0, "grad_norm": 8.239360636095949, "language_loss": 0.76572776, "learning_rate": 3.284545849647989e-06, "loss": 0.79021317, "num_input_tokens_seen": 99913020, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.84765625, "step": 4625, "time_per_iteration": 2.5360288619995117 }, { "auxiliary_loss_clip": 0.0115155, "auxiliary_loss_mlp": 0.01038284, "balance_loss_clip": 1.02211905, "balance_loss_mlp": 1.04766262, "epoch": 0.2781301668420261, "flos": 16727119165440.0, "grad_norm": 2.2026955251170652, "language_loss": 0.70422149, "learning_rate": 3.284256234974556e-06, "loss": 0.72611988, "num_input_tokens_seen": 99931405, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.859375, "step": 4626, "time_per_iteration": 4.032962322235107 }, { "auxiliary_loss_clip": 0.01176548, "auxiliary_loss_mlp": 0.01044714, "balance_loss_clip": 1.02695179, "balance_loss_mlp": 1.05134654, "epoch": 0.27819029009469415, "flos": 13844749317120.0, "grad_norm": 2.260363740843961, "language_loss": 0.92352563, "learning_rate": 3.2839665744693222e-06, "loss": 0.94573832, "num_input_tokens_seen": 99948100, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.8984375, "step": 4627, "time_per_iteration": 2.529308795928955 }, { "auxiliary_loss_clip": 0.01139366, "auxiliary_loss_mlp": 0.01040599, "balance_loss_clip": 1.02531648, "balance_loss_mlp": 1.05100918, "epoch": 0.2782504133473621, "flos": 27089286370560.0, "grad_norm": 3.0423407499050237, "language_loss": 0.85142386, "learning_rate": 3.2836768681426234e-06, "loss": 0.87322348, "num_input_tokens_seen": 99966470, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8828125, "step": 4628, "time_per_iteration": 2.6500861644744873 }, { "auxiliary_loss_clip": 0.01160942, "auxiliary_loss_mlp": 0.0104224, "balance_loss_clip": 1.02637339, "balance_loss_mlp": 1.04779637, "epoch": 0.2783105366000301, "flos": 21068826773760.0, "grad_norm": 1.7437586762251802, "language_loss": 0.79066432, "learning_rate": 3.2833871160047998e-06, "loss": 0.8126961, "num_input_tokens_seen": 99985930, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.859375, "step": 4629, "time_per_iteration": 2.5838563442230225 }, { "auxiliary_loss_clip": 0.01158592, "auxiliary_loss_mlp": 0.01040538, "balance_loss_clip": 1.02513635, "balance_loss_mlp": 1.05010974, "epoch": 0.27837065985269804, "flos": 26501823434880.0, "grad_norm": 1.5398793057590239, "language_loss": 0.84567106, "learning_rate": 3.2830973180661907e-06, "loss": 0.86766237, "num_input_tokens_seen": 100006235, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.81640625, "step": 4630, "time_per_iteration": 2.679154634475708 }, { "auxiliary_loss_clip": 0.01153566, "auxiliary_loss_mlp": 0.01040717, "balance_loss_clip": 1.02369368, "balance_loss_mlp": 1.04845273, "epoch": 0.278430783105366, "flos": 20223201813120.0, "grad_norm": 2.1638423905550743, "language_loss": 0.80588907, "learning_rate": 3.2828074743371394e-06, "loss": 0.82783198, "num_input_tokens_seen": 100023655, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.875, "step": 4631, "time_per_iteration": 5.417713642120361 }, { "auxiliary_loss_clip": 0.01141046, "auxiliary_loss_mlp": 0.01051027, "balance_loss_clip": 1.0348742, "balance_loss_mlp": 1.05380917, "epoch": 0.278490906358034, "flos": 25592888753280.0, "grad_norm": 1.6684596766099387, "language_loss": 0.70994389, "learning_rate": 3.2825175848279884e-06, "loss": 0.73186463, "num_input_tokens_seen": 100043280, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.875, "step": 4632, "time_per_iteration": 2.622652053833008 }, { "auxiliary_loss_clip": 0.01142866, "auxiliary_loss_mlp": 0.01044445, "balance_loss_clip": 1.02952015, "balance_loss_mlp": 1.05171013, "epoch": 0.27855102961070194, "flos": 16171544528640.0, "grad_norm": 1.7798703128056166, "language_loss": 0.8245337, "learning_rate": 3.2822276495490844e-06, "loss": 0.84640682, "num_input_tokens_seen": 100057690, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.82421875, "step": 4633, "time_per_iteration": 2.5012331008911133 }, { "auxiliary_loss_clip": 0.01155431, "auxiliary_loss_mlp": 0.01041351, "balance_loss_clip": 1.02312374, "balance_loss_mlp": 1.04937875, "epoch": 0.2786111528633699, "flos": 22927598749440.0, "grad_norm": 1.6425188853176382, "language_loss": 0.8791005, "learning_rate": 3.2819376685107733e-06, "loss": 0.90106833, "num_input_tokens_seen": 100075875, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.8828125, "step": 4634, "time_per_iteration": 4.036149024963379 }, { "auxiliary_loss_clip": 0.01156427, "auxiliary_loss_mlp": 0.01045337, "balance_loss_clip": 1.02811086, "balance_loss_mlp": 1.05187845, "epoch": 0.27867127611603787, "flos": 23404205335680.0, "grad_norm": 1.6229361824439628, "language_loss": 0.76611423, "learning_rate": 3.281647641723405e-06, "loss": 0.78813189, "num_input_tokens_seen": 100092930, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8671875, "step": 4635, "time_per_iteration": 2.6488451957702637 }, { "auxiliary_loss_clip": 0.01137094, "auxiliary_loss_mlp": 0.01045917, "balance_loss_clip": 1.02844095, "balance_loss_mlp": 1.04861009, "epoch": 0.27873139936870583, "flos": 19829010983040.0, "grad_norm": 5.017608312277395, "language_loss": 0.65076363, "learning_rate": 3.2813575691973288e-06, "loss": 0.67259383, "num_input_tokens_seen": 100110790, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.88671875, "step": 4636, "time_per_iteration": 2.634438991546631 }, { "auxiliary_loss_clip": 0.01148885, "auxiliary_loss_mlp": 0.01043568, "balance_loss_clip": 1.02664018, "balance_loss_mlp": 1.05104637, "epoch": 0.2787915226213738, "flos": 17707659609600.0, "grad_norm": 2.3936476769442923, "language_loss": 0.8328476, "learning_rate": 3.2810674509428973e-06, "loss": 0.85477215, "num_input_tokens_seen": 100126970, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.88671875, "step": 4637, "time_per_iteration": 2.4765989780426025 }, { "auxiliary_loss_clip": 0.01152734, "auxiliary_loss_mlp": 0.01043026, "balance_loss_clip": 1.0266943, "balance_loss_mlp": 1.05004919, "epoch": 0.27885164587404176, "flos": 22090557139200.0, "grad_norm": 1.5818854714041202, "language_loss": 0.75698191, "learning_rate": 3.2807772869704634e-06, "loss": 0.77893949, "num_input_tokens_seen": 100146720, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.84765625, "step": 4638, "time_per_iteration": 2.6573386192321777 }, { "auxiliary_loss_clip": 0.01148756, "auxiliary_loss_mlp": 0.01048829, "balance_loss_clip": 1.03221166, "balance_loss_mlp": 1.05134583, "epoch": 0.27891176912670973, "flos": 19207684500480.0, "grad_norm": 1.8040587027915866, "language_loss": 0.71377587, "learning_rate": 3.2804870772903826e-06, "loss": 0.73575175, "num_input_tokens_seen": 100165920, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8828125, "step": 4639, "time_per_iteration": 2.670761823654175 }, { "auxiliary_loss_clip": 0.01147116, "auxiliary_loss_mlp": 0.01037477, "balance_loss_clip": 1.01989985, "balance_loss_mlp": 1.05043626, "epoch": 0.27897189237937775, "flos": 27600007898880.0, "grad_norm": 2.6951666414910247, "language_loss": 0.65678334, "learning_rate": 3.2801968219130123e-06, "loss": 0.67862928, "num_input_tokens_seen": 100185525, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.87890625, "step": 4640, "time_per_iteration": 2.696976661682129 }, { "auxiliary_loss_clip": 0.01146702, "auxiliary_loss_mlp": 0.01044076, "balance_loss_clip": 1.02755404, "balance_loss_mlp": 1.05038166, "epoch": 0.2790320156320457, "flos": 21178210665600.0, "grad_norm": 2.214986698414106, "language_loss": 0.72843933, "learning_rate": 3.27990652084871e-06, "loss": 0.75034708, "num_input_tokens_seen": 100204850, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.875, "step": 4641, "time_per_iteration": 2.5825979709625244 }, { "auxiliary_loss_clip": 0.01168977, "auxiliary_loss_mlp": 0.01040613, "balance_loss_clip": 1.02291059, "balance_loss_mlp": 1.05169749, "epoch": 0.2790921388847137, "flos": 22783920347520.0, "grad_norm": 1.7465475054352542, "language_loss": 0.75274014, "learning_rate": 3.279616174107837e-06, "loss": 0.77483606, "num_input_tokens_seen": 100224520, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.90234375, "step": 4642, "time_per_iteration": 2.684497356414795 }, { "auxiliary_loss_clip": 0.01165516, "auxiliary_loss_mlp": 0.01043445, "balance_loss_clip": 1.02567101, "balance_loss_mlp": 1.05035377, "epoch": 0.27915226213738165, "flos": 23400649889280.0, "grad_norm": 1.7826023499728738, "language_loss": 0.86221701, "learning_rate": 3.2793257817007537e-06, "loss": 0.88430667, "num_input_tokens_seen": 100243935, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.8828125, "step": 4643, "time_per_iteration": 2.594839096069336 }, { "auxiliary_loss_clip": 0.0116615, "auxiliary_loss_mlp": 0.01045171, "balance_loss_clip": 1.02843392, "balance_loss_mlp": 1.05217469, "epoch": 0.2792123853900496, "flos": 22747794243840.0, "grad_norm": 1.810501977268121, "language_loss": 0.8287375, "learning_rate": 3.279035343637824e-06, "loss": 0.8508507, "num_input_tokens_seen": 100262290, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8671875, "step": 4644, "time_per_iteration": 2.5933241844177246 }, { "auxiliary_loss_clip": 0.01146075, "auxiliary_loss_mlp": 0.01038223, "balance_loss_clip": 1.02110434, "balance_loss_mlp": 1.0496068, "epoch": 0.2792725086427176, "flos": 15049372757760.0, "grad_norm": 1.9608345519553463, "language_loss": 0.78603721, "learning_rate": 3.2787448599294135e-06, "loss": 0.80788016, "num_input_tokens_seen": 100280015, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.875, "step": 4645, "time_per_iteration": 2.533719539642334 }, { "auxiliary_loss_clip": 0.01076047, "auxiliary_loss_mlp": 0.01003414, "balance_loss_clip": 1.0016737, "balance_loss_mlp": 1.03055644, "epoch": 0.27933263189538554, "flos": 62544861757440.0, "grad_norm": 0.7835699012054559, "language_loss": 0.62274861, "learning_rate": 3.2784543305858878e-06, "loss": 0.64354324, "num_input_tokens_seen": 100338935, "router_z_loss_clip": 0.01745605, "router_z_loss_mlp": 0.27734375, "step": 4646, "time_per_iteration": 3.1651368141174316 }, { "auxiliary_loss_clip": 0.01150063, "auxiliary_loss_mlp": 0.0103421, "balance_loss_clip": 1.01898682, "balance_loss_mlp": 1.04803514, "epoch": 0.2793927551480535, "flos": 25118365155840.0, "grad_norm": 1.6549647775629803, "language_loss": 0.89210784, "learning_rate": 3.2781637556176155e-06, "loss": 0.91395056, "num_input_tokens_seen": 100359905, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.83984375, "step": 4647, "time_per_iteration": 2.6297850608825684 }, { "auxiliary_loss_clip": 0.01147044, "auxiliary_loss_mlp": 0.01045813, "balance_loss_clip": 1.02846766, "balance_loss_mlp": 1.04921031, "epoch": 0.27945287840072147, "flos": 21324582587520.0, "grad_norm": 1.6402830513659843, "language_loss": 0.87138903, "learning_rate": 3.2778731350349673e-06, "loss": 0.89331758, "num_input_tokens_seen": 100376955, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.890625, "step": 4648, "time_per_iteration": 2.5496115684509277 }, { "auxiliary_loss_clip": 0.01148804, "auxiliary_loss_mlp": 0.01036319, "balance_loss_clip": 1.01936722, "balance_loss_mlp": 1.04995537, "epoch": 0.27951300165338944, "flos": 27450547407360.0, "grad_norm": 2.2700318843641223, "language_loss": 0.72562075, "learning_rate": 3.2775824688483138e-06, "loss": 0.74747205, "num_input_tokens_seen": 100397545, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8984375, "step": 4649, "time_per_iteration": 2.6537537574768066 }, { "auxiliary_loss_clip": 0.01163144, "auxiliary_loss_mlp": 0.01040579, "balance_loss_clip": 1.02193427, "balance_loss_mlp": 1.04930949, "epoch": 0.2795731249060574, "flos": 15159008044800.0, "grad_norm": 2.5719019519986337, "language_loss": 0.80227256, "learning_rate": 3.2772917570680278e-06, "loss": 0.82430983, "num_input_tokens_seen": 100415080, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.875, "step": 4650, "time_per_iteration": 2.522641658782959 }, { "auxiliary_loss_clip": 0.01073098, "auxiliary_loss_mlp": 0.01250245, "balance_loss_clip": 1.00135899, "balance_loss_mlp": 1.02761304, "epoch": 0.27963324815872537, "flos": 60120103178880.0, "grad_norm": 0.8211092669434499, "language_loss": 0.58792681, "learning_rate": 3.2770009997044846e-06, "loss": 0.61116022, "num_input_tokens_seen": 100471105, "router_z_loss_clip": 0.01818848, "router_z_loss_mlp": 0.2734375, "step": 4651, "time_per_iteration": 3.2093136310577393 }, { "auxiliary_loss_clip": 0.01147257, "auxiliary_loss_mlp": 0.01043317, "balance_loss_clip": 1.02426744, "balance_loss_mlp": 1.04754019, "epoch": 0.27969337141139333, "flos": 21765960910080.0, "grad_norm": 1.7308294207380674, "language_loss": 0.73843348, "learning_rate": 3.2767101967680607e-06, "loss": 0.76033926, "num_input_tokens_seen": 100492520, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.90625, "step": 4652, "time_per_iteration": 2.6321303844451904 }, { "auxiliary_loss_clip": 0.01142101, "auxiliary_loss_mlp": 0.01050351, "balance_loss_clip": 1.03347087, "balance_loss_mlp": 1.05303526, "epoch": 0.27975349466406135, "flos": 39851398834560.0, "grad_norm": 1.986849772347747, "language_loss": 0.79391003, "learning_rate": 3.276419348269134e-06, "loss": 0.81583458, "num_input_tokens_seen": 100512870, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.890625, "step": 4653, "time_per_iteration": 2.756821393966675 }, { "auxiliary_loss_clip": 0.01152274, "auxiliary_loss_mlp": 0.01043706, "balance_loss_clip": 1.02737474, "balance_loss_mlp": 1.04834127, "epoch": 0.2798136179167293, "flos": 21579799697280.0, "grad_norm": 2.2263206855781674, "language_loss": 0.79236341, "learning_rate": 3.2761284542180842e-06, "loss": 0.81432325, "num_input_tokens_seen": 100531655, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.86328125, "step": 4654, "time_per_iteration": 2.6257314682006836 }, { "auxiliary_loss_clip": 0.01176474, "auxiliary_loss_mlp": 0.01043189, "balance_loss_clip": 1.02527165, "balance_loss_mlp": 1.0504638, "epoch": 0.2798737411693973, "flos": 21537676022400.0, "grad_norm": 1.8109738073271082, "language_loss": 0.80714238, "learning_rate": 3.2758375146252924e-06, "loss": 0.82933903, "num_input_tokens_seen": 100548005, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.8984375, "step": 4655, "time_per_iteration": 2.5853970050811768 }, { "auxiliary_loss_clip": 0.01161024, "auxiliary_loss_mlp": 0.01289791, "balance_loss_clip": 1.02462196, "balance_loss_mlp": 1.04680896, "epoch": 0.27993386442206525, "flos": 26981051713920.0, "grad_norm": 1.5857050343555386, "language_loss": 0.81224817, "learning_rate": 3.275546529501142e-06, "loss": 0.83675635, "num_input_tokens_seen": 100567980, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.875, "step": 4656, "time_per_iteration": 2.6348721981048584 }, { "auxiliary_loss_clip": 0.01154623, "auxiliary_loss_mlp": 0.01039186, "balance_loss_clip": 1.02198386, "balance_loss_mlp": 1.04993296, "epoch": 0.2799939876747332, "flos": 24349876652160.0, "grad_norm": 1.6269195898127102, "language_loss": 0.83174711, "learning_rate": 3.2752554988560165e-06, "loss": 0.8536852, "num_input_tokens_seen": 100588630, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8671875, "step": 4657, "time_per_iteration": 2.605008125305176 }, { "auxiliary_loss_clip": 0.01163165, "auxiliary_loss_mlp": 0.01048577, "balance_loss_clip": 1.0317328, "balance_loss_mlp": 1.04902697, "epoch": 0.2800541109274012, "flos": 33656988648960.0, "grad_norm": 1.8109696053210138, "language_loss": 0.6336599, "learning_rate": 3.274964422700303e-06, "loss": 0.65577734, "num_input_tokens_seen": 100608775, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.875, "step": 4658, "time_per_iteration": 2.7629830837249756 }, { "auxiliary_loss_clip": 0.01145769, "auxiliary_loss_mlp": 0.01043117, "balance_loss_clip": 1.0260818, "balance_loss_mlp": 1.04797912, "epoch": 0.28011423418006914, "flos": 21614417429760.0, "grad_norm": 1.8738458804552627, "language_loss": 0.7839371, "learning_rate": 3.274673301044388e-06, "loss": 0.80582601, "num_input_tokens_seen": 100627975, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.88671875, "step": 4659, "time_per_iteration": 2.555109977722168 }, { "auxiliary_loss_clip": 0.01167588, "auxiliary_loss_mlp": 0.01050404, "balance_loss_clip": 1.03339243, "balance_loss_mlp": 1.05169034, "epoch": 0.2801743574327371, "flos": 23112431159040.0, "grad_norm": 2.9471883138025112, "language_loss": 0.79183668, "learning_rate": 3.274382133898663e-06, "loss": 0.81401658, "num_input_tokens_seen": 100645430, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.890625, "step": 4660, "time_per_iteration": 2.6058526039123535 }, { "auxiliary_loss_clip": 0.01161777, "auxiliary_loss_mlp": 0.01041289, "balance_loss_clip": 1.02560127, "balance_loss_mlp": 1.04927325, "epoch": 0.2802344806854051, "flos": 12641418766080.0, "grad_norm": 2.0124329087534525, "language_loss": 0.80784124, "learning_rate": 3.2740909212735172e-06, "loss": 0.82987189, "num_input_tokens_seen": 100663775, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.85546875, "step": 4661, "time_per_iteration": 2.5616588592529297 }, { "auxiliary_loss_clip": 0.01157906, "auxiliary_loss_mlp": 0.01057683, "balance_loss_clip": 1.04113662, "balance_loss_mlp": 1.05297589, "epoch": 0.28029460393807304, "flos": 37267878142080.0, "grad_norm": 1.4412402020193866, "language_loss": 0.78941047, "learning_rate": 3.273799663179343e-06, "loss": 0.81156635, "num_input_tokens_seen": 100686085, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.875, "step": 4662, "time_per_iteration": 2.7217843532562256 }, { "auxiliary_loss_clip": 0.01159032, "auxiliary_loss_mlp": 0.0105025, "balance_loss_clip": 1.03184402, "balance_loss_mlp": 1.05044889, "epoch": 0.280354727190741, "flos": 20741106061440.0, "grad_norm": 1.659099155857, "language_loss": 0.69793493, "learning_rate": 3.273508359626536e-06, "loss": 0.7200278, "num_input_tokens_seen": 100705135, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.90625, "step": 4663, "time_per_iteration": 2.5506043434143066 }, { "auxiliary_loss_clip": 0.01158393, "auxiliary_loss_mlp": 0.01053929, "balance_loss_clip": 1.03535676, "balance_loss_mlp": 1.05176401, "epoch": 0.28041485044340897, "flos": 21471026336640.0, "grad_norm": 1.6684495786154916, "language_loss": 0.77125633, "learning_rate": 3.2732170106254923e-06, "loss": 0.79337966, "num_input_tokens_seen": 100724960, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.88671875, "step": 4664, "time_per_iteration": 2.648592710494995 }, { "auxiliary_loss_clip": 0.01160493, "auxiliary_loss_mlp": 0.01046325, "balance_loss_clip": 1.02969551, "balance_loss_mlp": 1.04879737, "epoch": 0.28047497369607693, "flos": 14794263388800.0, "grad_norm": 1.91188957813504, "language_loss": 0.7925052, "learning_rate": 3.272925616186607e-06, "loss": 0.81457335, "num_input_tokens_seen": 100741995, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.84765625, "step": 4665, "time_per_iteration": 2.5310823917388916 }, { "auxiliary_loss_clip": 0.01145947, "auxiliary_loss_mlp": 0.01050901, "balance_loss_clip": 1.03452158, "balance_loss_mlp": 1.05070305, "epoch": 0.2805350969487449, "flos": 23070738447360.0, "grad_norm": 2.042106334166876, "language_loss": 0.80611426, "learning_rate": 3.2726341763202823e-06, "loss": 0.8280828, "num_input_tokens_seen": 100758985, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.86328125, "step": 4666, "time_per_iteration": 2.6517860889434814 }, { "auxiliary_loss_clip": 0.01167813, "auxiliary_loss_mlp": 0.01054005, "balance_loss_clip": 1.03640926, "balance_loss_mlp": 1.05239272, "epoch": 0.2805952202014129, "flos": 20479855466880.0, "grad_norm": 2.0606151796173613, "language_loss": 0.84366894, "learning_rate": 3.2723426910369166e-06, "loss": 0.86588705, "num_input_tokens_seen": 100777820, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.8828125, "step": 4667, "time_per_iteration": 2.5643246173858643 }, { "auxiliary_loss_clip": 0.01167663, "auxiliary_loss_mlp": 0.01290716, "balance_loss_clip": 1.02677345, "balance_loss_mlp": 1.05056524, "epoch": 0.2806553434540809, "flos": 27417330305280.0, "grad_norm": 1.6085055527089265, "language_loss": 0.79417151, "learning_rate": 3.2720511603469136e-06, "loss": 0.81875527, "num_input_tokens_seen": 100798205, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8984375, "step": 4668, "time_per_iteration": 4.046337604522705 }, { "auxiliary_loss_clip": 0.01182611, "auxiliary_loss_mlp": 0.01047366, "balance_loss_clip": 1.03039122, "balance_loss_mlp": 1.0469774, "epoch": 0.28071546670674885, "flos": 26505019745280.0, "grad_norm": 1.4140622336381918, "language_loss": 0.76064324, "learning_rate": 3.2717595842606766e-06, "loss": 0.78294301, "num_input_tokens_seen": 100819800, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.90625, "step": 4669, "time_per_iteration": 2.6577391624450684 }, { "auxiliary_loss_clip": 0.01147435, "auxiliary_loss_mlp": 0.01043327, "balance_loss_clip": 1.02623224, "balance_loss_mlp": 1.05099523, "epoch": 0.2807755899594168, "flos": 20558679863040.0, "grad_norm": 1.9843103229660222, "language_loss": 0.78682244, "learning_rate": 3.271467962788611e-06, "loss": 0.80873013, "num_input_tokens_seen": 100837880, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.875, "step": 4670, "time_per_iteration": 2.581052541732788 }, { "auxiliary_loss_clip": 0.01168488, "auxiliary_loss_mlp": 0.01047208, "balance_loss_clip": 1.02943373, "balance_loss_mlp": 1.05110705, "epoch": 0.2808357132120848, "flos": 24313319585280.0, "grad_norm": 1.9061305915918454, "language_loss": 0.78751791, "learning_rate": 3.271176295941125e-06, "loss": 0.80967486, "num_input_tokens_seen": 100856350, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.90625, "step": 4671, "time_per_iteration": 2.643174171447754 }, { "auxiliary_loss_clip": 0.01147053, "auxiliary_loss_mlp": 0.01042088, "balance_loss_clip": 1.02675748, "balance_loss_mlp": 1.05345488, "epoch": 0.28089583646475275, "flos": 26432408401920.0, "grad_norm": 1.8268964052488001, "language_loss": 0.75263828, "learning_rate": 3.270884583728626e-06, "loss": 0.7745297, "num_input_tokens_seen": 100876135, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.84375, "step": 4672, "time_per_iteration": 4.045693397521973 }, { "auxiliary_loss_clip": 0.01158628, "auxiliary_loss_mlp": 0.01045957, "balance_loss_clip": 1.02758741, "balance_loss_mlp": 1.05077839, "epoch": 0.2809559597174207, "flos": 23111820627840.0, "grad_norm": 2.4936196067996215, "language_loss": 0.75348204, "learning_rate": 3.2705928261615263e-06, "loss": 0.77552789, "num_input_tokens_seen": 100894790, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.8984375, "step": 4673, "time_per_iteration": 4.068954706192017 }, { "auxiliary_loss_clip": 0.01148515, "auxiliary_loss_mlp": 0.01038427, "balance_loss_clip": 1.02182078, "balance_loss_mlp": 1.05023217, "epoch": 0.2810160829700887, "flos": 20923496346240.0, "grad_norm": 2.200038764517837, "language_loss": 0.72147846, "learning_rate": 3.270301023250237e-06, "loss": 0.74334788, "num_input_tokens_seen": 100915100, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.89453125, "step": 4674, "time_per_iteration": 2.605750322341919 }, { "auxiliary_loss_clip": 0.01153885, "auxiliary_loss_mlp": 0.01043878, "balance_loss_clip": 1.0252099, "balance_loss_mlp": 1.05434442, "epoch": 0.28107620622275664, "flos": 14355901808640.0, "grad_norm": 1.9905346828918506, "language_loss": 0.76630002, "learning_rate": 3.270009175005171e-06, "loss": 0.78827763, "num_input_tokens_seen": 100932795, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.90625, "step": 4675, "time_per_iteration": 3.975053071975708 }, { "auxiliary_loss_clip": 0.01141292, "auxiliary_loss_mlp": 0.01046538, "balance_loss_clip": 1.02885985, "balance_loss_mlp": 1.05061042, "epoch": 0.2811363294754246, "flos": 20919078973440.0, "grad_norm": 1.790680839730517, "language_loss": 0.69909966, "learning_rate": 3.2697172814367447e-06, "loss": 0.72097796, "num_input_tokens_seen": 100950505, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.90625, "step": 4676, "time_per_iteration": 2.5726816654205322 }, { "auxiliary_loss_clip": 0.01137033, "auxiliary_loss_mlp": 0.01042505, "balance_loss_clip": 1.02616179, "balance_loss_mlp": 1.05296731, "epoch": 0.28119645272809257, "flos": 20594841880320.0, "grad_norm": 1.6537202495858123, "language_loss": 0.70409411, "learning_rate": 3.269425342555375e-06, "loss": 0.7258895, "num_input_tokens_seen": 100968790, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.83984375, "step": 4677, "time_per_iteration": 2.5912058353424072 }, { "auxiliary_loss_clip": 0.01148801, "auxiliary_loss_mlp": 0.01042088, "balance_loss_clip": 1.02357507, "balance_loss_mlp": 1.05148244, "epoch": 0.28125657598076054, "flos": 25337420248320.0, "grad_norm": 1.7980734218579368, "language_loss": 0.63837993, "learning_rate": 3.26913335837148e-06, "loss": 0.66028881, "num_input_tokens_seen": 100990205, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.8828125, "step": 4678, "time_per_iteration": 2.6043457984924316 }, { "auxiliary_loss_clip": 0.01149009, "auxiliary_loss_mlp": 0.01045856, "balance_loss_clip": 1.02964389, "balance_loss_mlp": 1.05245352, "epoch": 0.2813166992334285, "flos": 24827093769600.0, "grad_norm": 1.5999064909787466, "language_loss": 0.7082305, "learning_rate": 3.26884132889548e-06, "loss": 0.73017913, "num_input_tokens_seen": 101009815, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.875, "step": 4679, "time_per_iteration": 2.7586452960968018 }, { "auxiliary_loss_clip": 0.01147648, "auxiliary_loss_mlp": 0.01041226, "balance_loss_clip": 1.02434635, "balance_loss_mlp": 1.05007601, "epoch": 0.2813768224860965, "flos": 21760753438080.0, "grad_norm": 2.2233177684283443, "language_loss": 0.74507242, "learning_rate": 3.268549254137797e-06, "loss": 0.76696116, "num_input_tokens_seen": 101026780, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.88671875, "step": 4680, "time_per_iteration": 2.5195553302764893 }, { "auxiliary_loss_clip": 0.0114203, "auxiliary_loss_mlp": 0.01037889, "balance_loss_clip": 1.02164054, "balance_loss_mlp": 1.05265331, "epoch": 0.2814369457387645, "flos": 24316803204480.0, "grad_norm": 2.382823489615285, "language_loss": 0.77151257, "learning_rate": 3.2682571341088537e-06, "loss": 0.79331172, "num_input_tokens_seen": 101046215, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.89453125, "step": 4681, "time_per_iteration": 2.614414691925049 }, { "auxiliary_loss_clip": 0.01150891, "auxiliary_loss_mlp": 0.01037013, "balance_loss_clip": 1.01923895, "balance_loss_mlp": 1.05182219, "epoch": 0.28149706899143245, "flos": 18515326872960.0, "grad_norm": 1.9509096477702292, "language_loss": 0.73940074, "learning_rate": 3.2679649688190765e-06, "loss": 0.76127982, "num_input_tokens_seen": 101063365, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.90234375, "step": 4682, "time_per_iteration": 2.5306317806243896 }, { "auxiliary_loss_clip": 0.01138059, "auxiliary_loss_mlp": 0.01043555, "balance_loss_clip": 1.026842, "balance_loss_mlp": 1.05140746, "epoch": 0.2815571922441004, "flos": 24863255786880.0, "grad_norm": 1.4841185941316979, "language_loss": 0.80905807, "learning_rate": 3.2676727582788904e-06, "loss": 0.8308742, "num_input_tokens_seen": 101083835, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8671875, "step": 4683, "time_per_iteration": 2.6485724449157715 }, { "auxiliary_loss_clip": 0.01177877, "auxiliary_loss_mlp": 0.01046128, "balance_loss_clip": 1.02881908, "balance_loss_mlp": 1.05192029, "epoch": 0.2816173154967684, "flos": 19901622326400.0, "grad_norm": 2.8826886430875343, "language_loss": 0.76067877, "learning_rate": 3.2673805024987246e-06, "loss": 0.78291893, "num_input_tokens_seen": 101101740, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.8984375, "step": 4684, "time_per_iteration": 2.6392977237701416 }, { "auxiliary_loss_clip": 0.01172469, "auxiliary_loss_mlp": 0.01041962, "balance_loss_clip": 1.0254879, "balance_loss_mlp": 1.05030918, "epoch": 0.28167743874943635, "flos": 17491333950720.0, "grad_norm": 1.8120449825190235, "language_loss": 0.76451188, "learning_rate": 3.2670882014890085e-06, "loss": 0.7866562, "num_input_tokens_seen": 101120480, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8671875, "step": 4685, "time_per_iteration": 2.628265142440796 }, { "auxiliary_loss_clip": 0.01156407, "auxiliary_loss_mlp": 0.01044568, "balance_loss_clip": 1.02806926, "balance_loss_mlp": 1.05260813, "epoch": 0.2817375620021043, "flos": 25302120157440.0, "grad_norm": 1.4522515931452893, "language_loss": 0.75516903, "learning_rate": 3.2667958552601747e-06, "loss": 0.77717876, "num_input_tokens_seen": 101142910, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.859375, "step": 4686, "time_per_iteration": 2.6415302753448486 }, { "auxiliary_loss_clip": 0.01161764, "auxiliary_loss_mlp": 0.01051204, "balance_loss_clip": 1.03337073, "balance_loss_mlp": 1.05282974, "epoch": 0.2817976852547723, "flos": 18693227957760.0, "grad_norm": 2.4329093016955317, "language_loss": 0.62324286, "learning_rate": 3.266503463822655e-06, "loss": 0.64537251, "num_input_tokens_seen": 101160030, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.90625, "step": 4687, "time_per_iteration": 2.5726490020751953 }, { "auxiliary_loss_clip": 0.01160963, "auxiliary_loss_mlp": 0.01051901, "balance_loss_clip": 1.0349977, "balance_loss_mlp": 1.0527432, "epoch": 0.28185780850744024, "flos": 22742263549440.0, "grad_norm": 2.3477022013852458, "language_loss": 0.7705133, "learning_rate": 3.266211027186884e-06, "loss": 0.79264188, "num_input_tokens_seen": 101177675, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.90234375, "step": 4688, "time_per_iteration": 2.5786266326904297 }, { "auxiliary_loss_clip": 0.01168008, "auxiliary_loss_mlp": 0.01038761, "balance_loss_clip": 1.02213693, "balance_loss_mlp": 1.05316389, "epoch": 0.2819179317601082, "flos": 14933919467520.0, "grad_norm": 2.250737668032571, "language_loss": 0.77852488, "learning_rate": 3.265918545363299e-06, "loss": 0.8005926, "num_input_tokens_seen": 101192225, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.875, "step": 4689, "time_per_iteration": 2.6076817512512207 }, { "auxiliary_loss_clip": 0.01166483, "auxiliary_loss_mlp": 0.01040098, "balance_loss_clip": 1.02346849, "balance_loss_mlp": 1.0517838, "epoch": 0.2819780550127762, "flos": 23145325038720.0, "grad_norm": 1.6356690006555614, "language_loss": 0.78074706, "learning_rate": 3.2656260183623373e-06, "loss": 0.80281287, "num_input_tokens_seen": 101210870, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.87890625, "step": 4690, "time_per_iteration": 2.6171371936798096 }, { "auxiliary_loss_clip": 0.01140709, "auxiliary_loss_mlp": 0.01294358, "balance_loss_clip": 1.03067315, "balance_loss_mlp": 1.05193329, "epoch": 0.28203817826544414, "flos": 21616356764160.0, "grad_norm": 2.0939293525042904, "language_loss": 0.87233973, "learning_rate": 3.265333446194439e-06, "loss": 0.89669049, "num_input_tokens_seen": 101229965, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.88671875, "step": 4691, "time_per_iteration": 2.5768730640411377 }, { "auxiliary_loss_clip": 0.01163481, "auxiliary_loss_mlp": 0.01044671, "balance_loss_clip": 1.02755296, "balance_loss_mlp": 1.05385888, "epoch": 0.2820983015181121, "flos": 24026788794240.0, "grad_norm": 1.701794342293763, "language_loss": 0.82003909, "learning_rate": 3.2650408288700442e-06, "loss": 0.84212059, "num_input_tokens_seen": 101250980, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.9140625, "step": 4692, "time_per_iteration": 2.6275360584259033 }, { "auxiliary_loss_clip": 0.01170938, "auxiliary_loss_mlp": 0.01037042, "balance_loss_clip": 1.01961374, "balance_loss_mlp": 1.05470777, "epoch": 0.2821584247707801, "flos": 30007925976960.0, "grad_norm": 1.5410759606937443, "language_loss": 0.74520671, "learning_rate": 3.264748166399596e-06, "loss": 0.76728654, "num_input_tokens_seen": 101273335, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.890625, "step": 4693, "time_per_iteration": 2.6488237380981445 }, { "auxiliary_loss_clip": 0.01171245, "auxiliary_loss_mlp": 0.01036702, "balance_loss_clip": 1.01983392, "balance_loss_mlp": 1.0536176, "epoch": 0.2822185480234481, "flos": 21396762967680.0, "grad_norm": 1.5308749547648026, "language_loss": 0.77218598, "learning_rate": 3.2644554587935397e-06, "loss": 0.79426551, "num_input_tokens_seen": 101292110, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.91015625, "step": 4694, "time_per_iteration": 2.665506601333618 }, { "auxiliary_loss_clip": 0.01149375, "auxiliary_loss_mlp": 0.01043266, "balance_loss_clip": 1.02682674, "balance_loss_mlp": 1.05412507, "epoch": 0.28227867127611606, "flos": 27452809964160.0, "grad_norm": 2.6048065814504473, "language_loss": 0.66716021, "learning_rate": 3.2641627060623205e-06, "loss": 0.68908656, "num_input_tokens_seen": 101312815, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.86328125, "step": 4695, "time_per_iteration": 2.594616174697876 }, { "auxiliary_loss_clip": 0.01163933, "auxiliary_loss_mlp": 0.01051251, "balance_loss_clip": 1.03251088, "balance_loss_mlp": 1.05538034, "epoch": 0.282338794528784, "flos": 22593736811520.0, "grad_norm": 2.902955715008614, "language_loss": 0.75395393, "learning_rate": 3.263869908216387e-06, "loss": 0.77610576, "num_input_tokens_seen": 101329045, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.90625, "step": 4696, "time_per_iteration": 2.6128249168395996 }, { "auxiliary_loss_clip": 0.01174344, "auxiliary_loss_mlp": 0.01047983, "balance_loss_clip": 1.03171086, "balance_loss_mlp": 1.05738664, "epoch": 0.282398917781452, "flos": 42010923386880.0, "grad_norm": 1.5670918746528866, "language_loss": 0.62604713, "learning_rate": 3.2635770652661866e-06, "loss": 0.64827037, "num_input_tokens_seen": 101352715, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.89453125, "step": 4697, "time_per_iteration": 2.7234058380126953 }, { "auxiliary_loss_clip": 0.01167706, "auxiliary_loss_mlp": 0.01036307, "balance_loss_clip": 1.0202136, "balance_loss_mlp": 1.05423093, "epoch": 0.28245904103411995, "flos": 23224724052480.0, "grad_norm": 1.5878350911217487, "language_loss": 0.72997648, "learning_rate": 3.263284177222171e-06, "loss": 0.75201654, "num_input_tokens_seen": 101374640, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8671875, "step": 4698, "time_per_iteration": 2.7228429317474365 }, { "auxiliary_loss_clip": 0.01140104, "auxiliary_loss_mlp": 0.01046865, "balance_loss_clip": 1.03059244, "balance_loss_mlp": 1.05367994, "epoch": 0.2825191642867879, "flos": 25374623760000.0, "grad_norm": 1.6276247167654971, "language_loss": 0.75032181, "learning_rate": 3.2629912440947927e-06, "loss": 0.77219152, "num_input_tokens_seen": 101393595, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.86328125, "step": 4699, "time_per_iteration": 2.561481237411499 }, { "auxiliary_loss_clip": 0.01155249, "auxiliary_loss_mlp": 0.01039501, "balance_loss_clip": 1.02153611, "balance_loss_mlp": 1.05645323, "epoch": 0.2825792875394559, "flos": 17236799199360.0, "grad_norm": 2.3503783006306476, "language_loss": 0.79881763, "learning_rate": 3.262698265894506e-06, "loss": 0.8207652, "num_input_tokens_seen": 101409265, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.8984375, "step": 4700, "time_per_iteration": 2.5988881587982178 }, { "auxiliary_loss_clip": 0.01156154, "auxiliary_loss_mlp": 0.01045551, "balance_loss_clip": 1.0305779, "balance_loss_mlp": 1.05554795, "epoch": 0.28263941079212385, "flos": 26723967096960.0, "grad_norm": 1.7353961114512946, "language_loss": 0.81159854, "learning_rate": 3.2624052426317664e-06, "loss": 0.8336156, "num_input_tokens_seen": 101428365, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.828125, "step": 4701, "time_per_iteration": 2.7438101768493652 }, { "auxiliary_loss_clip": 0.01195142, "auxiliary_loss_mlp": 0.01041224, "balance_loss_clip": 1.02480841, "balance_loss_mlp": 1.05408144, "epoch": 0.2826995340447918, "flos": 25921327737600.0, "grad_norm": 2.0779198619771155, "language_loss": 0.73226404, "learning_rate": 3.26211217431703e-06, "loss": 0.7546277, "num_input_tokens_seen": 101447280, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.875, "step": 4702, "time_per_iteration": 2.6604361534118652 }, { "auxiliary_loss_clip": 0.011422, "auxiliary_loss_mlp": 0.01040789, "balance_loss_clip": 1.02440977, "balance_loss_mlp": 1.05634618, "epoch": 0.2827596572974598, "flos": 22379709623040.0, "grad_norm": 1.865607155932029, "language_loss": 0.78767133, "learning_rate": 3.2618190609607577e-06, "loss": 0.80950129, "num_input_tokens_seen": 101465435, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.859375, "step": 4703, "time_per_iteration": 2.5172369480133057 }, { "auxiliary_loss_clip": 0.01160428, "auxiliary_loss_mlp": 0.0104744, "balance_loss_clip": 1.02986813, "balance_loss_mlp": 1.05429137, "epoch": 0.28281978055012774, "flos": 33547137880320.0, "grad_norm": 1.561794160138155, "language_loss": 0.68961543, "learning_rate": 3.2615259025734085e-06, "loss": 0.71169406, "num_input_tokens_seen": 101486355, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.8828125, "step": 4704, "time_per_iteration": 2.706552505493164 }, { "auxiliary_loss_clip": 0.01171878, "auxiliary_loss_mlp": 0.01044074, "balance_loss_clip": 1.02736044, "balance_loss_mlp": 1.05840135, "epoch": 0.2828799038027957, "flos": 23440870143360.0, "grad_norm": 2.1300572314870565, "language_loss": 0.7008729, "learning_rate": 3.261232699165445e-06, "loss": 0.72303241, "num_input_tokens_seen": 101505875, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.86328125, "step": 4705, "time_per_iteration": 2.7079532146453857 }, { "auxiliary_loss_clip": 0.01086209, "auxiliary_loss_mlp": 0.01008603, "balance_loss_clip": 1.0063498, "balance_loss_mlp": 1.04859328, "epoch": 0.2829400270554637, "flos": 69873690251520.0, "grad_norm": 0.7279035623714769, "language_loss": 0.59217417, "learning_rate": 3.2609394507473305e-06, "loss": 0.61312234, "num_input_tokens_seen": 101565045, "router_z_loss_clip": 0.02258301, "router_z_loss_mlp": 0.28710938, "step": 4706, "time_per_iteration": 3.3415255546569824 }, { "auxiliary_loss_clip": 0.01158145, "auxiliary_loss_mlp": 0.01039369, "balance_loss_clip": 1.02452731, "balance_loss_mlp": 1.05593979, "epoch": 0.2830001503081317, "flos": 24789028331520.0, "grad_norm": 1.4213248601679238, "language_loss": 0.8201912, "learning_rate": 3.2606461573295303e-06, "loss": 0.84216636, "num_input_tokens_seen": 101585825, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.84375, "step": 4707, "time_per_iteration": 2.704214572906494 }, { "auxiliary_loss_clip": 0.01164846, "auxiliary_loss_mlp": 0.01043495, "balance_loss_clip": 1.02585208, "balance_loss_mlp": 1.05633759, "epoch": 0.28306027356079966, "flos": 27669387018240.0, "grad_norm": 1.4570002280541259, "language_loss": 0.80329627, "learning_rate": 3.260352818922512e-06, "loss": 0.82537973, "num_input_tokens_seen": 101606105, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.90625, "step": 4708, "time_per_iteration": 2.6735129356384277 }, { "auxiliary_loss_clip": 0.01093959, "auxiliary_loss_mlp": 0.01001076, "balance_loss_clip": 0.99881059, "balance_loss_mlp": 1.04722738, "epoch": 0.2831203968134676, "flos": 60527938199040.0, "grad_norm": 0.8813198548017532, "language_loss": 0.62724841, "learning_rate": 3.2600594355367434e-06, "loss": 0.64819872, "num_input_tokens_seen": 101656875, "router_z_loss_clip": 0.02270508, "router_z_loss_mlp": 0.28710938, "step": 4709, "time_per_iteration": 4.46778416633606 }, { "auxiliary_loss_clip": 0.01159383, "auxiliary_loss_mlp": 0.01038361, "balance_loss_clip": 1.0223037, "balance_loss_mlp": 1.05617785, "epoch": 0.2831805200661356, "flos": 22054790171520.0, "grad_norm": 1.4331540573444066, "language_loss": 0.73962426, "learning_rate": 3.259766007182695e-06, "loss": 0.76160163, "num_input_tokens_seen": 101676225, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8515625, "step": 4710, "time_per_iteration": 2.604384422302246 }, { "auxiliary_loss_clip": 0.01171296, "auxiliary_loss_mlp": 0.01050713, "balance_loss_clip": 1.03330827, "balance_loss_mlp": 1.05602562, "epoch": 0.28324064331880355, "flos": 22600668136320.0, "grad_norm": 1.8541636519353544, "language_loss": 0.78780377, "learning_rate": 3.259472533870838e-06, "loss": 0.81002384, "num_input_tokens_seen": 101693710, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.8828125, "step": 4711, "time_per_iteration": 2.575531005859375 }, { "auxiliary_loss_clip": 0.01152159, "auxiliary_loss_mlp": 0.01293068, "balance_loss_clip": 1.02784419, "balance_loss_mlp": 1.05402148, "epoch": 0.2833007665714715, "flos": 30404127968640.0, "grad_norm": 2.580098902694198, "language_loss": 0.70875913, "learning_rate": 3.2591790156116466e-06, "loss": 0.7332114, "num_input_tokens_seen": 101714010, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.89453125, "step": 4712, "time_per_iteration": 2.674661159515381 }, { "auxiliary_loss_clip": 0.01146226, "auxiliary_loss_mlp": 0.01054092, "balance_loss_clip": 1.036533, "balance_loss_mlp": 1.05707026, "epoch": 0.2833608898241395, "flos": 23549499849600.0, "grad_norm": 2.062089486534331, "language_loss": 0.81835175, "learning_rate": 3.258885452415595e-06, "loss": 0.84035492, "num_input_tokens_seen": 101732995, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.890625, "step": 4713, "time_per_iteration": 2.567894458770752 }, { "auxiliary_loss_clip": 0.01160797, "auxiliary_loss_mlp": 0.01043582, "balance_loss_clip": 1.02623689, "balance_loss_mlp": 1.05646789, "epoch": 0.28342101307680745, "flos": 20266726118400.0, "grad_norm": 1.9608858389724693, "language_loss": 0.75172859, "learning_rate": 3.2585918442931595e-06, "loss": 0.77377248, "num_input_tokens_seen": 101751385, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.8671875, "step": 4714, "time_per_iteration": 4.008214235305786 }, { "auxiliary_loss_clip": 0.01147185, "auxiliary_loss_mlp": 0.01046046, "balance_loss_clip": 1.0296185, "balance_loss_mlp": 1.05250263, "epoch": 0.2834811363294754, "flos": 30847050576000.0, "grad_norm": 1.365585701733153, "language_loss": 0.78255546, "learning_rate": 3.258298191254818e-06, "loss": 0.80448782, "num_input_tokens_seen": 101773825, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.859375, "step": 4715, "time_per_iteration": 4.267711162567139 }, { "auxiliary_loss_clip": 0.01159021, "auxiliary_loss_mlp": 0.01044014, "balance_loss_clip": 1.02733684, "balance_loss_mlp": 1.05503225, "epoch": 0.2835412595821434, "flos": 22711021695360.0, "grad_norm": 2.3287057722922446, "language_loss": 0.74015248, "learning_rate": 3.2580044933110513e-06, "loss": 0.76218277, "num_input_tokens_seen": 101791920, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.86328125, "step": 4716, "time_per_iteration": 2.5700531005859375 }, { "auxiliary_loss_clip": 0.01162927, "auxiliary_loss_mlp": 0.01046634, "balance_loss_clip": 1.02882373, "balance_loss_mlp": 1.05108225, "epoch": 0.28360138283481134, "flos": 18077719478400.0, "grad_norm": 3.1225547889801963, "language_loss": 0.74500203, "learning_rate": 3.2577107504723403e-06, "loss": 0.76709759, "num_input_tokens_seen": 101809515, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.9375, "step": 4717, "time_per_iteration": 3.9929733276367188 }, { "auxiliary_loss_clip": 0.01170341, "auxiliary_loss_mlp": 0.01043342, "balance_loss_clip": 1.0262233, "balance_loss_mlp": 1.05413461, "epoch": 0.2836615060874793, "flos": 17854785717120.0, "grad_norm": 1.5880995007930296, "language_loss": 0.66695964, "learning_rate": 3.2574169627491683e-06, "loss": 0.68909645, "num_input_tokens_seen": 101827735, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.890625, "step": 4718, "time_per_iteration": 2.5906729698181152 }, { "auxiliary_loss_clip": 0.0116051, "auxiliary_loss_mlp": 0.01045493, "balance_loss_clip": 1.02795696, "balance_loss_mlp": 1.0531249, "epoch": 0.2837216293401473, "flos": 25740302169600.0, "grad_norm": 1.8085187433763634, "language_loss": 0.72394508, "learning_rate": 3.2571231301520187e-06, "loss": 0.74600512, "num_input_tokens_seen": 101845970, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.8984375, "step": 4719, "time_per_iteration": 2.6420938968658447 }, { "auxiliary_loss_clip": 0.01148442, "auxiliary_loss_mlp": 0.01042225, "balance_loss_clip": 1.02552366, "balance_loss_mlp": 1.053339, "epoch": 0.2837817525928153, "flos": 20923532259840.0, "grad_norm": 2.0077253783875193, "language_loss": 0.80230069, "learning_rate": 3.2568292526913785e-06, "loss": 0.82420737, "num_input_tokens_seen": 101865040, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.86328125, "step": 4720, "time_per_iteration": 2.6586546897888184 }, { "auxiliary_loss_clip": 0.01150901, "auxiliary_loss_mlp": 0.01041344, "balance_loss_clip": 1.02407026, "balance_loss_mlp": 1.05367577, "epoch": 0.28384187584548326, "flos": 18916700423040.0, "grad_norm": 1.7990196712723552, "language_loss": 0.79739821, "learning_rate": 3.2565353303777353e-06, "loss": 0.81932056, "num_input_tokens_seen": 101883735, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.8828125, "step": 4721, "time_per_iteration": 2.5731470584869385 }, { "auxiliary_loss_clip": 0.01174088, "auxiliary_loss_mlp": 0.01284821, "balance_loss_clip": 1.02036715, "balance_loss_mlp": 1.05177307, "epoch": 0.2839019990981512, "flos": 27343964776320.0, "grad_norm": 1.839159849644889, "language_loss": 0.82618511, "learning_rate": 3.256241363221578e-06, "loss": 0.85077417, "num_input_tokens_seen": 101903025, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8671875, "step": 4722, "time_per_iteration": 2.7967729568481445 }, { "auxiliary_loss_clip": 0.0114256, "auxiliary_loss_mlp": 0.0104108, "balance_loss_clip": 1.02509427, "balance_loss_mlp": 1.05493116, "epoch": 0.2839621223508192, "flos": 18114312458880.0, "grad_norm": 1.7351741639821663, "language_loss": 0.70396698, "learning_rate": 3.2559473512333986e-06, "loss": 0.72580338, "num_input_tokens_seen": 101922255, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.875, "step": 4723, "time_per_iteration": 2.5297133922576904 }, { "auxiliary_loss_clip": 0.01158102, "auxiliary_loss_mlp": 0.01044029, "balance_loss_clip": 1.02655315, "balance_loss_mlp": 1.05315959, "epoch": 0.28402224560348716, "flos": 26358360514560.0, "grad_norm": 1.6602675040745751, "language_loss": 0.7832492, "learning_rate": 3.2556532944236886e-06, "loss": 0.80527049, "num_input_tokens_seen": 101943100, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.8671875, "step": 4724, "time_per_iteration": 2.698884963989258 }, { "auxiliary_loss_clip": 0.01159153, "auxiliary_loss_mlp": 0.01047571, "balance_loss_clip": 1.03077388, "balance_loss_mlp": 1.05197549, "epoch": 0.2840823688561551, "flos": 24060795995520.0, "grad_norm": 1.939597591153656, "language_loss": 0.92224121, "learning_rate": 3.2553591928029423e-06, "loss": 0.9443084, "num_input_tokens_seen": 101963160, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.890625, "step": 4725, "time_per_iteration": 2.598658800125122 }, { "auxiliary_loss_clip": 0.01151215, "auxiliary_loss_mlp": 0.01041376, "balance_loss_clip": 1.02348268, "balance_loss_mlp": 1.05281162, "epoch": 0.2841424921088231, "flos": 29459821368960.0, "grad_norm": 1.6920225132701994, "language_loss": 0.88638973, "learning_rate": 3.2550650463816557e-06, "loss": 0.90831566, "num_input_tokens_seen": 101984300, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.89453125, "step": 4726, "time_per_iteration": 2.69767165184021 }, { "auxiliary_loss_clip": 0.01152063, "auxiliary_loss_mlp": 0.01043978, "balance_loss_clip": 1.02657342, "balance_loss_mlp": 1.0533309, "epoch": 0.28420261536149105, "flos": 48100367053440.0, "grad_norm": 1.9286429810380974, "language_loss": 0.78748453, "learning_rate": 3.2547708551703256e-06, "loss": 0.80944496, "num_input_tokens_seen": 102005765, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.8984375, "step": 4727, "time_per_iteration": 2.8353965282440186 }, { "auxiliary_loss_clip": 0.01145883, "auxiliary_loss_mlp": 0.01041056, "balance_loss_clip": 1.02541614, "balance_loss_mlp": 1.05256283, "epoch": 0.284262738614159, "flos": 25666146541440.0, "grad_norm": 2.372255277586458, "language_loss": 0.6679666, "learning_rate": 3.254476619179452e-06, "loss": 0.68983603, "num_input_tokens_seen": 102022755, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.84375, "step": 4728, "time_per_iteration": 2.648505687713623 }, { "auxiliary_loss_clip": 0.011646, "auxiliary_loss_mlp": 0.01047886, "balance_loss_clip": 1.03105378, "balance_loss_mlp": 1.05062401, "epoch": 0.284322861866827, "flos": 19718980646400.0, "grad_norm": 1.7787282296976696, "language_loss": 0.76254898, "learning_rate": 3.2541823384195344e-06, "loss": 0.78467381, "num_input_tokens_seen": 102041850, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.87109375, "step": 4729, "time_per_iteration": 2.6809871196746826 }, { "auxiliary_loss_clip": 0.01162496, "auxiliary_loss_mlp": 0.01051461, "balance_loss_clip": 1.03406823, "balance_loss_mlp": 1.05380487, "epoch": 0.28438298511949495, "flos": 23915250086400.0, "grad_norm": 1.6384344893093716, "language_loss": 0.66775066, "learning_rate": 3.253888012901075e-06, "loss": 0.68989021, "num_input_tokens_seen": 102059500, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.90625, "step": 4730, "time_per_iteration": 2.7409281730651855 }, { "auxiliary_loss_clip": 0.0116005, "auxiliary_loss_mlp": 0.0104207, "balance_loss_clip": 1.02465367, "balance_loss_mlp": 1.05343866, "epoch": 0.2844431083721629, "flos": 26067340523520.0, "grad_norm": 1.6831779917752596, "language_loss": 0.74793398, "learning_rate": 3.253593642634578e-06, "loss": 0.76995522, "num_input_tokens_seen": 102080460, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.88671875, "step": 4731, "time_per_iteration": 2.659367322921753 }, { "auxiliary_loss_clip": 0.01167801, "auxiliary_loss_mlp": 0.01035068, "balance_loss_clip": 1.01825964, "balance_loss_mlp": 1.05157638, "epoch": 0.2845032316248309, "flos": 25810435474560.0, "grad_norm": 1.5019762565159058, "language_loss": 0.83833933, "learning_rate": 3.2532992276305492e-06, "loss": 0.86036801, "num_input_tokens_seen": 102100950, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.890625, "step": 4732, "time_per_iteration": 2.5907959938049316 }, { "auxiliary_loss_clip": 0.01151248, "auxiliary_loss_mlp": 0.01049919, "balance_loss_clip": 1.03126287, "balance_loss_mlp": 1.05416632, "epoch": 0.2845633548774989, "flos": 19823192979840.0, "grad_norm": 1.9614346574738046, "language_loss": 0.78485978, "learning_rate": 3.253004767899494e-06, "loss": 0.80687141, "num_input_tokens_seen": 102119345, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.8828125, "step": 4733, "time_per_iteration": 2.5679008960723877 }, { "auxiliary_loss_clip": 0.01147485, "auxiliary_loss_mlp": 0.01051539, "balance_loss_clip": 1.03450418, "balance_loss_mlp": 1.05493665, "epoch": 0.28462347813016686, "flos": 23182815859200.0, "grad_norm": 2.722076385004832, "language_loss": 0.71138936, "learning_rate": 3.252710263451922e-06, "loss": 0.7333796, "num_input_tokens_seen": 102139050, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.92578125, "step": 4734, "time_per_iteration": 2.5565195083618164 }, { "auxiliary_loss_clip": 0.01152456, "auxiliary_loss_mlp": 0.01033159, "balance_loss_clip": 1.01660097, "balance_loss_mlp": 1.04821587, "epoch": 0.2846836013828348, "flos": 18660477732480.0, "grad_norm": 1.7927181233823861, "language_loss": 0.74005282, "learning_rate": 3.2524157142983432e-06, "loss": 0.76190895, "num_input_tokens_seen": 102157935, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8671875, "step": 4735, "time_per_iteration": 2.6335816383361816 }, { "auxiliary_loss_clip": 0.01157739, "auxiliary_loss_mlp": 0.01045363, "balance_loss_clip": 1.02952003, "balance_loss_mlp": 1.05277884, "epoch": 0.2847437246355028, "flos": 14173511523840.0, "grad_norm": 1.9899922319253927, "language_loss": 0.79329389, "learning_rate": 3.252121120449269e-06, "loss": 0.8153249, "num_input_tokens_seen": 102175325, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.875, "step": 4736, "time_per_iteration": 2.5323503017425537 }, { "auxiliary_loss_clip": 0.01143039, "auxiliary_loss_mlp": 0.01047423, "balance_loss_clip": 1.02992356, "balance_loss_mlp": 1.05435658, "epoch": 0.28480384788817076, "flos": 29278364837760.0, "grad_norm": 2.671113855031217, "language_loss": 0.58980644, "learning_rate": 3.251826481915213e-06, "loss": 0.61171108, "num_input_tokens_seen": 102196625, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.88671875, "step": 4737, "time_per_iteration": 2.6512179374694824 }, { "auxiliary_loss_clip": 0.01144926, "auxiliary_loss_mlp": 0.01039395, "balance_loss_clip": 1.02277756, "balance_loss_mlp": 1.05111432, "epoch": 0.2848639711408387, "flos": 22601314581120.0, "grad_norm": 1.987375229669505, "language_loss": 0.86520737, "learning_rate": 3.2515317987066894e-06, "loss": 0.88705063, "num_input_tokens_seen": 102214975, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.84765625, "step": 4738, "time_per_iteration": 2.5167298316955566 }, { "auxiliary_loss_clip": 0.0116805, "auxiliary_loss_mlp": 0.01049594, "balance_loss_clip": 1.03103328, "balance_loss_mlp": 1.051548, "epoch": 0.2849240943935067, "flos": 17599460866560.0, "grad_norm": 2.4707174573539867, "language_loss": 0.89823931, "learning_rate": 3.2512370708342155e-06, "loss": 0.92041576, "num_input_tokens_seen": 102231885, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.8984375, "step": 4739, "time_per_iteration": 2.5975823402404785 }, { "auxiliary_loss_clip": 0.01150923, "auxiliary_loss_mlp": 0.01041267, "balance_loss_clip": 1.02488732, "balance_loss_mlp": 1.05560124, "epoch": 0.28498421764617465, "flos": 24862573428480.0, "grad_norm": 1.5664542979270162, "language_loss": 0.72148466, "learning_rate": 3.25094229830831e-06, "loss": 0.74340653, "num_input_tokens_seen": 102252725, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.86328125, "step": 4740, "time_per_iteration": 2.63812518119812 }, { "auxiliary_loss_clip": 0.01156131, "auxiliary_loss_mlp": 0.01041459, "balance_loss_clip": 1.02572381, "balance_loss_mlp": 1.05359149, "epoch": 0.2850443408988426, "flos": 22782555630720.0, "grad_norm": 1.5475310202960337, "language_loss": 0.77886736, "learning_rate": 3.2506474811394907e-06, "loss": 0.80084324, "num_input_tokens_seen": 102271730, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.84375, "step": 4741, "time_per_iteration": 2.804166793823242 }, { "auxiliary_loss_clip": 0.01167631, "auxiliary_loss_mlp": 0.01042623, "balance_loss_clip": 1.02515841, "balance_loss_mlp": 1.04936755, "epoch": 0.2851044641515106, "flos": 18844053166080.0, "grad_norm": 1.7687288207114293, "language_loss": 0.76484096, "learning_rate": 3.2503526193382796e-06, "loss": 0.78694355, "num_input_tokens_seen": 102291325, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.9140625, "step": 4742, "time_per_iteration": 2.5614876747131348 }, { "auxiliary_loss_clip": 0.01145128, "auxiliary_loss_mlp": 0.01051556, "balance_loss_clip": 1.03254223, "balance_loss_mlp": 1.05307293, "epoch": 0.28516458740417855, "flos": 18880502492160.0, "grad_norm": 2.005784542077737, "language_loss": 0.57383728, "learning_rate": 3.2500577129152004e-06, "loss": 0.59580415, "num_input_tokens_seen": 102309000, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.921875, "step": 4743, "time_per_iteration": 2.5773861408233643 }, { "auxiliary_loss_clip": 0.01171187, "auxiliary_loss_mlp": 0.0105205, "balance_loss_clip": 1.03437114, "balance_loss_mlp": 1.05412924, "epoch": 0.2852247106568465, "flos": 25299821687040.0, "grad_norm": 1.7221900093793283, "language_loss": 0.74510384, "learning_rate": 3.2497627618807767e-06, "loss": 0.76733619, "num_input_tokens_seen": 102329240, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.90234375, "step": 4744, "time_per_iteration": 2.621908187866211 }, { "auxiliary_loss_clip": 0.01149201, "auxiliary_loss_mlp": 0.0104096, "balance_loss_clip": 1.02486718, "balance_loss_mlp": 1.05376959, "epoch": 0.2852848339095145, "flos": 11655383541120.0, "grad_norm": 2.1418254334358506, "language_loss": 0.77164042, "learning_rate": 3.2494677662455355e-06, "loss": 0.79354197, "num_input_tokens_seen": 102344440, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8671875, "step": 4745, "time_per_iteration": 2.6371397972106934 }, { "auxiliary_loss_clip": 0.01142752, "auxiliary_loss_mlp": 0.01040704, "balance_loss_clip": 1.02484918, "balance_loss_mlp": 1.05236375, "epoch": 0.2853449571621825, "flos": 12933228856320.0, "grad_norm": 2.7286177963602927, "language_loss": 0.82796484, "learning_rate": 3.249172726020003e-06, "loss": 0.84979939, "num_input_tokens_seen": 102360985, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8125, "step": 4746, "time_per_iteration": 2.5091426372528076 }, { "auxiliary_loss_clip": 0.0117761, "auxiliary_loss_mlp": 0.01038119, "balance_loss_clip": 1.0209645, "balance_loss_mlp": 1.05029738, "epoch": 0.28540508041485046, "flos": 20010575255040.0, "grad_norm": 1.6689654996644503, "language_loss": 0.79657865, "learning_rate": 3.248877641214709e-06, "loss": 0.81873596, "num_input_tokens_seen": 102380320, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9140625, "step": 4747, "time_per_iteration": 2.6488656997680664 }, { "auxiliary_loss_clip": 0.0108174, "auxiliary_loss_mlp": 0.01016799, "balance_loss_clip": 1.01428354, "balance_loss_mlp": 1.04340482, "epoch": 0.28546520366751843, "flos": 68139349966080.0, "grad_norm": 0.7697740527946763, "language_loss": 0.60437453, "learning_rate": 3.248582511840185e-06, "loss": 0.62535995, "num_input_tokens_seen": 102439140, "router_z_loss_clip": 0.02514648, "router_z_loss_mlp": 0.29492188, "step": 4748, "time_per_iteration": 3.111927032470703 }, { "auxiliary_loss_clip": 0.01140109, "auxiliary_loss_mlp": 0.01042379, "balance_loss_clip": 1.02557015, "balance_loss_mlp": 1.05222845, "epoch": 0.2855253269201864, "flos": 13251540205440.0, "grad_norm": 2.102019651171674, "language_loss": 0.80710518, "learning_rate": 3.2482873379069627e-06, "loss": 0.82893002, "num_input_tokens_seen": 102450990, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.87890625, "step": 4749, "time_per_iteration": 2.5474135875701904 }, { "auxiliary_loss_clip": 0.01145981, "auxiliary_loss_mlp": 0.01037177, "balance_loss_clip": 1.02083373, "balance_loss_mlp": 1.05025351, "epoch": 0.28558545017285436, "flos": 28620876337920.0, "grad_norm": 2.154649265236868, "language_loss": 0.70241672, "learning_rate": 3.2479921194255764e-06, "loss": 0.72424835, "num_input_tokens_seen": 102471820, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8671875, "step": 4750, "time_per_iteration": 4.106597185134888 }, { "auxiliary_loss_clip": 0.01143434, "auxiliary_loss_mlp": 0.010351, "balance_loss_clip": 1.01899457, "balance_loss_mlp": 1.05119157, "epoch": 0.2856455734255223, "flos": 34130470752000.0, "grad_norm": 3.3106294082081127, "language_loss": 0.8200146, "learning_rate": 3.2476968564065613e-06, "loss": 0.84179997, "num_input_tokens_seen": 102492625, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.83203125, "step": 4751, "time_per_iteration": 2.7108592987060547 }, { "auxiliary_loss_clip": 0.01138361, "auxiliary_loss_mlp": 0.0104026, "balance_loss_clip": 1.02414298, "balance_loss_mlp": 1.05258822, "epoch": 0.2857056966781903, "flos": 39786149779200.0, "grad_norm": 2.266670232052874, "language_loss": 0.79816264, "learning_rate": 3.247401548860455e-06, "loss": 0.81994891, "num_input_tokens_seen": 102514145, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.859375, "step": 4752, "time_per_iteration": 2.7394402027130127 }, { "auxiliary_loss_clip": 0.01146888, "auxiliary_loss_mlp": 0.01041895, "balance_loss_clip": 1.02435899, "balance_loss_mlp": 1.05051744, "epoch": 0.28576581993085826, "flos": 21872292145920.0, "grad_norm": 1.732139999362965, "language_loss": 0.78352082, "learning_rate": 3.247106196797796e-06, "loss": 0.80540866, "num_input_tokens_seen": 102532365, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.875, "step": 4753, "time_per_iteration": 2.6252248287200928 }, { "auxiliary_loss_clip": 0.01142329, "auxiliary_loss_mlp": 0.01039247, "balance_loss_clip": 1.02069807, "balance_loss_mlp": 1.05194068, "epoch": 0.2858259431835262, "flos": 19091656592640.0, "grad_norm": 2.3006536277598886, "language_loss": 0.89667606, "learning_rate": 3.2468108002291256e-06, "loss": 0.91849184, "num_input_tokens_seen": 102548425, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.90234375, "step": 4754, "time_per_iteration": 2.53324818611145 }, { "auxiliary_loss_clip": 0.01143532, "auxiliary_loss_mlp": 0.01045302, "balance_loss_clip": 1.02905345, "balance_loss_mlp": 1.04980099, "epoch": 0.2858860664361942, "flos": 20334309557760.0, "grad_norm": 1.7149652230823589, "language_loss": 0.82466614, "learning_rate": 3.2465153591649835e-06, "loss": 0.84655452, "num_input_tokens_seen": 102566370, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.84765625, "step": 4755, "time_per_iteration": 3.9567201137542725 }, { "auxiliary_loss_clip": 0.0117225, "auxiliary_loss_mlp": 0.01043478, "balance_loss_clip": 1.02745676, "balance_loss_mlp": 1.04870462, "epoch": 0.28594618968886215, "flos": 24461738582400.0, "grad_norm": 1.7100758486645278, "language_loss": 0.84055257, "learning_rate": 3.2462198736159157e-06, "loss": 0.86270988, "num_input_tokens_seen": 102588715, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.87890625, "step": 4756, "time_per_iteration": 4.193747520446777 }, { "auxiliary_loss_clip": 0.01095226, "auxiliary_loss_mlp": 0.01001902, "balance_loss_clip": 0.99932712, "balance_loss_mlp": 1.03864455, "epoch": 0.2860063129415301, "flos": 71652850709760.0, "grad_norm": 0.8626238786776776, "language_loss": 0.6109668, "learning_rate": 3.245924343592466e-06, "loss": 0.6319381, "num_input_tokens_seen": 102656715, "router_z_loss_clip": 0.02575684, "router_z_loss_mlp": 0.29296875, "step": 4757, "time_per_iteration": 3.3887619972229004 }, { "auxiliary_loss_clip": 0.0113876, "auxiliary_loss_mlp": 0.01035726, "balance_loss_clip": 1.01914442, "balance_loss_mlp": 1.05302465, "epoch": 0.2860664361941981, "flos": 20558679863040.0, "grad_norm": 2.1056858828301066, "language_loss": 0.65432644, "learning_rate": 3.2456287691051815e-06, "loss": 0.67607129, "num_input_tokens_seen": 102676545, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.85546875, "step": 4758, "time_per_iteration": 2.570478916168213 }, { "auxiliary_loss_clip": 0.01146188, "auxiliary_loss_mlp": 0.01037048, "balance_loss_clip": 1.02049577, "balance_loss_mlp": 1.04992914, "epoch": 0.2861265594468661, "flos": 35996389534080.0, "grad_norm": 1.6841095363814185, "language_loss": 0.62668717, "learning_rate": 3.24533315016461e-06, "loss": 0.64851952, "num_input_tokens_seen": 102702875, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.875, "step": 4759, "time_per_iteration": 4.9038238525390625 }, { "auxiliary_loss_clip": 0.01148037, "auxiliary_loss_mlp": 0.01294926, "balance_loss_clip": 1.03001845, "balance_loss_mlp": 1.05375528, "epoch": 0.28618668269953407, "flos": 20047419630720.0, "grad_norm": 1.7767203893844916, "language_loss": 0.73925257, "learning_rate": 3.245037486781302e-06, "loss": 0.76368213, "num_input_tokens_seen": 102723160, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.8515625, "step": 4760, "time_per_iteration": 2.5816755294799805 }, { "auxiliary_loss_clip": 0.0115079, "auxiliary_loss_mlp": 0.01035822, "balance_loss_clip": 1.02046788, "balance_loss_mlp": 1.05087149, "epoch": 0.28624680595220203, "flos": 24971849579520.0, "grad_norm": 1.8707028947494397, "language_loss": 0.72597933, "learning_rate": 3.2447417789658083e-06, "loss": 0.74784547, "num_input_tokens_seen": 102743855, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8203125, "step": 4761, "time_per_iteration": 2.736814498901367 }, { "auxiliary_loss_clip": 0.0119003, "auxiliary_loss_mlp": 0.01046214, "balance_loss_clip": 1.02952492, "balance_loss_mlp": 1.04945421, "epoch": 0.28630692920487, "flos": 22492253911680.0, "grad_norm": 2.114029146533993, "language_loss": 0.7431199, "learning_rate": 3.244446026728683e-06, "loss": 0.76548243, "num_input_tokens_seen": 102761370, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8671875, "step": 4762, "time_per_iteration": 2.636482000350952 }, { "auxiliary_loss_clip": 0.0116029, "auxiliary_loss_mlp": 0.01043633, "balance_loss_clip": 1.02775407, "balance_loss_mlp": 1.05009592, "epoch": 0.28636705245753796, "flos": 21249888255360.0, "grad_norm": 1.6351552210417468, "language_loss": 0.76327008, "learning_rate": 3.2441502300804803e-06, "loss": 0.78530931, "num_input_tokens_seen": 102780885, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.83984375, "step": 4763, "time_per_iteration": 2.656562328338623 }, { "auxiliary_loss_clip": 0.01159953, "auxiliary_loss_mlp": 0.01038521, "balance_loss_clip": 1.02199817, "balance_loss_mlp": 1.04725432, "epoch": 0.28642717571020593, "flos": 24095772864000.0, "grad_norm": 1.7067866182709592, "language_loss": 0.76881611, "learning_rate": 3.2438543890317557e-06, "loss": 0.79080093, "num_input_tokens_seen": 102801000, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.85546875, "step": 4764, "time_per_iteration": 2.729966163635254 }, { "auxiliary_loss_clip": 0.01165682, "auxiliary_loss_mlp": 0.01044008, "balance_loss_clip": 1.02597165, "balance_loss_mlp": 1.05178523, "epoch": 0.2864872989628739, "flos": 22601386408320.0, "grad_norm": 2.595651993849566, "language_loss": 0.7088635, "learning_rate": 3.2435585035930676e-06, "loss": 0.73096037, "num_input_tokens_seen": 102820230, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.8671875, "step": 4765, "time_per_iteration": 2.765766143798828 }, { "auxiliary_loss_clip": 0.01139302, "auxiliary_loss_mlp": 0.01037406, "balance_loss_clip": 1.02181983, "balance_loss_mlp": 1.04750919, "epoch": 0.28654742221554186, "flos": 32745073138560.0, "grad_norm": 1.45572018195677, "language_loss": 0.75918674, "learning_rate": 3.2432625737749754e-06, "loss": 0.78095382, "num_input_tokens_seen": 102842670, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.828125, "step": 4766, "time_per_iteration": 2.754953145980835 }, { "auxiliary_loss_clip": 0.01144412, "auxiliary_loss_mlp": 0.01040691, "balance_loss_clip": 1.02435923, "balance_loss_mlp": 1.05103195, "epoch": 0.2866075454682098, "flos": 26981626331520.0, "grad_norm": 1.6500843527531033, "language_loss": 0.77806485, "learning_rate": 3.2429665995880397e-06, "loss": 0.79991591, "num_input_tokens_seen": 102864480, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.84375, "step": 4767, "time_per_iteration": 2.636547565460205 }, { "auxiliary_loss_clip": 0.01133992, "auxiliary_loss_mlp": 0.01043285, "balance_loss_clip": 1.02698958, "balance_loss_mlp": 1.04913771, "epoch": 0.2866676687208778, "flos": 23253847004160.0, "grad_norm": 4.207410034477566, "language_loss": 0.65084791, "learning_rate": 3.242670581042824e-06, "loss": 0.67262071, "num_input_tokens_seen": 102883740, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.84765625, "step": 4768, "time_per_iteration": 2.561483144760132 }, { "auxiliary_loss_clip": 0.01154962, "auxiliary_loss_mlp": 0.01040679, "balance_loss_clip": 1.02394223, "balance_loss_mlp": 1.04870081, "epoch": 0.28672779197354575, "flos": 21579727870080.0, "grad_norm": 1.602957681952046, "language_loss": 0.70461315, "learning_rate": 3.2423745181498907e-06, "loss": 0.72656953, "num_input_tokens_seen": 102902945, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.87890625, "step": 4769, "time_per_iteration": 2.6192891597747803 }, { "auxiliary_loss_clip": 0.01171427, "auxiliary_loss_mlp": 0.01033966, "balance_loss_clip": 1.0185405, "balance_loss_mlp": 1.04914832, "epoch": 0.2867879152262137, "flos": 19865568049920.0, "grad_norm": 1.5679298154422472, "language_loss": 0.74821287, "learning_rate": 3.2420784109198076e-06, "loss": 0.77026689, "num_input_tokens_seen": 102922405, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.859375, "step": 4770, "time_per_iteration": 2.611727476119995 }, { "auxiliary_loss_clip": 0.01174409, "auxiliary_loss_mlp": 0.01040472, "balance_loss_clip": 1.0241164, "balance_loss_mlp": 1.05127978, "epoch": 0.2868480384788817, "flos": 28213325648640.0, "grad_norm": 2.697295690275625, "language_loss": 0.67569208, "learning_rate": 3.241782259363141e-06, "loss": 0.69784093, "num_input_tokens_seen": 102938980, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.87109375, "step": 4771, "time_per_iteration": 2.654761552810669 }, { "auxiliary_loss_clip": 0.0107362, "auxiliary_loss_mlp": 0.01006256, "balance_loss_clip": 1.00428867, "balance_loss_mlp": 1.03632796, "epoch": 0.2869081617315497, "flos": 65424286690560.0, "grad_norm": 0.7700141840569813, "language_loss": 0.56907773, "learning_rate": 3.241486063490459e-06, "loss": 0.58987647, "num_input_tokens_seen": 103000405, "router_z_loss_clip": 0.01965332, "router_z_loss_mlp": 0.28125, "step": 4772, "time_per_iteration": 3.2082090377807617 }, { "auxiliary_loss_clip": 0.01182255, "auxiliary_loss_mlp": 0.01290149, "balance_loss_clip": 1.02640843, "balance_loss_mlp": 1.05053794, "epoch": 0.28696828498421767, "flos": 18660729127680.0, "grad_norm": 2.419230006101106, "language_loss": 0.82968891, "learning_rate": 3.241189823312334e-06, "loss": 0.85441297, "num_input_tokens_seen": 103017970, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8671875, "step": 4773, "time_per_iteration": 2.5928125381469727 }, { "auxiliary_loss_clip": 0.01134025, "auxiliary_loss_mlp": 0.01040484, "balance_loss_clip": 1.02358007, "balance_loss_mlp": 1.04694867, "epoch": 0.28702840823688563, "flos": 23659745667840.0, "grad_norm": 4.671012350176489, "language_loss": 0.77535892, "learning_rate": 3.2408935388393358e-06, "loss": 0.79710406, "num_input_tokens_seen": 103036385, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.87109375, "step": 4774, "time_per_iteration": 2.681628465652466 }, { "auxiliary_loss_clip": 0.01153019, "auxiliary_loss_mlp": 0.01042332, "balance_loss_clip": 1.02528465, "balance_loss_mlp": 1.05025613, "epoch": 0.2870885314895536, "flos": 13804744544640.0, "grad_norm": 2.123490701769595, "language_loss": 0.73559248, "learning_rate": 3.24059721008204e-06, "loss": 0.75754595, "num_input_tokens_seen": 103052170, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.84375, "step": 4775, "time_per_iteration": 2.6561014652252197 }, { "auxiliary_loss_clip": 0.01146082, "auxiliary_loss_mlp": 0.01041941, "balance_loss_clip": 1.02553773, "balance_loss_mlp": 1.05027521, "epoch": 0.28714865474222157, "flos": 17786771314560.0, "grad_norm": 1.6331237851062228, "language_loss": 0.88221127, "learning_rate": 3.2403008370510207e-06, "loss": 0.90409148, "num_input_tokens_seen": 103070510, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8671875, "step": 4776, "time_per_iteration": 2.5522620677948 }, { "auxiliary_loss_clip": 0.01144638, "auxiliary_loss_mlp": 0.01039139, "balance_loss_clip": 1.02337968, "balance_loss_mlp": 1.05047178, "epoch": 0.28720877799488953, "flos": 15997486199040.0, "grad_norm": 1.8177557949364718, "language_loss": 0.74216956, "learning_rate": 3.240004419756855e-06, "loss": 0.76400727, "num_input_tokens_seen": 103089590, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8515625, "step": 4777, "time_per_iteration": 2.5368006229400635 }, { "auxiliary_loss_clip": 0.01160899, "auxiliary_loss_mlp": 0.01041419, "balance_loss_clip": 1.02503943, "balance_loss_mlp": 1.05276728, "epoch": 0.2872689012475575, "flos": 20923137210240.0, "grad_norm": 2.0088157645592637, "language_loss": 0.79786217, "learning_rate": 3.239707958210121e-06, "loss": 0.81988537, "num_input_tokens_seen": 103109080, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8984375, "step": 4778, "time_per_iteration": 2.5940799713134766 }, { "auxiliary_loss_clip": 0.01151815, "auxiliary_loss_mlp": 0.01043737, "balance_loss_clip": 1.02698863, "balance_loss_mlp": 1.05025339, "epoch": 0.28732902450022546, "flos": 21325121291520.0, "grad_norm": 1.8303336890831603, "language_loss": 0.7373448, "learning_rate": 3.239411452421399e-06, "loss": 0.75930035, "num_input_tokens_seen": 103127755, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8359375, "step": 4779, "time_per_iteration": 2.5695784091949463 }, { "auxiliary_loss_clip": 0.01155655, "auxiliary_loss_mlp": 0.01041767, "balance_loss_clip": 1.0243032, "balance_loss_mlp": 1.05114043, "epoch": 0.2873891477528934, "flos": 20850382212480.0, "grad_norm": 1.5819897839321375, "language_loss": 0.75927556, "learning_rate": 3.2391149024012705e-06, "loss": 0.78124976, "num_input_tokens_seen": 103147035, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.86328125, "step": 4780, "time_per_iteration": 2.5944015979766846 }, { "auxiliary_loss_clip": 0.01160483, "auxiliary_loss_mlp": 0.01043828, "balance_loss_clip": 1.0285151, "balance_loss_mlp": 1.04916763, "epoch": 0.2874492710055614, "flos": 17420051410560.0, "grad_norm": 1.6405436564271885, "language_loss": 0.81436855, "learning_rate": 3.238818308160318e-06, "loss": 0.83641171, "num_input_tokens_seen": 103165410, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.84375, "step": 4781, "time_per_iteration": 2.54512357711792 }, { "auxiliary_loss_clip": 0.01173913, "auxiliary_loss_mlp": 0.01043412, "balance_loss_clip": 1.02607942, "balance_loss_mlp": 1.0497148, "epoch": 0.28750939425822936, "flos": 13406818700160.0, "grad_norm": 2.3245588193436073, "language_loss": 0.86180687, "learning_rate": 3.2385216697091277e-06, "loss": 0.88398015, "num_input_tokens_seen": 103183710, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.8828125, "step": 4782, "time_per_iteration": 2.569716453552246 }, { "auxiliary_loss_clip": 0.01163001, "auxiliary_loss_mlp": 0.01043521, "balance_loss_clip": 1.02675986, "balance_loss_mlp": 1.05082095, "epoch": 0.2875695175108973, "flos": 21870029589120.0, "grad_norm": 3.2649369541094653, "language_loss": 0.70995152, "learning_rate": 3.238224987058284e-06, "loss": 0.73201668, "num_input_tokens_seen": 103203790, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8515625, "step": 4783, "time_per_iteration": 2.6227331161499023 }, { "auxiliary_loss_clip": 0.01155628, "auxiliary_loss_mlp": 0.01042964, "balance_loss_clip": 1.02654839, "balance_loss_mlp": 1.05291271, "epoch": 0.2876296407635653, "flos": 26245457089920.0, "grad_norm": 2.079213225662365, "language_loss": 0.76824713, "learning_rate": 3.2379282602183757e-06, "loss": 0.79023308, "num_input_tokens_seen": 103223925, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.84765625, "step": 4784, "time_per_iteration": 2.670219898223877 }, { "auxiliary_loss_clip": 0.01163545, "auxiliary_loss_mlp": 0.01043809, "balance_loss_clip": 1.02816892, "balance_loss_mlp": 1.05293036, "epoch": 0.28768976401623325, "flos": 25373654092800.0, "grad_norm": 1.4214803289642157, "language_loss": 0.75758147, "learning_rate": 3.237631489199993e-06, "loss": 0.77965498, "num_input_tokens_seen": 103244760, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8359375, "step": 4785, "time_per_iteration": 2.734668016433716 }, { "auxiliary_loss_clip": 0.01137098, "auxiliary_loss_mlp": 0.01043426, "balance_loss_clip": 1.02757072, "balance_loss_mlp": 1.05128527, "epoch": 0.28774988726890127, "flos": 30664372982400.0, "grad_norm": 2.1368099844158235, "language_loss": 0.83187246, "learning_rate": 3.2373346740137254e-06, "loss": 0.85367775, "num_input_tokens_seen": 103261995, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.859375, "step": 4786, "time_per_iteration": 2.614064931869507 }, { "auxiliary_loss_clip": 0.01165928, "auxiliary_loss_mlp": 0.01284069, "balance_loss_clip": 1.02219892, "balance_loss_mlp": 1.05340612, "epoch": 0.28781001052156924, "flos": 20595452411520.0, "grad_norm": 1.9273428544866884, "language_loss": 0.79776824, "learning_rate": 3.237037814670166e-06, "loss": 0.82226831, "num_input_tokens_seen": 103279780, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.85546875, "step": 4787, "time_per_iteration": 2.831129312515259 }, { "auxiliary_loss_clip": 0.01155326, "auxiliary_loss_mlp": 0.01038389, "balance_loss_clip": 1.02225995, "balance_loss_mlp": 1.05044365, "epoch": 0.2878701337742372, "flos": 26542330997760.0, "grad_norm": 1.734248450393175, "language_loss": 0.82850468, "learning_rate": 3.2367409111799082e-06, "loss": 0.85044175, "num_input_tokens_seen": 103300580, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8671875, "step": 4788, "time_per_iteration": 2.6440269947052 }, { "auxiliary_loss_clip": 0.0117747, "auxiliary_loss_mlp": 0.01044196, "balance_loss_clip": 1.02762556, "balance_loss_mlp": 1.05337369, "epoch": 0.28793025702690517, "flos": 28146855530880.0, "grad_norm": 1.7878807568531756, "language_loss": 0.73574406, "learning_rate": 3.23644396355355e-06, "loss": 0.75796068, "num_input_tokens_seen": 103320430, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.88671875, "step": 4789, "time_per_iteration": 2.7544891834259033 }, { "auxiliary_loss_clip": 0.01141141, "auxiliary_loss_mlp": 0.01040335, "balance_loss_clip": 1.02483821, "balance_loss_mlp": 1.04910159, "epoch": 0.28799038027957313, "flos": 23805471144960.0, "grad_norm": 1.6957823361351536, "language_loss": 0.83997166, "learning_rate": 3.2361469718016867e-06, "loss": 0.86178648, "num_input_tokens_seen": 103337695, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.83203125, "step": 4790, "time_per_iteration": 2.6015260219573975 }, { "auxiliary_loss_clip": 0.01147419, "auxiliary_loss_mlp": 0.01043214, "balance_loss_clip": 1.02625036, "balance_loss_mlp": 1.05141211, "epoch": 0.2880505035322411, "flos": 22344122223360.0, "grad_norm": 1.8676384712853098, "language_loss": 0.7787354, "learning_rate": 3.2358499359349177e-06, "loss": 0.80064178, "num_input_tokens_seen": 103357010, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8671875, "step": 4791, "time_per_iteration": 2.67332124710083 }, { "auxiliary_loss_clip": 0.01158863, "auxiliary_loss_mlp": 0.01038766, "balance_loss_clip": 1.02243412, "balance_loss_mlp": 1.05162776, "epoch": 0.28811062678490906, "flos": 18004246208640.0, "grad_norm": 3.5862317882361627, "language_loss": 0.70479167, "learning_rate": 3.2355528559638436e-06, "loss": 0.7267679, "num_input_tokens_seen": 103375600, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.890625, "step": 4792, "time_per_iteration": 3.9605445861816406 }, { "auxiliary_loss_clip": 0.01174009, "auxiliary_loss_mlp": 0.01036903, "balance_loss_clip": 1.02046371, "balance_loss_mlp": 1.05230916, "epoch": 0.28817075003757703, "flos": 22090880361600.0, "grad_norm": 1.6852431179664573, "language_loss": 0.79588509, "learning_rate": 3.235255731899066e-06, "loss": 0.81799418, "num_input_tokens_seen": 103395225, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.859375, "step": 4793, "time_per_iteration": 2.6542487144470215 }, { "auxiliary_loss_clip": 0.01155838, "auxiliary_loss_mlp": 0.01040189, "balance_loss_clip": 1.02502513, "balance_loss_mlp": 1.05436134, "epoch": 0.288230873290245, "flos": 41683130847360.0, "grad_norm": 1.699108337549197, "language_loss": 0.78158742, "learning_rate": 3.2349585637511896e-06, "loss": 0.80354768, "num_input_tokens_seen": 103417245, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8359375, "step": 4794, "time_per_iteration": 2.8350837230682373 }, { "auxiliary_loss_clip": 0.01148807, "auxiliary_loss_mlp": 0.01046027, "balance_loss_clip": 1.02946925, "balance_loss_mlp": 1.05277014, "epoch": 0.28829099654291296, "flos": 18624423456000.0, "grad_norm": 2.0016607747035393, "language_loss": 0.82914639, "learning_rate": 3.2346613515308176e-06, "loss": 0.85109472, "num_input_tokens_seen": 103435500, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.875, "step": 4795, "time_per_iteration": 2.6525611877441406 }, { "auxiliary_loss_clip": 0.01143611, "auxiliary_loss_mlp": 0.01042684, "balance_loss_clip": 1.02704394, "balance_loss_mlp": 1.05199111, "epoch": 0.2883511197955809, "flos": 24674832017280.0, "grad_norm": 1.9983958085122642, "language_loss": 0.74498749, "learning_rate": 3.2343640952485586e-06, "loss": 0.76685041, "num_input_tokens_seen": 103451040, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.828125, "step": 4796, "time_per_iteration": 2.6474475860595703 }, { "auxiliary_loss_clip": 0.01152197, "auxiliary_loss_mlp": 0.01043216, "balance_loss_clip": 1.02463078, "balance_loss_mlp": 1.05292797, "epoch": 0.2884112430482489, "flos": 23112143850240.0, "grad_norm": 1.895135695105714, "language_loss": 0.72624296, "learning_rate": 3.23406679491502e-06, "loss": 0.74819708, "num_input_tokens_seen": 103471330, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.90625, "step": 4797, "time_per_iteration": 4.140578508377075 }, { "auxiliary_loss_clip": 0.01164866, "auxiliary_loss_mlp": 0.01285431, "balance_loss_clip": 1.02104664, "balance_loss_mlp": 1.05169904, "epoch": 0.28847136630091685, "flos": 16873347432960.0, "grad_norm": 2.2788834183758326, "language_loss": 0.74464017, "learning_rate": 3.2337694505408117e-06, "loss": 0.7691431, "num_input_tokens_seen": 103488060, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.859375, "step": 4798, "time_per_iteration": 4.080370664596558 }, { "auxiliary_loss_clip": 0.01149158, "auxiliary_loss_mlp": 0.01045933, "balance_loss_clip": 1.02782476, "balance_loss_mlp": 1.05128503, "epoch": 0.2885314895535849, "flos": 25657527277440.0, "grad_norm": 3.1581319035743545, "language_loss": 0.64743793, "learning_rate": 3.2334720621365457e-06, "loss": 0.66938877, "num_input_tokens_seen": 103503600, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.88671875, "step": 4799, "time_per_iteration": 2.6125056743621826 }, { "auxiliary_loss_clip": 0.0113995, "auxiliary_loss_mlp": 0.01050455, "balance_loss_clip": 1.03347969, "balance_loss_mlp": 1.05118477, "epoch": 0.28859161280625284, "flos": 21107251347840.0, "grad_norm": 1.5847081362873652, "language_loss": 0.82380944, "learning_rate": 3.2331746297128345e-06, "loss": 0.8457135, "num_input_tokens_seen": 103524195, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.88671875, "step": 4800, "time_per_iteration": 4.122687578201294 }, { "auxiliary_loss_clip": 0.01164038, "auxiliary_loss_mlp": 0.01039774, "balance_loss_clip": 1.02347779, "balance_loss_mlp": 1.05364752, "epoch": 0.2886517360589208, "flos": 26469540086400.0, "grad_norm": 1.6684973917498063, "language_loss": 0.9101907, "learning_rate": 3.2328771532802934e-06, "loss": 0.93222886, "num_input_tokens_seen": 103545235, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.828125, "step": 4801, "time_per_iteration": 2.6247429847717285 }, { "auxiliary_loss_clip": 0.01158446, "auxiliary_loss_mlp": 0.01287372, "balance_loss_clip": 1.02311325, "balance_loss_mlp": 1.05307448, "epoch": 0.28871185931158877, "flos": 25265275781760.0, "grad_norm": 2.0517433032239993, "language_loss": 0.73633206, "learning_rate": 3.232579632849537e-06, "loss": 0.76079023, "num_input_tokens_seen": 103563305, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.87109375, "step": 4802, "time_per_iteration": 2.75683856010437 }, { "auxiliary_loss_clip": 0.01097489, "auxiliary_loss_mlp": 0.01005178, "balance_loss_clip": 1.00327098, "balance_loss_mlp": 1.04227877, "epoch": 0.28877198256425674, "flos": 66665431284480.0, "grad_norm": 0.7767831040387262, "language_loss": 0.62943971, "learning_rate": 3.232282068431185e-06, "loss": 0.65046644, "num_input_tokens_seen": 103625025, "router_z_loss_clip": 0.01904297, "router_z_loss_mlp": 0.28125, "step": 4803, "time_per_iteration": 3.20047926902771 }, { "auxiliary_loss_clip": 0.01170718, "auxiliary_loss_mlp": 0.01048158, "balance_loss_clip": 1.03222597, "balance_loss_mlp": 1.04997659, "epoch": 0.2888321058169247, "flos": 20303031790080.0, "grad_norm": 2.1801018765833975, "language_loss": 0.70798874, "learning_rate": 3.2319844600358554e-06, "loss": 0.7301774, "num_input_tokens_seen": 103644235, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8515625, "step": 4804, "time_per_iteration": 2.6414425373077393 }, { "auxiliary_loss_clip": 0.01147533, "auxiliary_loss_mlp": 0.0104194, "balance_loss_clip": 1.02403522, "balance_loss_mlp": 1.04972231, "epoch": 0.28889222906959267, "flos": 25516721963520.0, "grad_norm": 4.8545544529425015, "language_loss": 0.68203825, "learning_rate": 3.231686807674169e-06, "loss": 0.703933, "num_input_tokens_seen": 103664700, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.890625, "step": 4805, "time_per_iteration": 2.693964958190918 }, { "auxiliary_loss_clip": 0.01147893, "auxiliary_loss_mlp": 0.01047062, "balance_loss_clip": 1.03017008, "balance_loss_mlp": 1.05106854, "epoch": 0.28895235232226063, "flos": 32671312560000.0, "grad_norm": 2.5347570701057993, "language_loss": 0.69471884, "learning_rate": 3.2313891113567496e-06, "loss": 0.71666837, "num_input_tokens_seen": 103686595, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.87890625, "step": 4806, "time_per_iteration": 2.6990463733673096 }, { "auxiliary_loss_clip": 0.01154113, "auxiliary_loss_mlp": 0.01043723, "balance_loss_clip": 1.02720046, "balance_loss_mlp": 1.05250585, "epoch": 0.2890124755749286, "flos": 29714679342720.0, "grad_norm": 1.396799202708116, "language_loss": 0.71218777, "learning_rate": 3.2310913710942193e-06, "loss": 0.73416615, "num_input_tokens_seen": 103707525, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8359375, "step": 4807, "time_per_iteration": 2.7326602935791016 }, { "auxiliary_loss_clip": 0.01163821, "auxiliary_loss_mlp": 0.01041269, "balance_loss_clip": 1.02468729, "balance_loss_mlp": 1.05163193, "epoch": 0.28907259882759656, "flos": 22674464628480.0, "grad_norm": 1.801675324181088, "language_loss": 0.81266308, "learning_rate": 3.2307935868972055e-06, "loss": 0.834714, "num_input_tokens_seen": 103727905, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.85546875, "step": 4808, "time_per_iteration": 2.6519854068756104 }, { "auxiliary_loss_clip": 0.01161359, "auxiliary_loss_mlp": 0.01049038, "balance_loss_clip": 1.03185952, "balance_loss_mlp": 1.05216193, "epoch": 0.2891327220802645, "flos": 22566050403840.0, "grad_norm": 1.5601982250022655, "language_loss": 0.78245151, "learning_rate": 3.2304957587763344e-06, "loss": 0.80455548, "num_input_tokens_seen": 103748335, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.82421875, "step": 4809, "time_per_iteration": 2.7527294158935547 }, { "auxiliary_loss_clip": 0.01177164, "auxiliary_loss_mlp": 0.0104616, "balance_loss_clip": 1.0281713, "balance_loss_mlp": 1.05039704, "epoch": 0.2891928453329325, "flos": 21652806090240.0, "grad_norm": 1.8322708724602346, "language_loss": 0.79101574, "learning_rate": 3.2301978867422352e-06, "loss": 0.81324899, "num_input_tokens_seen": 103767020, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.90625, "step": 4810, "time_per_iteration": 2.6469831466674805 }, { "auxiliary_loss_clip": 0.0117221, "auxiliary_loss_mlp": 0.01040487, "balance_loss_clip": 1.02478135, "balance_loss_mlp": 1.05232692, "epoch": 0.28925296858560046, "flos": 23222102359680.0, "grad_norm": 1.9073578793105441, "language_loss": 0.76813626, "learning_rate": 3.2298999708055375e-06, "loss": 0.79026318, "num_input_tokens_seen": 103786355, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.84375, "step": 4811, "time_per_iteration": 2.741947889328003 }, { "auxiliary_loss_clip": 0.01145616, "auxiliary_loss_mlp": 0.0104997, "balance_loss_clip": 1.03252935, "balance_loss_mlp": 1.05151451, "epoch": 0.2893130918382685, "flos": 28621666437120.0, "grad_norm": 1.4973507787720577, "language_loss": 0.77229118, "learning_rate": 3.229602010976873e-06, "loss": 0.79424703, "num_input_tokens_seen": 103809345, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.8515625, "step": 4812, "time_per_iteration": 2.609896183013916 }, { "auxiliary_loss_clip": 0.01097729, "auxiliary_loss_mlp": 0.01016837, "balance_loss_clip": 1.01483428, "balance_loss_mlp": 1.04275191, "epoch": 0.28937321509093644, "flos": 72301288982400.0, "grad_norm": 0.8404981511366251, "language_loss": 0.60237503, "learning_rate": 3.2293040072668768e-06, "loss": 0.62352061, "num_input_tokens_seen": 103871180, "router_z_loss_clip": 0.02001953, "router_z_loss_mlp": 0.28320312, "step": 4813, "time_per_iteration": 3.3618249893188477 }, { "auxiliary_loss_clip": 0.01162483, "auxiliary_loss_mlp": 0.01040355, "balance_loss_clip": 1.02358186, "balance_loss_mlp": 1.05113947, "epoch": 0.2894333383436044, "flos": 16216397637120.0, "grad_norm": 2.5282456477973985, "language_loss": 0.82562149, "learning_rate": 3.229005959686182e-06, "loss": 0.84764981, "num_input_tokens_seen": 103889040, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.83984375, "step": 4814, "time_per_iteration": 2.6450302600860596 }, { "auxiliary_loss_clip": 0.01175027, "auxiliary_loss_mlp": 0.01045876, "balance_loss_clip": 1.02902007, "balance_loss_mlp": 1.05073559, "epoch": 0.2894934615962724, "flos": 24828278918400.0, "grad_norm": 3.43561361045067, "language_loss": 0.72607195, "learning_rate": 3.2287078682454255e-06, "loss": 0.748281, "num_input_tokens_seen": 103910380, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.890625, "step": 4815, "time_per_iteration": 2.6132452487945557 }, { "auxiliary_loss_clip": 0.01141605, "auxiliary_loss_mlp": 0.0104569, "balance_loss_clip": 1.03066981, "balance_loss_mlp": 1.05124474, "epoch": 0.28955358484894034, "flos": 20449978329600.0, "grad_norm": 1.7024471300598605, "language_loss": 0.70729113, "learning_rate": 3.2284097329552465e-06, "loss": 0.72916412, "num_input_tokens_seen": 103929955, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8125, "step": 4816, "time_per_iteration": 2.6530566215515137 }, { "auxiliary_loss_clip": 0.01156731, "auxiliary_loss_mlp": 0.01041275, "balance_loss_clip": 1.02395415, "balance_loss_mlp": 1.04895377, "epoch": 0.2896137081016083, "flos": 22565188477440.0, "grad_norm": 2.6906436909054037, "language_loss": 0.74184114, "learning_rate": 3.2281115538262844e-06, "loss": 0.76382118, "num_input_tokens_seen": 103948020, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.8984375, "step": 4817, "time_per_iteration": 2.681140899658203 }, { "auxiliary_loss_clip": 0.01166083, "auxiliary_loss_mlp": 0.01051444, "balance_loss_clip": 1.03357434, "balance_loss_mlp": 1.05168438, "epoch": 0.28967383135427627, "flos": 26687948734080.0, "grad_norm": 1.636101532043853, "language_loss": 0.76374787, "learning_rate": 3.227813330869179e-06, "loss": 0.78592312, "num_input_tokens_seen": 103968740, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.87890625, "step": 4818, "time_per_iteration": 2.7768173217773438 }, { "auxiliary_loss_clip": 0.01181956, "auxiliary_loss_mlp": 0.01039861, "balance_loss_clip": 1.02271903, "balance_loss_mlp": 1.04978025, "epoch": 0.28973395460694423, "flos": 15558262692480.0, "grad_norm": 2.185798724002761, "language_loss": 0.79440141, "learning_rate": 3.2275150640945742e-06, "loss": 0.81661958, "num_input_tokens_seen": 103986005, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.8671875, "step": 4819, "time_per_iteration": 2.626783609390259 }, { "auxiliary_loss_clip": 0.01167465, "auxiliary_loss_mlp": 0.01044698, "balance_loss_clip": 1.02656686, "balance_loss_mlp": 1.05050826, "epoch": 0.2897940778596122, "flos": 18697465762560.0, "grad_norm": 2.216363113875426, "language_loss": 0.78205764, "learning_rate": 3.227216753513115e-06, "loss": 0.80417931, "num_input_tokens_seen": 104005070, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.8984375, "step": 4820, "time_per_iteration": 2.6873133182525635 }, { "auxiliary_loss_clip": 0.01175056, "auxiliary_loss_mlp": 0.01034186, "balance_loss_clip": 1.01703155, "balance_loss_mlp": 1.05135107, "epoch": 0.28985420111228016, "flos": 18770292587520.0, "grad_norm": 3.1315917362530636, "language_loss": 0.72893906, "learning_rate": 3.2269183991354464e-06, "loss": 0.75103146, "num_input_tokens_seen": 104022945, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.87890625, "step": 4821, "time_per_iteration": 2.577606201171875 }, { "auxiliary_loss_clip": 0.01144254, "auxiliary_loss_mlp": 0.01036587, "balance_loss_clip": 1.01940894, "balance_loss_mlp": 1.04970932, "epoch": 0.28991432436494813, "flos": 23069840607360.0, "grad_norm": 2.038074522441825, "language_loss": 0.72340244, "learning_rate": 3.226620000972216e-06, "loss": 0.74521089, "num_input_tokens_seen": 104042080, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.85546875, "step": 4822, "time_per_iteration": 2.651090145111084 }, { "auxiliary_loss_clip": 0.01145219, "auxiliary_loss_mlp": 0.01045855, "balance_loss_clip": 1.02859354, "balance_loss_mlp": 1.05121815, "epoch": 0.2899744476176161, "flos": 17603195880960.0, "grad_norm": 1.7975647440996827, "language_loss": 0.66361785, "learning_rate": 3.2263215590340726e-06, "loss": 0.68552864, "num_input_tokens_seen": 104060975, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.8515625, "step": 4823, "time_per_iteration": 2.522794008255005 }, { "auxiliary_loss_clip": 0.01151626, "auxiliary_loss_mlp": 0.01039926, "balance_loss_clip": 1.02377343, "balance_loss_mlp": 1.04920793, "epoch": 0.29003457087028406, "flos": 22309360836480.0, "grad_norm": 1.8306978686798583, "language_loss": 0.80769038, "learning_rate": 3.2260230733316683e-06, "loss": 0.82960594, "num_input_tokens_seen": 104081395, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.84375, "step": 4824, "time_per_iteration": 2.656271457672119 }, { "auxiliary_loss_clip": 0.01136556, "auxiliary_loss_mlp": 0.01039764, "balance_loss_clip": 1.02227581, "balance_loss_mlp": 1.049402, "epoch": 0.2900946941229521, "flos": 21944975316480.0, "grad_norm": 1.9326369362801525, "language_loss": 0.72679508, "learning_rate": 3.2257245438756534e-06, "loss": 0.74855822, "num_input_tokens_seen": 104099995, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.87109375, "step": 4825, "time_per_iteration": 2.5303590297698975 }, { "auxiliary_loss_clip": 0.01137729, "auxiliary_loss_mlp": 0.01039078, "balance_loss_clip": 1.02190042, "balance_loss_mlp": 1.05092978, "epoch": 0.29015481737562004, "flos": 17932173569280.0, "grad_norm": 2.3172986240331115, "language_loss": 0.73531622, "learning_rate": 3.2254259706766824e-06, "loss": 0.75708437, "num_input_tokens_seen": 104118930, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8671875, "step": 4826, "time_per_iteration": 2.6011011600494385 }, { "auxiliary_loss_clip": 0.01141478, "auxiliary_loss_mlp": 0.01037571, "balance_loss_clip": 1.02250278, "balance_loss_mlp": 1.04721677, "epoch": 0.290214940628288, "flos": 22783525297920.0, "grad_norm": 1.8598985676647286, "language_loss": 0.68506706, "learning_rate": 3.2251273537454113e-06, "loss": 0.70685756, "num_input_tokens_seen": 104136940, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8515625, "step": 4827, "time_per_iteration": 2.557945728302002 }, { "auxiliary_loss_clip": 0.01148097, "auxiliary_loss_mlp": 0.01039549, "balance_loss_clip": 1.02239442, "balance_loss_mlp": 1.05244112, "epoch": 0.290275063880956, "flos": 20006481104640.0, "grad_norm": 2.041642009934138, "language_loss": 0.80639738, "learning_rate": 3.224828693092496e-06, "loss": 0.82827377, "num_input_tokens_seen": 104154280, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.86328125, "step": 4828, "time_per_iteration": 2.747232437133789 }, { "auxiliary_loss_clip": 0.01136385, "auxiliary_loss_mlp": 0.01044527, "balance_loss_clip": 1.02808166, "balance_loss_mlp": 1.05209363, "epoch": 0.29033518713362394, "flos": 22053605022720.0, "grad_norm": 2.1550309019443863, "language_loss": 0.80056942, "learning_rate": 3.2245299887285954e-06, "loss": 0.82237852, "num_input_tokens_seen": 104172605, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.84375, "step": 4829, "time_per_iteration": 2.5978481769561768 }, { "auxiliary_loss_clip": 0.01151953, "auxiliary_loss_mlp": 0.01040555, "balance_loss_clip": 1.02443838, "balance_loss_mlp": 1.05224133, "epoch": 0.2903953103862919, "flos": 25630056351360.0, "grad_norm": 1.6218773315437094, "language_loss": 0.82332629, "learning_rate": 3.224231240664369e-06, "loss": 0.84525132, "num_input_tokens_seen": 104194120, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.81640625, "step": 4830, "time_per_iteration": 2.642843008041382 }, { "auxiliary_loss_clip": 0.01146143, "auxiliary_loss_mlp": 0.0104616, "balance_loss_clip": 1.02922082, "balance_loss_mlp": 1.05025744, "epoch": 0.29045543363895987, "flos": 16945851035520.0, "grad_norm": 2.6749026830047353, "language_loss": 0.79324818, "learning_rate": 3.223932448910479e-06, "loss": 0.81517118, "num_input_tokens_seen": 104210875, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.87109375, "step": 4831, "time_per_iteration": 2.5479133129119873 }, { "auxiliary_loss_clip": 0.01133098, "auxiliary_loss_mlp": 0.01041428, "balance_loss_clip": 1.02613378, "balance_loss_mlp": 1.05080295, "epoch": 0.29051555689162784, "flos": 26395492199040.0, "grad_norm": 1.6639616543016138, "language_loss": 0.74263299, "learning_rate": 3.2236336134775883e-06, "loss": 0.76437825, "num_input_tokens_seen": 104229875, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.82421875, "step": 4832, "time_per_iteration": 2.5996358394622803 }, { "auxiliary_loss_clip": 0.01148973, "auxiliary_loss_mlp": 0.01035437, "balance_loss_clip": 1.02112019, "balance_loss_mlp": 1.04826355, "epoch": 0.2905756801442958, "flos": 21103875469440.0, "grad_norm": 1.8184240963346199, "language_loss": 0.75762457, "learning_rate": 3.2233347343763614e-06, "loss": 0.77946866, "num_input_tokens_seen": 104250405, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.828125, "step": 4833, "time_per_iteration": 4.057356595993042 }, { "auxiliary_loss_clip": 0.01150954, "auxiliary_loss_mlp": 0.01034738, "balance_loss_clip": 1.0193125, "balance_loss_mlp": 1.05014062, "epoch": 0.29063580339696377, "flos": 15706071158400.0, "grad_norm": 1.9594178344633926, "language_loss": 0.64699548, "learning_rate": 3.2230358116174645e-06, "loss": 0.66885245, "num_input_tokens_seen": 104269185, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.828125, "step": 4834, "time_per_iteration": 2.5712697505950928 }, { "auxiliary_loss_clip": 0.01150475, "auxiliary_loss_mlp": 0.01030995, "balance_loss_clip": 1.01629066, "balance_loss_mlp": 1.05008078, "epoch": 0.29069592664963173, "flos": 24644990793600.0, "grad_norm": 2.4366456414603013, "language_loss": 0.71739519, "learning_rate": 3.2227368452115658e-06, "loss": 0.73920989, "num_input_tokens_seen": 104289400, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8203125, "step": 4835, "time_per_iteration": 2.65826678276062 }, { "auxiliary_loss_clip": 0.01150762, "auxiliary_loss_mlp": 0.0103804, "balance_loss_clip": 1.02337694, "balance_loss_mlp": 1.04942501, "epoch": 0.2907560499022997, "flos": 24973753000320.0, "grad_norm": 1.620372957558484, "language_loss": 0.79497707, "learning_rate": 3.2224378351693337e-06, "loss": 0.81686515, "num_input_tokens_seen": 104310485, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8359375, "step": 4836, "time_per_iteration": 2.594619035720825 }, { "auxiliary_loss_clip": 0.01159077, "auxiliary_loss_mlp": 0.01042043, "balance_loss_clip": 1.02598596, "balance_loss_mlp": 1.0496757, "epoch": 0.29081617315496766, "flos": 18657496903680.0, "grad_norm": 1.5937614416731465, "language_loss": 0.80824524, "learning_rate": 3.2221387815014405e-06, "loss": 0.83025646, "num_input_tokens_seen": 104327330, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.82421875, "step": 4837, "time_per_iteration": 2.5732462406158447 }, { "auxiliary_loss_clip": 0.01150079, "auxiliary_loss_mlp": 0.01042767, "balance_loss_clip": 1.02724552, "balance_loss_mlp": 1.04698634, "epoch": 0.2908762964076356, "flos": 35331035955840.0, "grad_norm": 2.3057667371126485, "language_loss": 0.67381012, "learning_rate": 3.2218396842185576e-06, "loss": 0.69573861, "num_input_tokens_seen": 104350350, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8515625, "step": 4838, "time_per_iteration": 2.6803722381591797 }, { "auxiliary_loss_clip": 0.01137398, "auxiliary_loss_mlp": 0.01291734, "balance_loss_clip": 1.02804458, "balance_loss_mlp": 1.05052805, "epoch": 0.29093641966030365, "flos": 23076305055360.0, "grad_norm": 1.6897356452527263, "language_loss": 0.71437442, "learning_rate": 3.2215405433313595e-06, "loss": 0.7386657, "num_input_tokens_seen": 104369995, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8671875, "step": 4839, "time_per_iteration": 3.991765260696411 }, { "auxiliary_loss_clip": 0.01133452, "auxiliary_loss_mlp": 0.0104119, "balance_loss_clip": 1.02584803, "balance_loss_mlp": 1.04794991, "epoch": 0.2909965429129716, "flos": 35955415094400.0, "grad_norm": 2.3795270799344634, "language_loss": 0.75609696, "learning_rate": 3.221241358850521e-06, "loss": 0.77784336, "num_input_tokens_seen": 104392285, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.85546875, "step": 4840, "time_per_iteration": 4.160989284515381 }, { "auxiliary_loss_clip": 0.01162104, "auxiliary_loss_mlp": 0.01044038, "balance_loss_clip": 1.02804005, "balance_loss_mlp": 1.05122101, "epoch": 0.2910566661656396, "flos": 30880231764480.0, "grad_norm": 1.624309529556096, "language_loss": 0.60530645, "learning_rate": 3.2209421307867205e-06, "loss": 0.62736785, "num_input_tokens_seen": 104412640, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83984375, "step": 4841, "time_per_iteration": 2.658749580383301 }, { "auxiliary_loss_clip": 0.01154432, "auxiliary_loss_mlp": 0.01044308, "balance_loss_clip": 1.02718949, "balance_loss_mlp": 1.05141008, "epoch": 0.29111678941830754, "flos": 30010188533760.0, "grad_norm": 1.3096709664039738, "language_loss": 0.71397191, "learning_rate": 3.2206428591506358e-06, "loss": 0.73595929, "num_input_tokens_seen": 104435245, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8515625, "step": 4842, "time_per_iteration": 4.112670421600342 }, { "auxiliary_loss_clip": 0.0114249, "auxiliary_loss_mlp": 0.01035021, "balance_loss_clip": 1.02033448, "balance_loss_mlp": 1.04928923, "epoch": 0.2911769126709755, "flos": 22857393617280.0, "grad_norm": 1.5321233600031998, "language_loss": 0.73200011, "learning_rate": 3.220343543952947e-06, "loss": 0.75377518, "num_input_tokens_seen": 104455395, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.84375, "step": 4843, "time_per_iteration": 2.5944714546203613 }, { "auxiliary_loss_clip": 0.01171549, "auxiliary_loss_mlp": 0.01038686, "balance_loss_clip": 1.02314067, "balance_loss_mlp": 1.04969668, "epoch": 0.2912370359236435, "flos": 21650507619840.0, "grad_norm": 2.0767778840277176, "language_loss": 0.58169687, "learning_rate": 3.2200441852043367e-06, "loss": 0.60379922, "num_input_tokens_seen": 104473350, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.85546875, "step": 4844, "time_per_iteration": 2.5696704387664795 }, { "auxiliary_loss_clip": 0.01147262, "auxiliary_loss_mlp": 0.01037676, "balance_loss_clip": 1.020522, "balance_loss_mlp": 1.05290914, "epoch": 0.29129715917631144, "flos": 22893340152960.0, "grad_norm": 1.777455209866888, "language_loss": 0.86575288, "learning_rate": 3.2197447829154875e-06, "loss": 0.88760221, "num_input_tokens_seen": 104492265, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.85546875, "step": 4845, "time_per_iteration": 2.5536561012268066 }, { "auxiliary_loss_clip": 0.01155495, "auxiliary_loss_mlp": 0.01051987, "balance_loss_clip": 1.03430843, "balance_loss_mlp": 1.05040491, "epoch": 0.2913572824289794, "flos": 22674464628480.0, "grad_norm": 1.667332315330575, "language_loss": 0.66723353, "learning_rate": 3.2194453370970844e-06, "loss": 0.68930829, "num_input_tokens_seen": 104510755, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.87109375, "step": 4846, "time_per_iteration": 2.548083782196045 }, { "auxiliary_loss_clip": 0.01155597, "auxiliary_loss_mlp": 0.01041079, "balance_loss_clip": 1.0254153, "balance_loss_mlp": 1.05338895, "epoch": 0.29141740568164737, "flos": 23107403255040.0, "grad_norm": 3.85536577360056, "language_loss": 0.70398319, "learning_rate": 3.219145847759814e-06, "loss": 0.72594994, "num_input_tokens_seen": 104530830, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.83984375, "step": 4847, "time_per_iteration": 2.5629100799560547 }, { "auxiliary_loss_clip": 0.0114268, "auxiliary_loss_mlp": 0.01038966, "balance_loss_clip": 1.02225256, "balance_loss_mlp": 1.04940116, "epoch": 0.29147752893431533, "flos": 23587026583680.0, "grad_norm": 1.4993132374494709, "language_loss": 0.74069762, "learning_rate": 3.218846314914365e-06, "loss": 0.76251411, "num_input_tokens_seen": 104550115, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.84375, "step": 4848, "time_per_iteration": 2.624513864517212 }, { "auxiliary_loss_clip": 0.01145054, "auxiliary_loss_mlp": 0.0104428, "balance_loss_clip": 1.02730417, "balance_loss_mlp": 1.04986298, "epoch": 0.2915376521869833, "flos": 20591968792320.0, "grad_norm": 5.11402064890887, "language_loss": 0.76798445, "learning_rate": 3.218546738571425e-06, "loss": 0.78987777, "num_input_tokens_seen": 104566255, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.86328125, "step": 4849, "time_per_iteration": 2.71220064163208 }, { "auxiliary_loss_clip": 0.01147162, "auxiliary_loss_mlp": 0.01039058, "balance_loss_clip": 1.02342916, "balance_loss_mlp": 1.05057621, "epoch": 0.29159777543965126, "flos": 20811490761600.0, "grad_norm": 1.7168054149345315, "language_loss": 0.78365362, "learning_rate": 3.2182471187416874e-06, "loss": 0.80551589, "num_input_tokens_seen": 104585235, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.875, "step": 4850, "time_per_iteration": 2.717667818069458 }, { "auxiliary_loss_clip": 0.01150229, "auxiliary_loss_mlp": 0.0104148, "balance_loss_clip": 1.02505302, "balance_loss_mlp": 1.04928267, "epoch": 0.29165789869231923, "flos": 24244155947520.0, "grad_norm": 1.870037966306588, "language_loss": 0.75440264, "learning_rate": 3.2179474554358438e-06, "loss": 0.77631974, "num_input_tokens_seen": 104605315, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.83203125, "step": 4851, "time_per_iteration": 2.6755940914154053 }, { "auxiliary_loss_clip": 0.01145065, "auxiliary_loss_mlp": 0.0104367, "balance_loss_clip": 1.0280658, "balance_loss_mlp": 1.05313647, "epoch": 0.29171802194498725, "flos": 28949925853440.0, "grad_norm": 1.4447018531207274, "language_loss": 0.77313507, "learning_rate": 3.2176477486645883e-06, "loss": 0.79502249, "num_input_tokens_seen": 104626055, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.83203125, "step": 4852, "time_per_iteration": 2.6514155864715576 }, { "auxiliary_loss_clip": 0.01153814, "auxiliary_loss_mlp": 0.01047679, "balance_loss_clip": 1.0317049, "balance_loss_mlp": 1.05104184, "epoch": 0.2917781451976552, "flos": 22598226011520.0, "grad_norm": 1.6012495821758577, "language_loss": 0.78025198, "learning_rate": 3.2173479984386165e-06, "loss": 0.8022669, "num_input_tokens_seen": 104646005, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8515625, "step": 4853, "time_per_iteration": 2.6157989501953125 }, { "auxiliary_loss_clip": 0.01162409, "auxiliary_loss_mlp": 0.0104104, "balance_loss_clip": 1.02423108, "balance_loss_mlp": 1.04950333, "epoch": 0.2918382684503232, "flos": 21574448570880.0, "grad_norm": 2.7022365029382054, "language_loss": 0.88052529, "learning_rate": 3.217048204768626e-06, "loss": 0.90255976, "num_input_tokens_seen": 104661620, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.859375, "step": 4854, "time_per_iteration": 2.595104932785034 }, { "auxiliary_loss_clip": 0.01146509, "auxiliary_loss_mlp": 0.01053735, "balance_loss_clip": 1.0347805, "balance_loss_mlp": 1.05197811, "epoch": 0.29189839170299114, "flos": 24353503925760.0, "grad_norm": 2.147166623474565, "language_loss": 0.86674035, "learning_rate": 3.2167483676653167e-06, "loss": 0.8887428, "num_input_tokens_seen": 104681445, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.8515625, "step": 4855, "time_per_iteration": 2.5952956676483154 }, { "auxiliary_loss_clip": 0.01075274, "auxiliary_loss_mlp": 0.01004311, "balance_loss_clip": 1.00237989, "balance_loss_mlp": 1.03822327, "epoch": 0.2919585149556591, "flos": 71316726215040.0, "grad_norm": 0.8277536982079504, "language_loss": 0.60153651, "learning_rate": 3.216448487139387e-06, "loss": 0.62233233, "num_input_tokens_seen": 104747945, "router_z_loss_clip": 0.01928711, "router_z_loss_mlp": 0.27734375, "step": 4856, "time_per_iteration": 3.235462188720703 }, { "auxiliary_loss_clip": 0.01152975, "auxiliary_loss_mlp": 0.01291862, "balance_loss_clip": 1.02818155, "balance_loss_mlp": 1.05003822, "epoch": 0.2920186382083271, "flos": 15633208419840.0, "grad_norm": 4.352611849032131, "language_loss": 0.67841274, "learning_rate": 3.2161485632015397e-06, "loss": 0.70286107, "num_input_tokens_seen": 104766225, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8515625, "step": 4857, "time_per_iteration": 2.5654213428497314 }, { "auxiliary_loss_clip": 0.01143626, "auxiliary_loss_mlp": 0.0128885, "balance_loss_clip": 1.02545667, "balance_loss_mlp": 1.05069947, "epoch": 0.29207876146099504, "flos": 28366018364160.0, "grad_norm": 1.9049281995630407, "language_loss": 0.83768791, "learning_rate": 3.2158485958624794e-06, "loss": 0.86201262, "num_input_tokens_seen": 104785345, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.83984375, "step": 4858, "time_per_iteration": 2.6561837196350098 }, { "auxiliary_loss_clip": 0.01136485, "auxiliary_loss_mlp": 0.01040827, "balance_loss_clip": 1.02488875, "balance_loss_mlp": 1.05211449, "epoch": 0.292138884713663, "flos": 21870963342720.0, "grad_norm": 1.9037299745697365, "language_loss": 0.7783367, "learning_rate": 3.2155485851329095e-06, "loss": 0.80010974, "num_input_tokens_seen": 104804560, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.84375, "step": 4859, "time_per_iteration": 2.567714214324951 }, { "auxiliary_loss_clip": 0.01174111, "auxiliary_loss_mlp": 0.0104013, "balance_loss_clip": 1.02272511, "balance_loss_mlp": 1.04874158, "epoch": 0.29219900796633097, "flos": 20992552243200.0, "grad_norm": 2.8387219316706265, "language_loss": 0.68741393, "learning_rate": 3.215248531023538e-06, "loss": 0.70955634, "num_input_tokens_seen": 104821105, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.89453125, "step": 4860, "time_per_iteration": 2.5720510482788086 }, { "auxiliary_loss_clip": 0.01151338, "auxiliary_loss_mlp": 0.01041947, "balance_loss_clip": 1.02743959, "balance_loss_mlp": 1.05208063, "epoch": 0.29225913121899894, "flos": 35004608133120.0, "grad_norm": 2.151990248294516, "language_loss": 0.7531929, "learning_rate": 3.2149484335450722e-06, "loss": 0.77512574, "num_input_tokens_seen": 104841440, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8125, "step": 4861, "time_per_iteration": 2.7581050395965576 }, { "auxiliary_loss_clip": 0.01142338, "auxiliary_loss_mlp": 0.01046337, "balance_loss_clip": 1.03075624, "balance_loss_mlp": 1.05046928, "epoch": 0.2923192544716669, "flos": 13515663888000.0, "grad_norm": 1.5827368812512204, "language_loss": 0.91592765, "learning_rate": 3.2146482927082216e-06, "loss": 0.93781435, "num_input_tokens_seen": 104858210, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.828125, "step": 4862, "time_per_iteration": 2.6407432556152344 }, { "auxiliary_loss_clip": 0.01142788, "auxiliary_loss_mlp": 0.01039801, "balance_loss_clip": 1.02425575, "balance_loss_mlp": 1.04870546, "epoch": 0.29237937772433487, "flos": 19463512141440.0, "grad_norm": 2.5463149264214913, "language_loss": 0.73493063, "learning_rate": 3.214348108523698e-06, "loss": 0.75675654, "num_input_tokens_seen": 104875620, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.84765625, "step": 4863, "time_per_iteration": 2.677236318588257 }, { "auxiliary_loss_clip": 0.01185632, "auxiliary_loss_mlp": 0.01039566, "balance_loss_clip": 1.02439058, "balance_loss_mlp": 1.0508492, "epoch": 0.29243950097700283, "flos": 20850597694080.0, "grad_norm": 1.9209735673131636, "language_loss": 0.77570897, "learning_rate": 3.214047881002214e-06, "loss": 0.79796094, "num_input_tokens_seen": 104894600, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.80859375, "step": 4864, "time_per_iteration": 2.701913356781006 }, { "auxiliary_loss_clip": 0.01165454, "auxiliary_loss_mlp": 0.01045985, "balance_loss_clip": 1.02833021, "balance_loss_mlp": 1.0519762, "epoch": 0.29249962422967085, "flos": 23584225322880.0, "grad_norm": 2.2222686786422856, "language_loss": 0.81409109, "learning_rate": 3.2137476101544848e-06, "loss": 0.83620548, "num_input_tokens_seen": 104914530, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.86328125, "step": 4865, "time_per_iteration": 2.7258739471435547 }, { "auxiliary_loss_clip": 0.0117954, "auxiliary_loss_mlp": 0.01045844, "balance_loss_clip": 1.0286777, "balance_loss_mlp": 1.04868364, "epoch": 0.2925597474823388, "flos": 22273342473600.0, "grad_norm": 1.8195576424755522, "language_loss": 0.84710908, "learning_rate": 3.213447295991225e-06, "loss": 0.86936283, "num_input_tokens_seen": 104933460, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.859375, "step": 4866, "time_per_iteration": 2.6628501415252686 }, { "auxiliary_loss_clip": 0.01149096, "auxiliary_loss_mlp": 0.01039442, "balance_loss_clip": 1.02382588, "balance_loss_mlp": 1.04875088, "epoch": 0.2926198707350068, "flos": 34456108475520.0, "grad_norm": 1.699176126544481, "language_loss": 0.75327206, "learning_rate": 3.2131469385231525e-06, "loss": 0.77515751, "num_input_tokens_seen": 104954495, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.82421875, "step": 4867, "time_per_iteration": 2.73618745803833 }, { "auxiliary_loss_clip": 0.01180216, "auxiliary_loss_mlp": 0.01045842, "balance_loss_clip": 1.02867579, "balance_loss_mlp": 1.04839706, "epoch": 0.29267999398767475, "flos": 20704153944960.0, "grad_norm": 1.8034102149193567, "language_loss": 0.72798395, "learning_rate": 3.212846537760986e-06, "loss": 0.7502445, "num_input_tokens_seen": 104971915, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8671875, "step": 4868, "time_per_iteration": 2.578273057937622 }, { "auxiliary_loss_clip": 0.01168211, "auxiliary_loss_mlp": 0.01033617, "balance_loss_clip": 1.01788139, "balance_loss_mlp": 1.04975295, "epoch": 0.2927401172403427, "flos": 18368667642240.0, "grad_norm": 1.3560825812373474, "language_loss": 0.74637258, "learning_rate": 3.212546093715447e-06, "loss": 0.76839083, "num_input_tokens_seen": 104991335, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.828125, "step": 4869, "time_per_iteration": 2.664047956466675 }, { "auxiliary_loss_clip": 0.01155202, "auxiliary_loss_mlp": 0.01037789, "balance_loss_clip": 1.02174318, "balance_loss_mlp": 1.05232549, "epoch": 0.2928002404930107, "flos": 26104041244800.0, "grad_norm": 1.54370763043455, "language_loss": 0.76580024, "learning_rate": 3.2122456063972567e-06, "loss": 0.78773022, "num_input_tokens_seen": 105012015, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8515625, "step": 4870, "time_per_iteration": 2.595214366912842 }, { "auxiliary_loss_clip": 0.01150434, "auxiliary_loss_mlp": 0.01046834, "balance_loss_clip": 1.02931058, "balance_loss_mlp": 1.05336761, "epoch": 0.29286036374567864, "flos": 21324726241920.0, "grad_norm": 2.2704534265891554, "language_loss": 0.67909777, "learning_rate": 3.2119450758171393e-06, "loss": 0.70107043, "num_input_tokens_seen": 105031460, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.87890625, "step": 4871, "time_per_iteration": 2.603214740753174 }, { "auxiliary_loss_clip": 0.01151203, "auxiliary_loss_mlp": 0.01042981, "balance_loss_clip": 1.0271852, "balance_loss_mlp": 1.04926419, "epoch": 0.2929204869983466, "flos": 29569492569600.0, "grad_norm": 1.9740672720580648, "language_loss": 0.77252805, "learning_rate": 3.2116445019858196e-06, "loss": 0.79446995, "num_input_tokens_seen": 105052965, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8359375, "step": 4872, "time_per_iteration": 2.6866698265075684 }, { "auxiliary_loss_clip": 0.01183853, "auxiliary_loss_mlp": 0.0104545, "balance_loss_clip": 1.0276401, "balance_loss_mlp": 1.05172145, "epoch": 0.2929806102510146, "flos": 19058259922560.0, "grad_norm": 1.9326405694224529, "language_loss": 0.73041987, "learning_rate": 3.211343884914024e-06, "loss": 0.75271291, "num_input_tokens_seen": 105071840, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.87890625, "step": 4873, "time_per_iteration": 2.6836485862731934 }, { "auxiliary_loss_clip": 0.01146891, "auxiliary_loss_mlp": 0.01043413, "balance_loss_clip": 1.02636671, "balance_loss_mlp": 1.05072427, "epoch": 0.29304073350368254, "flos": 21944221130880.0, "grad_norm": 2.00858452765957, "language_loss": 0.78071034, "learning_rate": 3.211043224612481e-06, "loss": 0.80261338, "num_input_tokens_seen": 105089445, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.87109375, "step": 4874, "time_per_iteration": 2.532092809677124 }, { "auxiliary_loss_clip": 0.01177616, "auxiliary_loss_mlp": 0.01044974, "balance_loss_clip": 1.02756953, "balance_loss_mlp": 1.05167699, "epoch": 0.2931008567563505, "flos": 15450818135040.0, "grad_norm": 2.247689855321676, "language_loss": 0.77725089, "learning_rate": 3.2107425210919204e-06, "loss": 0.79947674, "num_input_tokens_seen": 105106210, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.8984375, "step": 4875, "time_per_iteration": 3.9922666549682617 }, { "auxiliary_loss_clip": 0.01153146, "auxiliary_loss_mlp": 0.01034537, "balance_loss_clip": 1.01802635, "balance_loss_mlp": 1.05755842, "epoch": 0.29316098000901847, "flos": 16983162288000.0, "grad_norm": 1.6323936017556728, "language_loss": 0.69326693, "learning_rate": 3.2104417743630742e-06, "loss": 0.71514368, "num_input_tokens_seen": 105124200, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8671875, "step": 4876, "time_per_iteration": 2.572838544845581 }, { "auxiliary_loss_clip": 0.01144505, "auxiliary_loss_mlp": 0.0104158, "balance_loss_clip": 1.02546275, "balance_loss_mlp": 1.05111217, "epoch": 0.29322110326168643, "flos": 16357705741440.0, "grad_norm": 1.901759338348325, "language_loss": 0.81884658, "learning_rate": 3.2101409844366743e-06, "loss": 0.84070748, "num_input_tokens_seen": 105140400, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.84375, "step": 4877, "time_per_iteration": 2.4975218772888184 }, { "auxiliary_loss_clip": 0.01149342, "auxiliary_loss_mlp": 0.01043104, "balance_loss_clip": 1.02761877, "balance_loss_mlp": 1.05369937, "epoch": 0.29328122651435445, "flos": 13990869843840.0, "grad_norm": 1.9650388370180765, "language_loss": 0.67292798, "learning_rate": 3.209840151323456e-06, "loss": 0.69485247, "num_input_tokens_seen": 105157535, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8671875, "step": 4878, "time_per_iteration": 2.602573871612549 }, { "auxiliary_loss_clip": 0.01165783, "auxiliary_loss_mlp": 0.01042118, "balance_loss_clip": 1.02599525, "balance_loss_mlp": 1.05334246, "epoch": 0.2933413497670224, "flos": 25264593423360.0, "grad_norm": 1.8827592084048794, "language_loss": 0.73283434, "learning_rate": 3.2095392750341543e-06, "loss": 0.75491333, "num_input_tokens_seen": 105175185, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8515625, "step": 4879, "time_per_iteration": 2.630061149597168 }, { "auxiliary_loss_clip": 0.01167703, "auxiliary_loss_mlp": 0.01296799, "balance_loss_clip": 1.03149271, "balance_loss_mlp": 1.05298626, "epoch": 0.2934014730196904, "flos": 32123746656000.0, "grad_norm": 1.9111049639212792, "language_loss": 0.66256624, "learning_rate": 3.209238355579507e-06, "loss": 0.68721128, "num_input_tokens_seen": 105194540, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.875, "step": 4880, "time_per_iteration": 4.258726596832275 }, { "auxiliary_loss_clip": 0.01146401, "auxiliary_loss_mlp": 0.01054754, "balance_loss_clip": 1.03819585, "balance_loss_mlp": 1.05075407, "epoch": 0.29346159627235835, "flos": 24352498344960.0, "grad_norm": 2.6415089946798664, "language_loss": 0.70766318, "learning_rate": 3.2089373929702542e-06, "loss": 0.7296747, "num_input_tokens_seen": 105213215, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8671875, "step": 4881, "time_per_iteration": 4.16362738609314 }, { "auxiliary_loss_clip": 0.01156762, "auxiliary_loss_mlp": 0.01289943, "balance_loss_clip": 1.02553344, "balance_loss_mlp": 1.05250645, "epoch": 0.2935217195250263, "flos": 22746752749440.0, "grad_norm": 1.5011962871705957, "language_loss": 0.83167976, "learning_rate": 3.2086363872171344e-06, "loss": 0.85614687, "num_input_tokens_seen": 105231585, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.86328125, "step": 4882, "time_per_iteration": 2.724318504333496 }, { "auxiliary_loss_clip": 0.01139908, "auxiliary_loss_mlp": 0.01046026, "balance_loss_clip": 1.02876425, "balance_loss_mlp": 1.05084157, "epoch": 0.2935818427776943, "flos": 21725561088000.0, "grad_norm": 2.127606968470689, "language_loss": 0.71757078, "learning_rate": 3.208335338330892e-06, "loss": 0.73943013, "num_input_tokens_seen": 105250120, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.890625, "step": 4883, "time_per_iteration": 2.6547486782073975 }, { "auxiliary_loss_clip": 0.01156459, "auxiliary_loss_mlp": 0.0103947, "balance_loss_clip": 1.02250624, "balance_loss_mlp": 1.05229139, "epoch": 0.29364196603036224, "flos": 23804968354560.0, "grad_norm": 2.1421646924899065, "language_loss": 0.92243814, "learning_rate": 3.2080342463222693e-06, "loss": 0.94439745, "num_input_tokens_seen": 105266065, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8671875, "step": 4884, "time_per_iteration": 4.865550518035889 }, { "auxiliary_loss_clip": 0.01153471, "auxiliary_loss_mlp": 0.01040616, "balance_loss_clip": 1.02437997, "balance_loss_mlp": 1.05682898, "epoch": 0.2937020892830302, "flos": 23470064922240.0, "grad_norm": 2.0268788287357364, "language_loss": 0.73168325, "learning_rate": 3.207733111202011e-06, "loss": 0.75362408, "num_input_tokens_seen": 105282155, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.87890625, "step": 4885, "time_per_iteration": 2.6002345085144043 }, { "auxiliary_loss_clip": 0.01145421, "auxiliary_loss_mlp": 0.01044007, "balance_loss_clip": 1.02808619, "balance_loss_mlp": 1.05118799, "epoch": 0.2937622125356982, "flos": 24272740195200.0, "grad_norm": 2.061585628419, "language_loss": 0.85138381, "learning_rate": 3.2074319329808656e-06, "loss": 0.87327802, "num_input_tokens_seen": 105299225, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8515625, "step": 4886, "time_per_iteration": 2.6042428016662598 }, { "auxiliary_loss_clip": 0.01164634, "auxiliary_loss_mlp": 0.01041481, "balance_loss_clip": 1.02530432, "balance_loss_mlp": 1.04973412, "epoch": 0.29382233578836614, "flos": 20662461233280.0, "grad_norm": 2.08404326173495, "language_loss": 0.77235478, "learning_rate": 3.2071307116695803e-06, "loss": 0.79441595, "num_input_tokens_seen": 105315710, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.875, "step": 4887, "time_per_iteration": 2.6192803382873535 }, { "auxiliary_loss_clip": 0.01158023, "auxiliary_loss_mlp": 0.0104363, "balance_loss_clip": 1.02757263, "balance_loss_mlp": 1.05219293, "epoch": 0.2938824590410341, "flos": 16545052103040.0, "grad_norm": 1.9984889194103537, "language_loss": 0.78784579, "learning_rate": 3.2068294472789044e-06, "loss": 0.80986226, "num_input_tokens_seen": 105333505, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.87890625, "step": 4888, "time_per_iteration": 2.5450644493103027 }, { "auxiliary_loss_clip": 0.01154334, "auxiliary_loss_mlp": 0.01035884, "balance_loss_clip": 1.02054143, "balance_loss_mlp": 1.05088568, "epoch": 0.29394258229370207, "flos": 37925474382720.0, "grad_norm": 1.6644576524775314, "language_loss": 0.55110383, "learning_rate": 3.20652813981959e-06, "loss": 0.57300597, "num_input_tokens_seen": 105355605, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.85546875, "step": 4889, "time_per_iteration": 2.720806121826172 }, { "auxiliary_loss_clip": 0.01147367, "auxiliary_loss_mlp": 0.0103889, "balance_loss_clip": 1.02155709, "balance_loss_mlp": 1.05050945, "epoch": 0.29400270554637004, "flos": 20044690197120.0, "grad_norm": 1.7796621273712008, "language_loss": 0.84335494, "learning_rate": 3.2062267893023903e-06, "loss": 0.86521757, "num_input_tokens_seen": 105374225, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.87890625, "step": 4890, "time_per_iteration": 2.57114577293396 }, { "auxiliary_loss_clip": 0.01148303, "auxiliary_loss_mlp": 0.01046443, "balance_loss_clip": 1.02924156, "balance_loss_mlp": 1.05190897, "epoch": 0.294062828799038, "flos": 15266380775040.0, "grad_norm": 2.129419134937705, "language_loss": 0.72370911, "learning_rate": 3.205925395738059e-06, "loss": 0.74565649, "num_input_tokens_seen": 105391565, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.875, "step": 4891, "time_per_iteration": 2.549675226211548 }, { "auxiliary_loss_clip": 0.01136355, "auxiliary_loss_mlp": 0.01043394, "balance_loss_clip": 1.02607286, "balance_loss_mlp": 1.04899299, "epoch": 0.294122952051706, "flos": 22747147799040.0, "grad_norm": 1.6313294213468932, "language_loss": 0.77126116, "learning_rate": 3.205623959137353e-06, "loss": 0.79305863, "num_input_tokens_seen": 105409840, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.875, "step": 4892, "time_per_iteration": 2.6653382778167725 }, { "auxiliary_loss_clip": 0.01136021, "auxiliary_loss_mlp": 0.0103706, "balance_loss_clip": 1.01991761, "balance_loss_mlp": 1.05001068, "epoch": 0.294183075304374, "flos": 24972891073920.0, "grad_norm": 1.8026007205573635, "language_loss": 0.78316295, "learning_rate": 3.205322479511028e-06, "loss": 0.80489373, "num_input_tokens_seen": 105428645, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.859375, "step": 4893, "time_per_iteration": 2.741746664047241 }, { "auxiliary_loss_clip": 0.01164565, "auxiliary_loss_mlp": 0.01045396, "balance_loss_clip": 1.02883768, "balance_loss_mlp": 1.0504998, "epoch": 0.29424319855704195, "flos": 30952986762240.0, "grad_norm": 1.7593584954064605, "language_loss": 0.84702277, "learning_rate": 3.205020956869845e-06, "loss": 0.86912245, "num_input_tokens_seen": 105447480, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.875, "step": 4894, "time_per_iteration": 2.662562847137451 }, { "auxiliary_loss_clip": 0.01143016, "auxiliary_loss_mlp": 0.01039114, "balance_loss_clip": 1.02392721, "balance_loss_mlp": 1.0481863, "epoch": 0.2943033218097099, "flos": 15231583474560.0, "grad_norm": 2.1206554633991836, "language_loss": 0.90399623, "learning_rate": 3.204719391224563e-06, "loss": 0.92581749, "num_input_tokens_seen": 105464600, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.859375, "step": 4895, "time_per_iteration": 2.6950411796569824 }, { "auxiliary_loss_clip": 0.01151271, "auxiliary_loss_mlp": 0.01043329, "balance_loss_clip": 1.02610385, "balance_loss_mlp": 1.05307257, "epoch": 0.2943634450623779, "flos": 21725884310400.0, "grad_norm": 2.1032073831952207, "language_loss": 0.86732924, "learning_rate": 3.2044177825859457e-06, "loss": 0.88927525, "num_input_tokens_seen": 105481510, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.890625, "step": 4896, "time_per_iteration": 2.5165212154388428 }, { "auxiliary_loss_clip": 0.01150105, "auxiliary_loss_mlp": 0.01050548, "balance_loss_clip": 1.03406155, "balance_loss_mlp": 1.0540756, "epoch": 0.29442356831504585, "flos": 22602104680320.0, "grad_norm": 1.64645069210967, "language_loss": 0.73395884, "learning_rate": 3.2041161309647555e-06, "loss": 0.75596535, "num_input_tokens_seen": 105501390, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.87109375, "step": 4897, "time_per_iteration": 2.6093719005584717 }, { "auxiliary_loss_clip": 0.01149087, "auxiliary_loss_mlp": 0.01047307, "balance_loss_clip": 1.02950883, "balance_loss_mlp": 1.04877281, "epoch": 0.2944836915677138, "flos": 20011401267840.0, "grad_norm": 2.0104789514487447, "language_loss": 0.73491883, "learning_rate": 3.2038144363717572e-06, "loss": 0.75688279, "num_input_tokens_seen": 105519600, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.91015625, "step": 4898, "time_per_iteration": 2.5661487579345703 }, { "auxiliary_loss_clip": 0.01156609, "auxiliary_loss_mlp": 0.01043739, "balance_loss_clip": 1.02464199, "balance_loss_mlp": 1.05505943, "epoch": 0.2945438148203818, "flos": 20045875345920.0, "grad_norm": 2.585554610686618, "language_loss": 0.70088583, "learning_rate": 3.203512698817719e-06, "loss": 0.7228893, "num_input_tokens_seen": 105535970, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.921875, "step": 4899, "time_per_iteration": 2.5893614292144775 }, { "auxiliary_loss_clip": 0.01154304, "auxiliary_loss_mlp": 0.01290475, "balance_loss_clip": 1.02588248, "balance_loss_mlp": 1.04982662, "epoch": 0.29460393807304974, "flos": 23733542160000.0, "grad_norm": 2.190104870019144, "language_loss": 0.79000717, "learning_rate": 3.2032109183134086e-06, "loss": 0.81445497, "num_input_tokens_seen": 105556735, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8671875, "step": 4900, "time_per_iteration": 2.5658769607543945 }, { "auxiliary_loss_clip": 0.01152611, "auxiliary_loss_mlp": 0.01042132, "balance_loss_clip": 1.0259428, "balance_loss_mlp": 1.04668403, "epoch": 0.2946640613257177, "flos": 14976079056000.0, "grad_norm": 1.9981513813629994, "language_loss": 0.80735803, "learning_rate": 3.202909094869595e-06, "loss": 0.82930541, "num_input_tokens_seen": 105574875, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.875, "step": 4901, "time_per_iteration": 2.5553181171417236 }, { "auxiliary_loss_clip": 0.01149255, "auxiliary_loss_mlp": 0.01035458, "balance_loss_clip": 1.0202347, "balance_loss_mlp": 1.04946768, "epoch": 0.2947241845783857, "flos": 24243904552320.0, "grad_norm": 1.949167717897845, "language_loss": 0.57371312, "learning_rate": 3.2026072284970504e-06, "loss": 0.59556025, "num_input_tokens_seen": 105594225, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8203125, "step": 4902, "time_per_iteration": 2.5621907711029053 }, { "auxiliary_loss_clip": 0.01152352, "auxiliary_loss_mlp": 0.01043382, "balance_loss_clip": 1.0275811, "balance_loss_mlp": 1.0482403, "epoch": 0.29478430783105364, "flos": 19938394874880.0, "grad_norm": 2.3432160554442687, "language_loss": 0.75392938, "learning_rate": 3.202305319206547e-06, "loss": 0.77588677, "num_input_tokens_seen": 105614000, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8671875, "step": 4903, "time_per_iteration": 2.556959390640259 }, { "auxiliary_loss_clip": 0.01154964, "auxiliary_loss_mlp": 0.01048055, "balance_loss_clip": 1.03072202, "balance_loss_mlp": 1.05048633, "epoch": 0.2948444310837216, "flos": 27381347856000.0, "grad_norm": 1.955769945434623, "language_loss": 0.62301433, "learning_rate": 3.20200336700886e-06, "loss": 0.64504457, "num_input_tokens_seen": 105634575, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.8671875, "step": 4904, "time_per_iteration": 2.612175464630127 }, { "auxiliary_loss_clip": 0.01147786, "auxiliary_loss_mlp": 0.01042525, "balance_loss_clip": 1.02662289, "balance_loss_mlp": 1.0488528, "epoch": 0.2949045543363896, "flos": 23405462311680.0, "grad_norm": 1.4377704876736197, "language_loss": 0.73190922, "learning_rate": 3.2017013719147644e-06, "loss": 0.75381231, "num_input_tokens_seen": 105654385, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8984375, "step": 4905, "time_per_iteration": 2.6154282093048096 }, { "auxiliary_loss_clip": 0.01162211, "auxiliary_loss_mlp": 0.010451, "balance_loss_clip": 1.02842283, "balance_loss_mlp": 1.04937196, "epoch": 0.2949646775890576, "flos": 23951483930880.0, "grad_norm": 1.8876130805712834, "language_loss": 0.81509829, "learning_rate": 3.201399333935038e-06, "loss": 0.83717132, "num_input_tokens_seen": 105673570, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.859375, "step": 4906, "time_per_iteration": 2.663421630859375 }, { "auxiliary_loss_clip": 0.01168544, "auxiliary_loss_mlp": 0.01039527, "balance_loss_clip": 1.02352905, "balance_loss_mlp": 1.0490855, "epoch": 0.29502480084172555, "flos": 22784315397120.0, "grad_norm": 1.8495202818272403, "language_loss": 0.87570441, "learning_rate": 3.2010972530804595e-06, "loss": 0.89778513, "num_input_tokens_seen": 105691940, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83203125, "step": 4907, "time_per_iteration": 2.6143577098846436 }, { "auxiliary_loss_clip": 0.01139131, "auxiliary_loss_mlp": 0.01044364, "balance_loss_clip": 1.02731752, "balance_loss_mlp": 1.0499692, "epoch": 0.2950849240943935, "flos": 19646656611840.0, "grad_norm": 2.1571484667954204, "language_loss": 0.81996429, "learning_rate": 3.20079512936181e-06, "loss": 0.84179926, "num_input_tokens_seen": 105709825, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.890625, "step": 4908, "time_per_iteration": 2.6124308109283447 }, { "auxiliary_loss_clip": 0.01061065, "auxiliary_loss_mlp": 0.01007146, "balance_loss_clip": 1.00516689, "balance_loss_mlp": 1.03301394, "epoch": 0.2951450473470615, "flos": 71002829260800.0, "grad_norm": 0.8056583869420065, "language_loss": 0.57266438, "learning_rate": 3.2004929627898707e-06, "loss": 0.59334648, "num_input_tokens_seen": 105766880, "router_z_loss_clip": 0.01977539, "router_z_loss_mlp": 0.28125, "step": 4909, "time_per_iteration": 2.9905381202697754 }, { "auxiliary_loss_clip": 0.01147388, "auxiliary_loss_mlp": 0.01290106, "balance_loss_clip": 1.02669454, "balance_loss_mlp": 1.05380166, "epoch": 0.29520517059972945, "flos": 22966310632320.0, "grad_norm": 1.5354416566343172, "language_loss": 0.86755949, "learning_rate": 3.200190753375426e-06, "loss": 0.89193445, "num_input_tokens_seen": 105786875, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.84765625, "step": 4910, "time_per_iteration": 2.64435076713562 }, { "auxiliary_loss_clip": 0.01140761, "auxiliary_loss_mlp": 0.01042393, "balance_loss_clip": 1.02660978, "balance_loss_mlp": 1.0492382, "epoch": 0.2952652938523974, "flos": 20485673470080.0, "grad_norm": 3.2593524931767592, "language_loss": 0.72630674, "learning_rate": 3.1998885011292604e-06, "loss": 0.74813831, "num_input_tokens_seen": 105805315, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.828125, "step": 4911, "time_per_iteration": 2.5307509899139404 }, { "auxiliary_loss_clip": 0.01148834, "auxiliary_loss_mlp": 0.01039504, "balance_loss_clip": 1.02410173, "balance_loss_mlp": 1.05534852, "epoch": 0.2953254171050654, "flos": 19646584784640.0, "grad_norm": 1.7042707204156091, "language_loss": 0.9001947, "learning_rate": 3.199586206062161e-06, "loss": 0.92207807, "num_input_tokens_seen": 105825125, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.84375, "step": 4912, "time_per_iteration": 2.772629499435425 }, { "auxiliary_loss_clip": 0.01152894, "auxiliary_loss_mlp": 0.0105076, "balance_loss_clip": 1.03436899, "balance_loss_mlp": 1.05002058, "epoch": 0.29538554035773334, "flos": 22747973811840.0, "grad_norm": 1.3364473740353644, "language_loss": 0.82905358, "learning_rate": 3.1992838681849153e-06, "loss": 0.85109007, "num_input_tokens_seen": 105846085, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.84765625, "step": 4913, "time_per_iteration": 2.593198537826538 }, { "auxiliary_loss_clip": 0.01138169, "auxiliary_loss_mlp": 0.01044668, "balance_loss_clip": 1.02834857, "balance_loss_mlp": 1.05096555, "epoch": 0.2954456636104013, "flos": 21871861182720.0, "grad_norm": 1.7313765553346854, "language_loss": 0.76495284, "learning_rate": 3.1989814875083134e-06, "loss": 0.78678119, "num_input_tokens_seen": 105865400, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.87109375, "step": 4914, "time_per_iteration": 2.592282295227051 }, { "auxiliary_loss_clip": 0.01153266, "auxiliary_loss_mlp": 0.01044318, "balance_loss_clip": 1.02739012, "balance_loss_mlp": 1.05066049, "epoch": 0.2955057868630693, "flos": 40442560871040.0, "grad_norm": 1.6876462553329177, "language_loss": 0.81637359, "learning_rate": 3.198679064043146e-06, "loss": 0.83834946, "num_input_tokens_seen": 105887920, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.84765625, "step": 4915, "time_per_iteration": 2.746795892715454 }, { "auxiliary_loss_clip": 0.01193365, "auxiliary_loss_mlp": 0.01037127, "balance_loss_clip": 1.02009177, "balance_loss_mlp": 1.05175662, "epoch": 0.29556591011573724, "flos": 22564506119040.0, "grad_norm": 1.9271924924549442, "language_loss": 0.84492898, "learning_rate": 3.1983765978002067e-06, "loss": 0.86723387, "num_input_tokens_seen": 105904035, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.87890625, "step": 4916, "time_per_iteration": 2.6649725437164307 }, { "auxiliary_loss_clip": 0.01142185, "auxiliary_loss_mlp": 0.01036357, "balance_loss_clip": 1.02029943, "balance_loss_mlp": 1.04844272, "epoch": 0.2956260333684052, "flos": 22089300163200.0, "grad_norm": 2.0566738121405477, "language_loss": 0.69682407, "learning_rate": 3.198074088790289e-06, "loss": 0.71860945, "num_input_tokens_seen": 105922685, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.84765625, "step": 4917, "time_per_iteration": 4.022572755813599 }, { "auxiliary_loss_clip": 0.01155488, "auxiliary_loss_mlp": 0.01041684, "balance_loss_clip": 1.02504253, "balance_loss_mlp": 1.05130005, "epoch": 0.2956861566210732, "flos": 16435488643200.0, "grad_norm": 2.027426069161834, "language_loss": 0.91132069, "learning_rate": 3.197771537024189e-06, "loss": 0.93329245, "num_input_tokens_seen": 105940425, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.86328125, "step": 4918, "time_per_iteration": 2.5512778759002686 }, { "auxiliary_loss_clip": 0.01137563, "auxiliary_loss_mlp": 0.01040661, "balance_loss_clip": 1.02421033, "balance_loss_mlp": 1.0512619, "epoch": 0.2957462798737412, "flos": 25812087500160.0, "grad_norm": 1.8577064054119488, "language_loss": 0.72620606, "learning_rate": 3.197468942512703e-06, "loss": 0.74798834, "num_input_tokens_seen": 105960550, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.86328125, "step": 4919, "time_per_iteration": 2.6197898387908936 }, { "auxiliary_loss_clip": 0.01151836, "auxiliary_loss_mlp": 0.01041953, "balance_loss_clip": 1.02476299, "balance_loss_mlp": 1.04909623, "epoch": 0.29580640312640916, "flos": 16690849407360.0, "grad_norm": 1.7341996711759513, "language_loss": 0.75450879, "learning_rate": 3.1971663052666317e-06, "loss": 0.7764467, "num_input_tokens_seen": 105978820, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8515625, "step": 4920, "time_per_iteration": 2.4786951541900635 }, { "auxiliary_loss_clip": 0.01156174, "auxiliary_loss_mlp": 0.01043208, "balance_loss_clip": 1.02558851, "balance_loss_mlp": 1.05020952, "epoch": 0.2958665263790771, "flos": 23945594100480.0, "grad_norm": 2.7545326952796003, "language_loss": 0.6867823, "learning_rate": 3.196863625296775e-06, "loss": 0.70877612, "num_input_tokens_seen": 105997545, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.87890625, "step": 4921, "time_per_iteration": 2.6248302459716797 }, { "auxiliary_loss_clip": 0.01156527, "auxiliary_loss_mlp": 0.01043993, "balance_loss_clip": 1.02609921, "balance_loss_mlp": 1.05052137, "epoch": 0.2959266496317451, "flos": 18478410670080.0, "grad_norm": 3.123759488806789, "language_loss": 0.74572241, "learning_rate": 3.1965609026139327e-06, "loss": 0.76772767, "num_input_tokens_seen": 106015320, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.8828125, "step": 4922, "time_per_iteration": 4.031205892562866 }, { "auxiliary_loss_clip": 0.01140649, "auxiliary_loss_mlp": 0.01289081, "balance_loss_clip": 1.02400672, "balance_loss_mlp": 1.04993784, "epoch": 0.29598677288441305, "flos": 25957489754880.0, "grad_norm": 1.6531832505188753, "language_loss": 0.76472455, "learning_rate": 3.1962581372289105e-06, "loss": 0.78902185, "num_input_tokens_seen": 106034555, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.90625, "step": 4923, "time_per_iteration": 4.149626970291138 }, { "auxiliary_loss_clip": 0.0115389, "auxiliary_loss_mlp": 0.01042633, "balance_loss_clip": 1.02498984, "balance_loss_mlp": 1.04936481, "epoch": 0.296046896137081, "flos": 25155999630720.0, "grad_norm": 1.744078495236276, "language_loss": 0.86392987, "learning_rate": 3.195955329152512e-06, "loss": 0.88589501, "num_input_tokens_seen": 106054200, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.8671875, "step": 4924, "time_per_iteration": 2.625920534133911 }, { "auxiliary_loss_clip": 0.01142878, "auxiliary_loss_mlp": 0.01035969, "balance_loss_clip": 1.0188024, "balance_loss_mlp": 1.04943395, "epoch": 0.296107019389749, "flos": 21761148487680.0, "grad_norm": 1.4315121582854329, "language_loss": 0.81808913, "learning_rate": 3.1956524783955453e-06, "loss": 0.83987761, "num_input_tokens_seen": 106074700, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.84375, "step": 4925, "time_per_iteration": 4.249699115753174 }, { "auxiliary_loss_clip": 0.01168081, "auxiliary_loss_mlp": 0.01035976, "balance_loss_clip": 1.01929879, "balance_loss_mlp": 1.0475831, "epoch": 0.29616714264241695, "flos": 17960039544960.0, "grad_norm": 2.7719963947661745, "language_loss": 0.86248517, "learning_rate": 3.195349584968816e-06, "loss": 0.88452578, "num_input_tokens_seen": 106091415, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.84375, "step": 4926, "time_per_iteration": 2.6444079875946045 }, { "auxiliary_loss_clip": 0.0116265, "auxiliary_loss_mlp": 0.01036621, "balance_loss_clip": 1.02022934, "balance_loss_mlp": 1.04764354, "epoch": 0.2962272658950849, "flos": 15012779777280.0, "grad_norm": 1.897848694715968, "language_loss": 0.85774887, "learning_rate": 3.1950466488831357e-06, "loss": 0.87974155, "num_input_tokens_seen": 106109135, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8828125, "step": 4927, "time_per_iteration": 2.7058370113372803 }, { "auxiliary_loss_clip": 0.01143701, "auxiliary_loss_mlp": 0.01036598, "balance_loss_clip": 1.02025437, "balance_loss_mlp": 1.04973781, "epoch": 0.2962873891477529, "flos": 14720861946240.0, "grad_norm": 1.5905650397476392, "language_loss": 0.8067956, "learning_rate": 3.194743670149314e-06, "loss": 0.82859862, "num_input_tokens_seen": 106125750, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8515625, "step": 4928, "time_per_iteration": 2.475214958190918 }, { "auxiliary_loss_clip": 0.01161241, "auxiliary_loss_mlp": 0.0104883, "balance_loss_clip": 1.02783704, "balance_loss_mlp": 1.05093968, "epoch": 0.29634751240042084, "flos": 26723787528960.0, "grad_norm": 2.4111066714429183, "language_loss": 0.73423016, "learning_rate": 3.194440648778164e-06, "loss": 0.75633085, "num_input_tokens_seen": 106142835, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.92578125, "step": 4929, "time_per_iteration": 2.600275993347168 }, { "auxiliary_loss_clip": 0.0115943, "auxiliary_loss_mlp": 0.01048969, "balance_loss_clip": 1.03038478, "balance_loss_mlp": 1.05134892, "epoch": 0.2964076356530888, "flos": 14571293713920.0, "grad_norm": 2.271698206313163, "language_loss": 0.72019339, "learning_rate": 3.1941375847805e-06, "loss": 0.74227732, "num_input_tokens_seen": 106160680, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.8984375, "step": 4930, "time_per_iteration": 2.5067710876464844 }, { "auxiliary_loss_clip": 0.01087791, "auxiliary_loss_mlp": 0.01007371, "balance_loss_clip": 1.00539196, "balance_loss_mlp": 1.0331707, "epoch": 0.29646775890575683, "flos": 63104315063040.0, "grad_norm": 0.8110825794904147, "language_loss": 0.6070497, "learning_rate": 3.193834478167137e-06, "loss": 0.62800133, "num_input_tokens_seen": 106224415, "router_z_loss_clip": 0.01977539, "router_z_loss_mlp": 0.27734375, "step": 4931, "time_per_iteration": 3.149635076522827 }, { "auxiliary_loss_clip": 0.0110514, "auxiliary_loss_mlp": 0.01003404, "balance_loss_clip": 1.00143707, "balance_loss_mlp": 1.03295612, "epoch": 0.2965278821584248, "flos": 63067686168960.0, "grad_norm": 0.7416738624372289, "language_loss": 0.52392244, "learning_rate": 3.1935313289488926e-06, "loss": 0.54500788, "num_input_tokens_seen": 106279140, "router_z_loss_clip": 0.01965332, "router_z_loss_mlp": 0.27734375, "step": 4932, "time_per_iteration": 3.0881659984588623 }, { "auxiliary_loss_clip": 0.01145718, "auxiliary_loss_mlp": 0.01041091, "balance_loss_clip": 1.02422285, "balance_loss_mlp": 1.0515734, "epoch": 0.29658800541109276, "flos": 23768734510080.0, "grad_norm": 1.6257317532358446, "language_loss": 0.81586593, "learning_rate": 3.193228137136585e-06, "loss": 0.83773404, "num_input_tokens_seen": 106298190, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8515625, "step": 4933, "time_per_iteration": 2.570627450942993 }, { "auxiliary_loss_clip": 0.01160263, "auxiliary_loss_mlp": 0.01034447, "balance_loss_clip": 1.01761425, "balance_loss_mlp": 1.05102384, "epoch": 0.2966481286637607, "flos": 23988543788160.0, "grad_norm": 1.7657705176098326, "language_loss": 0.75208044, "learning_rate": 3.1929249027410347e-06, "loss": 0.77402759, "num_input_tokens_seen": 106319065, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8203125, "step": 4934, "time_per_iteration": 2.6604084968566895 }, { "auxiliary_loss_clip": 0.01157809, "auxiliary_loss_mlp": 0.01047669, "balance_loss_clip": 1.02999067, "balance_loss_mlp": 1.05055285, "epoch": 0.2967082519164287, "flos": 17165157523200.0, "grad_norm": 2.1667521158047625, "language_loss": 0.61963975, "learning_rate": 3.1926216257730634e-06, "loss": 0.64169455, "num_input_tokens_seen": 106338040, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.89453125, "step": 4935, "time_per_iteration": 2.6377758979797363 }, { "auxiliary_loss_clip": 0.01145969, "auxiliary_loss_mlp": 0.01043915, "balance_loss_clip": 1.02629566, "balance_loss_mlp": 1.05141807, "epoch": 0.29676837516909665, "flos": 29387712816000.0, "grad_norm": 1.6024923831132094, "language_loss": 0.79930639, "learning_rate": 3.1923183062434936e-06, "loss": 0.82120526, "num_input_tokens_seen": 106358900, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.85546875, "step": 4936, "time_per_iteration": 2.7391598224639893 }, { "auxiliary_loss_clip": 0.0117658, "auxiliary_loss_mlp": 0.01048824, "balance_loss_clip": 1.03094244, "balance_loss_mlp": 1.05212379, "epoch": 0.2968284984217646, "flos": 34751222616960.0, "grad_norm": 1.712166685643963, "language_loss": 0.74671471, "learning_rate": 3.1920149441631505e-06, "loss": 0.7689687, "num_input_tokens_seen": 106381805, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.88671875, "step": 4937, "time_per_iteration": 2.697971820831299 }, { "auxiliary_loss_clip": 0.01172083, "auxiliary_loss_mlp": 0.01045254, "balance_loss_clip": 1.02768254, "balance_loss_mlp": 1.05008352, "epoch": 0.2968886216744326, "flos": 21544104556800.0, "grad_norm": 1.675625920674556, "language_loss": 0.78148973, "learning_rate": 3.1917115395428608e-06, "loss": 0.80366313, "num_input_tokens_seen": 106402365, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.859375, "step": 4938, "time_per_iteration": 2.7167351245880127 }, { "auxiliary_loss_clip": 0.01158638, "auxiliary_loss_mlp": 0.01048922, "balance_loss_clip": 1.03096902, "balance_loss_mlp": 1.05279016, "epoch": 0.29694874492710055, "flos": 12787323811200.0, "grad_norm": 5.090972291468501, "language_loss": 0.7694515, "learning_rate": 3.191408092393451e-06, "loss": 0.79152715, "num_input_tokens_seen": 106419800, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.8828125, "step": 4939, "time_per_iteration": 2.568704605102539 }, { "auxiliary_loss_clip": 0.01168777, "auxiliary_loss_mlp": 0.01048371, "balance_loss_clip": 1.0314908, "balance_loss_mlp": 1.05223489, "epoch": 0.2970088681797685, "flos": 24569973239040.0, "grad_norm": 1.690385494981451, "language_loss": 0.78455341, "learning_rate": 3.1911046027257516e-06, "loss": 0.80672491, "num_input_tokens_seen": 106440300, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.89453125, "step": 4940, "time_per_iteration": 2.7697598934173584 }, { "auxiliary_loss_clip": 0.01157506, "auxiliary_loss_mlp": 0.0104756, "balance_loss_clip": 1.02825975, "balance_loss_mlp": 1.05030668, "epoch": 0.2970689914324365, "flos": 23659171050240.0, "grad_norm": 1.5316772761016018, "language_loss": 0.75487053, "learning_rate": 3.1908010705505925e-06, "loss": 0.77692115, "num_input_tokens_seen": 106460035, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.890625, "step": 4941, "time_per_iteration": 2.703826427459717 }, { "auxiliary_loss_clip": 0.01150669, "auxiliary_loss_mlp": 0.01053799, "balance_loss_clip": 1.03551257, "balance_loss_mlp": 1.051512, "epoch": 0.29712911468510445, "flos": 39670301439360.0, "grad_norm": 2.2033779503249473, "language_loss": 0.73967093, "learning_rate": 3.1904974958788065e-06, "loss": 0.76171553, "num_input_tokens_seen": 106481095, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.90625, "step": 4942, "time_per_iteration": 2.8421154022216797 }, { "auxiliary_loss_clip": 0.01169276, "auxiliary_loss_mlp": 0.01052495, "balance_loss_clip": 1.03416085, "balance_loss_mlp": 1.05133605, "epoch": 0.2971892379377724, "flos": 26395312631040.0, "grad_norm": 2.0198338386168926, "language_loss": 0.70176744, "learning_rate": 3.190193878721227e-06, "loss": 0.7239852, "num_input_tokens_seen": 106501590, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.90625, "step": 4943, "time_per_iteration": 2.667734384536743 }, { "auxiliary_loss_clip": 0.01161757, "auxiliary_loss_mlp": 0.0105241, "balance_loss_clip": 1.03375363, "balance_loss_mlp": 1.05308187, "epoch": 0.2972493611904404, "flos": 17603195880960.0, "grad_norm": 2.110448380528473, "language_loss": 0.79506516, "learning_rate": 3.1898902190886898e-06, "loss": 0.8172068, "num_input_tokens_seen": 106519430, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.91015625, "step": 4944, "time_per_iteration": 2.5746700763702393 }, { "auxiliary_loss_clip": 0.01153036, "auxiliary_loss_mlp": 0.01044914, "balance_loss_clip": 1.0286895, "balance_loss_mlp": 1.05047464, "epoch": 0.2973094844431084, "flos": 20412774817920.0, "grad_norm": 1.9725827144838703, "language_loss": 0.82783639, "learning_rate": 3.1895865169920316e-06, "loss": 0.84981591, "num_input_tokens_seen": 106535870, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.84375, "step": 4945, "time_per_iteration": 2.5962142944335938 }, { "auxiliary_loss_clip": 0.01171495, "auxiliary_loss_mlp": 0.01041134, "balance_loss_clip": 1.02475417, "balance_loss_mlp": 1.05119729, "epoch": 0.29736960769577636, "flos": 17493488766720.0, "grad_norm": 1.8615888019709295, "language_loss": 0.66676486, "learning_rate": 3.18928277244209e-06, "loss": 0.68889117, "num_input_tokens_seen": 106553560, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8515625, "step": 4946, "time_per_iteration": 2.562244176864624 }, { "auxiliary_loss_clip": 0.01152823, "auxiliary_loss_mlp": 0.01283597, "balance_loss_clip": 1.01926327, "balance_loss_mlp": 1.0550797, "epoch": 0.2974297309484443, "flos": 26103969417600.0, "grad_norm": 1.9973332894643885, "language_loss": 0.74084234, "learning_rate": 3.1889789854497052e-06, "loss": 0.76520658, "num_input_tokens_seen": 106574115, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.88671875, "step": 4947, "time_per_iteration": 2.62436580657959 }, { "auxiliary_loss_clip": 0.01167998, "auxiliary_loss_mlp": 0.01051795, "balance_loss_clip": 1.03428316, "balance_loss_mlp": 1.05289829, "epoch": 0.2974898542011123, "flos": 25666433850240.0, "grad_norm": 1.9846123803720999, "language_loss": 0.73024249, "learning_rate": 3.188675156025719e-06, "loss": 0.75244039, "num_input_tokens_seen": 106593070, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.88671875, "step": 4948, "time_per_iteration": 2.614260196685791 }, { "auxiliary_loss_clip": 0.01134921, "auxiliary_loss_mlp": 0.01038029, "balance_loss_clip": 1.0214467, "balance_loss_mlp": 1.04913664, "epoch": 0.29754997745378026, "flos": 18661339658880.0, "grad_norm": 2.8841342197805244, "language_loss": 0.83956134, "learning_rate": 3.1883712841809752e-06, "loss": 0.86129081, "num_input_tokens_seen": 106610695, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.859375, "step": 4949, "time_per_iteration": 2.568843126296997 }, { "auxiliary_loss_clip": 0.01143759, "auxiliary_loss_mlp": 0.01044872, "balance_loss_clip": 1.02775359, "balance_loss_mlp": 1.0497098, "epoch": 0.2976101007064482, "flos": 22274599449600.0, "grad_norm": 2.050230376523607, "language_loss": 0.70971906, "learning_rate": 3.188067369926316e-06, "loss": 0.73160535, "num_input_tokens_seen": 106631300, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.8515625, "step": 4950, "time_per_iteration": 2.6267378330230713 }, { "auxiliary_loss_clip": 0.01171492, "auxiliary_loss_mlp": 0.01043704, "balance_loss_clip": 1.0270865, "balance_loss_mlp": 1.0537641, "epoch": 0.2976702239591162, "flos": 21945657674880.0, "grad_norm": 1.8936575310305077, "language_loss": 0.83094305, "learning_rate": 3.1877634132725887e-06, "loss": 0.85309494, "num_input_tokens_seen": 106650065, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8203125, "step": 4951, "time_per_iteration": 2.6073484420776367 }, { "auxiliary_loss_clip": 0.01152327, "auxiliary_loss_mlp": 0.01035636, "balance_loss_clip": 1.01791, "balance_loss_mlp": 1.04784548, "epoch": 0.29773034721178415, "flos": 24637197542400.0, "grad_norm": 3.120885550462058, "language_loss": 0.7363615, "learning_rate": 3.187459414230641e-06, "loss": 0.75824106, "num_input_tokens_seen": 106668230, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.8671875, "step": 4952, "time_per_iteration": 2.5770647525787354 }, { "auxiliary_loss_clip": 0.01191107, "auxiliary_loss_mlp": 0.01039042, "balance_loss_clip": 1.02079141, "balance_loss_mlp": 1.05074096, "epoch": 0.2977904704644521, "flos": 20557566541440.0, "grad_norm": 1.8000931829064795, "language_loss": 0.84022129, "learning_rate": 3.187155372811321e-06, "loss": 0.86252272, "num_input_tokens_seen": 106687785, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.86328125, "step": 4953, "time_per_iteration": 2.6541550159454346 }, { "auxiliary_loss_clip": 0.01154985, "auxiliary_loss_mlp": 0.01040167, "balance_loss_clip": 1.02396643, "balance_loss_mlp": 1.05028403, "epoch": 0.2978505937171201, "flos": 18916449027840.0, "grad_norm": 2.084504285434988, "language_loss": 0.73693407, "learning_rate": 3.186851289025479e-06, "loss": 0.75888562, "num_input_tokens_seen": 106706875, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8671875, "step": 4954, "time_per_iteration": 2.5345659255981445 }, { "auxiliary_loss_clip": 0.01135362, "auxiliary_loss_mlp": 0.01035296, "balance_loss_clip": 1.01921511, "balance_loss_mlp": 1.05085588, "epoch": 0.29791071696978805, "flos": 19317750750720.0, "grad_norm": 1.9544400831202442, "language_loss": 0.75615907, "learning_rate": 3.186547162883968e-06, "loss": 0.77786559, "num_input_tokens_seen": 106725105, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.84375, "step": 4955, "time_per_iteration": 2.6009135246276855 }, { "auxiliary_loss_clip": 0.01175127, "auxiliary_loss_mlp": 0.01039482, "balance_loss_clip": 1.02292395, "balance_loss_mlp": 1.05214119, "epoch": 0.297970840222456, "flos": 18806813740800.0, "grad_norm": 1.6065053216826681, "language_loss": 0.72285789, "learning_rate": 3.1862429943976404e-06, "loss": 0.74500406, "num_input_tokens_seen": 106744780, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.875, "step": 4956, "time_per_iteration": 2.579230308532715 }, { "auxiliary_loss_clip": 0.01148206, "auxiliary_loss_mlp": 0.01046818, "balance_loss_clip": 1.02878165, "balance_loss_mlp": 1.04957485, "epoch": 0.298030963475124, "flos": 22852760762880.0, "grad_norm": 2.810762660005782, "language_loss": 0.79235083, "learning_rate": 3.1859387835773525e-06, "loss": 0.81430107, "num_input_tokens_seen": 106764670, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.8984375, "step": 4957, "time_per_iteration": 2.6143441200256348 }, { "auxiliary_loss_clip": 0.01147685, "auxiliary_loss_mlp": 0.01046546, "balance_loss_clip": 1.02833116, "balance_loss_mlp": 1.05041242, "epoch": 0.298091086727792, "flos": 21868485304320.0, "grad_norm": 1.574110898532213, "language_loss": 0.70239627, "learning_rate": 3.1856345304339593e-06, "loss": 0.72433865, "num_input_tokens_seen": 106783695, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.8828125, "step": 4958, "time_per_iteration": 3.9531824588775635 }, { "auxiliary_loss_clip": 0.01162866, "auxiliary_loss_mlp": 0.01051797, "balance_loss_clip": 1.03473842, "balance_loss_mlp": 1.05173731, "epoch": 0.29815120998045996, "flos": 21175014355200.0, "grad_norm": 1.5290960019273423, "language_loss": 0.78847492, "learning_rate": 3.1853302349783197e-06, "loss": 0.81062156, "num_input_tokens_seen": 106803150, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.84375, "step": 4959, "time_per_iteration": 2.570946216583252 }, { "auxiliary_loss_clip": 0.01144972, "auxiliary_loss_mlp": 0.01049489, "balance_loss_clip": 1.03264475, "balance_loss_mlp": 1.05007219, "epoch": 0.29821133323312793, "flos": 19896271200000.0, "grad_norm": 1.6512316144352062, "language_loss": 0.79706335, "learning_rate": 3.185025897221293e-06, "loss": 0.81900799, "num_input_tokens_seen": 106820705, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.859375, "step": 4960, "time_per_iteration": 2.668224811553955 }, { "auxiliary_loss_clip": 0.01162971, "auxiliary_loss_mlp": 0.01048614, "balance_loss_clip": 1.02986288, "balance_loss_mlp": 1.0477972, "epoch": 0.2982714564857959, "flos": 12750766744320.0, "grad_norm": 2.484756643636686, "language_loss": 0.73741317, "learning_rate": 3.1847215171737406e-06, "loss": 0.75952893, "num_input_tokens_seen": 106837335, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.8828125, "step": 4961, "time_per_iteration": 2.5457348823547363 }, { "auxiliary_loss_clip": 0.01145424, "auxiliary_loss_mlp": 0.01039886, "balance_loss_clip": 1.02355421, "balance_loss_mlp": 1.04968762, "epoch": 0.29833157973846386, "flos": 22271905929600.0, "grad_norm": 2.149770130963568, "language_loss": 0.62522376, "learning_rate": 3.1844170948465246e-06, "loss": 0.6470769, "num_input_tokens_seen": 106856250, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8671875, "step": 4962, "time_per_iteration": 2.604820489883423 }, { "auxiliary_loss_clip": 0.01148572, "auxiliary_loss_mlp": 0.01048299, "balance_loss_clip": 1.03022718, "balance_loss_mlp": 1.05239367, "epoch": 0.2983917029911318, "flos": 15372999319680.0, "grad_norm": 1.9463184051738458, "language_loss": 0.83542395, "learning_rate": 3.184112630250509e-06, "loss": 0.85739267, "num_input_tokens_seen": 106873370, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.87109375, "step": 4963, "time_per_iteration": 3.907763957977295 }, { "auxiliary_loss_clip": 0.01164161, "auxiliary_loss_mlp": 0.01041268, "balance_loss_clip": 1.02338696, "balance_loss_mlp": 1.05288148, "epoch": 0.2984518262437998, "flos": 15377632174080.0, "grad_norm": 2.820344125288906, "language_loss": 0.66887844, "learning_rate": 3.1838081233965595e-06, "loss": 0.69093275, "num_input_tokens_seen": 106890330, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.84375, "step": 4964, "time_per_iteration": 4.043238639831543 }, { "auxiliary_loss_clip": 0.0114134, "auxiliary_loss_mlp": 0.01038657, "balance_loss_clip": 1.02274239, "balance_loss_mlp": 1.04784989, "epoch": 0.29851194949646775, "flos": 18108458542080.0, "grad_norm": 1.6932166595303346, "language_loss": 0.71169448, "learning_rate": 3.1835035742955435e-06, "loss": 0.7334944, "num_input_tokens_seen": 106909190, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.84375, "step": 4965, "time_per_iteration": 2.587656021118164 }, { "auxiliary_loss_clip": 0.01140017, "auxiliary_loss_mlp": 0.01048336, "balance_loss_clip": 1.03118193, "balance_loss_mlp": 1.05355763, "epoch": 0.2985720727491357, "flos": 22018233104640.0, "grad_norm": 1.8222756282908763, "language_loss": 0.66311395, "learning_rate": 3.1831989829583286e-06, "loss": 0.68499744, "num_input_tokens_seen": 106927825, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8671875, "step": 4966, "time_per_iteration": 2.5013513565063477 }, { "auxiliary_loss_clip": 0.01141043, "auxiliary_loss_mlp": 0.01041478, "balance_loss_clip": 1.02450299, "balance_loss_mlp": 1.05370593, "epoch": 0.2986321960018037, "flos": 13041355772160.0, "grad_norm": 4.600411186879456, "language_loss": 0.73547208, "learning_rate": 3.182894349395787e-06, "loss": 0.75729728, "num_input_tokens_seen": 106943155, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.875, "step": 4967, "time_per_iteration": 3.927375555038452 }, { "auxiliary_loss_clip": 0.01149971, "auxiliary_loss_mlp": 0.01034312, "balance_loss_clip": 1.01845694, "balance_loss_mlp": 1.04657245, "epoch": 0.29869231925447165, "flos": 14465034305280.0, "grad_norm": 1.954716801078302, "language_loss": 0.71149039, "learning_rate": 3.1825896736187876e-06, "loss": 0.73333323, "num_input_tokens_seen": 106960295, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8515625, "step": 4968, "time_per_iteration": 2.519087076187134 }, { "auxiliary_loss_clip": 0.01145065, "auxiliary_loss_mlp": 0.01038043, "balance_loss_clip": 1.02104425, "balance_loss_mlp": 1.04916584, "epoch": 0.2987524425071396, "flos": 31650228639360.0, "grad_norm": 2.8486403171750427, "language_loss": 0.76817799, "learning_rate": 3.182284955638205e-06, "loss": 0.79000902, "num_input_tokens_seen": 106982870, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8671875, "step": 4969, "time_per_iteration": 2.6391870975494385 }, { "auxiliary_loss_clip": 0.0113527, "auxiliary_loss_mlp": 0.01034457, "balance_loss_clip": 1.01934099, "balance_loss_mlp": 1.05042934, "epoch": 0.2988125657598076, "flos": 21433427775360.0, "grad_norm": 1.5952889527292617, "language_loss": 0.70062172, "learning_rate": 3.181980195464913e-06, "loss": 0.72231901, "num_input_tokens_seen": 107002405, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.84765625, "step": 4970, "time_per_iteration": 2.5645930767059326 }, { "auxiliary_loss_clip": 0.01147932, "auxiliary_loss_mlp": 0.0104312, "balance_loss_clip": 1.02505934, "balance_loss_mlp": 1.04967427, "epoch": 0.2988726890124756, "flos": 18076965292800.0, "grad_norm": 2.1373398050019916, "language_loss": 0.85343987, "learning_rate": 3.1816753931097894e-06, "loss": 0.87535042, "num_input_tokens_seen": 107017310, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.89453125, "step": 4971, "time_per_iteration": 2.5371944904327393 }, { "auxiliary_loss_clip": 0.01153182, "auxiliary_loss_mlp": 0.01047787, "balance_loss_clip": 1.03085899, "balance_loss_mlp": 1.05256438, "epoch": 0.29893281226514357, "flos": 21755653706880.0, "grad_norm": 1.8770415027301004, "language_loss": 0.79163873, "learning_rate": 3.1813705485837095e-06, "loss": 0.8136484, "num_input_tokens_seen": 107034645, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.828125, "step": 4972, "time_per_iteration": 2.606255054473877 }, { "auxiliary_loss_clip": 0.01136568, "auxiliary_loss_mlp": 0.01046916, "balance_loss_clip": 1.03021502, "balance_loss_mlp": 1.0513289, "epoch": 0.29899293551781153, "flos": 16836718538880.0, "grad_norm": 1.9838082967102209, "language_loss": 0.85607749, "learning_rate": 3.1810656618975544e-06, "loss": 0.87791234, "num_input_tokens_seen": 107051125, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8515625, "step": 4973, "time_per_iteration": 2.4771153926849365 }, { "auxiliary_loss_clip": 0.01150993, "auxiliary_loss_mlp": 0.01038577, "balance_loss_clip": 1.02182794, "balance_loss_mlp": 1.05267096, "epoch": 0.2990530587704795, "flos": 11729215946880.0, "grad_norm": 1.5725016913065455, "language_loss": 0.77295262, "learning_rate": 3.180760733062204e-06, "loss": 0.79484826, "num_input_tokens_seen": 107068815, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8046875, "step": 4974, "time_per_iteration": 2.6250476837158203 }, { "auxiliary_loss_clip": 0.01157279, "auxiliary_loss_mlp": 0.01292953, "balance_loss_clip": 1.02949071, "balance_loss_mlp": 1.05204368, "epoch": 0.29911318202314746, "flos": 28039877850240.0, "grad_norm": 2.4252993956233655, "language_loss": 0.71978402, "learning_rate": 3.1804557620885396e-06, "loss": 0.74428642, "num_input_tokens_seen": 107090420, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.875, "step": 4975, "time_per_iteration": 2.64392352104187 }, { "auxiliary_loss_clip": 0.01145327, "auxiliary_loss_mlp": 0.01039213, "balance_loss_clip": 1.02196312, "balance_loss_mlp": 1.05197811, "epoch": 0.2991733052758154, "flos": 18733555952640.0, "grad_norm": 2.182289646323565, "language_loss": 0.75998646, "learning_rate": 3.1801507489874453e-06, "loss": 0.7818318, "num_input_tokens_seen": 107107255, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.84375, "step": 4976, "time_per_iteration": 2.556299924850464 }, { "auxiliary_loss_clip": 0.01142483, "auxiliary_loss_mlp": 0.01040879, "balance_loss_clip": 1.02489269, "balance_loss_mlp": 1.05016088, "epoch": 0.2992334285284834, "flos": 15559160532480.0, "grad_norm": 2.2955213062584106, "language_loss": 0.86325705, "learning_rate": 3.1798456937698073e-06, "loss": 0.88509059, "num_input_tokens_seen": 107123840, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8359375, "step": 4977, "time_per_iteration": 2.488213300704956 }, { "auxiliary_loss_clip": 0.01154941, "auxiliary_loss_mlp": 0.01042492, "balance_loss_clip": 1.02618396, "balance_loss_mlp": 1.05202174, "epoch": 0.29929355178115136, "flos": 21797561900160.0, "grad_norm": 1.6549958695564517, "language_loss": 0.67968345, "learning_rate": 3.1795405964465114e-06, "loss": 0.70165777, "num_input_tokens_seen": 107143475, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8515625, "step": 4978, "time_per_iteration": 2.6261379718780518 }, { "auxiliary_loss_clip": 0.01156675, "auxiliary_loss_mlp": 0.01037907, "balance_loss_clip": 1.02109861, "balance_loss_mlp": 1.05368602, "epoch": 0.2993536750338193, "flos": 21178533888000.0, "grad_norm": 1.9162340562436455, "language_loss": 0.76255506, "learning_rate": 3.1792354570284452e-06, "loss": 0.78450096, "num_input_tokens_seen": 107161725, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8515625, "step": 4979, "time_per_iteration": 2.5472915172576904 }, { "auxiliary_loss_clip": 0.01168628, "auxiliary_loss_mlp": 0.01040574, "balance_loss_clip": 1.02368236, "balance_loss_mlp": 1.04715157, "epoch": 0.2994137982864873, "flos": 32122130544000.0, "grad_norm": 2.0535971742237984, "language_loss": 0.68418032, "learning_rate": 3.1789302755264996e-06, "loss": 0.70627236, "num_input_tokens_seen": 107183935, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8515625, "step": 4980, "time_per_iteration": 2.7461423873901367 }, { "auxiliary_loss_clip": 0.01144533, "auxiliary_loss_mlp": 0.01292913, "balance_loss_clip": 1.02866983, "balance_loss_mlp": 1.05331922, "epoch": 0.29947392153915525, "flos": 21105419754240.0, "grad_norm": 2.1842745467902005, "language_loss": 0.72679323, "learning_rate": 3.178625051951564e-06, "loss": 0.75116771, "num_input_tokens_seen": 107204285, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8203125, "step": 4981, "time_per_iteration": 2.531712770462036 }, { "auxiliary_loss_clip": 0.01145617, "auxiliary_loss_mlp": 0.01041611, "balance_loss_clip": 1.02537513, "balance_loss_mlp": 1.0491035, "epoch": 0.2995340447918232, "flos": 21542632099200.0, "grad_norm": 1.6369952730010617, "language_loss": 0.86386293, "learning_rate": 3.1783197863145335e-06, "loss": 0.88573521, "num_input_tokens_seen": 107225265, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.875, "step": 4982, "time_per_iteration": 2.6250650882720947 }, { "auxiliary_loss_clip": 0.01158862, "auxiliary_loss_mlp": 0.01042103, "balance_loss_clip": 1.02417397, "balance_loss_mlp": 1.0520004, "epoch": 0.2995941680444912, "flos": 16725143917440.0, "grad_norm": 2.3041143448636774, "language_loss": 0.86478961, "learning_rate": 3.1780144786262997e-06, "loss": 0.88679922, "num_input_tokens_seen": 107241335, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.88671875, "step": 4983, "time_per_iteration": 2.4905102252960205 }, { "auxiliary_loss_clip": 0.0114279, "auxiliary_loss_mlp": 0.01041669, "balance_loss_clip": 1.02608299, "balance_loss_mlp": 1.04936647, "epoch": 0.2996542912971592, "flos": 20923496346240.0, "grad_norm": 2.210400390535917, "language_loss": 0.78652781, "learning_rate": 3.17770912889776e-06, "loss": 0.80837244, "num_input_tokens_seen": 107259375, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.84375, "step": 4984, "time_per_iteration": 2.631362199783325 }, { "auxiliary_loss_clip": 0.01155009, "auxiliary_loss_mlp": 0.01045975, "balance_loss_clip": 1.02882028, "balance_loss_mlp": 1.05100918, "epoch": 0.29971441454982717, "flos": 25079868754560.0, "grad_norm": 1.537590043936892, "language_loss": 0.78694868, "learning_rate": 3.17740373713981e-06, "loss": 0.80895853, "num_input_tokens_seen": 107279890, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.859375, "step": 4985, "time_per_iteration": 2.5659687519073486 }, { "auxiliary_loss_clip": 0.01174671, "auxiliary_loss_mlp": 0.01043129, "balance_loss_clip": 1.02508068, "balance_loss_mlp": 1.05036879, "epoch": 0.29977453780249513, "flos": 52555911840000.0, "grad_norm": 1.8840951736715605, "language_loss": 0.71182859, "learning_rate": 3.1770983033633504e-06, "loss": 0.73400652, "num_input_tokens_seen": 107303430, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.88671875, "step": 4986, "time_per_iteration": 2.8831851482391357 }, { "auxiliary_loss_clip": 0.0115427, "auxiliary_loss_mlp": 0.01042366, "balance_loss_clip": 1.02548647, "balance_loss_mlp": 1.04855609, "epoch": 0.2998346610551631, "flos": 22237144542720.0, "grad_norm": 1.848387538646409, "language_loss": 0.73021388, "learning_rate": 3.1767928275792796e-06, "loss": 0.75218028, "num_input_tokens_seen": 107323700, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8828125, "step": 4987, "time_per_iteration": 2.5449793338775635 }, { "auxiliary_loss_clip": 0.01151794, "auxiliary_loss_mlp": 0.01037331, "balance_loss_clip": 1.02141607, "balance_loss_mlp": 1.05028129, "epoch": 0.29989478430783106, "flos": 16873203778560.0, "grad_norm": 1.6231998128085332, "language_loss": 0.80426645, "learning_rate": 3.1764873097984997e-06, "loss": 0.82615775, "num_input_tokens_seen": 107341965, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.83203125, "step": 4988, "time_per_iteration": 2.571751117706299 }, { "auxiliary_loss_clip": 0.01151748, "auxiliary_loss_mlp": 0.01294294, "balance_loss_clip": 1.03031921, "balance_loss_mlp": 1.04949105, "epoch": 0.29995490756049903, "flos": 23768878164480.0, "grad_norm": 1.5841333778006883, "language_loss": 0.7066527, "learning_rate": 3.1761817500319143e-06, "loss": 0.73111314, "num_input_tokens_seen": 107362615, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.83984375, "step": 4989, "time_per_iteration": 2.5712289810180664 }, { "auxiliary_loss_clip": 0.01165529, "auxiliary_loss_mlp": 0.01045611, "balance_loss_clip": 1.02839696, "balance_loss_mlp": 1.0510397, "epoch": 0.300015030813167, "flos": 14465321614080.0, "grad_norm": 2.6338976804944036, "language_loss": 0.85389435, "learning_rate": 3.175876148290428e-06, "loss": 0.87600577, "num_input_tokens_seen": 107378980, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.87109375, "step": 4990, "time_per_iteration": 2.5891315937042236 }, { "auxiliary_loss_clip": 0.0114557, "auxiliary_loss_mlp": 0.0103988, "balance_loss_clip": 1.02233243, "balance_loss_mlp": 1.04871488, "epoch": 0.30007515406583496, "flos": 25191982080000.0, "grad_norm": 1.9522390004421455, "language_loss": 0.66891372, "learning_rate": 3.175570504584946e-06, "loss": 0.69076824, "num_input_tokens_seen": 107397640, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.8828125, "step": 4991, "time_per_iteration": 2.6080596446990967 }, { "auxiliary_loss_clip": 0.01155963, "auxiliary_loss_mlp": 0.01041587, "balance_loss_clip": 1.0231576, "balance_loss_mlp": 1.04823279, "epoch": 0.3001352773185029, "flos": 19391188106880.0, "grad_norm": 1.7406210345590571, "language_loss": 0.78547263, "learning_rate": 3.175264818926377e-06, "loss": 0.80744821, "num_input_tokens_seen": 107416020, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.8984375, "step": 4992, "time_per_iteration": 2.5873987674713135 }, { "auxiliary_loss_clip": 0.0113412, "auxiliary_loss_mlp": 0.01037432, "balance_loss_clip": 1.02056336, "balance_loss_mlp": 1.04923987, "epoch": 0.3001954005711709, "flos": 21543853161600.0, "grad_norm": 3.104782249837756, "language_loss": 0.82687533, "learning_rate": 3.17495909132563e-06, "loss": 0.84859085, "num_input_tokens_seen": 107436340, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.84765625, "step": 4993, "time_per_iteration": 2.561886787414551 }, { "auxiliary_loss_clip": 0.01175262, "auxiliary_loss_mlp": 0.01051769, "balance_loss_clip": 1.03259981, "balance_loss_mlp": 1.04815161, "epoch": 0.30025552382383885, "flos": 17384320356480.0, "grad_norm": 2.051060365219215, "language_loss": 0.85954916, "learning_rate": 3.174653321793615e-06, "loss": 0.88181949, "num_input_tokens_seen": 107454585, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.9140625, "step": 4994, "time_per_iteration": 2.602994918823242 }, { "auxiliary_loss_clip": 0.01163375, "auxiliary_loss_mlp": 0.01039462, "balance_loss_clip": 1.02313042, "balance_loss_mlp": 1.04947877, "epoch": 0.3003156470765068, "flos": 29533330552320.0, "grad_norm": 1.8666461437770616, "language_loss": 0.80925769, "learning_rate": 3.1743475103412446e-06, "loss": 0.83128607, "num_input_tokens_seen": 107477180, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8671875, "step": 4995, "time_per_iteration": 2.708822727203369 }, { "auxiliary_loss_clip": 0.0115152, "auxiliary_loss_mlp": 0.01040017, "balance_loss_clip": 1.02326846, "balance_loss_mlp": 1.05012703, "epoch": 0.3003757703291748, "flos": 43646402465280.0, "grad_norm": 1.6560938392989406, "language_loss": 0.67412257, "learning_rate": 3.174041656979432e-06, "loss": 0.69603795, "num_input_tokens_seen": 107500250, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8359375, "step": 4996, "time_per_iteration": 2.7344348430633545 }, { "auxiliary_loss_clip": 0.01151548, "auxiliary_loss_mlp": 0.01042436, "balance_loss_clip": 1.02522254, "balance_loss_mlp": 1.04820776, "epoch": 0.30043589358184275, "flos": 22528380015360.0, "grad_norm": 1.7826802150142977, "language_loss": 0.75163496, "learning_rate": 3.1737357617190935e-06, "loss": 0.77357477, "num_input_tokens_seen": 107520070, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.85546875, "step": 4997, "time_per_iteration": 2.6258544921875 }, { "auxiliary_loss_clip": 0.01141953, "auxiliary_loss_mlp": 0.0104418, "balance_loss_clip": 1.02815866, "balance_loss_mlp": 1.04837298, "epoch": 0.30049601683451077, "flos": 20995892208000.0, "grad_norm": 1.9278247623750577, "language_loss": 0.77977097, "learning_rate": 3.1734298245711443e-06, "loss": 0.80163229, "num_input_tokens_seen": 107539285, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.84765625, "step": 4998, "time_per_iteration": 2.506899833679199 }, { "auxiliary_loss_clip": 0.01133425, "auxiliary_loss_mlp": 0.01045466, "balance_loss_clip": 1.02937293, "balance_loss_mlp": 1.05034339, "epoch": 0.30055614008717874, "flos": 23916004272000.0, "grad_norm": 1.7281207286755436, "language_loss": 0.73537749, "learning_rate": 3.1731238455465033e-06, "loss": 0.75716645, "num_input_tokens_seen": 107560260, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.83203125, "step": 4999, "time_per_iteration": 3.98490047454834 }, { "auxiliary_loss_clip": 0.01146125, "auxiliary_loss_mlp": 0.01050285, "balance_loss_clip": 1.03360796, "balance_loss_mlp": 1.05076039, "epoch": 0.3006162633398467, "flos": 19169798630400.0, "grad_norm": 1.6069742007512535, "language_loss": 0.75802028, "learning_rate": 3.1728178246560903e-06, "loss": 0.77998441, "num_input_tokens_seen": 107579260, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.86328125, "step": 5000, "time_per_iteration": 2.544860601425171 }, { "auxiliary_loss_clip": 0.0113324, "auxiliary_loss_mlp": 0.0104358, "balance_loss_clip": 1.02851224, "balance_loss_mlp": 1.05109549, "epoch": 0.30067638659251467, "flos": 14679241061760.0, "grad_norm": 1.8995393261194509, "language_loss": 0.81922585, "learning_rate": 3.172511761910825e-06, "loss": 0.840994, "num_input_tokens_seen": 107595245, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8203125, "step": 5001, "time_per_iteration": 2.5156829357147217 }, { "auxiliary_loss_clip": 0.01145406, "auxiliary_loss_mlp": 0.01054645, "balance_loss_clip": 1.03675175, "balance_loss_mlp": 1.05099726, "epoch": 0.30073650984518263, "flos": 23368007404800.0, "grad_norm": 2.1212885952711464, "language_loss": 0.80465245, "learning_rate": 3.1722056573216315e-06, "loss": 0.82665294, "num_input_tokens_seen": 107613985, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.85546875, "step": 5002, "time_per_iteration": 2.6284048557281494 }, { "auxiliary_loss_clip": 0.01176044, "auxiliary_loss_mlp": 0.01038809, "balance_loss_clip": 1.02235794, "balance_loss_mlp": 1.05086398, "epoch": 0.3007966330978506, "flos": 22966633854720.0, "grad_norm": 1.835286655936966, "language_loss": 0.70988953, "learning_rate": 3.1718995108994336e-06, "loss": 0.73203802, "num_input_tokens_seen": 107631435, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8984375, "step": 5003, "time_per_iteration": 2.560933828353882 }, { "auxiliary_loss_clip": 0.0116536, "auxiliary_loss_mlp": 0.010469, "balance_loss_clip": 1.03063965, "balance_loss_mlp": 1.05075097, "epoch": 0.30085675635051856, "flos": 27818452460160.0, "grad_norm": 1.8944387531155118, "language_loss": 0.70467716, "learning_rate": 3.1715933226551562e-06, "loss": 0.72679973, "num_input_tokens_seen": 107650530, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8828125, "step": 5004, "time_per_iteration": 2.620441198348999 }, { "auxiliary_loss_clip": 0.01158026, "auxiliary_loss_mlp": 0.01055182, "balance_loss_clip": 1.03786111, "balance_loss_mlp": 1.05267322, "epoch": 0.3009168796031865, "flos": 10882729059840.0, "grad_norm": 2.4778024210920493, "language_loss": 0.81590343, "learning_rate": 3.171287092599727e-06, "loss": 0.83803552, "num_input_tokens_seen": 107662240, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.875, "step": 5005, "time_per_iteration": 3.887697696685791 }, { "auxiliary_loss_clip": 0.01143598, "auxiliary_loss_mlp": 0.0104075, "balance_loss_clip": 1.02491903, "balance_loss_mlp": 1.04980278, "epoch": 0.3009770028558545, "flos": 23805399317760.0, "grad_norm": 2.443757774746574, "language_loss": 0.74949586, "learning_rate": 3.1709808207440745e-06, "loss": 0.7713393, "num_input_tokens_seen": 107680330, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8515625, "step": 5006, "time_per_iteration": 4.186584711074829 }, { "auxiliary_loss_clip": 0.01160739, "auxiliary_loss_mlp": 0.01298116, "balance_loss_clip": 1.03451347, "balance_loss_mlp": 1.05101037, "epoch": 0.30103712610852246, "flos": 26468211283200.0, "grad_norm": 2.3709647781043994, "language_loss": 0.7059145, "learning_rate": 3.170674507099128e-06, "loss": 0.73050302, "num_input_tokens_seen": 107700020, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.828125, "step": 5007, "time_per_iteration": 2.67919921875 }, { "auxiliary_loss_clip": 0.01145332, "auxiliary_loss_mlp": 0.01042058, "balance_loss_clip": 1.02448618, "balance_loss_mlp": 1.0511018, "epoch": 0.3010972493611904, "flos": 22856459863680.0, "grad_norm": 2.153473604917921, "language_loss": 0.76172417, "learning_rate": 3.17036815167582e-06, "loss": 0.78359807, "num_input_tokens_seen": 107718575, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.85546875, "step": 5008, "time_per_iteration": 2.5821688175201416 }, { "auxiliary_loss_clip": 0.01146682, "auxiliary_loss_mlp": 0.01043071, "balance_loss_clip": 1.02603579, "balance_loss_mlp": 1.05271149, "epoch": 0.3011573726138584, "flos": 24053685102720.0, "grad_norm": 1.9027225869806115, "language_loss": 0.84760928, "learning_rate": 3.170061754485084e-06, "loss": 0.86950678, "num_input_tokens_seen": 107738635, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8515625, "step": 5009, "time_per_iteration": 3.978621006011963 }, { "auxiliary_loss_clip": 0.01150149, "auxiliary_loss_mlp": 0.01042669, "balance_loss_clip": 1.02381051, "balance_loss_mlp": 1.0519278, "epoch": 0.30121749586652635, "flos": 20259687052800.0, "grad_norm": 2.1910957296734903, "language_loss": 0.83706474, "learning_rate": 3.1697553155378527e-06, "loss": 0.85899293, "num_input_tokens_seen": 107753415, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.890625, "step": 5010, "time_per_iteration": 2.521270751953125 }, { "auxiliary_loss_clip": 0.01161257, "auxiliary_loss_mlp": 0.0103823, "balance_loss_clip": 1.02208865, "balance_loss_mlp": 1.05036199, "epoch": 0.3012776191191944, "flos": 26943058103040.0, "grad_norm": 2.4162757744611096, "language_loss": 0.85176373, "learning_rate": 3.1694488348450636e-06, "loss": 0.87375855, "num_input_tokens_seen": 107773840, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.84375, "step": 5011, "time_per_iteration": 2.6957223415374756 }, { "auxiliary_loss_clip": 0.01155911, "auxiliary_loss_mlp": 0.01040676, "balance_loss_clip": 1.02365327, "balance_loss_mlp": 1.05002713, "epoch": 0.30133774237186234, "flos": 20412307941120.0, "grad_norm": 2.0847889921628164, "language_loss": 0.71883392, "learning_rate": 3.169142312417654e-06, "loss": 0.74079978, "num_input_tokens_seen": 107792020, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.87890625, "step": 5012, "time_per_iteration": 2.6209542751312256 }, { "auxiliary_loss_clip": 0.01145255, "auxiliary_loss_mlp": 0.01039524, "balance_loss_clip": 1.02343035, "balance_loss_mlp": 1.05004358, "epoch": 0.3013978656245303, "flos": 19792453916160.0, "grad_norm": 2.8686468565671093, "language_loss": 0.87587309, "learning_rate": 3.1688357482665622e-06, "loss": 0.89772093, "num_input_tokens_seen": 107809595, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.86328125, "step": 5013, "time_per_iteration": 2.5674495697021484 }, { "auxiliary_loss_clip": 0.01158771, "auxiliary_loss_mlp": 0.01040768, "balance_loss_clip": 1.02132511, "balance_loss_mlp": 1.05123734, "epoch": 0.30145798887719827, "flos": 16249650652800.0, "grad_norm": 2.257465336170233, "language_loss": 0.82726216, "learning_rate": 3.1685291424027293e-06, "loss": 0.84925759, "num_input_tokens_seen": 107827230, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.89453125, "step": 5014, "time_per_iteration": 2.588487148284912 }, { "auxiliary_loss_clip": 0.01161204, "auxiliary_loss_mlp": 0.0103778, "balance_loss_clip": 1.02127004, "balance_loss_mlp": 1.05246937, "epoch": 0.30151811212986623, "flos": 24571733005440.0, "grad_norm": 1.602161947211767, "language_loss": 0.68289381, "learning_rate": 3.1682224948370973e-06, "loss": 0.70488364, "num_input_tokens_seen": 107847195, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8203125, "step": 5015, "time_per_iteration": 2.5953598022460938 }, { "auxiliary_loss_clip": 0.01155808, "auxiliary_loss_mlp": 0.01041103, "balance_loss_clip": 1.02324569, "balance_loss_mlp": 1.05230641, "epoch": 0.3015782353825342, "flos": 21872076664320.0, "grad_norm": 4.766146695606986, "language_loss": 0.74754465, "learning_rate": 3.1679158055806096e-06, "loss": 0.76951379, "num_input_tokens_seen": 107866420, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.85546875, "step": 5016, "time_per_iteration": 2.597790479660034 }, { "auxiliary_loss_clip": 0.0114784, "auxiliary_loss_mlp": 0.01039605, "balance_loss_clip": 1.02224839, "balance_loss_mlp": 1.05201077, "epoch": 0.30163835863520216, "flos": 28769331248640.0, "grad_norm": 1.6783765551322989, "language_loss": 0.65698975, "learning_rate": 3.1676090746442105e-06, "loss": 0.67886418, "num_input_tokens_seen": 107889090, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.8671875, "step": 5017, "time_per_iteration": 2.749807119369507 }, { "auxiliary_loss_clip": 0.01165903, "auxiliary_loss_mlp": 0.01042567, "balance_loss_clip": 1.02567518, "balance_loss_mlp": 1.05152249, "epoch": 0.30169848188787013, "flos": 22966202891520.0, "grad_norm": 2.1972479850290974, "language_loss": 0.68711615, "learning_rate": 3.1673023020388473e-06, "loss": 0.70920086, "num_input_tokens_seen": 107907520, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8671875, "step": 5018, "time_per_iteration": 2.6988613605499268 }, { "auxiliary_loss_clip": 0.01133559, "auxiliary_loss_mlp": 0.01040588, "balance_loss_clip": 1.02577055, "balance_loss_mlp": 1.05155861, "epoch": 0.3017586051405381, "flos": 21835268202240.0, "grad_norm": 2.300034601569296, "language_loss": 0.79473919, "learning_rate": 3.1669954877754677e-06, "loss": 0.81648064, "num_input_tokens_seen": 107925650, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8203125, "step": 5019, "time_per_iteration": 2.5355238914489746 }, { "auxiliary_loss_clip": 0.01158482, "auxiliary_loss_mlp": 0.01041204, "balance_loss_clip": 1.02483702, "balance_loss_mlp": 1.05554056, "epoch": 0.30181872839320606, "flos": 22160403135360.0, "grad_norm": 1.8693639708537437, "language_loss": 0.69629306, "learning_rate": 3.1666886318650206e-06, "loss": 0.71828997, "num_input_tokens_seen": 107943975, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8515625, "step": 5020, "time_per_iteration": 2.667581081390381 }, { "auxiliary_loss_clip": 0.01140859, "auxiliary_loss_mlp": 0.01050038, "balance_loss_clip": 1.03351557, "balance_loss_mlp": 1.05251801, "epoch": 0.301878851645874, "flos": 18114168804480.0, "grad_norm": 2.3277581429380056, "language_loss": 0.78211141, "learning_rate": 3.1663817343184576e-06, "loss": 0.80402035, "num_input_tokens_seen": 107962950, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8828125, "step": 5021, "time_per_iteration": 2.5410103797912598 }, { "auxiliary_loss_clip": 0.01144139, "auxiliary_loss_mlp": 0.01036952, "balance_loss_clip": 1.02078724, "balance_loss_mlp": 1.04969501, "epoch": 0.301938974898542, "flos": 17602226213760.0, "grad_norm": 2.0545831724576904, "language_loss": 0.76385629, "learning_rate": 3.166074795146731e-06, "loss": 0.78566718, "num_input_tokens_seen": 107979700, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.85546875, "step": 5022, "time_per_iteration": 2.6376075744628906 }, { "auxiliary_loss_clip": 0.01136124, "auxiliary_loss_mlp": 0.01043051, "balance_loss_clip": 1.02640963, "balance_loss_mlp": 1.04998326, "epoch": 0.30199909815120995, "flos": 11181219079680.0, "grad_norm": 1.8273114475131942, "language_loss": 0.69611579, "learning_rate": 3.1657678143607943e-06, "loss": 0.71790755, "num_input_tokens_seen": 107996645, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.86328125, "step": 5023, "time_per_iteration": 2.523780345916748 }, { "auxiliary_loss_clip": 0.01156686, "auxiliary_loss_mlp": 0.01040473, "balance_loss_clip": 1.02457047, "balance_loss_mlp": 1.05014038, "epoch": 0.302059221403878, "flos": 21907843632000.0, "grad_norm": 2.522293324123847, "language_loss": 0.71523255, "learning_rate": 3.165460791971603e-06, "loss": 0.73720419, "num_input_tokens_seen": 108015020, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.88671875, "step": 5024, "time_per_iteration": 2.6290981769561768 }, { "auxiliary_loss_clip": 0.01143878, "auxiliary_loss_mlp": 0.01046047, "balance_loss_clip": 1.02950072, "balance_loss_mlp": 1.05124116, "epoch": 0.30211934465654594, "flos": 26396390039040.0, "grad_norm": 1.6434365984412689, "language_loss": 0.74230134, "learning_rate": 3.1651537279901135e-06, "loss": 0.76420057, "num_input_tokens_seen": 108036430, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.83984375, "step": 5025, "time_per_iteration": 2.5852699279785156 }, { "auxiliary_loss_clip": 0.01143623, "auxiliary_loss_mlp": 0.01043004, "balance_loss_clip": 1.02726865, "balance_loss_mlp": 1.05215645, "epoch": 0.3021794679092139, "flos": 23400470321280.0, "grad_norm": 1.5204443542543473, "language_loss": 0.66972524, "learning_rate": 3.1648466224272854e-06, "loss": 0.6915915, "num_input_tokens_seen": 108054250, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.82421875, "step": 5026, "time_per_iteration": 2.6228158473968506 }, { "auxiliary_loss_clip": 0.01145278, "auxiliary_loss_mlp": 0.0104306, "balance_loss_clip": 1.02653766, "balance_loss_mlp": 1.05151272, "epoch": 0.30223959116188187, "flos": 20260979942400.0, "grad_norm": 2.1105943541231134, "language_loss": 0.85127699, "learning_rate": 3.1645394752940772e-06, "loss": 0.87316036, "num_input_tokens_seen": 108071495, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.84375, "step": 5027, "time_per_iteration": 2.5205812454223633 }, { "auxiliary_loss_clip": 0.01152785, "auxiliary_loss_mlp": 0.01044316, "balance_loss_clip": 1.0284251, "balance_loss_mlp": 1.05123448, "epoch": 0.30229971441454984, "flos": 26687840993280.0, "grad_norm": 1.514927608616211, "language_loss": 0.78688729, "learning_rate": 3.164232286601451e-06, "loss": 0.80885828, "num_input_tokens_seen": 108092135, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.83203125, "step": 5028, "time_per_iteration": 2.653257369995117 }, { "auxiliary_loss_clip": 0.01154203, "auxiliary_loss_mlp": 0.01048929, "balance_loss_clip": 1.03227544, "balance_loss_mlp": 1.05164111, "epoch": 0.3023598376672178, "flos": 34345323953280.0, "grad_norm": 1.7139123234792628, "language_loss": 0.77022654, "learning_rate": 3.1639250563603686e-06, "loss": 0.79225791, "num_input_tokens_seen": 108112945, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.84765625, "step": 5029, "time_per_iteration": 2.719963550567627 }, { "auxiliary_loss_clip": 0.01164853, "auxiliary_loss_mlp": 0.01045609, "balance_loss_clip": 1.02688098, "balance_loss_mlp": 1.05117488, "epoch": 0.30241996091988577, "flos": 23112143850240.0, "grad_norm": 2.302586056425674, "language_loss": 0.82228106, "learning_rate": 3.1636177845817954e-06, "loss": 0.84438562, "num_input_tokens_seen": 108130325, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.8671875, "step": 5030, "time_per_iteration": 2.656137466430664 }, { "auxiliary_loss_clip": 0.01172451, "auxiliary_loss_mlp": 0.01290199, "balance_loss_clip": 1.02527535, "balance_loss_mlp": 1.04895878, "epoch": 0.30248008417255373, "flos": 19390002958080.0, "grad_norm": 1.6123545827055583, "language_loss": 0.70231348, "learning_rate": 3.1633104712766967e-06, "loss": 0.72694004, "num_input_tokens_seen": 108150300, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.875, "step": 5031, "time_per_iteration": 2.671664237976074 }, { "auxiliary_loss_clip": 0.01146631, "auxiliary_loss_mlp": 0.01040784, "balance_loss_clip": 1.02377307, "balance_loss_mlp": 1.0523057, "epoch": 0.3025402074252217, "flos": 23769704177280.0, "grad_norm": 1.8486579542148611, "language_loss": 0.82224488, "learning_rate": 3.1630031164560395e-06, "loss": 0.84411901, "num_input_tokens_seen": 108170330, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.85546875, "step": 5032, "time_per_iteration": 2.668642044067383 }, { "auxiliary_loss_clip": 0.01173302, "auxiliary_loss_mlp": 0.01052569, "balance_loss_clip": 1.0339483, "balance_loss_mlp": 1.05675459, "epoch": 0.30260033067788966, "flos": 25994118648960.0, "grad_norm": 3.790166792359258, "language_loss": 0.72900295, "learning_rate": 3.162695720130793e-06, "loss": 0.75126171, "num_input_tokens_seen": 108191265, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.8984375, "step": 5033, "time_per_iteration": 2.6862974166870117 }, { "auxiliary_loss_clip": 0.01141402, "auxiliary_loss_mlp": 0.01052094, "balance_loss_clip": 1.03463018, "balance_loss_mlp": 1.04875708, "epoch": 0.3026604539305576, "flos": 25374551932800.0, "grad_norm": 1.4637389953078508, "language_loss": 0.7405833, "learning_rate": 3.1623882823119267e-06, "loss": 0.76251823, "num_input_tokens_seen": 108211615, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.8359375, "step": 5034, "time_per_iteration": 2.665276527404785 }, { "auxiliary_loss_clip": 0.01138045, "auxiliary_loss_mlp": 0.01033839, "balance_loss_clip": 1.01699448, "balance_loss_mlp": 1.05191255, "epoch": 0.3027205771832256, "flos": 25812733944960.0, "grad_norm": 1.751031388945944, "language_loss": 0.71694916, "learning_rate": 3.1620808030104127e-06, "loss": 0.73866802, "num_input_tokens_seen": 108231080, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.859375, "step": 5035, "time_per_iteration": 2.6404905319213867 }, { "auxiliary_loss_clip": 0.01142585, "auxiliary_loss_mlp": 0.01039341, "balance_loss_clip": 1.02397513, "balance_loss_mlp": 1.04924011, "epoch": 0.30278070043589356, "flos": 27344539393920.0, "grad_norm": 2.1007539299239575, "language_loss": 0.87116277, "learning_rate": 3.1617732822372237e-06, "loss": 0.89298207, "num_input_tokens_seen": 108251125, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.84375, "step": 5036, "time_per_iteration": 2.6447324752807617 }, { "auxiliary_loss_clip": 0.01159309, "auxiliary_loss_mlp": 0.01050272, "balance_loss_clip": 1.03254533, "balance_loss_mlp": 1.0485456, "epoch": 0.3028408236885616, "flos": 24786227070720.0, "grad_norm": 1.5305611111712931, "language_loss": 0.77871132, "learning_rate": 3.1614657200033355e-06, "loss": 0.80080718, "num_input_tokens_seen": 108272545, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.83984375, "step": 5037, "time_per_iteration": 2.6428420543670654 }, { "auxiliary_loss_clip": 0.01164774, "auxiliary_loss_mlp": 0.01043996, "balance_loss_clip": 1.02669835, "balance_loss_mlp": 1.05103993, "epoch": 0.30290094694122954, "flos": 12932474670720.0, "grad_norm": 4.556569687151379, "language_loss": 0.77932805, "learning_rate": 3.1611581163197228e-06, "loss": 0.80141574, "num_input_tokens_seen": 108289725, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.8671875, "step": 5038, "time_per_iteration": 2.547990322113037 }, { "auxiliary_loss_clip": 0.01148663, "auxiliary_loss_mlp": 0.01042284, "balance_loss_clip": 1.02677441, "balance_loss_mlp": 1.04910851, "epoch": 0.3029610701938975, "flos": 25916443488000.0, "grad_norm": 1.7570541884500162, "language_loss": 0.74031049, "learning_rate": 3.160850471197364e-06, "loss": 0.76221991, "num_input_tokens_seen": 108310690, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.81640625, "step": 5039, "time_per_iteration": 2.694122552871704 }, { "auxiliary_loss_clip": 0.01130594, "auxiliary_loss_mlp": 0.0128828, "balance_loss_clip": 1.0262562, "balance_loss_mlp": 1.04927039, "epoch": 0.3030211934465655, "flos": 21980993679360.0, "grad_norm": 3.0066953439493886, "language_loss": 0.79818654, "learning_rate": 3.160542784647238e-06, "loss": 0.82237518, "num_input_tokens_seen": 108328905, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8125, "step": 5040, "time_per_iteration": 2.535170078277588 }, { "auxiliary_loss_clip": 0.01150419, "auxiliary_loss_mlp": 0.01036575, "balance_loss_clip": 1.02137017, "balance_loss_mlp": 1.05078888, "epoch": 0.30308131669923344, "flos": 20991977625600.0, "grad_norm": 1.6215623021937142, "language_loss": 0.82172799, "learning_rate": 3.1602350566803254e-06, "loss": 0.84359789, "num_input_tokens_seen": 108346680, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8203125, "step": 5041, "time_per_iteration": 3.9661436080932617 }, { "auxiliary_loss_clip": 0.01113531, "auxiliary_loss_mlp": 0.0105218, "balance_loss_clip": 1.04958153, "balance_loss_mlp": 1.04913688, "epoch": 0.3031414399519014, "flos": 60548875827840.0, "grad_norm": 0.7759860102426609, "language_loss": 0.59503156, "learning_rate": 3.1599272873076076e-06, "loss": 0.61668867, "num_input_tokens_seen": 108413885, "router_z_loss_clip": 0.02600098, "router_z_loss_mlp": 0.2890625, "step": 5042, "time_per_iteration": 3.377969980239868 }, { "auxiliary_loss_clip": 0.01134442, "auxiliary_loss_mlp": 0.01034615, "balance_loss_clip": 1.01870036, "balance_loss_mlp": 1.04893625, "epoch": 0.30320156320456937, "flos": 21907664064000.0, "grad_norm": 1.681936691412705, "language_loss": 0.71212202, "learning_rate": 3.159619476540069e-06, "loss": 0.73381257, "num_input_tokens_seen": 108433640, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.85546875, "step": 5043, "time_per_iteration": 2.5491316318511963 }, { "auxiliary_loss_clip": 0.0116736, "auxiliary_loss_mlp": 0.01037444, "balance_loss_clip": 1.02144587, "balance_loss_mlp": 1.0476824, "epoch": 0.30326168645723733, "flos": 21652770176640.0, "grad_norm": 2.296186423391231, "language_loss": 0.69766009, "learning_rate": 3.1593116243886943e-06, "loss": 0.71970814, "num_input_tokens_seen": 108452640, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83984375, "step": 5044, "time_per_iteration": 2.649226188659668 }, { "auxiliary_loss_clip": 0.01137546, "auxiliary_loss_mlp": 0.01040004, "balance_loss_clip": 1.02428031, "balance_loss_mlp": 1.04706204, "epoch": 0.3033218097099053, "flos": 21871286565120.0, "grad_norm": 1.3396886791914109, "language_loss": 0.77186674, "learning_rate": 3.1590037308644695e-06, "loss": 0.79364228, "num_input_tokens_seen": 108472470, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8125, "step": 5045, "time_per_iteration": 2.5856387615203857 }, { "auxiliary_loss_clip": 0.01132954, "auxiliary_loss_mlp": 0.01290173, "balance_loss_clip": 1.02565789, "balance_loss_mlp": 1.04865336, "epoch": 0.30338193296257326, "flos": 27089717333760.0, "grad_norm": 1.6129939725938534, "language_loss": 0.72475952, "learning_rate": 3.158695795978383e-06, "loss": 0.74899077, "num_input_tokens_seen": 108493025, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.84375, "step": 5046, "time_per_iteration": 4.097904682159424 }, { "auxiliary_loss_clip": 0.01140737, "auxiliary_loss_mlp": 0.01038105, "balance_loss_clip": 1.01973486, "balance_loss_mlp": 1.04601097, "epoch": 0.30344205621524123, "flos": 19534363718400.0, "grad_norm": 1.659336507295084, "language_loss": 0.80904043, "learning_rate": 3.1583878197414237e-06, "loss": 0.83082891, "num_input_tokens_seen": 108513480, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.85546875, "step": 5047, "time_per_iteration": 2.61934757232666 }, { "auxiliary_loss_clip": 0.01165068, "auxiliary_loss_mlp": 0.01041422, "balance_loss_clip": 1.02575803, "balance_loss_mlp": 1.04647279, "epoch": 0.3035021794679092, "flos": 23910976368000.0, "grad_norm": 1.9944967227157455, "language_loss": 0.72096598, "learning_rate": 3.1580798021645833e-06, "loss": 0.74303091, "num_input_tokens_seen": 108533155, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.82421875, "step": 5048, "time_per_iteration": 4.150184154510498 }, { "auxiliary_loss_clip": 0.01148324, "auxiliary_loss_mlp": 0.01036512, "balance_loss_clip": 1.02109814, "balance_loss_mlp": 1.04558527, "epoch": 0.30356230272057716, "flos": 16143606725760.0, "grad_norm": 2.031275975528803, "language_loss": 0.75442296, "learning_rate": 3.157771743258854e-06, "loss": 0.77627134, "num_input_tokens_seen": 108551900, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.84765625, "step": 5049, "time_per_iteration": 2.553507089614868 }, { "auxiliary_loss_clip": 0.0113914, "auxiliary_loss_mlp": 0.01040202, "balance_loss_clip": 1.02445412, "balance_loss_mlp": 1.04520488, "epoch": 0.3036224259732452, "flos": 28914697589760.0, "grad_norm": 1.611176706116896, "language_loss": 0.82046211, "learning_rate": 3.1574636430352287e-06, "loss": 0.84225553, "num_input_tokens_seen": 108574005, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.84765625, "step": 5050, "time_per_iteration": 4.155571699142456 }, { "auxiliary_loss_clip": 0.01157748, "auxiliary_loss_mlp": 0.01041469, "balance_loss_clip": 1.0245173, "balance_loss_mlp": 1.04656887, "epoch": 0.30368254922591315, "flos": 21105599322240.0, "grad_norm": 2.0923271925917843, "language_loss": 0.73531646, "learning_rate": 3.1571555015047036e-06, "loss": 0.7573086, "num_input_tokens_seen": 108592715, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.83984375, "step": 5051, "time_per_iteration": 2.5518712997436523 }, { "auxiliary_loss_clip": 0.01174038, "auxiliary_loss_mlp": 0.01282561, "balance_loss_clip": 1.01938581, "balance_loss_mlp": 1.04564524, "epoch": 0.3037426724785811, "flos": 23002293081600.0, "grad_norm": 1.81137356115037, "language_loss": 0.76558477, "learning_rate": 3.156847318678275e-06, "loss": 0.79015076, "num_input_tokens_seen": 108611770, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8359375, "step": 5052, "time_per_iteration": 2.6532251834869385 }, { "auxiliary_loss_clip": 0.01131264, "auxiliary_loss_mlp": 0.01044649, "balance_loss_clip": 1.02894378, "balance_loss_mlp": 1.04487836, "epoch": 0.3038027957312491, "flos": 15632705629440.0, "grad_norm": 1.9817219494693075, "language_loss": 0.8231765, "learning_rate": 3.156539094566941e-06, "loss": 0.84493566, "num_input_tokens_seen": 108629070, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.86328125, "step": 5053, "time_per_iteration": 2.5127429962158203 }, { "auxiliary_loss_clip": 0.01145273, "auxiliary_loss_mlp": 0.01038, "balance_loss_clip": 1.02214539, "balance_loss_mlp": 1.04485691, "epoch": 0.30386291898391704, "flos": 12713994195840.0, "grad_norm": 1.7754944241190327, "language_loss": 0.70791757, "learning_rate": 3.1562308291817024e-06, "loss": 0.72975034, "num_input_tokens_seen": 108646315, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.83203125, "step": 5054, "time_per_iteration": 2.6161837577819824 }, { "auxiliary_loss_clip": 0.01130019, "auxiliary_loss_mlp": 0.01036982, "balance_loss_clip": 1.02062643, "balance_loss_mlp": 1.04607987, "epoch": 0.303923042236585, "flos": 26359437922560.0, "grad_norm": 2.415717083357656, "language_loss": 0.6879406, "learning_rate": 3.15592252253356e-06, "loss": 0.70961058, "num_input_tokens_seen": 108665920, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.83984375, "step": 5055, "time_per_iteration": 2.525722026824951 }, { "auxiliary_loss_clip": 0.01148186, "auxiliary_loss_mlp": 0.01037628, "balance_loss_clip": 1.02054501, "balance_loss_mlp": 1.04547119, "epoch": 0.30398316548925297, "flos": 19719232041600.0, "grad_norm": 3.772293156842981, "language_loss": 0.67270821, "learning_rate": 3.1556141746335153e-06, "loss": 0.69456637, "num_input_tokens_seen": 108683485, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.84375, "step": 5056, "time_per_iteration": 2.6228818893432617 }, { "auxiliary_loss_clip": 0.01146591, "auxiliary_loss_mlp": 0.01043503, "balance_loss_clip": 1.02822018, "balance_loss_mlp": 1.04739189, "epoch": 0.30404328874192094, "flos": 24239846315520.0, "grad_norm": 1.6232779162797006, "language_loss": 0.82670128, "learning_rate": 3.155305785492574e-06, "loss": 0.84860229, "num_input_tokens_seen": 108702700, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8125, "step": 5057, "time_per_iteration": 2.603086233139038 }, { "auxiliary_loss_clip": 0.01143802, "auxiliary_loss_mlp": 0.01038711, "balance_loss_clip": 1.02297497, "balance_loss_mlp": 1.04442394, "epoch": 0.3041034119945889, "flos": 24498942094080.0, "grad_norm": 2.5346418605466927, "language_loss": 0.89106715, "learning_rate": 3.1549973551217408e-06, "loss": 0.91289234, "num_input_tokens_seen": 108721860, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8203125, "step": 5058, "time_per_iteration": 2.6925315856933594 }, { "auxiliary_loss_clip": 0.01153932, "auxiliary_loss_mlp": 0.01040342, "balance_loss_clip": 1.02433228, "balance_loss_mlp": 1.04402554, "epoch": 0.30416353524725687, "flos": 28288881907200.0, "grad_norm": 1.825536586106516, "language_loss": 0.71728826, "learning_rate": 3.1546888835320227e-06, "loss": 0.73923105, "num_input_tokens_seen": 108743215, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83203125, "step": 5059, "time_per_iteration": 2.641563653945923 }, { "auxiliary_loss_clip": 0.01171596, "auxiliary_loss_mlp": 0.01040372, "balance_loss_clip": 1.02479148, "balance_loss_mlp": 1.04441023, "epoch": 0.30422365849992483, "flos": 23660392112640.0, "grad_norm": 4.53201675971978, "language_loss": 0.73127925, "learning_rate": 3.1543803707344284e-06, "loss": 0.75339901, "num_input_tokens_seen": 108765505, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.82421875, "step": 5060, "time_per_iteration": 2.664569139480591 }, { "auxiliary_loss_clip": 0.01130283, "auxiliary_loss_mlp": 0.01036469, "balance_loss_clip": 1.02059078, "balance_loss_mlp": 1.04686213, "epoch": 0.3042837817525928, "flos": 22998773548800.0, "grad_norm": 1.6876103064074603, "language_loss": 0.76684165, "learning_rate": 3.154071816739969e-06, "loss": 0.78850913, "num_input_tokens_seen": 108783370, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.83203125, "step": 5061, "time_per_iteration": 2.5979368686676025 }, { "auxiliary_loss_clip": 0.01160231, "auxiliary_loss_mlp": 0.01038827, "balance_loss_clip": 1.02149463, "balance_loss_mlp": 1.04822326, "epoch": 0.30434390500526076, "flos": 22082332924800.0, "grad_norm": 4.156222950182953, "language_loss": 0.81819332, "learning_rate": 3.1537632215596542e-06, "loss": 0.84018385, "num_input_tokens_seen": 108797430, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.8515625, "step": 5062, "time_per_iteration": 2.619518280029297 }, { "auxiliary_loss_clip": 0.01125231, "auxiliary_loss_mlp": 0.0103402, "balance_loss_clip": 1.01948285, "balance_loss_mlp": 1.0438869, "epoch": 0.3044040282579287, "flos": 25483504861440.0, "grad_norm": 2.4784721460614074, "language_loss": 0.74445295, "learning_rate": 3.153454585204498e-06, "loss": 0.76604545, "num_input_tokens_seen": 108816945, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8125, "step": 5063, "time_per_iteration": 2.6488494873046875 }, { "auxiliary_loss_clip": 0.0113763, "auxiliary_loss_mlp": 0.01288637, "balance_loss_clip": 1.02424669, "balance_loss_mlp": 1.04634166, "epoch": 0.30446415151059675, "flos": 21945478106880.0, "grad_norm": 1.6019326831032406, "language_loss": 0.83858311, "learning_rate": 3.153145907685515e-06, "loss": 0.86284578, "num_input_tokens_seen": 108836615, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.82421875, "step": 5064, "time_per_iteration": 2.579850435256958 }, { "auxiliary_loss_clip": 0.01137435, "auxiliary_loss_mlp": 0.0104042, "balance_loss_clip": 1.02408862, "balance_loss_mlp": 1.04538786, "epoch": 0.3045242747632647, "flos": 16435416816000.0, "grad_norm": 2.3335591706620464, "language_loss": 0.75902593, "learning_rate": 3.152837189013721e-06, "loss": 0.78080451, "num_input_tokens_seen": 108855165, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8359375, "step": 5065, "time_per_iteration": 2.557512044906616 }, { "auxiliary_loss_clip": 0.0114794, "auxiliary_loss_mlp": 0.01046547, "balance_loss_clip": 1.02935719, "balance_loss_mlp": 1.04455447, "epoch": 0.3045843980159327, "flos": 31540341957120.0, "grad_norm": 1.7920569364841, "language_loss": 0.61888319, "learning_rate": 3.1525284292001323e-06, "loss": 0.64082801, "num_input_tokens_seen": 108874690, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8515625, "step": 5066, "time_per_iteration": 2.723900318145752 }, { "auxiliary_loss_clip": 0.01155946, "auxiliary_loss_mlp": 0.01045989, "balance_loss_clip": 1.0293473, "balance_loss_mlp": 1.05060649, "epoch": 0.30464452126860064, "flos": 17853636481920.0, "grad_norm": 1.8506425988773523, "language_loss": 0.82298142, "learning_rate": 3.1522196282557698e-06, "loss": 0.84500074, "num_input_tokens_seen": 108893140, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.875, "step": 5067, "time_per_iteration": 2.697287082672119 }, { "auxiliary_loss_clip": 0.01129406, "auxiliary_loss_mlp": 0.01045534, "balance_loss_clip": 1.02975106, "balance_loss_mlp": 1.04781997, "epoch": 0.3047046445212686, "flos": 20631398947200.0, "grad_norm": 1.9033580894848705, "language_loss": 0.63229889, "learning_rate": 3.1519107861916516e-06, "loss": 0.65404832, "num_input_tokens_seen": 108911880, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.81640625, "step": 5068, "time_per_iteration": 2.509185791015625 }, { "auxiliary_loss_clip": 0.01137393, "auxiliary_loss_mlp": 0.01037115, "balance_loss_clip": 1.02165341, "balance_loss_mlp": 1.04652691, "epoch": 0.3047647677739366, "flos": 21287594557440.0, "grad_norm": 4.14348881010855, "language_loss": 0.74907774, "learning_rate": 3.151601903018801e-06, "loss": 0.77082282, "num_input_tokens_seen": 108930440, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8203125, "step": 5069, "time_per_iteration": 2.667109489440918 }, { "auxiliary_loss_clip": 0.01154182, "auxiliary_loss_mlp": 0.01044598, "balance_loss_clip": 1.02892828, "balance_loss_mlp": 1.04701829, "epoch": 0.30482489102660454, "flos": 20995928121600.0, "grad_norm": 1.8419555158996943, "language_loss": 0.75421888, "learning_rate": 3.1512929787482405e-06, "loss": 0.77620667, "num_input_tokens_seen": 108949125, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.80078125, "step": 5070, "time_per_iteration": 2.6663029193878174 }, { "auxiliary_loss_clip": 0.01135409, "auxiliary_loss_mlp": 0.01291957, "balance_loss_clip": 1.02748168, "balance_loss_mlp": 1.04852676, "epoch": 0.3048850142792725, "flos": 26290812988800.0, "grad_norm": 1.7773180455734343, "language_loss": 0.81573063, "learning_rate": 3.150984013390995e-06, "loss": 0.84000432, "num_input_tokens_seen": 108972190, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8671875, "step": 5071, "time_per_iteration": 2.645859718322754 }, { "auxiliary_loss_clip": 0.01151264, "auxiliary_loss_mlp": 0.01044241, "balance_loss_clip": 1.02713442, "balance_loss_mlp": 1.04717112, "epoch": 0.30494513753194047, "flos": 22346241125760.0, "grad_norm": 1.7661736459945094, "language_loss": 0.75871789, "learning_rate": 3.1506750069580916e-06, "loss": 0.78067291, "num_input_tokens_seen": 108990325, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.859375, "step": 5072, "time_per_iteration": 2.5986757278442383 }, { "auxiliary_loss_clip": 0.01157575, "auxiliary_loss_mlp": 0.01042818, "balance_loss_clip": 1.02698731, "balance_loss_mlp": 1.04862082, "epoch": 0.30500526078460843, "flos": 19537667769600.0, "grad_norm": 2.034397915437441, "language_loss": 0.70674938, "learning_rate": 3.150365959460556e-06, "loss": 0.72875333, "num_input_tokens_seen": 109009505, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8203125, "step": 5073, "time_per_iteration": 2.7168869972229004 }, { "auxiliary_loss_clip": 0.01145781, "auxiliary_loss_mlp": 0.01045485, "balance_loss_clip": 1.02930796, "balance_loss_mlp": 1.04973233, "epoch": 0.3050653840372764, "flos": 14465321614080.0, "grad_norm": 2.2695288265470897, "language_loss": 0.75848621, "learning_rate": 3.150056870909419e-06, "loss": 0.78039891, "num_input_tokens_seen": 109026350, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.87109375, "step": 5074, "time_per_iteration": 2.569758653640747 }, { "auxiliary_loss_clip": 0.01148955, "auxiliary_loss_mlp": 0.01035544, "balance_loss_clip": 1.02018988, "balance_loss_mlp": 1.04839599, "epoch": 0.30512550728994436, "flos": 24243796811520.0, "grad_norm": 2.0163824410152382, "language_loss": 0.74185812, "learning_rate": 3.1497477413157107e-06, "loss": 0.76370311, "num_input_tokens_seen": 109044165, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.828125, "step": 5075, "time_per_iteration": 2.63419771194458 }, { "auxiliary_loss_clip": 0.011357, "auxiliary_loss_mlp": 0.01043979, "balance_loss_clip": 1.02582335, "balance_loss_mlp": 1.04932034, "epoch": 0.30518563054261233, "flos": 16360542915840.0, "grad_norm": 1.8807250933813708, "language_loss": 0.75337386, "learning_rate": 3.1494385706904625e-06, "loss": 0.77517056, "num_input_tokens_seen": 109060665, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.86328125, "step": 5076, "time_per_iteration": 2.516434669494629 }, { "auxiliary_loss_clip": 0.01152358, "auxiliary_loss_mlp": 0.0103909, "balance_loss_clip": 1.02358127, "balance_loss_mlp": 1.05058289, "epoch": 0.30524575379528035, "flos": 21579584215680.0, "grad_norm": 1.5909235913744992, "language_loss": 0.79202664, "learning_rate": 3.149129359044709e-06, "loss": 0.81394112, "num_input_tokens_seen": 109080035, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8359375, "step": 5077, "time_per_iteration": 2.5934460163116455 }, { "auxiliary_loss_clip": 0.01163959, "auxiliary_loss_mlp": 0.01030701, "balance_loss_clip": 1.01608562, "balance_loss_mlp": 1.04894733, "epoch": 0.3053058770479483, "flos": 16545231671040.0, "grad_norm": 2.487457048644254, "language_loss": 0.74977052, "learning_rate": 3.148820106389484e-06, "loss": 0.77171719, "num_input_tokens_seen": 109097385, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.796875, "step": 5078, "time_per_iteration": 2.5877318382263184 }, { "auxiliary_loss_clip": 0.01165681, "auxiliary_loss_mlp": 0.01046538, "balance_loss_clip": 1.03108871, "balance_loss_mlp": 1.04799199, "epoch": 0.3053660003006163, "flos": 12312907954560.0, "grad_norm": 2.5861291884492634, "language_loss": 0.66366333, "learning_rate": 3.1485108127358246e-06, "loss": 0.68578553, "num_input_tokens_seen": 109115495, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.81640625, "step": 5079, "time_per_iteration": 2.576444387435913 }, { "auxiliary_loss_clip": 0.01140506, "auxiliary_loss_mlp": 0.01033914, "balance_loss_clip": 1.01864386, "balance_loss_mlp": 1.04803979, "epoch": 0.30542612355328425, "flos": 23112287504640.0, "grad_norm": 1.7828973673884456, "language_loss": 0.8018465, "learning_rate": 3.1482014780947693e-06, "loss": 0.8235907, "num_input_tokens_seen": 109134235, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8359375, "step": 5080, "time_per_iteration": 2.6098897457122803 }, { "auxiliary_loss_clip": 0.01139321, "auxiliary_loss_mlp": 0.01289506, "balance_loss_clip": 1.02645683, "balance_loss_mlp": 1.04747272, "epoch": 0.3054862468059522, "flos": 24389450461440.0, "grad_norm": 2.3182308402287557, "language_loss": 0.80832803, "learning_rate": 3.147892102477356e-06, "loss": 0.83261633, "num_input_tokens_seen": 109152760, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.828125, "step": 5081, "time_per_iteration": 2.5489249229431152 }, { "auxiliary_loss_clip": 0.01138699, "auxiliary_loss_mlp": 0.01039273, "balance_loss_clip": 1.02385926, "balance_loss_mlp": 1.04729676, "epoch": 0.3055463700586202, "flos": 29386096704000.0, "grad_norm": 1.8420338894801664, "language_loss": 0.7189374, "learning_rate": 3.147582685894627e-06, "loss": 0.74071717, "num_input_tokens_seen": 109173925, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.82421875, "step": 5082, "time_per_iteration": 4.079789400100708 }, { "auxiliary_loss_clip": 0.01142568, "auxiliary_loss_mlp": 0.01034586, "balance_loss_clip": 1.01777768, "balance_loss_mlp": 1.04752541, "epoch": 0.30560649331128814, "flos": 25591775431680.0, "grad_norm": 2.002655026481869, "language_loss": 0.72790581, "learning_rate": 3.1472732283576226e-06, "loss": 0.7496773, "num_input_tokens_seen": 109192510, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.86328125, "step": 5083, "time_per_iteration": 2.550511598587036 }, { "auxiliary_loss_clip": 0.01148941, "auxiliary_loss_mlp": 0.01284719, "balance_loss_clip": 1.02071905, "balance_loss_mlp": 1.04614973, "epoch": 0.3056666165639561, "flos": 19128321400320.0, "grad_norm": 1.7606550952064641, "language_loss": 0.71298444, "learning_rate": 3.146963729877389e-06, "loss": 0.73732102, "num_input_tokens_seen": 109210885, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.84765625, "step": 5084, "time_per_iteration": 2.5903515815734863 }, { "auxiliary_loss_clip": 0.01162178, "auxiliary_loss_mlp": 0.01033831, "balance_loss_clip": 1.01759481, "balance_loss_mlp": 1.04911399, "epoch": 0.30572673981662407, "flos": 15523860441600.0, "grad_norm": 1.8483094095874948, "language_loss": 0.78496689, "learning_rate": 3.1466541904649698e-06, "loss": 0.80692697, "num_input_tokens_seen": 109229180, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.859375, "step": 5085, "time_per_iteration": 2.5268123149871826 }, { "auxiliary_loss_clip": 0.01127745, "auxiliary_loss_mlp": 0.01038661, "balance_loss_clip": 1.02305079, "balance_loss_mlp": 1.04665565, "epoch": 0.30578686306929204, "flos": 21506541909120.0, "grad_norm": 1.6420644295422453, "language_loss": 0.78296185, "learning_rate": 3.1463446101314118e-06, "loss": 0.80462587, "num_input_tokens_seen": 109249510, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.80859375, "step": 5086, "time_per_iteration": 2.5788803100585938 }, { "auxiliary_loss_clip": 0.011516, "auxiliary_loss_mlp": 0.01039399, "balance_loss_clip": 1.02312732, "balance_loss_mlp": 1.04982185, "epoch": 0.30584698632196, "flos": 20954271323520.0, "grad_norm": 2.103890694573405, "language_loss": 0.76659548, "learning_rate": 3.1460349888877645e-06, "loss": 0.78850543, "num_input_tokens_seen": 109268200, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8359375, "step": 5087, "time_per_iteration": 2.5459301471710205 }, { "auxiliary_loss_clip": 0.01142301, "auxiliary_loss_mlp": 0.01038392, "balance_loss_clip": 1.02172673, "balance_loss_mlp": 1.04789031, "epoch": 0.30590710957462797, "flos": 24681116897280.0, "grad_norm": 2.03700334584049, "language_loss": 0.72366512, "learning_rate": 3.1457253267450756e-06, "loss": 0.74547207, "num_input_tokens_seen": 109288370, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.85546875, "step": 5088, "time_per_iteration": 4.142530918121338 }, { "auxiliary_loss_clip": 0.01146285, "auxiliary_loss_mlp": 0.0103841, "balance_loss_clip": 1.02157748, "balance_loss_mlp": 1.04790306, "epoch": 0.30596723282729593, "flos": 17086907744640.0, "grad_norm": 2.3667893733212946, "language_loss": 0.7905277, "learning_rate": 3.145415623714397e-06, "loss": 0.81237465, "num_input_tokens_seen": 109306730, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.89453125, "step": 5089, "time_per_iteration": 4.088387727737427 }, { "auxiliary_loss_clip": 0.01149453, "auxiliary_loss_mlp": 0.01041497, "balance_loss_clip": 1.02512932, "balance_loss_mlp": 1.04841244, "epoch": 0.30602735607996395, "flos": 22857106308480.0, "grad_norm": 1.7533201218135825, "language_loss": 0.77211809, "learning_rate": 3.145105879806781e-06, "loss": 0.79402757, "num_input_tokens_seen": 109327360, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.83203125, "step": 5090, "time_per_iteration": 2.5938448905944824 }, { "auxiliary_loss_clip": 0.01152046, "auxiliary_loss_mlp": 0.0104542, "balance_loss_clip": 1.02861094, "balance_loss_mlp": 1.04678774, "epoch": 0.3060874793326319, "flos": 29861482227840.0, "grad_norm": 1.8341093884592445, "language_loss": 0.76179206, "learning_rate": 3.144796095033282e-06, "loss": 0.78376675, "num_input_tokens_seen": 109348135, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.875, "step": 5091, "time_per_iteration": 4.029203176498413 }, { "auxiliary_loss_clip": 0.01188544, "auxiliary_loss_mlp": 0.01037125, "balance_loss_clip": 1.02041173, "balance_loss_mlp": 1.04771817, "epoch": 0.3061476025852999, "flos": 20448577699200.0, "grad_norm": 1.62610083006406, "language_loss": 0.7191568, "learning_rate": 3.1444862694049548e-06, "loss": 0.74141347, "num_input_tokens_seen": 109366220, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8671875, "step": 5092, "time_per_iteration": 2.6233303546905518 }, { "auxiliary_loss_clip": 0.01148793, "auxiliary_loss_mlp": 0.01040857, "balance_loss_clip": 1.02551484, "balance_loss_mlp": 1.04761219, "epoch": 0.30620772583796785, "flos": 19391475415680.0, "grad_norm": 1.8601467015030506, "language_loss": 0.8252579, "learning_rate": 3.144176402932857e-06, "loss": 0.84715438, "num_input_tokens_seen": 109385260, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8359375, "step": 5093, "time_per_iteration": 2.5538625717163086 }, { "auxiliary_loss_clip": 0.01129757, "auxiliary_loss_mlp": 0.01286981, "balance_loss_clip": 1.02465558, "balance_loss_mlp": 1.0469054, "epoch": 0.3062678490906358, "flos": 24024562151040.0, "grad_norm": 2.3739182406538406, "language_loss": 0.74681896, "learning_rate": 3.143866495628046e-06, "loss": 0.77098638, "num_input_tokens_seen": 109405025, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.828125, "step": 5094, "time_per_iteration": 2.5789601802825928 }, { "auxiliary_loss_clip": 0.01148549, "auxiliary_loss_mlp": 0.01039387, "balance_loss_clip": 1.02405047, "balance_loss_mlp": 1.04850793, "epoch": 0.3063279723433038, "flos": 19754639873280.0, "grad_norm": 1.9624904901659734, "language_loss": 0.75656164, "learning_rate": 3.1435565475015827e-06, "loss": 0.77844095, "num_input_tokens_seen": 109422465, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.82421875, "step": 5095, "time_per_iteration": 2.5397775173187256 }, { "auxiliary_loss_clip": 0.01138264, "auxiliary_loss_mlp": 0.01037122, "balance_loss_clip": 1.02181578, "balance_loss_mlp": 1.04786491, "epoch": 0.30638809559597174, "flos": 22450022496000.0, "grad_norm": 1.9804058769464967, "language_loss": 0.8088603, "learning_rate": 3.143246558564528e-06, "loss": 0.83061415, "num_input_tokens_seen": 109440575, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8125, "step": 5096, "time_per_iteration": 2.554661750793457 }, { "auxiliary_loss_clip": 0.01141567, "auxiliary_loss_mlp": 0.01036365, "balance_loss_clip": 1.02095759, "balance_loss_mlp": 1.0486964, "epoch": 0.3064482188486397, "flos": 17165157523200.0, "grad_norm": 2.07892398418319, "language_loss": 0.82207012, "learning_rate": 3.1429365288279437e-06, "loss": 0.84384942, "num_input_tokens_seen": 109459050, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.83984375, "step": 5097, "time_per_iteration": 2.5142405033111572 }, { "auxiliary_loss_clip": 0.01131119, "auxiliary_loss_mlp": 0.01041361, "balance_loss_clip": 1.0245645, "balance_loss_mlp": 1.0469569, "epoch": 0.3065083421013077, "flos": 23768483114880.0, "grad_norm": 1.946749584348108, "language_loss": 0.78003335, "learning_rate": 3.142626458302895e-06, "loss": 0.80175817, "num_input_tokens_seen": 109475860, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.83984375, "step": 5098, "time_per_iteration": 2.5452780723571777 }, { "auxiliary_loss_clip": 0.01167609, "auxiliary_loss_mlp": 0.01040395, "balance_loss_clip": 1.024647, "balance_loss_mlp": 1.05024362, "epoch": 0.30656846535397564, "flos": 26431833784320.0, "grad_norm": 1.7267562373590768, "language_loss": 0.83903915, "learning_rate": 3.1423163470004473e-06, "loss": 0.86111915, "num_input_tokens_seen": 109494760, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8203125, "step": 5099, "time_per_iteration": 2.6078169345855713 }, { "auxiliary_loss_clip": 0.01131782, "auxiliary_loss_mlp": 0.01042969, "balance_loss_clip": 1.02669144, "balance_loss_mlp": 1.04607749, "epoch": 0.3066285886066436, "flos": 26651786716800.0, "grad_norm": 2.206154688375747, "language_loss": 0.85465395, "learning_rate": 3.1420061949316676e-06, "loss": 0.87640154, "num_input_tokens_seen": 109516480, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.85546875, "step": 5100, "time_per_iteration": 2.558715343475342 }, { "auxiliary_loss_clip": 0.01157461, "auxiliary_loss_mlp": 0.01035948, "balance_loss_clip": 1.01949692, "balance_loss_mlp": 1.04715347, "epoch": 0.30668871185931157, "flos": 15049947375360.0, "grad_norm": 1.995125812573454, "language_loss": 0.79696059, "learning_rate": 3.141696002107624e-06, "loss": 0.81889468, "num_input_tokens_seen": 109534615, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8359375, "step": 5101, "time_per_iteration": 2.53690767288208 }, { "auxiliary_loss_clip": 0.01161843, "auxiliary_loss_mlp": 0.01043544, "balance_loss_clip": 1.0266279, "balance_loss_mlp": 1.04994798, "epoch": 0.30674883511197953, "flos": 20082109190400.0, "grad_norm": 2.1042456230039606, "language_loss": 0.803698, "learning_rate": 3.1413857685393873e-06, "loss": 0.8257519, "num_input_tokens_seen": 109554040, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8515625, "step": 5102, "time_per_iteration": 2.7302749156951904 }, { "auxiliary_loss_clip": 0.01135021, "auxiliary_loss_mlp": 0.0103866, "balance_loss_clip": 1.02177966, "balance_loss_mlp": 1.05108762, "epoch": 0.30680895836464755, "flos": 22893807029760.0, "grad_norm": 1.6985352527962212, "language_loss": 0.88421059, "learning_rate": 3.1410754942380287e-06, "loss": 0.90594739, "num_input_tokens_seen": 109574345, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8359375, "step": 5103, "time_per_iteration": 2.5172414779663086 }, { "auxiliary_loss_clip": 0.01140742, "auxiliary_loss_mlp": 0.01045626, "balance_loss_clip": 1.02963388, "balance_loss_mlp": 1.04876137, "epoch": 0.3068690816173155, "flos": 23696159080320.0, "grad_norm": 1.7098987717095617, "language_loss": 0.73705739, "learning_rate": 3.1407651792146204e-06, "loss": 0.75892103, "num_input_tokens_seen": 109593670, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.828125, "step": 5104, "time_per_iteration": 2.554847002029419 }, { "auxiliary_loss_clip": 0.01143071, "auxiliary_loss_mlp": 0.01289734, "balance_loss_clip": 1.02589667, "balance_loss_mlp": 1.04743218, "epoch": 0.3069292048699835, "flos": 23551044134400.0, "grad_norm": 1.6951159112526075, "language_loss": 0.72423708, "learning_rate": 3.1404548234802376e-06, "loss": 0.7485652, "num_input_tokens_seen": 109613385, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8671875, "step": 5105, "time_per_iteration": 2.5711841583251953 }, { "auxiliary_loss_clip": 0.01151654, "auxiliary_loss_mlp": 0.0104612, "balance_loss_clip": 1.02913237, "balance_loss_mlp": 1.04855466, "epoch": 0.30698932812265145, "flos": 24531656405760.0, "grad_norm": 1.754020621295186, "language_loss": 0.87357175, "learning_rate": 3.140144427045955e-06, "loss": 0.89554942, "num_input_tokens_seen": 109632395, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8515625, "step": 5106, "time_per_iteration": 2.561342239379883 }, { "auxiliary_loss_clip": 0.01164185, "auxiliary_loss_mlp": 0.01047442, "balance_loss_clip": 1.03055, "balance_loss_mlp": 1.04890668, "epoch": 0.3070494513753194, "flos": 20996430912000.0, "grad_norm": 2.010134132238575, "language_loss": 0.71431404, "learning_rate": 3.1398339899228512e-06, "loss": 0.73643029, "num_input_tokens_seen": 109651380, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.88671875, "step": 5107, "time_per_iteration": 2.71806263923645 }, { "auxiliary_loss_clip": 0.01155715, "auxiliary_loss_mlp": 0.01048606, "balance_loss_clip": 1.03252435, "balance_loss_mlp": 1.04664505, "epoch": 0.3071095746279874, "flos": 19025940660480.0, "grad_norm": 2.074124994031799, "language_loss": 0.72571933, "learning_rate": 3.139523512122005e-06, "loss": 0.74776256, "num_input_tokens_seen": 109670240, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.828125, "step": 5108, "time_per_iteration": 2.567858934402466 }, { "auxiliary_loss_clip": 0.01151245, "auxiliary_loss_mlp": 0.01042223, "balance_loss_clip": 1.02649975, "balance_loss_mlp": 1.04923487, "epoch": 0.30716969788065535, "flos": 21215521918080.0, "grad_norm": 1.4932649032008865, "language_loss": 0.85965526, "learning_rate": 3.1392129936544947e-06, "loss": 0.88159001, "num_input_tokens_seen": 109690810, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.83984375, "step": 5109, "time_per_iteration": 2.6266403198242188 }, { "auxiliary_loss_clip": 0.01149639, "auxiliary_loss_mlp": 0.01033946, "balance_loss_clip": 1.01875341, "balance_loss_mlp": 1.04768324, "epoch": 0.3072298211333233, "flos": 25772765086080.0, "grad_norm": 2.12778482263162, "language_loss": 0.67910433, "learning_rate": 3.1389024345314033e-06, "loss": 0.70094019, "num_input_tokens_seen": 109711145, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.83984375, "step": 5110, "time_per_iteration": 2.573809862136841 }, { "auxiliary_loss_clip": 0.01136886, "auxiliary_loss_mlp": 0.01033253, "balance_loss_clip": 1.01882899, "balance_loss_mlp": 1.04690051, "epoch": 0.3072899443859913, "flos": 25848931875840.0, "grad_norm": 1.8882106035944886, "language_loss": 0.76906562, "learning_rate": 3.1385918347638142e-06, "loss": 0.79076701, "num_input_tokens_seen": 109731425, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8125, "step": 5111, "time_per_iteration": 2.6208395957946777 }, { "auxiliary_loss_clip": 0.01130833, "auxiliary_loss_mlp": 0.01040404, "balance_loss_clip": 1.02503848, "balance_loss_mlp": 1.04758346, "epoch": 0.30735006763865924, "flos": 25922800195200.0, "grad_norm": 1.9283792341690746, "language_loss": 0.66980624, "learning_rate": 3.1382811943628107e-06, "loss": 0.6915186, "num_input_tokens_seen": 109752720, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.83203125, "step": 5112, "time_per_iteration": 2.5739316940307617 }, { "auxiliary_loss_clip": 0.01145139, "auxiliary_loss_mlp": 0.01041718, "balance_loss_clip": 1.02390862, "balance_loss_mlp": 1.04845095, "epoch": 0.3074101908913272, "flos": 30917004312960.0, "grad_norm": 1.9580155730554463, "language_loss": 0.79564524, "learning_rate": 3.1379705133394793e-06, "loss": 0.81751382, "num_input_tokens_seen": 109772840, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.87890625, "step": 5113, "time_per_iteration": 2.8490102291107178 }, { "auxiliary_loss_clip": 0.01157038, "auxiliary_loss_mlp": 0.01286794, "balance_loss_clip": 1.02346206, "balance_loss_mlp": 1.04495406, "epoch": 0.30747031414399517, "flos": 18401058731520.0, "grad_norm": 1.8585633611928907, "language_loss": 0.76473665, "learning_rate": 3.1376597917049084e-06, "loss": 0.78917491, "num_input_tokens_seen": 109790150, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.84765625, "step": 5114, "time_per_iteration": 2.640490770339966 }, { "auxiliary_loss_clip": 0.01158335, "auxiliary_loss_mlp": 0.010341, "balance_loss_clip": 1.01696944, "balance_loss_mlp": 1.04681349, "epoch": 0.30753043739666314, "flos": 22633166966400.0, "grad_norm": 3.6226515589889745, "language_loss": 0.62212777, "learning_rate": 3.1373490294701853e-06, "loss": 0.64405215, "num_input_tokens_seen": 109807985, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.84765625, "step": 5115, "time_per_iteration": 2.6077218055725098 }, { "auxiliary_loss_clip": 0.01140769, "auxiliary_loss_mlp": 0.01038431, "balance_loss_clip": 1.02264822, "balance_loss_mlp": 1.04696107, "epoch": 0.3075905606493311, "flos": 27344072517120.0, "grad_norm": 1.8654124181464053, "language_loss": 0.82839692, "learning_rate": 3.1370382266464007e-06, "loss": 0.85018891, "num_input_tokens_seen": 109825920, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.84765625, "step": 5116, "time_per_iteration": 2.570796012878418 }, { "auxiliary_loss_clip": 0.0114692, "auxiliary_loss_mlp": 0.01042233, "balance_loss_clip": 1.02631855, "balance_loss_mlp": 1.04635966, "epoch": 0.3076506839019991, "flos": 22090808534400.0, "grad_norm": 1.928985254552651, "language_loss": 0.76178253, "learning_rate": 3.136727383244647e-06, "loss": 0.783674, "num_input_tokens_seen": 109846220, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.83203125, "step": 5117, "time_per_iteration": 2.6247668266296387 }, { "auxiliary_loss_clip": 0.01159499, "auxiliary_loss_mlp": 0.01034832, "balance_loss_clip": 1.018417, "balance_loss_mlp": 1.04765117, "epoch": 0.3077108071546671, "flos": 21289533891840.0, "grad_norm": 2.3077306412332623, "language_loss": 0.71816212, "learning_rate": 3.136416499276017e-06, "loss": 0.74010539, "num_input_tokens_seen": 109863870, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8515625, "step": 5118, "time_per_iteration": 2.7259020805358887 }, { "auxiliary_loss_clip": 0.0116541, "auxiliary_loss_mlp": 0.01037847, "balance_loss_clip": 1.02262378, "balance_loss_mlp": 1.04575455, "epoch": 0.30777093040733505, "flos": 21430985650560.0, "grad_norm": 1.9527077749457056, "language_loss": 0.74130046, "learning_rate": 3.136105574751605e-06, "loss": 0.76333308, "num_input_tokens_seen": 109883500, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.83984375, "step": 5119, "time_per_iteration": 2.6700026988983154 }, { "auxiliary_loss_clip": 0.01143675, "auxiliary_loss_mlp": 0.01045606, "balance_loss_clip": 1.02870262, "balance_loss_mlp": 1.04844296, "epoch": 0.307831053660003, "flos": 23149275534720.0, "grad_norm": 1.756518717575538, "language_loss": 0.80107599, "learning_rate": 3.135794609682508e-06, "loss": 0.82296878, "num_input_tokens_seen": 109904620, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.86328125, "step": 5120, "time_per_iteration": 2.591531276702881 }, { "auxiliary_loss_clip": 0.0113577, "auxiliary_loss_mlp": 0.01045585, "balance_loss_clip": 1.03025496, "balance_loss_mlp": 1.04415989, "epoch": 0.307891176912671, "flos": 17019755268480.0, "grad_norm": 1.6925280081970286, "language_loss": 0.79723358, "learning_rate": 3.135483604079823e-06, "loss": 0.81904715, "num_input_tokens_seen": 109922275, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.82421875, "step": 5121, "time_per_iteration": 2.543701171875 }, { "auxiliary_loss_clip": 0.01147731, "auxiliary_loss_mlp": 0.01033104, "balance_loss_clip": 1.01850116, "balance_loss_mlp": 1.04728436, "epoch": 0.30795130016533895, "flos": 27705046245120.0, "grad_norm": 1.7978815578536005, "language_loss": 0.82667851, "learning_rate": 3.1351725579546484e-06, "loss": 0.8484869, "num_input_tokens_seen": 109944265, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.82421875, "step": 5122, "time_per_iteration": 2.6005303859710693 }, { "auxiliary_loss_clip": 0.01072247, "auxiliary_loss_mlp": 0.01000767, "balance_loss_clip": 0.99901468, "balance_loss_mlp": 1.03491485, "epoch": 0.3080114234180069, "flos": 69058699591680.0, "grad_norm": 0.8613425429682974, "language_loss": 0.58603418, "learning_rate": 3.134861471318086e-06, "loss": 0.60676432, "num_input_tokens_seen": 110014160, "router_z_loss_clip": 0.01757812, "router_z_loss_mlp": 0.28515625, "step": 5123, "time_per_iteration": 3.466860294342041 }, { "auxiliary_loss_clip": 0.0114213, "auxiliary_loss_mlp": 0.01283744, "balance_loss_clip": 1.01933813, "balance_loss_mlp": 1.04617417, "epoch": 0.3080715466706749, "flos": 24060221377920.0, "grad_norm": 1.4880885940291642, "language_loss": 0.83149481, "learning_rate": 3.1345503441812357e-06, "loss": 0.85575354, "num_input_tokens_seen": 110034865, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.87109375, "step": 5124, "time_per_iteration": 3.9882404804229736 }, { "auxiliary_loss_clip": 0.0113134, "auxiliary_loss_mlp": 0.0103556, "balance_loss_clip": 1.02027702, "balance_loss_mlp": 1.04679585, "epoch": 0.30813166992334284, "flos": 25848680480640.0, "grad_norm": 1.9220375164566195, "language_loss": 0.79236615, "learning_rate": 3.1342391765552032e-06, "loss": 0.81403518, "num_input_tokens_seen": 110052930, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.84375, "step": 5125, "time_per_iteration": 2.6266229152679443 }, { "auxiliary_loss_clip": 0.01141899, "auxiliary_loss_mlp": 0.01040252, "balance_loss_clip": 1.02281117, "balance_loss_mlp": 1.04578674, "epoch": 0.3081917931760108, "flos": 20449619193600.0, "grad_norm": 2.628982904209024, "language_loss": 0.63512194, "learning_rate": 3.1339279684510916e-06, "loss": 0.65694344, "num_input_tokens_seen": 110071765, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.8671875, "step": 5126, "time_per_iteration": 2.5664174556732178 }, { "auxiliary_loss_clip": 0.01162374, "auxiliary_loss_mlp": 0.01040249, "balance_loss_clip": 1.02484727, "balance_loss_mlp": 1.04585648, "epoch": 0.3082519164286788, "flos": 22166257052160.0, "grad_norm": 1.5532706871888573, "language_loss": 0.8661598, "learning_rate": 3.1336167198800072e-06, "loss": 0.88818604, "num_input_tokens_seen": 110092660, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.80859375, "step": 5127, "time_per_iteration": 2.6833815574645996 }, { "auxiliary_loss_clip": 0.011631, "auxiliary_loss_mlp": 0.01043186, "balance_loss_clip": 1.0276171, "balance_loss_mlp": 1.04472613, "epoch": 0.30831203968134674, "flos": 28913404700160.0, "grad_norm": 2.1439451348510272, "language_loss": 0.68587601, "learning_rate": 3.133305430853059e-06, "loss": 0.70793885, "num_input_tokens_seen": 110114960, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8203125, "step": 5128, "time_per_iteration": 2.6722373962402344 }, { "auxiliary_loss_clip": 0.01142111, "auxiliary_loss_mlp": 0.01284617, "balance_loss_clip": 1.02126658, "balance_loss_mlp": 1.04724407, "epoch": 0.3083721629340147, "flos": 25667726739840.0, "grad_norm": 1.8131562616988341, "language_loss": 0.71045268, "learning_rate": 3.132994101381354e-06, "loss": 0.73471999, "num_input_tokens_seen": 110135750, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.859375, "step": 5129, "time_per_iteration": 2.6709935665130615 }, { "auxiliary_loss_clip": 0.01089021, "auxiliary_loss_mlp": 0.01006568, "balance_loss_clip": 1.00468469, "balance_loss_mlp": 1.03396106, "epoch": 0.3084322861866827, "flos": 68212679581440.0, "grad_norm": 0.8235636319430116, "language_loss": 0.59217054, "learning_rate": 3.132682731476005e-06, "loss": 0.6131264, "num_input_tokens_seen": 110189480, "router_z_loss_clip": 0.01879883, "router_z_loss_mlp": 0.28515625, "step": 5130, "time_per_iteration": 6.038992643356323 }, { "auxiliary_loss_clip": 0.01149751, "auxiliary_loss_mlp": 0.01038919, "balance_loss_clip": 1.02239645, "balance_loss_mlp": 1.04678166, "epoch": 0.3084924094393507, "flos": 20296495514880.0, "grad_norm": 3.222433238495182, "language_loss": 0.72540188, "learning_rate": 3.1323713211481227e-06, "loss": 0.74728858, "num_input_tokens_seen": 110206445, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8515625, "step": 5131, "time_per_iteration": 2.5862534046173096 }, { "auxiliary_loss_clip": 0.01154265, "auxiliary_loss_mlp": 0.01034082, "balance_loss_clip": 1.01947939, "balance_loss_mlp": 1.04616594, "epoch": 0.30855253269201866, "flos": 23949831905280.0, "grad_norm": 1.665650355976738, "language_loss": 0.70998693, "learning_rate": 3.1320598704088204e-06, "loss": 0.73187041, "num_input_tokens_seen": 110226845, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8125, "step": 5132, "time_per_iteration": 2.68266224861145 }, { "auxiliary_loss_clip": 0.01125551, "auxiliary_loss_mlp": 0.01288495, "balance_loss_clip": 1.02582324, "balance_loss_mlp": 1.04597366, "epoch": 0.3086126559446866, "flos": 19281876042240.0, "grad_norm": 1.864121314485671, "language_loss": 0.8983407, "learning_rate": 3.1317483792692136e-06, "loss": 0.92248118, "num_input_tokens_seen": 110244095, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.796875, "step": 5133, "time_per_iteration": 2.5611343383789062 }, { "auxiliary_loss_clip": 0.01148571, "auxiliary_loss_mlp": 0.01045806, "balance_loss_clip": 1.03032112, "balance_loss_mlp": 1.04450965, "epoch": 0.3086727791973546, "flos": 33760770019200.0, "grad_norm": 1.961887953004088, "language_loss": 0.670955, "learning_rate": 3.131436847740418e-06, "loss": 0.69289875, "num_input_tokens_seen": 110264240, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.859375, "step": 5134, "time_per_iteration": 4.256523847579956 }, { "auxiliary_loss_clip": 0.01140673, "auxiliary_loss_mlp": 0.01044312, "balance_loss_clip": 1.0286715, "balance_loss_mlp": 1.04624891, "epoch": 0.30873290245002255, "flos": 16034151006720.0, "grad_norm": 1.9914237446151988, "language_loss": 0.82295257, "learning_rate": 3.1311252758335523e-06, "loss": 0.84480238, "num_input_tokens_seen": 110282450, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8515625, "step": 5135, "time_per_iteration": 2.5656514167785645 }, { "auxiliary_loss_clip": 0.01085249, "auxiliary_loss_mlp": 0.01006829, "balance_loss_clip": 1.00511205, "balance_loss_mlp": 1.03045225, "epoch": 0.3087930257026905, "flos": 65048304055680.0, "grad_norm": 0.699277053561479, "language_loss": 0.55337727, "learning_rate": 3.130813663559735e-06, "loss": 0.57429808, "num_input_tokens_seen": 110343715, "router_z_loss_clip": 0.01721191, "router_z_loss_mlp": 0.28125, "step": 5136, "time_per_iteration": 3.245633840560913 }, { "auxiliary_loss_clip": 0.01146263, "auxiliary_loss_mlp": 0.01285631, "balance_loss_clip": 1.02288651, "balance_loss_mlp": 1.04536796, "epoch": 0.3088531489553585, "flos": 74738829824640.0, "grad_norm": 2.0210520358874913, "language_loss": 0.75947499, "learning_rate": 3.130502010930087e-06, "loss": 0.78379393, "num_input_tokens_seen": 110368430, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.828125, "step": 5137, "time_per_iteration": 3.0148370265960693 }, { "auxiliary_loss_clip": 0.01154835, "auxiliary_loss_mlp": 0.01036684, "balance_loss_clip": 1.02206337, "balance_loss_mlp": 1.04536366, "epoch": 0.30891327220802645, "flos": 21142300043520.0, "grad_norm": 1.7748424498913362, "language_loss": 0.79610735, "learning_rate": 3.1301903179557293e-06, "loss": 0.81802249, "num_input_tokens_seen": 110386735, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.828125, "step": 5138, "time_per_iteration": 2.6399803161621094 }, { "auxiliary_loss_clip": 0.01165529, "auxiliary_loss_mlp": 0.01037091, "balance_loss_clip": 1.02065825, "balance_loss_mlp": 1.04448795, "epoch": 0.3089733954606944, "flos": 25664494515840.0, "grad_norm": 2.684091410141027, "language_loss": 0.81434429, "learning_rate": 3.1298785846477868e-06, "loss": 0.83637053, "num_input_tokens_seen": 110406820, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.84765625, "step": 5139, "time_per_iteration": 2.7126471996307373 }, { "auxiliary_loss_clip": 0.01144094, "auxiliary_loss_mlp": 0.01037044, "balance_loss_clip": 1.01990736, "balance_loss_mlp": 1.04595947, "epoch": 0.3090335187133624, "flos": 19427350124160.0, "grad_norm": 1.9517848291377484, "language_loss": 0.77206403, "learning_rate": 3.129566811017384e-06, "loss": 0.79387546, "num_input_tokens_seen": 110424225, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.890625, "step": 5140, "time_per_iteration": 2.5769405364990234 }, { "auxiliary_loss_clip": 0.01150159, "auxiliary_loss_mlp": 0.01038709, "balance_loss_clip": 1.0240705, "balance_loss_mlp": 1.04440749, "epoch": 0.30909364196603034, "flos": 20011329440640.0, "grad_norm": 2.120663232053305, "language_loss": 0.78306693, "learning_rate": 3.1292549970756476e-06, "loss": 0.8049556, "num_input_tokens_seen": 110443310, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.79296875, "step": 5141, "time_per_iteration": 2.563443660736084 }, { "auxiliary_loss_clip": 0.01151863, "auxiliary_loss_mlp": 0.01039127, "balance_loss_clip": 1.0228312, "balance_loss_mlp": 1.04693532, "epoch": 0.3091537652186983, "flos": 19677575243520.0, "grad_norm": 2.0492248830417514, "language_loss": 0.86604208, "learning_rate": 3.128943142833705e-06, "loss": 0.88795197, "num_input_tokens_seen": 110460215, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8671875, "step": 5142, "time_per_iteration": 2.561204195022583 }, { "auxiliary_loss_clip": 0.01137148, "auxiliary_loss_mlp": 0.01042865, "balance_loss_clip": 1.0266763, "balance_loss_mlp": 1.04329276, "epoch": 0.3092138884713663, "flos": 17020042577280.0, "grad_norm": 1.836870487509836, "language_loss": 0.79029167, "learning_rate": 3.128631248302686e-06, "loss": 0.81209171, "num_input_tokens_seen": 110479385, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8515625, "step": 5143, "time_per_iteration": 2.4843978881835938 }, { "auxiliary_loss_clip": 0.01152659, "auxiliary_loss_mlp": 0.01038931, "balance_loss_clip": 1.02304053, "balance_loss_mlp": 1.0434227, "epoch": 0.3092740117240343, "flos": 25009986844800.0, "grad_norm": 1.5977230760198091, "language_loss": 0.72161835, "learning_rate": 3.12831931349372e-06, "loss": 0.74353427, "num_input_tokens_seen": 110499885, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.82421875, "step": 5144, "time_per_iteration": 2.6230149269104004 }, { "auxiliary_loss_clip": 0.01139084, "auxiliary_loss_mlp": 0.01037811, "balance_loss_clip": 1.0216701, "balance_loss_mlp": 1.04432082, "epoch": 0.30933413497670226, "flos": 25590410714880.0, "grad_norm": 1.7680572712548366, "language_loss": 0.7358824, "learning_rate": 3.128007338417941e-06, "loss": 0.75765133, "num_input_tokens_seen": 110519690, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.859375, "step": 5145, "time_per_iteration": 2.543309211730957 }, { "auxiliary_loss_clip": 0.01145152, "auxiliary_loss_mlp": 0.01038297, "balance_loss_clip": 1.02275229, "balance_loss_mlp": 1.04326677, "epoch": 0.3093942582293702, "flos": 24389665943040.0, "grad_norm": 1.7377240504438172, "language_loss": 0.76175094, "learning_rate": 3.127695323086481e-06, "loss": 0.78358537, "num_input_tokens_seen": 110540520, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8359375, "step": 5146, "time_per_iteration": 2.640183448791504 }, { "auxiliary_loss_clip": 0.01146153, "auxiliary_loss_mlp": 0.0103129, "balance_loss_clip": 1.01628733, "balance_loss_mlp": 1.04500806, "epoch": 0.3094543814820382, "flos": 19646441130240.0, "grad_norm": 1.7763368923281109, "language_loss": 0.66409922, "learning_rate": 3.1273832675104766e-06, "loss": 0.68587357, "num_input_tokens_seen": 110557950, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.83203125, "step": 5147, "time_per_iteration": 2.487619638442993 }, { "auxiliary_loss_clip": 0.01143118, "auxiliary_loss_mlp": 0.01042907, "balance_loss_clip": 1.02757716, "balance_loss_mlp": 1.0485661, "epoch": 0.30951450473470615, "flos": 25663812157440.0, "grad_norm": 1.793997963743096, "language_loss": 0.74464613, "learning_rate": 3.1270711717010623e-06, "loss": 0.76650637, "num_input_tokens_seen": 110578215, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.85546875, "step": 5148, "time_per_iteration": 2.579200029373169 }, { "auxiliary_loss_clip": 0.01146359, "auxiliary_loss_mlp": 0.0104473, "balance_loss_clip": 1.02776623, "balance_loss_mlp": 1.04776418, "epoch": 0.3095746279873741, "flos": 12020415505920.0, "grad_norm": 2.693556576547642, "language_loss": 0.7256093, "learning_rate": 3.126759035669378e-06, "loss": 0.74752021, "num_input_tokens_seen": 110592990, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8984375, "step": 5149, "time_per_iteration": 2.4529523849487305 }, { "auxiliary_loss_clip": 0.0116541, "auxiliary_loss_mlp": 0.01041772, "balance_loss_clip": 1.0252614, "balance_loss_mlp": 1.04440844, "epoch": 0.3096347512400421, "flos": 23623044946560.0, "grad_norm": 1.83718706144141, "language_loss": 0.84542263, "learning_rate": 3.1264468594265612e-06, "loss": 0.86749446, "num_input_tokens_seen": 110612130, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.85546875, "step": 5150, "time_per_iteration": 2.728388547897339 }, { "auxiliary_loss_clip": 0.01165188, "auxiliary_loss_mlp": 0.0104003, "balance_loss_clip": 1.02366281, "balance_loss_mlp": 1.04511666, "epoch": 0.30969487449271005, "flos": 22529313768960.0, "grad_norm": 1.8279837359498727, "language_loss": 0.78688329, "learning_rate": 3.126134642983754e-06, "loss": 0.8089354, "num_input_tokens_seen": 110632045, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.84375, "step": 5151, "time_per_iteration": 2.5672988891601562 }, { "auxiliary_loss_clip": 0.01140795, "auxiliary_loss_mlp": 0.01042041, "balance_loss_clip": 1.02500606, "balance_loss_mlp": 1.04610324, "epoch": 0.309754997745378, "flos": 15267925059840.0, "grad_norm": 3.5543788085440413, "language_loss": 0.67190003, "learning_rate": 3.125822386352098e-06, "loss": 0.69372845, "num_input_tokens_seen": 110649340, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.859375, "step": 5152, "time_per_iteration": 2.5777230262756348 }, { "auxiliary_loss_clip": 0.01172769, "auxiliary_loss_mlp": 0.01046681, "balance_loss_clip": 1.03064704, "balance_loss_mlp": 1.04489326, "epoch": 0.309815120998046, "flos": 26979291947520.0, "grad_norm": 1.8913076749599715, "language_loss": 0.82912099, "learning_rate": 3.1255100895427373e-06, "loss": 0.8513155, "num_input_tokens_seen": 110668450, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83203125, "step": 5153, "time_per_iteration": 2.6591575145721436 }, { "auxiliary_loss_clip": 0.01149984, "auxiliary_loss_mlp": 0.01286933, "balance_loss_clip": 1.02389264, "balance_loss_mlp": 1.04762745, "epoch": 0.30987524425071394, "flos": 21143161969920.0, "grad_norm": 2.0397102156682876, "language_loss": 0.73765528, "learning_rate": 3.1251977525668167e-06, "loss": 0.76202452, "num_input_tokens_seen": 110689410, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.84375, "step": 5154, "time_per_iteration": 2.687145471572876 }, { "auxiliary_loss_clip": 0.01138382, "auxiliary_loss_mlp": 0.01032797, "balance_loss_clip": 1.01766896, "balance_loss_mlp": 1.04596567, "epoch": 0.3099353675033819, "flos": 15268284195840.0, "grad_norm": 2.3898078996851595, "language_loss": 0.76183349, "learning_rate": 3.1248853754354824e-06, "loss": 0.78354526, "num_input_tokens_seen": 110707350, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.83203125, "step": 5155, "time_per_iteration": 2.48830509185791 }, { "auxiliary_loss_clip": 0.01150913, "auxiliary_loss_mlp": 0.01039528, "balance_loss_clip": 1.02355337, "balance_loss_mlp": 1.047351, "epoch": 0.30999549075604993, "flos": 15413794191360.0, "grad_norm": 1.9200802216665052, "language_loss": 0.78963447, "learning_rate": 3.1245729581598826e-06, "loss": 0.81153888, "num_input_tokens_seen": 110724910, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.85546875, "step": 5156, "time_per_iteration": 2.5871119499206543 }, { "auxiliary_loss_clip": 0.01140292, "auxiliary_loss_mlp": 0.01042752, "balance_loss_clip": 1.02571714, "balance_loss_mlp": 1.04454565, "epoch": 0.3100556140087179, "flos": 23184539712000.0, "grad_norm": 2.9568347847732666, "language_loss": 0.74905139, "learning_rate": 3.1242605007511664e-06, "loss": 0.77088183, "num_input_tokens_seen": 110744010, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.8671875, "step": 5157, "time_per_iteration": 2.518948554992676 }, { "auxiliary_loss_clip": 0.01153328, "auxiliary_loss_mlp": 0.01038353, "balance_loss_clip": 1.02396441, "balance_loss_mlp": 1.04431415, "epoch": 0.31011573726138586, "flos": 25742169676800.0, "grad_norm": 1.6419689639737685, "language_loss": 0.6910398, "learning_rate": 3.1239480032204857e-06, "loss": 0.71295661, "num_input_tokens_seen": 110765835, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.81640625, "step": 5158, "time_per_iteration": 2.6308083534240723 }, { "auxiliary_loss_clip": 0.01162913, "auxiliary_loss_mlp": 0.01042555, "balance_loss_clip": 1.02816653, "balance_loss_mlp": 1.04530227, "epoch": 0.3101758605140538, "flos": 20011329440640.0, "grad_norm": 2.4458524174730627, "language_loss": 0.84627271, "learning_rate": 3.123635465578991e-06, "loss": 0.86832744, "num_input_tokens_seen": 110784655, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8203125, "step": 5159, "time_per_iteration": 2.542008399963379 }, { "auxiliary_loss_clip": 0.01145393, "auxiliary_loss_mlp": 0.01040183, "balance_loss_clip": 1.02429283, "balance_loss_mlp": 1.04450822, "epoch": 0.3102359837667218, "flos": 19135683688320.0, "grad_norm": 1.9289560551308869, "language_loss": 0.84860337, "learning_rate": 3.123322887837837e-06, "loss": 0.87045908, "num_input_tokens_seen": 110802545, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.83203125, "step": 5160, "time_per_iteration": 2.5816147327423096 }, { "auxiliary_loss_clip": 0.0113801, "auxiliary_loss_mlp": 0.0104015, "balance_loss_clip": 1.02441478, "balance_loss_mlp": 1.04698598, "epoch": 0.31029610701938976, "flos": 22265405568000.0, "grad_norm": 1.9722377205872244, "language_loss": 0.75747192, "learning_rate": 3.123010270008179e-06, "loss": 0.77925354, "num_input_tokens_seen": 110820265, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8203125, "step": 5161, "time_per_iteration": 2.5238914489746094 }, { "auxiliary_loss_clip": 0.01139441, "auxiliary_loss_mlp": 0.01040491, "balance_loss_clip": 1.02482677, "balance_loss_mlp": 1.04458451, "epoch": 0.3103562302720577, "flos": 20805349536000.0, "grad_norm": 2.062094795532094, "language_loss": 0.82041192, "learning_rate": 3.1226976121011734e-06, "loss": 0.84221125, "num_input_tokens_seen": 110836195, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.859375, "step": 5162, "time_per_iteration": 2.5472352504730225 }, { "auxiliary_loss_clip": 0.01135012, "auxiliary_loss_mlp": 0.01035247, "balance_loss_clip": 1.02095389, "balance_loss_mlp": 1.04468989, "epoch": 0.3104163535247257, "flos": 22344158136960.0, "grad_norm": 1.5697298143746328, "language_loss": 0.83154726, "learning_rate": 3.1223849141279774e-06, "loss": 0.85324985, "num_input_tokens_seen": 110856420, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8203125, "step": 5163, "time_per_iteration": 2.563051223754883 }, { "auxiliary_loss_clip": 0.0113219, "auxiliary_loss_mlp": 0.01040593, "balance_loss_clip": 1.02544737, "balance_loss_mlp": 1.04674053, "epoch": 0.31047647677739365, "flos": 21689363157120.0, "grad_norm": 1.7006231385426926, "language_loss": 0.76005316, "learning_rate": 3.1220721760997517e-06, "loss": 0.78178102, "num_input_tokens_seen": 110876650, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.85546875, "step": 5164, "time_per_iteration": 2.5270347595214844 }, { "auxiliary_loss_clip": 0.01169615, "auxiliary_loss_mlp": 0.01040128, "balance_loss_clip": 1.02373719, "balance_loss_mlp": 1.04958761, "epoch": 0.3105366000300616, "flos": 18917275040640.0, "grad_norm": 2.4428045624504255, "language_loss": 0.74647355, "learning_rate": 3.1217593980276554e-06, "loss": 0.76857102, "num_input_tokens_seen": 110894445, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.83984375, "step": 5165, "time_per_iteration": 2.6603195667266846 }, { "auxiliary_loss_clip": 0.01155152, "auxiliary_loss_mlp": 0.01038174, "balance_loss_clip": 1.02286768, "balance_loss_mlp": 1.04561353, "epoch": 0.3105967232827296, "flos": 18260397072000.0, "grad_norm": 1.7316165028914698, "language_loss": 0.75787026, "learning_rate": 3.1214465799228525e-06, "loss": 0.77980357, "num_input_tokens_seen": 110912855, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.828125, "step": 5166, "time_per_iteration": 4.0340657234191895 }, { "auxiliary_loss_clip": 0.01149437, "auxiliary_loss_mlp": 0.01040437, "balance_loss_clip": 1.0244031, "balance_loss_mlp": 1.04727483, "epoch": 0.31065684653539755, "flos": 17672144037120.0, "grad_norm": 2.4445306111796907, "language_loss": 0.73833382, "learning_rate": 3.121133721796505e-06, "loss": 0.76023257, "num_input_tokens_seen": 110928025, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.84375, "step": 5167, "time_per_iteration": 2.5799763202667236 }, { "auxiliary_loss_clip": 0.01094385, "auxiliary_loss_mlp": 0.01010791, "balance_loss_clip": 1.00888336, "balance_loss_mlp": 1.0309025, "epoch": 0.3107169697880655, "flos": 68531996511360.0, "grad_norm": 0.7144021823152313, "language_loss": 0.53004998, "learning_rate": 3.1208208236597795e-06, "loss": 0.55110174, "num_input_tokens_seen": 110992215, "router_z_loss_clip": 0.01904297, "router_z_loss_mlp": 0.27929688, "step": 5168, "time_per_iteration": 3.269892930984497 }, { "auxiliary_loss_clip": 0.01142972, "auxiliary_loss_mlp": 0.01040369, "balance_loss_clip": 1.02385831, "balance_loss_mlp": 1.04825115, "epoch": 0.3107770930407335, "flos": 13188733274880.0, "grad_norm": 1.8554418126127579, "language_loss": 0.78326201, "learning_rate": 3.1205078855238417e-06, "loss": 0.80509537, "num_input_tokens_seen": 111010400, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.859375, "step": 5169, "time_per_iteration": 2.655061721801758 }, { "auxiliary_loss_clip": 0.01150591, "auxiliary_loss_mlp": 0.01042436, "balance_loss_clip": 1.02659297, "balance_loss_mlp": 1.04749024, "epoch": 0.3108372162934015, "flos": 31580849520000.0, "grad_norm": 1.662928525064586, "language_loss": 0.64367926, "learning_rate": 3.12019490739986e-06, "loss": 0.66560954, "num_input_tokens_seen": 111033960, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8515625, "step": 5170, "time_per_iteration": 2.7433481216430664 }, { "auxiliary_loss_clip": 0.01162546, "auxiliary_loss_mlp": 0.01286778, "balance_loss_clip": 1.0233016, "balance_loss_mlp": 1.05079961, "epoch": 0.31089733954606946, "flos": 28729829266560.0, "grad_norm": 3.103866067723541, "language_loss": 0.780559, "learning_rate": 3.1198818892990037e-06, "loss": 0.80505228, "num_input_tokens_seen": 111053265, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.84375, "step": 5171, "time_per_iteration": 4.019669771194458 }, { "auxiliary_loss_clip": 0.01171015, "auxiliary_loss_mlp": 0.01048401, "balance_loss_clip": 1.03224206, "balance_loss_mlp": 1.04924071, "epoch": 0.3109574627987374, "flos": 19683249592320.0, "grad_norm": 1.7635775373276552, "language_loss": 0.83741939, "learning_rate": 3.1195688312324426e-06, "loss": 0.85961354, "num_input_tokens_seen": 111071130, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.859375, "step": 5172, "time_per_iteration": 4.084698915481567 }, { "auxiliary_loss_clip": 0.011693, "auxiliary_loss_mlp": 0.01045089, "balance_loss_clip": 1.02928174, "balance_loss_mlp": 1.04856372, "epoch": 0.3110175860514054, "flos": 14683981656960.0, "grad_norm": 2.030869418137835, "language_loss": 0.84243393, "learning_rate": 3.11925573321135e-06, "loss": 0.86457777, "num_input_tokens_seen": 111089560, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.84765625, "step": 5173, "time_per_iteration": 2.6033132076263428 }, { "auxiliary_loss_clip": 0.0112984, "auxiliary_loss_mlp": 0.01035101, "balance_loss_clip": 1.02060521, "balance_loss_mlp": 1.04976726, "epoch": 0.31107770930407336, "flos": 25739655724800.0, "grad_norm": 4.386823549799088, "language_loss": 0.83494282, "learning_rate": 3.1189425952469003e-06, "loss": 0.85659224, "num_input_tokens_seen": 111109960, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.80078125, "step": 5174, "time_per_iteration": 2.554412364959717 }, { "auxiliary_loss_clip": 0.01151563, "auxiliary_loss_mlp": 0.01043516, "balance_loss_clip": 1.02799535, "balance_loss_mlp": 1.05016196, "epoch": 0.3111378325567413, "flos": 19208259118080.0, "grad_norm": 2.2650994304212535, "language_loss": 0.85280502, "learning_rate": 3.1186294173502667e-06, "loss": 0.8747558, "num_input_tokens_seen": 111127960, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8359375, "step": 5175, "time_per_iteration": 4.100980281829834 }, { "auxiliary_loss_clip": 0.01179668, "auxiliary_loss_mlp": 0.01038776, "balance_loss_clip": 1.0226233, "balance_loss_mlp": 1.05332923, "epoch": 0.3111979558094093, "flos": 23696374561920.0, "grad_norm": 1.6871576333616107, "language_loss": 0.83233887, "learning_rate": 3.118316199532627e-06, "loss": 0.8545233, "num_input_tokens_seen": 111146730, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.81640625, "step": 5176, "time_per_iteration": 2.6574110984802246 }, { "auxiliary_loss_clip": 0.01138881, "auxiliary_loss_mlp": 0.01041608, "balance_loss_clip": 1.02556252, "balance_loss_mlp": 1.04655933, "epoch": 0.31125807906207725, "flos": 21033023892480.0, "grad_norm": 2.000694111840445, "language_loss": 0.80612099, "learning_rate": 3.1180029418051586e-06, "loss": 0.82792586, "num_input_tokens_seen": 111166295, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8359375, "step": 5177, "time_per_iteration": 2.563631296157837 }, { "auxiliary_loss_clip": 0.0114724, "auxiliary_loss_mlp": 0.0128544, "balance_loss_clip": 1.02276778, "balance_loss_mlp": 1.05441356, "epoch": 0.3113182023147452, "flos": 23076628277760.0, "grad_norm": 1.6575055455484444, "language_loss": 0.80393088, "learning_rate": 3.117689644179041e-06, "loss": 0.82825768, "num_input_tokens_seen": 111185665, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.83984375, "step": 5178, "time_per_iteration": 2.6185781955718994 }, { "auxiliary_loss_clip": 0.01154938, "auxiliary_loss_mlp": 0.01040437, "balance_loss_clip": 1.02507067, "balance_loss_mlp": 1.05070806, "epoch": 0.3113783255674132, "flos": 11838994888320.0, "grad_norm": 2.0225055653332427, "language_loss": 0.81325114, "learning_rate": 3.1173763066654556e-06, "loss": 0.83520484, "num_input_tokens_seen": 111201615, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.859375, "step": 5179, "time_per_iteration": 2.546844005584717 }, { "auxiliary_loss_clip": 0.01162731, "auxiliary_loss_mlp": 0.01044947, "balance_loss_clip": 1.02928245, "balance_loss_mlp": 1.05288684, "epoch": 0.31143844882008115, "flos": 16289547684480.0, "grad_norm": 1.5809320137276208, "language_loss": 0.78342545, "learning_rate": 3.1170629292755837e-06, "loss": 0.80550218, "num_input_tokens_seen": 111220515, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8359375, "step": 5180, "time_per_iteration": 2.5608859062194824 }, { "auxiliary_loss_clip": 0.01169767, "auxiliary_loss_mlp": 0.01032042, "balance_loss_clip": 1.01596069, "balance_loss_mlp": 1.05106187, "epoch": 0.3114985720727491, "flos": 23217792727680.0, "grad_norm": 1.6811881029863918, "language_loss": 0.83196825, "learning_rate": 3.1167495120206094e-06, "loss": 0.85398638, "num_input_tokens_seen": 111240395, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.828125, "step": 5181, "time_per_iteration": 2.658024549484253 }, { "auxiliary_loss_clip": 0.01158498, "auxiliary_loss_mlp": 0.0103724, "balance_loss_clip": 1.02360296, "balance_loss_mlp": 1.05091858, "epoch": 0.3115586953254171, "flos": 30044626698240.0, "grad_norm": 8.35617195842016, "language_loss": 0.73785162, "learning_rate": 3.116436054911717e-06, "loss": 0.75980902, "num_input_tokens_seen": 111261100, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.8046875, "step": 5182, "time_per_iteration": 2.729147434234619 }, { "auxiliary_loss_clip": 0.01173861, "auxiliary_loss_mlp": 0.01289537, "balance_loss_clip": 1.02534723, "balance_loss_mlp": 1.05126572, "epoch": 0.3116188185780851, "flos": 25666326109440.0, "grad_norm": 2.5000570485443494, "language_loss": 0.70751715, "learning_rate": 3.116122557960094e-06, "loss": 0.73215109, "num_input_tokens_seen": 111281320, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.87109375, "step": 5183, "time_per_iteration": 2.6395957469940186 }, { "auxiliary_loss_clip": 0.01095068, "auxiliary_loss_mlp": 0.01015644, "balance_loss_clip": 1.01364124, "balance_loss_mlp": 1.04043007, "epoch": 0.31167894183075306, "flos": 69510058917120.0, "grad_norm": 0.946707667103642, "language_loss": 0.59661102, "learning_rate": 3.115809021176928e-06, "loss": 0.6177181, "num_input_tokens_seen": 111341405, "router_z_loss_clip": 0.02001953, "router_z_loss_mlp": 0.27734375, "step": 5184, "time_per_iteration": 3.208940029144287 }, { "auxiliary_loss_clip": 0.01153025, "auxiliary_loss_mlp": 0.01044772, "balance_loss_clip": 1.02942371, "balance_loss_mlp": 1.05122328, "epoch": 0.31173906508342103, "flos": 14939845211520.0, "grad_norm": 2.034773955117025, "language_loss": 0.70026958, "learning_rate": 3.1154954445734088e-06, "loss": 0.72224754, "num_input_tokens_seen": 111358975, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.83984375, "step": 5185, "time_per_iteration": 2.561047077178955 }, { "auxiliary_loss_clip": 0.01165113, "auxiliary_loss_mlp": 0.01045714, "balance_loss_clip": 1.02971625, "balance_loss_mlp": 1.05323625, "epoch": 0.311799188336089, "flos": 16176033728640.0, "grad_norm": 2.3985362953651523, "language_loss": 0.63631487, "learning_rate": 3.115181828160726e-06, "loss": 0.65842307, "num_input_tokens_seen": 111375845, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.84765625, "step": 5186, "time_per_iteration": 2.6013779640197754 }, { "auxiliary_loss_clip": 0.01157393, "auxiliary_loss_mlp": 0.01047375, "balance_loss_clip": 1.0300777, "balance_loss_mlp": 1.05162978, "epoch": 0.31185931158875696, "flos": 25009627708800.0, "grad_norm": 2.2192010896850034, "language_loss": 0.86468643, "learning_rate": 3.1148681719500723e-06, "loss": 0.88673407, "num_input_tokens_seen": 111394150, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.87890625, "step": 5187, "time_per_iteration": 2.626315116882324 }, { "auxiliary_loss_clip": 0.01134706, "auxiliary_loss_mlp": 0.01293895, "balance_loss_clip": 1.03039563, "balance_loss_mlp": 1.05039418, "epoch": 0.3119194348414249, "flos": 37232901273600.0, "grad_norm": 1.573830452640449, "language_loss": 0.62694848, "learning_rate": 3.114554475952642e-06, "loss": 0.65123451, "num_input_tokens_seen": 111418355, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.84375, "step": 5188, "time_per_iteration": 2.724985122680664 }, { "auxiliary_loss_clip": 0.01163949, "auxiliary_loss_mlp": 0.01043038, "balance_loss_clip": 1.02706361, "balance_loss_mlp": 1.05313492, "epoch": 0.3119795580940929, "flos": 15012779777280.0, "grad_norm": 2.298373802070639, "language_loss": 0.83298194, "learning_rate": 3.1142407401796283e-06, "loss": 0.85505188, "num_input_tokens_seen": 111435445, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8359375, "step": 5189, "time_per_iteration": 2.588804006576538 }, { "auxiliary_loss_clip": 0.01205378, "auxiliary_loss_mlp": 0.01034106, "balance_loss_clip": 1.01856065, "balance_loss_mlp": 1.05109441, "epoch": 0.31203968134676086, "flos": 15998168557440.0, "grad_norm": 2.036994985365738, "language_loss": 0.79055303, "learning_rate": 3.113926964642229e-06, "loss": 0.81294787, "num_input_tokens_seen": 111453430, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.828125, "step": 5190, "time_per_iteration": 2.660919189453125 }, { "auxiliary_loss_clip": 0.01172722, "auxiliary_loss_mlp": 0.01045336, "balance_loss_clip": 1.02942193, "balance_loss_mlp": 1.05245841, "epoch": 0.3120998045994288, "flos": 23837359443840.0, "grad_norm": 1.453265367501302, "language_loss": 0.75172251, "learning_rate": 3.1136131493516426e-06, "loss": 0.77390307, "num_input_tokens_seen": 111475325, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.84375, "step": 5191, "time_per_iteration": 2.655580759048462 }, { "auxiliary_loss_clip": 0.0107006, "auxiliary_loss_mlp": 0.01260765, "balance_loss_clip": 1.01212335, "balance_loss_mlp": 1.03360581, "epoch": 0.3121599278520968, "flos": 69184205712000.0, "grad_norm": 0.8472734551644623, "language_loss": 0.63936251, "learning_rate": 3.1132992943190664e-06, "loss": 0.66267079, "num_input_tokens_seen": 111533960, "router_z_loss_clip": 0.01660156, "router_z_loss_mlp": 0.27539062, "step": 5192, "time_per_iteration": 3.232194185256958 }, { "auxiliary_loss_clip": 0.01144012, "auxiliary_loss_mlp": 0.01044768, "balance_loss_clip": 1.02744699, "balance_loss_mlp": 1.04958344, "epoch": 0.31222005110476475, "flos": 23806368984960.0, "grad_norm": 1.6294973366639705, "language_loss": 0.7984494, "learning_rate": 3.1129853995557033e-06, "loss": 0.82033718, "num_input_tokens_seen": 111554055, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.859375, "step": 5193, "time_per_iteration": 2.6321332454681396 }, { "auxiliary_loss_clip": 0.01151656, "auxiliary_loss_mlp": 0.0104472, "balance_loss_clip": 1.02787566, "balance_loss_mlp": 1.04755092, "epoch": 0.3122801743574327, "flos": 25226132935680.0, "grad_norm": 2.2247957223875345, "language_loss": 0.71958327, "learning_rate": 3.1126714650727534e-06, "loss": 0.74154699, "num_input_tokens_seen": 111574305, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.86328125, "step": 5194, "time_per_iteration": 2.6574323177337646 }, { "auxiliary_loss_clip": 0.01135579, "auxiliary_loss_mlp": 0.01040293, "balance_loss_clip": 1.02523732, "balance_loss_mlp": 1.05174351, "epoch": 0.3123402976101007, "flos": 22966490200320.0, "grad_norm": 1.5508956097096567, "language_loss": 0.76309818, "learning_rate": 3.112357490881421e-06, "loss": 0.78485692, "num_input_tokens_seen": 111595680, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8359375, "step": 5195, "time_per_iteration": 2.6466879844665527 }, { "auxiliary_loss_clip": 0.01133678, "auxiliary_loss_mlp": 0.0103844, "balance_loss_clip": 1.02277637, "balance_loss_mlp": 1.04769826, "epoch": 0.3124004208627687, "flos": 25192089820800.0, "grad_norm": 1.3489185015789134, "language_loss": 0.77750289, "learning_rate": 3.112043476992911e-06, "loss": 0.79922408, "num_input_tokens_seen": 111618135, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.859375, "step": 5196, "time_per_iteration": 2.7117786407470703 }, { "auxiliary_loss_clip": 0.01074457, "auxiliary_loss_mlp": 0.01254044, "balance_loss_clip": 1.00526047, "balance_loss_mlp": 1.02878475, "epoch": 0.31246054411543667, "flos": 67485165517440.0, "grad_norm": 0.7758530437955973, "language_loss": 0.54741883, "learning_rate": 3.1117294234184304e-06, "loss": 0.57070386, "num_input_tokens_seen": 111682220, "router_z_loss_clip": 0.01867676, "router_z_loss_mlp": 0.28125, "step": 5197, "time_per_iteration": 3.2645554542541504 }, { "auxiliary_loss_clip": 0.01133121, "auxiliary_loss_mlp": 0.01035821, "balance_loss_clip": 1.02040124, "balance_loss_mlp": 1.05042231, "epoch": 0.31252066736810463, "flos": 17858520731520.0, "grad_norm": 1.541743818758113, "language_loss": 0.66370428, "learning_rate": 3.111415330169186e-06, "loss": 0.68539369, "num_input_tokens_seen": 111700815, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.828125, "step": 5198, "time_per_iteration": 2.530285596847534 }, { "auxiliary_loss_clip": 0.0116276, "auxiliary_loss_mlp": 0.01041669, "balance_loss_clip": 1.02466965, "balance_loss_mlp": 1.04929471, "epoch": 0.3125807906207726, "flos": 18475034791680.0, "grad_norm": 5.005463377268425, "language_loss": 0.69259852, "learning_rate": 3.111101197256387e-06, "loss": 0.71464276, "num_input_tokens_seen": 111718195, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8671875, "step": 5199, "time_per_iteration": 2.619659423828125 }, { "auxiliary_loss_clip": 0.01140387, "auxiliary_loss_mlp": 0.01043302, "balance_loss_clip": 1.02692235, "balance_loss_mlp": 1.04754376, "epoch": 0.31264091387344056, "flos": 18946541646720.0, "grad_norm": 1.763713746667729, "language_loss": 0.78634977, "learning_rate": 3.110787024691245e-06, "loss": 0.80818671, "num_input_tokens_seen": 111734440, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8359375, "step": 5200, "time_per_iteration": 2.5836079120635986 }, { "auxiliary_loss_clip": 0.01138611, "auxiliary_loss_mlp": 0.01032322, "balance_loss_clip": 1.0172658, "balance_loss_mlp": 1.04881859, "epoch": 0.3127010371261085, "flos": 21468512384640.0, "grad_norm": 1.9442349520165163, "language_loss": 0.83309448, "learning_rate": 3.1104728124849714e-06, "loss": 0.8548038, "num_input_tokens_seen": 111751960, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.80859375, "step": 5201, "time_per_iteration": 2.495046854019165 }, { "auxiliary_loss_clip": 0.01135553, "auxiliary_loss_mlp": 0.01038371, "balance_loss_clip": 1.02237296, "balance_loss_mlp": 1.05116796, "epoch": 0.3127611603787765, "flos": 15336047203200.0, "grad_norm": 1.8006895315322617, "language_loss": 0.69251198, "learning_rate": 3.110158560648779e-06, "loss": 0.71425122, "num_input_tokens_seen": 111769585, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.84375, "step": 5202, "time_per_iteration": 2.499974250793457 }, { "auxiliary_loss_clip": 0.01134923, "auxiliary_loss_mlp": 0.01036662, "balance_loss_clip": 1.02220225, "balance_loss_mlp": 1.04942, "epoch": 0.31282128363144446, "flos": 17602980399360.0, "grad_norm": 1.7088819061772322, "language_loss": 0.84008271, "learning_rate": 3.109844269193884e-06, "loss": 0.86179852, "num_input_tokens_seen": 111787880, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8515625, "step": 5203, "time_per_iteration": 2.5040371417999268 }, { "auxiliary_loss_clip": 0.01171418, "auxiliary_loss_mlp": 0.01034543, "balance_loss_clip": 1.01933217, "balance_loss_mlp": 1.0505358, "epoch": 0.3128814068841124, "flos": 26756753235840.0, "grad_norm": 2.1419951776727646, "language_loss": 0.60405886, "learning_rate": 3.109529938131501e-06, "loss": 0.62611854, "num_input_tokens_seen": 111805950, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.84765625, "step": 5204, "time_per_iteration": 2.628002643585205 }, { "auxiliary_loss_clip": 0.0114751, "auxiliary_loss_mlp": 0.01032934, "balance_loss_clip": 1.01904643, "balance_loss_mlp": 1.04853129, "epoch": 0.3129415301367804, "flos": 22272372806400.0, "grad_norm": 1.69376733739103, "language_loss": 0.6594404, "learning_rate": 3.109215567472849e-06, "loss": 0.68124485, "num_input_tokens_seen": 111826135, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.80859375, "step": 5205, "time_per_iteration": 2.5316553115844727 }, { "auxiliary_loss_clip": 0.01142459, "auxiliary_loss_mlp": 0.01040298, "balance_loss_clip": 1.02444363, "balance_loss_mlp": 1.04969645, "epoch": 0.31300165338944835, "flos": 26464907232000.0, "grad_norm": 1.9733003305894687, "language_loss": 0.76657414, "learning_rate": 3.1089011572291464e-06, "loss": 0.78840172, "num_input_tokens_seen": 111844700, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8359375, "step": 5206, "time_per_iteration": 2.57460880279541 }, { "auxiliary_loss_clip": 0.01132088, "auxiliary_loss_mlp": 0.01032, "balance_loss_clip": 1.01687253, "balance_loss_mlp": 1.04849124, "epoch": 0.3130617766421163, "flos": 21944652094080.0, "grad_norm": 2.0925413806326834, "language_loss": 0.83283001, "learning_rate": 3.1085867074116143e-06, "loss": 0.85447091, "num_input_tokens_seen": 111861585, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8359375, "step": 5207, "time_per_iteration": 2.5442209243774414 }, { "auxiliary_loss_clip": 0.01139225, "auxiliary_loss_mlp": 0.01038516, "balance_loss_clip": 1.02453303, "balance_loss_mlp": 1.04959059, "epoch": 0.3131218998947843, "flos": 23292774368640.0, "grad_norm": 1.6782461192928708, "language_loss": 0.71160775, "learning_rate": 3.108272218031475e-06, "loss": 0.73338521, "num_input_tokens_seen": 111882950, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.80859375, "step": 5208, "time_per_iteration": 3.9740231037139893 }, { "auxiliary_loss_clip": 0.01155786, "auxiliary_loss_mlp": 0.01041942, "balance_loss_clip": 1.02595615, "balance_loss_mlp": 1.05340028, "epoch": 0.3131820231474523, "flos": 21139642437120.0, "grad_norm": 2.319197122463746, "language_loss": 0.74454689, "learning_rate": 3.1079576890999498e-06, "loss": 0.7665242, "num_input_tokens_seen": 111901640, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.84375, "step": 5209, "time_per_iteration": 2.610058546066284 }, { "auxiliary_loss_clip": 0.01134328, "auxiliary_loss_mlp": 0.01039432, "balance_loss_clip": 1.02518606, "balance_loss_mlp": 1.04969072, "epoch": 0.31324214640012027, "flos": 23909863046400.0, "grad_norm": 1.8589224607273658, "language_loss": 0.773983, "learning_rate": 3.107643120628265e-06, "loss": 0.79572058, "num_input_tokens_seen": 111919615, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.84765625, "step": 5210, "time_per_iteration": 2.5933079719543457 }, { "auxiliary_loss_clip": 0.01140426, "auxiliary_loss_mlp": 0.01038215, "balance_loss_clip": 1.02314723, "balance_loss_mlp": 1.05128551, "epoch": 0.31330226965278823, "flos": 22236929061120.0, "grad_norm": 2.027707112651965, "language_loss": 0.78969294, "learning_rate": 3.1073285126276467e-06, "loss": 0.81147933, "num_input_tokens_seen": 111938485, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8046875, "step": 5211, "time_per_iteration": 2.587008237838745 }, { "auxiliary_loss_clip": 0.01141304, "auxiliary_loss_mlp": 0.01036948, "balance_loss_clip": 1.02217758, "balance_loss_mlp": 1.05069709, "epoch": 0.3133623929054562, "flos": 19753993428480.0, "grad_norm": 1.9337171793051202, "language_loss": 0.79769766, "learning_rate": 3.1070138651093217e-06, "loss": 0.81948018, "num_input_tokens_seen": 111956425, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.81640625, "step": 5212, "time_per_iteration": 2.625279664993286 }, { "auxiliary_loss_clip": 0.01155641, "auxiliary_loss_mlp": 0.01047383, "balance_loss_clip": 1.03137279, "balance_loss_mlp": 1.05071414, "epoch": 0.31342251615812416, "flos": 27162256849920.0, "grad_norm": 1.916768818096531, "language_loss": 0.70739007, "learning_rate": 3.10669917808452e-06, "loss": 0.7294203, "num_input_tokens_seen": 111975915, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8671875, "step": 5213, "time_per_iteration": 5.529375076293945 }, { "auxiliary_loss_clip": 0.01147232, "auxiliary_loss_mlp": 0.01039616, "balance_loss_clip": 1.0243926, "balance_loss_mlp": 1.05281019, "epoch": 0.31348263941079213, "flos": 20229809915520.0, "grad_norm": 2.0610564660229596, "language_loss": 0.77150023, "learning_rate": 3.106384451564471e-06, "loss": 0.7933687, "num_input_tokens_seen": 111995055, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.85546875, "step": 5214, "time_per_iteration": 2.5734593868255615 }, { "auxiliary_loss_clip": 0.01165599, "auxiliary_loss_mlp": 0.01033917, "balance_loss_clip": 1.01951647, "balance_loss_mlp": 1.05014229, "epoch": 0.3135427626634601, "flos": 24607643627520.0, "grad_norm": 1.5379826417171438, "language_loss": 0.82975101, "learning_rate": 3.106069685560407e-06, "loss": 0.8517462, "num_input_tokens_seen": 112015830, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.80078125, "step": 5215, "time_per_iteration": 2.6239116191864014 }, { "auxiliary_loss_clip": 0.01142683, "auxiliary_loss_mlp": 0.01037574, "balance_loss_clip": 1.02189803, "balance_loss_mlp": 1.05003536, "epoch": 0.31360288591612806, "flos": 20959873845120.0, "grad_norm": 1.999278542866579, "language_loss": 0.79647923, "learning_rate": 3.1057548800835613e-06, "loss": 0.81828177, "num_input_tokens_seen": 112035065, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8359375, "step": 5216, "time_per_iteration": 3.9637656211853027 }, { "auxiliary_loss_clip": 0.01155529, "auxiliary_loss_mlp": 0.01044118, "balance_loss_clip": 1.02685666, "balance_loss_mlp": 1.05101049, "epoch": 0.313663009168796, "flos": 26980513009920.0, "grad_norm": 1.7796509712099409, "language_loss": 0.68506587, "learning_rate": 3.105440035145168e-06, "loss": 0.7070623, "num_input_tokens_seen": 112058405, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.8671875, "step": 5217, "time_per_iteration": 2.678279399871826 }, { "auxiliary_loss_clip": 0.01153003, "auxiliary_loss_mlp": 0.01037802, "balance_loss_clip": 1.02273381, "balance_loss_mlp": 1.05063248, "epoch": 0.313723132421464, "flos": 18040911016320.0, "grad_norm": 1.6664285828750414, "language_loss": 0.80553097, "learning_rate": 3.105125150756463e-06, "loss": 0.82743907, "num_input_tokens_seen": 112076420, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.84765625, "step": 5218, "time_per_iteration": 2.5783920288085938 }, { "auxiliary_loss_clip": 0.01132193, "auxiliary_loss_mlp": 0.01042708, "balance_loss_clip": 1.02689481, "balance_loss_mlp": 1.04776645, "epoch": 0.31378325567413196, "flos": 22488913946880.0, "grad_norm": 2.4008241699748583, "language_loss": 0.69162983, "learning_rate": 3.1048102269286843e-06, "loss": 0.71337891, "num_input_tokens_seen": 112090775, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.84375, "step": 5219, "time_per_iteration": 2.576721429824829 }, { "auxiliary_loss_clip": 0.01151069, "auxiliary_loss_mlp": 0.01044258, "balance_loss_clip": 1.02934504, "balance_loss_mlp": 1.05008781, "epoch": 0.3138433789267999, "flos": 22419247518720.0, "grad_norm": 2.292831630522082, "language_loss": 0.79788923, "learning_rate": 3.104495263673071e-06, "loss": 0.81984252, "num_input_tokens_seen": 112110980, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.83203125, "step": 5220, "time_per_iteration": 2.5874407291412354 }, { "auxiliary_loss_clip": 0.01139969, "auxiliary_loss_mlp": 0.01041922, "balance_loss_clip": 1.02748573, "balance_loss_mlp": 1.04820871, "epoch": 0.3139035021794679, "flos": 13005912026880.0, "grad_norm": 1.6568749531813725, "language_loss": 0.73045135, "learning_rate": 3.1041802610008624e-06, "loss": 0.75227022, "num_input_tokens_seen": 112129020, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.828125, "step": 5221, "time_per_iteration": 2.540639877319336 }, { "auxiliary_loss_clip": 0.01147081, "auxiliary_loss_mlp": 0.01037942, "balance_loss_clip": 1.02305329, "balance_loss_mlp": 1.04803824, "epoch": 0.31396362543213585, "flos": 16945994689920.0, "grad_norm": 2.0228307821200455, "language_loss": 0.81240392, "learning_rate": 3.103865218923301e-06, "loss": 0.83425415, "num_input_tokens_seen": 112147865, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8125, "step": 5222, "time_per_iteration": 2.5807273387908936 }, { "auxiliary_loss_clip": 0.01144233, "auxiliary_loss_mlp": 0.01041463, "balance_loss_clip": 1.02485693, "balance_loss_mlp": 1.04937994, "epoch": 0.31402374868480387, "flos": 20156731695360.0, "grad_norm": 2.2291636197572733, "language_loss": 0.70025969, "learning_rate": 3.103550137451629e-06, "loss": 0.72211659, "num_input_tokens_seen": 112166745, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.859375, "step": 5223, "time_per_iteration": 2.559225559234619 }, { "auxiliary_loss_clip": 0.01139547, "auxiliary_loss_mlp": 0.01284491, "balance_loss_clip": 1.02249765, "balance_loss_mlp": 1.04664278, "epoch": 0.31408387193747184, "flos": 21251073404160.0, "grad_norm": 1.6121272102668807, "language_loss": 0.80715692, "learning_rate": 3.1032350165970915e-06, "loss": 0.83139735, "num_input_tokens_seen": 112185895, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8359375, "step": 5224, "time_per_iteration": 2.577338933944702 }, { "auxiliary_loss_clip": 0.01151742, "auxiliary_loss_mlp": 0.01044321, "balance_loss_clip": 1.02754784, "balance_loss_mlp": 1.05337024, "epoch": 0.3141439951901398, "flos": 27484267299840.0, "grad_norm": 2.301772616976714, "language_loss": 0.58562058, "learning_rate": 3.102919856370934e-06, "loss": 0.60758114, "num_input_tokens_seen": 112204465, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.89453125, "step": 5225, "time_per_iteration": 2.608085870742798 }, { "auxiliary_loss_clip": 0.0116439, "auxiliary_loss_mlp": 0.01033544, "balance_loss_clip": 1.01928639, "balance_loss_mlp": 1.04895258, "epoch": 0.31420411844280777, "flos": 17852235851520.0, "grad_norm": 1.7854104909766657, "language_loss": 0.81509817, "learning_rate": 3.102604656784404e-06, "loss": 0.83707744, "num_input_tokens_seen": 112221635, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.796875, "step": 5226, "time_per_iteration": 2.6028037071228027 }, { "auxiliary_loss_clip": 0.01143613, "auxiliary_loss_mlp": 0.01046963, "balance_loss_clip": 1.03089356, "balance_loss_mlp": 1.04978096, "epoch": 0.31426424169547573, "flos": 21616967295360.0, "grad_norm": 1.9233370725947256, "language_loss": 0.73835659, "learning_rate": 3.10228941784875e-06, "loss": 0.76026237, "num_input_tokens_seen": 112241240, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.84765625, "step": 5227, "time_per_iteration": 2.593153715133667 }, { "auxiliary_loss_clip": 0.01145073, "auxiliary_loss_mlp": 0.01041698, "balance_loss_clip": 1.02598596, "balance_loss_mlp": 1.05153644, "epoch": 0.3143243649481437, "flos": 30920631586560.0, "grad_norm": 1.702037712521077, "language_loss": 0.67486084, "learning_rate": 3.101974139575222e-06, "loss": 0.69672859, "num_input_tokens_seen": 112262350, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.84375, "step": 5228, "time_per_iteration": 2.625748872756958 }, { "auxiliary_loss_clip": 0.01150781, "auxiliary_loss_mlp": 0.01046273, "balance_loss_clip": 1.03137779, "balance_loss_mlp": 1.04872382, "epoch": 0.31438448820081166, "flos": 22821411168000.0, "grad_norm": 2.0486665366462007, "language_loss": 0.79528052, "learning_rate": 3.1016588219750716e-06, "loss": 0.81725109, "num_input_tokens_seen": 112283710, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.84375, "step": 5229, "time_per_iteration": 2.6410698890686035 }, { "auxiliary_loss_clip": 0.01151384, "auxiliary_loss_mlp": 0.01038077, "balance_loss_clip": 1.02206683, "balance_loss_mlp": 1.0509932, "epoch": 0.3144446114534796, "flos": 23292127923840.0, "grad_norm": 1.7751826829341064, "language_loss": 0.69316494, "learning_rate": 3.1013434650595522e-06, "loss": 0.71505952, "num_input_tokens_seen": 112304285, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.828125, "step": 5230, "time_per_iteration": 2.6223206520080566 }, { "auxiliary_loss_clip": 0.01158222, "auxiliary_loss_mlp": 0.01040817, "balance_loss_clip": 1.02461636, "balance_loss_mlp": 1.04590774, "epoch": 0.3145047347061476, "flos": 31355976424320.0, "grad_norm": 1.5666921852735554, "language_loss": 0.79418671, "learning_rate": 3.101028068839917e-06, "loss": 0.81617713, "num_input_tokens_seen": 112325110, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8515625, "step": 5231, "time_per_iteration": 2.6920583248138428 }, { "auxiliary_loss_clip": 0.01147949, "auxiliary_loss_mlp": 0.01044299, "balance_loss_clip": 1.02754951, "balance_loss_mlp": 1.04795289, "epoch": 0.31456485795881556, "flos": 10889552643840.0, "grad_norm": 2.0383296037484775, "language_loss": 0.8472898, "learning_rate": 3.100712633327422e-06, "loss": 0.86921227, "num_input_tokens_seen": 112339855, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8203125, "step": 5232, "time_per_iteration": 2.527726411819458 }, { "auxiliary_loss_clip": 0.01165516, "auxiliary_loss_mlp": 0.01044181, "balance_loss_clip": 1.0279808, "balance_loss_mlp": 1.04724479, "epoch": 0.3146249812114835, "flos": 17092438439040.0, "grad_norm": 2.4764656901177156, "language_loss": 0.79993582, "learning_rate": 3.100397158533325e-06, "loss": 0.82203275, "num_input_tokens_seen": 112358480, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.82421875, "step": 5233, "time_per_iteration": 2.673250436782837 }, { "auxiliary_loss_clip": 0.01170444, "auxiliary_loss_mlp": 0.01041386, "balance_loss_clip": 1.02554274, "balance_loss_mlp": 1.04914594, "epoch": 0.3146851044641515, "flos": 55291442889600.0, "grad_norm": 1.7500394484528963, "language_loss": 0.70727479, "learning_rate": 3.100081644468883e-06, "loss": 0.72939312, "num_input_tokens_seen": 112382350, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8515625, "step": 5234, "time_per_iteration": 2.8706071376800537 }, { "auxiliary_loss_clip": 0.01160479, "auxiliary_loss_mlp": 0.0103579, "balance_loss_clip": 1.02020979, "balance_loss_mlp": 1.04842937, "epoch": 0.31474522771681945, "flos": 27015884928000.0, "grad_norm": 2.1361017875185047, "language_loss": 0.72652936, "learning_rate": 3.0997660911453575e-06, "loss": 0.748492, "num_input_tokens_seen": 112400260, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8515625, "step": 5235, "time_per_iteration": 2.6278398036956787 }, { "auxiliary_loss_clip": 0.01139131, "auxiliary_loss_mlp": 0.01041549, "balance_loss_clip": 1.02620637, "balance_loss_mlp": 1.04716873, "epoch": 0.3148053509694875, "flos": 21251935330560.0, "grad_norm": 2.0704020827125498, "language_loss": 0.78254718, "learning_rate": 3.0994504985740096e-06, "loss": 0.80435395, "num_input_tokens_seen": 112419400, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.828125, "step": 5236, "time_per_iteration": 2.6741766929626465 }, { "auxiliary_loss_clip": 0.01135186, "auxiliary_loss_mlp": 0.0104357, "balance_loss_clip": 1.02720237, "balance_loss_mlp": 1.04814541, "epoch": 0.31486547422215544, "flos": 31248675521280.0, "grad_norm": 1.6874341307453102, "language_loss": 0.7565527, "learning_rate": 3.099134866766101e-06, "loss": 0.77834028, "num_input_tokens_seen": 112440825, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.87109375, "step": 5237, "time_per_iteration": 2.638657569885254 }, { "auxiliary_loss_clip": 0.01128972, "auxiliary_loss_mlp": 0.01035906, "balance_loss_clip": 1.02160645, "balance_loss_mlp": 1.04766488, "epoch": 0.3149255974748234, "flos": 19828615933440.0, "grad_norm": 1.765755500494831, "language_loss": 0.79617542, "learning_rate": 3.0988191957328967e-06, "loss": 0.81782413, "num_input_tokens_seen": 112459180, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8125, "step": 5238, "time_per_iteration": 2.5271222591400146 }, { "auxiliary_loss_clip": 0.01065419, "auxiliary_loss_mlp": 0.01008722, "balance_loss_clip": 1.00659966, "balance_loss_mlp": 1.02846897, "epoch": 0.31498572072749137, "flos": 67683965339520.0, "grad_norm": 0.8393985783164112, "language_loss": 0.6796062, "learning_rate": 3.0985034854856615e-06, "loss": 0.7003476, "num_input_tokens_seen": 112516680, "router_z_loss_clip": 0.02124023, "router_z_loss_mlp": 0.27929688, "step": 5239, "time_per_iteration": 3.1172165870666504 }, { "auxiliary_loss_clip": 0.01153371, "auxiliary_loss_mlp": 0.010426, "balance_loss_clip": 1.02476609, "balance_loss_mlp": 1.04830766, "epoch": 0.31504584398015933, "flos": 19793136274560.0, "grad_norm": 1.992733890205861, "language_loss": 0.82828093, "learning_rate": 3.098187736035663e-06, "loss": 0.85024071, "num_input_tokens_seen": 112535895, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.87109375, "step": 5240, "time_per_iteration": 2.6083221435546875 }, { "auxiliary_loss_clip": 0.01152423, "auxiliary_loss_mlp": 0.01286787, "balance_loss_clip": 1.02395618, "balance_loss_mlp": 1.05094326, "epoch": 0.3151059672328273, "flos": 26615409217920.0, "grad_norm": 1.6173530380137369, "language_loss": 0.81083584, "learning_rate": 3.097871947394168e-06, "loss": 0.83522791, "num_input_tokens_seen": 112557490, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8359375, "step": 5241, "time_per_iteration": 2.558396816253662 }, { "auxiliary_loss_clip": 0.01131891, "auxiliary_loss_mlp": 0.01040487, "balance_loss_clip": 1.02513289, "balance_loss_mlp": 1.04996443, "epoch": 0.31516609048549526, "flos": 24204438483840.0, "grad_norm": 1.7783090601560236, "language_loss": 0.74360394, "learning_rate": 3.0975561195724477e-06, "loss": 0.76532769, "num_input_tokens_seen": 112577075, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8203125, "step": 5242, "time_per_iteration": 2.550943613052368 }, { "auxiliary_loss_clip": 0.01158948, "auxiliary_loss_mlp": 0.01038307, "balance_loss_clip": 1.02155781, "balance_loss_mlp": 1.04818678, "epoch": 0.31522621373816323, "flos": 25958710817280.0, "grad_norm": 1.8021950110900828, "language_loss": 0.73649144, "learning_rate": 3.0972402525817732e-06, "loss": 0.75846398, "num_input_tokens_seen": 112597620, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8359375, "step": 5243, "time_per_iteration": 2.587944746017456 }, { "auxiliary_loss_clip": 0.0113978, "auxiliary_loss_mlp": 0.0104339, "balance_loss_clip": 1.02712977, "balance_loss_mlp": 1.04678845, "epoch": 0.3152863369908312, "flos": 21908813299200.0, "grad_norm": 1.646095204805339, "language_loss": 0.64555681, "learning_rate": 3.0969243464334166e-06, "loss": 0.6673885, "num_input_tokens_seen": 112617150, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.83984375, "step": 5244, "time_per_iteration": 2.6442360877990723 }, { "auxiliary_loss_clip": 0.01170646, "auxiliary_loss_mlp": 0.01044161, "balance_loss_clip": 1.02873552, "balance_loss_mlp": 1.04858875, "epoch": 0.31534646024349916, "flos": 16281072074880.0, "grad_norm": 3.0988572632337426, "language_loss": 0.91328585, "learning_rate": 3.0966084011386517e-06, "loss": 0.93543392, "num_input_tokens_seen": 112631090, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.86328125, "step": 5245, "time_per_iteration": 2.5041182041168213 }, { "auxiliary_loss_clip": 0.01144109, "auxiliary_loss_mlp": 0.01049496, "balance_loss_clip": 1.03275919, "balance_loss_mlp": 1.04900527, "epoch": 0.3154065834961671, "flos": 24717243000960.0, "grad_norm": 1.8265568820353364, "language_loss": 0.75117683, "learning_rate": 3.0962924167087526e-06, "loss": 0.77311289, "num_input_tokens_seen": 112651220, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.86328125, "step": 5246, "time_per_iteration": 2.6084673404693604 }, { "auxiliary_loss_clip": 0.01133462, "auxiliary_loss_mlp": 0.01046506, "balance_loss_clip": 1.03075862, "balance_loss_mlp": 1.04739666, "epoch": 0.3154667067488351, "flos": 35371148469120.0, "grad_norm": 1.4972343000455628, "language_loss": 0.61371827, "learning_rate": 3.0959763931549985e-06, "loss": 0.63551801, "num_input_tokens_seen": 112671560, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.859375, "step": 5247, "time_per_iteration": 2.629760265350342 }, { "auxiliary_loss_clip": 0.01143263, "auxiliary_loss_mlp": 0.01044077, "balance_loss_clip": 1.02789986, "balance_loss_mlp": 1.04762769, "epoch": 0.31552683000150306, "flos": 17456464823040.0, "grad_norm": 2.4329192714190957, "language_loss": 0.82662833, "learning_rate": 3.0956603304886653e-06, "loss": 0.84850168, "num_input_tokens_seen": 112689790, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8671875, "step": 5248, "time_per_iteration": 2.568342685699463 }, { "auxiliary_loss_clip": 0.01151602, "auxiliary_loss_mlp": 0.01048894, "balance_loss_clip": 1.03126311, "balance_loss_mlp": 1.04798794, "epoch": 0.3155869532541711, "flos": 18405763413120.0, "grad_norm": 1.9856811565870478, "language_loss": 0.84764355, "learning_rate": 3.095344228721034e-06, "loss": 0.86964846, "num_input_tokens_seen": 112708265, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.859375, "step": 5249, "time_per_iteration": 3.9311516284942627 }, { "auxiliary_loss_clip": 0.01162882, "auxiliary_loss_mlp": 0.01043023, "balance_loss_clip": 1.02629757, "balance_loss_mlp": 1.05006838, "epoch": 0.31564707650683904, "flos": 21579763783680.0, "grad_norm": 1.8742878349950913, "language_loss": 0.84973407, "learning_rate": 3.0950280878633844e-06, "loss": 0.87179309, "num_input_tokens_seen": 112727820, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.859375, "step": 5250, "time_per_iteration": 2.5630531311035156 }, { "auxiliary_loss_clip": 0.01149816, "auxiliary_loss_mlp": 0.01043078, "balance_loss_clip": 1.02653134, "balance_loss_mlp": 1.04730892, "epoch": 0.315707199759507, "flos": 21030976817280.0, "grad_norm": 2.2979116884105166, "language_loss": 0.68274868, "learning_rate": 3.094711907926999e-06, "loss": 0.70467764, "num_input_tokens_seen": 112743140, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8515625, "step": 5251, "time_per_iteration": 2.577160120010376 }, { "auxiliary_loss_clip": 0.01132715, "auxiliary_loss_mlp": 0.01039335, "balance_loss_clip": 1.02321815, "balance_loss_mlp": 1.04861355, "epoch": 0.31576732301217497, "flos": 26828861788800.0, "grad_norm": 1.7894342516529578, "language_loss": 0.78780723, "learning_rate": 3.0943956889231613e-06, "loss": 0.80952775, "num_input_tokens_seen": 112764705, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.84375, "step": 5252, "time_per_iteration": 2.574918746948242 }, { "auxiliary_loss_clip": 0.01141512, "auxiliary_loss_mlp": 0.01040601, "balance_loss_clip": 1.02504396, "balance_loss_mlp": 1.04993105, "epoch": 0.31582744626484294, "flos": 22711165349760.0, "grad_norm": 1.6755782683895262, "language_loss": 0.74433243, "learning_rate": 3.0940794308631574e-06, "loss": 0.76615357, "num_input_tokens_seen": 112785310, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.82421875, "step": 5253, "time_per_iteration": 2.6149659156799316 }, { "auxiliary_loss_clip": 0.01150024, "auxiliary_loss_mlp": 0.01042756, "balance_loss_clip": 1.02606618, "balance_loss_mlp": 1.04708958, "epoch": 0.3158875695175109, "flos": 23951914894080.0, "grad_norm": 1.9065108876422567, "language_loss": 0.73645651, "learning_rate": 3.0937631337582723e-06, "loss": 0.75838435, "num_input_tokens_seen": 112802905, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.84765625, "step": 5254, "time_per_iteration": 3.9463582038879395 }, { "auxiliary_loss_clip": 0.01142392, "auxiliary_loss_mlp": 0.0103894, "balance_loss_clip": 1.02243507, "balance_loss_mlp": 1.048159, "epoch": 0.31594769277017887, "flos": 13261883322240.0, "grad_norm": 1.7327293984732428, "language_loss": 0.77454787, "learning_rate": 3.093446797619795e-06, "loss": 0.79636121, "num_input_tokens_seen": 112820305, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8515625, "step": 5255, "time_per_iteration": 4.052876234054565 }, { "auxiliary_loss_clip": 0.01149497, "auxiliary_loss_mlp": 0.01039817, "balance_loss_clip": 1.0219475, "balance_loss_mlp": 1.04818058, "epoch": 0.31600781602284683, "flos": 23368258800000.0, "grad_norm": 1.8564062643148747, "language_loss": 0.78258145, "learning_rate": 3.093130422459013e-06, "loss": 0.80447459, "num_input_tokens_seen": 112841185, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.83984375, "step": 5256, "time_per_iteration": 2.651020050048828 }, { "auxiliary_loss_clip": 0.01156368, "auxiliary_loss_mlp": 0.01039449, "balance_loss_clip": 1.02407098, "balance_loss_mlp": 1.04986417, "epoch": 0.3160679392755148, "flos": 19828580019840.0, "grad_norm": 1.6051442505494884, "language_loss": 0.71147454, "learning_rate": 3.0928140082872194e-06, "loss": 0.73343265, "num_input_tokens_seen": 112860570, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.796875, "step": 5257, "time_per_iteration": 2.536229133605957 }, { "auxiliary_loss_clip": 0.01141919, "auxiliary_loss_mlp": 0.01041723, "balance_loss_clip": 1.02505708, "balance_loss_mlp": 1.04802608, "epoch": 0.31612806252818276, "flos": 20193216935040.0, "grad_norm": 2.2209884590866786, "language_loss": 0.7653656, "learning_rate": 3.092497555115704e-06, "loss": 0.78720194, "num_input_tokens_seen": 112877975, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8515625, "step": 5258, "time_per_iteration": 4.077234506607056 }, { "auxiliary_loss_clip": 0.01149751, "auxiliary_loss_mlp": 0.01043872, "balance_loss_clip": 1.02732563, "balance_loss_mlp": 1.04741192, "epoch": 0.31618818578085073, "flos": 24235967646720.0, "grad_norm": 3.832161169390392, "language_loss": 0.7256735, "learning_rate": 3.0921810629557614e-06, "loss": 0.74760973, "num_input_tokens_seen": 112896170, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.84375, "step": 5259, "time_per_iteration": 2.6493868827819824 }, { "auxiliary_loss_clip": 0.0115537, "auxiliary_loss_mlp": 0.01293767, "balance_loss_clip": 1.03079009, "balance_loss_mlp": 1.05023432, "epoch": 0.3162483090335187, "flos": 25081844002560.0, "grad_norm": 2.1951398247505187, "language_loss": 0.66446972, "learning_rate": 3.0918645318186863e-06, "loss": 0.68896103, "num_input_tokens_seen": 112916180, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8671875, "step": 5260, "time_per_iteration": 2.6785128116607666 }, { "auxiliary_loss_clip": 0.01138466, "auxiliary_loss_mlp": 0.01036204, "balance_loss_clip": 1.02045679, "balance_loss_mlp": 1.04547083, "epoch": 0.31630843228618666, "flos": 26323383646080.0, "grad_norm": 1.9453714533263602, "language_loss": 0.71674764, "learning_rate": 3.091547961715775e-06, "loss": 0.73849434, "num_input_tokens_seen": 112936745, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.84375, "step": 5261, "time_per_iteration": 2.7030575275421143 }, { "auxiliary_loss_clip": 0.01061666, "auxiliary_loss_mlp": 0.01008079, "balance_loss_clip": 1.0058732, "balance_loss_mlp": 1.02530408, "epoch": 0.3163685555388547, "flos": 66758441552640.0, "grad_norm": 0.7576869970181056, "language_loss": 0.50615108, "learning_rate": 3.0912313526583237e-06, "loss": 0.52684855, "num_input_tokens_seen": 112994845, "router_z_loss_clip": 0.02209473, "router_z_loss_mlp": 0.2734375, "step": 5262, "time_per_iteration": 3.213083505630493 }, { "auxiliary_loss_clip": 0.01145192, "auxiliary_loss_mlp": 0.0103474, "balance_loss_clip": 1.01917148, "balance_loss_mlp": 1.05010509, "epoch": 0.31642867879152264, "flos": 25995662933760.0, "grad_norm": 1.5265705771294613, "language_loss": 0.85091174, "learning_rate": 3.0909147046576333e-06, "loss": 0.87271106, "num_input_tokens_seen": 113015125, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.859375, "step": 5263, "time_per_iteration": 2.677220344543457 }, { "auxiliary_loss_clip": 0.01146184, "auxiliary_loss_mlp": 0.01042312, "balance_loss_clip": 1.02713668, "balance_loss_mlp": 1.04783249, "epoch": 0.3164888020441906, "flos": 25774955815680.0, "grad_norm": 1.5020701752742514, "language_loss": 0.81664449, "learning_rate": 3.0905980177250026e-06, "loss": 0.83852947, "num_input_tokens_seen": 113035535, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8046875, "step": 5264, "time_per_iteration": 2.6241796016693115 }, { "auxiliary_loss_clip": 0.01154501, "auxiliary_loss_mlp": 0.010366, "balance_loss_clip": 1.02014947, "balance_loss_mlp": 1.05066705, "epoch": 0.3165489252968586, "flos": 19756220071680.0, "grad_norm": 1.805107763616358, "language_loss": 0.79356474, "learning_rate": 3.090281291871734e-06, "loss": 0.81547582, "num_input_tokens_seen": 113052720, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.859375, "step": 5265, "time_per_iteration": 2.603332996368408 }, { "auxiliary_loss_clip": 0.01145815, "auxiliary_loss_mlp": 0.01043872, "balance_loss_clip": 1.02714694, "balance_loss_mlp": 1.0510962, "epoch": 0.31660904854952654, "flos": 23183929180800.0, "grad_norm": 1.748951201656327, "language_loss": 0.75283194, "learning_rate": 3.089964527109131e-06, "loss": 0.77472883, "num_input_tokens_seen": 113071435, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.859375, "step": 5266, "time_per_iteration": 2.5694003105163574 }, { "auxiliary_loss_clip": 0.01160863, "auxiliary_loss_mlp": 0.01037536, "balance_loss_clip": 1.02153802, "balance_loss_mlp": 1.04891205, "epoch": 0.3166691718021945, "flos": 20408501099520.0, "grad_norm": 2.0571659634263053, "language_loss": 0.79632109, "learning_rate": 3.0896477234484976e-06, "loss": 0.81830513, "num_input_tokens_seen": 113088645, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8515625, "step": 5267, "time_per_iteration": 2.585794448852539 }, { "auxiliary_loss_clip": 0.01070621, "auxiliary_loss_mlp": 0.01002298, "balance_loss_clip": 0.99991435, "balance_loss_mlp": 1.0253396, "epoch": 0.31672929505486247, "flos": 70144781172480.0, "grad_norm": 0.7765878309108751, "language_loss": 0.57914793, "learning_rate": 3.08933088090114e-06, "loss": 0.59987712, "num_input_tokens_seen": 113152775, "router_z_loss_clip": 0.02380371, "router_z_loss_mlp": 0.27734375, "step": 5268, "time_per_iteration": 3.139152765274048 }, { "auxiliary_loss_clip": 0.01133263, "auxiliary_loss_mlp": 0.01035632, "balance_loss_clip": 1.01919317, "balance_loss_mlp": 1.04794335, "epoch": 0.31678941830753043, "flos": 14355758154240.0, "grad_norm": 1.9445789058417324, "language_loss": 0.72820371, "learning_rate": 3.0890139994783653e-06, "loss": 0.74989265, "num_input_tokens_seen": 113171410, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8515625, "step": 5269, "time_per_iteration": 2.4942502975463867 }, { "auxiliary_loss_clip": 0.01158725, "auxiliary_loss_mlp": 0.01040803, "balance_loss_clip": 1.02474523, "balance_loss_mlp": 1.04889727, "epoch": 0.3168495415601984, "flos": 22747722416640.0, "grad_norm": 1.905866510466324, "language_loss": 0.79721928, "learning_rate": 3.0886970791914822e-06, "loss": 0.81921458, "num_input_tokens_seen": 113189965, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83203125, "step": 5270, "time_per_iteration": 2.552340269088745 }, { "auxiliary_loss_clip": 0.01138205, "auxiliary_loss_mlp": 0.01043572, "balance_loss_clip": 1.02647686, "balance_loss_mlp": 1.05053675, "epoch": 0.31690966481286637, "flos": 20115254465280.0, "grad_norm": 2.1370035784631765, "language_loss": 0.79688954, "learning_rate": 3.088380120051801e-06, "loss": 0.81870735, "num_input_tokens_seen": 113206355, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.875, "step": 5271, "time_per_iteration": 2.46356201171875 }, { "auxiliary_loss_clip": 0.01135182, "auxiliary_loss_mlp": 0.01037854, "balance_loss_clip": 1.02103329, "balance_loss_mlp": 1.0500443, "epoch": 0.31696978806553433, "flos": 21178928937600.0, "grad_norm": 2.020263026976218, "language_loss": 0.72925854, "learning_rate": 3.088063122070633e-06, "loss": 0.75098884, "num_input_tokens_seen": 113225440, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8515625, "step": 5272, "time_per_iteration": 2.523733615875244 }, { "auxiliary_loss_clip": 0.01153578, "auxiliary_loss_mlp": 0.01047592, "balance_loss_clip": 1.0287571, "balance_loss_mlp": 1.04787278, "epoch": 0.3170299113182023, "flos": 42997030439040.0, "grad_norm": 2.1950634271991283, "language_loss": 0.69035232, "learning_rate": 3.0877460852592902e-06, "loss": 0.71236408, "num_input_tokens_seen": 113248840, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.87890625, "step": 5273, "time_per_iteration": 2.687998056411743 }, { "auxiliary_loss_clip": 0.0114087, "auxiliary_loss_mlp": 0.01285551, "balance_loss_clip": 1.02099991, "balance_loss_mlp": 1.04801035, "epoch": 0.31709003457087026, "flos": 24460158384000.0, "grad_norm": 1.7039348089306332, "language_loss": 0.67999876, "learning_rate": 3.0874290096290888e-06, "loss": 0.70426297, "num_input_tokens_seen": 113269630, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.83984375, "step": 5274, "time_per_iteration": 2.570657968521118 }, { "auxiliary_loss_clip": 0.01155924, "auxiliary_loss_mlp": 0.01042393, "balance_loss_clip": 1.02686, "balance_loss_mlp": 1.04864395, "epoch": 0.3171501578235382, "flos": 24135310759680.0, "grad_norm": 1.5974793017506856, "language_loss": 0.80554128, "learning_rate": 3.0871118951913423e-06, "loss": 0.82752442, "num_input_tokens_seen": 113291200, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.80859375, "step": 5275, "time_per_iteration": 2.5805981159210205 }, { "auxiliary_loss_clip": 0.01141781, "auxiliary_loss_mlp": 0.01040187, "balance_loss_clip": 1.02360559, "balance_loss_mlp": 1.04705012, "epoch": 0.31721028107620625, "flos": 18879712392960.0, "grad_norm": 2.0463200636506027, "language_loss": 0.72651839, "learning_rate": 3.0867947419573693e-06, "loss": 0.74833816, "num_input_tokens_seen": 113310170, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.859375, "step": 5276, "time_per_iteration": 2.623875617980957 }, { "auxiliary_loss_clip": 0.01146652, "auxiliary_loss_mlp": 0.01036238, "balance_loss_clip": 1.0207293, "balance_loss_mlp": 1.04875422, "epoch": 0.3172704043288742, "flos": 23147874904320.0, "grad_norm": 1.445576339324312, "language_loss": 0.77874601, "learning_rate": 3.0864775499384873e-06, "loss": 0.8005749, "num_input_tokens_seen": 113331140, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.80078125, "step": 5277, "time_per_iteration": 2.6302740573883057 }, { "auxiliary_loss_clip": 0.01152149, "auxiliary_loss_mlp": 0.01037469, "balance_loss_clip": 1.02006412, "balance_loss_mlp": 1.04835403, "epoch": 0.3173305275815422, "flos": 17858520731520.0, "grad_norm": 1.7331717220181064, "language_loss": 0.79008466, "learning_rate": 3.086160319146016e-06, "loss": 0.81198084, "num_input_tokens_seen": 113350030, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.859375, "step": 5278, "time_per_iteration": 2.6438968181610107 }, { "auxiliary_loss_clip": 0.01062691, "auxiliary_loss_mlp": 0.01016817, "balance_loss_clip": 1.01445663, "balance_loss_mlp": 1.0267303, "epoch": 0.31739065083421014, "flos": 59973476883840.0, "grad_norm": 0.8755852683496365, "language_loss": 0.62796259, "learning_rate": 3.0858430495912772e-06, "loss": 0.64875764, "num_input_tokens_seen": 113395820, "router_z_loss_clip": 0.02355957, "router_z_loss_mlp": 0.27539062, "step": 5279, "time_per_iteration": 2.900681495666504 }, { "auxiliary_loss_clip": 0.01137418, "auxiliary_loss_mlp": 0.01045124, "balance_loss_clip": 1.0280652, "balance_loss_mlp": 1.04860711, "epoch": 0.3174507740868781, "flos": 23800981944960.0, "grad_norm": 1.5722567639763971, "language_loss": 0.81013334, "learning_rate": 3.0855257412855933e-06, "loss": 0.83195877, "num_input_tokens_seen": 113416835, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.890625, "step": 5280, "time_per_iteration": 2.6253042221069336 }, { "auxiliary_loss_clip": 0.01161861, "auxiliary_loss_mlp": 0.01050073, "balance_loss_clip": 1.03464746, "balance_loss_mlp": 1.05090642, "epoch": 0.31751089733954607, "flos": 27638899349760.0, "grad_norm": 1.4896615756598066, "language_loss": 0.78354502, "learning_rate": 3.0852083942402874e-06, "loss": 0.80566436, "num_input_tokens_seen": 113440850, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.84375, "step": 5281, "time_per_iteration": 2.722503662109375 }, { "auxiliary_loss_clip": 0.01152859, "auxiliary_loss_mlp": 0.01043455, "balance_loss_clip": 1.02632463, "balance_loss_mlp": 1.05047584, "epoch": 0.31757102059221404, "flos": 23769273214080.0, "grad_norm": 1.5011168432658504, "language_loss": 0.78404796, "learning_rate": 3.084891008466686e-06, "loss": 0.80601108, "num_input_tokens_seen": 113461000, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.84765625, "step": 5282, "time_per_iteration": 2.6261563301086426 }, { "auxiliary_loss_clip": 0.01155369, "auxiliary_loss_mlp": 0.01041418, "balance_loss_clip": 1.02538395, "balance_loss_mlp": 1.05044043, "epoch": 0.317631143844882, "flos": 25264521596160.0, "grad_norm": 2.260724871003695, "language_loss": 0.67003518, "learning_rate": 3.0845735839761145e-06, "loss": 0.69200301, "num_input_tokens_seen": 113480820, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8671875, "step": 5283, "time_per_iteration": 2.6344950199127197 }, { "auxiliary_loss_clip": 0.01061124, "auxiliary_loss_mlp": 0.01006669, "balance_loss_clip": 1.00440454, "balance_loss_mlp": 1.02481604, "epoch": 0.31769126709754997, "flos": 55825939221120.0, "grad_norm": 0.7371526955172467, "language_loss": 0.52746451, "learning_rate": 3.084256120779902e-06, "loss": 0.54814243, "num_input_tokens_seen": 113536910, "router_z_loss_clip": 0.02270508, "router_z_loss_mlp": 0.2734375, "step": 5284, "time_per_iteration": 3.175089120864868 }, { "auxiliary_loss_clip": 0.01159702, "auxiliary_loss_mlp": 0.01042822, "balance_loss_clip": 1.02700269, "balance_loss_mlp": 1.05405402, "epoch": 0.31775139035021793, "flos": 16690562098560.0, "grad_norm": 1.6835359114210178, "language_loss": 0.7055918, "learning_rate": 3.0839386188893777e-06, "loss": 0.72761703, "num_input_tokens_seen": 113555480, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.875, "step": 5285, "time_per_iteration": 2.5930049419403076 }, { "auxiliary_loss_clip": 0.01051661, "auxiliary_loss_mlp": 0.01008633, "balance_loss_clip": 1.00647533, "balance_loss_mlp": 1.02411544, "epoch": 0.3178115136028859, "flos": 64227241019520.0, "grad_norm": 0.8165856977961028, "language_loss": 0.60515022, "learning_rate": 3.083621078315872e-06, "loss": 0.62575316, "num_input_tokens_seen": 113616790, "router_z_loss_clip": 0.02160645, "router_z_loss_mlp": 0.27734375, "step": 5286, "time_per_iteration": 3.174893617630005 }, { "auxiliary_loss_clip": 0.0113652, "auxiliary_loss_mlp": 0.01051762, "balance_loss_clip": 1.03467965, "balance_loss_mlp": 1.04862952, "epoch": 0.31787163685555386, "flos": 18697465762560.0, "grad_norm": 2.1516764007428995, "language_loss": 0.72438955, "learning_rate": 3.083303499070718e-06, "loss": 0.74627233, "num_input_tokens_seen": 113635320, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.87890625, "step": 5287, "time_per_iteration": 2.6535322666168213 }, { "auxiliary_loss_clip": 0.01143868, "auxiliary_loss_mlp": 0.01053041, "balance_loss_clip": 1.03544581, "balance_loss_mlp": 1.0472405, "epoch": 0.31793176010822183, "flos": 21324762155520.0, "grad_norm": 1.7961476901502669, "language_loss": 0.75632834, "learning_rate": 3.082985881165248e-06, "loss": 0.77829742, "num_input_tokens_seen": 113654000, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.87890625, "step": 5288, "time_per_iteration": 2.5714833736419678 }, { "auxiliary_loss_clip": 0.01136611, "auxiliary_loss_mlp": 0.01037708, "balance_loss_clip": 1.02281284, "balance_loss_mlp": 1.04469013, "epoch": 0.31799188336088985, "flos": 20958688696320.0, "grad_norm": 1.8774537505160656, "language_loss": 0.87699652, "learning_rate": 3.082668224610798e-06, "loss": 0.8987397, "num_input_tokens_seen": 113672375, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.828125, "step": 5289, "time_per_iteration": 2.767239570617676 }, { "auxiliary_loss_clip": 0.01139564, "auxiliary_loss_mlp": 0.01043787, "balance_loss_clip": 1.02854013, "balance_loss_mlp": 1.04879773, "epoch": 0.3180520066135578, "flos": 22491930689280.0, "grad_norm": 1.4948907865532524, "language_loss": 0.67792195, "learning_rate": 3.0823505294187044e-06, "loss": 0.69975549, "num_input_tokens_seen": 113692385, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.81640625, "step": 5290, "time_per_iteration": 2.537480354309082 }, { "auxiliary_loss_clip": 0.01151603, "auxiliary_loss_mlp": 0.01043629, "balance_loss_clip": 1.02726161, "balance_loss_mlp": 1.04571342, "epoch": 0.3181121298662258, "flos": 27235335070080.0, "grad_norm": 1.768778562265533, "language_loss": 0.80200195, "learning_rate": 3.0820327956003045e-06, "loss": 0.82395428, "num_input_tokens_seen": 113712145, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.87890625, "step": 5291, "time_per_iteration": 3.985386610031128 }, { "auxiliary_loss_clip": 0.01171323, "auxiliary_loss_mlp": 0.01037984, "balance_loss_clip": 1.02055526, "balance_loss_mlp": 1.04905796, "epoch": 0.31817225311889374, "flos": 23180158252800.0, "grad_norm": 2.1481303773701566, "language_loss": 0.79590768, "learning_rate": 3.0817150231669367e-06, "loss": 0.81800073, "num_input_tokens_seen": 113731435, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.8671875, "step": 5292, "time_per_iteration": 2.640732526779175 }, { "auxiliary_loss_clip": 0.01137092, "auxiliary_loss_mlp": 0.01041074, "balance_loss_clip": 1.02688777, "balance_loss_mlp": 1.04779863, "epoch": 0.3182323763715617, "flos": 23258803080960.0, "grad_norm": 1.6167562879948338, "language_loss": 0.75163817, "learning_rate": 3.081397212129943e-06, "loss": 0.7734198, "num_input_tokens_seen": 113750825, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.80078125, "step": 5293, "time_per_iteration": 2.519341230392456 }, { "auxiliary_loss_clip": 0.01139389, "auxiliary_loss_mlp": 0.01046014, "balance_loss_clip": 1.03081489, "balance_loss_mlp": 1.0485723, "epoch": 0.3182924996242297, "flos": 29016683280000.0, "grad_norm": 5.950010419905098, "language_loss": 0.73685312, "learning_rate": 3.0810793625006637e-06, "loss": 0.75870717, "num_input_tokens_seen": 113770010, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8203125, "step": 5294, "time_per_iteration": 2.5970733165740967 }, { "auxiliary_loss_clip": 0.01135032, "auxiliary_loss_mlp": 0.01041184, "balance_loss_clip": 1.02388644, "balance_loss_mlp": 1.04819417, "epoch": 0.31835262287689764, "flos": 20449188230400.0, "grad_norm": 1.8893802315523303, "language_loss": 0.76036835, "learning_rate": 3.080761474290443e-06, "loss": 0.78213048, "num_input_tokens_seen": 113788640, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.8671875, "step": 5295, "time_per_iteration": 2.515139579772949 }, { "auxiliary_loss_clip": 0.01164011, "auxiliary_loss_mlp": 0.0104128, "balance_loss_clip": 1.02493668, "balance_loss_mlp": 1.04939222, "epoch": 0.3184127461295656, "flos": 25119478477440.0, "grad_norm": 1.5815574159764278, "language_loss": 0.69143116, "learning_rate": 3.0804435475106265e-06, "loss": 0.71348405, "num_input_tokens_seen": 113809515, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.875, "step": 5296, "time_per_iteration": 4.009310007095337 }, { "auxiliary_loss_clip": 0.01150413, "auxiliary_loss_mlp": 0.01040471, "balance_loss_clip": 1.02461624, "balance_loss_mlp": 1.04734159, "epoch": 0.31847286938223357, "flos": 25551231955200.0, "grad_norm": 2.2264453586149466, "language_loss": 0.77600992, "learning_rate": 3.0801255821725578e-06, "loss": 0.7979188, "num_input_tokens_seen": 113829770, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8515625, "step": 5297, "time_per_iteration": 4.158074140548706 }, { "auxiliary_loss_clip": 0.01157075, "auxiliary_loss_mlp": 0.01037104, "balance_loss_clip": 1.02067709, "balance_loss_mlp": 1.04753065, "epoch": 0.31853299263490154, "flos": 27782470010880.0, "grad_norm": 1.7987676771945187, "language_loss": 0.79296625, "learning_rate": 3.0798075782875854e-06, "loss": 0.81490809, "num_input_tokens_seen": 113849320, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.82421875, "step": 5298, "time_per_iteration": 2.572540760040283 }, { "auxiliary_loss_clip": 0.01153708, "auxiliary_loss_mlp": 0.0104099, "balance_loss_clip": 1.02544558, "balance_loss_mlp": 1.04810572, "epoch": 0.3185931158875695, "flos": 22706747976960.0, "grad_norm": 1.8469841088785584, "language_loss": 0.74134707, "learning_rate": 3.0794895358670587e-06, "loss": 0.76329404, "num_input_tokens_seen": 113867860, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.87890625, "step": 5299, "time_per_iteration": 4.0564329624176025 }, { "auxiliary_loss_clip": 0.01152418, "auxiliary_loss_mlp": 0.01046275, "balance_loss_clip": 1.03015804, "balance_loss_mlp": 1.04665577, "epoch": 0.31865323914023747, "flos": 24571517523840.0, "grad_norm": 1.8595736146532926, "language_loss": 0.77778631, "learning_rate": 3.079171454922327e-06, "loss": 0.79977322, "num_input_tokens_seen": 113886375, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.87890625, "step": 5300, "time_per_iteration": 2.6086878776550293 }, { "auxiliary_loss_clip": 0.0114915, "auxiliary_loss_mlp": 0.01043036, "balance_loss_clip": 1.02628708, "balance_loss_mlp": 1.04600966, "epoch": 0.31871336239290543, "flos": 18186564666240.0, "grad_norm": 2.1345786600780556, "language_loss": 0.84113359, "learning_rate": 3.0788533354647425e-06, "loss": 0.86305547, "num_input_tokens_seen": 113904065, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8515625, "step": 5301, "time_per_iteration": 2.5027058124542236 }, { "auxiliary_loss_clip": 0.01145493, "auxiliary_loss_mlp": 0.01048983, "balance_loss_clip": 1.03243661, "balance_loss_mlp": 1.05363274, "epoch": 0.31877348564557345, "flos": 21826756679040.0, "grad_norm": 2.0216145659251357, "language_loss": 0.76828879, "learning_rate": 3.078535177505657e-06, "loss": 0.79023355, "num_input_tokens_seen": 113918415, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.828125, "step": 5302, "time_per_iteration": 2.5506534576416016 }, { "auxiliary_loss_clip": 0.01128725, "auxiliary_loss_mlp": 0.01042235, "balance_loss_clip": 1.02531934, "balance_loss_mlp": 1.04731393, "epoch": 0.3188336088982414, "flos": 22015252275840.0, "grad_norm": 1.6700870023372947, "language_loss": 0.79579765, "learning_rate": 3.0782169810564256e-06, "loss": 0.81750727, "num_input_tokens_seen": 113938135, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8125, "step": 5303, "time_per_iteration": 2.563645601272583 }, { "auxiliary_loss_clip": 0.01147864, "auxiliary_loss_mlp": 0.01042942, "balance_loss_clip": 1.02577555, "balance_loss_mlp": 1.04827607, "epoch": 0.3188937321509094, "flos": 20047886507520.0, "grad_norm": 1.8298571559609402, "language_loss": 0.72698295, "learning_rate": 3.0778987461284035e-06, "loss": 0.74889106, "num_input_tokens_seen": 113957125, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.90625, "step": 5304, "time_per_iteration": 2.6890618801116943 }, { "auxiliary_loss_clip": 0.01147756, "auxiliary_loss_mlp": 0.0104362, "balance_loss_clip": 1.02852201, "balance_loss_mlp": 1.04840362, "epoch": 0.31895385540357735, "flos": 25848105863040.0, "grad_norm": 1.9999881352069726, "language_loss": 0.71813619, "learning_rate": 3.077580472732948e-06, "loss": 0.74004996, "num_input_tokens_seen": 113974875, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8125, "step": 5305, "time_per_iteration": 2.6244657039642334 }, { "auxiliary_loss_clip": 0.01150664, "auxiliary_loss_mlp": 0.01035558, "balance_loss_clip": 1.02080035, "balance_loss_mlp": 1.04883003, "epoch": 0.3190139786562453, "flos": 23477714519040.0, "grad_norm": 2.386355978720458, "language_loss": 0.63854766, "learning_rate": 3.077262160881417e-06, "loss": 0.66040993, "num_input_tokens_seen": 113994450, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.83984375, "step": 5306, "time_per_iteration": 2.6679348945617676 }, { "auxiliary_loss_clip": 0.01153282, "auxiliary_loss_mlp": 0.01042898, "balance_loss_clip": 1.02561235, "balance_loss_mlp": 1.05018866, "epoch": 0.3190741019089133, "flos": 29095543589760.0, "grad_norm": 2.123483001785107, "language_loss": 0.78996307, "learning_rate": 3.07694381058517e-06, "loss": 0.81192487, "num_input_tokens_seen": 114013945, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.8515625, "step": 5307, "time_per_iteration": 2.5817456245422363 }, { "auxiliary_loss_clip": 0.01143857, "auxiliary_loss_mlp": 0.01039321, "balance_loss_clip": 1.02270293, "balance_loss_mlp": 1.04500341, "epoch": 0.31913422516158124, "flos": 17129534209920.0, "grad_norm": 1.6045582952863344, "language_loss": 0.77515459, "learning_rate": 3.07662542185557e-06, "loss": 0.79698634, "num_input_tokens_seen": 114031375, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8125, "step": 5308, "time_per_iteration": 2.602266311645508 }, { "auxiliary_loss_clip": 0.01153729, "auxiliary_loss_mlp": 0.01047335, "balance_loss_clip": 1.02884555, "balance_loss_mlp": 1.0518043, "epoch": 0.3191943484142492, "flos": 16069846147200.0, "grad_norm": 2.5038735598152972, "language_loss": 0.73682588, "learning_rate": 3.0763069947039774e-06, "loss": 0.75883651, "num_input_tokens_seen": 114048465, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.9296875, "step": 5309, "time_per_iteration": 2.483362913131714 }, { "auxiliary_loss_clip": 0.0114142, "auxiliary_loss_mlp": 0.01040266, "balance_loss_clip": 1.02513885, "balance_loss_mlp": 1.04803944, "epoch": 0.3192544716669172, "flos": 22966166977920.0, "grad_norm": 2.334610450199674, "language_loss": 0.82578176, "learning_rate": 3.0759885291417574e-06, "loss": 0.84759867, "num_input_tokens_seen": 114068415, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.84375, "step": 5310, "time_per_iteration": 2.671320915222168 }, { "auxiliary_loss_clip": 0.01147712, "auxiliary_loss_mlp": 0.01041199, "balance_loss_clip": 1.02617884, "balance_loss_mlp": 1.04592609, "epoch": 0.31931459491958514, "flos": 26870339018880.0, "grad_norm": 1.4809544179639922, "language_loss": 0.78254843, "learning_rate": 3.0756700251802745e-06, "loss": 0.80443764, "num_input_tokens_seen": 114088565, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.83984375, "step": 5311, "time_per_iteration": 2.5736589431762695 }, { "auxiliary_loss_clip": 0.01158364, "auxiliary_loss_mlp": 0.01039009, "balance_loss_clip": 1.02243853, "balance_loss_mlp": 1.0471127, "epoch": 0.3193747181722531, "flos": 21836525178240.0, "grad_norm": 1.8700287274381273, "language_loss": 0.8413738, "learning_rate": 3.0753514828308942e-06, "loss": 0.86334753, "num_input_tokens_seen": 114107160, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.84375, "step": 5312, "time_per_iteration": 2.6073310375213623 }, { "auxiliary_loss_clip": 0.01163883, "auxiliary_loss_mlp": 0.01049985, "balance_loss_clip": 1.03247309, "balance_loss_mlp": 1.04713285, "epoch": 0.31943484142492107, "flos": 18324999682560.0, "grad_norm": 2.1612706487985713, "language_loss": 0.78737104, "learning_rate": 3.0750329021049863e-06, "loss": 0.80950969, "num_input_tokens_seen": 114123420, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.89453125, "step": 5313, "time_per_iteration": 2.5261974334716797 }, { "auxiliary_loss_clip": 0.01140976, "auxiliary_loss_mlp": 0.01039121, "balance_loss_clip": 1.02318239, "balance_loss_mlp": 1.04766715, "epoch": 0.31949496467758903, "flos": 21615818060160.0, "grad_norm": 2.3653572369579803, "language_loss": 0.85707772, "learning_rate": 3.074714283013919e-06, "loss": 0.87887871, "num_input_tokens_seen": 114139230, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.84375, "step": 5314, "time_per_iteration": 2.537051200866699 }, { "auxiliary_loss_clip": 0.01150203, "auxiliary_loss_mlp": 0.01041562, "balance_loss_clip": 1.02534914, "balance_loss_mlp": 1.04701638, "epoch": 0.31955508793025705, "flos": 21760214734080.0, "grad_norm": 2.474689905464093, "language_loss": 0.79987723, "learning_rate": 3.074395625569064e-06, "loss": 0.82179487, "num_input_tokens_seen": 114159290, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8515625, "step": 5315, "time_per_iteration": 2.5230305194854736 }, { "auxiliary_loss_clip": 0.01161071, "auxiliary_loss_mlp": 0.01056031, "balance_loss_clip": 1.03850698, "balance_loss_mlp": 1.04808664, "epoch": 0.319615211182925, "flos": 17164331510400.0, "grad_norm": 1.7753284052035103, "language_loss": 0.67325234, "learning_rate": 3.074076929781793e-06, "loss": 0.69542342, "num_input_tokens_seen": 114177655, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.859375, "step": 5316, "time_per_iteration": 2.5802857875823975 }, { "auxiliary_loss_clip": 0.01141534, "auxiliary_loss_mlp": 0.01039491, "balance_loss_clip": 1.02500701, "balance_loss_mlp": 1.04838002, "epoch": 0.319675334435593, "flos": 28112812416000.0, "grad_norm": 1.4281843340614773, "language_loss": 0.69036442, "learning_rate": 3.073758195663479e-06, "loss": 0.71217471, "num_input_tokens_seen": 114200880, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.84375, "step": 5317, "time_per_iteration": 2.673912286758423 }, { "auxiliary_loss_clip": 0.01069247, "auxiliary_loss_mlp": 0.01025964, "balance_loss_clip": 1.02378273, "balance_loss_mlp": 1.02402663, "epoch": 0.31973545768826095, "flos": 69501119408640.0, "grad_norm": 0.7421043174022933, "language_loss": 0.53095734, "learning_rate": 3.0734394232254967e-06, "loss": 0.55190945, "num_input_tokens_seen": 114267145, "router_z_loss_clip": 0.02185059, "router_z_loss_mlp": 0.27734375, "step": 5318, "time_per_iteration": 3.3014307022094727 }, { "auxiliary_loss_clip": 0.01129373, "auxiliary_loss_mlp": 0.0104493, "balance_loss_clip": 1.02968311, "balance_loss_mlp": 1.04750824, "epoch": 0.3197955809409289, "flos": 13699203408000.0, "grad_norm": 1.7933915196116017, "language_loss": 0.84111428, "learning_rate": 3.0731206124792225e-06, "loss": 0.86285722, "num_input_tokens_seen": 114284630, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8203125, "step": 5319, "time_per_iteration": 2.521238088607788 }, { "auxiliary_loss_clip": 0.01138652, "auxiliary_loss_mlp": 0.01046779, "balance_loss_clip": 1.03075671, "balance_loss_mlp": 1.04624045, "epoch": 0.3198557041935969, "flos": 33218124278400.0, "grad_norm": 1.4994301563404977, "language_loss": 0.62892675, "learning_rate": 3.0728017634360345e-06, "loss": 0.65078104, "num_input_tokens_seen": 114305830, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83203125, "step": 5320, "time_per_iteration": 2.7350940704345703 }, { "auxiliary_loss_clip": 0.01155582, "auxiliary_loss_mlp": 0.0103999, "balance_loss_clip": 1.02487445, "balance_loss_mlp": 1.05058742, "epoch": 0.31991582744626484, "flos": 23732033788800.0, "grad_norm": 4.4362265382848705, "language_loss": 0.70408058, "learning_rate": 3.072482876107311e-06, "loss": 0.72603631, "num_input_tokens_seen": 114325165, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8671875, "step": 5321, "time_per_iteration": 2.593247652053833 }, { "auxiliary_loss_clip": 0.01141043, "auxiliary_loss_mlp": 0.01313257, "balance_loss_clip": 1.04880881, "balance_loss_mlp": 1.0501914, "epoch": 0.3199759506989328, "flos": 18550842445440.0, "grad_norm": 3.3104333406883657, "language_loss": 0.86321217, "learning_rate": 3.072163950504432e-06, "loss": 0.88775516, "num_input_tokens_seen": 114341310, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.90625, "step": 5322, "time_per_iteration": 2.533688545227051 }, { "auxiliary_loss_clip": 0.01149681, "auxiliary_loss_mlp": 0.01043344, "balance_loss_clip": 1.02773952, "balance_loss_mlp": 1.04703808, "epoch": 0.3200360739516008, "flos": 22418888382720.0, "grad_norm": 1.6390097819413114, "language_loss": 0.83102471, "learning_rate": 3.0718449866387805e-06, "loss": 0.85295492, "num_input_tokens_seen": 114360355, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.84375, "step": 5323, "time_per_iteration": 2.6069140434265137 }, { "auxiliary_loss_clip": 0.01137955, "auxiliary_loss_mlp": 0.01039371, "balance_loss_clip": 1.02308691, "balance_loss_mlp": 1.04655385, "epoch": 0.32009619720426874, "flos": 20595236929920.0, "grad_norm": 1.7938170245440739, "language_loss": 0.77898765, "learning_rate": 3.071525984521738e-06, "loss": 0.80076087, "num_input_tokens_seen": 114379220, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8203125, "step": 5324, "time_per_iteration": 2.5374739170074463 }, { "auxiliary_loss_clip": 0.0114904, "auxiliary_loss_mlp": 0.01039447, "balance_loss_clip": 1.02335417, "balance_loss_mlp": 1.04676294, "epoch": 0.3201563204569367, "flos": 18147637301760.0, "grad_norm": 2.015895966578965, "language_loss": 0.80229008, "learning_rate": 3.0712069441646896e-06, "loss": 0.82417488, "num_input_tokens_seen": 114396365, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.84375, "step": 5325, "time_per_iteration": 2.5622293949127197 }, { "auxiliary_loss_clip": 0.01142964, "auxiliary_loss_mlp": 0.01038506, "balance_loss_clip": 1.02317619, "balance_loss_mlp": 1.04956961, "epoch": 0.32021644370960467, "flos": 31684235840640.0, "grad_norm": 1.6497556154479676, "language_loss": 0.74930823, "learning_rate": 3.0708878655790207e-06, "loss": 0.77112293, "num_input_tokens_seen": 114416780, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.84765625, "step": 5326, "time_per_iteration": 2.6041836738586426 }, { "auxiliary_loss_clip": 0.01158645, "auxiliary_loss_mlp": 0.01040446, "balance_loss_clip": 1.02467465, "balance_loss_mlp": 1.04969525, "epoch": 0.32027656696227264, "flos": 26865921646080.0, "grad_norm": 1.4244637690048938, "language_loss": 0.81157088, "learning_rate": 3.070568748776118e-06, "loss": 0.83356178, "num_input_tokens_seen": 114437405, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8203125, "step": 5327, "time_per_iteration": 2.643848180770874 }, { "auxiliary_loss_clip": 0.01162429, "auxiliary_loss_mlp": 0.01042709, "balance_loss_clip": 1.02619863, "balance_loss_mlp": 1.04924083, "epoch": 0.32033669021494066, "flos": 24169928492160.0, "grad_norm": 1.4733868079007948, "language_loss": 0.77504468, "learning_rate": 3.0702495937673713e-06, "loss": 0.79709613, "num_input_tokens_seen": 114458505, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.859375, "step": 5328, "time_per_iteration": 2.5446276664733887 }, { "auxiliary_loss_clip": 0.0116691, "auxiliary_loss_mlp": 0.01045001, "balance_loss_clip": 1.02785921, "balance_loss_mlp": 1.04624367, "epoch": 0.3203968134676086, "flos": 24460768915200.0, "grad_norm": 1.9514655193329467, "language_loss": 0.7403034, "learning_rate": 3.0699304005641686e-06, "loss": 0.7624225, "num_input_tokens_seen": 114479050, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.8515625, "step": 5329, "time_per_iteration": 2.6722536087036133 }, { "auxiliary_loss_clip": 0.01145928, "auxiliary_loss_mlp": 0.01038703, "balance_loss_clip": 1.02367091, "balance_loss_mlp": 1.04755867, "epoch": 0.3204569367202766, "flos": 18004713085440.0, "grad_norm": 1.4425013472690333, "language_loss": 0.70703971, "learning_rate": 3.069611169177903e-06, "loss": 0.72888601, "num_input_tokens_seen": 114497415, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8046875, "step": 5330, "time_per_iteration": 2.4992945194244385 }, { "auxiliary_loss_clip": 0.01172895, "auxiliary_loss_mlp": 0.01047082, "balance_loss_clip": 1.02949834, "balance_loss_mlp": 1.04840374, "epoch": 0.32051705997294455, "flos": 30589678650240.0, "grad_norm": 1.8056763495621373, "language_loss": 0.79958445, "learning_rate": 3.069291899619966e-06, "loss": 0.82178426, "num_input_tokens_seen": 114518785, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.8828125, "step": 5331, "time_per_iteration": 2.6899006366729736 }, { "auxiliary_loss_clip": 0.01058638, "auxiliary_loss_mlp": 0.01038789, "balance_loss_clip": 1.03655934, "balance_loss_mlp": 1.02217126, "epoch": 0.3205771832256125, "flos": 68417979765120.0, "grad_norm": 0.8530017649332519, "language_loss": 0.57853544, "learning_rate": 3.0689725919017517e-06, "loss": 0.59950972, "num_input_tokens_seen": 114577710, "router_z_loss_clip": 0.02233887, "router_z_loss_mlp": 0.2734375, "step": 5332, "time_per_iteration": 4.45250940322876 }, { "auxiliary_loss_clip": 0.01161654, "auxiliary_loss_mlp": 0.010369, "balance_loss_clip": 1.02083635, "balance_loss_mlp": 1.04823697, "epoch": 0.3206373064782805, "flos": 30443953173120.0, "grad_norm": 1.6323067743096473, "language_loss": 0.73367828, "learning_rate": 3.068653246034655e-06, "loss": 0.75566387, "num_input_tokens_seen": 114598640, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8671875, "step": 5333, "time_per_iteration": 2.6782946586608887 }, { "auxiliary_loss_clip": 0.01149901, "auxiliary_loss_mlp": 0.0104346, "balance_loss_clip": 1.02702117, "balance_loss_mlp": 1.04768884, "epoch": 0.32069742973094845, "flos": 22054502862720.0, "grad_norm": 2.451901590607878, "language_loss": 0.70403826, "learning_rate": 3.0683338620300728e-06, "loss": 0.72597182, "num_input_tokens_seen": 114618780, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.84375, "step": 5334, "time_per_iteration": 2.636697769165039 }, { "auxiliary_loss_clip": 0.01165877, "auxiliary_loss_mlp": 0.01041616, "balance_loss_clip": 1.02558279, "balance_loss_mlp": 1.04572165, "epoch": 0.3207575529836164, "flos": 22054000072320.0, "grad_norm": 1.748157352726352, "language_loss": 0.77492392, "learning_rate": 3.068014439899404e-06, "loss": 0.7969988, "num_input_tokens_seen": 114637525, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.84375, "step": 5335, "time_per_iteration": 2.5663511753082275 }, { "auxiliary_loss_clip": 0.01140209, "auxiliary_loss_mlp": 0.01040126, "balance_loss_clip": 1.02417564, "balance_loss_mlp": 1.04797888, "epoch": 0.3208176762362844, "flos": 34057536186240.0, "grad_norm": 1.4479431147571435, "language_loss": 0.68112552, "learning_rate": 3.0676949796540458e-06, "loss": 0.7029289, "num_input_tokens_seen": 114659705, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.83203125, "step": 5336, "time_per_iteration": 2.6983633041381836 }, { "auxiliary_loss_clip": 0.01157565, "auxiliary_loss_mlp": 0.01045729, "balance_loss_clip": 1.02985024, "balance_loss_mlp": 1.0466485, "epoch": 0.32087779948895234, "flos": 21798711135360.0, "grad_norm": 1.6798493694306946, "language_loss": 0.78933001, "learning_rate": 3.067375481305401e-06, "loss": 0.81136298, "num_input_tokens_seen": 114678340, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.83984375, "step": 5337, "time_per_iteration": 4.009607553482056 }, { "auxiliary_loss_clip": 0.01132831, "auxiliary_loss_mlp": 0.01038947, "balance_loss_clip": 1.02367628, "balance_loss_mlp": 1.04415226, "epoch": 0.3209379227416203, "flos": 21434110133760.0, "grad_norm": 1.5356927613075375, "language_loss": 0.73877978, "learning_rate": 3.0670559448648707e-06, "loss": 0.76049757, "num_input_tokens_seen": 114696980, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.796875, "step": 5338, "time_per_iteration": 4.128960609436035 }, { "auxiliary_loss_clip": 0.01148171, "auxiliary_loss_mlp": 0.01040838, "balance_loss_clip": 1.02432716, "balance_loss_mlp": 1.04395485, "epoch": 0.3209980459942883, "flos": 25849075530240.0, "grad_norm": 1.8626651791320248, "language_loss": 0.62505686, "learning_rate": 3.0667363703438588e-06, "loss": 0.64694691, "num_input_tokens_seen": 114717330, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.859375, "step": 5339, "time_per_iteration": 2.613799810409546 }, { "auxiliary_loss_clip": 0.01128385, "auxiliary_loss_mlp": 0.01039952, "balance_loss_clip": 1.02395427, "balance_loss_mlp": 1.0442965, "epoch": 0.32105816924695624, "flos": 19099162535040.0, "grad_norm": 1.798788479113573, "language_loss": 0.81729198, "learning_rate": 3.0664167577537696e-06, "loss": 0.83897537, "num_input_tokens_seen": 114736320, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83984375, "step": 5340, "time_per_iteration": 2.4754037857055664 }, { "auxiliary_loss_clip": 0.01157879, "auxiliary_loss_mlp": 0.01042803, "balance_loss_clip": 1.02678144, "balance_loss_mlp": 1.04650223, "epoch": 0.3211182924996242, "flos": 16581860565120.0, "grad_norm": 1.8673113462694502, "language_loss": 0.76271921, "learning_rate": 3.0660971071060095e-06, "loss": 0.78472602, "num_input_tokens_seen": 114754575, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.84375, "step": 5341, "time_per_iteration": 3.918208122253418 }, { "auxiliary_loss_clip": 0.01153275, "auxiliary_loss_mlp": 0.01040719, "balance_loss_clip": 1.02510238, "balance_loss_mlp": 1.04569101, "epoch": 0.3211784157522922, "flos": 22672202071680.0, "grad_norm": 1.563628733938274, "language_loss": 0.79362059, "learning_rate": 3.0657774184119854e-06, "loss": 0.81556052, "num_input_tokens_seen": 114773590, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.80859375, "step": 5342, "time_per_iteration": 2.5868959426879883 }, { "auxiliary_loss_clip": 0.01149273, "auxiliary_loss_mlp": 0.0103831, "balance_loss_clip": 1.02235961, "balance_loss_mlp": 1.04665971, "epoch": 0.3212385390049602, "flos": 20558787603840.0, "grad_norm": 2.3049503435401864, "language_loss": 0.74512619, "learning_rate": 3.065457691683108e-06, "loss": 0.76700199, "num_input_tokens_seen": 114790775, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.84765625, "step": 5343, "time_per_iteration": 2.481687307357788 }, { "auxiliary_loss_clip": 0.01173048, "auxiliary_loss_mlp": 0.01037487, "balance_loss_clip": 1.02154863, "balance_loss_mlp": 1.0454917, "epoch": 0.32129866225762815, "flos": 24791147233920.0, "grad_norm": 1.5312067739471327, "language_loss": 0.82443941, "learning_rate": 3.0651379269307853e-06, "loss": 0.84654474, "num_input_tokens_seen": 114809835, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.83203125, "step": 5344, "time_per_iteration": 2.704226016998291 }, { "auxiliary_loss_clip": 0.01165433, "auxiliary_loss_mlp": 0.01039146, "balance_loss_clip": 1.02300513, "balance_loss_mlp": 1.0442307, "epoch": 0.3213587855102961, "flos": 18366871962240.0, "grad_norm": 2.7602051100637364, "language_loss": 0.79798806, "learning_rate": 3.06481812416643e-06, "loss": 0.82003391, "num_input_tokens_seen": 114826505, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8515625, "step": 5345, "time_per_iteration": 2.509549140930176 }, { "auxiliary_loss_clip": 0.01148074, "auxiliary_loss_mlp": 0.01041178, "balance_loss_clip": 1.02550197, "balance_loss_mlp": 1.04503441, "epoch": 0.3214189087629641, "flos": 27015992668800.0, "grad_norm": 1.7377850025776096, "language_loss": 0.82939839, "learning_rate": 3.0644982834014545e-06, "loss": 0.85129094, "num_input_tokens_seen": 114846140, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.84765625, "step": 5346, "time_per_iteration": 2.636986017227173 }, { "auxiliary_loss_clip": 0.01156996, "auxiliary_loss_mlp": 0.01040755, "balance_loss_clip": 1.02404141, "balance_loss_mlp": 1.04327035, "epoch": 0.32147903201563205, "flos": 23148269953920.0, "grad_norm": 1.8375312949552935, "language_loss": 0.81954277, "learning_rate": 3.0641784046472745e-06, "loss": 0.84152031, "num_input_tokens_seen": 114866660, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8671875, "step": 5347, "time_per_iteration": 2.577300548553467 }, { "auxiliary_loss_clip": 0.01136456, "auxiliary_loss_mlp": 0.01045831, "balance_loss_clip": 1.02860534, "balance_loss_mlp": 1.04408062, "epoch": 0.3215391552683, "flos": 16580747243520.0, "grad_norm": 2.2201005423528812, "language_loss": 0.79605073, "learning_rate": 3.063858487915304e-06, "loss": 0.8178736, "num_input_tokens_seen": 114882820, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.83203125, "step": 5348, "time_per_iteration": 2.5480802059173584 }, { "auxiliary_loss_clip": 0.01147892, "auxiliary_loss_mlp": 0.01048752, "balance_loss_clip": 1.032933, "balance_loss_mlp": 1.04530287, "epoch": 0.321599278520968, "flos": 17821820010240.0, "grad_norm": 3.154019796029814, "language_loss": 0.85264766, "learning_rate": 3.0635385332169606e-06, "loss": 0.87461412, "num_input_tokens_seen": 114900745, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.84375, "step": 5349, "time_per_iteration": 2.532581329345703 }, { "auxiliary_loss_clip": 0.01146959, "auxiliary_loss_mlp": 0.01040124, "balance_loss_clip": 1.02383983, "balance_loss_mlp": 1.04502106, "epoch": 0.32165940177363594, "flos": 16251769555200.0, "grad_norm": 2.4969229859662114, "language_loss": 0.80395484, "learning_rate": 3.063218540563663e-06, "loss": 0.82582569, "num_input_tokens_seen": 114917940, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8359375, "step": 5350, "time_per_iteration": 2.6309702396392822 }, { "auxiliary_loss_clip": 0.0113611, "auxiliary_loss_mlp": 0.01044107, "balance_loss_clip": 1.02868104, "balance_loss_mlp": 1.04415846, "epoch": 0.3217195250263039, "flos": 27599900158080.0, "grad_norm": 1.817920327770674, "language_loss": 0.79723471, "learning_rate": 3.06289850996683e-06, "loss": 0.81903684, "num_input_tokens_seen": 114937735, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.828125, "step": 5351, "time_per_iteration": 2.605811357498169 }, { "auxiliary_loss_clip": 0.01151386, "auxiliary_loss_mlp": 0.01044129, "balance_loss_clip": 1.02746344, "balance_loss_mlp": 1.04257131, "epoch": 0.3217796482789719, "flos": 21470595373440.0, "grad_norm": 3.2303464869504817, "language_loss": 0.75697857, "learning_rate": 3.062578441437884e-06, "loss": 0.7789337, "num_input_tokens_seen": 114956630, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.82421875, "step": 5352, "time_per_iteration": 2.618650197982788 }, { "auxiliary_loss_clip": 0.01164242, "auxiliary_loss_mlp": 0.01039872, "balance_loss_clip": 1.02274132, "balance_loss_mlp": 1.04464364, "epoch": 0.32183977153163984, "flos": 21215593745280.0, "grad_norm": 1.9293750914581245, "language_loss": 0.81459045, "learning_rate": 3.062258334988246e-06, "loss": 0.83663154, "num_input_tokens_seen": 114976470, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.8359375, "step": 5353, "time_per_iteration": 2.6715235710144043 }, { "auxiliary_loss_clip": 0.01179153, "auxiliary_loss_mlp": 0.01041089, "balance_loss_clip": 1.02518654, "balance_loss_mlp": 1.04489517, "epoch": 0.3218998947843078, "flos": 24608182331520.0, "grad_norm": 1.532214378840955, "language_loss": 0.72942626, "learning_rate": 3.0619381906293414e-06, "loss": 0.7516287, "num_input_tokens_seen": 114996710, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8125, "step": 5354, "time_per_iteration": 2.6074914932250977 }, { "auxiliary_loss_clip": 0.01154552, "auxiliary_loss_mlp": 0.010385, "balance_loss_clip": 1.02351546, "balance_loss_mlp": 1.04325664, "epoch": 0.3219600180369758, "flos": 22270577126400.0, "grad_norm": 1.6418188527135513, "language_loss": 0.835868, "learning_rate": 3.0616180083725943e-06, "loss": 0.85779846, "num_input_tokens_seen": 115015775, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.84765625, "step": 5355, "time_per_iteration": 2.7094595432281494 }, { "auxiliary_loss_clip": 0.01133937, "auxiliary_loss_mlp": 0.01044179, "balance_loss_clip": 1.02824116, "balance_loss_mlp": 1.0475378, "epoch": 0.3220201412896438, "flos": 14939126939520.0, "grad_norm": 1.979527988179526, "language_loss": 0.71484083, "learning_rate": 3.0612977882294306e-06, "loss": 0.73662198, "num_input_tokens_seen": 115034265, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.86328125, "step": 5356, "time_per_iteration": 2.488358974456787 }, { "auxiliary_loss_clip": 0.01172112, "auxiliary_loss_mlp": 0.01042381, "balance_loss_clip": 1.02521515, "balance_loss_mlp": 1.04561388, "epoch": 0.32208026454231176, "flos": 22667389649280.0, "grad_norm": 2.39312633715905, "language_loss": 0.6749264, "learning_rate": 3.0609775302112793e-06, "loss": 0.69707131, "num_input_tokens_seen": 115051945, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.90625, "step": 5357, "time_per_iteration": 2.6382949352264404 }, { "auxiliary_loss_clip": 0.01066886, "auxiliary_loss_mlp": 0.01004062, "balance_loss_clip": 1.00205922, "balance_loss_mlp": 1.02165663, "epoch": 0.3221403877949797, "flos": 64605130053120.0, "grad_norm": 0.7033178529124061, "language_loss": 0.58276606, "learning_rate": 3.060657234329569e-06, "loss": 0.60347557, "num_input_tokens_seen": 115119090, "router_z_loss_clip": 0.02001953, "router_z_loss_mlp": 0.27148438, "step": 5358, "time_per_iteration": 3.247868299484253 }, { "auxiliary_loss_clip": 0.01173583, "auxiliary_loss_mlp": 0.01042004, "balance_loss_clip": 1.02555287, "balance_loss_mlp": 1.04381275, "epoch": 0.3222005110476477, "flos": 20157019004160.0, "grad_norm": 2.0676968069167136, "language_loss": 0.83424294, "learning_rate": 3.06033690059573e-06, "loss": 0.85639882, "num_input_tokens_seen": 115137755, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8515625, "step": 5359, "time_per_iteration": 2.6988420486450195 }, { "auxiliary_loss_clip": 0.01138001, "auxiliary_loss_mlp": 0.01039185, "balance_loss_clip": 1.02331853, "balance_loss_mlp": 1.04386878, "epoch": 0.32226063430031565, "flos": 22674177319680.0, "grad_norm": 1.5843170590240585, "language_loss": 0.79782915, "learning_rate": 3.060016529021195e-06, "loss": 0.81960094, "num_input_tokens_seen": 115158150, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.85546875, "step": 5360, "time_per_iteration": 2.519529104232788 }, { "auxiliary_loss_clip": 0.0105664, "auxiliary_loss_mlp": 0.01003364, "balance_loss_clip": 1.00139737, "balance_loss_mlp": 1.02073503, "epoch": 0.3223207575529836, "flos": 63828525075840.0, "grad_norm": 0.6535730332212986, "language_loss": 0.56963432, "learning_rate": 3.0596961196173965e-06, "loss": 0.5902344, "num_input_tokens_seen": 115212755, "router_z_loss_clip": 0.01965332, "router_z_loss_mlp": 0.2734375, "step": 5361, "time_per_iteration": 3.066864252090454 }, { "auxiliary_loss_clip": 0.0117267, "auxiliary_loss_mlp": 0.01043926, "balance_loss_clip": 1.02796388, "balance_loss_mlp": 1.04547358, "epoch": 0.3223808808056516, "flos": 26870123537280.0, "grad_norm": 2.0482007319392928, "language_loss": 0.70862365, "learning_rate": 3.0593756723957695e-06, "loss": 0.7307896, "num_input_tokens_seen": 115233090, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.82421875, "step": 5362, "time_per_iteration": 2.746640682220459 }, { "auxiliary_loss_clip": 0.01137004, "auxiliary_loss_mlp": 0.01040298, "balance_loss_clip": 1.02574861, "balance_loss_mlp": 1.04537678, "epoch": 0.32244100405831955, "flos": 26761350176640.0, "grad_norm": 2.5921358524573814, "language_loss": 0.74244589, "learning_rate": 3.0590551873677493e-06, "loss": 0.76421893, "num_input_tokens_seen": 115252645, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.82421875, "step": 5363, "time_per_iteration": 2.5803818702697754 }, { "auxiliary_loss_clip": 0.01139303, "auxiliary_loss_mlp": 0.01040751, "balance_loss_clip": 1.02428842, "balance_loss_mlp": 1.04303265, "epoch": 0.3225011273109875, "flos": 23803029020160.0, "grad_norm": 2.812917196709283, "language_loss": 0.76784611, "learning_rate": 3.058734664544774e-06, "loss": 0.78964663, "num_input_tokens_seen": 115269085, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.87109375, "step": 5364, "time_per_iteration": 2.588528633117676 }, { "auxiliary_loss_clip": 0.01144998, "auxiliary_loss_mlp": 0.01041597, "balance_loss_clip": 1.02443087, "balance_loss_mlp": 1.04379225, "epoch": 0.3225612505636555, "flos": 17274505501440.0, "grad_norm": 2.318160356144933, "language_loss": 0.7688089, "learning_rate": 3.0584141039382828e-06, "loss": 0.79067481, "num_input_tokens_seen": 115286470, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.83203125, "step": 5365, "time_per_iteration": 2.528383255004883 }, { "auxiliary_loss_clip": 0.01148981, "auxiliary_loss_mlp": 0.01043464, "balance_loss_clip": 1.02747774, "balance_loss_mlp": 1.04792142, "epoch": 0.32262137381632344, "flos": 23366247638400.0, "grad_norm": 1.5642735948227215, "language_loss": 0.76631755, "learning_rate": 3.0580935055597135e-06, "loss": 0.78824198, "num_input_tokens_seen": 115307000, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83203125, "step": 5366, "time_per_iteration": 2.6192264556884766 }, { "auxiliary_loss_clip": 0.01145353, "auxiliary_loss_mlp": 0.01038663, "balance_loss_clip": 1.02214074, "balance_loss_mlp": 1.0442853, "epoch": 0.3226814970689914, "flos": 23258803080960.0, "grad_norm": 2.2863854111930646, "language_loss": 0.71506357, "learning_rate": 3.057772869420509e-06, "loss": 0.73690379, "num_input_tokens_seen": 115325925, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.83203125, "step": 5367, "time_per_iteration": 2.6175448894500732 }, { "auxiliary_loss_clip": 0.01124994, "auxiliary_loss_mlp": 0.01037206, "balance_loss_clip": 1.02231693, "balance_loss_mlp": 1.04313707, "epoch": 0.32274162032165943, "flos": 16395196561920.0, "grad_norm": 3.422410256383468, "language_loss": 0.74406195, "learning_rate": 3.057452195532112e-06, "loss": 0.76568389, "num_input_tokens_seen": 115343705, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8203125, "step": 5368, "time_per_iteration": 2.6063101291656494 }, { "auxiliary_loss_clip": 0.01142333, "auxiliary_loss_mlp": 0.0103864, "balance_loss_clip": 1.02297616, "balance_loss_mlp": 1.04390991, "epoch": 0.3228017435743274, "flos": 27855081354240.0, "grad_norm": 1.6353795437515866, "language_loss": 0.78521931, "learning_rate": 3.057131483905967e-06, "loss": 0.80702901, "num_input_tokens_seen": 115364170, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8046875, "step": 5369, "time_per_iteration": 2.696481466293335 }, { "auxiliary_loss_clip": 0.01151912, "auxiliary_loss_mlp": 0.01031726, "balance_loss_clip": 1.01725388, "balance_loss_mlp": 1.0449276, "epoch": 0.32286186682699536, "flos": 19608770741760.0, "grad_norm": 2.172101708302971, "language_loss": 0.83458591, "learning_rate": 3.0568107345535173e-06, "loss": 0.85642225, "num_input_tokens_seen": 115382495, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.80078125, "step": 5370, "time_per_iteration": 2.686187505722046 }, { "auxiliary_loss_clip": 0.01129858, "auxiliary_loss_mlp": 0.01038175, "balance_loss_clip": 1.02246356, "balance_loss_mlp": 1.04657722, "epoch": 0.3229219900796633, "flos": 24134017870080.0, "grad_norm": 2.0477511191770748, "language_loss": 0.83413827, "learning_rate": 3.0564899474862112e-06, "loss": 0.85581863, "num_input_tokens_seen": 115399450, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.83203125, "step": 5371, "time_per_iteration": 2.6322715282440186 }, { "auxiliary_loss_clip": 0.01141861, "auxiliary_loss_mlp": 0.01289153, "balance_loss_clip": 1.02475572, "balance_loss_mlp": 1.04652405, "epoch": 0.3229821133323313, "flos": 17748705876480.0, "grad_norm": 2.5705293609406032, "language_loss": 0.88255632, "learning_rate": 3.056169122715497e-06, "loss": 0.90686655, "num_input_tokens_seen": 115417700, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.86328125, "step": 5372, "time_per_iteration": 2.630120277404785 }, { "auxiliary_loss_clip": 0.01129234, "auxiliary_loss_mlp": 0.01043563, "balance_loss_clip": 1.02793455, "balance_loss_mlp": 1.04719472, "epoch": 0.32304223658499925, "flos": 22346025644160.0, "grad_norm": 3.76566324120057, "language_loss": 0.72961867, "learning_rate": 3.055848260252823e-06, "loss": 0.75134659, "num_input_tokens_seen": 115435840, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8203125, "step": 5373, "time_per_iteration": 2.525064468383789 }, { "auxiliary_loss_clip": 0.01144241, "auxiliary_loss_mlp": 0.01035381, "balance_loss_clip": 1.02031922, "balance_loss_mlp": 1.0459919, "epoch": 0.3231023598376672, "flos": 18478302929280.0, "grad_norm": 14.564106562672698, "language_loss": 0.80943966, "learning_rate": 3.055527360109641e-06, "loss": 0.83123589, "num_input_tokens_seen": 115454210, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8046875, "step": 5374, "time_per_iteration": 3.9569666385650635 }, { "auxiliary_loss_clip": 0.01152229, "auxiliary_loss_mlp": 0.01039943, "balance_loss_clip": 1.02420688, "balance_loss_mlp": 1.04449511, "epoch": 0.3231624830903352, "flos": 27636313570560.0, "grad_norm": 2.101867202427954, "language_loss": 0.87561619, "learning_rate": 3.0552064222974024e-06, "loss": 0.89753795, "num_input_tokens_seen": 115471785, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.80859375, "step": 5375, "time_per_iteration": 2.6325924396514893 }, { "auxiliary_loss_clip": 0.01129736, "auxiliary_loss_mlp": 0.01035308, "balance_loss_clip": 1.0190593, "balance_loss_mlp": 1.04418612, "epoch": 0.32322260634300315, "flos": 21726423014400.0, "grad_norm": 3.374238093158515, "language_loss": 0.76082665, "learning_rate": 3.054885446827561e-06, "loss": 0.78247708, "num_input_tokens_seen": 115491405, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.85546875, "step": 5376, "time_per_iteration": 2.5104148387908936 }, { "auxiliary_loss_clip": 0.0115115, "auxiliary_loss_mlp": 0.01031911, "balance_loss_clip": 1.01758158, "balance_loss_mlp": 1.04330587, "epoch": 0.3232827295956711, "flos": 22637656166400.0, "grad_norm": 1.6778136808521305, "language_loss": 0.66584927, "learning_rate": 3.0545644337115716e-06, "loss": 0.68767989, "num_input_tokens_seen": 115511555, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.80859375, "step": 5377, "time_per_iteration": 2.573176622390747 }, { "auxiliary_loss_clip": 0.01153975, "auxiliary_loss_mlp": 0.01045137, "balance_loss_clip": 1.02792335, "balance_loss_mlp": 1.04580104, "epoch": 0.3233428528483391, "flos": 26322593546880.0, "grad_norm": 1.5485467380197708, "language_loss": 0.72593975, "learning_rate": 3.0542433829608902e-06, "loss": 0.74793088, "num_input_tokens_seen": 115532860, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8125, "step": 5378, "time_per_iteration": 3.9844448566436768 }, { "auxiliary_loss_clip": 0.01134726, "auxiliary_loss_mlp": 0.01035888, "balance_loss_clip": 1.02098656, "balance_loss_mlp": 1.04271245, "epoch": 0.32340297610100704, "flos": 28585217111040.0, "grad_norm": 2.738376125535063, "language_loss": 0.81821382, "learning_rate": 3.0539222945869742e-06, "loss": 0.83991992, "num_input_tokens_seen": 115553850, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.828125, "step": 5379, "time_per_iteration": 2.6250226497650146 }, { "auxiliary_loss_clip": 0.01138095, "auxiliary_loss_mlp": 0.01036268, "balance_loss_clip": 1.0209856, "balance_loss_mlp": 1.04411244, "epoch": 0.323463099353675, "flos": 30773792787840.0, "grad_norm": 1.619770277094345, "language_loss": 0.78789282, "learning_rate": 3.0536011686012827e-06, "loss": 0.80963653, "num_input_tokens_seen": 115575530, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.84765625, "step": 5380, "time_per_iteration": 4.184469699859619 }, { "auxiliary_loss_clip": 0.01139118, "auxiliary_loss_mlp": 0.01042956, "balance_loss_clip": 1.02804279, "balance_loss_mlp": 1.04712546, "epoch": 0.32352322260634303, "flos": 25228610974080.0, "grad_norm": 1.809305460987415, "language_loss": 0.77150941, "learning_rate": 3.0532800050152752e-06, "loss": 0.79333019, "num_input_tokens_seen": 115594885, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.828125, "step": 5381, "time_per_iteration": 2.564606189727783 }, { "auxiliary_loss_clip": 0.01135104, "auxiliary_loss_mlp": 0.01036595, "balance_loss_clip": 1.02230215, "balance_loss_mlp": 1.04615378, "epoch": 0.323583345859011, "flos": 23330480670720.0, "grad_norm": 1.845730368156252, "language_loss": 0.71895099, "learning_rate": 3.052958803840414e-06, "loss": 0.74066794, "num_input_tokens_seen": 115614080, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8046875, "step": 5382, "time_per_iteration": 2.6590287685394287 }, { "auxiliary_loss_clip": 0.01149042, "auxiliary_loss_mlp": 0.01041039, "balance_loss_clip": 1.02521992, "balance_loss_mlp": 1.04540324, "epoch": 0.32364346911167896, "flos": 26207499392640.0, "grad_norm": 2.150452181898266, "language_loss": 0.70852077, "learning_rate": 3.0526375650881617e-06, "loss": 0.73042154, "num_input_tokens_seen": 115632820, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8515625, "step": 5383, "time_per_iteration": 4.123288631439209 }, { "auxiliary_loss_clip": 0.01135956, "auxiliary_loss_mlp": 0.01039763, "balance_loss_clip": 1.02508867, "balance_loss_mlp": 1.04700935, "epoch": 0.3237035923643469, "flos": 23695764030720.0, "grad_norm": 2.103024113411186, "language_loss": 0.78522968, "learning_rate": 3.0523162887699824e-06, "loss": 0.80698687, "num_input_tokens_seen": 115652860, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80078125, "step": 5384, "time_per_iteration": 2.5878098011016846 }, { "auxiliary_loss_clip": 0.0117692, "auxiliary_loss_mlp": 0.010445, "balance_loss_clip": 1.02720261, "balance_loss_mlp": 1.04682851, "epoch": 0.3237637156170149, "flos": 14428728633600.0, "grad_norm": 2.1790306361374046, "language_loss": 0.75283647, "learning_rate": 3.051994974897342e-06, "loss": 0.77505064, "num_input_tokens_seen": 115670940, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.8515625, "step": 5385, "time_per_iteration": 2.6488380432128906 }, { "auxiliary_loss_clip": 0.01148961, "auxiliary_loss_mlp": 0.01040416, "balance_loss_clip": 1.02473354, "balance_loss_mlp": 1.04869008, "epoch": 0.32382383886968286, "flos": 31062981185280.0, "grad_norm": 1.8920293940757145, "language_loss": 0.71481979, "learning_rate": 3.051673623481706e-06, "loss": 0.73671353, "num_input_tokens_seen": 115691155, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.828125, "step": 5386, "time_per_iteration": 2.787262201309204 }, { "auxiliary_loss_clip": 0.01133641, "auxiliary_loss_mlp": 0.01039586, "balance_loss_clip": 1.02308714, "balance_loss_mlp": 1.04642928, "epoch": 0.3238839621223508, "flos": 23256935573760.0, "grad_norm": 1.6068678972180033, "language_loss": 0.94163835, "learning_rate": 3.0513522345345446e-06, "loss": 0.96337062, "num_input_tokens_seen": 115710340, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.87109375, "step": 5387, "time_per_iteration": 2.6753783226013184 }, { "auxiliary_loss_clip": 0.01141869, "auxiliary_loss_mlp": 0.01045033, "balance_loss_clip": 1.02829635, "balance_loss_mlp": 1.04794014, "epoch": 0.3239440853750188, "flos": 22964658606720.0, "grad_norm": 2.0267792965725575, "language_loss": 0.7728219, "learning_rate": 3.0510308080673256e-06, "loss": 0.79469097, "num_input_tokens_seen": 115726745, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8515625, "step": 5388, "time_per_iteration": 2.5436336994171143 }, { "auxiliary_loss_clip": 0.01147178, "auxiliary_loss_mlp": 0.01032072, "balance_loss_clip": 1.01634789, "balance_loss_mlp": 1.04452515, "epoch": 0.32400420862768675, "flos": 36246614653440.0, "grad_norm": 1.9334233807743113, "language_loss": 0.72005689, "learning_rate": 3.0507093440915214e-06, "loss": 0.74184942, "num_input_tokens_seen": 115749385, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.84765625, "step": 5389, "time_per_iteration": 2.788490056991577 }, { "auxiliary_loss_clip": 0.01153422, "auxiliary_loss_mlp": 0.01040773, "balance_loss_clip": 1.02401209, "balance_loss_mlp": 1.04611087, "epoch": 0.3240643318803547, "flos": 21616500418560.0, "grad_norm": 1.766710176596492, "language_loss": 0.81058115, "learning_rate": 3.0503878426186028e-06, "loss": 0.83252311, "num_input_tokens_seen": 115768105, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8046875, "step": 5390, "time_per_iteration": 2.646101474761963 }, { "auxiliary_loss_clip": 0.01150663, "auxiliary_loss_mlp": 0.01043591, "balance_loss_clip": 1.0279392, "balance_loss_mlp": 1.04930913, "epoch": 0.3241244551330227, "flos": 23295611543040.0, "grad_norm": 1.899185338870307, "language_loss": 0.7297979, "learning_rate": 3.050066303660044e-06, "loss": 0.75174046, "num_input_tokens_seen": 115787340, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8359375, "step": 5391, "time_per_iteration": 2.6153981685638428 }, { "auxiliary_loss_clip": 0.01144365, "auxiliary_loss_mlp": 0.01041586, "balance_loss_clip": 1.02656591, "balance_loss_mlp": 1.04574502, "epoch": 0.32418457838569065, "flos": 14097236993280.0, "grad_norm": 1.8257021635155994, "language_loss": 0.77197397, "learning_rate": 3.0497447272273203e-06, "loss": 0.79383349, "num_input_tokens_seen": 115805565, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.80859375, "step": 5392, "time_per_iteration": 2.4668142795562744 }, { "auxiliary_loss_clip": 0.01150361, "auxiliary_loss_mlp": 0.01041691, "balance_loss_clip": 1.02504933, "balance_loss_mlp": 1.04661632, "epoch": 0.3242447016383586, "flos": 29752672953600.0, "grad_norm": 1.7372751763081922, "language_loss": 0.62108004, "learning_rate": 3.049423113331907e-06, "loss": 0.64300054, "num_input_tokens_seen": 115826725, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.859375, "step": 5393, "time_per_iteration": 2.651538372039795 }, { "auxiliary_loss_clip": 0.01135632, "auxiliary_loss_mlp": 0.01037151, "balance_loss_clip": 1.02114129, "balance_loss_mlp": 1.04418731, "epoch": 0.3243048248910266, "flos": 24351205455360.0, "grad_norm": 1.5088527489009815, "language_loss": 0.82626522, "learning_rate": 3.049101461985283e-06, "loss": 0.84799302, "num_input_tokens_seen": 115846955, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.82421875, "step": 5394, "time_per_iteration": 2.5351181030273438 }, { "auxiliary_loss_clip": 0.01142955, "auxiliary_loss_mlp": 0.0104597, "balance_loss_clip": 1.03217769, "balance_loss_mlp": 1.04683828, "epoch": 0.3243649481436946, "flos": 24353037048960.0, "grad_norm": 1.784814097303431, "language_loss": 0.81573427, "learning_rate": 3.048779773198926e-06, "loss": 0.83762348, "num_input_tokens_seen": 115865975, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78515625, "step": 5395, "time_per_iteration": 2.609987497329712 }, { "auxiliary_loss_clip": 0.01135805, "auxiliary_loss_mlp": 0.01042367, "balance_loss_clip": 1.02739453, "balance_loss_mlp": 1.04872417, "epoch": 0.32442507139636256, "flos": 22925228451840.0, "grad_norm": 4.907655845157836, "language_loss": 0.83193326, "learning_rate": 3.048458046984317e-06, "loss": 0.85371494, "num_input_tokens_seen": 115884950, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.78125, "step": 5396, "time_per_iteration": 2.5194811820983887 }, { "auxiliary_loss_clip": 0.01159007, "auxiliary_loss_mlp": 0.01043932, "balance_loss_clip": 1.02796984, "balance_loss_mlp": 1.04621387, "epoch": 0.32448519464903053, "flos": 22200192426240.0, "grad_norm": 2.2125871367343386, "language_loss": 0.75664496, "learning_rate": 3.0481362833529363e-06, "loss": 0.77867436, "num_input_tokens_seen": 115904170, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.859375, "step": 5397, "time_per_iteration": 2.577176094055176 }, { "auxiliary_loss_clip": 0.01153773, "auxiliary_loss_mlp": 0.01034424, "balance_loss_clip": 1.01902795, "balance_loss_mlp": 1.04662967, "epoch": 0.3245453179016985, "flos": 18838450644480.0, "grad_norm": 2.3342070056591653, "language_loss": 0.6659252, "learning_rate": 3.0478144823162686e-06, "loss": 0.6878072, "num_input_tokens_seen": 115919255, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8046875, "step": 5398, "time_per_iteration": 2.5455996990203857 }, { "auxiliary_loss_clip": 0.01144981, "auxiliary_loss_mlp": 0.01033391, "balance_loss_clip": 1.01798928, "balance_loss_mlp": 1.04552889, "epoch": 0.32460544115436646, "flos": 21178390233600.0, "grad_norm": 1.476097977634892, "language_loss": 0.72885609, "learning_rate": 3.0474926438857976e-06, "loss": 0.75063986, "num_input_tokens_seen": 115938535, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8125, "step": 5399, "time_per_iteration": 2.633122205734253 }, { "auxiliary_loss_clip": 0.01156157, "auxiliary_loss_mlp": 0.01036113, "balance_loss_clip": 1.02084255, "balance_loss_mlp": 1.04733372, "epoch": 0.3246655644070344, "flos": 21981137333760.0, "grad_norm": 3.4091676516479423, "language_loss": 0.71314985, "learning_rate": 3.047170768073008e-06, "loss": 0.73507255, "num_input_tokens_seen": 115955005, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8203125, "step": 5400, "time_per_iteration": 2.7739474773406982 }, { "auxiliary_loss_clip": 0.01138898, "auxiliary_loss_mlp": 0.01036867, "balance_loss_clip": 1.02175164, "balance_loss_mlp": 1.04646397, "epoch": 0.3247256876597024, "flos": 32159729105280.0, "grad_norm": 1.9609944114300055, "language_loss": 0.79329443, "learning_rate": 3.046848854889388e-06, "loss": 0.81505215, "num_input_tokens_seen": 115975305, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8359375, "step": 5401, "time_per_iteration": 2.6658167839050293 }, { "auxiliary_loss_clip": 0.01149064, "auxiliary_loss_mlp": 0.01043188, "balance_loss_clip": 1.02655792, "balance_loss_mlp": 1.0468781, "epoch": 0.32478581091237035, "flos": 20997544233600.0, "grad_norm": 1.373261360310668, "language_loss": 0.87537265, "learning_rate": 3.0465269043464243e-06, "loss": 0.89729524, "num_input_tokens_seen": 115994810, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.84375, "step": 5402, "time_per_iteration": 2.6061947345733643 }, { "auxiliary_loss_clip": 0.01158904, "auxiliary_loss_mlp": 0.01038918, "balance_loss_clip": 1.0245887, "balance_loss_mlp": 1.04461002, "epoch": 0.3248459341650383, "flos": 17924990849280.0, "grad_norm": 1.8699658639453132, "language_loss": 0.84326804, "learning_rate": 3.0462049164556082e-06, "loss": 0.86524624, "num_input_tokens_seen": 116011095, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.78125, "step": 5403, "time_per_iteration": 2.5742852687835693 }, { "auxiliary_loss_clip": 0.01144161, "auxiliary_loss_mlp": 0.01036161, "balance_loss_clip": 1.02266073, "balance_loss_mlp": 1.04804778, "epoch": 0.3249060574177063, "flos": 24535606901760.0, "grad_norm": 2.032228779039135, "language_loss": 0.87737978, "learning_rate": 3.0458828912284293e-06, "loss": 0.89918303, "num_input_tokens_seen": 116028805, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.78125, "step": 5404, "time_per_iteration": 2.64345121383667 }, { "auxiliary_loss_clip": 0.01135572, "auxiliary_loss_mlp": 0.01039842, "balance_loss_clip": 1.02424896, "balance_loss_mlp": 1.04687667, "epoch": 0.32496618067037425, "flos": 25994765093760.0, "grad_norm": 1.503431619476609, "language_loss": 0.724271, "learning_rate": 3.0455608286763803e-06, "loss": 0.74602515, "num_input_tokens_seen": 116047765, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.796875, "step": 5405, "time_per_iteration": 2.5790181159973145 }, { "auxiliary_loss_clip": 0.01153747, "auxiliary_loss_mlp": 0.01036486, "balance_loss_clip": 1.0209831, "balance_loss_mlp": 1.04609215, "epoch": 0.3250263039230422, "flos": 19573757959680.0, "grad_norm": 1.6891285349934053, "language_loss": 0.82855517, "learning_rate": 3.045238728810955e-06, "loss": 0.85045743, "num_input_tokens_seen": 116068385, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.80859375, "step": 5406, "time_per_iteration": 2.6441965103149414 }, { "auxiliary_loss_clip": 0.01127462, "auxiliary_loss_mlp": 0.01037314, "balance_loss_clip": 1.02285385, "balance_loss_mlp": 1.04767907, "epoch": 0.3250864271757102, "flos": 16763640318720.0, "grad_norm": 1.8048240789132275, "language_loss": 0.87653077, "learning_rate": 3.0449165916436485e-06, "loss": 0.89817858, "num_input_tokens_seen": 116085350, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.796875, "step": 5407, "time_per_iteration": 2.458893060684204 }, { "auxiliary_loss_clip": 0.01144597, "auxiliary_loss_mlp": 0.01038832, "balance_loss_clip": 1.02362132, "balance_loss_mlp": 1.04601967, "epoch": 0.3251465504283782, "flos": 27819458040960.0, "grad_norm": 1.502918483388092, "language_loss": 0.69419134, "learning_rate": 3.044594417185956e-06, "loss": 0.71602565, "num_input_tokens_seen": 116107560, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.80859375, "step": 5408, "time_per_iteration": 2.736516237258911 }, { "auxiliary_loss_clip": 0.0114077, "auxiliary_loss_mlp": 0.01034628, "balance_loss_clip": 1.01937532, "balance_loss_mlp": 1.0442338, "epoch": 0.32520667368104617, "flos": 19063144172160.0, "grad_norm": 2.033866116476812, "language_loss": 0.77724737, "learning_rate": 3.044272205449376e-06, "loss": 0.79900134, "num_input_tokens_seen": 116125980, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.875, "step": 5409, "time_per_iteration": 2.510864019393921 }, { "auxiliary_loss_clip": 0.01172254, "auxiliary_loss_mlp": 0.01046511, "balance_loss_clip": 1.03137183, "balance_loss_mlp": 1.0438478, "epoch": 0.32526679693371413, "flos": 29382146208000.0, "grad_norm": 1.664740661833315, "language_loss": 0.82966733, "learning_rate": 3.0439499564454073e-06, "loss": 0.85185498, "num_input_tokens_seen": 116146530, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.84375, "step": 5410, "time_per_iteration": 2.6781227588653564 }, { "auxiliary_loss_clip": 0.01158735, "auxiliary_loss_mlp": 0.01035884, "balance_loss_clip": 1.0214119, "balance_loss_mlp": 1.0433706, "epoch": 0.3253269201863821, "flos": 20704513080960.0, "grad_norm": 2.8839079913248993, "language_loss": 0.71202409, "learning_rate": 3.04362767018555e-06, "loss": 0.73397028, "num_input_tokens_seen": 116165695, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.796875, "step": 5411, "time_per_iteration": 2.5377845764160156 }, { "auxiliary_loss_clip": 0.01162948, "auxiliary_loss_mlp": 0.01035743, "balance_loss_clip": 1.02177203, "balance_loss_mlp": 1.04639602, "epoch": 0.32538704343905006, "flos": 29094142959360.0, "grad_norm": 1.5180235200512582, "language_loss": 0.82877862, "learning_rate": 3.0433053466813053e-06, "loss": 0.85076553, "num_input_tokens_seen": 116185375, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.8046875, "step": 5412, "time_per_iteration": 2.8449223041534424 }, { "auxiliary_loss_clip": 0.01156413, "auxiliary_loss_mlp": 0.01040274, "balance_loss_clip": 1.02401423, "balance_loss_mlp": 1.04462433, "epoch": 0.325447166691718, "flos": 24676124906880.0, "grad_norm": 1.8892752425242594, "language_loss": 0.80727327, "learning_rate": 3.042982985944177e-06, "loss": 0.82924008, "num_input_tokens_seen": 116204335, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8515625, "step": 5413, "time_per_iteration": 2.639033555984497 }, { "auxiliary_loss_clip": 0.01145544, "auxiliary_loss_mlp": 0.01036626, "balance_loss_clip": 1.020437, "balance_loss_mlp": 1.04605436, "epoch": 0.325507289944386, "flos": 21543134889600.0, "grad_norm": 1.6297767697273355, "language_loss": 0.767609, "learning_rate": 3.0426605879856685e-06, "loss": 0.78943068, "num_input_tokens_seen": 116222840, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.81640625, "step": 5414, "time_per_iteration": 2.6951076984405518 }, { "auxiliary_loss_clip": 0.01078393, "auxiliary_loss_mlp": 0.01011381, "balance_loss_clip": 1.00875854, "balance_loss_mlp": 1.0250082, "epoch": 0.32556741319705396, "flos": 71518722347520.0, "grad_norm": 0.9185582658811056, "language_loss": 0.63939464, "learning_rate": 3.0423381528172864e-06, "loss": 0.66029239, "num_input_tokens_seen": 116274940, "router_z_loss_clip": 0.02624512, "router_z_loss_mlp": 0.265625, "step": 5415, "time_per_iteration": 3.1237430572509766 }, { "auxiliary_loss_clip": 0.01151966, "auxiliary_loss_mlp": 0.01036117, "balance_loss_clip": 1.02131128, "balance_loss_mlp": 1.04413891, "epoch": 0.3256275364497219, "flos": 23732428838400.0, "grad_norm": 1.5253528177670448, "language_loss": 0.73840058, "learning_rate": 3.042015680450536e-06, "loss": 0.76028144, "num_input_tokens_seen": 116297300, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.80859375, "step": 5416, "time_per_iteration": 4.020952939987183 }, { "auxiliary_loss_clip": 0.01050437, "auxiliary_loss_mlp": 0.01005033, "balance_loss_clip": 1.00267243, "balance_loss_mlp": 1.02382934, "epoch": 0.3256876597023899, "flos": 67289199891840.0, "grad_norm": 0.7778750939454218, "language_loss": 0.57939655, "learning_rate": 3.041693170896926e-06, "loss": 0.59995127, "num_input_tokens_seen": 116362370, "router_z_loss_clip": 0.02355957, "router_z_loss_mlp": 0.265625, "step": 5417, "time_per_iteration": 3.24186635017395 }, { "auxiliary_loss_clip": 0.01067648, "auxiliary_loss_mlp": 0.01001004, "balance_loss_clip": 0.99866736, "balance_loss_mlp": 1.02294111, "epoch": 0.32574778295505785, "flos": 71282323964160.0, "grad_norm": 0.8225117967780053, "language_loss": 0.63498342, "learning_rate": 3.0413706241679674e-06, "loss": 0.65566993, "num_input_tokens_seen": 116430365, "router_z_loss_clip": 0.02331543, "router_z_loss_mlp": 0.265625, "step": 5418, "time_per_iteration": 3.2953407764434814 }, { "auxiliary_loss_clip": 0.01145661, "auxiliary_loss_mlp": 0.01043092, "balance_loss_clip": 1.02822089, "balance_loss_mlp": 1.04578042, "epoch": 0.3258079062077258, "flos": 20776370238720.0, "grad_norm": 1.808534838609019, "language_loss": 0.69291276, "learning_rate": 3.041048040275169e-06, "loss": 0.71480024, "num_input_tokens_seen": 116447525, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8125, "step": 5419, "time_per_iteration": 4.012064695358276 }, { "auxiliary_loss_clip": 0.01136522, "auxiliary_loss_mlp": 0.01037199, "balance_loss_clip": 1.02114177, "balance_loss_mlp": 1.04502225, "epoch": 0.3258680294603938, "flos": 22235456603520.0, "grad_norm": 1.6386914371336958, "language_loss": 0.76884139, "learning_rate": 3.0407254192300444e-06, "loss": 0.7905786, "num_input_tokens_seen": 116466310, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.82421875, "step": 5420, "time_per_iteration": 2.5900182723999023 }, { "auxiliary_loss_clip": 0.01139555, "auxiliary_loss_mlp": 0.01289033, "balance_loss_clip": 1.02567589, "balance_loss_mlp": 1.04633498, "epoch": 0.3259281527130618, "flos": 26979974305920.0, "grad_norm": 1.51528604943008, "language_loss": 0.79905874, "learning_rate": 3.040402761044107e-06, "loss": 0.82334459, "num_input_tokens_seen": 116487825, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.83984375, "step": 5421, "time_per_iteration": 4.2723166942596436 }, { "auxiliary_loss_clip": 0.01123284, "auxiliary_loss_mlp": 0.01036657, "balance_loss_clip": 1.02241111, "balance_loss_mlp": 1.04449677, "epoch": 0.32598827596572977, "flos": 26214251149440.0, "grad_norm": 2.1425010847050427, "language_loss": 0.75293022, "learning_rate": 3.040080065728871e-06, "loss": 0.77452964, "num_input_tokens_seen": 116509950, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7890625, "step": 5422, "time_per_iteration": 2.563945770263672 }, { "auxiliary_loss_clip": 0.01156266, "auxiliary_loss_mlp": 0.01289743, "balance_loss_clip": 1.02671039, "balance_loss_mlp": 1.04693973, "epoch": 0.32604839921839773, "flos": 17639752947840.0, "grad_norm": 1.9283793119127453, "language_loss": 0.62614584, "learning_rate": 3.0397573332958527e-06, "loss": 0.65060604, "num_input_tokens_seen": 116527695, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.828125, "step": 5423, "time_per_iteration": 2.583709955215454 }, { "auxiliary_loss_clip": 0.01137057, "auxiliary_loss_mlp": 0.01031822, "balance_loss_clip": 1.01883364, "balance_loss_mlp": 1.04240346, "epoch": 0.3261085224710657, "flos": 23622721724160.0, "grad_norm": 1.5919066643400601, "language_loss": 0.74612182, "learning_rate": 3.039434563756569e-06, "loss": 0.76781058, "num_input_tokens_seen": 116547800, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.765625, "step": 5424, "time_per_iteration": 3.9654595851898193 }, { "auxiliary_loss_clip": 0.01140472, "auxiliary_loss_mlp": 0.01040757, "balance_loss_clip": 1.02673829, "balance_loss_mlp": 1.04426742, "epoch": 0.32616864572373366, "flos": 23260455106560.0, "grad_norm": 1.5716607384906638, "language_loss": 0.76919734, "learning_rate": 3.0391117571225407e-06, "loss": 0.79100966, "num_input_tokens_seen": 116568460, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78125, "step": 5425, "time_per_iteration": 2.609199047088623 }, { "auxiliary_loss_clip": 0.01155602, "auxiliary_loss_mlp": 0.01042762, "balance_loss_clip": 1.026335, "balance_loss_mlp": 1.04403281, "epoch": 0.32622876897640163, "flos": 25593427457280.0, "grad_norm": 2.158086300402193, "language_loss": 0.77797192, "learning_rate": 3.0387889134052866e-06, "loss": 0.79995555, "num_input_tokens_seen": 116588705, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.84765625, "step": 5426, "time_per_iteration": 2.6525328159332275 }, { "auxiliary_loss_clip": 0.01162193, "auxiliary_loss_mlp": 0.01039078, "balance_loss_clip": 1.02299666, "balance_loss_mlp": 1.04561353, "epoch": 0.3262888922290696, "flos": 22418996123520.0, "grad_norm": 1.7964305325356806, "language_loss": 0.74195063, "learning_rate": 3.0384660326163277e-06, "loss": 0.76396334, "num_input_tokens_seen": 116608845, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8125, "step": 5427, "time_per_iteration": 2.664259195327759 }, { "auxiliary_loss_clip": 0.01154098, "auxiliary_loss_mlp": 0.0104162, "balance_loss_clip": 1.02560973, "balance_loss_mlp": 1.04391479, "epoch": 0.32634901548173756, "flos": 19718908819200.0, "grad_norm": 2.0964830157798997, "language_loss": 0.79054368, "learning_rate": 3.0381431147671875e-06, "loss": 0.81250083, "num_input_tokens_seen": 116628145, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8359375, "step": 5428, "time_per_iteration": 2.6524829864501953 }, { "auxiliary_loss_clip": 0.01134275, "auxiliary_loss_mlp": 0.0103933, "balance_loss_clip": 1.02424991, "balance_loss_mlp": 1.04413438, "epoch": 0.3264091387344055, "flos": 16142924367360.0, "grad_norm": 3.6486748516702625, "language_loss": 0.70688534, "learning_rate": 3.03782015986939e-06, "loss": 0.72862148, "num_input_tokens_seen": 116646920, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8125, "step": 5429, "time_per_iteration": 2.544734239578247 }, { "auxiliary_loss_clip": 0.01152838, "auxiliary_loss_mlp": 0.01042353, "balance_loss_clip": 1.02803588, "balance_loss_mlp": 1.04543054, "epoch": 0.3264692619870735, "flos": 16399075230720.0, "grad_norm": 1.6295842492031745, "language_loss": 0.78309768, "learning_rate": 3.037497167934461e-06, "loss": 0.80504954, "num_input_tokens_seen": 116665100, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.80859375, "step": 5430, "time_per_iteration": 2.653336524963379 }, { "auxiliary_loss_clip": 0.01166886, "auxiliary_loss_mlp": 0.01042824, "balance_loss_clip": 1.02582479, "balance_loss_mlp": 1.0471251, "epoch": 0.32652938523974145, "flos": 22382331315840.0, "grad_norm": 1.9168854394345385, "language_loss": 0.84041142, "learning_rate": 3.037174138973927e-06, "loss": 0.86250854, "num_input_tokens_seen": 116682205, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.83984375, "step": 5431, "time_per_iteration": 2.6135427951812744 }, { "auxiliary_loss_clip": 0.01132778, "auxiliary_loss_mlp": 0.01035463, "balance_loss_clip": 1.02109861, "balance_loss_mlp": 1.04398036, "epoch": 0.3265895084924094, "flos": 21908059113600.0, "grad_norm": 1.6249831531693884, "language_loss": 0.70443487, "learning_rate": 3.0368510729993147e-06, "loss": 0.72611731, "num_input_tokens_seen": 116702575, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.796875, "step": 5432, "time_per_iteration": 2.696223735809326 }, { "auxiliary_loss_clip": 0.01161383, "auxiliary_loss_mlp": 0.01033925, "balance_loss_clip": 1.01942348, "balance_loss_mlp": 1.04293275, "epoch": 0.3266496317450774, "flos": 16067152627200.0, "grad_norm": 1.4611273559189208, "language_loss": 0.83893842, "learning_rate": 3.0365279700221555e-06, "loss": 0.86089146, "num_input_tokens_seen": 116720885, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8203125, "step": 5433, "time_per_iteration": 2.6100945472717285 }, { "auxiliary_loss_clip": 0.01143173, "auxiliary_loss_mlp": 0.0103505, "balance_loss_clip": 1.02044702, "balance_loss_mlp": 1.04620266, "epoch": 0.3267097549977454, "flos": 22528236360960.0, "grad_norm": 1.4698647628228654, "language_loss": 0.85714024, "learning_rate": 3.036204830053979e-06, "loss": 0.87892252, "num_input_tokens_seen": 116740395, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.79296875, "step": 5434, "time_per_iteration": 2.633254051208496 }, { "auxiliary_loss_clip": 0.01145404, "auxiliary_loss_mlp": 0.01037964, "balance_loss_clip": 1.02252674, "balance_loss_mlp": 1.0440836, "epoch": 0.32676987825041337, "flos": 27270419679360.0, "grad_norm": 2.6980736869285753, "language_loss": 0.8762145, "learning_rate": 3.035881653106318e-06, "loss": 0.89804816, "num_input_tokens_seen": 116758870, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.83203125, "step": 5435, "time_per_iteration": 2.61454701423645 }, { "auxiliary_loss_clip": 0.01124293, "auxiliary_loss_mlp": 0.01040309, "balance_loss_clip": 1.02600431, "balance_loss_mlp": 1.04335666, "epoch": 0.32683000150308134, "flos": 11508257433600.0, "grad_norm": 2.244264438927787, "language_loss": 0.76421088, "learning_rate": 3.035558439190705e-06, "loss": 0.78585696, "num_input_tokens_seen": 116773440, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.80859375, "step": 5436, "time_per_iteration": 2.5366008281707764 }, { "auxiliary_loss_clip": 0.01124405, "auxiliary_loss_mlp": 0.01035405, "balance_loss_clip": 1.02109373, "balance_loss_mlp": 1.04465318, "epoch": 0.3268901247557493, "flos": 25630200005760.0, "grad_norm": 1.5587252116424275, "language_loss": 0.71877116, "learning_rate": 3.0352351883186753e-06, "loss": 0.74036926, "num_input_tokens_seen": 116794375, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.796875, "step": 5437, "time_per_iteration": 2.597907066345215 }, { "auxiliary_loss_clip": 0.01153781, "auxiliary_loss_mlp": 0.01041305, "balance_loss_clip": 1.02437782, "balance_loss_mlp": 1.04210114, "epoch": 0.32695024800841727, "flos": 24860849575680.0, "grad_norm": 1.5725984619257583, "language_loss": 0.63507688, "learning_rate": 3.034911900501765e-06, "loss": 0.65702778, "num_input_tokens_seen": 116815095, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.84375, "step": 5438, "time_per_iteration": 2.620314359664917 }, { "auxiliary_loss_clip": 0.01135315, "auxiliary_loss_mlp": 0.01036407, "balance_loss_clip": 1.02150607, "balance_loss_mlp": 1.0450232, "epoch": 0.32701037126108523, "flos": 28839249072000.0, "grad_norm": 1.506171845679458, "language_loss": 0.74633479, "learning_rate": 3.0345885757515104e-06, "loss": 0.7680521, "num_input_tokens_seen": 116836630, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8125, "step": 5439, "time_per_iteration": 2.608220338821411 }, { "auxiliary_loss_clip": 0.01135947, "auxiliary_loss_mlp": 0.0104387, "balance_loss_clip": 1.02824771, "balance_loss_mlp": 1.04581273, "epoch": 0.3270704945137532, "flos": 27965075777280.0, "grad_norm": 2.0773228222369817, "language_loss": 0.74349004, "learning_rate": 3.034265214079451e-06, "loss": 0.76528823, "num_input_tokens_seen": 116856880, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8125, "step": 5440, "time_per_iteration": 2.669320583343506 }, { "auxiliary_loss_clip": 0.01132533, "auxiliary_loss_mlp": 0.01292383, "balance_loss_clip": 1.03099442, "balance_loss_mlp": 1.04386711, "epoch": 0.32713061776642116, "flos": 23690700213120.0, "grad_norm": 2.002227054393919, "language_loss": 0.84406221, "learning_rate": 3.0339418154971262e-06, "loss": 0.86831135, "num_input_tokens_seen": 116873770, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.796875, "step": 5441, "time_per_iteration": 2.6305861473083496 }, { "auxiliary_loss_clip": 0.0113657, "auxiliary_loss_mlp": 0.01038443, "balance_loss_clip": 1.02333951, "balance_loss_mlp": 1.04473269, "epoch": 0.3271907410190891, "flos": 22455625017600.0, "grad_norm": 2.4541153945254695, "language_loss": 0.8681004, "learning_rate": 3.0336183800160786e-06, "loss": 0.88985056, "num_input_tokens_seen": 116891225, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.828125, "step": 5442, "time_per_iteration": 2.598054885864258 }, { "auxiliary_loss_clip": 0.01156639, "auxiliary_loss_mlp": 0.01036221, "balance_loss_clip": 1.01942444, "balance_loss_mlp": 1.04538417, "epoch": 0.3272508642717571, "flos": 22820118278400.0, "grad_norm": 1.7142140179176557, "language_loss": 0.77148795, "learning_rate": 3.033294907647849e-06, "loss": 0.7934165, "num_input_tokens_seen": 116912300, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.84375, "step": 5443, "time_per_iteration": 2.6232104301452637 }, { "auxiliary_loss_clip": 0.01152375, "auxiliary_loss_mlp": 0.01287447, "balance_loss_clip": 1.02454627, "balance_loss_mlp": 1.04158521, "epoch": 0.32731098752442506, "flos": 11801360413440.0, "grad_norm": 6.7093727083358585, "language_loss": 0.81919211, "learning_rate": 3.0329713984039824e-06, "loss": 0.84359038, "num_input_tokens_seen": 116929425, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.84375, "step": 5444, "time_per_iteration": 2.5480527877807617 }, { "auxiliary_loss_clip": 0.01162426, "auxiliary_loss_mlp": 0.01036447, "balance_loss_clip": 1.02128386, "balance_loss_mlp": 1.0451138, "epoch": 0.327371110777093, "flos": 21027780506880.0, "grad_norm": 2.1011058183948728, "language_loss": 0.59030759, "learning_rate": 3.032647852296024e-06, "loss": 0.61229634, "num_input_tokens_seen": 116948255, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8203125, "step": 5445, "time_per_iteration": 2.599151372909546 }, { "auxiliary_loss_clip": 0.01138139, "auxiliary_loss_mlp": 0.01039019, "balance_loss_clip": 1.02325404, "balance_loss_mlp": 1.04661846, "epoch": 0.327431234029761, "flos": 19062102677760.0, "grad_norm": 2.2736713226561918, "language_loss": 0.88237643, "learning_rate": 3.0323242693355195e-06, "loss": 0.90414798, "num_input_tokens_seen": 116964905, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.828125, "step": 5446, "time_per_iteration": 2.5010969638824463 }, { "auxiliary_loss_clip": 0.01151393, "auxiliary_loss_mlp": 0.01039981, "balance_loss_clip": 1.02263641, "balance_loss_mlp": 1.04670608, "epoch": 0.32749135728242895, "flos": 25849219184640.0, "grad_norm": 1.6174578980983714, "language_loss": 0.78698897, "learning_rate": 3.0320006495340175e-06, "loss": 0.80890274, "num_input_tokens_seen": 116983650, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.8671875, "step": 5447, "time_per_iteration": 2.693857192993164 }, { "auxiliary_loss_clip": 0.01142512, "auxiliary_loss_mlp": 0.01284208, "balance_loss_clip": 1.0232414, "balance_loss_mlp": 1.04354179, "epoch": 0.327551480535097, "flos": 20120533764480.0, "grad_norm": 2.4478816259960383, "language_loss": 0.73216468, "learning_rate": 3.0316769929030672e-06, "loss": 0.75643188, "num_input_tokens_seen": 117003265, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.80859375, "step": 5448, "time_per_iteration": 2.6479737758636475 }, { "auxiliary_loss_clip": 0.01127538, "auxiliary_loss_mlp": 0.01045422, "balance_loss_clip": 1.03035355, "balance_loss_mlp": 1.04598045, "epoch": 0.32761160378776494, "flos": 28803553931520.0, "grad_norm": 1.589599245297722, "language_loss": 0.66832733, "learning_rate": 3.0313532994542185e-06, "loss": 0.69005692, "num_input_tokens_seen": 117025370, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.81640625, "step": 5449, "time_per_iteration": 2.6951639652252197 }, { "auxiliary_loss_clip": 0.0115307, "auxiliary_loss_mlp": 0.01036903, "balance_loss_clip": 1.02236557, "balance_loss_mlp": 1.04512978, "epoch": 0.3276717270404329, "flos": 26937778803840.0, "grad_norm": 1.617546517406089, "language_loss": 0.65481853, "learning_rate": 3.0310295691990234e-06, "loss": 0.67671824, "num_input_tokens_seen": 117044350, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8125, "step": 5450, "time_per_iteration": 2.5950539112091064 }, { "auxiliary_loss_clip": 0.01136963, "auxiliary_loss_mlp": 0.01041606, "balance_loss_clip": 1.02651381, "balance_loss_mlp": 1.04572916, "epoch": 0.32773185029310087, "flos": 25338425829120.0, "grad_norm": 1.3856372212896066, "language_loss": 0.77095783, "learning_rate": 3.030705802149035e-06, "loss": 0.79274344, "num_input_tokens_seen": 117064450, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8203125, "step": 5451, "time_per_iteration": 2.6690049171447754 }, { "auxiliary_loss_clip": 0.01139164, "auxiliary_loss_mlp": 0.01044674, "balance_loss_clip": 1.02855659, "balance_loss_mlp": 1.04640889, "epoch": 0.32779197354576883, "flos": 26391721271040.0, "grad_norm": 2.397034053559661, "language_loss": 0.71036005, "learning_rate": 3.030381998315808e-06, "loss": 0.73219836, "num_input_tokens_seen": 117083060, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8359375, "step": 5452, "time_per_iteration": 2.594810962677002 }, { "auxiliary_loss_clip": 0.01141288, "auxiliary_loss_mlp": 0.0103682, "balance_loss_clip": 1.02238369, "balance_loss_mlp": 1.04452372, "epoch": 0.3278520967984368, "flos": 24899381890560.0, "grad_norm": 1.6378319321004613, "language_loss": 0.78530836, "learning_rate": 3.030058157710899e-06, "loss": 0.80708945, "num_input_tokens_seen": 117101860, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7890625, "step": 5453, "time_per_iteration": 2.6639113426208496 }, { "auxiliary_loss_clip": 0.01138922, "auxiliary_loss_mlp": 0.0103426, "balance_loss_clip": 1.01966333, "balance_loss_mlp": 1.04740143, "epoch": 0.32791222005110476, "flos": 29752996176000.0, "grad_norm": 2.630220985852309, "language_loss": 0.75600749, "learning_rate": 3.0297342803458624e-06, "loss": 0.77773935, "num_input_tokens_seen": 117123100, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.82421875, "step": 5454, "time_per_iteration": 2.6156346797943115 }, { "auxiliary_loss_clip": 0.01159029, "auxiliary_loss_mlp": 0.01035121, "balance_loss_clip": 1.02207971, "balance_loss_mlp": 1.04444897, "epoch": 0.32797234330377273, "flos": 16508064072960.0, "grad_norm": 2.026762402172695, "language_loss": 0.76787663, "learning_rate": 3.029410366232259e-06, "loss": 0.78981817, "num_input_tokens_seen": 117140515, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.78125, "step": 5455, "time_per_iteration": 2.6122357845306396 }, { "auxiliary_loss_clip": 0.01139869, "auxiliary_loss_mlp": 0.01041589, "balance_loss_clip": 1.02495944, "balance_loss_mlp": 1.04565692, "epoch": 0.3280324665564407, "flos": 26577918397440.0, "grad_norm": 1.6250493792099847, "language_loss": 0.73608387, "learning_rate": 3.0290864153816467e-06, "loss": 0.75789851, "num_input_tokens_seen": 117161485, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.84765625, "step": 5456, "time_per_iteration": 2.7656056880950928 }, { "auxiliary_loss_clip": 0.01127309, "auxiliary_loss_mlp": 0.01047562, "balance_loss_clip": 1.03208852, "balance_loss_mlp": 1.04463673, "epoch": 0.32809258980910866, "flos": 22929969047040.0, "grad_norm": 1.9198660846381737, "language_loss": 0.78321552, "learning_rate": 3.028762427805588e-06, "loss": 0.80496418, "num_input_tokens_seen": 117181870, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.828125, "step": 5457, "time_per_iteration": 3.9353113174438477 }, { "auxiliary_loss_clip": 0.01157917, "auxiliary_loss_mlp": 0.01035124, "balance_loss_clip": 1.0203898, "balance_loss_mlp": 1.04469597, "epoch": 0.3281527130617766, "flos": 22783848520320.0, "grad_norm": 33.95132895858181, "language_loss": 0.78670371, "learning_rate": 3.028438403515645e-06, "loss": 0.8086341, "num_input_tokens_seen": 117201380, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.86328125, "step": 5458, "time_per_iteration": 2.724276304244995 }, { "auxiliary_loss_clip": 0.01134276, "auxiliary_loss_mlp": 0.01036456, "balance_loss_clip": 1.02107823, "balance_loss_mlp": 1.04413533, "epoch": 0.3282128363144446, "flos": 21250678354560.0, "grad_norm": 1.61540626475352, "language_loss": 0.73241067, "learning_rate": 3.0281143425233795e-06, "loss": 0.75411803, "num_input_tokens_seen": 117221040, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8125, "step": 5459, "time_per_iteration": 2.558540105819702 }, { "auxiliary_loss_clip": 0.01147747, "auxiliary_loss_mlp": 0.01039, "balance_loss_clip": 1.02372301, "balance_loss_mlp": 1.04680943, "epoch": 0.32827295956711255, "flos": 30843064166400.0, "grad_norm": 1.8990258722748043, "language_loss": 0.84474009, "learning_rate": 3.02779024484036e-06, "loss": 0.86660755, "num_input_tokens_seen": 117241395, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.83203125, "step": 5460, "time_per_iteration": 2.713423252105713 }, { "auxiliary_loss_clip": 0.01126192, "auxiliary_loss_mlp": 0.01035348, "balance_loss_clip": 1.02082849, "balance_loss_mlp": 1.04476643, "epoch": 0.3283330828197806, "flos": 25915006944000.0, "grad_norm": 1.7726004929409684, "language_loss": 0.76938379, "learning_rate": 3.0274661104781483e-06, "loss": 0.79099923, "num_input_tokens_seen": 117259340, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8125, "step": 5461, "time_per_iteration": 4.000823497772217 }, { "auxiliary_loss_clip": 0.01138468, "auxiliary_loss_mlp": 0.01037342, "balance_loss_clip": 1.0203433, "balance_loss_mlp": 1.04646063, "epoch": 0.32839320607244854, "flos": 38582065042560.0, "grad_norm": 2.1713738171184804, "language_loss": 0.63138479, "learning_rate": 3.027141939448315e-06, "loss": 0.65314287, "num_input_tokens_seen": 117282375, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.828125, "step": 5462, "time_per_iteration": 4.3351805210113525 }, { "auxiliary_loss_clip": 0.01125944, "auxiliary_loss_mlp": 0.01280721, "balance_loss_clip": 1.01882029, "balance_loss_mlp": 1.04604816, "epoch": 0.3284533293251165, "flos": 26650888876800.0, "grad_norm": 1.372104706817876, "language_loss": 0.77682275, "learning_rate": 3.0268177317624275e-06, "loss": 0.80088943, "num_input_tokens_seen": 117303830, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.796875, "step": 5463, "time_per_iteration": 2.583024740219116 }, { "auxiliary_loss_clip": 0.01154967, "auxiliary_loss_mlp": 0.01040876, "balance_loss_clip": 1.02570105, "balance_loss_mlp": 1.04566288, "epoch": 0.32851345257778447, "flos": 15304158904320.0, "grad_norm": 2.011951404783137, "language_loss": 0.69822341, "learning_rate": 3.0264934874320566e-06, "loss": 0.72018182, "num_input_tokens_seen": 117320665, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.828125, "step": 5464, "time_per_iteration": 2.5565242767333984 }, { "auxiliary_loss_clip": 0.0113641, "auxiliary_loss_mlp": 0.01043612, "balance_loss_clip": 1.02826965, "balance_loss_mlp": 1.04860759, "epoch": 0.32857357583045244, "flos": 23513732881920.0, "grad_norm": 1.7387624438897078, "language_loss": 0.72228342, "learning_rate": 3.026169206468774e-06, "loss": 0.74408364, "num_input_tokens_seen": 117339795, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.7890625, "step": 5465, "time_per_iteration": 2.585909128189087 }, { "auxiliary_loss_clip": 0.01144981, "auxiliary_loss_mlp": 0.01039602, "balance_loss_clip": 1.02506423, "balance_loss_mlp": 1.04677451, "epoch": 0.3286336990831204, "flos": 20995209849600.0, "grad_norm": 1.4717596233757375, "language_loss": 0.83432686, "learning_rate": 3.025844888884152e-06, "loss": 0.8561728, "num_input_tokens_seen": 117359525, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.80078125, "step": 5466, "time_per_iteration": 4.08076548576355 }, { "auxiliary_loss_clip": 0.01145703, "auxiliary_loss_mlp": 0.0103979, "balance_loss_clip": 1.02507925, "balance_loss_mlp": 1.04678726, "epoch": 0.32869382233578837, "flos": 23658811914240.0, "grad_norm": 2.1333008157514355, "language_loss": 0.79511172, "learning_rate": 3.0255205346897646e-06, "loss": 0.81696665, "num_input_tokens_seen": 117380320, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8125, "step": 5467, "time_per_iteration": 2.6327433586120605 }, { "auxiliary_loss_clip": 0.01146226, "auxiliary_loss_mlp": 0.01036812, "balance_loss_clip": 1.02158904, "balance_loss_mlp": 1.04652429, "epoch": 0.32875394558845633, "flos": 25336522408320.0, "grad_norm": 1.5533978269340263, "language_loss": 0.74880981, "learning_rate": 3.0251961438971866e-06, "loss": 0.77064013, "num_input_tokens_seen": 117400695, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8203125, "step": 5468, "time_per_iteration": 2.640169382095337 }, { "auxiliary_loss_clip": 0.01153517, "auxiliary_loss_mlp": 0.01038197, "balance_loss_clip": 1.02118611, "balance_loss_mlp": 1.0492171, "epoch": 0.3288140688411243, "flos": 14903108576640.0, "grad_norm": 1.6961580852834746, "language_loss": 0.77862448, "learning_rate": 3.024871716517996e-06, "loss": 0.80054164, "num_input_tokens_seen": 117418800, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.859375, "step": 5469, "time_per_iteration": 2.6535604000091553 }, { "auxiliary_loss_clip": 0.01157366, "auxiliary_loss_mlp": 0.01041236, "balance_loss_clip": 1.02682376, "balance_loss_mlp": 1.04746068, "epoch": 0.32887419209379226, "flos": 18551345235840.0, "grad_norm": 4.753874464911175, "language_loss": 0.82100093, "learning_rate": 3.0245472525637706e-06, "loss": 0.84298694, "num_input_tokens_seen": 117438220, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.828125, "step": 5470, "time_per_iteration": 2.696197509765625 }, { "auxiliary_loss_clip": 0.01130172, "auxiliary_loss_mlp": 0.01041354, "balance_loss_clip": 1.02554667, "balance_loss_mlp": 1.04640293, "epoch": 0.3289343153464602, "flos": 48105610439040.0, "grad_norm": 1.7876992107559482, "language_loss": 0.67850959, "learning_rate": 3.0242227520460885e-06, "loss": 0.70022482, "num_input_tokens_seen": 117462560, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8359375, "step": 5471, "time_per_iteration": 2.846752166748047 }, { "auxiliary_loss_clip": 0.01135828, "auxiliary_loss_mlp": 0.01048664, "balance_loss_clip": 1.03079414, "balance_loss_mlp": 1.04930615, "epoch": 0.3289944385991282, "flos": 27600295207680.0, "grad_norm": 1.5433473678993752, "language_loss": 0.65132701, "learning_rate": 3.023898214976531e-06, "loss": 0.67317188, "num_input_tokens_seen": 117483665, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.8671875, "step": 5472, "time_per_iteration": 2.616699695587158 }, { "auxiliary_loss_clip": 0.01142315, "auxiliary_loss_mlp": 0.0105008, "balance_loss_clip": 1.03364098, "balance_loss_mlp": 1.04850209, "epoch": 0.32905456185179616, "flos": 20120318282880.0, "grad_norm": 1.7262022457133508, "language_loss": 0.88048875, "learning_rate": 3.02357364136668e-06, "loss": 0.90241271, "num_input_tokens_seen": 117503565, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8515625, "step": 5473, "time_per_iteration": 2.6197826862335205 }, { "auxiliary_loss_clip": 0.01161102, "auxiliary_loss_mlp": 0.01041926, "balance_loss_clip": 1.02502251, "balance_loss_mlp": 1.05058646, "epoch": 0.3291146851044642, "flos": 23180230080000.0, "grad_norm": 1.8111556598607184, "language_loss": 0.78190947, "learning_rate": 3.023249031228119e-06, "loss": 0.80393976, "num_input_tokens_seen": 117521460, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8359375, "step": 5474, "time_per_iteration": 2.615478277206421 }, { "auxiliary_loss_clip": 0.01072233, "auxiliary_loss_mlp": 0.01036391, "balance_loss_clip": 1.03342283, "balance_loss_mlp": 1.02827108, "epoch": 0.32917480835713214, "flos": 67621912594560.0, "grad_norm": 0.8124771886511848, "language_loss": 0.60163641, "learning_rate": 3.0229243845724323e-06, "loss": 0.62272263, "num_input_tokens_seen": 117580550, "router_z_loss_clip": 0.02966309, "router_z_loss_mlp": 0.265625, "step": 5475, "time_per_iteration": 3.166700601577759 }, { "auxiliary_loss_clip": 0.01160648, "auxiliary_loss_mlp": 0.01049599, "balance_loss_clip": 1.03258812, "balance_loss_mlp": 1.04567528, "epoch": 0.3292349316098001, "flos": 27964537073280.0, "grad_norm": 3.056064787303929, "language_loss": 0.76634884, "learning_rate": 3.022599701411205e-06, "loss": 0.78845131, "num_input_tokens_seen": 117600645, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8828125, "step": 5476, "time_per_iteration": 2.655221462249756 }, { "auxiliary_loss_clip": 0.01160209, "auxiliary_loss_mlp": 0.0104532, "balance_loss_clip": 1.03025174, "balance_loss_mlp": 1.05070853, "epoch": 0.3292950548624681, "flos": 20263673462400.0, "grad_norm": 1.7330589650653145, "language_loss": 0.74409688, "learning_rate": 3.0222749817560252e-06, "loss": 0.76615214, "num_input_tokens_seen": 117618880, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.828125, "step": 5477, "time_per_iteration": 2.7217798233032227 }, { "auxiliary_loss_clip": 0.01127045, "auxiliary_loss_mlp": 0.01040986, "balance_loss_clip": 1.02603769, "balance_loss_mlp": 1.04733825, "epoch": 0.32935517811513604, "flos": 20812999132800.0, "grad_norm": 2.333929428277253, "language_loss": 0.75809693, "learning_rate": 3.0219502256184804e-06, "loss": 0.77977729, "num_input_tokens_seen": 117636445, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.796875, "step": 5478, "time_per_iteration": 2.563401699066162 }, { "auxiliary_loss_clip": 0.01141594, "auxiliary_loss_mlp": 0.01041617, "balance_loss_clip": 1.02680552, "balance_loss_mlp": 1.051157, "epoch": 0.329415301367804, "flos": 18441853603200.0, "grad_norm": 1.961768116406801, "language_loss": 0.80196059, "learning_rate": 3.0216254330101617e-06, "loss": 0.8237927, "num_input_tokens_seen": 117653105, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8125, "step": 5479, "time_per_iteration": 2.636204481124878 }, { "auxiliary_loss_clip": 0.0107156, "auxiliary_loss_mlp": 0.01015636, "balance_loss_clip": 1.01277471, "balance_loss_mlp": 1.02697062, "epoch": 0.32947542462047197, "flos": 66323024887680.0, "grad_norm": 0.7502662568642742, "language_loss": 0.56566912, "learning_rate": 3.0213006039426587e-06, "loss": 0.58654106, "num_input_tokens_seen": 117719225, "router_z_loss_clip": 0.02856445, "router_z_loss_mlp": 0.265625, "step": 5480, "time_per_iteration": 3.2333014011383057 }, { "auxiliary_loss_clip": 0.01140927, "auxiliary_loss_mlp": 0.01297491, "balance_loss_clip": 1.03484035, "balance_loss_mlp": 1.04878747, "epoch": 0.32953554787313993, "flos": 23221599569280.0, "grad_norm": 2.0431766855274396, "language_loss": 0.77304423, "learning_rate": 3.0209757384275643e-06, "loss": 0.79742837, "num_input_tokens_seen": 117738725, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.83203125, "step": 5481, "time_per_iteration": 2.6303253173828125 }, { "auxiliary_loss_clip": 0.01152288, "auxiliary_loss_mlp": 0.01044508, "balance_loss_clip": 1.02940416, "balance_loss_mlp": 1.05016232, "epoch": 0.3295956711258079, "flos": 27009492307200.0, "grad_norm": 1.5226952000129195, "language_loss": 0.78446245, "learning_rate": 3.020650836476472e-06, "loss": 0.8064304, "num_input_tokens_seen": 117757765, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.84375, "step": 5482, "time_per_iteration": 2.7566299438476562 }, { "auxiliary_loss_clip": 0.01141276, "auxiliary_loss_mlp": 0.01048785, "balance_loss_clip": 1.03259695, "balance_loss_mlp": 1.04810476, "epoch": 0.32965579437847586, "flos": 19171702051200.0, "grad_norm": 1.705868130366423, "language_loss": 0.73960823, "learning_rate": 3.0203258981009767e-06, "loss": 0.76150882, "num_input_tokens_seen": 117776810, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.84375, "step": 5483, "time_per_iteration": 2.563641309738159 }, { "auxiliary_loss_clip": 0.01140273, "auxiliary_loss_mlp": 0.01039168, "balance_loss_clip": 1.02539301, "balance_loss_mlp": 1.04913902, "epoch": 0.32971591763114383, "flos": 30482521401600.0, "grad_norm": 1.6131406129215224, "language_loss": 0.75928521, "learning_rate": 3.0200009233126745e-06, "loss": 0.78107965, "num_input_tokens_seen": 117797730, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.8203125, "step": 5484, "time_per_iteration": 2.7249794006347656 }, { "auxiliary_loss_clip": 0.01132219, "auxiliary_loss_mlp": 0.0104583, "balance_loss_clip": 1.03016639, "balance_loss_mlp": 1.05005431, "epoch": 0.3297760408838118, "flos": 16289583598080.0, "grad_norm": 1.9696355914736676, "language_loss": 0.7178632, "learning_rate": 3.0196759121231636e-06, "loss": 0.73964369, "num_input_tokens_seen": 117815365, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8203125, "step": 5485, "time_per_iteration": 2.4907424449920654 }, { "auxiliary_loss_clip": 0.01173065, "auxiliary_loss_mlp": 0.0104061, "balance_loss_clip": 1.02634656, "balance_loss_mlp": 1.0480423, "epoch": 0.32983616413647976, "flos": 29530924341120.0, "grad_norm": 2.1176066028974536, "language_loss": 0.804322, "learning_rate": 3.0193508645440424e-06, "loss": 0.82645875, "num_input_tokens_seen": 117836095, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8046875, "step": 5486, "time_per_iteration": 2.730365753173828 }, { "auxiliary_loss_clip": 0.0114496, "auxiliary_loss_mlp": 0.01041802, "balance_loss_clip": 1.02641225, "balance_loss_mlp": 1.04673886, "epoch": 0.3298962873891478, "flos": 20631398947200.0, "grad_norm": 1.9058183345605675, "language_loss": 0.84196115, "learning_rate": 3.0190257805869106e-06, "loss": 0.86382878, "num_input_tokens_seen": 117854655, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.80078125, "step": 5487, "time_per_iteration": 2.6070947647094727 }, { "auxiliary_loss_clip": 0.01172536, "auxiliary_loss_mlp": 0.01046409, "balance_loss_clip": 1.02963674, "balance_loss_mlp": 1.0507853, "epoch": 0.32995641064181574, "flos": 14976007228800.0, "grad_norm": 2.2244847744865055, "language_loss": 0.74944264, "learning_rate": 3.01870066026337e-06, "loss": 0.77163208, "num_input_tokens_seen": 117873300, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.859375, "step": 5488, "time_per_iteration": 2.7129898071289062 }, { "auxiliary_loss_clip": 0.01132958, "auxiliary_loss_mlp": 0.01040052, "balance_loss_clip": 1.02457285, "balance_loss_mlp": 1.05041313, "epoch": 0.3300165338944837, "flos": 18661447399680.0, "grad_norm": 2.7396773102525667, "language_loss": 0.72276783, "learning_rate": 3.018375503585023e-06, "loss": 0.7444979, "num_input_tokens_seen": 117891540, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.82421875, "step": 5489, "time_per_iteration": 2.5150084495544434 }, { "auxiliary_loss_clip": 0.0112829, "auxiliary_loss_mlp": 0.01035207, "balance_loss_clip": 1.01959062, "balance_loss_mlp": 1.04651093, "epoch": 0.3300766571471517, "flos": 25583730785280.0, "grad_norm": 3.253473923258049, "language_loss": 0.88588876, "learning_rate": 3.018050310563474e-06, "loss": 0.90752369, "num_input_tokens_seen": 117907690, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.81640625, "step": 5490, "time_per_iteration": 2.707688331604004 }, { "auxiliary_loss_clip": 0.01136637, "auxiliary_loss_mlp": 0.01034851, "balance_loss_clip": 1.01965213, "balance_loss_mlp": 1.04691613, "epoch": 0.33013678039981964, "flos": 11363501623680.0, "grad_norm": 1.8340623426414624, "language_loss": 0.82911587, "learning_rate": 3.0177250812103286e-06, "loss": 0.85083079, "num_input_tokens_seen": 117925640, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8125, "step": 5491, "time_per_iteration": 2.4863386154174805 }, { "auxiliary_loss_clip": 0.01147947, "auxiliary_loss_mlp": 0.01044422, "balance_loss_clip": 1.02881145, "balance_loss_mlp": 1.0487318, "epoch": 0.3301969036524876, "flos": 24821203939200.0, "grad_norm": 1.825757483565839, "language_loss": 0.77845514, "learning_rate": 3.017399815537193e-06, "loss": 0.80037892, "num_input_tokens_seen": 117944525, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8125, "step": 5492, "time_per_iteration": 2.6704890727996826 }, { "auxiliary_loss_clip": 0.0113654, "auxiliary_loss_mlp": 0.01042485, "balance_loss_clip": 1.02577186, "balance_loss_mlp": 1.05215228, "epoch": 0.33025702690515557, "flos": 15961144613760.0, "grad_norm": 3.2430379249899297, "language_loss": 0.7474702, "learning_rate": 3.0170745135556744e-06, "loss": 0.76926041, "num_input_tokens_seen": 117962515, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.84375, "step": 5493, "time_per_iteration": 2.5080177783966064 }, { "auxiliary_loss_clip": 0.01089574, "auxiliary_loss_mlp": 0.01008227, "balance_loss_clip": 1.00561607, "balance_loss_mlp": 1.02706754, "epoch": 0.33031715015782354, "flos": 59416755989760.0, "grad_norm": 0.7802485080712477, "language_loss": 0.53881502, "learning_rate": 3.0167491752773826e-06, "loss": 0.55979311, "num_input_tokens_seen": 118018780, "router_z_loss_clip": 0.02612305, "router_z_loss_mlp": 0.26171875, "step": 5494, "time_per_iteration": 3.2372937202453613 }, { "auxiliary_loss_clip": 0.01150913, "auxiliary_loss_mlp": 0.01038441, "balance_loss_clip": 1.0227294, "balance_loss_mlp": 1.05212665, "epoch": 0.3303772734104915, "flos": 23184360144000.0, "grad_norm": 1.4686121119054816, "language_loss": 0.87147957, "learning_rate": 3.0164238007139285e-06, "loss": 0.89337313, "num_input_tokens_seen": 118038610, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.80859375, "step": 5495, "time_per_iteration": 2.6139559745788574 }, { "auxiliary_loss_clip": 0.01161906, "auxiliary_loss_mlp": 0.01049231, "balance_loss_clip": 1.03212452, "balance_loss_mlp": 1.05077529, "epoch": 0.33043739666315947, "flos": 33071896010880.0, "grad_norm": 1.9245698703007343, "language_loss": 0.73865783, "learning_rate": 3.0160983898769233e-06, "loss": 0.76076913, "num_input_tokens_seen": 118055905, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.84375, "step": 5496, "time_per_iteration": 2.7542576789855957 }, { "auxiliary_loss_clip": 0.01148623, "auxiliary_loss_mlp": 0.01033024, "balance_loss_clip": 1.01816499, "balance_loss_mlp": 1.04982781, "epoch": 0.33049751991582743, "flos": 24895431394560.0, "grad_norm": 2.1048109759895177, "language_loss": 0.72437572, "learning_rate": 3.015772942777981e-06, "loss": 0.74619216, "num_input_tokens_seen": 118073695, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.80859375, "step": 5497, "time_per_iteration": 2.658750534057617 }, { "auxiliary_loss_clip": 0.01148535, "auxiliary_loss_mlp": 0.01037211, "balance_loss_clip": 1.02144015, "balance_loss_mlp": 1.05013895, "epoch": 0.3305576431684954, "flos": 29460575554560.0, "grad_norm": 1.6690940044273825, "language_loss": 0.79992592, "learning_rate": 3.015447459428714e-06, "loss": 0.82178336, "num_input_tokens_seen": 118094030, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8046875, "step": 5498, "time_per_iteration": 2.6451005935668945 }, { "auxiliary_loss_clip": 0.01139419, "auxiliary_loss_mlp": 0.0104306, "balance_loss_clip": 1.0270381, "balance_loss_mlp": 1.0467726, "epoch": 0.33061776642116336, "flos": 22632305040000.0, "grad_norm": 1.7989737262461492, "language_loss": 0.75609988, "learning_rate": 3.01512193984074e-06, "loss": 0.77792466, "num_input_tokens_seen": 118111665, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8359375, "step": 5499, "time_per_iteration": 4.01926326751709 }, { "auxiliary_loss_clip": 0.01148006, "auxiliary_loss_mlp": 0.01039918, "balance_loss_clip": 1.02467132, "balance_loss_mlp": 1.04872441, "epoch": 0.3306778896738313, "flos": 25776320532480.0, "grad_norm": 1.6096206436415608, "language_loss": 0.78760952, "learning_rate": 3.0147963840256748e-06, "loss": 0.80948877, "num_input_tokens_seen": 118132435, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.81640625, "step": 5500, "time_per_iteration": 2.6804988384246826 }, { "auxiliary_loss_clip": 0.01143253, "auxiliary_loss_mlp": 0.01036611, "balance_loss_clip": 1.02056503, "balance_loss_mlp": 1.05157208, "epoch": 0.33073801292649935, "flos": 36940552479360.0, "grad_norm": 2.1132800838066172, "language_loss": 0.66195524, "learning_rate": 3.0144707919951376e-06, "loss": 0.68375391, "num_input_tokens_seen": 118155255, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83203125, "step": 5501, "time_per_iteration": 2.735637664794922 }, { "auxiliary_loss_clip": 0.01142538, "auxiliary_loss_mlp": 0.01040468, "balance_loss_clip": 1.02354026, "balance_loss_mlp": 1.04965591, "epoch": 0.3307981361791673, "flos": 12967738848000.0, "grad_norm": 2.0992220639869474, "language_loss": 0.77422917, "learning_rate": 3.014145163760747e-06, "loss": 0.79605925, "num_input_tokens_seen": 118169865, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.83984375, "step": 5502, "time_per_iteration": 3.8968231678009033 }, { "auxiliary_loss_clip": 0.01155848, "auxiliary_loss_mlp": 0.01041117, "balance_loss_clip": 1.02501142, "balance_loss_mlp": 1.05357933, "epoch": 0.3308582594318353, "flos": 25374372364800.0, "grad_norm": 1.5906166098997294, "language_loss": 0.72397441, "learning_rate": 3.013819499334124e-06, "loss": 0.74594402, "num_input_tokens_seen": 118190760, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.84375, "step": 5503, "time_per_iteration": 2.645782947540283 }, { "auxiliary_loss_clip": 0.01149252, "auxiliary_loss_mlp": 0.0103622, "balance_loss_clip": 1.02022183, "balance_loss_mlp": 1.0481137, "epoch": 0.33091838268450324, "flos": 26468570419200.0, "grad_norm": 1.52571692592084, "language_loss": 0.75136709, "learning_rate": 3.0134937987268913e-06, "loss": 0.77322185, "num_input_tokens_seen": 118213620, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83203125, "step": 5504, "time_per_iteration": 4.154864072799683 }, { "auxiliary_loss_clip": 0.01150407, "auxiliary_loss_mlp": 0.01042422, "balance_loss_clip": 1.02693665, "balance_loss_mlp": 1.04921877, "epoch": 0.3309785059371712, "flos": 24971167221120.0, "grad_norm": 1.5188059862449095, "language_loss": 0.69536787, "learning_rate": 3.013168061950672e-06, "loss": 0.71729624, "num_input_tokens_seen": 118235010, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.83203125, "step": 5505, "time_per_iteration": 2.682365894317627 }, { "auxiliary_loss_clip": 0.01141604, "auxiliary_loss_mlp": 0.01048736, "balance_loss_clip": 1.03326249, "balance_loss_mlp": 1.05185962, "epoch": 0.3310386291898392, "flos": 20446710192000.0, "grad_norm": 1.6014569183525031, "language_loss": 0.81921774, "learning_rate": 3.0128422890170908e-06, "loss": 0.84112114, "num_input_tokens_seen": 118255820, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8046875, "step": 5506, "time_per_iteration": 2.56710147857666 }, { "auxiliary_loss_clip": 0.0114327, "auxiliary_loss_mlp": 0.01044487, "balance_loss_clip": 1.02870333, "balance_loss_mlp": 1.05248237, "epoch": 0.33109875244250714, "flos": 23182672204800.0, "grad_norm": 1.6864321409070837, "language_loss": 0.79031438, "learning_rate": 3.0125164799377727e-06, "loss": 0.8121919, "num_input_tokens_seen": 118274160, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.81640625, "step": 5507, "time_per_iteration": 4.1904284954071045 }, { "auxiliary_loss_clip": 0.01146469, "auxiliary_loss_mlp": 0.01049869, "balance_loss_clip": 1.03416908, "balance_loss_mlp": 1.04677081, "epoch": 0.3311588756951751, "flos": 24168384207360.0, "grad_norm": 1.466864708092067, "language_loss": 0.71781456, "learning_rate": 3.0121906347243473e-06, "loss": 0.73977792, "num_input_tokens_seen": 118294385, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8203125, "step": 5508, "time_per_iteration": 2.685657024383545 }, { "auxiliary_loss_clip": 0.01147621, "auxiliary_loss_mlp": 0.01037749, "balance_loss_clip": 1.02315819, "balance_loss_mlp": 1.05044329, "epoch": 0.33121899894784307, "flos": 28145742209280.0, "grad_norm": 1.8234819667906812, "language_loss": 0.71835494, "learning_rate": 3.011864753388441e-06, "loss": 0.74020869, "num_input_tokens_seen": 118313105, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.79296875, "step": 5509, "time_per_iteration": 2.6517043113708496 }, { "auxiliary_loss_clip": 0.01140529, "auxiliary_loss_mlp": 0.01035913, "balance_loss_clip": 1.02067769, "balance_loss_mlp": 1.0497191, "epoch": 0.33127912220051103, "flos": 29567660976000.0, "grad_norm": 1.7887058716477027, "language_loss": 0.72777539, "learning_rate": 3.0115388359416845e-06, "loss": 0.74953979, "num_input_tokens_seen": 118335250, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.81640625, "step": 5510, "time_per_iteration": 2.6799566745758057 }, { "auxiliary_loss_clip": 0.01156226, "auxiliary_loss_mlp": 0.01043579, "balance_loss_clip": 1.02892864, "balance_loss_mlp": 1.04905391, "epoch": 0.331339245453179, "flos": 14428836374400.0, "grad_norm": 2.4141302026371196, "language_loss": 0.87896788, "learning_rate": 3.011212882395709e-06, "loss": 0.90096593, "num_input_tokens_seen": 118351470, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80078125, "step": 5511, "time_per_iteration": 2.5628888607025146 }, { "auxiliary_loss_clip": 0.01162631, "auxiliary_loss_mlp": 0.01034763, "balance_loss_clip": 1.02082181, "balance_loss_mlp": 1.05035639, "epoch": 0.33139936870584696, "flos": 20887118847360.0, "grad_norm": 1.6161682784791567, "language_loss": 0.73018539, "learning_rate": 3.010886892762147e-06, "loss": 0.75215936, "num_input_tokens_seen": 118370970, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.765625, "step": 5512, "time_per_iteration": 2.6433000564575195 }, { "auxiliary_loss_clip": 0.01156168, "auxiliary_loss_mlp": 0.01044583, "balance_loss_clip": 1.02935958, "balance_loss_mlp": 1.0495975, "epoch": 0.33145949195851493, "flos": 36284356869120.0, "grad_norm": 1.888520260073367, "language_loss": 0.72314644, "learning_rate": 3.0105608670526317e-06, "loss": 0.74515396, "num_input_tokens_seen": 118393125, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.796875, "step": 5513, "time_per_iteration": 2.7078235149383545 }, { "auxiliary_loss_clip": 0.0116116, "auxiliary_loss_mlp": 0.01037172, "balance_loss_clip": 1.02129316, "balance_loss_mlp": 1.04967666, "epoch": 0.33151961521118295, "flos": 14279735018880.0, "grad_norm": 1.9663733353344712, "language_loss": 0.68359298, "learning_rate": 3.010234805278799e-06, "loss": 0.7055763, "num_input_tokens_seen": 118410860, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.84765625, "step": 5514, "time_per_iteration": 2.561185359954834 }, { "auxiliary_loss_clip": 0.01150955, "auxiliary_loss_mlp": 0.01048505, "balance_loss_clip": 1.03063571, "balance_loss_mlp": 1.04985881, "epoch": 0.3315797384638509, "flos": 20774323163520.0, "grad_norm": 1.9406869719673037, "language_loss": 0.66666043, "learning_rate": 3.0099087074522844e-06, "loss": 0.68865502, "num_input_tokens_seen": 118429570, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.83203125, "step": 5515, "time_per_iteration": 2.599766254425049 }, { "auxiliary_loss_clip": 0.01138986, "auxiliary_loss_mlp": 0.01040624, "balance_loss_clip": 1.02500784, "balance_loss_mlp": 1.04771614, "epoch": 0.3316398617165189, "flos": 24679464871680.0, "grad_norm": 1.563130056701649, "language_loss": 0.69429463, "learning_rate": 3.009582573584726e-06, "loss": 0.7160908, "num_input_tokens_seen": 118450285, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.82421875, "step": 5516, "time_per_iteration": 2.617149591445923 }, { "auxiliary_loss_clip": 0.01142427, "auxiliary_loss_mlp": 0.01042338, "balance_loss_clip": 1.02663779, "balance_loss_mlp": 1.04858768, "epoch": 0.33169998496918685, "flos": 18587974129920.0, "grad_norm": 2.128751847501041, "language_loss": 0.8092072, "learning_rate": 3.0092564036877624e-06, "loss": 0.83105487, "num_input_tokens_seen": 118468270, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.84765625, "step": 5517, "time_per_iteration": 2.5632057189941406 }, { "auxiliary_loss_clip": 0.0113671, "auxiliary_loss_mlp": 0.01035919, "balance_loss_clip": 1.0214709, "balance_loss_mlp": 1.04920208, "epoch": 0.3317601082218548, "flos": 20193647898240.0, "grad_norm": 1.8041601992892746, "language_loss": 0.74188089, "learning_rate": 3.0089301977730343e-06, "loss": 0.7636072, "num_input_tokens_seen": 118486615, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78515625, "step": 5518, "time_per_iteration": 2.5717554092407227 }, { "auxiliary_loss_clip": 0.01072386, "auxiliary_loss_mlp": 0.01005145, "balance_loss_clip": 1.00273657, "balance_loss_mlp": 1.02767992, "epoch": 0.3318202314745228, "flos": 68974703637120.0, "grad_norm": 0.6243263555973751, "language_loss": 0.54332811, "learning_rate": 3.008603955852182e-06, "loss": 0.56410336, "num_input_tokens_seen": 118553580, "router_z_loss_clip": 0.02404785, "router_z_loss_mlp": 0.265625, "step": 5519, "time_per_iteration": 3.2988064289093018 }, { "auxiliary_loss_clip": 0.01139647, "auxiliary_loss_mlp": 0.01039384, "balance_loss_clip": 1.02290952, "balance_loss_mlp": 1.0490768, "epoch": 0.33188035472719074, "flos": 21500113374720.0, "grad_norm": 2.141147050626537, "language_loss": 0.78434551, "learning_rate": 3.00827767793685e-06, "loss": 0.80613589, "num_input_tokens_seen": 118570280, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.81640625, "step": 5520, "time_per_iteration": 2.588463068008423 }, { "auxiliary_loss_clip": 0.01145563, "auxiliary_loss_mlp": 0.0103569, "balance_loss_clip": 1.0209676, "balance_loss_mlp": 1.04890978, "epoch": 0.3319404779798587, "flos": 28870490926080.0, "grad_norm": 2.151681043798962, "language_loss": 0.76444805, "learning_rate": 3.0079513640386806e-06, "loss": 0.78626055, "num_input_tokens_seen": 118590455, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7890625, "step": 5521, "time_per_iteration": 2.7642173767089844 }, { "auxiliary_loss_clip": 0.01138991, "auxiliary_loss_mlp": 0.01044744, "balance_loss_clip": 1.02832913, "balance_loss_mlp": 1.04627681, "epoch": 0.33200060123252667, "flos": 23076915586560.0, "grad_norm": 2.20281718076745, "language_loss": 0.69924498, "learning_rate": 3.00762501416932e-06, "loss": 0.72108239, "num_input_tokens_seen": 118609495, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8359375, "step": 5522, "time_per_iteration": 2.607611656188965 }, { "auxiliary_loss_clip": 0.01145734, "auxiliary_loss_mlp": 0.01036911, "balance_loss_clip": 1.0223918, "balance_loss_mlp": 1.04823399, "epoch": 0.33206072448519464, "flos": 21142479611520.0, "grad_norm": 1.831500167369775, "language_loss": 0.73652923, "learning_rate": 3.007298628340414e-06, "loss": 0.75835574, "num_input_tokens_seen": 118628720, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.796875, "step": 5523, "time_per_iteration": 2.6529908180236816 }, { "auxiliary_loss_clip": 0.01144016, "auxiliary_loss_mlp": 0.01039074, "balance_loss_clip": 1.02375007, "balance_loss_mlp": 1.04671371, "epoch": 0.3321208477378626, "flos": 13079097987840.0, "grad_norm": 1.6747817036178767, "language_loss": 0.81948119, "learning_rate": 3.0069722065636114e-06, "loss": 0.84131217, "num_input_tokens_seen": 118645955, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.79296875, "step": 5524, "time_per_iteration": 2.538243293762207 }, { "auxiliary_loss_clip": 0.01154141, "auxiliary_loss_mlp": 0.01287447, "balance_loss_clip": 1.02629066, "balance_loss_mlp": 1.04798651, "epoch": 0.33218097099053057, "flos": 21215414177280.0, "grad_norm": 1.5426088539782978, "language_loss": 0.8251887, "learning_rate": 3.006645748850561e-06, "loss": 0.84960455, "num_input_tokens_seen": 118665605, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.796875, "step": 5525, "time_per_iteration": 2.661163091659546 }, { "auxiliary_loss_clip": 0.01052739, "auxiliary_loss_mlp": 0.01006368, "balance_loss_clip": 1.00419855, "balance_loss_mlp": 1.02660441, "epoch": 0.33224109424319853, "flos": 64348979189760.0, "grad_norm": 0.7611461045762513, "language_loss": 0.52518499, "learning_rate": 3.006319255212913e-06, "loss": 0.54577601, "num_input_tokens_seen": 118728155, "router_z_loss_clip": 0.02172852, "router_z_loss_mlp": 0.26171875, "step": 5526, "time_per_iteration": 3.1817409992218018 }, { "auxiliary_loss_clip": 0.01157482, "auxiliary_loss_mlp": 0.0129619, "balance_loss_clip": 1.03286123, "balance_loss_mlp": 1.04745817, "epoch": 0.33230121749586655, "flos": 17346003523200.0, "grad_norm": 2.0777073366705925, "language_loss": 0.77389193, "learning_rate": 3.0059927256623195e-06, "loss": 0.79842865, "num_input_tokens_seen": 118743955, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.828125, "step": 5527, "time_per_iteration": 2.5961496829986572 }, { "auxiliary_loss_clip": 0.01166326, "auxiliary_loss_mlp": 0.01052008, "balance_loss_clip": 1.03731561, "balance_loss_mlp": 1.05017066, "epoch": 0.3323613407485345, "flos": 20997041443200.0, "grad_norm": 1.7145583201534471, "language_loss": 0.71827716, "learning_rate": 3.005666160210434e-06, "loss": 0.74046052, "num_input_tokens_seen": 118763275, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8046875, "step": 5528, "time_per_iteration": 2.598494529724121 }, { "auxiliary_loss_clip": 0.01145937, "auxiliary_loss_mlp": 0.01036421, "balance_loss_clip": 1.02236676, "balance_loss_mlp": 1.04653072, "epoch": 0.3324214640012025, "flos": 13152535344000.0, "grad_norm": 1.663212077474246, "language_loss": 0.82630569, "learning_rate": 3.005339558868909e-06, "loss": 0.84812927, "num_input_tokens_seen": 118781110, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.81640625, "step": 5529, "time_per_iteration": 2.5761635303497314 }, { "auxiliary_loss_clip": 0.01150031, "auxiliary_loss_mlp": 0.01286224, "balance_loss_clip": 1.02365887, "balance_loss_mlp": 1.04807091, "epoch": 0.33248158725387045, "flos": 22273522041600.0, "grad_norm": 2.059962211505333, "language_loss": 0.6948809, "learning_rate": 3.0050129216494017e-06, "loss": 0.71924347, "num_input_tokens_seen": 118800620, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.83984375, "step": 5530, "time_per_iteration": 2.5547292232513428 }, { "auxiliary_loss_clip": 0.01138772, "auxiliary_loss_mlp": 0.01050658, "balance_loss_clip": 1.03362274, "balance_loss_mlp": 1.04786825, "epoch": 0.3325417105065384, "flos": 20740998320640.0, "grad_norm": 2.456191158942166, "language_loss": 0.76106131, "learning_rate": 3.004686248563569e-06, "loss": 0.78295565, "num_input_tokens_seen": 118818725, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.8203125, "step": 5531, "time_per_iteration": 2.604008913040161 }, { "auxiliary_loss_clip": 0.01148607, "auxiliary_loss_mlp": 0.01043005, "balance_loss_clip": 1.02771044, "balance_loss_mlp": 1.04796779, "epoch": 0.3326018337592064, "flos": 24790536702720.0, "grad_norm": 1.7527361833915829, "language_loss": 0.73710519, "learning_rate": 3.0043595396230675e-06, "loss": 0.75902128, "num_input_tokens_seen": 118839390, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.828125, "step": 5532, "time_per_iteration": 2.6706783771514893 }, { "auxiliary_loss_clip": 0.01149151, "auxiliary_loss_mlp": 0.01033456, "balance_loss_clip": 1.01906753, "balance_loss_mlp": 1.05006564, "epoch": 0.33266195701187434, "flos": 14501699112960.0, "grad_norm": 4.057237845877337, "language_loss": 0.66097254, "learning_rate": 3.004032794839558e-06, "loss": 0.68279862, "num_input_tokens_seen": 118856275, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8203125, "step": 5533, "time_per_iteration": 2.5808675289154053 }, { "auxiliary_loss_clip": 0.01171333, "auxiliary_loss_mlp": 0.01040321, "balance_loss_clip": 1.02448976, "balance_loss_mlp": 1.04603076, "epoch": 0.3327220802645423, "flos": 15304410299520.0, "grad_norm": 2.5574037620703445, "language_loss": 0.71115053, "learning_rate": 3.0037060142247006e-06, "loss": 0.73326707, "num_input_tokens_seen": 118873830, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.80859375, "step": 5534, "time_per_iteration": 2.5922117233276367 }, { "auxiliary_loss_clip": 0.01154838, "auxiliary_loss_mlp": 0.01036384, "balance_loss_clip": 1.02147126, "balance_loss_mlp": 1.04783881, "epoch": 0.3327822035172103, "flos": 23477534951040.0, "grad_norm": 1.627645092761764, "language_loss": 0.6691044, "learning_rate": 3.0033791977901582e-06, "loss": 0.69101661, "num_input_tokens_seen": 118891560, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.796875, "step": 5535, "time_per_iteration": 2.653808355331421 }, { "auxiliary_loss_clip": 0.01137292, "auxiliary_loss_mlp": 0.01287284, "balance_loss_clip": 1.0252378, "balance_loss_mlp": 1.04703069, "epoch": 0.33284232676987824, "flos": 25374516019200.0, "grad_norm": 2.5522611021595187, "language_loss": 0.72764391, "learning_rate": 3.0030523455475923e-06, "loss": 0.75188965, "num_input_tokens_seen": 118910260, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8125, "step": 5536, "time_per_iteration": 2.6423213481903076 }, { "auxiliary_loss_clip": 0.01137016, "auxiliary_loss_mlp": 0.01037762, "balance_loss_clip": 1.02278924, "balance_loss_mlp": 1.04697299, "epoch": 0.3329024500225462, "flos": 23694363400320.0, "grad_norm": 1.5056985465570618, "language_loss": 0.81609136, "learning_rate": 3.0027254575086683e-06, "loss": 0.83783913, "num_input_tokens_seen": 118929985, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8125, "step": 5537, "time_per_iteration": 2.616806983947754 }, { "auxiliary_loss_clip": 0.01143341, "auxiliary_loss_mlp": 0.01041443, "balance_loss_clip": 1.02652991, "balance_loss_mlp": 1.05158806, "epoch": 0.33296257327521417, "flos": 31723163205120.0, "grad_norm": 1.778715158204715, "language_loss": 0.71221387, "learning_rate": 3.0023985336850526e-06, "loss": 0.73406172, "num_input_tokens_seen": 118951355, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.828125, "step": 5538, "time_per_iteration": 2.6476852893829346 }, { "auxiliary_loss_clip": 0.01133431, "auxiliary_loss_mlp": 0.01035485, "balance_loss_clip": 1.02047694, "balance_loss_mlp": 1.04602051, "epoch": 0.33302269652788213, "flos": 22744705674240.0, "grad_norm": 2.6036525688172625, "language_loss": 0.74134165, "learning_rate": 3.0020715740884112e-06, "loss": 0.76303077, "num_input_tokens_seen": 118970910, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.78515625, "step": 5539, "time_per_iteration": 2.5544400215148926 }, { "auxiliary_loss_clip": 0.01134786, "auxiliary_loss_mlp": 0.0104378, "balance_loss_clip": 1.02734113, "balance_loss_mlp": 1.04826498, "epoch": 0.33308281978055015, "flos": 11473747441920.0, "grad_norm": 1.9211722470658141, "language_loss": 0.70545387, "learning_rate": 3.001744578730413e-06, "loss": 0.72723949, "num_input_tokens_seen": 118989200, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8671875, "step": 5540, "time_per_iteration": 4.007315158843994 }, { "auxiliary_loss_clip": 0.01134959, "auxiliary_loss_mlp": 0.01035525, "balance_loss_clip": 1.02074301, "balance_loss_mlp": 1.04574227, "epoch": 0.3331429430332181, "flos": 38213693112960.0, "grad_norm": 1.5612102165325823, "language_loss": 0.60488784, "learning_rate": 3.0014175476227284e-06, "loss": 0.6265927, "num_input_tokens_seen": 119011030, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.80078125, "step": 5541, "time_per_iteration": 2.8112432956695557 }, { "auxiliary_loss_clip": 0.01148287, "auxiliary_loss_mlp": 0.01040208, "balance_loss_clip": 1.02418602, "balance_loss_mlp": 1.04732084, "epoch": 0.3332030662858861, "flos": 22528667324160.0, "grad_norm": 1.7168516691800753, "language_loss": 0.68866384, "learning_rate": 3.0010904807770267e-06, "loss": 0.71054876, "num_input_tokens_seen": 119030620, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83203125, "step": 5542, "time_per_iteration": 2.6692049503326416 }, { "auxiliary_loss_clip": 0.0113778, "auxiliary_loss_mlp": 0.01042244, "balance_loss_clip": 1.02746165, "balance_loss_mlp": 1.04818487, "epoch": 0.33326318953855405, "flos": 15997773507840.0, "grad_norm": 1.5788759148537028, "language_loss": 0.75781614, "learning_rate": 3.0007633782049808e-06, "loss": 0.77961636, "num_input_tokens_seen": 119048015, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.80859375, "step": 5543, "time_per_iteration": 2.567274570465088 }, { "auxiliary_loss_clip": 0.01191201, "auxiliary_loss_mlp": 0.01035512, "balance_loss_clip": 1.02058697, "balance_loss_mlp": 1.05378163, "epoch": 0.333323312791222, "flos": 25593535198080.0, "grad_norm": 1.8342071824779163, "language_loss": 0.74873102, "learning_rate": 3.000436239918264e-06, "loss": 0.77099818, "num_input_tokens_seen": 119066280, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8359375, "step": 5544, "time_per_iteration": 4.076531887054443 }, { "auxiliary_loss_clip": 0.01127304, "auxiliary_loss_mlp": 0.01032872, "balance_loss_clip": 1.01824474, "balance_loss_mlp": 1.04704309, "epoch": 0.33338343604389, "flos": 25119550304640.0, "grad_norm": 1.503975277432619, "language_loss": 0.70416069, "learning_rate": 3.0001090659285514e-06, "loss": 0.72576249, "num_input_tokens_seen": 119087680, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80078125, "step": 5545, "time_per_iteration": 2.6488473415374756 }, { "auxiliary_loss_clip": 0.01154942, "auxiliary_loss_mlp": 0.01038967, "balance_loss_clip": 1.02413702, "balance_loss_mlp": 1.04946458, "epoch": 0.33344355929655795, "flos": 16947287579520.0, "grad_norm": 1.9010303600113763, "language_loss": 0.69111067, "learning_rate": 2.9997818562475194e-06, "loss": 0.71304977, "num_input_tokens_seen": 119105820, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.78125, "step": 5546, "time_per_iteration": 4.129936695098877 }, { "auxiliary_loss_clip": 0.01158103, "auxiliary_loss_mlp": 0.01036606, "balance_loss_clip": 1.02171707, "balance_loss_mlp": 1.04844379, "epoch": 0.3335036825492259, "flos": 27889591345920.0, "grad_norm": 2.3889167024445004, "language_loss": 0.64893544, "learning_rate": 2.999454610886844e-06, "loss": 0.67088258, "num_input_tokens_seen": 119126630, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.828125, "step": 5547, "time_per_iteration": 2.669262170791626 }, { "auxiliary_loss_clip": 0.01147942, "auxiliary_loss_mlp": 0.01031576, "balance_loss_clip": 1.01825428, "balance_loss_mlp": 1.05132663, "epoch": 0.3335638058018939, "flos": 16179553261440.0, "grad_norm": 2.4357050700705307, "language_loss": 0.85475087, "learning_rate": 2.999127329858205e-06, "loss": 0.87654614, "num_input_tokens_seen": 119143375, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.78515625, "step": 5548, "time_per_iteration": 2.618741512298584 }, { "auxiliary_loss_clip": 0.01141006, "auxiliary_loss_mlp": 0.0103909, "balance_loss_clip": 1.02313936, "balance_loss_mlp": 1.04970646, "epoch": 0.33362392905456184, "flos": 39896108288640.0, "grad_norm": 2.0626865870037787, "language_loss": 0.74456906, "learning_rate": 2.9988000131732813e-06, "loss": 0.76637, "num_input_tokens_seen": 119166450, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.828125, "step": 5549, "time_per_iteration": 4.236039876937866 }, { "auxiliary_loss_clip": 0.01142759, "auxiliary_loss_mlp": 0.01035925, "balance_loss_clip": 1.02030873, "balance_loss_mlp": 1.051373, "epoch": 0.3336840523072298, "flos": 44271212567040.0, "grad_norm": 2.02885575095874, "language_loss": 0.68351603, "learning_rate": 2.998472660843755e-06, "loss": 0.70530289, "num_input_tokens_seen": 119189645, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.82421875, "step": 5550, "time_per_iteration": 2.7412095069885254 }, { "auxiliary_loss_clip": 0.01159499, "auxiliary_loss_mlp": 0.01041587, "balance_loss_clip": 1.02749014, "balance_loss_mlp": 1.05133748, "epoch": 0.33374417555989777, "flos": 15085678429440.0, "grad_norm": 1.6702332962078152, "language_loss": 0.6089431, "learning_rate": 2.998145272881307e-06, "loss": 0.63095397, "num_input_tokens_seen": 119208045, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8125, "step": 5551, "time_per_iteration": 2.6682653427124023 }, { "auxiliary_loss_clip": 0.01157783, "auxiliary_loss_mlp": 0.01038885, "balance_loss_clip": 1.02350068, "balance_loss_mlp": 1.05081773, "epoch": 0.33380429881256574, "flos": 15849174942720.0, "grad_norm": 1.949140856375598, "language_loss": 0.7027356, "learning_rate": 2.997817849297622e-06, "loss": 0.7247023, "num_input_tokens_seen": 119224910, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.796875, "step": 5552, "time_per_iteration": 2.547898530960083 }, { "auxiliary_loss_clip": 0.01158609, "auxiliary_loss_mlp": 0.01038028, "balance_loss_clip": 1.02352071, "balance_loss_mlp": 1.05066502, "epoch": 0.33386442206523376, "flos": 13480327883520.0, "grad_norm": 1.8204685416077837, "language_loss": 0.82651293, "learning_rate": 2.997490390104385e-06, "loss": 0.84847933, "num_input_tokens_seen": 119243290, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8125, "step": 5553, "time_per_iteration": 2.610384702682495 }, { "auxiliary_loss_clip": 0.01149709, "auxiliary_loss_mlp": 0.01289648, "balance_loss_clip": 1.02676022, "balance_loss_mlp": 1.05091596, "epoch": 0.3339245453179017, "flos": 16690669839360.0, "grad_norm": 2.2056635547668026, "language_loss": 0.8115958, "learning_rate": 2.9971628953132815e-06, "loss": 0.83598942, "num_input_tokens_seen": 119261195, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8125, "step": 5554, "time_per_iteration": 2.5770819187164307 }, { "auxiliary_loss_clip": 0.01141195, "auxiliary_loss_mlp": 0.01040875, "balance_loss_clip": 1.02616501, "balance_loss_mlp": 1.0513413, "epoch": 0.3339846685705697, "flos": 24610624456320.0, "grad_norm": 1.520299459866833, "language_loss": 0.81345719, "learning_rate": 2.9968353649359996e-06, "loss": 0.83527792, "num_input_tokens_seen": 119282845, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80859375, "step": 5555, "time_per_iteration": 2.66180157661438 }, { "auxiliary_loss_clip": 0.01137979, "auxiliary_loss_mlp": 0.01040749, "balance_loss_clip": 1.02672398, "balance_loss_mlp": 1.04901826, "epoch": 0.33404479182323765, "flos": 30953812775040.0, "grad_norm": 1.7050553224280507, "language_loss": 0.74006474, "learning_rate": 2.996507798984227e-06, "loss": 0.76185203, "num_input_tokens_seen": 119304430, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.796875, "step": 5556, "time_per_iteration": 2.6563022136688232 }, { "auxiliary_loss_clip": 0.01139435, "auxiliary_loss_mlp": 0.01038436, "balance_loss_clip": 1.02382112, "balance_loss_mlp": 1.05185533, "epoch": 0.3341049150759056, "flos": 23513301918720.0, "grad_norm": 1.362959571405967, "language_loss": 0.82187921, "learning_rate": 2.9961801974696546e-06, "loss": 0.84365791, "num_input_tokens_seen": 119323830, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78515625, "step": 5557, "time_per_iteration": 2.695939779281616 }, { "auxiliary_loss_clip": 0.01149989, "auxiliary_loss_mlp": 0.01038082, "balance_loss_clip": 1.02338386, "balance_loss_mlp": 1.05095911, "epoch": 0.3341650383285736, "flos": 24026824707840.0, "grad_norm": 1.4311714934389268, "language_loss": 0.80120796, "learning_rate": 2.995852560403974e-06, "loss": 0.82308877, "num_input_tokens_seen": 119346340, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8125, "step": 5558, "time_per_iteration": 2.6204581260681152 }, { "auxiliary_loss_clip": 0.0115128, "auxiliary_loss_mlp": 0.01035353, "balance_loss_clip": 1.02132237, "balance_loss_mlp": 1.05199623, "epoch": 0.33422516158124155, "flos": 24901967669760.0, "grad_norm": 1.4678900614267516, "language_loss": 0.81598115, "learning_rate": 2.9955248877988767e-06, "loss": 0.83784747, "num_input_tokens_seen": 119367285, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8125, "step": 5559, "time_per_iteration": 2.672846794128418 }, { "auxiliary_loss_clip": 0.01156926, "auxiliary_loss_mlp": 0.01041672, "balance_loss_clip": 1.0270865, "balance_loss_mlp": 1.05104518, "epoch": 0.3342852848339095, "flos": 18333403464960.0, "grad_norm": 1.7218183289035058, "language_loss": 0.71598601, "learning_rate": 2.9951971796660565e-06, "loss": 0.73797196, "num_input_tokens_seen": 119385370, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.796875, "step": 5560, "time_per_iteration": 2.5403389930725098 }, { "auxiliary_loss_clip": 0.01154982, "auxiliary_loss_mlp": 0.01043994, "balance_loss_clip": 1.0279845, "balance_loss_mlp": 1.05300045, "epoch": 0.3343454080865775, "flos": 30046530119040.0, "grad_norm": 1.494115773739197, "language_loss": 0.75063497, "learning_rate": 2.994869436017209e-06, "loss": 0.77262473, "num_input_tokens_seen": 119409150, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.84375, "step": 5561, "time_per_iteration": 2.6852102279663086 }, { "auxiliary_loss_clip": 0.01151692, "auxiliary_loss_mlp": 0.0103567, "balance_loss_clip": 1.02078044, "balance_loss_mlp": 1.05248213, "epoch": 0.33440553133924544, "flos": 16398823835520.0, "grad_norm": 1.5710808092993995, "language_loss": 0.69616699, "learning_rate": 2.9945416568640314e-06, "loss": 0.71804059, "num_input_tokens_seen": 119426475, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8125, "step": 5562, "time_per_iteration": 2.629873275756836 }, { "auxiliary_loss_clip": 0.01141068, "auxiliary_loss_mlp": 0.01044297, "balance_loss_clip": 1.03037989, "balance_loss_mlp": 1.05254102, "epoch": 0.3344656545919134, "flos": 24242072958720.0, "grad_norm": 1.5316340813536322, "language_loss": 0.64857775, "learning_rate": 2.99421384221822e-06, "loss": 0.67043138, "num_input_tokens_seen": 119446900, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.796875, "step": 5563, "time_per_iteration": 2.735032796859741 }, { "auxiliary_loss_clip": 0.0117946, "auxiliary_loss_mlp": 0.01045144, "balance_loss_clip": 1.02952754, "balance_loss_mlp": 1.05155206, "epoch": 0.3345257778445814, "flos": 52118843149440.0, "grad_norm": 1.471666832286301, "language_loss": 0.74163687, "learning_rate": 2.9938859920914735e-06, "loss": 0.76388288, "num_input_tokens_seen": 119470945, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.83203125, "step": 5564, "time_per_iteration": 3.0354177951812744 }, { "auxiliary_loss_clip": 0.01089374, "auxiliary_loss_mlp": 0.01010005, "balance_loss_clip": 1.00739431, "balance_loss_mlp": 1.03579712, "epoch": 0.33458590109724934, "flos": 68048602254720.0, "grad_norm": 0.7801980788145261, "language_loss": 0.55476511, "learning_rate": 2.9935581064954934e-06, "loss": 0.57575893, "num_input_tokens_seen": 119529925, "router_z_loss_clip": 0.02612305, "router_z_loss_mlp": 0.265625, "step": 5565, "time_per_iteration": 3.156519889831543 }, { "auxiliary_loss_clip": 0.0113044, "auxiliary_loss_mlp": 0.01039208, "balance_loss_clip": 1.02499807, "balance_loss_mlp": 1.05240297, "epoch": 0.3346460243499173, "flos": 37414788768000.0, "grad_norm": 1.7959971159441745, "language_loss": 0.64544809, "learning_rate": 2.9932301854419794e-06, "loss": 0.6671446, "num_input_tokens_seen": 119550700, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78125, "step": 5566, "time_per_iteration": 2.733222723007202 }, { "auxiliary_loss_clip": 0.01148498, "auxiliary_loss_mlp": 0.01039019, "balance_loss_clip": 1.02417731, "balance_loss_mlp": 1.05171907, "epoch": 0.3347061476025853, "flos": 18697358021760.0, "grad_norm": 1.7865913951037709, "language_loss": 0.77278328, "learning_rate": 2.9929022289426352e-06, "loss": 0.79465854, "num_input_tokens_seen": 119569295, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7890625, "step": 5567, "time_per_iteration": 2.5637240409851074 }, { "auxiliary_loss_clip": 0.01150525, "auxiliary_loss_mlp": 0.0103604, "balance_loss_clip": 1.02038765, "balance_loss_mlp": 1.05173683, "epoch": 0.3347662708552533, "flos": 13917827537280.0, "grad_norm": 1.682768847150815, "language_loss": 0.76402628, "learning_rate": 2.9925742370091645e-06, "loss": 0.78589189, "num_input_tokens_seen": 119587375, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8125, "step": 5568, "time_per_iteration": 2.638350009918213 }, { "auxiliary_loss_clip": 0.01159111, "auxiliary_loss_mlp": 0.01040835, "balance_loss_clip": 1.02595723, "balance_loss_mlp": 1.05047274, "epoch": 0.33482639410792125, "flos": 19750402068480.0, "grad_norm": 1.9548821716406855, "language_loss": 0.70420229, "learning_rate": 2.992246209653272e-06, "loss": 0.72620177, "num_input_tokens_seen": 119604530, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.81640625, "step": 5569, "time_per_iteration": 2.790217638015747 }, { "auxiliary_loss_clip": 0.0115781, "auxiliary_loss_mlp": 0.01037053, "balance_loss_clip": 1.02113903, "balance_loss_mlp": 1.04841542, "epoch": 0.3348865173605892, "flos": 16102991422080.0, "grad_norm": 1.8733922271443721, "language_loss": 0.90046024, "learning_rate": 2.9919181468866653e-06, "loss": 0.92240894, "num_input_tokens_seen": 119621025, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.82421875, "step": 5570, "time_per_iteration": 2.664642095565796 }, { "auxiliary_loss_clip": 0.01156163, "auxiliary_loss_mlp": 0.01032461, "balance_loss_clip": 1.0177145, "balance_loss_mlp": 1.0498147, "epoch": 0.3349466406132572, "flos": 25008945350400.0, "grad_norm": 1.524729502575732, "language_loss": 0.79681921, "learning_rate": 2.9915900487210514e-06, "loss": 0.81870538, "num_input_tokens_seen": 119641725, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.796875, "step": 5571, "time_per_iteration": 2.6812281608581543 }, { "auxiliary_loss_clip": 0.01056433, "auxiliary_loss_mlp": 0.01250053, "balance_loss_clip": 1.00090873, "balance_loss_mlp": 1.02990246, "epoch": 0.33500676386592515, "flos": 54319991564160.0, "grad_norm": 0.9021926605253908, "language_loss": 0.56012201, "learning_rate": 2.991261915168139e-06, "loss": 0.58318686, "num_input_tokens_seen": 119693560, "router_z_loss_clip": 0.02380371, "router_z_loss_mlp": 0.265625, "step": 5572, "time_per_iteration": 3.1686930656433105 }, { "auxiliary_loss_clip": 0.01133613, "auxiliary_loss_mlp": 0.01045444, "balance_loss_clip": 1.03028011, "balance_loss_mlp": 1.05377436, "epoch": 0.3350668871185931, "flos": 26797332625920.0, "grad_norm": 3.435252906951395, "language_loss": 0.77864033, "learning_rate": 2.990933746239639e-06, "loss": 0.80043089, "num_input_tokens_seen": 119712935, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.80078125, "step": 5573, "time_per_iteration": 2.70851469039917 }, { "auxiliary_loss_clip": 0.01159706, "auxiliary_loss_mlp": 0.01047192, "balance_loss_clip": 1.0307765, "balance_loss_mlp": 1.0505383, "epoch": 0.3351270103712611, "flos": 33510508986240.0, "grad_norm": 2.4897791835138454, "language_loss": 0.72848177, "learning_rate": 2.9906055419472622e-06, "loss": 0.75055075, "num_input_tokens_seen": 119731680, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.828125, "step": 5574, "time_per_iteration": 2.6724417209625244 }, { "auxiliary_loss_clip": 0.01135648, "auxiliary_loss_mlp": 0.01033518, "balance_loss_clip": 1.01879525, "balance_loss_mlp": 1.0477519, "epoch": 0.33518713362392905, "flos": 26506240807680.0, "grad_norm": 1.593068137516351, "language_loss": 0.87706906, "learning_rate": 2.9902773023027224e-06, "loss": 0.89876068, "num_input_tokens_seen": 119752155, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7890625, "step": 5575, "time_per_iteration": 2.687074899673462 }, { "auxiliary_loss_clip": 0.01152415, "auxiliary_loss_mlp": 0.0104731, "balance_loss_clip": 1.03064418, "balance_loss_mlp": 1.04961824, "epoch": 0.335247256876597, "flos": 17232345912960.0, "grad_norm": 2.6187520805593496, "language_loss": 0.82941282, "learning_rate": 2.9899490273177327e-06, "loss": 0.85141015, "num_input_tokens_seen": 119769195, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8515625, "step": 5576, "time_per_iteration": 2.6145145893096924 }, { "auxiliary_loss_clip": 0.01139421, "auxiliary_loss_mlp": 0.01040613, "balance_loss_clip": 1.02484155, "balance_loss_mlp": 1.0480305, "epoch": 0.335307380129265, "flos": 25629373992960.0, "grad_norm": 1.9877370362822018, "language_loss": 0.73518825, "learning_rate": 2.9896207170040084e-06, "loss": 0.75698858, "num_input_tokens_seen": 119786810, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.828125, "step": 5577, "time_per_iteration": 2.7375261783599854 }, { "auxiliary_loss_clip": 0.01163091, "auxiliary_loss_mlp": 0.01039422, "balance_loss_clip": 1.02336454, "balance_loss_mlp": 1.05505073, "epoch": 0.33536750338193294, "flos": 19680089195520.0, "grad_norm": 1.5869838343847789, "language_loss": 0.81803524, "learning_rate": 2.989292371373266e-06, "loss": 0.84006035, "num_input_tokens_seen": 119805395, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.80859375, "step": 5578, "time_per_iteration": 2.5860202312469482 }, { "auxiliary_loss_clip": 0.01173313, "auxiliary_loss_mlp": 0.01287085, "balance_loss_clip": 1.02438414, "balance_loss_mlp": 1.05128324, "epoch": 0.3354276266346009, "flos": 18332613365760.0, "grad_norm": 1.668002766081858, "language_loss": 0.71917534, "learning_rate": 2.9889639904372246e-06, "loss": 0.74377936, "num_input_tokens_seen": 119823135, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.76953125, "step": 5579, "time_per_iteration": 2.6762731075286865 }, { "auxiliary_loss_clip": 0.01168255, "auxiliary_loss_mlp": 0.01037322, "balance_loss_clip": 1.02197957, "balance_loss_mlp": 1.05105567, "epoch": 0.3354877498872689, "flos": 17858556645120.0, "grad_norm": 1.8966629296816804, "language_loss": 0.81159341, "learning_rate": 2.988635574207602e-06, "loss": 0.83364916, "num_input_tokens_seen": 119842265, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8203125, "step": 5580, "time_per_iteration": 2.571563959121704 }, { "auxiliary_loss_clip": 0.01150844, "auxiliary_loss_mlp": 0.01033532, "balance_loss_clip": 1.0179038, "balance_loss_mlp": 1.05051804, "epoch": 0.3355478731399369, "flos": 24717745791360.0, "grad_norm": 1.738703437884445, "language_loss": 0.77954948, "learning_rate": 2.988307122696119e-06, "loss": 0.80139327, "num_input_tokens_seen": 119862500, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8203125, "step": 5581, "time_per_iteration": 2.6893727779388428 }, { "auxiliary_loss_clip": 0.0116133, "auxiliary_loss_mlp": 0.01044531, "balance_loss_clip": 1.02740097, "balance_loss_mlp": 1.04983759, "epoch": 0.33560799639260486, "flos": 16873886136960.0, "grad_norm": 2.2583868380321896, "language_loss": 0.74716741, "learning_rate": 2.9879786359144967e-06, "loss": 0.76922601, "num_input_tokens_seen": 119880160, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.84765625, "step": 5582, "time_per_iteration": 3.9853434562683105 }, { "auxiliary_loss_clip": 0.01149972, "auxiliary_loss_mlp": 0.01043482, "balance_loss_clip": 1.02805638, "balance_loss_mlp": 1.04879785, "epoch": 0.3356681196452728, "flos": 18333511205760.0, "grad_norm": 1.5719897706687413, "language_loss": 0.82184148, "learning_rate": 2.9876501138744577e-06, "loss": 0.84377611, "num_input_tokens_seen": 119899040, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.83203125, "step": 5583, "time_per_iteration": 2.552500009536743 }, { "auxiliary_loss_clip": 0.01146796, "auxiliary_loss_mlp": 0.01042876, "balance_loss_clip": 1.02837384, "balance_loss_mlp": 1.04830515, "epoch": 0.3357282428979408, "flos": 34750612085760.0, "grad_norm": 1.63657333503534, "language_loss": 0.77460837, "learning_rate": 2.9873215565877274e-06, "loss": 0.79650515, "num_input_tokens_seen": 119921120, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8046875, "step": 5584, "time_per_iteration": 2.828310966491699 }, { "auxiliary_loss_clip": 0.01149568, "auxiliary_loss_mlp": 0.01042289, "balance_loss_clip": 1.0269115, "balance_loss_mlp": 1.04860306, "epoch": 0.33578836615060875, "flos": 21580087006080.0, "grad_norm": 2.259163636281998, "language_loss": 0.76051533, "learning_rate": 2.9869929640660303e-06, "loss": 0.78243387, "num_input_tokens_seen": 119940165, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.83203125, "step": 5585, "time_per_iteration": 2.5948219299316406 }, { "auxiliary_loss_clip": 0.01133818, "auxiliary_loss_mlp": 0.01042661, "balance_loss_clip": 1.0268898, "balance_loss_mlp": 1.0463438, "epoch": 0.3358484894032767, "flos": 24530291688960.0, "grad_norm": 1.7906547359357872, "language_loss": 0.77521598, "learning_rate": 2.9866643363210928e-06, "loss": 0.7969808, "num_input_tokens_seen": 119959730, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.78515625, "step": 5586, "time_per_iteration": 4.050603151321411 }, { "auxiliary_loss_clip": 0.01163786, "auxiliary_loss_mlp": 0.01049627, "balance_loss_clip": 1.03182876, "balance_loss_mlp": 1.05178773, "epoch": 0.3359086126559447, "flos": 22455589104000.0, "grad_norm": 1.5803700003287857, "language_loss": 0.80685496, "learning_rate": 2.9863356733646437e-06, "loss": 0.82898909, "num_input_tokens_seen": 119979315, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.8515625, "step": 5587, "time_per_iteration": 4.155569076538086 }, { "auxiliary_loss_clip": 0.01126411, "auxiliary_loss_mlp": 0.01039558, "balance_loss_clip": 1.02530682, "balance_loss_mlp": 1.04912376, "epoch": 0.33596873590861265, "flos": 16543687386240.0, "grad_norm": 1.8618903981077333, "language_loss": 0.6737355, "learning_rate": 2.9860069752084115e-06, "loss": 0.69539523, "num_input_tokens_seen": 119996140, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7734375, "step": 5588, "time_per_iteration": 2.5042471885681152 }, { "auxiliary_loss_clip": 0.01148509, "auxiliary_loss_mlp": 0.01293432, "balance_loss_clip": 1.03001893, "balance_loss_mlp": 1.0495019, "epoch": 0.3360288591612806, "flos": 31175812782720.0, "grad_norm": 3.122614940349009, "language_loss": 0.69772089, "learning_rate": 2.985678241864126e-06, "loss": 0.72214031, "num_input_tokens_seen": 120017720, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.80859375, "step": 5589, "time_per_iteration": 2.679579734802246 }, { "auxiliary_loss_clip": 0.01147813, "auxiliary_loss_mlp": 0.01040412, "balance_loss_clip": 1.02377081, "balance_loss_mlp": 1.04842544, "epoch": 0.3360889824139486, "flos": 23696913265920.0, "grad_norm": 1.856344684491191, "language_loss": 0.67229986, "learning_rate": 2.9853494733435204e-06, "loss": 0.69418216, "num_input_tokens_seen": 120036335, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.81640625, "step": 5590, "time_per_iteration": 4.051928997039795 }, { "auxiliary_loss_clip": 0.01154281, "auxiliary_loss_mlp": 0.01041213, "balance_loss_clip": 1.02554917, "balance_loss_mlp": 1.05007923, "epoch": 0.33614910566661654, "flos": 19318109886720.0, "grad_norm": 1.9272584288032235, "language_loss": 0.72989887, "learning_rate": 2.985020669658326e-06, "loss": 0.75185382, "num_input_tokens_seen": 120056120, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.77734375, "step": 5591, "time_per_iteration": 2.5906014442443848 }, { "auxiliary_loss_clip": 0.01128811, "auxiliary_loss_mlp": 0.01043362, "balance_loss_clip": 1.02818656, "balance_loss_mlp": 1.04909968, "epoch": 0.3362092289192845, "flos": 16472261191680.0, "grad_norm": 1.8669118902415234, "language_loss": 0.69612384, "learning_rate": 2.984691830820278e-06, "loss": 0.71784556, "num_input_tokens_seen": 120073650, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.796875, "step": 5592, "time_per_iteration": 2.6191091537475586 }, { "auxiliary_loss_clip": 0.01144778, "auxiliary_loss_mlp": 0.01040451, "balance_loss_clip": 1.02606881, "balance_loss_mlp": 1.04827285, "epoch": 0.33626935217195253, "flos": 24243581329920.0, "grad_norm": 2.0175247080234557, "language_loss": 0.76870954, "learning_rate": 2.9843629568411114e-06, "loss": 0.7905618, "num_input_tokens_seen": 120093260, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7890625, "step": 5593, "time_per_iteration": 2.675938129425049 }, { "auxiliary_loss_clip": 0.01156321, "auxiliary_loss_mlp": 0.0129102, "balance_loss_clip": 1.02596879, "balance_loss_mlp": 1.04694223, "epoch": 0.3363294754246205, "flos": 19718765164800.0, "grad_norm": 2.2202755242670134, "language_loss": 0.72771633, "learning_rate": 2.984034047732563e-06, "loss": 0.75218976, "num_input_tokens_seen": 120111830, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.828125, "step": 5594, "time_per_iteration": 2.6698033809661865 }, { "auxiliary_loss_clip": 0.01169229, "auxiliary_loss_mlp": 0.01290354, "balance_loss_clip": 1.02624822, "balance_loss_mlp": 1.05014777, "epoch": 0.33638959867728846, "flos": 22596286677120.0, "grad_norm": 1.6073528343578833, "language_loss": 0.80195361, "learning_rate": 2.983705103506371e-06, "loss": 0.82654947, "num_input_tokens_seen": 120130470, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.83203125, "step": 5595, "time_per_iteration": 2.568347692489624 }, { "auxiliary_loss_clip": 0.01146675, "auxiliary_loss_mlp": 0.01038477, "balance_loss_clip": 1.02417231, "balance_loss_mlp": 1.0483042, "epoch": 0.3364497219299564, "flos": 20994742972800.0, "grad_norm": 1.562530807415852, "language_loss": 0.81078482, "learning_rate": 2.983376124174274e-06, "loss": 0.83263636, "num_input_tokens_seen": 120150735, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.80859375, "step": 5596, "time_per_iteration": 2.5927462577819824 }, { "auxiliary_loss_clip": 0.01144063, "auxiliary_loss_mlp": 0.01040582, "balance_loss_clip": 1.02658129, "balance_loss_mlp": 1.04621422, "epoch": 0.3365098451826244, "flos": 25228610974080.0, "grad_norm": 5.733960224871695, "language_loss": 0.75741684, "learning_rate": 2.9830471097480133e-06, "loss": 0.77926326, "num_input_tokens_seen": 120173230, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.80078125, "step": 5597, "time_per_iteration": 2.5857529640197754 }, { "auxiliary_loss_clip": 0.01137336, "auxiliary_loss_mlp": 0.01037526, "balance_loss_clip": 1.02271974, "balance_loss_mlp": 1.05042052, "epoch": 0.33656996843529235, "flos": 24571697091840.0, "grad_norm": 1.7053661266782292, "language_loss": 0.78883207, "learning_rate": 2.982718060239329e-06, "loss": 0.81058073, "num_input_tokens_seen": 120191860, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.78125, "step": 5598, "time_per_iteration": 2.639530658721924 }, { "auxiliary_loss_clip": 0.01133763, "auxiliary_loss_mlp": 0.01038064, "balance_loss_clip": 1.02213776, "balance_loss_mlp": 1.04851317, "epoch": 0.3366300916879603, "flos": 44091120752640.0, "grad_norm": 1.32301623909866, "language_loss": 0.64245212, "learning_rate": 2.9823889756599652e-06, "loss": 0.66417032, "num_input_tokens_seen": 120219195, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8515625, "step": 5599, "time_per_iteration": 2.781857967376709 }, { "auxiliary_loss_clip": 0.01162302, "auxiliary_loss_mlp": 0.01047813, "balance_loss_clip": 1.03055155, "balance_loss_mlp": 1.04974627, "epoch": 0.3366902149406283, "flos": 13879869840000.0, "grad_norm": 2.581164781795032, "language_loss": 0.82225287, "learning_rate": 2.9820598560216653e-06, "loss": 0.84435403, "num_input_tokens_seen": 120232950, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.85546875, "step": 5600, "time_per_iteration": 2.5251882076263428 }, { "auxiliary_loss_clip": 0.0113964, "auxiliary_loss_mlp": 0.01048948, "balance_loss_clip": 1.03213918, "balance_loss_mlp": 1.04674375, "epoch": 0.33675033819329625, "flos": 16253098358400.0, "grad_norm": 2.0924471369368765, "language_loss": 0.83083761, "learning_rate": 2.9817307013361764e-06, "loss": 0.85272348, "num_input_tokens_seen": 120248865, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.83984375, "step": 5601, "time_per_iteration": 2.540565252304077 }, { "auxiliary_loss_clip": 0.01136015, "auxiliary_loss_mlp": 0.01035136, "balance_loss_clip": 1.02082443, "balance_loss_mlp": 1.05100846, "epoch": 0.3368104614459642, "flos": 17055809544960.0, "grad_norm": 1.516356456745473, "language_loss": 0.83439988, "learning_rate": 2.9814015116152437e-06, "loss": 0.85611141, "num_input_tokens_seen": 120267820, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.76171875, "step": 5602, "time_per_iteration": 2.5194091796875 }, { "auxiliary_loss_clip": 0.01130036, "auxiliary_loss_mlp": 0.01048452, "balance_loss_clip": 1.03310394, "balance_loss_mlp": 1.04877436, "epoch": 0.3368705846986322, "flos": 17858628472320.0, "grad_norm": 2.3493889693916956, "language_loss": 0.69358921, "learning_rate": 2.9810722868706154e-06, "loss": 0.71537411, "num_input_tokens_seen": 120286540, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8125, "step": 5603, "time_per_iteration": 2.553136110305786 }, { "auxiliary_loss_clip": 0.0115054, "auxiliary_loss_mlp": 0.01038993, "balance_loss_clip": 1.02311397, "balance_loss_mlp": 1.04925919, "epoch": 0.33693070795130015, "flos": 22929502170240.0, "grad_norm": 2.3194165880562903, "language_loss": 0.8312403, "learning_rate": 2.980743027114041e-06, "loss": 0.85313559, "num_input_tokens_seen": 120307305, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.83203125, "step": 5604, "time_per_iteration": 2.5941715240478516 }, { "auxiliary_loss_clip": 0.01156331, "auxiliary_loss_mlp": 0.01040164, "balance_loss_clip": 1.02426124, "balance_loss_mlp": 1.04719341, "epoch": 0.3369908312039681, "flos": 22017443005440.0, "grad_norm": 1.313375953487047, "language_loss": 0.73714626, "learning_rate": 2.98041373235727e-06, "loss": 0.75911117, "num_input_tokens_seen": 120327845, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8203125, "step": 5605, "time_per_iteration": 2.7421231269836426 }, { "auxiliary_loss_clip": 0.01147586, "auxiliary_loss_mlp": 0.01038866, "balance_loss_clip": 1.02333307, "balance_loss_mlp": 1.04612422, "epoch": 0.33705095445663613, "flos": 11801970944640.0, "grad_norm": 3.1769662290102816, "language_loss": 0.83518565, "learning_rate": 2.980084402612056e-06, "loss": 0.85705012, "num_input_tokens_seen": 120343255, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.84375, "step": 5606, "time_per_iteration": 2.5972251892089844 }, { "auxiliary_loss_clip": 0.01142162, "auxiliary_loss_mlp": 0.01036805, "balance_loss_clip": 1.0223037, "balance_loss_mlp": 1.04472709, "epoch": 0.3371110777093041, "flos": 25046400257280.0, "grad_norm": 2.1144452860490848, "language_loss": 0.67767608, "learning_rate": 2.97975503789015e-06, "loss": 0.69946581, "num_input_tokens_seen": 120361745, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.796875, "step": 5607, "time_per_iteration": 2.5863659381866455 }, { "auxiliary_loss_clip": 0.01140555, "auxiliary_loss_mlp": 0.01040252, "balance_loss_clip": 1.02424192, "balance_loss_mlp": 1.04755259, "epoch": 0.33717120096197206, "flos": 26579031719040.0, "grad_norm": 1.7985215287109373, "language_loss": 0.70966172, "learning_rate": 2.979425638203307e-06, "loss": 0.73146975, "num_input_tokens_seen": 120380565, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83984375, "step": 5608, "time_per_iteration": 2.601938009262085 }, { "auxiliary_loss_clip": 0.01182513, "auxiliary_loss_mlp": 0.01042597, "balance_loss_clip": 1.0277679, "balance_loss_mlp": 1.04626918, "epoch": 0.33723132421464, "flos": 15158541168000.0, "grad_norm": 1.7715889401729001, "language_loss": 0.78989565, "learning_rate": 2.9790962035632823e-06, "loss": 0.81214678, "num_input_tokens_seen": 120399235, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.82421875, "step": 5609, "time_per_iteration": 2.5691771507263184 }, { "auxiliary_loss_clip": 0.01138595, "auxiliary_loss_mlp": 0.01043787, "balance_loss_clip": 1.02679944, "balance_loss_mlp": 1.04781079, "epoch": 0.337291447467308, "flos": 23436093634560.0, "grad_norm": 1.6076284637491345, "language_loss": 0.82650334, "learning_rate": 2.978766733981833e-06, "loss": 0.84832716, "num_input_tokens_seen": 120420095, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8203125, "step": 5610, "time_per_iteration": 2.5730419158935547 }, { "auxiliary_loss_clip": 0.01135354, "auxiliary_loss_mlp": 0.01043471, "balance_loss_clip": 1.02663898, "balance_loss_mlp": 1.046857, "epoch": 0.33735157071997596, "flos": 17238163916160.0, "grad_norm": 2.02385011838069, "language_loss": 0.82057238, "learning_rate": 2.9784372294707165e-06, "loss": 0.84236062, "num_input_tokens_seen": 120437690, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.796875, "step": 5611, "time_per_iteration": 2.5315475463867188 }, { "auxiliary_loss_clip": 0.01148389, "auxiliary_loss_mlp": 0.0104355, "balance_loss_clip": 1.02615714, "balance_loss_mlp": 1.04809272, "epoch": 0.3374116939726439, "flos": 28257388657920.0, "grad_norm": 2.3840505971901416, "language_loss": 0.79758751, "learning_rate": 2.9781076900416923e-06, "loss": 0.81950688, "num_input_tokens_seen": 120459240, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.82421875, "step": 5612, "time_per_iteration": 2.5962073802948 }, { "auxiliary_loss_clip": 0.01171973, "auxiliary_loss_mlp": 0.0103952, "balance_loss_clip": 1.02365303, "balance_loss_mlp": 1.04478097, "epoch": 0.3374718172253119, "flos": 35919396731520.0, "grad_norm": 2.376019219326733, "language_loss": 0.70001245, "learning_rate": 2.97777811570652e-06, "loss": 0.72212732, "num_input_tokens_seen": 120481090, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.828125, "step": 5613, "time_per_iteration": 2.7459864616394043 }, { "auxiliary_loss_clip": 0.01149146, "auxiliary_loss_mlp": 0.01039669, "balance_loss_clip": 1.02382576, "balance_loss_mlp": 1.04758334, "epoch": 0.33753194047797985, "flos": 18186672407040.0, "grad_norm": 1.8186285165054326, "language_loss": 0.79493189, "learning_rate": 2.977448506476962e-06, "loss": 0.81682003, "num_input_tokens_seen": 120500045, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8359375, "step": 5614, "time_per_iteration": 2.4855282306671143 }, { "auxiliary_loss_clip": 0.01146617, "auxiliary_loss_mlp": 0.01040557, "balance_loss_clip": 1.02328348, "balance_loss_mlp": 1.04611659, "epoch": 0.3375920637306478, "flos": 23148916398720.0, "grad_norm": 1.7992591539804883, "language_loss": 0.91257626, "learning_rate": 2.977118862364781e-06, "loss": 0.934448, "num_input_tokens_seen": 120521125, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.82421875, "step": 5615, "time_per_iteration": 2.5907490253448486 }, { "auxiliary_loss_clip": 0.01131141, "auxiliary_loss_mlp": 0.01036975, "balance_loss_clip": 1.02182317, "balance_loss_mlp": 1.04328752, "epoch": 0.3376521869833158, "flos": 23112215677440.0, "grad_norm": 1.8253513101784486, "language_loss": 0.80853212, "learning_rate": 2.9767891833817424e-06, "loss": 0.83021325, "num_input_tokens_seen": 120539180, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.7890625, "step": 5616, "time_per_iteration": 2.533764600753784 }, { "auxiliary_loss_clip": 0.01142301, "auxiliary_loss_mlp": 0.01294165, "balance_loss_clip": 1.02801824, "balance_loss_mlp": 1.04714346, "epoch": 0.33771231023598375, "flos": 19274585581440.0, "grad_norm": 1.6675870675556002, "language_loss": 0.84042871, "learning_rate": 2.976459469539609e-06, "loss": 0.86479342, "num_input_tokens_seen": 120556280, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.86328125, "step": 5617, "time_per_iteration": 2.576324701309204 }, { "auxiliary_loss_clip": 0.01181971, "auxiliary_loss_mlp": 0.01040665, "balance_loss_clip": 1.02506018, "balance_loss_mlp": 1.04583049, "epoch": 0.3377724334886517, "flos": 18150187167360.0, "grad_norm": 1.5916966543342268, "language_loss": 0.80397546, "learning_rate": 2.97612972085015e-06, "loss": 0.8262018, "num_input_tokens_seen": 120575395, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8203125, "step": 5618, "time_per_iteration": 2.583350658416748 }, { "auxiliary_loss_clip": 0.01165109, "auxiliary_loss_mlp": 0.010364, "balance_loss_clip": 1.0207603, "balance_loss_mlp": 1.04588854, "epoch": 0.3378325567413197, "flos": 25775997310080.0, "grad_norm": 1.524877296431027, "language_loss": 0.70774162, "learning_rate": 2.9757999373251315e-06, "loss": 0.72975671, "num_input_tokens_seen": 120596075, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.83203125, "step": 5619, "time_per_iteration": 2.644223690032959 }, { "auxiliary_loss_clip": 0.01146839, "auxiliary_loss_mlp": 0.01049004, "balance_loss_clip": 1.03337574, "balance_loss_mlp": 1.04691958, "epoch": 0.3378926799939877, "flos": 21317112558720.0, "grad_norm": 2.3717056530142813, "language_loss": 0.70033634, "learning_rate": 2.9754701189763236e-06, "loss": 0.72229481, "num_input_tokens_seen": 120614195, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8203125, "step": 5620, "time_per_iteration": 2.5479650497436523 }, { "auxiliary_loss_clip": 0.01128641, "auxiliary_loss_mlp": 0.01046031, "balance_loss_clip": 1.0308795, "balance_loss_mlp": 1.04810131, "epoch": 0.33795280324665566, "flos": 24900028335360.0, "grad_norm": 1.6972537069977802, "language_loss": 0.68538642, "learning_rate": 2.975140265815496e-06, "loss": 0.70713305, "num_input_tokens_seen": 120634475, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8046875, "step": 5621, "time_per_iteration": 2.5805108547210693 }, { "auxiliary_loss_clip": 0.01144083, "auxiliary_loss_mlp": 0.01040647, "balance_loss_clip": 1.02562666, "balance_loss_mlp": 1.04614687, "epoch": 0.33801292649932363, "flos": 24753943722240.0, "grad_norm": 1.695233576081416, "language_loss": 0.82687771, "learning_rate": 2.9748103778544213e-06, "loss": 0.84872508, "num_input_tokens_seen": 120654980, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8046875, "step": 5622, "time_per_iteration": 2.569035768508911 }, { "auxiliary_loss_clip": 0.01151268, "auxiliary_loss_mlp": 0.0104095, "balance_loss_clip": 1.02637053, "balance_loss_mlp": 1.04463184, "epoch": 0.3380730497519916, "flos": 26723967096960.0, "grad_norm": 1.5661931581637831, "language_loss": 0.73701519, "learning_rate": 2.974480455104871e-06, "loss": 0.75893742, "num_input_tokens_seen": 120676245, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8046875, "step": 5623, "time_per_iteration": 2.6524553298950195 }, { "auxiliary_loss_clip": 0.01067811, "auxiliary_loss_mlp": 0.01019014, "balance_loss_clip": 1.01688051, "balance_loss_mlp": 1.02401495, "epoch": 0.33813317300465956, "flos": 70035756416640.0, "grad_norm": 0.7509185802313377, "language_loss": 0.54967678, "learning_rate": 2.9741504975786206e-06, "loss": 0.57054496, "num_input_tokens_seen": 120741965, "router_z_loss_clip": 0.0213623, "router_z_loss_mlp": 0.2578125, "step": 5624, "time_per_iteration": 4.801892518997192 }, { "auxiliary_loss_clip": 0.01156077, "auxiliary_loss_mlp": 0.0104654, "balance_loss_clip": 1.03068495, "balance_loss_mlp": 1.04693556, "epoch": 0.3381932962573275, "flos": 24097317148800.0, "grad_norm": 2.376363397370931, "language_loss": 0.72855771, "learning_rate": 2.9738205052874444e-06, "loss": 0.75058389, "num_input_tokens_seen": 120760410, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.82421875, "step": 5625, "time_per_iteration": 2.6329758167266846 }, { "auxiliary_loss_clip": 0.01137903, "auxiliary_loss_mlp": 0.01047085, "balance_loss_clip": 1.03070557, "balance_loss_mlp": 1.04535973, "epoch": 0.3382534195099955, "flos": 19278248768640.0, "grad_norm": 1.7039002252869255, "language_loss": 0.70714283, "learning_rate": 2.9734904782431196e-06, "loss": 0.72899276, "num_input_tokens_seen": 120777705, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8359375, "step": 5626, "time_per_iteration": 2.6195809841156006 }, { "auxiliary_loss_clip": 0.01143927, "auxiliary_loss_mlp": 0.01038619, "balance_loss_clip": 1.02247763, "balance_loss_mlp": 1.04613113, "epoch": 0.33831354276266346, "flos": 25226240676480.0, "grad_norm": 1.5382584679508586, "language_loss": 0.81150573, "learning_rate": 2.973160416457423e-06, "loss": 0.83333123, "num_input_tokens_seen": 120798660, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.796875, "step": 5627, "time_per_iteration": 2.6736397743225098 }, { "auxiliary_loss_clip": 0.01141853, "auxiliary_loss_mlp": 0.01294401, "balance_loss_clip": 1.03041327, "balance_loss_mlp": 1.04870558, "epoch": 0.3383736660153314, "flos": 23112000195840.0, "grad_norm": 1.7004398157875884, "language_loss": 0.80278772, "learning_rate": 2.9728303199421354e-06, "loss": 0.82715023, "num_input_tokens_seen": 120816705, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.84375, "step": 5628, "time_per_iteration": 3.947838306427002 }, { "auxiliary_loss_clip": 0.01163503, "auxiliary_loss_mlp": 0.01036677, "balance_loss_clip": 1.02069092, "balance_loss_mlp": 1.04552925, "epoch": 0.3384337892679994, "flos": 23477139901440.0, "grad_norm": 2.07530708304615, "language_loss": 0.77278531, "learning_rate": 2.9725001887090358e-06, "loss": 0.79478711, "num_input_tokens_seen": 120835375, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8203125, "step": 5629, "time_per_iteration": 4.181494235992432 }, { "auxiliary_loss_clip": 0.01147485, "auxiliary_loss_mlp": 0.01040684, "balance_loss_clip": 1.02332735, "balance_loss_mlp": 1.04583883, "epoch": 0.33849391252066735, "flos": 19425805839360.0, "grad_norm": 1.7424275369818747, "language_loss": 0.84938538, "learning_rate": 2.9721700227699055e-06, "loss": 0.87126708, "num_input_tokens_seen": 120854260, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.8359375, "step": 5630, "time_per_iteration": 2.5806939601898193 }, { "auxiliary_loss_clip": 0.01166211, "auxiliary_loss_mlp": 0.01047258, "balance_loss_clip": 1.03222585, "balance_loss_mlp": 1.04822969, "epoch": 0.3385540357733353, "flos": 21064840364160.0, "grad_norm": 2.054104988611429, "language_loss": 0.71754837, "learning_rate": 2.9718398221365285e-06, "loss": 0.73968303, "num_input_tokens_seen": 120871590, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.82421875, "step": 5631, "time_per_iteration": 2.657320022583008 }, { "auxiliary_loss_clip": 0.01069321, "auxiliary_loss_mlp": 0.01001833, "balance_loss_clip": 0.99973452, "balance_loss_mlp": 1.02606988, "epoch": 0.3386141590260033, "flos": 69208013450880.0, "grad_norm": 0.8437499494917097, "language_loss": 0.56164187, "learning_rate": 2.9715095868206874e-06, "loss": 0.58235335, "num_input_tokens_seen": 120925550, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.2578125, "step": 5632, "time_per_iteration": 4.642168045043945 }, { "auxiliary_loss_clip": 0.01137178, "auxiliary_loss_mlp": 0.01033623, "balance_loss_clip": 1.01774454, "balance_loss_mlp": 1.04669082, "epoch": 0.3386742822786713, "flos": 25519487310720.0, "grad_norm": 1.5436219912615232, "language_loss": 0.802104, "learning_rate": 2.9711793168341686e-06, "loss": 0.82381195, "num_input_tokens_seen": 120947620, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8125, "step": 5633, "time_per_iteration": 2.767313241958618 }, { "auxiliary_loss_clip": 0.01156542, "auxiliary_loss_mlp": 0.01038111, "balance_loss_clip": 1.02350211, "balance_loss_mlp": 1.04765296, "epoch": 0.33873440553133927, "flos": 23623116773760.0, "grad_norm": 2.0345980683945943, "language_loss": 0.59345669, "learning_rate": 2.9708490121887587e-06, "loss": 0.61540323, "num_input_tokens_seen": 120965205, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8203125, "step": 5634, "time_per_iteration": 2.5943524837493896 }, { "auxiliary_loss_clip": 0.01144117, "auxiliary_loss_mlp": 0.01036995, "balance_loss_clip": 1.02227879, "balance_loss_mlp": 1.04503298, "epoch": 0.33879452878400723, "flos": 17088882992640.0, "grad_norm": 2.3569044275877293, "language_loss": 0.92771333, "learning_rate": 2.9705186728962436e-06, "loss": 0.94952452, "num_input_tokens_seen": 120983560, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.80859375, "step": 5635, "time_per_iteration": 2.5613973140716553 }, { "auxiliary_loss_clip": 0.01154674, "auxiliary_loss_mlp": 0.01033807, "balance_loss_clip": 1.01908422, "balance_loss_mlp": 1.04956138, "epoch": 0.3388546520366752, "flos": 15742053607680.0, "grad_norm": 1.8845135588503883, "language_loss": 0.7468161, "learning_rate": 2.9701882989684145e-06, "loss": 0.7687009, "num_input_tokens_seen": 121001400, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78125, "step": 5636, "time_per_iteration": 2.574796438217163 }, { "auxiliary_loss_clip": 0.01136333, "auxiliary_loss_mlp": 0.01041008, "balance_loss_clip": 1.02570176, "balance_loss_mlp": 1.04754996, "epoch": 0.33891477528934316, "flos": 22418744728320.0, "grad_norm": 1.6303892853709747, "language_loss": 0.8324033, "learning_rate": 2.96985789041706e-06, "loss": 0.85417676, "num_input_tokens_seen": 121021760, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.796875, "step": 5637, "time_per_iteration": 2.639751434326172 }, { "auxiliary_loss_clip": 0.01166831, "auxiliary_loss_mlp": 0.01040148, "balance_loss_clip": 1.02342308, "balance_loss_mlp": 1.04892719, "epoch": 0.3389748985420111, "flos": 17274828723840.0, "grad_norm": 1.6368977651338994, "language_loss": 0.69926786, "learning_rate": 2.9695274472539725e-06, "loss": 0.72133768, "num_input_tokens_seen": 121041070, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8203125, "step": 5638, "time_per_iteration": 2.600006341934204 }, { "auxiliary_loss_clip": 0.01148015, "auxiliary_loss_mlp": 0.01047212, "balance_loss_clip": 1.03157139, "balance_loss_mlp": 1.04925847, "epoch": 0.3390350217946791, "flos": 27744979190400.0, "grad_norm": 1.6415476804385625, "language_loss": 0.80923116, "learning_rate": 2.9691969694909443e-06, "loss": 0.83118337, "num_input_tokens_seen": 121060890, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.80859375, "step": 5639, "time_per_iteration": 2.705023765563965 }, { "auxiliary_loss_clip": 0.01132171, "auxiliary_loss_mlp": 0.01045682, "balance_loss_clip": 1.02933812, "balance_loss_mlp": 1.04778135, "epoch": 0.33909514504734706, "flos": 20339804338560.0, "grad_norm": 1.9174252421983782, "language_loss": 0.6752274, "learning_rate": 2.9688664571397696e-06, "loss": 0.69700587, "num_input_tokens_seen": 121079135, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.84375, "step": 5640, "time_per_iteration": 2.5548624992370605 }, { "auxiliary_loss_clip": 0.01147872, "auxiliary_loss_mlp": 0.0104038, "balance_loss_clip": 1.02460909, "balance_loss_mlp": 1.04917216, "epoch": 0.339155268300015, "flos": 14830030356480.0, "grad_norm": 1.75848069492526, "language_loss": 0.69975424, "learning_rate": 2.9685359102122432e-06, "loss": 0.72163677, "num_input_tokens_seen": 121097685, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8125, "step": 5641, "time_per_iteration": 2.591686248779297 }, { "auxiliary_loss_clip": 0.01131922, "auxiliary_loss_mlp": 0.01042742, "balance_loss_clip": 1.02735257, "balance_loss_mlp": 1.04951334, "epoch": 0.339215391552683, "flos": 26067951054720.0, "grad_norm": 1.748122282227298, "language_loss": 0.87262815, "learning_rate": 2.9682053287201615e-06, "loss": 0.89437479, "num_input_tokens_seen": 121115640, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.82421875, "step": 5642, "time_per_iteration": 2.581746816635132 }, { "auxiliary_loss_clip": 0.01132241, "auxiliary_loss_mlp": 0.01034218, "balance_loss_clip": 1.02099204, "balance_loss_mlp": 1.04817033, "epoch": 0.33927551480535095, "flos": 14574705505920.0, "grad_norm": 1.7142946838169753, "language_loss": 0.83587086, "learning_rate": 2.967874712675322e-06, "loss": 0.85753548, "num_input_tokens_seen": 121132485, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75390625, "step": 5643, "time_per_iteration": 2.6371171474456787 }, { "auxiliary_loss_clip": 0.01144797, "auxiliary_loss_mlp": 0.01048428, "balance_loss_clip": 1.0335865, "balance_loss_mlp": 1.04894233, "epoch": 0.3393356380580189, "flos": 23805578885760.0, "grad_norm": 1.4794946719552677, "language_loss": 0.77046973, "learning_rate": 2.9675440620895233e-06, "loss": 0.79240197, "num_input_tokens_seen": 121152935, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.78125, "step": 5644, "time_per_iteration": 2.6088616847991943 }, { "auxiliary_loss_clip": 0.01136378, "auxiliary_loss_mlp": 0.01040583, "balance_loss_clip": 1.0247643, "balance_loss_mlp": 1.0465374, "epoch": 0.3393957613106869, "flos": 17347871030400.0, "grad_norm": 2.8642727950360314, "language_loss": 0.63105309, "learning_rate": 2.9672133769745664e-06, "loss": 0.65282267, "num_input_tokens_seen": 121169835, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.80859375, "step": 5645, "time_per_iteration": 2.6374716758728027 }, { "auxiliary_loss_clip": 0.01133866, "auxiliary_loss_mlp": 0.01036014, "balance_loss_clip": 1.02183962, "balance_loss_mlp": 1.04672301, "epoch": 0.3394558845633549, "flos": 28433960939520.0, "grad_norm": 3.7783912635632473, "language_loss": 0.76809764, "learning_rate": 2.966882657342252e-06, "loss": 0.78979647, "num_input_tokens_seen": 121190290, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.78515625, "step": 5646, "time_per_iteration": 2.733035087585449 }, { "auxiliary_loss_clip": 0.01140182, "auxiliary_loss_mlp": 0.01041361, "balance_loss_clip": 1.02531552, "balance_loss_mlp": 1.0473398, "epoch": 0.33951600781602287, "flos": 22086929865600.0, "grad_norm": 2.2548095146079334, "language_loss": 0.79265237, "learning_rate": 2.9665519032043825e-06, "loss": 0.81446785, "num_input_tokens_seen": 121209060, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8359375, "step": 5647, "time_per_iteration": 2.561000108718872 }, { "auxiliary_loss_clip": 0.01148693, "auxiliary_loss_mlp": 0.01290007, "balance_loss_clip": 1.02719378, "balance_loss_mlp": 1.05078053, "epoch": 0.33957613106869083, "flos": 23878262056320.0, "grad_norm": 2.0207257763509587, "language_loss": 0.77255023, "learning_rate": 2.9662211145727618e-06, "loss": 0.79693723, "num_input_tokens_seen": 121227480, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.80078125, "step": 5648, "time_per_iteration": 2.720883369445801 }, { "auxiliary_loss_clip": 0.01107862, "auxiliary_loss_mlp": 0.01012597, "balance_loss_clip": 1.01026058, "balance_loss_mlp": 1.02807498, "epoch": 0.3396362543213588, "flos": 71242642414080.0, "grad_norm": 0.7783454452512338, "language_loss": 0.56383646, "learning_rate": 2.965890291459195e-06, "loss": 0.58504105, "num_input_tokens_seen": 121291305, "router_z_loss_clip": 0.02331543, "router_z_loss_mlp": 0.26171875, "step": 5649, "time_per_iteration": 3.2540669441223145 }, { "auxiliary_loss_clip": 0.0114452, "auxiliary_loss_mlp": 0.01293718, "balance_loss_clip": 1.03075218, "balance_loss_mlp": 1.04633546, "epoch": 0.33969637757402676, "flos": 25921615046400.0, "grad_norm": 2.3071519047516906, "language_loss": 0.85626733, "learning_rate": 2.9655594338754887e-06, "loss": 0.88064969, "num_input_tokens_seen": 121312740, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8046875, "step": 5650, "time_per_iteration": 2.8186135292053223 }, { "auxiliary_loss_clip": 0.01136085, "auxiliary_loss_mlp": 0.01033709, "balance_loss_clip": 1.0192374, "balance_loss_mlp": 1.04793525, "epoch": 0.33975650082669473, "flos": 35261728663680.0, "grad_norm": 1.6389512838776876, "language_loss": 0.70834553, "learning_rate": 2.9652285418334496e-06, "loss": 0.73004347, "num_input_tokens_seen": 121334220, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.79296875, "step": 5651, "time_per_iteration": 2.6759793758392334 }, { "auxiliary_loss_clip": 0.01163661, "auxiliary_loss_mlp": 0.01039591, "balance_loss_clip": 1.02434993, "balance_loss_mlp": 1.04847062, "epoch": 0.3398166240793627, "flos": 16647001879680.0, "grad_norm": 1.7324966177214915, "language_loss": 0.80922598, "learning_rate": 2.964897615344886e-06, "loss": 0.83125848, "num_input_tokens_seen": 121351870, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.796875, "step": 5652, "time_per_iteration": 2.6097888946533203 }, { "auxiliary_loss_clip": 0.01131397, "auxiliary_loss_mlp": 0.01044374, "balance_loss_clip": 1.02854323, "balance_loss_mlp": 1.04896832, "epoch": 0.33987674733203066, "flos": 24062196625920.0, "grad_norm": 1.8240546268697824, "language_loss": 0.774077, "learning_rate": 2.9645666544216097e-06, "loss": 0.79583478, "num_input_tokens_seen": 121373400, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.82421875, "step": 5653, "time_per_iteration": 2.5639517307281494 }, { "auxiliary_loss_clip": 0.01136363, "auxiliary_loss_mlp": 0.01034488, "balance_loss_clip": 1.01969385, "balance_loss_mlp": 1.04762626, "epoch": 0.3399368705846986, "flos": 13250678279040.0, "grad_norm": 2.530373234536734, "language_loss": 0.86375397, "learning_rate": 2.9642356590754298e-06, "loss": 0.88546246, "num_input_tokens_seen": 121385225, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.796875, "step": 5654, "time_per_iteration": 2.6740634441375732 }, { "auxiliary_loss_clip": 0.01134459, "auxiliary_loss_mlp": 0.01043223, "balance_loss_clip": 1.02828646, "balance_loss_mlp": 1.04516029, "epoch": 0.3399969938373666, "flos": 27012832272000.0, "grad_norm": 2.9889061700051984, "language_loss": 0.65072256, "learning_rate": 2.9639046293181603e-06, "loss": 0.67249942, "num_input_tokens_seen": 121404735, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8046875, "step": 5655, "time_per_iteration": 2.609872579574585 }, { "auxiliary_loss_clip": 0.01137848, "auxiliary_loss_mlp": 0.01040429, "balance_loss_clip": 1.02635658, "balance_loss_mlp": 1.04918385, "epoch": 0.34005711709003456, "flos": 28550096588160.0, "grad_norm": 1.4545188545861312, "language_loss": 0.76773942, "learning_rate": 2.963573565161613e-06, "loss": 0.78952217, "num_input_tokens_seen": 121426780, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.796875, "step": 5656, "time_per_iteration": 2.7249069213867188 }, { "auxiliary_loss_clip": 0.01160414, "auxiliary_loss_mlp": 0.01038361, "balance_loss_clip": 1.02260184, "balance_loss_mlp": 1.04792118, "epoch": 0.3401172403427025, "flos": 21617003208960.0, "grad_norm": 1.9392679011801004, "language_loss": 0.83122218, "learning_rate": 2.963242466617605e-06, "loss": 0.85320997, "num_input_tokens_seen": 121447245, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.85546875, "step": 5657, "time_per_iteration": 2.6426284313201904 }, { "auxiliary_loss_clip": 0.01142714, "auxiliary_loss_mlp": 0.01042979, "balance_loss_clip": 1.02850699, "balance_loss_mlp": 1.04698479, "epoch": 0.3401773635953705, "flos": 25885776251520.0, "grad_norm": 1.896994244301312, "language_loss": 0.85259354, "learning_rate": 2.9629113336979505e-06, "loss": 0.87445045, "num_input_tokens_seen": 121468165, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78125, "step": 5658, "time_per_iteration": 2.6536312103271484 }, { "auxiliary_loss_clip": 0.01066884, "auxiliary_loss_mlp": 0.01012994, "balance_loss_clip": 1.01084793, "balance_loss_mlp": 1.03232694, "epoch": 0.3402374868480385, "flos": 65507995336320.0, "grad_norm": 0.8196349367344307, "language_loss": 0.60008025, "learning_rate": 2.962580166414467e-06, "loss": 0.62087893, "num_input_tokens_seen": 121523795, "router_z_loss_clip": 0.02148438, "router_z_loss_mlp": 0.2578125, "step": 5659, "time_per_iteration": 3.2165257930755615 }, { "auxiliary_loss_clip": 0.01127434, "auxiliary_loss_mlp": 0.01038375, "balance_loss_clip": 1.02375436, "balance_loss_mlp": 1.05046594, "epoch": 0.34029761010070647, "flos": 24060580513920.0, "grad_norm": 1.6791970448163422, "language_loss": 0.67765194, "learning_rate": 2.9622489647789742e-06, "loss": 0.69931, "num_input_tokens_seen": 121542950, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.76953125, "step": 5660, "time_per_iteration": 2.6311140060424805 }, { "auxiliary_loss_clip": 0.0113223, "auxiliary_loss_mlp": 0.01045068, "balance_loss_clip": 1.03012466, "balance_loss_mlp": 1.05222905, "epoch": 0.34035773335337444, "flos": 27599720590080.0, "grad_norm": 2.8924573290422035, "language_loss": 0.6719861, "learning_rate": 2.9619177288032904e-06, "loss": 0.69375908, "num_input_tokens_seen": 121562765, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.80078125, "step": 5661, "time_per_iteration": 2.6020920276641846 }, { "auxiliary_loss_clip": 0.01146111, "auxiliary_loss_mlp": 0.01036346, "balance_loss_clip": 1.02163529, "balance_loss_mlp": 1.05059683, "epoch": 0.3404178566060424, "flos": 20812783651200.0, "grad_norm": 1.7799175647179404, "language_loss": 0.78620195, "learning_rate": 2.9615864584992374e-06, "loss": 0.80802655, "num_input_tokens_seen": 121581610, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.77734375, "step": 5662, "time_per_iteration": 2.5341639518737793 }, { "auxiliary_loss_clip": 0.01155023, "auxiliary_loss_mlp": 0.01041368, "balance_loss_clip": 1.02558494, "balance_loss_mlp": 1.04812741, "epoch": 0.34047797985871037, "flos": 26833566470400.0, "grad_norm": 1.9043840607006486, "language_loss": 0.73026788, "learning_rate": 2.961255153878637e-06, "loss": 0.75223178, "num_input_tokens_seen": 121601885, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.80078125, "step": 5663, "time_per_iteration": 2.644421339035034 }, { "auxiliary_loss_clip": 0.01152039, "auxiliary_loss_mlp": 0.01037033, "balance_loss_clip": 1.02369952, "balance_loss_mlp": 1.04890311, "epoch": 0.34053810311137833, "flos": 19682639061120.0, "grad_norm": 1.6628293750515117, "language_loss": 0.85887623, "learning_rate": 2.9609238149533132e-06, "loss": 0.88076693, "num_input_tokens_seen": 121621335, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.76171875, "step": 5664, "time_per_iteration": 2.587956190109253 }, { "auxiliary_loss_clip": 0.01129105, "auxiliary_loss_mlp": 0.01038363, "balance_loss_clip": 1.02418947, "balance_loss_mlp": 1.04894257, "epoch": 0.3405982263640463, "flos": 21725740656000.0, "grad_norm": 1.7402247276681604, "language_loss": 0.68977821, "learning_rate": 2.9605924417350904e-06, "loss": 0.71145296, "num_input_tokens_seen": 121641310, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.80078125, "step": 5665, "time_per_iteration": 3.9648778438568115 }, { "auxiliary_loss_clip": 0.01126984, "auxiliary_loss_mlp": 0.01036487, "balance_loss_clip": 1.02113247, "balance_loss_mlp": 1.0471369, "epoch": 0.34065834961671426, "flos": 18041629288320.0, "grad_norm": 2.52254936246733, "language_loss": 0.73011804, "learning_rate": 2.960261034235794e-06, "loss": 0.75175279, "num_input_tokens_seen": 121659625, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.796875, "step": 5666, "time_per_iteration": 2.6188013553619385 }, { "auxiliary_loss_clip": 0.01138167, "auxiliary_loss_mlp": 0.01044444, "balance_loss_clip": 1.03001952, "balance_loss_mlp": 1.0474968, "epoch": 0.3407184728693822, "flos": 21397337585280.0, "grad_norm": 1.8785898772994427, "language_loss": 0.73174798, "learning_rate": 2.959929592467251e-06, "loss": 0.75357407, "num_input_tokens_seen": 121679205, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8203125, "step": 5667, "time_per_iteration": 2.532052755355835 }, { "auxiliary_loss_clip": 0.01061046, "auxiliary_loss_mlp": 0.01007227, "balance_loss_clip": 1.00505781, "balance_loss_mlp": 1.02691567, "epoch": 0.3407785961220502, "flos": 68688101018880.0, "grad_norm": 0.8427846841844853, "language_loss": 0.63287157, "learning_rate": 2.959598116441291e-06, "loss": 0.65355432, "num_input_tokens_seen": 121751085, "router_z_loss_clip": 0.02172852, "router_z_loss_mlp": 0.2578125, "step": 5668, "time_per_iteration": 3.3205363750457764 }, { "auxiliary_loss_clip": 0.01198394, "auxiliary_loss_mlp": 0.01042431, "balance_loss_clip": 1.02670693, "balance_loss_mlp": 1.04739451, "epoch": 0.34083871937471816, "flos": 14064379027200.0, "grad_norm": 1.803384350951194, "language_loss": 0.71956152, "learning_rate": 2.959266606169741e-06, "loss": 0.74196982, "num_input_tokens_seen": 121768565, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.796875, "step": 5669, "time_per_iteration": 4.152010917663574 }, { "auxiliary_loss_clip": 0.01155083, "auxiliary_loss_mlp": 0.01281672, "balance_loss_clip": 1.01960015, "balance_loss_mlp": 1.04684794, "epoch": 0.3408988426273861, "flos": 17085435287040.0, "grad_norm": 1.650729334937591, "language_loss": 0.80631161, "learning_rate": 2.9589350616644353e-06, "loss": 0.83067912, "num_input_tokens_seen": 121784925, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8203125, "step": 5670, "time_per_iteration": 2.6062567234039307 }, { "auxiliary_loss_clip": 0.01164768, "auxiliary_loss_mlp": 0.01038672, "balance_loss_clip": 1.02364564, "balance_loss_mlp": 1.04600036, "epoch": 0.3409589658800541, "flos": 24024562151040.0, "grad_norm": 1.681749167619811, "language_loss": 0.77106583, "learning_rate": 2.9586034829372026e-06, "loss": 0.79310018, "num_input_tokens_seen": 121804425, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.828125, "step": 5671, "time_per_iteration": 4.255752086639404 }, { "auxiliary_loss_clip": 0.01127441, "auxiliary_loss_mlp": 0.01293944, "balance_loss_clip": 1.03053045, "balance_loss_mlp": 1.04693961, "epoch": 0.34101908913272205, "flos": 21142012734720.0, "grad_norm": 2.593248788310313, "language_loss": 0.73967898, "learning_rate": 2.958271869999878e-06, "loss": 0.76389283, "num_input_tokens_seen": 121825145, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8046875, "step": 5672, "time_per_iteration": 2.708904504776001 }, { "auxiliary_loss_clip": 0.01137909, "auxiliary_loss_mlp": 0.01034708, "balance_loss_clip": 1.02071309, "balance_loss_mlp": 1.04783368, "epoch": 0.3410792123853901, "flos": 15702012921600.0, "grad_norm": 1.7691596463898431, "language_loss": 0.73408055, "learning_rate": 2.9579402228642956e-06, "loss": 0.7558068, "num_input_tokens_seen": 121842185, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.8125, "step": 5673, "time_per_iteration": 2.559274196624756 }, { "auxiliary_loss_clip": 0.01134841, "auxiliary_loss_mlp": 0.01035855, "balance_loss_clip": 1.02120399, "balance_loss_mlp": 1.04768801, "epoch": 0.34113933563805804, "flos": 23036012974080.0, "grad_norm": 1.697802004786957, "language_loss": 0.79627311, "learning_rate": 2.9576085415422902e-06, "loss": 0.81798005, "num_input_tokens_seen": 121862260, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78125, "step": 5674, "time_per_iteration": 4.073877811431885 }, { "auxiliary_loss_clip": 0.01050147, "auxiliary_loss_mlp": 0.010024, "balance_loss_clip": 1.00019419, "balance_loss_mlp": 1.02451229, "epoch": 0.341199458890726, "flos": 69614235336960.0, "grad_norm": 0.7891463533999166, "language_loss": 0.56098181, "learning_rate": 2.957276826045699e-06, "loss": 0.58150727, "num_input_tokens_seen": 121923560, "router_z_loss_clip": 0.02209473, "router_z_loss_mlp": 0.25585938, "step": 5675, "time_per_iteration": 3.2867372035980225 }, { "auxiliary_loss_clip": 0.01142224, "auxiliary_loss_mlp": 0.01288619, "balance_loss_clip": 1.02683783, "balance_loss_mlp": 1.04706562, "epoch": 0.34125958214339397, "flos": 22346348866560.0, "grad_norm": 1.573838943080515, "language_loss": 0.78726679, "learning_rate": 2.9569450763863606e-06, "loss": 0.81157523, "num_input_tokens_seen": 121943515, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7734375, "step": 5676, "time_per_iteration": 2.640047311782837 }, { "auxiliary_loss_clip": 0.01136693, "auxiliary_loss_mlp": 0.01034306, "balance_loss_clip": 1.01923835, "balance_loss_mlp": 1.04678738, "epoch": 0.34131970539606193, "flos": 21871933009920.0, "grad_norm": 1.6607971795655878, "language_loss": 0.85679293, "learning_rate": 2.9566132925761143e-06, "loss": 0.87850285, "num_input_tokens_seen": 121962540, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.80859375, "step": 5677, "time_per_iteration": 2.6296839714050293 }, { "auxiliary_loss_clip": 0.01144815, "auxiliary_loss_mlp": 0.01044477, "balance_loss_clip": 1.02929568, "balance_loss_mlp": 1.04787016, "epoch": 0.3413798286487299, "flos": 24935723475840.0, "grad_norm": 1.784557522683281, "language_loss": 0.79557979, "learning_rate": 2.9562814746267996e-06, "loss": 0.8174727, "num_input_tokens_seen": 121979830, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.79296875, "step": 5678, "time_per_iteration": 2.599087953567505 }, { "auxiliary_loss_clip": 0.01136633, "auxiliary_loss_mlp": 0.01035492, "balance_loss_clip": 1.02075839, "balance_loss_mlp": 1.0467484, "epoch": 0.34143995190139786, "flos": 25374372364800.0, "grad_norm": 1.7161352944312875, "language_loss": 0.74973011, "learning_rate": 2.9559496225502594e-06, "loss": 0.77145141, "num_input_tokens_seen": 121999055, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.80859375, "step": 5679, "time_per_iteration": 2.754603385925293 }, { "auxiliary_loss_clip": 0.01059585, "auxiliary_loss_mlp": 0.01007151, "balance_loss_clip": 1.00498128, "balance_loss_mlp": 1.02576029, "epoch": 0.34150007515406583, "flos": 67782578129280.0, "grad_norm": 0.7156989279358629, "language_loss": 0.59435582, "learning_rate": 2.955617736358336e-06, "loss": 0.61502314, "num_input_tokens_seen": 122067015, "router_z_loss_clip": 0.02172852, "router_z_loss_mlp": 0.25390625, "step": 5680, "time_per_iteration": 3.2219269275665283 }, { "auxiliary_loss_clip": 0.01141219, "auxiliary_loss_mlp": 0.01033643, "balance_loss_clip": 1.01982105, "balance_loss_mlp": 1.04620314, "epoch": 0.3415601984067338, "flos": 20302421258880.0, "grad_norm": 4.0084248586484055, "language_loss": 0.72368842, "learning_rate": 2.955285816062874e-06, "loss": 0.74543703, "num_input_tokens_seen": 122085295, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7734375, "step": 5681, "time_per_iteration": 2.6153018474578857 }, { "auxiliary_loss_clip": 0.01140735, "auxiliary_loss_mlp": 0.01041183, "balance_loss_clip": 1.02737284, "balance_loss_mlp": 1.04507899, "epoch": 0.34162032165940176, "flos": 26031178506240.0, "grad_norm": 2.0080854732284, "language_loss": 0.71141499, "learning_rate": 2.9549538616757183e-06, "loss": 0.73323417, "num_input_tokens_seen": 122104020, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.78125, "step": 5682, "time_per_iteration": 2.6745526790618896 }, { "auxiliary_loss_clip": 0.01125109, "auxiliary_loss_mlp": 0.01040772, "balance_loss_clip": 1.02553678, "balance_loss_mlp": 1.045017, "epoch": 0.3416804449120697, "flos": 28803338449920.0, "grad_norm": 1.7904266623348428, "language_loss": 0.84015596, "learning_rate": 2.9546218732087154e-06, "loss": 0.86181474, "num_input_tokens_seen": 122125080, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.80078125, "step": 5683, "time_per_iteration": 2.6609573364257812 }, { "auxiliary_loss_clip": 0.01145466, "auxiliary_loss_mlp": 0.01048709, "balance_loss_clip": 1.03306842, "balance_loss_mlp": 1.04584229, "epoch": 0.3417405681647377, "flos": 22601601889920.0, "grad_norm": 1.7485270465794267, "language_loss": 0.71006155, "learning_rate": 2.9542898506737135e-06, "loss": 0.73200327, "num_input_tokens_seen": 122146350, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8203125, "step": 5684, "time_per_iteration": 2.607682943344116 }, { "auxiliary_loss_clip": 0.01124167, "auxiliary_loss_mlp": 0.01036446, "balance_loss_clip": 1.02268958, "balance_loss_mlp": 1.04616141, "epoch": 0.34180069141740566, "flos": 24716237420160.0, "grad_norm": 1.3989002727144741, "language_loss": 0.74943185, "learning_rate": 2.953957794082562e-06, "loss": 0.771038, "num_input_tokens_seen": 122168085, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.78125, "step": 5685, "time_per_iteration": 2.677180290222168 }, { "auxiliary_loss_clip": 0.01134415, "auxiliary_loss_mlp": 0.01040222, "balance_loss_clip": 1.02624512, "balance_loss_mlp": 1.04686451, "epoch": 0.3418608146700737, "flos": 30518755246080.0, "grad_norm": 1.9673212563587397, "language_loss": 0.70325792, "learning_rate": 2.9536257034471107e-06, "loss": 0.7250042, "num_input_tokens_seen": 122191040, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78515625, "step": 5686, "time_per_iteration": 2.6598429679870605 }, { "auxiliary_loss_clip": 0.01134793, "auxiliary_loss_mlp": 0.01039176, "balance_loss_clip": 1.02375674, "balance_loss_mlp": 1.04554892, "epoch": 0.34192093792274164, "flos": 15122343237120.0, "grad_norm": 2.203461654756108, "language_loss": 0.77420866, "learning_rate": 2.9532935787792114e-06, "loss": 0.79594839, "num_input_tokens_seen": 122209225, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8046875, "step": 5687, "time_per_iteration": 2.6128990650177 }, { "auxiliary_loss_clip": 0.01144791, "auxiliary_loss_mlp": 0.01037422, "balance_loss_clip": 1.02259254, "balance_loss_mlp": 1.04891944, "epoch": 0.3419810611754096, "flos": 13187799521280.0, "grad_norm": 1.983385009795363, "language_loss": 0.86449385, "learning_rate": 2.9529614200907157e-06, "loss": 0.886316, "num_input_tokens_seen": 122226160, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.77734375, "step": 5688, "time_per_iteration": 2.578754186630249 }, { "auxiliary_loss_clip": 0.01151204, "auxiliary_loss_mlp": 0.01036927, "balance_loss_clip": 1.02101207, "balance_loss_mlp": 1.04720509, "epoch": 0.34204118442807757, "flos": 19536267139200.0, "grad_norm": 1.8579052530571443, "language_loss": 0.79670119, "learning_rate": 2.9526292273934787e-06, "loss": 0.81858248, "num_input_tokens_seen": 122243115, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.859375, "step": 5689, "time_per_iteration": 2.614788293838501 }, { "auxiliary_loss_clip": 0.01135647, "auxiliary_loss_mlp": 0.01044256, "balance_loss_clip": 1.02899742, "balance_loss_mlp": 1.04593945, "epoch": 0.34210130768074554, "flos": 15194846839680.0, "grad_norm": 1.8621376822190159, "language_loss": 0.73677683, "learning_rate": 2.9522970006993547e-06, "loss": 0.75857586, "num_input_tokens_seen": 122261105, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.80859375, "step": 5690, "time_per_iteration": 2.6410632133483887 }, { "auxiliary_loss_clip": 0.01133203, "auxiliary_loss_mlp": 0.01031648, "balance_loss_clip": 1.01746762, "balance_loss_mlp": 1.04442906, "epoch": 0.3421614309334135, "flos": 24936226266240.0, "grad_norm": 1.7866601074610955, "language_loss": 0.75569403, "learning_rate": 2.9519647400202003e-06, "loss": 0.7773425, "num_input_tokens_seen": 122279995, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.796875, "step": 5691, "time_per_iteration": 2.6368422508239746 }, { "auxiliary_loss_clip": 0.01133025, "auxiliary_loss_mlp": 0.01035814, "balance_loss_clip": 1.02125883, "balance_loss_mlp": 1.04607916, "epoch": 0.34222155418608147, "flos": 21908633731200.0, "grad_norm": 2.117143559482731, "language_loss": 0.68101293, "learning_rate": 2.9516324453678733e-06, "loss": 0.70270133, "num_input_tokens_seen": 122299070, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.77734375, "step": 5692, "time_per_iteration": 2.5985043048858643 }, { "auxiliary_loss_clip": 0.01136738, "auxiliary_loss_mlp": 0.01041251, "balance_loss_clip": 1.02509832, "balance_loss_mlp": 1.04669452, "epoch": 0.34228167743874943, "flos": 18114061063680.0, "grad_norm": 1.997228190122135, "language_loss": 0.72661614, "learning_rate": 2.9513001167542316e-06, "loss": 0.74839604, "num_input_tokens_seen": 122316800, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8125, "step": 5693, "time_per_iteration": 2.54506254196167 }, { "auxiliary_loss_clip": 0.01151056, "auxiliary_loss_mlp": 0.01039543, "balance_loss_clip": 1.02574492, "balance_loss_mlp": 1.04537225, "epoch": 0.3423418006914174, "flos": 21288600138240.0, "grad_norm": 1.74952596744509, "language_loss": 0.7567842, "learning_rate": 2.9509677541911363e-06, "loss": 0.77869022, "num_input_tokens_seen": 122335275, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7890625, "step": 5694, "time_per_iteration": 2.671419858932495 }, { "auxiliary_loss_clip": 0.01150206, "auxiliary_loss_mlp": 0.01036806, "balance_loss_clip": 1.02269745, "balance_loss_mlp": 1.04668915, "epoch": 0.34240192394408536, "flos": 19823480288640.0, "grad_norm": 1.802403622628876, "language_loss": 0.7928564, "learning_rate": 2.9506353576904483e-06, "loss": 0.81472653, "num_input_tokens_seen": 122353215, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.765625, "step": 5695, "time_per_iteration": 2.542891025543213 }, { "auxiliary_loss_clip": 0.01161498, "auxiliary_loss_mlp": 0.01039733, "balance_loss_clip": 1.02467728, "balance_loss_mlp": 1.04600239, "epoch": 0.3424620471967533, "flos": 24535535074560.0, "grad_norm": 1.730175034058756, "language_loss": 0.73043644, "learning_rate": 2.9503029272640296e-06, "loss": 0.75244874, "num_input_tokens_seen": 122372495, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.80078125, "step": 5696, "time_per_iteration": 2.654170036315918 }, { "auxiliary_loss_clip": 0.01153032, "auxiliary_loss_mlp": 0.01045042, "balance_loss_clip": 1.03080893, "balance_loss_mlp": 1.04602695, "epoch": 0.3425221704494213, "flos": 25848895962240.0, "grad_norm": 1.8220109413337897, "language_loss": 0.70640093, "learning_rate": 2.9499704629237436e-06, "loss": 0.72838169, "num_input_tokens_seen": 122394600, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.796875, "step": 5697, "time_per_iteration": 2.6575732231140137 }, { "auxiliary_loss_clip": 0.01143061, "auxiliary_loss_mlp": 0.01032064, "balance_loss_clip": 1.01710379, "balance_loss_mlp": 1.04783499, "epoch": 0.34258229370208926, "flos": 21540513196800.0, "grad_norm": 1.720706005974759, "language_loss": 0.82020903, "learning_rate": 2.9496379646814555e-06, "loss": 0.84196031, "num_input_tokens_seen": 122414700, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.7734375, "step": 5698, "time_per_iteration": 2.7758562564849854 }, { "auxiliary_loss_clip": 0.01171531, "auxiliary_loss_mlp": 0.01050149, "balance_loss_clip": 1.03298283, "balance_loss_mlp": 1.04475236, "epoch": 0.3426424169547573, "flos": 23652778429440.0, "grad_norm": 1.8222358443450626, "language_loss": 0.68542588, "learning_rate": 2.949305432549031e-06, "loss": 0.70764267, "num_input_tokens_seen": 122432760, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.81640625, "step": 5699, "time_per_iteration": 2.643981456756592 }, { "auxiliary_loss_clip": 0.011448, "auxiliary_loss_mlp": 0.01033978, "balance_loss_clip": 1.01931548, "balance_loss_mlp": 1.04699755, "epoch": 0.34270254020742524, "flos": 24644883052800.0, "grad_norm": 2.578533802722475, "language_loss": 0.72626704, "learning_rate": 2.9489728665383382e-06, "loss": 0.74805486, "num_input_tokens_seen": 122449105, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80078125, "step": 5700, "time_per_iteration": 2.7253036499023438 }, { "auxiliary_loss_clip": 0.01144356, "auxiliary_loss_mlp": 0.01031229, "balance_loss_clip": 1.01726401, "balance_loss_mlp": 1.04745507, "epoch": 0.3427626634600932, "flos": 20996754134400.0, "grad_norm": 3.097468185795334, "language_loss": 0.8164829, "learning_rate": 2.948640266661244e-06, "loss": 0.83823872, "num_input_tokens_seen": 122468700, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.79296875, "step": 5701, "time_per_iteration": 2.545597553253174 }, { "auxiliary_loss_clip": 0.01134961, "auxiliary_loss_mlp": 0.01038122, "balance_loss_clip": 1.02372122, "balance_loss_mlp": 1.04537261, "epoch": 0.3428227867127612, "flos": 21433786911360.0, "grad_norm": 1.804831039981302, "language_loss": 0.71535528, "learning_rate": 2.94830763292962e-06, "loss": 0.73708606, "num_input_tokens_seen": 122488160, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8046875, "step": 5702, "time_per_iteration": 2.6859982013702393 }, { "auxiliary_loss_clip": 0.01066012, "auxiliary_loss_mlp": 0.01002314, "balance_loss_clip": 0.99985808, "balance_loss_mlp": 1.0231024, "epoch": 0.34288290996542914, "flos": 55731782695680.0, "grad_norm": 0.7812406395427706, "language_loss": 0.57398045, "learning_rate": 2.9479749653553347e-06, "loss": 0.59466374, "num_input_tokens_seen": 122542890, "router_z_loss_clip": 0.02453613, "router_z_loss_mlp": 0.2578125, "step": 5703, "time_per_iteration": 3.0716562271118164 }, { "auxiliary_loss_clip": 0.01140021, "auxiliary_loss_mlp": 0.01041067, "balance_loss_clip": 1.02421045, "balance_loss_mlp": 1.04926014, "epoch": 0.3429430332180971, "flos": 20156803522560.0, "grad_norm": 1.8932187571005408, "language_loss": 0.74731636, "learning_rate": 2.947642263950262e-06, "loss": 0.76912725, "num_input_tokens_seen": 122561770, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.81640625, "step": 5704, "time_per_iteration": 2.6107263565063477 }, { "auxiliary_loss_clip": 0.01152265, "auxiliary_loss_mlp": 0.01038141, "balance_loss_clip": 1.02493811, "balance_loss_mlp": 1.04696798, "epoch": 0.34300315647076507, "flos": 17965857548160.0, "grad_norm": 1.7456894886975336, "language_loss": 0.72880369, "learning_rate": 2.947309528726274e-06, "loss": 0.75070775, "num_input_tokens_seen": 122580580, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.78125, "step": 5705, "time_per_iteration": 2.6000053882598877 }, { "auxiliary_loss_clip": 0.01162435, "auxiliary_loss_mlp": 0.01033806, "balance_loss_clip": 1.01893437, "balance_loss_mlp": 1.04755688, "epoch": 0.34306327972343303, "flos": 22086822124800.0, "grad_norm": 1.9168929903437308, "language_loss": 0.79744482, "learning_rate": 2.9469767596952463e-06, "loss": 0.81940722, "num_input_tokens_seen": 122599810, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.79296875, "step": 5706, "time_per_iteration": 3.975858449935913 }, { "auxiliary_loss_clip": 0.01148679, "auxiliary_loss_mlp": 0.01035593, "balance_loss_clip": 1.02013791, "balance_loss_mlp": 1.0479219, "epoch": 0.343123402976101, "flos": 18442679616000.0, "grad_norm": 1.9127680242792542, "language_loss": 0.8278116, "learning_rate": 2.946643956869054e-06, "loss": 0.84965432, "num_input_tokens_seen": 122616035, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.828125, "step": 5707, "time_per_iteration": 2.5730526447296143 }, { "auxiliary_loss_clip": 0.01137718, "auxiliary_loss_mlp": 0.01035185, "balance_loss_clip": 1.02065325, "balance_loss_mlp": 1.04951358, "epoch": 0.34318352622876896, "flos": 17163685065600.0, "grad_norm": 1.5019579183880998, "language_loss": 0.75241375, "learning_rate": 2.9463111202595734e-06, "loss": 0.77414286, "num_input_tokens_seen": 122633785, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.79296875, "step": 5708, "time_per_iteration": 2.458855152130127 }, { "auxiliary_loss_clip": 0.01141808, "auxiliary_loss_mlp": 0.01037974, "balance_loss_clip": 1.02331138, "balance_loss_mlp": 1.04639745, "epoch": 0.34324364948143693, "flos": 26688164215680.0, "grad_norm": 1.567256915995297, "language_loss": 0.80857122, "learning_rate": 2.945978249878683e-06, "loss": 0.830369, "num_input_tokens_seen": 122652100, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7734375, "step": 5709, "time_per_iteration": 2.689455509185791 }, { "auxiliary_loss_clip": 0.01146777, "auxiliary_loss_mlp": 0.01038884, "balance_loss_clip": 1.02266002, "balance_loss_mlp": 1.04834366, "epoch": 0.3433037727341049, "flos": 21251576194560.0, "grad_norm": 2.12946022061977, "language_loss": 0.79139316, "learning_rate": 2.9456453457382628e-06, "loss": 0.81324977, "num_input_tokens_seen": 122669720, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.80859375, "step": 5710, "time_per_iteration": 2.512974739074707 }, { "auxiliary_loss_clip": 0.01139187, "auxiliary_loss_mlp": 0.01039053, "balance_loss_clip": 1.02307951, "balance_loss_mlp": 1.04764867, "epoch": 0.34336389598677286, "flos": 20629423699200.0, "grad_norm": 1.7896697340081924, "language_loss": 0.70155358, "learning_rate": 2.9453124078501926e-06, "loss": 0.72333592, "num_input_tokens_seen": 122688715, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.82421875, "step": 5711, "time_per_iteration": 3.9619569778442383 }, { "auxiliary_loss_clip": 0.01153875, "auxiliary_loss_mlp": 0.0128657, "balance_loss_clip": 1.02416706, "balance_loss_mlp": 1.04753244, "epoch": 0.3434240192394409, "flos": 14538579402240.0, "grad_norm": 1.8281138782552464, "language_loss": 0.67569721, "learning_rate": 2.944979436226354e-06, "loss": 0.70010161, "num_input_tokens_seen": 122706970, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.796875, "step": 5712, "time_per_iteration": 4.138516902923584 }, { "auxiliary_loss_clip": 0.01073189, "auxiliary_loss_mlp": 0.01001906, "balance_loss_clip": 0.99958146, "balance_loss_mlp": 1.02081585, "epoch": 0.34348414249210885, "flos": 58051538841600.0, "grad_norm": 0.8388132475083423, "language_loss": 0.58092785, "learning_rate": 2.94464643087863e-06, "loss": 0.60167885, "num_input_tokens_seen": 122758095, "router_z_loss_clip": 0.02319336, "router_z_loss_mlp": 0.2578125, "step": 5713, "time_per_iteration": 3.1968343257904053 }, { "auxiliary_loss_clip": 0.01128123, "auxiliary_loss_mlp": 0.01040585, "balance_loss_clip": 1.02625632, "balance_loss_mlp": 1.04780841, "epoch": 0.3435442657447768, "flos": 20704441253760.0, "grad_norm": 1.5311004877900196, "language_loss": 0.80374134, "learning_rate": 2.9443133918189054e-06, "loss": 0.82542843, "num_input_tokens_seen": 122777815, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8046875, "step": 5714, "time_per_iteration": 2.57857608795166 }, { "auxiliary_loss_clip": 0.01129873, "auxiliary_loss_mlp": 0.01039757, "balance_loss_clip": 1.02429521, "balance_loss_mlp": 1.05004382, "epoch": 0.3436043889974448, "flos": 22930256355840.0, "grad_norm": 1.669264353971917, "language_loss": 0.71674073, "learning_rate": 2.943980319059064e-06, "loss": 0.73843706, "num_input_tokens_seen": 122797555, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.796875, "step": 5715, "time_per_iteration": 4.015127420425415 }, { "auxiliary_loss_clip": 0.01054583, "auxiliary_loss_mlp": 0.01002569, "balance_loss_clip": 1.00036383, "balance_loss_mlp": 1.01962364, "epoch": 0.34366451225011274, "flos": 58403285752320.0, "grad_norm": 1.1929296079330658, "language_loss": 0.6587404, "learning_rate": 2.9436472126109943e-06, "loss": 0.67931187, "num_input_tokens_seen": 122863955, "router_z_loss_clip": 0.02209473, "router_z_loss_mlp": 0.2578125, "step": 5716, "time_per_iteration": 3.1890621185302734 }, { "auxiliary_loss_clip": 0.01135684, "auxiliary_loss_mlp": 0.01044823, "balance_loss_clip": 1.02967191, "balance_loss_mlp": 1.04874194, "epoch": 0.3437246355027807, "flos": 15596292216960.0, "grad_norm": 2.0018067125641563, "language_loss": 0.74174601, "learning_rate": 2.9433140724865824e-06, "loss": 0.76355112, "num_input_tokens_seen": 122883000, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.78125, "step": 5717, "time_per_iteration": 2.555699586868286 }, { "auxiliary_loss_clip": 0.01161531, "auxiliary_loss_mlp": 0.01037872, "balance_loss_clip": 1.02381158, "balance_loss_mlp": 1.04524338, "epoch": 0.34378475875544867, "flos": 27672260106240.0, "grad_norm": 1.737415130522252, "language_loss": 0.75223118, "learning_rate": 2.9429808986977175e-06, "loss": 0.77422523, "num_input_tokens_seen": 122903265, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8046875, "step": 5718, "time_per_iteration": 2.777985095977783 }, { "auxiliary_loss_clip": 0.01154513, "auxiliary_loss_mlp": 0.01048392, "balance_loss_clip": 1.03254271, "balance_loss_mlp": 1.04657221, "epoch": 0.34384488200811664, "flos": 31431496769280.0, "grad_norm": 3.5487084546727634, "language_loss": 0.63886189, "learning_rate": 2.9426476912562905e-06, "loss": 0.660891, "num_input_tokens_seen": 122923860, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.81640625, "step": 5719, "time_per_iteration": 2.690159797668457 }, { "auxiliary_loss_clip": 0.01165945, "auxiliary_loss_mlp": 0.0103703, "balance_loss_clip": 1.02031696, "balance_loss_mlp": 1.04746723, "epoch": 0.3439050052607846, "flos": 24899920594560.0, "grad_norm": 1.762001501611586, "language_loss": 0.73347497, "learning_rate": 2.9423144501741918e-06, "loss": 0.75550473, "num_input_tokens_seen": 122945305, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.828125, "step": 5720, "time_per_iteration": 2.6805033683776855 }, { "auxiliary_loss_clip": 0.0113807, "auxiliary_loss_mlp": 0.0104005, "balance_loss_clip": 1.02461243, "balance_loss_mlp": 1.04805219, "epoch": 0.34396512851345257, "flos": 18150079426560.0, "grad_norm": 1.6300856318445014, "language_loss": 0.74132949, "learning_rate": 2.9419811754633143e-06, "loss": 0.7631107, "num_input_tokens_seen": 122962535, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.80859375, "step": 5721, "time_per_iteration": 2.5645179748535156 }, { "auxiliary_loss_clip": 0.01149171, "auxiliary_loss_mlp": 0.01295238, "balance_loss_clip": 1.03252482, "balance_loss_mlp": 1.04908895, "epoch": 0.34402525176612053, "flos": 16034438315520.0, "grad_norm": 1.7510503938279534, "language_loss": 0.79931331, "learning_rate": 2.9416478671355516e-06, "loss": 0.82375741, "num_input_tokens_seen": 122979750, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8203125, "step": 5722, "time_per_iteration": 2.6317598819732666 }, { "auxiliary_loss_clip": 0.01163946, "auxiliary_loss_mlp": 0.0103473, "balance_loss_clip": 1.0201503, "balance_loss_mlp": 1.04874754, "epoch": 0.3440853750187885, "flos": 21178641628800.0, "grad_norm": 1.5333528827214247, "language_loss": 0.8160463, "learning_rate": 2.9413145252027985e-06, "loss": 0.83803308, "num_input_tokens_seen": 122998955, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.79296875, "step": 5723, "time_per_iteration": 2.668532371520996 }, { "auxiliary_loss_clip": 0.01155876, "auxiliary_loss_mlp": 0.01042251, "balance_loss_clip": 1.02696896, "balance_loss_mlp": 1.04759169, "epoch": 0.34414549827145646, "flos": 12677868092160.0, "grad_norm": 2.088549415814322, "language_loss": 0.80844516, "learning_rate": 2.940981149676952e-06, "loss": 0.83042639, "num_input_tokens_seen": 123016165, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8125, "step": 5724, "time_per_iteration": 2.6611318588256836 }, { "auxiliary_loss_clip": 0.0114783, "auxiliary_loss_mlp": 0.01035139, "balance_loss_clip": 1.01997566, "balance_loss_mlp": 1.04860854, "epoch": 0.3442056215241244, "flos": 31284514316160.0, "grad_norm": 2.006995931179103, "language_loss": 0.68778718, "learning_rate": 2.940647740569908e-06, "loss": 0.7096169, "num_input_tokens_seen": 123036900, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.81640625, "step": 5725, "time_per_iteration": 2.626910924911499 }, { "auxiliary_loss_clip": 0.01145238, "auxiliary_loss_mlp": 0.01042446, "balance_loss_clip": 1.02485037, "balance_loss_mlp": 1.04996109, "epoch": 0.34426574477679245, "flos": 23367289132800.0, "grad_norm": 1.3418048145035129, "language_loss": 0.69162774, "learning_rate": 2.9403142978935665e-06, "loss": 0.71350461, "num_input_tokens_seen": 123057480, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.859375, "step": 5726, "time_per_iteration": 2.752047538757324 }, { "auxiliary_loss_clip": 0.01151921, "auxiliary_loss_mlp": 0.01037908, "balance_loss_clip": 1.02301884, "balance_loss_mlp": 1.04540873, "epoch": 0.3443258680294604, "flos": 24535427333760.0, "grad_norm": 1.9774930809993594, "language_loss": 0.73212647, "learning_rate": 2.939980821659826e-06, "loss": 0.75402474, "num_input_tokens_seen": 123076890, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.796875, "step": 5727, "time_per_iteration": 2.60365891456604 }, { "auxiliary_loss_clip": 0.01156557, "auxiliary_loss_mlp": 0.01041554, "balance_loss_clip": 1.02621174, "balance_loss_mlp": 1.04954982, "epoch": 0.3443859912821284, "flos": 20230133137920.0, "grad_norm": 1.7394796395342274, "language_loss": 0.88268626, "learning_rate": 2.9396473118805886e-06, "loss": 0.90466738, "num_input_tokens_seen": 123092530, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.80078125, "step": 5728, "time_per_iteration": 2.5724987983703613 }, { "auxiliary_loss_clip": 0.01163733, "auxiliary_loss_mlp": 0.01041007, "balance_loss_clip": 1.02626121, "balance_loss_mlp": 1.0480653, "epoch": 0.34444611453479634, "flos": 24316515895680.0, "grad_norm": 2.027872605782545, "language_loss": 0.70394194, "learning_rate": 2.9393137685677555e-06, "loss": 0.7259894, "num_input_tokens_seen": 123110560, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8046875, "step": 5729, "time_per_iteration": 2.585678815841675 }, { "auxiliary_loss_clip": 0.01135542, "auxiliary_loss_mlp": 0.0103436, "balance_loss_clip": 1.01868415, "balance_loss_mlp": 1.04771399, "epoch": 0.3445062377874643, "flos": 16983413683200.0, "grad_norm": 1.8085304182585016, "language_loss": 0.74066788, "learning_rate": 2.9389801917332294e-06, "loss": 0.76236689, "num_input_tokens_seen": 123128655, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.7890625, "step": 5730, "time_per_iteration": 2.7147979736328125 }, { "auxiliary_loss_clip": 0.01153872, "auxiliary_loss_mlp": 0.01045779, "balance_loss_clip": 1.03035343, "balance_loss_mlp": 1.04645014, "epoch": 0.3445663610401323, "flos": 20302708567680.0, "grad_norm": 2.6724998569192926, "language_loss": 0.793648, "learning_rate": 2.938646581388917e-06, "loss": 0.8156445, "num_input_tokens_seen": 123145130, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8046875, "step": 5731, "time_per_iteration": 2.569882392883301 }, { "auxiliary_loss_clip": 0.01137066, "auxiliary_loss_mlp": 0.01044016, "balance_loss_clip": 1.02851868, "balance_loss_mlp": 1.04751635, "epoch": 0.34462648429280024, "flos": 15888102307200.0, "grad_norm": 2.9022304367487846, "language_loss": 0.78199959, "learning_rate": 2.9383129375467214e-06, "loss": 0.80381042, "num_input_tokens_seen": 123162265, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8046875, "step": 5732, "time_per_iteration": 2.5348153114318848 }, { "auxiliary_loss_clip": 0.01071317, "auxiliary_loss_mlp": 0.01001467, "balance_loss_clip": 0.99952441, "balance_loss_mlp": 1.01865625, "epoch": 0.3446866075454682, "flos": 59311035285120.0, "grad_norm": 0.8630611832149734, "language_loss": 0.53456992, "learning_rate": 2.937979260218551e-06, "loss": 0.55529773, "num_input_tokens_seen": 123218620, "router_z_loss_clip": 0.01940918, "router_z_loss_mlp": 0.26171875, "step": 5733, "time_per_iteration": 3.2006800174713135 }, { "auxiliary_loss_clip": 0.01146584, "auxiliary_loss_mlp": 0.01044923, "balance_loss_clip": 1.02894902, "balance_loss_mlp": 1.04795063, "epoch": 0.34474673079813617, "flos": 22343799000960.0, "grad_norm": 1.6451874665048722, "language_loss": 0.8320936, "learning_rate": 2.9376455494163137e-06, "loss": 0.85400867, "num_input_tokens_seen": 123237325, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.80859375, "step": 5734, "time_per_iteration": 2.5576155185699463 }, { "auxiliary_loss_clip": 0.01145859, "auxiliary_loss_mlp": 0.01036637, "balance_loss_clip": 1.020818, "balance_loss_mlp": 1.04653525, "epoch": 0.34480685405080413, "flos": 27670141203840.0, "grad_norm": 1.7485490536825816, "language_loss": 0.92715251, "learning_rate": 2.9373118051519185e-06, "loss": 0.94897753, "num_input_tokens_seen": 123258650, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.81640625, "step": 5735, "time_per_iteration": 2.6045806407928467 }, { "auxiliary_loss_clip": 0.01167214, "auxiliary_loss_mlp": 0.01036731, "balance_loss_clip": 1.02031612, "balance_loss_mlp": 1.04931879, "epoch": 0.3448669773034721, "flos": 22456020067200.0, "grad_norm": 1.5730785824378808, "language_loss": 0.76738656, "learning_rate": 2.936978027437276e-06, "loss": 0.78942597, "num_input_tokens_seen": 123277155, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8203125, "step": 5736, "time_per_iteration": 2.558964252471924 }, { "auxiliary_loss_clip": 0.01149806, "auxiliary_loss_mlp": 0.01039378, "balance_loss_clip": 1.02379704, "balance_loss_mlp": 1.04939818, "epoch": 0.34492710055614006, "flos": 24936190352640.0, "grad_norm": 1.562852165554724, "language_loss": 0.78486413, "learning_rate": 2.9366442162842976e-06, "loss": 0.8067559, "num_input_tokens_seen": 123297640, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.82421875, "step": 5737, "time_per_iteration": 2.589397430419922 }, { "auxiliary_loss_clip": 0.01168826, "auxiliary_loss_mlp": 0.01041676, "balance_loss_clip": 1.0251658, "balance_loss_mlp": 1.04722834, "epoch": 0.34498722380880803, "flos": 20120821073280.0, "grad_norm": 2.7545345723910555, "language_loss": 0.7189858, "learning_rate": 2.936310371704897e-06, "loss": 0.74109083, "num_input_tokens_seen": 123314370, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.85546875, "step": 5738, "time_per_iteration": 2.537344217300415 }, { "auxiliary_loss_clip": 0.01157806, "auxiliary_loss_mlp": 0.01043317, "balance_loss_clip": 1.02690196, "balance_loss_mlp": 1.04735172, "epoch": 0.34504734706147605, "flos": 28438126917120.0, "grad_norm": 2.1540735049460653, "language_loss": 0.81825459, "learning_rate": 2.9359764937109877e-06, "loss": 0.84026587, "num_input_tokens_seen": 123336085, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8359375, "step": 5739, "time_per_iteration": 2.748176097869873 }, { "auxiliary_loss_clip": 0.01159874, "auxiliary_loss_mlp": 0.01046258, "balance_loss_clip": 1.03057003, "balance_loss_mlp": 1.05025184, "epoch": 0.345107470314144, "flos": 22674464628480.0, "grad_norm": 1.9317151021217458, "language_loss": 0.8277061, "learning_rate": 2.9356425823144847e-06, "loss": 0.84976745, "num_input_tokens_seen": 123354460, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.83203125, "step": 5740, "time_per_iteration": 2.642965793609619 }, { "auxiliary_loss_clip": 0.01138499, "auxiliary_loss_mlp": 0.01038465, "balance_loss_clip": 1.02172208, "balance_loss_mlp": 1.04834461, "epoch": 0.345167593566812, "flos": 20630716588800.0, "grad_norm": 2.422787355024102, "language_loss": 0.76771992, "learning_rate": 2.9353086375273047e-06, "loss": 0.78948957, "num_input_tokens_seen": 123373420, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.81640625, "step": 5741, "time_per_iteration": 2.6654560565948486 }, { "auxiliary_loss_clip": 0.01148562, "auxiliary_loss_mlp": 0.01040446, "balance_loss_clip": 1.02526426, "balance_loss_mlp": 1.04687476, "epoch": 0.34522771681947995, "flos": 26214358890240.0, "grad_norm": 1.4496353101971287, "language_loss": 0.77394176, "learning_rate": 2.9349746593613654e-06, "loss": 0.7958318, "num_input_tokens_seen": 123394730, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8359375, "step": 5742, "time_per_iteration": 2.61210560798645 }, { "auxiliary_loss_clip": 0.01137184, "auxiliary_loss_mlp": 0.01041432, "balance_loss_clip": 1.0259347, "balance_loss_mlp": 1.04653215, "epoch": 0.3452878400721479, "flos": 19062354072960.0, "grad_norm": 1.8832503574513404, "language_loss": 0.75980049, "learning_rate": 2.934640647828586e-06, "loss": 0.78158665, "num_input_tokens_seen": 123412895, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.81640625, "step": 5743, "time_per_iteration": 2.613985300064087 }, { "auxiliary_loss_clip": 0.01134261, "auxiliary_loss_mlp": 0.01038052, "balance_loss_clip": 1.02291822, "balance_loss_mlp": 1.04588938, "epoch": 0.3453479633248159, "flos": 27929739772800.0, "grad_norm": 1.5105173057291643, "language_loss": 0.70114118, "learning_rate": 2.934306602940885e-06, "loss": 0.72286427, "num_input_tokens_seen": 123432320, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.796875, "step": 5744, "time_per_iteration": 2.6428141593933105 }, { "auxiliary_loss_clip": 0.01126458, "auxiliary_loss_mlp": 0.01038005, "balance_loss_clip": 1.02196002, "balance_loss_mlp": 1.04708266, "epoch": 0.34540808657748384, "flos": 19606113135360.0, "grad_norm": 2.0063636330964023, "language_loss": 0.79139179, "learning_rate": 2.9339725247101855e-06, "loss": 0.81303644, "num_input_tokens_seen": 123450980, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.79296875, "step": 5745, "time_per_iteration": 2.582467555999756 }, { "auxiliary_loss_clip": 0.01166761, "auxiliary_loss_mlp": 0.01041329, "balance_loss_clip": 1.0247947, "balance_loss_mlp": 1.0472331, "epoch": 0.3454682098301518, "flos": 20411661496320.0, "grad_norm": 1.8309269306919342, "language_loss": 0.89020008, "learning_rate": 2.933638413148409e-06, "loss": 0.91228092, "num_input_tokens_seen": 123469365, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.84375, "step": 5746, "time_per_iteration": 2.571654796600342 }, { "auxiliary_loss_clip": 0.01139632, "auxiliary_loss_mlp": 0.01040847, "balance_loss_clip": 1.02437258, "balance_loss_mlp": 1.0467999, "epoch": 0.34552833308281977, "flos": 21325121291520.0, "grad_norm": 2.067387897588519, "language_loss": 0.63945711, "learning_rate": 2.9333042682674788e-06, "loss": 0.66126192, "num_input_tokens_seen": 123489425, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.83984375, "step": 5747, "time_per_iteration": 2.5917580127716064 }, { "auxiliary_loss_clip": 0.01146632, "auxiliary_loss_mlp": 0.01035789, "balance_loss_clip": 1.02134657, "balance_loss_mlp": 1.04746115, "epoch": 0.34558845633548774, "flos": 36243633824640.0, "grad_norm": 1.5736981129514547, "language_loss": 0.72696745, "learning_rate": 2.9329700900793207e-06, "loss": 0.74879169, "num_input_tokens_seen": 123509970, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8125, "step": 5748, "time_per_iteration": 4.148196697235107 }, { "auxiliary_loss_clip": 0.01162069, "auxiliary_loss_mlp": 0.01036202, "balance_loss_clip": 1.02202833, "balance_loss_mlp": 1.04778564, "epoch": 0.3456485795881557, "flos": 22450561200000.0, "grad_norm": 1.770247555593808, "language_loss": 0.75631869, "learning_rate": 2.9326358785958593e-06, "loss": 0.77830142, "num_input_tokens_seen": 123531055, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.78125, "step": 5749, "time_per_iteration": 2.646400213241577 }, { "auxiliary_loss_clip": 0.01061019, "auxiliary_loss_mlp": 0.01005815, "balance_loss_clip": 1.00390768, "balance_loss_mlp": 1.01742363, "epoch": 0.34570870284082367, "flos": 62004299005440.0, "grad_norm": 0.8990601373760062, "language_loss": 0.62753862, "learning_rate": 2.9323016338290227e-06, "loss": 0.64820695, "num_input_tokens_seen": 123584720, "router_z_loss_clip": 0.01904297, "router_z_loss_mlp": 0.2578125, "step": 5750, "time_per_iteration": 3.1418423652648926 }, { "auxiliary_loss_clip": 0.01136365, "auxiliary_loss_mlp": 0.01035079, "balance_loss_clip": 1.02036905, "balance_loss_mlp": 1.04348719, "epoch": 0.34576882609349163, "flos": 22782196494720.0, "grad_norm": 1.6617631452495563, "language_loss": 0.80412346, "learning_rate": 2.931967355790739e-06, "loss": 0.82583791, "num_input_tokens_seen": 123604465, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.75, "step": 5751, "time_per_iteration": 2.5495123863220215 }, { "auxiliary_loss_clip": 0.01123716, "auxiliary_loss_mlp": 0.01044103, "balance_loss_clip": 1.02916622, "balance_loss_mlp": 1.04513574, "epoch": 0.34582894934615965, "flos": 12348818576640.0, "grad_norm": 2.12824912334186, "language_loss": 0.83924389, "learning_rate": 2.931633044492937e-06, "loss": 0.86092204, "num_input_tokens_seen": 123622320, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.78515625, "step": 5752, "time_per_iteration": 2.5442252159118652 }, { "auxiliary_loss_clip": 0.01060173, "auxiliary_loss_mlp": 0.01254467, "balance_loss_clip": 1.00553787, "balance_loss_mlp": 1.01738405, "epoch": 0.3458890725988276, "flos": 70167691071360.0, "grad_norm": 0.7332040480466557, "language_loss": 0.63133901, "learning_rate": 2.931298699947549e-06, "loss": 0.65448546, "num_input_tokens_seen": 123678010, "router_z_loss_clip": 0.02160645, "router_z_loss_mlp": 0.25390625, "step": 5753, "time_per_iteration": 4.409118890762329 }, { "auxiliary_loss_clip": 0.0114595, "auxiliary_loss_mlp": 0.0103868, "balance_loss_clip": 1.02318251, "balance_loss_mlp": 1.04804468, "epoch": 0.3459491958514956, "flos": 17092582093440.0, "grad_norm": 1.8580582271095618, "language_loss": 0.71292818, "learning_rate": 2.9309643221665054e-06, "loss": 0.73477441, "num_input_tokens_seen": 123696830, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.796875, "step": 5754, "time_per_iteration": 4.195930480957031 }, { "auxiliary_loss_clip": 0.01137658, "auxiliary_loss_mlp": 0.01032774, "balance_loss_clip": 1.01745534, "balance_loss_mlp": 1.04619765, "epoch": 0.34600931910416355, "flos": 16650952375680.0, "grad_norm": 13.647406574397179, "language_loss": 0.72228837, "learning_rate": 2.9306299111617402e-06, "loss": 0.74399269, "num_input_tokens_seen": 123714360, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.82421875, "step": 5755, "time_per_iteration": 2.521012306213379 }, { "auxiliary_loss_clip": 0.01132166, "auxiliary_loss_mlp": 0.01035573, "balance_loss_clip": 1.02096951, "balance_loss_mlp": 1.04561555, "epoch": 0.3460694423568315, "flos": 38546190334080.0, "grad_norm": 1.4747409593456038, "language_loss": 0.72296107, "learning_rate": 2.9302954669451875e-06, "loss": 0.74463844, "num_input_tokens_seen": 123739250, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7734375, "step": 5756, "time_per_iteration": 2.722954034805298 }, { "auxiliary_loss_clip": 0.01051089, "auxiliary_loss_mlp": 0.01004542, "balance_loss_clip": 1.00238442, "balance_loss_mlp": 1.01686144, "epoch": 0.3461295656094995, "flos": 72081479704320.0, "grad_norm": 0.7107641230616856, "language_loss": 0.62604684, "learning_rate": 2.9299609895287817e-06, "loss": 0.64660311, "num_input_tokens_seen": 123802845, "router_z_loss_clip": 0.02160645, "router_z_loss_mlp": 0.25390625, "step": 5757, "time_per_iteration": 5.370211839675903 }, { "auxiliary_loss_clip": 0.01068786, "auxiliary_loss_mlp": 0.01250826, "balance_loss_clip": 1.00217235, "balance_loss_mlp": 1.01678848, "epoch": 0.34618968886216744, "flos": 65460089571840.0, "grad_norm": 0.8199965156645648, "language_loss": 0.59193134, "learning_rate": 2.929626478924461e-06, "loss": 0.61512756, "num_input_tokens_seen": 123861805, "router_z_loss_clip": 0.01904297, "router_z_loss_mlp": 0.25585938, "step": 5758, "time_per_iteration": 3.1579649448394775 }, { "auxiliary_loss_clip": 0.01148144, "auxiliary_loss_mlp": 0.01039203, "balance_loss_clip": 1.02538037, "balance_loss_mlp": 1.05019939, "epoch": 0.3462498121148354, "flos": 23472542960640.0, "grad_norm": 1.6163903919254874, "language_loss": 0.7128545, "learning_rate": 2.9292919351441626e-06, "loss": 0.73472798, "num_input_tokens_seen": 123881820, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.8046875, "step": 5759, "time_per_iteration": 2.7150230407714844 }, { "auxiliary_loss_clip": 0.01159216, "auxiliary_loss_mlp": 0.01044641, "balance_loss_clip": 1.02891684, "balance_loss_mlp": 1.04491711, "epoch": 0.3463099353675034, "flos": 24170790418560.0, "grad_norm": 2.5758742184240164, "language_loss": 0.83205259, "learning_rate": 2.928957358199825e-06, "loss": 0.85409117, "num_input_tokens_seen": 123903700, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.7890625, "step": 5760, "time_per_iteration": 2.6238532066345215 }, { "auxiliary_loss_clip": 0.01069749, "auxiliary_loss_mlp": 0.01004027, "balance_loss_clip": 1.0021081, "balance_loss_mlp": 1.01751876, "epoch": 0.34637005862017134, "flos": 63700609766400.0, "grad_norm": 0.8310553830207461, "language_loss": 0.56662685, "learning_rate": 2.9286227481033903e-06, "loss": 0.58736461, "num_input_tokens_seen": 123960075, "router_z_loss_clip": 0.01916504, "router_z_loss_mlp": 0.25390625, "step": 5761, "time_per_iteration": 3.230393171310425 }, { "auxiliary_loss_clip": 0.01131263, "auxiliary_loss_mlp": 0.0128938, "balance_loss_clip": 1.02755475, "balance_loss_mlp": 1.04490995, "epoch": 0.3464301818728393, "flos": 13145532192000.0, "grad_norm": 4.1224592583747235, "language_loss": 0.94951808, "learning_rate": 2.9282881048667972e-06, "loss": 0.97372448, "num_input_tokens_seen": 123975805, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.77734375, "step": 5762, "time_per_iteration": 2.6149182319641113 }, { "auxiliary_loss_clip": 0.01154743, "auxiliary_loss_mlp": 0.01036312, "balance_loss_clip": 1.02126169, "balance_loss_mlp": 1.04534793, "epoch": 0.34649030512550727, "flos": 29315173299840.0, "grad_norm": 1.5607082847205258, "language_loss": 0.69918168, "learning_rate": 2.927953428501989e-06, "loss": 0.72109228, "num_input_tokens_seen": 123997530, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.828125, "step": 5763, "time_per_iteration": 2.6397385597229004 }, { "auxiliary_loss_clip": 0.01141686, "auxiliary_loss_mlp": 0.01045687, "balance_loss_clip": 1.02905774, "balance_loss_mlp": 1.04914057, "epoch": 0.34655042837817523, "flos": 23730884553600.0, "grad_norm": 1.603827844417241, "language_loss": 0.8364352, "learning_rate": 2.9276187190209107e-06, "loss": 0.85830891, "num_input_tokens_seen": 124016375, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8359375, "step": 5764, "time_per_iteration": 2.6310720443725586 }, { "auxiliary_loss_clip": 0.0114456, "auxiliary_loss_mlp": 0.01035193, "balance_loss_clip": 1.02025044, "balance_loss_mlp": 1.04601324, "epoch": 0.34661055163084326, "flos": 22054215553920.0, "grad_norm": 2.183797019723981, "language_loss": 0.67485708, "learning_rate": 2.927283976435506e-06, "loss": 0.69665456, "num_input_tokens_seen": 124033975, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.80078125, "step": 5765, "time_per_iteration": 2.5507233142852783 }, { "auxiliary_loss_clip": 0.01147212, "auxiliary_loss_mlp": 0.0104164, "balance_loss_clip": 1.02660775, "balance_loss_mlp": 1.04972136, "epoch": 0.3466706748835112, "flos": 21799213925760.0, "grad_norm": 1.7677479379796306, "language_loss": 0.7717573, "learning_rate": 2.926949200757722e-06, "loss": 0.79364586, "num_input_tokens_seen": 124051930, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.796875, "step": 5766, "time_per_iteration": 2.678443670272827 }, { "auxiliary_loss_clip": 0.01160734, "auxiliary_loss_mlp": 0.01034588, "balance_loss_clip": 1.02093303, "balance_loss_mlp": 1.04642963, "epoch": 0.3467307981361792, "flos": 19461680547840.0, "grad_norm": 1.468701679499564, "language_loss": 0.73620141, "learning_rate": 2.926614391999505e-06, "loss": 0.75815463, "num_input_tokens_seen": 124071220, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.78515625, "step": 5767, "time_per_iteration": 2.60219407081604 }, { "auxiliary_loss_clip": 0.01146342, "auxiliary_loss_mlp": 0.01044932, "balance_loss_clip": 1.02916074, "balance_loss_mlp": 1.04891896, "epoch": 0.34679092138884715, "flos": 24827452905600.0, "grad_norm": 1.8793931186958914, "language_loss": 0.77715826, "learning_rate": 2.926279550172804e-06, "loss": 0.79907107, "num_input_tokens_seen": 124090140, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.79296875, "step": 5768, "time_per_iteration": 2.7397642135620117 }, { "auxiliary_loss_clip": 0.01050529, "auxiliary_loss_mlp": 0.01004031, "balance_loss_clip": 1.00201654, "balance_loss_mlp": 1.01690841, "epoch": 0.3468510446415151, "flos": 63236070149760.0, "grad_norm": 0.76614589001965, "language_loss": 0.57535481, "learning_rate": 2.9259446752895686e-06, "loss": 0.59590042, "num_input_tokens_seen": 124152025, "router_z_loss_clip": 0.0201416, "router_z_loss_mlp": 0.25195312, "step": 5769, "time_per_iteration": 3.2762341499328613 }, { "auxiliary_loss_clip": 0.0116211, "auxiliary_loss_mlp": 0.01038616, "balance_loss_clip": 1.021474, "balance_loss_mlp": 1.04856002, "epoch": 0.3469111678941831, "flos": 12120713256960.0, "grad_norm": 2.45184361692075, "language_loss": 0.85721958, "learning_rate": 2.9256097673617495e-06, "loss": 0.87922686, "num_input_tokens_seen": 124165795, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.87109375, "step": 5770, "time_per_iteration": 2.5591890811920166 }, { "auxiliary_loss_clip": 0.01058442, "auxiliary_loss_mlp": 0.01001855, "balance_loss_clip": 0.99987644, "balance_loss_mlp": 1.01501226, "epoch": 0.34697129114685105, "flos": 65934110378880.0, "grad_norm": 0.773413968151505, "language_loss": 0.59787178, "learning_rate": 2.9252748264012985e-06, "loss": 0.61847472, "num_input_tokens_seen": 124222925, "router_z_loss_clip": 0.01977539, "router_z_loss_mlp": 0.25195312, "step": 5771, "time_per_iteration": 2.9816277027130127 }, { "auxiliary_loss_clip": 0.01136323, "auxiliary_loss_mlp": 0.01046164, "balance_loss_clip": 1.03122723, "balance_loss_mlp": 1.0480113, "epoch": 0.347031414399519, "flos": 34454205054720.0, "grad_norm": 1.8990554855615764, "language_loss": 0.71596354, "learning_rate": 2.9249398524201693e-06, "loss": 0.73778844, "num_input_tokens_seen": 124240915, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.79296875, "step": 5772, "time_per_iteration": 2.7284369468688965 }, { "auxiliary_loss_clip": 0.01154227, "auxiliary_loss_mlp": 0.01285058, "balance_loss_clip": 1.02212167, "balance_loss_mlp": 1.04657841, "epoch": 0.347091537652187, "flos": 26944135511040.0, "grad_norm": 1.5752515801343114, "language_loss": 0.76112831, "learning_rate": 2.9246048454303165e-06, "loss": 0.78552115, "num_input_tokens_seen": 124262770, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.80859375, "step": 5773, "time_per_iteration": 2.766697883605957 }, { "auxiliary_loss_clip": 0.01138149, "auxiliary_loss_mlp": 0.01039591, "balance_loss_clip": 1.02330732, "balance_loss_mlp": 1.04683113, "epoch": 0.34715166090485494, "flos": 21142228216320.0, "grad_norm": 2.6441315971300865, "language_loss": 0.70082998, "learning_rate": 2.9242698054436942e-06, "loss": 0.72260731, "num_input_tokens_seen": 124280950, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.82421875, "step": 5774, "time_per_iteration": 2.612830638885498 }, { "auxiliary_loss_clip": 0.01144652, "auxiliary_loss_mlp": 0.01285025, "balance_loss_clip": 1.02322221, "balance_loss_mlp": 1.04880512, "epoch": 0.3472117841575229, "flos": 23478001827840.0, "grad_norm": 1.527209484492484, "language_loss": 0.75812268, "learning_rate": 2.9239347324722605e-06, "loss": 0.78241944, "num_input_tokens_seen": 124299540, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.78125, "step": 5775, "time_per_iteration": 2.730837821960449 }, { "auxiliary_loss_clip": 0.01150101, "auxiliary_loss_mlp": 0.01040311, "balance_loss_clip": 1.02397895, "balance_loss_mlp": 1.04821587, "epoch": 0.34727190741019087, "flos": 17492806408320.0, "grad_norm": 2.1722522924105845, "language_loss": 0.77555561, "learning_rate": 2.923599626527973e-06, "loss": 0.79745972, "num_input_tokens_seen": 124316285, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.83984375, "step": 5776, "time_per_iteration": 2.539893388748169 }, { "auxiliary_loss_clip": 0.01055239, "auxiliary_loss_mlp": 0.01001306, "balance_loss_clip": 0.99918425, "balance_loss_mlp": 1.01267886, "epoch": 0.34733203066285884, "flos": 65265491640960.0, "grad_norm": 0.8376912553774165, "language_loss": 0.63316292, "learning_rate": 2.9232644876227904e-06, "loss": 0.65372837, "num_input_tokens_seen": 124376650, "router_z_loss_clip": 0.02124023, "router_z_loss_mlp": 0.25, "step": 5777, "time_per_iteration": 3.2065482139587402 }, { "auxiliary_loss_clip": 0.01155395, "auxiliary_loss_mlp": 0.01038442, "balance_loss_clip": 1.02277827, "balance_loss_mlp": 1.0470829, "epoch": 0.3473921539155268, "flos": 28658726294400.0, "grad_norm": 2.0948779258064443, "language_loss": 0.64603424, "learning_rate": 2.9229293157686732e-06, "loss": 0.66797256, "num_input_tokens_seen": 124396475, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8125, "step": 5778, "time_per_iteration": 2.7390835285186768 }, { "auxiliary_loss_clip": 0.01147452, "auxiliary_loss_mlp": 0.01046281, "balance_loss_clip": 1.03098607, "balance_loss_mlp": 1.0469861, "epoch": 0.3474522771681948, "flos": 40836895355520.0, "grad_norm": 1.8416619692647884, "language_loss": 0.71491849, "learning_rate": 2.9225941109775825e-06, "loss": 0.73685586, "num_input_tokens_seen": 124416480, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.82421875, "step": 5779, "time_per_iteration": 2.7611446380615234 }, { "auxiliary_loss_clip": 0.01132099, "auxiliary_loss_mlp": 0.01045425, "balance_loss_clip": 1.03007102, "balance_loss_mlp": 1.04810846, "epoch": 0.3475124004208628, "flos": 24608577381120.0, "grad_norm": 1.8899738817468288, "language_loss": 0.62119317, "learning_rate": 2.9222588732614818e-06, "loss": 0.64296842, "num_input_tokens_seen": 124435950, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.83984375, "step": 5780, "time_per_iteration": 2.667006731033325 }, { "auxiliary_loss_clip": 0.01156331, "auxiliary_loss_mlp": 0.01041173, "balance_loss_clip": 1.02630758, "balance_loss_mlp": 1.0480603, "epoch": 0.34757252367353075, "flos": 22711309004160.0, "grad_norm": 1.650266676916419, "language_loss": 0.71884859, "learning_rate": 2.921923602632333e-06, "loss": 0.74082369, "num_input_tokens_seen": 124455410, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.81640625, "step": 5781, "time_per_iteration": 2.5763587951660156 }, { "auxiliary_loss_clip": 0.01174801, "auxiliary_loss_mlp": 0.01042667, "balance_loss_clip": 1.02682424, "balance_loss_mlp": 1.04786575, "epoch": 0.3476326469261987, "flos": 19828184970240.0, "grad_norm": 1.9118376059345288, "language_loss": 0.76669055, "learning_rate": 2.9215882991021036e-06, "loss": 0.78886521, "num_input_tokens_seen": 124474870, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.828125, "step": 5782, "time_per_iteration": 2.645547389984131 }, { "auxiliary_loss_clip": 0.01147939, "auxiliary_loss_mlp": 0.01037037, "balance_loss_clip": 1.0217545, "balance_loss_mlp": 1.04974532, "epoch": 0.3476927701788667, "flos": 19938107566080.0, "grad_norm": 1.8688721171373919, "language_loss": 0.62587661, "learning_rate": 2.9212529626827582e-06, "loss": 0.64772636, "num_input_tokens_seen": 124494105, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8046875, "step": 5783, "time_per_iteration": 2.559016704559326 }, { "auxiliary_loss_clip": 0.01159955, "auxiliary_loss_mlp": 0.01032161, "balance_loss_clip": 1.01815355, "balance_loss_mlp": 1.04731119, "epoch": 0.34775289343153465, "flos": 20735108490240.0, "grad_norm": 1.6937975975411785, "language_loss": 0.89141417, "learning_rate": 2.9209175933862636e-06, "loss": 0.91333538, "num_input_tokens_seen": 124512030, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.76953125, "step": 5784, "time_per_iteration": 2.5761866569519043 }, { "auxiliary_loss_clip": 0.01162781, "auxiliary_loss_mlp": 0.01037533, "balance_loss_clip": 1.02288246, "balance_loss_mlp": 1.04829359, "epoch": 0.3478130166842026, "flos": 19354846521600.0, "grad_norm": 1.6529628203894577, "language_loss": 0.81464612, "learning_rate": 2.92058219122459e-06, "loss": 0.83664924, "num_input_tokens_seen": 124530980, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78515625, "step": 5785, "time_per_iteration": 2.5675816535949707 }, { "auxiliary_loss_clip": 0.01148774, "auxiliary_loss_mlp": 0.01034237, "balance_loss_clip": 1.02012229, "balance_loss_mlp": 1.0520401, "epoch": 0.3478731399368706, "flos": 22051198811520.0, "grad_norm": 1.8308440804284787, "language_loss": 0.80890036, "learning_rate": 2.9202467562097052e-06, "loss": 0.83073044, "num_input_tokens_seen": 124549330, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78515625, "step": 5786, "time_per_iteration": 2.592848777770996 }, { "auxiliary_loss_clip": 0.01126402, "auxiliary_loss_mlp": 0.0128526, "balance_loss_clip": 1.02236819, "balance_loss_mlp": 1.04878807, "epoch": 0.34793326318953854, "flos": 18041449720320.0, "grad_norm": 1.602778686814133, "language_loss": 0.7496835, "learning_rate": 2.9199112883535813e-06, "loss": 0.77380013, "num_input_tokens_seen": 124567200, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.77734375, "step": 5787, "time_per_iteration": 2.5079779624938965 }, { "auxiliary_loss_clip": 0.01145218, "auxiliary_loss_mlp": 0.01282156, "balance_loss_clip": 1.01976109, "balance_loss_mlp": 1.04670084, "epoch": 0.3479933864422065, "flos": 29314670509440.0, "grad_norm": 1.659906044437864, "language_loss": 0.81495911, "learning_rate": 2.919575787668189e-06, "loss": 0.83923286, "num_input_tokens_seen": 124587025, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8046875, "step": 5788, "time_per_iteration": 2.6770179271698 }, { "auxiliary_loss_clip": 0.0113262, "auxiliary_loss_mlp": 0.01037721, "balance_loss_clip": 1.02181864, "balance_loss_mlp": 1.04890692, "epoch": 0.3480535096948745, "flos": 20120713332480.0, "grad_norm": 1.8070620133158906, "language_loss": 0.85532737, "learning_rate": 2.919240254165503e-06, "loss": 0.87703073, "num_input_tokens_seen": 124605860, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8359375, "step": 5789, "time_per_iteration": 2.6003334522247314 }, { "auxiliary_loss_clip": 0.01141566, "auxiliary_loss_mlp": 0.01059211, "balance_loss_clip": 1.04404724, "balance_loss_mlp": 1.04969168, "epoch": 0.34811363294754244, "flos": 18548974938240.0, "grad_norm": 1.781155649259231, "language_loss": 0.84883189, "learning_rate": 2.918904687857497e-06, "loss": 0.87083972, "num_input_tokens_seen": 124624270, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.83203125, "step": 5790, "time_per_iteration": 4.01422381401062 }, { "auxiliary_loss_clip": 0.01139173, "auxiliary_loss_mlp": 0.01046202, "balance_loss_clip": 1.03105688, "balance_loss_mlp": 1.04954517, "epoch": 0.3481737562002104, "flos": 26870303105280.0, "grad_norm": 1.8770347501032731, "language_loss": 0.81850529, "learning_rate": 2.9185690887561463e-06, "loss": 0.84035903, "num_input_tokens_seen": 124644005, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8046875, "step": 5791, "time_per_iteration": 2.6401758193969727 }, { "auxiliary_loss_clip": 0.01138355, "auxiliary_loss_mlp": 0.01039809, "balance_loss_clip": 1.02428222, "balance_loss_mlp": 1.04709458, "epoch": 0.3482338794528784, "flos": 28908664104960.0, "grad_norm": 1.6054362172314, "language_loss": 0.77540505, "learning_rate": 2.918233456873428e-06, "loss": 0.79718673, "num_input_tokens_seen": 124663020, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.82421875, "step": 5792, "time_per_iteration": 2.6416873931884766 }, { "auxiliary_loss_clip": 0.01143752, "auxiliary_loss_mlp": 0.01035166, "balance_loss_clip": 1.0204016, "balance_loss_mlp": 1.04678047, "epoch": 0.3482940027055464, "flos": 22200767043840.0, "grad_norm": 1.4643228035254159, "language_loss": 0.81815916, "learning_rate": 2.9178977922213188e-06, "loss": 0.8399483, "num_input_tokens_seen": 124682975, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7890625, "step": 5793, "time_per_iteration": 2.6788361072540283 }, { "auxiliary_loss_clip": 0.01138118, "auxiliary_loss_mlp": 0.01055222, "balance_loss_clip": 1.03970075, "balance_loss_mlp": 1.04836273, "epoch": 0.34835412595821436, "flos": 20302708567680.0, "grad_norm": 1.9137330153368297, "language_loss": 0.76101142, "learning_rate": 2.917562094811799e-06, "loss": 0.7829448, "num_input_tokens_seen": 124701340, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8125, "step": 5794, "time_per_iteration": 3.9285101890563965 }, { "auxiliary_loss_clip": 0.0113684, "auxiliary_loss_mlp": 0.01042936, "balance_loss_clip": 1.02779055, "balance_loss_mlp": 1.04881096, "epoch": 0.3484142492108823, "flos": 20449691020800.0, "grad_norm": 1.6591580370227192, "language_loss": 0.56876123, "learning_rate": 2.917226364656848e-06, "loss": 0.59055901, "num_input_tokens_seen": 124719165, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7890625, "step": 5795, "time_per_iteration": 2.614464282989502 }, { "auxiliary_loss_clip": 0.01143425, "auxiliary_loss_mlp": 0.01035168, "balance_loss_clip": 1.02076149, "balance_loss_mlp": 1.04770422, "epoch": 0.3484743724635503, "flos": 24352929308160.0, "grad_norm": 1.7094821498109956, "language_loss": 0.82273686, "learning_rate": 2.9168906017684474e-06, "loss": 0.84452277, "num_input_tokens_seen": 124738670, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7734375, "step": 5796, "time_per_iteration": 4.282655477523804 }, { "auxiliary_loss_clip": 0.01132529, "auxiliary_loss_mlp": 0.01035184, "balance_loss_clip": 1.02065229, "balance_loss_mlp": 1.04740119, "epoch": 0.34853449571621825, "flos": 24353001135360.0, "grad_norm": 2.099485796480769, "language_loss": 0.83431119, "learning_rate": 2.91655480615858e-06, "loss": 0.85598826, "num_input_tokens_seen": 124758760, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.76171875, "step": 5797, "time_per_iteration": 2.5489795207977295 }, { "auxiliary_loss_clip": 0.0112445, "auxiliary_loss_mlp": 0.01039195, "balance_loss_clip": 1.02456808, "balance_loss_mlp": 1.04718947, "epoch": 0.3485946189688862, "flos": 27267690245760.0, "grad_norm": 1.7701267953764193, "language_loss": 0.7318691, "learning_rate": 2.9162189778392286e-06, "loss": 0.75350559, "num_input_tokens_seen": 124777765, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7734375, "step": 5798, "time_per_iteration": 4.169783115386963 }, { "auxiliary_loss_clip": 0.01128617, "auxiliary_loss_mlp": 0.01045696, "balance_loss_clip": 1.03094399, "balance_loss_mlp": 1.04772151, "epoch": 0.3486547422215542, "flos": 20156695781760.0, "grad_norm": 1.9408044541507532, "language_loss": 0.75888115, "learning_rate": 2.9158831168223797e-06, "loss": 0.78062427, "num_input_tokens_seen": 124796775, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8125, "step": 5799, "time_per_iteration": 2.5577213764190674 }, { "auxiliary_loss_clip": 0.01180088, "auxiliary_loss_mlp": 0.01038342, "balance_loss_clip": 1.02462077, "balance_loss_mlp": 1.04702163, "epoch": 0.34871486547422215, "flos": 20230348619520.0, "grad_norm": 1.5878775051805667, "language_loss": 0.75635487, "learning_rate": 2.915547223120018e-06, "loss": 0.77853918, "num_input_tokens_seen": 124815825, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7890625, "step": 5800, "time_per_iteration": 2.61702823638916 }, { "auxiliary_loss_clip": 0.01151786, "auxiliary_loss_mlp": 0.01291937, "balance_loss_clip": 1.02857816, "balance_loss_mlp": 1.05085826, "epoch": 0.3487749887268901, "flos": 44053234882560.0, "grad_norm": 1.6890209437024784, "language_loss": 0.66837209, "learning_rate": 2.9152112967441307e-06, "loss": 0.69280934, "num_input_tokens_seen": 124838420, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.828125, "step": 5801, "time_per_iteration": 2.854499101638794 }, { "auxiliary_loss_clip": 0.01146895, "auxiliary_loss_mlp": 0.01047416, "balance_loss_clip": 1.03225267, "balance_loss_mlp": 1.0492171, "epoch": 0.3488351119795581, "flos": 23295144666240.0, "grad_norm": 3.147993971815607, "language_loss": 0.76407015, "learning_rate": 2.9148753377067063e-06, "loss": 0.78601331, "num_input_tokens_seen": 124857320, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.796875, "step": 5802, "time_per_iteration": 2.582379102706909 }, { "auxiliary_loss_clip": 0.01174515, "auxiliary_loss_mlp": 0.0103917, "balance_loss_clip": 1.02466869, "balance_loss_mlp": 1.04507768, "epoch": 0.34889523523222604, "flos": 19934839428480.0, "grad_norm": 1.835638405131294, "language_loss": 0.7918303, "learning_rate": 2.9145393460197346e-06, "loss": 0.81396723, "num_input_tokens_seen": 124875685, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.75390625, "step": 5803, "time_per_iteration": 2.7039384841918945 }, { "auxiliary_loss_clip": 0.01144645, "auxiliary_loss_mlp": 0.01290578, "balance_loss_clip": 1.02868617, "balance_loss_mlp": 1.04479325, "epoch": 0.348955358484894, "flos": 30446179816320.0, "grad_norm": 1.7516354759682398, "language_loss": 0.67728341, "learning_rate": 2.914203321695206e-06, "loss": 0.70163572, "num_input_tokens_seen": 124895960, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8203125, "step": 5804, "time_per_iteration": 2.709275960922241 }, { "auxiliary_loss_clip": 0.01139958, "auxiliary_loss_mlp": 0.01047078, "balance_loss_clip": 1.03233206, "balance_loss_mlp": 1.04722214, "epoch": 0.349015481737562, "flos": 17999972490240.0, "grad_norm": 1.841470485748705, "language_loss": 0.76128018, "learning_rate": 2.913867264745113e-06, "loss": 0.78315055, "num_input_tokens_seen": 124914140, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.75390625, "step": 5805, "time_per_iteration": 2.6327316761016846 }, { "auxiliary_loss_clip": 0.01152269, "auxiliary_loss_mlp": 0.01039202, "balance_loss_clip": 1.02396119, "balance_loss_mlp": 1.04667497, "epoch": 0.34907560499023, "flos": 27198490694400.0, "grad_norm": 2.8769017715437193, "language_loss": 0.66661334, "learning_rate": 2.913531175181448e-06, "loss": 0.68852806, "num_input_tokens_seen": 124934180, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.78515625, "step": 5806, "time_per_iteration": 2.6809051036834717 }, { "auxiliary_loss_clip": 0.01138259, "auxiliary_loss_mlp": 0.01041935, "balance_loss_clip": 1.02691483, "balance_loss_mlp": 1.05015171, "epoch": 0.34913572824289796, "flos": 30226873328640.0, "grad_norm": 1.5635607704832433, "language_loss": 0.71798378, "learning_rate": 2.913195053016205e-06, "loss": 0.73978573, "num_input_tokens_seen": 124956060, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7890625, "step": 5807, "time_per_iteration": 2.663007974624634 }, { "auxiliary_loss_clip": 0.01126262, "auxiliary_loss_mlp": 0.01037928, "balance_loss_clip": 1.02231145, "balance_loss_mlp": 1.04479551, "epoch": 0.3491958514955659, "flos": 29971907614080.0, "grad_norm": 1.8816089876114774, "language_loss": 0.73678517, "learning_rate": 2.9128588982613794e-06, "loss": 0.75842708, "num_input_tokens_seen": 124976070, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8125, "step": 5808, "time_per_iteration": 2.5671181678771973 }, { "auxiliary_loss_clip": 0.01124886, "auxiliary_loss_mlp": 0.01045975, "balance_loss_clip": 1.03153849, "balance_loss_mlp": 1.04885614, "epoch": 0.3492559747482339, "flos": 22783273902720.0, "grad_norm": 1.7291319108375993, "language_loss": 0.84713674, "learning_rate": 2.912522710928968e-06, "loss": 0.86884528, "num_input_tokens_seen": 124996995, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7578125, "step": 5809, "time_per_iteration": 2.5587446689605713 }, { "auxiliary_loss_clip": 0.01138928, "auxiliary_loss_mlp": 0.01035271, "balance_loss_clip": 1.0217762, "balance_loss_mlp": 1.04711246, "epoch": 0.34931609800090185, "flos": 26068022881920.0, "grad_norm": 1.961134484336163, "language_loss": 0.80589283, "learning_rate": 2.912186491030968e-06, "loss": 0.82763481, "num_input_tokens_seen": 125015600, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 5810, "time_per_iteration": 2.6076581478118896 }, { "auxiliary_loss_clip": 0.01125268, "auxiliary_loss_mlp": 0.01044399, "balance_loss_clip": 1.03060031, "balance_loss_mlp": 1.04629314, "epoch": 0.3493762212535698, "flos": 29242023252480.0, "grad_norm": 1.9294323961316329, "language_loss": 0.75277579, "learning_rate": 2.911850238579379e-06, "loss": 0.77447248, "num_input_tokens_seen": 125035290, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7890625, "step": 5811, "time_per_iteration": 2.6379177570343018 }, { "auxiliary_loss_clip": 0.01145821, "auxiliary_loss_mlp": 0.0103627, "balance_loss_clip": 1.0216136, "balance_loss_mlp": 1.04677832, "epoch": 0.3494363445062378, "flos": 27126058919040.0, "grad_norm": 1.5679862089008823, "language_loss": 0.80065906, "learning_rate": 2.9115139535862003e-06, "loss": 0.82247996, "num_input_tokens_seen": 125057130, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8125, "step": 5812, "time_per_iteration": 2.6151938438415527 }, { "auxiliary_loss_clip": 0.01167842, "auxiliary_loss_mlp": 0.01035846, "balance_loss_clip": 1.02095711, "balance_loss_mlp": 1.04318202, "epoch": 0.34949646775890575, "flos": 12276207233280.0, "grad_norm": 1.842979307099755, "language_loss": 0.69427341, "learning_rate": 2.9111776360634334e-06, "loss": 0.71631026, "num_input_tokens_seen": 125073720, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.796875, "step": 5813, "time_per_iteration": 2.5992798805236816 }, { "auxiliary_loss_clip": 0.01139508, "auxiliary_loss_mlp": 0.01282004, "balance_loss_clip": 1.01984143, "balance_loss_mlp": 1.04494536, "epoch": 0.3495565910115737, "flos": 17165516659200.0, "grad_norm": 2.1077560110972757, "language_loss": 0.76130986, "learning_rate": 2.9108412860230806e-06, "loss": 0.78552496, "num_input_tokens_seen": 125090635, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.765625, "step": 5814, "time_per_iteration": 2.699633836746216 }, { "auxiliary_loss_clip": 0.01144338, "auxiliary_loss_mlp": 0.01288706, "balance_loss_clip": 1.02560782, "balance_loss_mlp": 1.04573059, "epoch": 0.3496167142642417, "flos": 26465661417600.0, "grad_norm": 1.8375616497283866, "language_loss": 0.84543753, "learning_rate": 2.910504903477145e-06, "loss": 0.86976796, "num_input_tokens_seen": 125110070, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8046875, "step": 5815, "time_per_iteration": 2.634552478790283 }, { "auxiliary_loss_clip": 0.01130514, "auxiliary_loss_mlp": 0.01031161, "balance_loss_clip": 1.01746964, "balance_loss_mlp": 1.04287243, "epoch": 0.34967683751690964, "flos": 17414843938560.0, "grad_norm": 1.939318504591813, "language_loss": 0.76997513, "learning_rate": 2.910168488437632e-06, "loss": 0.79159188, "num_input_tokens_seen": 125125730, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.78515625, "step": 5816, "time_per_iteration": 2.616173267364502 }, { "auxiliary_loss_clip": 0.01137864, "auxiliary_loss_mlp": 0.01041811, "balance_loss_clip": 1.02697539, "balance_loss_mlp": 1.04778695, "epoch": 0.3497369607695776, "flos": 22600021691520.0, "grad_norm": 1.9236747233916762, "language_loss": 0.58564162, "learning_rate": 2.9098320409165462e-06, "loss": 0.60743833, "num_input_tokens_seen": 125146195, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8046875, "step": 5817, "time_per_iteration": 2.6159961223602295 }, { "auxiliary_loss_clip": 0.01044387, "auxiliary_loss_mlp": 0.01004064, "balance_loss_clip": 1.00197744, "balance_loss_mlp": 1.01937413, "epoch": 0.34979708402224563, "flos": 68529374818560.0, "grad_norm": 0.8694257909469254, "language_loss": 0.59855956, "learning_rate": 2.9094955609258954e-06, "loss": 0.61904407, "num_input_tokens_seen": 125207790, "router_z_loss_clip": 0.02087402, "router_z_loss_mlp": 0.25, "step": 5818, "time_per_iteration": 3.345113515853882 }, { "auxiliary_loss_clip": 0.01148501, "auxiliary_loss_mlp": 0.01037104, "balance_loss_clip": 1.02257276, "balance_loss_mlp": 1.04543781, "epoch": 0.3498572072749136, "flos": 18989634988800.0, "grad_norm": 1.6692992500847046, "language_loss": 0.83492815, "learning_rate": 2.909159048477688e-06, "loss": 0.85678422, "num_input_tokens_seen": 125226220, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7578125, "step": 5819, "time_per_iteration": 2.6423094272613525 }, { "auxiliary_loss_clip": 0.01142867, "auxiliary_loss_mlp": 0.01034146, "balance_loss_clip": 1.01963854, "balance_loss_mlp": 1.04639542, "epoch": 0.34991733052758156, "flos": 27818883423360.0, "grad_norm": 1.6989033761474066, "language_loss": 0.70849001, "learning_rate": 2.9088225035839327e-06, "loss": 0.73026007, "num_input_tokens_seen": 125247485, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.78515625, "step": 5820, "time_per_iteration": 2.7383038997650146 }, { "auxiliary_loss_clip": 0.01134908, "auxiliary_loss_mlp": 0.01034059, "balance_loss_clip": 1.02018905, "balance_loss_mlp": 1.04675126, "epoch": 0.3499774537802495, "flos": 33584197737600.0, "grad_norm": 1.6814405057140418, "language_loss": 0.70492709, "learning_rate": 2.9084859262566397e-06, "loss": 0.72661674, "num_input_tokens_seen": 125268625, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.79296875, "step": 5821, "time_per_iteration": 2.7366325855255127 }, { "auxiliary_loss_clip": 0.01140046, "auxiliary_loss_mlp": 0.0104402, "balance_loss_clip": 1.02786684, "balance_loss_mlp": 1.04727864, "epoch": 0.3500375770329175, "flos": 23476744851840.0, "grad_norm": 2.5191637970954575, "language_loss": 0.737679, "learning_rate": 2.9081493165078216e-06, "loss": 0.75951964, "num_input_tokens_seen": 125287530, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8359375, "step": 5822, "time_per_iteration": 2.6517584323883057 }, { "auxiliary_loss_clip": 0.01144312, "auxiliary_loss_mlp": 0.01035625, "balance_loss_clip": 1.01899529, "balance_loss_mlp": 1.04595792, "epoch": 0.35009770028558546, "flos": 19026048401280.0, "grad_norm": 1.641186041626141, "language_loss": 0.78341615, "learning_rate": 2.907812674349489e-06, "loss": 0.80521548, "num_input_tokens_seen": 125307020, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8046875, "step": 5823, "time_per_iteration": 2.5871715545654297 }, { "auxiliary_loss_clip": 0.0105053, "auxiliary_loss_mlp": 0.01007256, "balance_loss_clip": 1.0051465, "balance_loss_mlp": 1.01673949, "epoch": 0.3501578235382534, "flos": 68351868783360.0, "grad_norm": 0.7161991344940886, "language_loss": 0.59237218, "learning_rate": 2.907475999793659e-06, "loss": 0.61295003, "num_input_tokens_seen": 125370445, "router_z_loss_clip": 0.02111816, "router_z_loss_mlp": 0.25, "step": 5824, "time_per_iteration": 3.2349367141723633 }, { "auxiliary_loss_clip": 0.01143206, "auxiliary_loss_mlp": 0.01039379, "balance_loss_clip": 1.02364373, "balance_loss_mlp": 1.04577553, "epoch": 0.3502179467909214, "flos": 21250893836160.0, "grad_norm": 2.030923333663862, "language_loss": 0.84965056, "learning_rate": 2.9071392928523433e-06, "loss": 0.87147641, "num_input_tokens_seen": 125388900, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.80078125, "step": 5825, "time_per_iteration": 2.6761670112609863 }, { "auxiliary_loss_clip": 0.01134297, "auxiliary_loss_mlp": 0.01285085, "balance_loss_clip": 1.02278042, "balance_loss_mlp": 1.04665959, "epoch": 0.35027807004358935, "flos": 11942955826560.0, "grad_norm": 2.749825811526179, "language_loss": 0.83178949, "learning_rate": 2.9068025535375603e-06, "loss": 0.85598326, "num_input_tokens_seen": 125402675, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.7890625, "step": 5826, "time_per_iteration": 2.530545234680176 }, { "auxiliary_loss_clip": 0.01145981, "auxiliary_loss_mlp": 0.01042291, "balance_loss_clip": 1.02722287, "balance_loss_mlp": 1.04739475, "epoch": 0.3503381932962573, "flos": 21470918595840.0, "grad_norm": 1.6607320210913203, "language_loss": 0.8089757, "learning_rate": 2.9064657818613274e-06, "loss": 0.83085835, "num_input_tokens_seen": 125421360, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8125, "step": 5827, "time_per_iteration": 2.702723979949951 }, { "auxiliary_loss_clip": 0.01162464, "auxiliary_loss_mlp": 0.01036498, "balance_loss_clip": 1.02197814, "balance_loss_mlp": 1.04785347, "epoch": 0.3503983165489253, "flos": 21251109317760.0, "grad_norm": 2.2311231983368454, "language_loss": 0.70849311, "learning_rate": 2.906128977835661e-06, "loss": 0.7304827, "num_input_tokens_seen": 125440000, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7890625, "step": 5828, "time_per_iteration": 2.598454236984253 }, { "auxiliary_loss_clip": 0.01138793, "auxiliary_loss_mlp": 0.01047428, "balance_loss_clip": 1.02992797, "balance_loss_mlp": 1.04779601, "epoch": 0.35045843980159325, "flos": 27815723026560.0, "grad_norm": 1.7163907770884979, "language_loss": 0.79638505, "learning_rate": 2.9057921414725838e-06, "loss": 0.81824726, "num_input_tokens_seen": 125460390, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.8203125, "step": 5829, "time_per_iteration": 2.715693712234497 }, { "auxiliary_loss_clip": 0.01157578, "auxiliary_loss_mlp": 0.01045278, "balance_loss_clip": 1.02874398, "balance_loss_mlp": 1.04681456, "epoch": 0.3505185630542612, "flos": 25921148169600.0, "grad_norm": 1.7001687508026033, "language_loss": 0.72412384, "learning_rate": 2.9054552727841136e-06, "loss": 0.7461524, "num_input_tokens_seen": 125478410, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8359375, "step": 5830, "time_per_iteration": 2.580390214920044 }, { "auxiliary_loss_clip": 0.01149852, "auxiliary_loss_mlp": 0.01035085, "balance_loss_clip": 1.01941502, "balance_loss_mlp": 1.04413843, "epoch": 0.35057868630692923, "flos": 20521763660160.0, "grad_norm": 2.124840775109101, "language_loss": 0.88326085, "learning_rate": 2.905118371782275e-06, "loss": 0.90511024, "num_input_tokens_seen": 125495975, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.78515625, "step": 5831, "time_per_iteration": 3.9727585315704346 }, { "auxiliary_loss_clip": 0.01161937, "auxiliary_loss_mlp": 0.01045667, "balance_loss_clip": 1.03014541, "balance_loss_mlp": 1.04575908, "epoch": 0.3506388095595972, "flos": 20448649526400.0, "grad_norm": 1.793515482457443, "language_loss": 0.78346866, "learning_rate": 2.9047814384790894e-06, "loss": 0.80554467, "num_input_tokens_seen": 125515035, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.80078125, "step": 5832, "time_per_iteration": 2.6434695720672607 }, { "auxiliary_loss_clip": 0.01153371, "auxiliary_loss_mlp": 0.01040391, "balance_loss_clip": 1.02436948, "balance_loss_mlp": 1.04549599, "epoch": 0.35069893281226516, "flos": 23109665811840.0, "grad_norm": 1.7326212280674507, "language_loss": 0.70706391, "learning_rate": 2.9044444728865814e-06, "loss": 0.72900152, "num_input_tokens_seen": 125535555, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.80859375, "step": 5833, "time_per_iteration": 2.607083320617676 }, { "auxiliary_loss_clip": 0.0112903, "auxiliary_loss_mlp": 0.01033192, "balance_loss_clip": 1.01858258, "balance_loss_mlp": 1.04413795, "epoch": 0.35075905606493313, "flos": 27271999877760.0, "grad_norm": 1.4568227952090642, "language_loss": 0.80804121, "learning_rate": 2.904107475016777e-06, "loss": 0.8296634, "num_input_tokens_seen": 125558195, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7578125, "step": 5834, "time_per_iteration": 2.6357083320617676 }, { "auxiliary_loss_clip": 0.01161679, "auxiliary_loss_mlp": 0.01039907, "balance_loss_clip": 1.02414751, "balance_loss_mlp": 1.04610515, "epoch": 0.3508191793176011, "flos": 19128608709120.0, "grad_norm": 1.9558079025391757, "language_loss": 0.8357085, "learning_rate": 2.903770444881702e-06, "loss": 0.85772443, "num_input_tokens_seen": 125575375, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.79296875, "step": 5835, "time_per_iteration": 2.5631563663482666 }, { "auxiliary_loss_clip": 0.01157039, "auxiliary_loss_mlp": 0.01040811, "balance_loss_clip": 1.02598143, "balance_loss_mlp": 1.04248452, "epoch": 0.35087930257026906, "flos": 25557588662400.0, "grad_norm": 1.4257602815000134, "language_loss": 0.76681322, "learning_rate": 2.903433382493386e-06, "loss": 0.78879172, "num_input_tokens_seen": 125596745, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.78125, "step": 5836, "time_per_iteration": 4.054614782333374 }, { "auxiliary_loss_clip": 0.01154179, "auxiliary_loss_mlp": 0.01043679, "balance_loss_clip": 1.02828956, "balance_loss_mlp": 1.04837394, "epoch": 0.350939425822937, "flos": 18004246208640.0, "grad_norm": 1.9425805635402558, "language_loss": 0.77210641, "learning_rate": 2.903096287863855e-06, "loss": 0.79408503, "num_input_tokens_seen": 125613980, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.7890625, "step": 5837, "time_per_iteration": 4.075535774230957 }, { "auxiliary_loss_clip": 0.01132109, "auxiliary_loss_mlp": 0.01045364, "balance_loss_clip": 1.02973533, "balance_loss_mlp": 1.04391503, "epoch": 0.350999549075605, "flos": 22273198819200.0, "grad_norm": 2.394214894120978, "language_loss": 0.67785966, "learning_rate": 2.902759161005141e-06, "loss": 0.69963443, "num_input_tokens_seen": 125632100, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.7890625, "step": 5838, "time_per_iteration": 2.5564630031585693 }, { "auxiliary_loss_clip": 0.01122818, "auxiliary_loss_mlp": 0.01037106, "balance_loss_clip": 1.0209651, "balance_loss_mlp": 1.04371595, "epoch": 0.35105967232827295, "flos": 14392279307520.0, "grad_norm": 1.8826784334245499, "language_loss": 0.83134812, "learning_rate": 2.9024220019292752e-06, "loss": 0.85294735, "num_input_tokens_seen": 125649190, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.7890625, "step": 5839, "time_per_iteration": 2.6572959423065186 }, { "auxiliary_loss_clip": 0.0113574, "auxiliary_loss_mlp": 0.0104242, "balance_loss_clip": 1.02633834, "balance_loss_mlp": 1.04396665, "epoch": 0.3511197955809409, "flos": 25082346792960.0, "grad_norm": 2.078155703847341, "language_loss": 0.59149843, "learning_rate": 2.902084810648289e-06, "loss": 0.61328006, "num_input_tokens_seen": 125668680, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.82421875, "step": 5840, "time_per_iteration": 4.05876350402832 }, { "auxiliary_loss_clip": 0.01153945, "auxiliary_loss_mlp": 0.01041936, "balance_loss_clip": 1.02561593, "balance_loss_mlp": 1.04688251, "epoch": 0.3511799188336089, "flos": 25884160139520.0, "grad_norm": 2.7443655516493033, "language_loss": 0.8727141, "learning_rate": 2.901747587174216e-06, "loss": 0.89467293, "num_input_tokens_seen": 125686935, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8046875, "step": 5841, "time_per_iteration": 2.6056180000305176 }, { "auxiliary_loss_clip": 0.01148148, "auxiliary_loss_mlp": 0.01042115, "balance_loss_clip": 1.0245316, "balance_loss_mlp": 1.04605627, "epoch": 0.35124004208627685, "flos": 20083725302400.0, "grad_norm": 1.7317476500143305, "language_loss": 0.75168991, "learning_rate": 2.9014103315190916e-06, "loss": 0.77359247, "num_input_tokens_seen": 125707180, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.84375, "step": 5842, "time_per_iteration": 2.634665012359619 }, { "auxiliary_loss_clip": 0.01135426, "auxiliary_loss_mlp": 0.01044142, "balance_loss_clip": 1.02820981, "balance_loss_mlp": 1.04429305, "epoch": 0.3513001653389448, "flos": 17783431349760.0, "grad_norm": 1.8714893068587137, "language_loss": 0.68223369, "learning_rate": 2.9010730436949514e-06, "loss": 0.70402932, "num_input_tokens_seen": 125722780, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8203125, "step": 5843, "time_per_iteration": 2.501232624053955 }, { "auxiliary_loss_clip": 0.01124683, "auxiliary_loss_mlp": 0.01041434, "balance_loss_clip": 1.02567482, "balance_loss_mlp": 1.04612911, "epoch": 0.3513602885916128, "flos": 29387138198400.0, "grad_norm": 2.156812517302681, "language_loss": 0.65105152, "learning_rate": 2.900735723713832e-06, "loss": 0.67271262, "num_input_tokens_seen": 125742110, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.78515625, "step": 5844, "time_per_iteration": 2.6896872520446777 }, { "auxiliary_loss_clip": 0.01133757, "auxiliary_loss_mlp": 0.01045064, "balance_loss_clip": 1.02843475, "balance_loss_mlp": 1.04485846, "epoch": 0.3514204118442808, "flos": 16179876483840.0, "grad_norm": 2.020687730481144, "language_loss": 0.75172102, "learning_rate": 2.9003983715877713e-06, "loss": 0.77350914, "num_input_tokens_seen": 125759980, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.80078125, "step": 5845, "time_per_iteration": 2.5416676998138428 }, { "auxiliary_loss_clip": 0.01131836, "auxiliary_loss_mlp": 0.01036172, "balance_loss_clip": 1.02125859, "balance_loss_mlp": 1.04574323, "epoch": 0.35148053509694877, "flos": 23834665923840.0, "grad_norm": 2.8216828024615643, "language_loss": 0.73267847, "learning_rate": 2.9000609873288085e-06, "loss": 0.75435853, "num_input_tokens_seen": 125772660, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.7734375, "step": 5846, "time_per_iteration": 2.670722246170044 }, { "auxiliary_loss_clip": 0.01154658, "auxiliary_loss_mlp": 0.01037924, "balance_loss_clip": 1.02259409, "balance_loss_mlp": 1.04689574, "epoch": 0.35154065834961673, "flos": 20991295267200.0, "grad_norm": 1.670346725703405, "language_loss": 0.75203127, "learning_rate": 2.8997235709489845e-06, "loss": 0.77395713, "num_input_tokens_seen": 125791935, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8046875, "step": 5847, "time_per_iteration": 2.5604500770568848 }, { "auxiliary_loss_clip": 0.01141829, "auxiliary_loss_mlp": 0.01035182, "balance_loss_clip": 1.01981556, "balance_loss_mlp": 1.04381824, "epoch": 0.3516007816022847, "flos": 33255471444480.0, "grad_norm": 1.8347434943277816, "language_loss": 0.72797686, "learning_rate": 2.8993861224603412e-06, "loss": 0.74974704, "num_input_tokens_seen": 125813455, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.80078125, "step": 5848, "time_per_iteration": 2.698700189590454 }, { "auxiliary_loss_clip": 0.01146386, "auxiliary_loss_mlp": 0.01043212, "balance_loss_clip": 1.02655268, "balance_loss_mlp": 1.04629135, "epoch": 0.35166090485495266, "flos": 11726953390080.0, "grad_norm": 2.3876019921027347, "language_loss": 0.89663565, "learning_rate": 2.8990486418749205e-06, "loss": 0.91853172, "num_input_tokens_seen": 125827660, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8203125, "step": 5849, "time_per_iteration": 2.5117111206054688 }, { "auxiliary_loss_clip": 0.01141123, "auxiliary_loss_mlp": 0.01035882, "balance_loss_clip": 1.02055168, "balance_loss_mlp": 1.04368103, "epoch": 0.3517210281076206, "flos": 22638446265600.0, "grad_norm": 5.727294312456022, "language_loss": 0.75030732, "learning_rate": 2.8987111292047663e-06, "loss": 0.77207744, "num_input_tokens_seen": 125846655, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.796875, "step": 5850, "time_per_iteration": 2.642728567123413 }, { "auxiliary_loss_clip": 0.0113065, "auxiliary_loss_mlp": 0.01038064, "balance_loss_clip": 1.02229905, "balance_loss_mlp": 1.04427779, "epoch": 0.3517811513602886, "flos": 21322750993920.0, "grad_norm": 1.5896307683472184, "language_loss": 0.75688833, "learning_rate": 2.898373584461924e-06, "loss": 0.77857542, "num_input_tokens_seen": 125866290, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.77734375, "step": 5851, "time_per_iteration": 2.7095556259155273 }, { "auxiliary_loss_clip": 0.01144973, "auxiliary_loss_mlp": 0.01036525, "balance_loss_clip": 1.02014601, "balance_loss_mlp": 1.04661119, "epoch": 0.35184127461295656, "flos": 21032880238080.0, "grad_norm": 2.0087500166561845, "language_loss": 0.87388134, "learning_rate": 2.8980360076584384e-06, "loss": 0.89569628, "num_input_tokens_seen": 125884620, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8046875, "step": 5852, "time_per_iteration": 2.627750873565674 }, { "auxiliary_loss_clip": 0.01121194, "auxiliary_loss_mlp": 0.01034365, "balance_loss_clip": 1.01959515, "balance_loss_mlp": 1.04319358, "epoch": 0.3519013978656245, "flos": 22455265881600.0, "grad_norm": 4.413094357911882, "language_loss": 0.68214786, "learning_rate": 2.8976983988063586e-06, "loss": 0.7037034, "num_input_tokens_seen": 125902430, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.78125, "step": 5853, "time_per_iteration": 2.5274648666381836 }, { "auxiliary_loss_clip": 0.01143488, "auxiliary_loss_mlp": 0.0103895, "balance_loss_clip": 1.02370322, "balance_loss_mlp": 1.04453683, "epoch": 0.3519615211182925, "flos": 13115295918720.0, "grad_norm": 1.8605441904107007, "language_loss": 0.80661136, "learning_rate": 2.8973607579177317e-06, "loss": 0.82843572, "num_input_tokens_seen": 125920570, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.80859375, "step": 5854, "time_per_iteration": 2.583040475845337 }, { "auxiliary_loss_clip": 0.01138343, "auxiliary_loss_mlp": 0.01031432, "balance_loss_clip": 1.01729333, "balance_loss_mlp": 1.04388213, "epoch": 0.35202164437096045, "flos": 19135144984320.0, "grad_norm": 1.562993286723334, "language_loss": 0.73034471, "learning_rate": 2.8970230850046076e-06, "loss": 0.75204247, "num_input_tokens_seen": 125939800, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.765625, "step": 5855, "time_per_iteration": 2.6394596099853516 }, { "auxiliary_loss_clip": 0.01120965, "auxiliary_loss_mlp": 0.01036588, "balance_loss_clip": 1.02214611, "balance_loss_mlp": 1.0434047, "epoch": 0.3520817676236284, "flos": 26542187343360.0, "grad_norm": 1.986662313602211, "language_loss": 0.70569837, "learning_rate": 2.896685380079037e-06, "loss": 0.72727394, "num_input_tokens_seen": 125958720, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7734375, "step": 5856, "time_per_iteration": 2.6128199100494385 }, { "auxiliary_loss_clip": 0.01134925, "auxiliary_loss_mlp": 0.01045971, "balance_loss_clip": 1.02978206, "balance_loss_mlp": 1.04490578, "epoch": 0.3521418908762964, "flos": 44893472803200.0, "grad_norm": 1.5904151558134665, "language_loss": 0.61197609, "learning_rate": 2.896347643153072e-06, "loss": 0.63378507, "num_input_tokens_seen": 125984310, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.80859375, "step": 5857, "time_per_iteration": 2.8087596893310547 }, { "auxiliary_loss_clip": 0.01131763, "auxiliary_loss_mlp": 0.01039917, "balance_loss_clip": 1.02446747, "balance_loss_mlp": 1.04354763, "epoch": 0.3522020141289644, "flos": 20187398931840.0, "grad_norm": 1.822142794618324, "language_loss": 0.73222339, "learning_rate": 2.896009874238765e-06, "loss": 0.75394017, "num_input_tokens_seen": 126002410, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.796875, "step": 5858, "time_per_iteration": 2.591140031814575 }, { "auxiliary_loss_clip": 0.01126733, "auxiliary_loss_mlp": 0.0104116, "balance_loss_clip": 1.02585316, "balance_loss_mlp": 1.04514492, "epoch": 0.35226213738163237, "flos": 27563917708800.0, "grad_norm": 1.4659215145559843, "language_loss": 0.76033121, "learning_rate": 2.8956720733481707e-06, "loss": 0.78201008, "num_input_tokens_seen": 126022490, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.81640625, "step": 5859, "time_per_iteration": 2.6407699584960938 }, { "auxiliary_loss_clip": 0.01147727, "auxiliary_loss_mlp": 0.01042487, "balance_loss_clip": 1.02569032, "balance_loss_mlp": 1.0464797, "epoch": 0.35232226063430033, "flos": 22966310632320.0, "grad_norm": 1.8603778410284377, "language_loss": 0.71890843, "learning_rate": 2.895334240493344e-06, "loss": 0.74081057, "num_input_tokens_seen": 126042895, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8359375, "step": 5860, "time_per_iteration": 2.5781123638153076 }, { "auxiliary_loss_clip": 0.01144594, "auxiliary_loss_mlp": 0.01039123, "balance_loss_clip": 1.02275586, "balance_loss_mlp": 1.04239964, "epoch": 0.3523823838869683, "flos": 19168290259200.0, "grad_norm": 2.074458802623332, "language_loss": 0.66067457, "learning_rate": 2.8949963756863414e-06, "loss": 0.68251175, "num_input_tokens_seen": 126060130, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.84375, "step": 5861, "time_per_iteration": 2.573019504547119 }, { "auxiliary_loss_clip": 0.01139911, "auxiliary_loss_mlp": 0.01033569, "balance_loss_clip": 1.01869798, "balance_loss_mlp": 1.04378033, "epoch": 0.35244250713963626, "flos": 17930988420480.0, "grad_norm": 1.7678105864994575, "language_loss": 0.66840357, "learning_rate": 2.8946584789392197e-06, "loss": 0.69013834, "num_input_tokens_seen": 126077850, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.78125, "step": 5862, "time_per_iteration": 2.6559958457946777 }, { "auxiliary_loss_clip": 0.01142366, "auxiliary_loss_mlp": 0.0104032, "balance_loss_clip": 1.02377427, "balance_loss_mlp": 1.04399812, "epoch": 0.35250263039230423, "flos": 21432529935360.0, "grad_norm": 13.884768253475102, "language_loss": 0.769261, "learning_rate": 2.894320550264039e-06, "loss": 0.79108787, "num_input_tokens_seen": 126095985, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8046875, "step": 5863, "time_per_iteration": 2.6593222618103027 }, { "auxiliary_loss_clip": 0.01132963, "auxiliary_loss_mlp": 0.01040086, "balance_loss_clip": 1.02518463, "balance_loss_mlp": 1.04443228, "epoch": 0.3525627536449722, "flos": 27416863428480.0, "grad_norm": 2.3933658117559364, "language_loss": 0.75004858, "learning_rate": 2.893982589672858e-06, "loss": 0.77177906, "num_input_tokens_seen": 126116070, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.796875, "step": 5864, "time_per_iteration": 2.6205356121063232 }, { "auxiliary_loss_clip": 0.01132614, "auxiliary_loss_mlp": 0.0104466, "balance_loss_clip": 1.02979469, "balance_loss_mlp": 1.0442729, "epoch": 0.35262287689764016, "flos": 24789818430720.0, "grad_norm": 2.668958367401282, "language_loss": 0.79302359, "learning_rate": 2.893644597177738e-06, "loss": 0.81479633, "num_input_tokens_seen": 126135205, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.796875, "step": 5865, "time_per_iteration": 2.711277961730957 }, { "auxiliary_loss_clip": 0.01137454, "auxiliary_loss_mlp": 0.01042371, "balance_loss_clip": 1.02618265, "balance_loss_mlp": 1.04679084, "epoch": 0.3526830001503081, "flos": 17821604528640.0, "grad_norm": 3.1632546893092695, "language_loss": 0.81862712, "learning_rate": 2.8933065727907417e-06, "loss": 0.84042537, "num_input_tokens_seen": 126151895, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.81640625, "step": 5866, "time_per_iteration": 2.528658866882324 }, { "auxiliary_loss_clip": 0.01165921, "auxiliary_loss_mlp": 0.01040983, "balance_loss_clip": 1.02348292, "balance_loss_mlp": 1.04442191, "epoch": 0.3527431234029761, "flos": 18078114528000.0, "grad_norm": 2.3241035761191804, "language_loss": 0.8354938, "learning_rate": 2.8929685165239308e-06, "loss": 0.85756278, "num_input_tokens_seen": 126168515, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.859375, "step": 5867, "time_per_iteration": 2.691286563873291 }, { "auxiliary_loss_clip": 0.01125914, "auxiliary_loss_mlp": 0.01045344, "balance_loss_clip": 1.02904773, "balance_loss_mlp": 1.04501605, "epoch": 0.35280324665564405, "flos": 19427350124160.0, "grad_norm": 2.812173243263995, "language_loss": 0.74481571, "learning_rate": 2.892630428389371e-06, "loss": 0.76652825, "num_input_tokens_seen": 126186460, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.80859375, "step": 5868, "time_per_iteration": 2.497302532196045 }, { "auxiliary_loss_clip": 0.01135072, "auxiliary_loss_mlp": 0.01038917, "balance_loss_clip": 1.02226317, "balance_loss_mlp": 1.04534531, "epoch": 0.352863369908312, "flos": 21504027957120.0, "grad_norm": 1.8501876182872325, "language_loss": 0.61569488, "learning_rate": 2.892292308399127e-06, "loss": 0.63743478, "num_input_tokens_seen": 126206170, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8125, "step": 5869, "time_per_iteration": 2.6062631607055664 }, { "auxiliary_loss_clip": 0.0113302, "auxiliary_loss_mlp": 0.01040335, "balance_loss_clip": 1.02518964, "balance_loss_mlp": 1.0437634, "epoch": 0.35292349316098, "flos": 22309504490880.0, "grad_norm": 2.2439884909043495, "language_loss": 0.74420613, "learning_rate": 2.8919541565652655e-06, "loss": 0.76593971, "num_input_tokens_seen": 126225605, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8046875, "step": 5870, "time_per_iteration": 2.522831439971924 }, { "auxiliary_loss_clip": 0.01129908, "auxiliary_loss_mlp": 0.01034867, "balance_loss_clip": 1.01926303, "balance_loss_mlp": 1.04235554, "epoch": 0.352983616413648, "flos": 33109745967360.0, "grad_norm": 1.418369979831397, "language_loss": 0.71759635, "learning_rate": 2.8916159728998555e-06, "loss": 0.73924416, "num_input_tokens_seen": 126250230, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.7890625, "step": 5871, "time_per_iteration": 2.7241873741149902 }, { "auxiliary_loss_clip": 0.01129443, "auxiliary_loss_mlp": 0.0103518, "balance_loss_clip": 1.02134609, "balance_loss_mlp": 1.04370117, "epoch": 0.35304373966631597, "flos": 18696603836160.0, "grad_norm": 1.8394877940469667, "language_loss": 0.73551142, "learning_rate": 2.8912777574149642e-06, "loss": 0.75715762, "num_input_tokens_seen": 126268315, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.765625, "step": 5872, "time_per_iteration": 2.560135841369629 }, { "auxiliary_loss_clip": 0.0114023, "auxiliary_loss_mlp": 0.01288428, "balance_loss_clip": 1.02697825, "balance_loss_mlp": 1.04413056, "epoch": 0.35310386291898394, "flos": 23364954748800.0, "grad_norm": 1.7971572777767721, "language_loss": 0.82129705, "learning_rate": 2.8909395101226628e-06, "loss": 0.84558362, "num_input_tokens_seen": 126288390, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78125, "step": 5873, "time_per_iteration": 3.952329635620117 }, { "auxiliary_loss_clip": 0.01145153, "auxiliary_loss_mlp": 0.01041958, "balance_loss_clip": 1.02542353, "balance_loss_mlp": 1.04440904, "epoch": 0.3531639861716519, "flos": 24461954064000.0, "grad_norm": 1.7855721901661266, "language_loss": 0.65087628, "learning_rate": 2.8906012310350212e-06, "loss": 0.67274743, "num_input_tokens_seen": 126305750, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.828125, "step": 5874, "time_per_iteration": 2.7596611976623535 }, { "auxiliary_loss_clip": 0.01054653, "auxiliary_loss_mlp": 0.01017368, "balance_loss_clip": 1.01519871, "balance_loss_mlp": 1.02031183, "epoch": 0.35322410942431987, "flos": 70312446881280.0, "grad_norm": 0.765214969193602, "language_loss": 0.61530769, "learning_rate": 2.890262920164113e-06, "loss": 0.63602793, "num_input_tokens_seen": 126362495, "router_z_loss_clip": 0.02172852, "router_z_loss_mlp": 0.25390625, "step": 5875, "time_per_iteration": 3.0720887184143066 }, { "auxiliary_loss_clip": 0.01169421, "auxiliary_loss_mlp": 0.01042283, "balance_loss_clip": 1.02836502, "balance_loss_mlp": 1.04594016, "epoch": 0.35328423267698783, "flos": 19820894509440.0, "grad_norm": 2.5243040822413385, "language_loss": 0.80015469, "learning_rate": 2.8899245775220113e-06, "loss": 0.82227165, "num_input_tokens_seen": 126378320, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78125, "step": 5876, "time_per_iteration": 2.6502788066864014 }, { "auxiliary_loss_clip": 0.01069686, "auxiliary_loss_mlp": 0.01259196, "balance_loss_clip": 1.0101912, "balance_loss_mlp": 1.01782167, "epoch": 0.3533443559296558, "flos": 60826356391680.0, "grad_norm": 0.6754316063969346, "language_loss": 0.56809038, "learning_rate": 2.8895862031207906e-06, "loss": 0.59137923, "num_input_tokens_seen": 126442735, "router_z_loss_clip": 0.02368164, "router_z_loss_mlp": 0.25195312, "step": 5877, "time_per_iteration": 4.593461513519287 }, { "auxiliary_loss_clip": 0.01143271, "auxiliary_loss_mlp": 0.0103095, "balance_loss_clip": 1.01638246, "balance_loss_mlp": 1.04515123, "epoch": 0.35340447918232376, "flos": 24755775315840.0, "grad_norm": 1.4890023021834655, "language_loss": 0.71919465, "learning_rate": 2.889247796972527e-06, "loss": 0.74093682, "num_input_tokens_seen": 126463090, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.80078125, "step": 5878, "time_per_iteration": 4.216526031494141 }, { "auxiliary_loss_clip": 0.01139878, "auxiliary_loss_mlp": 0.01034981, "balance_loss_clip": 1.020455, "balance_loss_mlp": 1.04321957, "epoch": 0.3534646024349917, "flos": 21796304924160.0, "grad_norm": 1.7744259723232476, "language_loss": 0.78415549, "learning_rate": 2.8889093590892965e-06, "loss": 0.80590403, "num_input_tokens_seen": 126482105, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78515625, "step": 5879, "time_per_iteration": 2.5955402851104736 }, { "auxiliary_loss_clip": 0.01155024, "auxiliary_loss_mlp": 0.01046444, "balance_loss_clip": 1.02889609, "balance_loss_mlp": 1.04584885, "epoch": 0.3535247256876597, "flos": 20012119539840.0, "grad_norm": 2.649300316568857, "language_loss": 0.63387918, "learning_rate": 2.8885708894831776e-06, "loss": 0.6558938, "num_input_tokens_seen": 126502125, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.828125, "step": 5880, "time_per_iteration": 2.7009084224700928 }, { "auxiliary_loss_clip": 0.01143226, "auxiliary_loss_mlp": 0.01032987, "balance_loss_clip": 1.01768017, "balance_loss_mlp": 1.04497397, "epoch": 0.35358484894032766, "flos": 18187929383040.0, "grad_norm": 2.527266415117604, "language_loss": 0.65949911, "learning_rate": 2.8882323881662496e-06, "loss": 0.68126118, "num_input_tokens_seen": 126521950, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8046875, "step": 5881, "time_per_iteration": 4.029440641403198 }, { "auxiliary_loss_clip": 0.01129243, "auxiliary_loss_mlp": 0.01028774, "balance_loss_clip": 1.01524997, "balance_loss_mlp": 1.04415059, "epoch": 0.3536449721929956, "flos": 22820369673600.0, "grad_norm": 1.4340039769356887, "language_loss": 0.75696516, "learning_rate": 2.887893855150592e-06, "loss": 0.77854538, "num_input_tokens_seen": 126542445, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 5882, "time_per_iteration": 2.5851457118988037 }, { "auxiliary_loss_clip": 0.01143385, "auxiliary_loss_mlp": 0.01036193, "balance_loss_clip": 1.02086294, "balance_loss_mlp": 1.04568624, "epoch": 0.3537050954456636, "flos": 26432336574720.0, "grad_norm": 2.070576511874696, "language_loss": 0.70223463, "learning_rate": 2.8875552904482874e-06, "loss": 0.72403038, "num_input_tokens_seen": 126560690, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.796875, "step": 5883, "time_per_iteration": 2.7146263122558594 }, { "auxiliary_loss_clip": 0.01155693, "auxiliary_loss_mlp": 0.01040696, "balance_loss_clip": 1.02545488, "balance_loss_mlp": 1.04689384, "epoch": 0.3537652186983316, "flos": 17197153562880.0, "grad_norm": 2.314002358797864, "language_loss": 0.78631365, "learning_rate": 2.8872166940714166e-06, "loss": 0.80827749, "num_input_tokens_seen": 126577620, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.82421875, "step": 5884, "time_per_iteration": 2.5769762992858887 }, { "auxiliary_loss_clip": 0.01140929, "auxiliary_loss_mlp": 0.01035794, "balance_loss_clip": 1.02214479, "balance_loss_mlp": 1.04490113, "epoch": 0.3538253419509996, "flos": 19536769929600.0, "grad_norm": 2.350710129203555, "language_loss": 0.7529794, "learning_rate": 2.886878066032065e-06, "loss": 0.77474666, "num_input_tokens_seen": 126596235, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.78125, "step": 5885, "time_per_iteration": 2.6134724617004395 }, { "auxiliary_loss_clip": 0.01160477, "auxiliary_loss_mlp": 0.01040593, "balance_loss_clip": 1.02514327, "balance_loss_mlp": 1.04371023, "epoch": 0.35388546520366754, "flos": 12128578335360.0, "grad_norm": 2.3642223642766513, "language_loss": 0.83101165, "learning_rate": 2.8865394063423155e-06, "loss": 0.85302234, "num_input_tokens_seen": 126612830, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8125, "step": 5886, "time_per_iteration": 2.567223072052002 }, { "auxiliary_loss_clip": 0.01121591, "auxiliary_loss_mlp": 0.01034013, "balance_loss_clip": 1.01908851, "balance_loss_mlp": 1.04400241, "epoch": 0.3539455884563355, "flos": 19678149861120.0, "grad_norm": 2.319362126101506, "language_loss": 0.78487355, "learning_rate": 2.8862007150142557e-06, "loss": 0.80642962, "num_input_tokens_seen": 126630910, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.77734375, "step": 5887, "time_per_iteration": 2.6186418533325195 }, { "auxiliary_loss_clip": 0.01167439, "auxiliary_loss_mlp": 0.01041861, "balance_loss_clip": 1.0273416, "balance_loss_mlp": 1.04469848, "epoch": 0.35400571170900347, "flos": 18072045129600.0, "grad_norm": 2.1655246323816746, "language_loss": 0.65349996, "learning_rate": 2.885861992059972e-06, "loss": 0.6755929, "num_input_tokens_seen": 126648365, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78515625, "step": 5888, "time_per_iteration": 2.6084976196289062 }, { "auxiliary_loss_clip": 0.01141566, "auxiliary_loss_mlp": 0.01036295, "balance_loss_clip": 1.02293181, "balance_loss_mlp": 1.04600024, "epoch": 0.35406583496167143, "flos": 26068058795520.0, "grad_norm": 1.6607605747807574, "language_loss": 0.76879632, "learning_rate": 2.8855232374915528e-06, "loss": 0.79057491, "num_input_tokens_seen": 126667500, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.77734375, "step": 5889, "time_per_iteration": 2.6763851642608643 }, { "auxiliary_loss_clip": 0.01152636, "auxiliary_loss_mlp": 0.01036416, "balance_loss_clip": 1.02215862, "balance_loss_mlp": 1.04847133, "epoch": 0.3541259582143394, "flos": 19792453916160.0, "grad_norm": 1.694465932030801, "language_loss": 0.80637115, "learning_rate": 2.885184451321087e-06, "loss": 0.82826173, "num_input_tokens_seen": 126686820, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7734375, "step": 5890, "time_per_iteration": 2.5868754386901855 }, { "auxiliary_loss_clip": 0.01141361, "auxiliary_loss_mlp": 0.01036055, "balance_loss_clip": 1.0234009, "balance_loss_mlp": 1.04577959, "epoch": 0.35418608146700736, "flos": 24022084112640.0, "grad_norm": 1.715500071296389, "language_loss": 0.79875195, "learning_rate": 2.884845633560664e-06, "loss": 0.82052612, "num_input_tokens_seen": 126706965, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7734375, "step": 5891, "time_per_iteration": 2.65313720703125 }, { "auxiliary_loss_clip": 0.01153488, "auxiliary_loss_mlp": 0.0104285, "balance_loss_clip": 1.02782345, "balance_loss_mlp": 1.04743028, "epoch": 0.35424620471967533, "flos": 12385770693120.0, "grad_norm": 1.7909332212317544, "language_loss": 0.72760451, "learning_rate": 2.8845067842223776e-06, "loss": 0.74956787, "num_input_tokens_seen": 126724015, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.79296875, "step": 5892, "time_per_iteration": 2.5444414615631104 }, { "auxiliary_loss_clip": 0.01143313, "auxiliary_loss_mlp": 0.01042069, "balance_loss_clip": 1.02756715, "balance_loss_mlp": 1.046646, "epoch": 0.3543063279723433, "flos": 19673624747520.0, "grad_norm": 2.209373992412101, "language_loss": 0.67138326, "learning_rate": 2.884167903318319e-06, "loss": 0.69323707, "num_input_tokens_seen": 126737565, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7890625, "step": 5893, "time_per_iteration": 2.587125301361084 }, { "auxiliary_loss_clip": 0.01159783, "auxiliary_loss_mlp": 0.01039459, "balance_loss_clip": 1.02427149, "balance_loss_mlp": 1.04618073, "epoch": 0.35436645122501126, "flos": 21909208348800.0, "grad_norm": 1.7477398402502031, "language_loss": 0.69832051, "learning_rate": 2.8838289908605822e-06, "loss": 0.72031295, "num_input_tokens_seen": 126756095, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.78125, "step": 5894, "time_per_iteration": 2.5728759765625 }, { "auxiliary_loss_clip": 0.01143217, "auxiliary_loss_mlp": 0.01284131, "balance_loss_clip": 1.02322292, "balance_loss_mlp": 1.04820013, "epoch": 0.3544265744776792, "flos": 21719527603200.0, "grad_norm": 1.791102073362088, "language_loss": 0.74580032, "learning_rate": 2.8834900468612624e-06, "loss": 0.77007377, "num_input_tokens_seen": 126775455, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.765625, "step": 5895, "time_per_iteration": 2.662323236465454 }, { "auxiliary_loss_clip": 0.01139518, "auxiliary_loss_mlp": 0.01035516, "balance_loss_clip": 1.02092505, "balance_loss_mlp": 1.04430151, "epoch": 0.3544866977303472, "flos": 21213223447680.0, "grad_norm": 2.1731982475067877, "language_loss": 0.83631974, "learning_rate": 2.883151071332455e-06, "loss": 0.85807002, "num_input_tokens_seen": 126792320, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7734375, "step": 5896, "time_per_iteration": 2.595767021179199 }, { "auxiliary_loss_clip": 0.0114048, "auxiliary_loss_mlp": 0.01297724, "balance_loss_clip": 1.03553915, "balance_loss_mlp": 1.0453198, "epoch": 0.35454682098301515, "flos": 29311402371840.0, "grad_norm": 1.5089002054873184, "language_loss": 0.69939637, "learning_rate": 2.8828120642862585e-06, "loss": 0.72377837, "num_input_tokens_seen": 126813680, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7734375, "step": 5897, "time_per_iteration": 2.6669094562530518 }, { "auxiliary_loss_clip": 0.01142053, "auxiliary_loss_mlp": 0.01038828, "balance_loss_clip": 1.02435577, "balance_loss_mlp": 1.04696929, "epoch": 0.3546069442356832, "flos": 24316587722880.0, "grad_norm": 1.5619746899067362, "language_loss": 0.82008231, "learning_rate": 2.882473025734769e-06, "loss": 0.84189111, "num_input_tokens_seen": 126834395, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7734375, "step": 5898, "time_per_iteration": 2.65771484375 }, { "auxiliary_loss_clip": 0.01122174, "auxiliary_loss_mlp": 0.01041707, "balance_loss_clip": 1.02805185, "balance_loss_mlp": 1.04613042, "epoch": 0.35466706748835114, "flos": 22857285876480.0, "grad_norm": 1.9724749514045652, "language_loss": 0.74106383, "learning_rate": 2.8821339556900883e-06, "loss": 0.76270258, "num_input_tokens_seen": 126855145, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7578125, "step": 5899, "time_per_iteration": 2.6211225986480713 }, { "auxiliary_loss_clip": 0.01125405, "auxiliary_loss_mlp": 0.01288263, "balance_loss_clip": 1.02630246, "balance_loss_mlp": 1.04655826, "epoch": 0.3547271907410191, "flos": 28330107742080.0, "grad_norm": 1.6931286777455161, "language_loss": 0.7943995, "learning_rate": 2.8817948541643153e-06, "loss": 0.81853622, "num_input_tokens_seen": 126873790, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7890625, "step": 5900, "time_per_iteration": 2.709754467010498 }, { "auxiliary_loss_clip": 0.01142804, "auxiliary_loss_mlp": 0.010452, "balance_loss_clip": 1.02996492, "balance_loss_mlp": 1.04652309, "epoch": 0.35478731399368707, "flos": 23514092017920.0, "grad_norm": 2.247644347162471, "language_loss": 0.8114273, "learning_rate": 2.8814557211695523e-06, "loss": 0.83330739, "num_input_tokens_seen": 126892865, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.78125, "step": 5901, "time_per_iteration": 2.577256441116333 }, { "auxiliary_loss_clip": 0.01136945, "auxiliary_loss_mlp": 0.01286148, "balance_loss_clip": 1.02354848, "balance_loss_mlp": 1.04618156, "epoch": 0.35484743724635504, "flos": 18624315715200.0, "grad_norm": 1.7563199501928495, "language_loss": 0.75721788, "learning_rate": 2.8811165567179025e-06, "loss": 0.78144884, "num_input_tokens_seen": 126911935, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.81640625, "step": 5902, "time_per_iteration": 2.5275402069091797 }, { "auxiliary_loss_clip": 0.01136169, "auxiliary_loss_mlp": 0.01037656, "balance_loss_clip": 1.02352333, "balance_loss_mlp": 1.04842019, "epoch": 0.354907560499023, "flos": 17384499924480.0, "grad_norm": 1.7647651523111358, "language_loss": 0.70637536, "learning_rate": 2.880777360821468e-06, "loss": 0.72811359, "num_input_tokens_seen": 126930040, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7890625, "step": 5903, "time_per_iteration": 2.499950408935547 }, { "auxiliary_loss_clip": 0.01136007, "auxiliary_loss_mlp": 0.01036777, "balance_loss_clip": 1.02149439, "balance_loss_mlp": 1.04514468, "epoch": 0.35496768375169097, "flos": 19208546426880.0, "grad_norm": 11.20232683510071, "language_loss": 0.73941529, "learning_rate": 2.8804381334923563e-06, "loss": 0.76114309, "num_input_tokens_seen": 126948390, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8203125, "step": 5904, "time_per_iteration": 2.5544962882995605 }, { "auxiliary_loss_clip": 0.01144614, "auxiliary_loss_mlp": 0.01288004, "balance_loss_clip": 1.02616191, "balance_loss_mlp": 1.04694772, "epoch": 0.35502780700435893, "flos": 18332792933760.0, "grad_norm": 2.1225519144581804, "language_loss": 0.7857585, "learning_rate": 2.8800988747426722e-06, "loss": 0.8100847, "num_input_tokens_seen": 126964905, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80078125, "step": 5905, "time_per_iteration": 2.523617744445801 }, { "auxiliary_loss_clip": 0.01144673, "auxiliary_loss_mlp": 0.01034586, "balance_loss_clip": 1.02145505, "balance_loss_mlp": 1.04532123, "epoch": 0.3550879302570269, "flos": 15448555578240.0, "grad_norm": 2.005211593336695, "language_loss": 0.72510672, "learning_rate": 2.8797595845845225e-06, "loss": 0.74689937, "num_input_tokens_seen": 126982000, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 5906, "time_per_iteration": 2.556774854660034 }, { "auxiliary_loss_clip": 0.0114998, "auxiliary_loss_mlp": 0.01035998, "balance_loss_clip": 1.01944029, "balance_loss_mlp": 1.04897749, "epoch": 0.35514805350969486, "flos": 21979197999360.0, "grad_norm": 1.848173153092742, "language_loss": 0.74670148, "learning_rate": 2.879420263030017e-06, "loss": 0.76856124, "num_input_tokens_seen": 126998390, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.828125, "step": 5907, "time_per_iteration": 2.5836989879608154 }, { "auxiliary_loss_clip": 0.0116853, "auxiliary_loss_mlp": 0.01032124, "balance_loss_clip": 1.01719856, "balance_loss_mlp": 1.0446229, "epoch": 0.3552081767623628, "flos": 29861949104640.0, "grad_norm": 1.8658509475341232, "language_loss": 0.75376976, "learning_rate": 2.8790809100912637e-06, "loss": 0.77577627, "num_input_tokens_seen": 127020220, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.7890625, "step": 5908, "time_per_iteration": 2.7795281410217285 }, { "auxiliary_loss_clip": 0.01144096, "auxiliary_loss_mlp": 0.01038843, "balance_loss_clip": 1.02373958, "balance_loss_mlp": 1.04690027, "epoch": 0.3552683000150308, "flos": 26432264747520.0, "grad_norm": 1.8413527585034386, "language_loss": 0.68129396, "learning_rate": 2.8787415257803742e-06, "loss": 0.70312333, "num_input_tokens_seen": 127038585, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.796875, "step": 5909, "time_per_iteration": 2.6186583042144775 }, { "auxiliary_loss_clip": 0.01131487, "auxiliary_loss_mlp": 0.01031392, "balance_loss_clip": 1.01761699, "balance_loss_mlp": 1.04753971, "epoch": 0.35532842326769876, "flos": 19785989468160.0, "grad_norm": 1.6969390517923568, "language_loss": 0.7825684, "learning_rate": 2.8784021101094605e-06, "loss": 0.80419719, "num_input_tokens_seen": 127056215, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75, "step": 5910, "time_per_iteration": 2.6088273525238037 }, { "auxiliary_loss_clip": 0.01147064, "auxiliary_loss_mlp": 0.01041683, "balance_loss_clip": 1.0255481, "balance_loss_mlp": 1.04723001, "epoch": 0.3553885465203668, "flos": 17239277237760.0, "grad_norm": 1.6871051485839352, "language_loss": 0.70422155, "learning_rate": 2.878062663090635e-06, "loss": 0.72610903, "num_input_tokens_seen": 127075825, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8203125, "step": 5911, "time_per_iteration": 2.5416667461395264 }, { "auxiliary_loss_clip": 0.01131905, "auxiliary_loss_mlp": 0.01037507, "balance_loss_clip": 1.02241492, "balance_loss_mlp": 1.04625452, "epoch": 0.35544866977303474, "flos": 14934350430720.0, "grad_norm": 2.319319021944231, "language_loss": 0.86897242, "learning_rate": 2.8777231847360117e-06, "loss": 0.89066648, "num_input_tokens_seen": 127091205, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.765625, "step": 5912, "time_per_iteration": 2.572389841079712 }, { "auxiliary_loss_clip": 0.01142514, "auxiliary_loss_mlp": 0.01036122, "balance_loss_clip": 1.02165008, "balance_loss_mlp": 1.04711556, "epoch": 0.3555087930257027, "flos": 19756040503680.0, "grad_norm": 2.1113689367799444, "language_loss": 0.76988769, "learning_rate": 2.8773836750577053e-06, "loss": 0.79167402, "num_input_tokens_seen": 127109210, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7734375, "step": 5913, "time_per_iteration": 2.5930697917938232 }, { "auxiliary_loss_clip": 0.01152059, "auxiliary_loss_mlp": 0.0103509, "balance_loss_clip": 1.02012968, "balance_loss_mlp": 1.04954362, "epoch": 0.3555689162783707, "flos": 21068252156160.0, "grad_norm": 1.36230770694894, "language_loss": 0.82489586, "learning_rate": 2.877044134067833e-06, "loss": 0.84676737, "num_input_tokens_seen": 127128400, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.7578125, "step": 5914, "time_per_iteration": 4.037011623382568 }, { "auxiliary_loss_clip": 0.01126288, "auxiliary_loss_mlp": 0.01033364, "balance_loss_clip": 1.01846337, "balance_loss_mlp": 1.04765189, "epoch": 0.35562903953103864, "flos": 33069633454080.0, "grad_norm": 2.3795596516799944, "language_loss": 0.70369565, "learning_rate": 2.8767045617785108e-06, "loss": 0.72529221, "num_input_tokens_seen": 127149965, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.78515625, "step": 5915, "time_per_iteration": 2.6686298847198486 }, { "auxiliary_loss_clip": 0.01132343, "auxiliary_loss_mlp": 0.01040023, "balance_loss_clip": 1.0255754, "balance_loss_mlp": 1.04564261, "epoch": 0.3556891627837066, "flos": 20557853850240.0, "grad_norm": 3.1081640686823824, "language_loss": 0.76256377, "learning_rate": 2.8763649582018584e-06, "loss": 0.78428745, "num_input_tokens_seen": 127169865, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78125, "step": 5916, "time_per_iteration": 2.5588314533233643 }, { "auxiliary_loss_clip": 0.01152922, "auxiliary_loss_mlp": 0.0103546, "balance_loss_clip": 1.02133393, "balance_loss_mlp": 1.04786062, "epoch": 0.35574928603637457, "flos": 20703327932160.0, "grad_norm": 1.7632090079570992, "language_loss": 0.88266444, "learning_rate": 2.876025323349995e-06, "loss": 0.90454823, "num_input_tokens_seen": 127188075, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.78125, "step": 5917, "time_per_iteration": 2.5986592769622803 }, { "auxiliary_loss_clip": 0.01168373, "auxiliary_loss_mlp": 0.01030449, "balance_loss_clip": 1.01566744, "balance_loss_mlp": 1.04559529, "epoch": 0.35580940928904253, "flos": 15194595444480.0, "grad_norm": 2.1928156069996603, "language_loss": 0.74506819, "learning_rate": 2.875685657235041e-06, "loss": 0.76705635, "num_input_tokens_seen": 127206065, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.77734375, "step": 5918, "time_per_iteration": 2.6520447731018066 }, { "auxiliary_loss_clip": 0.01050778, "auxiliary_loss_mlp": 0.01016416, "balance_loss_clip": 1.01399601, "balance_loss_mlp": 1.01637793, "epoch": 0.3558695325417105, "flos": 58639145431680.0, "grad_norm": 0.9596839917120329, "language_loss": 0.63965422, "learning_rate": 2.8753459598691183e-06, "loss": 0.66032612, "num_input_tokens_seen": 127257885, "router_z_loss_clip": 0.02416992, "router_z_loss_mlp": 0.25390625, "step": 5919, "time_per_iteration": 4.2828369140625 }, { "auxiliary_loss_clip": 0.01136865, "auxiliary_loss_mlp": 0.01036126, "balance_loss_clip": 1.0204258, "balance_loss_mlp": 1.04630864, "epoch": 0.35592965579437846, "flos": 22018233104640.0, "grad_norm": 1.986738407666702, "language_loss": 0.73752832, "learning_rate": 2.8750062312643495e-06, "loss": 0.75925827, "num_input_tokens_seen": 127275550, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8125, "step": 5920, "time_per_iteration": 4.129518985748291 }, { "auxiliary_loss_clip": 0.01147395, "auxiliary_loss_mlp": 0.01033079, "balance_loss_clip": 1.01787961, "balance_loss_mlp": 1.04251289, "epoch": 0.35598977904704643, "flos": 23367684182400.0, "grad_norm": 2.0246261787327864, "language_loss": 0.77285552, "learning_rate": 2.8746664714328603e-06, "loss": 0.79466033, "num_input_tokens_seen": 127295110, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.77734375, "step": 5921, "time_per_iteration": 2.6066174507141113 }, { "auxiliary_loss_clip": 0.0114222, "auxiliary_loss_mlp": 0.01030623, "balance_loss_clip": 1.01630616, "balance_loss_mlp": 1.04514599, "epoch": 0.3560499022997144, "flos": 17785334770560.0, "grad_norm": 1.9181668120439268, "language_loss": 0.67543757, "learning_rate": 2.8743266803867743e-06, "loss": 0.69716603, "num_input_tokens_seen": 127312865, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7890625, "step": 5922, "time_per_iteration": 2.613008499145508 }, { "auxiliary_loss_clip": 0.01135941, "auxiliary_loss_mlp": 0.01035662, "balance_loss_clip": 1.02075505, "balance_loss_mlp": 1.04697633, "epoch": 0.35611002555238236, "flos": 20740459616640.0, "grad_norm": 1.9592749389030202, "language_loss": 0.78430927, "learning_rate": 2.8739868581382175e-06, "loss": 0.80602533, "num_input_tokens_seen": 127331710, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.796875, "step": 5923, "time_per_iteration": 4.13240909576416 }, { "auxiliary_loss_clip": 0.01142562, "auxiliary_loss_mlp": 0.0103684, "balance_loss_clip": 1.02276707, "balance_loss_mlp": 1.0466336, "epoch": 0.3561701488050504, "flos": 19462219251840.0, "grad_norm": 1.7224553726836975, "language_loss": 0.8503052, "learning_rate": 2.873647004699318e-06, "loss": 0.87209916, "num_input_tokens_seen": 127350950, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78125, "step": 5924, "time_per_iteration": 2.6638998985290527 }, { "auxiliary_loss_clip": 0.011415, "auxiliary_loss_mlp": 0.0103728, "balance_loss_clip": 1.02209258, "balance_loss_mlp": 1.04557562, "epoch": 0.35623027205771834, "flos": 30774942023040.0, "grad_norm": 2.8805403431312215, "language_loss": 0.77655005, "learning_rate": 2.8733071200822046e-06, "loss": 0.79833782, "num_input_tokens_seen": 127369385, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.77734375, "step": 5925, "time_per_iteration": 2.789048910140991 }, { "auxiliary_loss_clip": 0.01165383, "auxiliary_loss_mlp": 0.01039523, "balance_loss_clip": 1.02498579, "balance_loss_mlp": 1.04241323, "epoch": 0.3562903953103863, "flos": 16981079299200.0, "grad_norm": 1.7798298795837593, "language_loss": 0.75859678, "learning_rate": 2.8729672042990068e-06, "loss": 0.78064585, "num_input_tokens_seen": 127386965, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.78515625, "step": 5926, "time_per_iteration": 2.607316255569458 }, { "auxiliary_loss_clip": 0.01152944, "auxiliary_loss_mlp": 0.01282427, "balance_loss_clip": 1.01986456, "balance_loss_mlp": 1.04523754, "epoch": 0.3563505185630543, "flos": 23839837482240.0, "grad_norm": 2.096599965813888, "language_loss": 0.696145, "learning_rate": 2.872627257361855e-06, "loss": 0.72049868, "num_input_tokens_seen": 127406075, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.80859375, "step": 5927, "time_per_iteration": 2.674968719482422 }, { "auxiliary_loss_clip": 0.01121109, "auxiliary_loss_mlp": 0.01034901, "balance_loss_clip": 1.0209415, "balance_loss_mlp": 1.04459214, "epoch": 0.35641064181572224, "flos": 22273450214400.0, "grad_norm": 2.208056815225765, "language_loss": 0.7972703, "learning_rate": 2.8722872792828803e-06, "loss": 0.81883037, "num_input_tokens_seen": 127425350, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.765625, "step": 5928, "time_per_iteration": 2.6151742935180664 }, { "auxiliary_loss_clip": 0.01123496, "auxiliary_loss_mlp": 0.01035599, "balance_loss_clip": 1.02137733, "balance_loss_mlp": 1.04475307, "epoch": 0.3564707650683902, "flos": 23001251587200.0, "grad_norm": 5.289208317392441, "language_loss": 0.81872535, "learning_rate": 2.8719472700742167e-06, "loss": 0.8403163, "num_input_tokens_seen": 127446335, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7890625, "step": 5929, "time_per_iteration": 2.6117305755615234 }, { "auxiliary_loss_clip": 0.01126086, "auxiliary_loss_mlp": 0.01034542, "balance_loss_clip": 1.02123785, "balance_loss_mlp": 1.04165494, "epoch": 0.35653088832105817, "flos": 14684268965760.0, "grad_norm": 2.2806681821877866, "language_loss": 0.70176727, "learning_rate": 2.871607229747998e-06, "loss": 0.72337353, "num_input_tokens_seen": 127462795, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7578125, "step": 5930, "time_per_iteration": 2.55086612701416 }, { "auxiliary_loss_clip": 0.011365, "auxiliary_loss_mlp": 0.01044358, "balance_loss_clip": 1.02883649, "balance_loss_mlp": 1.04654741, "epoch": 0.35659101157372614, "flos": 23477068074240.0, "grad_norm": 1.7839341954492813, "language_loss": 0.6756978, "learning_rate": 2.8712671583163596e-06, "loss": 0.69750643, "num_input_tokens_seen": 127482675, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8125, "step": 5931, "time_per_iteration": 2.649885654449463 }, { "auxiliary_loss_clip": 0.0113004, "auxiliary_loss_mlp": 0.01034591, "balance_loss_clip": 1.020262, "balance_loss_mlp": 1.04432249, "epoch": 0.3566511348263941, "flos": 26578672583040.0, "grad_norm": 1.5668929837734349, "language_loss": 0.6784867, "learning_rate": 2.870927055791437e-06, "loss": 0.70013297, "num_input_tokens_seen": 127502275, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.76953125, "step": 5932, "time_per_iteration": 2.6094889640808105 }, { "auxiliary_loss_clip": 0.01155463, "auxiliary_loss_mlp": 0.01030819, "balance_loss_clip": 1.01661563, "balance_loss_mlp": 1.0439961, "epoch": 0.35671125807906207, "flos": 13115008609920.0, "grad_norm": 2.205209930991918, "language_loss": 0.78727674, "learning_rate": 2.8705869221853684e-06, "loss": 0.80913949, "num_input_tokens_seen": 127520195, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7578125, "step": 5933, "time_per_iteration": 2.610605001449585 }, { "auxiliary_loss_clip": 0.01131312, "auxiliary_loss_mlp": 0.01043692, "balance_loss_clip": 1.02962565, "balance_loss_mlp": 1.04322755, "epoch": 0.35677138133173003, "flos": 33000577557120.0, "grad_norm": 1.3918376819262668, "language_loss": 0.6966275, "learning_rate": 2.8702467575102914e-06, "loss": 0.71837759, "num_input_tokens_seen": 127544495, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7890625, "step": 5934, "time_per_iteration": 2.658763885498047 }, { "auxiliary_loss_clip": 0.01148929, "auxiliary_loss_mlp": 0.01289933, "balance_loss_clip": 1.0262773, "balance_loss_mlp": 1.04670548, "epoch": 0.356831504584398, "flos": 20777842696320.0, "grad_norm": 3.6161364025891376, "language_loss": 0.70554757, "learning_rate": 2.869906561778347e-06, "loss": 0.72993624, "num_input_tokens_seen": 127563810, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.84375, "step": 5935, "time_per_iteration": 2.5821592807769775 }, { "auxiliary_loss_clip": 0.01151094, "auxiliary_loss_mlp": 0.0103825, "balance_loss_clip": 1.02356362, "balance_loss_mlp": 1.04551733, "epoch": 0.35689162783706596, "flos": 12165566365440.0, "grad_norm": 1.8942964834409266, "language_loss": 0.78222883, "learning_rate": 2.869566335001674e-06, "loss": 0.80412233, "num_input_tokens_seen": 127579065, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7890625, "step": 5936, "time_per_iteration": 2.57431960105896 }, { "auxiliary_loss_clip": 0.01131093, "auxiliary_loss_mlp": 0.01288626, "balance_loss_clip": 1.02667522, "balance_loss_mlp": 1.04489064, "epoch": 0.356951751089734, "flos": 23841489507840.0, "grad_norm": 1.89754336222418, "language_loss": 0.64367479, "learning_rate": 2.8692260771924167e-06, "loss": 0.66787195, "num_input_tokens_seen": 127599105, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7734375, "step": 5937, "time_per_iteration": 2.6211767196655273 }, { "auxiliary_loss_clip": 0.01124657, "auxiliary_loss_mlp": 0.01286042, "balance_loss_clip": 1.02396595, "balance_loss_mlp": 1.0448693, "epoch": 0.35701187434240195, "flos": 11722176881280.0, "grad_norm": 2.235050266179075, "language_loss": 0.78545874, "learning_rate": 2.868885788362715e-06, "loss": 0.80956566, "num_input_tokens_seen": 127614940, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.796875, "step": 5938, "time_per_iteration": 2.515789747238159 }, { "auxiliary_loss_clip": 0.01151023, "auxiliary_loss_mlp": 0.0104654, "balance_loss_clip": 1.03249693, "balance_loss_mlp": 1.04605436, "epoch": 0.3570719975950699, "flos": 24898879100160.0, "grad_norm": 1.7272307924834167, "language_loss": 0.80228043, "learning_rate": 2.868545468524716e-06, "loss": 0.82425606, "num_input_tokens_seen": 127634960, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78125, "step": 5939, "time_per_iteration": 2.5972485542297363 }, { "auxiliary_loss_clip": 0.01147023, "auxiliary_loss_mlp": 0.0103731, "balance_loss_clip": 1.02251601, "balance_loss_mlp": 1.04454064, "epoch": 0.3571321208477379, "flos": 25994836920960.0, "grad_norm": 1.7379604253444105, "language_loss": 0.7907511, "learning_rate": 2.8682051176905624e-06, "loss": 0.81259447, "num_input_tokens_seen": 127654545, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.84375, "step": 5940, "time_per_iteration": 2.6717207431793213 }, { "auxiliary_loss_clip": 0.01141708, "auxiliary_loss_mlp": 0.01034438, "balance_loss_clip": 1.01889873, "balance_loss_mlp": 1.04444766, "epoch": 0.35719224410040584, "flos": 14501663199360.0, "grad_norm": 1.8255841763097627, "language_loss": 0.72001171, "learning_rate": 2.867864735872402e-06, "loss": 0.74177313, "num_input_tokens_seen": 127672320, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.796875, "step": 5941, "time_per_iteration": 2.6665775775909424 }, { "auxiliary_loss_clip": 0.01134515, "auxiliary_loss_mlp": 0.01037986, "balance_loss_clip": 1.02226257, "balance_loss_mlp": 1.04611731, "epoch": 0.3572523673530738, "flos": 31175453646720.0, "grad_norm": 2.1308394455701243, "language_loss": 0.64393783, "learning_rate": 2.8675243230823815e-06, "loss": 0.66566277, "num_input_tokens_seen": 127693315, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.796875, "step": 5942, "time_per_iteration": 2.8308026790618896 }, { "auxiliary_loss_clip": 0.01129949, "auxiliary_loss_mlp": 0.01038121, "balance_loss_clip": 1.02326202, "balance_loss_mlp": 1.04364407, "epoch": 0.3573124906057418, "flos": 15851976203520.0, "grad_norm": 1.9484566982426246, "language_loss": 0.73895425, "learning_rate": 2.86718387933265e-06, "loss": 0.76063496, "num_input_tokens_seen": 127711570, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7734375, "step": 5943, "time_per_iteration": 2.547349691390991 }, { "auxiliary_loss_clip": 0.01040015, "auxiliary_loss_mlp": 0.01003295, "balance_loss_clip": 1.00088727, "balance_loss_mlp": 1.01437807, "epoch": 0.35737261385840974, "flos": 60822729118080.0, "grad_norm": 0.7873187054386961, "language_loss": 0.60778606, "learning_rate": 2.8668434046353557e-06, "loss": 0.62821913, "num_input_tokens_seen": 127772475, "router_z_loss_clip": 0.02404785, "router_z_loss_mlp": 0.2578125, "step": 5944, "time_per_iteration": 3.2404630184173584 }, { "auxiliary_loss_clip": 0.01137988, "auxiliary_loss_mlp": 0.01034001, "balance_loss_clip": 1.01958227, "balance_loss_mlp": 1.04325175, "epoch": 0.3574327371110777, "flos": 18843765857280.0, "grad_norm": 1.8304669757839571, "language_loss": 0.72876459, "learning_rate": 2.86650289900265e-06, "loss": 0.75048447, "num_input_tokens_seen": 127790940, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.76953125, "step": 5945, "time_per_iteration": 2.5466465950012207 }, { "auxiliary_loss_clip": 0.01136677, "auxiliary_loss_mlp": 0.01284188, "balance_loss_clip": 1.02229333, "balance_loss_mlp": 1.04174113, "epoch": 0.35749286036374567, "flos": 23549679417600.0, "grad_norm": 1.7963762028978982, "language_loss": 0.81105024, "learning_rate": 2.8661623624466856e-06, "loss": 0.8352589, "num_input_tokens_seen": 127808275, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7734375, "step": 5946, "time_per_iteration": 2.618328332901001 }, { "auxiliary_loss_clip": 0.01123917, "auxiliary_loss_mlp": 0.01046042, "balance_loss_clip": 1.03153419, "balance_loss_mlp": 1.0454998, "epoch": 0.35755298361641363, "flos": 21105491581440.0, "grad_norm": 1.6678973507041508, "language_loss": 0.6891495, "learning_rate": 2.8658217949796133e-06, "loss": 0.71084905, "num_input_tokens_seen": 127828840, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78515625, "step": 5947, "time_per_iteration": 2.6457228660583496 }, { "auxiliary_loss_clip": 0.01136213, "auxiliary_loss_mlp": 0.01039932, "balance_loss_clip": 1.02584147, "balance_loss_mlp": 1.04319274, "epoch": 0.3576131068690816, "flos": 19245031666560.0, "grad_norm": 1.774121147501329, "language_loss": 0.75545263, "learning_rate": 2.8654811966135893e-06, "loss": 0.77721405, "num_input_tokens_seen": 127846240, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75, "step": 5948, "time_per_iteration": 2.5802195072174072 }, { "auxiliary_loss_clip": 0.01162417, "auxiliary_loss_mlp": 0.01042355, "balance_loss_clip": 1.02815104, "balance_loss_mlp": 1.04051948, "epoch": 0.35767323012174956, "flos": 28654703971200.0, "grad_norm": 2.1273437951689655, "language_loss": 0.70417166, "learning_rate": 2.865140567360767e-06, "loss": 0.72621942, "num_input_tokens_seen": 127866880, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7734375, "step": 5949, "time_per_iteration": 2.661468029022217 }, { "auxiliary_loss_clip": 0.01137841, "auxiliary_loss_mlp": 0.01036212, "balance_loss_clip": 1.02227092, "balance_loss_mlp": 1.0427295, "epoch": 0.35773335337441753, "flos": 17085363459840.0, "grad_norm": 1.7970737782324884, "language_loss": 0.77721417, "learning_rate": 2.864799907233304e-06, "loss": 0.79895473, "num_input_tokens_seen": 127883560, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7734375, "step": 5950, "time_per_iteration": 2.5335371494293213 }, { "auxiliary_loss_clip": 0.0113113, "auxiliary_loss_mlp": 0.01034219, "balance_loss_clip": 1.0195806, "balance_loss_mlp": 1.04332352, "epoch": 0.35779347662708555, "flos": 15888605097600.0, "grad_norm": 1.856919370075927, "language_loss": 0.72864175, "learning_rate": 2.8644592162433565e-06, "loss": 0.75029522, "num_input_tokens_seen": 127902330, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7890625, "step": 5951, "time_per_iteration": 2.5356431007385254 }, { "auxiliary_loss_clip": 0.01135128, "auxiliary_loss_mlp": 0.01041449, "balance_loss_clip": 1.02616668, "balance_loss_mlp": 1.04429138, "epoch": 0.3578535998797535, "flos": 28658834035200.0, "grad_norm": 2.107222628394467, "language_loss": 0.70225698, "learning_rate": 2.864118494403083e-06, "loss": 0.72402275, "num_input_tokens_seen": 127922325, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8203125, "step": 5952, "time_per_iteration": 2.6131060123443604 }, { "auxiliary_loss_clip": 0.01154839, "auxiliary_loss_mlp": 0.01034062, "balance_loss_clip": 1.02072835, "balance_loss_mlp": 1.04335904, "epoch": 0.3579137231324215, "flos": 37852432076160.0, "grad_norm": 1.7412229280880733, "language_loss": 0.69262993, "learning_rate": 2.863777741724643e-06, "loss": 0.71451902, "num_input_tokens_seen": 127942635, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7578125, "step": 5953, "time_per_iteration": 2.7164344787597656 }, { "auxiliary_loss_clip": 0.01138058, "auxiliary_loss_mlp": 0.01032248, "balance_loss_clip": 1.01839554, "balance_loss_mlp": 1.04293478, "epoch": 0.35797384638508944, "flos": 22346851656960.0, "grad_norm": 1.4652590801077792, "language_loss": 0.66715515, "learning_rate": 2.863436958220198e-06, "loss": 0.68885821, "num_input_tokens_seen": 127962520, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7734375, "step": 5954, "time_per_iteration": 2.5603418350219727 }, { "auxiliary_loss_clip": 0.01138115, "auxiliary_loss_mlp": 0.01035319, "balance_loss_clip": 1.0218724, "balance_loss_mlp": 1.04448032, "epoch": 0.3580339696377574, "flos": 13589711775360.0, "grad_norm": 1.939855769733154, "language_loss": 0.74621612, "learning_rate": 2.8630961439019087e-06, "loss": 0.76795042, "num_input_tokens_seen": 127981180, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.765625, "step": 5955, "time_per_iteration": 4.092323541641235 }, { "auxiliary_loss_clip": 0.01125948, "auxiliary_loss_mlp": 0.01035162, "balance_loss_clip": 1.02194142, "balance_loss_mlp": 1.0433718, "epoch": 0.3580940928904254, "flos": 23768231719680.0, "grad_norm": 1.702064152123165, "language_loss": 0.75676167, "learning_rate": 2.8627552987819382e-06, "loss": 0.77837276, "num_input_tokens_seen": 127999725, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 5956, "time_per_iteration": 2.5871171951293945 }, { "auxiliary_loss_clip": 0.01119108, "auxiliary_loss_mlp": 0.01035456, "balance_loss_clip": 1.02158558, "balance_loss_mlp": 1.04512, "epoch": 0.35815421614309334, "flos": 19463871277440.0, "grad_norm": 1.7164070327501917, "language_loss": 0.73372495, "learning_rate": 2.86241442287245e-06, "loss": 0.7552706, "num_input_tokens_seen": 128018885, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.73828125, "step": 5957, "time_per_iteration": 2.509120225906372 }, { "auxiliary_loss_clip": 0.01140339, "auxiliary_loss_mlp": 0.01032631, "balance_loss_clip": 1.0189873, "balance_loss_mlp": 1.04619646, "epoch": 0.3582143393957613, "flos": 23368186972800.0, "grad_norm": 1.6645126413611264, "language_loss": 0.71294296, "learning_rate": 2.86207351618561e-06, "loss": 0.73467267, "num_input_tokens_seen": 128037875, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.765625, "step": 5958, "time_per_iteration": 2.573920488357544 }, { "auxiliary_loss_clip": 0.01143947, "auxiliary_loss_mlp": 0.0103028, "balance_loss_clip": 1.01736927, "balance_loss_mlp": 1.04335248, "epoch": 0.35827446264842927, "flos": 26323275905280.0, "grad_norm": 2.3532069372193445, "language_loss": 0.8853628, "learning_rate": 2.8617325787335833e-06, "loss": 0.90710503, "num_input_tokens_seen": 128056045, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.74609375, "step": 5959, "time_per_iteration": 2.581857204437256 }, { "auxiliary_loss_clip": 0.01122212, "auxiliary_loss_mlp": 0.01034638, "balance_loss_clip": 1.02041674, "balance_loss_mlp": 1.04625487, "epoch": 0.35833458590109724, "flos": 30446610779520.0, "grad_norm": 1.5223783045539312, "language_loss": 0.58229125, "learning_rate": 2.861391610528538e-06, "loss": 0.60385978, "num_input_tokens_seen": 128077815, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7578125, "step": 5960, "time_per_iteration": 2.6333913803100586 }, { "auxiliary_loss_clip": 0.01122958, "auxiliary_loss_mlp": 0.01034775, "balance_loss_clip": 1.01921189, "balance_loss_mlp": 1.04511392, "epoch": 0.3583947091537652, "flos": 14829886702080.0, "grad_norm": 2.197986044234729, "language_loss": 0.76696169, "learning_rate": 2.8610506115826415e-06, "loss": 0.78853899, "num_input_tokens_seen": 128095460, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.77734375, "step": 5961, "time_per_iteration": 5.492109537124634 }, { "auxiliary_loss_clip": 0.01149634, "auxiliary_loss_mlp": 0.01030126, "balance_loss_clip": 1.01616645, "balance_loss_mlp": 1.04710674, "epoch": 0.35845483240643317, "flos": 34240644743040.0, "grad_norm": 1.6644039206233867, "language_loss": 0.69949412, "learning_rate": 2.8607095819080633e-06, "loss": 0.72129178, "num_input_tokens_seen": 128118605, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.76171875, "step": 5962, "time_per_iteration": 2.7050864696502686 }, { "auxiliary_loss_clip": 0.01140794, "auxiliary_loss_mlp": 0.01034159, "balance_loss_clip": 1.02197623, "balance_loss_mlp": 1.04793167, "epoch": 0.35851495565910113, "flos": 20960089326720.0, "grad_norm": 1.5946182185545963, "language_loss": 0.74356389, "learning_rate": 2.8603685215169745e-06, "loss": 0.76531339, "num_input_tokens_seen": 128139205, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.75, "step": 5963, "time_per_iteration": 2.591942310333252 }, { "auxiliary_loss_clip": 0.01124174, "auxiliary_loss_mlp": 0.01036552, "balance_loss_clip": 1.02269983, "balance_loss_mlp": 1.0481528, "epoch": 0.35857507891176915, "flos": 22309863626880.0, "grad_norm": 1.6059165405086016, "language_loss": 0.78679478, "learning_rate": 2.8600274304215458e-06, "loss": 0.80840206, "num_input_tokens_seen": 128158765, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76171875, "step": 5964, "time_per_iteration": 2.536857843399048 }, { "auxiliary_loss_clip": 0.0114369, "auxiliary_loss_mlp": 0.01033095, "balance_loss_clip": 1.01838422, "balance_loss_mlp": 1.04623139, "epoch": 0.3586352021644371, "flos": 23367863750400.0, "grad_norm": 1.744440777370954, "language_loss": 0.66577995, "learning_rate": 2.859686308633951e-06, "loss": 0.6875478, "num_input_tokens_seen": 128177850, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.796875, "step": 5965, "time_per_iteration": 3.9980385303497314 }, { "auxiliary_loss_clip": 0.01121466, "auxiliary_loss_mlp": 0.01283235, "balance_loss_clip": 1.02154982, "balance_loss_mlp": 1.04520071, "epoch": 0.3586953254171051, "flos": 27849227437440.0, "grad_norm": 1.4982894627805168, "language_loss": 0.79064977, "learning_rate": 2.8593451561663634e-06, "loss": 0.81469673, "num_input_tokens_seen": 128196925, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.765625, "step": 5966, "time_per_iteration": 2.610853672027588 }, { "auxiliary_loss_clip": 0.01161048, "auxiliary_loss_mlp": 0.01040033, "balance_loss_clip": 1.02476263, "balance_loss_mlp": 1.04701626, "epoch": 0.35875544866977305, "flos": 19500500171520.0, "grad_norm": 1.9898832777725808, "language_loss": 0.91063291, "learning_rate": 2.859003973030957e-06, "loss": 0.93264377, "num_input_tokens_seen": 128213955, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.77734375, "step": 5967, "time_per_iteration": 2.6860811710357666 }, { "auxiliary_loss_clip": 0.01154859, "auxiliary_loss_mlp": 0.01035041, "balance_loss_clip": 1.01969314, "balance_loss_mlp": 1.04828775, "epoch": 0.358815571922441, "flos": 21471134077440.0, "grad_norm": 1.768201330813149, "language_loss": 0.80539936, "learning_rate": 2.858662759239909e-06, "loss": 0.8272984, "num_input_tokens_seen": 128232980, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.796875, "step": 5968, "time_per_iteration": 2.708812952041626 }, { "auxiliary_loss_clip": 0.01146115, "auxiliary_loss_mlp": 0.01051784, "balance_loss_clip": 1.03636992, "balance_loss_mlp": 1.04707468, "epoch": 0.358875695175109, "flos": 21835411856640.0, "grad_norm": 2.206008785975816, "language_loss": 0.83805734, "learning_rate": 2.858321514805395e-06, "loss": 0.86003631, "num_input_tokens_seen": 128252795, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8125, "step": 5969, "time_per_iteration": 2.6197128295898438 }, { "auxiliary_loss_clip": 0.01149474, "auxiliary_loss_mlp": 0.01027434, "balance_loss_clip": 1.01449966, "balance_loss_mlp": 1.04731965, "epoch": 0.35893581842777694, "flos": 32011633330560.0, "grad_norm": 1.8009744411288016, "language_loss": 0.71732664, "learning_rate": 2.8579802397395953e-06, "loss": 0.73909569, "num_input_tokens_seen": 128273115, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7578125, "step": 5970, "time_per_iteration": 2.7053284645080566 }, { "auxiliary_loss_clip": 0.01141636, "auxiliary_loss_mlp": 0.01033344, "balance_loss_clip": 1.01961768, "balance_loss_mlp": 1.04789722, "epoch": 0.3589959416804449, "flos": 20485817124480.0, "grad_norm": 1.894363818130778, "language_loss": 0.79921424, "learning_rate": 2.857638934054687e-06, "loss": 0.8209641, "num_input_tokens_seen": 128292220, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.76171875, "step": 5971, "time_per_iteration": 2.615565061569214 }, { "auxiliary_loss_clip": 0.01159257, "auxiliary_loss_mlp": 0.01035523, "balance_loss_clip": 1.02074099, "balance_loss_mlp": 1.0443697, "epoch": 0.3590560649331129, "flos": 16180666583040.0, "grad_norm": 2.0900506782675743, "language_loss": 0.78503776, "learning_rate": 2.8572975977628517e-06, "loss": 0.80698556, "num_input_tokens_seen": 128310305, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.79296875, "step": 5972, "time_per_iteration": 2.623823642730713 }, { "auxiliary_loss_clip": 0.01149351, "auxiliary_loss_mlp": 0.01031384, "balance_loss_clip": 1.01688838, "balance_loss_mlp": 1.04417777, "epoch": 0.35911618818578084, "flos": 20375391738240.0, "grad_norm": 1.7266668006977202, "language_loss": 0.81184947, "learning_rate": 2.8569562308762697e-06, "loss": 0.83365679, "num_input_tokens_seen": 128328305, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.78515625, "step": 5973, "time_per_iteration": 2.627638578414917 }, { "auxiliary_loss_clip": 0.01058636, "auxiliary_loss_mlp": 0.00999964, "balance_loss_clip": 0.99783051, "balance_loss_mlp": 1.01471829, "epoch": 0.3591763114384488, "flos": 41236691685120.0, "grad_norm": 0.911436135171295, "language_loss": 0.56656122, "learning_rate": 2.8566148334071245e-06, "loss": 0.58714724, "num_input_tokens_seen": 128378380, "router_z_loss_clip": 0.0213623, "router_z_loss_mlp": 0.25585938, "step": 5974, "time_per_iteration": 3.0222880840301514 }, { "auxiliary_loss_clip": 0.01133122, "auxiliary_loss_mlp": 0.01036398, "balance_loss_clip": 1.02237868, "balance_loss_mlp": 1.04574609, "epoch": 0.35923643469111677, "flos": 18695454600960.0, "grad_norm": 1.8010373214871107, "language_loss": 0.68951523, "learning_rate": 2.8562734053675997e-06, "loss": 0.71121049, "num_input_tokens_seen": 128394315, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78515625, "step": 5975, "time_per_iteration": 2.50459623336792 }, { "auxiliary_loss_clip": 0.01121525, "auxiliary_loss_mlp": 0.01033339, "balance_loss_clip": 1.02002287, "balance_loss_mlp": 1.04586291, "epoch": 0.35929655794378473, "flos": 25009950931200.0, "grad_norm": 1.5812561727480645, "language_loss": 0.80003905, "learning_rate": 2.8559319467698794e-06, "loss": 0.82158768, "num_input_tokens_seen": 128414515, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7578125, "step": 5976, "time_per_iteration": 2.6505115032196045 }, { "auxiliary_loss_clip": 0.01168061, "auxiliary_loss_mlp": 0.01036103, "balance_loss_clip": 1.02170849, "balance_loss_mlp": 1.04628539, "epoch": 0.35935668119645275, "flos": 14975576265600.0, "grad_norm": 1.7360562334522822, "language_loss": 0.78960156, "learning_rate": 2.855590457626149e-06, "loss": 0.81164318, "num_input_tokens_seen": 128430615, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7734375, "step": 5977, "time_per_iteration": 2.513921022415161 }, { "auxiliary_loss_clip": 0.01145772, "auxiliary_loss_mlp": 0.01033337, "balance_loss_clip": 1.02019405, "balance_loss_mlp": 1.0435729, "epoch": 0.3594168044491207, "flos": 21178138838400.0, "grad_norm": 1.746588137588913, "language_loss": 0.80142599, "learning_rate": 2.855248937948597e-06, "loss": 0.82321709, "num_input_tokens_seen": 128449480, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7578125, "step": 5978, "time_per_iteration": 2.5738131999969482 }, { "auxiliary_loss_clip": 0.01122835, "auxiliary_loss_mlp": 0.01280541, "balance_loss_clip": 1.01860344, "balance_loss_mlp": 1.04502225, "epoch": 0.3594769277017887, "flos": 27672152365440.0, "grad_norm": 1.7779271648867463, "language_loss": 0.67663974, "learning_rate": 2.8549073877494096e-06, "loss": 0.70067352, "num_input_tokens_seen": 128471465, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.77734375, "step": 5979, "time_per_iteration": 2.565878391265869 }, { "auxiliary_loss_clip": 0.01139996, "auxiliary_loss_mlp": 0.01037271, "balance_loss_clip": 1.02299571, "balance_loss_mlp": 1.04507864, "epoch": 0.35953705095445665, "flos": 23002328995200.0, "grad_norm": 2.05738481339836, "language_loss": 0.67367816, "learning_rate": 2.8545658070407773e-06, "loss": 0.69545078, "num_input_tokens_seen": 128490645, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.76953125, "step": 5980, "time_per_iteration": 2.6134085655212402 }, { "auxiliary_loss_clip": 0.01143301, "auxiliary_loss_mlp": 0.01041095, "balance_loss_clip": 1.02533555, "balance_loss_mlp": 1.04394865, "epoch": 0.3595971742071246, "flos": 25513992529920.0, "grad_norm": 1.9381808518086583, "language_loss": 0.70957291, "learning_rate": 2.8542241958348894e-06, "loss": 0.73141682, "num_input_tokens_seen": 128510225, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.81640625, "step": 5981, "time_per_iteration": 2.598248243331909 }, { "auxiliary_loss_clip": 0.01143359, "auxiliary_loss_mlp": 0.01042254, "balance_loss_clip": 1.02619612, "balance_loss_mlp": 1.04678082, "epoch": 0.3596572974597926, "flos": 29862559635840.0, "grad_norm": 1.8115905743417149, "language_loss": 0.71363473, "learning_rate": 2.8538825541439367e-06, "loss": 0.73549092, "num_input_tokens_seen": 128530195, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.7890625, "step": 5982, "time_per_iteration": 2.638885021209717 }, { "auxiliary_loss_clip": 0.01129242, "auxiliary_loss_mlp": 0.01040078, "balance_loss_clip": 1.02661371, "balance_loss_mlp": 1.04560912, "epoch": 0.35971742071246054, "flos": 23112538899840.0, "grad_norm": 1.848636646067643, "language_loss": 0.7575686, "learning_rate": 2.8535408819801127e-06, "loss": 0.77926177, "num_input_tokens_seen": 128549990, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.74609375, "step": 5983, "time_per_iteration": 2.585498332977295 }, { "auxiliary_loss_clip": 0.01127825, "auxiliary_loss_mlp": 0.01047115, "balance_loss_clip": 1.03114676, "balance_loss_mlp": 1.04598737, "epoch": 0.3597775439651285, "flos": 16725359399040.0, "grad_norm": 2.091950991695041, "language_loss": 0.76487654, "learning_rate": 2.85319917935561e-06, "loss": 0.78662592, "num_input_tokens_seen": 128567925, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.81640625, "step": 5984, "time_per_iteration": 2.511698007583618 }, { "auxiliary_loss_clip": 0.01128951, "auxiliary_loss_mlp": 0.01283753, "balance_loss_clip": 1.0227983, "balance_loss_mlp": 1.0445354, "epoch": 0.3598376672177965, "flos": 19719483436800.0, "grad_norm": 2.237794191447875, "language_loss": 0.86679417, "learning_rate": 2.8528574462826234e-06, "loss": 0.89092118, "num_input_tokens_seen": 128585655, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 5985, "time_per_iteration": 2.6352169513702393 }, { "auxiliary_loss_clip": 0.011289, "auxiliary_loss_mlp": 0.01037292, "balance_loss_clip": 1.02180719, "balance_loss_mlp": 1.04438174, "epoch": 0.35989779047046444, "flos": 17311529445120.0, "grad_norm": 1.387143339902108, "language_loss": 0.7275331, "learning_rate": 2.852515682773348e-06, "loss": 0.74919504, "num_input_tokens_seen": 128604820, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.7578125, "step": 5986, "time_per_iteration": 2.528484344482422 }, { "auxiliary_loss_clip": 0.01153779, "auxiliary_loss_mlp": 0.01038674, "balance_loss_clip": 1.02380919, "balance_loss_mlp": 1.04494262, "epoch": 0.3599579137231324, "flos": 22711237176960.0, "grad_norm": 1.9301688663119305, "language_loss": 0.73555136, "learning_rate": 2.8521738888399815e-06, "loss": 0.75747585, "num_input_tokens_seen": 128623070, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8203125, "step": 5987, "time_per_iteration": 2.596172332763672 }, { "auxiliary_loss_clip": 0.01128936, "auxiliary_loss_mlp": 0.01039089, "balance_loss_clip": 1.02390778, "balance_loss_mlp": 1.04818726, "epoch": 0.36001803697580037, "flos": 20959873845120.0, "grad_norm": 1.7898781203502443, "language_loss": 0.69365925, "learning_rate": 2.8518320644947204e-06, "loss": 0.71533948, "num_input_tokens_seen": 128642430, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.80859375, "step": 5988, "time_per_iteration": 2.558506965637207 }, { "auxiliary_loss_clip": 0.01136249, "auxiliary_loss_mlp": 0.01038984, "balance_loss_clip": 1.02414823, "balance_loss_mlp": 1.04600239, "epoch": 0.36007816022846834, "flos": 20485565729280.0, "grad_norm": 1.7565095688596248, "language_loss": 0.73038667, "learning_rate": 2.851490209749764e-06, "loss": 0.75213903, "num_input_tokens_seen": 128661285, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8125, "step": 5989, "time_per_iteration": 2.5596933364868164 }, { "auxiliary_loss_clip": 0.01148494, "auxiliary_loss_mlp": 0.01033289, "balance_loss_clip": 1.01889491, "balance_loss_mlp": 1.04448783, "epoch": 0.36013828348113636, "flos": 28001237794560.0, "grad_norm": 1.4785305398250532, "language_loss": 0.79864377, "learning_rate": 2.8511483246173126e-06, "loss": 0.82046157, "num_input_tokens_seen": 128682210, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.76953125, "step": 5990, "time_per_iteration": 2.6413748264312744 }, { "auxiliary_loss_clip": 0.01131258, "auxiliary_loss_mlp": 0.01034382, "balance_loss_clip": 1.01985073, "balance_loss_mlp": 1.04363465, "epoch": 0.3601984067338043, "flos": 20082181017600.0, "grad_norm": 2.922876367115755, "language_loss": 0.83833051, "learning_rate": 2.8508064091095664e-06, "loss": 0.85998696, "num_input_tokens_seen": 128700445, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78515625, "step": 5991, "time_per_iteration": 2.598116159439087 }, { "auxiliary_loss_clip": 0.01145385, "auxiliary_loss_mlp": 0.01041609, "balance_loss_clip": 1.0271666, "balance_loss_mlp": 1.04535985, "epoch": 0.3602585299864723, "flos": 18617599872000.0, "grad_norm": 1.9714380742311892, "language_loss": 0.75736326, "learning_rate": 2.8504644632387286e-06, "loss": 0.77923322, "num_input_tokens_seen": 128716855, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.81640625, "step": 5992, "time_per_iteration": 2.527916431427002 }, { "auxiliary_loss_clip": 0.01123523, "auxiliary_loss_mlp": 0.01044423, "balance_loss_clip": 1.02911687, "balance_loss_mlp": 1.04626966, "epoch": 0.36031865323914025, "flos": 19573003774080.0, "grad_norm": 2.3676937609697926, "language_loss": 0.77542603, "learning_rate": 2.850122487017002e-06, "loss": 0.79710543, "num_input_tokens_seen": 128735835, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7734375, "step": 5993, "time_per_iteration": 2.5641634464263916 }, { "auxiliary_loss_clip": 0.01146141, "auxiliary_loss_mlp": 0.01046272, "balance_loss_clip": 1.03076911, "balance_loss_mlp": 1.04772842, "epoch": 0.3603787764918082, "flos": 17490615678720.0, "grad_norm": 1.8898317627294723, "language_loss": 0.7438668, "learning_rate": 2.84978048045659e-06, "loss": 0.76579094, "num_input_tokens_seen": 128752465, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8046875, "step": 5994, "time_per_iteration": 2.535998582839966 }, { "auxiliary_loss_clip": 0.01164759, "auxiliary_loss_mlp": 0.01038957, "balance_loss_clip": 1.02453303, "balance_loss_mlp": 1.04732168, "epoch": 0.3604388997444762, "flos": 15523393564800.0, "grad_norm": 1.830545526221023, "language_loss": 0.6862973, "learning_rate": 2.8494384435696987e-06, "loss": 0.70833451, "num_input_tokens_seen": 128770865, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.82421875, "step": 5995, "time_per_iteration": 2.6565542221069336 }, { "auxiliary_loss_clip": 0.01144221, "auxiliary_loss_mlp": 0.01043972, "balance_loss_clip": 1.02842677, "balance_loss_mlp": 1.04469609, "epoch": 0.36049902299714415, "flos": 17310883000320.0, "grad_norm": 1.9292824814848928, "language_loss": 0.82507527, "learning_rate": 2.849096376368534e-06, "loss": 0.84695721, "num_input_tokens_seen": 128789730, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8203125, "step": 5996, "time_per_iteration": 2.5896148681640625 }, { "auxiliary_loss_clip": 0.01124372, "auxiliary_loss_mlp": 0.01037755, "balance_loss_clip": 1.02306235, "balance_loss_mlp": 1.04612565, "epoch": 0.3605591462498121, "flos": 17056025026560.0, "grad_norm": 1.9545451553307471, "language_loss": 0.7359798, "learning_rate": 2.8487542788653044e-06, "loss": 0.75760108, "num_input_tokens_seen": 128806610, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78125, "step": 5997, "time_per_iteration": 4.026880979537964 }, { "auxiliary_loss_clip": 0.01133021, "auxiliary_loss_mlp": 0.01037372, "balance_loss_clip": 1.02297115, "balance_loss_mlp": 1.04656112, "epoch": 0.3606192695024801, "flos": 16836862193280.0, "grad_norm": 4.182696503643641, "language_loss": 0.69172943, "learning_rate": 2.848412151072218e-06, "loss": 0.71343338, "num_input_tokens_seen": 128824830, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7734375, "step": 5998, "time_per_iteration": 2.6251282691955566 }, { "auxiliary_loss_clip": 0.01161299, "auxiliary_loss_mlp": 0.01038087, "balance_loss_clip": 1.02312636, "balance_loss_mlp": 1.04618669, "epoch": 0.36067939275514804, "flos": 12129655743360.0, "grad_norm": 2.1879819831442946, "language_loss": 0.76441014, "learning_rate": 2.8480699930014834e-06, "loss": 0.78640401, "num_input_tokens_seen": 128838170, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.7890625, "step": 5999, "time_per_iteration": 2.6071507930755615 }, { "auxiliary_loss_clip": 0.01153405, "auxiliary_loss_mlp": 0.01040851, "balance_loss_clip": 1.02611637, "balance_loss_mlp": 1.04682767, "epoch": 0.360739516007816, "flos": 18041449720320.0, "grad_norm": 1.7111124201952501, "language_loss": 0.78334403, "learning_rate": 2.847727804665313e-06, "loss": 0.80528653, "num_input_tokens_seen": 128855625, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.79296875, "step": 6000, "time_per_iteration": 2.6447131633758545 }, { "auxiliary_loss_clip": 0.01152094, "auxiliary_loss_mlp": 0.01036824, "balance_loss_clip": 1.02289438, "balance_loss_mlp": 1.04575968, "epoch": 0.360799639260484, "flos": 18549800951040.0, "grad_norm": 1.7197218156597378, "language_loss": 0.7813623, "learning_rate": 2.8473855860759175e-06, "loss": 0.8032515, "num_input_tokens_seen": 128873540, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.79296875, "step": 6001, "time_per_iteration": 2.5772290229797363 }, { "auxiliary_loss_clip": 0.0112284, "auxiliary_loss_mlp": 0.01277402, "balance_loss_clip": 1.01605296, "balance_loss_mlp": 1.04635811, "epoch": 0.36085976251315194, "flos": 19682028529920.0, "grad_norm": 1.8810540103151185, "language_loss": 0.8363623, "learning_rate": 2.847043337245511e-06, "loss": 0.86036468, "num_input_tokens_seen": 128889925, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.765625, "step": 6002, "time_per_iteration": 4.0365989208221436 }, { "auxiliary_loss_clip": 0.01136985, "auxiliary_loss_mlp": 0.0102763, "balance_loss_clip": 1.01420128, "balance_loss_mlp": 1.04559362, "epoch": 0.3609198857658199, "flos": 24198943703040.0, "grad_norm": 1.9208342498644688, "language_loss": 0.90804291, "learning_rate": 2.8467010581863058e-06, "loss": 0.92968905, "num_input_tokens_seen": 128906890, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73828125, "step": 6003, "time_per_iteration": 4.115615606307983 }, { "auxiliary_loss_clip": 0.01072215, "auxiliary_loss_mlp": 0.01015335, "balance_loss_clip": 1.01308239, "balance_loss_mlp": 1.02064228, "epoch": 0.3609800090184879, "flos": 57115995160320.0, "grad_norm": 0.9750920464790794, "language_loss": 0.53285551, "learning_rate": 2.8463587489105175e-06, "loss": 0.55373102, "num_input_tokens_seen": 128965940, "router_z_loss_clip": 0.02258301, "router_z_loss_mlp": 0.25390625, "step": 6004, "time_per_iteration": 3.0685906410217285 }, { "auxiliary_loss_clip": 0.01141949, "auxiliary_loss_mlp": 0.0128329, "balance_loss_clip": 1.01943564, "balance_loss_mlp": 1.04409957, "epoch": 0.3610401322711559, "flos": 20811239366400.0, "grad_norm": 1.6008063948897162, "language_loss": 0.77601981, "learning_rate": 2.846016409430363e-06, "loss": 0.80027217, "num_input_tokens_seen": 128985835, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8046875, "step": 6005, "time_per_iteration": 2.6231162548065186 }, { "auxiliary_loss_clip": 0.01132059, "auxiliary_loss_mlp": 0.01039237, "balance_loss_clip": 1.02507472, "balance_loss_mlp": 1.0455811, "epoch": 0.36110025552382385, "flos": 13699167494400.0, "grad_norm": 1.9808537918437525, "language_loss": 0.79190207, "learning_rate": 2.8456740397580586e-06, "loss": 0.81361496, "num_input_tokens_seen": 129003120, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7734375, "step": 6006, "time_per_iteration": 4.070719003677368 }, { "auxiliary_loss_clip": 0.01127378, "auxiliary_loss_mlp": 0.0103732, "balance_loss_clip": 1.02079749, "balance_loss_mlp": 1.04737294, "epoch": 0.3611603787764918, "flos": 22455014486400.0, "grad_norm": 1.755316017422702, "language_loss": 0.84222776, "learning_rate": 2.845331639905824e-06, "loss": 0.86387473, "num_input_tokens_seen": 129021645, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.80078125, "step": 6007, "time_per_iteration": 2.592278003692627 }, { "auxiliary_loss_clip": 0.011482, "auxiliary_loss_mlp": 0.01035437, "balance_loss_clip": 1.01892614, "balance_loss_mlp": 1.04614782, "epoch": 0.3612205020291598, "flos": 20886651970560.0, "grad_norm": 2.3172783114134123, "language_loss": 0.72526491, "learning_rate": 2.844989209885877e-06, "loss": 0.74710125, "num_input_tokens_seen": 129038375, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.84375, "step": 6008, "time_per_iteration": 2.636749029159546 }, { "auxiliary_loss_clip": 0.01131624, "auxiliary_loss_mlp": 0.01034663, "balance_loss_clip": 1.01999998, "balance_loss_mlp": 1.04410625, "epoch": 0.36128062528182775, "flos": 15741981780480.0, "grad_norm": 2.103130859553253, "language_loss": 0.82798213, "learning_rate": 2.844646749710439e-06, "loss": 0.84964502, "num_input_tokens_seen": 129056235, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78515625, "step": 6009, "time_per_iteration": 2.600473403930664 }, { "auxiliary_loss_clip": 0.01152072, "auxiliary_loss_mlp": 0.01041559, "balance_loss_clip": 1.02633047, "balance_loss_mlp": 1.04622841, "epoch": 0.3613407485344957, "flos": 16764502245120.0, "grad_norm": 1.7300018014716334, "language_loss": 0.76460755, "learning_rate": 2.844304259391731e-06, "loss": 0.78654391, "num_input_tokens_seen": 129072405, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7890625, "step": 6010, "time_per_iteration": 2.601632595062256 }, { "auxiliary_loss_clip": 0.01139424, "auxiliary_loss_mlp": 0.01033127, "balance_loss_clip": 1.01863074, "balance_loss_mlp": 1.04434419, "epoch": 0.3614008717871637, "flos": 20371189847040.0, "grad_norm": 1.7996817938057934, "language_loss": 0.83229846, "learning_rate": 2.8439617389419757e-06, "loss": 0.85402405, "num_input_tokens_seen": 129090225, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7734375, "step": 6011, "time_per_iteration": 2.5764079093933105 }, { "auxiliary_loss_clip": 0.01131119, "auxiliary_loss_mlp": 0.01038645, "balance_loss_clip": 1.02271855, "balance_loss_mlp": 1.04765534, "epoch": 0.36146099503983165, "flos": 22776665800320.0, "grad_norm": 2.065962621968393, "language_loss": 0.62300169, "learning_rate": 2.843619188373397e-06, "loss": 0.64469934, "num_input_tokens_seen": 129107685, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8359375, "step": 6012, "time_per_iteration": 2.585068464279175 }, { "auxiliary_loss_clip": 0.01135734, "auxiliary_loss_mlp": 0.01035914, "balance_loss_clip": 1.02175248, "balance_loss_mlp": 1.04284382, "epoch": 0.3615211182924996, "flos": 22996654646400.0, "grad_norm": 1.778430425919972, "language_loss": 0.83842218, "learning_rate": 2.843276607698219e-06, "loss": 0.86013865, "num_input_tokens_seen": 129125315, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.75390625, "step": 6013, "time_per_iteration": 2.5548675060272217 }, { "auxiliary_loss_clip": 0.01124442, "auxiliary_loss_mlp": 0.01041739, "balance_loss_clip": 1.02597904, "balance_loss_mlp": 1.04581559, "epoch": 0.3615812415451676, "flos": 16648079287680.0, "grad_norm": 1.7282117370010222, "language_loss": 0.91325396, "learning_rate": 2.8429339969286687e-06, "loss": 0.93491578, "num_input_tokens_seen": 129141600, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.78515625, "step": 6014, "time_per_iteration": 2.5592904090881348 }, { "auxiliary_loss_clip": 0.01120978, "auxiliary_loss_mlp": 0.01035338, "balance_loss_clip": 1.02089024, "balance_loss_mlp": 1.0434835, "epoch": 0.36164136479783554, "flos": 21320093387520.0, "grad_norm": 1.7126345723306378, "language_loss": 0.73642516, "learning_rate": 2.8425913560769725e-06, "loss": 0.75798833, "num_input_tokens_seen": 129160665, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7734375, "step": 6015, "time_per_iteration": 2.5427188873291016 }, { "auxiliary_loss_clip": 0.01180486, "auxiliary_loss_mlp": 0.01036313, "balance_loss_clip": 1.02132869, "balance_loss_mlp": 1.04729116, "epoch": 0.3617014880505035, "flos": 24169569356160.0, "grad_norm": 2.4412473670465795, "language_loss": 0.64688045, "learning_rate": 2.8422486851553577e-06, "loss": 0.66904843, "num_input_tokens_seen": 129179220, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.796875, "step": 6016, "time_per_iteration": 2.6552693843841553 }, { "auxiliary_loss_clip": 0.01133088, "auxiliary_loss_mlp": 0.01038253, "balance_loss_clip": 1.02156341, "balance_loss_mlp": 1.04514647, "epoch": 0.3617616113031715, "flos": 39014824101120.0, "grad_norm": 1.6457440473720104, "language_loss": 0.71403658, "learning_rate": 2.8419059841760545e-06, "loss": 0.73574996, "num_input_tokens_seen": 129200385, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.7890625, "step": 6017, "time_per_iteration": 2.6669185161590576 }, { "auxiliary_loss_clip": 0.01135208, "auxiliary_loss_mlp": 0.01039345, "balance_loss_clip": 1.02375209, "balance_loss_mlp": 1.04462254, "epoch": 0.3618217345558395, "flos": 12130840892160.0, "grad_norm": 1.7365742946043399, "language_loss": 0.73805517, "learning_rate": 2.8415632531512916e-06, "loss": 0.75980067, "num_input_tokens_seen": 129217395, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.81640625, "step": 6018, "time_per_iteration": 2.616217613220215 }, { "auxiliary_loss_clip": 0.01144486, "auxiliary_loss_mlp": 0.01031668, "balance_loss_clip": 1.0172677, "balance_loss_mlp": 1.04247475, "epoch": 0.36188185780850746, "flos": 24935005203840.0, "grad_norm": 1.990535637800626, "language_loss": 0.68847036, "learning_rate": 2.841220492093301e-06, "loss": 0.7102319, "num_input_tokens_seen": 129238940, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.75, "step": 6019, "time_per_iteration": 2.644057273864746 }, { "auxiliary_loss_clip": 0.01143715, "auxiliary_loss_mlp": 0.0103669, "balance_loss_clip": 1.02101362, "balance_loss_mlp": 1.04566121, "epoch": 0.3619419810611754, "flos": 20958832350720.0, "grad_norm": 1.7906295158838488, "language_loss": 0.76410639, "learning_rate": 2.840877701014316e-06, "loss": 0.78591049, "num_input_tokens_seen": 129258240, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8046875, "step": 6020, "time_per_iteration": 2.65213680267334 }, { "auxiliary_loss_clip": 0.01136674, "auxiliary_loss_mlp": 0.01044138, "balance_loss_clip": 1.02788937, "balance_loss_mlp": 1.04751968, "epoch": 0.3620021043138434, "flos": 22528882805760.0, "grad_norm": 1.8333693252107897, "language_loss": 0.73628354, "learning_rate": 2.840534879926567e-06, "loss": 0.75809169, "num_input_tokens_seen": 129279040, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.80078125, "step": 6021, "time_per_iteration": 2.6032307147979736 }, { "auxiliary_loss_clip": 0.01148751, "auxiliary_loss_mlp": 0.01035919, "balance_loss_clip": 1.02213264, "balance_loss_mlp": 1.04391658, "epoch": 0.36206222756651135, "flos": 15596687266560.0, "grad_norm": 1.7250710977553119, "language_loss": 0.81126648, "learning_rate": 2.8401920288422915e-06, "loss": 0.83311319, "num_input_tokens_seen": 129295415, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.77734375, "step": 6022, "time_per_iteration": 2.618227958679199 }, { "auxiliary_loss_clip": 0.01130199, "auxiliary_loss_mlp": 0.01039644, "balance_loss_clip": 1.02573848, "balance_loss_mlp": 1.04648519, "epoch": 0.3621223508191793, "flos": 23587170238080.0, "grad_norm": 1.7229544371062604, "language_loss": 0.81250763, "learning_rate": 2.8398491477737235e-06, "loss": 0.8342061, "num_input_tokens_seen": 129312620, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75, "step": 6023, "time_per_iteration": 2.6865432262420654 }, { "auxiliary_loss_clip": 0.01141485, "auxiliary_loss_mlp": 0.01036245, "balance_loss_clip": 1.02108145, "balance_loss_mlp": 1.04394484, "epoch": 0.3621824740718473, "flos": 22309899540480.0, "grad_norm": 1.6696527572430595, "language_loss": 0.79567075, "learning_rate": 2.8395062367330997e-06, "loss": 0.81744802, "num_input_tokens_seen": 129331825, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.796875, "step": 6024, "time_per_iteration": 2.6032915115356445 }, { "auxiliary_loss_clip": 0.01134638, "auxiliary_loss_mlp": 0.01031365, "balance_loss_clip": 1.01795936, "balance_loss_mlp": 1.04251301, "epoch": 0.36224259732451525, "flos": 16763640318720.0, "grad_norm": 2.063715150002523, "language_loss": 0.74563551, "learning_rate": 2.839163295732658e-06, "loss": 0.76729548, "num_input_tokens_seen": 129350400, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73828125, "step": 6025, "time_per_iteration": 2.574662446975708 }, { "auxiliary_loss_clip": 0.01127164, "auxiliary_loss_mlp": 0.01283132, "balance_loss_clip": 1.0221715, "balance_loss_mlp": 1.04235375, "epoch": 0.3623027205771832, "flos": 23149742411520.0, "grad_norm": 1.8492406840632616, "language_loss": 0.7204237, "learning_rate": 2.8388203247846365e-06, "loss": 0.74452668, "num_input_tokens_seen": 129371155, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 6026, "time_per_iteration": 2.5695769786834717 }, { "auxiliary_loss_clip": 0.01136133, "auxiliary_loss_mlp": 0.01044041, "balance_loss_clip": 1.02819753, "balance_loss_mlp": 1.04551649, "epoch": 0.3623628438298512, "flos": 28549162834560.0, "grad_norm": 2.141053528404469, "language_loss": 0.78946227, "learning_rate": 2.8384773239012757e-06, "loss": 0.81126398, "num_input_tokens_seen": 129391230, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8203125, "step": 6027, "time_per_iteration": 2.6013729572296143 }, { "auxiliary_loss_clip": 0.01140301, "auxiliary_loss_mlp": 0.01040677, "balance_loss_clip": 1.02525091, "balance_loss_mlp": 1.04344606, "epoch": 0.36242296708251914, "flos": 25484941405440.0, "grad_norm": 2.143585834470522, "language_loss": 0.67631614, "learning_rate": 2.838134293094815e-06, "loss": 0.69812596, "num_input_tokens_seen": 129410065, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.7890625, "step": 6028, "time_per_iteration": 2.569890022277832 }, { "auxiliary_loss_clip": 0.01147051, "auxiliary_loss_mlp": 0.01033949, "balance_loss_clip": 1.02007294, "balance_loss_mlp": 1.04402649, "epoch": 0.3624830903351871, "flos": 16290373697280.0, "grad_norm": 2.7934992757458756, "language_loss": 0.8568812, "learning_rate": 2.8377912323774986e-06, "loss": 0.8786912, "num_input_tokens_seen": 129428655, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 6029, "time_per_iteration": 2.549084424972534 }, { "auxiliary_loss_clip": 0.01145496, "auxiliary_loss_mlp": 0.01035392, "balance_loss_clip": 1.02155185, "balance_loss_mlp": 1.04347229, "epoch": 0.36254321358785513, "flos": 18296307694080.0, "grad_norm": 1.6145854973627352, "language_loss": 0.72562438, "learning_rate": 2.8374481417615675e-06, "loss": 0.7474333, "num_input_tokens_seen": 129447845, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 6030, "time_per_iteration": 2.636721134185791 }, { "auxiliary_loss_clip": 0.01142558, "auxiliary_loss_mlp": 0.01042102, "balance_loss_clip": 1.02544236, "balance_loss_mlp": 1.04266143, "epoch": 0.3626033368405231, "flos": 14865294533760.0, "grad_norm": 2.0438806431451777, "language_loss": 0.74420875, "learning_rate": 2.8371050212592664e-06, "loss": 0.76605535, "num_input_tokens_seen": 129463275, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8203125, "step": 6031, "time_per_iteration": 2.5649657249450684 }, { "auxiliary_loss_clip": 0.01145499, "auxiliary_loss_mlp": 0.01032257, "balance_loss_clip": 1.01787448, "balance_loss_mlp": 1.04304624, "epoch": 0.36266346009319106, "flos": 22306595489280.0, "grad_norm": 1.5325824405429234, "language_loss": 0.73182821, "learning_rate": 2.8367618708828413e-06, "loss": 0.75360572, "num_input_tokens_seen": 129483205, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7578125, "step": 6032, "time_per_iteration": 2.605257511138916 }, { "auxiliary_loss_clip": 0.0115691, "auxiliary_loss_mlp": 0.01041983, "balance_loss_clip": 1.02757072, "balance_loss_mlp": 1.04354072, "epoch": 0.362723583345859, "flos": 18222331633920.0, "grad_norm": 1.714305770945228, "language_loss": 0.77419662, "learning_rate": 2.836418690644536e-06, "loss": 0.79618555, "num_input_tokens_seen": 129499885, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78125, "step": 6033, "time_per_iteration": 2.6891543865203857 }, { "auxiliary_loss_clip": 0.01077159, "auxiliary_loss_mlp": 0.010083, "balance_loss_clip": 1.00635707, "balance_loss_mlp": 1.01649284, "epoch": 0.362783706598527, "flos": 68499174458880.0, "grad_norm": 0.7742878546411087, "language_loss": 0.64741278, "learning_rate": 2.8360754805566004e-06, "loss": 0.66826737, "num_input_tokens_seen": 129561885, "router_z_loss_clip": 0.01940918, "router_z_loss_mlp": 0.24902344, "step": 6034, "time_per_iteration": 3.3053438663482666 }, { "auxiliary_loss_clip": 0.01127963, "auxiliary_loss_mlp": 0.01035312, "balance_loss_clip": 1.02069688, "balance_loss_mlp": 1.04269624, "epoch": 0.36284382985119495, "flos": 26576589594240.0, "grad_norm": 1.648377304006154, "language_loss": 0.89781141, "learning_rate": 2.835732240631281e-06, "loss": 0.9194442, "num_input_tokens_seen": 129582325, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.76171875, "step": 6035, "time_per_iteration": 2.698869466781616 }, { "auxiliary_loss_clip": 0.01140183, "auxiliary_loss_mlp": 0.01040384, "balance_loss_clip": 1.02486324, "balance_loss_mlp": 1.04220653, "epoch": 0.3629039531038629, "flos": 20156767608960.0, "grad_norm": 1.860545315569077, "language_loss": 0.74034059, "learning_rate": 2.8353889708808274e-06, "loss": 0.76214623, "num_input_tokens_seen": 129600350, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.80078125, "step": 6036, "time_per_iteration": 2.6426496505737305 }, { "auxiliary_loss_clip": 0.01142436, "auxiliary_loss_mlp": 0.01034309, "balance_loss_clip": 1.01937163, "balance_loss_mlp": 1.04350996, "epoch": 0.3629640763565309, "flos": 18625716345600.0, "grad_norm": 1.789923093048975, "language_loss": 0.75877756, "learning_rate": 2.835045671317491e-06, "loss": 0.780545, "num_input_tokens_seen": 129618425, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8125, "step": 6037, "time_per_iteration": 2.6445462703704834 }, { "auxiliary_loss_clip": 0.01126347, "auxiliary_loss_mlp": 0.01046548, "balance_loss_clip": 1.03128946, "balance_loss_mlp": 1.04302382, "epoch": 0.36302419960919885, "flos": 19571459489280.0, "grad_norm": 1.456289154523596, "language_loss": 0.78695947, "learning_rate": 2.834702341953522e-06, "loss": 0.8086884, "num_input_tokens_seen": 129636750, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7421875, "step": 6038, "time_per_iteration": 4.043387413024902 }, { "auxiliary_loss_clip": 0.01051556, "auxiliary_loss_mlp": 0.01006037, "balance_loss_clip": 1.00408161, "balance_loss_mlp": 1.01717484, "epoch": 0.3630843228618668, "flos": 63797606444160.0, "grad_norm": 0.8284058088691394, "language_loss": 0.63412797, "learning_rate": 2.8343589828011737e-06, "loss": 0.65470392, "num_input_tokens_seen": 129699030, "router_z_loss_clip": 0.01953125, "router_z_loss_mlp": 0.25390625, "step": 6039, "time_per_iteration": 3.180142402648926 }, { "auxiliary_loss_clip": 0.01138667, "auxiliary_loss_mlp": 0.01037851, "balance_loss_clip": 1.02365947, "balance_loss_mlp": 1.04394519, "epoch": 0.3631444461145348, "flos": 21835160461440.0, "grad_norm": 1.903072331408725, "language_loss": 0.70776367, "learning_rate": 2.8340155938726993e-06, "loss": 0.72952878, "num_input_tokens_seen": 129717135, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.765625, "step": 6040, "time_per_iteration": 2.572064161300659 }, { "auxiliary_loss_clip": 0.01132413, "auxiliary_loss_mlp": 0.01033755, "balance_loss_clip": 1.0193547, "balance_loss_mlp": 1.04423487, "epoch": 0.36320456936720275, "flos": 21722041555200.0, "grad_norm": 2.0656410702477173, "language_loss": 0.81180644, "learning_rate": 2.833672175180354e-06, "loss": 0.8334682, "num_input_tokens_seen": 129735940, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.796875, "step": 6041, "time_per_iteration": 2.6519064903259277 }, { "auxiliary_loss_clip": 0.0113754, "auxiliary_loss_mlp": 0.01033931, "balance_loss_clip": 1.0180645, "balance_loss_mlp": 1.041399, "epoch": 0.3632646926198707, "flos": 17019072910080.0, "grad_norm": 3.2865160667591584, "language_loss": 0.83958733, "learning_rate": 2.8333287267363934e-06, "loss": 0.86130202, "num_input_tokens_seen": 129752790, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.78125, "step": 6042, "time_per_iteration": 2.5369935035705566 }, { "auxiliary_loss_clip": 0.01137975, "auxiliary_loss_mlp": 0.01040639, "balance_loss_clip": 1.02626848, "balance_loss_mlp": 1.04420018, "epoch": 0.36332481587253873, "flos": 23331163029120.0, "grad_norm": 1.5591978268824664, "language_loss": 0.78432202, "learning_rate": 2.832985248553074e-06, "loss": 0.80610824, "num_input_tokens_seen": 129773655, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7578125, "step": 6043, "time_per_iteration": 2.73264741897583 }, { "auxiliary_loss_clip": 0.01144927, "auxiliary_loss_mlp": 0.01034864, "balance_loss_clip": 1.01962924, "balance_loss_mlp": 1.04341662, "epoch": 0.3633849391252067, "flos": 10743539857920.0, "grad_norm": 1.8141336496312692, "language_loss": 0.65699959, "learning_rate": 2.8326417406426536e-06, "loss": 0.67879748, "num_input_tokens_seen": 129791605, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.75, "step": 6044, "time_per_iteration": 5.57329797744751 }, { "auxiliary_loss_clip": 0.01138251, "auxiliary_loss_mlp": 0.01028884, "balance_loss_clip": 1.01419115, "balance_loss_mlp": 1.04432631, "epoch": 0.36344506237787466, "flos": 25849147357440.0, "grad_norm": 1.5821674525138842, "language_loss": 0.8097468, "learning_rate": 2.8322982030173908e-06, "loss": 0.83141816, "num_input_tokens_seen": 129811075, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7578125, "step": 6045, "time_per_iteration": 2.6790292263031006 }, { "auxiliary_loss_clip": 0.01149364, "auxiliary_loss_mlp": 0.01038694, "balance_loss_clip": 1.02395415, "balance_loss_mlp": 1.04453516, "epoch": 0.3635051856305426, "flos": 30154046503680.0, "grad_norm": 1.8871288527397552, "language_loss": 0.64809263, "learning_rate": 2.8319546356895467e-06, "loss": 0.66997325, "num_input_tokens_seen": 129833755, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.78125, "step": 6046, "time_per_iteration": 2.756725311279297 }, { "auxiliary_loss_clip": 0.01147638, "auxiliary_loss_mlp": 0.01037934, "balance_loss_clip": 1.02393317, "balance_loss_mlp": 1.04443645, "epoch": 0.3635653088832106, "flos": 22198396746240.0, "grad_norm": 1.492811595700444, "language_loss": 0.77637613, "learning_rate": 2.831611038671382e-06, "loss": 0.79823184, "num_input_tokens_seen": 129854475, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.76953125, "step": 6047, "time_per_iteration": 2.6261355876922607 }, { "auxiliary_loss_clip": 0.01137889, "auxiliary_loss_mlp": 0.01041449, "balance_loss_clip": 1.02466452, "balance_loss_mlp": 1.04445291, "epoch": 0.36362543213587856, "flos": 24787053083520.0, "grad_norm": 1.811185755352866, "language_loss": 0.79737264, "learning_rate": 2.8312674119751585e-06, "loss": 0.81916606, "num_input_tokens_seen": 129873530, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.84375, "step": 6048, "time_per_iteration": 4.213382005691528 }, { "auxiliary_loss_clip": 0.01051083, "auxiliary_loss_mlp": 0.01001223, "balance_loss_clip": 0.99916023, "balance_loss_mlp": 1.01724088, "epoch": 0.3636855553885465, "flos": 62526369231360.0, "grad_norm": 0.7561321140234355, "language_loss": 0.52597237, "learning_rate": 2.8309237556131385e-06, "loss": 0.54649544, "num_input_tokens_seen": 129940400, "router_z_loss_clip": 0.02062988, "router_z_loss_mlp": 0.25195312, "step": 6049, "time_per_iteration": 3.2807979583740234 }, { "auxiliary_loss_clip": 0.01149531, "auxiliary_loss_mlp": 0.01037564, "balance_loss_clip": 1.02215004, "balance_loss_mlp": 1.04405522, "epoch": 0.3637456786412145, "flos": 24060652341120.0, "grad_norm": 1.7480602994060788, "language_loss": 0.86011362, "learning_rate": 2.8305800695975873e-06, "loss": 0.88198459, "num_input_tokens_seen": 129958635, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.78125, "step": 6050, "time_per_iteration": 2.6653988361358643 }, { "auxiliary_loss_clip": 0.01129915, "auxiliary_loss_mlp": 0.01282879, "balance_loss_clip": 1.02179909, "balance_loss_mlp": 1.04462337, "epoch": 0.36380580189388245, "flos": 16691495852160.0, "grad_norm": 1.7620745359441334, "language_loss": 0.78177774, "learning_rate": 2.8302363539407703e-06, "loss": 0.8059057, "num_input_tokens_seen": 129977685, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 6051, "time_per_iteration": 2.6517140865325928 }, { "auxiliary_loss_clip": 0.01130157, "auxiliary_loss_mlp": 0.01035935, "balance_loss_clip": 1.02210665, "balance_loss_mlp": 1.04348719, "epoch": 0.3638659251465504, "flos": 25114091437440.0, "grad_norm": 3.7999151580121566, "language_loss": 0.82780659, "learning_rate": 2.829892608654953e-06, "loss": 0.84946752, "num_input_tokens_seen": 129997530, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.77734375, "step": 6052, "time_per_iteration": 2.570828676223755 }, { "auxiliary_loss_clip": 0.01142982, "auxiliary_loss_mlp": 0.01037506, "balance_loss_clip": 1.02395833, "balance_loss_mlp": 1.0423919, "epoch": 0.3639260483992184, "flos": 23003011353600.0, "grad_norm": 1.6411681668346974, "language_loss": 0.72334409, "learning_rate": 2.829548833752404e-06, "loss": 0.74514902, "num_input_tokens_seen": 130017955, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7421875, "step": 6053, "time_per_iteration": 2.6671528816223145 }, { "auxiliary_loss_clip": 0.01087193, "auxiliary_loss_mlp": 0.01002349, "balance_loss_clip": 1.0002507, "balance_loss_mlp": 1.01717639, "epoch": 0.36398617165188635, "flos": 70716011160960.0, "grad_norm": 0.7651736030627533, "language_loss": 0.61263424, "learning_rate": 2.8292050292453904e-06, "loss": 0.63352966, "num_input_tokens_seen": 130074275, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.25, "step": 6054, "time_per_iteration": 3.2782163619995117 }, { "auxiliary_loss_clip": 0.01141563, "auxiliary_loss_mlp": 0.01039002, "balance_loss_clip": 1.02328444, "balance_loss_mlp": 1.04419482, "epoch": 0.3640462949045543, "flos": 22235456603520.0, "grad_norm": 1.8226732996389343, "language_loss": 0.75649202, "learning_rate": 2.828861195146182e-06, "loss": 0.77829766, "num_input_tokens_seen": 130091375, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.79296875, "step": 6055, "time_per_iteration": 2.6045377254486084 }, { "auxiliary_loss_clip": 0.01121561, "auxiliary_loss_mlp": 0.01039083, "balance_loss_clip": 1.02399123, "balance_loss_mlp": 1.04456973, "epoch": 0.3641064181572223, "flos": 21543529939200.0, "grad_norm": 2.118532571841897, "language_loss": 0.74853075, "learning_rate": 2.82851733146705e-06, "loss": 0.77013707, "num_input_tokens_seen": 130111595, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.76953125, "step": 6056, "time_per_iteration": 2.608207941055298 }, { "auxiliary_loss_clip": 0.01140109, "auxiliary_loss_mlp": 0.01039595, "balance_loss_clip": 1.02502775, "balance_loss_mlp": 1.04576135, "epoch": 0.3641665414098903, "flos": 22273306560000.0, "grad_norm": 2.277277758783967, "language_loss": 0.79722482, "learning_rate": 2.8281734382202657e-06, "loss": 0.81902182, "num_input_tokens_seen": 130131440, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.765625, "step": 6057, "time_per_iteration": 2.584653377532959 }, { "auxiliary_loss_clip": 0.01139145, "auxiliary_loss_mlp": 0.01031298, "balance_loss_clip": 1.0171597, "balance_loss_mlp": 1.04504395, "epoch": 0.36422666466255826, "flos": 28329676778880.0, "grad_norm": 1.9623361057934463, "language_loss": 0.80211484, "learning_rate": 2.8278295154181017e-06, "loss": 0.82381928, "num_input_tokens_seen": 130151375, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 6058, "time_per_iteration": 2.6483051776885986 }, { "auxiliary_loss_clip": 0.01137404, "auxiliary_loss_mlp": 0.01279733, "balance_loss_clip": 1.01775503, "balance_loss_mlp": 1.04228747, "epoch": 0.36428678791522623, "flos": 24170503109760.0, "grad_norm": 1.8325982564465848, "language_loss": 0.84978139, "learning_rate": 2.8274855630728316e-06, "loss": 0.87395275, "num_input_tokens_seen": 130169960, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7734375, "step": 6059, "time_per_iteration": 2.597874164581299 }, { "auxiliary_loss_clip": 0.01149524, "auxiliary_loss_mlp": 0.01038326, "balance_loss_clip": 1.02310288, "balance_loss_mlp": 1.04369402, "epoch": 0.3643469111678942, "flos": 22528451842560.0, "grad_norm": 1.5605446260019864, "language_loss": 0.87875903, "learning_rate": 2.82714158119673e-06, "loss": 0.90063751, "num_input_tokens_seen": 130189800, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7890625, "step": 6060, "time_per_iteration": 2.614321231842041 }, { "auxiliary_loss_clip": 0.01133274, "auxiliary_loss_mlp": 0.01037448, "balance_loss_clip": 1.02150977, "balance_loss_mlp": 1.04414129, "epoch": 0.36440703442056216, "flos": 19426595938560.0, "grad_norm": 2.1805990285835564, "language_loss": 0.66780317, "learning_rate": 2.826797569802074e-06, "loss": 0.68951035, "num_input_tokens_seen": 130206370, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8046875, "step": 6061, "time_per_iteration": 2.566739559173584 }, { "auxiliary_loss_clip": 0.01143005, "auxiliary_loss_mlp": 0.01034069, "balance_loss_clip": 1.01863146, "balance_loss_mlp": 1.04509807, "epoch": 0.3644671576732301, "flos": 18040515966720.0, "grad_norm": 3.4823622657421636, "language_loss": 0.75397015, "learning_rate": 2.826453528901139e-06, "loss": 0.77574086, "num_input_tokens_seen": 130224445, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8046875, "step": 6062, "time_per_iteration": 2.8226478099823 }, { "auxiliary_loss_clip": 0.01137589, "auxiliary_loss_mlp": 0.01034873, "balance_loss_clip": 1.01845789, "balance_loss_mlp": 1.04290438, "epoch": 0.3645272809258981, "flos": 21542811667200.0, "grad_norm": 1.5691498194462012, "language_loss": 0.72379774, "learning_rate": 2.826109458506203e-06, "loss": 0.74552238, "num_input_tokens_seen": 130245380, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.765625, "step": 6063, "time_per_iteration": 2.6093199253082275 }, { "auxiliary_loss_clip": 0.01126625, "auxiliary_loss_mlp": 0.01036024, "balance_loss_clip": 1.02124763, "balance_loss_mlp": 1.04134035, "epoch": 0.36458740417856605, "flos": 22746860490240.0, "grad_norm": 1.972112927670907, "language_loss": 0.67679352, "learning_rate": 2.825765358629546e-06, "loss": 0.69842005, "num_input_tokens_seen": 130265575, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.76171875, "step": 6064, "time_per_iteration": 2.679175853729248 }, { "auxiliary_loss_clip": 0.01140111, "auxiliary_loss_mlp": 0.01051932, "balance_loss_clip": 1.03646421, "balance_loss_mlp": 1.04440331, "epoch": 0.364647527431234, "flos": 26140670138880.0, "grad_norm": 1.9725774145536201, "language_loss": 0.74393761, "learning_rate": 2.825421229283447e-06, "loss": 0.76585805, "num_input_tokens_seen": 130286195, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.78125, "step": 6065, "time_per_iteration": 2.6051199436187744 }, { "auxiliary_loss_clip": 0.01141275, "auxiliary_loss_mlp": 0.01042113, "balance_loss_clip": 1.02565062, "balance_loss_mlp": 1.04253221, "epoch": 0.364707650683902, "flos": 31029907737600.0, "grad_norm": 1.8423712109365906, "language_loss": 0.7484622, "learning_rate": 2.825077070480188e-06, "loss": 0.77029604, "num_input_tokens_seen": 130306095, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8125, "step": 6066, "time_per_iteration": 2.7011613845825195 }, { "auxiliary_loss_clip": 0.01126173, "auxiliary_loss_mlp": 0.01033885, "balance_loss_clip": 1.02046204, "balance_loss_mlp": 1.04221296, "epoch": 0.36476777393656995, "flos": 19572896033280.0, "grad_norm": 2.015985997487453, "language_loss": 0.76577091, "learning_rate": 2.8247328822320505e-06, "loss": 0.78737146, "num_input_tokens_seen": 130324685, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75, "step": 6067, "time_per_iteration": 2.523832082748413 }, { "auxiliary_loss_clip": 0.01116898, "auxiliary_loss_mlp": 0.01036707, "balance_loss_clip": 1.02346337, "balance_loss_mlp": 1.04183912, "epoch": 0.3648278971892379, "flos": 17748849530880.0, "grad_norm": 1.9241252671267326, "language_loss": 0.70790768, "learning_rate": 2.8243886645513176e-06, "loss": 0.72944367, "num_input_tokens_seen": 130343855, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75, "step": 6068, "time_per_iteration": 2.5662434101104736 }, { "auxiliary_loss_clip": 0.01129899, "auxiliary_loss_mlp": 0.01037135, "balance_loss_clip": 1.02223408, "balance_loss_mlp": 1.04180467, "epoch": 0.3648880204419059, "flos": 17931167988480.0, "grad_norm": 1.7877359152840422, "language_loss": 0.73775029, "learning_rate": 2.8240444174502747e-06, "loss": 0.75942057, "num_input_tokens_seen": 130362320, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7890625, "step": 6069, "time_per_iteration": 2.5422208309173584 }, { "auxiliary_loss_clip": 0.0113796, "auxiliary_loss_mlp": 0.01037095, "balance_loss_clip": 1.02196777, "balance_loss_mlp": 1.04158044, "epoch": 0.3649481436945739, "flos": 22638266697600.0, "grad_norm": 1.8212372859179995, "language_loss": 0.66258109, "learning_rate": 2.8237001409412055e-06, "loss": 0.68433166, "num_input_tokens_seen": 130383165, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7890625, "step": 6070, "time_per_iteration": 2.6352486610412598 }, { "auxiliary_loss_clip": 0.01126928, "auxiliary_loss_mlp": 0.01034408, "balance_loss_clip": 1.02047229, "balance_loss_mlp": 1.04228187, "epoch": 0.36500826694724187, "flos": 21579656042880.0, "grad_norm": 1.5771066330266827, "language_loss": 0.74012196, "learning_rate": 2.8233558350363974e-06, "loss": 0.76173532, "num_input_tokens_seen": 130402425, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75390625, "step": 6071, "time_per_iteration": 2.6061806678771973 }, { "auxiliary_loss_clip": 0.01128304, "auxiliary_loss_mlp": 0.01031814, "balance_loss_clip": 1.01734829, "balance_loss_mlp": 1.04276729, "epoch": 0.36506839019990983, "flos": 13772533023360.0, "grad_norm": 1.8540360911156675, "language_loss": 0.8826443, "learning_rate": 2.823011499748137e-06, "loss": 0.9042455, "num_input_tokens_seen": 130419440, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.765625, "step": 6072, "time_per_iteration": 2.5518808364868164 }, { "auxiliary_loss_clip": 0.01128709, "auxiliary_loss_mlp": 0.01035766, "balance_loss_clip": 1.02190781, "balance_loss_mlp": 1.04298949, "epoch": 0.3651285134525778, "flos": 17274972378240.0, "grad_norm": 2.3869738208168148, "language_loss": 0.72222412, "learning_rate": 2.8226671350887136e-06, "loss": 0.74386889, "num_input_tokens_seen": 130438495, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76953125, "step": 6073, "time_per_iteration": 2.7025809288024902 }, { "auxiliary_loss_clip": 0.01165396, "auxiliary_loss_mlp": 0.01038244, "balance_loss_clip": 1.02305675, "balance_loss_mlp": 1.04210854, "epoch": 0.36518863670524576, "flos": 21907987286400.0, "grad_norm": 2.322840595422035, "language_loss": 0.673141, "learning_rate": 2.8223227410704163e-06, "loss": 0.69517744, "num_input_tokens_seen": 130455575, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.78515625, "step": 6074, "time_per_iteration": 2.6221301555633545 }, { "auxiliary_loss_clip": 0.01143714, "auxiliary_loss_mlp": 0.01035501, "balance_loss_clip": 1.02126145, "balance_loss_mlp": 1.04071784, "epoch": 0.3652487599579137, "flos": 27122180250240.0, "grad_norm": 1.460008032180422, "language_loss": 0.72533435, "learning_rate": 2.8219783177055355e-06, "loss": 0.74712646, "num_input_tokens_seen": 130476385, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.76171875, "step": 6075, "time_per_iteration": 2.6348204612731934 }, { "auxiliary_loss_clip": 0.01152014, "auxiliary_loss_mlp": 0.01043517, "balance_loss_clip": 1.02792478, "balance_loss_mlp": 1.04427028, "epoch": 0.3653088832105817, "flos": 19755573626880.0, "grad_norm": 2.122998869761784, "language_loss": 0.89853394, "learning_rate": 2.821633865006363e-06, "loss": 0.92048931, "num_input_tokens_seen": 130493630, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8125, "step": 6076, "time_per_iteration": 2.5828592777252197 }, { "auxiliary_loss_clip": 0.01127601, "auxiliary_loss_mlp": 0.01039693, "balance_loss_clip": 1.02497053, "balance_loss_mlp": 1.04264247, "epoch": 0.36536900646324966, "flos": 13115008609920.0, "grad_norm": 1.9455333726212714, "language_loss": 0.69899505, "learning_rate": 2.8212893829851914e-06, "loss": 0.72066808, "num_input_tokens_seen": 130510735, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.76171875, "step": 6077, "time_per_iteration": 2.577286958694458 }, { "auxiliary_loss_clip": 0.01042787, "auxiliary_loss_mlp": 0.01004211, "balance_loss_clip": 1.00220871, "balance_loss_mlp": 1.0171535, "epoch": 0.3654291297159176, "flos": 71100472383360.0, "grad_norm": 0.7441006876614369, "language_loss": 0.61812061, "learning_rate": 2.8209448716543145e-06, "loss": 0.63859063, "num_input_tokens_seen": 130577050, "router_z_loss_clip": 0.02001953, "router_z_loss_mlp": 0.2578125, "step": 6078, "time_per_iteration": 3.2345054149627686 }, { "auxiliary_loss_clip": 0.01136063, "auxiliary_loss_mlp": 0.01039818, "balance_loss_clip": 1.02569127, "balance_loss_mlp": 1.04137707, "epoch": 0.3654892529685856, "flos": 23617478338560.0, "grad_norm": 2.4813754203310925, "language_loss": 0.78247547, "learning_rate": 2.8206003310260265e-06, "loss": 0.80423427, "num_input_tokens_seen": 130593780, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76953125, "step": 6079, "time_per_iteration": 2.6368911266326904 }, { "auxiliary_loss_clip": 0.01130666, "auxiliary_loss_mlp": 0.01037642, "balance_loss_clip": 1.02235973, "balance_loss_mlp": 1.04338956, "epoch": 0.36554937622125355, "flos": 43470799850880.0, "grad_norm": 1.987165025896167, "language_loss": 0.62823546, "learning_rate": 2.820255761112624e-06, "loss": 0.64991844, "num_input_tokens_seen": 130615510, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.78125, "step": 6080, "time_per_iteration": 4.059189796447754 }, { "auxiliary_loss_clip": 0.01150189, "auxiliary_loss_mlp": 0.01038004, "balance_loss_clip": 1.02243495, "balance_loss_mlp": 1.04448283, "epoch": 0.3656094994739215, "flos": 23294641875840.0, "grad_norm": 3.1231689577720716, "language_loss": 0.67630696, "learning_rate": 2.819911161926403e-06, "loss": 0.6981889, "num_input_tokens_seen": 130635410, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.7890625, "step": 6081, "time_per_iteration": 2.5898513793945312 }, { "auxiliary_loss_clip": 0.01142672, "auxiliary_loss_mlp": 0.01288225, "balance_loss_clip": 1.02592897, "balance_loss_mlp": 1.04263866, "epoch": 0.3656696227265895, "flos": 24571984400640.0, "grad_norm": 1.8088842117216697, "language_loss": 0.75190032, "learning_rate": 2.8195665334796617e-06, "loss": 0.77620935, "num_input_tokens_seen": 130657725, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8203125, "step": 6082, "time_per_iteration": 2.665844202041626 }, { "auxiliary_loss_clip": 0.01169066, "auxiliary_loss_mlp": 0.01276796, "balance_loss_clip": 1.01583767, "balance_loss_mlp": 1.04549408, "epoch": 0.3657297459792575, "flos": 27928375056000.0, "grad_norm": 1.846993801742794, "language_loss": 0.83017236, "learning_rate": 2.8192218757846993e-06, "loss": 0.85463095, "num_input_tokens_seen": 130678360, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.79296875, "step": 6083, "time_per_iteration": 2.6599526405334473 }, { "auxiliary_loss_clip": 0.01062514, "auxiliary_loss_mlp": 0.01003008, "balance_loss_clip": 1.00110042, "balance_loss_mlp": 1.01946664, "epoch": 0.36578986923192547, "flos": 67392622126080.0, "grad_norm": 0.8130390901585586, "language_loss": 0.59267449, "learning_rate": 2.8188771888538148e-06, "loss": 0.61332977, "num_input_tokens_seen": 130742110, "router_z_loss_clip": 0.01904297, "router_z_loss_mlp": 0.25195312, "step": 6084, "time_per_iteration": 3.3197500705718994 }, { "auxiliary_loss_clip": 0.01148798, "auxiliary_loss_mlp": 0.01037943, "balance_loss_clip": 1.02312577, "balance_loss_mlp": 1.04421079, "epoch": 0.36584999248459343, "flos": 20227511445120.0, "grad_norm": 1.671635550673729, "language_loss": 0.72356975, "learning_rate": 2.8185324726993102e-06, "loss": 0.74543715, "num_input_tokens_seen": 130759870, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7734375, "step": 6085, "time_per_iteration": 3.9671995639801025 }, { "auxiliary_loss_clip": 0.01151403, "auxiliary_loss_mlp": 0.01036002, "balance_loss_clip": 1.0223887, "balance_loss_mlp": 1.04688573, "epoch": 0.3659101157372614, "flos": 19062461813760.0, "grad_norm": 1.7009812156349562, "language_loss": 0.78400826, "learning_rate": 2.8181877273334875e-06, "loss": 0.80588222, "num_input_tokens_seen": 130778510, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.77734375, "step": 6086, "time_per_iteration": 4.077924728393555 }, { "auxiliary_loss_clip": 0.01126676, "auxiliary_loss_mlp": 0.01031658, "balance_loss_clip": 1.01736474, "balance_loss_mlp": 1.04330468, "epoch": 0.36597023898992936, "flos": 30810708990720.0, "grad_norm": 2.2986696383780845, "language_loss": 0.76463246, "learning_rate": 2.8178429527686484e-06, "loss": 0.78621578, "num_input_tokens_seen": 130798535, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7421875, "step": 6087, "time_per_iteration": 2.6748666763305664 }, { "auxiliary_loss_clip": 0.01160008, "auxiliary_loss_mlp": 0.0103389, "balance_loss_clip": 1.01796913, "balance_loss_mlp": 1.04436398, "epoch": 0.36603036224259733, "flos": 20521799573760.0, "grad_norm": 2.4214042318575397, "language_loss": 0.69994932, "learning_rate": 2.817498149017099e-06, "loss": 0.7218883, "num_input_tokens_seen": 130816655, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.796875, "step": 6088, "time_per_iteration": 2.615241765975952 }, { "auxiliary_loss_clip": 0.0113921, "auxiliary_loss_mlp": 0.0103047, "balance_loss_clip": 1.01426995, "balance_loss_mlp": 1.04617822, "epoch": 0.3660904854952653, "flos": 38329397798400.0, "grad_norm": 1.50138847312179, "language_loss": 0.79723221, "learning_rate": 2.8171533160911432e-06, "loss": 0.81892896, "num_input_tokens_seen": 130841225, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.83984375, "step": 6089, "time_per_iteration": 2.806868076324463 }, { "auxiliary_loss_clip": 0.01148694, "auxiliary_loss_mlp": 0.01033137, "balance_loss_clip": 1.0189507, "balance_loss_mlp": 1.04558074, "epoch": 0.36615060874793326, "flos": 21835555511040.0, "grad_norm": 1.8392155983137692, "language_loss": 0.71261507, "learning_rate": 2.8168084540030873e-06, "loss": 0.73443329, "num_input_tokens_seen": 130861050, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.765625, "step": 6090, "time_per_iteration": 4.156564712524414 }, { "auxiliary_loss_clip": 0.01139708, "auxiliary_loss_mlp": 0.01040872, "balance_loss_clip": 1.02733016, "balance_loss_mlp": 1.04790807, "epoch": 0.3662107320006012, "flos": 16581537342720.0, "grad_norm": 1.859698413215784, "language_loss": 0.74266934, "learning_rate": 2.8164635627652394e-06, "loss": 0.76447511, "num_input_tokens_seen": 130879775, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 6091, "time_per_iteration": 2.5690319538116455 }, { "auxiliary_loss_clip": 0.01169684, "auxiliary_loss_mlp": 0.01035841, "balance_loss_clip": 1.02058792, "balance_loss_mlp": 1.04717088, "epoch": 0.3662708552532692, "flos": 20958365473920.0, "grad_norm": 1.813948745454936, "language_loss": 0.72821701, "learning_rate": 2.8161186423899067e-06, "loss": 0.75027221, "num_input_tokens_seen": 130898070, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.77734375, "step": 6092, "time_per_iteration": 2.6575026512145996 }, { "auxiliary_loss_clip": 0.01133713, "auxiliary_loss_mlp": 0.01044113, "balance_loss_clip": 1.02854466, "balance_loss_mlp": 1.04558313, "epoch": 0.36633097850593715, "flos": 21902707987200.0, "grad_norm": 2.410780034328422, "language_loss": 0.77850229, "learning_rate": 2.8157736928893995e-06, "loss": 0.80028051, "num_input_tokens_seen": 130915250, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.79296875, "step": 6093, "time_per_iteration": 2.561143636703491 }, { "auxiliary_loss_clip": 0.01131066, "auxiliary_loss_mlp": 0.01032905, "balance_loss_clip": 1.01830196, "balance_loss_mlp": 1.04376471, "epoch": 0.3663911017586051, "flos": 32854133808000.0, "grad_norm": 1.7959591537647814, "language_loss": 0.74167955, "learning_rate": 2.815428714276027e-06, "loss": 0.76331925, "num_input_tokens_seen": 130936995, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78125, "step": 6094, "time_per_iteration": 2.7276031970977783 }, { "auxiliary_loss_clip": 0.0112659, "auxiliary_loss_mlp": 0.01049009, "balance_loss_clip": 1.03396487, "balance_loss_mlp": 1.04688132, "epoch": 0.3664512250112731, "flos": 27271748482560.0, "grad_norm": 1.8252672241366348, "language_loss": 0.79245442, "learning_rate": 2.8150837065621016e-06, "loss": 0.81421041, "num_input_tokens_seen": 130957970, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.796875, "step": 6095, "time_per_iteration": 2.6501524448394775 }, { "auxiliary_loss_clip": 0.01146083, "auxiliary_loss_mlp": 0.01282779, "balance_loss_clip": 1.01904535, "balance_loss_mlp": 1.04653943, "epoch": 0.3665113482639411, "flos": 17784436930560.0, "grad_norm": 2.370763497216547, "language_loss": 0.73790407, "learning_rate": 2.8147386697599346e-06, "loss": 0.76219261, "num_input_tokens_seen": 130974915, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8125, "step": 6096, "time_per_iteration": 2.6636345386505127 }, { "auxiliary_loss_clip": 0.01151538, "auxiliary_loss_mlp": 0.01031946, "balance_loss_clip": 1.01745009, "balance_loss_mlp": 1.04509294, "epoch": 0.36657147151660907, "flos": 27854614477440.0, "grad_norm": 1.9953266059014727, "language_loss": 0.66325605, "learning_rate": 2.8143936038818412e-06, "loss": 0.6850909, "num_input_tokens_seen": 130995745, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.80078125, "step": 6097, "time_per_iteration": 2.792349100112915 }, { "auxiliary_loss_clip": 0.01172888, "auxiliary_loss_mlp": 0.01034591, "balance_loss_clip": 1.01968944, "balance_loss_mlp": 1.0485276, "epoch": 0.36663159476927704, "flos": 25374013228800.0, "grad_norm": 1.7085000294354193, "language_loss": 0.77603436, "learning_rate": 2.8140485089401344e-06, "loss": 0.79810917, "num_input_tokens_seen": 131015545, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8046875, "step": 6098, "time_per_iteration": 2.6255505084991455 }, { "auxiliary_loss_clip": 0.01133164, "auxiliary_loss_mlp": 0.01037894, "balance_loss_clip": 1.02263486, "balance_loss_mlp": 1.0475831, "epoch": 0.366691718021945, "flos": 21357225072000.0, "grad_norm": 1.8121141592101528, "language_loss": 0.73752761, "learning_rate": 2.8137033849471305e-06, "loss": 0.75923824, "num_input_tokens_seen": 131033990, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.76953125, "step": 6099, "time_per_iteration": 2.556360960006714 }, { "auxiliary_loss_clip": 0.01139544, "auxiliary_loss_mlp": 0.01045528, "balance_loss_clip": 1.03095484, "balance_loss_mlp": 1.04582143, "epoch": 0.36675184127461297, "flos": 16800376953600.0, "grad_norm": 1.7468980337344948, "language_loss": 0.84557533, "learning_rate": 2.8133582319151456e-06, "loss": 0.8674261, "num_input_tokens_seen": 131050710, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7578125, "step": 6100, "time_per_iteration": 2.5025877952575684 }, { "auxiliary_loss_clip": 0.01154772, "auxiliary_loss_mlp": 0.01034812, "balance_loss_clip": 1.01973248, "balance_loss_mlp": 1.04575288, "epoch": 0.36681196452728093, "flos": 21906514828800.0, "grad_norm": 1.816393805464467, "language_loss": 0.70528871, "learning_rate": 2.8130130498564975e-06, "loss": 0.72718453, "num_input_tokens_seen": 131071435, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8203125, "step": 6101, "time_per_iteration": 2.650362968444824 }, { "auxiliary_loss_clip": 0.01146338, "auxiliary_loss_mlp": 0.01045246, "balance_loss_clip": 1.0289259, "balance_loss_mlp": 1.04780245, "epoch": 0.3668720877799489, "flos": 17712436118400.0, "grad_norm": 2.23848949726863, "language_loss": 0.7555685, "learning_rate": 2.8126678387835057e-06, "loss": 0.77748442, "num_input_tokens_seen": 131088775, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8046875, "step": 6102, "time_per_iteration": 2.554495096206665 }, { "auxiliary_loss_clip": 0.01140017, "auxiliary_loss_mlp": 0.01037678, "balance_loss_clip": 1.02109563, "balance_loss_mlp": 1.04833376, "epoch": 0.36693221103261686, "flos": 47045455499520.0, "grad_norm": 1.4903785894936443, "language_loss": 0.6991424, "learning_rate": 2.812322598708489e-06, "loss": 0.72091937, "num_input_tokens_seen": 131112800, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.828125, "step": 6103, "time_per_iteration": 2.8323259353637695 }, { "auxiliary_loss_clip": 0.01162004, "auxiliary_loss_mlp": 0.01036803, "balance_loss_clip": 1.02190208, "balance_loss_mlp": 1.04693305, "epoch": 0.3669923342852848, "flos": 15960929132160.0, "grad_norm": 1.8306114747703848, "language_loss": 0.71698254, "learning_rate": 2.811977329643768e-06, "loss": 0.73897064, "num_input_tokens_seen": 131131150, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.796875, "step": 6104, "time_per_iteration": 2.556342601776123 }, { "auxiliary_loss_clip": 0.01135222, "auxiliary_loss_mlp": 0.01033438, "balance_loss_clip": 1.01785731, "balance_loss_mlp": 1.04659891, "epoch": 0.3670524575379528, "flos": 19974485064960.0, "grad_norm": 1.9589267734104667, "language_loss": 0.8186363, "learning_rate": 2.8116320316016646e-06, "loss": 0.84032285, "num_input_tokens_seen": 131150365, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.796875, "step": 6105, "time_per_iteration": 2.640929698944092 }, { "auxiliary_loss_clip": 0.01131223, "auxiliary_loss_mlp": 0.01038399, "balance_loss_clip": 1.02238882, "balance_loss_mlp": 1.04813623, "epoch": 0.36711258079062076, "flos": 25702955003520.0, "grad_norm": 1.9962122664924573, "language_loss": 0.80936503, "learning_rate": 2.8112867045945016e-06, "loss": 0.8310613, "num_input_tokens_seen": 131169310, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83203125, "step": 6106, "time_per_iteration": 2.6406474113464355 }, { "auxiliary_loss_clip": 0.01073729, "auxiliary_loss_mlp": 0.01252596, "balance_loss_clip": 1.00408649, "balance_loss_mlp": 1.02167869, "epoch": 0.3671727040432887, "flos": 60772743342720.0, "grad_norm": 0.6898025000622491, "language_loss": 0.59214705, "learning_rate": 2.8109413486346044e-06, "loss": 0.61541033, "num_input_tokens_seen": 131232900, "router_z_loss_clip": 0.02026367, "router_z_loss_mlp": 0.24804688, "step": 6107, "time_per_iteration": 3.2373452186584473 }, { "auxiliary_loss_clip": 0.01140922, "auxiliary_loss_mlp": 0.01035744, "balance_loss_clip": 1.02081871, "balance_loss_mlp": 1.04646909, "epoch": 0.3672328272959567, "flos": 18661303745280.0, "grad_norm": 1.7884721339465157, "language_loss": 0.7463131, "learning_rate": 2.810595963734295e-06, "loss": 0.76807976, "num_input_tokens_seen": 131250920, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.7734375, "step": 6108, "time_per_iteration": 2.5830295085906982 }, { "auxiliary_loss_clip": 0.01135957, "auxiliary_loss_mlp": 0.01036574, "balance_loss_clip": 1.02187002, "balance_loss_mlp": 1.04562926, "epoch": 0.3672929505486247, "flos": 15049049535360.0, "grad_norm": 2.0910488050722136, "language_loss": 0.73454618, "learning_rate": 2.810250549905901e-06, "loss": 0.75627154, "num_input_tokens_seen": 131267910, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8125, "step": 6109, "time_per_iteration": 2.609670877456665 }, { "auxiliary_loss_clip": 0.01168779, "auxiliary_loss_mlp": 0.01041784, "balance_loss_clip": 1.02702558, "balance_loss_mlp": 1.0445025, "epoch": 0.3673530738012927, "flos": 20589347099520.0, "grad_norm": 1.733986844874767, "language_loss": 0.52586877, "learning_rate": 2.80990510716175e-06, "loss": 0.54797441, "num_input_tokens_seen": 131287150, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7890625, "step": 6110, "time_per_iteration": 2.59749698638916 }, { "auxiliary_loss_clip": 0.01138078, "auxiliary_loss_mlp": 0.01044353, "balance_loss_clip": 1.02864099, "balance_loss_mlp": 1.05109239, "epoch": 0.36741319705396064, "flos": 21689830033920.0, "grad_norm": 1.5444648378928512, "language_loss": 0.8093009, "learning_rate": 2.8095596355141676e-06, "loss": 0.83112526, "num_input_tokens_seen": 131308225, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.78125, "step": 6111, "time_per_iteration": 2.590794324874878 }, { "auxiliary_loss_clip": 0.01160272, "auxiliary_loss_mlp": 0.01051161, "balance_loss_clip": 1.036093, "balance_loss_mlp": 1.04764438, "epoch": 0.3674733203066286, "flos": 29862200499840.0, "grad_norm": 1.4567914371564776, "language_loss": 0.72279072, "learning_rate": 2.809214134975485e-06, "loss": 0.74490511, "num_input_tokens_seen": 131332115, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7734375, "step": 6112, "time_per_iteration": 2.6731457710266113 }, { "auxiliary_loss_clip": 0.01133108, "auxiliary_loss_mlp": 0.01049813, "balance_loss_clip": 1.03592491, "balance_loss_mlp": 1.04577339, "epoch": 0.36753344355929657, "flos": 18257021193600.0, "grad_norm": 1.5900421192443497, "language_loss": 0.85140556, "learning_rate": 2.8088686055580315e-06, "loss": 0.87323475, "num_input_tokens_seen": 131351885, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78515625, "step": 6113, "time_per_iteration": 2.5519979000091553 }, { "auxiliary_loss_clip": 0.01138521, "auxiliary_loss_mlp": 0.01046236, "balance_loss_clip": 1.02957082, "balance_loss_mlp": 1.04745293, "epoch": 0.36759356681196453, "flos": 25301150490240.0, "grad_norm": 2.092218085919537, "language_loss": 0.78590739, "learning_rate": 2.8085230472741377e-06, "loss": 0.80775499, "num_input_tokens_seen": 131370245, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8203125, "step": 6114, "time_per_iteration": 2.6218650341033936 }, { "auxiliary_loss_clip": 0.01139806, "auxiliary_loss_mlp": 0.01053625, "balance_loss_clip": 1.03554142, "balance_loss_mlp": 1.04777598, "epoch": 0.3676536900646325, "flos": 21032952065280.0, "grad_norm": 1.7890672120823914, "language_loss": 0.66911507, "learning_rate": 2.808177460136137e-06, "loss": 0.69104934, "num_input_tokens_seen": 131388115, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.828125, "step": 6115, "time_per_iteration": 2.5440335273742676 }, { "auxiliary_loss_clip": 0.01138143, "auxiliary_loss_mlp": 0.0103681, "balance_loss_clip": 1.02169466, "balance_loss_mlp": 1.04452729, "epoch": 0.36771381331730046, "flos": 16288506190080.0, "grad_norm": 2.5108196896567545, "language_loss": 0.76531816, "learning_rate": 2.807831844156361e-06, "loss": 0.78706777, "num_input_tokens_seen": 131404595, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7578125, "step": 6116, "time_per_iteration": 2.595574378967285 }, { "auxiliary_loss_clip": 0.01159446, "auxiliary_loss_mlp": 0.01045446, "balance_loss_clip": 1.03073621, "balance_loss_mlp": 1.04410315, "epoch": 0.36777393656996843, "flos": 22309971367680.0, "grad_norm": 1.7628018540991526, "language_loss": 0.63133967, "learning_rate": 2.8074861993471444e-06, "loss": 0.65338856, "num_input_tokens_seen": 131423760, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.80078125, "step": 6117, "time_per_iteration": 2.5999491214752197 }, { "auxiliary_loss_clip": 0.01143159, "auxiliary_loss_mlp": 0.01040825, "balance_loss_clip": 1.02557778, "balance_loss_mlp": 1.04702699, "epoch": 0.3678340598226364, "flos": 26834069260800.0, "grad_norm": 1.93697434714919, "language_loss": 0.72733974, "learning_rate": 2.807140525720822e-06, "loss": 0.7491796, "num_input_tokens_seen": 131444955, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.78125, "step": 6118, "time_per_iteration": 2.726116895675659 }, { "auxiliary_loss_clip": 0.0114121, "auxiliary_loss_mlp": 0.01045634, "balance_loss_clip": 1.02893305, "balance_loss_mlp": 1.04853392, "epoch": 0.36789418307530436, "flos": 21761723105280.0, "grad_norm": 1.6616655349465812, "language_loss": 0.7268995, "learning_rate": 2.8067948232897314e-06, "loss": 0.74876797, "num_input_tokens_seen": 131465720, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.83984375, "step": 6119, "time_per_iteration": 2.58845591545105 }, { "auxiliary_loss_clip": 0.01144822, "auxiliary_loss_mlp": 0.01040931, "balance_loss_clip": 1.02539778, "balance_loss_mlp": 1.04804564, "epoch": 0.3679543063279723, "flos": 15924192497280.0, "grad_norm": 1.802893705031386, "language_loss": 0.80261326, "learning_rate": 2.806449092066209e-06, "loss": 0.82447076, "num_input_tokens_seen": 131483080, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.7890625, "step": 6120, "time_per_iteration": 2.696018695831299 }, { "auxiliary_loss_clip": 0.01152631, "auxiliary_loss_mlp": 0.01043961, "balance_loss_clip": 1.02839231, "balance_loss_mlp": 1.04668641, "epoch": 0.3680144295806403, "flos": 24275541456000.0, "grad_norm": 1.8250301650383953, "language_loss": 0.645491, "learning_rate": 2.8061033320625923e-06, "loss": 0.66745698, "num_input_tokens_seen": 131502545, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.7890625, "step": 6121, "time_per_iteration": 4.01778769493103 }, { "auxiliary_loss_clip": 0.0113806, "auxiliary_loss_mlp": 0.01043352, "balance_loss_clip": 1.02745545, "balance_loss_mlp": 1.04845643, "epoch": 0.36807455283330826, "flos": 26104148985600.0, "grad_norm": 1.973062527206988, "language_loss": 0.71397483, "learning_rate": 2.8057575432912215e-06, "loss": 0.73578894, "num_input_tokens_seen": 131522155, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.80859375, "step": 6122, "time_per_iteration": 2.606692314147949 }, { "auxiliary_loss_clip": 0.01142278, "auxiliary_loss_mlp": 0.01037565, "balance_loss_clip": 1.02213383, "balance_loss_mlp": 1.04910207, "epoch": 0.3681346760859763, "flos": 24644990793600.0, "grad_norm": 1.896900785474294, "language_loss": 0.69359708, "learning_rate": 2.805411725764436e-06, "loss": 0.71539545, "num_input_tokens_seen": 131543865, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.75, "step": 6123, "time_per_iteration": 2.668187379837036 }, { "auxiliary_loss_clip": 0.01143155, "auxiliary_loss_mlp": 0.01039596, "balance_loss_clip": 1.0226686, "balance_loss_mlp": 1.04931116, "epoch": 0.36819479933864424, "flos": 23878369797120.0, "grad_norm": 2.297788795644305, "language_loss": 0.73636949, "learning_rate": 2.805065879494579e-06, "loss": 0.75819701, "num_input_tokens_seen": 131562155, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.84765625, "step": 6124, "time_per_iteration": 2.5898826122283936 }, { "auxiliary_loss_clip": 0.01147039, "auxiliary_loss_mlp": 0.01039292, "balance_loss_clip": 1.02150607, "balance_loss_mlp": 1.04837132, "epoch": 0.3682549225913122, "flos": 25553997302400.0, "grad_norm": 2.316252620239076, "language_loss": 0.74682128, "learning_rate": 2.804720004493991e-06, "loss": 0.76868463, "num_input_tokens_seen": 131581695, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.8046875, "step": 6125, "time_per_iteration": 2.6601011753082275 }, { "auxiliary_loss_clip": 0.01139967, "auxiliary_loss_mlp": 0.01046888, "balance_loss_clip": 1.02943552, "balance_loss_mlp": 1.04848123, "epoch": 0.36831504584398017, "flos": 16946605221120.0, "grad_norm": 1.995690838275656, "language_loss": 0.78239572, "learning_rate": 2.804374100775016e-06, "loss": 0.80426425, "num_input_tokens_seen": 131599465, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.828125, "step": 6126, "time_per_iteration": 2.5980546474456787 }, { "auxiliary_loss_clip": 0.01139978, "auxiliary_loss_mlp": 0.01044622, "balance_loss_clip": 1.0266335, "balance_loss_mlp": 1.0480814, "epoch": 0.36837516909664814, "flos": 19865065259520.0, "grad_norm": 2.17227585769894, "language_loss": 0.65837473, "learning_rate": 2.8040281683499985e-06, "loss": 0.68022072, "num_input_tokens_seen": 131618330, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.83203125, "step": 6127, "time_per_iteration": 5.49338436126709 }, { "auxiliary_loss_clip": 0.01148382, "auxiliary_loss_mlp": 0.01041495, "balance_loss_clip": 1.02503181, "balance_loss_mlp": 1.04894781, "epoch": 0.3684352923493161, "flos": 37626984362880.0, "grad_norm": 1.8740733958630258, "language_loss": 0.70420349, "learning_rate": 2.8036822072312835e-06, "loss": 0.72610229, "num_input_tokens_seen": 131638960, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8203125, "step": 6128, "time_per_iteration": 2.7749738693237305 }, { "auxiliary_loss_clip": 0.01146694, "auxiliary_loss_mlp": 0.01040467, "balance_loss_clip": 1.0240283, "balance_loss_mlp": 1.04890549, "epoch": 0.36849541560198407, "flos": 14465501182080.0, "grad_norm": 2.000872826535079, "language_loss": 0.75407529, "learning_rate": 2.803336217431218e-06, "loss": 0.77594686, "num_input_tokens_seen": 131657440, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.80078125, "step": 6129, "time_per_iteration": 2.583519697189331 }, { "auxiliary_loss_clip": 0.0113565, "auxiliary_loss_mlp": 0.01042193, "balance_loss_clip": 1.02586102, "balance_loss_mlp": 1.04685116, "epoch": 0.36855553885465203, "flos": 25770753924480.0, "grad_norm": 1.616617181175182, "language_loss": 0.84514391, "learning_rate": 2.80299019896215e-06, "loss": 0.86692238, "num_input_tokens_seen": 131678035, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.796875, "step": 6130, "time_per_iteration": 2.7065000534057617 }, { "auxiliary_loss_clip": 0.01064739, "auxiliary_loss_mlp": 0.01025944, "balance_loss_clip": 1.02396512, "balance_loss_mlp": 1.02145851, "epoch": 0.36861566210732, "flos": 65049417377280.0, "grad_norm": 0.8166776312111522, "language_loss": 0.60304165, "learning_rate": 2.8026441518364262e-06, "loss": 0.62394857, "num_input_tokens_seen": 131742470, "router_z_loss_clip": 0.01977539, "router_z_loss_mlp": 0.25390625, "step": 6131, "time_per_iteration": 5.429697751998901 }, { "auxiliary_loss_clip": 0.01152604, "auxiliary_loss_mlp": 0.01033016, "balance_loss_clip": 1.01706564, "balance_loss_mlp": 1.04712248, "epoch": 0.36867578535998796, "flos": 30954495133440.0, "grad_norm": 1.6919703454058188, "language_loss": 0.73025644, "learning_rate": 2.8022980760663977e-06, "loss": 0.75211263, "num_input_tokens_seen": 131764570, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.78515625, "step": 6132, "time_per_iteration": 2.6934115886688232 }, { "auxiliary_loss_clip": 0.01141296, "auxiliary_loss_mlp": 0.01038366, "balance_loss_clip": 1.02135515, "balance_loss_mlp": 1.04802299, "epoch": 0.3687359086126559, "flos": 28837956182400.0, "grad_norm": 1.7595152672872552, "language_loss": 0.73860306, "learning_rate": 2.8019519716644147e-06, "loss": 0.76039964, "num_input_tokens_seen": 131785720, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.84375, "step": 6133, "time_per_iteration": 2.676039457321167 }, { "auxiliary_loss_clip": 0.01168579, "auxiliary_loss_mlp": 0.01039753, "balance_loss_clip": 1.02414823, "balance_loss_mlp": 1.04768634, "epoch": 0.3687960318653239, "flos": 21396798881280.0, "grad_norm": 1.826082867547427, "language_loss": 0.71406806, "learning_rate": 2.801605838642829e-06, "loss": 0.7361514, "num_input_tokens_seen": 131804430, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.7578125, "step": 6134, "time_per_iteration": 2.6787545680999756 }, { "auxiliary_loss_clip": 0.01151342, "auxiliary_loss_mlp": 0.01033589, "balance_loss_clip": 1.01717401, "balance_loss_mlp": 1.0455848, "epoch": 0.36885615511799186, "flos": 20266043760000.0, "grad_norm": 1.6441911080631906, "language_loss": 0.75306308, "learning_rate": 2.8012596770139933e-06, "loss": 0.77491248, "num_input_tokens_seen": 131822060, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.7890625, "step": 6135, "time_per_iteration": 2.5979983806610107 }, { "auxiliary_loss_clip": 0.01071333, "auxiliary_loss_mlp": 0.01003638, "balance_loss_clip": 1.00161123, "balance_loss_mlp": 1.01906872, "epoch": 0.3689162783706599, "flos": 63088836301440.0, "grad_norm": 0.8094038222913316, "language_loss": 0.58770692, "learning_rate": 2.80091348679026e-06, "loss": 0.60845661, "num_input_tokens_seen": 131880715, "router_z_loss_clip": 0.02026367, "router_z_loss_mlp": 0.25585938, "step": 6136, "time_per_iteration": 3.164961576461792 }, { "auxiliary_loss_clip": 0.01144859, "auxiliary_loss_mlp": 0.01036737, "balance_loss_clip": 1.02057803, "balance_loss_mlp": 1.04828644, "epoch": 0.36897640162332784, "flos": 10961984419200.0, "grad_norm": 2.1647689915637964, "language_loss": 0.78783607, "learning_rate": 2.800567267983985e-06, "loss": 0.80965209, "num_input_tokens_seen": 131895850, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.78515625, "step": 6137, "time_per_iteration": 2.549027919769287 }, { "auxiliary_loss_clip": 0.01137342, "auxiliary_loss_mlp": 0.01042336, "balance_loss_clip": 1.02593255, "balance_loss_mlp": 1.04853582, "epoch": 0.3690365248759958, "flos": 20704297599360.0, "grad_norm": 1.859777676571835, "language_loss": 0.7394098, "learning_rate": 2.8002210206075233e-06, "loss": 0.76120663, "num_input_tokens_seen": 131915775, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.80078125, "step": 6138, "time_per_iteration": 2.6884100437164307 }, { "auxiliary_loss_clip": 0.01168619, "auxiliary_loss_mlp": 0.01041305, "balance_loss_clip": 1.02519953, "balance_loss_mlp": 1.04826772, "epoch": 0.3690966481286638, "flos": 31826369957760.0, "grad_norm": 1.8540075586908351, "language_loss": 0.64972103, "learning_rate": 2.7998747446732315e-06, "loss": 0.67182028, "num_input_tokens_seen": 131935715, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.84375, "step": 6139, "time_per_iteration": 2.756687879562378 }, { "auxiliary_loss_clip": 0.01150303, "auxiliary_loss_mlp": 0.01042412, "balance_loss_clip": 1.02615166, "balance_loss_mlp": 1.04652834, "epoch": 0.36915677138133174, "flos": 13114936782720.0, "grad_norm": 1.9875672974009788, "language_loss": 0.71392512, "learning_rate": 2.7995284401934677e-06, "loss": 0.73585224, "num_input_tokens_seen": 131954120, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.7734375, "step": 6140, "time_per_iteration": 2.6667556762695312 }, { "auxiliary_loss_clip": 0.01069169, "auxiliary_loss_mlp": 0.01001706, "balance_loss_clip": 0.99973929, "balance_loss_mlp": 1.01727235, "epoch": 0.3692168946339997, "flos": 68686879956480.0, "grad_norm": 0.7441322503793929, "language_loss": 0.59316015, "learning_rate": 2.7991821071805906e-06, "loss": 0.61386889, "num_input_tokens_seen": 132017485, "router_z_loss_clip": 0.01965332, "router_z_loss_mlp": 0.25585938, "step": 6141, "time_per_iteration": 3.267411231994629 }, { "auxiliary_loss_clip": 0.01134825, "auxiliary_loss_mlp": 0.01040414, "balance_loss_clip": 1.02412963, "balance_loss_mlp": 1.0456053, "epoch": 0.36927701788666767, "flos": 22017873968640.0, "grad_norm": 1.775305626144706, "language_loss": 0.75013781, "learning_rate": 2.7988357456469605e-06, "loss": 0.77189028, "num_input_tokens_seen": 132036760, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8046875, "step": 6142, "time_per_iteration": 2.60174822807312 }, { "auxiliary_loss_clip": 0.0113179, "auxiliary_loss_mlp": 0.010331, "balance_loss_clip": 1.01843774, "balance_loss_mlp": 1.04513884, "epoch": 0.36933714113933563, "flos": 21835591424640.0, "grad_norm": 1.75450225298686, "language_loss": 0.7706303, "learning_rate": 2.7984893556049365e-06, "loss": 0.79227924, "num_input_tokens_seen": 132056935, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.77734375, "step": 6143, "time_per_iteration": 2.7192931175231934 }, { "auxiliary_loss_clip": 0.011314, "auxiliary_loss_mlp": 0.01033249, "balance_loss_clip": 1.01877761, "balance_loss_mlp": 1.04511118, "epoch": 0.3693972643920036, "flos": 23691705793920.0, "grad_norm": 1.4643664943195227, "language_loss": 0.81957114, "learning_rate": 2.7981429370668815e-06, "loss": 0.84121764, "num_input_tokens_seen": 132077285, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7734375, "step": 6144, "time_per_iteration": 2.5496251583099365 }, { "auxiliary_loss_clip": 0.01134505, "auxiliary_loss_mlp": 0.01044453, "balance_loss_clip": 1.02910495, "balance_loss_mlp": 1.04564083, "epoch": 0.36945738764467156, "flos": 22856747172480.0, "grad_norm": 2.5176191796649836, "language_loss": 0.76856816, "learning_rate": 2.797796490045158e-06, "loss": 0.79035771, "num_input_tokens_seen": 132095520, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.80078125, "step": 6145, "time_per_iteration": 2.6224007606506348 }, { "auxiliary_loss_clip": 0.01136243, "auxiliary_loss_mlp": 0.01030047, "balance_loss_clip": 1.01533079, "balance_loss_mlp": 1.04680085, "epoch": 0.36951751089733953, "flos": 16615939593600.0, "grad_norm": 2.0623324654622825, "language_loss": 0.77240568, "learning_rate": 2.7974500145521304e-06, "loss": 0.79406857, "num_input_tokens_seen": 132112810, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8046875, "step": 6146, "time_per_iteration": 2.514068841934204 }, { "auxiliary_loss_clip": 0.01145709, "auxiliary_loss_mlp": 0.01042076, "balance_loss_clip": 1.02562499, "balance_loss_mlp": 1.04637003, "epoch": 0.3695776341500075, "flos": 18914545607040.0, "grad_norm": 3.785387482375476, "language_loss": 0.8064847, "learning_rate": 2.7971035106001636e-06, "loss": 0.82836258, "num_input_tokens_seen": 132131615, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8125, "step": 6147, "time_per_iteration": 2.6347572803497314 }, { "auxiliary_loss_clip": 0.01150953, "auxiliary_loss_mlp": 0.01287577, "balance_loss_clip": 1.02584839, "balance_loss_mlp": 1.04350567, "epoch": 0.36963775740267546, "flos": 20808474019200.0, "grad_norm": 1.70735968875912, "language_loss": 0.83038169, "learning_rate": 2.796756978201622e-06, "loss": 0.85476696, "num_input_tokens_seen": 132149585, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.80859375, "step": 6148, "time_per_iteration": 2.5772087574005127 }, { "auxiliary_loss_clip": 0.01130659, "auxiliary_loss_mlp": 0.01038876, "balance_loss_clip": 1.02378416, "balance_loss_mlp": 1.04499054, "epoch": 0.3696978806553435, "flos": 26061881656320.0, "grad_norm": 1.9985199565347256, "language_loss": 0.74469328, "learning_rate": 2.7964104173688735e-06, "loss": 0.7663886, "num_input_tokens_seen": 132165555, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.765625, "step": 6149, "time_per_iteration": 2.63565731048584 }, { "auxiliary_loss_clip": 0.01128841, "auxiliary_loss_mlp": 0.01041722, "balance_loss_clip": 1.02423453, "balance_loss_mlp": 1.04705572, "epoch": 0.36975800390801145, "flos": 26833925606400.0, "grad_norm": 5.233337093783357, "language_loss": 0.70848733, "learning_rate": 2.796063828114286e-06, "loss": 0.73019296, "num_input_tokens_seen": 132185100, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.81640625, "step": 6150, "time_per_iteration": 2.627173662185669 }, { "auxiliary_loss_clip": 0.01142698, "auxiliary_loss_mlp": 0.01041341, "balance_loss_clip": 1.02547431, "balance_loss_mlp": 1.04618168, "epoch": 0.3698181271606794, "flos": 21142623265920.0, "grad_norm": 1.6207900677312765, "language_loss": 0.81746173, "learning_rate": 2.795717210450228e-06, "loss": 0.83930218, "num_input_tokens_seen": 132203930, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.7890625, "step": 6151, "time_per_iteration": 2.6673171520233154 }, { "auxiliary_loss_clip": 0.01056823, "auxiliary_loss_mlp": 0.01005244, "balance_loss_clip": 1.00328922, "balance_loss_mlp": 1.01314354, "epoch": 0.3698782504133474, "flos": 66742639568640.0, "grad_norm": 0.7841119563693775, "language_loss": 0.63091242, "learning_rate": 2.7953705643890705e-06, "loss": 0.65153313, "num_input_tokens_seen": 132263845, "router_z_loss_clip": 0.01953125, "router_z_loss_mlp": 0.2578125, "step": 6152, "time_per_iteration": 3.3290038108825684 }, { "auxiliary_loss_clip": 0.01158036, "auxiliary_loss_mlp": 0.0103687, "balance_loss_clip": 1.02195644, "balance_loss_mlp": 1.04556584, "epoch": 0.36993837366601534, "flos": 24311523905280.0, "grad_norm": 1.9599193479149246, "language_loss": 0.70171523, "learning_rate": 2.7950238899431827e-06, "loss": 0.72366428, "num_input_tokens_seen": 132282350, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.765625, "step": 6153, "time_per_iteration": 2.6805782318115234 }, { "auxiliary_loss_clip": 0.01136075, "auxiliary_loss_mlp": 0.01038569, "balance_loss_clip": 1.02235639, "balance_loss_mlp": 1.0456121, "epoch": 0.3699984969186833, "flos": 24349194293760.0, "grad_norm": 1.5829982182952873, "language_loss": 0.72232389, "learning_rate": 2.7946771871249374e-06, "loss": 0.74407029, "num_input_tokens_seen": 132301930, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8203125, "step": 6154, "time_per_iteration": 2.703831672668457 }, { "auxiliary_loss_clip": 0.01143311, "auxiliary_loss_mlp": 0.0103056, "balance_loss_clip": 1.01608205, "balance_loss_mlp": 1.04608774, "epoch": 0.37005862017135127, "flos": 19829154637440.0, "grad_norm": 1.7434420330967615, "language_loss": 0.68101656, "learning_rate": 2.794330455946707e-06, "loss": 0.70275527, "num_input_tokens_seen": 132320915, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.79296875, "step": 6155, "time_per_iteration": 2.6588082313537598 }, { "auxiliary_loss_clip": 0.011321, "auxiliary_loss_mlp": 0.01032793, "balance_loss_clip": 1.01753402, "balance_loss_mlp": 1.04431558, "epoch": 0.37011874342401924, "flos": 19573793873280.0, "grad_norm": 1.785372169275544, "language_loss": 0.67577255, "learning_rate": 2.7939836964208665e-06, "loss": 0.69742149, "num_input_tokens_seen": 132340415, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7890625, "step": 6156, "time_per_iteration": 2.613462448120117 }, { "auxiliary_loss_clip": 0.01140989, "auxiliary_loss_mlp": 0.01036327, "balance_loss_clip": 1.02208138, "balance_loss_mlp": 1.0455358, "epoch": 0.3701788666766872, "flos": 20374350243840.0, "grad_norm": 1.75237258919158, "language_loss": 0.82149076, "learning_rate": 2.7936369085597895e-06, "loss": 0.84326398, "num_input_tokens_seen": 132358600, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.76953125, "step": 6157, "time_per_iteration": 2.5891308784484863 }, { "auxiliary_loss_clip": 0.0116307, "auxiliary_loss_mlp": 0.01039876, "balance_loss_clip": 1.02346063, "balance_loss_mlp": 1.04566002, "epoch": 0.37023898992935517, "flos": 15340931452800.0, "grad_norm": 2.265171294932713, "language_loss": 0.76675063, "learning_rate": 2.793290092375853e-06, "loss": 0.78878009, "num_input_tokens_seen": 132373160, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8203125, "step": 6158, "time_per_iteration": 2.5904877185821533 }, { "auxiliary_loss_clip": 0.01126327, "auxiliary_loss_mlp": 0.01038732, "balance_loss_clip": 1.02272177, "balance_loss_mlp": 1.04485404, "epoch": 0.37029911318202313, "flos": 19573937527680.0, "grad_norm": 2.283833623480583, "language_loss": 0.68813062, "learning_rate": 2.7929432478814346e-06, "loss": 0.70978123, "num_input_tokens_seen": 132392345, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8125, "step": 6159, "time_per_iteration": 2.597818613052368 }, { "auxiliary_loss_clip": 0.01138812, "auxiliary_loss_mlp": 0.01040969, "balance_loss_clip": 1.02740335, "balance_loss_mlp": 1.04357028, "epoch": 0.3703592364346911, "flos": 26213353309440.0, "grad_norm": 1.7782940165803798, "language_loss": 0.70845675, "learning_rate": 2.7925963750889108e-06, "loss": 0.73025453, "num_input_tokens_seen": 132412620, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7734375, "step": 6160, "time_per_iteration": 2.6835219860076904 }, { "auxiliary_loss_clip": 0.01135823, "auxiliary_loss_mlp": 0.01035427, "balance_loss_clip": 1.02213502, "balance_loss_mlp": 1.04326582, "epoch": 0.37041935968735906, "flos": 20048317470720.0, "grad_norm": 1.5927316766666533, "language_loss": 0.79113162, "learning_rate": 2.792249474010661e-06, "loss": 0.81284404, "num_input_tokens_seen": 132431570, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75, "step": 6161, "time_per_iteration": 2.696340322494507 }, { "auxiliary_loss_clip": 0.0112449, "auxiliary_loss_mlp": 0.01038817, "balance_loss_clip": 1.02315855, "balance_loss_mlp": 1.0450635, "epoch": 0.3704794829400271, "flos": 24133802388480.0, "grad_norm": 1.859334088027227, "language_loss": 0.79222792, "learning_rate": 2.791902544659065e-06, "loss": 0.81386089, "num_input_tokens_seen": 132451525, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.79296875, "step": 6162, "time_per_iteration": 2.687537431716919 }, { "auxiliary_loss_clip": 0.01140415, "auxiliary_loss_mlp": 0.0128541, "balance_loss_clip": 1.02364302, "balance_loss_mlp": 1.04427862, "epoch": 0.37053960619269505, "flos": 14866874732160.0, "grad_norm": 1.9754633915320938, "language_loss": 0.79529774, "learning_rate": 2.7915555870465047e-06, "loss": 0.81955594, "num_input_tokens_seen": 132469875, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78515625, "step": 6163, "time_per_iteration": 3.9481759071350098 }, { "auxiliary_loss_clip": 0.0113316, "auxiliary_loss_mlp": 0.01040222, "balance_loss_clip": 1.02467132, "balance_loss_mlp": 1.04433084, "epoch": 0.370599729445363, "flos": 21361498790400.0, "grad_norm": 1.667207280619308, "language_loss": 0.67842597, "learning_rate": 2.791208601185362e-06, "loss": 0.70015979, "num_input_tokens_seen": 132488360, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.80078125, "step": 6164, "time_per_iteration": 2.587026834487915 }, { "auxiliary_loss_clip": 0.01145271, "auxiliary_loss_mlp": 0.01039714, "balance_loss_clip": 1.0233469, "balance_loss_mlp": 1.04733765, "epoch": 0.370659852698031, "flos": 26829041356800.0, "grad_norm": 2.044418271357112, "language_loss": 0.83479917, "learning_rate": 2.7908615870880185e-06, "loss": 0.85664904, "num_input_tokens_seen": 132508630, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.80078125, "step": 6165, "time_per_iteration": 2.665289878845215 }, { "auxiliary_loss_clip": 0.01148181, "auxiliary_loss_mlp": 0.01041918, "balance_loss_clip": 1.02386987, "balance_loss_mlp": 1.04716969, "epoch": 0.37071997595069894, "flos": 19099018880640.0, "grad_norm": 2.0686884791054307, "language_loss": 0.6939851, "learning_rate": 2.7905145447668605e-06, "loss": 0.71588612, "num_input_tokens_seen": 132527465, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.828125, "step": 6166, "time_per_iteration": 2.5525143146514893 }, { "auxiliary_loss_clip": 0.01067592, "auxiliary_loss_mlp": 0.01002852, "balance_loss_clip": 1.00114775, "balance_loss_mlp": 1.01592791, "epoch": 0.3707800992033669, "flos": 52178384920320.0, "grad_norm": 0.7932923508981365, "language_loss": 0.56919122, "learning_rate": 2.790167474234271e-06, "loss": 0.58989567, "num_input_tokens_seen": 132579940, "router_z_loss_clip": 0.01708984, "router_z_loss_mlp": 0.25390625, "step": 6167, "time_per_iteration": 3.112931251525879 }, { "auxiliary_loss_clip": 0.01138063, "auxiliary_loss_mlp": 0.01038358, "balance_loss_clip": 1.02466106, "balance_loss_mlp": 1.04396856, "epoch": 0.3708402224560349, "flos": 19901837808000.0, "grad_norm": 1.855833464336349, "language_loss": 0.75172246, "learning_rate": 2.7898203755026377e-06, "loss": 0.77348667, "num_input_tokens_seen": 132598390, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.76171875, "step": 6168, "time_per_iteration": 4.1331658363342285 }, { "auxiliary_loss_clip": 0.01130357, "auxiliary_loss_mlp": 0.01285868, "balance_loss_clip": 1.02423668, "balance_loss_mlp": 1.04313588, "epoch": 0.37090034570870284, "flos": 20007630339840.0, "grad_norm": 1.6353979808075356, "language_loss": 0.73601937, "learning_rate": 2.7894732485843465e-06, "loss": 0.76018161, "num_input_tokens_seen": 132616920, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.78125, "step": 6169, "time_per_iteration": 4.0501344203948975 }, { "auxiliary_loss_clip": 0.0112006, "auxiliary_loss_mlp": 0.01038562, "balance_loss_clip": 1.02424467, "balance_loss_mlp": 1.04410839, "epoch": 0.3709604689613708, "flos": 24134700228480.0, "grad_norm": 1.6447583570215487, "language_loss": 0.79210567, "learning_rate": 2.7891260934917854e-06, "loss": 0.81369185, "num_input_tokens_seen": 132637660, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7578125, "step": 6170, "time_per_iteration": 2.6129345893859863 }, { "auxiliary_loss_clip": 0.0113375, "auxiliary_loss_mlp": 0.01042338, "balance_loss_clip": 1.02612591, "balance_loss_mlp": 1.04518354, "epoch": 0.37102059221403877, "flos": 23876071326720.0, "grad_norm": 1.7647202472099126, "language_loss": 0.76370901, "learning_rate": 2.7887789102373444e-06, "loss": 0.78546989, "num_input_tokens_seen": 132657635, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.796875, "step": 6171, "time_per_iteration": 2.6496598720550537 }, { "auxiliary_loss_clip": 0.01141973, "auxiliary_loss_mlp": 0.01033984, "balance_loss_clip": 1.0185107, "balance_loss_mlp": 1.04626799, "epoch": 0.37108071546670673, "flos": 14501268149760.0, "grad_norm": 1.9290810387077968, "language_loss": 0.80065572, "learning_rate": 2.7884316988334125e-06, "loss": 0.82241529, "num_input_tokens_seen": 132674455, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.77734375, "step": 6172, "time_per_iteration": 4.093313455581665 }, { "auxiliary_loss_clip": 0.01132721, "auxiliary_loss_mlp": 0.01041831, "balance_loss_clip": 1.02517796, "balance_loss_mlp": 1.04389548, "epoch": 0.3711408387193747, "flos": 34562619279360.0, "grad_norm": 2.0093056607738564, "language_loss": 0.59241211, "learning_rate": 2.7880844592923815e-06, "loss": 0.61415762, "num_input_tokens_seen": 132695140, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.80078125, "step": 6173, "time_per_iteration": 2.725369691848755 }, { "auxiliary_loss_clip": 0.01139465, "auxiliary_loss_mlp": 0.01035604, "balance_loss_clip": 1.02043414, "balance_loss_mlp": 1.04352582, "epoch": 0.37120096197204266, "flos": 17310703432320.0, "grad_norm": 1.8412808544883719, "language_loss": 0.79819667, "learning_rate": 2.787737191626644e-06, "loss": 0.81994724, "num_input_tokens_seen": 132712470, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.78515625, "step": 6174, "time_per_iteration": 2.491762161254883 }, { "auxiliary_loss_clip": 0.01119004, "auxiliary_loss_mlp": 0.01036245, "balance_loss_clip": 1.02232718, "balance_loss_mlp": 1.04377782, "epoch": 0.37126108522471063, "flos": 30664049760000.0, "grad_norm": 2.0118090924989382, "language_loss": 0.79742301, "learning_rate": 2.787389895848591e-06, "loss": 0.81897551, "num_input_tokens_seen": 132732945, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75390625, "step": 6175, "time_per_iteration": 2.688150405883789 }, { "auxiliary_loss_clip": 0.01131461, "auxiliary_loss_mlp": 0.01043387, "balance_loss_clip": 1.02873039, "balance_loss_mlp": 1.04554367, "epoch": 0.37132120847737865, "flos": 25155640494720.0, "grad_norm": 2.7369457014006393, "language_loss": 0.88438427, "learning_rate": 2.78704257197062e-06, "loss": 0.90613276, "num_input_tokens_seen": 132752470, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.76953125, "step": 6176, "time_per_iteration": 2.570006847381592 }, { "auxiliary_loss_clip": 0.01134358, "auxiliary_loss_mlp": 0.01038392, "balance_loss_clip": 1.02392054, "balance_loss_mlp": 1.0467577, "epoch": 0.3713813317300466, "flos": 21213474842880.0, "grad_norm": 1.5033102265942477, "language_loss": 0.73327756, "learning_rate": 2.7866952200051224e-06, "loss": 0.755005, "num_input_tokens_seen": 132771485, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7890625, "step": 6177, "time_per_iteration": 2.635984420776367 }, { "auxiliary_loss_clip": 0.0114748, "auxiliary_loss_mlp": 0.01041213, "balance_loss_clip": 1.02624035, "balance_loss_mlp": 1.04415715, "epoch": 0.3714414549827146, "flos": 21616644072960.0, "grad_norm": 2.006988395184643, "language_loss": 0.7505005, "learning_rate": 2.7863478399644973e-06, "loss": 0.77238739, "num_input_tokens_seen": 132791465, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.76953125, "step": 6178, "time_per_iteration": 2.567317485809326 }, { "auxiliary_loss_clip": 0.01168983, "auxiliary_loss_mlp": 0.01036811, "balance_loss_clip": 1.02220786, "balance_loss_mlp": 1.04732478, "epoch": 0.37150157823538255, "flos": 19972294335360.0, "grad_norm": 1.7383745094857141, "language_loss": 0.71821046, "learning_rate": 2.786000431861139e-06, "loss": 0.74026847, "num_input_tokens_seen": 132810160, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7734375, "step": 6179, "time_per_iteration": 2.605436325073242 }, { "auxiliary_loss_clip": 0.01123567, "auxiliary_loss_mlp": 0.01036701, "balance_loss_clip": 1.02125108, "balance_loss_mlp": 1.04367769, "epoch": 0.3715617014880505, "flos": 24860562266880.0, "grad_norm": 1.7247693892888767, "language_loss": 0.69902837, "learning_rate": 2.7856529957074484e-06, "loss": 0.720631, "num_input_tokens_seen": 132831265, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.796875, "step": 6180, "time_per_iteration": 2.547395944595337 }, { "auxiliary_loss_clip": 0.01137333, "auxiliary_loss_mlp": 0.0103555, "balance_loss_clip": 1.02048206, "balance_loss_mlp": 1.04328167, "epoch": 0.3716218247407185, "flos": 20449080489600.0, "grad_norm": 1.6144881004109808, "language_loss": 0.77710837, "learning_rate": 2.7853055315158233e-06, "loss": 0.79883718, "num_input_tokens_seen": 132850005, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.76171875, "step": 6181, "time_per_iteration": 2.6259772777557373 }, { "auxiliary_loss_clip": 0.01128812, "auxiliary_loss_mlp": 0.01033947, "balance_loss_clip": 1.01881313, "balance_loss_mlp": 1.04375172, "epoch": 0.37168194799338644, "flos": 24133479166080.0, "grad_norm": 1.8123767238887378, "language_loss": 0.78221011, "learning_rate": 2.7849580392986633e-06, "loss": 0.80383766, "num_input_tokens_seen": 132865790, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.76171875, "step": 6182, "time_per_iteration": 2.681098699569702 }, { "auxiliary_loss_clip": 0.01059679, "auxiliary_loss_mlp": 0.01000831, "balance_loss_clip": 0.99919742, "balance_loss_mlp": 1.01680779, "epoch": 0.3717420712460544, "flos": 67408926900480.0, "grad_norm": 0.7814220101186674, "language_loss": 0.57515174, "learning_rate": 2.7846105190683705e-06, "loss": 0.59575683, "num_input_tokens_seen": 132921775, "router_z_loss_clip": 0.01635742, "router_z_loss_mlp": 0.25390625, "step": 6183, "time_per_iteration": 3.2263219356536865 }, { "auxiliary_loss_clip": 0.01135648, "auxiliary_loss_mlp": 0.01288793, "balance_loss_clip": 1.02495801, "balance_loss_mlp": 1.04423881, "epoch": 0.37180219449872237, "flos": 22376908362240.0, "grad_norm": 1.6560049403487314, "language_loss": 0.77011263, "learning_rate": 2.7842629708373466e-06, "loss": 0.79435706, "num_input_tokens_seen": 132941060, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.828125, "step": 6184, "time_per_iteration": 2.6819584369659424 }, { "auxiliary_loss_clip": 0.01145909, "auxiliary_loss_mlp": 0.01037058, "balance_loss_clip": 1.02243721, "balance_loss_mlp": 1.04361475, "epoch": 0.37186231775139034, "flos": 21869885934720.0, "grad_norm": 1.8433667834425174, "language_loss": 0.72176456, "learning_rate": 2.7839153946179943e-06, "loss": 0.74359417, "num_input_tokens_seen": 132961850, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.75390625, "step": 6185, "time_per_iteration": 2.6219708919525146 }, { "auxiliary_loss_clip": 0.01155353, "auxiliary_loss_mlp": 0.01029053, "balance_loss_clip": 1.01429498, "balance_loss_mlp": 1.04412317, "epoch": 0.3719224410040583, "flos": 22415225195520.0, "grad_norm": 1.6208058452923342, "language_loss": 0.76877797, "learning_rate": 2.783567790422718e-06, "loss": 0.79062206, "num_input_tokens_seen": 132981625, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7578125, "step": 6186, "time_per_iteration": 2.6066596508026123 }, { "auxiliary_loss_clip": 0.0113632, "auxiliary_loss_mlp": 0.01038889, "balance_loss_clip": 1.02216434, "balance_loss_mlp": 1.04531193, "epoch": 0.37198256425672627, "flos": 25151223121920.0, "grad_norm": 1.647306074852359, "language_loss": 0.83185887, "learning_rate": 2.7832201582639227e-06, "loss": 0.85361099, "num_input_tokens_seen": 133001225, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8203125, "step": 6187, "time_per_iteration": 2.6254260540008545 }, { "auxiliary_loss_clip": 0.01130083, "auxiliary_loss_mlp": 0.0104016, "balance_loss_clip": 1.0249908, "balance_loss_mlp": 1.04340136, "epoch": 0.37204268750939423, "flos": 21138313633920.0, "grad_norm": 1.668214908468038, "language_loss": 0.849491, "learning_rate": 2.782872498154015e-06, "loss": 0.87119341, "num_input_tokens_seen": 133018820, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.77734375, "step": 6188, "time_per_iteration": 2.583993911743164 }, { "auxiliary_loss_clip": 0.01141563, "auxiliary_loss_mlp": 0.01033943, "balance_loss_clip": 1.01838601, "balance_loss_mlp": 1.04554105, "epoch": 0.37210281076206225, "flos": 21506829217920.0, "grad_norm": 1.7305143489419323, "language_loss": 0.6503824, "learning_rate": 2.782524810105401e-06, "loss": 0.67213744, "num_input_tokens_seen": 133040205, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.77734375, "step": 6189, "time_per_iteration": 2.572695732116699 }, { "auxiliary_loss_clip": 0.01142579, "auxiliary_loss_mlp": 0.01036233, "balance_loss_clip": 1.02009189, "balance_loss_mlp": 1.04655242, "epoch": 0.3721629340147302, "flos": 17347835116800.0, "grad_norm": 1.851339448817938, "language_loss": 0.84177643, "learning_rate": 2.78217709413049e-06, "loss": 0.86356455, "num_input_tokens_seen": 133058095, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.78125, "step": 6190, "time_per_iteration": 2.603109121322632 }, { "auxiliary_loss_clip": 0.01144293, "auxiliary_loss_mlp": 0.0103977, "balance_loss_clip": 1.02448726, "balance_loss_mlp": 1.04456484, "epoch": 0.3722230572673982, "flos": 16432400073600.0, "grad_norm": 3.741205292088572, "language_loss": 0.87966943, "learning_rate": 2.781829350241691e-06, "loss": 0.90151, "num_input_tokens_seen": 133071530, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.81640625, "step": 6191, "time_per_iteration": 2.542799949645996 }, { "auxiliary_loss_clip": 0.01160199, "auxiliary_loss_mlp": 0.01037116, "balance_loss_clip": 1.01961637, "balance_loss_mlp": 1.04247093, "epoch": 0.37228318052006615, "flos": 22674716023680.0, "grad_norm": 1.4873046730294854, "language_loss": 0.73576689, "learning_rate": 2.7814815784514125e-06, "loss": 0.75774002, "num_input_tokens_seen": 133091410, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.81640625, "step": 6192, "time_per_iteration": 2.6534080505371094 }, { "auxiliary_loss_clip": 0.01120763, "auxiliary_loss_mlp": 0.01033713, "balance_loss_clip": 1.01947951, "balance_loss_mlp": 1.04308152, "epoch": 0.3723433037727341, "flos": 25265491263360.0, "grad_norm": 2.8376608411235233, "language_loss": 0.79947078, "learning_rate": 2.7811337787720674e-06, "loss": 0.82101554, "num_input_tokens_seen": 133110365, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7734375, "step": 6193, "time_per_iteration": 2.593183994293213 }, { "auxiliary_loss_clip": 0.01148771, "auxiliary_loss_mlp": 0.01034178, "balance_loss_clip": 1.01898468, "balance_loss_mlp": 1.04285944, "epoch": 0.3724034270254021, "flos": 10524664333440.0, "grad_norm": 1.7316655200007207, "language_loss": 0.84019119, "learning_rate": 2.7807859512160663e-06, "loss": 0.86202067, "num_input_tokens_seen": 133128255, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.796875, "step": 6194, "time_per_iteration": 2.6522903442382812 }, { "auxiliary_loss_clip": 0.01138132, "auxiliary_loss_mlp": 0.01036492, "balance_loss_clip": 1.02156711, "balance_loss_mlp": 1.04197836, "epoch": 0.37246355027807004, "flos": 20266223328000.0, "grad_norm": 2.2062200611165563, "language_loss": 0.77093601, "learning_rate": 2.7804380957958238e-06, "loss": 0.79268229, "num_input_tokens_seen": 133143975, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.78515625, "step": 6195, "time_per_iteration": 2.5616567134857178 }, { "auxiliary_loss_clip": 0.01139274, "auxiliary_loss_mlp": 0.01034228, "balance_loss_clip": 1.01939249, "balance_loss_mlp": 1.04451323, "epoch": 0.372523673530738, "flos": 19500571998720.0, "grad_norm": 1.7934561865884828, "language_loss": 0.79573798, "learning_rate": 2.780090212523753e-06, "loss": 0.81747299, "num_input_tokens_seen": 133162935, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.765625, "step": 6196, "time_per_iteration": 2.7005186080932617 }, { "auxiliary_loss_clip": 0.01132622, "auxiliary_loss_mlp": 0.01036315, "balance_loss_clip": 1.0217005, "balance_loss_mlp": 1.04589653, "epoch": 0.372583796783406, "flos": 16764250849920.0, "grad_norm": 2.1152440348270005, "language_loss": 0.83799648, "learning_rate": 2.779742301412269e-06, "loss": 0.85968584, "num_input_tokens_seen": 133181180, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.77734375, "step": 6197, "time_per_iteration": 2.7541494369506836 }, { "auxiliary_loss_clip": 0.01118053, "auxiliary_loss_mlp": 0.0103787, "balance_loss_clip": 1.0226171, "balance_loss_mlp": 1.04195285, "epoch": 0.37264392003607394, "flos": 22637979388800.0, "grad_norm": 2.047909525944646, "language_loss": 0.64506209, "learning_rate": 2.7793943624737884e-06, "loss": 0.66662133, "num_input_tokens_seen": 133199615, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.76171875, "step": 6198, "time_per_iteration": 2.693347692489624 }, { "auxiliary_loss_clip": 0.01125576, "auxiliary_loss_mlp": 0.01044359, "balance_loss_clip": 1.03051925, "balance_loss_mlp": 1.04108584, "epoch": 0.3727040432887419, "flos": 19973120348160.0, "grad_norm": 1.4478235470378096, "language_loss": 0.73743355, "learning_rate": 2.7790463957207275e-06, "loss": 0.75913292, "num_input_tokens_seen": 133219650, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 6199, "time_per_iteration": 2.616208076477051 }, { "auxiliary_loss_clip": 0.01137928, "auxiliary_loss_mlp": 0.01040522, "balance_loss_clip": 1.02644944, "balance_loss_mlp": 1.04321861, "epoch": 0.37276416654140987, "flos": 63899122279680.0, "grad_norm": 1.8016392415586833, "language_loss": 0.80716336, "learning_rate": 2.7786984011655045e-06, "loss": 0.82894778, "num_input_tokens_seen": 133245675, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76953125, "step": 6200, "time_per_iteration": 2.951310157775879 }, { "auxiliary_loss_clip": 0.01146317, "auxiliary_loss_mlp": 0.01039536, "balance_loss_clip": 1.0248611, "balance_loss_mlp": 1.04296708, "epoch": 0.37282428979407783, "flos": 39785970211200.0, "grad_norm": 2.031494517602155, "language_loss": 0.60019994, "learning_rate": 2.7783503788205383e-06, "loss": 0.62205839, "num_input_tokens_seen": 133266905, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.765625, "step": 6201, "time_per_iteration": 2.920058250427246 }, { "auxiliary_loss_clip": 0.01133204, "auxiliary_loss_mlp": 0.01037771, "balance_loss_clip": 1.02232766, "balance_loss_mlp": 1.04520285, "epoch": 0.37288441304674586, "flos": 22709046447360.0, "grad_norm": 1.8535921424514135, "language_loss": 0.7258445, "learning_rate": 2.7780023286982502e-06, "loss": 0.74755424, "num_input_tokens_seen": 133286865, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.796875, "step": 6202, "time_per_iteration": 2.7215449810028076 }, { "auxiliary_loss_clip": 0.01137789, "auxiliary_loss_mlp": 0.01035291, "balance_loss_clip": 1.02079558, "balance_loss_mlp": 1.0432725, "epoch": 0.3729445362994138, "flos": 18770292587520.0, "grad_norm": 1.8363895327643418, "language_loss": 0.73807943, "learning_rate": 2.77765425081106e-06, "loss": 0.75981027, "num_input_tokens_seen": 133305295, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.76953125, "step": 6203, "time_per_iteration": 2.7226462364196777 }, { "auxiliary_loss_clip": 0.01133384, "auxiliary_loss_mlp": 0.01034035, "balance_loss_clip": 1.02117252, "balance_loss_mlp": 1.04142547, "epoch": 0.3730046595520818, "flos": 22456199635200.0, "grad_norm": 1.6275947913119972, "language_loss": 0.82193565, "learning_rate": 2.7773061451713893e-06, "loss": 0.84360987, "num_input_tokens_seen": 133324625, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.74609375, "step": 6204, "time_per_iteration": 3.9789371490478516 }, { "auxiliary_loss_clip": 0.01132007, "auxiliary_loss_mlp": 0.01042077, "balance_loss_clip": 1.02666318, "balance_loss_mlp": 1.04352474, "epoch": 0.37306478280474975, "flos": 24316372241280.0, "grad_norm": 1.7608394920540074, "language_loss": 0.75041705, "learning_rate": 2.776958011791662e-06, "loss": 0.77215791, "num_input_tokens_seen": 133344625, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.796875, "step": 6205, "time_per_iteration": 2.5667855739593506 }, { "auxiliary_loss_clip": 0.01130022, "auxiliary_loss_mlp": 0.01038342, "balance_loss_clip": 1.02286911, "balance_loss_mlp": 1.04408979, "epoch": 0.3731249060574177, "flos": 15815167741440.0, "grad_norm": 1.935295238752582, "language_loss": 0.78286189, "learning_rate": 2.776609850684302e-06, "loss": 0.80454552, "num_input_tokens_seen": 133363605, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.765625, "step": 6206, "time_per_iteration": 2.561289072036743 }, { "auxiliary_loss_clip": 0.01122942, "auxiliary_loss_mlp": 0.01033969, "balance_loss_clip": 1.01887107, "balance_loss_mlp": 1.04457891, "epoch": 0.3731850293100857, "flos": 19828077229440.0, "grad_norm": 1.909087922285246, "language_loss": 0.93470371, "learning_rate": 2.7762616618617346e-06, "loss": 0.95627284, "num_input_tokens_seen": 133379405, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.78125, "step": 6207, "time_per_iteration": 2.5079562664031982 }, { "auxiliary_loss_clip": 0.01149836, "auxiliary_loss_mlp": 0.01029527, "balance_loss_clip": 1.01566255, "balance_loss_mlp": 1.04328096, "epoch": 0.37324515256275365, "flos": 19062354072960.0, "grad_norm": 2.043869286056352, "language_loss": 0.82717729, "learning_rate": 2.7759134453363847e-06, "loss": 0.84897095, "num_input_tokens_seen": 133397585, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.79296875, "step": 6208, "time_per_iteration": 2.6088664531707764 }, { "auxiliary_loss_clip": 0.01131592, "auxiliary_loss_mlp": 0.01034706, "balance_loss_clip": 1.0187676, "balance_loss_mlp": 1.04399061, "epoch": 0.3733052758154216, "flos": 20704333512960.0, "grad_norm": 2.165826446885849, "language_loss": 0.72794855, "learning_rate": 2.7755652011206798e-06, "loss": 0.7496115, "num_input_tokens_seen": 133415365, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.78515625, "step": 6209, "time_per_iteration": 2.544647216796875 }, { "auxiliary_loss_clip": 0.01150305, "auxiliary_loss_mlp": 0.01038408, "balance_loss_clip": 1.02231443, "balance_loss_mlp": 1.04551864, "epoch": 0.3733653990680896, "flos": 20193504243840.0, "grad_norm": 2.1525013424589057, "language_loss": 0.69884479, "learning_rate": 2.7752169292270485e-06, "loss": 0.72073191, "num_input_tokens_seen": 133435700, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.77734375, "step": 6210, "time_per_iteration": 5.44855523109436 }, { "auxiliary_loss_clip": 0.01159517, "auxiliary_loss_mlp": 0.01030699, "balance_loss_clip": 1.015154, "balance_loss_mlp": 1.04301691, "epoch": 0.37342552232075754, "flos": 20339660684160.0, "grad_norm": 1.8589443959225083, "language_loss": 0.77696627, "learning_rate": 2.7748686296679184e-06, "loss": 0.79886842, "num_input_tokens_seen": 133455180, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.80078125, "step": 6211, "time_per_iteration": 2.7150115966796875 }, { "auxiliary_loss_clip": 0.01133099, "auxiliary_loss_mlp": 0.01034686, "balance_loss_clip": 1.01977909, "balance_loss_mlp": 1.04526806, "epoch": 0.3734856455734255, "flos": 35517879527040.0, "grad_norm": 1.448387381284619, "language_loss": 0.73046434, "learning_rate": 2.7745203024557207e-06, "loss": 0.75214219, "num_input_tokens_seen": 133476715, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.7890625, "step": 6212, "time_per_iteration": 2.7031524181365967 }, { "auxiliary_loss_clip": 0.01141777, "auxiliary_loss_mlp": 0.01044645, "balance_loss_clip": 1.02875471, "balance_loss_mlp": 1.04803944, "epoch": 0.37354576882609347, "flos": 21142300043520.0, "grad_norm": 2.103972167151884, "language_loss": 0.81600678, "learning_rate": 2.7741719476028855e-06, "loss": 0.83787107, "num_input_tokens_seen": 133494550, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.84765625, "step": 6213, "time_per_iteration": 2.638601541519165 }, { "auxiliary_loss_clip": 0.01151203, "auxiliary_loss_mlp": 0.01040548, "balance_loss_clip": 1.02516961, "balance_loss_mlp": 1.04577208, "epoch": 0.37360589207876144, "flos": 21506793304320.0, "grad_norm": 2.0598531191915517, "language_loss": 0.78237379, "learning_rate": 2.773823565121844e-06, "loss": 0.80429125, "num_input_tokens_seen": 133512640, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.78515625, "step": 6214, "time_per_iteration": 4.138205289840698 }, { "auxiliary_loss_clip": 0.011378, "auxiliary_loss_mlp": 0.01036044, "balance_loss_clip": 1.02174473, "balance_loss_mlp": 1.04332185, "epoch": 0.37366601533142946, "flos": 38435800861440.0, "grad_norm": 1.745479428103802, "language_loss": 0.84879398, "learning_rate": 2.7734751550250306e-06, "loss": 0.87053245, "num_input_tokens_seen": 133535540, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.76953125, "step": 6215, "time_per_iteration": 2.721630334854126 }, { "auxiliary_loss_clip": 0.01151528, "auxiliary_loss_mlp": 0.01038766, "balance_loss_clip": 1.02304268, "balance_loss_mlp": 1.04428315, "epoch": 0.3737261385840974, "flos": 18441171244800.0, "grad_norm": 1.83790354689903, "language_loss": 0.6797694, "learning_rate": 2.773126717324879e-06, "loss": 0.70167238, "num_input_tokens_seen": 133555795, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.80859375, "step": 6216, "time_per_iteration": 2.6650876998901367 }, { "auxiliary_loss_clip": 0.01142111, "auxiliary_loss_mlp": 0.01038852, "balance_loss_clip": 1.02315211, "balance_loss_mlp": 1.04477811, "epoch": 0.3737862618367654, "flos": 22929861306240.0, "grad_norm": 2.224247305744204, "language_loss": 0.65563393, "learning_rate": 2.7727782520338227e-06, "loss": 0.6774435, "num_input_tokens_seen": 133575905, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.79296875, "step": 6217, "time_per_iteration": 2.562471628189087 }, { "auxiliary_loss_clip": 0.01142841, "auxiliary_loss_mlp": 0.01039175, "balance_loss_clip": 1.02344537, "balance_loss_mlp": 1.04562068, "epoch": 0.37384638508943335, "flos": 15409664127360.0, "grad_norm": 1.8806890984649085, "language_loss": 0.80672121, "learning_rate": 2.772429759164299e-06, "loss": 0.8285414, "num_input_tokens_seen": 133592585, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.79296875, "step": 6218, "time_per_iteration": 2.6592135429382324 }, { "auxiliary_loss_clip": 0.01127619, "auxiliary_loss_mlp": 0.01033678, "balance_loss_clip": 1.01958752, "balance_loss_mlp": 1.04464769, "epoch": 0.3739065083421013, "flos": 24280820755200.0, "grad_norm": 1.3293429779187496, "language_loss": 0.78780228, "learning_rate": 2.7720812387287444e-06, "loss": 0.80941522, "num_input_tokens_seen": 133615070, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7421875, "step": 6219, "time_per_iteration": 2.5926077365875244 }, { "auxiliary_loss_clip": 0.01139005, "auxiliary_loss_mlp": 0.01038847, "balance_loss_clip": 1.02380335, "balance_loss_mlp": 1.04581666, "epoch": 0.3739666315947693, "flos": 23002831785600.0, "grad_norm": 1.7516178097565105, "language_loss": 0.77023649, "learning_rate": 2.771732690739596e-06, "loss": 0.79201508, "num_input_tokens_seen": 133633490, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7578125, "step": 6220, "time_per_iteration": 2.649794578552246 }, { "auxiliary_loss_clip": 0.01149334, "auxiliary_loss_mlp": 0.01040041, "balance_loss_clip": 1.02468705, "balance_loss_mlp": 1.04368806, "epoch": 0.37402675484743725, "flos": 19391116279680.0, "grad_norm": 1.5580369418850446, "language_loss": 0.82436007, "learning_rate": 2.771384115209293e-06, "loss": 0.84625375, "num_input_tokens_seen": 133653425, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.7890625, "step": 6221, "time_per_iteration": 2.591097593307495 }, { "auxiliary_loss_clip": 0.01139249, "auxiliary_loss_mlp": 0.01038592, "balance_loss_clip": 1.02408481, "balance_loss_mlp": 1.04446197, "epoch": 0.3740868781001052, "flos": 17126158331520.0, "grad_norm": 3.1222934392390846, "language_loss": 0.76315635, "learning_rate": 2.771035512150275e-06, "loss": 0.78493476, "num_input_tokens_seen": 133670220, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.76953125, "step": 6222, "time_per_iteration": 2.5922303199768066 }, { "auxiliary_loss_clip": 0.01150372, "auxiliary_loss_mlp": 0.01278618, "balance_loss_clip": 1.01654363, "balance_loss_mlp": 1.04653847, "epoch": 0.3741470013527732, "flos": 20043505048320.0, "grad_norm": 1.583620787099272, "language_loss": 0.70449811, "learning_rate": 2.770686881574983e-06, "loss": 0.72878802, "num_input_tokens_seen": 133688910, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.7734375, "step": 6223, "time_per_iteration": 2.5861806869506836 }, { "auxiliary_loss_clip": 0.01152012, "auxiliary_loss_mlp": 0.01036529, "balance_loss_clip": 1.02226532, "balance_loss_mlp": 1.04793274, "epoch": 0.37420712460544114, "flos": 36897279569280.0, "grad_norm": 1.8779607845029196, "language_loss": 0.68887109, "learning_rate": 2.770338223495859e-06, "loss": 0.71075648, "num_input_tokens_seen": 133708690, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7734375, "step": 6224, "time_per_iteration": 2.753830909729004 }, { "auxiliary_loss_clip": 0.01148566, "auxiliary_loss_mlp": 0.01033259, "balance_loss_clip": 1.0189364, "balance_loss_mlp": 1.04704642, "epoch": 0.3742672478581091, "flos": 22201198007040.0, "grad_norm": 1.6208173113442355, "language_loss": 0.6991896, "learning_rate": 2.7699895379253447e-06, "loss": 0.72100788, "num_input_tokens_seen": 133728095, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.74609375, "step": 6225, "time_per_iteration": 2.7707672119140625 }, { "auxiliary_loss_clip": 0.0114993, "auxiliary_loss_mlp": 0.0103679, "balance_loss_clip": 1.02160287, "balance_loss_mlp": 1.04675221, "epoch": 0.3743273711107771, "flos": 24681547860480.0, "grad_norm": 2.013614182755666, "language_loss": 0.79432464, "learning_rate": 2.7696408248758846e-06, "loss": 0.81619185, "num_input_tokens_seen": 133745590, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7578125, "step": 6226, "time_per_iteration": 2.618724822998047 }, { "auxiliary_loss_clip": 0.01141627, "auxiliary_loss_mlp": 0.01035547, "balance_loss_clip": 1.02011502, "balance_loss_mlp": 1.04578602, "epoch": 0.37438749436344504, "flos": 24459619680000.0, "grad_norm": 1.7990240611941963, "language_loss": 0.68078279, "learning_rate": 2.7692920843599238e-06, "loss": 0.70255446, "num_input_tokens_seen": 133766155, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.78125, "step": 6227, "time_per_iteration": 2.763576030731201 }, { "auxiliary_loss_clip": 0.01148531, "auxiliary_loss_mlp": 0.01036308, "balance_loss_clip": 1.02192497, "balance_loss_mlp": 1.04587626, "epoch": 0.374447617616113, "flos": 21798747048960.0, "grad_norm": 1.6697149232888033, "language_loss": 0.82958233, "learning_rate": 2.7689433163899073e-06, "loss": 0.85143077, "num_input_tokens_seen": 133783185, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7578125, "step": 6228, "time_per_iteration": 2.5859577655792236 }, { "auxiliary_loss_clip": 0.01148578, "auxiliary_loss_mlp": 0.01039949, "balance_loss_clip": 1.02466083, "balance_loss_mlp": 1.046628, "epoch": 0.374507740868781, "flos": 17968191932160.0, "grad_norm": 1.6529452942055645, "language_loss": 0.74696767, "learning_rate": 2.7685945209782816e-06, "loss": 0.76885301, "num_input_tokens_seen": 133800975, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.75, "step": 6229, "time_per_iteration": 2.647447347640991 }, { "auxiliary_loss_clip": 0.01150677, "auxiliary_loss_mlp": 0.0104029, "balance_loss_clip": 1.02445269, "balance_loss_mlp": 1.04387653, "epoch": 0.374567864121449, "flos": 16105828596480.0, "grad_norm": 1.9071226374126222, "language_loss": 0.83491158, "learning_rate": 2.7682456981374946e-06, "loss": 0.8568213, "num_input_tokens_seen": 133818020, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.796875, "step": 6230, "time_per_iteration": 2.5447139739990234 }, { "auxiliary_loss_clip": 0.01135995, "auxiliary_loss_mlp": 0.01041935, "balance_loss_clip": 1.0261879, "balance_loss_mlp": 1.0485487, "epoch": 0.37462798737411696, "flos": 25773160135680.0, "grad_norm": 1.7442679508554968, "language_loss": 0.73670483, "learning_rate": 2.7678968478799943e-06, "loss": 0.75848407, "num_input_tokens_seen": 133840690, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.78515625, "step": 6231, "time_per_iteration": 2.6356470584869385 }, { "auxiliary_loss_clip": 0.01134315, "auxiliary_loss_mlp": 0.01045258, "balance_loss_clip": 1.02959371, "balance_loss_mlp": 1.0457809, "epoch": 0.3746881106267849, "flos": 16654507822080.0, "grad_norm": 2.002930998131264, "language_loss": 0.73335803, "learning_rate": 2.767547970218231e-06, "loss": 0.75515378, "num_input_tokens_seen": 133858350, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.79296875, "step": 6232, "time_per_iteration": 2.5096275806427 }, { "auxiliary_loss_clip": 0.01142758, "auxiliary_loss_mlp": 0.01038027, "balance_loss_clip": 1.02263713, "balance_loss_mlp": 1.0459733, "epoch": 0.3747482338794529, "flos": 26177981391360.0, "grad_norm": 1.671666886266478, "language_loss": 0.77168524, "learning_rate": 2.767199065164655e-06, "loss": 0.79349315, "num_input_tokens_seen": 133879775, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.7890625, "step": 6233, "time_per_iteration": 2.665130376815796 }, { "auxiliary_loss_clip": 0.01143729, "auxiliary_loss_mlp": 0.01042639, "balance_loss_clip": 1.02810776, "balance_loss_mlp": 1.04676402, "epoch": 0.37480835713212085, "flos": 12021061950720.0, "grad_norm": 3.140995114437122, "language_loss": 0.69227725, "learning_rate": 2.7668501327317184e-06, "loss": 0.71414089, "num_input_tokens_seen": 133898295, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.79296875, "step": 6234, "time_per_iteration": 2.4962260723114014 }, { "auxiliary_loss_clip": 0.01133219, "auxiliary_loss_mlp": 0.01042741, "balance_loss_clip": 1.02840018, "balance_loss_mlp": 1.04726994, "epoch": 0.3748684803847888, "flos": 19679263182720.0, "grad_norm": 2.1492112248737008, "language_loss": 0.82981288, "learning_rate": 2.7665011729318727e-06, "loss": 0.85157245, "num_input_tokens_seen": 133915230, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7734375, "step": 6235, "time_per_iteration": 2.6132969856262207 }, { "auxiliary_loss_clip": 0.01129685, "auxiliary_loss_mlp": 0.01032645, "balance_loss_clip": 1.0179342, "balance_loss_mlp": 1.04901505, "epoch": 0.3749286036374568, "flos": 20521189042560.0, "grad_norm": 1.7839346415258548, "language_loss": 0.7801708, "learning_rate": 2.7661521857775715e-06, "loss": 0.80179417, "num_input_tokens_seen": 133934110, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.80859375, "step": 6236, "time_per_iteration": 2.485774517059326 }, { "auxiliary_loss_clip": 0.01135711, "auxiliary_loss_mlp": 0.01044724, "balance_loss_clip": 1.02809405, "balance_loss_mlp": 1.04449368, "epoch": 0.37498872689012475, "flos": 20704620821760.0, "grad_norm": 2.391048606243492, "language_loss": 0.73119527, "learning_rate": 2.76580317128127e-06, "loss": 0.7529996, "num_input_tokens_seen": 133952395, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8203125, "step": 6237, "time_per_iteration": 2.6092681884765625 }, { "auxiliary_loss_clip": 0.01142186, "auxiliary_loss_mlp": 0.01286236, "balance_loss_clip": 1.02362585, "balance_loss_mlp": 1.04272354, "epoch": 0.3750488501427927, "flos": 21574843620480.0, "grad_norm": 2.0236973335326724, "language_loss": 0.93103349, "learning_rate": 2.765454129455423e-06, "loss": 0.95531768, "num_input_tokens_seen": 133969635, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8125, "step": 6238, "time_per_iteration": 2.549215078353882 }, { "auxiliary_loss_clip": 0.01126666, "auxiliary_loss_mlp": 0.01036025, "balance_loss_clip": 1.0204618, "balance_loss_mlp": 1.04665804, "epoch": 0.3751089733954607, "flos": 15923869274880.0, "grad_norm": 1.905379551244482, "language_loss": 0.71313405, "learning_rate": 2.765105060312487e-06, "loss": 0.734761, "num_input_tokens_seen": 133987215, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.80078125, "step": 6239, "time_per_iteration": 2.6942548751831055 }, { "auxiliary_loss_clip": 0.01182222, "auxiliary_loss_mlp": 0.01038471, "balance_loss_clip": 1.02349794, "balance_loss_mlp": 1.04811382, "epoch": 0.37516909664812864, "flos": 36284644177920.0, "grad_norm": 1.6581935804221979, "language_loss": 0.65454853, "learning_rate": 2.76475596386492e-06, "loss": 0.67675543, "num_input_tokens_seen": 134009250, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.80078125, "step": 6240, "time_per_iteration": 2.7764482498168945 }, { "auxiliary_loss_clip": 0.01152598, "auxiliary_loss_mlp": 0.01284737, "balance_loss_clip": 1.02356529, "balance_loss_mlp": 1.04546905, "epoch": 0.3752292199007966, "flos": 13515915283200.0, "grad_norm": 1.68293212930274, "language_loss": 0.75680017, "learning_rate": 2.764406840125179e-06, "loss": 0.78117359, "num_input_tokens_seen": 134026875, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.8046875, "step": 6241, "time_per_iteration": 2.6402502059936523 }, { "auxiliary_loss_clip": 0.01154026, "auxiliary_loss_mlp": 0.01040315, "balance_loss_clip": 1.02441859, "balance_loss_mlp": 1.04672551, "epoch": 0.3752893431534646, "flos": 27198095644800.0, "grad_norm": 1.9215340950172146, "language_loss": 0.83465606, "learning_rate": 2.7640576891057246e-06, "loss": 0.85659945, "num_input_tokens_seen": 134047185, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8046875, "step": 6242, "time_per_iteration": 2.641693592071533 }, { "auxiliary_loss_clip": 0.01142195, "auxiliary_loss_mlp": 0.01040191, "balance_loss_clip": 1.02611256, "balance_loss_mlp": 1.04444814, "epoch": 0.3753494664061326, "flos": 30007674581760.0, "grad_norm": 2.4808650266512915, "language_loss": 0.68212992, "learning_rate": 2.763708510819017e-06, "loss": 0.70395374, "num_input_tokens_seen": 134067330, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.796875, "step": 6243, "time_per_iteration": 2.6615874767303467 }, { "auxiliary_loss_clip": 0.01177697, "auxiliary_loss_mlp": 0.01287378, "balance_loss_clip": 1.02421856, "balance_loss_mlp": 1.04582167, "epoch": 0.37540958965880056, "flos": 24461954064000.0, "grad_norm": 1.7507755876475941, "language_loss": 0.83796865, "learning_rate": 2.7633593052775174e-06, "loss": 0.86261934, "num_input_tokens_seen": 134085525, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.78515625, "step": 6244, "time_per_iteration": 2.712876081466675 }, { "auxiliary_loss_clip": 0.01156282, "auxiliary_loss_mlp": 0.01038869, "balance_loss_clip": 1.0239203, "balance_loss_mlp": 1.04442298, "epoch": 0.3754697129114685, "flos": 16508387295360.0, "grad_norm": 5.726876402166819, "language_loss": 0.82914531, "learning_rate": 2.763010072493687e-06, "loss": 0.85109675, "num_input_tokens_seen": 134101855, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.76171875, "step": 6245, "time_per_iteration": 4.076855897903442 }, { "auxiliary_loss_clip": 0.01141428, "auxiliary_loss_mlp": 0.01038752, "balance_loss_clip": 1.02361202, "balance_loss_mlp": 1.04373884, "epoch": 0.3755298361641365, "flos": 19390900798080.0, "grad_norm": 1.9708216562095735, "language_loss": 0.6359632, "learning_rate": 2.76266081247999e-06, "loss": 0.65776503, "num_input_tokens_seen": 134119360, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.796875, "step": 6246, "time_per_iteration": 2.5782723426818848 }, { "auxiliary_loss_clip": 0.01145063, "auxiliary_loss_mlp": 0.01039677, "balance_loss_clip": 1.0244, "balance_loss_mlp": 1.04478884, "epoch": 0.37558995941680445, "flos": 14720395069440.0, "grad_norm": 1.7249948072469252, "language_loss": 0.74767482, "learning_rate": 2.7623115252488905e-06, "loss": 0.76952219, "num_input_tokens_seen": 134137475, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8203125, "step": 6247, "time_per_iteration": 2.539544105529785 }, { "auxiliary_loss_clip": 0.01132777, "auxiliary_loss_mlp": 0.0104248, "balance_loss_clip": 1.02710211, "balance_loss_mlp": 1.04376256, "epoch": 0.3756500826694724, "flos": 21689901861120.0, "grad_norm": 10.715932282488845, "language_loss": 0.55192363, "learning_rate": 2.7619622108128534e-06, "loss": 0.57367617, "num_input_tokens_seen": 134154580, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.80078125, "step": 6248, "time_per_iteration": 2.632949113845825 }, { "auxiliary_loss_clip": 0.01131965, "auxiliary_loss_mlp": 0.01041015, "balance_loss_clip": 1.02616131, "balance_loss_mlp": 1.04456055, "epoch": 0.3757102059221404, "flos": 26505666190080.0, "grad_norm": 1.8409372227991307, "language_loss": 0.84534049, "learning_rate": 2.7616128691843452e-06, "loss": 0.86707026, "num_input_tokens_seen": 134174285, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.78515625, "step": 6249, "time_per_iteration": 2.6520073413848877 }, { "auxiliary_loss_clip": 0.01142907, "auxiliary_loss_mlp": 0.01033345, "balance_loss_clip": 1.01909375, "balance_loss_mlp": 1.04639387, "epoch": 0.37577032917480835, "flos": 37338083274240.0, "grad_norm": 1.5430973049443113, "language_loss": 0.67555743, "learning_rate": 2.761263500375832e-06, "loss": 0.69731998, "num_input_tokens_seen": 134195940, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78515625, "step": 6250, "time_per_iteration": 2.742234468460083 }, { "auxiliary_loss_clip": 0.011415, "auxiliary_loss_mlp": 0.01042101, "balance_loss_clip": 1.0279566, "balance_loss_mlp": 1.04493213, "epoch": 0.3758304524274763, "flos": 21908597817600.0, "grad_norm": 3.631082558637562, "language_loss": 0.77564108, "learning_rate": 2.760914104399784e-06, "loss": 0.79747713, "num_input_tokens_seen": 134212235, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7890625, "step": 6251, "time_per_iteration": 2.5931472778320312 }, { "auxiliary_loss_clip": 0.01132933, "auxiliary_loss_mlp": 0.01036083, "balance_loss_clip": 1.02189744, "balance_loss_mlp": 1.04493964, "epoch": 0.3758905756801443, "flos": 36569343375360.0, "grad_norm": 2.4177014621537087, "language_loss": 0.58251166, "learning_rate": 2.7605646812686687e-06, "loss": 0.60420179, "num_input_tokens_seen": 134233810, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7890625, "step": 6252, "time_per_iteration": 5.561855316162109 }, { "auxiliary_loss_clip": 0.01145048, "auxiliary_loss_mlp": 0.01039303, "balance_loss_clip": 1.02356148, "balance_loss_mlp": 1.04521692, "epoch": 0.37595069893281224, "flos": 24528783317760.0, "grad_norm": 1.9821650185036013, "language_loss": 0.89688945, "learning_rate": 2.7602152309949552e-06, "loss": 0.91873288, "num_input_tokens_seen": 134252020, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8203125, "step": 6253, "time_per_iteration": 2.606444835662842 }, { "auxiliary_loss_clip": 0.01150004, "auxiliary_loss_mlp": 0.01032383, "balance_loss_clip": 1.01831043, "balance_loss_mlp": 1.04633152, "epoch": 0.3760108221854802, "flos": 16435021766400.0, "grad_norm": 1.8471587914580214, "language_loss": 0.76658553, "learning_rate": 2.7598657535911166e-06, "loss": 0.78840935, "num_input_tokens_seen": 134269495, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76953125, "step": 6254, "time_per_iteration": 2.5888822078704834 }, { "auxiliary_loss_clip": 0.01154662, "auxiliary_loss_mlp": 0.01043651, "balance_loss_clip": 1.0275811, "balance_loss_mlp": 1.04683626, "epoch": 0.37607094543814823, "flos": 13771742924160.0, "grad_norm": 2.1229833780870075, "language_loss": 0.61783314, "learning_rate": 2.759516249069623e-06, "loss": 0.63981628, "num_input_tokens_seen": 134287035, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8125, "step": 6255, "time_per_iteration": 2.5490164756774902 }, { "auxiliary_loss_clip": 0.01135522, "auxiliary_loss_mlp": 0.01030874, "balance_loss_clip": 1.01485252, "balance_loss_mlp": 1.04427481, "epoch": 0.3761310686908162, "flos": 19857918453120.0, "grad_norm": 3.254506355678766, "language_loss": 0.7403419, "learning_rate": 2.7591667174429487e-06, "loss": 0.76200593, "num_input_tokens_seen": 134304840, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.82421875, "step": 6256, "time_per_iteration": 4.211950302124023 }, { "auxiliary_loss_clip": 0.01136188, "auxiliary_loss_mlp": 0.01040887, "balance_loss_clip": 1.02443647, "balance_loss_mlp": 1.04647112, "epoch": 0.37619119194348416, "flos": 12750802657920.0, "grad_norm": 2.263190870806062, "language_loss": 0.70480156, "learning_rate": 2.758817158723568e-06, "loss": 0.72657228, "num_input_tokens_seen": 134323180, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.80859375, "step": 6257, "time_per_iteration": 2.574183940887451 }, { "auxiliary_loss_clip": 0.01141421, "auxiliary_loss_mlp": 0.0102876, "balance_loss_clip": 1.0142405, "balance_loss_mlp": 1.04482436, "epoch": 0.3762513151961521, "flos": 17530548624000.0, "grad_norm": 1.654639607665357, "language_loss": 0.84343636, "learning_rate": 2.7584675729239537e-06, "loss": 0.86513817, "num_input_tokens_seen": 134341390, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.78515625, "step": 6258, "time_per_iteration": 2.5746653079986572 }, { "auxiliary_loss_clip": 0.01159318, "auxiliary_loss_mlp": 0.01039527, "balance_loss_clip": 1.0251981, "balance_loss_mlp": 1.04503131, "epoch": 0.3763114384488201, "flos": 23617406511360.0, "grad_norm": 1.6056230750594402, "language_loss": 0.80538297, "learning_rate": 2.7581179600565833e-06, "loss": 0.82737142, "num_input_tokens_seen": 134360425, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78515625, "step": 6259, "time_per_iteration": 2.6429333686828613 }, { "auxiliary_loss_clip": 0.01162822, "auxiliary_loss_mlp": 0.01042936, "balance_loss_clip": 1.02666366, "balance_loss_mlp": 1.04417324, "epoch": 0.37637156170148806, "flos": 25406978935680.0, "grad_norm": 3.1164882061504833, "language_loss": 0.71110624, "learning_rate": 2.7577683201339324e-06, "loss": 0.73316383, "num_input_tokens_seen": 134379775, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.83203125, "step": 6260, "time_per_iteration": 2.680126667022705 }, { "auxiliary_loss_clip": 0.01142853, "auxiliary_loss_mlp": 0.01037381, "balance_loss_clip": 1.02137113, "balance_loss_mlp": 1.04373455, "epoch": 0.376431684954156, "flos": 23440906056960.0, "grad_norm": 1.6766420509999886, "language_loss": 0.78377664, "learning_rate": 2.75741865316848e-06, "loss": 0.80557901, "num_input_tokens_seen": 134400315, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.81640625, "step": 6261, "time_per_iteration": 2.6428816318511963 }, { "auxiliary_loss_clip": 0.011467, "auxiliary_loss_mlp": 0.01039378, "balance_loss_clip": 1.02390432, "balance_loss_mlp": 1.04560125, "epoch": 0.376491808206824, "flos": 34204482725760.0, "grad_norm": 2.180574042313593, "language_loss": 0.80570489, "learning_rate": 2.757068959172704e-06, "loss": 0.82756567, "num_input_tokens_seen": 134422875, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.83203125, "step": 6262, "time_per_iteration": 2.6960651874542236 }, { "auxiliary_loss_clip": 0.01131743, "auxiliary_loss_mlp": 0.01036037, "balance_loss_clip": 1.02108836, "balance_loss_mlp": 1.04408288, "epoch": 0.37655193145949195, "flos": 35185669614720.0, "grad_norm": 1.6825235325803083, "language_loss": 0.80879188, "learning_rate": 2.7567192381590837e-06, "loss": 0.83046967, "num_input_tokens_seen": 134443025, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.78515625, "step": 6263, "time_per_iteration": 2.7411420345306396 }, { "auxiliary_loss_clip": 0.01125524, "auxiliary_loss_mlp": 0.01040168, "balance_loss_clip": 1.02530241, "balance_loss_mlp": 1.04524922, "epoch": 0.3766120547121599, "flos": 16761844638720.0, "grad_norm": 1.7751918338342878, "language_loss": 0.79802692, "learning_rate": 2.756369490140101e-06, "loss": 0.81968379, "num_input_tokens_seen": 134460945, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8046875, "step": 6264, "time_per_iteration": 2.5563995838165283 }, { "auxiliary_loss_clip": 0.01120189, "auxiliary_loss_mlp": 0.010428, "balance_loss_clip": 1.02789903, "balance_loss_mlp": 1.0414784, "epoch": 0.3766721779648279, "flos": 23550361776000.0, "grad_norm": 2.3743852909808894, "language_loss": 0.73682594, "learning_rate": 2.756019715128236e-06, "loss": 0.75845575, "num_input_tokens_seen": 134480440, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7890625, "step": 6265, "time_per_iteration": 2.7169625759124756 }, { "auxiliary_loss_clip": 0.0113191, "auxiliary_loss_mlp": 0.01037444, "balance_loss_clip": 1.02360392, "balance_loss_mlp": 1.04592931, "epoch": 0.37673230121749585, "flos": 29129191655040.0, "grad_norm": 1.5734443564084026, "language_loss": 0.68690914, "learning_rate": 2.755669913135973e-06, "loss": 0.70860273, "num_input_tokens_seen": 134501110, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7734375, "step": 6266, "time_per_iteration": 2.623112678527832 }, { "auxiliary_loss_clip": 0.01137176, "auxiliary_loss_mlp": 0.01038163, "balance_loss_clip": 1.02262974, "balance_loss_mlp": 1.04367757, "epoch": 0.3767924244701638, "flos": 28146783703680.0, "grad_norm": 2.206157092932477, "language_loss": 0.62394905, "learning_rate": 2.755320084175794e-06, "loss": 0.64570236, "num_input_tokens_seen": 134522460, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.84765625, "step": 6267, "time_per_iteration": 2.609358072280884 }, { "auxiliary_loss_clip": 0.0105082, "auxiliary_loss_mlp": 0.0100653, "balance_loss_clip": 1.00487256, "balance_loss_mlp": 1.01615989, "epoch": 0.37685254772283183, "flos": 60797197526400.0, "grad_norm": 0.7147210716663036, "language_loss": 0.5887146, "learning_rate": 2.7549702282601847e-06, "loss": 0.6092881, "num_input_tokens_seen": 134589545, "router_z_loss_clip": 0.01660156, "router_z_loss_mlp": 0.25390625, "step": 6268, "time_per_iteration": 3.2589478492736816 }, { "auxiliary_loss_clip": 0.01126171, "auxiliary_loss_mlp": 0.01038875, "balance_loss_clip": 1.02294874, "balance_loss_mlp": 1.04506302, "epoch": 0.3769126709754998, "flos": 26032543223040.0, "grad_norm": 1.4615903635361494, "language_loss": 0.64841771, "learning_rate": 2.7546203454016294e-06, "loss": 0.67006814, "num_input_tokens_seen": 134610550, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8125, "step": 6269, "time_per_iteration": 2.6068310737609863 }, { "auxiliary_loss_clip": 0.01149413, "auxiliary_loss_mlp": 0.01037552, "balance_loss_clip": 1.02160239, "balance_loss_mlp": 1.04525459, "epoch": 0.37697279422816776, "flos": 23579879777280.0, "grad_norm": 1.6286709394777465, "language_loss": 0.70631975, "learning_rate": 2.7542704356126154e-06, "loss": 0.72818947, "num_input_tokens_seen": 134630485, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.77734375, "step": 6270, "time_per_iteration": 2.599175214767456 }, { "auxiliary_loss_clip": 0.01041729, "auxiliary_loss_mlp": 0.01001131, "balance_loss_clip": 0.99952143, "balance_loss_mlp": 1.01623225, "epoch": 0.3770329174808357, "flos": 64745935367040.0, "grad_norm": 0.7525849167854248, "language_loss": 0.560848, "learning_rate": 2.7539204989056295e-06, "loss": 0.5812766, "num_input_tokens_seen": 134693510, "router_z_loss_clip": 0.01611328, "router_z_loss_mlp": 0.25585938, "step": 6271, "time_per_iteration": 3.13539981842041 }, { "auxiliary_loss_clip": 0.01129578, "auxiliary_loss_mlp": 0.01036853, "balance_loss_clip": 1.02091455, "balance_loss_mlp": 1.04310918, "epoch": 0.3770930407335037, "flos": 21835304115840.0, "grad_norm": 1.8133465985866, "language_loss": 0.79743975, "learning_rate": 2.753570535293161e-06, "loss": 0.81910408, "num_input_tokens_seen": 134713115, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.77734375, "step": 6272, "time_per_iteration": 2.625572443008423 }, { "auxiliary_loss_clip": 0.01129071, "auxiliary_loss_mlp": 0.01031579, "balance_loss_clip": 1.01828718, "balance_loss_mlp": 1.04351425, "epoch": 0.37715316398617166, "flos": 22747901984640.0, "grad_norm": 1.8568743862311003, "language_loss": 0.74005663, "learning_rate": 2.753220544787698e-06, "loss": 0.7616632, "num_input_tokens_seen": 134732635, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.765625, "step": 6273, "time_per_iteration": 2.565763473510742 }, { "auxiliary_loss_clip": 0.01133902, "auxiliary_loss_mlp": 0.01043362, "balance_loss_clip": 1.02801955, "balance_loss_mlp": 1.04587471, "epoch": 0.3772132872388396, "flos": 18914581520640.0, "grad_norm": 1.609292539987886, "language_loss": 0.7180441, "learning_rate": 2.7528705274017315e-06, "loss": 0.73981678, "num_input_tokens_seen": 134750695, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.7890625, "step": 6274, "time_per_iteration": 2.589343547821045 }, { "auxiliary_loss_clip": 0.0114601, "auxiliary_loss_mlp": 0.01031145, "balance_loss_clip": 1.01709664, "balance_loss_mlp": 1.04727674, "epoch": 0.3772734104915076, "flos": 17346219004800.0, "grad_norm": 1.724667505413484, "language_loss": 0.83620554, "learning_rate": 2.752520483147752e-06, "loss": 0.85797703, "num_input_tokens_seen": 134768935, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.80859375, "step": 6275, "time_per_iteration": 2.5484211444854736 }, { "auxiliary_loss_clip": 0.01154777, "auxiliary_loss_mlp": 0.01028145, "balance_loss_clip": 1.01512182, "balance_loss_mlp": 1.04350197, "epoch": 0.37733353374417555, "flos": 32342370785280.0, "grad_norm": 1.4019881681734931, "language_loss": 0.75238609, "learning_rate": 2.7521704120382523e-06, "loss": 0.77421534, "num_input_tokens_seen": 134791260, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7578125, "step": 6276, "time_per_iteration": 2.723653793334961 }, { "auxiliary_loss_clip": 0.01143467, "auxiliary_loss_mlp": 0.01033527, "balance_loss_clip": 1.01794636, "balance_loss_mlp": 1.04561746, "epoch": 0.3773936569968435, "flos": 23360681030400.0, "grad_norm": 3.5548197955960847, "language_loss": 0.85628033, "learning_rate": 2.7518203140857255e-06, "loss": 0.87805027, "num_input_tokens_seen": 134808350, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.80078125, "step": 6277, "time_per_iteration": 2.5174381732940674 }, { "auxiliary_loss_clip": 0.01132017, "auxiliary_loss_mlp": 0.01034556, "balance_loss_clip": 1.0205127, "balance_loss_mlp": 1.04719877, "epoch": 0.3774537802495115, "flos": 21466788531840.0, "grad_norm": 1.6120929807809368, "language_loss": 0.77927172, "learning_rate": 2.7514701893026656e-06, "loss": 0.80093747, "num_input_tokens_seen": 134826005, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 6278, "time_per_iteration": 2.6045117378234863 }, { "auxiliary_loss_clip": 0.01148076, "auxiliary_loss_mlp": 0.01039151, "balance_loss_clip": 1.02303398, "balance_loss_mlp": 1.04799676, "epoch": 0.37751390350217945, "flos": 24973717086720.0, "grad_norm": 2.9408159875092994, "language_loss": 0.83113897, "learning_rate": 2.751120037701568e-06, "loss": 0.85301125, "num_input_tokens_seen": 134844995, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8203125, "step": 6279, "time_per_iteration": 2.5679845809936523 }, { "auxiliary_loss_clip": 0.01142847, "auxiliary_loss_mlp": 0.01035859, "balance_loss_clip": 1.02173865, "balance_loss_mlp": 1.04630888, "epoch": 0.3775740267548474, "flos": 27819098904960.0, "grad_norm": 2.1907928975145685, "language_loss": 0.74710906, "learning_rate": 2.7507698592949276e-06, "loss": 0.7688961, "num_input_tokens_seen": 134865285, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7890625, "step": 6280, "time_per_iteration": 2.659062147140503 }, { "auxiliary_loss_clip": 0.01124025, "auxiliary_loss_mlp": 0.01038341, "balance_loss_clip": 1.02507925, "balance_loss_mlp": 1.04773045, "epoch": 0.3776341500075154, "flos": 22565224391040.0, "grad_norm": 1.9494499448853575, "language_loss": 0.76133835, "learning_rate": 2.750419654095243e-06, "loss": 0.78296202, "num_input_tokens_seen": 134886535, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.765625, "step": 6281, "time_per_iteration": 2.5337088108062744 }, { "auxiliary_loss_clip": 0.01142525, "auxiliary_loss_mlp": 0.01034511, "balance_loss_clip": 1.01934111, "balance_loss_mlp": 1.04644203, "epoch": 0.3776942732601834, "flos": 23077238808960.0, "grad_norm": 1.4402557710808546, "language_loss": 0.84298801, "learning_rate": 2.75006942211501e-06, "loss": 0.86475837, "num_input_tokens_seen": 134907435, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.78125, "step": 6282, "time_per_iteration": 2.6274452209472656 }, { "auxiliary_loss_clip": 0.01131835, "auxiliary_loss_mlp": 0.01033357, "balance_loss_clip": 1.01924253, "balance_loss_mlp": 1.0463264, "epoch": 0.37775439651285136, "flos": 21724411852800.0, "grad_norm": 1.6218831155322113, "language_loss": 0.69882554, "learning_rate": 2.74971916336673e-06, "loss": 0.72047752, "num_input_tokens_seen": 134925360, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.765625, "step": 6283, "time_per_iteration": 2.591991424560547 }, { "auxiliary_loss_clip": 0.01168481, "auxiliary_loss_mlp": 0.0103972, "balance_loss_clip": 1.02486622, "balance_loss_mlp": 1.04666066, "epoch": 0.37781451976551933, "flos": 23987753688960.0, "grad_norm": 1.794838275777396, "language_loss": 0.76237702, "learning_rate": 2.7493688778629012e-06, "loss": 0.78445899, "num_input_tokens_seen": 134944205, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.765625, "step": 6284, "time_per_iteration": 2.7073681354522705 }, { "auxiliary_loss_clip": 0.01133581, "auxiliary_loss_mlp": 0.01037921, "balance_loss_clip": 1.02195847, "balance_loss_mlp": 1.04969049, "epoch": 0.3778746430181873, "flos": 13727967223680.0, "grad_norm": 1.9874129103808997, "language_loss": 0.84607381, "learning_rate": 2.7490185656160244e-06, "loss": 0.86778879, "num_input_tokens_seen": 134960255, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.83984375, "step": 6285, "time_per_iteration": 2.50454044342041 }, { "auxiliary_loss_clip": 0.01146667, "auxiliary_loss_mlp": 0.01037279, "balance_loss_clip": 1.02126932, "balance_loss_mlp": 1.04769254, "epoch": 0.37793476627085526, "flos": 19460495399040.0, "grad_norm": 2.0104920517563234, "language_loss": 0.84379959, "learning_rate": 2.7486682266386025e-06, "loss": 0.86563909, "num_input_tokens_seen": 134978605, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8125, "step": 6286, "time_per_iteration": 2.566216230392456 }, { "auxiliary_loss_clip": 0.01135216, "auxiliary_loss_mlp": 0.01040276, "balance_loss_clip": 1.0250411, "balance_loss_mlp": 1.04749835, "epoch": 0.3779948895235232, "flos": 10707018704640.0, "grad_norm": 1.876145532373858, "language_loss": 0.81946635, "learning_rate": 2.748317860943137e-06, "loss": 0.84122121, "num_input_tokens_seen": 134995020, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.78515625, "step": 6287, "time_per_iteration": 3.8871474266052246 }, { "auxiliary_loss_clip": 0.01134209, "auxiliary_loss_mlp": 0.01041225, "balance_loss_clip": 1.02647901, "balance_loss_mlp": 1.04664421, "epoch": 0.3780550127761912, "flos": 22310007281280.0, "grad_norm": 2.737630406775811, "language_loss": 0.72868955, "learning_rate": 2.747967468542132e-06, "loss": 0.75044388, "num_input_tokens_seen": 135012620, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.78515625, "step": 6288, "time_per_iteration": 2.5597407817840576 }, { "auxiliary_loss_clip": 0.01137523, "auxiliary_loss_mlp": 0.01039237, "balance_loss_clip": 1.02417529, "balance_loss_mlp": 1.05001855, "epoch": 0.37811513602885916, "flos": 28950644125440.0, "grad_norm": 1.6403983920501635, "language_loss": 0.75185639, "learning_rate": 2.7476170494480915e-06, "loss": 0.77362394, "num_input_tokens_seen": 135033365, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.78515625, "step": 6289, "time_per_iteration": 2.696906089782715 }, { "auxiliary_loss_clip": 0.01133444, "auxiliary_loss_mlp": 0.01040673, "balance_loss_clip": 1.02686846, "balance_loss_mlp": 1.04701352, "epoch": 0.3781752592815271, "flos": 23112933949440.0, "grad_norm": 2.444369615045153, "language_loss": 0.7417919, "learning_rate": 2.7472666036735225e-06, "loss": 0.76353312, "num_input_tokens_seen": 135052185, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7734375, "step": 6290, "time_per_iteration": 2.6351780891418457 }, { "auxiliary_loss_clip": 0.01158024, "auxiliary_loss_mlp": 0.01044217, "balance_loss_clip": 1.02758694, "balance_loss_mlp": 1.04772246, "epoch": 0.3782353825341951, "flos": 19755932762880.0, "grad_norm": 2.652493603602375, "language_loss": 0.78696489, "learning_rate": 2.74691613123093e-06, "loss": 0.80898732, "num_input_tokens_seen": 135070425, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8359375, "step": 6291, "time_per_iteration": 2.6989126205444336 }, { "auxiliary_loss_clip": 0.01155508, "auxiliary_loss_mlp": 0.01040766, "balance_loss_clip": 1.02451801, "balance_loss_mlp": 1.04699731, "epoch": 0.37829550578686305, "flos": 22050839675520.0, "grad_norm": 1.8371703489470612, "language_loss": 0.76146781, "learning_rate": 2.746565632132822e-06, "loss": 0.78343058, "num_input_tokens_seen": 135090525, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.82421875, "step": 6292, "time_per_iteration": 2.583754062652588 }, { "auxiliary_loss_clip": 0.01136745, "auxiliary_loss_mlp": 0.0104888, "balance_loss_clip": 1.03282249, "balance_loss_mlp": 1.04718041, "epoch": 0.378355629039531, "flos": 16470357770880.0, "grad_norm": 1.900148112016601, "language_loss": 0.69669867, "learning_rate": 2.746215106391707e-06, "loss": 0.71855485, "num_input_tokens_seen": 135109575, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8046875, "step": 6293, "time_per_iteration": 3.987938165664673 }, { "auxiliary_loss_clip": 0.01123104, "auxiliary_loss_mlp": 0.01041346, "balance_loss_clip": 1.02595603, "balance_loss_mlp": 1.04552317, "epoch": 0.378415752292199, "flos": 19974844200960.0, "grad_norm": 1.8194677866161133, "language_loss": 0.71279722, "learning_rate": 2.745864554020095e-06, "loss": 0.73444176, "num_input_tokens_seen": 135127000, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.77734375, "step": 6294, "time_per_iteration": 4.089040040969849 }, { "auxiliary_loss_clip": 0.01150751, "auxiliary_loss_mlp": 0.01039393, "balance_loss_clip": 1.02354372, "balance_loss_mlp": 1.04956961, "epoch": 0.378475875544867, "flos": 14647388676480.0, "grad_norm": 2.0354711913695533, "language_loss": 0.82722223, "learning_rate": 2.7455139750304947e-06, "loss": 0.84912372, "num_input_tokens_seen": 135145285, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.83203125, "step": 6295, "time_per_iteration": 2.633563756942749 }, { "auxiliary_loss_clip": 0.01138662, "auxiliary_loss_mlp": 0.01041002, "balance_loss_clip": 1.02613091, "balance_loss_mlp": 1.04890752, "epoch": 0.37853599879753497, "flos": 26650996617600.0, "grad_norm": 1.7584720927033857, "language_loss": 0.71619487, "learning_rate": 2.7451633694354194e-06, "loss": 0.73799157, "num_input_tokens_seen": 135165240, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8046875, "step": 6296, "time_per_iteration": 2.6315038204193115 }, { "auxiliary_loss_clip": 0.01123623, "auxiliary_loss_mlp": 0.01046738, "balance_loss_clip": 1.03158617, "balance_loss_mlp": 1.04764128, "epoch": 0.37859612205020293, "flos": 17311960408320.0, "grad_norm": 1.641543817613873, "language_loss": 0.77052903, "learning_rate": 2.7448127372473793e-06, "loss": 0.79223263, "num_input_tokens_seen": 135184045, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.7578125, "step": 6297, "time_per_iteration": 3.998704195022583 }, { "auxiliary_loss_clip": 0.01145495, "auxiliary_loss_mlp": 0.01040117, "balance_loss_clip": 1.02582967, "balance_loss_mlp": 1.04728889, "epoch": 0.3786562453028709, "flos": 18220392299520.0, "grad_norm": 3.623716282137106, "language_loss": 0.78870678, "learning_rate": 2.7444620784788887e-06, "loss": 0.81056297, "num_input_tokens_seen": 135202365, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8046875, "step": 6298, "time_per_iteration": 2.52264142036438 }, { "auxiliary_loss_clip": 0.01160149, "auxiliary_loss_mlp": 0.01041602, "balance_loss_clip": 1.0269866, "balance_loss_mlp": 1.04709828, "epoch": 0.37871636855553886, "flos": 21214875473280.0, "grad_norm": 1.4479004919774554, "language_loss": 0.84011632, "learning_rate": 2.744111393142462e-06, "loss": 0.86213386, "num_input_tokens_seen": 135220955, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.76953125, "step": 6299, "time_per_iteration": 2.619232416152954 }, { "auxiliary_loss_clip": 0.01156039, "auxiliary_loss_mlp": 0.01033942, "balance_loss_clip": 1.01942837, "balance_loss_mlp": 1.04879677, "epoch": 0.3787764918082068, "flos": 20952727038720.0, "grad_norm": 2.0708598882088505, "language_loss": 0.75992846, "learning_rate": 2.743760681250613e-06, "loss": 0.78182828, "num_input_tokens_seen": 135239715, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8046875, "step": 6300, "time_per_iteration": 2.537423849105835 }, { "auxiliary_loss_clip": 0.01148505, "auxiliary_loss_mlp": 0.01039365, "balance_loss_clip": 1.02273488, "balance_loss_mlp": 1.04783309, "epoch": 0.3788366150608748, "flos": 17308009912320.0, "grad_norm": 3.6826117842078987, "language_loss": 0.82029831, "learning_rate": 2.743409942815859e-06, "loss": 0.84217703, "num_input_tokens_seen": 135257035, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.828125, "step": 6301, "time_per_iteration": 2.6510589122772217 }, { "auxiliary_loss_clip": 0.01132671, "auxiliary_loss_mlp": 0.01041892, "balance_loss_clip": 1.02712226, "balance_loss_mlp": 1.04431677, "epoch": 0.37889673831354276, "flos": 24311092942080.0, "grad_norm": 2.2443712732520615, "language_loss": 0.68110347, "learning_rate": 2.743059177850716e-06, "loss": 0.70284909, "num_input_tokens_seen": 135275720, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.79296875, "step": 6302, "time_per_iteration": 2.5789222717285156 }, { "auxiliary_loss_clip": 0.01160746, "auxiliary_loss_mlp": 0.01042465, "balance_loss_clip": 1.02880323, "balance_loss_mlp": 1.0480355, "epoch": 0.3789568615662107, "flos": 26683603188480.0, "grad_norm": 1.990853535854019, "language_loss": 0.68219346, "learning_rate": 2.7427083863677035e-06, "loss": 0.7042256, "num_input_tokens_seen": 135294140, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.76953125, "step": 6303, "time_per_iteration": 2.690908670425415 }, { "auxiliary_loss_clip": 0.01122651, "auxiliary_loss_mlp": 0.01032291, "balance_loss_clip": 1.01800346, "balance_loss_mlp": 1.04464555, "epoch": 0.3790169848188787, "flos": 23585194990080.0, "grad_norm": 1.4659871063978243, "language_loss": 0.77221149, "learning_rate": 2.742357568379338e-06, "loss": 0.7937609, "num_input_tokens_seen": 135314845, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78125, "step": 6304, "time_per_iteration": 2.603346824645996 }, { "auxiliary_loss_clip": 0.01133097, "auxiliary_loss_mlp": 0.01040718, "balance_loss_clip": 1.02452946, "balance_loss_mlp": 1.05015624, "epoch": 0.37907710807154665, "flos": 18437436230400.0, "grad_norm": 2.0934776527142125, "language_loss": 0.804057, "learning_rate": 2.7420067238981405e-06, "loss": 0.82579517, "num_input_tokens_seen": 135333055, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.828125, "step": 6305, "time_per_iteration": 2.666517972946167 }, { "auxiliary_loss_clip": 0.01051743, "auxiliary_loss_mlp": 0.01001608, "balance_loss_clip": 0.99961704, "balance_loss_mlp": 1.01789379, "epoch": 0.3791372313242146, "flos": 50107165954560.0, "grad_norm": 0.9725624982832292, "language_loss": 0.64514494, "learning_rate": 2.741655852936632e-06, "loss": 0.6656785, "num_input_tokens_seen": 135387865, "router_z_loss_clip": 0.01989746, "router_z_loss_mlp": 0.25, "step": 6306, "time_per_iteration": 3.058983087539673 }, { "auxiliary_loss_clip": 0.01147605, "auxiliary_loss_mlp": 0.01040148, "balance_loss_clip": 1.0247221, "balance_loss_mlp": 1.04944384, "epoch": 0.3791973545768826, "flos": 24316551809280.0, "grad_norm": 1.7001391160626096, "language_loss": 0.73372519, "learning_rate": 2.741304955507334e-06, "loss": 0.75560272, "num_input_tokens_seen": 135409095, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8046875, "step": 6307, "time_per_iteration": 2.6346912384033203 }, { "auxiliary_loss_clip": 0.01145302, "auxiliary_loss_mlp": 0.01039423, "balance_loss_clip": 1.0241468, "balance_loss_mlp": 1.04652047, "epoch": 0.3792574778295506, "flos": 21579907438080.0, "grad_norm": 1.5275229909603194, "language_loss": 0.78138447, "learning_rate": 2.7409540316227686e-06, "loss": 0.80323172, "num_input_tokens_seen": 135429585, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.80859375, "step": 6308, "time_per_iteration": 2.6020379066467285 }, { "auxiliary_loss_clip": 0.01151718, "auxiliary_loss_mlp": 0.01039293, "balance_loss_clip": 1.02398026, "balance_loss_mlp": 1.04503655, "epoch": 0.37931760108221857, "flos": 22272731942400.0, "grad_norm": 2.22366666002282, "language_loss": 0.73072219, "learning_rate": 2.7406030812954596e-06, "loss": 0.75263226, "num_input_tokens_seen": 135446320, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.796875, "step": 6309, "time_per_iteration": 2.6089463233947754 }, { "auxiliary_loss_clip": 0.01159836, "auxiliary_loss_mlp": 0.01036004, "balance_loss_clip": 1.02116227, "balance_loss_mlp": 1.04606915, "epoch": 0.37937772433488653, "flos": 19682998197120.0, "grad_norm": 1.4482362988033781, "language_loss": 0.78491008, "learning_rate": 2.740252104537932e-06, "loss": 0.80686843, "num_input_tokens_seen": 135465720, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.78125, "step": 6310, "time_per_iteration": 2.745304584503174 }, { "auxiliary_loss_clip": 0.01132775, "auxiliary_loss_mlp": 0.01285302, "balance_loss_clip": 1.02417159, "balance_loss_mlp": 1.04547441, "epoch": 0.3794378475875545, "flos": 19099378016640.0, "grad_norm": 1.7208057202831397, "language_loss": 0.75704134, "learning_rate": 2.7399011013627112e-06, "loss": 0.78122211, "num_input_tokens_seen": 135485155, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78515625, "step": 6311, "time_per_iteration": 2.588958740234375 }, { "auxiliary_loss_clip": 0.01142084, "auxiliary_loss_mlp": 0.0103266, "balance_loss_clip": 1.01813996, "balance_loss_mlp": 1.04584861, "epoch": 0.37949797084022246, "flos": 20339660684160.0, "grad_norm": 1.7283032963491964, "language_loss": 0.70890391, "learning_rate": 2.7395500717823233e-06, "loss": 0.73065132, "num_input_tokens_seen": 135502675, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.78125, "step": 6312, "time_per_iteration": 2.6557631492614746 }, { "auxiliary_loss_clip": 0.01067195, "auxiliary_loss_mlp": 0.01003026, "balance_loss_clip": 1.0012145, "balance_loss_mlp": 1.01577377, "epoch": 0.37955809409289043, "flos": 63972203477760.0, "grad_norm": 0.7787308501321109, "language_loss": 0.56078881, "learning_rate": 2.739199015809296e-06, "loss": 0.58149111, "num_input_tokens_seen": 135562005, "router_z_loss_clip": 0.01806641, "router_z_loss_mlp": 0.25, "step": 6313, "time_per_iteration": 3.1071767807006836 }, { "auxiliary_loss_clip": 0.01151254, "auxiliary_loss_mlp": 0.01039369, "balance_loss_clip": 1.0244205, "balance_loss_mlp": 1.04619825, "epoch": 0.3796182173455584, "flos": 31540665179520.0, "grad_norm": 1.7353990738948797, "language_loss": 0.71568334, "learning_rate": 2.738847933456156e-06, "loss": 0.7375896, "num_input_tokens_seen": 135582600, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.78125, "step": 6314, "time_per_iteration": 2.682750701904297 }, { "auxiliary_loss_clip": 0.01138937, "auxiliary_loss_mlp": 0.01036687, "balance_loss_clip": 1.02140999, "balance_loss_mlp": 1.04748893, "epoch": 0.37967834059822636, "flos": 12130804978560.0, "grad_norm": 2.452851458472667, "language_loss": 0.73106229, "learning_rate": 2.738496824735435e-06, "loss": 0.75281852, "num_input_tokens_seen": 135600280, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.828125, "step": 6315, "time_per_iteration": 2.4879283905029297 }, { "auxiliary_loss_clip": 0.01130946, "auxiliary_loss_mlp": 0.01036095, "balance_loss_clip": 1.02155149, "balance_loss_mlp": 1.04474926, "epoch": 0.3797384638508943, "flos": 39348578298240.0, "grad_norm": 2.351024599825698, "language_loss": 0.70429641, "learning_rate": 2.738145689659661e-06, "loss": 0.72596681, "num_input_tokens_seen": 135621560, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.76953125, "step": 6316, "time_per_iteration": 2.7224812507629395 }, { "auxiliary_loss_clip": 0.01150789, "auxiliary_loss_mlp": 0.01034859, "balance_loss_clip": 1.02149582, "balance_loss_mlp": 1.04709315, "epoch": 0.3797985871035623, "flos": 34054016653440.0, "grad_norm": 1.7961612470893968, "language_loss": 0.6538713, "learning_rate": 2.737794528241367e-06, "loss": 0.67572778, "num_input_tokens_seen": 135641745, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.76953125, "step": 6317, "time_per_iteration": 2.6840944290161133 }, { "auxiliary_loss_clip": 0.01130518, "auxiliary_loss_mlp": 0.01033198, "balance_loss_clip": 1.01975679, "balance_loss_mlp": 1.04438663, "epoch": 0.37985871035623026, "flos": 23222174186880.0, "grad_norm": 2.759514440860021, "language_loss": 0.85361886, "learning_rate": 2.737443340493084e-06, "loss": 0.875256, "num_input_tokens_seen": 135660650, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7734375, "step": 6318, "time_per_iteration": 2.6305928230285645 }, { "auxiliary_loss_clip": 0.01145298, "auxiliary_loss_mlp": 0.01038598, "balance_loss_clip": 1.02315998, "balance_loss_mlp": 1.04677534, "epoch": 0.3799188336088982, "flos": 18114958903680.0, "grad_norm": 2.386560928932603, "language_loss": 0.76124787, "learning_rate": 2.737092126427345e-06, "loss": 0.78308678, "num_input_tokens_seen": 135679980, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.80859375, "step": 6319, "time_per_iteration": 2.5984678268432617 }, { "auxiliary_loss_clip": 0.01125719, "auxiliary_loss_mlp": 0.01036842, "balance_loss_clip": 1.02241755, "balance_loss_mlp": 1.04702628, "epoch": 0.3799789568615662, "flos": 21871897096320.0, "grad_norm": 1.4808367696577505, "language_loss": 0.64147735, "learning_rate": 2.736740886056684e-06, "loss": 0.66310298, "num_input_tokens_seen": 135699400, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7890625, "step": 6320, "time_per_iteration": 2.5915844440460205 }, { "auxiliary_loss_clip": 0.01158782, "auxiliary_loss_mlp": 0.01035644, "balance_loss_clip": 1.02067101, "balance_loss_mlp": 1.04532671, "epoch": 0.3800390801142342, "flos": 32962943082240.0, "grad_norm": 1.9574462244756852, "language_loss": 0.71120453, "learning_rate": 2.7363896193936356e-06, "loss": 0.73314875, "num_input_tokens_seen": 135723455, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.78125, "step": 6321, "time_per_iteration": 2.7579219341278076 }, { "auxiliary_loss_clip": 0.01153925, "auxiliary_loss_mlp": 0.0103509, "balance_loss_clip": 1.02096391, "balance_loss_mlp": 1.04596984, "epoch": 0.38009920336690217, "flos": 26907075653760.0, "grad_norm": 1.6240249965915052, "language_loss": 0.74385816, "learning_rate": 2.7360383264507364e-06, "loss": 0.76574832, "num_input_tokens_seen": 135744335, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.8125, "step": 6322, "time_per_iteration": 2.6151280403137207 }, { "auxiliary_loss_clip": 0.0113224, "auxiliary_loss_mlp": 0.01039151, "balance_loss_clip": 1.02396321, "balance_loss_mlp": 1.04492378, "epoch": 0.38015932661957014, "flos": 22488913946880.0, "grad_norm": 2.289264763125929, "language_loss": 0.8456223, "learning_rate": 2.735687007240522e-06, "loss": 0.86733615, "num_input_tokens_seen": 135761440, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.78125, "step": 6323, "time_per_iteration": 2.5985116958618164 }, { "auxiliary_loss_clip": 0.01143766, "auxiliary_loss_mlp": 0.0103544, "balance_loss_clip": 1.01954889, "balance_loss_mlp": 1.04605436, "epoch": 0.3802194498722381, "flos": 21980993679360.0, "grad_norm": 5.945520727160387, "language_loss": 0.72910082, "learning_rate": 2.735335661775531e-06, "loss": 0.75089288, "num_input_tokens_seen": 135779955, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.796875, "step": 6324, "time_per_iteration": 2.559178590774536 }, { "auxiliary_loss_clip": 0.0115357, "auxiliary_loss_mlp": 0.0103459, "balance_loss_clip": 1.01962292, "balance_loss_mlp": 1.04697442, "epoch": 0.38027957312490607, "flos": 21324869896320.0, "grad_norm": 1.7688857548535293, "language_loss": 0.84320056, "learning_rate": 2.734984290068302e-06, "loss": 0.86508209, "num_input_tokens_seen": 135799840, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.80078125, "step": 6325, "time_per_iteration": 2.5955307483673096 }, { "auxiliary_loss_clip": 0.01140529, "auxiliary_loss_mlp": 0.01031794, "balance_loss_clip": 1.01772761, "balance_loss_mlp": 1.04510903, "epoch": 0.38033969637757403, "flos": 16691244456960.0, "grad_norm": 2.4548374756537075, "language_loss": 0.79611611, "learning_rate": 2.734632892131374e-06, "loss": 0.81783938, "num_input_tokens_seen": 135817880, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.77734375, "step": 6326, "time_per_iteration": 2.5153253078460693 }, { "auxiliary_loss_clip": 0.0112883, "auxiliary_loss_mlp": 0.01036172, "balance_loss_clip": 1.02237952, "balance_loss_mlp": 1.04262912, "epoch": 0.380399819630242, "flos": 36210847685760.0, "grad_norm": 2.270481580318478, "language_loss": 0.73043597, "learning_rate": 2.734281467977288e-06, "loss": 0.75208592, "num_input_tokens_seen": 135838940, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7734375, "step": 6327, "time_per_iteration": 2.648698091506958 }, { "auxiliary_loss_clip": 0.01121943, "auxiliary_loss_mlp": 0.01277829, "balance_loss_clip": 1.01621103, "balance_loss_mlp": 1.04466295, "epoch": 0.38045994288290996, "flos": 21288851533440.0, "grad_norm": 1.510587442524201, "language_loss": 0.83089149, "learning_rate": 2.733930017618585e-06, "loss": 0.85488927, "num_input_tokens_seen": 135858325, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7734375, "step": 6328, "time_per_iteration": 4.020441770553589 }, { "auxiliary_loss_clip": 0.01129163, "auxiliary_loss_mlp": 0.01030126, "balance_loss_clip": 1.01579118, "balance_loss_mlp": 1.04339135, "epoch": 0.38052006613557793, "flos": 20922885815040.0, "grad_norm": 1.4003983999841345, "language_loss": 0.61142814, "learning_rate": 2.733578541067808e-06, "loss": 0.633021, "num_input_tokens_seen": 135878430, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.76953125, "step": 6329, "time_per_iteration": 2.5655853748321533 }, { "auxiliary_loss_clip": 0.01142702, "auxiliary_loss_mlp": 0.01033959, "balance_loss_clip": 1.01968932, "balance_loss_mlp": 1.04557157, "epoch": 0.3805801893882459, "flos": 20990720649600.0, "grad_norm": 1.8400549226743752, "language_loss": 0.5620873, "learning_rate": 2.733227038337499e-06, "loss": 0.58385384, "num_input_tokens_seen": 135894755, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7890625, "step": 6330, "time_per_iteration": 2.5357418060302734 }, { "auxiliary_loss_clip": 0.01146215, "auxiliary_loss_mlp": 0.0103141, "balance_loss_clip": 1.0180707, "balance_loss_mlp": 1.04503918, "epoch": 0.38064031264091386, "flos": 25558594243200.0, "grad_norm": 1.5058848945502565, "language_loss": 0.65904319, "learning_rate": 2.7328755094402036e-06, "loss": 0.68081945, "num_input_tokens_seen": 135918275, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7421875, "step": 6331, "time_per_iteration": 2.6230974197387695 }, { "auxiliary_loss_clip": 0.01133146, "auxiliary_loss_mlp": 0.01036556, "balance_loss_clip": 1.02151787, "balance_loss_mlp": 1.04569602, "epoch": 0.3807004358935818, "flos": 15085857997440.0, "grad_norm": 1.8018597679077144, "language_loss": 0.75573087, "learning_rate": 2.732523954388466e-06, "loss": 0.77742785, "num_input_tokens_seen": 135937430, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.78125, "step": 6332, "time_per_iteration": 2.53395676612854 }, { "auxiliary_loss_clip": 0.01170019, "auxiliary_loss_mlp": 0.01284877, "balance_loss_clip": 1.02252078, "balance_loss_mlp": 1.04406583, "epoch": 0.3807605591462498, "flos": 16399398453120.0, "grad_norm": 2.050195778090314, "language_loss": 0.81300688, "learning_rate": 2.732172373194834e-06, "loss": 0.83755583, "num_input_tokens_seen": 135954210, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.80859375, "step": 6333, "time_per_iteration": 2.59071946144104 }, { "auxiliary_loss_clip": 0.01130646, "auxiliary_loss_mlp": 0.0103188, "balance_loss_clip": 1.01743221, "balance_loss_mlp": 1.04386866, "epoch": 0.3808206823989178, "flos": 29057083102080.0, "grad_norm": 1.9543369143632432, "language_loss": 0.86240554, "learning_rate": 2.731820765871853e-06, "loss": 0.88403082, "num_input_tokens_seen": 135974425, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.77734375, "step": 6334, "time_per_iteration": 2.7066268920898438 }, { "auxiliary_loss_clip": 0.01131725, "auxiliary_loss_mlp": 0.01036204, "balance_loss_clip": 1.02131462, "balance_loss_mlp": 1.04367924, "epoch": 0.3808808056515858, "flos": 15705855676800.0, "grad_norm": 1.8072837020836425, "language_loss": 0.7896052, "learning_rate": 2.7314691324320705e-06, "loss": 0.81128448, "num_input_tokens_seen": 135991985, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7890625, "step": 6335, "time_per_iteration": 3.965275287628174 }, { "auxiliary_loss_clip": 0.01133616, "auxiliary_loss_mlp": 0.01037991, "balance_loss_clip": 1.02233911, "balance_loss_mlp": 1.04424143, "epoch": 0.38094092890425374, "flos": 20704584908160.0, "grad_norm": 2.59766672169301, "language_loss": 0.73259366, "learning_rate": 2.7311174728880364e-06, "loss": 0.75430965, "num_input_tokens_seen": 136010015, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8046875, "step": 6336, "time_per_iteration": 4.186552047729492 }, { "auxiliary_loss_clip": 0.01129824, "auxiliary_loss_mlp": 0.01032469, "balance_loss_clip": 1.01853323, "balance_loss_mlp": 1.04309869, "epoch": 0.3810010521569217, "flos": 20667956014080.0, "grad_norm": 2.0520881873961705, "language_loss": 0.6935209, "learning_rate": 2.730765787252301e-06, "loss": 0.7151438, "num_input_tokens_seen": 136028440, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.77734375, "step": 6337, "time_per_iteration": 2.6709539890289307 }, { "auxiliary_loss_clip": 0.0114235, "auxiliary_loss_mlp": 0.01032714, "balance_loss_clip": 1.01776481, "balance_loss_mlp": 1.0453974, "epoch": 0.38106117540958967, "flos": 31827626933760.0, "grad_norm": 1.8818411022960035, "language_loss": 0.6324321, "learning_rate": 2.7304140755374137e-06, "loss": 0.65418279, "num_input_tokens_seen": 136048360, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.7890625, "step": 6338, "time_per_iteration": 2.6661579608917236 }, { "auxiliary_loss_clip": 0.01141627, "auxiliary_loss_mlp": 0.01039365, "balance_loss_clip": 1.02391529, "balance_loss_mlp": 1.0436362, "epoch": 0.38112129866225763, "flos": 16902757693440.0, "grad_norm": 1.6938543999238402, "language_loss": 0.69441122, "learning_rate": 2.7300623377559273e-06, "loss": 0.71622109, "num_input_tokens_seen": 136065500, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.80078125, "step": 6339, "time_per_iteration": 4.807001829147339 }, { "auxiliary_loss_clip": 0.01143044, "auxiliary_loss_mlp": 0.01039696, "balance_loss_clip": 1.0253675, "balance_loss_mlp": 1.045632, "epoch": 0.3811814219149256, "flos": 20887226588160.0, "grad_norm": 2.5425220995663067, "language_loss": 0.67554975, "learning_rate": 2.729710573920394e-06, "loss": 0.69737715, "num_input_tokens_seen": 136084060, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.796875, "step": 6340, "time_per_iteration": 2.6528561115264893 }, { "auxiliary_loss_clip": 0.01151579, "auxiliary_loss_mlp": 0.01040909, "balance_loss_clip": 1.02486372, "balance_loss_mlp": 1.04376674, "epoch": 0.38124154516759357, "flos": 16690813493760.0, "grad_norm": 1.9747152818485394, "language_loss": 0.89526951, "learning_rate": 2.729358784043367e-06, "loss": 0.91719443, "num_input_tokens_seen": 136102310, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8125, "step": 6341, "time_per_iteration": 2.5844268798828125 }, { "auxiliary_loss_clip": 0.01135287, "auxiliary_loss_mlp": 0.01040704, "balance_loss_clip": 1.02434218, "balance_loss_mlp": 1.04472661, "epoch": 0.38130166842026153, "flos": 19681956702720.0, "grad_norm": 1.5531093355837424, "language_loss": 0.74970627, "learning_rate": 2.7290069681374018e-06, "loss": 0.7714662, "num_input_tokens_seen": 136120725, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.81640625, "step": 6342, "time_per_iteration": 2.617748260498047 }, { "auxiliary_loss_clip": 0.01141238, "auxiliary_loss_mlp": 0.01037336, "balance_loss_clip": 1.02229142, "balance_loss_mlp": 1.04368639, "epoch": 0.3813617916729295, "flos": 22198432659840.0, "grad_norm": 1.6388935552372867, "language_loss": 0.83554852, "learning_rate": 2.7286551262150522e-06, "loss": 0.85733426, "num_input_tokens_seen": 136139105, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.796875, "step": 6343, "time_per_iteration": 2.534748077392578 }, { "auxiliary_loss_clip": 0.01166223, "auxiliary_loss_mlp": 0.01037045, "balance_loss_clip": 1.02288866, "balance_loss_mlp": 1.0416038, "epoch": 0.38142191492559746, "flos": 19096899978240.0, "grad_norm": 2.9546658799029357, "language_loss": 0.76860678, "learning_rate": 2.7283032582888763e-06, "loss": 0.79063952, "num_input_tokens_seen": 136158265, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.796875, "step": 6344, "time_per_iteration": 2.6417739391326904 }, { "auxiliary_loss_clip": 0.01150575, "auxiliary_loss_mlp": 0.01037895, "balance_loss_clip": 1.02240396, "balance_loss_mlp": 1.04450941, "epoch": 0.3814820381782654, "flos": 24097748112000.0, "grad_norm": 2.200215465191671, "language_loss": 0.72855771, "learning_rate": 2.7279513643714304e-06, "loss": 0.75044245, "num_input_tokens_seen": 136176100, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.79296875, "step": 6345, "time_per_iteration": 2.5784411430358887 }, { "auxiliary_loss_clip": 0.01147556, "auxiliary_loss_mlp": 0.01280322, "balance_loss_clip": 1.01936913, "balance_loss_mlp": 1.0427494, "epoch": 0.3815421614309334, "flos": 15778502933760.0, "grad_norm": 2.2882182671430913, "language_loss": 0.6945892, "learning_rate": 2.727599444475272e-06, "loss": 0.71886802, "num_input_tokens_seen": 136195125, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78125, "step": 6346, "time_per_iteration": 2.5997447967529297 }, { "auxiliary_loss_clip": 0.01132848, "auxiliary_loss_mlp": 0.01034904, "balance_loss_clip": 1.02081347, "balance_loss_mlp": 1.04389191, "epoch": 0.38160228468360136, "flos": 19899754819200.0, "grad_norm": 1.7683066095088382, "language_loss": 0.75078821, "learning_rate": 2.7272474986129622e-06, "loss": 0.77246571, "num_input_tokens_seen": 136213885, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.796875, "step": 6347, "time_per_iteration": 2.558389663696289 }, { "auxiliary_loss_clip": 0.01131576, "auxiliary_loss_mlp": 0.01037816, "balance_loss_clip": 1.02419066, "balance_loss_mlp": 1.04357672, "epoch": 0.3816624079362694, "flos": 19281050029440.0, "grad_norm": 2.1201012711019005, "language_loss": 0.74421561, "learning_rate": 2.7268955267970594e-06, "loss": 0.76590955, "num_input_tokens_seen": 136232700, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.79296875, "step": 6348, "time_per_iteration": 2.5824496746063232 }, { "auxiliary_loss_clip": 0.01139778, "auxiliary_loss_mlp": 0.01033578, "balance_loss_clip": 1.01904082, "balance_loss_mlp": 1.04328036, "epoch": 0.38172253118893734, "flos": 21177564220800.0, "grad_norm": 1.849593149310156, "language_loss": 0.87221551, "learning_rate": 2.726543529040125e-06, "loss": 0.89394909, "num_input_tokens_seen": 136248975, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7890625, "step": 6349, "time_per_iteration": 2.6218559741973877 }, { "auxiliary_loss_clip": 0.01131336, "auxiliary_loss_mlp": 0.01038323, "balance_loss_clip": 1.02385056, "balance_loss_mlp": 1.04313374, "epoch": 0.3817826544416053, "flos": 17529219820800.0, "grad_norm": 1.6462072145469702, "language_loss": 0.7617287, "learning_rate": 2.7261915053547216e-06, "loss": 0.78342533, "num_input_tokens_seen": 136266710, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.79296875, "step": 6350, "time_per_iteration": 2.5835447311401367 }, { "auxiliary_loss_clip": 0.01148475, "auxiliary_loss_mlp": 0.01033648, "balance_loss_clip": 1.01742411, "balance_loss_mlp": 1.04257154, "epoch": 0.38184277769427327, "flos": 16326535714560.0, "grad_norm": 2.3947198794007916, "language_loss": 0.75910723, "learning_rate": 2.7258394557534103e-06, "loss": 0.78092849, "num_input_tokens_seen": 136284445, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.79296875, "step": 6351, "time_per_iteration": 2.57297945022583 }, { "auxiliary_loss_clip": 0.01135208, "auxiliary_loss_mlp": 0.01036157, "balance_loss_clip": 1.02005148, "balance_loss_mlp": 1.04457116, "epoch": 0.38190290094694124, "flos": 30443450382720.0, "grad_norm": 1.797043980294844, "language_loss": 0.74071002, "learning_rate": 2.725487380248756e-06, "loss": 0.76242363, "num_input_tokens_seen": 136305730, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.81640625, "step": 6352, "time_per_iteration": 2.618471384048462 }, { "auxiliary_loss_clip": 0.01143977, "auxiliary_loss_mlp": 0.01032382, "balance_loss_clip": 1.01847076, "balance_loss_mlp": 1.04014874, "epoch": 0.3819630241996092, "flos": 14209924936320.0, "grad_norm": 1.6876567020602447, "language_loss": 0.64475894, "learning_rate": 2.7251352788533237e-06, "loss": 0.6665225, "num_input_tokens_seen": 136323850, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76953125, "step": 6353, "time_per_iteration": 2.566870927810669 }, { "auxiliary_loss_clip": 0.01121347, "auxiliary_loss_mlp": 0.01032899, "balance_loss_clip": 1.01841545, "balance_loss_mlp": 1.04267263, "epoch": 0.38202314745227717, "flos": 25009699536000.0, "grad_norm": 1.6848725358107195, "language_loss": 0.83184212, "learning_rate": 2.7247831515796786e-06, "loss": 0.85338455, "num_input_tokens_seen": 136344880, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7890625, "step": 6354, "time_per_iteration": 2.576026678085327 }, { "auxiliary_loss_clip": 0.01121067, "auxiliary_loss_mlp": 0.01033928, "balance_loss_clip": 1.0204215, "balance_loss_mlp": 1.04413295, "epoch": 0.38208327070494513, "flos": 20814507504000.0, "grad_norm": 1.6996315256769567, "language_loss": 0.80056262, "learning_rate": 2.7244309984403865e-06, "loss": 0.82211256, "num_input_tokens_seen": 136366060, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.76953125, "step": 6355, "time_per_iteration": 2.5856919288635254 }, { "auxiliary_loss_clip": 0.01150572, "auxiliary_loss_mlp": 0.01036221, "balance_loss_clip": 1.02177906, "balance_loss_mlp": 1.04498267, "epoch": 0.3821433939576131, "flos": 22637727993600.0, "grad_norm": 1.8623475081659728, "language_loss": 0.75173551, "learning_rate": 2.7240788194480163e-06, "loss": 0.77360344, "num_input_tokens_seen": 136385625, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78515625, "step": 6356, "time_per_iteration": 2.5581798553466797 }, { "auxiliary_loss_clip": 0.01131866, "auxiliary_loss_mlp": 0.01287318, "balance_loss_clip": 1.02539313, "balance_loss_mlp": 1.04436755, "epoch": 0.38220351721028106, "flos": 26869872142080.0, "grad_norm": 2.244842134688014, "language_loss": 0.81266123, "learning_rate": 2.7237266146151357e-06, "loss": 0.83685303, "num_input_tokens_seen": 136405750, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78515625, "step": 6357, "time_per_iteration": 2.6122653484344482 }, { "auxiliary_loss_clip": 0.01144668, "auxiliary_loss_mlp": 0.01043548, "balance_loss_clip": 1.02726364, "balance_loss_mlp": 1.04726553, "epoch": 0.38226364046294903, "flos": 23367468700800.0, "grad_norm": 1.7162937031471064, "language_loss": 0.77506572, "learning_rate": 2.7233743839543135e-06, "loss": 0.7969479, "num_input_tokens_seen": 136426085, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.80078125, "step": 6358, "time_per_iteration": 2.7168779373168945 }, { "auxiliary_loss_clip": 0.01141352, "auxiliary_loss_mlp": 0.01040061, "balance_loss_clip": 1.02484989, "balance_loss_mlp": 1.04369044, "epoch": 0.382323763715617, "flos": 19646225648640.0, "grad_norm": 2.0649976670716765, "language_loss": 0.79122788, "learning_rate": 2.7230221274781204e-06, "loss": 0.81304204, "num_input_tokens_seen": 136442670, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.80078125, "step": 6359, "time_per_iteration": 2.6357364654541016 }, { "auxiliary_loss_clip": 0.01167223, "auxiliary_loss_mlp": 0.01038611, "balance_loss_clip": 1.02361465, "balance_loss_mlp": 1.04400718, "epoch": 0.38238388696828496, "flos": 54124741232640.0, "grad_norm": 2.1907180589617514, "language_loss": 0.69382578, "learning_rate": 2.722669845199127e-06, "loss": 0.71588409, "num_input_tokens_seen": 136465730, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.78125, "step": 6360, "time_per_iteration": 2.8971235752105713 }, { "auxiliary_loss_clip": 0.01140309, "auxiliary_loss_mlp": 0.01032829, "balance_loss_clip": 1.01743317, "balance_loss_mlp": 1.04462516, "epoch": 0.382444010220953, "flos": 24936190352640.0, "grad_norm": 1.493381607749724, "language_loss": 0.78780288, "learning_rate": 2.7223175371299062e-06, "loss": 0.80953431, "num_input_tokens_seen": 136487215, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.77734375, "step": 6361, "time_per_iteration": 2.7204267978668213 }, { "auxiliary_loss_clip": 0.0113658, "auxiliary_loss_mlp": 0.01038648, "balance_loss_clip": 1.02455139, "balance_loss_mlp": 1.04329491, "epoch": 0.38250413347362094, "flos": 42337351209600.0, "grad_norm": 1.440359718186609, "language_loss": 0.65476412, "learning_rate": 2.72196520328303e-06, "loss": 0.67651641, "num_input_tokens_seen": 136510365, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75390625, "step": 6362, "time_per_iteration": 2.7280538082122803 }, { "auxiliary_loss_clip": 0.01119951, "auxiliary_loss_mlp": 0.0103583, "balance_loss_clip": 1.02010632, "balance_loss_mlp": 1.04150772, "epoch": 0.3825642567262889, "flos": 16289224462080.0, "grad_norm": 2.265325176877709, "language_loss": 0.81595123, "learning_rate": 2.7216128436710737e-06, "loss": 0.83750904, "num_input_tokens_seen": 136527100, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.78515625, "step": 6363, "time_per_iteration": 2.577402114868164 }, { "auxiliary_loss_clip": 0.01156916, "auxiliary_loss_mlp": 0.01043114, "balance_loss_clip": 1.02828383, "balance_loss_mlp": 1.04522574, "epoch": 0.3826243799789569, "flos": 45654778586880.0, "grad_norm": 1.654653775382962, "language_loss": 0.58754051, "learning_rate": 2.7212604583066107e-06, "loss": 0.60954082, "num_input_tokens_seen": 136550870, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.76171875, "step": 6364, "time_per_iteration": 2.7935168743133545 }, { "auxiliary_loss_clip": 0.01143186, "auxiliary_loss_mlp": 0.01036052, "balance_loss_clip": 1.02082324, "balance_loss_mlp": 1.04498816, "epoch": 0.38268450323162484, "flos": 25301581453440.0, "grad_norm": 2.769134159793766, "language_loss": 0.68742073, "learning_rate": 2.7209080472022174e-06, "loss": 0.70921314, "num_input_tokens_seen": 136569895, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8046875, "step": 6365, "time_per_iteration": 2.6174418926239014 }, { "auxiliary_loss_clip": 0.01132737, "auxiliary_loss_mlp": 0.01036253, "balance_loss_clip": 1.02080333, "balance_loss_mlp": 1.04202938, "epoch": 0.3827446264842928, "flos": 21836022387840.0, "grad_norm": 1.659293406441215, "language_loss": 0.7324664, "learning_rate": 2.72055561037047e-06, "loss": 0.75415635, "num_input_tokens_seen": 136588585, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8125, "step": 6366, "time_per_iteration": 2.6172780990600586 }, { "auxiliary_loss_clip": 0.01140782, "auxiliary_loss_mlp": 0.01038187, "balance_loss_clip": 1.0226301, "balance_loss_mlp": 1.04398131, "epoch": 0.38280474973696077, "flos": 25734591907200.0, "grad_norm": 2.201578354152726, "language_loss": 0.68398482, "learning_rate": 2.720203147823947e-06, "loss": 0.70577455, "num_input_tokens_seen": 136606640, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.7890625, "step": 6367, "time_per_iteration": 2.620682716369629 }, { "auxiliary_loss_clip": 0.01127844, "auxiliary_loss_mlp": 0.01033738, "balance_loss_clip": 1.01875925, "balance_loss_mlp": 1.04284334, "epoch": 0.38286487298962874, "flos": 24895934184960.0, "grad_norm": 1.8667643900684598, "language_loss": 0.64100277, "learning_rate": 2.719850659575225e-06, "loss": 0.66261858, "num_input_tokens_seen": 136624940, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7578125, "step": 6368, "time_per_iteration": 2.613980770111084 }, { "auxiliary_loss_clip": 0.01123138, "auxiliary_loss_mlp": 0.0103429, "balance_loss_clip": 1.01957989, "balance_loss_mlp": 1.04420471, "epoch": 0.3829249962422967, "flos": 28543703967360.0, "grad_norm": 1.2835210840825468, "language_loss": 0.68329656, "learning_rate": 2.7194981456368857e-06, "loss": 0.70487082, "num_input_tokens_seen": 136645540, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7890625, "step": 6369, "time_per_iteration": 2.573307752609253 }, { "auxiliary_loss_clip": 0.01155564, "auxiliary_loss_mlp": 0.01043607, "balance_loss_clip": 1.02832484, "balance_loss_mlp": 1.04262602, "epoch": 0.38298511949496467, "flos": 21471205904640.0, "grad_norm": 1.7872981143936508, "language_loss": 0.78467554, "learning_rate": 2.719145606021508e-06, "loss": 0.80666721, "num_input_tokens_seen": 136664530, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7734375, "step": 6370, "time_per_iteration": 3.9523491859436035 }, { "auxiliary_loss_clip": 0.01131481, "auxiliary_loss_mlp": 0.01041502, "balance_loss_clip": 1.02648735, "balance_loss_mlp": 1.04484057, "epoch": 0.38304524274763263, "flos": 31679998035840.0, "grad_norm": 2.35453841699749, "language_loss": 0.63984996, "learning_rate": 2.7187930407416738e-06, "loss": 0.66157979, "num_input_tokens_seen": 136682315, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7734375, "step": 6371, "time_per_iteration": 2.574016809463501 }, { "auxiliary_loss_clip": 0.01152052, "auxiliary_loss_mlp": 0.01039713, "balance_loss_clip": 1.02342856, "balance_loss_mlp": 1.04379046, "epoch": 0.3831053660003006, "flos": 25076816098560.0, "grad_norm": 1.8779836804333876, "language_loss": 0.72707313, "learning_rate": 2.7184404498099644e-06, "loss": 0.74899077, "num_input_tokens_seen": 136701185, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.81640625, "step": 6372, "time_per_iteration": 2.6493542194366455 }, { "auxiliary_loss_clip": 0.01132183, "auxiliary_loss_mlp": 0.01037866, "balance_loss_clip": 1.02249992, "balance_loss_mlp": 1.04332042, "epoch": 0.38316548925296856, "flos": 23259018562560.0, "grad_norm": 1.7104155971400172, "language_loss": 0.84330297, "learning_rate": 2.7180878332389638e-06, "loss": 0.86500347, "num_input_tokens_seen": 136721265, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.796875, "step": 6373, "time_per_iteration": 2.541349411010742 }, { "auxiliary_loss_clip": 0.01162542, "auxiliary_loss_mlp": 0.01040155, "balance_loss_clip": 1.02466929, "balance_loss_mlp": 1.04556334, "epoch": 0.3832256125056366, "flos": 34423465991040.0, "grad_norm": 1.868073333018617, "language_loss": 0.74909592, "learning_rate": 2.7177351910412553e-06, "loss": 0.77112287, "num_input_tokens_seen": 136741885, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8125, "step": 6374, "time_per_iteration": 2.735656499862671 }, { "auxiliary_loss_clip": 0.01133701, "auxiliary_loss_mlp": 0.01038621, "balance_loss_clip": 1.02350521, "balance_loss_mlp": 1.04384363, "epoch": 0.38328573575830455, "flos": 21762764599680.0, "grad_norm": 7.5221855929960455, "language_loss": 0.75964099, "learning_rate": 2.717382523229424e-06, "loss": 0.7813642, "num_input_tokens_seen": 136760905, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8046875, "step": 6375, "time_per_iteration": 2.5550835132598877 }, { "auxiliary_loss_clip": 0.01133161, "auxiliary_loss_mlp": 0.01037342, "balance_loss_clip": 1.02304268, "balance_loss_mlp": 1.04462194, "epoch": 0.3833458590109725, "flos": 17380010724480.0, "grad_norm": 2.228191490317787, "language_loss": 0.72353637, "learning_rate": 2.7170298298160558e-06, "loss": 0.7452414, "num_input_tokens_seen": 136777240, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.80078125, "step": 6376, "time_per_iteration": 3.991205930709839 }, { "auxiliary_loss_clip": 0.01129763, "auxiliary_loss_mlp": 0.01037128, "balance_loss_clip": 1.02116561, "balance_loss_mlp": 1.04375219, "epoch": 0.3834059822636405, "flos": 29424557191680.0, "grad_norm": 1.5809055060473478, "language_loss": 0.67876369, "learning_rate": 2.7166771108137373e-06, "loss": 0.70043254, "num_input_tokens_seen": 136801040, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.76953125, "step": 6377, "time_per_iteration": 4.184863567352295 }, { "auxiliary_loss_clip": 0.01152021, "auxiliary_loss_mlp": 0.01041173, "balance_loss_clip": 1.02456152, "balance_loss_mlp": 1.04521561, "epoch": 0.38346610551630844, "flos": 21470739027840.0, "grad_norm": 3.385892617040118, "language_loss": 0.73018241, "learning_rate": 2.7163243662350574e-06, "loss": 0.75211442, "num_input_tokens_seen": 136819495, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8046875, "step": 6378, "time_per_iteration": 2.6539835929870605 }, { "auxiliary_loss_clip": 0.01160294, "auxiliary_loss_mlp": 0.0103859, "balance_loss_clip": 1.02378464, "balance_loss_mlp": 1.04368305, "epoch": 0.3835262287689764, "flos": 27561224188800.0, "grad_norm": 1.6547972660927897, "language_loss": 0.69097012, "learning_rate": 2.7159715960926025e-06, "loss": 0.71295893, "num_input_tokens_seen": 136838840, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8046875, "step": 6379, "time_per_iteration": 2.6236789226531982 }, { "auxiliary_loss_clip": 0.01131167, "auxiliary_loss_mlp": 0.010327, "balance_loss_clip": 1.01747108, "balance_loss_mlp": 1.04472721, "epoch": 0.3835863520216444, "flos": 15523716787200.0, "grad_norm": 1.8131290067552144, "language_loss": 0.83157206, "learning_rate": 2.715618800398963e-06, "loss": 0.85321069, "num_input_tokens_seen": 136854425, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7734375, "step": 6380, "time_per_iteration": 2.587313413619995 }, { "auxiliary_loss_clip": 0.01120547, "auxiliary_loss_mlp": 0.01032436, "balance_loss_clip": 1.0175705, "balance_loss_mlp": 1.04484248, "epoch": 0.38364647527431234, "flos": 21904934630400.0, "grad_norm": 1.533415494379132, "language_loss": 0.8096332, "learning_rate": 2.7152659791667296e-06, "loss": 0.83116293, "num_input_tokens_seen": 136874355, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7578125, "step": 6381, "time_per_iteration": 4.239211559295654 }, { "auxiliary_loss_clip": 0.01052217, "auxiliary_loss_mlp": 0.01001748, "balance_loss_clip": 0.99979305, "balance_loss_mlp": 1.02717888, "epoch": 0.3837065985269803, "flos": 65534927558400.0, "grad_norm": 0.8001528266940094, "language_loss": 0.60561514, "learning_rate": 2.7149131324084925e-06, "loss": 0.62615478, "num_input_tokens_seen": 136937475, "router_z_loss_clip": 0.01953125, "router_z_loss_mlp": 0.25, "step": 6382, "time_per_iteration": 3.106457471847534 }, { "auxiliary_loss_clip": 0.01123528, "auxiliary_loss_mlp": 0.01032959, "balance_loss_clip": 1.01778364, "balance_loss_mlp": 1.04281521, "epoch": 0.38376672177964827, "flos": 28256598558720.0, "grad_norm": 5.911281622110996, "language_loss": 0.66810048, "learning_rate": 2.714560260136846e-06, "loss": 0.68966532, "num_input_tokens_seen": 136955805, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.80859375, "step": 6383, "time_per_iteration": 2.6833600997924805 }, { "auxiliary_loss_clip": 0.01151377, "auxiliary_loss_mlp": 0.01035194, "balance_loss_clip": 1.02037001, "balance_loss_mlp": 1.04538465, "epoch": 0.38382684503231623, "flos": 20631363033600.0, "grad_norm": 2.5254378458833506, "language_loss": 0.75094026, "learning_rate": 2.714207362364381e-06, "loss": 0.77280599, "num_input_tokens_seen": 136975240, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.79296875, "step": 6384, "time_per_iteration": 2.5721075534820557 }, { "auxiliary_loss_clip": 0.01131298, "auxiliary_loss_mlp": 0.01037004, "balance_loss_clip": 1.02219796, "balance_loss_mlp": 1.04499996, "epoch": 0.3838869682849842, "flos": 19605825826560.0, "grad_norm": 1.389058659165176, "language_loss": 0.7619046, "learning_rate": 2.7138544391036925e-06, "loss": 0.78358757, "num_input_tokens_seen": 136994985, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7734375, "step": 6385, "time_per_iteration": 2.616507053375244 }, { "auxiliary_loss_clip": 0.01058389, "auxiliary_loss_mlp": 0.01002332, "balance_loss_clip": 1.00043702, "balance_loss_mlp": 1.02477932, "epoch": 0.38394709153765216, "flos": 56556110891520.0, "grad_norm": 0.9035733647218185, "language_loss": 0.66917181, "learning_rate": 2.7135014903673748e-06, "loss": 0.68977904, "num_input_tokens_seen": 137046290, "router_z_loss_clip": 0.0189209, "router_z_loss_mlp": 0.24707031, "step": 6386, "time_per_iteration": 3.024085760116577 }, { "auxiliary_loss_clip": 0.01147954, "auxiliary_loss_mlp": 0.0103425, "balance_loss_clip": 1.02035642, "balance_loss_mlp": 1.04507065, "epoch": 0.3840072147903202, "flos": 15888748752000.0, "grad_norm": 2.041512274315279, "language_loss": 0.72775841, "learning_rate": 2.713148516168025e-06, "loss": 0.7495805, "num_input_tokens_seen": 137064725, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76171875, "step": 6387, "time_per_iteration": 2.6557157039642334 }, { "auxiliary_loss_clip": 0.01120574, "auxiliary_loss_mlp": 0.0103737, "balance_loss_clip": 1.02312481, "balance_loss_mlp": 1.04527974, "epoch": 0.38406733804298815, "flos": 28218030330240.0, "grad_norm": 1.5454946734959294, "language_loss": 0.8121599, "learning_rate": 2.712795516518239e-06, "loss": 0.83373934, "num_input_tokens_seen": 137086030, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75390625, "step": 6388, "time_per_iteration": 2.5681090354919434 }, { "auxiliary_loss_clip": 0.01135011, "auxiliary_loss_mlp": 0.01033405, "balance_loss_clip": 1.02017879, "balance_loss_mlp": 1.04123878, "epoch": 0.3841274612956561, "flos": 18223588609920.0, "grad_norm": 1.7576127147694895, "language_loss": 0.76298517, "learning_rate": 2.7124424914306143e-06, "loss": 0.78466928, "num_input_tokens_seen": 137105400, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75390625, "step": 6389, "time_per_iteration": 2.6339802742004395 }, { "auxiliary_loss_clip": 0.01144276, "auxiliary_loss_mlp": 0.01043941, "balance_loss_clip": 1.02803838, "balance_loss_mlp": 1.04615927, "epoch": 0.3841875845483241, "flos": 19792884879360.0, "grad_norm": 1.5442827921821665, "language_loss": 0.76172489, "learning_rate": 2.71208944091775e-06, "loss": 0.78360707, "num_input_tokens_seen": 137124985, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8046875, "step": 6390, "time_per_iteration": 2.5682754516601562 }, { "auxiliary_loss_clip": 0.01160483, "auxiliary_loss_mlp": 0.01043998, "balance_loss_clip": 1.02852499, "balance_loss_mlp": 1.04376304, "epoch": 0.38424770780099204, "flos": 29898829393920.0, "grad_norm": 1.7190987578129286, "language_loss": 0.69636893, "learning_rate": 2.7117363649922453e-06, "loss": 0.71841371, "num_input_tokens_seen": 137146745, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8125, "step": 6391, "time_per_iteration": 2.726961135864258 }, { "auxiliary_loss_clip": 0.01131204, "auxiliary_loss_mlp": 0.01036841, "balance_loss_clip": 1.02216661, "balance_loss_mlp": 1.04351628, "epoch": 0.38430783105366, "flos": 20813717404800.0, "grad_norm": 1.9025040215990683, "language_loss": 0.83949578, "learning_rate": 2.7113832636667e-06, "loss": 0.86117625, "num_input_tokens_seen": 137163195, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7890625, "step": 6392, "time_per_iteration": 2.599543333053589 }, { "auxiliary_loss_clip": 0.01131985, "auxiliary_loss_mlp": 0.01034213, "balance_loss_clip": 1.02025962, "balance_loss_mlp": 1.04468155, "epoch": 0.384367954306328, "flos": 10998577399680.0, "grad_norm": 2.26759519132598, "language_loss": 0.60674787, "learning_rate": 2.7110301369537168e-06, "loss": 0.6284098, "num_input_tokens_seen": 137179330, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78125, "step": 6393, "time_per_iteration": 2.5429141521453857 }, { "auxiliary_loss_clip": 0.01154329, "auxiliary_loss_mlp": 0.01034389, "balance_loss_clip": 1.01805687, "balance_loss_mlp": 1.04430437, "epoch": 0.38442807755899594, "flos": 25338030779520.0, "grad_norm": 1.6832257429816286, "language_loss": 0.71102226, "learning_rate": 2.7106769848658965e-06, "loss": 0.73290938, "num_input_tokens_seen": 137198655, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8359375, "step": 6394, "time_per_iteration": 2.7090606689453125 }, { "auxiliary_loss_clip": 0.01137058, "auxiliary_loss_mlp": 0.01035887, "balance_loss_clip": 1.02017498, "balance_loss_mlp": 1.04564214, "epoch": 0.3844882008116639, "flos": 21069760527360.0, "grad_norm": 2.2691913526873217, "language_loss": 0.81154215, "learning_rate": 2.710323807415843e-06, "loss": 0.83327162, "num_input_tokens_seen": 137217120, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.82421875, "step": 6395, "time_per_iteration": 2.5928280353546143 }, { "auxiliary_loss_clip": 0.01135668, "auxiliary_loss_mlp": 0.01042798, "balance_loss_clip": 1.0284332, "balance_loss_mlp": 1.04663014, "epoch": 0.38454832406433187, "flos": 17963235855360.0, "grad_norm": 2.0157990176664304, "language_loss": 0.70855427, "learning_rate": 2.7099706046161593e-06, "loss": 0.73033893, "num_input_tokens_seen": 137234410, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.796875, "step": 6396, "time_per_iteration": 2.5796666145324707 }, { "auxiliary_loss_clip": 0.01053374, "auxiliary_loss_mlp": 0.01009831, "balance_loss_clip": 1.00795901, "balance_loss_mlp": 1.02038407, "epoch": 0.38460844731699984, "flos": 67924999555200.0, "grad_norm": 0.8879006870617273, "language_loss": 0.59714657, "learning_rate": 2.7096173764794514e-06, "loss": 0.6177786, "num_input_tokens_seen": 137294940, "router_z_loss_clip": 0.01867676, "router_z_loss_mlp": 0.24414062, "step": 6397, "time_per_iteration": 3.218963146209717 }, { "auxiliary_loss_clip": 0.01123358, "auxiliary_loss_mlp": 0.01032113, "balance_loss_clip": 1.01700974, "balance_loss_mlp": 1.04581368, "epoch": 0.3846685705696678, "flos": 25849075530240.0, "grad_norm": 1.6996578247831644, "language_loss": 0.84602284, "learning_rate": 2.7092641230183243e-06, "loss": 0.86757755, "num_input_tokens_seen": 137315035, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.7734375, "step": 6398, "time_per_iteration": 2.7209880352020264 }, { "auxiliary_loss_clip": 0.01121949, "auxiliary_loss_mlp": 0.01029192, "balance_loss_clip": 1.01512563, "balance_loss_mlp": 1.04365432, "epoch": 0.38472869382233577, "flos": 16290194129280.0, "grad_norm": 2.111713825390849, "language_loss": 0.7979176, "learning_rate": 2.7089108442453854e-06, "loss": 0.81942904, "num_input_tokens_seen": 137333155, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78515625, "step": 6399, "time_per_iteration": 2.5027670860290527 }, { "auxiliary_loss_clip": 0.01125511, "auxiliary_loss_mlp": 0.01280745, "balance_loss_clip": 1.01835322, "balance_loss_mlp": 1.04558647, "epoch": 0.38478881707500373, "flos": 19353122668800.0, "grad_norm": 1.9337988429795705, "language_loss": 0.66825259, "learning_rate": 2.7085575401732423e-06, "loss": 0.69231516, "num_input_tokens_seen": 137351515, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.80078125, "step": 6400, "time_per_iteration": 2.5982329845428467 }, { "auxiliary_loss_clip": 0.01145932, "auxiliary_loss_mlp": 0.01043016, "balance_loss_clip": 1.02807951, "balance_loss_mlp": 1.04750299, "epoch": 0.38484894032767175, "flos": 24860849575680.0, "grad_norm": 1.7727024182641506, "language_loss": 0.73495728, "learning_rate": 2.708204210814503e-06, "loss": 0.75684679, "num_input_tokens_seen": 137371255, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8046875, "step": 6401, "time_per_iteration": 2.5731852054595947 }, { "auxiliary_loss_clip": 0.01151302, "auxiliary_loss_mlp": 0.01036365, "balance_loss_clip": 1.02124977, "balance_loss_mlp": 1.04499269, "epoch": 0.3849090635803397, "flos": 14501806853760.0, "grad_norm": 3.0209384831666477, "language_loss": 0.7225095, "learning_rate": 2.707850856181777e-06, "loss": 0.7443862, "num_input_tokens_seen": 137388980, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.796875, "step": 6402, "time_per_iteration": 2.6586217880249023 }, { "auxiliary_loss_clip": 0.01119409, "auxiliary_loss_mlp": 0.01032751, "balance_loss_clip": 1.01805234, "balance_loss_mlp": 1.04306889, "epoch": 0.3849691868330077, "flos": 18515865576960.0, "grad_norm": 1.8332334426595314, "language_loss": 0.82709408, "learning_rate": 2.707497476287675e-06, "loss": 0.84861565, "num_input_tokens_seen": 137406885, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.76171875, "step": 6403, "time_per_iteration": 2.4928290843963623 }, { "auxiliary_loss_clip": 0.01158382, "auxiliary_loss_mlp": 0.01034131, "balance_loss_clip": 1.01858032, "balance_loss_mlp": 1.0455358, "epoch": 0.38502931008567565, "flos": 21616392677760.0, "grad_norm": 2.150141753480042, "language_loss": 0.83010674, "learning_rate": 2.7071440711448077e-06, "loss": 0.85203195, "num_input_tokens_seen": 137425535, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.76953125, "step": 6404, "time_per_iteration": 2.6193363666534424 }, { "auxiliary_loss_clip": 0.01142448, "auxiliary_loss_mlp": 0.01035214, "balance_loss_clip": 1.02133203, "balance_loss_mlp": 1.04362202, "epoch": 0.3850894333383436, "flos": 25415346804480.0, "grad_norm": 1.477456780579932, "language_loss": 0.69268185, "learning_rate": 2.7067906407657877e-06, "loss": 0.71445847, "num_input_tokens_seen": 137447700, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.8046875, "step": 6405, "time_per_iteration": 2.7459075450897217 }, { "auxiliary_loss_clip": 0.01137802, "auxiliary_loss_mlp": 0.01034534, "balance_loss_clip": 1.02095628, "balance_loss_mlp": 1.04456317, "epoch": 0.3851495565910116, "flos": 20227870581120.0, "grad_norm": 2.077483405221418, "language_loss": 0.79164529, "learning_rate": 2.706437185163228e-06, "loss": 0.81336862, "num_input_tokens_seen": 137462245, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.75390625, "step": 6406, "time_per_iteration": 2.603403091430664 }, { "auxiliary_loss_clip": 0.01135293, "auxiliary_loss_mlp": 0.01037688, "balance_loss_clip": 1.02221489, "balance_loss_mlp": 1.04747367, "epoch": 0.38520967984367954, "flos": 16508459122560.0, "grad_norm": 2.5813916319554644, "language_loss": 0.84062076, "learning_rate": 2.7060837043497416e-06, "loss": 0.86235058, "num_input_tokens_seen": 137476455, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.7890625, "step": 6407, "time_per_iteration": 2.510725498199463 }, { "auxiliary_loss_clip": 0.01052092, "auxiliary_loss_mlp": 0.01001529, "balance_loss_clip": 0.99952585, "balance_loss_mlp": 1.01920831, "epoch": 0.3852698030963475, "flos": 61313772971520.0, "grad_norm": 0.83132327585077, "language_loss": 0.64883745, "learning_rate": 2.7057301983379452e-06, "loss": 0.66937363, "num_input_tokens_seen": 137539845, "router_z_loss_clip": 0.02001953, "router_z_loss_mlp": 0.24414062, "step": 6408, "time_per_iteration": 3.2436320781707764 }, { "auxiliary_loss_clip": 0.01140406, "auxiliary_loss_mlp": 0.01038726, "balance_loss_clip": 1.02279985, "balance_loss_mlp": 1.04309487, "epoch": 0.3853299263490155, "flos": 22893016930560.0, "grad_norm": 1.7915571573352145, "language_loss": 0.73758006, "learning_rate": 2.705376667140452e-06, "loss": 0.7593714, "num_input_tokens_seen": 137559880, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.79296875, "step": 6409, "time_per_iteration": 2.6282591819763184 }, { "auxiliary_loss_clip": 0.01167219, "auxiliary_loss_mlp": 0.01046953, "balance_loss_clip": 1.0300734, "balance_loss_mlp": 1.04636097, "epoch": 0.38539004960168344, "flos": 20047491457920.0, "grad_norm": 2.168368957909465, "language_loss": 0.70367682, "learning_rate": 2.705023110769881e-06, "loss": 0.72581851, "num_input_tokens_seen": 137578225, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8515625, "step": 6410, "time_per_iteration": 2.556264638900757 }, { "auxiliary_loss_clip": 0.01077054, "auxiliary_loss_mlp": 0.01249401, "balance_loss_clip": 1.00123441, "balance_loss_mlp": 1.01763344, "epoch": 0.3854501728543514, "flos": 68730691570560.0, "grad_norm": 0.7000743968030717, "language_loss": 0.60315269, "learning_rate": 2.7046695292388485e-06, "loss": 0.62641728, "num_input_tokens_seen": 137645770, "router_z_loss_clip": 0.01647949, "router_z_loss_mlp": 0.24414062, "step": 6411, "time_per_iteration": 4.750910520553589 }, { "auxiliary_loss_clip": 0.0115654, "auxiliary_loss_mlp": 0.01030477, "balance_loss_clip": 1.01648188, "balance_loss_mlp": 1.04301429, "epoch": 0.38551029610701937, "flos": 20485027025280.0, "grad_norm": 2.2397697805844077, "language_loss": 0.77667332, "learning_rate": 2.7043159225599727e-06, "loss": 0.79854345, "num_input_tokens_seen": 137664090, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78125, "step": 6412, "time_per_iteration": 2.6530187129974365 }, { "auxiliary_loss_clip": 0.01151814, "auxiliary_loss_mlp": 0.01037754, "balance_loss_clip": 1.02113652, "balance_loss_mlp": 1.04472184, "epoch": 0.38557041935968733, "flos": 23471788775040.0, "grad_norm": 1.7396711971170813, "language_loss": 0.78026545, "learning_rate": 2.703962290745874e-06, "loss": 0.8021611, "num_input_tokens_seen": 137683190, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8046875, "step": 6413, "time_per_iteration": 2.582798957824707 }, { "auxiliary_loss_clip": 0.01041661, "auxiliary_loss_mlp": 0.01002348, "balance_loss_clip": 1.0005126, "balance_loss_mlp": 1.01725817, "epoch": 0.38563054261235535, "flos": 63966636869760.0, "grad_norm": 0.8015595224111998, "language_loss": 0.6128217, "learning_rate": 2.703608633809171e-06, "loss": 0.6332618, "num_input_tokens_seen": 137737315, "router_z_loss_clip": 0.01831055, "router_z_loss_mlp": 0.24414062, "step": 6414, "time_per_iteration": 3.0466110706329346 }, { "auxiliary_loss_clip": 0.01153187, "auxiliary_loss_mlp": 0.01040614, "balance_loss_clip": 1.02552176, "balance_loss_mlp": 1.04569101, "epoch": 0.3856906658650233, "flos": 23987789602560.0, "grad_norm": 2.2737096959376095, "language_loss": 0.77485049, "learning_rate": 2.7032549517624865e-06, "loss": 0.79678845, "num_input_tokens_seen": 137753535, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8046875, "step": 6415, "time_per_iteration": 2.6114144325256348 }, { "auxiliary_loss_clip": 0.01135733, "auxiliary_loss_mlp": 0.01035109, "balance_loss_clip": 1.0214777, "balance_loss_mlp": 1.0449934, "epoch": 0.3857507891176913, "flos": 25007436979200.0, "grad_norm": 1.6083827971271212, "language_loss": 0.79753965, "learning_rate": 2.702901244618442e-06, "loss": 0.81924808, "num_input_tokens_seen": 137773405, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 6416, "time_per_iteration": 2.8238627910614014 }, { "auxiliary_loss_clip": 0.01130981, "auxiliary_loss_mlp": 0.01283424, "balance_loss_clip": 1.02266967, "balance_loss_mlp": 1.04431307, "epoch": 0.38581091237035925, "flos": 21536778182400.0, "grad_norm": 1.6424545404284479, "language_loss": 0.79028392, "learning_rate": 2.7025475123896597e-06, "loss": 0.81442797, "num_input_tokens_seen": 137790810, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7734375, "step": 6417, "time_per_iteration": 3.994013547897339 }, { "auxiliary_loss_clip": 0.01136364, "auxiliary_loss_mlp": 0.0103741, "balance_loss_clip": 1.02385592, "balance_loss_mlp": 1.04065871, "epoch": 0.3858710356230272, "flos": 17383889393280.0, "grad_norm": 2.6604410390953532, "language_loss": 0.79765379, "learning_rate": 2.702193755088764e-06, "loss": 0.81939161, "num_input_tokens_seen": 137810265, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.78125, "step": 6418, "time_per_iteration": 2.5470097064971924 }, { "auxiliary_loss_clip": 0.01136531, "auxiliary_loss_mlp": 0.01033849, "balance_loss_clip": 1.02041984, "balance_loss_mlp": 1.04291868, "epoch": 0.3859311588756952, "flos": 20339588856960.0, "grad_norm": 1.8021231680196166, "language_loss": 0.79392457, "learning_rate": 2.701839972728379e-06, "loss": 0.81562829, "num_input_tokens_seen": 137828580, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 6419, "time_per_iteration": 4.09609580039978 }, { "auxiliary_loss_clip": 0.01135807, "auxiliary_loss_mlp": 0.01033391, "balance_loss_clip": 1.01804876, "balance_loss_mlp": 1.04316652, "epoch": 0.38599128212836314, "flos": 26321157002880.0, "grad_norm": 2.2001772592363658, "language_loss": 0.67529404, "learning_rate": 2.7014861653211314e-06, "loss": 0.69698608, "num_input_tokens_seen": 137846145, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.75, "step": 6420, "time_per_iteration": 2.599510669708252 }, { "auxiliary_loss_clip": 0.01146416, "auxiliary_loss_mlp": 0.01040456, "balance_loss_clip": 1.0272181, "balance_loss_mlp": 1.04525423, "epoch": 0.3860514053810311, "flos": 13553837066880.0, "grad_norm": 2.0151541358121983, "language_loss": 0.81018692, "learning_rate": 2.701132332879646e-06, "loss": 0.83205563, "num_input_tokens_seen": 137863705, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7421875, "step": 6421, "time_per_iteration": 2.624969244003296 }, { "auxiliary_loss_clip": 0.01138263, "auxiliary_loss_mlp": 0.01032827, "balance_loss_clip": 1.01832497, "balance_loss_mlp": 1.04274666, "epoch": 0.3861115286336991, "flos": 20954271323520.0, "grad_norm": 2.0021924803273174, "language_loss": 0.71621072, "learning_rate": 2.700778475416552e-06, "loss": 0.73792166, "num_input_tokens_seen": 137880285, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.78125, "step": 6422, "time_per_iteration": 2.5887022018432617 }, { "auxiliary_loss_clip": 0.01136319, "auxiliary_loss_mlp": 0.01039383, "balance_loss_clip": 1.02537012, "balance_loss_mlp": 1.0443579, "epoch": 0.38617165188636704, "flos": 16362697731840.0, "grad_norm": 1.7266860108152637, "language_loss": 0.66178942, "learning_rate": 2.7004245929444776e-06, "loss": 0.68354636, "num_input_tokens_seen": 137898335, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.74609375, "step": 6423, "time_per_iteration": 4.048479795455933 }, { "auxiliary_loss_clip": 0.0113228, "auxiliary_loss_mlp": 0.0104204, "balance_loss_clip": 1.02827144, "balance_loss_mlp": 1.04555464, "epoch": 0.386231775139035, "flos": 34787276893440.0, "grad_norm": 1.7695093210027053, "language_loss": 0.69413739, "learning_rate": 2.7000706854760504e-06, "loss": 0.71588057, "num_input_tokens_seen": 137918605, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.77734375, "step": 6424, "time_per_iteration": 2.7366783618927 }, { "auxiliary_loss_clip": 0.01146828, "auxiliary_loss_mlp": 0.01038359, "balance_loss_clip": 1.02348733, "balance_loss_mlp": 1.04510617, "epoch": 0.38629189839170297, "flos": 21726171619200.0, "grad_norm": 1.4788748987391767, "language_loss": 0.72455788, "learning_rate": 2.699716753023901e-06, "loss": 0.74640971, "num_input_tokens_seen": 137938245, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.75, "step": 6425, "time_per_iteration": 2.611098051071167 }, { "auxiliary_loss_clip": 0.01148395, "auxiliary_loss_mlp": 0.012865, "balance_loss_clip": 1.02506828, "balance_loss_mlp": 1.04233718, "epoch": 0.38635202164437094, "flos": 27923634460800.0, "grad_norm": 2.77340022157422, "language_loss": 0.80728632, "learning_rate": 2.69936279560066e-06, "loss": 0.8316353, "num_input_tokens_seen": 137956770, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.796875, "step": 6426, "time_per_iteration": 2.729641914367676 }, { "auxiliary_loss_clip": 0.01133115, "auxiliary_loss_mlp": 0.01035621, "balance_loss_clip": 1.02125001, "balance_loss_mlp": 1.04660654, "epoch": 0.38641214489703896, "flos": 23586631534080.0, "grad_norm": 2.0851491023015782, "language_loss": 0.74871767, "learning_rate": 2.699008813218961e-06, "loss": 0.77040499, "num_input_tokens_seen": 137977040, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.77734375, "step": 6427, "time_per_iteration": 2.6384732723236084 }, { "auxiliary_loss_clip": 0.01156688, "auxiliary_loss_mlp": 0.01039676, "balance_loss_clip": 1.02537084, "balance_loss_mlp": 1.04363692, "epoch": 0.3864722681497069, "flos": 12641239198080.0, "grad_norm": 1.7762545524325095, "language_loss": 0.69951087, "learning_rate": 2.698654805891435e-06, "loss": 0.72147453, "num_input_tokens_seen": 137993545, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7734375, "step": 6428, "time_per_iteration": 2.6337380409240723 }, { "auxiliary_loss_clip": 0.01139745, "auxiliary_loss_mlp": 0.01040958, "balance_loss_clip": 1.02705812, "balance_loss_mlp": 1.04374039, "epoch": 0.3865323914023749, "flos": 17598922162560.0, "grad_norm": 2.0459198299042147, "language_loss": 0.84405529, "learning_rate": 2.6983007736307158e-06, "loss": 0.86586237, "num_input_tokens_seen": 138010140, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78125, "step": 6429, "time_per_iteration": 2.5657191276550293 }, { "auxiliary_loss_clip": 0.01142683, "auxiliary_loss_mlp": 0.0103947, "balance_loss_clip": 1.0254209, "balance_loss_mlp": 1.04641676, "epoch": 0.38659251465504285, "flos": 18478949374080.0, "grad_norm": 1.8454786399714662, "language_loss": 0.80605215, "learning_rate": 2.6979467164494387e-06, "loss": 0.82787371, "num_input_tokens_seen": 138028880, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78125, "step": 6430, "time_per_iteration": 2.627987861633301 }, { "auxiliary_loss_clip": 0.01069273, "auxiliary_loss_mlp": 0.01001707, "balance_loss_clip": 1.00003767, "balance_loss_mlp": 1.01838827, "epoch": 0.3866526379077108, "flos": 64165726978560.0, "grad_norm": 0.7111276826700039, "language_loss": 0.58810353, "learning_rate": 2.697592634360238e-06, "loss": 0.60881329, "num_input_tokens_seen": 138098090, "router_z_loss_clip": 0.01672363, "router_z_loss_mlp": 0.24609375, "step": 6431, "time_per_iteration": 3.258878707885742 }, { "auxiliary_loss_clip": 0.01170208, "auxiliary_loss_mlp": 0.0103995, "balance_loss_clip": 1.02429736, "balance_loss_mlp": 1.04455543, "epoch": 0.3867127611603788, "flos": 14388292897920.0, "grad_norm": 1.915757905652992, "language_loss": 0.79352891, "learning_rate": 2.6972385273757513e-06, "loss": 0.81563044, "num_input_tokens_seen": 138114735, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.80859375, "step": 6432, "time_per_iteration": 2.647702217102051 }, { "auxiliary_loss_clip": 0.01133602, "auxiliary_loss_mlp": 0.01043744, "balance_loss_clip": 1.02827048, "balance_loss_mlp": 1.04256308, "epoch": 0.38677288441304675, "flos": 20010754823040.0, "grad_norm": 1.9394868202840418, "language_loss": 0.80552793, "learning_rate": 2.6968843955086155e-06, "loss": 0.82730138, "num_input_tokens_seen": 138130480, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8203125, "step": 6433, "time_per_iteration": 2.630296230316162 }, { "auxiliary_loss_clip": 0.01142688, "auxiliary_loss_mlp": 0.01035264, "balance_loss_clip": 1.01976657, "balance_loss_mlp": 1.04532576, "epoch": 0.3868330076657147, "flos": 22236893147520.0, "grad_norm": 1.5802173018273178, "language_loss": 0.70213997, "learning_rate": 2.696530238771467e-06, "loss": 0.72391951, "num_input_tokens_seen": 138150640, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.796875, "step": 6434, "time_per_iteration": 2.56555438041687 }, { "auxiliary_loss_clip": 0.01152853, "auxiliary_loss_mlp": 0.01037617, "balance_loss_clip": 1.02242923, "balance_loss_mlp": 1.04286456, "epoch": 0.3868931309183827, "flos": 16727442387840.0, "grad_norm": 1.728214783460934, "language_loss": 0.77218509, "learning_rate": 2.696176057176947e-06, "loss": 0.79408979, "num_input_tokens_seen": 138169700, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.83203125, "step": 6435, "time_per_iteration": 2.600309133529663 }, { "auxiliary_loss_clip": 0.01128795, "auxiliary_loss_mlp": 0.01033607, "balance_loss_clip": 1.01920092, "balance_loss_mlp": 1.04330862, "epoch": 0.38695325417105064, "flos": 22674716023680.0, "grad_norm": 2.016655309863449, "language_loss": 0.79717267, "learning_rate": 2.6958218507376936e-06, "loss": 0.81879663, "num_input_tokens_seen": 138185835, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.765625, "step": 6436, "time_per_iteration": 2.5889811515808105 }, { "auxiliary_loss_clip": 0.01137778, "auxiliary_loss_mlp": 0.01032359, "balance_loss_clip": 1.018507, "balance_loss_mlp": 1.04336882, "epoch": 0.3870133774237186, "flos": 23112036109440.0, "grad_norm": 2.4258909971065634, "language_loss": 0.76191515, "learning_rate": 2.6954676194663486e-06, "loss": 0.78361654, "num_input_tokens_seen": 138204080, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.765625, "step": 6437, "time_per_iteration": 2.62860107421875 }, { "auxiliary_loss_clip": 0.011409, "auxiliary_loss_mlp": 0.01043051, "balance_loss_clip": 1.02916956, "balance_loss_mlp": 1.04514885, "epoch": 0.3870735006763866, "flos": 17675699483520.0, "grad_norm": 2.1235095483900728, "language_loss": 0.81820506, "learning_rate": 2.6951133633755538e-06, "loss": 0.84004456, "num_input_tokens_seen": 138220710, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78125, "step": 6438, "time_per_iteration": 2.6226966381073 }, { "auxiliary_loss_clip": 0.01141558, "auxiliary_loss_mlp": 0.01282868, "balance_loss_clip": 1.02090991, "balance_loss_mlp": 1.04428279, "epoch": 0.38713362392905454, "flos": 23295791111040.0, "grad_norm": 1.6300869759387169, "language_loss": 0.755077, "learning_rate": 2.6947590824779502e-06, "loss": 0.77932131, "num_input_tokens_seen": 138241720, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.79296875, "step": 6439, "time_per_iteration": 2.6519346237182617 }, { "auxiliary_loss_clip": 0.01118939, "auxiliary_loss_mlp": 0.01036103, "balance_loss_clip": 1.02195859, "balance_loss_mlp": 1.04373169, "epoch": 0.38719374718172256, "flos": 21031192298880.0, "grad_norm": 1.40739768120992, "language_loss": 0.73532426, "learning_rate": 2.694404776786182e-06, "loss": 0.75687474, "num_input_tokens_seen": 138261885, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.75390625, "step": 6440, "time_per_iteration": 2.654313325881958 }, { "auxiliary_loss_clip": 0.01132025, "auxiliary_loss_mlp": 0.01039096, "balance_loss_clip": 1.02409387, "balance_loss_mlp": 1.04297137, "epoch": 0.3872538704343905, "flos": 19609776322560.0, "grad_norm": 1.8724751580341201, "language_loss": 0.81623268, "learning_rate": 2.6940504463128933e-06, "loss": 0.83794391, "num_input_tokens_seen": 138280255, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8046875, "step": 6441, "time_per_iteration": 2.6628470420837402 }, { "auxiliary_loss_clip": 0.0112268, "auxiliary_loss_mlp": 0.01044613, "balance_loss_clip": 1.0306772, "balance_loss_mlp": 1.04489231, "epoch": 0.3873139936870585, "flos": 17530045833600.0, "grad_norm": 2.38391548518033, "language_loss": 0.80352223, "learning_rate": 2.6936960910707307e-06, "loss": 0.82519513, "num_input_tokens_seen": 138296675, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.77734375, "step": 6442, "time_per_iteration": 2.5878663063049316 }, { "auxiliary_loss_clip": 0.0114807, "auxiliary_loss_mlp": 0.010347, "balance_loss_clip": 1.01997733, "balance_loss_mlp": 1.04294944, "epoch": 0.38737411693972645, "flos": 17786555832960.0, "grad_norm": 1.6254948038652852, "language_loss": 0.83511329, "learning_rate": 2.693341711072338e-06, "loss": 0.85694098, "num_input_tokens_seen": 138314985, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.78515625, "step": 6443, "time_per_iteration": 2.6225483417510986 }, { "auxiliary_loss_clip": 0.01062575, "auxiliary_loss_mlp": 0.01002824, "balance_loss_clip": 1.00119138, "balance_loss_mlp": 1.02110076, "epoch": 0.3874342401923944, "flos": 58304637048960.0, "grad_norm": 0.7715415814837344, "language_loss": 0.50215411, "learning_rate": 2.6929873063303634e-06, "loss": 0.52280807, "num_input_tokens_seen": 138373275, "router_z_loss_clip": 0.01635742, "router_z_loss_mlp": 0.24414062, "step": 6444, "time_per_iteration": 3.1387405395507812 }, { "auxiliary_loss_clip": 0.01145736, "auxiliary_loss_mlp": 0.01281814, "balance_loss_clip": 1.02114165, "balance_loss_mlp": 1.04461575, "epoch": 0.3874943634450624, "flos": 17711933328000.0, "grad_norm": 1.4651180945946214, "language_loss": 0.78625047, "learning_rate": 2.6926328768574545e-06, "loss": 0.81052601, "num_input_tokens_seen": 138391145, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 6445, "time_per_iteration": 2.536451578140259 }, { "auxiliary_loss_clip": 0.01130281, "auxiliary_loss_mlp": 0.01036444, "balance_loss_clip": 1.02244925, "balance_loss_mlp": 1.04557312, "epoch": 0.38755448669773035, "flos": 19244852098560.0, "grad_norm": 4.910135153430654, "language_loss": 0.80772185, "learning_rate": 2.6922784226662595e-06, "loss": 0.82938904, "num_input_tokens_seen": 138409875, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 6446, "time_per_iteration": 2.546633720397949 }, { "auxiliary_loss_clip": 0.01139081, "auxiliary_loss_mlp": 0.0103968, "balance_loss_clip": 1.02611363, "balance_loss_mlp": 1.04435372, "epoch": 0.3876146099503983, "flos": 20594267262720.0, "grad_norm": 1.6467594152282976, "language_loss": 0.76980937, "learning_rate": 2.6919239437694288e-06, "loss": 0.79159695, "num_input_tokens_seen": 138428965, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.76953125, "step": 6447, "time_per_iteration": 2.5545170307159424 }, { "auxiliary_loss_clip": 0.01139066, "auxiliary_loss_mlp": 0.0103512, "balance_loss_clip": 1.02155375, "balance_loss_mlp": 1.04497468, "epoch": 0.3876747332030663, "flos": 19281121856640.0, "grad_norm": 1.8152006886301117, "language_loss": 0.75959188, "learning_rate": 2.691569440179612e-06, "loss": 0.78133374, "num_input_tokens_seen": 138448090, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.76171875, "step": 6448, "time_per_iteration": 2.587822675704956 }, { "auxiliary_loss_clip": 0.01135977, "auxiliary_loss_mlp": 0.01034767, "balance_loss_clip": 1.0204978, "balance_loss_mlp": 1.04348302, "epoch": 0.38773485645573424, "flos": 18945895201920.0, "grad_norm": 1.5787296005936657, "language_loss": 0.75835788, "learning_rate": 2.691214911909461e-06, "loss": 0.7800653, "num_input_tokens_seen": 138466105, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.74609375, "step": 6449, "time_per_iteration": 2.624051094055176 }, { "auxiliary_loss_clip": 0.01148908, "auxiliary_loss_mlp": 0.01283308, "balance_loss_clip": 1.02145672, "balance_loss_mlp": 1.04263103, "epoch": 0.3877949797084022, "flos": 23071348978560.0, "grad_norm": 1.6606689303439879, "language_loss": 0.77996135, "learning_rate": 2.690860358971628e-06, "loss": 0.8042835, "num_input_tokens_seen": 138485160, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.796875, "step": 6450, "time_per_iteration": 2.5757360458374023 }, { "auxiliary_loss_clip": 0.01124269, "auxiliary_loss_mlp": 0.01035685, "balance_loss_clip": 1.02127218, "balance_loss_mlp": 1.04277062, "epoch": 0.3878551029610702, "flos": 29095543589760.0, "grad_norm": 2.7905120002824786, "language_loss": 0.77839386, "learning_rate": 2.690505781378766e-06, "loss": 0.7999934, "num_input_tokens_seen": 138504135, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8125, "step": 6451, "time_per_iteration": 2.5690605640411377 }, { "auxiliary_loss_clip": 0.01116607, "auxiliary_loss_mlp": 0.01030422, "balance_loss_clip": 1.01698685, "balance_loss_mlp": 1.04256523, "epoch": 0.38791522621373814, "flos": 20996394998400.0, "grad_norm": 1.581214823389433, "language_loss": 0.76547062, "learning_rate": 2.6901511791435286e-06, "loss": 0.78694093, "num_input_tokens_seen": 138523955, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 6452, "time_per_iteration": 2.757071018218994 }, { "auxiliary_loss_clip": 0.01140621, "auxiliary_loss_mlp": 0.01042254, "balance_loss_clip": 1.02885461, "balance_loss_mlp": 1.04618263, "epoch": 0.3879753494664061, "flos": 15486836497920.0, "grad_norm": 1.6227624291648273, "language_loss": 0.7952112, "learning_rate": 2.689796552278571e-06, "loss": 0.81703991, "num_input_tokens_seen": 138541655, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.76171875, "step": 6453, "time_per_iteration": 3.9889254570007324 }, { "auxiliary_loss_clip": 0.01136722, "auxiliary_loss_mlp": 0.01037487, "balance_loss_clip": 1.0226692, "balance_loss_mlp": 1.04656219, "epoch": 0.3880354727190741, "flos": 22053964158720.0, "grad_norm": 1.61989223672938, "language_loss": 0.71253788, "learning_rate": 2.689441900796549e-06, "loss": 0.73427993, "num_input_tokens_seen": 138560860, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8125, "step": 6454, "time_per_iteration": 2.5591983795166016 }, { "auxiliary_loss_clip": 0.01124746, "auxiliary_loss_mlp": 0.01039251, "balance_loss_clip": 1.02423048, "balance_loss_mlp": 1.04482329, "epoch": 0.3880955959717421, "flos": 20340307128960.0, "grad_norm": 1.607067987679653, "language_loss": 0.77276033, "learning_rate": 2.689087224710119e-06, "loss": 0.79440033, "num_input_tokens_seen": 138580200, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.796875, "step": 6455, "time_per_iteration": 2.5955350399017334 }, { "auxiliary_loss_clip": 0.0111935, "auxiliary_loss_mlp": 0.01037039, "balance_loss_clip": 1.02363992, "balance_loss_mlp": 1.04375648, "epoch": 0.38815571922441006, "flos": 23075407215360.0, "grad_norm": 1.5317563561489018, "language_loss": 0.75971943, "learning_rate": 2.688732524031938e-06, "loss": 0.78128332, "num_input_tokens_seen": 138598315, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7578125, "step": 6456, "time_per_iteration": 2.5813910961151123 }, { "auxiliary_loss_clip": 0.01168295, "auxiliary_loss_mlp": 0.0103419, "balance_loss_clip": 1.02019525, "balance_loss_mlp": 1.04504967, "epoch": 0.388215842477078, "flos": 20776944856320.0, "grad_norm": 2.2563177653631747, "language_loss": 0.60897374, "learning_rate": 2.688377798774665e-06, "loss": 0.63099861, "num_input_tokens_seen": 138615695, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78515625, "step": 6457, "time_per_iteration": 2.703528881072998 }, { "auxiliary_loss_clip": 0.01121655, "auxiliary_loss_mlp": 0.01036459, "balance_loss_clip": 1.02033043, "balance_loss_mlp": 1.04246521, "epoch": 0.388275965729746, "flos": 20448182649600.0, "grad_norm": 2.2573800846922905, "language_loss": 0.79990137, "learning_rate": 2.688023048950959e-06, "loss": 0.82148254, "num_input_tokens_seen": 138633180, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.79296875, "step": 6458, "time_per_iteration": 2.510838270187378 }, { "auxiliary_loss_clip": 0.01157109, "auxiliary_loss_mlp": 0.01035538, "balance_loss_clip": 1.02155459, "balance_loss_mlp": 1.04297256, "epoch": 0.38833608898241395, "flos": 27892392606720.0, "grad_norm": 2.31326968572752, "language_loss": 0.81323123, "learning_rate": 2.6876682745734807e-06, "loss": 0.83515775, "num_input_tokens_seen": 138654785, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78125, "step": 6459, "time_per_iteration": 4.06947135925293 }, { "auxiliary_loss_clip": 0.01135609, "auxiliary_loss_mlp": 0.01032512, "balance_loss_clip": 1.01954818, "balance_loss_mlp": 1.04334044, "epoch": 0.3883962122350819, "flos": 18076390675200.0, "grad_norm": 1.682959504834385, "language_loss": 0.62059802, "learning_rate": 2.6873134756548902e-06, "loss": 0.64227927, "num_input_tokens_seen": 138673330, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7421875, "step": 6460, "time_per_iteration": 4.231162786483765 }, { "auxiliary_loss_clip": 0.01128667, "auxiliary_loss_mlp": 0.01034303, "balance_loss_clip": 1.02127957, "balance_loss_mlp": 1.04350543, "epoch": 0.3884563354877499, "flos": 23622254847360.0, "grad_norm": 1.7614605612442404, "language_loss": 0.86162424, "learning_rate": 2.6869586522078494e-06, "loss": 0.88325393, "num_input_tokens_seen": 138694185, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.76171875, "step": 6461, "time_per_iteration": 2.6493759155273438 }, { "auxiliary_loss_clip": 0.01130428, "auxiliary_loss_mlp": 0.01035399, "balance_loss_clip": 1.02204776, "balance_loss_mlp": 1.04335809, "epoch": 0.38851645874041785, "flos": 27453528236160.0, "grad_norm": 2.2921601304337598, "language_loss": 0.71134353, "learning_rate": 2.686603804245022e-06, "loss": 0.73300183, "num_input_tokens_seen": 138714625, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.78125, "step": 6462, "time_per_iteration": 2.6642205715179443 }, { "auxiliary_loss_clip": 0.01147949, "auxiliary_loss_mlp": 0.01036296, "balance_loss_clip": 1.02212811, "balance_loss_mlp": 1.04269409, "epoch": 0.3885765819930858, "flos": 25228072270080.0, "grad_norm": 1.9417228078078277, "language_loss": 0.75826925, "learning_rate": 2.6862489317790708e-06, "loss": 0.78011173, "num_input_tokens_seen": 138733585, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.78125, "step": 6463, "time_per_iteration": 2.5681207180023193 }, { "auxiliary_loss_clip": 0.0115218, "auxiliary_loss_mlp": 0.01041558, "balance_loss_clip": 1.02626348, "balance_loss_mlp": 1.045771, "epoch": 0.3886367052457538, "flos": 16946605221120.0, "grad_norm": 2.9124542336675248, "language_loss": 0.70799947, "learning_rate": 2.6858940348226606e-06, "loss": 0.7299369, "num_input_tokens_seen": 138752335, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.796875, "step": 6464, "time_per_iteration": 4.067925214767456 }, { "auxiliary_loss_clip": 0.01138391, "auxiliary_loss_mlp": 0.01029802, "balance_loss_clip": 1.01555026, "balance_loss_mlp": 1.04426312, "epoch": 0.38869682849842174, "flos": 27154140376320.0, "grad_norm": 1.8676302287576634, "language_loss": 0.69741011, "learning_rate": 2.685539113388456e-06, "loss": 0.71909207, "num_input_tokens_seen": 138768450, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.765625, "step": 6465, "time_per_iteration": 2.691229820251465 }, { "auxiliary_loss_clip": 0.0114791, "auxiliary_loss_mlp": 0.01038149, "balance_loss_clip": 1.02313495, "balance_loss_mlp": 1.04343367, "epoch": 0.3887569517510897, "flos": 21063619301760.0, "grad_norm": 2.016150117316542, "language_loss": 0.781057, "learning_rate": 2.6851841674891242e-06, "loss": 0.8029176, "num_input_tokens_seen": 138786775, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.77734375, "step": 6466, "time_per_iteration": 2.5261728763580322 }, { "auxiliary_loss_clip": 0.01149347, "auxiliary_loss_mlp": 0.01034667, "balance_loss_clip": 1.02016473, "balance_loss_mlp": 1.04362774, "epoch": 0.38881707500375773, "flos": 29497384016640.0, "grad_norm": 1.5424142985044655, "language_loss": 0.69461036, "learning_rate": 2.6848291971373325e-06, "loss": 0.71645051, "num_input_tokens_seen": 138810100, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78515625, "step": 6467, "time_per_iteration": 2.666586399078369 }, { "auxiliary_loss_clip": 0.01149248, "auxiliary_loss_mlp": 0.01038805, "balance_loss_clip": 1.02432108, "balance_loss_mlp": 1.04393613, "epoch": 0.3888771982564257, "flos": 17488281294720.0, "grad_norm": 2.41559254207118, "language_loss": 0.83207583, "learning_rate": 2.684474202345748e-06, "loss": 0.8539564, "num_input_tokens_seen": 138825140, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78125, "step": 6468, "time_per_iteration": 2.5287365913391113 }, { "auxiliary_loss_clip": 0.01140327, "auxiliary_loss_mlp": 0.01029179, "balance_loss_clip": 1.01605988, "balance_loss_mlp": 1.04518783, "epoch": 0.38893732150909366, "flos": 21942425450880.0, "grad_norm": 1.7434717497038543, "language_loss": 0.84484863, "learning_rate": 2.6841191831270394e-06, "loss": 0.86654365, "num_input_tokens_seen": 138844115, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7734375, "step": 6469, "time_per_iteration": 2.6519381999969482 }, { "auxiliary_loss_clip": 0.01147053, "auxiliary_loss_mlp": 0.01029909, "balance_loss_clip": 1.01548481, "balance_loss_mlp": 1.04412198, "epoch": 0.3889974447617616, "flos": 24276367468800.0, "grad_norm": 1.682655612923269, "language_loss": 0.74457777, "learning_rate": 2.683764139493878e-06, "loss": 0.76634735, "num_input_tokens_seen": 138860860, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.765625, "step": 6470, "time_per_iteration": 2.5902767181396484 }, { "auxiliary_loss_clip": 0.01146719, "auxiliary_loss_mlp": 0.01281227, "balance_loss_clip": 1.01914489, "balance_loss_mlp": 1.04264557, "epoch": 0.3890575680144296, "flos": 25667116208640.0, "grad_norm": 1.531648283359575, "language_loss": 0.74798888, "learning_rate": 2.683409071458932e-06, "loss": 0.77226835, "num_input_tokens_seen": 138881910, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.76953125, "step": 6471, "time_per_iteration": 2.722127914428711 }, { "auxiliary_loss_clip": 0.01157594, "auxiliary_loss_mlp": 0.01040482, "balance_loss_clip": 1.02608192, "balance_loss_mlp": 1.04516959, "epoch": 0.38911769126709755, "flos": 22855274714880.0, "grad_norm": 2.682679016494055, "language_loss": 0.67997372, "learning_rate": 2.6830539790348755e-06, "loss": 0.70195454, "num_input_tokens_seen": 138900975, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.765625, "step": 6472, "time_per_iteration": 2.6105246543884277 }, { "auxiliary_loss_clip": 0.01147402, "auxiliary_loss_mlp": 0.01042022, "balance_loss_clip": 1.02816391, "balance_loss_mlp": 1.04373026, "epoch": 0.3891778145197655, "flos": 25447522412160.0, "grad_norm": 1.6578548306442675, "language_loss": 0.76253992, "learning_rate": 2.6826988622343783e-06, "loss": 0.7844342, "num_input_tokens_seen": 138920795, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76953125, "step": 6473, "time_per_iteration": 2.6619601249694824 }, { "auxiliary_loss_clip": 0.01140369, "auxiliary_loss_mlp": 0.01041144, "balance_loss_clip": 1.02593327, "balance_loss_mlp": 1.04542971, "epoch": 0.3892379377724335, "flos": 14027965614720.0, "grad_norm": 2.030039219231303, "language_loss": 0.70835876, "learning_rate": 2.6823437210701155e-06, "loss": 0.73017389, "num_input_tokens_seen": 138938770, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.77734375, "step": 6474, "time_per_iteration": 2.5114803314208984 }, { "auxiliary_loss_clip": 0.01121329, "auxiliary_loss_mlp": 0.01038103, "balance_loss_clip": 1.02457321, "balance_loss_mlp": 1.04349399, "epoch": 0.38929806102510145, "flos": 20157449967360.0, "grad_norm": 1.463962697092352, "language_loss": 0.68768227, "learning_rate": 2.68198855555476e-06, "loss": 0.70927668, "num_input_tokens_seen": 138958880, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.77734375, "step": 6475, "time_per_iteration": 2.5627663135528564 }, { "auxiliary_loss_clip": 0.01145592, "auxiliary_loss_mlp": 0.01044408, "balance_loss_clip": 1.02925682, "balance_loss_mlp": 1.04642868, "epoch": 0.3893581842777694, "flos": 22163958581760.0, "grad_norm": 1.8124644093215552, "language_loss": 0.75829041, "learning_rate": 2.6816333657009876e-06, "loss": 0.78019035, "num_input_tokens_seen": 138977240, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8125, "step": 6476, "time_per_iteration": 2.5924341678619385 }, { "auxiliary_loss_clip": 0.0106307, "auxiliary_loss_mlp": 0.01007604, "balance_loss_clip": 1.00541019, "balance_loss_mlp": 1.02003765, "epoch": 0.3894183075304374, "flos": 67301877392640.0, "grad_norm": 0.7767069260552005, "language_loss": 0.5816834, "learning_rate": 2.6812781515214742e-06, "loss": 0.60239017, "num_input_tokens_seen": 139039035, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.24902344, "step": 6477, "time_per_iteration": 3.184058666229248 }, { "auxiliary_loss_clip": 0.01157938, "auxiliary_loss_mlp": 0.01037579, "balance_loss_clip": 1.02200985, "balance_loss_mlp": 1.04379082, "epoch": 0.38947843078310534, "flos": 18547502480640.0, "grad_norm": 2.271096153784549, "language_loss": 0.77838194, "learning_rate": 2.680922913028895e-06, "loss": 0.80033708, "num_input_tokens_seen": 139055560, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.7890625, "step": 6478, "time_per_iteration": 2.641718864440918 }, { "auxiliary_loss_clip": 0.01128115, "auxiliary_loss_mlp": 0.01037506, "balance_loss_clip": 1.0233202, "balance_loss_mlp": 1.04215717, "epoch": 0.3895385540357733, "flos": 14605875532800.0, "grad_norm": 2.2067597829233168, "language_loss": 0.82504475, "learning_rate": 2.680567650235929e-06, "loss": 0.84670103, "num_input_tokens_seen": 139071865, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.76953125, "step": 6479, "time_per_iteration": 2.4961957931518555 }, { "auxiliary_loss_clip": 0.01119426, "auxiliary_loss_mlp": 0.01035008, "balance_loss_clip": 1.01992178, "balance_loss_mlp": 1.04341578, "epoch": 0.38959867728844133, "flos": 19975203336960.0, "grad_norm": 1.6200732115086809, "language_loss": 0.79714137, "learning_rate": 2.680212363155254e-06, "loss": 0.81868571, "num_input_tokens_seen": 139089640, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7578125, "step": 6480, "time_per_iteration": 2.599588632583618 }, { "auxiliary_loss_clip": 0.01134681, "auxiliary_loss_mlp": 0.01029691, "balance_loss_clip": 1.01500988, "balance_loss_mlp": 1.04227912, "epoch": 0.3896588005411093, "flos": 22672130244480.0, "grad_norm": 1.7311051629744585, "language_loss": 0.82978398, "learning_rate": 2.6798570517995505e-06, "loss": 0.85142767, "num_input_tokens_seen": 139109365, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.74609375, "step": 6481, "time_per_iteration": 2.5527665615081787 }, { "auxiliary_loss_clip": 0.01137786, "auxiliary_loss_mlp": 0.01034267, "balance_loss_clip": 1.02126169, "balance_loss_mlp": 1.0454731, "epoch": 0.38971892379377726, "flos": 20996035862400.0, "grad_norm": 1.598631811111613, "language_loss": 0.75196755, "learning_rate": 2.679501716181497e-06, "loss": 0.77368808, "num_input_tokens_seen": 139128260, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7421875, "step": 6482, "time_per_iteration": 2.7022764682769775 }, { "auxiliary_loss_clip": 0.01147625, "auxiliary_loss_mlp": 0.01034172, "balance_loss_clip": 1.02030218, "balance_loss_mlp": 1.04343748, "epoch": 0.3897790470464452, "flos": 22528487756160.0, "grad_norm": 1.8511526522783501, "language_loss": 0.78592902, "learning_rate": 2.6791463563137752e-06, "loss": 0.80774701, "num_input_tokens_seen": 139147315, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.77734375, "step": 6483, "time_per_iteration": 2.6267142295837402 }, { "auxiliary_loss_clip": 0.01128129, "auxiliary_loss_mlp": 0.0103002, "balance_loss_clip": 1.014153, "balance_loss_mlp": 1.04193306, "epoch": 0.3898391702991132, "flos": 26209905603840.0, "grad_norm": 1.4328698584850512, "language_loss": 0.79698849, "learning_rate": 2.6787909722090667e-06, "loss": 0.81856996, "num_input_tokens_seen": 139167270, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.7734375, "step": 6484, "time_per_iteration": 2.7298920154571533 }, { "auxiliary_loss_clip": 0.01131133, "auxiliary_loss_mlp": 0.01043393, "balance_loss_clip": 1.02811086, "balance_loss_mlp": 1.04608786, "epoch": 0.38989929355178116, "flos": 21065558636160.0, "grad_norm": 1.6070121809690927, "language_loss": 0.77480698, "learning_rate": 2.6784355638800545e-06, "loss": 0.79655224, "num_input_tokens_seen": 139185970, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7578125, "step": 6485, "time_per_iteration": 2.6667137145996094 }, { "auxiliary_loss_clip": 0.01158928, "auxiliary_loss_mlp": 0.01038389, "balance_loss_clip": 1.02363086, "balance_loss_mlp": 1.04441094, "epoch": 0.3899594168044491, "flos": 25484115392640.0, "grad_norm": 2.4291977379346474, "language_loss": 0.85210329, "learning_rate": 2.6780801313394225e-06, "loss": 0.87407649, "num_input_tokens_seen": 139203730, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7890625, "step": 6486, "time_per_iteration": 2.718534469604492 }, { "auxiliary_loss_clip": 0.01141374, "auxiliary_loss_mlp": 0.01031926, "balance_loss_clip": 1.01738286, "balance_loss_mlp": 1.04285085, "epoch": 0.3900195400571171, "flos": 31139363456640.0, "grad_norm": 1.6924623682281146, "language_loss": 0.85042733, "learning_rate": 2.677724674599854e-06, "loss": 0.87216032, "num_input_tokens_seen": 139222560, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8046875, "step": 6487, "time_per_iteration": 2.744678497314453 }, { "auxiliary_loss_clip": 0.01137804, "auxiliary_loss_mlp": 0.01033799, "balance_loss_clip": 1.01848042, "balance_loss_mlp": 1.04326701, "epoch": 0.39007966330978505, "flos": 20229917656320.0, "grad_norm": 1.4673583685435378, "language_loss": 0.72803599, "learning_rate": 2.6773691936740357e-06, "loss": 0.74975204, "num_input_tokens_seen": 139242165, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.765625, "step": 6488, "time_per_iteration": 2.6490602493286133 }, { "auxiliary_loss_clip": 0.01150567, "auxiliary_loss_mlp": 0.01036183, "balance_loss_clip": 1.02137756, "balance_loss_mlp": 1.04564977, "epoch": 0.390139786562453, "flos": 22528739151360.0, "grad_norm": 1.677057381686733, "language_loss": 0.68365443, "learning_rate": 2.677013688574654e-06, "loss": 0.70552194, "num_input_tokens_seen": 139262525, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.78515625, "step": 6489, "time_per_iteration": 2.674210548400879 }, { "auxiliary_loss_clip": 0.01135142, "auxiliary_loss_mlp": 0.01038271, "balance_loss_clip": 1.02382934, "balance_loss_mlp": 1.04320884, "epoch": 0.390199909815121, "flos": 26432911192320.0, "grad_norm": 1.5587700588719295, "language_loss": 0.80674553, "learning_rate": 2.6766581593143937e-06, "loss": 0.82847965, "num_input_tokens_seen": 139282835, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7421875, "step": 6490, "time_per_iteration": 2.593637704849243 }, { "auxiliary_loss_clip": 0.01131563, "auxiliary_loss_mlp": 0.01036623, "balance_loss_clip": 1.02192497, "balance_loss_mlp": 1.04430079, "epoch": 0.39026003306778895, "flos": 17274577328640.0, "grad_norm": 2.367343988382057, "language_loss": 0.89454246, "learning_rate": 2.6763026059059455e-06, "loss": 0.91622424, "num_input_tokens_seen": 139299490, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.78125, "step": 6491, "time_per_iteration": 2.5292000770568848 }, { "auxiliary_loss_clip": 0.01148621, "auxiliary_loss_mlp": 0.01037503, "balance_loss_clip": 1.02318609, "balance_loss_mlp": 1.04319835, "epoch": 0.3903201563204569, "flos": 24532841554560.0, "grad_norm": 1.9873986488854616, "language_loss": 0.7858997, "learning_rate": 2.675947028361996e-06, "loss": 0.80776095, "num_input_tokens_seen": 139317865, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7890625, "step": 6492, "time_per_iteration": 2.5972516536712646 }, { "auxiliary_loss_clip": 0.01140145, "auxiliary_loss_mlp": 0.01040546, "balance_loss_clip": 1.02664042, "balance_loss_mlp": 1.04423165, "epoch": 0.39038027957312493, "flos": 23767944410880.0, "grad_norm": 1.8505347032252266, "language_loss": 0.74642503, "learning_rate": 2.6755914266952365e-06, "loss": 0.76823187, "num_input_tokens_seen": 139339840, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78125, "step": 6493, "time_per_iteration": 2.660146951675415 }, { "auxiliary_loss_clip": 0.01162364, "auxiliary_loss_mlp": 0.01035609, "balance_loss_clip": 1.02058864, "balance_loss_mlp": 1.04448652, "epoch": 0.3904404028257929, "flos": 14100612871680.0, "grad_norm": 2.002497370772619, "language_loss": 0.7561276, "learning_rate": 2.675235800918357e-06, "loss": 0.77810729, "num_input_tokens_seen": 139357555, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8203125, "step": 6494, "time_per_iteration": 2.6519665718078613 }, { "auxiliary_loss_clip": 0.01138983, "auxiliary_loss_mlp": 0.01047412, "balance_loss_clip": 1.03165233, "balance_loss_mlp": 1.04472971, "epoch": 0.39050052607846086, "flos": 16910048154240.0, "grad_norm": 1.9704929521828856, "language_loss": 0.74235123, "learning_rate": 2.6748801510440484e-06, "loss": 0.76421517, "num_input_tokens_seen": 139374455, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.85546875, "step": 6495, "time_per_iteration": 3.8935742378234863 }, { "auxiliary_loss_clip": 0.01123712, "auxiliary_loss_mlp": 0.01042135, "balance_loss_clip": 1.02526677, "balance_loss_mlp": 1.04471135, "epoch": 0.39056064933112883, "flos": 25915761129600.0, "grad_norm": 1.5934872192711278, "language_loss": 0.68102252, "learning_rate": 2.674524477085003e-06, "loss": 0.70268101, "num_input_tokens_seen": 139394770, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.7890625, "step": 6496, "time_per_iteration": 2.567063093185425 }, { "auxiliary_loss_clip": 0.01071442, "auxiliary_loss_mlp": 0.01001554, "balance_loss_clip": 0.99996805, "balance_loss_mlp": 1.01976645, "epoch": 0.3906207725837968, "flos": 60028421713920.0, "grad_norm": 0.6719973480030217, "language_loss": 0.5398218, "learning_rate": 2.674168779053914e-06, "loss": 0.56055176, "num_input_tokens_seen": 139454760, "router_z_loss_clip": 0.01586914, "router_z_loss_mlp": 0.25, "step": 6497, "time_per_iteration": 3.282309055328369 }, { "auxiliary_loss_clip": 0.01148944, "auxiliary_loss_mlp": 0.012864, "balance_loss_clip": 1.02511489, "balance_loss_mlp": 1.04488206, "epoch": 0.39068089583646476, "flos": 21068683119360.0, "grad_norm": 2.1479106754445634, "language_loss": 0.69063449, "learning_rate": 2.6738130569634763e-06, "loss": 0.71498799, "num_input_tokens_seen": 139472645, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7734375, "step": 6498, "time_per_iteration": 2.684455156326294 }, { "auxiliary_loss_clip": 0.01070316, "auxiliary_loss_mlp": 0.01001138, "balance_loss_clip": 0.99954051, "balance_loss_mlp": 1.01824141, "epoch": 0.3907410190891327, "flos": 70445677403520.0, "grad_norm": 0.7349692123501369, "language_loss": 0.51791656, "learning_rate": 2.673457310826383e-06, "loss": 0.53863108, "num_input_tokens_seen": 139536730, "router_z_loss_clip": 0.01599121, "router_z_loss_mlp": 0.25, "step": 6499, "time_per_iteration": 3.2472522258758545 }, { "auxiliary_loss_clip": 0.01131911, "auxiliary_loss_mlp": 0.01046758, "balance_loss_clip": 1.03011632, "balance_loss_mlp": 1.04269481, "epoch": 0.3908011423418007, "flos": 27962454084480.0, "grad_norm": 2.597334050480561, "language_loss": 0.73829204, "learning_rate": 2.673101540655331e-06, "loss": 0.76007873, "num_input_tokens_seen": 139557540, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8046875, "step": 6500, "time_per_iteration": 4.0095837116241455 }, { "auxiliary_loss_clip": 0.0113445, "auxiliary_loss_mlp": 0.01037482, "balance_loss_clip": 1.02351046, "balance_loss_mlp": 1.04574239, "epoch": 0.39086126559446865, "flos": 24462097718400.0, "grad_norm": 3.8306797447420298, "language_loss": 0.68666315, "learning_rate": 2.6727457464630166e-06, "loss": 0.70838249, "num_input_tokens_seen": 139576875, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.796875, "step": 6501, "time_per_iteration": 2.6495447158813477 }, { "auxiliary_loss_clip": 0.01131301, "auxiliary_loss_mlp": 0.01041744, "balance_loss_clip": 1.02787948, "balance_loss_mlp": 1.04427218, "epoch": 0.3909213888471366, "flos": 16941541403520.0, "grad_norm": 1.8908516884093824, "language_loss": 0.79098117, "learning_rate": 2.6723899282621363e-06, "loss": 0.8127116, "num_input_tokens_seen": 139594295, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78125, "step": 6502, "time_per_iteration": 4.1926233768463135 }, { "auxiliary_loss_clip": 0.01138293, "auxiliary_loss_mlp": 0.01037592, "balance_loss_clip": 1.02397275, "balance_loss_mlp": 1.04565716, "epoch": 0.3909815120998046, "flos": 29278400751360.0, "grad_norm": 2.2354141419560367, "language_loss": 0.79990089, "learning_rate": 2.6720340860653894e-06, "loss": 0.82165974, "num_input_tokens_seen": 139614080, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75, "step": 6503, "time_per_iteration": 2.6738638877868652 }, { "auxiliary_loss_clip": 0.01134175, "auxiliary_loss_mlp": 0.01030629, "balance_loss_clip": 1.01754618, "balance_loss_mlp": 1.04200673, "epoch": 0.39104163535247255, "flos": 18951246328320.0, "grad_norm": 1.6244183807690915, "language_loss": 0.71347463, "learning_rate": 2.671678219885475e-06, "loss": 0.73512268, "num_input_tokens_seen": 139632755, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.74609375, "step": 6504, "time_per_iteration": 2.645963668823242 }, { "auxiliary_loss_clip": 0.01117256, "auxiliary_loss_mlp": 0.01038017, "balance_loss_clip": 1.02377152, "balance_loss_mlp": 1.04026222, "epoch": 0.3911017586051405, "flos": 26323347732480.0, "grad_norm": 1.4220613037354266, "language_loss": 0.83064008, "learning_rate": 2.6713223297350926e-06, "loss": 0.85219288, "num_input_tokens_seen": 139654205, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.76953125, "step": 6505, "time_per_iteration": 2.6324477195739746 }, { "auxiliary_loss_clip": 0.01117525, "auxiliary_loss_mlp": 0.01037295, "balance_loss_clip": 1.02210152, "balance_loss_mlp": 1.04242969, "epoch": 0.3911618818578085, "flos": 21835770992640.0, "grad_norm": 1.8959661934283212, "language_loss": 0.70869958, "learning_rate": 2.6709664156269426e-06, "loss": 0.73024774, "num_input_tokens_seen": 139673595, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.75, "step": 6506, "time_per_iteration": 4.773964881896973 }, { "auxiliary_loss_clip": 0.01133549, "auxiliary_loss_mlp": 0.01044373, "balance_loss_clip": 1.03080153, "balance_loss_mlp": 1.04250097, "epoch": 0.3912220051104765, "flos": 16359680989440.0, "grad_norm": 1.815470472619368, "language_loss": 0.75001895, "learning_rate": 2.670610477573727e-06, "loss": 0.77179813, "num_input_tokens_seen": 139690565, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 6507, "time_per_iteration": 2.6009349822998047 }, { "auxiliary_loss_clip": 0.01041873, "auxiliary_loss_mlp": 0.0102539, "balance_loss_clip": 1.02388799, "balance_loss_mlp": 1.01695085, "epoch": 0.39128212836314447, "flos": 71050986420480.0, "grad_norm": 0.7653592755140859, "language_loss": 0.56538689, "learning_rate": 2.670254515588149e-06, "loss": 0.58605951, "num_input_tokens_seen": 139749420, "router_z_loss_clip": 0.01501465, "router_z_loss_mlp": 0.24902344, "step": 6508, "time_per_iteration": 3.3347220420837402 }, { "auxiliary_loss_clip": 0.01128496, "auxiliary_loss_mlp": 0.01040714, "balance_loss_clip": 1.02720177, "balance_loss_mlp": 1.04296124, "epoch": 0.39134225161581243, "flos": 20331975173760.0, "grad_norm": 1.845895467268416, "language_loss": 0.76778936, "learning_rate": 2.6698985296829115e-06, "loss": 0.78948146, "num_input_tokens_seen": 139766265, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.765625, "step": 6509, "time_per_iteration": 2.6344244480133057 }, { "auxiliary_loss_clip": 0.01145318, "auxiliary_loss_mlp": 0.01043364, "balance_loss_clip": 1.02699661, "balance_loss_mlp": 1.04107273, "epoch": 0.3914023748684804, "flos": 17018390551680.0, "grad_norm": 3.127503386257895, "language_loss": 0.82934242, "learning_rate": 2.6695425198707187e-06, "loss": 0.85122919, "num_input_tokens_seen": 139782400, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.7734375, "step": 6510, "time_per_iteration": 2.598832845687866 }, { "auxiliary_loss_clip": 0.01128481, "auxiliary_loss_mlp": 0.0103477, "balance_loss_clip": 1.02066767, "balance_loss_mlp": 1.04201329, "epoch": 0.39146249812114836, "flos": 18405224709120.0, "grad_norm": 1.97962470002395, "language_loss": 0.76822835, "learning_rate": 2.669186486164276e-06, "loss": 0.78986084, "num_input_tokens_seen": 139801435, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.77734375, "step": 6511, "time_per_iteration": 2.656114101409912 }, { "auxiliary_loss_clip": 0.01060405, "auxiliary_loss_mlp": 0.01010944, "balance_loss_clip": 1.00940609, "balance_loss_mlp": 1.01769853, "epoch": 0.3915226213738163, "flos": 67637355442560.0, "grad_norm": 0.8129757177082146, "language_loss": 0.57771045, "learning_rate": 2.6688304285762878e-06, "loss": 0.5984239, "num_input_tokens_seen": 139869700, "router_z_loss_clip": 0.01538086, "router_z_loss_mlp": 0.24707031, "step": 6512, "time_per_iteration": 3.286747455596924 }, { "auxiliary_loss_clip": 0.01119861, "auxiliary_loss_mlp": 0.01033347, "balance_loss_clip": 1.01817727, "balance_loss_mlp": 1.0430491, "epoch": 0.3915827446264843, "flos": 26359330181760.0, "grad_norm": 1.5750995735218627, "language_loss": 0.69739664, "learning_rate": 2.6684743471194627e-06, "loss": 0.71892869, "num_input_tokens_seen": 139890140, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.765625, "step": 6513, "time_per_iteration": 2.62007737159729 }, { "auxiliary_loss_clip": 0.01157563, "auxiliary_loss_mlp": 0.01036148, "balance_loss_clip": 1.02168226, "balance_loss_mlp": 1.041466, "epoch": 0.39164286787915226, "flos": 21943897908480.0, "grad_norm": 2.9841067025927863, "language_loss": 0.76171529, "learning_rate": 2.668118241806508e-06, "loss": 0.78365237, "num_input_tokens_seen": 139908020, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.80078125, "step": 6514, "time_per_iteration": 2.6150243282318115 }, { "auxiliary_loss_clip": 0.01147041, "auxiliary_loss_mlp": 0.0103392, "balance_loss_clip": 1.02050889, "balance_loss_mlp": 1.04217541, "epoch": 0.3917029911318202, "flos": 16399829416320.0, "grad_norm": 1.8577061033529243, "language_loss": 0.77029097, "learning_rate": 2.6677621126501316e-06, "loss": 0.79210061, "num_input_tokens_seen": 139926180, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7734375, "step": 6515, "time_per_iteration": 2.6378281116485596 }, { "auxiliary_loss_clip": 0.01142691, "auxiliary_loss_mlp": 0.01034482, "balance_loss_clip": 1.02201927, "balance_loss_mlp": 1.04156101, "epoch": 0.3917631143844882, "flos": 26211701283840.0, "grad_norm": 1.3378040553213633, "language_loss": 0.79714787, "learning_rate": 2.667405959663043e-06, "loss": 0.81891954, "num_input_tokens_seen": 139947420, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.73828125, "step": 6516, "time_per_iteration": 2.644451856613159 }, { "auxiliary_loss_clip": 0.01156023, "auxiliary_loss_mlp": 0.01031496, "balance_loss_clip": 1.01740551, "balance_loss_mlp": 1.04251575, "epoch": 0.39182323763715615, "flos": 18548364407040.0, "grad_norm": 2.398563859551232, "language_loss": 0.70447057, "learning_rate": 2.667049782857952e-06, "loss": 0.72634578, "num_input_tokens_seen": 139965800, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78125, "step": 6517, "time_per_iteration": 2.6371514797210693 }, { "auxiliary_loss_clip": 0.011446, "auxiliary_loss_mlp": 0.01038378, "balance_loss_clip": 1.02410853, "balance_loss_mlp": 1.04083681, "epoch": 0.3918833608898241, "flos": 34313543395200.0, "grad_norm": 1.5252987769374335, "language_loss": 0.7227186, "learning_rate": 2.666693582247571e-06, "loss": 0.74454838, "num_input_tokens_seen": 139988140, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.76953125, "step": 6518, "time_per_iteration": 2.772566556930542 }, { "auxiliary_loss_clip": 0.0112898, "auxiliary_loss_mlp": 0.0103072, "balance_loss_clip": 1.01621258, "balance_loss_mlp": 1.04175818, "epoch": 0.3919434841424921, "flos": 36939582812160.0, "grad_norm": 1.8625163788484331, "language_loss": 0.61174554, "learning_rate": 2.66633735784461e-06, "loss": 0.6333425, "num_input_tokens_seen": 140010060, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78125, "step": 6519, "time_per_iteration": 2.69323992729187 }, { "auxiliary_loss_clip": 0.01125219, "auxiliary_loss_mlp": 0.01040071, "balance_loss_clip": 1.02517033, "balance_loss_mlp": 1.04336119, "epoch": 0.3920036073951601, "flos": 23508956373120.0, "grad_norm": 4.980364509382068, "language_loss": 0.6691758, "learning_rate": 2.665981109661784e-06, "loss": 0.69082868, "num_input_tokens_seen": 140029400, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8203125, "step": 6520, "time_per_iteration": 2.5717737674713135 }, { "auxiliary_loss_clip": 0.01117693, "auxiliary_loss_mlp": 0.01032825, "balance_loss_clip": 1.01871061, "balance_loss_mlp": 1.04204512, "epoch": 0.39206373064782807, "flos": 18406086635520.0, "grad_norm": 1.7525229570828997, "language_loss": 0.78886729, "learning_rate": 2.6656248377118043e-06, "loss": 0.81037241, "num_input_tokens_seen": 140048940, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75390625, "step": 6521, "time_per_iteration": 2.5423123836517334 }, { "auxiliary_loss_clip": 0.01133179, "auxiliary_loss_mlp": 0.01037277, "balance_loss_clip": 1.02099264, "balance_loss_mlp": 1.04364884, "epoch": 0.39212385390049603, "flos": 12313051608960.0, "grad_norm": 2.9537192760943336, "language_loss": 0.69977295, "learning_rate": 2.6652685420073867e-06, "loss": 0.72147751, "num_input_tokens_seen": 140066380, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8046875, "step": 6522, "time_per_iteration": 2.5613105297088623 }, { "auxiliary_loss_clip": 0.01140452, "auxiliary_loss_mlp": 0.01031233, "balance_loss_clip": 1.01744652, "balance_loss_mlp": 1.04272676, "epoch": 0.392183977153164, "flos": 19719160214400.0, "grad_norm": 1.7959931376005667, "language_loss": 0.76770341, "learning_rate": 2.664912222561246e-06, "loss": 0.78942025, "num_input_tokens_seen": 140085275, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.796875, "step": 6523, "time_per_iteration": 2.633857011795044 }, { "auxiliary_loss_clip": 0.01137555, "auxiliary_loss_mlp": 0.0103867, "balance_loss_clip": 1.02427518, "balance_loss_mlp": 1.0411973, "epoch": 0.39224410040583196, "flos": 33144902403840.0, "grad_norm": 2.707471304229439, "language_loss": 0.61986756, "learning_rate": 2.664555879386098e-06, "loss": 0.64162982, "num_input_tokens_seen": 140105105, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78515625, "step": 6524, "time_per_iteration": 2.7232463359832764 }, { "auxiliary_loss_clip": 0.01118642, "auxiliary_loss_mlp": 0.01035482, "balance_loss_clip": 1.02061629, "balance_loss_mlp": 1.04034054, "epoch": 0.39230422365849993, "flos": 27782434097280.0, "grad_norm": 1.6363548906238612, "language_loss": 0.74174488, "learning_rate": 2.6641995124946606e-06, "loss": 0.76328611, "num_input_tokens_seen": 140125645, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.78125, "step": 6525, "time_per_iteration": 2.632330894470215 }, { "auxiliary_loss_clip": 0.01136825, "auxiliary_loss_mlp": 0.01038222, "balance_loss_clip": 1.02460873, "balance_loss_mlp": 1.04068041, "epoch": 0.3923643469111679, "flos": 17931634865280.0, "grad_norm": 2.302008833433815, "language_loss": 0.81085259, "learning_rate": 2.6638431218996517e-06, "loss": 0.8326031, "num_input_tokens_seen": 140141925, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.78515625, "step": 6526, "time_per_iteration": 2.6883788108825684 }, { "auxiliary_loss_clip": 0.01127346, "auxiliary_loss_mlp": 0.0104008, "balance_loss_clip": 1.02606118, "balance_loss_mlp": 1.04195392, "epoch": 0.39242447016383586, "flos": 24059539019520.0, "grad_norm": 1.9877151098425307, "language_loss": 0.70017672, "learning_rate": 2.6634867076137886e-06, "loss": 0.72185099, "num_input_tokens_seen": 140160965, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.765625, "step": 6527, "time_per_iteration": 2.5524423122406006 }, { "auxiliary_loss_clip": 0.0114525, "auxiliary_loss_mlp": 0.01033936, "balance_loss_clip": 1.01986933, "balance_loss_mlp": 1.04218817, "epoch": 0.3924845934165038, "flos": 10664069016960.0, "grad_norm": 2.5966617878229536, "language_loss": 0.82634151, "learning_rate": 2.663130269649792e-06, "loss": 0.84813333, "num_input_tokens_seen": 140177780, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.765625, "step": 6528, "time_per_iteration": 2.6250381469726562 }, { "auxiliary_loss_clip": 0.01132846, "auxiliary_loss_mlp": 0.01033648, "balance_loss_clip": 1.01976061, "balance_loss_mlp": 1.04135501, "epoch": 0.3925447166691718, "flos": 31245910174080.0, "grad_norm": 1.6441871817872113, "language_loss": 0.68508488, "learning_rate": 2.6627738080203817e-06, "loss": 0.7067498, "num_input_tokens_seen": 140201660, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.73828125, "step": 6529, "time_per_iteration": 2.699458599090576 }, { "auxiliary_loss_clip": 0.01138899, "auxiliary_loss_mlp": 0.0103797, "balance_loss_clip": 1.02391481, "balance_loss_mlp": 1.04260111, "epoch": 0.39260483992183975, "flos": 29415040087680.0, "grad_norm": 3.461669054535572, "language_loss": 0.80513918, "learning_rate": 2.662417322738279e-06, "loss": 0.82690787, "num_input_tokens_seen": 140218585, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78125, "step": 6530, "time_per_iteration": 2.7007815837860107 }, { "auxiliary_loss_clip": 0.01125097, "auxiliary_loss_mlp": 0.0103361, "balance_loss_clip": 1.01960301, "balance_loss_mlp": 1.04052949, "epoch": 0.3926649631745077, "flos": 22857788666880.0, "grad_norm": 1.38595759248606, "language_loss": 0.75759095, "learning_rate": 2.6620608138162055e-06, "loss": 0.77917802, "num_input_tokens_seen": 140239905, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 6531, "time_per_iteration": 2.6103506088256836 }, { "auxiliary_loss_clip": 0.01048562, "auxiliary_loss_mlp": 0.01000466, "balance_loss_clip": 0.99876136, "balance_loss_mlp": 1.02360606, "epoch": 0.3927250864271757, "flos": 63893881872000.0, "grad_norm": 0.8115350484747827, "language_loss": 0.60295689, "learning_rate": 2.6617042812668857e-06, "loss": 0.62344718, "num_input_tokens_seen": 140293820, "router_z_loss_clip": 0.01708984, "router_z_loss_mlp": 0.25, "step": 6532, "time_per_iteration": 3.082409381866455 }, { "auxiliary_loss_clip": 0.01064495, "auxiliary_loss_mlp": 0.0100098, "balance_loss_clip": 0.99929965, "balance_loss_mlp": 1.02253687, "epoch": 0.3927852096798437, "flos": 68909741890560.0, "grad_norm": 0.7698946320344481, "language_loss": 0.55535907, "learning_rate": 2.661347725103041e-06, "loss": 0.5760138, "num_input_tokens_seen": 140360420, "router_z_loss_clip": 0.0168457, "router_z_loss_mlp": 0.24902344, "step": 6533, "time_per_iteration": 3.3034582138061523 }, { "auxiliary_loss_clip": 0.0114108, "auxiliary_loss_mlp": 0.01040774, "balance_loss_clip": 1.02655292, "balance_loss_mlp": 1.04407084, "epoch": 0.39284533293251167, "flos": 29715972232320.0, "grad_norm": 1.8760100737356697, "language_loss": 0.76294065, "learning_rate": 2.6609911453373978e-06, "loss": 0.78475916, "num_input_tokens_seen": 140381950, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.79296875, "step": 6534, "time_per_iteration": 2.6632230281829834 }, { "auxiliary_loss_clip": 0.01134179, "auxiliary_loss_mlp": 0.0103727, "balance_loss_clip": 1.02259517, "balance_loss_mlp": 1.04478002, "epoch": 0.39290545618517964, "flos": 18552027594240.0, "grad_norm": 2.0448195632783097, "language_loss": 0.77937818, "learning_rate": 2.660634541982681e-06, "loss": 0.80109274, "num_input_tokens_seen": 140399410, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8046875, "step": 6535, "time_per_iteration": 2.6117985248565674 }, { "auxiliary_loss_clip": 0.01144985, "auxiliary_loss_mlp": 0.01035284, "balance_loss_clip": 1.02224815, "balance_loss_mlp": 1.04311776, "epoch": 0.3929655794378476, "flos": 26249479413120.0, "grad_norm": 1.7458233617261454, "language_loss": 0.69035572, "learning_rate": 2.6602779150516163e-06, "loss": 0.71215844, "num_input_tokens_seen": 140419055, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.75, "step": 6536, "time_per_iteration": 4.052106142044067 }, { "auxiliary_loss_clip": 0.0113434, "auxiliary_loss_mlp": 0.01039853, "balance_loss_clip": 1.02694285, "balance_loss_mlp": 1.04256022, "epoch": 0.39302570269051557, "flos": 29277933874560.0, "grad_norm": 1.7616294740229117, "language_loss": 0.68822944, "learning_rate": 2.6599212645569316e-06, "loss": 0.70997143, "num_input_tokens_seen": 140438800, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73828125, "step": 6537, "time_per_iteration": 2.6408376693725586 }, { "auxiliary_loss_clip": 0.01130066, "auxiliary_loss_mlp": 0.01040341, "balance_loss_clip": 1.02599406, "balance_loss_mlp": 1.04367054, "epoch": 0.39308582594318353, "flos": 17347440067200.0, "grad_norm": 2.1345950072017628, "language_loss": 0.78916281, "learning_rate": 2.6595645905113546e-06, "loss": 0.81086689, "num_input_tokens_seen": 140456880, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7734375, "step": 6538, "time_per_iteration": 2.5647237300872803 }, { "auxiliary_loss_clip": 0.01080485, "auxiliary_loss_mlp": 0.01004931, "balance_loss_clip": 1.00330973, "balance_loss_mlp": 1.01985407, "epoch": 0.3931459491958515, "flos": 61007094650880.0, "grad_norm": 0.8016355649900386, "language_loss": 0.61934453, "learning_rate": 2.659207892927614e-06, "loss": 0.64019871, "num_input_tokens_seen": 140507510, "router_z_loss_clip": 0.01623535, "router_z_loss_mlp": 0.24804688, "step": 6539, "time_per_iteration": 2.9757447242736816 }, { "auxiliary_loss_clip": 0.01134921, "auxiliary_loss_mlp": 0.01035492, "balance_loss_clip": 1.02176535, "balance_loss_mlp": 1.04089785, "epoch": 0.39320607244851946, "flos": 39016009249920.0, "grad_norm": 2.6561657417511575, "language_loss": 0.68140984, "learning_rate": 2.658851171818439e-06, "loss": 0.70311391, "num_input_tokens_seen": 140528740, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.765625, "step": 6540, "time_per_iteration": 2.7416751384735107 }, { "auxiliary_loss_clip": 0.01135572, "auxiliary_loss_mlp": 0.01034329, "balance_loss_clip": 1.02035737, "balance_loss_mlp": 1.04215598, "epoch": 0.3932661957011874, "flos": 24679752180480.0, "grad_norm": 1.8305973654940002, "language_loss": 0.72636974, "learning_rate": 2.65849442719656e-06, "loss": 0.74806869, "num_input_tokens_seen": 140547560, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7578125, "step": 6541, "time_per_iteration": 2.5455095767974854 }, { "auxiliary_loss_clip": 0.01042148, "auxiliary_loss_mlp": 0.0100496, "balance_loss_clip": 1.00332725, "balance_loss_mlp": 1.01752329, "epoch": 0.3933263189538554, "flos": 70096552185600.0, "grad_norm": 0.8277131144842812, "language_loss": 0.60371506, "learning_rate": 2.65813765907471e-06, "loss": 0.62418616, "num_input_tokens_seen": 140601175, "router_z_loss_clip": 0.01635742, "router_z_loss_mlp": 0.24609375, "step": 6542, "time_per_iteration": 4.329163551330566 }, { "auxiliary_loss_clip": 0.0113025, "auxiliary_loss_mlp": 0.01040541, "balance_loss_clip": 1.02693903, "balance_loss_mlp": 1.04455006, "epoch": 0.39338644220652336, "flos": 22929071207040.0, "grad_norm": 1.504394535727662, "language_loss": 0.82049245, "learning_rate": 2.657780867465619e-06, "loss": 0.8422004, "num_input_tokens_seen": 140622200, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.765625, "step": 6543, "time_per_iteration": 4.16791296005249 }, { "auxiliary_loss_clip": 0.01133636, "auxiliary_loss_mlp": 0.01035657, "balance_loss_clip": 1.02135181, "balance_loss_mlp": 1.04003239, "epoch": 0.3934465654591913, "flos": 30848163897600.0, "grad_norm": 1.5144639964023856, "language_loss": 0.69057775, "learning_rate": 2.6574240523820214e-06, "loss": 0.71227074, "num_input_tokens_seen": 140643125, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7578125, "step": 6544, "time_per_iteration": 2.677025318145752 }, { "auxiliary_loss_clip": 0.01141451, "auxiliary_loss_mlp": 0.01042134, "balance_loss_clip": 1.02661324, "balance_loss_mlp": 1.04311204, "epoch": 0.3935066887118593, "flos": 29236528471680.0, "grad_norm": 1.9075439216433625, "language_loss": 0.75501108, "learning_rate": 2.6570672138366503e-06, "loss": 0.77684695, "num_input_tokens_seen": 140662500, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8046875, "step": 6545, "time_per_iteration": 2.6208889484405518 }, { "auxiliary_loss_clip": 0.01141446, "auxiliary_loss_mlp": 0.01033048, "balance_loss_clip": 1.0212822, "balance_loss_mlp": 1.04386508, "epoch": 0.3935668119645273, "flos": 19135288638720.0, "grad_norm": 2.0656235680148556, "language_loss": 0.74639791, "learning_rate": 2.65671035184224e-06, "loss": 0.76814282, "num_input_tokens_seen": 140681960, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.7109375, "step": 6546, "time_per_iteration": 2.7031850814819336 }, { "auxiliary_loss_clip": 0.01143524, "auxiliary_loss_mlp": 0.01035188, "balance_loss_clip": 1.02122283, "balance_loss_mlp": 1.04295516, "epoch": 0.3936269352171953, "flos": 18516116972160.0, "grad_norm": 1.8268258217397033, "language_loss": 0.81176567, "learning_rate": 2.656353466411527e-06, "loss": 0.83355278, "num_input_tokens_seen": 140699170, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.82421875, "step": 6547, "time_per_iteration": 2.5808496475219727 }, { "auxiliary_loss_clip": 0.01126427, "auxiliary_loss_mlp": 0.0127583, "balance_loss_clip": 1.01592612, "balance_loss_mlp": 1.04076004, "epoch": 0.39368705846986324, "flos": 15632813370240.0, "grad_norm": 1.831380111927623, "language_loss": 0.84279788, "learning_rate": 2.6559965575572475e-06, "loss": 0.86682045, "num_input_tokens_seen": 140714920, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.765625, "step": 6548, "time_per_iteration": 4.149337530136108 }, { "auxiliary_loss_clip": 0.01122998, "auxiliary_loss_mlp": 0.01274821, "balance_loss_clip": 1.01553881, "balance_loss_mlp": 1.04060221, "epoch": 0.3937471817225312, "flos": 21325839563520.0, "grad_norm": 1.6369801332578962, "language_loss": 0.72632343, "learning_rate": 2.6556396252921375e-06, "loss": 0.7503016, "num_input_tokens_seen": 140734595, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.734375, "step": 6549, "time_per_iteration": 2.5227460861206055 }, { "auxiliary_loss_clip": 0.01166562, "auxiliary_loss_mlp": 0.01031756, "balance_loss_clip": 1.01805902, "balance_loss_mlp": 1.04516494, "epoch": 0.39380730497519917, "flos": 20776693461120.0, "grad_norm": 1.7887671002902201, "language_loss": 0.77559519, "learning_rate": 2.6552826696289363e-06, "loss": 0.79757839, "num_input_tokens_seen": 140754050, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.765625, "step": 6550, "time_per_iteration": 2.63698673248291 }, { "auxiliary_loss_clip": 0.01142899, "auxiliary_loss_mlp": 0.01028569, "balance_loss_clip": 1.01515782, "balance_loss_mlp": 1.04105616, "epoch": 0.39386742822786713, "flos": 21609784575360.0, "grad_norm": 1.6368566275110972, "language_loss": 0.81087762, "learning_rate": 2.6549256905803815e-06, "loss": 0.83259225, "num_input_tokens_seen": 140771440, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75, "step": 6551, "time_per_iteration": 2.615797758102417 }, { "auxiliary_loss_clip": 0.01128548, "auxiliary_loss_mlp": 0.01034924, "balance_loss_clip": 1.02048206, "balance_loss_mlp": 1.04167581, "epoch": 0.3939275514805351, "flos": 12414642249600.0, "grad_norm": 2.496919296708741, "language_loss": 0.80116963, "learning_rate": 2.654568688159214e-06, "loss": 0.82280433, "num_input_tokens_seen": 140786715, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.77734375, "step": 6552, "time_per_iteration": 2.5248608589172363 }, { "auxiliary_loss_clip": 0.01121845, "auxiliary_loss_mlp": 0.01035951, "balance_loss_clip": 1.02242041, "balance_loss_mlp": 1.04338658, "epoch": 0.39398767473320306, "flos": 18552027594240.0, "grad_norm": 1.7143793365386726, "language_loss": 0.70995235, "learning_rate": 2.6542116623781736e-06, "loss": 0.73153031, "num_input_tokens_seen": 140804950, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7890625, "step": 6553, "time_per_iteration": 2.5389888286590576 }, { "auxiliary_loss_clip": 0.0112595, "auxiliary_loss_mlp": 0.01284221, "balance_loss_clip": 1.02329338, "balance_loss_mlp": 1.04108286, "epoch": 0.39404779798587103, "flos": 29308888419840.0, "grad_norm": 1.5526948526948423, "language_loss": 0.63788962, "learning_rate": 2.6538546132500023e-06, "loss": 0.6619913, "num_input_tokens_seen": 140822800, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.76171875, "step": 6554, "time_per_iteration": 2.6557414531707764 }, { "auxiliary_loss_clip": 0.0113627, "auxiliary_loss_mlp": 0.01037194, "balance_loss_clip": 1.02381277, "balance_loss_mlp": 1.04252458, "epoch": 0.394107921238539, "flos": 34897055834880.0, "grad_norm": 1.8352284588810412, "language_loss": 0.7897203, "learning_rate": 2.6534975407874417e-06, "loss": 0.81145495, "num_input_tokens_seen": 140842940, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.75390625, "step": 6555, "time_per_iteration": 2.7790770530700684 }, { "auxiliary_loss_clip": 0.01137966, "auxiliary_loss_mlp": 0.01036036, "balance_loss_clip": 1.02083707, "balance_loss_mlp": 1.04167366, "epoch": 0.39416804449120696, "flos": 25081413039360.0, "grad_norm": 2.1528175743637084, "language_loss": 0.7192511, "learning_rate": 2.653140445003234e-06, "loss": 0.74099112, "num_input_tokens_seen": 140863060, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.78125, "step": 6556, "time_per_iteration": 2.5891611576080322 }, { "auxiliary_loss_clip": 0.01116972, "auxiliary_loss_mlp": 0.010316, "balance_loss_clip": 1.01750398, "balance_loss_mlp": 1.03933287, "epoch": 0.3942281677438749, "flos": 32306639731200.0, "grad_norm": 1.729929000604327, "language_loss": 0.83537948, "learning_rate": 2.652783325910125e-06, "loss": 0.85686517, "num_input_tokens_seen": 140883795, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.77734375, "step": 6557, "time_per_iteration": 2.6403777599334717 }, { "auxiliary_loss_clip": 0.01127033, "auxiliary_loss_mlp": 0.01031036, "balance_loss_clip": 1.01740432, "balance_loss_mlp": 1.04151416, "epoch": 0.3942882909965429, "flos": 24936621315840.0, "grad_norm": 2.2081120736870594, "language_loss": 0.80176258, "learning_rate": 2.652426183520857e-06, "loss": 0.82334328, "num_input_tokens_seen": 140903055, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.765625, "step": 6558, "time_per_iteration": 2.6823275089263916 }, { "auxiliary_loss_clip": 0.01119127, "auxiliary_loss_mlp": 0.01033937, "balance_loss_clip": 1.02044868, "balance_loss_mlp": 1.04326487, "epoch": 0.39434841424921085, "flos": 11874797769600.0, "grad_norm": 1.6988141956793181, "language_loss": 0.71123075, "learning_rate": 2.652069017848178e-06, "loss": 0.73276138, "num_input_tokens_seen": 140920685, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 6559, "time_per_iteration": 2.621055841445923 }, { "auxiliary_loss_clip": 0.01139398, "auxiliary_loss_mlp": 0.01039332, "balance_loss_clip": 1.02367413, "balance_loss_mlp": 1.04101098, "epoch": 0.3944085375018789, "flos": 16361620323840.0, "grad_norm": 1.9507947564157535, "language_loss": 0.80077457, "learning_rate": 2.651711828904833e-06, "loss": 0.82256186, "num_input_tokens_seen": 140937320, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8046875, "step": 6560, "time_per_iteration": 2.560454845428467 }, { "auxiliary_loss_clip": 0.01139615, "auxiliary_loss_mlp": 0.01035592, "balance_loss_clip": 1.02045858, "balance_loss_mlp": 1.04278016, "epoch": 0.39446866075454684, "flos": 10633365866880.0, "grad_norm": 2.125760498738521, "language_loss": 0.82539135, "learning_rate": 2.6513546167035687e-06, "loss": 0.84714341, "num_input_tokens_seen": 140954855, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.78515625, "step": 6561, "time_per_iteration": 2.597105026245117 }, { "auxiliary_loss_clip": 0.01127471, "auxiliary_loss_mlp": 0.0104118, "balance_loss_clip": 1.02724457, "balance_loss_mlp": 1.04182017, "epoch": 0.3945287840072148, "flos": 18187498419840.0, "grad_norm": 3.9121633168481607, "language_loss": 0.80366653, "learning_rate": 2.6509973812571336e-06, "loss": 0.82535315, "num_input_tokens_seen": 140973250, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.765625, "step": 6562, "time_per_iteration": 2.5677154064178467 }, { "auxiliary_loss_clip": 0.0113401, "auxiliary_loss_mlp": 0.01033461, "balance_loss_clip": 1.01929319, "balance_loss_mlp": 1.04203606, "epoch": 0.39458890725988277, "flos": 23039891642880.0, "grad_norm": 1.4542780156957982, "language_loss": 0.81409013, "learning_rate": 2.6506401225782763e-06, "loss": 0.83576483, "num_input_tokens_seen": 140993050, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.73828125, "step": 6563, "time_per_iteration": 2.6574771404266357 }, { "auxiliary_loss_clip": 0.0114497, "auxiliary_loss_mlp": 0.01034959, "balance_loss_clip": 1.02012968, "balance_loss_mlp": 1.04094636, "epoch": 0.39464903051255074, "flos": 17159052211200.0, "grad_norm": 1.8795017311254314, "language_loss": 0.69701898, "learning_rate": 2.650282840679747e-06, "loss": 0.71881825, "num_input_tokens_seen": 141010815, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.765625, "step": 6564, "time_per_iteration": 2.4974005222320557 }, { "auxiliary_loss_clip": 0.01129621, "auxiliary_loss_mlp": 0.01034735, "balance_loss_clip": 1.02085268, "balance_loss_mlp": 1.04260683, "epoch": 0.3947091537652187, "flos": 15889000147200.0, "grad_norm": 2.135875050963518, "language_loss": 0.82774055, "learning_rate": 2.6499255355742966e-06, "loss": 0.84938413, "num_input_tokens_seen": 141028720, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78125, "step": 6565, "time_per_iteration": 2.6017024517059326 }, { "auxiliary_loss_clip": 0.01146311, "auxiliary_loss_mlp": 0.01031573, "balance_loss_clip": 1.01797175, "balance_loss_mlp": 1.04201198, "epoch": 0.39476927701788667, "flos": 18545491319040.0, "grad_norm": 1.714180680210522, "language_loss": 0.83151901, "learning_rate": 2.649568207274674e-06, "loss": 0.85329789, "num_input_tokens_seen": 141046025, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7734375, "step": 6566, "time_per_iteration": 2.4892544746398926 }, { "auxiliary_loss_clip": 0.0112471, "auxiliary_loss_mlp": 0.01035294, "balance_loss_clip": 1.02030325, "balance_loss_mlp": 1.04307699, "epoch": 0.39482940027055463, "flos": 22275712771200.0, "grad_norm": 1.7313340826977515, "language_loss": 0.76925147, "learning_rate": 2.649210855793634e-06, "loss": 0.79085147, "num_input_tokens_seen": 141066865, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.81640625, "step": 6567, "time_per_iteration": 2.6314966678619385 }, { "auxiliary_loss_clip": 0.01122275, "auxiliary_loss_mlp": 0.01037802, "balance_loss_clip": 1.02403319, "balance_loss_mlp": 1.04089618, "epoch": 0.3948895235232226, "flos": 14757634494720.0, "grad_norm": 1.757598966626229, "language_loss": 0.80277663, "learning_rate": 2.648853481143928e-06, "loss": 0.82437736, "num_input_tokens_seen": 141084210, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.72265625, "step": 6568, "time_per_iteration": 2.4957528114318848 }, { "auxiliary_loss_clip": 0.01146027, "auxiliary_loss_mlp": 0.01036758, "balance_loss_clip": 1.02216661, "balance_loss_mlp": 1.04205418, "epoch": 0.39494964677589056, "flos": 22565763095040.0, "grad_norm": 3.159617667791372, "language_loss": 0.85088992, "learning_rate": 2.648496083338311e-06, "loss": 0.87271774, "num_input_tokens_seen": 141103895, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7734375, "step": 6569, "time_per_iteration": 2.677447557449341 }, { "auxiliary_loss_clip": 0.01164322, "auxiliary_loss_mlp": 0.01036022, "balance_loss_clip": 1.02188969, "balance_loss_mlp": 1.04419899, "epoch": 0.3950097700285585, "flos": 22963186149120.0, "grad_norm": 1.6462851681717192, "language_loss": 0.74512559, "learning_rate": 2.648138662389537e-06, "loss": 0.76712906, "num_input_tokens_seen": 141124000, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.75390625, "step": 6570, "time_per_iteration": 2.584989070892334 }, { "auxiliary_loss_clip": 0.01148476, "auxiliary_loss_mlp": 0.01035469, "balance_loss_clip": 1.02070451, "balance_loss_mlp": 1.04339206, "epoch": 0.3950698932812265, "flos": 20595236929920.0, "grad_norm": 1.5639435331009122, "language_loss": 0.7974267, "learning_rate": 2.6477812183103606e-06, "loss": 0.8192662, "num_input_tokens_seen": 141142535, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.78125, "step": 6571, "time_per_iteration": 2.6319808959960938 }, { "auxiliary_loss_clip": 0.0113801, "auxiliary_loss_mlp": 0.01039978, "balance_loss_clip": 1.02609563, "balance_loss_mlp": 1.04363465, "epoch": 0.39513001653389446, "flos": 20375786787840.0, "grad_norm": 1.5670552032110223, "language_loss": 0.77646708, "learning_rate": 2.647423751113539e-06, "loss": 0.79824698, "num_input_tokens_seen": 141161575, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.765625, "step": 6572, "time_per_iteration": 2.5793070793151855 }, { "auxiliary_loss_clip": 0.01149875, "auxiliary_loss_mlp": 0.01040105, "balance_loss_clip": 1.02520394, "balance_loss_mlp": 1.04300618, "epoch": 0.3951901397865625, "flos": 26463650256000.0, "grad_norm": 1.6622318640881084, "language_loss": 0.74842489, "learning_rate": 2.6470662608118294e-06, "loss": 0.77032471, "num_input_tokens_seen": 141181150, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8046875, "step": 6573, "time_per_iteration": 2.654165029525757 }, { "auxiliary_loss_clip": 0.01126871, "auxiliary_loss_mlp": 0.01033666, "balance_loss_clip": 1.02077329, "balance_loss_mlp": 1.04175222, "epoch": 0.39525026303923044, "flos": 43838345767680.0, "grad_norm": 4.546024769457097, "language_loss": 0.67845982, "learning_rate": 2.64670874741799e-06, "loss": 0.70006514, "num_input_tokens_seen": 141206310, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.76171875, "step": 6574, "time_per_iteration": 2.748828649520874 }, { "auxiliary_loss_clip": 0.01167374, "auxiliary_loss_mlp": 0.01038254, "balance_loss_clip": 1.02321577, "balance_loss_mlp": 1.04251671, "epoch": 0.3953103862918984, "flos": 18040803275520.0, "grad_norm": 2.2294053664103215, "language_loss": 0.71472669, "learning_rate": 2.6463512109447776e-06, "loss": 0.73678303, "num_input_tokens_seen": 141223925, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.796875, "step": 6575, "time_per_iteration": 2.6052513122558594 }, { "auxiliary_loss_clip": 0.01142331, "auxiliary_loss_mlp": 0.01042848, "balance_loss_clip": 1.02820945, "balance_loss_mlp": 1.04340136, "epoch": 0.3953705095445664, "flos": 16976015481600.0, "grad_norm": 2.180067490430404, "language_loss": 0.73535043, "learning_rate": 2.645993651404954e-06, "loss": 0.75720227, "num_input_tokens_seen": 141239010, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8046875, "step": 6576, "time_per_iteration": 2.5900566577911377 }, { "auxiliary_loss_clip": 0.01119591, "auxiliary_loss_mlp": 0.0103756, "balance_loss_clip": 1.02423847, "balance_loss_mlp": 1.04142094, "epoch": 0.39543063279723434, "flos": 17411144837760.0, "grad_norm": 2.6256739655677364, "language_loss": 0.83638382, "learning_rate": 2.6456360688112785e-06, "loss": 0.85795534, "num_input_tokens_seen": 141252255, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.78125, "step": 6577, "time_per_iteration": 3.9620649814605713 }, { "auxiliary_loss_clip": 0.01137761, "auxiliary_loss_mlp": 0.01037387, "balance_loss_clip": 1.02373755, "balance_loss_mlp": 1.04307127, "epoch": 0.3954907560499023, "flos": 22784207656320.0, "grad_norm": 2.7215975114771056, "language_loss": 0.90824926, "learning_rate": 2.6452784631765117e-06, "loss": 0.93000072, "num_input_tokens_seen": 141269325, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.765625, "step": 6578, "time_per_iteration": 2.6464836597442627 }, { "auxiliary_loss_clip": 0.01141826, "auxiliary_loss_mlp": 0.01038153, "balance_loss_clip": 1.02190495, "balance_loss_mlp": 1.04265261, "epoch": 0.39555087930257027, "flos": 21944400698880.0, "grad_norm": 1.8473061226089205, "language_loss": 0.7807411, "learning_rate": 2.6449208345134174e-06, "loss": 0.8025409, "num_input_tokens_seen": 141288505, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8125, "step": 6579, "time_per_iteration": 2.5380380153656006 }, { "auxiliary_loss_clip": 0.01159159, "auxiliary_loss_mlp": 0.01031477, "balance_loss_clip": 1.01608682, "balance_loss_mlp": 1.04220808, "epoch": 0.39561100255523823, "flos": 20404622430720.0, "grad_norm": 1.99938035766093, "language_loss": 0.68429118, "learning_rate": 2.6445631828347566e-06, "loss": 0.7061975, "num_input_tokens_seen": 141303680, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8125, "step": 6580, "time_per_iteration": 2.6663267612457275 }, { "auxiliary_loss_clip": 0.0112802, "auxiliary_loss_mlp": 0.01033837, "balance_loss_clip": 1.01977587, "balance_loss_mlp": 1.04218531, "epoch": 0.3956711258079062, "flos": 27964572986880.0, "grad_norm": 1.954040494240247, "language_loss": 0.590868, "learning_rate": 2.644205508153295e-06, "loss": 0.6124866, "num_input_tokens_seen": 141324090, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76953125, "step": 6581, "time_per_iteration": 2.5903751850128174 }, { "auxiliary_loss_clip": 0.01142743, "auxiliary_loss_mlp": 0.01043081, "balance_loss_clip": 1.02790546, "balance_loss_mlp": 1.04523134, "epoch": 0.39573124906057416, "flos": 14428297670400.0, "grad_norm": 1.8142302072645493, "language_loss": 0.6927892, "learning_rate": 2.6438478104817953e-06, "loss": 0.71464741, "num_input_tokens_seen": 141342235, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.796875, "step": 6582, "time_per_iteration": 2.61354398727417 }, { "auxiliary_loss_clip": 0.01124807, "auxiliary_loss_mlp": 0.01283047, "balance_loss_clip": 1.02099442, "balance_loss_mlp": 1.04469109, "epoch": 0.39579137231324213, "flos": 18733699607040.0, "grad_norm": 2.1191715285243773, "language_loss": 0.76110244, "learning_rate": 2.643490089833023e-06, "loss": 0.78518093, "num_input_tokens_seen": 141361195, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.80078125, "step": 6583, "time_per_iteration": 3.9356515407562256 }, { "auxiliary_loss_clip": 0.01158777, "auxiliary_loss_mlp": 0.01282094, "balance_loss_clip": 1.02061725, "balance_loss_mlp": 1.04387832, "epoch": 0.3958514955659101, "flos": 17676417755520.0, "grad_norm": 1.930416562282844, "language_loss": 0.65884137, "learning_rate": 2.6431323462197453e-06, "loss": 0.68325013, "num_input_tokens_seen": 141378275, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7890625, "step": 6584, "time_per_iteration": 2.556593894958496 }, { "auxiliary_loss_clip": 0.01151634, "auxiliary_loss_mlp": 0.01039127, "balance_loss_clip": 1.02368891, "balance_loss_mlp": 1.04297161, "epoch": 0.39591161881857806, "flos": 29309103901440.0, "grad_norm": 2.129668798855553, "language_loss": 0.7247746, "learning_rate": 2.642774579654728e-06, "loss": 0.74668223, "num_input_tokens_seen": 141396960, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8203125, "step": 6585, "time_per_iteration": 4.1811933517456055 }, { "auxiliary_loss_clip": 0.01148796, "auxiliary_loss_mlp": 0.01034424, "balance_loss_clip": 1.01997602, "balance_loss_mlp": 1.0426811, "epoch": 0.3959717420712461, "flos": 25771831332480.0, "grad_norm": 2.1990706284040793, "language_loss": 0.73345482, "learning_rate": 2.6424167901507393e-06, "loss": 0.75528705, "num_input_tokens_seen": 141417320, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.79296875, "step": 6586, "time_per_iteration": 2.5997989177703857 }, { "auxiliary_loss_clip": 0.01142424, "auxiliary_loss_mlp": 0.01035433, "balance_loss_clip": 1.02096081, "balance_loss_mlp": 1.04582, "epoch": 0.39603186532391405, "flos": 20923783655040.0, "grad_norm": 2.1551939313858868, "language_loss": 0.71556127, "learning_rate": 2.6420589777205483e-06, "loss": 0.73733985, "num_input_tokens_seen": 141435985, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78515625, "step": 6587, "time_per_iteration": 2.589514970779419 }, { "auxiliary_loss_clip": 0.01085603, "auxiliary_loss_mlp": 0.010086, "balance_loss_clip": 1.00684774, "balance_loss_mlp": 1.0253737, "epoch": 0.396091988576582, "flos": 54880986176640.0, "grad_norm": 0.8909554827293494, "language_loss": 0.61143047, "learning_rate": 2.641701142376924e-06, "loss": 0.6323725, "num_input_tokens_seen": 141486075, "router_z_loss_clip": 0.01757812, "router_z_loss_mlp": 0.24414062, "step": 6588, "time_per_iteration": 3.043123483657837 }, { "auxiliary_loss_clip": 0.01141174, "auxiliary_loss_mlp": 0.01285418, "balance_loss_clip": 1.02352929, "balance_loss_mlp": 1.04442084, "epoch": 0.39615211182925, "flos": 20702896968960.0, "grad_norm": 1.8440323554450289, "language_loss": 0.81431526, "learning_rate": 2.6413432841326364e-06, "loss": 0.8385812, "num_input_tokens_seen": 141505280, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.79296875, "step": 6589, "time_per_iteration": 4.866927623748779 }, { "auxiliary_loss_clip": 0.01143769, "auxiliary_loss_mlp": 0.0103535, "balance_loss_clip": 1.02010357, "balance_loss_mlp": 1.04579377, "epoch": 0.39621223508191794, "flos": 20994312009600.0, "grad_norm": 2.5885588190694775, "language_loss": 0.70138818, "learning_rate": 2.6409854030004564e-06, "loss": 0.7231794, "num_input_tokens_seen": 141523930, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.80078125, "step": 6590, "time_per_iteration": 2.6474130153656006 }, { "auxiliary_loss_clip": 0.01134136, "auxiliary_loss_mlp": 0.01046029, "balance_loss_clip": 1.03133655, "balance_loss_mlp": 1.04560196, "epoch": 0.3962723583345859, "flos": 23368833417600.0, "grad_norm": 1.7504711352666016, "language_loss": 0.75698668, "learning_rate": 2.640627498993157e-06, "loss": 0.77878833, "num_input_tokens_seen": 141541320, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.79296875, "step": 6591, "time_per_iteration": 2.6798439025878906 }, { "auxiliary_loss_clip": 0.01187242, "auxiliary_loss_mlp": 0.0104388, "balance_loss_clip": 1.029742, "balance_loss_mlp": 1.04523039, "epoch": 0.39633248158725387, "flos": 25115599808640.0, "grad_norm": 1.7215908428891746, "language_loss": 0.78373861, "learning_rate": 2.6402695721235094e-06, "loss": 0.80604982, "num_input_tokens_seen": 141561880, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.796875, "step": 6592, "time_per_iteration": 2.6799368858337402 }, { "auxiliary_loss_clip": 0.01162772, "auxiliary_loss_mlp": 0.0103297, "balance_loss_clip": 1.01868296, "balance_loss_mlp": 1.04279041, "epoch": 0.39639260483992184, "flos": 39787622236800.0, "grad_norm": 2.095633379193704, "language_loss": 0.69534719, "learning_rate": 2.6399116224042875e-06, "loss": 0.71730465, "num_input_tokens_seen": 141586460, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75390625, "step": 6593, "time_per_iteration": 2.803831100463867 }, { "auxiliary_loss_clip": 0.01180247, "auxiliary_loss_mlp": 0.01040092, "balance_loss_clip": 1.02504802, "balance_loss_mlp": 1.04509163, "epoch": 0.3964527280925898, "flos": 17347045017600.0, "grad_norm": 1.6784839299600844, "language_loss": 0.77529037, "learning_rate": 2.6395536498482666e-06, "loss": 0.79749376, "num_input_tokens_seen": 141605955, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8125, "step": 6594, "time_per_iteration": 2.556246757507324 }, { "auxiliary_loss_clip": 0.01071977, "auxiliary_loss_mlp": 0.01005551, "balance_loss_clip": 1.00382257, "balance_loss_mlp": 1.0208199, "epoch": 0.39651285134525777, "flos": 71717848369920.0, "grad_norm": 0.9533418438031867, "language_loss": 0.63063598, "learning_rate": 2.6391956544682205e-06, "loss": 0.65141118, "num_input_tokens_seen": 141673140, "router_z_loss_clip": 0.01733398, "router_z_loss_mlp": 0.24414062, "step": 6595, "time_per_iteration": 3.2234692573547363 }, { "auxiliary_loss_clip": 0.01155876, "auxiliary_loss_mlp": 0.01045206, "balance_loss_clip": 1.02973843, "balance_loss_mlp": 1.04630291, "epoch": 0.39657297459792573, "flos": 25775710001280.0, "grad_norm": 1.7883334275035971, "language_loss": 0.62898487, "learning_rate": 2.6388376362769258e-06, "loss": 0.65099561, "num_input_tokens_seen": 141692955, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.83203125, "step": 6596, "time_per_iteration": 2.667708158493042 }, { "auxiliary_loss_clip": 0.01130229, "auxiliary_loss_mlp": 0.01040597, "balance_loss_clip": 1.02617335, "balance_loss_mlp": 1.04422545, "epoch": 0.3966330978505937, "flos": 20266115587200.0, "grad_norm": 1.8115959201492866, "language_loss": 0.7882334, "learning_rate": 2.638479595287159e-06, "loss": 0.80994165, "num_input_tokens_seen": 141710680, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7734375, "step": 6597, "time_per_iteration": 2.577346086502075 }, { "auxiliary_loss_clip": 0.01153757, "auxiliary_loss_mlp": 0.01290514, "balance_loss_clip": 1.02809823, "balance_loss_mlp": 1.04608917, "epoch": 0.39669322110326166, "flos": 20631183465600.0, "grad_norm": 1.9870371782174636, "language_loss": 0.67515767, "learning_rate": 2.638121531511698e-06, "loss": 0.69960034, "num_input_tokens_seen": 141729860, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.80859375, "step": 6598, "time_per_iteration": 2.6157991886138916 }, { "auxiliary_loss_clip": 0.01123074, "auxiliary_loss_mlp": 0.01043168, "balance_loss_clip": 1.0291791, "balance_loss_mlp": 1.04334641, "epoch": 0.3967533443559297, "flos": 21726063878400.0, "grad_norm": 1.765127274439681, "language_loss": 0.79060471, "learning_rate": 2.637763444963321e-06, "loss": 0.81226718, "num_input_tokens_seen": 141749060, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.796875, "step": 6599, "time_per_iteration": 2.5627565383911133 }, { "auxiliary_loss_clip": 0.01151542, "auxiliary_loss_mlp": 0.0103705, "balance_loss_clip": 1.02077794, "balance_loss_mlp": 1.04485202, "epoch": 0.39681346760859765, "flos": 25484151306240.0, "grad_norm": 1.64816171476934, "language_loss": 0.72817838, "learning_rate": 2.637405335654807e-06, "loss": 0.75006425, "num_input_tokens_seen": 141769860, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.796875, "step": 6600, "time_per_iteration": 2.724771499633789 }, { "auxiliary_loss_clip": 0.01147673, "auxiliary_loss_mlp": 0.01036022, "balance_loss_clip": 1.02174687, "balance_loss_mlp": 1.04186368, "epoch": 0.3968735908612656, "flos": 20959586536320.0, "grad_norm": 1.9636342727172065, "language_loss": 0.85549515, "learning_rate": 2.6370472035989367e-06, "loss": 0.87733209, "num_input_tokens_seen": 141788465, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7890625, "step": 6601, "time_per_iteration": 2.595310688018799 }, { "auxiliary_loss_clip": 0.01157472, "auxiliary_loss_mlp": 0.01043915, "balance_loss_clip": 1.02736902, "balance_loss_mlp": 1.04545701, "epoch": 0.3969337141139336, "flos": 10707090531840.0, "grad_norm": 2.635808716595601, "language_loss": 0.70216417, "learning_rate": 2.6366890488084897e-06, "loss": 0.72417808, "num_input_tokens_seen": 141804955, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8515625, "step": 6602, "time_per_iteration": 2.655592918395996 }, { "auxiliary_loss_clip": 0.01122747, "auxiliary_loss_mlp": 0.01045532, "balance_loss_clip": 1.03040433, "balance_loss_mlp": 1.04310894, "epoch": 0.39699383736660154, "flos": 17593714690560.0, "grad_norm": 2.7324472535388415, "language_loss": 0.83186847, "learning_rate": 2.636330871296249e-06, "loss": 0.85355127, "num_input_tokens_seen": 141820025, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.796875, "step": 6603, "time_per_iteration": 2.5464236736297607 }, { "auxiliary_loss_clip": 0.01139257, "auxiliary_loss_mlp": 0.0103923, "balance_loss_clip": 1.02480602, "balance_loss_mlp": 1.04307365, "epoch": 0.3970539606192695, "flos": 17785945301760.0, "grad_norm": 1.5879861001510713, "language_loss": 0.73129368, "learning_rate": 2.635972671074996e-06, "loss": 0.75307858, "num_input_tokens_seen": 141838735, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78125, "step": 6604, "time_per_iteration": 2.600064516067505 }, { "auxiliary_loss_clip": 0.01137052, "auxiliary_loss_mlp": 0.01041443, "balance_loss_clip": 1.0268991, "balance_loss_mlp": 1.04345393, "epoch": 0.3971140838719375, "flos": 24789495208320.0, "grad_norm": 1.4926765670961657, "language_loss": 0.82238209, "learning_rate": 2.6356144481575144e-06, "loss": 0.84416699, "num_input_tokens_seen": 141858090, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7578125, "step": 6605, "time_per_iteration": 2.607724189758301 }, { "auxiliary_loss_clip": 0.01119792, "auxiliary_loss_mlp": 0.01036031, "balance_loss_clip": 1.02278066, "balance_loss_mlp": 1.04276252, "epoch": 0.39717420712460544, "flos": 24243581329920.0, "grad_norm": 3.557109532275828, "language_loss": 0.73791784, "learning_rate": 2.6352562025565885e-06, "loss": 0.75947601, "num_input_tokens_seen": 141877540, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.76953125, "step": 6606, "time_per_iteration": 2.619875907897949 }, { "auxiliary_loss_clip": 0.01152021, "auxiliary_loss_mlp": 0.01039694, "balance_loss_clip": 1.02487683, "balance_loss_mlp": 1.04519796, "epoch": 0.3972343303772734, "flos": 25884698843520.0, "grad_norm": 2.1926316151989442, "language_loss": 0.74177617, "learning_rate": 2.634897934285002e-06, "loss": 0.76369333, "num_input_tokens_seen": 141897315, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.796875, "step": 6607, "time_per_iteration": 2.615831136703491 }, { "auxiliary_loss_clip": 0.01135488, "auxiliary_loss_mlp": 0.01040785, "balance_loss_clip": 1.02609277, "balance_loss_mlp": 1.04505086, "epoch": 0.39729445362994137, "flos": 45623716300800.0, "grad_norm": 1.5046276355811492, "language_loss": 0.683761, "learning_rate": 2.6345396433555415e-06, "loss": 0.70552367, "num_input_tokens_seen": 141919580, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8125, "step": 6608, "time_per_iteration": 2.8012335300445557 }, { "auxiliary_loss_clip": 0.01151859, "auxiliary_loss_mlp": 0.01046758, "balance_loss_clip": 1.03015184, "balance_loss_mlp": 1.04353464, "epoch": 0.39735457688260933, "flos": 20193971120640.0, "grad_norm": 1.7291724308339103, "language_loss": 0.74347115, "learning_rate": 2.6341813297809937e-06, "loss": 0.76545739, "num_input_tokens_seen": 141937045, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8125, "step": 6609, "time_per_iteration": 2.5729563236236572 }, { "auxiliary_loss_clip": 0.01140556, "auxiliary_loss_mlp": 0.01037137, "balance_loss_clip": 1.02268279, "balance_loss_mlp": 1.04333687, "epoch": 0.3974147001352773, "flos": 23331163029120.0, "grad_norm": 1.4814454841521183, "language_loss": 0.71567488, "learning_rate": 2.633822993574145e-06, "loss": 0.73745179, "num_input_tokens_seen": 141956695, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7890625, "step": 6610, "time_per_iteration": 2.671050786972046 }, { "auxiliary_loss_clip": 0.01126527, "auxiliary_loss_mlp": 0.01031708, "balance_loss_clip": 1.01731956, "balance_loss_mlp": 1.04175079, "epoch": 0.39747482338794526, "flos": 21688644885120.0, "grad_norm": 1.557350037011665, "language_loss": 0.78629309, "learning_rate": 2.633464634747785e-06, "loss": 0.80787539, "num_input_tokens_seen": 141975935, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7578125, "step": 6611, "time_per_iteration": 2.603386640548706 }, { "auxiliary_loss_clip": 0.01131563, "auxiliary_loss_mlp": 0.01036859, "balance_loss_clip": 1.02239895, "balance_loss_mlp": 1.04458869, "epoch": 0.3975349466406133, "flos": 30988717816320.0, "grad_norm": 1.743438412863049, "language_loss": 0.7914772, "learning_rate": 2.6331062533147002e-06, "loss": 0.81316143, "num_input_tokens_seen": 141995750, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78125, "step": 6612, "time_per_iteration": 2.6611886024475098 }, { "auxiliary_loss_clip": 0.01071662, "auxiliary_loss_mlp": 0.01001841, "balance_loss_clip": 1.00020742, "balance_loss_mlp": 1.02047992, "epoch": 0.39759506989328125, "flos": 63683948833920.0, "grad_norm": 0.8389679685444932, "language_loss": 0.64915025, "learning_rate": 2.632747849287683e-06, "loss": 0.66988528, "num_input_tokens_seen": 142057655, "router_z_loss_clip": 0.01635742, "router_z_loss_mlp": 0.25, "step": 6613, "time_per_iteration": 3.1027371883392334 }, { "auxiliary_loss_clip": 0.01141326, "auxiliary_loss_mlp": 0.01034179, "balance_loss_clip": 1.01924229, "balance_loss_mlp": 1.04474175, "epoch": 0.3976551931459492, "flos": 23695835857920.0, "grad_norm": 2.2975718735390043, "language_loss": 0.71298313, "learning_rate": 2.632389422679523e-06, "loss": 0.73473817, "num_input_tokens_seen": 142076020, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.78515625, "step": 6614, "time_per_iteration": 2.5939574241638184 }, { "auxiliary_loss_clip": 0.01129972, "auxiliary_loss_mlp": 0.010313, "balance_loss_clip": 1.01659596, "balance_loss_mlp": 1.04257452, "epoch": 0.3977153163986172, "flos": 15669657745920.0, "grad_norm": 1.803444012340031, "language_loss": 0.81313008, "learning_rate": 2.63203097350301e-06, "loss": 0.83474284, "num_input_tokens_seen": 142093790, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.78515625, "step": 6615, "time_per_iteration": 2.5370407104492188 }, { "auxiliary_loss_clip": 0.0114703, "auxiliary_loss_mlp": 0.01031954, "balance_loss_clip": 1.01827526, "balance_loss_mlp": 1.04223025, "epoch": 0.39777543965128515, "flos": 14064702249600.0, "grad_norm": 2.3089850128762937, "language_loss": 0.66956532, "learning_rate": 2.631672501770938e-06, "loss": 0.69135517, "num_input_tokens_seen": 142110545, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.78125, "step": 6616, "time_per_iteration": 2.5425853729248047 }, { "auxiliary_loss_clip": 0.01125539, "auxiliary_loss_mlp": 0.0103915, "balance_loss_clip": 1.02427232, "balance_loss_mlp": 1.04388118, "epoch": 0.3978355629039531, "flos": 23367468700800.0, "grad_norm": 2.06183292661349, "language_loss": 0.83497733, "learning_rate": 2.631314007496099e-06, "loss": 0.85662425, "num_input_tokens_seen": 142128695, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8125, "step": 6617, "time_per_iteration": 2.6288869380950928 }, { "auxiliary_loss_clip": 0.01117654, "auxiliary_loss_mlp": 0.01039861, "balance_loss_clip": 1.02619338, "balance_loss_mlp": 1.04321003, "epoch": 0.3978956861566211, "flos": 19062785036160.0, "grad_norm": 1.6722213024898513, "language_loss": 0.72274166, "learning_rate": 2.6309554906912873e-06, "loss": 0.74431682, "num_input_tokens_seen": 142148375, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 6618, "time_per_iteration": 2.4911258220672607 }, { "auxiliary_loss_clip": 0.0113955, "auxiliary_loss_mlp": 0.01039398, "balance_loss_clip": 1.02231526, "balance_loss_mlp": 1.0435605, "epoch": 0.39795580940928904, "flos": 30227699341440.0, "grad_norm": 1.7959421365773882, "language_loss": 0.65688795, "learning_rate": 2.6305969513692965e-06, "loss": 0.67867744, "num_input_tokens_seen": 142169735, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.78125, "step": 6619, "time_per_iteration": 3.9956212043762207 }, { "auxiliary_loss_clip": 0.01043779, "auxiliary_loss_mlp": 0.01007803, "balance_loss_clip": 1.00613403, "balance_loss_mlp": 1.01878119, "epoch": 0.398015932661957, "flos": 69847224906240.0, "grad_norm": 0.7671024487529271, "language_loss": 0.58243388, "learning_rate": 2.630238389542924e-06, "loss": 0.60294968, "num_input_tokens_seen": 142229520, "router_z_loss_clip": 0.01672363, "router_z_loss_mlp": 0.25, "step": 6620, "time_per_iteration": 3.1034140586853027 }, { "auxiliary_loss_clip": 0.0113837, "auxiliary_loss_mlp": 0.01036687, "balance_loss_clip": 1.02389622, "balance_loss_mlp": 1.04382944, "epoch": 0.39807605591462497, "flos": 20157773189760.0, "grad_norm": 1.6877073433845684, "language_loss": 0.79469228, "learning_rate": 2.629879805224964e-06, "loss": 0.81644285, "num_input_tokens_seen": 142247660, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.76953125, "step": 6621, "time_per_iteration": 2.5727429389953613 }, { "auxiliary_loss_clip": 0.01137928, "auxiliary_loss_mlp": 0.01031224, "balance_loss_clip": 1.01729441, "balance_loss_mlp": 1.04320705, "epoch": 0.39813617916729294, "flos": 21141761339520.0, "grad_norm": 1.8163053996909317, "language_loss": 0.77887392, "learning_rate": 2.629521198428213e-06, "loss": 0.80056542, "num_input_tokens_seen": 142266990, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76953125, "step": 6622, "time_per_iteration": 2.677943706512451 }, { "auxiliary_loss_clip": 0.01136599, "auxiliary_loss_mlp": 0.01037154, "balance_loss_clip": 1.02243745, "balance_loss_mlp": 1.04081011, "epoch": 0.3981963024199609, "flos": 18988485753600.0, "grad_norm": 1.6897812113287953, "language_loss": 0.75064665, "learning_rate": 2.6291625691654702e-06, "loss": 0.77238417, "num_input_tokens_seen": 142287170, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.78125, "step": 6623, "time_per_iteration": 2.6121151447296143 }, { "auxiliary_loss_clip": 0.01146053, "auxiliary_loss_mlp": 0.01036794, "balance_loss_clip": 1.02238727, "balance_loss_mlp": 1.04225743, "epoch": 0.39825642567262887, "flos": 16575108808320.0, "grad_norm": 1.7406595935494376, "language_loss": 0.79541719, "learning_rate": 2.6288039174495334e-06, "loss": 0.8172456, "num_input_tokens_seen": 142305405, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.76953125, "step": 6624, "time_per_iteration": 2.6688027381896973 }, { "auxiliary_loss_clip": 0.01125635, "auxiliary_loss_mlp": 0.01043265, "balance_loss_clip": 1.02670145, "balance_loss_mlp": 1.04360318, "epoch": 0.39831654892529683, "flos": 22199833290240.0, "grad_norm": 2.235034439497679, "language_loss": 0.83436823, "learning_rate": 2.6284452432932034e-06, "loss": 0.85605723, "num_input_tokens_seen": 142322710, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8203125, "step": 6625, "time_per_iteration": 3.9231655597686768 }, { "auxiliary_loss_clip": 0.01127106, "auxiliary_loss_mlp": 0.01039478, "balance_loss_clip": 1.0245533, "balance_loss_mlp": 1.04111123, "epoch": 0.39837667217796485, "flos": 10487963612160.0, "grad_norm": 1.944083771503959, "language_loss": 0.85969412, "learning_rate": 2.6280865467092787e-06, "loss": 0.88135993, "num_input_tokens_seen": 142338535, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.76953125, "step": 6626, "time_per_iteration": 3.9721949100494385 }, { "auxiliary_loss_clip": 0.01165493, "auxiliary_loss_mlp": 0.01285224, "balance_loss_clip": 1.02292335, "balance_loss_mlp": 1.04332221, "epoch": 0.3984367954306328, "flos": 17965282930560.0, "grad_norm": 2.359795773302978, "language_loss": 0.83062005, "learning_rate": 2.6277278277105604e-06, "loss": 0.85512722, "num_input_tokens_seen": 142354570, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7734375, "step": 6627, "time_per_iteration": 2.652549982070923 }, { "auxiliary_loss_clip": 0.01144956, "auxiliary_loss_mlp": 0.01285698, "balance_loss_clip": 1.02512205, "balance_loss_mlp": 1.04166913, "epoch": 0.3984969186833008, "flos": 22711057608960.0, "grad_norm": 1.4664681164450442, "language_loss": 0.82728195, "learning_rate": 2.627369086309851e-06, "loss": 0.85158849, "num_input_tokens_seen": 142374395, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.76171875, "step": 6628, "time_per_iteration": 2.5619571208953857 }, { "auxiliary_loss_clip": 0.01147913, "auxiliary_loss_mlp": 0.01046301, "balance_loss_clip": 1.03219295, "balance_loss_mlp": 1.04265714, "epoch": 0.39855704193596875, "flos": 23405785534080.0, "grad_norm": 1.9295864465313024, "language_loss": 0.70777357, "learning_rate": 2.6270103225199524e-06, "loss": 0.7297157, "num_input_tokens_seen": 142396040, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78125, "step": 6629, "time_per_iteration": 2.701364278793335 }, { "auxiliary_loss_clip": 0.01145007, "auxiliary_loss_mlp": 0.01032763, "balance_loss_clip": 1.01869631, "balance_loss_mlp": 1.04485369, "epoch": 0.3986171651886367, "flos": 21251935330560.0, "grad_norm": 1.8660178013945596, "language_loss": 0.8046478, "learning_rate": 2.626651536353668e-06, "loss": 0.82642549, "num_input_tokens_seen": 142415495, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.734375, "step": 6630, "time_per_iteration": 4.112753391265869 }, { "auxiliary_loss_clip": 0.01165867, "auxiliary_loss_mlp": 0.01030492, "balance_loss_clip": 1.01630044, "balance_loss_mlp": 1.04271603, "epoch": 0.3986772884413047, "flos": 12458705258880.0, "grad_norm": 1.859891533064425, "language_loss": 0.74995029, "learning_rate": 2.6262927278238032e-06, "loss": 0.77191389, "num_input_tokens_seen": 142431865, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78515625, "step": 6631, "time_per_iteration": 2.670609712600708 }, { "auxiliary_loss_clip": 0.01148455, "auxiliary_loss_mlp": 0.01037078, "balance_loss_clip": 1.02227855, "balance_loss_mlp": 1.04340625, "epoch": 0.39873741169397264, "flos": 19646117907840.0, "grad_norm": 1.7736443083441482, "language_loss": 0.72293878, "learning_rate": 2.6259338969431613e-06, "loss": 0.74479407, "num_input_tokens_seen": 142450595, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.78515625, "step": 6632, "time_per_iteration": 2.6536331176757812 }, { "auxiliary_loss_clip": 0.01160814, "auxiliary_loss_mlp": 0.01281571, "balance_loss_clip": 1.02117157, "balance_loss_mlp": 1.04015791, "epoch": 0.3987975349466406, "flos": 21684766216320.0, "grad_norm": 1.8928579215113508, "language_loss": 0.74790788, "learning_rate": 2.6255750437245487e-06, "loss": 0.77233177, "num_input_tokens_seen": 142466650, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7578125, "step": 6633, "time_per_iteration": 2.5659260749816895 }, { "auxiliary_loss_clip": 0.01138425, "auxiliary_loss_mlp": 0.01029209, "balance_loss_clip": 1.01555943, "balance_loss_mlp": 1.0427264, "epoch": 0.3988576581993086, "flos": 23914064937600.0, "grad_norm": 1.7422798200968654, "language_loss": 0.81453514, "learning_rate": 2.625216168180772e-06, "loss": 0.83621144, "num_input_tokens_seen": 142486165, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7734375, "step": 6634, "time_per_iteration": 2.623911142349243 }, { "auxiliary_loss_clip": 0.01140587, "auxiliary_loss_mlp": 0.01029891, "balance_loss_clip": 1.01522207, "balance_loss_mlp": 1.04444981, "epoch": 0.39891778145197654, "flos": 18149899858560.0, "grad_norm": 1.8357419177644771, "language_loss": 0.74609935, "learning_rate": 2.624857270324639e-06, "loss": 0.76780415, "num_input_tokens_seen": 142505035, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78125, "step": 6635, "time_per_iteration": 2.5903003215789795 }, { "auxiliary_loss_clip": 0.01154194, "auxiliary_loss_mlp": 0.0103902, "balance_loss_clip": 1.02304649, "balance_loss_mlp": 1.04417634, "epoch": 0.3989779047046445, "flos": 22595281096320.0, "grad_norm": 2.1845642634328315, "language_loss": 0.66184068, "learning_rate": 2.6244983501689574e-06, "loss": 0.6837728, "num_input_tokens_seen": 142521870, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.828125, "step": 6636, "time_per_iteration": 2.69952392578125 }, { "auxiliary_loss_clip": 0.01129355, "auxiliary_loss_mlp": 0.01037583, "balance_loss_clip": 1.02283716, "balance_loss_mlp": 1.04419303, "epoch": 0.39903802795731247, "flos": 18077216688000.0, "grad_norm": 1.6649584877449042, "language_loss": 0.81806076, "learning_rate": 2.6241394077265352e-06, "loss": 0.83973014, "num_input_tokens_seen": 142540455, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.76171875, "step": 6637, "time_per_iteration": 2.5655441284179688 }, { "auxiliary_loss_clip": 0.01066997, "auxiliary_loss_mlp": 0.01002611, "balance_loss_clip": 1.00088298, "balance_loss_mlp": 1.01599312, "epoch": 0.39909815120998043, "flos": 70441367771520.0, "grad_norm": 0.723041748940864, "language_loss": 0.53210407, "learning_rate": 2.6237804430101835e-06, "loss": 0.55280018, "num_input_tokens_seen": 142599665, "router_z_loss_clip": 0.01733398, "router_z_loss_mlp": 0.24414062, "step": 6638, "time_per_iteration": 3.248227596282959 }, { "auxiliary_loss_clip": 0.01138214, "auxiliary_loss_mlp": 0.01282491, "balance_loss_clip": 1.02147281, "balance_loss_mlp": 1.04334772, "epoch": 0.39915827446264845, "flos": 18549262247040.0, "grad_norm": 2.6364160901203713, "language_loss": 0.75362527, "learning_rate": 2.623421456032712e-06, "loss": 0.77783233, "num_input_tokens_seen": 142618845, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7734375, "step": 6639, "time_per_iteration": 2.5983006954193115 }, { "auxiliary_loss_clip": 0.0116209, "auxiliary_loss_mlp": 0.01033279, "balance_loss_clip": 1.01982665, "balance_loss_mlp": 1.04367018, "epoch": 0.3992183977153164, "flos": 29897249195520.0, "grad_norm": 1.6121057931278593, "language_loss": 0.76377642, "learning_rate": 2.6230624468069326e-06, "loss": 0.78573012, "num_input_tokens_seen": 142640885, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 6640, "time_per_iteration": 2.727919578552246 }, { "auxiliary_loss_clip": 0.01147013, "auxiliary_loss_mlp": 0.01033829, "balance_loss_clip": 1.01989388, "balance_loss_mlp": 1.04229152, "epoch": 0.3992785209679844, "flos": 22565080736640.0, "grad_norm": 1.94074641861393, "language_loss": 0.81511104, "learning_rate": 2.6227034153456573e-06, "loss": 0.83691943, "num_input_tokens_seen": 142659340, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78125, "step": 6641, "time_per_iteration": 2.650902271270752 }, { "auxiliary_loss_clip": 0.01123553, "auxiliary_loss_mlp": 0.01033278, "balance_loss_clip": 1.0186336, "balance_loss_mlp": 1.04558706, "epoch": 0.39933864422065235, "flos": 19682674974720.0, "grad_norm": 2.036714536236183, "language_loss": 0.76835465, "learning_rate": 2.6223443616616985e-06, "loss": 0.78992295, "num_input_tokens_seen": 142677085, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78125, "step": 6642, "time_per_iteration": 2.571730136871338 }, { "auxiliary_loss_clip": 0.01137098, "auxiliary_loss_mlp": 0.01034348, "balance_loss_clip": 1.02007222, "balance_loss_mlp": 1.0423913, "epoch": 0.3993987674733203, "flos": 23038491012480.0, "grad_norm": 2.397497851142217, "language_loss": 0.72492468, "learning_rate": 2.621985285767871e-06, "loss": 0.74663913, "num_input_tokens_seen": 142694595, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.76953125, "step": 6643, "time_per_iteration": 2.6187336444854736 }, { "auxiliary_loss_clip": 0.01128561, "auxiliary_loss_mlp": 0.01031413, "balance_loss_clip": 1.01754344, "balance_loss_mlp": 1.04254913, "epoch": 0.3994588907259883, "flos": 19390828970880.0, "grad_norm": 1.5522595916289357, "language_loss": 0.66385055, "learning_rate": 2.621626187676988e-06, "loss": 0.68545026, "num_input_tokens_seen": 142714175, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7734375, "step": 6644, "time_per_iteration": 2.5828166007995605 }, { "auxiliary_loss_clip": 0.0111913, "auxiliary_loss_mlp": 0.01038796, "balance_loss_clip": 1.0243119, "balance_loss_mlp": 1.04195213, "epoch": 0.39951901397865625, "flos": 13734395758080.0, "grad_norm": 2.0380919622100278, "language_loss": 0.7816931, "learning_rate": 2.6212670674018657e-06, "loss": 0.80327231, "num_input_tokens_seen": 142730955, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7734375, "step": 6645, "time_per_iteration": 2.5598599910736084 }, { "auxiliary_loss_clip": 0.0113042, "auxiliary_loss_mlp": 0.01033785, "balance_loss_clip": 1.01947987, "balance_loss_mlp": 1.04352129, "epoch": 0.3995791372313242, "flos": 23586451966080.0, "grad_norm": 2.28870344407959, "language_loss": 0.69638461, "learning_rate": 2.6209079249553195e-06, "loss": 0.71802664, "num_input_tokens_seen": 142751200, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78125, "step": 6646, "time_per_iteration": 2.596465826034546 }, { "auxiliary_loss_clip": 0.01154474, "auxiliary_loss_mlp": 0.01037054, "balance_loss_clip": 1.02301168, "balance_loss_mlp": 1.04077053, "epoch": 0.3996392604839922, "flos": 21355896268800.0, "grad_norm": 1.8081983021368457, "language_loss": 0.71901095, "learning_rate": 2.6205487603501672e-06, "loss": 0.74092621, "num_input_tokens_seen": 142770170, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78125, "step": 6647, "time_per_iteration": 2.6624066829681396 }, { "auxiliary_loss_clip": 0.01140465, "auxiliary_loss_mlp": 0.01036291, "balance_loss_clip": 1.02343464, "balance_loss_mlp": 1.04108, "epoch": 0.39969938373666014, "flos": 26032255914240.0, "grad_norm": 1.5191206440545262, "language_loss": 0.74469745, "learning_rate": 2.6201895735992255e-06, "loss": 0.76646501, "num_input_tokens_seen": 142792680, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 6648, "time_per_iteration": 2.641111135482788 }, { "auxiliary_loss_clip": 0.01119396, "auxiliary_loss_mlp": 0.010351, "balance_loss_clip": 1.020998, "balance_loss_mlp": 1.04221547, "epoch": 0.3997595069893281, "flos": 20116367786880.0, "grad_norm": 2.2192164937668397, "language_loss": 0.66246301, "learning_rate": 2.6198303647153133e-06, "loss": 0.684008, "num_input_tokens_seen": 142810510, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76953125, "step": 6649, "time_per_iteration": 2.5723702907562256 }, { "auxiliary_loss_clip": 0.01148314, "auxiliary_loss_mlp": 0.01040619, "balance_loss_clip": 1.02616465, "balance_loss_mlp": 1.04434657, "epoch": 0.39981963024199607, "flos": 27783403764480.0, "grad_norm": 1.516193371910558, "language_loss": 0.750741, "learning_rate": 2.61947113371125e-06, "loss": 0.77263033, "num_input_tokens_seen": 142832455, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.76953125, "step": 6650, "time_per_iteration": 2.6793019771575928 }, { "auxiliary_loss_clip": 0.01118329, "auxiliary_loss_mlp": 0.01041227, "balance_loss_clip": 1.02733898, "balance_loss_mlp": 1.04133773, "epoch": 0.39987975349466404, "flos": 21944436612480.0, "grad_norm": 1.5540379192732408, "language_loss": 0.72077388, "learning_rate": 2.6191118805998547e-06, "loss": 0.74236947, "num_input_tokens_seen": 142852590, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76953125, "step": 6651, "time_per_iteration": 2.6273133754730225 }, { "auxiliary_loss_clip": 0.01119002, "auxiliary_loss_mlp": 0.01036412, "balance_loss_clip": 1.02296567, "balance_loss_mlp": 1.04160488, "epoch": 0.39993987674733206, "flos": 20704405340160.0, "grad_norm": 2.692957391110787, "language_loss": 0.72969377, "learning_rate": 2.6187526053939497e-06, "loss": 0.75124788, "num_input_tokens_seen": 142870595, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7734375, "step": 6652, "time_per_iteration": 2.5658648014068604 }, { "auxiliary_loss_clip": 0.01067139, "auxiliary_loss_mlp": 0.01018999, "balance_loss_clip": 1.01746166, "balance_loss_mlp": 1.01599038, "epoch": 0.4, "flos": 61525429862400.0, "grad_norm": 0.8335495207485979, "language_loss": 0.60627532, "learning_rate": 2.6183933081063556e-06, "loss": 0.62713671, "num_input_tokens_seen": 142925805, "router_z_loss_clip": 0.01531982, "router_z_loss_mlp": 0.24414062, "step": 6653, "time_per_iteration": 3.1095986366271973 }, { "auxiliary_loss_clip": 0.01144251, "auxiliary_loss_mlp": 0.01037157, "balance_loss_clip": 1.02363837, "balance_loss_mlp": 1.0447638, "epoch": 0.400060123252668, "flos": 14502309644160.0, "grad_norm": 1.9627214717040171, "language_loss": 0.66558003, "learning_rate": 2.6180339887498946e-06, "loss": 0.68739414, "num_input_tokens_seen": 142943145, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 6654, "time_per_iteration": 2.627134323120117 }, { "auxiliary_loss_clip": 0.01127242, "auxiliary_loss_mlp": 0.01037104, "balance_loss_clip": 1.02507019, "balance_loss_mlp": 1.04145002, "epoch": 0.40012024650533595, "flos": 19093308618240.0, "grad_norm": 2.9940444042854772, "language_loss": 0.89892381, "learning_rate": 2.617674647337391e-06, "loss": 0.92056727, "num_input_tokens_seen": 142956925, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.76953125, "step": 6655, "time_per_iteration": 2.5692551136016846 }, { "auxiliary_loss_clip": 0.01135894, "auxiliary_loss_mlp": 0.01030052, "balance_loss_clip": 1.01779091, "balance_loss_mlp": 1.04489863, "epoch": 0.4001803697580039, "flos": 29351012094720.0, "grad_norm": 1.6972679021056511, "language_loss": 0.7332164, "learning_rate": 2.6173152838816673e-06, "loss": 0.75487578, "num_input_tokens_seen": 142978040, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.73046875, "step": 6656, "time_per_iteration": 2.61673903465271 }, { "auxiliary_loss_clip": 0.01118953, "auxiliary_loss_mlp": 0.01046013, "balance_loss_clip": 1.03161275, "balance_loss_mlp": 1.0420146, "epoch": 0.4002404930106719, "flos": 20920048640640.0, "grad_norm": 1.5551951502406849, "language_loss": 0.7304529, "learning_rate": 2.6169558983955496e-06, "loss": 0.75210261, "num_input_tokens_seen": 142998390, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.765625, "step": 6657, "time_per_iteration": 2.683406352996826 }, { "auxiliary_loss_clip": 0.01121682, "auxiliary_loss_mlp": 0.01041577, "balance_loss_clip": 1.02643764, "balance_loss_mlp": 1.04381728, "epoch": 0.40030061626333985, "flos": 28405735827840.0, "grad_norm": 1.4554291181915002, "language_loss": 0.79326606, "learning_rate": 2.6165964908918624e-06, "loss": 0.81489873, "num_input_tokens_seen": 143021505, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.77734375, "step": 6658, "time_per_iteration": 2.6687307357788086 }, { "auxiliary_loss_clip": 0.01128913, "auxiliary_loss_mlp": 0.01040688, "balance_loss_clip": 1.02675223, "balance_loss_mlp": 1.04183984, "epoch": 0.4003607395160078, "flos": 25921615046400.0, "grad_norm": 1.7823922653340756, "language_loss": 0.7725203, "learning_rate": 2.6162370613834333e-06, "loss": 0.79421628, "num_input_tokens_seen": 143041375, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78125, "step": 6659, "time_per_iteration": 2.7092385292053223 }, { "auxiliary_loss_clip": 0.01111069, "auxiliary_loss_mlp": 0.01029007, "balance_loss_clip": 1.01690078, "balance_loss_mlp": 1.04050493, "epoch": 0.4004208627686758, "flos": 20768648814720.0, "grad_norm": 2.6408762693114545, "language_loss": 0.72571003, "learning_rate": 2.6158776098830884e-06, "loss": 0.74711078, "num_input_tokens_seen": 143058725, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.70703125, "step": 6660, "time_per_iteration": 3.962313652038574 }, { "auxiliary_loss_clip": 0.01136286, "auxiliary_loss_mlp": 0.010346, "balance_loss_clip": 1.02177906, "balance_loss_mlp": 1.04262459, "epoch": 0.40048098602134374, "flos": 24681224638080.0, "grad_norm": 1.9286964613792905, "language_loss": 0.71337974, "learning_rate": 2.6155181364036556e-06, "loss": 0.73508853, "num_input_tokens_seen": 143076995, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7578125, "step": 6661, "time_per_iteration": 2.639124631881714 }, { "auxiliary_loss_clip": 0.01122529, "auxiliary_loss_mlp": 0.01041858, "balance_loss_clip": 1.02761281, "balance_loss_mlp": 1.04477143, "epoch": 0.4005411092740117, "flos": 23185688947200.0, "grad_norm": 1.6332895089126138, "language_loss": 0.75423265, "learning_rate": 2.615158640957964e-06, "loss": 0.77587652, "num_input_tokens_seen": 143096780, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.77734375, "step": 6662, "time_per_iteration": 2.6567130088806152 }, { "auxiliary_loss_clip": 0.01129997, "auxiliary_loss_mlp": 0.01031389, "balance_loss_clip": 1.01674998, "balance_loss_mlp": 1.04121733, "epoch": 0.4006012325266797, "flos": 17522324409600.0, "grad_norm": 2.015738128231264, "language_loss": 0.65974784, "learning_rate": 2.614799123558842e-06, "loss": 0.68136168, "num_input_tokens_seen": 143112590, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.796875, "step": 6663, "time_per_iteration": 2.5601227283477783 }, { "auxiliary_loss_clip": 0.01128025, "auxiliary_loss_mlp": 0.01030667, "balance_loss_clip": 1.0174768, "balance_loss_mlp": 1.04179955, "epoch": 0.40066135577934764, "flos": 19857200181120.0, "grad_norm": 2.2829796994336164, "language_loss": 0.86014152, "learning_rate": 2.6144395842191227e-06, "loss": 0.88172841, "num_input_tokens_seen": 143130220, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.76953125, "step": 6664, "time_per_iteration": 2.5630977153778076 }, { "auxiliary_loss_clip": 0.01129337, "auxiliary_loss_mlp": 0.01031091, "balance_loss_clip": 1.01755524, "balance_loss_mlp": 1.04304445, "epoch": 0.40072147903201566, "flos": 18150007599360.0, "grad_norm": 2.056510829865611, "language_loss": 0.84989488, "learning_rate": 2.6140800229516337e-06, "loss": 0.87149912, "num_input_tokens_seen": 143147160, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7734375, "step": 6665, "time_per_iteration": 2.5477075576782227 }, { "auxiliary_loss_clip": 0.01132181, "auxiliary_loss_mlp": 0.01033188, "balance_loss_clip": 1.01913357, "balance_loss_mlp": 1.04396343, "epoch": 0.4007816022846836, "flos": 18661267831680.0, "grad_norm": 1.6372607975201319, "language_loss": 0.78306842, "learning_rate": 2.613720439769208e-06, "loss": 0.80472213, "num_input_tokens_seen": 143164605, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7890625, "step": 6666, "time_per_iteration": 3.915327787399292 }, { "auxiliary_loss_clip": 0.01130252, "auxiliary_loss_mlp": 0.01030852, "balance_loss_clip": 1.016469, "balance_loss_mlp": 1.04367065, "epoch": 0.4008417255373516, "flos": 25703170485120.0, "grad_norm": 1.8707468688495923, "language_loss": 0.73188818, "learning_rate": 2.6133608346846794e-06, "loss": 0.75349927, "num_input_tokens_seen": 143183965, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.77734375, "step": 6667, "time_per_iteration": 4.330399513244629 }, { "auxiliary_loss_clip": 0.01132291, "auxiliary_loss_mlp": 0.01044487, "balance_loss_clip": 1.02888298, "balance_loss_mlp": 1.04401195, "epoch": 0.40090184879001955, "flos": 22858614679680.0, "grad_norm": 1.3617809104187186, "language_loss": 0.75967121, "learning_rate": 2.61300120771088e-06, "loss": 0.78143895, "num_input_tokens_seen": 143204965, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.7890625, "step": 6668, "time_per_iteration": 2.5507569313049316 }, { "auxiliary_loss_clip": 0.01149286, "auxiliary_loss_mlp": 0.01035387, "balance_loss_clip": 1.02087975, "balance_loss_mlp": 1.04710436, "epoch": 0.4009619720426875, "flos": 29059848449280.0, "grad_norm": 1.7815252266141697, "language_loss": 0.81833899, "learning_rate": 2.6126415588606443e-06, "loss": 0.84018576, "num_input_tokens_seen": 143225015, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7578125, "step": 6669, "time_per_iteration": 2.7315754890441895 }, { "auxiliary_loss_clip": 0.01061696, "auxiliary_loss_mlp": 0.01006527, "balance_loss_clip": 1.00503659, "balance_loss_mlp": 1.01942253, "epoch": 0.4010220952953555, "flos": 66059763131520.0, "grad_norm": 0.7020726639856898, "language_loss": 0.53332877, "learning_rate": 2.6122818881468072e-06, "loss": 0.55401099, "num_input_tokens_seen": 143294925, "router_z_loss_clip": 0.01489258, "router_z_loss_mlp": 0.2421875, "step": 6670, "time_per_iteration": 3.3086729049682617 }, { "auxiliary_loss_clip": 0.01155675, "auxiliary_loss_mlp": 0.01033786, "balance_loss_clip": 1.01958847, "balance_loss_mlp": 1.04358256, "epoch": 0.40108221854802345, "flos": 29642822184960.0, "grad_norm": 1.5717651507142418, "language_loss": 0.88766336, "learning_rate": 2.6119221955822044e-06, "loss": 0.90955794, "num_input_tokens_seen": 143314170, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.765625, "step": 6671, "time_per_iteration": 4.09544038772583 }, { "auxiliary_loss_clip": 0.01129011, "auxiliary_loss_mlp": 0.01031562, "balance_loss_clip": 1.01672602, "balance_loss_mlp": 1.04261863, "epoch": 0.4011423418006914, "flos": 19929560129280.0, "grad_norm": 2.0022559289067727, "language_loss": 0.78650516, "learning_rate": 2.611562481179673e-06, "loss": 0.80811083, "num_input_tokens_seen": 143330050, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7734375, "step": 6672, "time_per_iteration": 2.5406036376953125 }, { "auxiliary_loss_clip": 0.0114217, "auxiliary_loss_mlp": 0.01030897, "balance_loss_clip": 1.01752734, "balance_loss_mlp": 1.04108286, "epoch": 0.4012024650533594, "flos": 20084299920000.0, "grad_norm": 2.066763145405157, "language_loss": 0.62947649, "learning_rate": 2.611202744952049e-06, "loss": 0.65120721, "num_input_tokens_seen": 143348650, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7421875, "step": 6673, "time_per_iteration": 2.5909602642059326 }, { "auxiliary_loss_clip": 0.01137544, "auxiliary_loss_mlp": 0.01277728, "balance_loss_clip": 1.01759851, "balance_loss_mlp": 1.04315662, "epoch": 0.40126258830602735, "flos": 21695719864320.0, "grad_norm": 1.4389311618645588, "language_loss": 0.80087227, "learning_rate": 2.610842986912172e-06, "loss": 0.82502496, "num_input_tokens_seen": 143370275, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.765625, "step": 6674, "time_per_iteration": 2.6508219242095947 }, { "auxiliary_loss_clip": 0.01158967, "auxiliary_loss_mlp": 0.01034438, "balance_loss_clip": 1.02059233, "balance_loss_mlp": 1.04377937, "epoch": 0.4013227115586953, "flos": 12020379592320.0, "grad_norm": 3.3425792507180256, "language_loss": 0.82373202, "learning_rate": 2.61048320707288e-06, "loss": 0.84566605, "num_input_tokens_seen": 143385390, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.796875, "step": 6675, "time_per_iteration": 2.615662097930908 }, { "auxiliary_loss_clip": 0.01128141, "auxiliary_loss_mlp": 0.01036665, "balance_loss_clip": 1.02157342, "balance_loss_mlp": 1.04299021, "epoch": 0.4013828348113633, "flos": 25447522412160.0, "grad_norm": 1.71965384131624, "language_loss": 0.93439746, "learning_rate": 2.6101234054470118e-06, "loss": 0.95604551, "num_input_tokens_seen": 143404215, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.7578125, "step": 6676, "time_per_iteration": 2.6153817176818848 }, { "auxiliary_loss_clip": 0.01135357, "auxiliary_loss_mlp": 0.01040751, "balance_loss_clip": 1.02472973, "balance_loss_mlp": 1.04422402, "epoch": 0.40144295806403124, "flos": 18582946225920.0, "grad_norm": 1.7352406214883516, "language_loss": 0.79074323, "learning_rate": 2.6097635820474095e-06, "loss": 0.81250429, "num_input_tokens_seen": 143422245, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.82421875, "step": 6677, "time_per_iteration": 2.5031161308288574 }, { "auxiliary_loss_clip": 0.01140067, "auxiliary_loss_mlp": 0.01034904, "balance_loss_clip": 1.0209682, "balance_loss_mlp": 1.04586077, "epoch": 0.4015030813166992, "flos": 22930220442240.0, "grad_norm": 1.9682641834149095, "language_loss": 0.8369295, "learning_rate": 2.609403736886913e-06, "loss": 0.85867918, "num_input_tokens_seen": 143443130, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.765625, "step": 6678, "time_per_iteration": 2.5970749855041504 }, { "auxiliary_loss_clip": 0.01117632, "auxiliary_loss_mlp": 0.01036919, "balance_loss_clip": 1.02325141, "balance_loss_mlp": 1.04244304, "epoch": 0.4015632045693672, "flos": 20595057361920.0, "grad_norm": 2.856640982442274, "language_loss": 0.63635951, "learning_rate": 2.6090438699783655e-06, "loss": 0.65790498, "num_input_tokens_seen": 143461385, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75390625, "step": 6679, "time_per_iteration": 2.5562751293182373 }, { "auxiliary_loss_clip": 0.01136582, "auxiliary_loss_mlp": 0.01028892, "balance_loss_clip": 1.01569533, "balance_loss_mlp": 1.04364061, "epoch": 0.4016233278220352, "flos": 23438930808960.0, "grad_norm": 1.5048210303922493, "language_loss": 0.78470081, "learning_rate": 2.608683981334608e-06, "loss": 0.8063556, "num_input_tokens_seen": 143481750, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.75, "step": 6680, "time_per_iteration": 2.577716588973999 }, { "auxiliary_loss_clip": 0.01154835, "auxiliary_loss_mlp": 0.01294991, "balance_loss_clip": 1.03285944, "balance_loss_mlp": 1.04682767, "epoch": 0.40168345107470316, "flos": 21431057477760.0, "grad_norm": 1.8856836889287671, "language_loss": 0.75933093, "learning_rate": 2.6083240709684856e-06, "loss": 0.78382921, "num_input_tokens_seen": 143501540, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8125, "step": 6681, "time_per_iteration": 2.6160268783569336 }, { "auxiliary_loss_clip": 0.01158285, "auxiliary_loss_mlp": 0.0103199, "balance_loss_clip": 1.01839375, "balance_loss_mlp": 1.04472399, "epoch": 0.4017435743273711, "flos": 22857214049280.0, "grad_norm": 2.129469479122103, "language_loss": 0.63707525, "learning_rate": 2.6079641388928417e-06, "loss": 0.65897799, "num_input_tokens_seen": 143520530, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.77734375, "step": 6682, "time_per_iteration": 2.5821821689605713 }, { "auxiliary_loss_clip": 0.01159197, "auxiliary_loss_mlp": 0.01035868, "balance_loss_clip": 1.02204585, "balance_loss_mlp": 1.04484355, "epoch": 0.4018036975800391, "flos": 28622312881920.0, "grad_norm": 1.7309841210948678, "language_loss": 0.73136687, "learning_rate": 2.6076041851205214e-06, "loss": 0.75331748, "num_input_tokens_seen": 143540210, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78515625, "step": 6683, "time_per_iteration": 2.667983055114746 }, { "auxiliary_loss_clip": 0.01128899, "auxiliary_loss_mlp": 0.01042906, "balance_loss_clip": 1.02944136, "balance_loss_mlp": 1.04317236, "epoch": 0.40186382083270705, "flos": 26651212099200.0, "grad_norm": 1.4312383189291062, "language_loss": 0.72898185, "learning_rate": 2.6072442096643707e-06, "loss": 0.75069994, "num_input_tokens_seen": 143560940, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.76953125, "step": 6684, "time_per_iteration": 2.575432300567627 }, { "auxiliary_loss_clip": 0.01054011, "auxiliary_loss_mlp": 0.01004842, "balance_loss_clip": 1.00312519, "balance_loss_mlp": 1.02041888, "epoch": 0.401923944085375, "flos": 59259969123840.0, "grad_norm": 0.79994371110896, "language_loss": 0.60351658, "learning_rate": 2.606884212537236e-06, "loss": 0.6241051, "num_input_tokens_seen": 143624015, "router_z_loss_clip": 0.01721191, "router_z_loss_mlp": 0.24414062, "step": 6685, "time_per_iteration": 3.3124163150787354 }, { "auxiliary_loss_clip": 0.01133979, "auxiliary_loss_mlp": 0.01036044, "balance_loss_clip": 1.02198315, "balance_loss_mlp": 1.04464769, "epoch": 0.401984067338043, "flos": 16982803152000.0, "grad_norm": 1.683836230923671, "language_loss": 0.69853276, "learning_rate": 2.6065241937519653e-06, "loss": 0.72023296, "num_input_tokens_seen": 143642750, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.80078125, "step": 6686, "time_per_iteration": 2.5668020248413086 }, { "auxiliary_loss_clip": 0.01138474, "auxiliary_loss_mlp": 0.01032976, "balance_loss_clip": 1.01935661, "balance_loss_mlp": 1.04411829, "epoch": 0.40204419059071095, "flos": 24972496024320.0, "grad_norm": 2.2833330409689645, "language_loss": 0.74731016, "learning_rate": 2.6061641533214062e-06, "loss": 0.76902473, "num_input_tokens_seen": 143664515, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.765625, "step": 6687, "time_per_iteration": 2.6769776344299316 }, { "auxiliary_loss_clip": 0.01142032, "auxiliary_loss_mlp": 0.01038803, "balance_loss_clip": 1.02461147, "balance_loss_mlp": 1.04611647, "epoch": 0.4021043138433789, "flos": 23477463123840.0, "grad_norm": 1.7506406815719653, "language_loss": 0.7073015, "learning_rate": 2.6058040912584075e-06, "loss": 0.72910988, "num_input_tokens_seen": 143683135, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.78125, "step": 6688, "time_per_iteration": 2.5316689014434814 }, { "auxiliary_loss_clip": 0.01142412, "auxiliary_loss_mlp": 0.01041095, "balance_loss_clip": 1.02645028, "balance_loss_mlp": 1.04379082, "epoch": 0.4021644370960469, "flos": 25995806588160.0, "grad_norm": 2.9428066074769004, "language_loss": 0.64251179, "learning_rate": 2.605444007575819e-06, "loss": 0.66434687, "num_input_tokens_seen": 143703985, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8046875, "step": 6689, "time_per_iteration": 2.599358320236206 }, { "auxiliary_loss_clip": 0.01133307, "auxiliary_loss_mlp": 0.01031544, "balance_loss_clip": 1.01730418, "balance_loss_mlp": 1.04416621, "epoch": 0.40222456034871484, "flos": 13587987922560.0, "grad_norm": 2.792879441020896, "language_loss": 0.73567951, "learning_rate": 2.605083902286491e-06, "loss": 0.75732803, "num_input_tokens_seen": 143719245, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8046875, "step": 6690, "time_per_iteration": 2.5434112548828125 }, { "auxiliary_loss_clip": 0.01146198, "auxiliary_loss_mlp": 0.01035994, "balance_loss_clip": 1.02147472, "balance_loss_mlp": 1.04262137, "epoch": 0.4022846836013828, "flos": 24278019494400.0, "grad_norm": 1.5507859721382065, "language_loss": 0.74608546, "learning_rate": 2.6047237754032755e-06, "loss": 0.76790738, "num_input_tokens_seen": 143739575, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.765625, "step": 6691, "time_per_iteration": 2.6977334022521973 }, { "auxiliary_loss_clip": 0.01142098, "auxiliary_loss_mlp": 0.01040165, "balance_loss_clip": 1.02590728, "balance_loss_mlp": 1.0452466, "epoch": 0.40234480685405083, "flos": 20151596050560.0, "grad_norm": 1.56911001895633, "language_loss": 0.7242462, "learning_rate": 2.6043636269390245e-06, "loss": 0.74606884, "num_input_tokens_seen": 143758515, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7890625, "step": 6692, "time_per_iteration": 2.5748987197875977 }, { "auxiliary_loss_clip": 0.01156978, "auxiliary_loss_mlp": 0.01040693, "balance_loss_clip": 1.02705526, "balance_loss_mlp": 1.04287863, "epoch": 0.4024049301067188, "flos": 22930220442240.0, "grad_norm": 1.983754261982233, "language_loss": 0.84328806, "learning_rate": 2.6040034569065893e-06, "loss": 0.86526477, "num_input_tokens_seen": 143776770, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.78125, "step": 6693, "time_per_iteration": 2.636993646621704 }, { "auxiliary_loss_clip": 0.01151154, "auxiliary_loss_mlp": 0.01041899, "balance_loss_clip": 1.02821362, "balance_loss_mlp": 1.04597545, "epoch": 0.40246505335938676, "flos": 36028421487360.0, "grad_norm": 1.674513337752113, "language_loss": 0.70716441, "learning_rate": 2.6036432653188254e-06, "loss": 0.72909498, "num_input_tokens_seen": 143798450, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.78125, "step": 6694, "time_per_iteration": 2.6821630001068115 }, { "auxiliary_loss_clip": 0.01137057, "auxiliary_loss_mlp": 0.01039663, "balance_loss_clip": 1.02507782, "balance_loss_mlp": 1.04217386, "epoch": 0.4025251766120547, "flos": 20594303176320.0, "grad_norm": 2.1625642401735834, "language_loss": 0.68035424, "learning_rate": 2.603283052188585e-06, "loss": 0.70212144, "num_input_tokens_seen": 143816995, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.765625, "step": 6695, "time_per_iteration": 2.5086708068847656 }, { "auxiliary_loss_clip": 0.01135853, "auxiliary_loss_mlp": 0.01032212, "balance_loss_clip": 1.01868176, "balance_loss_mlp": 1.0410589, "epoch": 0.4025852998647227, "flos": 64523932381440.0, "grad_norm": 2.370353546660614, "language_loss": 0.79318088, "learning_rate": 2.602922817528725e-06, "loss": 0.81486154, "num_input_tokens_seen": 143842090, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.76953125, "step": 6696, "time_per_iteration": 2.9719130992889404 }, { "auxiliary_loss_clip": 0.01140484, "auxiliary_loss_mlp": 0.01039824, "balance_loss_clip": 1.02476132, "balance_loss_mlp": 1.04438925, "epoch": 0.40264542311739066, "flos": 20886292834560.0, "grad_norm": 2.242950600760952, "language_loss": 0.71244085, "learning_rate": 2.6025625613521005e-06, "loss": 0.73424393, "num_input_tokens_seen": 143860800, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.78125, "step": 6697, "time_per_iteration": 2.579944133758545 }, { "auxiliary_loss_clip": 0.01129662, "auxiliary_loss_mlp": 0.01038864, "balance_loss_clip": 1.02511907, "balance_loss_mlp": 1.04446661, "epoch": 0.4027055463700586, "flos": 26250197685120.0, "grad_norm": 6.113860534617246, "language_loss": 0.61620927, "learning_rate": 2.602202283671568e-06, "loss": 0.63789451, "num_input_tokens_seen": 143878950, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.76171875, "step": 6698, "time_per_iteration": 2.7016866207122803 }, { "auxiliary_loss_clip": 0.01130412, "auxiliary_loss_mlp": 0.0103646, "balance_loss_clip": 1.02246439, "balance_loss_mlp": 1.04362535, "epoch": 0.4027656696227266, "flos": 20631398947200.0, "grad_norm": 1.8338262903363076, "language_loss": 0.76697588, "learning_rate": 2.601841984499985e-06, "loss": 0.78864455, "num_input_tokens_seen": 143898385, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78125, "step": 6699, "time_per_iteration": 2.6144845485687256 }, { "auxiliary_loss_clip": 0.01144677, "auxiliary_loss_mlp": 0.01030456, "balance_loss_clip": 1.01639557, "balance_loss_mlp": 1.04317033, "epoch": 0.40282579287539455, "flos": 22346277039360.0, "grad_norm": 1.5902269093467511, "language_loss": 0.79819143, "learning_rate": 2.6014816638502094e-06, "loss": 0.81994283, "num_input_tokens_seen": 143918795, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75, "step": 6700, "time_per_iteration": 2.6963512897491455 }, { "auxiliary_loss_clip": 0.01141654, "auxiliary_loss_mlp": 0.01040446, "balance_loss_clip": 1.02506804, "balance_loss_mlp": 1.04185927, "epoch": 0.4028859161280625, "flos": 29274988959360.0, "grad_norm": 1.7724478261996721, "language_loss": 0.74807322, "learning_rate": 2.601121321735101e-06, "loss": 0.76989424, "num_input_tokens_seen": 143938245, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8203125, "step": 6701, "time_per_iteration": 2.6571621894836426 }, { "auxiliary_loss_clip": 0.01138952, "auxiliary_loss_mlp": 0.01036198, "balance_loss_clip": 1.02250695, "balance_loss_mlp": 1.04322183, "epoch": 0.4029460393807305, "flos": 28622312881920.0, "grad_norm": 1.568104117813557, "language_loss": 0.65579689, "learning_rate": 2.6007609581675183e-06, "loss": 0.67754835, "num_input_tokens_seen": 143960995, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.77734375, "step": 6702, "time_per_iteration": 4.078309774398804 }, { "auxiliary_loss_clip": 0.01139786, "auxiliary_loss_mlp": 0.01041671, "balance_loss_clip": 1.02646565, "balance_loss_mlp": 1.04252493, "epoch": 0.40300616263339845, "flos": 22601925112320.0, "grad_norm": 1.4875485670532762, "language_loss": 0.65793717, "learning_rate": 2.600400573160323e-06, "loss": 0.67975175, "num_input_tokens_seen": 143979910, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.796875, "step": 6703, "time_per_iteration": 2.6026320457458496 }, { "auxiliary_loss_clip": 0.0118084, "auxiliary_loss_mlp": 0.01035541, "balance_loss_clip": 1.02097321, "balance_loss_mlp": 1.04626322, "epoch": 0.4030662858860664, "flos": 25520313323520.0, "grad_norm": 1.901021054649363, "language_loss": 0.82136422, "learning_rate": 2.6000401667263755e-06, "loss": 0.84352803, "num_input_tokens_seen": 144000095, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80859375, "step": 6704, "time_per_iteration": 2.656102418899536 }, { "auxiliary_loss_clip": 0.01140664, "auxiliary_loss_mlp": 0.01041126, "balance_loss_clip": 1.0268327, "balance_loss_mlp": 1.04272437, "epoch": 0.40312640913873443, "flos": 23586703361280.0, "grad_norm": 1.6247087468761414, "language_loss": 0.7305876, "learning_rate": 2.5996797388785373e-06, "loss": 0.75240546, "num_input_tokens_seen": 144019695, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.80078125, "step": 6705, "time_per_iteration": 2.6132678985595703 }, { "auxiliary_loss_clip": 0.01127042, "auxiliary_loss_mlp": 0.01037811, "balance_loss_clip": 1.02394652, "balance_loss_mlp": 1.04172277, "epoch": 0.4031865323914024, "flos": 20011042131840.0, "grad_norm": 1.8100549824139038, "language_loss": 0.66375834, "learning_rate": 2.5993192896296727e-06, "loss": 0.68540686, "num_input_tokens_seen": 144038525, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.765625, "step": 6706, "time_per_iteration": 2.555389881134033 }, { "auxiliary_loss_clip": 0.0112987, "auxiliary_loss_mlp": 0.01043783, "balance_loss_clip": 1.02920341, "balance_loss_mlp": 1.04307222, "epoch": 0.40324665564407036, "flos": 21871430219520.0, "grad_norm": 1.547712607787018, "language_loss": 0.71424955, "learning_rate": 2.5989588189926433e-06, "loss": 0.73598611, "num_input_tokens_seen": 144059485, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.77734375, "step": 6707, "time_per_iteration": 4.0249059200286865 }, { "auxiliary_loss_clip": 0.01132429, "auxiliary_loss_mlp": 0.01035296, "balance_loss_clip": 1.0206691, "balance_loss_mlp": 1.04044318, "epoch": 0.4033067788967383, "flos": 23878728933120.0, "grad_norm": 1.6700852535821276, "language_loss": 0.79886377, "learning_rate": 2.598598326980315e-06, "loss": 0.82054102, "num_input_tokens_seen": 144080265, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7421875, "step": 6708, "time_per_iteration": 2.637397289276123 }, { "auxiliary_loss_clip": 0.01157782, "auxiliary_loss_mlp": 0.01038165, "balance_loss_clip": 1.02332306, "balance_loss_mlp": 1.0405407, "epoch": 0.4033669021494063, "flos": 17419907756160.0, "grad_norm": 2.02370180249845, "language_loss": 0.83017671, "learning_rate": 2.5982378136055525e-06, "loss": 0.85213614, "num_input_tokens_seen": 144098040, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.81640625, "step": 6709, "time_per_iteration": 4.087666034698486 }, { "auxiliary_loss_clip": 0.01133577, "auxiliary_loss_mlp": 0.01039287, "balance_loss_clip": 1.02426696, "balance_loss_mlp": 1.04462612, "epoch": 0.40342702540207426, "flos": 29600554855680.0, "grad_norm": 1.9938007274307896, "language_loss": 0.70970505, "learning_rate": 2.597877278881221e-06, "loss": 0.73143375, "num_input_tokens_seen": 144118265, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.80078125, "step": 6710, "time_per_iteration": 2.637842893600464 }, { "auxiliary_loss_clip": 0.01138582, "auxiliary_loss_mlp": 0.0103852, "balance_loss_clip": 1.0233742, "balance_loss_mlp": 1.04213083, "epoch": 0.4034871486547422, "flos": 11284605400320.0, "grad_norm": 1.9846785199108519, "language_loss": 0.84586877, "learning_rate": 2.5975167228201875e-06, "loss": 0.86763978, "num_input_tokens_seen": 144133865, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.78125, "step": 6711, "time_per_iteration": 2.552435874938965 }, { "auxiliary_loss_clip": 0.01131691, "auxiliary_loss_mlp": 0.01034187, "balance_loss_clip": 1.01950073, "balance_loss_mlp": 1.04400468, "epoch": 0.4035472719074102, "flos": 15552839738880.0, "grad_norm": 2.003922192874981, "language_loss": 0.76278085, "learning_rate": 2.5971561454353185e-06, "loss": 0.78443962, "num_input_tokens_seen": 144150125, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78515625, "step": 6712, "time_per_iteration": 2.5890679359436035 }, { "auxiliary_loss_clip": 0.01143236, "auxiliary_loss_mlp": 0.01038569, "balance_loss_clip": 1.02372789, "balance_loss_mlp": 1.04619503, "epoch": 0.40360739516007815, "flos": 24674365140480.0, "grad_norm": 1.9034738679775012, "language_loss": 0.7874651, "learning_rate": 2.596795546739483e-06, "loss": 0.8092832, "num_input_tokens_seen": 144169295, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7890625, "step": 6713, "time_per_iteration": 4.81911563873291 }, { "auxiliary_loss_clip": 0.01149585, "auxiliary_loss_mlp": 0.01036063, "balance_loss_clip": 1.02116787, "balance_loss_mlp": 1.04236925, "epoch": 0.4036675184127461, "flos": 17304095329920.0, "grad_norm": 1.7697903907181576, "language_loss": 0.8782652, "learning_rate": 2.59643492674555e-06, "loss": 0.90012169, "num_input_tokens_seen": 144185790, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8046875, "step": 6714, "time_per_iteration": 2.5888962745666504 }, { "auxiliary_loss_clip": 0.01131178, "auxiliary_loss_mlp": 0.01043963, "balance_loss_clip": 1.02920532, "balance_loss_mlp": 1.04347444, "epoch": 0.4037276416654141, "flos": 19864023765120.0, "grad_norm": 1.5209097005618697, "language_loss": 0.68827164, "learning_rate": 2.596074285466388e-06, "loss": 0.71002305, "num_input_tokens_seen": 144205190, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7890625, "step": 6715, "time_per_iteration": 2.6922125816345215 }, { "auxiliary_loss_clip": 0.01159904, "auxiliary_loss_mlp": 0.0103575, "balance_loss_clip": 1.02060461, "balance_loss_mlp": 1.04288912, "epoch": 0.40378776491808205, "flos": 18296271780480.0, "grad_norm": 2.1737802461292812, "language_loss": 0.77149081, "learning_rate": 2.595713622914869e-06, "loss": 0.79344738, "num_input_tokens_seen": 144222705, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8125, "step": 6716, "time_per_iteration": 2.572908401489258 }, { "auxiliary_loss_clip": 0.01154523, "auxiliary_loss_mlp": 0.01038974, "balance_loss_clip": 1.02487159, "balance_loss_mlp": 1.04233563, "epoch": 0.40384788817075, "flos": 15049372757760.0, "grad_norm": 2.2451333706606658, "language_loss": 0.76069278, "learning_rate": 2.595352939103862e-06, "loss": 0.78262776, "num_input_tokens_seen": 144239545, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.765625, "step": 6717, "time_per_iteration": 2.6032278537750244 }, { "auxiliary_loss_clip": 0.01064757, "auxiliary_loss_mlp": 0.01003608, "balance_loss_clip": 1.0019989, "balance_loss_mlp": 1.02228236, "epoch": 0.40390801142341803, "flos": 61929927895680.0, "grad_norm": 0.9205815675447362, "language_loss": 0.60664225, "learning_rate": 2.594992234046241e-06, "loss": 0.62732589, "num_input_tokens_seen": 144288145, "router_z_loss_clip": 0.01611328, "router_z_loss_mlp": 0.24511719, "step": 6718, "time_per_iteration": 3.0344889163970947 }, { "auxiliary_loss_clip": 0.01127808, "auxiliary_loss_mlp": 0.01039836, "balance_loss_clip": 1.02339721, "balance_loss_mlp": 1.04468608, "epoch": 0.403968134676086, "flos": 22738779930240.0, "grad_norm": 1.8961582928882021, "language_loss": 0.74655503, "learning_rate": 2.594631507754877e-06, "loss": 0.76823145, "num_input_tokens_seen": 144302315, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.83203125, "step": 6719, "time_per_iteration": 2.546370506286621 }, { "auxiliary_loss_clip": 0.01132484, "auxiliary_loss_mlp": 0.01043974, "balance_loss_clip": 1.02795219, "balance_loss_mlp": 1.04393983, "epoch": 0.40402825792875396, "flos": 19784409269760.0, "grad_norm": 2.6876473476104823, "language_loss": 0.81433618, "learning_rate": 2.594270760242644e-06, "loss": 0.8361007, "num_input_tokens_seen": 144318990, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.796875, "step": 6720, "time_per_iteration": 2.527057409286499 }, { "auxiliary_loss_clip": 0.01139342, "auxiliary_loss_mlp": 0.01032369, "balance_loss_clip": 1.01715851, "balance_loss_mlp": 1.04173541, "epoch": 0.40408838118142193, "flos": 19609273532160.0, "grad_norm": 1.83451520132566, "language_loss": 0.76672554, "learning_rate": 2.593909991522417e-06, "loss": 0.78844267, "num_input_tokens_seen": 144335765, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8046875, "step": 6721, "time_per_iteration": 2.517249822616577 }, { "auxiliary_loss_clip": 0.01145085, "auxiliary_loss_mlp": 0.01029574, "balance_loss_clip": 1.016312, "balance_loss_mlp": 1.04349351, "epoch": 0.4041485044340899, "flos": 24426043441920.0, "grad_norm": 1.6961762572463828, "language_loss": 0.7267822, "learning_rate": 2.5935492016070697e-06, "loss": 0.74852884, "num_input_tokens_seen": 144355825, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75, "step": 6722, "time_per_iteration": 2.6292495727539062 }, { "auxiliary_loss_clip": 0.01129392, "auxiliary_loss_mlp": 0.01031926, "balance_loss_clip": 1.01769865, "balance_loss_mlp": 1.04250574, "epoch": 0.40420862768675786, "flos": 16760192613120.0, "grad_norm": 2.452069469544493, "language_loss": 0.65901405, "learning_rate": 2.593188390509478e-06, "loss": 0.68062717, "num_input_tokens_seen": 144374320, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78125, "step": 6723, "time_per_iteration": 2.592967987060547 }, { "auxiliary_loss_clip": 0.0113488, "auxiliary_loss_mlp": 0.01046853, "balance_loss_clip": 1.03117657, "balance_loss_mlp": 1.04436541, "epoch": 0.4042687509394258, "flos": 22491571553280.0, "grad_norm": 2.1727967498498253, "language_loss": 0.73638809, "learning_rate": 2.5928275582425184e-06, "loss": 0.75820541, "num_input_tokens_seen": 144394325, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8125, "step": 6724, "time_per_iteration": 2.59952712059021 }, { "auxiliary_loss_clip": 0.01133668, "auxiliary_loss_mlp": 0.01035585, "balance_loss_clip": 1.02130389, "balance_loss_mlp": 1.04063153, "epoch": 0.4043288741920938, "flos": 30336149479680.0, "grad_norm": 1.75707260373574, "language_loss": 0.74586034, "learning_rate": 2.5924667048190687e-06, "loss": 0.76755285, "num_input_tokens_seen": 144412765, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75, "step": 6725, "time_per_iteration": 2.615785837173462 }, { "auxiliary_loss_clip": 0.01126791, "auxiliary_loss_mlp": 0.01035998, "balance_loss_clip": 1.02053058, "balance_loss_mlp": 1.04092669, "epoch": 0.40438899744476176, "flos": 46348321363200.0, "grad_norm": 2.7787474970012163, "language_loss": 0.76665699, "learning_rate": 2.5921058302520066e-06, "loss": 0.7882849, "num_input_tokens_seen": 144435400, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.76953125, "step": 6726, "time_per_iteration": 2.793092727661133 }, { "auxiliary_loss_clip": 0.01141254, "auxiliary_loss_mlp": 0.01284917, "balance_loss_clip": 1.02197015, "balance_loss_mlp": 1.04273772, "epoch": 0.4044491206974297, "flos": 13333524998400.0, "grad_norm": 2.5183754313690976, "language_loss": 0.81470025, "learning_rate": 2.5917449345542093e-06, "loss": 0.83896196, "num_input_tokens_seen": 144452925, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8046875, "step": 6727, "time_per_iteration": 2.55264949798584 }, { "auxiliary_loss_clip": 0.01120499, "auxiliary_loss_mlp": 0.01036562, "balance_loss_clip": 1.02179217, "balance_loss_mlp": 1.04227769, "epoch": 0.4045092439500977, "flos": 12093745121280.0, "grad_norm": 1.992886202839806, "language_loss": 0.84958196, "learning_rate": 2.5913840177385588e-06, "loss": 0.87115264, "num_input_tokens_seen": 144470195, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.78125, "step": 6728, "time_per_iteration": 2.596712827682495 }, { "auxiliary_loss_clip": 0.01148812, "auxiliary_loss_mlp": 0.01037916, "balance_loss_clip": 1.02293098, "balance_loss_mlp": 1.04363298, "epoch": 0.40456936720276565, "flos": 21179683123200.0, "grad_norm": 1.7699509103622726, "language_loss": 0.81528431, "learning_rate": 2.5910230798179325e-06, "loss": 0.83715153, "num_input_tokens_seen": 144490320, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.78515625, "step": 6729, "time_per_iteration": 2.568725109100342 }, { "auxiliary_loss_clip": 0.01136081, "auxiliary_loss_mlp": 0.01040392, "balance_loss_clip": 1.02596807, "balance_loss_mlp": 1.04164267, "epoch": 0.4046294904554336, "flos": 23915286000000.0, "grad_norm": 2.063545459717171, "language_loss": 0.73475659, "learning_rate": 2.590662120805214e-06, "loss": 0.75652134, "num_input_tokens_seen": 144508990, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.76953125, "step": 6730, "time_per_iteration": 2.6083333492279053 }, { "auxiliary_loss_clip": 0.01127632, "auxiliary_loss_mlp": 0.01037616, "balance_loss_clip": 1.02275014, "balance_loss_mlp": 1.04159379, "epoch": 0.4046896137081016, "flos": 38071235773440.0, "grad_norm": 1.834273354713488, "language_loss": 0.67363143, "learning_rate": 2.5903011407132834e-06, "loss": 0.69528389, "num_input_tokens_seen": 144529550, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.76953125, "step": 6731, "time_per_iteration": 2.6738736629486084 }, { "auxiliary_loss_clip": 0.01118826, "auxiliary_loss_mlp": 0.01037927, "balance_loss_clip": 1.02453995, "balance_loss_mlp": 1.04126012, "epoch": 0.4047497369607696, "flos": 23617262856960.0, "grad_norm": 1.9548978262937553, "language_loss": 0.73925799, "learning_rate": 2.589940139555023e-06, "loss": 0.76082551, "num_input_tokens_seen": 144549310, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7734375, "step": 6732, "time_per_iteration": 2.5941739082336426 }, { "auxiliary_loss_clip": 0.01134882, "auxiliary_loss_mlp": 0.01274392, "balance_loss_clip": 1.01477146, "balance_loss_mlp": 1.04155803, "epoch": 0.40480986021343757, "flos": 12823593569280.0, "grad_norm": 1.5842842456347366, "language_loss": 0.77137935, "learning_rate": 2.589579117343317e-06, "loss": 0.79547203, "num_input_tokens_seen": 144567430, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.75390625, "step": 6733, "time_per_iteration": 2.5103139877319336 }, { "auxiliary_loss_clip": 0.01139681, "auxiliary_loss_mlp": 0.01043817, "balance_loss_clip": 1.02840352, "balance_loss_mlp": 1.04052651, "epoch": 0.40486998346610553, "flos": 23768770423680.0, "grad_norm": 1.7512784184156678, "language_loss": 0.76557028, "learning_rate": 2.5892180740910487e-06, "loss": 0.78740525, "num_input_tokens_seen": 144585975, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8125, "step": 6734, "time_per_iteration": 2.620997667312622 }, { "auxiliary_loss_clip": 0.01141697, "auxiliary_loss_mlp": 0.01033263, "balance_loss_clip": 1.01955366, "balance_loss_mlp": 1.04205096, "epoch": 0.4049301067187735, "flos": 22856818999680.0, "grad_norm": 2.157111451734415, "language_loss": 0.65208095, "learning_rate": 2.5888570098111028e-06, "loss": 0.67383057, "num_input_tokens_seen": 144605225, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.81640625, "step": 6735, "time_per_iteration": 2.4858689308166504 }, { "auxiliary_loss_clip": 0.01143426, "auxiliary_loss_mlp": 0.01037733, "balance_loss_clip": 1.02414298, "balance_loss_mlp": 1.04034734, "epoch": 0.40499022997144146, "flos": 22783992174720.0, "grad_norm": 2.2405522998995813, "language_loss": 0.83258849, "learning_rate": 2.5884959245163656e-06, "loss": 0.85440004, "num_input_tokens_seen": 144624145, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.76171875, "step": 6736, "time_per_iteration": 2.5945043563842773 }, { "auxiliary_loss_clip": 0.01149887, "auxiliary_loss_mlp": 0.01286854, "balance_loss_clip": 1.02465725, "balance_loss_mlp": 1.04299116, "epoch": 0.4050503532241094, "flos": 23039352938880.0, "grad_norm": 1.7672488651938771, "language_loss": 0.75053954, "learning_rate": 2.588134818219722e-06, "loss": 0.77490693, "num_input_tokens_seen": 144644470, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8046875, "step": 6737, "time_per_iteration": 2.5870344638824463 }, { "auxiliary_loss_clip": 0.01126887, "auxiliary_loss_mlp": 0.0103937, "balance_loss_clip": 1.02545214, "balance_loss_mlp": 1.04073453, "epoch": 0.4051104764767774, "flos": 16647756065280.0, "grad_norm": 2.7030025947959735, "language_loss": 0.71542555, "learning_rate": 2.5877736909340597e-06, "loss": 0.73708814, "num_input_tokens_seen": 144661055, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7734375, "step": 6738, "time_per_iteration": 2.598255157470703 }, { "auxiliary_loss_clip": 0.01143342, "auxiliary_loss_mlp": 0.0103139, "balance_loss_clip": 1.01862907, "balance_loss_mlp": 1.04004383, "epoch": 0.40517059972944536, "flos": 16358962717440.0, "grad_norm": 2.3080585334735284, "language_loss": 0.74760795, "learning_rate": 2.587412542672267e-06, "loss": 0.7693553, "num_input_tokens_seen": 144677935, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.765625, "step": 6739, "time_per_iteration": 2.562167167663574 }, { "auxiliary_loss_clip": 0.01149572, "auxiliary_loss_mlp": 0.01036305, "balance_loss_clip": 1.02196383, "balance_loss_mlp": 1.04354823, "epoch": 0.4052307229821133, "flos": 28803374363520.0, "grad_norm": 1.7210699287395215, "language_loss": 0.73647404, "learning_rate": 2.587051373447231e-06, "loss": 0.75833285, "num_input_tokens_seen": 144697725, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7890625, "step": 6740, "time_per_iteration": 2.785843849182129 }, { "auxiliary_loss_clip": 0.0112689, "auxiliary_loss_mlp": 0.01031237, "balance_loss_clip": 1.01731944, "balance_loss_mlp": 1.0419594, "epoch": 0.4052908462347813, "flos": 21397876289280.0, "grad_norm": 1.585689910328474, "language_loss": 0.77442062, "learning_rate": 2.586690183271842e-06, "loss": 0.79600191, "num_input_tokens_seen": 144718805, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7578125, "step": 6741, "time_per_iteration": 2.572761058807373 }, { "auxiliary_loss_clip": 0.01121486, "auxiliary_loss_mlp": 0.01039609, "balance_loss_clip": 1.02435017, "balance_loss_mlp": 1.04197037, "epoch": 0.40535096948744925, "flos": 22419067950720.0, "grad_norm": 1.649172073308047, "language_loss": 0.71061265, "learning_rate": 2.5863289721589887e-06, "loss": 0.73222357, "num_input_tokens_seen": 144737105, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.796875, "step": 6742, "time_per_iteration": 2.567412853240967 }, { "auxiliary_loss_clip": 0.01121711, "auxiliary_loss_mlp": 0.0103256, "balance_loss_clip": 1.01767707, "balance_loss_mlp": 1.04264116, "epoch": 0.4054110927401172, "flos": 17010776868480.0, "grad_norm": 1.9691946991952243, "language_loss": 0.7279278, "learning_rate": 2.585967740121564e-06, "loss": 0.74947053, "num_input_tokens_seen": 144751350, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7890625, "step": 6743, "time_per_iteration": 4.036841869354248 }, { "auxiliary_loss_clip": 0.01141087, "auxiliary_loss_mlp": 0.01034902, "balance_loss_clip": 1.02000046, "balance_loss_mlp": 1.04242206, "epoch": 0.4054712159927852, "flos": 21614848392960.0, "grad_norm": 5.881760431932247, "language_loss": 0.70383573, "learning_rate": 2.5856064871724565e-06, "loss": 0.72559559, "num_input_tokens_seen": 144770030, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8046875, "step": 6744, "time_per_iteration": 2.5902884006500244 }, { "auxiliary_loss_clip": 0.0112648, "auxiliary_loss_mlp": 0.01037964, "balance_loss_clip": 1.02468967, "balance_loss_mlp": 1.04148102, "epoch": 0.4055313392454532, "flos": 25812554376960.0, "grad_norm": 1.7559341508854474, "language_loss": 0.7968235, "learning_rate": 2.58524521332456e-06, "loss": 0.81846792, "num_input_tokens_seen": 144790965, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.76171875, "step": 6745, "time_per_iteration": 2.683846950531006 }, { "auxiliary_loss_clip": 0.01137781, "auxiliary_loss_mlp": 0.01036564, "balance_loss_clip": 1.02146673, "balance_loss_mlp": 1.0437088, "epoch": 0.40559146249812117, "flos": 14137098111360.0, "grad_norm": 1.8250034664232684, "language_loss": 0.66960096, "learning_rate": 2.5848839185907673e-06, "loss": 0.69134438, "num_input_tokens_seen": 144807755, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.76171875, "step": 6746, "time_per_iteration": 2.529102087020874 }, { "auxiliary_loss_clip": 0.011446, "auxiliary_loss_mlp": 0.01032083, "balance_loss_clip": 1.01871979, "balance_loss_mlp": 1.04142189, "epoch": 0.40565158575078913, "flos": 41355481962240.0, "grad_norm": 1.4196608499763612, "language_loss": 0.56593484, "learning_rate": 2.584522602983973e-06, "loss": 0.58770168, "num_input_tokens_seen": 144832405, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.765625, "step": 6747, "time_per_iteration": 2.79481840133667 }, { "auxiliary_loss_clip": 0.01149325, "auxiliary_loss_mlp": 0.01046052, "balance_loss_clip": 1.03255737, "balance_loss_mlp": 1.04439664, "epoch": 0.4057117090034571, "flos": 28544529980160.0, "grad_norm": 1.833310646213998, "language_loss": 0.84280467, "learning_rate": 2.58416126651707e-06, "loss": 0.86475837, "num_input_tokens_seen": 144853890, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.78125, "step": 6748, "time_per_iteration": 2.6752588748931885 }, { "auxiliary_loss_clip": 0.01140662, "auxiliary_loss_mlp": 0.01037147, "balance_loss_clip": 1.02250755, "balance_loss_mlp": 1.04318428, "epoch": 0.40577183225612506, "flos": 18004066640640.0, "grad_norm": 1.6888038903061289, "language_loss": 0.80988944, "learning_rate": 2.5837999092029535e-06, "loss": 0.83166748, "num_input_tokens_seen": 144871395, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.796875, "step": 6749, "time_per_iteration": 3.9383773803710938 }, { "auxiliary_loss_clip": 0.01131983, "auxiliary_loss_mlp": 0.01289274, "balance_loss_clip": 1.02812648, "balance_loss_mlp": 1.04476547, "epoch": 0.40583195550879303, "flos": 19536734016000.0, "grad_norm": 1.6478247625004259, "language_loss": 0.75543159, "learning_rate": 2.5834385310545208e-06, "loss": 0.77964413, "num_input_tokens_seen": 144890975, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.78125, "step": 6750, "time_per_iteration": 4.069165945053101 }, { "auxiliary_loss_clip": 0.01141467, "auxiliary_loss_mlp": 0.01039643, "balance_loss_clip": 1.02426505, "balance_loss_mlp": 1.04224682, "epoch": 0.405892078761461, "flos": 22309468577280.0, "grad_norm": 2.4344551045659624, "language_loss": 0.73483074, "learning_rate": 2.583077132084667e-06, "loss": 0.75664186, "num_input_tokens_seen": 144908170, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.81640625, "step": 6751, "time_per_iteration": 2.617098331451416 }, { "auxiliary_loss_clip": 0.01160203, "auxiliary_loss_mlp": 0.01037595, "balance_loss_clip": 1.02295578, "balance_loss_mlp": 1.04575932, "epoch": 0.40595220201412896, "flos": 25484402701440.0, "grad_norm": 2.7012873253612963, "language_loss": 0.66751409, "learning_rate": 2.5827157123062906e-06, "loss": 0.68949211, "num_input_tokens_seen": 144928020, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78125, "step": 6752, "time_per_iteration": 2.5917489528656006 }, { "auxiliary_loss_clip": 0.01164457, "auxiliary_loss_mlp": 0.01040305, "balance_loss_clip": 1.02430689, "balance_loss_mlp": 1.04528379, "epoch": 0.4060123252667969, "flos": 49856004103680.0, "grad_norm": 1.6584177716840671, "language_loss": 0.70628601, "learning_rate": 2.5823542717322895e-06, "loss": 0.72833359, "num_input_tokens_seen": 144951240, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8359375, "step": 6753, "time_per_iteration": 2.890263795852661 }, { "auxiliary_loss_clip": 0.01143247, "auxiliary_loss_mlp": 0.01039561, "balance_loss_clip": 1.02476716, "balance_loss_mlp": 1.04411674, "epoch": 0.4060724485194649, "flos": 21135476459520.0, "grad_norm": 2.1441147196701307, "language_loss": 0.72234625, "learning_rate": 2.5819928103755625e-06, "loss": 0.74417436, "num_input_tokens_seen": 144969100, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8125, "step": 6754, "time_per_iteration": 4.169386625289917 }, { "auxiliary_loss_clip": 0.01150786, "auxiliary_loss_mlp": 0.01041249, "balance_loss_clip": 1.02681851, "balance_loss_mlp": 1.04359603, "epoch": 0.40613257177213286, "flos": 21758059918080.0, "grad_norm": 3.940561776513752, "language_loss": 0.82995492, "learning_rate": 2.581631328249009e-06, "loss": 0.85187525, "num_input_tokens_seen": 144987065, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.80078125, "step": 6755, "time_per_iteration": 2.597613573074341 }, { "auxiliary_loss_clip": 0.01163343, "auxiliary_loss_mlp": 0.01042467, "balance_loss_clip": 1.02734542, "balance_loss_mlp": 1.04491329, "epoch": 0.4061926950248008, "flos": 25555074710400.0, "grad_norm": 1.573374813012882, "language_loss": 0.70643145, "learning_rate": 2.5812698253655293e-06, "loss": 0.72848952, "num_input_tokens_seen": 145007310, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.82421875, "step": 6756, "time_per_iteration": 2.6659507751464844 }, { "auxiliary_loss_clip": 0.01170128, "auxiliary_loss_mlp": 0.0128928, "balance_loss_clip": 1.02681613, "balance_loss_mlp": 1.04244375, "epoch": 0.4062528182774688, "flos": 23695799944320.0, "grad_norm": 1.8458343282913021, "language_loss": 0.79174101, "learning_rate": 2.580908301738025e-06, "loss": 0.81633508, "num_input_tokens_seen": 145026210, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.828125, "step": 6757, "time_per_iteration": 2.599719762802124 }, { "auxiliary_loss_clip": 0.01139553, "auxiliary_loss_mlp": 0.01033979, "balance_loss_clip": 1.0199182, "balance_loss_mlp": 1.04290295, "epoch": 0.4063129415301368, "flos": 21726027964800.0, "grad_norm": 2.28140348582624, "language_loss": 0.78596056, "learning_rate": 2.5805467573793977e-06, "loss": 0.80769593, "num_input_tokens_seen": 145045475, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7890625, "step": 6758, "time_per_iteration": 2.56714129447937 }, { "auxiliary_loss_clip": 0.0115051, "auxiliary_loss_mlp": 0.01033404, "balance_loss_clip": 1.01886678, "balance_loss_mlp": 1.04462838, "epoch": 0.40637306478280477, "flos": 12787575206400.0, "grad_norm": 1.8054082289890574, "language_loss": 0.8917101, "learning_rate": 2.5801851923025495e-06, "loss": 0.9135493, "num_input_tokens_seen": 145062260, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.79296875, "step": 6759, "time_per_iteration": 2.5430033206939697 }, { "auxiliary_loss_clip": 0.01148655, "auxiliary_loss_mlp": 0.01034575, "balance_loss_clip": 1.02090168, "balance_loss_mlp": 1.04315007, "epoch": 0.40643318803547274, "flos": 24024490323840.0, "grad_norm": 2.1176743498360353, "language_loss": 0.64522105, "learning_rate": 2.579823606520385e-06, "loss": 0.6670534, "num_input_tokens_seen": 145082470, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.78515625, "step": 6760, "time_per_iteration": 2.6276612281799316 }, { "auxiliary_loss_clip": 0.01130352, "auxiliary_loss_mlp": 0.01035686, "balance_loss_clip": 1.02070689, "balance_loss_mlp": 1.0418328, "epoch": 0.4064933112881407, "flos": 25592421876480.0, "grad_norm": 1.4223832654090482, "language_loss": 0.74859953, "learning_rate": 2.5794620000458065e-06, "loss": 0.77025992, "num_input_tokens_seen": 145105685, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.796875, "step": 6761, "time_per_iteration": 2.611107349395752 }, { "auxiliary_loss_clip": 0.01135183, "auxiliary_loss_mlp": 0.01037882, "balance_loss_clip": 1.02364826, "balance_loss_mlp": 1.04244316, "epoch": 0.40655343454080867, "flos": 22054323294720.0, "grad_norm": 1.3530840492615843, "language_loss": 0.70057207, "learning_rate": 2.5791003728917204e-06, "loss": 0.72230273, "num_input_tokens_seen": 145125590, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.74609375, "step": 6762, "time_per_iteration": 2.5990726947784424 }, { "auxiliary_loss_clip": 0.01131021, "auxiliary_loss_mlp": 0.0103586, "balance_loss_clip": 1.02210355, "balance_loss_mlp": 1.04408622, "epoch": 0.40661355779347663, "flos": 26468893641600.0, "grad_norm": 1.6266756107537512, "language_loss": 0.7305994, "learning_rate": 2.578738725071032e-06, "loss": 0.7522682, "num_input_tokens_seen": 145146810, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.77734375, "step": 6763, "time_per_iteration": 2.56296443939209 }, { "auxiliary_loss_clip": 0.01151246, "auxiliary_loss_mlp": 0.01033841, "balance_loss_clip": 1.01897573, "balance_loss_mlp": 1.04297352, "epoch": 0.4066736810461446, "flos": 13261129136640.0, "grad_norm": 2.589394717407972, "language_loss": 0.68931913, "learning_rate": 2.578377056596646e-06, "loss": 0.71116996, "num_input_tokens_seen": 145163130, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8125, "step": 6764, "time_per_iteration": 2.5997114181518555 }, { "auxiliary_loss_clip": 0.01152306, "auxiliary_loss_mlp": 0.01038952, "balance_loss_clip": 1.02325177, "balance_loss_mlp": 1.04412746, "epoch": 0.40673380429881256, "flos": 28803625758720.0, "grad_norm": 3.2115811580481703, "language_loss": 0.91021478, "learning_rate": 2.5780153674814714e-06, "loss": 0.93212724, "num_input_tokens_seen": 145181420, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8125, "step": 6765, "time_per_iteration": 2.587859630584717 }, { "auxiliary_loss_clip": 0.01133131, "auxiliary_loss_mlp": 0.01041426, "balance_loss_clip": 1.02559519, "balance_loss_mlp": 1.04208076, "epoch": 0.4067939275514805, "flos": 12495334152960.0, "grad_norm": 2.9563063412444555, "language_loss": 0.78927833, "learning_rate": 2.5776536577384148e-06, "loss": 0.81102389, "num_input_tokens_seen": 145198545, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8203125, "step": 6766, "time_per_iteration": 2.6646289825439453 }, { "auxiliary_loss_clip": 0.01122594, "auxiliary_loss_mlp": 0.01041702, "balance_loss_clip": 1.02662182, "balance_loss_mlp": 1.04108095, "epoch": 0.4068540508041485, "flos": 18770508069120.0, "grad_norm": 2.01808846994017, "language_loss": 0.76794684, "learning_rate": 2.5772919273803855e-06, "loss": 0.78958976, "num_input_tokens_seen": 145215835, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.81640625, "step": 6767, "time_per_iteration": 2.48974609375 }, { "auxiliary_loss_clip": 0.01128171, "auxiliary_loss_mlp": 0.01036093, "balance_loss_clip": 1.02195501, "balance_loss_mlp": 1.04224026, "epoch": 0.40691417405681646, "flos": 28512821249280.0, "grad_norm": 1.6758124196962776, "language_loss": 0.77243114, "learning_rate": 2.576930176420292e-06, "loss": 0.79407382, "num_input_tokens_seen": 145236555, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7734375, "step": 6768, "time_per_iteration": 2.627436876296997 }, { "auxiliary_loss_clip": 0.01120082, "auxiliary_loss_mlp": 0.01031285, "balance_loss_clip": 1.01754582, "balance_loss_mlp": 1.0418334, "epoch": 0.4069742973094844, "flos": 20814040627200.0, "grad_norm": 2.3437968488378353, "language_loss": 0.86976594, "learning_rate": 2.5765684048710452e-06, "loss": 0.89127958, "num_input_tokens_seen": 145254595, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.78125, "step": 6769, "time_per_iteration": 2.4817562103271484 }, { "auxiliary_loss_clip": 0.01140566, "auxiliary_loss_mlp": 0.01037798, "balance_loss_clip": 1.024387, "balance_loss_mlp": 1.04597187, "epoch": 0.4070344205621524, "flos": 21470272151040.0, "grad_norm": 1.9113648273885353, "language_loss": 0.81061721, "learning_rate": 2.5762066127455544e-06, "loss": 0.8324008, "num_input_tokens_seen": 145274005, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.76953125, "step": 6770, "time_per_iteration": 2.592862844467163 }, { "auxiliary_loss_clip": 0.01123778, "auxiliary_loss_mlp": 0.01033068, "balance_loss_clip": 1.01809549, "balance_loss_mlp": 1.04249406, "epoch": 0.4070945438148204, "flos": 26830046937600.0, "grad_norm": 1.6704957479208484, "language_loss": 0.79933119, "learning_rate": 2.5758448000567324e-06, "loss": 0.82089961, "num_input_tokens_seen": 145294850, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8125, "step": 6771, "time_per_iteration": 2.516646146774292 }, { "auxiliary_loss_clip": 0.01147097, "auxiliary_loss_mlp": 0.01041945, "balance_loss_clip": 1.02741349, "balance_loss_mlp": 1.04227853, "epoch": 0.4071546670674884, "flos": 26354158623360.0, "grad_norm": 1.34445073542083, "language_loss": 0.75948703, "learning_rate": 2.57548296681749e-06, "loss": 0.78137743, "num_input_tokens_seen": 145317050, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.78125, "step": 6772, "time_per_iteration": 2.679145574569702 }, { "auxiliary_loss_clip": 0.01138501, "auxiliary_loss_mlp": 0.01040513, "balance_loss_clip": 1.02546287, "balance_loss_mlp": 1.04071331, "epoch": 0.40721479032015634, "flos": 17895401020800.0, "grad_norm": 1.6678302978997366, "language_loss": 0.81237268, "learning_rate": 2.5751211130407414e-06, "loss": 0.83416283, "num_input_tokens_seen": 145334480, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.80078125, "step": 6773, "time_per_iteration": 2.5735268592834473 }, { "auxiliary_loss_clip": 0.01064788, "auxiliary_loss_mlp": 0.0101627, "balance_loss_clip": 1.01466036, "balance_loss_mlp": 1.02151597, "epoch": 0.4072749135728243, "flos": 49854570537600.0, "grad_norm": 0.857263562943693, "language_loss": 0.64370453, "learning_rate": 2.5747592387393993e-06, "loss": 0.66451514, "num_input_tokens_seen": 145388695, "router_z_loss_clip": 0.01611328, "router_z_loss_mlp": 0.25, "step": 6774, "time_per_iteration": 3.069500684738159 }, { "auxiliary_loss_clip": 0.01157531, "auxiliary_loss_mlp": 0.01035359, "balance_loss_clip": 1.02098262, "balance_loss_mlp": 1.04115224, "epoch": 0.40733503682549227, "flos": 27563630400000.0, "grad_norm": 2.441236069750448, "language_loss": 0.72493851, "learning_rate": 2.574397343926379e-06, "loss": 0.74686742, "num_input_tokens_seen": 145408240, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8125, "step": 6775, "time_per_iteration": 2.6577181816101074 }, { "auxiliary_loss_clip": 0.01139702, "auxiliary_loss_mlp": 0.01291416, "balance_loss_clip": 1.0294199, "balance_loss_mlp": 1.04070449, "epoch": 0.40739516007816023, "flos": 22126970551680.0, "grad_norm": 1.416148000120943, "language_loss": 0.78038776, "learning_rate": 2.5740354286145936e-06, "loss": 0.80469894, "num_input_tokens_seen": 145428395, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8046875, "step": 6776, "time_per_iteration": 2.6659414768218994 }, { "auxiliary_loss_clip": 0.01133848, "auxiliary_loss_mlp": 0.01041179, "balance_loss_clip": 1.02652788, "balance_loss_mlp": 1.04292166, "epoch": 0.4074552833308282, "flos": 23842243693440.0, "grad_norm": 2.3450234903517035, "language_loss": 0.79433024, "learning_rate": 2.5736734928169616e-06, "loss": 0.81608045, "num_input_tokens_seen": 145448290, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8203125, "step": 6777, "time_per_iteration": 2.695725440979004 }, { "auxiliary_loss_clip": 0.01044817, "auxiliary_loss_mlp": 0.01004925, "balance_loss_clip": 1.00304151, "balance_loss_mlp": 1.01980662, "epoch": 0.40751540658349616, "flos": 58000008781440.0, "grad_norm": 0.6772642405346082, "language_loss": 0.53194964, "learning_rate": 2.5733115365463976e-06, "loss": 0.55244708, "num_input_tokens_seen": 145509785, "router_z_loss_clip": 0.01879883, "router_z_loss_mlp": 0.25, "step": 6778, "time_per_iteration": 3.1571905612945557 }, { "auxiliary_loss_clip": 0.0112337, "auxiliary_loss_mlp": 0.01292178, "balance_loss_clip": 1.02954698, "balance_loss_mlp": 1.04419899, "epoch": 0.40757552983616413, "flos": 21214659991680.0, "grad_norm": 2.051166878990422, "language_loss": 0.82098091, "learning_rate": 2.5729495598158205e-06, "loss": 0.84513646, "num_input_tokens_seen": 145528620, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.79296875, "step": 6779, "time_per_iteration": 2.644275188446045 }, { "auxiliary_loss_clip": 0.01129262, "auxiliary_loss_mlp": 0.01036497, "balance_loss_clip": 1.02136946, "balance_loss_mlp": 1.04116964, "epoch": 0.4076356530888321, "flos": 26833530556800.0, "grad_norm": 1.597236739108823, "language_loss": 0.76178563, "learning_rate": 2.572587562638147e-06, "loss": 0.78344321, "num_input_tokens_seen": 145547775, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.7890625, "step": 6780, "time_per_iteration": 2.5719082355499268 }, { "auxiliary_loss_clip": 0.01126033, "auxiliary_loss_mlp": 0.01036951, "balance_loss_clip": 1.02308679, "balance_loss_mlp": 1.04072857, "epoch": 0.40769577634150006, "flos": 12203021272320.0, "grad_norm": 1.9533214085399648, "language_loss": 0.66033417, "learning_rate": 2.572225545026296e-06, "loss": 0.68196404, "num_input_tokens_seen": 145564465, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76171875, "step": 6781, "time_per_iteration": 2.623166084289551 }, { "auxiliary_loss_clip": 0.01138222, "auxiliary_loss_mlp": 0.01036648, "balance_loss_clip": 1.02110887, "balance_loss_mlp": 1.04108298, "epoch": 0.407755899594168, "flos": 33655264796160.0, "grad_norm": 1.5873984087380553, "language_loss": 0.71312356, "learning_rate": 2.5718635069931875e-06, "loss": 0.73487222, "num_input_tokens_seen": 145585965, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.79296875, "step": 6782, "time_per_iteration": 2.651956558227539 }, { "auxiliary_loss_clip": 0.01131254, "auxiliary_loss_mlp": 0.01037195, "balance_loss_clip": 1.02255583, "balance_loss_mlp": 1.04224443, "epoch": 0.407816022846836, "flos": 20157342226560.0, "grad_norm": 1.6909134734769846, "language_loss": 0.82240105, "learning_rate": 2.571501448551741e-06, "loss": 0.84408551, "num_input_tokens_seen": 145605000, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80078125, "step": 6783, "time_per_iteration": 2.616713285446167 }, { "auxiliary_loss_clip": 0.01130167, "auxiliary_loss_mlp": 0.01037435, "balance_loss_clip": 1.02216387, "balance_loss_mlp": 1.04077315, "epoch": 0.40787614609950396, "flos": 21178821196800.0, "grad_norm": 1.666350441485261, "language_loss": 0.80447578, "learning_rate": 2.5711393697148787e-06, "loss": 0.82615179, "num_input_tokens_seen": 145623740, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8046875, "step": 6784, "time_per_iteration": 2.5252294540405273 }, { "auxiliary_loss_clip": 0.01152566, "auxiliary_loss_mlp": 0.01044726, "balance_loss_clip": 1.02816749, "balance_loss_mlp": 1.04174495, "epoch": 0.407936269352172, "flos": 20520650338560.0, "grad_norm": 1.8635324234205284, "language_loss": 0.65946794, "learning_rate": 2.5707772704955214e-06, "loss": 0.68144083, "num_input_tokens_seen": 145643515, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.83984375, "step": 6785, "time_per_iteration": 3.9761552810668945 }, { "auxiliary_loss_clip": 0.01129, "auxiliary_loss_mlp": 0.01034076, "balance_loss_clip": 1.01920414, "balance_loss_mlp": 1.04053438, "epoch": 0.40799639260483994, "flos": 20118809911680.0, "grad_norm": 2.028214193708974, "language_loss": 0.79249692, "learning_rate": 2.570415150906591e-06, "loss": 0.81412768, "num_input_tokens_seen": 145660890, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.79296875, "step": 6786, "time_per_iteration": 2.5763020515441895 }, { "auxiliary_loss_clip": 0.01120351, "auxiliary_loss_mlp": 0.01043303, "balance_loss_clip": 1.02953446, "balance_loss_mlp": 1.04179573, "epoch": 0.4080565158575079, "flos": 20997328752000.0, "grad_norm": 1.5619875990198204, "language_loss": 0.81261778, "learning_rate": 2.570053010961011e-06, "loss": 0.83425432, "num_input_tokens_seen": 145680070, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7890625, "step": 6787, "time_per_iteration": 2.646467924118042 }, { "auxiliary_loss_clip": 0.011265, "auxiliary_loss_mlp": 0.01037735, "balance_loss_clip": 1.02309585, "balance_loss_mlp": 1.04042554, "epoch": 0.40811663911017587, "flos": 19317714837120.0, "grad_norm": 1.8663654185353578, "language_loss": 0.67467999, "learning_rate": 2.5696908506717054e-06, "loss": 0.69632232, "num_input_tokens_seen": 145698010, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7734375, "step": 6788, "time_per_iteration": 2.561959981918335 }, { "auxiliary_loss_clip": 0.01136233, "auxiliary_loss_mlp": 0.01041091, "balance_loss_clip": 1.02589178, "balance_loss_mlp": 1.03910565, "epoch": 0.40817676236284384, "flos": 40625382119040.0, "grad_norm": 2.34750955690009, "language_loss": 0.66071355, "learning_rate": 2.5693286700515993e-06, "loss": 0.68248683, "num_input_tokens_seen": 145722215, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.79296875, "step": 6789, "time_per_iteration": 2.7835419178009033 }, { "auxiliary_loss_clip": 0.01155792, "auxiliary_loss_mlp": 0.01041693, "balance_loss_clip": 1.02679753, "balance_loss_mlp": 1.04125273, "epoch": 0.4082368856155118, "flos": 20522086882560.0, "grad_norm": 1.8245674884586995, "language_loss": 0.60513031, "learning_rate": 2.568966469113617e-06, "loss": 0.62710518, "num_input_tokens_seen": 145741090, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7890625, "step": 6790, "time_per_iteration": 3.9505999088287354 }, { "auxiliary_loss_clip": 0.01129909, "auxiliary_loss_mlp": 0.01041754, "balance_loss_clip": 1.02666235, "balance_loss_mlp": 1.04331851, "epoch": 0.40829700886817977, "flos": 11427745098240.0, "grad_norm": 2.712096339848335, "language_loss": 0.69299942, "learning_rate": 2.568604247870685e-06, "loss": 0.71471602, "num_input_tokens_seen": 145754985, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7734375, "step": 6791, "time_per_iteration": 2.475620985031128 }, { "auxiliary_loss_clip": 0.01053453, "auxiliary_loss_mlp": 0.01003626, "balance_loss_clip": 1.00168276, "balance_loss_mlp": 1.01946425, "epoch": 0.40835713212084773, "flos": 67330677121920.0, "grad_norm": 0.7480574213001316, "language_loss": 0.59653258, "learning_rate": 2.5682420063357308e-06, "loss": 0.61710334, "num_input_tokens_seen": 145815260, "router_z_loss_clip": 0.01940918, "router_z_loss_mlp": 0.25, "step": 6792, "time_per_iteration": 4.671952486038208 }, { "auxiliary_loss_clip": 0.01127995, "auxiliary_loss_mlp": 0.01041106, "balance_loss_clip": 1.02498889, "balance_loss_mlp": 1.04600811, "epoch": 0.4084172553735157, "flos": 21762010414080.0, "grad_norm": 1.654684155509727, "language_loss": 0.79894483, "learning_rate": 2.5678797445216798e-06, "loss": 0.82063586, "num_input_tokens_seen": 145832665, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8203125, "step": 6793, "time_per_iteration": 2.598710775375366 }, { "auxiliary_loss_clip": 0.01133155, "auxiliary_loss_mlp": 0.01037233, "balance_loss_clip": 1.02175975, "balance_loss_mlp": 1.04394996, "epoch": 0.40847737862618366, "flos": 27417258478080.0, "grad_norm": 1.7776906241208164, "language_loss": 0.84699512, "learning_rate": 2.5675174624414626e-06, "loss": 0.86869901, "num_input_tokens_seen": 145850240, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.80078125, "step": 6794, "time_per_iteration": 2.6357150077819824 }, { "auxiliary_loss_clip": 0.01149686, "auxiliary_loss_mlp": 0.0104033, "balance_loss_clip": 1.02519011, "balance_loss_mlp": 1.04323578, "epoch": 0.4085375018788516, "flos": 18587255857920.0, "grad_norm": 3.2256932100650997, "language_loss": 0.7968843, "learning_rate": 2.5671551601080057e-06, "loss": 0.81878442, "num_input_tokens_seen": 145869545, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.796875, "step": 6795, "time_per_iteration": 4.585557460784912 }, { "auxiliary_loss_clip": 0.01135399, "auxiliary_loss_mlp": 0.01037888, "balance_loss_clip": 1.02203298, "balance_loss_mlp": 1.04283929, "epoch": 0.4085976251315196, "flos": 15411783029760.0, "grad_norm": 2.54493591919223, "language_loss": 0.69724751, "learning_rate": 2.5667928375342414e-06, "loss": 0.71898043, "num_input_tokens_seen": 145884025, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8359375, "step": 6796, "time_per_iteration": 2.5036346912384033 }, { "auxiliary_loss_clip": 0.01130681, "auxiliary_loss_mlp": 0.01289195, "balance_loss_clip": 1.02815235, "balance_loss_mlp": 1.04298234, "epoch": 0.40865774838418756, "flos": 21252222639360.0, "grad_norm": 1.7492329419930397, "language_loss": 0.77822292, "learning_rate": 2.5664304947330985e-06, "loss": 0.80242169, "num_input_tokens_seen": 145903210, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78515625, "step": 6797, "time_per_iteration": 2.6753811836242676 }, { "auxiliary_loss_clip": 0.01127823, "auxiliary_loss_mlp": 0.01048692, "balance_loss_clip": 1.03380251, "balance_loss_mlp": 1.03967571, "epoch": 0.4087178716368556, "flos": 13772245714560.0, "grad_norm": 1.9475793838751632, "language_loss": 0.85419428, "learning_rate": 2.5660681317175076e-06, "loss": 0.8759594, "num_input_tokens_seen": 145920985, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7890625, "step": 6798, "time_per_iteration": 2.569181442260742 }, { "auxiliary_loss_clip": 0.01118703, "auxiliary_loss_mlp": 0.01035571, "balance_loss_clip": 1.02235651, "balance_loss_mlp": 1.04160702, "epoch": 0.40877799488952354, "flos": 23621752056960.0, "grad_norm": 1.577616363043668, "language_loss": 0.8412801, "learning_rate": 2.5657057485004016e-06, "loss": 0.86282289, "num_input_tokens_seen": 145940350, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7734375, "step": 6799, "time_per_iteration": 2.5971884727478027 }, { "auxiliary_loss_clip": 0.01151296, "auxiliary_loss_mlp": 0.01042942, "balance_loss_clip": 1.02757573, "balance_loss_mlp": 1.04471207, "epoch": 0.4088381181421915, "flos": 20918791664640.0, "grad_norm": 2.2035469020624787, "language_loss": 0.83432472, "learning_rate": 2.565343345094712e-06, "loss": 0.85626709, "num_input_tokens_seen": 145957460, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.796875, "step": 6800, "time_per_iteration": 2.6061410903930664 }, { "auxiliary_loss_clip": 0.01131623, "auxiliary_loss_mlp": 0.0104442, "balance_loss_clip": 1.02928019, "balance_loss_mlp": 1.04170084, "epoch": 0.4088982413948595, "flos": 13297578462720.0, "grad_norm": 2.246580286034629, "language_loss": 0.74197519, "learning_rate": 2.5649809215133737e-06, "loss": 0.76373559, "num_input_tokens_seen": 145975285, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8125, "step": 6801, "time_per_iteration": 2.5915043354034424 }, { "auxiliary_loss_clip": 0.01131289, "auxiliary_loss_mlp": 0.01039433, "balance_loss_clip": 1.02348316, "balance_loss_mlp": 1.04214251, "epoch": 0.40895836464752744, "flos": 15267673664640.0, "grad_norm": 2.040139732367122, "language_loss": 0.8019017, "learning_rate": 2.5646184777693193e-06, "loss": 0.82360888, "num_input_tokens_seen": 145989150, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8046875, "step": 6802, "time_per_iteration": 2.5578715801239014 }, { "auxiliary_loss_clip": 0.01133927, "auxiliary_loss_mlp": 0.01043947, "balance_loss_clip": 1.02783, "balance_loss_mlp": 1.04288852, "epoch": 0.4090184879001954, "flos": 14501411804160.0, "grad_norm": 2.3501132218406044, "language_loss": 0.76393044, "learning_rate": 2.5642560138754833e-06, "loss": 0.78570914, "num_input_tokens_seen": 146006980, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8203125, "step": 6803, "time_per_iteration": 2.6078860759735107 }, { "auxiliary_loss_clip": 0.0116706, "auxiliary_loss_mlp": 0.01286239, "balance_loss_clip": 1.02398121, "balance_loss_mlp": 1.04311323, "epoch": 0.40907861115286337, "flos": 13881593692800.0, "grad_norm": 1.8952155469687588, "language_loss": 0.78203249, "learning_rate": 2.5638935298448017e-06, "loss": 0.8065654, "num_input_tokens_seen": 146025125, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.796875, "step": 6804, "time_per_iteration": 2.621551275253296 }, { "auxiliary_loss_clip": 0.01127484, "auxiliary_loss_mlp": 0.01044059, "balance_loss_clip": 1.02936602, "balance_loss_mlp": 1.04313326, "epoch": 0.40913873440553133, "flos": 28037615293440.0, "grad_norm": 1.7327523464479548, "language_loss": 0.75525498, "learning_rate": 2.5635310256902106e-06, "loss": 0.77697039, "num_input_tokens_seen": 146044990, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.84375, "step": 6805, "time_per_iteration": 2.6472601890563965 }, { "auxiliary_loss_clip": 0.01163226, "auxiliary_loss_mlp": 0.01043149, "balance_loss_clip": 1.02861714, "balance_loss_mlp": 1.04624581, "epoch": 0.4091988576581993, "flos": 21618188357760.0, "grad_norm": 2.209378559415558, "language_loss": 0.79634297, "learning_rate": 2.563168501424647e-06, "loss": 0.81840676, "num_input_tokens_seen": 146066045, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8125, "step": 6806, "time_per_iteration": 2.6375200748443604 }, { "auxiliary_loss_clip": 0.01132739, "auxiliary_loss_mlp": 0.0103648, "balance_loss_clip": 1.02080441, "balance_loss_mlp": 1.0423317, "epoch": 0.40925898091086726, "flos": 25224085860480.0, "grad_norm": 1.8928865555759766, "language_loss": 0.71849436, "learning_rate": 2.562805957061048e-06, "loss": 0.74018657, "num_input_tokens_seen": 146086280, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.81640625, "step": 6807, "time_per_iteration": 2.7081313133239746 }, { "auxiliary_loss_clip": 0.0106187, "auxiliary_loss_mlp": 0.0101191, "balance_loss_clip": 1.01019335, "balance_loss_mlp": 1.01832151, "epoch": 0.40931910416353523, "flos": 68930568800640.0, "grad_norm": 0.8214667608487065, "language_loss": 0.58771837, "learning_rate": 2.5624433926123524e-06, "loss": 0.60845613, "num_input_tokens_seen": 146148840, "router_z_loss_clip": 0.01721191, "router_z_loss_mlp": 0.25, "step": 6808, "time_per_iteration": 3.3429667949676514 }, { "auxiliary_loss_clip": 0.01139207, "auxiliary_loss_mlp": 0.01049261, "balance_loss_clip": 1.03477728, "balance_loss_mlp": 1.04241168, "epoch": 0.4093792274162032, "flos": 20189553747840.0, "grad_norm": 1.80094001237253, "language_loss": 0.54349786, "learning_rate": 2.5620808080914985e-06, "loss": 0.56538254, "num_input_tokens_seen": 146166195, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7890625, "step": 6809, "time_per_iteration": 2.706425905227661 }, { "auxiliary_loss_clip": 0.01140745, "auxiliary_loss_mlp": 0.0103048, "balance_loss_clip": 1.01641893, "balance_loss_mlp": 1.04369283, "epoch": 0.40943935066887116, "flos": 25228754628480.0, "grad_norm": 1.7964530561646008, "language_loss": 0.8303138, "learning_rate": 2.5617182035114262e-06, "loss": 0.85202605, "num_input_tokens_seen": 146185045, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7890625, "step": 6810, "time_per_iteration": 2.597094774246216 }, { "auxiliary_loss_clip": 0.01144091, "auxiliary_loss_mlp": 0.01038989, "balance_loss_clip": 1.02284837, "balance_loss_mlp": 1.04525363, "epoch": 0.4094994739215392, "flos": 23255319461760.0, "grad_norm": 2.106506427691619, "language_loss": 0.77536815, "learning_rate": 2.5613555788850768e-06, "loss": 0.79719895, "num_input_tokens_seen": 146204655, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.80859375, "step": 6811, "time_per_iteration": 2.6360058784484863 }, { "auxiliary_loss_clip": 0.011238, "auxiliary_loss_mlp": 0.01038554, "balance_loss_clip": 1.02211475, "balance_loss_mlp": 1.04270589, "epoch": 0.40955959717420715, "flos": 17382165540480.0, "grad_norm": 1.5438918933045378, "language_loss": 0.70199549, "learning_rate": 2.5609929342253905e-06, "loss": 0.72361898, "num_input_tokens_seen": 146222000, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8125, "step": 6812, "time_per_iteration": 2.536985158920288 }, { "auxiliary_loss_clip": 0.01121193, "auxiliary_loss_mlp": 0.01039314, "balance_loss_clip": 1.02363849, "balance_loss_mlp": 1.04232574, "epoch": 0.4096197204268751, "flos": 25082418620160.0, "grad_norm": 1.4895482084679452, "language_loss": 0.66594535, "learning_rate": 2.5606302695453093e-06, "loss": 0.68755043, "num_input_tokens_seen": 146242630, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.7890625, "step": 6813, "time_per_iteration": 2.604926586151123 }, { "auxiliary_loss_clip": 0.01131651, "auxiliary_loss_mlp": 0.0128583, "balance_loss_clip": 1.02298284, "balance_loss_mlp": 1.04297543, "epoch": 0.4096798436795431, "flos": 30586769648640.0, "grad_norm": 2.0022945155109917, "language_loss": 0.74213231, "learning_rate": 2.5602675848577763e-06, "loss": 0.76630712, "num_input_tokens_seen": 146263070, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.796875, "step": 6814, "time_per_iteration": 2.6610119342803955 }, { "auxiliary_loss_clip": 0.01161408, "auxiliary_loss_mlp": 0.01033113, "balance_loss_clip": 1.01748466, "balance_loss_mlp": 1.04323912, "epoch": 0.40973996693221104, "flos": 24133622820480.0, "grad_norm": 1.8515758598127547, "language_loss": 0.66064, "learning_rate": 2.5599048801757345e-06, "loss": 0.68258524, "num_input_tokens_seen": 146282890, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8203125, "step": 6815, "time_per_iteration": 2.604726552963257 }, { "auxiliary_loss_clip": 0.01130594, "auxiliary_loss_mlp": 0.01041624, "balance_loss_clip": 1.02678275, "balance_loss_mlp": 1.04601693, "epoch": 0.409800090184879, "flos": 23988974751360.0, "grad_norm": 1.647889341680279, "language_loss": 0.76872545, "learning_rate": 2.559542155512127e-06, "loss": 0.79044759, "num_input_tokens_seen": 146301755, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7578125, "step": 6816, "time_per_iteration": 2.5965123176574707 }, { "auxiliary_loss_clip": 0.01141465, "auxiliary_loss_mlp": 0.01041101, "balance_loss_clip": 1.02555633, "balance_loss_mlp": 1.04376662, "epoch": 0.40986021343754697, "flos": 16143678552960.0, "grad_norm": 3.270894633189341, "language_loss": 0.81696999, "learning_rate": 2.5591794108798996e-06, "loss": 0.8387956, "num_input_tokens_seen": 146316835, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.796875, "step": 6817, "time_per_iteration": 2.5122389793395996 }, { "auxiliary_loss_clip": 0.01144498, "auxiliary_loss_mlp": 0.01038626, "balance_loss_clip": 1.02187717, "balance_loss_mlp": 1.04372501, "epoch": 0.40992033669021494, "flos": 24790824011520.0, "grad_norm": 1.5810547681743272, "language_loss": 0.79627913, "learning_rate": 2.5588166462919977e-06, "loss": 0.81811035, "num_input_tokens_seen": 146336650, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.828125, "step": 6818, "time_per_iteration": 2.6668484210968018 }, { "auxiliary_loss_clip": 0.01146767, "auxiliary_loss_mlp": 0.01038755, "balance_loss_clip": 1.02415776, "balance_loss_mlp": 1.04187489, "epoch": 0.4099804599428829, "flos": 29641888431360.0, "grad_norm": 1.6319520414581248, "language_loss": 0.65817451, "learning_rate": 2.558453861761367e-06, "loss": 0.68002975, "num_input_tokens_seen": 146357640, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78125, "step": 6819, "time_per_iteration": 2.6225998401641846 }, { "auxiliary_loss_clip": 0.01146076, "auxiliary_loss_mlp": 0.01043486, "balance_loss_clip": 1.0273807, "balance_loss_mlp": 1.04408336, "epoch": 0.41004058319555087, "flos": 24826590979200.0, "grad_norm": 1.560335866637888, "language_loss": 0.8532024, "learning_rate": 2.5580910573009544e-06, "loss": 0.87509799, "num_input_tokens_seen": 146379325, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.83984375, "step": 6820, "time_per_iteration": 2.6581239700317383 }, { "auxiliary_loss_clip": 0.01129686, "auxiliary_loss_mlp": 0.01033729, "balance_loss_clip": 1.01888752, "balance_loss_mlp": 1.04302728, "epoch": 0.41010070644821883, "flos": 25737464995200.0, "grad_norm": 1.6829010664192652, "language_loss": 0.70931864, "learning_rate": 2.5577282329237072e-06, "loss": 0.73095274, "num_input_tokens_seen": 146398635, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.77734375, "step": 6821, "time_per_iteration": 2.6242873668670654 }, { "auxiliary_loss_clip": 0.0113197, "auxiliary_loss_mlp": 0.01037432, "balance_loss_clip": 1.02226877, "balance_loss_mlp": 1.04293633, "epoch": 0.4101608297008868, "flos": 22346061557760.0, "grad_norm": 1.8692519422345648, "language_loss": 0.74355352, "learning_rate": 2.5573653886425745e-06, "loss": 0.76524752, "num_input_tokens_seen": 146417585, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.796875, "step": 6822, "time_per_iteration": 2.645904064178467 }, { "auxiliary_loss_clip": 0.01141241, "auxiliary_loss_mlp": 0.01034545, "balance_loss_clip": 1.01973867, "balance_loss_mlp": 1.04368937, "epoch": 0.41022095295355476, "flos": 21945083057280.0, "grad_norm": 1.9972008220604722, "language_loss": 0.75492442, "learning_rate": 2.5570025244705044e-06, "loss": 0.77668226, "num_input_tokens_seen": 146437035, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.80078125, "step": 6823, "time_per_iteration": 2.5549216270446777 }, { "auxiliary_loss_clip": 0.01163916, "auxiliary_loss_mlp": 0.01042515, "balance_loss_clip": 1.02544427, "balance_loss_mlp": 1.04372382, "epoch": 0.4102810762062228, "flos": 27450511493760.0, "grad_norm": 1.9866627756860407, "language_loss": 0.72695953, "learning_rate": 2.5566396404204473e-06, "loss": 0.7490238, "num_input_tokens_seen": 146457370, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.84765625, "step": 6824, "time_per_iteration": 2.674579381942749 }, { "auxiliary_loss_clip": 0.01150495, "auxiliary_loss_mlp": 0.01036268, "balance_loss_clip": 1.02094984, "balance_loss_mlp": 1.04094982, "epoch": 0.41034119945889075, "flos": 24499265316480.0, "grad_norm": 1.6825643435607602, "language_loss": 0.71718132, "learning_rate": 2.556276736505353e-06, "loss": 0.73904896, "num_input_tokens_seen": 146478105, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.828125, "step": 6825, "time_per_iteration": 2.5640103816986084 }, { "auxiliary_loss_clip": 0.0114935, "auxiliary_loss_mlp": 0.01043769, "balance_loss_clip": 1.02927947, "balance_loss_mlp": 1.04355431, "epoch": 0.4104013227115587, "flos": 24352641999360.0, "grad_norm": 2.208935004066578, "language_loss": 0.84936976, "learning_rate": 2.555913812738173e-06, "loss": 0.87130105, "num_input_tokens_seen": 146497835, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.78515625, "step": 6826, "time_per_iteration": 3.980738639831543 }, { "auxiliary_loss_clip": 0.01137441, "auxiliary_loss_mlp": 0.01285126, "balance_loss_clip": 1.02241015, "balance_loss_mlp": 1.04273212, "epoch": 0.4104614459642267, "flos": 23729340268800.0, "grad_norm": 1.8006596980066953, "language_loss": 0.65985322, "learning_rate": 2.555550869131859e-06, "loss": 0.68407893, "num_input_tokens_seen": 146517735, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.76953125, "step": 6827, "time_per_iteration": 2.653700113296509 }, { "auxiliary_loss_clip": 0.01138341, "auxiliary_loss_mlp": 0.01035003, "balance_loss_clip": 1.02026916, "balance_loss_mlp": 1.04226995, "epoch": 0.41052156921689464, "flos": 22127976132480.0, "grad_norm": 1.4024963469541656, "language_loss": 0.71921879, "learning_rate": 2.555187905699364e-06, "loss": 0.74095225, "num_input_tokens_seen": 146537640, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.78125, "step": 6828, "time_per_iteration": 2.561556100845337 }, { "auxiliary_loss_clip": 0.01145341, "auxiliary_loss_mlp": 0.01040727, "balance_loss_clip": 1.02476501, "balance_loss_mlp": 1.04463005, "epoch": 0.4105816924695626, "flos": 20084371747200.0, "grad_norm": 1.919823352080711, "language_loss": 0.83369994, "learning_rate": 2.5548249224536404e-06, "loss": 0.85556066, "num_input_tokens_seen": 146554695, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.828125, "step": 6829, "time_per_iteration": 2.6312990188598633 }, { "auxiliary_loss_clip": 0.01130553, "auxiliary_loss_mlp": 0.01035547, "balance_loss_clip": 1.02059841, "balance_loss_mlp": 1.04286742, "epoch": 0.4106418157222306, "flos": 18076785724800.0, "grad_norm": 1.548837258942628, "language_loss": 0.89766097, "learning_rate": 2.5544619194076425e-06, "loss": 0.91932195, "num_input_tokens_seen": 146573740, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.7890625, "step": 6830, "time_per_iteration": 2.5459635257720947 }, { "auxiliary_loss_clip": 0.01150129, "auxiliary_loss_mlp": 0.01031927, "balance_loss_clip": 1.01657295, "balance_loss_mlp": 1.04423296, "epoch": 0.41070193897489854, "flos": 21647850013440.0, "grad_norm": 2.0089716393022483, "language_loss": 0.65106004, "learning_rate": 2.5540988965743252e-06, "loss": 0.67288065, "num_input_tokens_seen": 146592885, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.7890625, "step": 6831, "time_per_iteration": 2.670435667037964 }, { "auxiliary_loss_clip": 0.01120657, "auxiliary_loss_mlp": 0.01035919, "balance_loss_clip": 1.02160239, "balance_loss_mlp": 1.04326594, "epoch": 0.4107620622275665, "flos": 26648195356800.0, "grad_norm": 2.1207129714390134, "language_loss": 0.6936444, "learning_rate": 2.553735853966643e-06, "loss": 0.7152102, "num_input_tokens_seen": 146611995, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7734375, "step": 6832, "time_per_iteration": 4.049876928329468 }, { "auxiliary_loss_clip": 0.0115527, "auxiliary_loss_mlp": 0.01039767, "balance_loss_clip": 1.02530706, "balance_loss_mlp": 1.04167593, "epoch": 0.41082218548023447, "flos": 18734310138240.0, "grad_norm": 2.2122077350747267, "language_loss": 0.73524487, "learning_rate": 2.553372791597553e-06, "loss": 0.75719523, "num_input_tokens_seen": 146628045, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.77734375, "step": 6833, "time_per_iteration": 2.5802576541900635 }, { "auxiliary_loss_clip": 0.01161215, "auxiliary_loss_mlp": 0.01039699, "balance_loss_clip": 1.02332008, "balance_loss_mlp": 1.04313827, "epoch": 0.41088230873290243, "flos": 22893771116160.0, "grad_norm": 1.7771002010189947, "language_loss": 0.72600698, "learning_rate": 2.553009709480011e-06, "loss": 0.74801612, "num_input_tokens_seen": 146648355, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.828125, "step": 6834, "time_per_iteration": 4.189646244049072 }, { "auxiliary_loss_clip": 0.01134383, "auxiliary_loss_mlp": 0.01044222, "balance_loss_clip": 1.02893972, "balance_loss_mlp": 1.04274857, "epoch": 0.4109424319855704, "flos": 24276978000000.0, "grad_norm": 2.4437567877351194, "language_loss": 0.71228427, "learning_rate": 2.5526466076269756e-06, "loss": 0.7340703, "num_input_tokens_seen": 146668370, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.828125, "step": 6835, "time_per_iteration": 2.5848188400268555 }, { "auxiliary_loss_clip": 0.01132635, "auxiliary_loss_mlp": 0.01042467, "balance_loss_clip": 1.02571821, "balance_loss_mlp": 1.04280937, "epoch": 0.41100255523823837, "flos": 12969139478400.0, "grad_norm": 2.2881614820272316, "language_loss": 0.87074101, "learning_rate": 2.5522834860514044e-06, "loss": 0.892492, "num_input_tokens_seen": 146686665, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.80859375, "step": 6836, "time_per_iteration": 2.6352615356445312 }, { "auxiliary_loss_clip": 0.01130993, "auxiliary_loss_mlp": 0.01039006, "balance_loss_clip": 1.02384281, "balance_loss_mlp": 1.0439049, "epoch": 0.4110626784909064, "flos": 23145648261120.0, "grad_norm": 1.998347810804074, "language_loss": 0.68913651, "learning_rate": 2.5519203447662554e-06, "loss": 0.71083647, "num_input_tokens_seen": 146706570, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.78515625, "step": 6837, "time_per_iteration": 4.781635761260986 }, { "auxiliary_loss_clip": 0.01123509, "auxiliary_loss_mlp": 0.0103966, "balance_loss_clip": 1.02369761, "balance_loss_mlp": 1.0423522, "epoch": 0.41112280174357435, "flos": 22747399194240.0, "grad_norm": 1.8343363876913237, "language_loss": 0.75334156, "learning_rate": 2.5515571837844902e-06, "loss": 0.77497321, "num_input_tokens_seen": 146723425, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8125, "step": 6838, "time_per_iteration": 2.5767202377319336 }, { "auxiliary_loss_clip": 0.0114749, "auxiliary_loss_mlp": 0.0103503, "balance_loss_clip": 1.01918697, "balance_loss_mlp": 1.04229999, "epoch": 0.4111829249962423, "flos": 21102403011840.0, "grad_norm": 2.2470740067773636, "language_loss": 0.8266986, "learning_rate": 2.5511940031190663e-06, "loss": 0.84852374, "num_input_tokens_seen": 146741640, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.78515625, "step": 6839, "time_per_iteration": 2.6320598125457764 }, { "auxiliary_loss_clip": 0.0111822, "auxiliary_loss_mlp": 0.01035103, "balance_loss_clip": 1.02008295, "balance_loss_mlp": 1.03992772, "epoch": 0.4112430482489103, "flos": 21505787723520.0, "grad_norm": 1.8287856667576288, "language_loss": 0.80122542, "learning_rate": 2.550830802782948e-06, "loss": 0.82275867, "num_input_tokens_seen": 146759195, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.78125, "step": 6840, "time_per_iteration": 2.5510001182556152 }, { "auxiliary_loss_clip": 0.01126313, "auxiliary_loss_mlp": 0.01032705, "balance_loss_clip": 1.01856089, "balance_loss_mlp": 1.04095459, "epoch": 0.41130317150157825, "flos": 19570022945280.0, "grad_norm": 1.8118867320781245, "language_loss": 0.67696601, "learning_rate": 2.5504675827890945e-06, "loss": 0.69855618, "num_input_tokens_seen": 146774990, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.76953125, "step": 6841, "time_per_iteration": 2.607684373855591 }, { "auxiliary_loss_clip": 0.01055023, "auxiliary_loss_mlp": 0.01010117, "balance_loss_clip": 1.0081023, "balance_loss_mlp": 1.0207057, "epoch": 0.4113632947542462, "flos": 62383157706240.0, "grad_norm": 0.7766275275146494, "language_loss": 0.59656221, "learning_rate": 2.5501043431504683e-06, "loss": 0.61721361, "num_input_tokens_seen": 146839610, "router_z_loss_clip": 0.0201416, "router_z_loss_mlp": 0.25390625, "step": 6842, "time_per_iteration": 3.2414090633392334 }, { "auxiliary_loss_clip": 0.01127807, "auxiliary_loss_mlp": 0.01037675, "balance_loss_clip": 1.02386415, "balance_loss_mlp": 1.04246736, "epoch": 0.4114234180069142, "flos": 13918617636480.0, "grad_norm": 1.9817161251506192, "language_loss": 0.69729102, "learning_rate": 2.5497410838800337e-06, "loss": 0.7189458, "num_input_tokens_seen": 146857360, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76171875, "step": 6843, "time_per_iteration": 2.581495523452759 }, { "auxiliary_loss_clip": 0.01070461, "auxiliary_loss_mlp": 0.01006489, "balance_loss_clip": 1.00449848, "balance_loss_mlp": 1.01904678, "epoch": 0.41148354125958214, "flos": 64928505219840.0, "grad_norm": 0.7215781619223957, "language_loss": 0.53648829, "learning_rate": 2.5493778049907537e-06, "loss": 0.55725777, "num_input_tokens_seen": 146917055, "router_z_loss_clip": 0.01989746, "router_z_loss_mlp": 0.25, "step": 6844, "time_per_iteration": 3.2227444648742676 }, { "auxiliary_loss_clip": 0.01127457, "auxiliary_loss_mlp": 0.01035698, "balance_loss_clip": 1.02177382, "balance_loss_mlp": 1.04222727, "epoch": 0.4115436645122501, "flos": 18728779443840.0, "grad_norm": 2.1020077872365133, "language_loss": 0.65983182, "learning_rate": 2.549014506495594e-06, "loss": 0.68146336, "num_input_tokens_seen": 146935215, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.765625, "step": 6845, "time_per_iteration": 2.5427985191345215 }, { "auxiliary_loss_clip": 0.01119472, "auxiliary_loss_mlp": 0.01035504, "balance_loss_clip": 1.02164626, "balance_loss_mlp": 1.04356384, "epoch": 0.41160378776491807, "flos": 16252918790400.0, "grad_norm": 1.977636638061609, "language_loss": 0.69958121, "learning_rate": 2.5486511884075184e-06, "loss": 0.72113097, "num_input_tokens_seen": 146951970, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 6846, "time_per_iteration": 2.5877857208251953 }, { "auxiliary_loss_clip": 0.01133824, "auxiliary_loss_mlp": 0.0104087, "balance_loss_clip": 1.02564728, "balance_loss_mlp": 1.0382216, "epoch": 0.41166391101758604, "flos": 27970031854080.0, "grad_norm": 1.8589906833092056, "language_loss": 0.64488673, "learning_rate": 2.5482878507394924e-06, "loss": 0.66663373, "num_input_tokens_seen": 146975615, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7734375, "step": 6847, "time_per_iteration": 2.6662726402282715 }, { "auxiliary_loss_clip": 0.01136713, "auxiliary_loss_mlp": 0.01041839, "balance_loss_clip": 1.02700949, "balance_loss_mlp": 1.0400908, "epoch": 0.411724034270254, "flos": 34131296764800.0, "grad_norm": 1.5484176122277116, "language_loss": 0.70384467, "learning_rate": 2.547924493504484e-06, "loss": 0.72563016, "num_input_tokens_seen": 146998855, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.78515625, "step": 6848, "time_per_iteration": 2.771559476852417 }, { "auxiliary_loss_clip": 0.01060105, "auxiliary_loss_mlp": 0.01260754, "balance_loss_clip": 1.01269364, "balance_loss_mlp": 1.01708508, "epoch": 0.41178415752292197, "flos": 67923670752000.0, "grad_norm": 0.8955388885993706, "language_loss": 0.56259358, "learning_rate": 2.5475611167154595e-06, "loss": 0.58580214, "num_input_tokens_seen": 147062710, "router_z_loss_clip": 0.01745605, "router_z_loss_mlp": 0.25195312, "step": 6849, "time_per_iteration": 3.2585792541503906 }, { "auxiliary_loss_clip": 0.01136896, "auxiliary_loss_mlp": 0.01039217, "balance_loss_clip": 1.024459, "balance_loss_mlp": 1.04314291, "epoch": 0.41184428077558993, "flos": 34313938444800.0, "grad_norm": 1.7553460704359154, "language_loss": 0.75899971, "learning_rate": 2.5471977203853874e-06, "loss": 0.78076088, "num_input_tokens_seen": 147086075, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7578125, "step": 6850, "time_per_iteration": 2.666966676712036 }, { "auxiliary_loss_clip": 0.01152814, "auxiliary_loss_mlp": 0.0103798, "balance_loss_clip": 1.02402687, "balance_loss_mlp": 1.04084134, "epoch": 0.41190440402825795, "flos": 35444118948480.0, "grad_norm": 1.529933543301259, "language_loss": 0.67806125, "learning_rate": 2.5468343045272363e-06, "loss": 0.69996917, "num_input_tokens_seen": 147107590, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.765625, "step": 6851, "time_per_iteration": 2.791872501373291 }, { "auxiliary_loss_clip": 0.01151734, "auxiliary_loss_mlp": 0.01045338, "balance_loss_clip": 1.02879179, "balance_loss_mlp": 1.04244936, "epoch": 0.4119645272809259, "flos": 20849879422080.0, "grad_norm": 2.405874897079605, "language_loss": 0.78581595, "learning_rate": 2.546470869153975e-06, "loss": 0.8077867, "num_input_tokens_seen": 147123715, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.82421875, "step": 6852, "time_per_iteration": 2.5692973136901855 }, { "auxiliary_loss_clip": 0.01124991, "auxiliary_loss_mlp": 0.01040957, "balance_loss_clip": 1.0246973, "balance_loss_mlp": 1.04262114, "epoch": 0.4120246505335939, "flos": 27562050201600.0, "grad_norm": 2.835351091465837, "language_loss": 0.77196783, "learning_rate": 2.546107414278575e-06, "loss": 0.79362732, "num_input_tokens_seen": 147144290, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.82421875, "step": 6853, "time_per_iteration": 2.6423397064208984 }, { "auxiliary_loss_clip": 0.01135474, "auxiliary_loss_mlp": 0.01041656, "balance_loss_clip": 1.02445436, "balance_loss_mlp": 1.04313886, "epoch": 0.41208477378626185, "flos": 37815444046080.0, "grad_norm": 1.8193012551650405, "language_loss": 0.65515399, "learning_rate": 2.545743939914005e-06, "loss": 0.67692536, "num_input_tokens_seen": 147166340, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.83203125, "step": 6854, "time_per_iteration": 2.671515941619873 }, { "auxiliary_loss_clip": 0.01136495, "auxiliary_loss_mlp": 0.01035831, "balance_loss_clip": 1.02144289, "balance_loss_mlp": 1.04087639, "epoch": 0.4121448970389298, "flos": 23440762402560.0, "grad_norm": 1.8513149837020755, "language_loss": 0.8377738, "learning_rate": 2.5453804460732385e-06, "loss": 0.85949707, "num_input_tokens_seen": 147184025, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.77734375, "step": 6855, "time_per_iteration": 2.6334052085876465 }, { "auxiliary_loss_clip": 0.01129183, "auxiliary_loss_mlp": 0.01041808, "balance_loss_clip": 1.02745557, "balance_loss_mlp": 1.04283154, "epoch": 0.4122050202915978, "flos": 21325300859520.0, "grad_norm": 5.163013639274777, "language_loss": 0.78797156, "learning_rate": 2.5450169327692463e-06, "loss": 0.80968142, "num_input_tokens_seen": 147202730, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7734375, "step": 6856, "time_per_iteration": 2.527170419692993 }, { "auxiliary_loss_clip": 0.01149133, "auxiliary_loss_mlp": 0.01034431, "balance_loss_clip": 1.02007794, "balance_loss_mlp": 1.04295397, "epoch": 0.41226514354426574, "flos": 17306286059520.0, "grad_norm": 2.7375894188078953, "language_loss": 0.79827893, "learning_rate": 2.5446534000150017e-06, "loss": 0.82011461, "num_input_tokens_seen": 147215315, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.796875, "step": 6857, "time_per_iteration": 2.534217596054077 }, { "auxiliary_loss_clip": 0.01128806, "auxiliary_loss_mlp": 0.01285426, "balance_loss_clip": 1.02285981, "balance_loss_mlp": 1.0396769, "epoch": 0.4123252667969337, "flos": 17638855107840.0, "grad_norm": 2.303364147456553, "language_loss": 0.70468044, "learning_rate": 2.5442898478234787e-06, "loss": 0.72882283, "num_input_tokens_seen": 147233330, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8046875, "step": 6858, "time_per_iteration": 2.4684720039367676 }, { "auxiliary_loss_clip": 0.01128504, "auxiliary_loss_mlp": 0.01040532, "balance_loss_clip": 1.02516544, "balance_loss_mlp": 1.04158998, "epoch": 0.4123853900496017, "flos": 46424811375360.0, "grad_norm": 1.6129979976191129, "language_loss": 0.59661794, "learning_rate": 2.543926276207651e-06, "loss": 0.61830831, "num_input_tokens_seen": 147257780, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.78125, "step": 6859, "time_per_iteration": 2.7572200298309326 }, { "auxiliary_loss_clip": 0.01120867, "auxiliary_loss_mlp": 0.01041096, "balance_loss_clip": 1.02645707, "balance_loss_mlp": 1.04077005, "epoch": 0.41244551330226964, "flos": 17675160779520.0, "grad_norm": 2.114190467031847, "language_loss": 0.73310089, "learning_rate": 2.543562685180494e-06, "loss": 0.75472057, "num_input_tokens_seen": 147276055, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80078125, "step": 6860, "time_per_iteration": 2.4656732082366943 }, { "auxiliary_loss_clip": 0.01172135, "auxiliary_loss_mlp": 0.01034178, "balance_loss_clip": 1.02006316, "balance_loss_mlp": 1.04025483, "epoch": 0.4125056365549376, "flos": 18693730748160.0, "grad_norm": 1.5863761233313607, "language_loss": 0.73538667, "learning_rate": 2.543199074754982e-06, "loss": 0.75744975, "num_input_tokens_seen": 147293200, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.77734375, "step": 6861, "time_per_iteration": 2.630174398422241 }, { "auxiliary_loss_clip": 0.01134971, "auxiliary_loss_mlp": 0.01032429, "balance_loss_clip": 1.01891637, "balance_loss_mlp": 1.04093444, "epoch": 0.41256575980760557, "flos": 17895293280000.0, "grad_norm": 1.910303895656862, "language_loss": 0.79631585, "learning_rate": 2.542835444944093e-06, "loss": 0.81798983, "num_input_tokens_seen": 147310640, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.76171875, "step": 6862, "time_per_iteration": 2.647711992263794 }, { "auxiliary_loss_clip": 0.011394, "auxiliary_loss_mlp": 0.01038531, "balance_loss_clip": 1.02391648, "balance_loss_mlp": 1.04215014, "epoch": 0.41262588306027354, "flos": 21981316901760.0, "grad_norm": 1.6531574277458636, "language_loss": 0.75654912, "learning_rate": 2.5424717957608034e-06, "loss": 0.77832842, "num_input_tokens_seen": 147329435, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7890625, "step": 6863, "time_per_iteration": 2.598609685897827 }, { "auxiliary_loss_clip": 0.01049547, "auxiliary_loss_mlp": 0.01004342, "balance_loss_clip": 1.00266123, "balance_loss_mlp": 1.01539779, "epoch": 0.41268600631294156, "flos": 53350006740480.0, "grad_norm": 0.6944056828115127, "language_loss": 0.5271076, "learning_rate": 2.5421081272180904e-06, "loss": 0.54764652, "num_input_tokens_seen": 147385805, "router_z_loss_clip": 0.0168457, "router_z_loss_mlp": 0.25, "step": 6864, "time_per_iteration": 3.120161533355713 }, { "auxiliary_loss_clip": 0.01122747, "auxiliary_loss_mlp": 0.01035572, "balance_loss_clip": 1.02034903, "balance_loss_mlp": 1.0405252, "epoch": 0.4127461295656095, "flos": 24385356311040.0, "grad_norm": 1.8063526889156618, "language_loss": 0.72607094, "learning_rate": 2.541744439328933e-06, "loss": 0.74765414, "num_input_tokens_seen": 147405160, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.82421875, "step": 6865, "time_per_iteration": 2.573002815246582 }, { "auxiliary_loss_clip": 0.01040598, "auxiliary_loss_mlp": 0.01001886, "balance_loss_clip": 1.00015736, "balance_loss_mlp": 1.01546431, "epoch": 0.4128062528182775, "flos": 71705242696320.0, "grad_norm": 0.9283881231582833, "language_loss": 0.66534877, "learning_rate": 2.5413807321063097e-06, "loss": 0.68577361, "num_input_tokens_seen": 147460245, "router_z_loss_clip": 0.01733398, "router_z_loss_mlp": 0.25, "step": 6866, "time_per_iteration": 3.0429890155792236 }, { "auxiliary_loss_clip": 0.01119357, "auxiliary_loss_mlp": 0.01039001, "balance_loss_clip": 1.02430224, "balance_loss_mlp": 1.04130542, "epoch": 0.41286637607094545, "flos": 17849111368320.0, "grad_norm": 2.3159355343775303, "language_loss": 0.80925071, "learning_rate": 2.5410170055632016e-06, "loss": 0.83083427, "num_input_tokens_seen": 147476200, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78125, "step": 6867, "time_per_iteration": 2.4703354835510254 }, { "auxiliary_loss_clip": 0.01143124, "auxiliary_loss_mlp": 0.01038322, "balance_loss_clip": 1.02314639, "balance_loss_mlp": 1.04342043, "epoch": 0.4129264993236134, "flos": 25549544016000.0, "grad_norm": 2.0501370367082226, "language_loss": 0.77693188, "learning_rate": 2.5406532597125873e-06, "loss": 0.79874635, "num_input_tokens_seen": 147494315, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8203125, "step": 6868, "time_per_iteration": 4.0138328075408936 }, { "auxiliary_loss_clip": 0.0104819, "auxiliary_loss_mlp": 0.01004573, "balance_loss_clip": 1.00285661, "balance_loss_mlp": 1.01462865, "epoch": 0.4129866225762814, "flos": 65414446364160.0, "grad_norm": 0.8434945454214847, "language_loss": 0.57782876, "learning_rate": 2.5402894945674492e-06, "loss": 0.59835637, "num_input_tokens_seen": 147543665, "router_z_loss_clip": 0.01721191, "router_z_loss_mlp": 0.25, "step": 6869, "time_per_iteration": 2.901312828063965 }, { "auxiliary_loss_clip": 0.01127603, "auxiliary_loss_mlp": 0.01030732, "balance_loss_clip": 1.0161761, "balance_loss_mlp": 1.04155838, "epoch": 0.41304674582894935, "flos": 28876991287680.0, "grad_norm": 2.19367777978846, "language_loss": 0.75469291, "learning_rate": 2.539925710140769e-06, "loss": 0.77627623, "num_input_tokens_seen": 147564870, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7734375, "step": 6870, "time_per_iteration": 2.6563615798950195 }, { "auxiliary_loss_clip": 0.01149285, "auxiliary_loss_mlp": 0.01039596, "balance_loss_clip": 1.02377677, "balance_loss_mlp": 1.04226851, "epoch": 0.4131068690816173, "flos": 22891975436160.0, "grad_norm": 1.652039837693007, "language_loss": 0.8335557, "learning_rate": 2.539561906445528e-06, "loss": 0.85544449, "num_input_tokens_seen": 147584840, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8046875, "step": 6871, "time_per_iteration": 2.6027143001556396 }, { "auxiliary_loss_clip": 0.01152157, "auxiliary_loss_mlp": 0.01042319, "balance_loss_clip": 1.02700043, "balance_loss_mlp": 1.04409111, "epoch": 0.4131669923342853, "flos": 26065185707520.0, "grad_norm": 2.0398563591496033, "language_loss": 0.68567204, "learning_rate": 2.5391980834947122e-06, "loss": 0.70761681, "num_input_tokens_seen": 147604635, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.81640625, "step": 6872, "time_per_iteration": 2.5723683834075928 }, { "auxiliary_loss_clip": 0.01142124, "auxiliary_loss_mlp": 0.01032251, "balance_loss_clip": 1.01795745, "balance_loss_mlp": 1.04372108, "epoch": 0.41322711558695324, "flos": 19244564789760.0, "grad_norm": 1.9583598035776855, "language_loss": 0.75404274, "learning_rate": 2.538834241301303e-06, "loss": 0.77578652, "num_input_tokens_seen": 147620700, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8046875, "step": 6873, "time_per_iteration": 2.5340306758880615 }, { "auxiliary_loss_clip": 0.0114124, "auxiliary_loss_mlp": 0.01038532, "balance_loss_clip": 1.02346349, "balance_loss_mlp": 1.04204512, "epoch": 0.4132872388396212, "flos": 22674464628480.0, "grad_norm": 2.107337631489841, "language_loss": 0.83146, "learning_rate": 2.5384703798782852e-06, "loss": 0.85325772, "num_input_tokens_seen": 147639490, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8125, "step": 6874, "time_per_iteration": 3.9496123790740967 }, { "auxiliary_loss_clip": 0.01150833, "auxiliary_loss_mlp": 0.01035355, "balance_loss_clip": 1.01960754, "balance_loss_mlp": 1.0426209, "epoch": 0.4133473620922892, "flos": 20150195420160.0, "grad_norm": 2.4885363918857393, "language_loss": 0.7147994, "learning_rate": 2.538106499238646e-06, "loss": 0.73666126, "num_input_tokens_seen": 147657205, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8125, "step": 6875, "time_per_iteration": 2.63173770904541 }, { "auxiliary_loss_clip": 0.01133341, "auxiliary_loss_mlp": 0.01040038, "balance_loss_clip": 1.02618003, "balance_loss_mlp": 1.04100871, "epoch": 0.41340748534495714, "flos": 24242755317120.0, "grad_norm": 1.515281593648586, "language_loss": 0.82649314, "learning_rate": 2.537742599395369e-06, "loss": 0.84822696, "num_input_tokens_seen": 147677005, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7421875, "step": 6876, "time_per_iteration": 4.205667734146118 }, { "auxiliary_loss_clip": 0.01134331, "auxiliary_loss_mlp": 0.01038707, "balance_loss_clip": 1.0223279, "balance_loss_mlp": 1.04188895, "epoch": 0.41346760859762516, "flos": 14392171566720.0, "grad_norm": 2.5689236640948048, "language_loss": 0.65369022, "learning_rate": 2.5373786803614423e-06, "loss": 0.67542058, "num_input_tokens_seen": 147693435, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.83203125, "step": 6877, "time_per_iteration": 2.4884893894195557 }, { "auxiliary_loss_clip": 0.01128847, "auxiliary_loss_mlp": 0.0103032, "balance_loss_clip": 1.01582432, "balance_loss_mlp": 1.04231918, "epoch": 0.4135277318502931, "flos": 22492002516480.0, "grad_norm": 1.7783695107588173, "language_loss": 0.76252961, "learning_rate": 2.5370147421498523e-06, "loss": 0.78412127, "num_input_tokens_seen": 147714000, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.77734375, "step": 6878, "time_per_iteration": 4.281364679336548 }, { "auxiliary_loss_clip": 0.01130397, "auxiliary_loss_mlp": 0.01282102, "balance_loss_clip": 1.02055275, "balance_loss_mlp": 1.04234767, "epoch": 0.4135878551029611, "flos": 22418744728320.0, "grad_norm": 1.825348553111093, "language_loss": 0.80152667, "learning_rate": 2.536650784773588e-06, "loss": 0.82565165, "num_input_tokens_seen": 147731010, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7890625, "step": 6879, "time_per_iteration": 2.6551644802093506 }, { "auxiliary_loss_clip": 0.01119463, "auxiliary_loss_mlp": 0.01035532, "balance_loss_clip": 1.02014232, "balance_loss_mlp": 1.04120409, "epoch": 0.41364797835562905, "flos": 27053232094080.0, "grad_norm": 1.7130623412057708, "language_loss": 0.84763765, "learning_rate": 2.5362868082456376e-06, "loss": 0.86918759, "num_input_tokens_seen": 147750880, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.78125, "step": 6880, "time_per_iteration": 2.621415615081787 }, { "auxiliary_loss_clip": 0.01138424, "auxiliary_loss_mlp": 0.01030445, "balance_loss_clip": 1.0158658, "balance_loss_mlp": 1.04048741, "epoch": 0.413708101608297, "flos": 22967603521920.0, "grad_norm": 1.9220633733419332, "language_loss": 0.70238209, "learning_rate": 2.53592281257899e-06, "loss": 0.72407079, "num_input_tokens_seen": 147771360, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8046875, "step": 6881, "time_per_iteration": 2.6256375312805176 }, { "auxiliary_loss_clip": 0.0111831, "auxiliary_loss_mlp": 0.01043115, "balance_loss_clip": 1.02923274, "balance_loss_mlp": 1.0422796, "epoch": 0.413768224860965, "flos": 13333991875200.0, "grad_norm": 1.7485901872396656, "language_loss": 0.81331551, "learning_rate": 2.5355587977866364e-06, "loss": 0.83492982, "num_input_tokens_seen": 147787440, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 6882, "time_per_iteration": 2.5334267616271973 }, { "auxiliary_loss_clip": 0.01143396, "auxiliary_loss_mlp": 0.01043572, "balance_loss_clip": 1.02640533, "balance_loss_mlp": 1.04364443, "epoch": 0.41382834811363295, "flos": 20813968800000.0, "grad_norm": 2.119201106553004, "language_loss": 0.69574738, "learning_rate": 2.5351947638815665e-06, "loss": 0.71761703, "num_input_tokens_seen": 147805720, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8203125, "step": 6883, "time_per_iteration": 2.639211416244507 }, { "auxiliary_loss_clip": 0.01127919, "auxiliary_loss_mlp": 0.01037875, "balance_loss_clip": 1.02390981, "balance_loss_mlp": 1.04094553, "epoch": 0.4138884713663009, "flos": 20667130001280.0, "grad_norm": 1.8687338582177448, "language_loss": 0.75583965, "learning_rate": 2.5348307108767724e-06, "loss": 0.77749759, "num_input_tokens_seen": 147824605, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78125, "step": 6884, "time_per_iteration": 2.5953102111816406 }, { "auxiliary_loss_clip": 0.01149708, "auxiliary_loss_mlp": 0.01040882, "balance_loss_clip": 1.02590954, "balance_loss_mlp": 1.04171586, "epoch": 0.4139485946189689, "flos": 29056616225280.0, "grad_norm": 1.5523444632984027, "language_loss": 0.75983495, "learning_rate": 2.534466638785246e-06, "loss": 0.7817409, "num_input_tokens_seen": 147845445, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.80859375, "step": 6885, "time_per_iteration": 2.733811378479004 }, { "auxiliary_loss_clip": 0.0112217, "auxiliary_loss_mlp": 0.01043437, "balance_loss_clip": 1.02848828, "balance_loss_mlp": 1.04132509, "epoch": 0.41400871787163684, "flos": 24425720219520.0, "grad_norm": 2.1071675376598087, "language_loss": 0.69957352, "learning_rate": 2.5341025476199795e-06, "loss": 0.72122961, "num_input_tokens_seen": 147865580, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8046875, "step": 6886, "time_per_iteration": 2.5652687549591064 }, { "auxiliary_loss_clip": 0.01123667, "auxiliary_loss_mlp": 0.01287753, "balance_loss_clip": 1.02625012, "balance_loss_mlp": 1.04361522, "epoch": 0.4140688411243048, "flos": 19464050845440.0, "grad_norm": 12.1580801553479, "language_loss": 0.74817824, "learning_rate": 2.5337384373939677e-06, "loss": 0.77229249, "num_input_tokens_seen": 147885230, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.80078125, "step": 6887, "time_per_iteration": 2.6258316040039062 }, { "auxiliary_loss_clip": 0.01127794, "auxiliary_loss_mlp": 0.01035131, "balance_loss_clip": 1.02015257, "balance_loss_mlp": 1.04031479, "epoch": 0.4141289643769728, "flos": 19313656600320.0, "grad_norm": 1.8573851319108599, "language_loss": 0.70110857, "learning_rate": 2.5333743081202034e-06, "loss": 0.72273779, "num_input_tokens_seen": 147903035, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.78515625, "step": 6888, "time_per_iteration": 2.500589609146118 }, { "auxiliary_loss_clip": 0.01117879, "auxiliary_loss_mlp": 0.01040873, "balance_loss_clip": 1.02610338, "balance_loss_mlp": 1.03992522, "epoch": 0.41418908762964074, "flos": 16726903683840.0, "grad_norm": 2.031993573822504, "language_loss": 0.76177943, "learning_rate": 2.5330101598116823e-06, "loss": 0.78336698, "num_input_tokens_seen": 147918745, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.78125, "step": 6889, "time_per_iteration": 2.5900516510009766 }, { "auxiliary_loss_clip": 0.01148609, "auxiliary_loss_mlp": 0.01285523, "balance_loss_clip": 1.02397001, "balance_loss_mlp": 1.04440737, "epoch": 0.41424921088230876, "flos": 25296840858240.0, "grad_norm": 1.5750608396419317, "language_loss": 0.80124557, "learning_rate": 2.5326459924814007e-06, "loss": 0.82558692, "num_input_tokens_seen": 147938265, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7734375, "step": 6890, "time_per_iteration": 2.601994514465332 }, { "auxiliary_loss_clip": 0.01118846, "auxiliary_loss_mlp": 0.01044871, "balance_loss_clip": 1.029863, "balance_loss_mlp": 1.04172826, "epoch": 0.4143093341349767, "flos": 20960520289920.0, "grad_norm": 1.466097048555693, "language_loss": 0.74285096, "learning_rate": 2.532281806142352e-06, "loss": 0.7644881, "num_input_tokens_seen": 147957320, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.76953125, "step": 6891, "time_per_iteration": 2.616112470626831 }, { "auxiliary_loss_clip": 0.01159844, "auxiliary_loss_mlp": 0.0104258, "balance_loss_clip": 1.02583122, "balance_loss_mlp": 1.04340315, "epoch": 0.4143694573876447, "flos": 22017694400640.0, "grad_norm": 2.8942260112263987, "language_loss": 0.84242857, "learning_rate": 2.531917600807536e-06, "loss": 0.86445284, "num_input_tokens_seen": 147977045, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8046875, "step": 6892, "time_per_iteration": 2.623122215270996 }, { "auxiliary_loss_clip": 0.01146515, "auxiliary_loss_mlp": 0.01281617, "balance_loss_clip": 1.02045846, "balance_loss_mlp": 1.04271841, "epoch": 0.41442958064031266, "flos": 35697396723840.0, "grad_norm": 2.114385091304138, "language_loss": 0.70668429, "learning_rate": 2.5315533764899487e-06, "loss": 0.73096561, "num_input_tokens_seen": 147996905, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.765625, "step": 6893, "time_per_iteration": 2.807677984237671 }, { "auxiliary_loss_clip": 0.01117889, "auxiliary_loss_mlp": 0.0103547, "balance_loss_clip": 1.02109349, "balance_loss_mlp": 1.0404923, "epoch": 0.4144897038929806, "flos": 28293766156800.0, "grad_norm": 1.4197466813836763, "language_loss": 0.72598207, "learning_rate": 2.5311891332025886e-06, "loss": 0.74751562, "num_input_tokens_seen": 148017875, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7734375, "step": 6894, "time_per_iteration": 2.592533588409424 }, { "auxiliary_loss_clip": 0.01130627, "auxiliary_loss_mlp": 0.01036577, "balance_loss_clip": 1.02192605, "balance_loss_mlp": 1.04129672, "epoch": 0.4145498271456486, "flos": 11648093080320.0, "grad_norm": 2.4387791417956444, "language_loss": 0.61693609, "learning_rate": 2.530824870958455e-06, "loss": 0.63860822, "num_input_tokens_seen": 148032300, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8046875, "step": 6895, "time_per_iteration": 2.6098792552948 }, { "auxiliary_loss_clip": 0.01145095, "auxiliary_loss_mlp": 0.01034505, "balance_loss_clip": 1.02005708, "balance_loss_mlp": 1.04115915, "epoch": 0.41460995039831655, "flos": 27235622378880.0, "grad_norm": 2.5637430821463747, "language_loss": 0.70552438, "learning_rate": 2.5304605897705465e-06, "loss": 0.72732043, "num_input_tokens_seen": 148053260, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.76953125, "step": 6896, "time_per_iteration": 2.6297266483306885 }, { "auxiliary_loss_clip": 0.01120882, "auxiliary_loss_mlp": 0.01043329, "balance_loss_clip": 1.02760518, "balance_loss_mlp": 1.04108787, "epoch": 0.4146700736509845, "flos": 25922369232000.0, "grad_norm": 1.6938371238228025, "language_loss": 0.7283417, "learning_rate": 2.530096289651865e-06, "loss": 0.74998373, "num_input_tokens_seen": 148072965, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.796875, "step": 6897, "time_per_iteration": 2.594688892364502 }, { "auxiliary_loss_clip": 0.01129282, "auxiliary_loss_mlp": 0.0104401, "balance_loss_clip": 1.029109, "balance_loss_mlp": 1.04311526, "epoch": 0.4147301969036525, "flos": 26833243248000.0, "grad_norm": 1.831748780668358, "language_loss": 0.84466588, "learning_rate": 2.5297319706154095e-06, "loss": 0.86639881, "num_input_tokens_seen": 148093240, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7734375, "step": 6898, "time_per_iteration": 2.6029348373413086 }, { "auxiliary_loss_clip": 0.01155264, "auxiliary_loss_mlp": 0.01037521, "balance_loss_clip": 1.02244079, "balance_loss_mlp": 1.03972304, "epoch": 0.41479032015632045, "flos": 20298291194880.0, "grad_norm": 1.613399797039445, "language_loss": 0.74350512, "learning_rate": 2.5293676326741838e-06, "loss": 0.76543295, "num_input_tokens_seen": 148110925, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.80078125, "step": 6899, "time_per_iteration": 2.6287200450897217 }, { "auxiliary_loss_clip": 0.01151662, "auxiliary_loss_mlp": 0.0104366, "balance_loss_clip": 1.02705383, "balance_loss_mlp": 1.04207182, "epoch": 0.4148504434089884, "flos": 25264988472960.0, "grad_norm": 2.1447049255523116, "language_loss": 0.7550562, "learning_rate": 2.529003275841188e-06, "loss": 0.77700943, "num_input_tokens_seen": 148130670, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.828125, "step": 6900, "time_per_iteration": 2.6402688026428223 }, { "auxiliary_loss_clip": 0.011279, "auxiliary_loss_mlp": 0.0103756, "balance_loss_clip": 1.02147889, "balance_loss_mlp": 1.04099882, "epoch": 0.4149105666616564, "flos": 12822300679680.0, "grad_norm": 2.036199708786435, "language_loss": 0.8052882, "learning_rate": 2.5286389001294265e-06, "loss": 0.82694286, "num_input_tokens_seen": 148148350, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.77734375, "step": 6901, "time_per_iteration": 2.5954020023345947 }, { "auxiliary_loss_clip": 0.01145366, "auxiliary_loss_mlp": 0.0103476, "balance_loss_clip": 1.01973343, "balance_loss_mlp": 1.04094112, "epoch": 0.41497068991432434, "flos": 16763891713920.0, "grad_norm": 2.108050735433857, "language_loss": 0.69453979, "learning_rate": 2.5282745055519027e-06, "loss": 0.71634108, "num_input_tokens_seen": 148167550, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.77734375, "step": 6902, "time_per_iteration": 2.641136407852173 }, { "auxiliary_loss_clip": 0.01135547, "auxiliary_loss_mlp": 0.01286378, "balance_loss_clip": 1.02499652, "balance_loss_mlp": 1.04185498, "epoch": 0.4150308131669923, "flos": 18000906243840.0, "grad_norm": 1.7199804747347542, "language_loss": 0.84030354, "learning_rate": 2.5279100921216197e-06, "loss": 0.86452281, "num_input_tokens_seen": 148184740, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.76171875, "step": 6903, "time_per_iteration": 2.6258559226989746 }, { "auxiliary_loss_clip": 0.01123302, "auxiliary_loss_mlp": 0.0104078, "balance_loss_clip": 1.02417374, "balance_loss_mlp": 1.04130769, "epoch": 0.41509093641966033, "flos": 30044770352640.0, "grad_norm": 2.439317757781939, "language_loss": 0.6791271, "learning_rate": 2.5275456598515846e-06, "loss": 0.70076793, "num_input_tokens_seen": 148204605, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8203125, "step": 6904, "time_per_iteration": 2.7052998542785645 }, { "auxiliary_loss_clip": 0.01121568, "auxiliary_loss_mlp": 0.01285482, "balance_loss_clip": 1.02300429, "balance_loss_mlp": 1.04100561, "epoch": 0.4151510596723283, "flos": 24279994742400.0, "grad_norm": 1.588312222711701, "language_loss": 0.77517128, "learning_rate": 2.5271812087548014e-06, "loss": 0.79924178, "num_input_tokens_seen": 148224675, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8046875, "step": 6905, "time_per_iteration": 2.6352131366729736 }, { "auxiliary_loss_clip": 0.01123509, "auxiliary_loss_mlp": 0.0103498, "balance_loss_clip": 1.01970959, "balance_loss_mlp": 1.04314995, "epoch": 0.41521118292499626, "flos": 23476206147840.0, "grad_norm": 1.5382193912591178, "language_loss": 0.68164533, "learning_rate": 2.526816738844277e-06, "loss": 0.70323026, "num_input_tokens_seen": 148243375, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8046875, "step": 6906, "time_per_iteration": 2.6470091342926025 }, { "auxiliary_loss_clip": 0.01136091, "auxiliary_loss_mlp": 0.0103495, "balance_loss_clip": 1.02050829, "balance_loss_mlp": 1.04154956, "epoch": 0.4152713061776642, "flos": 22125498094080.0, "grad_norm": 3.0670736499775417, "language_loss": 0.67383039, "learning_rate": 2.5264522501330186e-06, "loss": 0.69554079, "num_input_tokens_seen": 148261140, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.76953125, "step": 6907, "time_per_iteration": 2.5141239166259766 }, { "auxiliary_loss_clip": 0.01151313, "auxiliary_loss_mlp": 0.01035478, "balance_loss_clip": 1.02147114, "balance_loss_mlp": 1.04522359, "epoch": 0.4153314294303322, "flos": 21251396626560.0, "grad_norm": 1.8750772915995044, "language_loss": 0.77031815, "learning_rate": 2.5260877426340326e-06, "loss": 0.79218602, "num_input_tokens_seen": 148279655, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.79296875, "step": 6908, "time_per_iteration": 2.682401657104492 }, { "auxiliary_loss_clip": 0.01060547, "auxiliary_loss_mlp": 0.01016481, "balance_loss_clip": 1.01475286, "balance_loss_mlp": 1.01721919, "epoch": 0.41539155268300015, "flos": 57920681594880.0, "grad_norm": 0.7665937521293749, "language_loss": 0.64773631, "learning_rate": 2.5257232163603297e-06, "loss": 0.6685065, "num_input_tokens_seen": 148339005, "router_z_loss_clip": 0.01733398, "router_z_loss_mlp": 0.25195312, "step": 6909, "time_per_iteration": 3.152317762374878 }, { "auxiliary_loss_clip": 0.01147002, "auxiliary_loss_mlp": 0.01037157, "balance_loss_clip": 1.02263689, "balance_loss_mlp": 1.04257965, "epoch": 0.4154516759356681, "flos": 21903677654400.0, "grad_norm": 1.6143712581032978, "language_loss": 0.87194091, "learning_rate": 2.5253586713249164e-06, "loss": 0.8937825, "num_input_tokens_seen": 148358715, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7734375, "step": 6910, "time_per_iteration": 3.927109718322754 }, { "auxiliary_loss_clip": 0.01050981, "auxiliary_loss_mlp": 0.01001579, "balance_loss_clip": 0.99987471, "balance_loss_mlp": 1.01681125, "epoch": 0.4155117991883361, "flos": 67833677226240.0, "grad_norm": 0.7970160706371858, "language_loss": 0.62153983, "learning_rate": 2.524994107540804e-06, "loss": 0.64206541, "num_input_tokens_seen": 148417280, "router_z_loss_clip": 0.01708984, "router_z_loss_mlp": 0.25, "step": 6911, "time_per_iteration": 3.154212474822998 }, { "auxiliary_loss_clip": 0.01128236, "auxiliary_loss_mlp": 0.01037769, "balance_loss_clip": 1.02255774, "balance_loss_mlp": 1.04426026, "epoch": 0.41557192244100405, "flos": 14282679934080.0, "grad_norm": 6.485450341424141, "language_loss": 0.87917536, "learning_rate": 2.5246295250210024e-06, "loss": 0.90083545, "num_input_tokens_seen": 148432610, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.83984375, "step": 6912, "time_per_iteration": 2.479609489440918 }, { "auxiliary_loss_clip": 0.01130248, "auxiliary_loss_mlp": 0.01035649, "balance_loss_clip": 1.02139771, "balance_loss_mlp": 1.04246616, "epoch": 0.415632045693672, "flos": 24461954064000.0, "grad_norm": 2.3749306247932296, "language_loss": 0.62761569, "learning_rate": 2.5242649237785224e-06, "loss": 0.64927465, "num_input_tokens_seen": 148451510, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7890625, "step": 6913, "time_per_iteration": 2.6262996196746826 }, { "auxiliary_loss_clip": 0.01136413, "auxiliary_loss_mlp": 0.01283369, "balance_loss_clip": 1.02221358, "balance_loss_mlp": 1.04160535, "epoch": 0.41569216894634, "flos": 20115290378880.0, "grad_norm": 2.2291986518128484, "language_loss": 0.75736141, "learning_rate": 2.5239003038263764e-06, "loss": 0.78155923, "num_input_tokens_seen": 148469945, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.76953125, "step": 6914, "time_per_iteration": 2.5894672870635986 }, { "auxiliary_loss_clip": 0.01158165, "auxiliary_loss_mlp": 0.01043053, "balance_loss_clip": 1.02792501, "balance_loss_mlp": 1.04161656, "epoch": 0.41575229219900794, "flos": 23798827128960.0, "grad_norm": 2.5748924726963454, "language_loss": 0.8792206, "learning_rate": 2.523535665177575e-06, "loss": 0.90123284, "num_input_tokens_seen": 148486655, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.80859375, "step": 6915, "time_per_iteration": 3.9963176250457764 }, { "auxiliary_loss_clip": 0.0114137, "auxiliary_loss_mlp": 0.0104307, "balance_loss_clip": 1.0280025, "balance_loss_mlp": 1.04419947, "epoch": 0.4158124154516759, "flos": 23108229267840.0, "grad_norm": 3.745127869105526, "language_loss": 0.71940809, "learning_rate": 2.5231710078451333e-06, "loss": 0.74125248, "num_input_tokens_seen": 148505035, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7890625, "step": 6916, "time_per_iteration": 2.659730911254883 }, { "auxiliary_loss_clip": 0.01125162, "auxiliary_loss_mlp": 0.01033651, "balance_loss_clip": 1.01820195, "balance_loss_mlp": 1.04400396, "epoch": 0.41587253870434393, "flos": 24242970798720.0, "grad_norm": 1.5555978106283408, "language_loss": 0.71725535, "learning_rate": 2.522806331842064e-06, "loss": 0.73884344, "num_input_tokens_seen": 148525575, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8125, "step": 6917, "time_per_iteration": 4.085587739944458 }, { "auxiliary_loss_clip": 0.01136908, "auxiliary_loss_mlp": 0.0103682, "balance_loss_clip": 1.02203798, "balance_loss_mlp": 1.04050672, "epoch": 0.4159326619570119, "flos": 23881602021120.0, "grad_norm": 1.8114875568983888, "language_loss": 0.80976164, "learning_rate": 2.522441637181381e-06, "loss": 0.83149892, "num_input_tokens_seen": 148547270, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.78125, "step": 6918, "time_per_iteration": 2.6693432331085205 }, { "auxiliary_loss_clip": 0.01134412, "auxiliary_loss_mlp": 0.01040038, "balance_loss_clip": 1.02423131, "balance_loss_mlp": 1.04436827, "epoch": 0.41599278520967986, "flos": 40626531354240.0, "grad_norm": 1.3716183432571507, "language_loss": 0.70231879, "learning_rate": 2.5220769238761008e-06, "loss": 0.72406328, "num_input_tokens_seen": 148572100, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8125, "step": 6919, "time_per_iteration": 2.8159852027893066 }, { "auxiliary_loss_clip": 0.0114351, "auxiliary_loss_mlp": 0.0103772, "balance_loss_clip": 1.02165031, "balance_loss_mlp": 1.04510033, "epoch": 0.4160529084623478, "flos": 18222942165120.0, "grad_norm": 1.909494906729072, "language_loss": 0.80990475, "learning_rate": 2.5217121919392378e-06, "loss": 0.83171707, "num_input_tokens_seen": 148591245, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.80859375, "step": 6920, "time_per_iteration": 4.193775177001953 }, { "auxiliary_loss_clip": 0.01143527, "auxiliary_loss_mlp": 0.01040557, "balance_loss_clip": 1.02367711, "balance_loss_mlp": 1.04372919, "epoch": 0.4161130317150158, "flos": 13661963982720.0, "grad_norm": 3.5426186566942492, "language_loss": 0.65598428, "learning_rate": 2.521347441383808e-06, "loss": 0.67782509, "num_input_tokens_seen": 148607980, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8203125, "step": 6921, "time_per_iteration": 2.545020580291748 }, { "auxiliary_loss_clip": 0.01140183, "auxiliary_loss_mlp": 0.01041978, "balance_loss_clip": 1.0263381, "balance_loss_mlp": 1.04243469, "epoch": 0.41617315496768376, "flos": 16178511767040.0, "grad_norm": 2.2610884906142235, "language_loss": 0.80672121, "learning_rate": 2.5209826722228293e-06, "loss": 0.82854283, "num_input_tokens_seen": 148624490, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.796875, "step": 6922, "time_per_iteration": 2.522273540496826 }, { "auxiliary_loss_clip": 0.01141773, "auxiliary_loss_mlp": 0.01035571, "balance_loss_clip": 1.02137351, "balance_loss_mlp": 1.04260349, "epoch": 0.4162332782203517, "flos": 26213317395840.0, "grad_norm": 1.7366315330718132, "language_loss": 0.67527187, "learning_rate": 2.5206178844693195e-06, "loss": 0.69704533, "num_input_tokens_seen": 148646490, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8125, "step": 6923, "time_per_iteration": 2.5886454582214355 }, { "auxiliary_loss_clip": 0.01141606, "auxiliary_loss_mlp": 0.01046716, "balance_loss_clip": 1.03021765, "balance_loss_mlp": 1.04280519, "epoch": 0.4162934014730197, "flos": 28183987215360.0, "grad_norm": 1.7894375102629987, "language_loss": 0.75577945, "learning_rate": 2.5202530781362966e-06, "loss": 0.77766263, "num_input_tokens_seen": 148668580, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8046875, "step": 6924, "time_per_iteration": 2.5969951152801514 }, { "auxiliary_loss_clip": 0.01124668, "auxiliary_loss_mlp": 0.01036863, "balance_loss_clip": 1.02202129, "balance_loss_mlp": 1.04297996, "epoch": 0.41635352472568765, "flos": 19865316654720.0, "grad_norm": 1.6296547979337577, "language_loss": 0.7282089, "learning_rate": 2.51988825323678e-06, "loss": 0.74982429, "num_input_tokens_seen": 148688410, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.81640625, "step": 6925, "time_per_iteration": 2.5730462074279785 }, { "auxiliary_loss_clip": 0.01138437, "auxiliary_loss_mlp": 0.01037266, "balance_loss_clip": 1.02316999, "balance_loss_mlp": 1.04047918, "epoch": 0.4164136479783556, "flos": 14935356011520.0, "grad_norm": 1.8711244786402568, "language_loss": 0.83206356, "learning_rate": 2.5195234097837883e-06, "loss": 0.85382062, "num_input_tokens_seen": 148704855, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8046875, "step": 6926, "time_per_iteration": 2.5637032985687256 }, { "auxiliary_loss_clip": 0.01136886, "auxiliary_loss_mlp": 0.01036376, "balance_loss_clip": 1.02295864, "balance_loss_mlp": 1.04268742, "epoch": 0.4164737712310236, "flos": 21757593041280.0, "grad_norm": 1.6773700448413216, "language_loss": 0.86019552, "learning_rate": 2.5191585477903423e-06, "loss": 0.88192815, "num_input_tokens_seen": 148723065, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7578125, "step": 6927, "time_per_iteration": 2.6075472831726074 }, { "auxiliary_loss_clip": 0.011295, "auxiliary_loss_mlp": 0.01041162, "balance_loss_clip": 1.02501535, "balance_loss_mlp": 1.04141521, "epoch": 0.41653389448369155, "flos": 20740136394240.0, "grad_norm": 2.1348863741948234, "language_loss": 0.7171967, "learning_rate": 2.5187936672694636e-06, "loss": 0.7389034, "num_input_tokens_seen": 148741780, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.79296875, "step": 6928, "time_per_iteration": 2.5723187923431396 }, { "auxiliary_loss_clip": 0.01136113, "auxiliary_loss_mlp": 0.01042422, "balance_loss_clip": 1.02701974, "balance_loss_mlp": 1.03879929, "epoch": 0.4165940177363595, "flos": 24972891073920.0, "grad_norm": 1.879633155526927, "language_loss": 0.77949262, "learning_rate": 2.5184287682341733e-06, "loss": 0.80127794, "num_input_tokens_seen": 148759795, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.796875, "step": 6929, "time_per_iteration": 2.615018129348755 }, { "auxiliary_loss_clip": 0.0112214, "auxiliary_loss_mlp": 0.010356, "balance_loss_clip": 1.02134275, "balance_loss_mlp": 1.04205608, "epoch": 0.41665414098902753, "flos": 20521727746560.0, "grad_norm": 1.7758200417767014, "language_loss": 0.70929939, "learning_rate": 2.5180638506974935e-06, "loss": 0.7308768, "num_input_tokens_seen": 148778680, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.80078125, "step": 6930, "time_per_iteration": 2.624145746231079 }, { "auxiliary_loss_clip": 0.01139104, "auxiliary_loss_mlp": 0.01037107, "balance_loss_clip": 1.02255774, "balance_loss_mlp": 1.0412066, "epoch": 0.4167142642416955, "flos": 19682926369920.0, "grad_norm": 2.008961467028293, "language_loss": 0.80682647, "learning_rate": 2.517698914672448e-06, "loss": 0.8285886, "num_input_tokens_seen": 148796470, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.80078125, "step": 6931, "time_per_iteration": 2.5138180255889893 }, { "auxiliary_loss_clip": 0.01155198, "auxiliary_loss_mlp": 0.01039265, "balance_loss_clip": 1.02419686, "balance_loss_mlp": 1.04076004, "epoch": 0.41677438749436346, "flos": 23763742519680.0, "grad_norm": 3.0942617451062904, "language_loss": 0.79083157, "learning_rate": 2.5173339601720595e-06, "loss": 0.81277621, "num_input_tokens_seen": 148815300, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7890625, "step": 6932, "time_per_iteration": 2.572833776473999 }, { "auxiliary_loss_clip": 0.01141427, "auxiliary_loss_mlp": 0.0128789, "balance_loss_clip": 1.02508771, "balance_loss_mlp": 1.04310739, "epoch": 0.41683451074703143, "flos": 30410053712640.0, "grad_norm": 2.585038522183345, "language_loss": 0.81147766, "learning_rate": 2.516968987209353e-06, "loss": 0.83577085, "num_input_tokens_seen": 148834315, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8046875, "step": 6933, "time_per_iteration": 2.607691526412964 }, { "auxiliary_loss_clip": 0.01124932, "auxiliary_loss_mlp": 0.01037864, "balance_loss_clip": 1.02179503, "balance_loss_mlp": 1.04212141, "epoch": 0.4168946339996994, "flos": 21506757390720.0, "grad_norm": 1.6843895059946747, "language_loss": 0.7645238, "learning_rate": 2.516603995797353e-06, "loss": 0.78615177, "num_input_tokens_seen": 148852420, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.828125, "step": 6934, "time_per_iteration": 2.5791709423065186 }, { "auxiliary_loss_clip": 0.01132928, "auxiliary_loss_mlp": 0.01033715, "balance_loss_clip": 1.01867056, "balance_loss_mlp": 1.0434947, "epoch": 0.41695475725236736, "flos": 17638675539840.0, "grad_norm": 1.7899467939008342, "language_loss": 0.670928, "learning_rate": 2.5162389859490856e-06, "loss": 0.69259447, "num_input_tokens_seen": 148869305, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8046875, "step": 6935, "time_per_iteration": 2.45216703414917 }, { "auxiliary_loss_clip": 0.01141229, "auxiliary_loss_mlp": 0.01042852, "balance_loss_clip": 1.02750969, "balance_loss_mlp": 1.04344845, "epoch": 0.4170148805050353, "flos": 15668903560320.0, "grad_norm": 2.2914882318805785, "language_loss": 0.7296977, "learning_rate": 2.5158739576775766e-06, "loss": 0.75153852, "num_input_tokens_seen": 148886395, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.80078125, "step": 6936, "time_per_iteration": 2.555551767349243 }, { "auxiliary_loss_clip": 0.0113021, "auxiliary_loss_mlp": 0.01037803, "balance_loss_clip": 1.02337873, "balance_loss_mlp": 1.04220927, "epoch": 0.4170750037577033, "flos": 15159151699200.0, "grad_norm": 3.7650070378418516, "language_loss": 0.74425936, "learning_rate": 2.5155089109958526e-06, "loss": 0.76593947, "num_input_tokens_seen": 148905235, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7890625, "step": 6937, "time_per_iteration": 2.487894058227539 }, { "auxiliary_loss_clip": 0.01139006, "auxiliary_loss_mlp": 0.01037583, "balance_loss_clip": 1.02348101, "balance_loss_mlp": 1.04271996, "epoch": 0.41713512701037125, "flos": 28768289754240.0, "grad_norm": 1.4452030044706032, "language_loss": 0.84026343, "learning_rate": 2.5151438459169424e-06, "loss": 0.86202931, "num_input_tokens_seen": 148928130, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78515625, "step": 6938, "time_per_iteration": 2.7070703506469727 }, { "auxiliary_loss_clip": 0.01144885, "auxiliary_loss_mlp": 0.01037535, "balance_loss_clip": 1.02144217, "balance_loss_mlp": 1.04369068, "epoch": 0.4171952502630392, "flos": 28256993608320.0, "grad_norm": 1.808807312030379, "language_loss": 0.74243701, "learning_rate": 2.514778762453873e-06, "loss": 0.76426119, "num_input_tokens_seen": 148948790, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.83203125, "step": 6939, "time_per_iteration": 2.6068103313446045 }, { "auxiliary_loss_clip": 0.01131787, "auxiliary_loss_mlp": 0.01036038, "balance_loss_clip": 1.02039742, "balance_loss_mlp": 1.04294991, "epoch": 0.4172553735157072, "flos": 24571697091840.0, "grad_norm": 1.6036485044647566, "language_loss": 0.74833465, "learning_rate": 2.5144136606196732e-06, "loss": 0.77001286, "num_input_tokens_seen": 148967690, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.80078125, "step": 6940, "time_per_iteration": 2.6177423000335693 }, { "auxiliary_loss_clip": 0.0112251, "auxiliary_loss_mlp": 0.01040151, "balance_loss_clip": 1.02601838, "balance_loss_mlp": 1.04269862, "epoch": 0.41731549676837515, "flos": 27045797978880.0, "grad_norm": 1.9516043525102191, "language_loss": 0.7172786, "learning_rate": 2.5140485404273737e-06, "loss": 0.73890513, "num_input_tokens_seen": 148987150, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.796875, "step": 6941, "time_per_iteration": 2.5176594257354736 }, { "auxiliary_loss_clip": 0.01134256, "auxiliary_loss_mlp": 0.01044661, "balance_loss_clip": 1.02940273, "balance_loss_mlp": 1.0433985, "epoch": 0.4173756200210431, "flos": 28394063907840.0, "grad_norm": 1.8770962459614078, "language_loss": 0.73859471, "learning_rate": 2.5136834018900038e-06, "loss": 0.7603839, "num_input_tokens_seen": 149004895, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.81640625, "step": 6942, "time_per_iteration": 2.713306188583374 }, { "auxiliary_loss_clip": 0.01156269, "auxiliary_loss_mlp": 0.01040897, "balance_loss_clip": 1.02506638, "balance_loss_mlp": 1.04539084, "epoch": 0.41743574327371114, "flos": 22521556431360.0, "grad_norm": 4.743297672405828, "language_loss": 0.72668695, "learning_rate": 2.513318245020595e-06, "loss": 0.7486586, "num_input_tokens_seen": 149020970, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8359375, "step": 6943, "time_per_iteration": 2.600116729736328 }, { "auxiliary_loss_clip": 0.01153466, "auxiliary_loss_mlp": 0.01278183, "balance_loss_clip": 1.01513624, "balance_loss_mlp": 1.04455376, "epoch": 0.4174958665263791, "flos": 30113431200000.0, "grad_norm": 1.8576401587685214, "language_loss": 0.63820314, "learning_rate": 2.5129530698321775e-06, "loss": 0.66251963, "num_input_tokens_seen": 149041795, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8203125, "step": 6944, "time_per_iteration": 2.683666944503784 }, { "auxiliary_loss_clip": 0.01140703, "auxiliary_loss_mlp": 0.01037889, "balance_loss_clip": 1.02307105, "balance_loss_mlp": 1.04303551, "epoch": 0.41755598977904707, "flos": 25263444188160.0, "grad_norm": 1.5703436392163714, "language_loss": 0.69619733, "learning_rate": 2.5125878763377857e-06, "loss": 0.71798325, "num_input_tokens_seen": 149063700, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.796875, "step": 6945, "time_per_iteration": 2.613779067993164 }, { "auxiliary_loss_clip": 0.01148378, "auxiliary_loss_mlp": 0.01044811, "balance_loss_clip": 1.02812791, "balance_loss_mlp": 1.04092491, "epoch": 0.41761611303171503, "flos": 19828580019840.0, "grad_norm": 1.831510188168924, "language_loss": 0.80330062, "learning_rate": 2.5122226645504506e-06, "loss": 0.82523251, "num_input_tokens_seen": 149082410, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.80859375, "step": 6946, "time_per_iteration": 2.615833044052124 }, { "auxiliary_loss_clip": 0.01133679, "auxiliary_loss_mlp": 0.01033223, "balance_loss_clip": 1.01834583, "balance_loss_mlp": 1.04468179, "epoch": 0.417676236284383, "flos": 15523249910400.0, "grad_norm": 2.1599674920126213, "language_loss": 0.77231228, "learning_rate": 2.5118574344832056e-06, "loss": 0.79398131, "num_input_tokens_seen": 149098745, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.796875, "step": 6947, "time_per_iteration": 2.5063366889953613 }, { "auxiliary_loss_clip": 0.0112397, "auxiliary_loss_mlp": 0.01037341, "balance_loss_clip": 1.02158117, "balance_loss_mlp": 1.04291296, "epoch": 0.41773635953705096, "flos": 32524473761280.0, "grad_norm": 1.6996178538613573, "language_loss": 0.71104515, "learning_rate": 2.5114921861490865e-06, "loss": 0.73265827, "num_input_tokens_seen": 149122255, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8125, "step": 6948, "time_per_iteration": 2.6434202194213867 }, { "auxiliary_loss_clip": 0.01126082, "auxiliary_loss_mlp": 0.01035395, "balance_loss_clip": 1.02058911, "balance_loss_mlp": 1.03927493, "epoch": 0.4177964827897189, "flos": 23440941970560.0, "grad_norm": 1.4559051338893305, "language_loss": 0.76752889, "learning_rate": 2.511126919561126e-06, "loss": 0.78914368, "num_input_tokens_seen": 149142845, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.78125, "step": 6949, "time_per_iteration": 2.6261379718780518 }, { "auxiliary_loss_clip": 0.01143712, "auxiliary_loss_mlp": 0.01039436, "balance_loss_clip": 1.02330697, "balance_loss_mlp": 1.04325926, "epoch": 0.4178566060423869, "flos": 22748907565440.0, "grad_norm": 1.6305395974691965, "language_loss": 0.81777596, "learning_rate": 2.5107616347323617e-06, "loss": 0.83960748, "num_input_tokens_seen": 149163375, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.82421875, "step": 6950, "time_per_iteration": 2.6399927139282227 }, { "auxiliary_loss_clip": 0.01137521, "auxiliary_loss_mlp": 0.01283516, "balance_loss_clip": 1.02314734, "balance_loss_mlp": 1.04329014, "epoch": 0.41791672929505486, "flos": 26032794618240.0, "grad_norm": 1.343903088335148, "language_loss": 0.76002222, "learning_rate": 2.510396331675828e-06, "loss": 0.78423262, "num_input_tokens_seen": 149185610, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.76171875, "step": 6951, "time_per_iteration": 3.9802839756011963 }, { "auxiliary_loss_clip": 0.01158969, "auxiliary_loss_mlp": 0.01036312, "balance_loss_clip": 1.02038586, "balance_loss_mlp": 1.04083812, "epoch": 0.4179768525477228, "flos": 19568694142080.0, "grad_norm": 1.9040420324266456, "language_loss": 0.73154092, "learning_rate": 2.5100310104045613e-06, "loss": 0.75349379, "num_input_tokens_seen": 149203990, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.828125, "step": 6952, "time_per_iteration": 2.551304578781128 }, { "auxiliary_loss_clip": 0.01131608, "auxiliary_loss_mlp": 0.0103504, "balance_loss_clip": 1.02102661, "balance_loss_mlp": 1.04282379, "epoch": 0.4180369758003908, "flos": 17783826399360.0, "grad_norm": 2.669880375803928, "language_loss": 0.71675062, "learning_rate": 2.5096656709316008e-06, "loss": 0.73841709, "num_input_tokens_seen": 149221385, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.796875, "step": 6953, "time_per_iteration": 2.5621421337127686 }, { "auxiliary_loss_clip": 0.01128069, "auxiliary_loss_mlp": 0.01038973, "balance_loss_clip": 1.02301073, "balance_loss_mlp": 1.04223144, "epoch": 0.41809709905305875, "flos": 20960663944320.0, "grad_norm": 1.532936106891746, "language_loss": 0.76012117, "learning_rate": 2.509300313269983e-06, "loss": 0.78179157, "num_input_tokens_seen": 149241175, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.76953125, "step": 6954, "time_per_iteration": 2.5124056339263916 }, { "auxiliary_loss_clip": 0.01157055, "auxiliary_loss_mlp": 0.01043558, "balance_loss_clip": 1.02690494, "balance_loss_mlp": 1.04060209, "epoch": 0.4181572223057267, "flos": 22017622573440.0, "grad_norm": 2.3929184116396414, "language_loss": 0.84023619, "learning_rate": 2.5089349374327472e-06, "loss": 0.86224234, "num_input_tokens_seen": 149259115, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.81640625, "step": 6955, "time_per_iteration": 2.6460158824920654 }, { "auxiliary_loss_clip": 0.01128602, "auxiliary_loss_mlp": 0.01037657, "balance_loss_clip": 1.02347112, "balance_loss_mlp": 1.04223418, "epoch": 0.4182173455583947, "flos": 26245528917120.0, "grad_norm": 1.582696947884266, "language_loss": 0.83459669, "learning_rate": 2.5085695434329327e-06, "loss": 0.85625929, "num_input_tokens_seen": 149278705, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7734375, "step": 6956, "time_per_iteration": 2.6155471801757812 }, { "auxiliary_loss_clip": 0.01150717, "auxiliary_loss_mlp": 0.01042698, "balance_loss_clip": 1.02627134, "balance_loss_mlp": 1.04268909, "epoch": 0.4182774688110627, "flos": 14791605782400.0, "grad_norm": 1.9456713453272083, "language_loss": 0.71146441, "learning_rate": 2.5082041312835792e-06, "loss": 0.73339856, "num_input_tokens_seen": 149294040, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.81640625, "step": 6957, "time_per_iteration": 3.9566829204559326 }, { "auxiliary_loss_clip": 0.01149193, "auxiliary_loss_mlp": 0.01042001, "balance_loss_clip": 1.02819085, "balance_loss_mlp": 1.04281807, "epoch": 0.41833759206373067, "flos": 20412020632320.0, "grad_norm": 1.800291623334606, "language_loss": 0.75251776, "learning_rate": 2.507838700997728e-06, "loss": 0.77442968, "num_input_tokens_seen": 149310385, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.796875, "step": 6958, "time_per_iteration": 2.616117477416992 }, { "auxiliary_loss_clip": 0.01156256, "auxiliary_loss_mlp": 0.01284011, "balance_loss_clip": 1.02153468, "balance_loss_mlp": 1.04112899, "epoch": 0.41839771531639863, "flos": 23656333875840.0, "grad_norm": 1.7305955889452616, "language_loss": 0.76791215, "learning_rate": 2.5074732525884186e-06, "loss": 0.79231489, "num_input_tokens_seen": 149328235, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.79296875, "step": 6959, "time_per_iteration": 4.101940631866455 }, { "auxiliary_loss_clip": 0.01148634, "auxiliary_loss_mlp": 0.01283517, "balance_loss_clip": 1.02025867, "balance_loss_mlp": 1.04225135, "epoch": 0.4184578385690666, "flos": 19754137082880.0, "grad_norm": 1.768417902187136, "language_loss": 0.76702911, "learning_rate": 2.5071077860686954e-06, "loss": 0.7913506, "num_input_tokens_seen": 149347465, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.796875, "step": 6960, "time_per_iteration": 2.6470088958740234 }, { "auxiliary_loss_clip": 0.01134398, "auxiliary_loss_mlp": 0.01034097, "balance_loss_clip": 1.02048349, "balance_loss_mlp": 1.04121208, "epoch": 0.41851796182173456, "flos": 20193396503040.0, "grad_norm": 1.8975070063139807, "language_loss": 0.75592142, "learning_rate": 2.5067423014515995e-06, "loss": 0.77760637, "num_input_tokens_seen": 149366685, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.75390625, "step": 6961, "time_per_iteration": 3.9591996669769287 }, { "auxiliary_loss_clip": 0.0106815, "auxiliary_loss_mlp": 0.01009508, "balance_loss_clip": 1.00795794, "balance_loss_mlp": 1.01664853, "epoch": 0.41857808507440253, "flos": 59018794231680.0, "grad_norm": 0.7836287473482966, "language_loss": 0.54663885, "learning_rate": 2.5063767987501745e-06, "loss": 0.56741542, "num_input_tokens_seen": 149422925, "router_z_loss_clip": 0.01550293, "router_z_loss_mlp": 0.24609375, "step": 6962, "time_per_iteration": 3.1340079307556152 }, { "auxiliary_loss_clip": 0.01132363, "auxiliary_loss_mlp": 0.0103664, "balance_loss_clip": 1.02226353, "balance_loss_mlp": 1.04225945, "epoch": 0.4186382083270705, "flos": 18478805719680.0, "grad_norm": 1.828179017772221, "language_loss": 0.84786904, "learning_rate": 2.506011277977464e-06, "loss": 0.86955905, "num_input_tokens_seen": 149440820, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.80859375, "step": 6963, "time_per_iteration": 2.553110361099243 }, { "auxiliary_loss_clip": 0.01140523, "auxiliary_loss_mlp": 0.01034148, "balance_loss_clip": 1.01941419, "balance_loss_mlp": 1.04360676, "epoch": 0.41869833157973846, "flos": 21578758202880.0, "grad_norm": 3.692747762604183, "language_loss": 0.69862747, "learning_rate": 2.5056457391465123e-06, "loss": 0.72037417, "num_input_tokens_seen": 149461060, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7890625, "step": 6964, "time_per_iteration": 2.5300943851470947 }, { "auxiliary_loss_clip": 0.01121889, "auxiliary_loss_mlp": 0.01040927, "balance_loss_clip": 1.02671766, "balance_loss_mlp": 1.04395103, "epoch": 0.4187584548324064, "flos": 35517412650240.0, "grad_norm": 1.5790371625702218, "language_loss": 0.70751417, "learning_rate": 2.505280182270365e-06, "loss": 0.72914237, "num_input_tokens_seen": 149483115, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78125, "step": 6965, "time_per_iteration": 2.637078046798706 }, { "auxiliary_loss_clip": 0.01155415, "auxiliary_loss_mlp": 0.01034149, "balance_loss_clip": 1.02035666, "balance_loss_mlp": 1.04358149, "epoch": 0.4188185780850744, "flos": 18655880791680.0, "grad_norm": 1.5737945466524894, "language_loss": 0.71939743, "learning_rate": 2.504914607362068e-06, "loss": 0.74129307, "num_input_tokens_seen": 149501495, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.76171875, "step": 6966, "time_per_iteration": 2.562119483947754 }, { "auxiliary_loss_clip": 0.01135586, "auxiliary_loss_mlp": 0.01039085, "balance_loss_clip": 1.02295578, "balance_loss_mlp": 1.04335403, "epoch": 0.41887870133774235, "flos": 40333428374400.0, "grad_norm": 2.8327227930313765, "language_loss": 0.70820242, "learning_rate": 2.5045490144346673e-06, "loss": 0.72994912, "num_input_tokens_seen": 149523170, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.83203125, "step": 6967, "time_per_iteration": 2.6474945545196533 }, { "auxiliary_loss_clip": 0.01050733, "auxiliary_loss_mlp": 0.01003284, "balance_loss_clip": 1.0017935, "balance_loss_mlp": 1.01706874, "epoch": 0.4189388245904103, "flos": 61371336516480.0, "grad_norm": 0.6733797501343372, "language_loss": 0.46195936, "learning_rate": 2.5041834035012103e-06, "loss": 0.48249954, "num_input_tokens_seen": 149583955, "router_z_loss_clip": 0.01489258, "router_z_loss_mlp": 0.24804688, "step": 6968, "time_per_iteration": 3.0979692935943604 }, { "auxiliary_loss_clip": 0.01136207, "auxiliary_loss_mlp": 0.01041672, "balance_loss_clip": 1.0254004, "balance_loss_mlp": 1.04383492, "epoch": 0.4189989478430783, "flos": 28215624119040.0, "grad_norm": 1.6939526218409353, "language_loss": 0.74896705, "learning_rate": 2.503817774574744e-06, "loss": 0.77074587, "num_input_tokens_seen": 149604440, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8359375, "step": 6969, "time_per_iteration": 2.5684144496917725 }, { "auxiliary_loss_clip": 0.01137069, "auxiliary_loss_mlp": 0.01036626, "balance_loss_clip": 1.0226723, "balance_loss_mlp": 1.04107356, "epoch": 0.4190590710957463, "flos": 24565879088640.0, "grad_norm": 2.2402555594982023, "language_loss": 0.80608594, "learning_rate": 2.503452127668318e-06, "loss": 0.82782292, "num_input_tokens_seen": 149623745, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7890625, "step": 6970, "time_per_iteration": 2.6582632064819336 }, { "auxiliary_loss_clip": 0.01129494, "auxiliary_loss_mlp": 0.01036521, "balance_loss_clip": 1.02153683, "balance_loss_mlp": 1.04271865, "epoch": 0.41911919434841427, "flos": 21726027964800.0, "grad_norm": 1.9309732189447382, "language_loss": 0.82901061, "learning_rate": 2.50308646279498e-06, "loss": 0.85067081, "num_input_tokens_seen": 149643025, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.78125, "step": 6971, "time_per_iteration": 2.4826784133911133 }, { "auxiliary_loss_clip": 0.01139049, "auxiliary_loss_mlp": 0.01037912, "balance_loss_clip": 1.02209258, "balance_loss_mlp": 1.04405069, "epoch": 0.41917931760108224, "flos": 17601543855360.0, "grad_norm": 1.6645956885011945, "language_loss": 0.74829698, "learning_rate": 2.50272077996778e-06, "loss": 0.7700665, "num_input_tokens_seen": 149660695, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.76953125, "step": 6972, "time_per_iteration": 2.6121747493743896 }, { "auxiliary_loss_clip": 0.01137527, "auxiliary_loss_mlp": 0.0103338, "balance_loss_clip": 1.01939714, "balance_loss_mlp": 1.04106998, "epoch": 0.4192394408537502, "flos": 37816701022080.0, "grad_norm": 2.051021750151958, "language_loss": 0.72688091, "learning_rate": 2.5023550791997695e-06, "loss": 0.74859005, "num_input_tokens_seen": 149682040, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78515625, "step": 6973, "time_per_iteration": 2.6538188457489014 }, { "auxiliary_loss_clip": 0.01157747, "auxiliary_loss_mlp": 0.0104047, "balance_loss_clip": 1.02492499, "balance_loss_mlp": 1.04628658, "epoch": 0.41929956410641817, "flos": 23107726477440.0, "grad_norm": 5.814341107513382, "language_loss": 0.74761081, "learning_rate": 2.5019893605039976e-06, "loss": 0.769593, "num_input_tokens_seen": 149700855, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.84375, "step": 6974, "time_per_iteration": 2.6841681003570557 }, { "auxiliary_loss_clip": 0.01130011, "auxiliary_loss_mlp": 0.01037297, "balance_loss_clip": 1.02306914, "balance_loss_mlp": 1.04171693, "epoch": 0.41935968735908613, "flos": 22524537260160.0, "grad_norm": 1.7685486772365377, "language_loss": 0.7264719, "learning_rate": 2.501623623893517e-06, "loss": 0.74814498, "num_input_tokens_seen": 149717360, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.796875, "step": 6975, "time_per_iteration": 2.7287721633911133 }, { "auxiliary_loss_clip": 0.01130886, "auxiliary_loss_mlp": 0.01036248, "balance_loss_clip": 1.02208543, "balance_loss_mlp": 1.0435245, "epoch": 0.4194198106117541, "flos": 26870446759680.0, "grad_norm": 1.5920495266835089, "language_loss": 0.80809009, "learning_rate": 2.5012578693813796e-06, "loss": 0.82976139, "num_input_tokens_seen": 149738975, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.78125, "step": 6976, "time_per_iteration": 2.6371536254882812 }, { "auxiliary_loss_clip": 0.01139664, "auxiliary_loss_mlp": 0.01029806, "balance_loss_clip": 1.01614451, "balance_loss_mlp": 1.04175425, "epoch": 0.41947993386442206, "flos": 19902412425600.0, "grad_norm": 1.8290676846904366, "language_loss": 0.67932832, "learning_rate": 2.5008920969806386e-06, "loss": 0.70102298, "num_input_tokens_seen": 149757055, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.796875, "step": 6977, "time_per_iteration": 2.696687936782837 }, { "auxiliary_loss_clip": 0.01132158, "auxiliary_loss_mlp": 0.01035447, "balance_loss_clip": 1.02117729, "balance_loss_mlp": 1.04310679, "epoch": 0.41954005711709, "flos": 17383889393280.0, "grad_norm": 2.494629684507247, "language_loss": 0.80987638, "learning_rate": 2.5005263067043464e-06, "loss": 0.83155239, "num_input_tokens_seen": 149772885, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8046875, "step": 6978, "time_per_iteration": 2.5777995586395264 }, { "auxiliary_loss_clip": 0.01122799, "auxiliary_loss_mlp": 0.01039647, "balance_loss_clip": 1.02431631, "balance_loss_mlp": 1.04147899, "epoch": 0.419600180369758, "flos": 25003306915200.0, "grad_norm": 1.7570556578592953, "language_loss": 0.82539392, "learning_rate": 2.500160498565558e-06, "loss": 0.84701836, "num_input_tokens_seen": 149791515, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8125, "step": 6979, "time_per_iteration": 2.5209286212921143 }, { "auxiliary_loss_clip": 0.01120152, "auxiliary_loss_mlp": 0.01040076, "balance_loss_clip": 1.0256815, "balance_loss_mlp": 1.04157662, "epoch": 0.41966030362242596, "flos": 17383781652480.0, "grad_norm": 1.8770257839097244, "language_loss": 0.83412063, "learning_rate": 2.499794672577329e-06, "loss": 0.8557229, "num_input_tokens_seen": 149807250, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.78515625, "step": 6980, "time_per_iteration": 2.4874918460845947 }, { "auxiliary_loss_clip": 0.01124137, "auxiliary_loss_mlp": 0.01040702, "balance_loss_clip": 1.02618241, "balance_loss_mlp": 1.0420289, "epoch": 0.4197204268750939, "flos": 22156165330560.0, "grad_norm": 2.762713064528605, "language_loss": 0.79089129, "learning_rate": 2.4994288287527126e-06, "loss": 0.8125397, "num_input_tokens_seen": 149821640, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8203125, "step": 6981, "time_per_iteration": 2.536954402923584 }, { "auxiliary_loss_clip": 0.01143571, "auxiliary_loss_mlp": 0.0103705, "balance_loss_clip": 1.02284622, "balance_loss_mlp": 1.04037905, "epoch": 0.4197805501277619, "flos": 22084128604800.0, "grad_norm": 1.765672089785092, "language_loss": 0.8416428, "learning_rate": 2.499062967104766e-06, "loss": 0.86344904, "num_input_tokens_seen": 149840545, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.76171875, "step": 6982, "time_per_iteration": 2.589002847671509 }, { "auxiliary_loss_clip": 0.01132001, "auxiliary_loss_mlp": 0.01039387, "balance_loss_clip": 1.024212, "balance_loss_mlp": 1.04089439, "epoch": 0.4198406733804299, "flos": 26432192920320.0, "grad_norm": 1.9021777731446328, "language_loss": 0.56470579, "learning_rate": 2.498697087646546e-06, "loss": 0.5864197, "num_input_tokens_seen": 149860375, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8203125, "step": 6983, "time_per_iteration": 2.7546803951263428 }, { "auxiliary_loss_clip": 0.01129428, "auxiliary_loss_mlp": 0.0103709, "balance_loss_clip": 1.02340448, "balance_loss_mlp": 1.04226768, "epoch": 0.4199007966330979, "flos": 12531029293440.0, "grad_norm": 2.145388284004318, "language_loss": 0.82000363, "learning_rate": 2.49833119039111e-06, "loss": 0.84166873, "num_input_tokens_seen": 149877850, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.78515625, "step": 6984, "time_per_iteration": 2.556835412979126 }, { "auxiliary_loss_clip": 0.01128621, "auxiliary_loss_mlp": 0.01032741, "balance_loss_clip": 1.01845384, "balance_loss_mlp": 1.04134226, "epoch": 0.41996091988576584, "flos": 21762944167680.0, "grad_norm": 1.785168277615042, "language_loss": 0.7869277, "learning_rate": 2.497965275351516e-06, "loss": 0.8085413, "num_input_tokens_seen": 149896110, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78125, "step": 6985, "time_per_iteration": 2.5450546741485596 }, { "auxiliary_loss_clip": 0.01124917, "auxiliary_loss_mlp": 0.01041353, "balance_loss_clip": 1.0259397, "balance_loss_mlp": 1.0410583, "epoch": 0.4200210431384338, "flos": 26541935948160.0, "grad_norm": 1.9268257527956036, "language_loss": 0.78103125, "learning_rate": 2.4975993425408216e-06, "loss": 0.8026939, "num_input_tokens_seen": 149916495, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8359375, "step": 6986, "time_per_iteration": 2.5905516147613525 }, { "auxiliary_loss_clip": 0.01135776, "auxiliary_loss_mlp": 0.01032115, "balance_loss_clip": 1.01907921, "balance_loss_mlp": 1.04197526, "epoch": 0.42008116639110177, "flos": 26795824254720.0, "grad_norm": 1.564615707851138, "language_loss": 0.7207154, "learning_rate": 2.497233391972087e-06, "loss": 0.74239433, "num_input_tokens_seen": 149936445, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7578125, "step": 6987, "time_per_iteration": 2.666193962097168 }, { "auxiliary_loss_clip": 0.01141432, "auxiliary_loss_mlp": 0.01288711, "balance_loss_clip": 1.02707195, "balance_loss_mlp": 1.04348421, "epoch": 0.42014128964376973, "flos": 32087333243520.0, "grad_norm": 1.455835084259497, "language_loss": 0.74714392, "learning_rate": 2.496867423658371e-06, "loss": 0.77144539, "num_input_tokens_seen": 149959430, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.796875, "step": 6988, "time_per_iteration": 2.6401872634887695 }, { "auxiliary_loss_clip": 0.01132498, "auxiliary_loss_mlp": 0.01042471, "balance_loss_clip": 1.02692628, "balance_loss_mlp": 1.041188, "epoch": 0.4202014128964377, "flos": 26467133875200.0, "grad_norm": 1.4858834257823514, "language_loss": 0.73847109, "learning_rate": 2.496501437612735e-06, "loss": 0.76022083, "num_input_tokens_seen": 149980365, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.82421875, "step": 6989, "time_per_iteration": 2.632721424102783 }, { "auxiliary_loss_clip": 0.01140424, "auxiliary_loss_mlp": 0.01033922, "balance_loss_clip": 1.01909852, "balance_loss_mlp": 1.041731, "epoch": 0.42026153614910566, "flos": 13401216178560.0, "grad_norm": 3.8530236990880615, "language_loss": 0.70252615, "learning_rate": 2.4961354338482406e-06, "loss": 0.72426963, "num_input_tokens_seen": 149997375, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8125, "step": 6990, "time_per_iteration": 2.5930325984954834 }, { "auxiliary_loss_clip": 0.01048962, "auxiliary_loss_mlp": 0.01260322, "balance_loss_clip": 1.01236248, "balance_loss_mlp": 1.01508856, "epoch": 0.42032165940177363, "flos": 60250457635200.0, "grad_norm": 0.8239797015720598, "language_loss": 0.60490787, "learning_rate": 2.4957694123779477e-06, "loss": 0.62800074, "num_input_tokens_seen": 150051230, "router_z_loss_clip": 0.01757812, "router_z_loss_mlp": 0.24707031, "step": 6991, "time_per_iteration": 3.0628461837768555 }, { "auxiliary_loss_clip": 0.01125415, "auxiliary_loss_mlp": 0.0104206, "balance_loss_clip": 1.02727199, "balance_loss_mlp": 1.04404581, "epoch": 0.4203817826544416, "flos": 24535211852160.0, "grad_norm": 1.6605240753775743, "language_loss": 0.82721859, "learning_rate": 2.4954033732149208e-06, "loss": 0.84889334, "num_input_tokens_seen": 150071135, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.81640625, "step": 6992, "time_per_iteration": 2.6232523918151855 }, { "auxiliary_loss_clip": 0.0103889, "auxiliary_loss_mlp": 0.01007999, "balance_loss_clip": 1.00631773, "balance_loss_mlp": 1.01427293, "epoch": 0.42044190590710956, "flos": 58820781530880.0, "grad_norm": 0.8556436155388878, "language_loss": 0.65508306, "learning_rate": 2.495037316372221e-06, "loss": 0.67555189, "num_input_tokens_seen": 150125220, "router_z_loss_clip": 0.0168457, "router_z_loss_mlp": 0.24609375, "step": 6993, "time_per_iteration": 4.528273344039917 }, { "auxiliary_loss_clip": 0.0112787, "auxiliary_loss_mlp": 0.01036438, "balance_loss_clip": 1.0202378, "balance_loss_mlp": 1.0430913, "epoch": 0.4205020291597775, "flos": 16436063260800.0, "grad_norm": 1.9449649678472125, "language_loss": 0.83487594, "learning_rate": 2.4946712418629133e-06, "loss": 0.85651904, "num_input_tokens_seen": 150142300, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.84765625, "step": 6994, "time_per_iteration": 2.535367250442505 }, { "auxiliary_loss_clip": 0.01129769, "auxiliary_loss_mlp": 0.01280537, "balance_loss_clip": 1.01912713, "balance_loss_mlp": 1.04295945, "epoch": 0.4205621524124455, "flos": 18405655672320.0, "grad_norm": 1.6209501776180184, "language_loss": 0.78023851, "learning_rate": 2.4943051497000616e-06, "loss": 0.80434155, "num_input_tokens_seen": 150161345, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.77734375, "step": 6995, "time_per_iteration": 2.507094144821167 }, { "auxiliary_loss_clip": 0.0114254, "auxiliary_loss_mlp": 0.01031482, "balance_loss_clip": 1.01767755, "balance_loss_mlp": 1.03954434, "epoch": 0.4206222756651135, "flos": 25520097841920.0, "grad_norm": 1.405326131011131, "language_loss": 0.79995984, "learning_rate": 2.4939390398967303e-06, "loss": 0.82170004, "num_input_tokens_seen": 150182420, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7578125, "step": 6996, "time_per_iteration": 2.6316730976104736 }, { "auxiliary_loss_clip": 0.01141699, "auxiliary_loss_mlp": 0.01281212, "balance_loss_clip": 1.02014589, "balance_loss_mlp": 1.0430522, "epoch": 0.4206823989177815, "flos": 15304338472320.0, "grad_norm": 2.183045642233539, "language_loss": 0.75829512, "learning_rate": 2.493572912465985e-06, "loss": 0.78252423, "num_input_tokens_seen": 150200175, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.80859375, "step": 6997, "time_per_iteration": 2.506594657897949 }, { "auxiliary_loss_clip": 0.01119967, "auxiliary_loss_mlp": 0.01040358, "balance_loss_clip": 1.02519441, "balance_loss_mlp": 1.03897738, "epoch": 0.42074252217044944, "flos": 15554096714880.0, "grad_norm": 1.833769686149925, "language_loss": 0.75619608, "learning_rate": 2.493206767420892e-06, "loss": 0.77779937, "num_input_tokens_seen": 150217100, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8125, "step": 6998, "time_per_iteration": 3.9364213943481445 }, { "auxiliary_loss_clip": 0.01124644, "auxiliary_loss_mlp": 0.01041097, "balance_loss_clip": 1.02589762, "balance_loss_mlp": 1.04264534, "epoch": 0.4208026454231174, "flos": 26145877610880.0, "grad_norm": 2.870821414553318, "language_loss": 0.75844628, "learning_rate": 2.492840604774519e-06, "loss": 0.78010368, "num_input_tokens_seen": 150239830, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8203125, "step": 6999, "time_per_iteration": 2.657228469848633 }, { "auxiliary_loss_clip": 0.01148062, "auxiliary_loss_mlp": 0.01038918, "balance_loss_clip": 1.02361119, "balance_loss_mlp": 1.04218698, "epoch": 0.42086276867578537, "flos": 23550110380800.0, "grad_norm": 1.7722604408429226, "language_loss": 0.64505303, "learning_rate": 2.492474424539932e-06, "loss": 0.66692281, "num_input_tokens_seen": 150260690, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.7890625, "step": 7000, "time_per_iteration": 2.5529122352600098 }, { "auxiliary_loss_clip": 0.01149673, "auxiliary_loss_mlp": 0.01038365, "balance_loss_clip": 1.02371371, "balance_loss_mlp": 1.04327703, "epoch": 0.42092289192845334, "flos": 18113414618880.0, "grad_norm": 1.7403936715765, "language_loss": 0.76268643, "learning_rate": 2.4921082267301994e-06, "loss": 0.78456688, "num_input_tokens_seen": 150279885, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80078125, "step": 7001, "time_per_iteration": 4.0278284549713135 }, { "auxiliary_loss_clip": 0.01149319, "auxiliary_loss_mlp": 0.01282196, "balance_loss_clip": 1.02084017, "balance_loss_mlp": 1.04376268, "epoch": 0.4209830151811213, "flos": 20006588845440.0, "grad_norm": 1.597523227277563, "language_loss": 0.86589205, "learning_rate": 2.49174201135839e-06, "loss": 0.89020723, "num_input_tokens_seen": 150297390, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7890625, "step": 7002, "time_per_iteration": 2.5680994987487793 }, { "auxiliary_loss_clip": 0.01146023, "auxiliary_loss_mlp": 0.0103254, "balance_loss_clip": 1.01859808, "balance_loss_mlp": 1.04124582, "epoch": 0.42104313843378927, "flos": 21978946604160.0, "grad_norm": 2.009371625316479, "language_loss": 0.67897511, "learning_rate": 2.491375778437573e-06, "loss": 0.70076072, "num_input_tokens_seen": 150317390, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.77734375, "step": 7003, "time_per_iteration": 4.14063835144043 }, { "auxiliary_loss_clip": 0.01123615, "auxiliary_loss_mlp": 0.01037732, "balance_loss_clip": 1.02225828, "balance_loss_mlp": 1.04218423, "epoch": 0.42110326168645723, "flos": 25443966965760.0, "grad_norm": 2.004922107430954, "language_loss": 0.76995701, "learning_rate": 2.491009527980819e-06, "loss": 0.79157048, "num_input_tokens_seen": 150337455, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8125, "step": 7004, "time_per_iteration": 2.6397616863250732 }, { "auxiliary_loss_clip": 0.01120076, "auxiliary_loss_mlp": 0.0103484, "balance_loss_clip": 1.01971185, "balance_loss_mlp": 1.04264295, "epoch": 0.4211633849391252, "flos": 17822574195840.0, "grad_norm": 4.092572538319433, "language_loss": 0.68270743, "learning_rate": 2.4906432600011983e-06, "loss": 0.70425659, "num_input_tokens_seen": 150355385, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.7734375, "step": 7005, "time_per_iteration": 2.510422468185425 }, { "auxiliary_loss_clip": 0.0113117, "auxiliary_loss_mlp": 0.01036703, "balance_loss_clip": 1.02235007, "balance_loss_mlp": 1.04163158, "epoch": 0.42122350819179316, "flos": 16282436791680.0, "grad_norm": 2.018885578377803, "language_loss": 0.7232163, "learning_rate": 2.4902769745117805e-06, "loss": 0.74489498, "num_input_tokens_seen": 150371750, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.80859375, "step": 7006, "time_per_iteration": 2.600931167602539 }, { "auxiliary_loss_clip": 0.01121538, "auxiliary_loss_mlp": 0.0103431, "balance_loss_clip": 1.019629, "balance_loss_mlp": 1.04087996, "epoch": 0.4212836314444611, "flos": 19645866512640.0, "grad_norm": 1.80873340482446, "language_loss": 0.70575023, "learning_rate": 2.4899106715256394e-06, "loss": 0.72730875, "num_input_tokens_seen": 150389955, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8046875, "step": 7007, "time_per_iteration": 2.495321273803711 }, { "auxiliary_loss_clip": 0.01138406, "auxiliary_loss_mlp": 0.0128456, "balance_loss_clip": 1.02341771, "balance_loss_mlp": 1.04145384, "epoch": 0.4213437546971291, "flos": 18369026778240.0, "grad_norm": 1.880139152884928, "language_loss": 0.82686853, "learning_rate": 2.4895443510558467e-06, "loss": 0.85109818, "num_input_tokens_seen": 150405780, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.79296875, "step": 7008, "time_per_iteration": 2.5768396854400635 }, { "auxiliary_loss_clip": 0.01147103, "auxiliary_loss_mlp": 0.01038944, "balance_loss_clip": 1.02393579, "balance_loss_mlp": 1.04057026, "epoch": 0.42140387794979706, "flos": 27704507541120.0, "grad_norm": 1.9622860090978256, "language_loss": 0.71984142, "learning_rate": 2.489178013115475e-06, "loss": 0.74170196, "num_input_tokens_seen": 150425615, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.80078125, "step": 7009, "time_per_iteration": 2.640272617340088 }, { "auxiliary_loss_clip": 0.01123224, "auxiliary_loss_mlp": 0.01035107, "balance_loss_clip": 1.02042031, "balance_loss_mlp": 1.04216933, "epoch": 0.4214640012024651, "flos": 28147071012480.0, "grad_norm": 2.4719436194626874, "language_loss": 0.65271592, "learning_rate": 2.4888116577175987e-06, "loss": 0.67429918, "num_input_tokens_seen": 150445765, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8125, "step": 7010, "time_per_iteration": 2.623430013656616 }, { "auxiliary_loss_clip": 0.01145492, "auxiliary_loss_mlp": 0.01032395, "balance_loss_clip": 1.01799488, "balance_loss_mlp": 1.04127514, "epoch": 0.42152412445513304, "flos": 22997265177600.0, "grad_norm": 1.8424481309147456, "language_loss": 0.72530234, "learning_rate": 2.4884452848752918e-06, "loss": 0.74708116, "num_input_tokens_seen": 150464405, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.77734375, "step": 7011, "time_per_iteration": 2.567823886871338 }, { "auxiliary_loss_clip": 0.01051779, "auxiliary_loss_mlp": 0.01014554, "balance_loss_clip": 1.01263428, "balance_loss_mlp": 1.01782775, "epoch": 0.421584247707801, "flos": 63749592938880.0, "grad_norm": 0.8074905078524709, "language_loss": 0.54379708, "learning_rate": 2.4880788946016287e-06, "loss": 0.5644604, "num_input_tokens_seen": 150520430, "router_z_loss_clip": 0.01916504, "router_z_loss_mlp": 0.24804688, "step": 7012, "time_per_iteration": 3.1065447330474854 }, { "auxiliary_loss_clip": 0.01131831, "auxiliary_loss_mlp": 0.01033599, "balance_loss_clip": 1.01892447, "balance_loss_mlp": 1.04249287, "epoch": 0.421644370960469, "flos": 24314612474880.0, "grad_norm": 1.471528235383808, "language_loss": 0.78518915, "learning_rate": 2.4877124869096855e-06, "loss": 0.80684346, "num_input_tokens_seen": 150542610, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8046875, "step": 7013, "time_per_iteration": 2.644136428833008 }, { "auxiliary_loss_clip": 0.01123764, "auxiliary_loss_mlp": 0.01038002, "balance_loss_clip": 1.0232197, "balance_loss_mlp": 1.04202127, "epoch": 0.42170449421313694, "flos": 23440690575360.0, "grad_norm": 1.774831251130204, "language_loss": 0.81468791, "learning_rate": 2.487346061812538e-06, "loss": 0.83630556, "num_input_tokens_seen": 150560970, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8203125, "step": 7014, "time_per_iteration": 2.557535171508789 }, { "auxiliary_loss_clip": 0.01130918, "auxiliary_loss_mlp": 0.01038854, "balance_loss_clip": 1.02450109, "balance_loss_mlp": 1.04146302, "epoch": 0.4217646174658049, "flos": 23695476721920.0, "grad_norm": 1.7259659180195486, "language_loss": 0.77918029, "learning_rate": 2.4869796193232633e-06, "loss": 0.80087805, "num_input_tokens_seen": 150582615, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8046875, "step": 7015, "time_per_iteration": 2.631810426712036 }, { "auxiliary_loss_clip": 0.01122386, "auxiliary_loss_mlp": 0.01041468, "balance_loss_clip": 1.02643561, "balance_loss_mlp": 1.04192328, "epoch": 0.42182474071847287, "flos": 24971562270720.0, "grad_norm": 1.4092189658347634, "language_loss": 0.81996918, "learning_rate": 2.4866131594549385e-06, "loss": 0.84160775, "num_input_tokens_seen": 150603640, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8046875, "step": 7016, "time_per_iteration": 2.584244966506958 }, { "auxiliary_loss_clip": 0.01151057, "auxiliary_loss_mlp": 0.01036554, "balance_loss_clip": 1.02214146, "balance_loss_mlp": 1.04382372, "epoch": 0.42188486397114083, "flos": 22856639431680.0, "grad_norm": 2.140817396288244, "language_loss": 0.68131185, "learning_rate": 2.4862466822206425e-06, "loss": 0.70318794, "num_input_tokens_seen": 150622490, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8046875, "step": 7017, "time_per_iteration": 2.579974412918091 }, { "auxiliary_loss_clip": 0.0105185, "auxiliary_loss_mlp": 0.0100603, "balance_loss_clip": 1.00445676, "balance_loss_mlp": 1.01823211, "epoch": 0.4219449872238088, "flos": 66975700965120.0, "grad_norm": 0.8263335878382757, "language_loss": 0.59507239, "learning_rate": 2.485880187633452e-06, "loss": 0.61565113, "num_input_tokens_seen": 150689545, "router_z_loss_clip": 0.01574707, "router_z_loss_mlp": 0.24804688, "step": 7018, "time_per_iteration": 3.179547071456909 }, { "auxiliary_loss_clip": 0.0113412, "auxiliary_loss_mlp": 0.01038512, "balance_loss_clip": 1.02362275, "balance_loss_mlp": 1.04220104, "epoch": 0.42200511047647676, "flos": 13115367745920.0, "grad_norm": 1.8967960881043098, "language_loss": 0.750727, "learning_rate": 2.4855136757064487e-06, "loss": 0.77245331, "num_input_tokens_seen": 150707610, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.828125, "step": 7019, "time_per_iteration": 2.5896942615509033 }, { "auxiliary_loss_clip": 0.01140406, "auxiliary_loss_mlp": 0.01039359, "balance_loss_clip": 1.02383161, "balance_loss_mlp": 1.0420568, "epoch": 0.42206523372914473, "flos": 13991193066240.0, "grad_norm": 1.9103384065332478, "language_loss": 0.68880868, "learning_rate": 2.4851471464527097e-06, "loss": 0.71060628, "num_input_tokens_seen": 150724530, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.80078125, "step": 7020, "time_per_iteration": 2.503117561340332 }, { "auxiliary_loss_clip": 0.01137252, "auxiliary_loss_mlp": 0.01281596, "balance_loss_clip": 1.02142155, "balance_loss_mlp": 1.04337025, "epoch": 0.4221253569818127, "flos": 21942317710080.0, "grad_norm": 1.773468164991369, "language_loss": 0.80938828, "learning_rate": 2.4847805998853184e-06, "loss": 0.8335768, "num_input_tokens_seen": 150742870, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.76171875, "step": 7021, "time_per_iteration": 2.5945732593536377 }, { "auxiliary_loss_clip": 0.01167325, "auxiliary_loss_mlp": 0.01032801, "balance_loss_clip": 1.01872802, "balance_loss_mlp": 1.04213071, "epoch": 0.42218548023448066, "flos": 32192587071360.0, "grad_norm": 1.8242170095944117, "language_loss": 0.69957036, "learning_rate": 2.484414036017354e-06, "loss": 0.72157162, "num_input_tokens_seen": 150765500, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8046875, "step": 7022, "time_per_iteration": 2.6483800411224365 }, { "auxiliary_loss_clip": 0.01126581, "auxiliary_loss_mlp": 0.01032343, "balance_loss_clip": 1.01975489, "balance_loss_mlp": 1.0430969, "epoch": 0.4222456034871487, "flos": 30118961894400.0, "grad_norm": 1.8231057604331042, "language_loss": 0.67536092, "learning_rate": 2.4840474548618986e-06, "loss": 0.6969502, "num_input_tokens_seen": 150784945, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.74609375, "step": 7023, "time_per_iteration": 2.5907535552978516 }, { "auxiliary_loss_clip": 0.01138106, "auxiliary_loss_mlp": 0.0104304, "balance_loss_clip": 1.02809167, "balance_loss_mlp": 1.0437429, "epoch": 0.42230572673981664, "flos": 22127904305280.0, "grad_norm": 1.60051812596651, "language_loss": 0.68969262, "learning_rate": 2.4836808564320347e-06, "loss": 0.7115041, "num_input_tokens_seen": 150803120, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.765625, "step": 7024, "time_per_iteration": 2.5331552028656006 }, { "auxiliary_loss_clip": 0.01122552, "auxiliary_loss_mlp": 0.01042546, "balance_loss_clip": 1.02776384, "balance_loss_mlp": 1.04293585, "epoch": 0.4223658499924846, "flos": 22055077480320.0, "grad_norm": 1.5689339432933285, "language_loss": 0.76975811, "learning_rate": 2.4833142407408455e-06, "loss": 0.79140902, "num_input_tokens_seen": 150823135, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.796875, "step": 7025, "time_per_iteration": 2.595828056335449 }, { "auxiliary_loss_clip": 0.0113662, "auxiliary_loss_mlp": 0.01037976, "balance_loss_clip": 1.02383149, "balance_loss_mlp": 1.04188704, "epoch": 0.4224259732451526, "flos": 20410727742720.0, "grad_norm": 1.8229522002046215, "language_loss": 0.79750752, "learning_rate": 2.4829476078014143e-06, "loss": 0.8192535, "num_input_tokens_seen": 150842070, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.765625, "step": 7026, "time_per_iteration": 2.546520233154297 }, { "auxiliary_loss_clip": 0.01043945, "auxiliary_loss_mlp": 0.01005964, "balance_loss_clip": 1.00409222, "balance_loss_mlp": 1.01910853, "epoch": 0.42248609649782054, "flos": 62846655828480.0, "grad_norm": 0.7431853224788328, "language_loss": 0.61878562, "learning_rate": 2.4825809576268247e-06, "loss": 0.63928467, "num_input_tokens_seen": 150907450, "router_z_loss_clip": 0.01867676, "router_z_loss_mlp": 0.24804688, "step": 7027, "time_per_iteration": 3.2324280738830566 }, { "auxiliary_loss_clip": 0.01121658, "auxiliary_loss_mlp": 0.01043284, "balance_loss_clip": 1.02886558, "balance_loss_mlp": 1.04321361, "epoch": 0.4225462197504885, "flos": 26249946289920.0, "grad_norm": 2.0391737187776457, "language_loss": 0.70901883, "learning_rate": 2.4822142902301622e-06, "loss": 0.73066825, "num_input_tokens_seen": 150928040, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78125, "step": 7028, "time_per_iteration": 2.5668888092041016 }, { "auxiliary_loss_clip": 0.01142745, "auxiliary_loss_mlp": 0.01043106, "balance_loss_clip": 1.02832365, "balance_loss_mlp": 1.04335237, "epoch": 0.42260634300315647, "flos": 20521943228160.0, "grad_norm": 1.8030482502651728, "language_loss": 0.82383418, "learning_rate": 2.481847605624512e-06, "loss": 0.84569263, "num_input_tokens_seen": 150945760, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8125, "step": 7029, "time_per_iteration": 2.540419101715088 }, { "auxiliary_loss_clip": 0.01164373, "auxiliary_loss_mlp": 0.01044882, "balance_loss_clip": 1.02952218, "balance_loss_mlp": 1.04620731, "epoch": 0.42266646625582444, "flos": 24316731377280.0, "grad_norm": 2.3649510702444614, "language_loss": 0.74964046, "learning_rate": 2.481480903822961e-06, "loss": 0.77173305, "num_input_tokens_seen": 150965665, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.828125, "step": 7030, "time_per_iteration": 2.5714268684387207 }, { "auxiliary_loss_clip": 0.01123825, "auxiliary_loss_mlp": 0.01037152, "balance_loss_clip": 1.02101111, "balance_loss_mlp": 1.04271507, "epoch": 0.4227265895084924, "flos": 24204151175040.0, "grad_norm": 1.8172568873623507, "language_loss": 0.86229974, "learning_rate": 2.4811141848385944e-06, "loss": 0.88390946, "num_input_tokens_seen": 150982260, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8125, "step": 7031, "time_per_iteration": 2.4990077018737793 }, { "auxiliary_loss_clip": 0.01132083, "auxiliary_loss_mlp": 0.01041217, "balance_loss_clip": 1.02566051, "balance_loss_mlp": 1.04324734, "epoch": 0.42278671276116037, "flos": 16909760845440.0, "grad_norm": 2.090846487215271, "language_loss": 0.73288441, "learning_rate": 2.4807474486844996e-06, "loss": 0.75461745, "num_input_tokens_seen": 150999990, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.796875, "step": 7032, "time_per_iteration": 2.5457494258880615 }, { "auxiliary_loss_clip": 0.01142443, "auxiliary_loss_mlp": 0.01041297, "balance_loss_clip": 1.02659822, "balance_loss_mlp": 1.0422256, "epoch": 0.42284683601382833, "flos": 25411073086080.0, "grad_norm": 1.608818061913164, "language_loss": 0.70122695, "learning_rate": 2.480380695373766e-06, "loss": 0.7230643, "num_input_tokens_seen": 151021105, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.82421875, "step": 7033, "time_per_iteration": 2.577401638031006 }, { "auxiliary_loss_clip": 0.0114219, "auxiliary_loss_mlp": 0.01286781, "balance_loss_clip": 1.02467775, "balance_loss_mlp": 1.04353392, "epoch": 0.4229069592664963, "flos": 23040322606080.0, "grad_norm": 1.6839990740396762, "language_loss": 0.89956367, "learning_rate": 2.480013924919481e-06, "loss": 0.9238534, "num_input_tokens_seen": 151040665, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.80859375, "step": 7034, "time_per_iteration": 2.694282293319702 }, { "auxiliary_loss_clip": 0.01138757, "auxiliary_loss_mlp": 0.01042756, "balance_loss_clip": 1.02838516, "balance_loss_mlp": 1.04154539, "epoch": 0.42296708251916426, "flos": 26067448264320.0, "grad_norm": 1.6501527016830762, "language_loss": 0.76987296, "learning_rate": 2.479647137334733e-06, "loss": 0.79168808, "num_input_tokens_seen": 151061240, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.79296875, "step": 7035, "time_per_iteration": 4.0079264640808105 }, { "auxiliary_loss_clip": 0.01135481, "auxiliary_loss_mlp": 0.0104631, "balance_loss_clip": 1.03058672, "balance_loss_mlp": 1.04400218, "epoch": 0.4230272057718323, "flos": 19458376496640.0, "grad_norm": 1.7511578648535473, "language_loss": 0.82526588, "learning_rate": 2.479280332632613e-06, "loss": 0.84708381, "num_input_tokens_seen": 151076870, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.82421875, "step": 7036, "time_per_iteration": 2.5226006507873535 }, { "auxiliary_loss_clip": 0.01139913, "auxiliary_loss_mlp": 0.01039643, "balance_loss_clip": 1.02510512, "balance_loss_mlp": 1.04495716, "epoch": 0.42308732902450025, "flos": 22383300983040.0, "grad_norm": 1.6415097047767393, "language_loss": 0.70393384, "learning_rate": 2.4789135108262105e-06, "loss": 0.72572947, "num_input_tokens_seen": 151095110, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7734375, "step": 7037, "time_per_iteration": 2.6335434913635254 }, { "auxiliary_loss_clip": 0.0114863, "auxiliary_loss_mlp": 0.01284641, "balance_loss_clip": 1.02206683, "balance_loss_mlp": 1.04232061, "epoch": 0.4231474522771682, "flos": 20995425331200.0, "grad_norm": 1.5483140251737921, "language_loss": 0.77729714, "learning_rate": 2.478546671928617e-06, "loss": 0.80162978, "num_input_tokens_seen": 151114355, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.796875, "step": 7038, "time_per_iteration": 2.604675054550171 }, { "auxiliary_loss_clip": 0.01164905, "auxiliary_loss_mlp": 0.01050344, "balance_loss_clip": 1.03363085, "balance_loss_mlp": 1.04386687, "epoch": 0.4232075755298362, "flos": 14975863574400.0, "grad_norm": 2.301064246072844, "language_loss": 0.6690731, "learning_rate": 2.4781798159529235e-06, "loss": 0.69122553, "num_input_tokens_seen": 151131505, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8515625, "step": 7039, "time_per_iteration": 2.600414276123047 }, { "auxiliary_loss_clip": 0.01135276, "auxiliary_loss_mlp": 0.01035406, "balance_loss_clip": 1.01989675, "balance_loss_mlp": 1.04312885, "epoch": 0.42326769878250414, "flos": 24532661986560.0, "grad_norm": 1.570306279716251, "language_loss": 0.75830674, "learning_rate": 2.477812942912223e-06, "loss": 0.7800135, "num_input_tokens_seen": 151151555, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.83203125, "step": 7040, "time_per_iteration": 3.9476263523101807 }, { "auxiliary_loss_clip": 0.01131797, "auxiliary_loss_mlp": 0.01032562, "balance_loss_clip": 1.01766062, "balance_loss_mlp": 1.04260492, "epoch": 0.4233278220351721, "flos": 26870303105280.0, "grad_norm": 1.4550447835695182, "language_loss": 0.64736784, "learning_rate": 2.4774460528196073e-06, "loss": 0.66901147, "num_input_tokens_seen": 151172385, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8046875, "step": 7041, "time_per_iteration": 2.6896839141845703 }, { "auxiliary_loss_clip": 0.01145107, "auxiliary_loss_mlp": 0.01031247, "balance_loss_clip": 1.01595211, "balance_loss_mlp": 1.04495478, "epoch": 0.4233879452878401, "flos": 42814927463040.0, "grad_norm": 1.8474228159464272, "language_loss": 0.73926437, "learning_rate": 2.47707914568817e-06, "loss": 0.76102787, "num_input_tokens_seen": 151194930, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8203125, "step": 7042, "time_per_iteration": 4.303195476531982 }, { "auxiliary_loss_clip": 0.01130594, "auxiliary_loss_mlp": 0.0103259, "balance_loss_clip": 1.01897669, "balance_loss_mlp": 1.04399824, "epoch": 0.42344806854050804, "flos": 25229006023680.0, "grad_norm": 1.3552701691104587, "language_loss": 0.81929862, "learning_rate": 2.476712221531005e-06, "loss": 0.84093046, "num_input_tokens_seen": 151217905, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.77734375, "step": 7043, "time_per_iteration": 2.560244083404541 }, { "auxiliary_loss_clip": 0.01142406, "auxiliary_loss_mlp": 0.01041575, "balance_loss_clip": 1.02563691, "balance_loss_mlp": 1.04557788, "epoch": 0.423508191793176, "flos": 22778820616320.0, "grad_norm": 3.2752426000889305, "language_loss": 0.64766079, "learning_rate": 2.4763452803612077e-06, "loss": 0.66950059, "num_input_tokens_seen": 151234580, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.79296875, "step": 7044, "time_per_iteration": 2.594900369644165 }, { "auxiliary_loss_clip": 0.01139031, "auxiliary_loss_mlp": 0.01052461, "balance_loss_clip": 1.03605807, "balance_loss_mlp": 1.04751086, "epoch": 0.42356831504584397, "flos": 34637493179520.0, "grad_norm": 1.766234155222534, "language_loss": 0.75061888, "learning_rate": 2.4759783221918716e-06, "loss": 0.77253377, "num_input_tokens_seen": 151254765, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.82421875, "step": 7045, "time_per_iteration": 4.123289346694946 }, { "auxiliary_loss_clip": 0.01150455, "auxiliary_loss_mlp": 0.01038593, "balance_loss_clip": 1.02395391, "balance_loss_mlp": 1.0452559, "epoch": 0.42362843829851193, "flos": 17596767346560.0, "grad_norm": 2.0864727072829172, "language_loss": 0.80309582, "learning_rate": 2.4756113470360944e-06, "loss": 0.82498622, "num_input_tokens_seen": 151269045, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78515625, "step": 7046, "time_per_iteration": 2.513826608657837 }, { "auxiliary_loss_clip": 0.0118869, "auxiliary_loss_mlp": 0.01036715, "balance_loss_clip": 1.02148032, "balance_loss_mlp": 1.04323661, "epoch": 0.4236885615511799, "flos": 22565691267840.0, "grad_norm": 1.8774248934059738, "language_loss": 0.76657128, "learning_rate": 2.4752443549069713e-06, "loss": 0.78882533, "num_input_tokens_seen": 151287530, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.82421875, "step": 7047, "time_per_iteration": 2.654660701751709 }, { "auxiliary_loss_clip": 0.01122838, "auxiliary_loss_mlp": 0.01036591, "balance_loss_clip": 1.02214265, "balance_loss_mlp": 1.04632008, "epoch": 0.42374868480384786, "flos": 26469216864000.0, "grad_norm": 1.676985573939718, "language_loss": 0.67852324, "learning_rate": 2.4748773458176e-06, "loss": 0.70011747, "num_input_tokens_seen": 151308905, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.765625, "step": 7048, "time_per_iteration": 2.591778039932251 }, { "auxiliary_loss_clip": 0.01136036, "auxiliary_loss_mlp": 0.01040855, "balance_loss_clip": 1.02495277, "balance_loss_mlp": 1.04595423, "epoch": 0.4238088080565159, "flos": 20370220179840.0, "grad_norm": 1.5965492297561363, "language_loss": 0.77903819, "learning_rate": 2.4745103197810775e-06, "loss": 0.80080712, "num_input_tokens_seen": 151326525, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8125, "step": 7049, "time_per_iteration": 2.5777227878570557 }, { "auxiliary_loss_clip": 0.01134817, "auxiliary_loss_mlp": 0.01044863, "balance_loss_clip": 1.030159, "balance_loss_mlp": 1.04498458, "epoch": 0.42386893130918385, "flos": 21172105353600.0, "grad_norm": 1.774059168206711, "language_loss": 0.82704431, "learning_rate": 2.474143276810502e-06, "loss": 0.84884113, "num_input_tokens_seen": 151344675, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.80859375, "step": 7050, "time_per_iteration": 2.5429539680480957 }, { "auxiliary_loss_clip": 0.01122246, "auxiliary_loss_mlp": 0.01038777, "balance_loss_clip": 1.02404261, "balance_loss_mlp": 1.04089308, "epoch": 0.4239290545618518, "flos": 17675627656320.0, "grad_norm": 2.671853835190734, "language_loss": 0.73358047, "learning_rate": 2.4737762169189728e-06, "loss": 0.75519061, "num_input_tokens_seen": 151360730, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8125, "step": 7051, "time_per_iteration": 2.5375075340270996 }, { "auxiliary_loss_clip": 0.01150661, "auxiliary_loss_mlp": 0.01038706, "balance_loss_clip": 1.02419817, "balance_loss_mlp": 1.04372668, "epoch": 0.4239891778145198, "flos": 24314504734080.0, "grad_norm": 2.1438256607183517, "language_loss": 0.86092842, "learning_rate": 2.473409140119589e-06, "loss": 0.8828221, "num_input_tokens_seen": 151380445, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8046875, "step": 7052, "time_per_iteration": 2.5646557807922363 }, { "auxiliary_loss_clip": 0.01140084, "auxiliary_loss_mlp": 0.01052006, "balance_loss_clip": 1.03700912, "balance_loss_mlp": 1.04180622, "epoch": 0.42404930106718774, "flos": 20558428467840.0, "grad_norm": 1.5359216076511901, "language_loss": 0.72332239, "learning_rate": 2.4730420464254512e-06, "loss": 0.74524331, "num_input_tokens_seen": 151399325, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8046875, "step": 7053, "time_per_iteration": 2.5912551879882812 }, { "auxiliary_loss_clip": 0.01150427, "auxiliary_loss_mlp": 0.01282811, "balance_loss_clip": 1.02075624, "balance_loss_mlp": 1.04426289, "epoch": 0.4241094243198557, "flos": 22308067946880.0, "grad_norm": 1.7257769957972802, "language_loss": 0.822586, "learning_rate": 2.472674935849659e-06, "loss": 0.8469184, "num_input_tokens_seen": 151417240, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.79296875, "step": 7054, "time_per_iteration": 2.545353651046753 }, { "auxiliary_loss_clip": 0.01053075, "auxiliary_loss_mlp": 0.01004062, "balance_loss_clip": 1.00207138, "balance_loss_mlp": 1.01965284, "epoch": 0.4241695475725237, "flos": 70612445272320.0, "grad_norm": 0.7827820048938696, "language_loss": 0.60454488, "learning_rate": 2.4723078084053154e-06, "loss": 0.62511623, "num_input_tokens_seen": 151476015, "router_z_loss_clip": 0.01989746, "router_z_loss_mlp": 0.24902344, "step": 7055, "time_per_iteration": 3.1255037784576416 }, { "auxiliary_loss_clip": 0.01123133, "auxiliary_loss_mlp": 0.01042706, "balance_loss_clip": 1.02661276, "balance_loss_mlp": 1.0424099, "epoch": 0.42422967082519164, "flos": 14027462824320.0, "grad_norm": 4.892723310930146, "language_loss": 0.77054393, "learning_rate": 2.4719406641055197e-06, "loss": 0.79220235, "num_input_tokens_seen": 151492035, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.80859375, "step": 7056, "time_per_iteration": 2.4873931407928467 }, { "auxiliary_loss_clip": 0.01134428, "auxiliary_loss_mlp": 0.0104026, "balance_loss_clip": 1.02330875, "balance_loss_mlp": 1.04207587, "epoch": 0.4242897940778596, "flos": 22345522853760.0, "grad_norm": 1.8121167794024873, "language_loss": 0.84076488, "learning_rate": 2.471573502963376e-06, "loss": 0.86251175, "num_input_tokens_seen": 151508970, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8359375, "step": 7057, "time_per_iteration": 2.6209990978240967 }, { "auxiliary_loss_clip": 0.01151049, "auxiliary_loss_mlp": 0.01041478, "balance_loss_clip": 1.02592134, "balance_loss_mlp": 1.04212165, "epoch": 0.42434991733052757, "flos": 22595855713920.0, "grad_norm": 2.14681499638707, "language_loss": 0.832582, "learning_rate": 2.4712063249919876e-06, "loss": 0.85450721, "num_input_tokens_seen": 151525295, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.82421875, "step": 7058, "time_per_iteration": 2.5083487033843994 }, { "auxiliary_loss_clip": 0.01139831, "auxiliary_loss_mlp": 0.01278613, "balance_loss_clip": 1.01789117, "balance_loss_mlp": 1.04510808, "epoch": 0.42441004058319554, "flos": 20011437181440.0, "grad_norm": 1.9164558975637849, "language_loss": 0.8018949, "learning_rate": 2.470839130204457e-06, "loss": 0.82607931, "num_input_tokens_seen": 151544435, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7734375, "step": 7059, "time_per_iteration": 2.623898983001709 }, { "auxiliary_loss_clip": 0.01123008, "auxiliary_loss_mlp": 0.01038185, "balance_loss_clip": 1.02205002, "balance_loss_mlp": 1.04157555, "epoch": 0.4244701638358635, "flos": 11144985235200.0, "grad_norm": 1.9889512836618863, "language_loss": 0.70240712, "learning_rate": 2.4704719186138887e-06, "loss": 0.72401899, "num_input_tokens_seen": 151559520, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.81640625, "step": 7060, "time_per_iteration": 2.448685884475708 }, { "auxiliary_loss_clip": 0.01133892, "auxiliary_loss_mlp": 0.01035483, "balance_loss_clip": 1.02039123, "balance_loss_mlp": 1.04429305, "epoch": 0.42453028708853147, "flos": 23987753688960.0, "grad_norm": 1.5373247498538494, "language_loss": 0.75870335, "learning_rate": 2.4701046902333886e-06, "loss": 0.78039718, "num_input_tokens_seen": 151579790, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8046875, "step": 7061, "time_per_iteration": 2.575612783432007 }, { "auxiliary_loss_clip": 0.01135596, "auxiliary_loss_mlp": 0.01038572, "balance_loss_clip": 1.02217472, "balance_loss_mlp": 1.04488611, "epoch": 0.42459041034119943, "flos": 18406338030720.0, "grad_norm": 2.381307697701625, "language_loss": 0.72574121, "learning_rate": 2.4697374450760606e-06, "loss": 0.74748284, "num_input_tokens_seen": 151598285, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8203125, "step": 7062, "time_per_iteration": 2.4887866973876953 }, { "auxiliary_loss_clip": 0.01139954, "auxiliary_loss_mlp": 0.01040969, "balance_loss_clip": 1.0261631, "balance_loss_mlp": 1.04152429, "epoch": 0.42465053359386745, "flos": 20958006337920.0, "grad_norm": 1.7738843871733487, "language_loss": 0.66121894, "learning_rate": 2.469370183155012e-06, "loss": 0.68302816, "num_input_tokens_seen": 151615430, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8046875, "step": 7063, "time_per_iteration": 2.608344078063965 }, { "auxiliary_loss_clip": 0.01135846, "auxiliary_loss_mlp": 0.01280113, "balance_loss_clip": 1.01899266, "balance_loss_mlp": 1.04102588, "epoch": 0.4247106568465354, "flos": 33106190520960.0, "grad_norm": 1.5664757461140162, "language_loss": 0.78333944, "learning_rate": 2.4690029044833483e-06, "loss": 0.80749905, "num_input_tokens_seen": 151637030, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.765625, "step": 7064, "time_per_iteration": 2.631580114364624 }, { "auxiliary_loss_clip": 0.01122867, "auxiliary_loss_mlp": 0.01042117, "balance_loss_clip": 1.02694213, "balance_loss_mlp": 1.04176354, "epoch": 0.4247707800992034, "flos": 20046916840320.0, "grad_norm": 1.9620106211106232, "language_loss": 0.74864626, "learning_rate": 2.468635609074178e-06, "loss": 0.7702961, "num_input_tokens_seen": 151655745, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8125, "step": 7065, "time_per_iteration": 2.6010231971740723 }, { "auxiliary_loss_clip": 0.0113342, "auxiliary_loss_mlp": 0.01037251, "balance_loss_clip": 1.02112174, "balance_loss_mlp": 1.04459739, "epoch": 0.42483090335187135, "flos": 22385132576640.0, "grad_norm": 1.4302651088112657, "language_loss": 0.72417235, "learning_rate": 2.468268296940608e-06, "loss": 0.74587905, "num_input_tokens_seen": 151678040, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.80078125, "step": 7066, "time_per_iteration": 2.5810275077819824 }, { "auxiliary_loss_clip": 0.01151425, "auxiliary_loss_mlp": 0.01040879, "balance_loss_clip": 1.02372503, "balance_loss_mlp": 1.04214311, "epoch": 0.4248910266045393, "flos": 21356830022400.0, "grad_norm": 1.8731463603358185, "language_loss": 0.79534292, "learning_rate": 2.467900968095747e-06, "loss": 0.81726599, "num_input_tokens_seen": 151696410, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.828125, "step": 7067, "time_per_iteration": 2.5899789333343506 }, { "auxiliary_loss_clip": 0.0104265, "auxiliary_loss_mlp": 0.01001364, "balance_loss_clip": 0.9994089, "balance_loss_mlp": 1.01827002, "epoch": 0.4249511498572073, "flos": 64008114099840.0, "grad_norm": 0.9061907277082838, "language_loss": 0.63403249, "learning_rate": 2.4675336225527045e-06, "loss": 0.65447259, "num_input_tokens_seen": 151756365, "router_z_loss_clip": 0.01953125, "router_z_loss_mlp": 0.24414062, "step": 7068, "time_per_iteration": 3.104283332824707 }, { "auxiliary_loss_clip": 0.01123124, "auxiliary_loss_mlp": 0.01039316, "balance_loss_clip": 1.02247143, "balance_loss_mlp": 1.04178393, "epoch": 0.42501127310987524, "flos": 19607046888960.0, "grad_norm": 1.7365093804184024, "language_loss": 0.72138309, "learning_rate": 2.4671662603245892e-06, "loss": 0.74300742, "num_input_tokens_seen": 151775165, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8125, "step": 7069, "time_per_iteration": 2.514072895050049 }, { "auxiliary_loss_clip": 0.01126136, "auxiliary_loss_mlp": 0.01036558, "balance_loss_clip": 1.01984453, "balance_loss_mlp": 1.04446197, "epoch": 0.4250713963625432, "flos": 19462326992640.0, "grad_norm": 2.292645294671188, "language_loss": 0.79155445, "learning_rate": 2.4667988814245116e-06, "loss": 0.81318134, "num_input_tokens_seen": 151792620, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.81640625, "step": 7070, "time_per_iteration": 2.5815579891204834 }, { "auxiliary_loss_clip": 0.01132705, "auxiliary_loss_mlp": 0.01033602, "balance_loss_clip": 1.01793802, "balance_loss_mlp": 1.04241848, "epoch": 0.4251315196152112, "flos": 25337707557120.0, "grad_norm": 1.6429730745259539, "language_loss": 0.70071578, "learning_rate": 2.466431485865584e-06, "loss": 0.72237885, "num_input_tokens_seen": 151812850, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8203125, "step": 7071, "time_per_iteration": 2.531036853790283 }, { "auxiliary_loss_clip": 0.01122573, "auxiliary_loss_mlp": 0.01034876, "balance_loss_clip": 1.0188309, "balance_loss_mlp": 1.04221058, "epoch": 0.42519164286787914, "flos": 26359186527360.0, "grad_norm": 2.3852622085106465, "language_loss": 0.71389616, "learning_rate": 2.466064073660915e-06, "loss": 0.73547065, "num_input_tokens_seen": 151831785, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8046875, "step": 7072, "time_per_iteration": 2.579379081726074 }, { "auxiliary_loss_clip": 0.01153181, "auxiliary_loss_mlp": 0.01039963, "balance_loss_clip": 1.0252583, "balance_loss_mlp": 1.04298878, "epoch": 0.4252517661205471, "flos": 26031070765440.0, "grad_norm": 1.5962096609446403, "language_loss": 0.81582606, "learning_rate": 2.465696644823619e-06, "loss": 0.83775747, "num_input_tokens_seen": 151853885, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.828125, "step": 7073, "time_per_iteration": 2.5661885738372803 }, { "auxiliary_loss_clip": 0.01132363, "auxiliary_loss_mlp": 0.01033233, "balance_loss_clip": 1.01890361, "balance_loss_mlp": 1.04412234, "epoch": 0.42531188937321507, "flos": 12713635059840.0, "grad_norm": 2.412051561255245, "language_loss": 0.91021222, "learning_rate": 2.465329199366806e-06, "loss": 0.9318682, "num_input_tokens_seen": 151871780, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.79296875, "step": 7074, "time_per_iteration": 2.576934576034546 }, { "auxiliary_loss_clip": 0.01134053, "auxiliary_loss_mlp": 0.012873, "balance_loss_clip": 1.02266538, "balance_loss_mlp": 1.04220593, "epoch": 0.42537201262588303, "flos": 22091670460800.0, "grad_norm": 1.8371140007305968, "language_loss": 0.63689858, "learning_rate": 2.4649617373035924e-06, "loss": 0.66111207, "num_input_tokens_seen": 151891600, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.83203125, "step": 7075, "time_per_iteration": 2.502518653869629 }, { "auxiliary_loss_clip": 0.01143341, "auxiliary_loss_mlp": 0.01030691, "balance_loss_clip": 1.0162127, "balance_loss_mlp": 1.04058254, "epoch": 0.42543213587855105, "flos": 23003119094400.0, "grad_norm": 3.1216821262220464, "language_loss": 0.7317977, "learning_rate": 2.4645942586470898e-06, "loss": 0.75353801, "num_input_tokens_seen": 151911330, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.76171875, "step": 7076, "time_per_iteration": 3.9388973712921143 }, { "auxiliary_loss_clip": 0.01144846, "auxiliary_loss_mlp": 0.01040716, "balance_loss_clip": 1.02542686, "balance_loss_mlp": 1.04427099, "epoch": 0.425492259131219, "flos": 25082454533760.0, "grad_norm": 2.5052156687622, "language_loss": 0.78893411, "learning_rate": 2.4642267634104136e-06, "loss": 0.8107897, "num_input_tokens_seen": 151930355, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.82421875, "step": 7077, "time_per_iteration": 2.581716775894165 }, { "auxiliary_loss_clip": 0.01137868, "auxiliary_loss_mlp": 0.01035034, "balance_loss_clip": 1.02134824, "balance_loss_mlp": 1.04332197, "epoch": 0.425552382383887, "flos": 22816850140800.0, "grad_norm": 1.9290210830435892, "language_loss": 0.72971594, "learning_rate": 2.4638592516066784e-06, "loss": 0.75144494, "num_input_tokens_seen": 151949695, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.765625, "step": 7078, "time_per_iteration": 2.559647560119629 }, { "auxiliary_loss_clip": 0.01139456, "auxiliary_loss_mlp": 0.01038632, "balance_loss_clip": 1.02433276, "balance_loss_mlp": 1.04439878, "epoch": 0.42561250563655495, "flos": 13553585671680.0, "grad_norm": 2.071487064277498, "language_loss": 0.79865694, "learning_rate": 2.4634917232489993e-06, "loss": 0.82043785, "num_input_tokens_seen": 151967640, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7734375, "step": 7079, "time_per_iteration": 2.601358413696289 }, { "auxiliary_loss_clip": 0.01133712, "auxiliary_loss_mlp": 0.01033499, "balance_loss_clip": 1.0190742, "balance_loss_mlp": 1.04707265, "epoch": 0.4256726288892229, "flos": 46978303023360.0, "grad_norm": 1.3701373550537608, "language_loss": 0.71477282, "learning_rate": 2.463124178350493e-06, "loss": 0.73644495, "num_input_tokens_seen": 151994020, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78125, "step": 7080, "time_per_iteration": 2.8125219345092773 }, { "auxiliary_loss_clip": 0.01122916, "auxiliary_loss_mlp": 0.01034726, "balance_loss_clip": 1.02008104, "balance_loss_mlp": 1.04324245, "epoch": 0.4257327521418909, "flos": 23586451966080.0, "grad_norm": 2.038333479566523, "language_loss": 0.8097403, "learning_rate": 2.4627566169242757e-06, "loss": 0.83131671, "num_input_tokens_seen": 152013415, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.796875, "step": 7081, "time_per_iteration": 2.593580722808838 }, { "auxiliary_loss_clip": 0.01138386, "auxiliary_loss_mlp": 0.01030253, "balance_loss_clip": 1.015674, "balance_loss_mlp": 1.04471624, "epoch": 0.42579287539455885, "flos": 18989994124800.0, "grad_norm": 1.6176920397038852, "language_loss": 0.81314665, "learning_rate": 2.4623890389834656e-06, "loss": 0.83483303, "num_input_tokens_seen": 152030860, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7578125, "step": 7082, "time_per_iteration": 3.992770195007324 }, { "auxiliary_loss_clip": 0.01131926, "auxiliary_loss_mlp": 0.01035164, "balance_loss_clip": 1.02022731, "balance_loss_mlp": 1.04387856, "epoch": 0.4258529986472268, "flos": 25191910252800.0, "grad_norm": 2.146193157740881, "language_loss": 0.694637, "learning_rate": 2.46202144454118e-06, "loss": 0.71630788, "num_input_tokens_seen": 152050395, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.7890625, "step": 7083, "time_per_iteration": 2.695307970046997 }, { "auxiliary_loss_clip": 0.01049484, "auxiliary_loss_mlp": 0.01002331, "balance_loss_clip": 1.0007571, "balance_loss_mlp": 1.01593184, "epoch": 0.4259131218998948, "flos": 69968280718080.0, "grad_norm": 0.8690245096294483, "language_loss": 0.67095673, "learning_rate": 2.4616538336105373e-06, "loss": 0.69147485, "num_input_tokens_seen": 152113555, "router_z_loss_clip": 0.01574707, "router_z_loss_mlp": 0.24609375, "step": 7084, "time_per_iteration": 4.783864974975586 }, { "auxiliary_loss_clip": 0.01130674, "auxiliary_loss_mlp": 0.01041157, "balance_loss_clip": 1.02567136, "balance_loss_mlp": 1.0436517, "epoch": 0.42597324515256274, "flos": 18004964480640.0, "grad_norm": 1.8810450606936928, "language_loss": 0.78567588, "learning_rate": 2.461286206204657e-06, "loss": 0.80739415, "num_input_tokens_seen": 152131575, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.78125, "step": 7085, "time_per_iteration": 2.4808056354522705 }, { "auxiliary_loss_clip": 0.01138439, "auxiliary_loss_mlp": 0.01044285, "balance_loss_clip": 1.02779889, "balance_loss_mlp": 1.04615796, "epoch": 0.4260333684052307, "flos": 15158792563200.0, "grad_norm": 2.309783197506223, "language_loss": 0.7609818, "learning_rate": 2.460918562336659e-06, "loss": 0.78280902, "num_input_tokens_seen": 152149435, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.828125, "step": 7086, "time_per_iteration": 4.0500123500823975 }, { "auxiliary_loss_clip": 0.01139572, "auxiliary_loss_mlp": 0.01035896, "balance_loss_clip": 1.02050591, "balance_loss_mlp": 1.04266357, "epoch": 0.42609349165789867, "flos": 14939342421120.0, "grad_norm": 5.09298774111552, "language_loss": 0.80201453, "learning_rate": 2.460550902019663e-06, "loss": 0.82376921, "num_input_tokens_seen": 152166860, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.79296875, "step": 7087, "time_per_iteration": 2.5830118656158447 }, { "auxiliary_loss_clip": 0.01141215, "auxiliary_loss_mlp": 0.01035495, "balance_loss_clip": 1.01942539, "balance_loss_mlp": 1.04423547, "epoch": 0.42615361491056664, "flos": 23731961961600.0, "grad_norm": 1.7482473495547393, "language_loss": 0.66072166, "learning_rate": 2.4601832252667893e-06, "loss": 0.6824888, "num_input_tokens_seen": 152187475, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.7890625, "step": 7088, "time_per_iteration": 2.5735087394714355 }, { "auxiliary_loss_clip": 0.01161371, "auxiliary_loss_mlp": 0.0103869, "balance_loss_clip": 1.02300167, "balance_loss_mlp": 1.04448032, "epoch": 0.42621373816323466, "flos": 24936441747840.0, "grad_norm": 2.1251520658992473, "language_loss": 0.68467122, "learning_rate": 2.4598155320911604e-06, "loss": 0.70667183, "num_input_tokens_seen": 152207235, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8125, "step": 7089, "time_per_iteration": 2.6858317852020264 }, { "auxiliary_loss_clip": 0.01159424, "auxiliary_loss_mlp": 0.0103544, "balance_loss_clip": 1.02021718, "balance_loss_mlp": 1.04236221, "epoch": 0.4262738614159026, "flos": 13552975140480.0, "grad_norm": 2.9049605616423646, "language_loss": 0.73136336, "learning_rate": 2.459447822505898e-06, "loss": 0.75331199, "num_input_tokens_seen": 152224240, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8203125, "step": 7090, "time_per_iteration": 2.5046043395996094 }, { "auxiliary_loss_clip": 0.01142953, "auxiliary_loss_mlp": 0.01037546, "balance_loss_clip": 1.02277541, "balance_loss_mlp": 1.04174042, "epoch": 0.4263339846685706, "flos": 29748794284800.0, "grad_norm": 1.786652580174363, "language_loss": 0.74722373, "learning_rate": 2.459080096524124e-06, "loss": 0.76902878, "num_input_tokens_seen": 152242595, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8359375, "step": 7091, "time_per_iteration": 2.657482624053955 }, { "auxiliary_loss_clip": 0.01153528, "auxiliary_loss_mlp": 0.01030754, "balance_loss_clip": 1.0179925, "balance_loss_mlp": 1.04269505, "epoch": 0.42639410792123855, "flos": 16834204586880.0, "grad_norm": 1.6839092574704613, "language_loss": 0.82601762, "learning_rate": 2.458712354158963e-06, "loss": 0.84786046, "num_input_tokens_seen": 152260840, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.75390625, "step": 7092, "time_per_iteration": 2.5268375873565674 }, { "auxiliary_loss_clip": 0.01146301, "auxiliary_loss_mlp": 0.01041435, "balance_loss_clip": 1.02679658, "balance_loss_mlp": 1.04394674, "epoch": 0.4264542311739065, "flos": 28763118195840.0, "grad_norm": 2.138920603478557, "language_loss": 0.73711538, "learning_rate": 2.4583445954235384e-06, "loss": 0.75899273, "num_input_tokens_seen": 152280580, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.84375, "step": 7093, "time_per_iteration": 2.680464506149292 }, { "auxiliary_loss_clip": 0.01131449, "auxiliary_loss_mlp": 0.0103728, "balance_loss_clip": 1.02194989, "balance_loss_mlp": 1.042642, "epoch": 0.4265143544265745, "flos": 24713615727360.0, "grad_norm": 3.858142399927461, "language_loss": 0.70215011, "learning_rate": 2.4579768203309733e-06, "loss": 0.72383738, "num_input_tokens_seen": 152298455, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.80078125, "step": 7094, "time_per_iteration": 2.5719077587127686 }, { "auxiliary_loss_clip": 0.01149746, "auxiliary_loss_mlp": 0.01035923, "balance_loss_clip": 1.02046192, "balance_loss_mlp": 1.04301834, "epoch": 0.42657447767924245, "flos": 21865971352320.0, "grad_norm": 1.5474857645545053, "language_loss": 0.81550187, "learning_rate": 2.457609028894394e-06, "loss": 0.83735859, "num_input_tokens_seen": 152316995, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8046875, "step": 7095, "time_per_iteration": 2.6458065509796143 }, { "auxiliary_loss_clip": 0.01123855, "auxiliary_loss_mlp": 0.01045106, "balance_loss_clip": 1.03009772, "balance_loss_mlp": 1.04382527, "epoch": 0.4266346009319104, "flos": 21470236237440.0, "grad_norm": 1.74600793283922, "language_loss": 0.80504811, "learning_rate": 2.457241221126925e-06, "loss": 0.82673764, "num_input_tokens_seen": 152334800, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.80078125, "step": 7096, "time_per_iteration": 2.513324022293091 }, { "auxiliary_loss_clip": 0.01136088, "auxiliary_loss_mlp": 0.01036237, "balance_loss_clip": 1.02164543, "balance_loss_mlp": 1.04358697, "epoch": 0.4266947241845784, "flos": 25519379569920.0, "grad_norm": 4.65761241065357, "language_loss": 0.65949178, "learning_rate": 2.4568733970416936e-06, "loss": 0.68121499, "num_input_tokens_seen": 152355175, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8359375, "step": 7097, "time_per_iteration": 2.619424343109131 }, { "auxiliary_loss_clip": 0.01138488, "auxiliary_loss_mlp": 0.01033361, "balance_loss_clip": 1.01896596, "balance_loss_mlp": 1.0424515, "epoch": 0.42675484743724634, "flos": 26541217676160.0, "grad_norm": 1.7184950298026902, "language_loss": 0.74247879, "learning_rate": 2.4565055566518252e-06, "loss": 0.76419723, "num_input_tokens_seen": 152377245, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.77734375, "step": 7098, "time_per_iteration": 2.7356925010681152 }, { "auxiliary_loss_clip": 0.01129471, "auxiliary_loss_mlp": 0.01030818, "balance_loss_clip": 1.01675773, "balance_loss_mlp": 1.04267013, "epoch": 0.4268149706899143, "flos": 23112718467840.0, "grad_norm": 1.8504688652038614, "language_loss": 0.75123155, "learning_rate": 2.4561376999704488e-06, "loss": 0.77283442, "num_input_tokens_seen": 152396985, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78125, "step": 7099, "time_per_iteration": 2.572758436203003 }, { "auxiliary_loss_clip": 0.01143464, "auxiliary_loss_mlp": 0.01041289, "balance_loss_clip": 1.02604783, "balance_loss_mlp": 1.04435527, "epoch": 0.4268750939425823, "flos": 22706532495360.0, "grad_norm": 2.7119387393172927, "language_loss": 0.82670951, "learning_rate": 2.4557698270106906e-06, "loss": 0.84855705, "num_input_tokens_seen": 152415590, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8125, "step": 7100, "time_per_iteration": 2.636570453643799 }, { "auxiliary_loss_clip": 0.01141459, "auxiliary_loss_mlp": 0.01035058, "balance_loss_clip": 1.02077127, "balance_loss_mlp": 1.04284549, "epoch": 0.42693521719525024, "flos": 25374875155200.0, "grad_norm": 1.3029717732241286, "language_loss": 0.82288641, "learning_rate": 2.45540193778568e-06, "loss": 0.84465158, "num_input_tokens_seen": 152436735, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.80859375, "step": 7101, "time_per_iteration": 2.6273152828216553 }, { "auxiliary_loss_clip": 0.01126608, "auxiliary_loss_mlp": 0.01034786, "balance_loss_clip": 1.0198195, "balance_loss_mlp": 1.04422331, "epoch": 0.42699534044791826, "flos": 18150689957760.0, "grad_norm": 1.7913098453519518, "language_loss": 0.72586155, "learning_rate": 2.4550340323085453e-06, "loss": 0.7474755, "num_input_tokens_seen": 152455685, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.82421875, "step": 7102, "time_per_iteration": 2.535461902618408 }, { "auxiliary_loss_clip": 0.01128789, "auxiliary_loss_mlp": 0.01034082, "balance_loss_clip": 1.02035546, "balance_loss_mlp": 1.04292667, "epoch": 0.4270554637005862, "flos": 13698413308800.0, "grad_norm": 1.7979077812854412, "language_loss": 0.8286128, "learning_rate": 2.4546661105924166e-06, "loss": 0.85024154, "num_input_tokens_seen": 152473500, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.76953125, "step": 7103, "time_per_iteration": 2.5337648391723633 }, { "auxiliary_loss_clip": 0.01143421, "auxiliary_loss_mlp": 0.01035341, "balance_loss_clip": 1.02004075, "balance_loss_mlp": 1.04310524, "epoch": 0.4271155869532542, "flos": 17493596507520.0, "grad_norm": 2.46784160204682, "language_loss": 0.73818064, "learning_rate": 2.454298172650424e-06, "loss": 0.75996822, "num_input_tokens_seen": 152491320, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8203125, "step": 7104, "time_per_iteration": 2.520979404449463 }, { "auxiliary_loss_clip": 0.01130473, "auxiliary_loss_mlp": 0.01035369, "balance_loss_clip": 1.02139163, "balance_loss_mlp": 1.04252005, "epoch": 0.42717571020592215, "flos": 32452293381120.0, "grad_norm": 1.9122002074721036, "language_loss": 0.75069648, "learning_rate": 2.4539302184956986e-06, "loss": 0.7723549, "num_input_tokens_seen": 152511970, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7890625, "step": 7105, "time_per_iteration": 2.5845530033111572 }, { "auxiliary_loss_clip": 0.01126692, "auxiliary_loss_mlp": 0.01037672, "balance_loss_clip": 1.02349794, "balance_loss_mlp": 1.04537892, "epoch": 0.4272358334585901, "flos": 16253062444800.0, "grad_norm": 5.518260203281138, "language_loss": 0.76363575, "learning_rate": 2.45356224814137e-06, "loss": 0.78527945, "num_input_tokens_seen": 152530515, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.8125, "step": 7106, "time_per_iteration": 2.5578794479370117 }, { "auxiliary_loss_clip": 0.01138216, "auxiliary_loss_mlp": 0.01038378, "balance_loss_clip": 1.0243355, "balance_loss_mlp": 1.04148412, "epoch": 0.4272959567112581, "flos": 24200092938240.0, "grad_norm": 2.4514876742768053, "language_loss": 0.79977369, "learning_rate": 2.453194261600573e-06, "loss": 0.82153964, "num_input_tokens_seen": 152549295, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7890625, "step": 7107, "time_per_iteration": 2.5562071800231934 }, { "auxiliary_loss_clip": 0.01156964, "auxiliary_loss_mlp": 0.01037772, "balance_loss_clip": 1.02368736, "balance_loss_mlp": 1.04385042, "epoch": 0.42735607996392605, "flos": 27963495578880.0, "grad_norm": 1.4113262780902354, "language_loss": 0.68007076, "learning_rate": 2.4528262588864376e-06, "loss": 0.70201814, "num_input_tokens_seen": 152570725, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7734375, "step": 7108, "time_per_iteration": 2.6462881565093994 }, { "auxiliary_loss_clip": 0.01140151, "auxiliary_loss_mlp": 0.01039561, "balance_loss_clip": 1.02585185, "balance_loss_mlp": 1.04313886, "epoch": 0.427416203216594, "flos": 20295597674880.0, "grad_norm": 1.7307367670696812, "language_loss": 0.71490467, "learning_rate": 2.452458240012098e-06, "loss": 0.73670173, "num_input_tokens_seen": 152588950, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7890625, "step": 7109, "time_per_iteration": 2.539764165878296 }, { "auxiliary_loss_clip": 0.01131601, "auxiliary_loss_mlp": 0.01037431, "balance_loss_clip": 1.02161241, "balance_loss_mlp": 1.04271638, "epoch": 0.427476326469262, "flos": 26943955943040.0, "grad_norm": 1.9998580426349042, "language_loss": 0.647277, "learning_rate": 2.4520902049906883e-06, "loss": 0.66896731, "num_input_tokens_seen": 152608965, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.796875, "step": 7110, "time_per_iteration": 2.6598923206329346 }, { "auxiliary_loss_clip": 0.01123533, "auxiliary_loss_mlp": 0.01284008, "balance_loss_clip": 1.02156854, "balance_loss_mlp": 1.04393625, "epoch": 0.42753644972192995, "flos": 25702847262720.0, "grad_norm": 1.6831692215762715, "language_loss": 0.76679325, "learning_rate": 2.4517221538353413e-06, "loss": 0.79086876, "num_input_tokens_seen": 152630220, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.796875, "step": 7111, "time_per_iteration": 2.557133913040161 }, { "auxiliary_loss_clip": 0.01151722, "auxiliary_loss_mlp": 0.0103464, "balance_loss_clip": 1.02034056, "balance_loss_mlp": 1.04442489, "epoch": 0.4275965729745979, "flos": 18767419499520.0, "grad_norm": 2.3249833993020728, "language_loss": 0.73361254, "learning_rate": 2.4513540865591934e-06, "loss": 0.75547612, "num_input_tokens_seen": 152648835, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8046875, "step": 7112, "time_per_iteration": 2.638183355331421 }, { "auxiliary_loss_clip": 0.01139042, "auxiliary_loss_mlp": 0.01038581, "balance_loss_clip": 1.02460921, "balance_loss_mlp": 1.04376721, "epoch": 0.4276566962272659, "flos": 23764424878080.0, "grad_norm": 2.723518280186085, "language_loss": 0.71434712, "learning_rate": 2.450986003175378e-06, "loss": 0.73612332, "num_input_tokens_seen": 152668375, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7734375, "step": 7113, "time_per_iteration": 2.5882506370544434 }, { "auxiliary_loss_clip": 0.01134752, "auxiliary_loss_mlp": 0.01041392, "balance_loss_clip": 1.02701557, "balance_loss_mlp": 1.0453217, "epoch": 0.42771681947993384, "flos": 22492505306880.0, "grad_norm": 1.6925267219589364, "language_loss": 0.61661768, "learning_rate": 2.4506179036970333e-06, "loss": 0.6383791, "num_input_tokens_seen": 152689725, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.80859375, "step": 7114, "time_per_iteration": 2.644278049468994 }, { "auxiliary_loss_clip": 0.01138731, "auxiliary_loss_mlp": 0.01041782, "balance_loss_clip": 1.02556968, "balance_loss_mlp": 1.04522467, "epoch": 0.42777694273260186, "flos": 25044712318080.0, "grad_norm": 1.6812748669660282, "language_loss": 0.64574099, "learning_rate": 2.4502497881372943e-06, "loss": 0.66754615, "num_input_tokens_seen": 152709375, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.84375, "step": 7115, "time_per_iteration": 2.553830623626709 }, { "auxiliary_loss_clip": 0.01140203, "auxiliary_loss_mlp": 0.01037874, "balance_loss_clip": 1.0242362, "balance_loss_mlp": 1.04343271, "epoch": 0.4278370659852698, "flos": 18661519226880.0, "grad_norm": 1.6467666357335502, "language_loss": 0.74049795, "learning_rate": 2.449881656509299e-06, "loss": 0.76227868, "num_input_tokens_seen": 152727510, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7890625, "step": 7116, "time_per_iteration": 2.5564091205596924 }, { "auxiliary_loss_clip": 0.01136781, "auxiliary_loss_mlp": 0.01043903, "balance_loss_clip": 1.03005147, "balance_loss_mlp": 1.04208434, "epoch": 0.4278971892379378, "flos": 27308269635840.0, "grad_norm": 1.876974556635926, "language_loss": 0.69642407, "learning_rate": 2.4495135088261844e-06, "loss": 0.71823096, "num_input_tokens_seen": 152746670, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.765625, "step": 7117, "time_per_iteration": 2.5483195781707764 }, { "auxiliary_loss_clip": 0.01142896, "auxiliary_loss_mlp": 0.01042689, "balance_loss_clip": 1.0287472, "balance_loss_mlp": 1.04626942, "epoch": 0.42795731249060576, "flos": 12888698970240.0, "grad_norm": 2.3525297894276784, "language_loss": 0.6977548, "learning_rate": 2.4491453451010883e-06, "loss": 0.71961063, "num_input_tokens_seen": 152760545, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78515625, "step": 7118, "time_per_iteration": 3.9123260974884033 }, { "auxiliary_loss_clip": 0.0115022, "auxiliary_loss_mlp": 0.01046233, "balance_loss_clip": 1.03150451, "balance_loss_mlp": 1.04601467, "epoch": 0.4280174357432737, "flos": 33401448316800.0, "grad_norm": 1.963366308546283, "language_loss": 0.74501783, "learning_rate": 2.4487771653471508e-06, "loss": 0.76698232, "num_input_tokens_seen": 152780970, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7734375, "step": 7119, "time_per_iteration": 2.720398187637329 }, { "auxiliary_loss_clip": 0.0116571, "auxiliary_loss_mlp": 0.0103466, "balance_loss_clip": 1.02042055, "balance_loss_mlp": 1.04288435, "epoch": 0.4280775589959417, "flos": 18259104182400.0, "grad_norm": 2.320794626496001, "language_loss": 0.74684119, "learning_rate": 2.4484089695775104e-06, "loss": 0.7688449, "num_input_tokens_seen": 152798475, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78515625, "step": 7120, "time_per_iteration": 2.5606892108917236 }, { "auxiliary_loss_clip": 0.01140634, "auxiliary_loss_mlp": 0.01287224, "balance_loss_clip": 1.02634799, "balance_loss_mlp": 1.0454874, "epoch": 0.42813768224860965, "flos": 21471277731840.0, "grad_norm": 1.518299709490905, "language_loss": 0.77187359, "learning_rate": 2.4480407578053073e-06, "loss": 0.79615223, "num_input_tokens_seen": 152817555, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.77734375, "step": 7121, "time_per_iteration": 2.6029093265533447 }, { "auxiliary_loss_clip": 0.0113876, "auxiliary_loss_mlp": 0.01039684, "balance_loss_clip": 1.0263449, "balance_loss_mlp": 1.04391181, "epoch": 0.4281978055012776, "flos": 15669262696320.0, "grad_norm": 2.6759665500894863, "language_loss": 0.8540659, "learning_rate": 2.4476725300436823e-06, "loss": 0.87585032, "num_input_tokens_seen": 152836295, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.76953125, "step": 7122, "time_per_iteration": 2.545891761779785 }, { "auxiliary_loss_clip": 0.01155796, "auxiliary_loss_mlp": 0.01035734, "balance_loss_clip": 1.02222133, "balance_loss_mlp": 1.04413474, "epoch": 0.4282579287539456, "flos": 17712005155200.0, "grad_norm": 1.8409928063559835, "language_loss": 0.8122189, "learning_rate": 2.4473042863057763e-06, "loss": 0.83413416, "num_input_tokens_seen": 152854950, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.76171875, "step": 7123, "time_per_iteration": 4.039996385574341 }, { "auxiliary_loss_clip": 0.01148929, "auxiliary_loss_mlp": 0.01033777, "balance_loss_clip": 1.01939988, "balance_loss_mlp": 1.04571009, "epoch": 0.42831805200661355, "flos": 19281157770240.0, "grad_norm": 2.2198369490059084, "language_loss": 0.80536127, "learning_rate": 2.4469360266047305e-06, "loss": 0.82718837, "num_input_tokens_seen": 152873995, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.765625, "step": 7124, "time_per_iteration": 2.5680832862854004 }, { "auxiliary_loss_clip": 0.01130526, "auxiliary_loss_mlp": 0.01040991, "balance_loss_clip": 1.02635241, "balance_loss_mlp": 1.04365504, "epoch": 0.4283781752592815, "flos": 19792633484160.0, "grad_norm": 1.7493958597958037, "language_loss": 0.80933058, "learning_rate": 2.4465677509536876e-06, "loss": 0.83104575, "num_input_tokens_seen": 152892925, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.77734375, "step": 7125, "time_per_iteration": 2.5544772148132324 }, { "auxiliary_loss_clip": 0.01120683, "auxiliary_loss_mlp": 0.01280658, "balance_loss_clip": 1.0206207, "balance_loss_mlp": 1.04492378, "epoch": 0.4284382985119495, "flos": 16508064072960.0, "grad_norm": 1.7494077078895436, "language_loss": 0.74706703, "learning_rate": 2.446199459365791e-06, "loss": 0.77108043, "num_input_tokens_seen": 152910935, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7578125, "step": 7126, "time_per_iteration": 4.057261943817139 }, { "auxiliary_loss_clip": 0.01132156, "auxiliary_loss_mlp": 0.01033958, "balance_loss_clip": 1.01937294, "balance_loss_mlp": 1.04502606, "epoch": 0.42849842176461744, "flos": 23330767979520.0, "grad_norm": 1.6331600086227867, "language_loss": 0.80937672, "learning_rate": 2.445831151854183e-06, "loss": 0.83103788, "num_input_tokens_seen": 152931030, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.78125, "step": 7127, "time_per_iteration": 2.523200035095215 }, { "auxiliary_loss_clip": 0.01157254, "auxiliary_loss_mlp": 0.01037995, "balance_loss_clip": 1.02340949, "balance_loss_mlp": 1.04450321, "epoch": 0.4285585450172854, "flos": 17274433674240.0, "grad_norm": 1.8818996027642072, "language_loss": 0.76470989, "learning_rate": 2.445462828432008e-06, "loss": 0.78666234, "num_input_tokens_seen": 152948085, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.77734375, "step": 7128, "time_per_iteration": 4.024348974227905 }, { "auxiliary_loss_clip": 0.01159818, "auxiliary_loss_mlp": 0.01035636, "balance_loss_clip": 1.0211637, "balance_loss_mlp": 1.04419422, "epoch": 0.42861866826995343, "flos": 24279599692800.0, "grad_norm": 2.043697367967716, "language_loss": 0.74198055, "learning_rate": 2.4450944891124105e-06, "loss": 0.76393509, "num_input_tokens_seen": 152966265, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.80078125, "step": 7129, "time_per_iteration": 2.6580374240875244 }, { "auxiliary_loss_clip": 0.01132826, "auxiliary_loss_mlp": 0.01278723, "balance_loss_clip": 1.01759374, "balance_loss_mlp": 1.04655814, "epoch": 0.4286787915226214, "flos": 24353108876160.0, "grad_norm": 1.9084698497771984, "language_loss": 0.77446932, "learning_rate": 2.4447261339085355e-06, "loss": 0.79858482, "num_input_tokens_seen": 152986775, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.77734375, "step": 7130, "time_per_iteration": 2.570561647415161 }, { "auxiliary_loss_clip": 0.01144471, "auxiliary_loss_mlp": 0.01035598, "balance_loss_clip": 1.0206964, "balance_loss_mlp": 1.04700494, "epoch": 0.42873891477528936, "flos": 15449992122240.0, "grad_norm": 2.128517109155373, "language_loss": 0.73349094, "learning_rate": 2.4443577628335297e-06, "loss": 0.75529158, "num_input_tokens_seen": 153003595, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.796875, "step": 7131, "time_per_iteration": 2.572432279586792 }, { "auxiliary_loss_clip": 0.01145175, "auxiliary_loss_mlp": 0.01033737, "balance_loss_clip": 1.01922941, "balance_loss_mlp": 1.04692209, "epoch": 0.4287990380279573, "flos": 17639573379840.0, "grad_norm": 1.9410984509136624, "language_loss": 0.77060944, "learning_rate": 2.4439893759005374e-06, "loss": 0.79239857, "num_input_tokens_seen": 153021960, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8046875, "step": 7132, "time_per_iteration": 2.5892677307128906 }, { "auxiliary_loss_clip": 0.01151512, "auxiliary_loss_mlp": 0.01286256, "balance_loss_clip": 1.02430701, "balance_loss_mlp": 1.04616976, "epoch": 0.4288591612806253, "flos": 27162328677120.0, "grad_norm": 1.7675447915441578, "language_loss": 0.7838434, "learning_rate": 2.4436209731227066e-06, "loss": 0.8082211, "num_input_tokens_seen": 153042110, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.7890625, "step": 7133, "time_per_iteration": 2.6613800525665283 }, { "auxiliary_loss_clip": 0.01143358, "auxiliary_loss_mlp": 0.01037472, "balance_loss_clip": 1.02317905, "balance_loss_mlp": 1.04544127, "epoch": 0.42891928453329325, "flos": 17163182275200.0, "grad_norm": 7.75531133726199, "language_loss": 0.75014865, "learning_rate": 2.4432525545131842e-06, "loss": 0.77195692, "num_input_tokens_seen": 153058925, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.796875, "step": 7134, "time_per_iteration": 2.611398220062256 }, { "auxiliary_loss_clip": 0.01128137, "auxiliary_loss_mlp": 0.0103012, "balance_loss_clip": 1.01723945, "balance_loss_mlp": 1.04280043, "epoch": 0.4289794077859612, "flos": 18187031543040.0, "grad_norm": 2.0674011656591524, "language_loss": 0.84010303, "learning_rate": 2.4428841200851183e-06, "loss": 0.86168557, "num_input_tokens_seen": 153078070, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.76171875, "step": 7135, "time_per_iteration": 2.6134514808654785 }, { "auxiliary_loss_clip": 0.01163244, "auxiliary_loss_mlp": 0.01033453, "balance_loss_clip": 1.01923728, "balance_loss_mlp": 1.0423224, "epoch": 0.4290395310386292, "flos": 28256885867520.0, "grad_norm": 2.2240605933377773, "language_loss": 0.74682766, "learning_rate": 2.4425156698516576e-06, "loss": 0.76879466, "num_input_tokens_seen": 153096680, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7578125, "step": 7136, "time_per_iteration": 2.6387174129486084 }, { "auxiliary_loss_clip": 0.011311, "auxiliary_loss_mlp": 0.01034863, "balance_loss_clip": 1.02087379, "balance_loss_mlp": 1.042503, "epoch": 0.42909965429129715, "flos": 16216074414720.0, "grad_norm": 2.5567254526226026, "language_loss": 0.7930423, "learning_rate": 2.4421472038259513e-06, "loss": 0.81470191, "num_input_tokens_seen": 153113305, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.796875, "step": 7137, "time_per_iteration": 2.6113574504852295 }, { "auxiliary_loss_clip": 0.01125699, "auxiliary_loss_mlp": 0.01030296, "balance_loss_clip": 1.01596665, "balance_loss_mlp": 1.04628277, "epoch": 0.4291597775439651, "flos": 23112862122240.0, "grad_norm": 2.5384903706941406, "language_loss": 0.75849795, "learning_rate": 2.441778722021148e-06, "loss": 0.78005785, "num_input_tokens_seen": 153132735, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.79296875, "step": 7138, "time_per_iteration": 2.517071008682251 }, { "auxiliary_loss_clip": 0.01062962, "auxiliary_loss_mlp": 0.01003694, "balance_loss_clip": 1.0019412, "balance_loss_mlp": 1.02103353, "epoch": 0.4292199007966331, "flos": 67546212681600.0, "grad_norm": 0.778568239070956, "language_loss": 0.5620532, "learning_rate": 2.441410224450399e-06, "loss": 0.58271974, "num_input_tokens_seen": 153187925, "router_z_loss_clip": 0.01757812, "router_z_loss_mlp": 0.2421875, "step": 7139, "time_per_iteration": 3.0764873027801514 }, { "auxiliary_loss_clip": 0.01118469, "auxiliary_loss_mlp": 0.01286323, "balance_loss_clip": 1.0269177, "balance_loss_mlp": 1.04407692, "epoch": 0.42928002404930105, "flos": 22999850956800.0, "grad_norm": 1.692497178749079, "language_loss": 0.80715096, "learning_rate": 2.441041711126854e-06, "loss": 0.83119893, "num_input_tokens_seen": 153206990, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7421875, "step": 7140, "time_per_iteration": 2.623133420944214 }, { "auxiliary_loss_clip": 0.01128667, "auxiliary_loss_mlp": 0.01030661, "balance_loss_clip": 1.0162425, "balance_loss_mlp": 1.04231524, "epoch": 0.429340147301969, "flos": 11544922241280.0, "grad_norm": 1.6662243705830164, "language_loss": 0.82135397, "learning_rate": 2.4406731820636652e-06, "loss": 0.84294724, "num_input_tokens_seen": 153222345, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7734375, "step": 7141, "time_per_iteration": 2.596771240234375 }, { "auxiliary_loss_clip": 0.01135022, "auxiliary_loss_mlp": 0.01040481, "balance_loss_clip": 1.02559197, "balance_loss_mlp": 1.04550886, "epoch": 0.42940027055463703, "flos": 25264988472960.0, "grad_norm": 1.784201365997882, "language_loss": 0.86587006, "learning_rate": 2.4403046372739833e-06, "loss": 0.8876251, "num_input_tokens_seen": 153240570, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8046875, "step": 7142, "time_per_iteration": 2.6226675510406494 }, { "auxiliary_loss_clip": 0.01138779, "auxiliary_loss_mlp": 0.01033861, "balance_loss_clip": 1.02009821, "balance_loss_mlp": 1.0465281, "epoch": 0.429460393807305, "flos": 23805004268160.0, "grad_norm": 2.018707898340316, "language_loss": 0.78214741, "learning_rate": 2.4399360767709627e-06, "loss": 0.80387378, "num_input_tokens_seen": 153259575, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.74609375, "step": 7143, "time_per_iteration": 2.6579837799072266 }, { "auxiliary_loss_clip": 0.01158869, "auxiliary_loss_mlp": 0.01038491, "balance_loss_clip": 1.02487779, "balance_loss_mlp": 1.04666841, "epoch": 0.42952051705997296, "flos": 13918294414080.0, "grad_norm": 1.8212750456515097, "language_loss": 0.76726764, "learning_rate": 2.4395675005677545e-06, "loss": 0.78924119, "num_input_tokens_seen": 153276650, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.765625, "step": 7144, "time_per_iteration": 2.6255369186401367 }, { "auxiliary_loss_clip": 0.01147489, "auxiliary_loss_mlp": 0.01031889, "balance_loss_clip": 1.01762593, "balance_loss_mlp": 1.04329515, "epoch": 0.4295806403126409, "flos": 26760380509440.0, "grad_norm": 2.5114354914821404, "language_loss": 0.73442638, "learning_rate": 2.439198908677513e-06, "loss": 0.75622022, "num_input_tokens_seen": 153298025, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7734375, "step": 7145, "time_per_iteration": 2.616386651992798 }, { "auxiliary_loss_clip": 0.01131939, "auxiliary_loss_mlp": 0.01035268, "balance_loss_clip": 1.02074826, "balance_loss_mlp": 1.04386449, "epoch": 0.4296407635653089, "flos": 20952619297920.0, "grad_norm": 9.491615464652059, "language_loss": 0.79252762, "learning_rate": 2.4388303011133927e-06, "loss": 0.81419963, "num_input_tokens_seen": 153315775, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.79296875, "step": 7146, "time_per_iteration": 2.572441339492798 }, { "auxiliary_loss_clip": 0.01127882, "auxiliary_loss_mlp": 0.01032798, "balance_loss_clip": 1.01970851, "balance_loss_mlp": 1.04303491, "epoch": 0.42970088681797686, "flos": 15852335339520.0, "grad_norm": 4.029774111637552, "language_loss": 0.83260202, "learning_rate": 2.438461677888547e-06, "loss": 0.85420883, "num_input_tokens_seen": 153332765, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7578125, "step": 7147, "time_per_iteration": 2.5392284393310547 }, { "auxiliary_loss_clip": 0.01133818, "auxiliary_loss_mlp": 0.0103512, "balance_loss_clip": 1.01955152, "balance_loss_mlp": 1.04607773, "epoch": 0.4297610100706448, "flos": 22382618624640.0, "grad_norm": 1.8759932297842354, "language_loss": 0.87210906, "learning_rate": 2.4380930390161324e-06, "loss": 0.89379841, "num_input_tokens_seen": 153350760, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.78515625, "step": 7148, "time_per_iteration": 2.5718560218811035 }, { "auxiliary_loss_clip": 0.01141341, "auxiliary_loss_mlp": 0.01037101, "balance_loss_clip": 1.0231657, "balance_loss_mlp": 1.04536831, "epoch": 0.4298211333233128, "flos": 27925681536000.0, "grad_norm": 1.6123173582457162, "language_loss": 0.78011537, "learning_rate": 2.437724384509304e-06, "loss": 0.80189979, "num_input_tokens_seen": 153370765, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.77734375, "step": 7149, "time_per_iteration": 2.6136467456817627 }, { "auxiliary_loss_clip": 0.01145135, "auxiliary_loss_mlp": 0.01036795, "balance_loss_clip": 1.02312183, "balance_loss_mlp": 1.04326379, "epoch": 0.42988125657598075, "flos": 24425612478720.0, "grad_norm": 2.0163421282012903, "language_loss": 0.79466438, "learning_rate": 2.4373557143812184e-06, "loss": 0.81648374, "num_input_tokens_seen": 153390725, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75, "step": 7150, "time_per_iteration": 2.681816577911377 }, { "auxiliary_loss_clip": 0.01135183, "auxiliary_loss_mlp": 0.0104434, "balance_loss_clip": 1.0284729, "balance_loss_mlp": 1.04586613, "epoch": 0.4299413798286487, "flos": 15850180523520.0, "grad_norm": 1.604511274532561, "language_loss": 0.74734384, "learning_rate": 2.4369870286450318e-06, "loss": 0.76913905, "num_input_tokens_seen": 153408010, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.80859375, "step": 7151, "time_per_iteration": 2.530902624130249 }, { "auxiliary_loss_clip": 0.01151713, "auxiliary_loss_mlp": 0.01033781, "balance_loss_clip": 1.01918411, "balance_loss_mlp": 1.04374599, "epoch": 0.4300015030813167, "flos": 22309504490880.0, "grad_norm": 2.5565019143264767, "language_loss": 0.70346951, "learning_rate": 2.436618327313902e-06, "loss": 0.72532445, "num_input_tokens_seen": 153426865, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8046875, "step": 7152, "time_per_iteration": 2.6643295288085938 }, { "auxiliary_loss_clip": 0.01136757, "auxiliary_loss_mlp": 0.01035764, "balance_loss_clip": 1.02177489, "balance_loss_mlp": 1.0407908, "epoch": 0.43006162633398465, "flos": 34897666366080.0, "grad_norm": 1.5681411434928814, "language_loss": 0.70733368, "learning_rate": 2.4362496104009886e-06, "loss": 0.72905892, "num_input_tokens_seen": 153449410, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.77734375, "step": 7153, "time_per_iteration": 2.7150652408599854 }, { "auxiliary_loss_clip": 0.01130728, "auxiliary_loss_mlp": 0.01031451, "balance_loss_clip": 1.01712251, "balance_loss_mlp": 1.0422523, "epoch": 0.4301217495866526, "flos": 15961575576960.0, "grad_norm": 1.9692305875084501, "language_loss": 0.77879727, "learning_rate": 2.4358808779194477e-06, "loss": 0.80041909, "num_input_tokens_seen": 153467910, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.79296875, "step": 7154, "time_per_iteration": 2.603099822998047 }, { "auxiliary_loss_clip": 0.01136734, "auxiliary_loss_mlp": 0.01030757, "balance_loss_clip": 1.01760876, "balance_loss_mlp": 1.04296064, "epoch": 0.43018187283932063, "flos": 18770364414720.0, "grad_norm": 1.6342420386029823, "language_loss": 0.7888552, "learning_rate": 2.43551212988244e-06, "loss": 0.81053019, "num_input_tokens_seen": 153487100, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7578125, "step": 7155, "time_per_iteration": 2.6431033611297607 }, { "auxiliary_loss_clip": 0.01136663, "auxiliary_loss_mlp": 0.01031084, "balance_loss_clip": 1.01750565, "balance_loss_mlp": 1.04316664, "epoch": 0.4302419960919886, "flos": 20151703791360.0, "grad_norm": 1.79510870028465, "language_loss": 0.89009392, "learning_rate": 2.435143366303124e-06, "loss": 0.91177142, "num_input_tokens_seen": 153505565, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.75390625, "step": 7156, "time_per_iteration": 2.642101764678955 }, { "auxiliary_loss_clip": 0.01121372, "auxiliary_loss_mlp": 0.01033047, "balance_loss_clip": 1.01882529, "balance_loss_mlp": 1.04240251, "epoch": 0.43030211934465656, "flos": 26432731624320.0, "grad_norm": 1.8397702175070396, "language_loss": 0.83117807, "learning_rate": 2.4347745871946607e-06, "loss": 0.85272229, "num_input_tokens_seen": 153526130, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7890625, "step": 7157, "time_per_iteration": 2.5873799324035645 }, { "auxiliary_loss_clip": 0.01125058, "auxiliary_loss_mlp": 0.01033857, "balance_loss_clip": 1.01992095, "balance_loss_mlp": 1.04482675, "epoch": 0.43036224259732453, "flos": 24389234979840.0, "grad_norm": 1.809753387944813, "language_loss": 0.7202425, "learning_rate": 2.4344057925702113e-06, "loss": 0.74183166, "num_input_tokens_seen": 153546370, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.80078125, "step": 7158, "time_per_iteration": 2.664903163909912 }, { "auxiliary_loss_clip": 0.01129026, "auxiliary_loss_mlp": 0.01032928, "balance_loss_clip": 1.01935601, "balance_loss_mlp": 1.04256511, "epoch": 0.4304223658499925, "flos": 17201714590080.0, "grad_norm": 1.9404984761068969, "language_loss": 0.82698393, "learning_rate": 2.4340369824429364e-06, "loss": 0.84860349, "num_input_tokens_seen": 153562800, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7734375, "step": 7159, "time_per_iteration": 2.5251550674438477 }, { "auxiliary_loss_clip": 0.01138122, "auxiliary_loss_mlp": 0.0103258, "balance_loss_clip": 1.01906192, "balance_loss_mlp": 1.04252458, "epoch": 0.43048248910266046, "flos": 14903000835840.0, "grad_norm": 2.037356376543217, "language_loss": 0.82670546, "learning_rate": 2.433668156825998e-06, "loss": 0.84841239, "num_input_tokens_seen": 153578395, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.77734375, "step": 7160, "time_per_iteration": 3.9039394855499268 }, { "auxiliary_loss_clip": 0.0113341, "auxiliary_loss_mlp": 0.01039896, "balance_loss_clip": 1.0235281, "balance_loss_mlp": 1.04353094, "epoch": 0.4305426123553284, "flos": 21579835610880.0, "grad_norm": 1.8843085710617742, "language_loss": 0.76685965, "learning_rate": 2.4332993157325588e-06, "loss": 0.7885927, "num_input_tokens_seen": 153596880, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.80859375, "step": 7161, "time_per_iteration": 2.567352294921875 }, { "auxiliary_loss_clip": 0.01130025, "auxiliary_loss_mlp": 0.01039142, "balance_loss_clip": 1.02596951, "balance_loss_mlp": 1.04348481, "epoch": 0.4306027356079964, "flos": 22601278667520.0, "grad_norm": 2.0062217685686674, "language_loss": 0.72752589, "learning_rate": 2.4329304591757815e-06, "loss": 0.74921757, "num_input_tokens_seen": 153616570, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7734375, "step": 7162, "time_per_iteration": 2.5474860668182373 }, { "auxiliary_loss_clip": 0.01067704, "auxiliary_loss_mlp": 0.01260212, "balance_loss_clip": 1.01242542, "balance_loss_mlp": 1.01708364, "epoch": 0.43066285886066435, "flos": 70672091806080.0, "grad_norm": 0.8111411820526931, "language_loss": 0.58986163, "learning_rate": 2.4325615871688297e-06, "loss": 0.61314082, "num_input_tokens_seen": 153671450, "router_z_loss_clip": 0.01635742, "router_z_loss_mlp": 0.24023438, "step": 7163, "time_per_iteration": 3.110891580581665 }, { "auxiliary_loss_clip": 0.01135095, "auxiliary_loss_mlp": 0.01031352, "balance_loss_clip": 1.01850152, "balance_loss_mlp": 1.04062986, "epoch": 0.4307229821133323, "flos": 26720591218560.0, "grad_norm": 1.7141539922089435, "language_loss": 0.79101562, "learning_rate": 2.4321926997248676e-06, "loss": 0.81268013, "num_input_tokens_seen": 153691405, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.765625, "step": 7164, "time_per_iteration": 4.07231330871582 }, { "auxiliary_loss_clip": 0.01121445, "auxiliary_loss_mlp": 0.01039715, "balance_loss_clip": 1.02577925, "balance_loss_mlp": 1.04062676, "epoch": 0.4307831053660003, "flos": 26177119464960.0, "grad_norm": 1.7439366430560466, "language_loss": 0.67694604, "learning_rate": 2.4318237968570594e-06, "loss": 0.69855762, "num_input_tokens_seen": 153711555, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.80859375, "step": 7165, "time_per_iteration": 2.6579573154449463 }, { "auxiliary_loss_clip": 0.01138958, "auxiliary_loss_mlp": 0.0129496, "balance_loss_clip": 1.03455138, "balance_loss_mlp": 1.04342616, "epoch": 0.43084322861866825, "flos": 18910343715840.0, "grad_norm": 3.261861313233961, "language_loss": 0.74883199, "learning_rate": 2.4314548785785713e-06, "loss": 0.77317119, "num_input_tokens_seen": 153730095, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7734375, "step": 7166, "time_per_iteration": 2.6132044792175293 }, { "auxiliary_loss_clip": 0.01128532, "auxiliary_loss_mlp": 0.0103754, "balance_loss_clip": 1.02457047, "balance_loss_mlp": 1.04380751, "epoch": 0.4309033518713362, "flos": 26432911192320.0, "grad_norm": 1.753508621690722, "language_loss": 0.71700174, "learning_rate": 2.4310859449025675e-06, "loss": 0.73866248, "num_input_tokens_seen": 153749320, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7578125, "step": 7167, "time_per_iteration": 4.072921514511108 }, { "auxiliary_loss_clip": 0.01133827, "auxiliary_loss_mlp": 0.01282411, "balance_loss_clip": 1.02268505, "balance_loss_mlp": 1.04225636, "epoch": 0.43096347512400424, "flos": 21213295274880.0, "grad_norm": 2.9406509601451902, "language_loss": 0.78186023, "learning_rate": 2.430716995842216e-06, "loss": 0.80602264, "num_input_tokens_seen": 153767825, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.73828125, "step": 7168, "time_per_iteration": 2.5572593212127686 }, { "auxiliary_loss_clip": 0.01136898, "auxiliary_loss_mlp": 0.01041361, "balance_loss_clip": 1.02750301, "balance_loss_mlp": 1.04191923, "epoch": 0.4310235983766722, "flos": 16540131939840.0, "grad_norm": 1.947594524594806, "language_loss": 0.82318664, "learning_rate": 2.4303480314106823e-06, "loss": 0.84496915, "num_input_tokens_seen": 153785350, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.77734375, "step": 7169, "time_per_iteration": 2.5438547134399414 }, { "auxiliary_loss_clip": 0.01059302, "auxiliary_loss_mlp": 0.010001, "balance_loss_clip": 0.99847853, "balance_loss_mlp": 1.01743281, "epoch": 0.43108372162934017, "flos": 64775704763520.0, "grad_norm": 0.6776057346422851, "language_loss": 0.6073907, "learning_rate": 2.429979051621135e-06, "loss": 0.6279847, "num_input_tokens_seen": 153856400, "router_z_loss_clip": 0.01623535, "router_z_loss_mlp": 0.2421875, "step": 7170, "time_per_iteration": 4.783738613128662 }, { "auxiliary_loss_clip": 0.01119285, "auxiliary_loss_mlp": 0.01032451, "balance_loss_clip": 1.01971376, "balance_loss_mlp": 1.04212379, "epoch": 0.43114384488200813, "flos": 13444094039040.0, "grad_norm": 1.7695555792864135, "language_loss": 0.75801986, "learning_rate": 2.429610056486741e-06, "loss": 0.7795372, "num_input_tokens_seen": 153875230, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7734375, "step": 7171, "time_per_iteration": 2.5363264083862305 }, { "auxiliary_loss_clip": 0.01127591, "auxiliary_loss_mlp": 0.01035965, "balance_loss_clip": 1.02334082, "balance_loss_mlp": 1.04276931, "epoch": 0.4312039681346761, "flos": 26286682924800.0, "grad_norm": 1.6127431539917207, "language_loss": 0.77418184, "learning_rate": 2.4292410460206693e-06, "loss": 0.79581743, "num_input_tokens_seen": 153894740, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7578125, "step": 7172, "time_per_iteration": 2.6040337085723877 }, { "auxiliary_loss_clip": 0.01139545, "auxiliary_loss_mlp": 0.01039839, "balance_loss_clip": 1.02479458, "balance_loss_mlp": 1.04268098, "epoch": 0.43126409138734406, "flos": 20376684627840.0, "grad_norm": 2.9853236097410774, "language_loss": 0.76532412, "learning_rate": 2.4288720202360887e-06, "loss": 0.78711796, "num_input_tokens_seen": 153913230, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7890625, "step": 7173, "time_per_iteration": 2.5957560539245605 }, { "auxiliary_loss_clip": 0.01117776, "auxiliary_loss_mlp": 0.01279665, "balance_loss_clip": 1.01923692, "balance_loss_mlp": 1.04190874, "epoch": 0.431324214640012, "flos": 22123091882880.0, "grad_norm": 1.8559302733624063, "language_loss": 0.7694273, "learning_rate": 2.4285029791461687e-06, "loss": 0.79340172, "num_input_tokens_seen": 153933250, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 7174, "time_per_iteration": 2.594176769256592 }, { "auxiliary_loss_clip": 0.01123363, "auxiliary_loss_mlp": 0.01032293, "balance_loss_clip": 1.01743984, "balance_loss_mlp": 1.04375613, "epoch": 0.43138433789268, "flos": 15231008856960.0, "grad_norm": 1.4773294217898623, "language_loss": 0.82493055, "learning_rate": 2.42813392276408e-06, "loss": 0.84648716, "num_input_tokens_seen": 153951325, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.796875, "step": 7175, "time_per_iteration": 2.535102128982544 }, { "auxiliary_loss_clip": 0.01151529, "auxiliary_loss_mlp": 0.01038104, "balance_loss_clip": 1.02377534, "balance_loss_mlp": 1.04300475, "epoch": 0.43144446114534796, "flos": 18150294908160.0, "grad_norm": 2.640989815010168, "language_loss": 0.74412513, "learning_rate": 2.4277648511029936e-06, "loss": 0.76602149, "num_input_tokens_seen": 153966975, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8125, "step": 7176, "time_per_iteration": 2.655805826187134 }, { "auxiliary_loss_clip": 0.01129975, "auxiliary_loss_mlp": 0.01030893, "balance_loss_clip": 1.01738071, "balance_loss_mlp": 1.04142392, "epoch": 0.4315045843980159, "flos": 22929861306240.0, "grad_norm": 1.796786204343994, "language_loss": 0.73650169, "learning_rate": 2.4273957641760784e-06, "loss": 0.7581104, "num_input_tokens_seen": 153986695, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.79296875, "step": 7177, "time_per_iteration": 2.5753612518310547 }, { "auxiliary_loss_clip": 0.01145246, "auxiliary_loss_mlp": 0.01043123, "balance_loss_clip": 1.02718461, "balance_loss_mlp": 1.041996, "epoch": 0.4315647076506839, "flos": 22126862810880.0, "grad_norm": 3.340538513029757, "language_loss": 0.81235188, "learning_rate": 2.4270266619965087e-06, "loss": 0.83423555, "num_input_tokens_seen": 154004710, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.85546875, "step": 7178, "time_per_iteration": 2.7507524490356445 }, { "auxiliary_loss_clip": 0.01133791, "auxiliary_loss_mlp": 0.01030257, "balance_loss_clip": 1.01719773, "balance_loss_mlp": 1.04238045, "epoch": 0.43162483090335185, "flos": 26871129118080.0, "grad_norm": 1.5449197035833886, "language_loss": 0.8397944, "learning_rate": 2.4266575445774555e-06, "loss": 0.86143488, "num_input_tokens_seen": 154024320, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 7179, "time_per_iteration": 2.7061495780944824 }, { "auxiliary_loss_clip": 0.011379, "auxiliary_loss_mlp": 0.01037089, "balance_loss_clip": 1.02334452, "balance_loss_mlp": 1.04272103, "epoch": 0.4316849541560198, "flos": 13913122855680.0, "grad_norm": 1.7878187541459611, "language_loss": 0.75557125, "learning_rate": 2.4262884119320924e-06, "loss": 0.7773211, "num_input_tokens_seen": 154041755, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7734375, "step": 7180, "time_per_iteration": 2.6238625049591064 }, { "auxiliary_loss_clip": 0.01139465, "auxiliary_loss_mlp": 0.01285975, "balance_loss_clip": 1.02420461, "balance_loss_mlp": 1.04095125, "epoch": 0.4317450774086878, "flos": 16435165420800.0, "grad_norm": 2.4620589567967186, "language_loss": 0.81723011, "learning_rate": 2.4259192640735923e-06, "loss": 0.84148455, "num_input_tokens_seen": 154056775, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8046875, "step": 7181, "time_per_iteration": 2.5224761962890625 }, { "auxiliary_loss_clip": 0.01142288, "auxiliary_loss_mlp": 0.01035497, "balance_loss_clip": 1.02013052, "balance_loss_mlp": 1.04319572, "epoch": 0.4318052006613558, "flos": 20554980762240.0, "grad_norm": 2.403343428900697, "language_loss": 0.88924474, "learning_rate": 2.4255501010151287e-06, "loss": 0.91102254, "num_input_tokens_seen": 154075015, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.80859375, "step": 7182, "time_per_iteration": 2.5842623710632324 }, { "auxiliary_loss_clip": 0.01129332, "auxiliary_loss_mlp": 0.01283064, "balance_loss_clip": 1.02310193, "balance_loss_mlp": 1.04449725, "epoch": 0.43186532391402377, "flos": 22820046451200.0, "grad_norm": 1.6804231385745763, "language_loss": 0.75710416, "learning_rate": 2.4251809227698777e-06, "loss": 0.78122807, "num_input_tokens_seen": 154095170, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7578125, "step": 7183, "time_per_iteration": 2.5652732849121094 }, { "auxiliary_loss_clip": 0.01133308, "auxiliary_loss_mlp": 0.01031279, "balance_loss_clip": 1.01751089, "balance_loss_mlp": 1.04479432, "epoch": 0.43192544716669173, "flos": 25556583081600.0, "grad_norm": 1.4883836630616076, "language_loss": 0.77369475, "learning_rate": 2.4248117293510123e-06, "loss": 0.79534066, "num_input_tokens_seen": 154116895, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.796875, "step": 7184, "time_per_iteration": 2.6977994441986084 }, { "auxiliary_loss_clip": 0.01135193, "auxiliary_loss_mlp": 0.01036994, "balance_loss_clip": 1.02397084, "balance_loss_mlp": 1.04380941, "epoch": 0.4319855704193597, "flos": 30954674701440.0, "grad_norm": 1.5965634060950813, "language_loss": 0.73337388, "learning_rate": 2.42444252077171e-06, "loss": 0.75509572, "num_input_tokens_seen": 154138395, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 7185, "time_per_iteration": 2.7364892959594727 }, { "auxiliary_loss_clip": 0.01142602, "auxiliary_loss_mlp": 0.0103657, "balance_loss_clip": 1.02269399, "balance_loss_mlp": 1.04354358, "epoch": 0.43204569367202766, "flos": 24238732993920.0, "grad_norm": 2.6367242789995404, "language_loss": 0.75630903, "learning_rate": 2.4240732970451445e-06, "loss": 0.77810073, "num_input_tokens_seen": 154156775, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.8125, "step": 7186, "time_per_iteration": 2.6182785034179688 }, { "auxiliary_loss_clip": 0.01149048, "auxiliary_loss_mlp": 0.01038331, "balance_loss_clip": 1.02471101, "balance_loss_mlp": 1.04371011, "epoch": 0.43210581692469563, "flos": 18406948561920.0, "grad_norm": 2.4987574097798677, "language_loss": 0.76485181, "learning_rate": 2.423704058184495e-06, "loss": 0.78672558, "num_input_tokens_seen": 154177500, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7890625, "step": 7187, "time_per_iteration": 2.630154609680176 }, { "auxiliary_loss_clip": 0.0107407, "auxiliary_loss_mlp": 0.0100526, "balance_loss_clip": 1.00349605, "balance_loss_mlp": 1.02235603, "epoch": 0.4321659401773636, "flos": 49832378910720.0, "grad_norm": 0.8563848960634466, "language_loss": 0.68142843, "learning_rate": 2.4233348042029374e-06, "loss": 0.70222175, "num_input_tokens_seen": 154237110, "router_z_loss_clip": 0.0177002, "router_z_loss_mlp": 0.24609375, "step": 7188, "time_per_iteration": 3.3297066688537598 }, { "auxiliary_loss_clip": 0.01138651, "auxiliary_loss_mlp": 0.01036786, "balance_loss_clip": 1.02316594, "balance_loss_mlp": 1.04551935, "epoch": 0.43222606343003156, "flos": 17128564542720.0, "grad_norm": 1.8849611130372275, "language_loss": 0.77857667, "learning_rate": 2.4229655351136493e-06, "loss": 0.80033106, "num_input_tokens_seen": 154253910, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75390625, "step": 7189, "time_per_iteration": 2.63954496383667 }, { "auxiliary_loss_clip": 0.01162733, "auxiliary_loss_mlp": 0.01048284, "balance_loss_clip": 1.03315628, "balance_loss_mlp": 1.04289353, "epoch": 0.4322861866826995, "flos": 22749949059840.0, "grad_norm": 1.8096910794231378, "language_loss": 0.71675384, "learning_rate": 2.4225962509298097e-06, "loss": 0.738864, "num_input_tokens_seen": 154274770, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8359375, "step": 7190, "time_per_iteration": 2.627847671508789 }, { "auxiliary_loss_clip": 0.01139309, "auxiliary_loss_mlp": 0.01038237, "balance_loss_clip": 1.02477276, "balance_loss_mlp": 1.04360008, "epoch": 0.4323463099353675, "flos": 27891925729920.0, "grad_norm": 4.1594813399034525, "language_loss": 0.79570425, "learning_rate": 2.422226951664597e-06, "loss": 0.81747973, "num_input_tokens_seen": 154295035, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.78125, "step": 7191, "time_per_iteration": 2.6279468536376953 }, { "auxiliary_loss_clip": 0.01156971, "auxiliary_loss_mlp": 0.01037569, "balance_loss_clip": 1.0240624, "balance_loss_mlp": 1.04470336, "epoch": 0.43240643318803546, "flos": 21614740652160.0, "grad_norm": 1.547178205511153, "language_loss": 0.74980319, "learning_rate": 2.42185763733119e-06, "loss": 0.77174854, "num_input_tokens_seen": 154314905, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7734375, "step": 7192, "time_per_iteration": 2.597031831741333 }, { "auxiliary_loss_clip": 0.01159981, "auxiliary_loss_mlp": 0.0104289, "balance_loss_clip": 1.0283463, "balance_loss_mlp": 1.04594123, "epoch": 0.4324665564407034, "flos": 17558378686080.0, "grad_norm": 1.868825205793016, "language_loss": 0.78732151, "learning_rate": 2.4214883079427693e-06, "loss": 0.80935025, "num_input_tokens_seen": 154331740, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7890625, "step": 7193, "time_per_iteration": 2.59656023979187 }, { "auxiliary_loss_clip": 0.01141645, "auxiliary_loss_mlp": 0.01043734, "balance_loss_clip": 1.02914262, "balance_loss_mlp": 1.044191, "epoch": 0.4325266796933714, "flos": 18402423448320.0, "grad_norm": 1.8972010102474226, "language_loss": 0.75689882, "learning_rate": 2.421118963512515e-06, "loss": 0.77875262, "num_input_tokens_seen": 154348740, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.796875, "step": 7194, "time_per_iteration": 2.5154662132263184 }, { "auxiliary_loss_clip": 0.0112496, "auxiliary_loss_mlp": 0.01038508, "balance_loss_clip": 1.02429807, "balance_loss_mlp": 1.04521441, "epoch": 0.4325868029460394, "flos": 22564793427840.0, "grad_norm": 2.2996373897257727, "language_loss": 0.59853292, "learning_rate": 2.4207496040536086e-06, "loss": 0.62016755, "num_input_tokens_seen": 154368835, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.796875, "step": 7195, "time_per_iteration": 2.554227352142334 }, { "auxiliary_loss_clip": 0.01144414, "auxiliary_loss_mlp": 0.01039542, "balance_loss_clip": 1.02430105, "balance_loss_mlp": 1.04479873, "epoch": 0.43264692619870737, "flos": 14605516396800.0, "grad_norm": 3.135637840016155, "language_loss": 0.65465295, "learning_rate": 2.4203802295792303e-06, "loss": 0.67649251, "num_input_tokens_seen": 154384620, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8203125, "step": 7196, "time_per_iteration": 2.490623950958252 }, { "auxiliary_loss_clip": 0.01128328, "auxiliary_loss_mlp": 0.01041371, "balance_loss_clip": 1.02629685, "balance_loss_mlp": 1.04585028, "epoch": 0.43270704945137534, "flos": 21501657659520.0, "grad_norm": 2.2509944319619115, "language_loss": 0.72030318, "learning_rate": 2.4200108401025635e-06, "loss": 0.74200022, "num_input_tokens_seen": 154402865, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.82421875, "step": 7197, "time_per_iteration": 2.5540144443511963 }, { "auxiliary_loss_clip": 0.01131278, "auxiliary_loss_mlp": 0.01040187, "balance_loss_clip": 1.02643681, "balance_loss_mlp": 1.04620838, "epoch": 0.4327671727040433, "flos": 25155891889920.0, "grad_norm": 1.539239889900785, "language_loss": 0.7288025, "learning_rate": 2.41964143563679e-06, "loss": 0.75051713, "num_input_tokens_seen": 154423625, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.76171875, "step": 7198, "time_per_iteration": 2.547261953353882 }, { "auxiliary_loss_clip": 0.01149204, "auxiliary_loss_mlp": 0.01031137, "balance_loss_clip": 1.01711226, "balance_loss_mlp": 1.04438531, "epoch": 0.43282729595671127, "flos": 25447163276160.0, "grad_norm": 1.3967253766652197, "language_loss": 0.80890095, "learning_rate": 2.419272016195093e-06, "loss": 0.83070433, "num_input_tokens_seen": 154444775, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78125, "step": 7199, "time_per_iteration": 2.6747477054595947 }, { "auxiliary_loss_clip": 0.0114314, "auxiliary_loss_mlp": 0.01034151, "balance_loss_clip": 1.02027535, "balance_loss_mlp": 1.04652834, "epoch": 0.43288741920937923, "flos": 24126116878080.0, "grad_norm": 2.211840994751944, "language_loss": 0.68465149, "learning_rate": 2.4189025817906567e-06, "loss": 0.70642442, "num_input_tokens_seen": 154460815, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7890625, "step": 7200, "time_per_iteration": 2.5235860347747803 }, { "auxiliary_loss_clip": 0.01142545, "auxiliary_loss_mlp": 0.01285704, "balance_loss_clip": 1.02395463, "balance_loss_mlp": 1.04484415, "epoch": 0.4329475424620472, "flos": 19204955066880.0, "grad_norm": 1.9015817214497728, "language_loss": 0.87054801, "learning_rate": 2.4185331324366642e-06, "loss": 0.89483047, "num_input_tokens_seen": 154479145, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80078125, "step": 7201, "time_per_iteration": 3.9429283142089844 }, { "auxiliary_loss_clip": 0.01175769, "auxiliary_loss_mlp": 0.01038291, "balance_loss_clip": 1.02181673, "balance_loss_mlp": 1.04792917, "epoch": 0.43300766571471516, "flos": 22638374438400.0, "grad_norm": 1.8787842018235228, "language_loss": 0.63782305, "learning_rate": 2.418163668146301e-06, "loss": 0.65996361, "num_input_tokens_seen": 154498905, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8359375, "step": 7202, "time_per_iteration": 2.6128857135772705 }, { "auxiliary_loss_clip": 0.01158151, "auxiliary_loss_mlp": 0.01027007, "balance_loss_clip": 1.01392937, "balance_loss_mlp": 1.04409695, "epoch": 0.4330677889673831, "flos": 22121080721280.0, "grad_norm": 1.8023388379033842, "language_loss": 0.81933272, "learning_rate": 2.4177941889327523e-06, "loss": 0.84118432, "num_input_tokens_seen": 154517270, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.78125, "step": 7203, "time_per_iteration": 2.6249892711639404 }, { "auxiliary_loss_clip": 0.01154595, "auxiliary_loss_mlp": 0.01044522, "balance_loss_clip": 1.03033614, "balance_loss_mlp": 1.04487681, "epoch": 0.4331279122200511, "flos": 23221527742080.0, "grad_norm": 2.0871565763158317, "language_loss": 0.81220758, "learning_rate": 2.4174246948092035e-06, "loss": 0.83419871, "num_input_tokens_seen": 154535945, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.828125, "step": 7204, "time_per_iteration": 2.597949504852295 }, { "auxiliary_loss_clip": 0.01138462, "auxiliary_loss_mlp": 0.01038004, "balance_loss_clip": 1.02405107, "balance_loss_mlp": 1.04225659, "epoch": 0.43318803547271906, "flos": 18259750627200.0, "grad_norm": 1.9320277007206808, "language_loss": 0.73625088, "learning_rate": 2.4170551857888414e-06, "loss": 0.75801557, "num_input_tokens_seen": 154554935, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78515625, "step": 7205, "time_per_iteration": 2.5537447929382324 }, { "auxiliary_loss_clip": 0.01122762, "auxiliary_loss_mlp": 0.01284225, "balance_loss_clip": 1.0239675, "balance_loss_mlp": 1.04698014, "epoch": 0.433248158725387, "flos": 27418407713280.0, "grad_norm": 1.759033201446134, "language_loss": 0.74961209, "learning_rate": 2.4166856618848526e-06, "loss": 0.77368194, "num_input_tokens_seen": 154576065, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7578125, "step": 7206, "time_per_iteration": 4.093183755874634 }, { "auxiliary_loss_clip": 0.0115875, "auxiliary_loss_mlp": 0.01037386, "balance_loss_clip": 1.02274728, "balance_loss_mlp": 1.04364347, "epoch": 0.433308281978055, "flos": 23218008209280.0, "grad_norm": 2.2747939156477397, "language_loss": 0.78390062, "learning_rate": 2.416316123110424e-06, "loss": 0.80586195, "num_input_tokens_seen": 154595110, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7890625, "step": 7207, "time_per_iteration": 2.664330005645752 }, { "auxiliary_loss_clip": 0.01153072, "auxiliary_loss_mlp": 0.01035551, "balance_loss_clip": 1.02025044, "balance_loss_mlp": 1.04533863, "epoch": 0.433368405230723, "flos": 15852407166720.0, "grad_norm": 3.576464998080424, "language_loss": 0.806445, "learning_rate": 2.415946569478744e-06, "loss": 0.82833123, "num_input_tokens_seen": 154612255, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.80859375, "step": 7208, "time_per_iteration": 2.5575270652770996 }, { "auxiliary_loss_clip": 0.01143756, "auxiliary_loss_mlp": 0.01035988, "balance_loss_clip": 1.02238595, "balance_loss_mlp": 1.04685676, "epoch": 0.433428528483391, "flos": 19026084314880.0, "grad_norm": 2.107129809720651, "language_loss": 0.69804382, "learning_rate": 2.415577001003001e-06, "loss": 0.71984124, "num_input_tokens_seen": 154630440, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7890625, "step": 7209, "time_per_iteration": 4.16021203994751 }, { "auxiliary_loss_clip": 0.01129815, "auxiliary_loss_mlp": 0.01035263, "balance_loss_clip": 1.02065945, "balance_loss_mlp": 1.04336286, "epoch": 0.43348865173605894, "flos": 24718248581760.0, "grad_norm": 1.652584128248888, "language_loss": 0.81248611, "learning_rate": 2.4152074176963838e-06, "loss": 0.8341369, "num_input_tokens_seen": 154652515, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.77734375, "step": 7210, "time_per_iteration": 2.585369110107422 }, { "auxiliary_loss_clip": 0.01139065, "auxiliary_loss_mlp": 0.01036501, "balance_loss_clip": 1.0232749, "balance_loss_mlp": 1.04352045, "epoch": 0.4335487749887269, "flos": 22090664880000.0, "grad_norm": 1.8972238093144147, "language_loss": 0.81792897, "learning_rate": 2.4148378195720816e-06, "loss": 0.83968472, "num_input_tokens_seen": 154670965, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.78125, "step": 7211, "time_per_iteration": 4.120189666748047 }, { "auxiliary_loss_clip": 0.01129466, "auxiliary_loss_mlp": 0.01035784, "balance_loss_clip": 1.02300501, "balance_loss_mlp": 1.04171324, "epoch": 0.43360889824139487, "flos": 22382941847040.0, "grad_norm": 2.744576439400468, "language_loss": 0.74505222, "learning_rate": 2.4144682066432847e-06, "loss": 0.76670474, "num_input_tokens_seen": 154689980, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7890625, "step": 7212, "time_per_iteration": 2.6100499629974365 }, { "auxiliary_loss_clip": 0.01142212, "auxiliary_loss_mlp": 0.01030887, "balance_loss_clip": 1.01768494, "balance_loss_mlp": 1.04506612, "epoch": 0.43366902149406283, "flos": 17528286067200.0, "grad_norm": 1.7819085766682894, "language_loss": 0.76514769, "learning_rate": 2.4140985789231838e-06, "loss": 0.78687871, "num_input_tokens_seen": 154706570, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.79296875, "step": 7213, "time_per_iteration": 2.5276076793670654 }, { "auxiliary_loss_clip": 0.01131627, "auxiliary_loss_mlp": 0.01032888, "balance_loss_clip": 1.01933956, "balance_loss_mlp": 1.04380739, "epoch": 0.4337291447467308, "flos": 19022672522880.0, "grad_norm": 1.5436593723266012, "language_loss": 0.64873111, "learning_rate": 2.4137289364249698e-06, "loss": 0.67037618, "num_input_tokens_seen": 154725210, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7890625, "step": 7214, "time_per_iteration": 2.5815048217773438 }, { "auxiliary_loss_clip": 0.01135808, "auxiliary_loss_mlp": 0.01036431, "balance_loss_clip": 1.02348447, "balance_loss_mlp": 1.04301119, "epoch": 0.43378926799939876, "flos": 27234042180480.0, "grad_norm": 1.588579582419138, "language_loss": 0.71312588, "learning_rate": 2.4133592791618348e-06, "loss": 0.73484826, "num_input_tokens_seen": 154745945, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.75, "step": 7215, "time_per_iteration": 2.6054303646087646 }, { "auxiliary_loss_clip": 0.01138772, "auxiliary_loss_mlp": 0.01035302, "balance_loss_clip": 1.02201653, "balance_loss_mlp": 1.04438603, "epoch": 0.43384939125206673, "flos": 15961108700160.0, "grad_norm": 1.8515839920725277, "language_loss": 0.74808824, "learning_rate": 2.4129896071469697e-06, "loss": 0.76982892, "num_input_tokens_seen": 154763580, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.765625, "step": 7216, "time_per_iteration": 2.6328234672546387 }, { "auxiliary_loss_clip": 0.01147984, "auxiliary_loss_mlp": 0.01040376, "balance_loss_clip": 1.02415729, "balance_loss_mlp": 1.04601908, "epoch": 0.4339095145047347, "flos": 21793216354560.0, "grad_norm": 2.6797529004453913, "language_loss": 0.75690722, "learning_rate": 2.412619920393568e-06, "loss": 0.77879083, "num_input_tokens_seen": 154776825, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.83984375, "step": 7217, "time_per_iteration": 2.545609951019287 }, { "auxiliary_loss_clip": 0.01149336, "auxiliary_loss_mlp": 0.01035318, "balance_loss_clip": 1.02110875, "balance_loss_mlp": 1.04388857, "epoch": 0.43396963775740266, "flos": 14209853109120.0, "grad_norm": 1.900036914129583, "language_loss": 0.73691082, "learning_rate": 2.4122502189148225e-06, "loss": 0.75875735, "num_input_tokens_seen": 154794025, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78515625, "step": 7218, "time_per_iteration": 2.673799753189087 }, { "auxiliary_loss_clip": 0.01142955, "auxiliary_loss_mlp": 0.01032253, "balance_loss_clip": 1.01789474, "balance_loss_mlp": 1.04472864, "epoch": 0.4340297610100706, "flos": 19719052473600.0, "grad_norm": 1.7425995480672625, "language_loss": 0.68706691, "learning_rate": 2.4118805027239277e-06, "loss": 0.70881903, "num_input_tokens_seen": 154813105, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.796875, "step": 7219, "time_per_iteration": 2.5355045795440674 }, { "auxiliary_loss_clip": 0.01141849, "auxiliary_loss_mlp": 0.01034366, "balance_loss_clip": 1.02071011, "balance_loss_mlp": 1.04563951, "epoch": 0.4340898842627386, "flos": 18953508885120.0, "grad_norm": 1.9519402751604427, "language_loss": 0.77317226, "learning_rate": 2.411510771834077e-06, "loss": 0.79493439, "num_input_tokens_seen": 154833525, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.78125, "step": 7220, "time_per_iteration": 2.707818031311035 }, { "auxiliary_loss_clip": 0.01166509, "auxiliary_loss_mlp": 0.01034144, "balance_loss_clip": 1.02020264, "balance_loss_mlp": 1.04336119, "epoch": 0.4341500075154066, "flos": 22018304931840.0, "grad_norm": 6.805267242988512, "language_loss": 0.69092131, "learning_rate": 2.411141026258466e-06, "loss": 0.71292782, "num_input_tokens_seen": 154853090, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78125, "step": 7221, "time_per_iteration": 2.673790693283081 }, { "auxiliary_loss_clip": 0.0113414, "auxiliary_loss_mlp": 0.01035685, "balance_loss_clip": 1.02214253, "balance_loss_mlp": 1.04462695, "epoch": 0.4342101307680746, "flos": 23582465556480.0, "grad_norm": 1.7169620383550852, "language_loss": 0.65251315, "learning_rate": 2.4107712660102885e-06, "loss": 0.67421138, "num_input_tokens_seen": 154872055, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.8046875, "step": 7222, "time_per_iteration": 2.6249916553497314 }, { "auxiliary_loss_clip": 0.01146905, "auxiliary_loss_mlp": 0.01033276, "balance_loss_clip": 1.01806486, "balance_loss_mlp": 1.04195023, "epoch": 0.43427025402074254, "flos": 17967976450560.0, "grad_norm": 2.128800847004995, "language_loss": 0.72863448, "learning_rate": 2.410401491102741e-06, "loss": 0.75043625, "num_input_tokens_seen": 154886645, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.78515625, "step": 7223, "time_per_iteration": 2.549334764480591 }, { "auxiliary_loss_clip": 0.01125635, "auxiliary_loss_mlp": 0.01032392, "balance_loss_clip": 1.01776528, "balance_loss_mlp": 1.04356241, "epoch": 0.4343303772734105, "flos": 26286395616000.0, "grad_norm": 2.102130353547188, "language_loss": 0.94072264, "learning_rate": 2.4100317015490204e-06, "loss": 0.96230292, "num_input_tokens_seen": 154906775, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8203125, "step": 7224, "time_per_iteration": 2.615765333175659 }, { "auxiliary_loss_clip": 0.01147866, "auxiliary_loss_mlp": 0.01035728, "balance_loss_clip": 1.02196562, "balance_loss_mlp": 1.04439509, "epoch": 0.43439050052607847, "flos": 26833961520000.0, "grad_norm": 1.448778109843365, "language_loss": 0.61095047, "learning_rate": 2.4096618973623227e-06, "loss": 0.63278639, "num_input_tokens_seen": 154926990, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.765625, "step": 7225, "time_per_iteration": 2.5992140769958496 }, { "auxiliary_loss_clip": 0.0114273, "auxiliary_loss_mlp": 0.01285438, "balance_loss_clip": 1.02403998, "balance_loss_mlp": 1.04550529, "epoch": 0.43445062377874644, "flos": 21397660807680.0, "grad_norm": 1.8053061298204927, "language_loss": 0.77501106, "learning_rate": 2.4092920785558465e-06, "loss": 0.79929268, "num_input_tokens_seen": 154946210, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7890625, "step": 7226, "time_per_iteration": 2.57965350151062 }, { "auxiliary_loss_clip": 0.01131026, "auxiliary_loss_mlp": 0.01032585, "balance_loss_clip": 1.01864386, "balance_loss_mlp": 1.04523051, "epoch": 0.4345107470314144, "flos": 19901945548800.0, "grad_norm": 1.5394493396315931, "language_loss": 0.84920502, "learning_rate": 2.408922245142788e-06, "loss": 0.87084115, "num_input_tokens_seen": 154964995, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.76953125, "step": 7227, "time_per_iteration": 2.6219911575317383 }, { "auxiliary_loss_clip": 0.01131793, "auxiliary_loss_mlp": 0.01037606, "balance_loss_clip": 1.02331257, "balance_loss_mlp": 1.04486275, "epoch": 0.43457087028408237, "flos": 26432623883520.0, "grad_norm": 1.8612733109353612, "language_loss": 0.76552868, "learning_rate": 2.408552397136347e-06, "loss": 0.78722262, "num_input_tokens_seen": 154984775, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78125, "step": 7228, "time_per_iteration": 2.6410973072052 }, { "auxiliary_loss_clip": 0.01157575, "auxiliary_loss_mlp": 0.01038803, "balance_loss_clip": 1.02385402, "balance_loss_mlp": 1.0440892, "epoch": 0.43463099353675033, "flos": 31868816855040.0, "grad_norm": 1.504102478458522, "language_loss": 0.80293036, "learning_rate": 2.408182534549722e-06, "loss": 0.82489413, "num_input_tokens_seen": 155008125, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.78125, "step": 7229, "time_per_iteration": 2.7383108139038086 }, { "auxiliary_loss_clip": 0.01133439, "auxiliary_loss_mlp": 0.01042437, "balance_loss_clip": 1.02800047, "balance_loss_mlp": 1.04400826, "epoch": 0.4346911167894183, "flos": 24571266128640.0, "grad_norm": 1.9443137931684817, "language_loss": 0.82465041, "learning_rate": 2.4078126573961117e-06, "loss": 0.8464092, "num_input_tokens_seen": 155027885, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8046875, "step": 7230, "time_per_iteration": 2.5915913581848145 }, { "auxiliary_loss_clip": 0.01137256, "auxiliary_loss_mlp": 0.01042981, "balance_loss_clip": 1.02815151, "balance_loss_mlp": 1.04614651, "epoch": 0.43475124004208626, "flos": 17090678672640.0, "grad_norm": 2.562151098460921, "language_loss": 0.77329451, "learning_rate": 2.407442765688717e-06, "loss": 0.79509687, "num_input_tokens_seen": 155043375, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8203125, "step": 7231, "time_per_iteration": 2.624892473220825 }, { "auxiliary_loss_clip": 0.0115061, "auxiliary_loss_mlp": 0.01032626, "balance_loss_clip": 1.01975727, "balance_loss_mlp": 1.04604292, "epoch": 0.4348113632947542, "flos": 26104615862400.0, "grad_norm": 1.5132575619063104, "language_loss": 0.68535101, "learning_rate": 2.407072859440738e-06, "loss": 0.70718336, "num_input_tokens_seen": 155062930, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7734375, "step": 7232, "time_per_iteration": 2.6052637100219727 }, { "auxiliary_loss_clip": 0.01152109, "auxiliary_loss_mlp": 0.01035832, "balance_loss_clip": 1.02069855, "balance_loss_mlp": 1.04354274, "epoch": 0.4348714865474222, "flos": 34200496316160.0, "grad_norm": 4.863734655867178, "language_loss": 0.72122502, "learning_rate": 2.4067029386653758e-06, "loss": 0.74310446, "num_input_tokens_seen": 155084980, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8125, "step": 7233, "time_per_iteration": 2.7985024452209473 }, { "auxiliary_loss_clip": 0.01170313, "auxiliary_loss_mlp": 0.01041157, "balance_loss_clip": 1.02546883, "balance_loss_mlp": 1.04444671, "epoch": 0.43493160980009016, "flos": 31537468869120.0, "grad_norm": 1.5633689246622764, "language_loss": 0.74291295, "learning_rate": 2.4063330033758316e-06, "loss": 0.76502764, "num_input_tokens_seen": 155107260, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8203125, "step": 7234, "time_per_iteration": 2.680040121078491 }, { "auxiliary_loss_clip": 0.0114759, "auxiliary_loss_mlp": 0.0104484, "balance_loss_clip": 1.02935433, "balance_loss_mlp": 1.04474735, "epoch": 0.4349917330527582, "flos": 24061334699520.0, "grad_norm": 2.2341308939512716, "language_loss": 0.58739567, "learning_rate": 2.4059630535853074e-06, "loss": 0.60931998, "num_input_tokens_seen": 155126720, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.84765625, "step": 7235, "time_per_iteration": 2.723356246948242 }, { "auxiliary_loss_clip": 0.01159998, "auxiliary_loss_mlp": 0.010365, "balance_loss_clip": 1.0221709, "balance_loss_mlp": 1.04296875, "epoch": 0.43505185630542614, "flos": 30519329863680.0, "grad_norm": 1.9187161251165963, "language_loss": 0.77549964, "learning_rate": 2.4055930893070076e-06, "loss": 0.79746461, "num_input_tokens_seen": 155148640, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8046875, "step": 7236, "time_per_iteration": 2.6676552295684814 }, { "auxiliary_loss_clip": 0.01119329, "auxiliary_loss_mlp": 0.01038116, "balance_loss_clip": 1.02335763, "balance_loss_mlp": 1.04247665, "epoch": 0.4351119795580941, "flos": 15735158196480.0, "grad_norm": 1.6160163881965315, "language_loss": 0.81517267, "learning_rate": 2.405223110554133e-06, "loss": 0.83674717, "num_input_tokens_seen": 155165870, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.76953125, "step": 7237, "time_per_iteration": 2.6008286476135254 }, { "auxiliary_loss_clip": 0.01131721, "auxiliary_loss_mlp": 0.0103662, "balance_loss_clip": 1.02262497, "balance_loss_mlp": 1.04409814, "epoch": 0.4351721028107621, "flos": 18731760272640.0, "grad_norm": 1.7095696900672008, "language_loss": 0.63237441, "learning_rate": 2.4048531173398873e-06, "loss": 0.6540578, "num_input_tokens_seen": 155185315, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7890625, "step": 7238, "time_per_iteration": 2.5600335597991943 }, { "auxiliary_loss_clip": 0.01146166, "auxiliary_loss_mlp": 0.01287971, "balance_loss_clip": 1.0269196, "balance_loss_mlp": 1.04235458, "epoch": 0.43523222606343004, "flos": 25226887121280.0, "grad_norm": 1.7987759809190087, "language_loss": 0.85776508, "learning_rate": 2.4044831096774756e-06, "loss": 0.88210642, "num_input_tokens_seen": 155205790, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7734375, "step": 7239, "time_per_iteration": 2.674375534057617 }, { "auxiliary_loss_clip": 0.01140297, "auxiliary_loss_mlp": 0.01032783, "balance_loss_clip": 1.01826882, "balance_loss_mlp": 1.04323804, "epoch": 0.435292349316098, "flos": 22709190101760.0, "grad_norm": 1.6000498482874979, "language_loss": 0.72404319, "learning_rate": 2.4041130875801025e-06, "loss": 0.74577403, "num_input_tokens_seen": 155226475, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7890625, "step": 7240, "time_per_iteration": 2.651607036590576 }, { "auxiliary_loss_clip": 0.01153191, "auxiliary_loss_mlp": 0.01032819, "balance_loss_clip": 1.0182035, "balance_loss_mlp": 1.04538298, "epoch": 0.43535247256876597, "flos": 25775889569280.0, "grad_norm": 7.989868474199656, "language_loss": 0.81746566, "learning_rate": 2.4037430510609728e-06, "loss": 0.83932579, "num_input_tokens_seen": 155247110, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8125, "step": 7241, "time_per_iteration": 2.671247720718384 }, { "auxiliary_loss_clip": 0.01156645, "auxiliary_loss_mlp": 0.01288348, "balance_loss_clip": 1.02530777, "balance_loss_mlp": 1.0448935, "epoch": 0.43541259582143393, "flos": 17528142412800.0, "grad_norm": 2.3911868301114887, "language_loss": 0.79095155, "learning_rate": 2.4033730001332917e-06, "loss": 0.81540149, "num_input_tokens_seen": 155261335, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8515625, "step": 7242, "time_per_iteration": 3.9694812297821045 }, { "auxiliary_loss_clip": 0.01142504, "auxiliary_loss_mlp": 0.01032019, "balance_loss_clip": 1.01684368, "balance_loss_mlp": 1.04227507, "epoch": 0.4354727190741019, "flos": 15195205975680.0, "grad_norm": 1.8153026217602164, "language_loss": 0.68060225, "learning_rate": 2.4030029348102657e-06, "loss": 0.70234752, "num_input_tokens_seen": 155278510, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.82421875, "step": 7243, "time_per_iteration": 2.562150239944458 }, { "auxiliary_loss_clip": 0.01157707, "auxiliary_loss_mlp": 0.0103473, "balance_loss_clip": 1.02121806, "balance_loss_mlp": 1.04465389, "epoch": 0.43553284232676986, "flos": 16649264436480.0, "grad_norm": 1.6419568345071471, "language_loss": 0.81038117, "learning_rate": 2.4026328551051023e-06, "loss": 0.83230555, "num_input_tokens_seen": 155296450, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.765625, "step": 7244, "time_per_iteration": 2.7426834106445312 }, { "auxiliary_loss_clip": 0.01169653, "auxiliary_loss_mlp": 0.01030353, "balance_loss_clip": 1.01663184, "balance_loss_mlp": 1.04498792, "epoch": 0.43559296557943783, "flos": 23400865370880.0, "grad_norm": 1.7994874900386288, "language_loss": 0.734577, "learning_rate": 2.4022627610310075e-06, "loss": 0.75657707, "num_input_tokens_seen": 155316080, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.796875, "step": 7245, "time_per_iteration": 2.5724220275878906 }, { "auxiliary_loss_clip": 0.0115886, "auxiliary_loss_mlp": 0.01039874, "balance_loss_clip": 1.02630854, "balance_loss_mlp": 1.04351616, "epoch": 0.4356530888321058, "flos": 22419067950720.0, "grad_norm": 1.5955533796686985, "language_loss": 0.76647723, "learning_rate": 2.4018926526011895e-06, "loss": 0.78846461, "num_input_tokens_seen": 155336765, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.79296875, "step": 7246, "time_per_iteration": 2.6226389408111572 }, { "auxiliary_loss_clip": 0.01134072, "auxiliary_loss_mlp": 0.01042551, "balance_loss_clip": 1.02787614, "balance_loss_mlp": 1.04432154, "epoch": 0.43571321208477376, "flos": 21616141282560.0, "grad_norm": 2.0711470014623985, "language_loss": 0.85411549, "learning_rate": 2.4015225298288566e-06, "loss": 0.87588173, "num_input_tokens_seen": 155356440, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80859375, "step": 7247, "time_per_iteration": 3.981818675994873 }, { "auxiliary_loss_clip": 0.01124566, "auxiliary_loss_mlp": 0.01038232, "balance_loss_clip": 1.02417731, "balance_loss_mlp": 1.04476726, "epoch": 0.4357733353374418, "flos": 23987358639360.0, "grad_norm": 1.6217836989626184, "language_loss": 0.72447062, "learning_rate": 2.4011523927272177e-06, "loss": 0.74609864, "num_input_tokens_seen": 155377070, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.796875, "step": 7248, "time_per_iteration": 2.6241776943206787 }, { "auxiliary_loss_clip": 0.01132522, "auxiliary_loss_mlp": 0.0103467, "balance_loss_clip": 1.02063274, "balance_loss_mlp": 1.04448438, "epoch": 0.43583345859010975, "flos": 25264737077760.0, "grad_norm": 1.6408738971165124, "language_loss": 0.87322199, "learning_rate": 2.4007822413094815e-06, "loss": 0.89489388, "num_input_tokens_seen": 155398415, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7890625, "step": 7249, "time_per_iteration": 2.660248279571533 }, { "auxiliary_loss_clip": 0.01143555, "auxiliary_loss_mlp": 0.01045363, "balance_loss_clip": 1.03148675, "balance_loss_mlp": 1.04403114, "epoch": 0.4358935818427777, "flos": 23696302734720.0, "grad_norm": 1.9022212007368609, "language_loss": 0.81521314, "learning_rate": 2.400412075588858e-06, "loss": 0.83710229, "num_input_tokens_seen": 155415625, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.81640625, "step": 7250, "time_per_iteration": 4.16265082359314 }, { "auxiliary_loss_clip": 0.01136766, "auxiliary_loss_mlp": 0.01037911, "balance_loss_clip": 1.02340364, "balance_loss_mlp": 1.04775071, "epoch": 0.4359537050954457, "flos": 29532827761920.0, "grad_norm": 1.7125939474266147, "language_loss": 0.84400773, "learning_rate": 2.400041895578558e-06, "loss": 0.86575449, "num_input_tokens_seen": 155435505, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.80078125, "step": 7251, "time_per_iteration": 2.698794364929199 }, { "auxiliary_loss_clip": 0.01129167, "auxiliary_loss_mlp": 0.01040754, "balance_loss_clip": 1.02601969, "balance_loss_mlp": 1.04276133, "epoch": 0.43601382834811364, "flos": 22711273090560.0, "grad_norm": 1.4929690263045228, "language_loss": 0.68979388, "learning_rate": 2.3996717012917912e-06, "loss": 0.71149302, "num_input_tokens_seen": 155455425, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7734375, "step": 7252, "time_per_iteration": 2.659529685974121 }, { "auxiliary_loss_clip": 0.01138729, "auxiliary_loss_mlp": 0.0103484, "balance_loss_clip": 1.02044523, "balance_loss_mlp": 1.04384756, "epoch": 0.4360739516007816, "flos": 19098731571840.0, "grad_norm": 1.5828715584690436, "language_loss": 0.83705795, "learning_rate": 2.3993014927417704e-06, "loss": 0.85879368, "num_input_tokens_seen": 155474250, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.76953125, "step": 7253, "time_per_iteration": 4.103534460067749 }, { "auxiliary_loss_clip": 0.0112927, "auxiliary_loss_mlp": 0.01036738, "balance_loss_clip": 1.02236748, "balance_loss_mlp": 1.0430485, "epoch": 0.43613407485344957, "flos": 23404420817280.0, "grad_norm": 1.5104776533755024, "language_loss": 0.70164859, "learning_rate": 2.3989312699417057e-06, "loss": 0.72330868, "num_input_tokens_seen": 155494685, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.77734375, "step": 7254, "time_per_iteration": 2.6336770057678223 }, { "auxiliary_loss_clip": 0.0114117, "auxiliary_loss_mlp": 0.01037281, "balance_loss_clip": 1.02349496, "balance_loss_mlp": 1.04571176, "epoch": 0.43619419810611754, "flos": 22637799820800.0, "grad_norm": 1.880741715169981, "language_loss": 0.81245327, "learning_rate": 2.39856103290481e-06, "loss": 0.83423775, "num_input_tokens_seen": 155513040, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.77734375, "step": 7255, "time_per_iteration": 2.6133363246917725 }, { "auxiliary_loss_clip": 0.01122823, "auxiliary_loss_mlp": 0.01035876, "balance_loss_clip": 1.02228606, "balance_loss_mlp": 1.04350531, "epoch": 0.4362543213587855, "flos": 20047958334720.0, "grad_norm": 1.7787062140995318, "language_loss": 0.77508897, "learning_rate": 2.398190781644296e-06, "loss": 0.79667592, "num_input_tokens_seen": 155530100, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.79296875, "step": 7256, "time_per_iteration": 2.549206256866455 }, { "auxiliary_loss_clip": 0.01121878, "auxiliary_loss_mlp": 0.01036328, "balance_loss_clip": 1.0222733, "balance_loss_mlp": 1.04450822, "epoch": 0.43631444461145347, "flos": 21361319222400.0, "grad_norm": 1.6852271629135869, "language_loss": 0.76203138, "learning_rate": 2.397820516173378e-06, "loss": 0.78361344, "num_input_tokens_seen": 155549375, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7734375, "step": 7257, "time_per_iteration": 2.6405577659606934 }, { "auxiliary_loss_clip": 0.01130359, "auxiliary_loss_mlp": 0.01039047, "balance_loss_clip": 1.024158, "balance_loss_mlp": 1.04198074, "epoch": 0.43637456786412143, "flos": 22418529246720.0, "grad_norm": 1.7862865141281072, "language_loss": 0.73007143, "learning_rate": 2.3974502365052685e-06, "loss": 0.75176549, "num_input_tokens_seen": 155569395, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.79296875, "step": 7258, "time_per_iteration": 2.6084299087524414 }, { "auxiliary_loss_clip": 0.01140603, "auxiliary_loss_mlp": 0.01035961, "balance_loss_clip": 1.02190018, "balance_loss_mlp": 1.04321837, "epoch": 0.4364346911167894, "flos": 28548839612160.0, "grad_norm": 1.7100018658923892, "language_loss": 0.76605296, "learning_rate": 2.3970799426531833e-06, "loss": 0.78781861, "num_input_tokens_seen": 155589090, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8046875, "step": 7259, "time_per_iteration": 2.6728713512420654 }, { "auxiliary_loss_clip": 0.0106157, "auxiliary_loss_mlp": 0.01003899, "balance_loss_clip": 1.00209868, "balance_loss_mlp": 1.01936698, "epoch": 0.43649481436945736, "flos": 62659345380480.0, "grad_norm": 0.7594711744617959, "language_loss": 0.56929451, "learning_rate": 2.396709634630335e-06, "loss": 0.58994919, "num_input_tokens_seen": 155648660, "router_z_loss_clip": 0.01794434, "router_z_loss_mlp": 0.2421875, "step": 7260, "time_per_iteration": 3.1391615867614746 }, { "auxiliary_loss_clip": 0.01141536, "auxiliary_loss_mlp": 0.01046676, "balance_loss_clip": 1.03143501, "balance_loss_mlp": 1.0428288, "epoch": 0.4365549376221254, "flos": 30592120775040.0, "grad_norm": 2.516359566286029, "language_loss": 0.70736289, "learning_rate": 2.3963393124499415e-06, "loss": 0.72924501, "num_input_tokens_seen": 155669945, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.80859375, "step": 7261, "time_per_iteration": 2.6901724338531494 }, { "auxiliary_loss_clip": 0.01157065, "auxiliary_loss_mlp": 0.01042324, "balance_loss_clip": 1.02759552, "balance_loss_mlp": 1.04240489, "epoch": 0.43661506087479335, "flos": 17165875795200.0, "grad_norm": 1.8773288309973952, "language_loss": 0.69281113, "learning_rate": 2.395968976125217e-06, "loss": 0.71480501, "num_input_tokens_seen": 155688555, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.79296875, "step": 7262, "time_per_iteration": 2.581031560897827 }, { "auxiliary_loss_clip": 0.01160562, "auxiliary_loss_mlp": 0.01031815, "balance_loss_clip": 1.01910686, "balance_loss_mlp": 1.04214239, "epoch": 0.4366751841274613, "flos": 22047499710720.0, "grad_norm": 1.716523775001452, "language_loss": 0.79775512, "learning_rate": 2.3955986256693783e-06, "loss": 0.8196789, "num_input_tokens_seen": 155705370, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7421875, "step": 7263, "time_per_iteration": 2.68082857131958 }, { "auxiliary_loss_clip": 0.01149664, "auxiliary_loss_mlp": 0.01043029, "balance_loss_clip": 1.02846205, "balance_loss_mlp": 1.04424071, "epoch": 0.4367353073801293, "flos": 15997306631040.0, "grad_norm": 1.7413954056106031, "language_loss": 0.75227147, "learning_rate": 2.3952282610956426e-06, "loss": 0.77419841, "num_input_tokens_seen": 155721890, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7890625, "step": 7264, "time_per_iteration": 2.6181821823120117 }, { "auxiliary_loss_clip": 0.01154646, "auxiliary_loss_mlp": 0.01034875, "balance_loss_clip": 1.02011681, "balance_loss_mlp": 1.04137087, "epoch": 0.43679543063279724, "flos": 38217535868160.0, "grad_norm": 2.0743314375699375, "language_loss": 0.61563486, "learning_rate": 2.3948578824172264e-06, "loss": 0.63753009, "num_input_tokens_seen": 155743970, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7734375, "step": 7265, "time_per_iteration": 2.794130802154541 }, { "auxiliary_loss_clip": 0.01135323, "auxiliary_loss_mlp": 0.01030632, "balance_loss_clip": 1.01703572, "balance_loss_mlp": 1.04093504, "epoch": 0.4368555538854652, "flos": 15193230727680.0, "grad_norm": 2.1275635121726637, "language_loss": 0.72479415, "learning_rate": 2.394487489647349e-06, "loss": 0.74645364, "num_input_tokens_seen": 155761830, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.765625, "step": 7266, "time_per_iteration": 2.534963607788086 }, { "auxiliary_loss_clip": 0.01140326, "auxiliary_loss_mlp": 0.01036812, "balance_loss_clip": 1.02217293, "balance_loss_mlp": 1.04326844, "epoch": 0.4369156771381332, "flos": 23069086421760.0, "grad_norm": 2.334793873412973, "language_loss": 0.82301772, "learning_rate": 2.3941170827992264e-06, "loss": 0.84478909, "num_input_tokens_seen": 155779610, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.796875, "step": 7267, "time_per_iteration": 2.6310782432556152 }, { "auxiliary_loss_clip": 0.01118609, "auxiliary_loss_mlp": 0.01030443, "balance_loss_clip": 1.01704395, "balance_loss_mlp": 1.0416801, "epoch": 0.43697580039080114, "flos": 23441085624960.0, "grad_norm": 1.6975750864771117, "language_loss": 0.74248171, "learning_rate": 2.39374666188608e-06, "loss": 0.76397216, "num_input_tokens_seen": 155798765, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.76953125, "step": 7268, "time_per_iteration": 2.4876625537872314 }, { "auxiliary_loss_clip": 0.01126129, "auxiliary_loss_mlp": 0.0103839, "balance_loss_clip": 1.0232271, "balance_loss_mlp": 1.04352534, "epoch": 0.4370359236434691, "flos": 18514680428160.0, "grad_norm": 2.088665776536991, "language_loss": 0.79604518, "learning_rate": 2.3933762269211273e-06, "loss": 0.81769037, "num_input_tokens_seen": 155817750, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.82421875, "step": 7269, "time_per_iteration": 2.482553243637085 }, { "auxiliary_loss_clip": 0.01139106, "auxiliary_loss_mlp": 0.0103561, "balance_loss_clip": 1.02194905, "balance_loss_mlp": 1.04397297, "epoch": 0.43709604689613707, "flos": 23222497409280.0, "grad_norm": 1.7076197590176614, "language_loss": 0.74799466, "learning_rate": 2.3930057779175894e-06, "loss": 0.76974177, "num_input_tokens_seen": 155836490, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7734375, "step": 7270, "time_per_iteration": 2.550898790359497 }, { "auxiliary_loss_clip": 0.01165923, "auxiliary_loss_mlp": 0.01032376, "balance_loss_clip": 1.01844084, "balance_loss_mlp": 1.0432086, "epoch": 0.43715617014880503, "flos": 23803711378560.0, "grad_norm": 1.7243579124881325, "language_loss": 0.79425663, "learning_rate": 2.3926353148886864e-06, "loss": 0.8162396, "num_input_tokens_seen": 155856225, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78125, "step": 7271, "time_per_iteration": 2.606570243835449 }, { "auxiliary_loss_clip": 0.01052859, "auxiliary_loss_mlp": 0.01000716, "balance_loss_clip": 0.99884415, "balance_loss_mlp": 1.01946092, "epoch": 0.437216293401473, "flos": 61941204766080.0, "grad_norm": 0.6993753513409942, "language_loss": 0.54882801, "learning_rate": 2.3922648378476388e-06, "loss": 0.56936371, "num_input_tokens_seen": 155916770, "router_z_loss_clip": 0.01867676, "router_z_loss_mlp": 0.24707031, "step": 7272, "time_per_iteration": 3.2026453018188477 }, { "auxiliary_loss_clip": 0.01123331, "auxiliary_loss_mlp": 0.01043011, "balance_loss_clip": 1.02829516, "balance_loss_mlp": 1.04274154, "epoch": 0.43727641665414096, "flos": 21982250655360.0, "grad_norm": 1.8600409060218925, "language_loss": 0.70083773, "learning_rate": 2.391894346807668e-06, "loss": 0.72250116, "num_input_tokens_seen": 155936490, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8046875, "step": 7273, "time_per_iteration": 2.554417848587036 }, { "auxiliary_loss_clip": 0.01140358, "auxiliary_loss_mlp": 0.010376, "balance_loss_clip": 1.02327728, "balance_loss_mlp": 1.04164004, "epoch": 0.437336539906809, "flos": 39530860842240.0, "grad_norm": 3.98405142681104, "language_loss": 0.75500178, "learning_rate": 2.391523841781996e-06, "loss": 0.77678144, "num_input_tokens_seen": 155957595, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.80859375, "step": 7274, "time_per_iteration": 2.800269603729248 }, { "auxiliary_loss_clip": 0.01126746, "auxiliary_loss_mlp": 0.01032531, "balance_loss_clip": 1.02022231, "balance_loss_mlp": 1.03975654, "epoch": 0.43739666315947695, "flos": 17457147181440.0, "grad_norm": 1.7308403855858472, "language_loss": 0.80359364, "learning_rate": 2.3911533227838455e-06, "loss": 0.82518637, "num_input_tokens_seen": 155975710, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.77734375, "step": 7275, "time_per_iteration": 2.5354037284851074 }, { "auxiliary_loss_clip": 0.01141716, "auxiliary_loss_mlp": 0.01036095, "balance_loss_clip": 1.02268386, "balance_loss_mlp": 1.04509974, "epoch": 0.4374567864121449, "flos": 16358747235840.0, "grad_norm": 2.0098217756074255, "language_loss": 0.80522573, "learning_rate": 2.390782789826439e-06, "loss": 0.82700384, "num_input_tokens_seen": 155993090, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7890625, "step": 7276, "time_per_iteration": 2.6236484050750732 }, { "auxiliary_loss_clip": 0.01146157, "auxiliary_loss_mlp": 0.01034264, "balance_loss_clip": 1.02008986, "balance_loss_mlp": 1.04421604, "epoch": 0.4375169096648129, "flos": 30587523834240.0, "grad_norm": 1.8054818775435, "language_loss": 0.73109627, "learning_rate": 2.3904122429229997e-06, "loss": 0.75290048, "num_input_tokens_seen": 156013685, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.8359375, "step": 7277, "time_per_iteration": 2.6304335594177246 }, { "auxiliary_loss_clip": 0.0113917, "auxiliary_loss_mlp": 0.01284925, "balance_loss_clip": 1.02335787, "balance_loss_mlp": 1.04321432, "epoch": 0.43757703291748085, "flos": 30555599621760.0, "grad_norm": 1.967240605868889, "language_loss": 0.72941077, "learning_rate": 2.390041682086752e-06, "loss": 0.75365168, "num_input_tokens_seen": 156034300, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.78515625, "step": 7278, "time_per_iteration": 2.737119674682617 }, { "auxiliary_loss_clip": 0.01146791, "auxiliary_loss_mlp": 0.01039145, "balance_loss_clip": 1.02593029, "balance_loss_mlp": 1.04290986, "epoch": 0.4376371561701488, "flos": 21397373498880.0, "grad_norm": 1.5492652848610649, "language_loss": 0.66241789, "learning_rate": 2.3896711073309193e-06, "loss": 0.6842773, "num_input_tokens_seen": 156053805, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.76953125, "step": 7279, "time_per_iteration": 2.5535225868225098 }, { "auxiliary_loss_clip": 0.01127431, "auxiliary_loss_mlp": 0.0103859, "balance_loss_clip": 1.02432656, "balance_loss_mlp": 1.04471421, "epoch": 0.4376972794228168, "flos": 23404384903680.0, "grad_norm": 1.848441040901178, "language_loss": 0.81442398, "learning_rate": 2.389300518668728e-06, "loss": 0.83608425, "num_input_tokens_seen": 156073295, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.828125, "step": 7280, "time_per_iteration": 2.610663652420044 }, { "auxiliary_loss_clip": 0.01135673, "auxiliary_loss_mlp": 0.0103288, "balance_loss_clip": 1.01969504, "balance_loss_mlp": 1.04146945, "epoch": 0.43775740267548474, "flos": 22892945103360.0, "grad_norm": 1.4572212521101604, "language_loss": 0.772241, "learning_rate": 2.3889299161134027e-06, "loss": 0.79392654, "num_input_tokens_seen": 156094540, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.765625, "step": 7281, "time_per_iteration": 2.591938018798828 }, { "auxiliary_loss_clip": 0.01137994, "auxiliary_loss_mlp": 0.01040602, "balance_loss_clip": 1.02721524, "balance_loss_mlp": 1.04205203, "epoch": 0.4378175259281527, "flos": 23294390480640.0, "grad_norm": 1.9930042143003692, "language_loss": 0.75656581, "learning_rate": 2.3885592996781686e-06, "loss": 0.77835178, "num_input_tokens_seen": 156114070, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.78125, "step": 7282, "time_per_iteration": 2.6375417709350586 }, { "auxiliary_loss_clip": 0.01148154, "auxiliary_loss_mlp": 0.01037742, "balance_loss_clip": 1.02381277, "balance_loss_mlp": 1.04262972, "epoch": 0.43787764918082067, "flos": 23876897339520.0, "grad_norm": 2.1217446002118403, "language_loss": 0.84598398, "learning_rate": 2.388188669376253e-06, "loss": 0.86784291, "num_input_tokens_seen": 156132130, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7890625, "step": 7283, "time_per_iteration": 2.551450490951538 }, { "auxiliary_loss_clip": 0.01143178, "auxiliary_loss_mlp": 0.01034577, "balance_loss_clip": 1.0200814, "balance_loss_mlp": 1.04260564, "epoch": 0.43793777243348864, "flos": 23988148738560.0, "grad_norm": 1.9819556514182692, "language_loss": 0.80342311, "learning_rate": 2.3878180252208815e-06, "loss": 0.82520062, "num_input_tokens_seen": 156150820, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.82421875, "step": 7284, "time_per_iteration": 3.9473612308502197 }, { "auxiliary_loss_clip": 0.01128948, "auxiliary_loss_mlp": 0.01039613, "balance_loss_clip": 1.02680993, "balance_loss_mlp": 1.04204249, "epoch": 0.4379978956861566, "flos": 18624064320000.0, "grad_norm": 2.9194936292697853, "language_loss": 0.80263978, "learning_rate": 2.3874473672252834e-06, "loss": 0.82432538, "num_input_tokens_seen": 156170125, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.78125, "step": 7285, "time_per_iteration": 2.607231855392456 }, { "auxiliary_loss_clip": 0.01125599, "auxiliary_loss_mlp": 0.01034437, "balance_loss_clip": 1.02119303, "balance_loss_mlp": 1.03983629, "epoch": 0.43805801893882457, "flos": 21981388728960.0, "grad_norm": 1.9400355108130933, "language_loss": 0.748806, "learning_rate": 2.387076695402685e-06, "loss": 0.77040637, "num_input_tokens_seen": 156187320, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.76953125, "step": 7286, "time_per_iteration": 2.5579962730407715 }, { "auxiliary_loss_clip": 0.01148212, "auxiliary_loss_mlp": 0.01030703, "balance_loss_clip": 1.01687503, "balance_loss_mlp": 1.04125309, "epoch": 0.43811814219149253, "flos": 26393337383040.0, "grad_norm": 1.4439659469040218, "language_loss": 0.73701102, "learning_rate": 2.386706009766314e-06, "loss": 0.75880015, "num_input_tokens_seen": 156207455, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.8046875, "step": 7287, "time_per_iteration": 2.7133336067199707 }, { "auxiliary_loss_clip": 0.01140602, "auxiliary_loss_mlp": 0.0104335, "balance_loss_clip": 1.02887845, "balance_loss_mlp": 1.04294705, "epoch": 0.43817826544416055, "flos": 17493309198720.0, "grad_norm": 4.436856672268195, "language_loss": 0.82317924, "learning_rate": 2.3863353103294017e-06, "loss": 0.84501874, "num_input_tokens_seen": 156226560, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.80078125, "step": 7288, "time_per_iteration": 2.55088210105896 }, { "auxiliary_loss_clip": 0.01149425, "auxiliary_loss_mlp": 0.01035317, "balance_loss_clip": 1.02127421, "balance_loss_mlp": 1.03963447, "epoch": 0.4382383886968285, "flos": 21581020759680.0, "grad_norm": 1.3312880468985584, "language_loss": 0.84074885, "learning_rate": 2.385964597105175e-06, "loss": 0.86259633, "num_input_tokens_seen": 156246740, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.828125, "step": 7289, "time_per_iteration": 4.064618110656738 }, { "auxiliary_loss_clip": 0.01143653, "auxiliary_loss_mlp": 0.01036864, "balance_loss_clip": 1.02332795, "balance_loss_mlp": 1.04207432, "epoch": 0.4382985119494965, "flos": 27923742201600.0, "grad_norm": 1.6070432911838897, "language_loss": 0.78138429, "learning_rate": 2.3855938701068647e-06, "loss": 0.8031894, "num_input_tokens_seen": 156266440, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.8359375, "step": 7290, "time_per_iteration": 2.6222918033599854 }, { "auxiliary_loss_clip": 0.01121459, "auxiliary_loss_mlp": 0.0104279, "balance_loss_clip": 1.02906919, "balance_loss_mlp": 1.04253125, "epoch": 0.43835863520216445, "flos": 24936836797440.0, "grad_norm": 1.9919185570195355, "language_loss": 0.77477103, "learning_rate": 2.385223129347701e-06, "loss": 0.79641354, "num_input_tokens_seen": 156286900, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7890625, "step": 7291, "time_per_iteration": 2.5591962337493896 }, { "auxiliary_loss_clip": 0.01149977, "auxiliary_loss_mlp": 0.01033712, "balance_loss_clip": 1.01962173, "balance_loss_mlp": 1.04387987, "epoch": 0.4384187584548324, "flos": 33510293504640.0, "grad_norm": 1.9097641168384385, "language_loss": 0.64768326, "learning_rate": 2.3848523748409153e-06, "loss": 0.66952014, "num_input_tokens_seen": 156307690, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.796875, "step": 7292, "time_per_iteration": 4.243801832199097 }, { "auxiliary_loss_clip": 0.01130563, "auxiliary_loss_mlp": 0.0103002, "balance_loss_clip": 1.01675177, "balance_loss_mlp": 1.04058218, "epoch": 0.4384788817075004, "flos": 23951052967680.0, "grad_norm": 1.5561416425802086, "language_loss": 0.73929346, "learning_rate": 2.3844816065997385e-06, "loss": 0.76089931, "num_input_tokens_seen": 156326620, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.8125, "step": 7293, "time_per_iteration": 2.555899143218994 }, { "auxiliary_loss_clip": 0.01150148, "auxiliary_loss_mlp": 0.0104163, "balance_loss_clip": 1.02682424, "balance_loss_mlp": 1.04393888, "epoch": 0.43853900496016834, "flos": 19098516090240.0, "grad_norm": 1.9646554970054473, "language_loss": 0.78353184, "learning_rate": 2.3841108246374012e-06, "loss": 0.8054496, "num_input_tokens_seen": 156345495, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.79296875, "step": 7294, "time_per_iteration": 2.5702803134918213 }, { "auxiliary_loss_clip": 0.01150686, "auxiliary_loss_mlp": 0.01039758, "balance_loss_clip": 1.0262692, "balance_loss_mlp": 1.04412687, "epoch": 0.4385991282128363, "flos": 13225362168960.0, "grad_norm": 1.8290746856338698, "language_loss": 0.72872257, "learning_rate": 2.3837400289671376e-06, "loss": 0.75062704, "num_input_tokens_seen": 156363155, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.796875, "step": 7295, "time_per_iteration": 4.167423248291016 }, { "auxiliary_loss_clip": 0.01152357, "auxiliary_loss_mlp": 0.01044274, "balance_loss_clip": 1.02963543, "balance_loss_mlp": 1.04406881, "epoch": 0.4386592514655043, "flos": 14319883445760.0, "grad_norm": 1.8524647969634995, "language_loss": 0.74876267, "learning_rate": 2.3833692196021788e-06, "loss": 0.77072895, "num_input_tokens_seen": 156380940, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.81640625, "step": 7296, "time_per_iteration": 2.505124568939209 }, { "auxiliary_loss_clip": 0.01130684, "auxiliary_loss_mlp": 0.01033652, "balance_loss_clip": 1.01959729, "balance_loss_mlp": 1.04245734, "epoch": 0.43871937471817224, "flos": 22784423137920.0, "grad_norm": 1.5636584838560714, "language_loss": 0.69168282, "learning_rate": 2.382998396555759e-06, "loss": 0.71332622, "num_input_tokens_seen": 156400415, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.79296875, "step": 7297, "time_per_iteration": 2.646965742111206 }, { "auxiliary_loss_clip": 0.01129396, "auxiliary_loss_mlp": 0.01033008, "balance_loss_clip": 1.02000248, "balance_loss_mlp": 1.04181373, "epoch": 0.4387794979708402, "flos": 28072304853120.0, "grad_norm": 1.7921185153185848, "language_loss": 0.70170665, "learning_rate": 2.3826275598411113e-06, "loss": 0.72333068, "num_input_tokens_seen": 156421120, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.78515625, "step": 7298, "time_per_iteration": 2.576669931411743 }, { "auxiliary_loss_clip": 0.01132648, "auxiliary_loss_mlp": 0.01031215, "balance_loss_clip": 1.01729715, "balance_loss_mlp": 1.04612732, "epoch": 0.43883962122350817, "flos": 26249551240320.0, "grad_norm": 1.5870558542833941, "language_loss": 0.72687167, "learning_rate": 2.3822567094714704e-06, "loss": 0.7485103, "num_input_tokens_seen": 156441535, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78125, "step": 7299, "time_per_iteration": 2.608218193054199 }, { "auxiliary_loss_clip": 0.01144602, "auxiliary_loss_mlp": 0.01291591, "balance_loss_clip": 1.0285995, "balance_loss_mlp": 1.04447663, "epoch": 0.43889974447617613, "flos": 25883765089920.0, "grad_norm": 1.8785960131556263, "language_loss": 0.76877475, "learning_rate": 2.3818858454600713e-06, "loss": 0.79313672, "num_input_tokens_seen": 156462015, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8203125, "step": 7300, "time_per_iteration": 2.5924336910247803 }, { "auxiliary_loss_clip": 0.01141084, "auxiliary_loss_mlp": 0.01037543, "balance_loss_clip": 1.02263558, "balance_loss_mlp": 1.04308295, "epoch": 0.43895986772884416, "flos": 25046615738880.0, "grad_norm": 1.7228026639202851, "language_loss": 0.70447135, "learning_rate": 2.3815149678201474e-06, "loss": 0.72625756, "num_input_tokens_seen": 156482165, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8046875, "step": 7301, "time_per_iteration": 2.6486642360687256 }, { "auxiliary_loss_clip": 0.01163745, "auxiliary_loss_mlp": 0.01040344, "balance_loss_clip": 1.02519238, "balance_loss_mlp": 1.04650068, "epoch": 0.4390199909815121, "flos": 25994585525760.0, "grad_norm": 1.9328754576296183, "language_loss": 0.70489895, "learning_rate": 2.381144076564937e-06, "loss": 0.7269398, "num_input_tokens_seen": 156503170, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.81640625, "step": 7302, "time_per_iteration": 2.6569573879241943 }, { "auxiliary_loss_clip": 0.01141276, "auxiliary_loss_mlp": 0.01043692, "balance_loss_clip": 1.03014398, "balance_loss_mlp": 1.04573393, "epoch": 0.4390801142341801, "flos": 29022249888000.0, "grad_norm": 2.0729749816679313, "language_loss": 0.82363963, "learning_rate": 2.3807731717076748e-06, "loss": 0.84548932, "num_input_tokens_seen": 156523005, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.77734375, "step": 7303, "time_per_iteration": 2.749847888946533 }, { "auxiliary_loss_clip": 0.01139556, "auxiliary_loss_mlp": 0.01043403, "balance_loss_clip": 1.02757215, "balance_loss_mlp": 1.04628396, "epoch": 0.43914023748684805, "flos": 33438544087680.0, "grad_norm": 1.9192934693614678, "language_loss": 0.6793083, "learning_rate": 2.3804022532615965e-06, "loss": 0.7011379, "num_input_tokens_seen": 156544440, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.83984375, "step": 7304, "time_per_iteration": 2.7443206310272217 }, { "auxiliary_loss_clip": 0.01140591, "auxiliary_loss_mlp": 0.01286754, "balance_loss_clip": 1.02641761, "balance_loss_mlp": 1.04571283, "epoch": 0.439200360739516, "flos": 34531844302080.0, "grad_norm": 1.486655962264599, "language_loss": 0.7752651, "learning_rate": 2.3800313212399412e-06, "loss": 0.79953855, "num_input_tokens_seen": 156565410, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.76953125, "step": 7305, "time_per_iteration": 2.7281112670898438 }, { "auxiliary_loss_clip": 0.01141594, "auxiliary_loss_mlp": 0.0104093, "balance_loss_clip": 1.02642202, "balance_loss_mlp": 1.04501426, "epoch": 0.439260483992184, "flos": 21907843632000.0, "grad_norm": 1.686817767367278, "language_loss": 0.69007832, "learning_rate": 2.379660375655945e-06, "loss": 0.71190357, "num_input_tokens_seen": 156584210, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7890625, "step": 7306, "time_per_iteration": 2.7295544147491455 }, { "auxiliary_loss_clip": 0.01152013, "auxiliary_loss_mlp": 0.01035099, "balance_loss_clip": 1.0205431, "balance_loss_mlp": 1.04464555, "epoch": 0.43932060724485195, "flos": 20996430912000.0, "grad_norm": 1.4648754680013665, "language_loss": 0.6221506, "learning_rate": 2.379289416522847e-06, "loss": 0.64402175, "num_input_tokens_seen": 156602730, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8046875, "step": 7307, "time_per_iteration": 2.6547739505767822 }, { "auxiliary_loss_clip": 0.01127352, "auxiliary_loss_mlp": 0.01037178, "balance_loss_clip": 1.02331388, "balance_loss_mlp": 1.04579628, "epoch": 0.4393807304975199, "flos": 17747053850880.0, "grad_norm": 2.012104688963716, "language_loss": 0.71978176, "learning_rate": 2.378918443853886e-06, "loss": 0.741427, "num_input_tokens_seen": 156619405, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.81640625, "step": 7308, "time_per_iteration": 2.653179168701172 }, { "auxiliary_loss_clip": 0.01145751, "auxiliary_loss_mlp": 0.01043819, "balance_loss_clip": 1.02884007, "balance_loss_mlp": 1.04378057, "epoch": 0.4394408537501879, "flos": 22528523669760.0, "grad_norm": 1.988805968486148, "language_loss": 0.76747221, "learning_rate": 2.378547457662299e-06, "loss": 0.78936785, "num_input_tokens_seen": 156638165, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.84375, "step": 7309, "time_per_iteration": 2.5822865962982178 }, { "auxiliary_loss_clip": 0.01131418, "auxiliary_loss_mlp": 0.01031391, "balance_loss_clip": 1.01820016, "balance_loss_mlp": 1.04651296, "epoch": 0.43950097700285584, "flos": 23440654661760.0, "grad_norm": 1.5763698875108654, "language_loss": 0.70704496, "learning_rate": 2.378176457961328e-06, "loss": 0.72867304, "num_input_tokens_seen": 156658845, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.76171875, "step": 7310, "time_per_iteration": 2.697383165359497 }, { "auxiliary_loss_clip": 0.01145236, "auxiliary_loss_mlp": 0.01281779, "balance_loss_clip": 1.01993775, "balance_loss_mlp": 1.04616308, "epoch": 0.4395611002555238, "flos": 23180696956800.0, "grad_norm": 2.6368316031624293, "language_loss": 0.75728518, "learning_rate": 2.3778054447642124e-06, "loss": 0.78155529, "num_input_tokens_seen": 156677275, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8125, "step": 7311, "time_per_iteration": 2.6079819202423096 }, { "auxiliary_loss_clip": 0.01136522, "auxiliary_loss_mlp": 0.01040949, "balance_loss_clip": 1.02684069, "balance_loss_mlp": 1.04670274, "epoch": 0.43962122350819177, "flos": 22127365601280.0, "grad_norm": 2.0343872823981597, "language_loss": 0.820059, "learning_rate": 2.3774344180841917e-06, "loss": 0.84183371, "num_input_tokens_seen": 156695815, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.80859375, "step": 7312, "time_per_iteration": 2.673715591430664 }, { "auxiliary_loss_clip": 0.01149705, "auxiliary_loss_mlp": 0.01035932, "balance_loss_clip": 1.0217576, "balance_loss_mlp": 1.0440948, "epoch": 0.43968134676085974, "flos": 17420554200960.0, "grad_norm": 3.7213647467628843, "language_loss": 0.85140502, "learning_rate": 2.3770633779345074e-06, "loss": 0.87326139, "num_input_tokens_seen": 156714385, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7890625, "step": 7313, "time_per_iteration": 2.5459344387054443 }, { "auxiliary_loss_clip": 0.01142908, "auxiliary_loss_mlp": 0.01035334, "balance_loss_clip": 1.0214107, "balance_loss_mlp": 1.04348087, "epoch": 0.43974147001352776, "flos": 18952646958720.0, "grad_norm": 1.7758464334967656, "language_loss": 0.67742813, "learning_rate": 2.376692324328401e-06, "loss": 0.69921052, "num_input_tokens_seen": 156732615, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.8203125, "step": 7314, "time_per_iteration": 2.604379415512085 }, { "auxiliary_loss_clip": 0.01148887, "auxiliary_loss_mlp": 0.01032776, "balance_loss_clip": 1.01818466, "balance_loss_mlp": 1.04899716, "epoch": 0.4398015932661957, "flos": 18953508885120.0, "grad_norm": 1.8027722963577415, "language_loss": 0.7681191, "learning_rate": 2.376321257279115e-06, "loss": 0.78993571, "num_input_tokens_seen": 156750920, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8203125, "step": 7315, "time_per_iteration": 2.6281726360321045 }, { "auxiliary_loss_clip": 0.01143343, "auxiliary_loss_mlp": 0.01033981, "balance_loss_clip": 1.01941323, "balance_loss_mlp": 1.04596829, "epoch": 0.4398617165188637, "flos": 24199913370240.0, "grad_norm": 2.3205923284108656, "language_loss": 0.74558103, "learning_rate": 2.375950176799891e-06, "loss": 0.76735425, "num_input_tokens_seen": 156768520, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.79296875, "step": 7316, "time_per_iteration": 2.6626944541931152 }, { "auxiliary_loss_clip": 0.0115616, "auxiliary_loss_mlp": 0.01037113, "balance_loss_clip": 1.02335036, "balance_loss_mlp": 1.04974961, "epoch": 0.43992183977153165, "flos": 22236677665920.0, "grad_norm": 2.72722685740308, "language_loss": 0.64900118, "learning_rate": 2.375579082903972e-06, "loss": 0.67093396, "num_input_tokens_seen": 156788700, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.796875, "step": 7317, "time_per_iteration": 2.689096450805664 }, { "auxiliary_loss_clip": 0.01149443, "auxiliary_loss_mlp": 0.01039289, "balance_loss_clip": 1.02490687, "balance_loss_mlp": 1.04997444, "epoch": 0.4399819630241996, "flos": 18697465762560.0, "grad_norm": 1.821064476943984, "language_loss": 0.79926205, "learning_rate": 2.3752079756046015e-06, "loss": 0.82114941, "num_input_tokens_seen": 156806470, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8125, "step": 7318, "time_per_iteration": 2.60854172706604 }, { "auxiliary_loss_clip": 0.01158107, "auxiliary_loss_mlp": 0.0104056, "balance_loss_clip": 1.02579033, "balance_loss_mlp": 1.04900849, "epoch": 0.4400420862768676, "flos": 23879375377920.0, "grad_norm": 1.7160081172256774, "language_loss": 0.79629594, "learning_rate": 2.374836854915024e-06, "loss": 0.8182826, "num_input_tokens_seen": 156825895, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8203125, "step": 7319, "time_per_iteration": 2.6164743900299072 }, { "auxiliary_loss_clip": 0.01135524, "auxiliary_loss_mlp": 0.01041723, "balance_loss_clip": 1.02652991, "balance_loss_mlp": 1.0465095, "epoch": 0.44010220952953555, "flos": 28037615293440.0, "grad_norm": 1.680338302674889, "language_loss": 0.79459465, "learning_rate": 2.3744657208484835e-06, "loss": 0.81636709, "num_input_tokens_seen": 156845990, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.796875, "step": 7320, "time_per_iteration": 2.6217947006225586 }, { "auxiliary_loss_clip": 0.01151651, "auxiliary_loss_mlp": 0.01040286, "balance_loss_clip": 1.02646935, "balance_loss_mlp": 1.04597545, "epoch": 0.4401623327822035, "flos": 23768985905280.0, "grad_norm": 1.662963704538389, "language_loss": 0.69542885, "learning_rate": 2.374094573418224e-06, "loss": 0.71734822, "num_input_tokens_seen": 156866685, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.796875, "step": 7321, "time_per_iteration": 2.659998655319214 }, { "auxiliary_loss_clip": 0.01086571, "auxiliary_loss_mlp": 0.01004614, "balance_loss_clip": 1.00294518, "balance_loss_mlp": 1.02578402, "epoch": 0.4402224560348715, "flos": 70774583264640.0, "grad_norm": 0.8745456549669867, "language_loss": 0.5684824, "learning_rate": 2.3737234126374923e-06, "loss": 0.58939421, "num_input_tokens_seen": 156923450, "router_z_loss_clip": 0.01672363, "router_z_loss_mlp": 0.25195312, "step": 7322, "time_per_iteration": 3.326012372970581 }, { "auxiliary_loss_clip": 0.01161391, "auxiliary_loss_mlp": 0.01031497, "balance_loss_clip": 1.01870036, "balance_loss_mlp": 1.04676485, "epoch": 0.44028257928753944, "flos": 22891795868160.0, "grad_norm": 1.4739735446663405, "language_loss": 0.76222312, "learning_rate": 2.3733522385195325e-06, "loss": 0.78415203, "num_input_tokens_seen": 156944795, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7890625, "step": 7323, "time_per_iteration": 2.6864943504333496 }, { "auxiliary_loss_clip": 0.01137529, "auxiliary_loss_mlp": 0.01035235, "balance_loss_clip": 1.02044153, "balance_loss_mlp": 1.04703021, "epoch": 0.4403427025402074, "flos": 17895760156800.0, "grad_norm": 1.6397164630797683, "language_loss": 0.80764019, "learning_rate": 2.372981051077592e-06, "loss": 0.82936788, "num_input_tokens_seen": 156962755, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.81640625, "step": 7324, "time_per_iteration": 2.522942304611206 }, { "auxiliary_loss_clip": 0.01077342, "auxiliary_loss_mlp": 0.01002061, "balance_loss_clip": 1.00030828, "balance_loss_mlp": 1.02488661, "epoch": 0.4404028257928754, "flos": 69562525708800.0, "grad_norm": 0.6642182248163268, "language_loss": 0.54555357, "learning_rate": 2.3726098503249175e-06, "loss": 0.5663476, "num_input_tokens_seen": 157028095, "router_z_loss_clip": 0.01757812, "router_z_loss_mlp": 0.25, "step": 7325, "time_per_iteration": 4.5765297412872314 }, { "auxiliary_loss_clip": 0.01125973, "auxiliary_loss_mlp": 0.01026994, "balance_loss_clip": 1.01391101, "balance_loss_mlp": 1.0485394, "epoch": 0.44046294904554334, "flos": 20923675914240.0, "grad_norm": 1.417643115115226, "language_loss": 0.69724464, "learning_rate": 2.3722386362747558e-06, "loss": 0.71877432, "num_input_tokens_seen": 157048365, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7734375, "step": 7326, "time_per_iteration": 2.616088390350342 }, { "auxiliary_loss_clip": 0.01150262, "auxiliary_loss_mlp": 0.01030057, "balance_loss_clip": 1.01602578, "balance_loss_mlp": 1.04587984, "epoch": 0.44052307229821136, "flos": 23623475909760.0, "grad_norm": 1.7949458533091698, "language_loss": 0.76606762, "learning_rate": 2.371867408940355e-06, "loss": 0.78787082, "num_input_tokens_seen": 157069130, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.77734375, "step": 7327, "time_per_iteration": 2.6167383193969727 }, { "auxiliary_loss_clip": 0.01141866, "auxiliary_loss_mlp": 0.01032867, "balance_loss_clip": 1.01838887, "balance_loss_mlp": 1.04559875, "epoch": 0.4405831955508793, "flos": 17597665186560.0, "grad_norm": 1.8283433178597768, "language_loss": 0.84426153, "learning_rate": 2.371496168334962e-06, "loss": 0.86600888, "num_input_tokens_seen": 157084940, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78515625, "step": 7328, "time_per_iteration": 2.7256908416748047 }, { "auxiliary_loss_clip": 0.01142767, "auxiliary_loss_mlp": 0.0103375, "balance_loss_clip": 1.01911747, "balance_loss_mlp": 1.04659796, "epoch": 0.4406433188035473, "flos": 21463376739840.0, "grad_norm": 1.9476627978857175, "language_loss": 0.77654684, "learning_rate": 2.371124914471827e-06, "loss": 0.79831201, "num_input_tokens_seen": 157102770, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78515625, "step": 7329, "time_per_iteration": 2.6008458137512207 }, { "auxiliary_loss_clip": 0.01135396, "auxiliary_loss_mlp": 0.01036273, "balance_loss_clip": 1.0216459, "balance_loss_mlp": 1.04528809, "epoch": 0.44070344205621526, "flos": 22673566788480.0, "grad_norm": 1.5626275970792152, "language_loss": 0.73395383, "learning_rate": 2.3707536473641987e-06, "loss": 0.75567055, "num_input_tokens_seen": 157122035, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80859375, "step": 7330, "time_per_iteration": 2.701709032058716 }, { "auxiliary_loss_clip": 0.01144009, "auxiliary_loss_mlp": 0.01032228, "balance_loss_clip": 1.0185194, "balance_loss_mlp": 1.04740107, "epoch": 0.4407635653088832, "flos": 23441193365760.0, "grad_norm": 1.8126215064874487, "language_loss": 0.74628288, "learning_rate": 2.3703823670253257e-06, "loss": 0.76804531, "num_input_tokens_seen": 157142800, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7890625, "step": 7331, "time_per_iteration": 4.056272268295288 }, { "auxiliary_loss_clip": 0.01158225, "auxiliary_loss_mlp": 0.01030871, "balance_loss_clip": 1.01717401, "balance_loss_mlp": 1.04528153, "epoch": 0.4408236885615512, "flos": 24021294013440.0, "grad_norm": 1.8030518985777226, "language_loss": 0.76299804, "learning_rate": 2.370011073468459e-06, "loss": 0.78488898, "num_input_tokens_seen": 157163295, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7734375, "step": 7332, "time_per_iteration": 2.6408333778381348 }, { "auxiliary_loss_clip": 0.01138525, "auxiliary_loss_mlp": 0.01039169, "balance_loss_clip": 1.02633595, "balance_loss_mlp": 1.0428412, "epoch": 0.44088381181421915, "flos": 12676826597760.0, "grad_norm": 1.8784239814138963, "language_loss": 0.73212552, "learning_rate": 2.3696397667068488e-06, "loss": 0.75390244, "num_input_tokens_seen": 157180890, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.78125, "step": 7333, "time_per_iteration": 2.615907907485962 }, { "auxiliary_loss_clip": 0.01148887, "auxiliary_loss_mlp": 0.01035235, "balance_loss_clip": 1.02114987, "balance_loss_mlp": 1.04601312, "epoch": 0.4409439350668871, "flos": 24569865498240.0, "grad_norm": 1.6055759106831338, "language_loss": 0.7965464, "learning_rate": 2.3692684467537467e-06, "loss": 0.81838763, "num_input_tokens_seen": 157200580, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76171875, "step": 7334, "time_per_iteration": 4.135733366012573 }, { "auxiliary_loss_clip": 0.01159495, "auxiliary_loss_mlp": 0.01043485, "balance_loss_clip": 1.0259732, "balance_loss_mlp": 1.04603207, "epoch": 0.4410040583195551, "flos": 22668574798080.0, "grad_norm": 2.8214445407833253, "language_loss": 0.75171232, "learning_rate": 2.3688971136224027e-06, "loss": 0.77374208, "num_input_tokens_seen": 157218345, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.87109375, "step": 7335, "time_per_iteration": 2.578378915786743 }, { "auxiliary_loss_clip": 0.01146041, "auxiliary_loss_mlp": 0.01032417, "balance_loss_clip": 1.01839256, "balance_loss_mlp": 1.0477035, "epoch": 0.44106418157222305, "flos": 10852528700160.0, "grad_norm": 2.1695613516502914, "language_loss": 0.7231825, "learning_rate": 2.3685257673260702e-06, "loss": 0.74496704, "num_input_tokens_seen": 157234395, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.80078125, "step": 7336, "time_per_iteration": 4.535191297531128 }, { "auxiliary_loss_clip": 0.01136441, "auxiliary_loss_mlp": 0.01046417, "balance_loss_clip": 1.0313375, "balance_loss_mlp": 1.04484653, "epoch": 0.441124304824891, "flos": 21726710323200.0, "grad_norm": 2.3078971054803223, "language_loss": 0.62302297, "learning_rate": 2.3681544078780013e-06, "loss": 0.64485157, "num_input_tokens_seen": 157254805, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.82421875, "step": 7337, "time_per_iteration": 2.624049186706543 }, { "auxiliary_loss_clip": 0.01064695, "auxiliary_loss_mlp": 0.01248423, "balance_loss_clip": 1.00077653, "balance_loss_mlp": 1.02159572, "epoch": 0.441184428077559, "flos": 63220486625280.0, "grad_norm": 0.7414052504417857, "language_loss": 0.52705085, "learning_rate": 2.367783035291448e-06, "loss": 0.55018204, "num_input_tokens_seen": 157317870, "router_z_loss_clip": 0.01647949, "router_z_loss_mlp": 0.25, "step": 7338, "time_per_iteration": 3.242335796356201 }, { "auxiliary_loss_clip": 0.01147588, "auxiliary_loss_mlp": 0.01047278, "balance_loss_clip": 1.03151834, "balance_loss_mlp": 1.04496181, "epoch": 0.44124455133022694, "flos": 21177959270400.0, "grad_norm": 2.001905042750252, "language_loss": 0.70622665, "learning_rate": 2.3674116495796642e-06, "loss": 0.72817528, "num_input_tokens_seen": 157336505, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.84765625, "step": 7339, "time_per_iteration": 2.5729196071624756 }, { "auxiliary_loss_clip": 0.0116632, "auxiliary_loss_mlp": 0.01039978, "balance_loss_clip": 1.02661514, "balance_loss_mlp": 1.04587746, "epoch": 0.4413046745828949, "flos": 17457865453440.0, "grad_norm": 1.4477715182025308, "language_loss": 0.69450116, "learning_rate": 2.367040250755904e-06, "loss": 0.71656418, "num_input_tokens_seen": 157354995, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7578125, "step": 7340, "time_per_iteration": 2.625983715057373 }, { "auxiliary_loss_clip": 0.01064289, "auxiliary_loss_mlp": 0.01001999, "balance_loss_clip": 1.00043762, "balance_loss_mlp": 1.02181554, "epoch": 0.4413647978355629, "flos": 61586153804160.0, "grad_norm": 0.9072502698797027, "language_loss": 0.63981122, "learning_rate": 2.3666688388334215e-06, "loss": 0.66047412, "num_input_tokens_seen": 157404260, "router_z_loss_clip": 0.015625, "router_z_loss_mlp": 0.25, "step": 7341, "time_per_iteration": 3.0348291397094727 }, { "auxiliary_loss_clip": 0.0114924, "auxiliary_loss_mlp": 0.0103525, "balance_loss_clip": 1.02052784, "balance_loss_mlp": 1.0443424, "epoch": 0.4414249210882309, "flos": 27527001505920.0, "grad_norm": 2.0911255353328184, "language_loss": 0.73801154, "learning_rate": 2.366297413825472e-06, "loss": 0.75985646, "num_input_tokens_seen": 157423045, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.78125, "step": 7342, "time_per_iteration": 2.6578729152679443 }, { "auxiliary_loss_clip": 0.01123828, "auxiliary_loss_mlp": 0.01034439, "balance_loss_clip": 1.01939464, "balance_loss_mlp": 1.04383314, "epoch": 0.44148504434089886, "flos": 23513984277120.0, "grad_norm": 1.837034056245149, "language_loss": 0.79908943, "learning_rate": 2.365925975745309e-06, "loss": 0.82067215, "num_input_tokens_seen": 157441815, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.80078125, "step": 7343, "time_per_iteration": 2.637763261795044 }, { "auxiliary_loss_clip": 0.01146645, "auxiliary_loss_mlp": 0.01036623, "balance_loss_clip": 1.02212691, "balance_loss_mlp": 1.04162621, "epoch": 0.4415451675935668, "flos": 21580589796480.0, "grad_norm": 1.6607405466788607, "language_loss": 0.76469731, "learning_rate": 2.3655545246061893e-06, "loss": 0.78653002, "num_input_tokens_seen": 157460470, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78515625, "step": 7344, "time_per_iteration": 2.5840070247650146 }, { "auxiliary_loss_clip": 0.01063658, "auxiliary_loss_mlp": 0.0100202, "balance_loss_clip": 1.00047052, "balance_loss_mlp": 1.02021945, "epoch": 0.4416052908462348, "flos": 59006368126080.0, "grad_norm": 0.7948232741865474, "language_loss": 0.63842279, "learning_rate": 2.365183060421369e-06, "loss": 0.65907961, "num_input_tokens_seen": 157512655, "router_z_loss_clip": 0.01550293, "router_z_loss_mlp": 0.25, "step": 7345, "time_per_iteration": 2.899020195007324 }, { "auxiliary_loss_clip": 0.01127075, "auxiliary_loss_mlp": 0.01037687, "balance_loss_clip": 1.02317894, "balance_loss_mlp": 1.04590464, "epoch": 0.44166541409890275, "flos": 26357642242560.0, "grad_norm": 1.7195018298073046, "language_loss": 0.85897756, "learning_rate": 2.364811583204105e-06, "loss": 0.88062507, "num_input_tokens_seen": 157533700, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8125, "step": 7346, "time_per_iteration": 2.5730550289154053 }, { "auxiliary_loss_clip": 0.01134218, "auxiliary_loss_mlp": 0.01039838, "balance_loss_clip": 1.02460289, "balance_loss_mlp": 1.04549861, "epoch": 0.4417255373515707, "flos": 20192678231040.0, "grad_norm": 2.0580627373408804, "language_loss": 0.8023454, "learning_rate": 2.364440092967654e-06, "loss": 0.82408595, "num_input_tokens_seen": 157551105, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.796875, "step": 7347, "time_per_iteration": 2.54702091217041 }, { "auxiliary_loss_clip": 0.01142478, "auxiliary_loss_mlp": 0.01033795, "balance_loss_clip": 1.01884604, "balance_loss_mlp": 1.04534996, "epoch": 0.4417856606042387, "flos": 17887895078400.0, "grad_norm": 2.008318660511493, "language_loss": 0.82669777, "learning_rate": 2.3640685897252726e-06, "loss": 0.8484605, "num_input_tokens_seen": 157568285, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.79296875, "step": 7348, "time_per_iteration": 2.5379512310028076 }, { "auxiliary_loss_clip": 0.01135898, "auxiliary_loss_mlp": 0.0103307, "balance_loss_clip": 1.0186336, "balance_loss_mlp": 1.04548013, "epoch": 0.44184578385690665, "flos": 27964034282880.0, "grad_norm": 1.667738841515663, "language_loss": 0.700019, "learning_rate": 2.3636970734902205e-06, "loss": 0.72170866, "num_input_tokens_seen": 157590405, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.81640625, "step": 7349, "time_per_iteration": 2.643261432647705 }, { "auxiliary_loss_clip": 0.0114291, "auxiliary_loss_mlp": 0.0103663, "balance_loss_clip": 1.021312, "balance_loss_mlp": 1.04327762, "epoch": 0.4419059071095746, "flos": 23367899664000.0, "grad_norm": 1.6442582982704559, "language_loss": 0.74524188, "learning_rate": 2.363325544275755e-06, "loss": 0.76703727, "num_input_tokens_seen": 157607420, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8125, "step": 7350, "time_per_iteration": 2.5754830837249756 }, { "auxiliary_loss_clip": 0.01123589, "auxiliary_loss_mlp": 0.01035521, "balance_loss_clip": 1.02124023, "balance_loss_mlp": 1.04326785, "epoch": 0.4419660303622426, "flos": 15012169246080.0, "grad_norm": 2.04473289412924, "language_loss": 0.80697954, "learning_rate": 2.362954002095136e-06, "loss": 0.82857066, "num_input_tokens_seen": 157624990, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8046875, "step": 7351, "time_per_iteration": 2.5280447006225586 }, { "auxiliary_loss_clip": 0.01146746, "auxiliary_loss_mlp": 0.01283491, "balance_loss_clip": 1.02398908, "balance_loss_mlp": 1.04339385, "epoch": 0.44202615361491054, "flos": 25371750672000.0, "grad_norm": 1.525138968239365, "language_loss": 0.73214483, "learning_rate": 2.3625824469616222e-06, "loss": 0.75644714, "num_input_tokens_seen": 157645300, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.76171875, "step": 7352, "time_per_iteration": 2.6960413455963135 }, { "auxiliary_loss_clip": 0.01141546, "auxiliary_loss_mlp": 0.01030942, "balance_loss_clip": 1.01630318, "balance_loss_mlp": 1.0446713, "epoch": 0.4420862768675785, "flos": 24681116897280.0, "grad_norm": 1.7648730499428589, "language_loss": 0.87067825, "learning_rate": 2.362210878888473e-06, "loss": 0.89240313, "num_input_tokens_seen": 157664060, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7890625, "step": 7353, "time_per_iteration": 2.624699115753174 }, { "auxiliary_loss_clip": 0.0112516, "auxiliary_loss_mlp": 0.01036027, "balance_loss_clip": 1.02303874, "balance_loss_mlp": 1.04472935, "epoch": 0.44214640012024653, "flos": 19528437974400.0, "grad_norm": 2.1218231987556133, "language_loss": 0.75597358, "learning_rate": 2.3618392978889498e-06, "loss": 0.77758545, "num_input_tokens_seen": 157680905, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.8046875, "step": 7354, "time_per_iteration": 2.6210594177246094 }, { "auxiliary_loss_clip": 0.01147322, "auxiliary_loss_mlp": 0.01032455, "balance_loss_clip": 1.01921129, "balance_loss_mlp": 1.04389238, "epoch": 0.4422065233729145, "flos": 47557434003840.0, "grad_norm": 1.7948003619719526, "language_loss": 0.64624429, "learning_rate": 2.3614677039763122e-06, "loss": 0.66804206, "num_input_tokens_seen": 157701980, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.765625, "step": 7355, "time_per_iteration": 2.856750249862671 }, { "auxiliary_loss_clip": 0.01127937, "auxiliary_loss_mlp": 0.0103841, "balance_loss_clip": 1.02256155, "balance_loss_mlp": 1.04469895, "epoch": 0.44226664662558246, "flos": 19281050029440.0, "grad_norm": 2.1328822934676723, "language_loss": 0.78200108, "learning_rate": 2.3610960971638224e-06, "loss": 0.80366457, "num_input_tokens_seen": 157720555, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.83203125, "step": 7356, "time_per_iteration": 2.664315938949585 }, { "auxiliary_loss_clip": 0.01144484, "auxiliary_loss_mlp": 0.01289322, "balance_loss_clip": 1.02754664, "balance_loss_mlp": 1.04504955, "epoch": 0.4423267698782504, "flos": 17821820010240.0, "grad_norm": 1.7289107463556082, "language_loss": 0.76868486, "learning_rate": 2.3607244774647423e-06, "loss": 0.79302299, "num_input_tokens_seen": 157739160, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8203125, "step": 7357, "time_per_iteration": 2.5796563625335693 }, { "auxiliary_loss_clip": 0.01141134, "auxiliary_loss_mlp": 0.01042626, "balance_loss_clip": 1.02758813, "balance_loss_mlp": 1.04468715, "epoch": 0.4423868931309184, "flos": 29204424691200.0, "grad_norm": 1.4781041604204141, "language_loss": 0.73306167, "learning_rate": 2.360352844892333e-06, "loss": 0.75489926, "num_input_tokens_seen": 157760020, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7890625, "step": 7358, "time_per_iteration": 2.7531418800354004 }, { "auxiliary_loss_clip": 0.01142969, "auxiliary_loss_mlp": 0.01034336, "balance_loss_clip": 1.02025175, "balance_loss_mlp": 1.04508531, "epoch": 0.44244701638358636, "flos": 29713135057920.0, "grad_norm": 1.6901997754666773, "language_loss": 0.75806248, "learning_rate": 2.359981199459858e-06, "loss": 0.77983552, "num_input_tokens_seen": 157780435, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.80078125, "step": 7359, "time_per_iteration": 2.664217710494995 }, { "auxiliary_loss_clip": 0.01143051, "auxiliary_loss_mlp": 0.01039146, "balance_loss_clip": 1.02367306, "balance_loss_mlp": 1.04510665, "epoch": 0.4425071396362543, "flos": 22930040874240.0, "grad_norm": 1.8073856885283326, "language_loss": 0.70038849, "learning_rate": 2.3596095411805794e-06, "loss": 0.72221047, "num_input_tokens_seen": 157799420, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.80078125, "step": 7360, "time_per_iteration": 2.691789150238037 }, { "auxiliary_loss_clip": 0.01140096, "auxiliary_loss_mlp": 0.01033803, "balance_loss_clip": 1.01948535, "balance_loss_mlp": 1.0422442, "epoch": 0.4425672628889223, "flos": 19792346175360.0, "grad_norm": 12.142175221771366, "language_loss": 0.69520378, "learning_rate": 2.359237870067761e-06, "loss": 0.71694279, "num_input_tokens_seen": 157817025, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.796875, "step": 7361, "time_per_iteration": 2.5652711391448975 }, { "auxiliary_loss_clip": 0.01142333, "auxiliary_loss_mlp": 0.0103806, "balance_loss_clip": 1.02314067, "balance_loss_mlp": 1.04460478, "epoch": 0.44262738614159025, "flos": 13662215377920.0, "grad_norm": 2.01707110112174, "language_loss": 0.82457644, "learning_rate": 2.3588661861346676e-06, "loss": 0.84638047, "num_input_tokens_seen": 157834345, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.796875, "step": 7362, "time_per_iteration": 2.596144437789917 }, { "auxiliary_loss_clip": 0.01156042, "auxiliary_loss_mlp": 0.01040367, "balance_loss_clip": 1.02508473, "balance_loss_mlp": 1.0459609, "epoch": 0.4426875093942582, "flos": 14210212245120.0, "grad_norm": 1.6961998210624074, "language_loss": 0.74249744, "learning_rate": 2.3584944893945634e-06, "loss": 0.76446152, "num_input_tokens_seen": 157852290, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.83984375, "step": 7363, "time_per_iteration": 2.5996596813201904 }, { "auxiliary_loss_clip": 0.01077023, "auxiliary_loss_mlp": 0.01009449, "balance_loss_clip": 1.00768495, "balance_loss_mlp": 1.01732755, "epoch": 0.4427476326469262, "flos": 70117525728000.0, "grad_norm": 0.6792651666365099, "language_loss": 0.55681211, "learning_rate": 2.3581227798607126e-06, "loss": 0.57767677, "num_input_tokens_seen": 157923060, "router_z_loss_clip": 0.0177002, "router_z_loss_mlp": 0.25, "step": 7364, "time_per_iteration": 3.3485450744628906 }, { "auxiliary_loss_clip": 0.01136297, "auxiliary_loss_mlp": 0.01037737, "balance_loss_clip": 1.02360427, "balance_loss_mlp": 1.04069829, "epoch": 0.44280775589959415, "flos": 25445080287360.0, "grad_norm": 1.647307734937027, "language_loss": 0.744403, "learning_rate": 2.3577510575463806e-06, "loss": 0.76614338, "num_input_tokens_seen": 157944110, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.77734375, "step": 7365, "time_per_iteration": 2.6307055950164795 }, { "auxiliary_loss_clip": 0.01139273, "auxiliary_loss_mlp": 0.01047646, "balance_loss_clip": 1.0329417, "balance_loss_mlp": 1.04241776, "epoch": 0.4428678791522621, "flos": 22857214049280.0, "grad_norm": 1.8822938141854855, "language_loss": 0.74144989, "learning_rate": 2.357379322464834e-06, "loss": 0.76331902, "num_input_tokens_seen": 157964295, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7890625, "step": 7366, "time_per_iteration": 2.678093910217285 }, { "auxiliary_loss_clip": 0.01139731, "auxiliary_loss_mlp": 0.01034224, "balance_loss_clip": 1.01951361, "balance_loss_mlp": 1.04325438, "epoch": 0.44292800240493013, "flos": 25812446636160.0, "grad_norm": 1.8697859267775998, "language_loss": 0.73508269, "learning_rate": 2.357007574629339e-06, "loss": 0.75682223, "num_input_tokens_seen": 157983970, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7890625, "step": 7367, "time_per_iteration": 4.077218055725098 }, { "auxiliary_loss_clip": 0.01122843, "auxiliary_loss_mlp": 0.01041015, "balance_loss_clip": 1.02718067, "balance_loss_mlp": 1.04397881, "epoch": 0.4429881256575981, "flos": 32416885549440.0, "grad_norm": 1.4766024848199613, "language_loss": 0.73820013, "learning_rate": 2.356635814053162e-06, "loss": 0.7598387, "num_input_tokens_seen": 158006515, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7890625, "step": 7368, "time_per_iteration": 2.7378485202789307 }, { "auxiliary_loss_clip": 0.0113839, "auxiliary_loss_mlp": 0.01034175, "balance_loss_clip": 1.02022147, "balance_loss_mlp": 1.04333043, "epoch": 0.44304824891026606, "flos": 22163707186560.0, "grad_norm": 1.8210868738244632, "language_loss": 0.798379, "learning_rate": 2.3562640407495697e-06, "loss": 0.82010472, "num_input_tokens_seen": 158025565, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7734375, "step": 7369, "time_per_iteration": 2.621162176132202 }, { "auxiliary_loss_clip": 0.01146131, "auxiliary_loss_mlp": 0.0103699, "balance_loss_clip": 1.02365661, "balance_loss_mlp": 1.04310632, "epoch": 0.443108372162934, "flos": 25338569483520.0, "grad_norm": 1.8869010330194307, "language_loss": 0.71806967, "learning_rate": 2.3558922547318304e-06, "loss": 0.73990089, "num_input_tokens_seen": 158045620, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.765625, "step": 7370, "time_per_iteration": 2.745124101638794 }, { "auxiliary_loss_clip": 0.01121186, "auxiliary_loss_mlp": 0.01037931, "balance_loss_clip": 1.02342296, "balance_loss_mlp": 1.04289818, "epoch": 0.443168495415602, "flos": 23330947547520.0, "grad_norm": 1.7390700917509816, "language_loss": 0.70136958, "learning_rate": 2.3555204560132123e-06, "loss": 0.72296077, "num_input_tokens_seen": 158063505, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.78125, "step": 7371, "time_per_iteration": 2.5557820796966553 }, { "auxiliary_loss_clip": 0.01129158, "auxiliary_loss_mlp": 0.01033684, "balance_loss_clip": 1.02071416, "balance_loss_mlp": 1.04352105, "epoch": 0.44322861866826996, "flos": 21871502046720.0, "grad_norm": 2.143452374444691, "language_loss": 0.67907858, "learning_rate": 2.3551486446069834e-06, "loss": 0.70070702, "num_input_tokens_seen": 158080335, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.765625, "step": 7372, "time_per_iteration": 4.137408494949341 }, { "auxiliary_loss_clip": 0.01150051, "auxiliary_loss_mlp": 0.01036869, "balance_loss_clip": 1.02202177, "balance_loss_mlp": 1.04312682, "epoch": 0.4432887419209379, "flos": 20084407660800.0, "grad_norm": 2.048704960189632, "language_loss": 0.83271933, "learning_rate": 2.3547768205264133e-06, "loss": 0.85458857, "num_input_tokens_seen": 158098955, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8046875, "step": 7373, "time_per_iteration": 2.559645175933838 }, { "auxiliary_loss_clip": 0.01051791, "auxiliary_loss_mlp": 0.01002626, "balance_loss_clip": 1.0007664, "balance_loss_mlp": 1.01736784, "epoch": 0.4433488651736059, "flos": 70035540935040.0, "grad_norm": 0.760459979388655, "language_loss": 0.55213773, "learning_rate": 2.3544049837847708e-06, "loss": 0.5726819, "num_input_tokens_seen": 158164110, "router_z_loss_clip": 0.01855469, "router_z_loss_mlp": 0.25390625, "step": 7374, "time_per_iteration": 3.243891716003418 }, { "auxiliary_loss_clip": 0.01157261, "auxiliary_loss_mlp": 0.01044784, "balance_loss_clip": 1.02959633, "balance_loss_mlp": 1.04447627, "epoch": 0.44340898842627385, "flos": 16282472705280.0, "grad_norm": 2.213364313449715, "language_loss": 0.82610488, "learning_rate": 2.354033134395325e-06, "loss": 0.8481254, "num_input_tokens_seen": 158179850, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7734375, "step": 7375, "time_per_iteration": 4.172879695892334 }, { "auxiliary_loss_clip": 0.01123002, "auxiliary_loss_mlp": 0.0103355, "balance_loss_clip": 1.01962078, "balance_loss_mlp": 1.04315114, "epoch": 0.4434691116789418, "flos": 16611989097600.0, "grad_norm": 1.973039063782198, "language_loss": 0.84126502, "learning_rate": 2.3536612723713487e-06, "loss": 0.86283052, "num_input_tokens_seen": 158196590, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.80078125, "step": 7376, "time_per_iteration": 2.5443499088287354 }, { "auxiliary_loss_clip": 0.01139478, "auxiliary_loss_mlp": 0.01036262, "balance_loss_clip": 1.02211213, "balance_loss_mlp": 1.04309237, "epoch": 0.4435292349316098, "flos": 19063251912960.0, "grad_norm": 1.6809511986478078, "language_loss": 0.76953256, "learning_rate": 2.353289397726111e-06, "loss": 0.79129004, "num_input_tokens_seen": 158216355, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.78515625, "step": 7377, "time_per_iteration": 2.5899858474731445 }, { "auxiliary_loss_clip": 0.01127662, "auxiliary_loss_mlp": 0.0103477, "balance_loss_clip": 1.02079916, "balance_loss_mlp": 1.043347, "epoch": 0.44358935818427775, "flos": 21251324799360.0, "grad_norm": 1.7447456137840074, "language_loss": 0.75599539, "learning_rate": 2.352917510472883e-06, "loss": 0.77761972, "num_input_tokens_seen": 158235825, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75390625, "step": 7378, "time_per_iteration": 4.133167743682861 }, { "auxiliary_loss_clip": 0.01139856, "auxiliary_loss_mlp": 0.01042912, "balance_loss_clip": 1.02908373, "balance_loss_mlp": 1.0431447, "epoch": 0.4436494814369457, "flos": 12495298239360.0, "grad_norm": 1.9111071686463323, "language_loss": 0.68491578, "learning_rate": 2.3525456106249367e-06, "loss": 0.70674342, "num_input_tokens_seen": 158254230, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7890625, "step": 7379, "time_per_iteration": 2.587125301361084 }, { "auxiliary_loss_clip": 0.01168248, "auxiliary_loss_mlp": 0.01039482, "balance_loss_clip": 1.0254271, "balance_loss_mlp": 1.04436803, "epoch": 0.44370960468961373, "flos": 23659853408640.0, "grad_norm": 1.6701879504681971, "language_loss": 0.72976273, "learning_rate": 2.3521736981955454e-06, "loss": 0.75184005, "num_input_tokens_seen": 158273400, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7890625, "step": 7380, "time_per_iteration": 2.703756809234619 }, { "auxiliary_loss_clip": 0.01140795, "auxiliary_loss_mlp": 0.01036534, "balance_loss_clip": 1.02233624, "balance_loss_mlp": 1.04471493, "epoch": 0.4437697279422817, "flos": 32416849635840.0, "grad_norm": 1.497098286495441, "language_loss": 0.64592278, "learning_rate": 2.35180177319798e-06, "loss": 0.66769612, "num_input_tokens_seen": 158296840, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78515625, "step": 7381, "time_per_iteration": 2.6348938941955566 }, { "auxiliary_loss_clip": 0.0113266, "auxiliary_loss_mlp": 0.01034637, "balance_loss_clip": 1.02024829, "balance_loss_mlp": 1.0450573, "epoch": 0.44382985119494966, "flos": 18112875914880.0, "grad_norm": 4.68857055932939, "language_loss": 0.80360264, "learning_rate": 2.3514298356455145e-06, "loss": 0.8252756, "num_input_tokens_seen": 158314935, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7890625, "step": 7382, "time_per_iteration": 2.5908074378967285 }, { "auxiliary_loss_clip": 0.01134897, "auxiliary_loss_mlp": 0.01040242, "balance_loss_clip": 1.02586532, "balance_loss_mlp": 1.04708695, "epoch": 0.44388997444761763, "flos": 30774151923840.0, "grad_norm": 1.755775972124963, "language_loss": 0.65092683, "learning_rate": 2.351057885551422e-06, "loss": 0.67267823, "num_input_tokens_seen": 158334620, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7890625, "step": 7383, "time_per_iteration": 2.6440629959106445 }, { "auxiliary_loss_clip": 0.01144261, "auxiliary_loss_mlp": 0.01039916, "balance_loss_clip": 1.02540851, "balance_loss_mlp": 1.04534912, "epoch": 0.4439500977002856, "flos": 20339157893760.0, "grad_norm": 2.6877306394503817, "language_loss": 0.75673366, "learning_rate": 2.3506859229289768e-06, "loss": 0.77857542, "num_input_tokens_seen": 158350550, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8125, "step": 7384, "time_per_iteration": 2.621351718902588 }, { "auxiliary_loss_clip": 0.01125549, "auxiliary_loss_mlp": 0.01039082, "balance_loss_clip": 1.02424669, "balance_loss_mlp": 1.04483485, "epoch": 0.44401022095295356, "flos": 20371225760640.0, "grad_norm": 1.6054074662394142, "language_loss": 0.81025445, "learning_rate": 2.3503139477914532e-06, "loss": 0.83190072, "num_input_tokens_seen": 158369555, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.80859375, "step": 7385, "time_per_iteration": 2.567521572113037 }, { "auxiliary_loss_clip": 0.011589, "auxiliary_loss_mlp": 0.01034942, "balance_loss_clip": 1.01980257, "balance_loss_mlp": 1.04403865, "epoch": 0.4440703442056215, "flos": 20230635928320.0, "grad_norm": 1.8518124772192963, "language_loss": 0.81767428, "learning_rate": 2.349941960152126e-06, "loss": 0.83961272, "num_input_tokens_seen": 158388045, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.79296875, "step": 7386, "time_per_iteration": 2.675835609436035 }, { "auxiliary_loss_clip": 0.0115434, "auxiliary_loss_mlp": 0.0104331, "balance_loss_clip": 1.02769363, "balance_loss_mlp": 1.04477012, "epoch": 0.4441304674582895, "flos": 39494698824960.0, "grad_norm": 3.428823709550672, "language_loss": 0.69813776, "learning_rate": 2.34956996002427e-06, "loss": 0.72011423, "num_input_tokens_seen": 158410115, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.82421875, "step": 7387, "time_per_iteration": 2.8098666667938232 }, { "auxiliary_loss_clip": 0.01167198, "auxiliary_loss_mlp": 0.0104181, "balance_loss_clip": 1.02655137, "balance_loss_mlp": 1.04330957, "epoch": 0.44419059071095746, "flos": 14829671220480.0, "grad_norm": 1.9726369109452002, "language_loss": 0.72090024, "learning_rate": 2.3491979474211615e-06, "loss": 0.74299031, "num_input_tokens_seen": 158427765, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.796875, "step": 7388, "time_per_iteration": 2.6169540882110596 }, { "auxiliary_loss_clip": 0.01134713, "auxiliary_loss_mlp": 0.01034951, "balance_loss_clip": 1.02010977, "balance_loss_mlp": 1.04525805, "epoch": 0.4442507139636254, "flos": 22637835734400.0, "grad_norm": 1.6017197139795132, "language_loss": 0.69113582, "learning_rate": 2.3488259223560766e-06, "loss": 0.71283245, "num_input_tokens_seen": 158446375, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8046875, "step": 7389, "time_per_iteration": 2.5995020866394043 }, { "auxiliary_loss_clip": 0.0113223, "auxiliary_loss_mlp": 0.01031021, "balance_loss_clip": 1.01655555, "balance_loss_mlp": 1.044631, "epoch": 0.4443108372162934, "flos": 38290721829120.0, "grad_norm": 1.882640446848139, "language_loss": 0.75429398, "learning_rate": 2.3484538848422913e-06, "loss": 0.77592647, "num_input_tokens_seen": 158467260, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7890625, "step": 7390, "time_per_iteration": 2.689181327819824 }, { "auxiliary_loss_clip": 0.01132124, "auxiliary_loss_mlp": 0.01029341, "balance_loss_clip": 1.01571524, "balance_loss_mlp": 1.04573226, "epoch": 0.44437096046896135, "flos": 17748993185280.0, "grad_norm": 1.951616477145467, "language_loss": 0.81346154, "learning_rate": 2.348081834893084e-06, "loss": 0.83507615, "num_input_tokens_seen": 158486720, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7734375, "step": 7391, "time_per_iteration": 2.5402779579162598 }, { "auxiliary_loss_clip": 0.01142877, "auxiliary_loss_mlp": 0.01037458, "balance_loss_clip": 1.02274156, "balance_loss_mlp": 1.04703665, "epoch": 0.4444310837216293, "flos": 13732348682880.0, "grad_norm": 1.7843525632000854, "language_loss": 0.73355877, "learning_rate": 2.3477097725217306e-06, "loss": 0.75536215, "num_input_tokens_seen": 158502530, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.77734375, "step": 7392, "time_per_iteration": 2.5066494941711426 }, { "auxiliary_loss_clip": 0.01130134, "auxiliary_loss_mlp": 0.01032666, "balance_loss_clip": 1.0192312, "balance_loss_mlp": 1.04312909, "epoch": 0.44449120697429734, "flos": 25010238240000.0, "grad_norm": 1.5746649222344653, "language_loss": 0.79666615, "learning_rate": 2.3473376977415102e-06, "loss": 0.81829423, "num_input_tokens_seen": 158522715, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.78125, "step": 7393, "time_per_iteration": 2.570946455001831 }, { "auxiliary_loss_clip": 0.01135456, "auxiliary_loss_mlp": 0.01036905, "balance_loss_clip": 1.0224812, "balance_loss_mlp": 1.04497731, "epoch": 0.4445513302269653, "flos": 32671707609600.0, "grad_norm": 3.061239378535347, "language_loss": 0.81065834, "learning_rate": 2.3469656105657004e-06, "loss": 0.8323819, "num_input_tokens_seen": 158543615, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.81640625, "step": 7394, "time_per_iteration": 2.606637954711914 }, { "auxiliary_loss_clip": 0.01138851, "auxiliary_loss_mlp": 0.0103903, "balance_loss_clip": 1.0256424, "balance_loss_mlp": 1.04441309, "epoch": 0.44461145347963327, "flos": 11655814504320.0, "grad_norm": 2.1915369600316983, "language_loss": 0.79840684, "learning_rate": 2.346593511007581e-06, "loss": 0.8201856, "num_input_tokens_seen": 158560330, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.765625, "step": 7395, "time_per_iteration": 2.5420103073120117 }, { "auxiliary_loss_clip": 0.0114041, "auxiliary_loss_mlp": 0.0103755, "balance_loss_clip": 1.02398384, "balance_loss_mlp": 1.04451597, "epoch": 0.44467157673230123, "flos": 20886759711360.0, "grad_norm": 1.7256935955861896, "language_loss": 0.68551469, "learning_rate": 2.3462213990804307e-06, "loss": 0.70729434, "num_input_tokens_seen": 158579735, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.78125, "step": 7396, "time_per_iteration": 2.5461738109588623 }, { "auxiliary_loss_clip": 0.01133612, "auxiliary_loss_mlp": 0.01278737, "balance_loss_clip": 1.01799846, "balance_loss_mlp": 1.04462194, "epoch": 0.4447316999849692, "flos": 18546137763840.0, "grad_norm": 1.6744435898967136, "language_loss": 0.80919111, "learning_rate": 2.345849274797529e-06, "loss": 0.8333146, "num_input_tokens_seen": 158597075, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.80078125, "step": 7397, "time_per_iteration": 2.5879099369049072 }, { "auxiliary_loss_clip": 0.01130547, "auxiliary_loss_mlp": 0.01034901, "balance_loss_clip": 1.02130568, "balance_loss_mlp": 1.04494441, "epoch": 0.44479182323763716, "flos": 23769057732480.0, "grad_norm": 1.692147527448851, "language_loss": 0.67695332, "learning_rate": 2.3454771381721566e-06, "loss": 0.6986078, "num_input_tokens_seen": 158616650, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.76953125, "step": 7398, "time_per_iteration": 2.5618317127227783 }, { "auxiliary_loss_clip": 0.01147713, "auxiliary_loss_mlp": 0.01038229, "balance_loss_clip": 1.02473474, "balance_loss_mlp": 1.04444265, "epoch": 0.44485194649030513, "flos": 16543831040640.0, "grad_norm": 1.824985920422134, "language_loss": 0.69647861, "learning_rate": 2.3451049892175934e-06, "loss": 0.71833801, "num_input_tokens_seen": 158634515, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.765625, "step": 7399, "time_per_iteration": 2.679570198059082 }, { "auxiliary_loss_clip": 0.01145522, "auxiliary_loss_mlp": 0.01038528, "balance_loss_clip": 1.02516508, "balance_loss_mlp": 1.04285502, "epoch": 0.4449120697429731, "flos": 22600955445120.0, "grad_norm": 1.7974359643943911, "language_loss": 0.72263598, "learning_rate": 2.3447328279471213e-06, "loss": 0.74447644, "num_input_tokens_seen": 158653760, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.76171875, "step": 7400, "time_per_iteration": 2.5457921028137207 }, { "auxiliary_loss_clip": 0.01155148, "auxiliary_loss_mlp": 0.01279708, "balance_loss_clip": 1.0196352, "balance_loss_mlp": 1.04280889, "epoch": 0.44497219299564106, "flos": 20004864992640.0, "grad_norm": 1.8820362365223813, "language_loss": 0.84962106, "learning_rate": 2.3443606543740207e-06, "loss": 0.87396961, "num_input_tokens_seen": 158672190, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.76953125, "step": 7401, "time_per_iteration": 2.6172871589660645 }, { "auxiliary_loss_clip": 0.01134204, "auxiliary_loss_mlp": 0.0103536, "balance_loss_clip": 1.0228672, "balance_loss_mlp": 1.04194319, "epoch": 0.445032316248309, "flos": 25594253470080.0, "grad_norm": 1.668232370763574, "language_loss": 0.82987332, "learning_rate": 2.3439884685115753e-06, "loss": 0.85156894, "num_input_tokens_seen": 158694115, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.74609375, "step": 7402, "time_per_iteration": 2.607428789138794 }, { "auxiliary_loss_clip": 0.011412, "auxiliary_loss_mlp": 0.01032785, "balance_loss_clip": 1.01880145, "balance_loss_mlp": 1.04559839, "epoch": 0.445092439500977, "flos": 21250426959360.0, "grad_norm": 1.8984907186254976, "language_loss": 0.76734138, "learning_rate": 2.343616270373066e-06, "loss": 0.78908122, "num_input_tokens_seen": 158711000, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78125, "step": 7403, "time_per_iteration": 2.5867977142333984 }, { "auxiliary_loss_clip": 0.01061027, "auxiliary_loss_mlp": 0.01004005, "balance_loss_clip": 1.00225282, "balance_loss_mlp": 1.01826251, "epoch": 0.44515256275364495, "flos": 57764900309760.0, "grad_norm": 0.7475050453795865, "language_loss": 0.60002089, "learning_rate": 2.3432440599717748e-06, "loss": 0.62067115, "num_input_tokens_seen": 158769675, "router_z_loss_clip": 0.01757812, "router_z_loss_mlp": 0.25390625, "step": 7404, "time_per_iteration": 3.114630937576294 }, { "auxiliary_loss_clip": 0.01143311, "auxiliary_loss_mlp": 0.01037451, "balance_loss_clip": 1.02358747, "balance_loss_mlp": 1.04567885, "epoch": 0.4452126860063129, "flos": 15596004908160.0, "grad_norm": 1.9804402482710843, "language_loss": 0.82197165, "learning_rate": 2.3428718373209872e-06, "loss": 0.84377933, "num_input_tokens_seen": 158788215, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.796875, "step": 7405, "time_per_iteration": 2.5580122470855713 }, { "auxiliary_loss_clip": 0.01134317, "auxiliary_loss_mlp": 0.01026963, "balance_loss_clip": 1.01390958, "balance_loss_mlp": 1.03992748, "epoch": 0.4452728092589809, "flos": 21617398258560.0, "grad_norm": 1.7660023071024509, "language_loss": 0.7491129, "learning_rate": 2.342499602433985e-06, "loss": 0.77072573, "num_input_tokens_seen": 158809090, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.765625, "step": 7406, "time_per_iteration": 2.5928759574890137 }, { "auxiliary_loss_clip": 0.01141669, "auxiliary_loss_mlp": 0.01031295, "balance_loss_clip": 1.01842642, "balance_loss_mlp": 1.04071152, "epoch": 0.4453329325116489, "flos": 29497491757440.0, "grad_norm": 2.511558596105103, "language_loss": 0.6539253, "learning_rate": 2.3421273553240534e-06, "loss": 0.67565489, "num_input_tokens_seen": 158828320, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7421875, "step": 7407, "time_per_iteration": 2.631416082382202 }, { "auxiliary_loss_clip": 0.0114369, "auxiliary_loss_mlp": 0.0103387, "balance_loss_clip": 1.02040505, "balance_loss_mlp": 1.04631567, "epoch": 0.44539305576431687, "flos": 21361139654400.0, "grad_norm": 1.5646528520411587, "language_loss": 0.68065834, "learning_rate": 2.3417550960044765e-06, "loss": 0.70243394, "num_input_tokens_seen": 158847040, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.79296875, "step": 7408, "time_per_iteration": 2.584040641784668 }, { "auxiliary_loss_clip": 0.01120031, "auxiliary_loss_mlp": 0.01037489, "balance_loss_clip": 1.022802, "balance_loss_mlp": 1.04150903, "epoch": 0.44545317901698483, "flos": 41427626428800.0, "grad_norm": 1.6129010757994144, "language_loss": 0.71872574, "learning_rate": 2.3413828244885386e-06, "loss": 0.74030095, "num_input_tokens_seen": 158870490, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78515625, "step": 7409, "time_per_iteration": 4.039022207260132 }, { "auxiliary_loss_clip": 0.01139054, "auxiliary_loss_mlp": 0.01035531, "balance_loss_clip": 1.02095211, "balance_loss_mlp": 1.04248869, "epoch": 0.4455133022696528, "flos": 22055005653120.0, "grad_norm": 1.725867512716566, "language_loss": 0.65264094, "learning_rate": 2.341010540789527e-06, "loss": 0.67438674, "num_input_tokens_seen": 158889920, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7890625, "step": 7410, "time_per_iteration": 2.656151056289673 }, { "auxiliary_loss_clip": 0.01143509, "auxiliary_loss_mlp": 0.01035502, "balance_loss_clip": 1.02134597, "balance_loss_mlp": 1.04319692, "epoch": 0.44557342552232077, "flos": 23476960333440.0, "grad_norm": 2.1181704081799975, "language_loss": 0.74153543, "learning_rate": 2.340638244920725e-06, "loss": 0.76332557, "num_input_tokens_seen": 158909580, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.8203125, "step": 7411, "time_per_iteration": 2.526582956314087 }, { "auxiliary_loss_clip": 0.01162799, "auxiliary_loss_mlp": 0.01032525, "balance_loss_clip": 1.01936448, "balance_loss_mlp": 1.04261374, "epoch": 0.44563354877498873, "flos": 19134678107520.0, "grad_norm": 1.9182543200904305, "language_loss": 0.79038769, "learning_rate": 2.3402659368954214e-06, "loss": 0.81234097, "num_input_tokens_seen": 158924600, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.75390625, "step": 7412, "time_per_iteration": 2.673701047897339 }, { "auxiliary_loss_clip": 0.01140125, "auxiliary_loss_mlp": 0.0103692, "balance_loss_clip": 1.02329433, "balance_loss_mlp": 1.04239655, "epoch": 0.4456936720276567, "flos": 13621420506240.0, "grad_norm": 1.8360185736248351, "language_loss": 0.79523033, "learning_rate": 2.3398936167269016e-06, "loss": 0.81700081, "num_input_tokens_seen": 158939345, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.796875, "step": 7413, "time_per_iteration": 2.498082399368286 }, { "auxiliary_loss_clip": 0.01126771, "auxiliary_loss_mlp": 0.01033718, "balance_loss_clip": 1.02083135, "balance_loss_mlp": 1.04233754, "epoch": 0.44575379528032466, "flos": 14713715139840.0, "grad_norm": 1.916257218150209, "language_loss": 0.75912523, "learning_rate": 2.3395212844284525e-06, "loss": 0.78073013, "num_input_tokens_seen": 158955855, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.75390625, "step": 7414, "time_per_iteration": 3.9192590713500977 }, { "auxiliary_loss_clip": 0.01136932, "auxiliary_loss_mlp": 0.01035847, "balance_loss_clip": 1.02225101, "balance_loss_mlp": 1.04289114, "epoch": 0.4458139185329926, "flos": 24170682677760.0, "grad_norm": 1.479761654370696, "language_loss": 0.83393252, "learning_rate": 2.339148940013362e-06, "loss": 0.85566026, "num_input_tokens_seen": 158976315, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.76171875, "step": 7415, "time_per_iteration": 2.650128126144409 }, { "auxiliary_loss_clip": 0.01117018, "auxiliary_loss_mlp": 0.01038347, "balance_loss_clip": 1.02522206, "balance_loss_mlp": 1.04043114, "epoch": 0.4458740417856606, "flos": 21762225895680.0, "grad_norm": 1.4995482977898662, "language_loss": 0.83774889, "learning_rate": 2.338776583494919e-06, "loss": 0.85930252, "num_input_tokens_seen": 158996725, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.765625, "step": 7416, "time_per_iteration": 2.565877914428711 }, { "auxiliary_loss_clip": 0.0114654, "auxiliary_loss_mlp": 0.01036645, "balance_loss_clip": 1.02274513, "balance_loss_mlp": 1.04080629, "epoch": 0.44593416503832856, "flos": 21068790860160.0, "grad_norm": 1.5254546551953836, "language_loss": 0.81012398, "learning_rate": 2.3384042148864113e-06, "loss": 0.83195585, "num_input_tokens_seen": 159017255, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7890625, "step": 7417, "time_per_iteration": 4.067098379135132 }, { "auxiliary_loss_clip": 0.01140211, "auxiliary_loss_mlp": 0.01042716, "balance_loss_clip": 1.0283637, "balance_loss_mlp": 1.04262972, "epoch": 0.4459942882909965, "flos": 22600488568320.0, "grad_norm": 1.9167788057068087, "language_loss": 0.8098793, "learning_rate": 2.338031834201127e-06, "loss": 0.83170861, "num_input_tokens_seen": 159035010, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.796875, "step": 7418, "time_per_iteration": 2.577928304672241 }, { "auxiliary_loss_clip": 0.01130428, "auxiliary_loss_mlp": 0.01277828, "balance_loss_clip": 1.01727378, "balance_loss_mlp": 1.04254496, "epoch": 0.4460544115436645, "flos": 26505486622080.0, "grad_norm": 1.7986362284497848, "language_loss": 0.77280259, "learning_rate": 2.3376594414523565e-06, "loss": 0.79688507, "num_input_tokens_seen": 159055345, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7890625, "step": 7419, "time_per_iteration": 2.543470621109009 }, { "auxiliary_loss_clip": 0.01144544, "auxiliary_loss_mlp": 0.0103994, "balance_loss_clip": 1.0270834, "balance_loss_mlp": 1.04105258, "epoch": 0.4461145347963325, "flos": 17604021893760.0, "grad_norm": 2.533120700373172, "language_loss": 0.72215456, "learning_rate": 2.3372870366533885e-06, "loss": 0.74399942, "num_input_tokens_seen": 159074225, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.765625, "step": 7420, "time_per_iteration": 4.014182090759277 }, { "auxiliary_loss_clip": 0.01138346, "auxiliary_loss_mlp": 0.01032832, "balance_loss_clip": 1.01815748, "balance_loss_mlp": 1.04457045, "epoch": 0.44617465804900047, "flos": 27268193036160.0, "grad_norm": 1.5624554640982602, "language_loss": 0.74877274, "learning_rate": 2.3369146198175136e-06, "loss": 0.77048457, "num_input_tokens_seen": 159095415, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7578125, "step": 7421, "time_per_iteration": 2.5529837608337402 }, { "auxiliary_loss_clip": 0.01162744, "auxiliary_loss_mlp": 0.01038662, "balance_loss_clip": 1.02527452, "balance_loss_mlp": 1.04393888, "epoch": 0.44623478130166844, "flos": 17786412178560.0, "grad_norm": 1.7996298091050413, "language_loss": 0.75598824, "learning_rate": 2.3365421909580234e-06, "loss": 0.77800226, "num_input_tokens_seen": 159114615, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73828125, "step": 7422, "time_per_iteration": 2.6259326934814453 }, { "auxiliary_loss_clip": 0.01119732, "auxiliary_loss_mlp": 0.01035783, "balance_loss_clip": 1.02249146, "balance_loss_mlp": 1.04264879, "epoch": 0.4462949045543364, "flos": 23003011353600.0, "grad_norm": 1.4407956790197833, "language_loss": 0.64906251, "learning_rate": 2.3361697500882074e-06, "loss": 0.67061764, "num_input_tokens_seen": 159134370, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.76953125, "step": 7423, "time_per_iteration": 2.5756101608276367 }, { "auxiliary_loss_clip": 0.01134325, "auxiliary_loss_mlp": 0.01033483, "balance_loss_clip": 1.02103806, "balance_loss_mlp": 1.04218197, "epoch": 0.44635502780700437, "flos": 17820096157440.0, "grad_norm": 1.4853418271339534, "language_loss": 0.78860748, "learning_rate": 2.3357972972213585e-06, "loss": 0.81028551, "num_input_tokens_seen": 159152540, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.74609375, "step": 7424, "time_per_iteration": 2.587707042694092 }, { "auxiliary_loss_clip": 0.01126514, "auxiliary_loss_mlp": 0.0103508, "balance_loss_clip": 1.02244985, "balance_loss_mlp": 1.04341125, "epoch": 0.44641515105967233, "flos": 26688020561280.0, "grad_norm": 1.4404802676224444, "language_loss": 0.8015486, "learning_rate": 2.3354248323707675e-06, "loss": 0.82316452, "num_input_tokens_seen": 159173425, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7421875, "step": 7425, "time_per_iteration": 2.5940425395965576 }, { "auxiliary_loss_clip": 0.01127417, "auxiliary_loss_mlp": 0.01032245, "balance_loss_clip": 1.01923323, "balance_loss_mlp": 1.04299057, "epoch": 0.4464752743123403, "flos": 18913324544640.0, "grad_norm": 1.5668757457358742, "language_loss": 0.7746188, "learning_rate": 2.3350523555497265e-06, "loss": 0.79621542, "num_input_tokens_seen": 159191210, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7578125, "step": 7426, "time_per_iteration": 2.5598340034484863 }, { "auxiliary_loss_clip": 0.01137643, "auxiliary_loss_mlp": 0.0127627, "balance_loss_clip": 1.01644075, "balance_loss_mlp": 1.04159451, "epoch": 0.44653539756500826, "flos": 29570318582400.0, "grad_norm": 1.6755476359247912, "language_loss": 0.64326572, "learning_rate": 2.3346798667715296e-06, "loss": 0.66740483, "num_input_tokens_seen": 159211755, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.78125, "step": 7427, "time_per_iteration": 2.6256022453308105 }, { "auxiliary_loss_clip": 0.01133134, "auxiliary_loss_mlp": 0.0103032, "balance_loss_clip": 1.01742125, "balance_loss_mlp": 1.04826617, "epoch": 0.44659552081767623, "flos": 21468979261440.0, "grad_norm": 1.7160306900209628, "language_loss": 0.75161064, "learning_rate": 2.3343073660494685e-06, "loss": 0.77324522, "num_input_tokens_seen": 159230315, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7578125, "step": 7428, "time_per_iteration": 2.536612033843994 }, { "auxiliary_loss_clip": 0.01136952, "auxiliary_loss_mlp": 0.01035551, "balance_loss_clip": 1.02212834, "balance_loss_mlp": 1.04241478, "epoch": 0.4466556440703442, "flos": 17931886260480.0, "grad_norm": 1.5831887899557686, "language_loss": 0.77622879, "learning_rate": 2.333934853396838e-06, "loss": 0.79795378, "num_input_tokens_seen": 159249810, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.76171875, "step": 7429, "time_per_iteration": 2.5444767475128174 }, { "auxiliary_loss_clip": 0.01130753, "auxiliary_loss_mlp": 0.01033104, "balance_loss_clip": 1.01944244, "balance_loss_mlp": 1.04364872, "epoch": 0.44671576732301216, "flos": 21107430915840.0, "grad_norm": 1.559041272864665, "language_loss": 0.90729213, "learning_rate": 2.3335623288269313e-06, "loss": 0.92893076, "num_input_tokens_seen": 159271715, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.78125, "step": 7430, "time_per_iteration": 2.591414451599121 }, { "auxiliary_loss_clip": 0.01142955, "auxiliary_loss_mlp": 0.01284374, "balance_loss_clip": 1.02221918, "balance_loss_mlp": 1.04417026, "epoch": 0.4467758905756801, "flos": 23508920459520.0, "grad_norm": 2.6549208887624647, "language_loss": 0.79920757, "learning_rate": 2.333189792353043e-06, "loss": 0.82348084, "num_input_tokens_seen": 159290690, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8125, "step": 7431, "time_per_iteration": 2.595726251602173 }, { "auxiliary_loss_clip": 0.01142044, "auxiliary_loss_mlp": 0.01037631, "balance_loss_clip": 1.02383852, "balance_loss_mlp": 1.0452261, "epoch": 0.4468360138283481, "flos": 18734022829440.0, "grad_norm": 2.2505676733654782, "language_loss": 0.80043823, "learning_rate": 2.3328172439884687e-06, "loss": 0.82223499, "num_input_tokens_seen": 159309400, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.79296875, "step": 7432, "time_per_iteration": 2.553452730178833 }, { "auxiliary_loss_clip": 0.01149569, "auxiliary_loss_mlp": 0.01033795, "balance_loss_clip": 1.02062809, "balance_loss_mlp": 1.04460835, "epoch": 0.4468961370810161, "flos": 23477139901440.0, "grad_norm": 1.9566644585719328, "language_loss": 0.76514947, "learning_rate": 2.3324446837465023e-06, "loss": 0.78698313, "num_input_tokens_seen": 159327425, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.78515625, "step": 7433, "time_per_iteration": 2.6965625286102295 }, { "auxiliary_loss_clip": 0.01124875, "auxiliary_loss_mlp": 0.01033803, "balance_loss_clip": 1.02212095, "balance_loss_mlp": 1.04209232, "epoch": 0.4469562603336841, "flos": 30075042539520.0, "grad_norm": 1.78016570996845, "language_loss": 0.77268171, "learning_rate": 2.332072111640441e-06, "loss": 0.79426849, "num_input_tokens_seen": 159345805, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.73828125, "step": 7434, "time_per_iteration": 2.559323787689209 }, { "auxiliary_loss_clip": 0.01135244, "auxiliary_loss_mlp": 0.01031735, "balance_loss_clip": 1.01834154, "balance_loss_mlp": 1.04704714, "epoch": 0.44701638358635204, "flos": 22456415116800.0, "grad_norm": 1.6007612815142684, "language_loss": 0.64277279, "learning_rate": 2.33169952768358e-06, "loss": 0.66444266, "num_input_tokens_seen": 159364595, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.79296875, "step": 7435, "time_per_iteration": 2.6083192825317383 }, { "auxiliary_loss_clip": 0.01141066, "auxiliary_loss_mlp": 0.01029556, "balance_loss_clip": 1.01617503, "balance_loss_mlp": 1.04517996, "epoch": 0.44707650683902, "flos": 24057851080320.0, "grad_norm": 3.0170342452985306, "language_loss": 0.83889806, "learning_rate": 2.331326931889215e-06, "loss": 0.86060429, "num_input_tokens_seen": 159385265, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.78125, "step": 7436, "time_per_iteration": 2.5604946613311768 }, { "auxiliary_loss_clip": 0.01159679, "auxiliary_loss_mlp": 0.01034996, "balance_loss_clip": 1.02024937, "balance_loss_mlp": 1.04434466, "epoch": 0.44713663009168797, "flos": 23766938830080.0, "grad_norm": 1.9551654285960571, "language_loss": 0.79403633, "learning_rate": 2.3309543242706454e-06, "loss": 0.81598306, "num_input_tokens_seen": 159405080, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.79296875, "step": 7437, "time_per_iteration": 2.6204640865325928 }, { "auxiliary_loss_clip": 0.01159537, "auxiliary_loss_mlp": 0.01034698, "balance_loss_clip": 1.02096462, "balance_loss_mlp": 1.04466045, "epoch": 0.44719675334435594, "flos": 24499265316480.0, "grad_norm": 2.755680277767802, "language_loss": 0.71807766, "learning_rate": 2.3305817048411667e-06, "loss": 0.74001998, "num_input_tokens_seen": 159424595, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.796875, "step": 7438, "time_per_iteration": 2.5858428478240967 }, { "auxiliary_loss_clip": 0.01158662, "auxiliary_loss_mlp": 0.01040432, "balance_loss_clip": 1.02507782, "balance_loss_mlp": 1.04363012, "epoch": 0.4472568765970239, "flos": 29781759991680.0, "grad_norm": 1.9951332455818334, "language_loss": 0.67411935, "learning_rate": 2.3302090736140772e-06, "loss": 0.69611037, "num_input_tokens_seen": 159443865, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.79296875, "step": 7439, "time_per_iteration": 2.668304443359375 }, { "auxiliary_loss_clip": 0.01144981, "auxiliary_loss_mlp": 0.01039412, "balance_loss_clip": 1.02439201, "balance_loss_mlp": 1.04664361, "epoch": 0.44731699984969187, "flos": 24643123286400.0, "grad_norm": 1.8353087977352467, "language_loss": 0.73723739, "learning_rate": 2.3298364306026757e-06, "loss": 0.75908136, "num_input_tokens_seen": 159464525, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8046875, "step": 7440, "time_per_iteration": 2.555208444595337 }, { "auxiliary_loss_clip": 0.01156241, "auxiliary_loss_mlp": 0.01035059, "balance_loss_clip": 1.02162385, "balance_loss_mlp": 1.04331255, "epoch": 0.44737712310235983, "flos": 29455691304960.0, "grad_norm": 1.5934256697001552, "language_loss": 0.73988497, "learning_rate": 2.32946377582026e-06, "loss": 0.76179796, "num_input_tokens_seen": 159486385, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.77734375, "step": 7441, "time_per_iteration": 2.6921801567077637 }, { "auxiliary_loss_clip": 0.01150753, "auxiliary_loss_mlp": 0.01037068, "balance_loss_clip": 1.02259612, "balance_loss_mlp": 1.04436791, "epoch": 0.4474372463550278, "flos": 24896832024960.0, "grad_norm": 1.8511661661147527, "language_loss": 0.74899817, "learning_rate": 2.32909110928013e-06, "loss": 0.77087635, "num_input_tokens_seen": 159503880, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.796875, "step": 7442, "time_per_iteration": 2.5541486740112305 }, { "auxiliary_loss_clip": 0.01127101, "auxiliary_loss_mlp": 0.01036244, "balance_loss_clip": 1.02139688, "balance_loss_mlp": 1.04458094, "epoch": 0.44749736960769576, "flos": 33181603125120.0, "grad_norm": 1.8352444734597946, "language_loss": 0.74149811, "learning_rate": 2.3287184309955847e-06, "loss": 0.76313156, "num_input_tokens_seen": 159522980, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.82421875, "step": 7443, "time_per_iteration": 2.6654763221740723 }, { "auxiliary_loss_clip": 0.01138914, "auxiliary_loss_mlp": 0.01031507, "balance_loss_clip": 1.01636767, "balance_loss_mlp": 1.0423578, "epoch": 0.4475574928603637, "flos": 21071807602560.0, "grad_norm": 1.9947184086332232, "language_loss": 0.78002703, "learning_rate": 2.328345740979924e-06, "loss": 0.80173123, "num_input_tokens_seen": 159543340, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.7890625, "step": 7444, "time_per_iteration": 2.594310998916626 }, { "auxiliary_loss_clip": 0.01129384, "auxiliary_loss_mlp": 0.01030604, "balance_loss_clip": 1.01642418, "balance_loss_mlp": 1.04318237, "epoch": 0.4476176161130317, "flos": 21862523646720.0, "grad_norm": 2.187227003918339, "language_loss": 0.84521127, "learning_rate": 2.3279730392464486e-06, "loss": 0.86681116, "num_input_tokens_seen": 159558210, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.76953125, "step": 7445, "time_per_iteration": 2.5748469829559326 }, { "auxiliary_loss_clip": 0.01151953, "auxiliary_loss_mlp": 0.01031302, "balance_loss_clip": 1.01610339, "balance_loss_mlp": 1.04639697, "epoch": 0.4476777393656997, "flos": 22528667324160.0, "grad_norm": 2.045859098708062, "language_loss": 0.63947278, "learning_rate": 2.3276003258084593e-06, "loss": 0.66130531, "num_input_tokens_seen": 159577920, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7890625, "step": 7446, "time_per_iteration": 2.585329294204712 }, { "auxiliary_loss_clip": 0.01052917, "auxiliary_loss_mlp": 0.01008834, "balance_loss_clip": 1.00704598, "balance_loss_mlp": 1.01970541, "epoch": 0.4477378626183677, "flos": 49017133877760.0, "grad_norm": 0.7438979625378003, "language_loss": 0.5038259, "learning_rate": 2.327227600679257e-06, "loss": 0.52444339, "num_input_tokens_seen": 159632295, "router_z_loss_clip": 0.01782227, "router_z_loss_mlp": 0.24414062, "step": 7447, "time_per_iteration": 3.0427567958831787 }, { "auxiliary_loss_clip": 0.01045022, "auxiliary_loss_mlp": 0.01005148, "balance_loss_clip": 1.00333583, "balance_loss_mlp": 1.02041447, "epoch": 0.44779798587103564, "flos": 56542179392640.0, "grad_norm": 0.7738037928093924, "language_loss": 0.59290439, "learning_rate": 2.326854863872143e-06, "loss": 0.61340612, "num_input_tokens_seen": 159698435, "router_z_loss_clip": 0.01806641, "router_z_loss_mlp": 0.24609375, "step": 7448, "time_per_iteration": 3.1809945106506348 }, { "auxiliary_loss_clip": 0.01148415, "auxiliary_loss_mlp": 0.01034096, "balance_loss_clip": 1.02098346, "balance_loss_mlp": 1.04383945, "epoch": 0.4478581091237036, "flos": 46498536040320.0, "grad_norm": 1.6405621013883167, "language_loss": 0.58627486, "learning_rate": 2.32648211540042e-06, "loss": 0.6081, "num_input_tokens_seen": 159722150, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.78125, "step": 7449, "time_per_iteration": 2.76273250579834 }, { "auxiliary_loss_clip": 0.01140439, "auxiliary_loss_mlp": 0.01031171, "balance_loss_clip": 1.01815343, "balance_loss_mlp": 1.04419804, "epoch": 0.4479182323763716, "flos": 20814363849600.0, "grad_norm": 1.7453055270041011, "language_loss": 0.80118257, "learning_rate": 2.32610935527739e-06, "loss": 0.82289875, "num_input_tokens_seen": 159740550, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.78515625, "step": 7450, "time_per_iteration": 3.967738628387451 }, { "auxiliary_loss_clip": 0.01123315, "auxiliary_loss_mlp": 0.01043477, "balance_loss_clip": 1.02967894, "balance_loss_mlp": 1.04599643, "epoch": 0.44797835562903954, "flos": 14245979212800.0, "grad_norm": 2.2399844963419553, "language_loss": 0.78864884, "learning_rate": 2.3257365835163562e-06, "loss": 0.81031674, "num_input_tokens_seen": 159758245, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7734375, "step": 7451, "time_per_iteration": 2.4842946529388428 }, { "auxiliary_loss_clip": 0.01078076, "auxiliary_loss_mlp": 0.010067, "balance_loss_clip": 1.00491238, "balance_loss_mlp": 1.01869202, "epoch": 0.4480384788817075, "flos": 63534560169600.0, "grad_norm": 0.8360630428325453, "language_loss": 0.62805486, "learning_rate": 2.325363800130621e-06, "loss": 0.64890254, "num_input_tokens_seen": 159826790, "router_z_loss_clip": 0.01782227, "router_z_loss_mlp": 0.24414062, "step": 7452, "time_per_iteration": 3.2640130519866943 }, { "auxiliary_loss_clip": 0.0112524, "auxiliary_loss_mlp": 0.01034817, "balance_loss_clip": 1.02013087, "balance_loss_mlp": 1.04669809, "epoch": 0.44809860213437547, "flos": 21652626522240.0, "grad_norm": 1.9067780001701344, "language_loss": 0.62619174, "learning_rate": 2.324991005133489e-06, "loss": 0.64779234, "num_input_tokens_seen": 159845805, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78515625, "step": 7453, "time_per_iteration": 2.5933265686035156 }, { "auxiliary_loss_clip": 0.0105211, "auxiliary_loss_mlp": 0.01007124, "balance_loss_clip": 1.00516915, "balance_loss_mlp": 1.01846981, "epoch": 0.44815872538704343, "flos": 69190634246400.0, "grad_norm": 0.7571947663757237, "language_loss": 0.57031906, "learning_rate": 2.324618198538264e-06, "loss": 0.59091139, "num_input_tokens_seen": 159898860, "router_z_loss_clip": 0.01953125, "router_z_loss_mlp": 0.24414062, "step": 7454, "time_per_iteration": 3.0563783645629883 }, { "auxiliary_loss_clip": 0.01140113, "auxiliary_loss_mlp": 0.01032887, "balance_loss_clip": 1.01932096, "balance_loss_mlp": 1.04340959, "epoch": 0.4482188486397114, "flos": 12598289510400.0, "grad_norm": 1.9485057156046333, "language_loss": 0.74749911, "learning_rate": 2.3242453803582505e-06, "loss": 0.76922905, "num_input_tokens_seen": 159911555, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7890625, "step": 7455, "time_per_iteration": 3.9595789909362793 }, { "auxiliary_loss_clip": 0.01140316, "auxiliary_loss_mlp": 0.01033072, "balance_loss_clip": 1.01950634, "balance_loss_mlp": 1.04464078, "epoch": 0.44827897189237936, "flos": 34058182631040.0, "grad_norm": 2.468569336252077, "language_loss": 0.76013291, "learning_rate": 2.3238725506067535e-06, "loss": 0.78186679, "num_input_tokens_seen": 159931470, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.78125, "step": 7456, "time_per_iteration": 2.6971828937530518 }, { "auxiliary_loss_clip": 0.01124987, "auxiliary_loss_mlp": 0.01039889, "balance_loss_clip": 1.02582824, "balance_loss_mlp": 1.04831719, "epoch": 0.44833909514504733, "flos": 25147416280320.0, "grad_norm": 1.9842559674399103, "language_loss": 0.76398587, "learning_rate": 2.3234997092970786e-06, "loss": 0.78563464, "num_input_tokens_seen": 159946115, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.765625, "step": 7457, "time_per_iteration": 2.5320467948913574 }, { "auxiliary_loss_clip": 0.01132128, "auxiliary_loss_mlp": 0.01034448, "balance_loss_clip": 1.01938653, "balance_loss_mlp": 1.04345298, "epoch": 0.4483992183977153, "flos": 16179984224640.0, "grad_norm": 2.823942025862574, "language_loss": 0.68259257, "learning_rate": 2.3231268564425305e-06, "loss": 0.70425832, "num_input_tokens_seen": 159963915, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.796875, "step": 7458, "time_per_iteration": 4.1238625049591064 }, { "auxiliary_loss_clip": 0.01151428, "auxiliary_loss_mlp": 0.01037372, "balance_loss_clip": 1.02225673, "balance_loss_mlp": 1.04417634, "epoch": 0.44845934165038326, "flos": 17746048270080.0, "grad_norm": 2.6460271650594103, "language_loss": 0.71668506, "learning_rate": 2.322753992056417e-06, "loss": 0.73857307, "num_input_tokens_seen": 159982140, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8046875, "step": 7459, "time_per_iteration": 2.5290064811706543 }, { "auxiliary_loss_clip": 0.01129991, "auxiliary_loss_mlp": 0.01033769, "balance_loss_clip": 1.019678, "balance_loss_mlp": 1.04275858, "epoch": 0.4485194649030513, "flos": 21835914647040.0, "grad_norm": 1.6639328432267637, "language_loss": 0.69494176, "learning_rate": 2.3223811161520425e-06, "loss": 0.71657938, "num_input_tokens_seen": 160002280, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78125, "step": 7460, "time_per_iteration": 2.577505588531494 }, { "auxiliary_loss_clip": 0.01122017, "auxiliary_loss_mlp": 0.01037486, "balance_loss_clip": 1.02342558, "balance_loss_mlp": 1.04340112, "epoch": 0.44857958815571924, "flos": 20084515401600.0, "grad_norm": 2.5318045797129907, "language_loss": 0.76762915, "learning_rate": 2.3220082287427163e-06, "loss": 0.78922415, "num_input_tokens_seen": 160020260, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78515625, "step": 7461, "time_per_iteration": 4.121611595153809 }, { "auxiliary_loss_clip": 0.01121876, "auxiliary_loss_mlp": 0.01035505, "balance_loss_clip": 1.02160478, "balance_loss_mlp": 1.04180253, "epoch": 0.4486397114083872, "flos": 27053519402880.0, "grad_norm": 1.5802928006119215, "language_loss": 0.67732936, "learning_rate": 2.321635329841745e-06, "loss": 0.69890314, "num_input_tokens_seen": 160040240, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.80078125, "step": 7462, "time_per_iteration": 2.5575432777404785 }, { "auxiliary_loss_clip": 0.01039961, "auxiliary_loss_mlp": 0.01000204, "balance_loss_clip": 0.99832076, "balance_loss_mlp": 1.01539314, "epoch": 0.4486998346610552, "flos": 67321195931520.0, "grad_norm": 0.7471397939334979, "language_loss": 0.54474241, "learning_rate": 2.3212624194624354e-06, "loss": 0.56514406, "num_input_tokens_seen": 160093865, "router_z_loss_clip": 0.01879883, "router_z_loss_mlp": 0.24609375, "step": 7463, "time_per_iteration": 3.175269842147827 }, { "auxiliary_loss_clip": 0.01130244, "auxiliary_loss_mlp": 0.01032353, "balance_loss_clip": 1.01833391, "balance_loss_mlp": 1.04370427, "epoch": 0.44875995791372314, "flos": 27636816360960.0, "grad_norm": 1.912517896205239, "language_loss": 0.75496817, "learning_rate": 2.3208894976180965e-06, "loss": 0.77659416, "num_input_tokens_seen": 160113590, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.77734375, "step": 7464, "time_per_iteration": 2.6331048011779785 }, { "auxiliary_loss_clip": 0.0113105, "auxiliary_loss_mlp": 0.01038134, "balance_loss_clip": 1.02509856, "balance_loss_mlp": 1.04527867, "epoch": 0.4488200811663911, "flos": 13005947940480.0, "grad_norm": 1.9396352207746745, "language_loss": 0.73766041, "learning_rate": 2.3205165643220364e-06, "loss": 0.75935221, "num_input_tokens_seen": 160131795, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.76953125, "step": 7465, "time_per_iteration": 2.5660691261291504 }, { "auxiliary_loss_clip": 0.01147127, "auxiliary_loss_mlp": 0.01043554, "balance_loss_clip": 1.02758002, "balance_loss_mlp": 1.04536247, "epoch": 0.44888020441905907, "flos": 27489977562240.0, "grad_norm": 2.193908998508522, "language_loss": 0.80166459, "learning_rate": 2.3201436195875655e-06, "loss": 0.82357138, "num_input_tokens_seen": 160150635, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8359375, "step": 7466, "time_per_iteration": 2.6402790546417236 }, { "auxiliary_loss_clip": 0.0114258, "auxiliary_loss_mlp": 0.01036108, "balance_loss_clip": 1.0217433, "balance_loss_mlp": 1.04535747, "epoch": 0.44894032767172704, "flos": 18259678800000.0, "grad_norm": 3.083075655358689, "language_loss": 0.80316907, "learning_rate": 2.3197706634279916e-06, "loss": 0.82495594, "num_input_tokens_seen": 160168615, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.796875, "step": 7467, "time_per_iteration": 2.547501564025879 }, { "auxiliary_loss_clip": 0.01156685, "auxiliary_loss_mlp": 0.01036895, "balance_loss_clip": 1.02360308, "balance_loss_mlp": 1.04559445, "epoch": 0.449000450924395, "flos": 21579835610880.0, "grad_norm": 1.8611676486712323, "language_loss": 0.7492733, "learning_rate": 2.3193976958566256e-06, "loss": 0.77120912, "num_input_tokens_seen": 160187295, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7578125, "step": 7468, "time_per_iteration": 2.576136350631714 }, { "auxiliary_loss_clip": 0.01139989, "auxiliary_loss_mlp": 0.01030212, "balance_loss_clip": 1.01737356, "balance_loss_mlp": 1.04492712, "epoch": 0.44906057417706297, "flos": 17967904623360.0, "grad_norm": 1.7367423325379066, "language_loss": 0.70054793, "learning_rate": 2.3190247168867775e-06, "loss": 0.72224998, "num_input_tokens_seen": 160205115, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7734375, "step": 7469, "time_per_iteration": 2.5444295406341553 }, { "auxiliary_loss_clip": 0.01133356, "auxiliary_loss_mlp": 0.01041929, "balance_loss_clip": 1.02739739, "balance_loss_mlp": 1.0447886, "epoch": 0.44912069742973093, "flos": 20047347803520.0, "grad_norm": 1.906065537759765, "language_loss": 0.7149148, "learning_rate": 2.3186517265317575e-06, "loss": 0.73666763, "num_input_tokens_seen": 160222580, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.79296875, "step": 7470, "time_per_iteration": 2.5218329429626465 }, { "auxiliary_loss_clip": 0.0114895, "auxiliary_loss_mlp": 0.01036352, "balance_loss_clip": 1.02156997, "balance_loss_mlp": 1.04202688, "epoch": 0.4491808206823989, "flos": 21033526682880.0, "grad_norm": 2.069283598900922, "language_loss": 0.77084005, "learning_rate": 2.3182787248048776e-06, "loss": 0.79269314, "num_input_tokens_seen": 160241520, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.796875, "step": 7471, "time_per_iteration": 2.5485615730285645 }, { "auxiliary_loss_clip": 0.01122648, "auxiliary_loss_mlp": 0.01037562, "balance_loss_clip": 1.02374589, "balance_loss_mlp": 1.04441237, "epoch": 0.44924094393506686, "flos": 22967136645120.0, "grad_norm": 1.7791441064422582, "language_loss": 0.70159364, "learning_rate": 2.317905711719448e-06, "loss": 0.72319579, "num_input_tokens_seen": 160261815, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78125, "step": 7472, "time_per_iteration": 2.5667052268981934 }, { "auxiliary_loss_clip": 0.01047188, "auxiliary_loss_mlp": 0.01003762, "balance_loss_clip": 1.00203347, "balance_loss_mlp": 1.01364589, "epoch": 0.4493010671877349, "flos": 59233467864960.0, "grad_norm": 0.7382045435319519, "language_loss": 0.61692345, "learning_rate": 2.3175326872887823e-06, "loss": 0.63743293, "num_input_tokens_seen": 160317070, "router_z_loss_clip": 0.01733398, "router_z_loss_mlp": 0.24414062, "step": 7473, "time_per_iteration": 3.0782158374786377 }, { "auxiliary_loss_clip": 0.01122541, "auxiliary_loss_mlp": 0.01288465, "balance_loss_clip": 1.02682757, "balance_loss_mlp": 1.04354537, "epoch": 0.44936119044040285, "flos": 18004892653440.0, "grad_norm": 2.105905937177988, "language_loss": 0.77418464, "learning_rate": 2.3171596515261907e-06, "loss": 0.79829466, "num_input_tokens_seen": 160334980, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7890625, "step": 7474, "time_per_iteration": 2.5226833820343018 }, { "auxiliary_loss_clip": 0.01157441, "auxiliary_loss_mlp": 0.01038284, "balance_loss_clip": 1.02394319, "balance_loss_mlp": 1.0443368, "epoch": 0.4494213136930708, "flos": 21251827589760.0, "grad_norm": 2.6917401073290352, "language_loss": 0.71997529, "learning_rate": 2.3167866044449876e-06, "loss": 0.74193251, "num_input_tokens_seen": 160354500, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.76953125, "step": 7475, "time_per_iteration": 2.56807541847229 }, { "auxiliary_loss_clip": 0.0115619, "auxiliary_loss_mlp": 0.01034718, "balance_loss_clip": 1.02102697, "balance_loss_mlp": 1.04159415, "epoch": 0.4494814369457388, "flos": 27418695022080.0, "grad_norm": 2.1810868640068914, "language_loss": 0.76809752, "learning_rate": 2.3164135460584853e-06, "loss": 0.79000658, "num_input_tokens_seen": 160373650, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.79296875, "step": 7476, "time_per_iteration": 2.696176528930664 }, { "auxiliary_loss_clip": 0.0113317, "auxiliary_loss_mlp": 0.01287567, "balance_loss_clip": 1.02535009, "balance_loss_mlp": 1.04291785, "epoch": 0.44954156019840674, "flos": 22854053652480.0, "grad_norm": 2.2520494491121608, "language_loss": 0.720505, "learning_rate": 2.316040476379998e-06, "loss": 0.74471241, "num_input_tokens_seen": 160393430, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8125, "step": 7477, "time_per_iteration": 2.530399799346924 }, { "auxiliary_loss_clip": 0.01146275, "auxiliary_loss_mlp": 0.01037694, "balance_loss_clip": 1.02288842, "balance_loss_mlp": 1.0461272, "epoch": 0.4496016834510747, "flos": 17201570935680.0, "grad_norm": 2.555970281109264, "language_loss": 0.67562628, "learning_rate": 2.3156673954228385e-06, "loss": 0.69746602, "num_input_tokens_seen": 160410545, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8203125, "step": 7478, "time_per_iteration": 2.601849317550659 }, { "auxiliary_loss_clip": 0.01139772, "auxiliary_loss_mlp": 0.01040079, "balance_loss_clip": 1.02642393, "balance_loss_mlp": 1.04510856, "epoch": 0.4496618067037427, "flos": 18916628595840.0, "grad_norm": 2.1529748356443843, "language_loss": 0.89139819, "learning_rate": 2.315294303200322e-06, "loss": 0.91319668, "num_input_tokens_seen": 160428105, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.76953125, "step": 7479, "time_per_iteration": 2.551356077194214 }, { "auxiliary_loss_clip": 0.01170086, "auxiliary_loss_mlp": 0.01041304, "balance_loss_clip": 1.02639079, "balance_loss_mlp": 1.04621625, "epoch": 0.44972192995641064, "flos": 21031659175680.0, "grad_norm": 1.9977521453744804, "language_loss": 0.7562356, "learning_rate": 2.314921199725762e-06, "loss": 0.77834952, "num_input_tokens_seen": 160448815, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.79296875, "step": 7480, "time_per_iteration": 2.664945602416992 }, { "auxiliary_loss_clip": 0.01152124, "auxiliary_loss_mlp": 0.01037598, "balance_loss_clip": 1.02313805, "balance_loss_mlp": 1.04455948, "epoch": 0.4497820532090786, "flos": 20777088510720.0, "grad_norm": 2.5458778791475196, "language_loss": 0.7905907, "learning_rate": 2.3145480850124754e-06, "loss": 0.81248796, "num_input_tokens_seen": 160465940, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8125, "step": 7481, "time_per_iteration": 2.5323078632354736 }, { "auxiliary_loss_clip": 0.01138194, "auxiliary_loss_mlp": 0.0103508, "balance_loss_clip": 1.02073956, "balance_loss_mlp": 1.04462028, "epoch": 0.44984217646174657, "flos": 33802606385280.0, "grad_norm": 1.765720914170535, "language_loss": 0.68423295, "learning_rate": 2.3141749590737763e-06, "loss": 0.70596564, "num_input_tokens_seen": 160486710, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7578125, "step": 7482, "time_per_iteration": 2.696702241897583 }, { "auxiliary_loss_clip": 0.01133418, "auxiliary_loss_mlp": 0.01043936, "balance_loss_clip": 1.0291543, "balance_loss_mlp": 1.04403055, "epoch": 0.44990229971441453, "flos": 15518365660800.0, "grad_norm": 1.9840963181549063, "language_loss": 0.84829319, "learning_rate": 2.313801821922981e-06, "loss": 0.87006676, "num_input_tokens_seen": 160503405, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8046875, "step": 7483, "time_per_iteration": 2.4932656288146973 }, { "auxiliary_loss_clip": 0.01150252, "auxiliary_loss_mlp": 0.01041853, "balance_loss_clip": 1.0271548, "balance_loss_mlp": 1.04863834, "epoch": 0.4499624229670825, "flos": 29861913191040.0, "grad_norm": 1.679941824112158, "language_loss": 0.8053236, "learning_rate": 2.3134286735734065e-06, "loss": 0.82724464, "num_input_tokens_seen": 160525080, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8359375, "step": 7484, "time_per_iteration": 2.661550283432007 }, { "auxiliary_loss_clip": 0.01161734, "auxiliary_loss_mlp": 0.01030828, "balance_loss_clip": 1.01521122, "balance_loss_mlp": 1.04229367, "epoch": 0.45002254621975046, "flos": 18513674847360.0, "grad_norm": 3.9603670183530633, "language_loss": 0.74811721, "learning_rate": 2.3130555140383678e-06, "loss": 0.77004284, "num_input_tokens_seen": 160540895, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.83984375, "step": 7485, "time_per_iteration": 2.5356671810150146 }, { "auxiliary_loss_clip": 0.01056365, "auxiliary_loss_mlp": 0.0100166, "balance_loss_clip": 1.00000322, "balance_loss_mlp": 1.01386952, "epoch": 0.4500826694724185, "flos": 70420394229120.0, "grad_norm": 0.7899870615126843, "language_loss": 0.586842, "learning_rate": 2.312682343331184e-06, "loss": 0.60742229, "num_input_tokens_seen": 160598270, "router_z_loss_clip": 0.01660156, "router_z_loss_mlp": 0.24609375, "step": 7486, "time_per_iteration": 3.20863676071167 }, { "auxiliary_loss_clip": 0.01131828, "auxiliary_loss_mlp": 0.01033533, "balance_loss_clip": 1.01908457, "balance_loss_mlp": 1.0441134, "epoch": 0.45014279272508645, "flos": 15778897983360.0, "grad_norm": 2.5814032433842846, "language_loss": 0.83395576, "learning_rate": 2.312309161465171e-06, "loss": 0.85560942, "num_input_tokens_seen": 160614720, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7890625, "step": 7487, "time_per_iteration": 2.592419385910034 }, { "auxiliary_loss_clip": 0.01122214, "auxiliary_loss_mlp": 0.01037705, "balance_loss_clip": 1.0235548, "balance_loss_mlp": 1.04418731, "epoch": 0.4502029159777544, "flos": 21799573061760.0, "grad_norm": 1.5743845461737378, "language_loss": 0.77580994, "learning_rate": 2.311935968453648e-06, "loss": 0.79740912, "num_input_tokens_seen": 160635170, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78125, "step": 7488, "time_per_iteration": 2.6212210655212402 }, { "auxiliary_loss_clip": 0.01138893, "auxiliary_loss_mlp": 0.01042073, "balance_loss_clip": 1.02624202, "balance_loss_mlp": 1.04591346, "epoch": 0.4502630392304224, "flos": 28767966531840.0, "grad_norm": 2.350085269858076, "language_loss": 0.71352369, "learning_rate": 2.311562764309932e-06, "loss": 0.73533332, "num_input_tokens_seen": 160654490, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.83984375, "step": 7489, "time_per_iteration": 2.678239345550537 }, { "auxiliary_loss_clip": 0.01147151, "auxiliary_loss_mlp": 0.0103731, "balance_loss_clip": 1.02120435, "balance_loss_mlp": 1.04735541, "epoch": 0.45032316248309034, "flos": 15844182952320.0, "grad_norm": 2.239041931859957, "language_loss": 0.69280368, "learning_rate": 2.311189549047343e-06, "loss": 0.71464825, "num_input_tokens_seen": 160669400, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8203125, "step": 7490, "time_per_iteration": 2.5407145023345947 }, { "auxiliary_loss_clip": 0.01056479, "auxiliary_loss_mlp": 0.01004729, "balance_loss_clip": 1.00303602, "balance_loss_mlp": 1.013587, "epoch": 0.4503832857357583, "flos": 57853600945920.0, "grad_norm": 0.7821398112546528, "language_loss": 0.56695873, "learning_rate": 2.3108163226791994e-06, "loss": 0.58757085, "num_input_tokens_seen": 160733820, "router_z_loss_clip": 0.01696777, "router_z_loss_mlp": 0.24609375, "step": 7491, "time_per_iteration": 4.520175218582153 }, { "auxiliary_loss_clip": 0.01129349, "auxiliary_loss_mlp": 0.0103323, "balance_loss_clip": 1.01944375, "balance_loss_mlp": 1.04362762, "epoch": 0.4504434089884263, "flos": 23878082488320.0, "grad_norm": 4.450734658082633, "language_loss": 0.79481947, "learning_rate": 2.3104430852188206e-06, "loss": 0.81644523, "num_input_tokens_seen": 160753175, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.76953125, "step": 7492, "time_per_iteration": 2.5846915245056152 }, { "auxiliary_loss_clip": 0.0115566, "auxiliary_loss_mlp": 0.01284842, "balance_loss_clip": 1.02336085, "balance_loss_mlp": 1.04562545, "epoch": 0.45050353224109424, "flos": 17785083375360.0, "grad_norm": 2.1507016077954724, "language_loss": 0.92521727, "learning_rate": 2.3100698366795266e-06, "loss": 0.94962233, "num_input_tokens_seen": 160768310, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.828125, "step": 7493, "time_per_iteration": 2.603685140609741 }, { "auxiliary_loss_clip": 0.01046181, "auxiliary_loss_mlp": 0.01005426, "balance_loss_clip": 1.00356662, "balance_loss_mlp": 1.01272821, "epoch": 0.4505636554937622, "flos": 65063420703360.0, "grad_norm": 0.7990192512993333, "language_loss": 0.62915248, "learning_rate": 2.309696577074638e-06, "loss": 0.64966857, "num_input_tokens_seen": 160827370, "router_z_loss_clip": 0.01855469, "router_z_loss_mlp": 0.24609375, "step": 7494, "time_per_iteration": 3.217163324356079 }, { "auxiliary_loss_clip": 0.01121434, "auxiliary_loss_mlp": 0.01034823, "balance_loss_clip": 1.02106047, "balance_loss_mlp": 1.04356968, "epoch": 0.45062377874643017, "flos": 22200084685440.0, "grad_norm": 1.47277058181065, "language_loss": 0.81827247, "learning_rate": 2.3093233064174747e-06, "loss": 0.83983505, "num_input_tokens_seen": 160849140, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.78125, "step": 7495, "time_per_iteration": 2.587250232696533 }, { "auxiliary_loss_clip": 0.01133371, "auxiliary_loss_mlp": 0.01029729, "balance_loss_clip": 1.01497126, "balance_loss_mlp": 1.04507673, "epoch": 0.45068390199909814, "flos": 37670293186560.0, "grad_norm": 1.7867358921586323, "language_loss": 0.85525113, "learning_rate": 2.308950024721359e-06, "loss": 0.87688208, "num_input_tokens_seen": 160871280, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.79296875, "step": 7496, "time_per_iteration": 2.7629034519195557 }, { "auxiliary_loss_clip": 0.01123841, "auxiliary_loss_mlp": 0.01032612, "balance_loss_clip": 1.01831329, "balance_loss_mlp": 1.04495239, "epoch": 0.4507440252517661, "flos": 22302501338880.0, "grad_norm": 1.965305887310569, "language_loss": 0.76349652, "learning_rate": 2.308576731999611e-06, "loss": 0.78506106, "num_input_tokens_seen": 160888625, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7890625, "step": 7497, "time_per_iteration": 3.958953619003296 }, { "auxiliary_loss_clip": 0.0114865, "auxiliary_loss_mlp": 0.01034046, "balance_loss_clip": 1.02019405, "balance_loss_mlp": 1.04191661, "epoch": 0.45080414850443407, "flos": 13188374138880.0, "grad_norm": 2.169062059864065, "language_loss": 0.74886537, "learning_rate": 2.3082034282655532e-06, "loss": 0.77069235, "num_input_tokens_seen": 160907040, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.796875, "step": 7498, "time_per_iteration": 2.519648551940918 }, { "auxiliary_loss_clip": 0.01122993, "auxiliary_loss_mlp": 0.01038236, "balance_loss_clip": 1.02290535, "balance_loss_mlp": 1.04339612, "epoch": 0.4508642717571021, "flos": 21944939402880.0, "grad_norm": 2.065113053028113, "language_loss": 0.70337045, "learning_rate": 2.3078301135325076e-06, "loss": 0.72498268, "num_input_tokens_seen": 160927115, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.796875, "step": 7499, "time_per_iteration": 2.508349895477295 }, { "auxiliary_loss_clip": 0.01160687, "auxiliary_loss_mlp": 0.01036376, "balance_loss_clip": 1.02171946, "balance_loss_mlp": 1.04394746, "epoch": 0.45092439500977005, "flos": 23367468700800.0, "grad_norm": 2.1795115070258184, "language_loss": 0.77239877, "learning_rate": 2.307456787813798e-06, "loss": 0.79436946, "num_input_tokens_seen": 160944405, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8125, "step": 7500, "time_per_iteration": 4.154577016830444 }, { "auxiliary_loss_clip": 0.01158807, "auxiliary_loss_mlp": 0.01035648, "balance_loss_clip": 1.02131355, "balance_loss_mlp": 1.044572, "epoch": 0.450984518262438, "flos": 20772958446720.0, "grad_norm": 1.8793296331286118, "language_loss": 0.62852788, "learning_rate": 2.307083451122746e-06, "loss": 0.6504724, "num_input_tokens_seen": 160961345, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7890625, "step": 7501, "time_per_iteration": 2.721541404724121 }, { "auxiliary_loss_clip": 0.01125129, "auxiliary_loss_mlp": 0.01035864, "balance_loss_clip": 1.02056932, "balance_loss_mlp": 1.04411614, "epoch": 0.451044641515106, "flos": 17707372300800.0, "grad_norm": 2.01005537435468, "language_loss": 0.84813261, "learning_rate": 2.3067101034726755e-06, "loss": 0.86974251, "num_input_tokens_seen": 160977330, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.80859375, "step": 7502, "time_per_iteration": 2.533303737640381 }, { "auxiliary_loss_clip": 0.01138804, "auxiliary_loss_mlp": 0.01031621, "balance_loss_clip": 1.01807868, "balance_loss_mlp": 1.04518604, "epoch": 0.45110476476777395, "flos": 20594698225920.0, "grad_norm": 1.3351843646988082, "language_loss": 0.79334891, "learning_rate": 2.30633674487691e-06, "loss": 0.81505311, "num_input_tokens_seen": 160997280, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.76171875, "step": 7503, "time_per_iteration": 3.994403600692749 }, { "auxiliary_loss_clip": 0.01140948, "auxiliary_loss_mlp": 0.01033957, "balance_loss_clip": 1.01997399, "balance_loss_mlp": 1.0473659, "epoch": 0.4511648880204419, "flos": 16034043265920.0, "grad_norm": 1.926026018435805, "language_loss": 0.8104043, "learning_rate": 2.3059633753487745e-06, "loss": 0.83215332, "num_input_tokens_seen": 161014235, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7578125, "step": 7504, "time_per_iteration": 2.6170618534088135 }, { "auxiliary_loss_clip": 0.01147562, "auxiliary_loss_mlp": 0.01033135, "balance_loss_clip": 1.01977777, "balance_loss_mlp": 1.04364789, "epoch": 0.4512250112731099, "flos": 23978811202560.0, "grad_norm": 1.7365805465728734, "language_loss": 0.63281906, "learning_rate": 2.3055899949015932e-06, "loss": 0.65462601, "num_input_tokens_seen": 161032360, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.765625, "step": 7505, "time_per_iteration": 2.562854290008545 }, { "auxiliary_loss_clip": 0.01061529, "auxiliary_loss_mlp": 0.0100142, "balance_loss_clip": 0.99969143, "balance_loss_mlp": 1.01067293, "epoch": 0.45128513452577784, "flos": 71462308037760.0, "grad_norm": 0.8312655681614499, "language_loss": 0.5884462, "learning_rate": 2.3052166035486916e-06, "loss": 0.60907567, "num_input_tokens_seen": 161091360, "router_z_loss_clip": 0.01733398, "router_z_loss_mlp": 0.24414062, "step": 7506, "time_per_iteration": 3.210737466812134 }, { "auxiliary_loss_clip": 0.01156965, "auxiliary_loss_mlp": 0.01033951, "balance_loss_clip": 1.01966405, "balance_loss_mlp": 1.04573596, "epoch": 0.4513452577784458, "flos": 22090844448000.0, "grad_norm": 1.6548772181635742, "language_loss": 0.78801477, "learning_rate": 2.304843201303394e-06, "loss": 0.80992395, "num_input_tokens_seen": 161110825, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7578125, "step": 7507, "time_per_iteration": 2.652557373046875 }, { "auxiliary_loss_clip": 0.01150355, "auxiliary_loss_mlp": 0.01036002, "balance_loss_clip": 1.02094626, "balance_loss_mlp": 1.04384136, "epoch": 0.4514053810311138, "flos": 24276403382400.0, "grad_norm": 1.5098142686436626, "language_loss": 0.73743522, "learning_rate": 2.3044697881790266e-06, "loss": 0.7592988, "num_input_tokens_seen": 161130685, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.796875, "step": 7508, "time_per_iteration": 2.620166063308716 }, { "auxiliary_loss_clip": 0.01128021, "auxiliary_loss_mlp": 0.01035774, "balance_loss_clip": 1.02289891, "balance_loss_mlp": 1.0447042, "epoch": 0.45146550428378174, "flos": 17886781756800.0, "grad_norm": 1.8632581709953084, "language_loss": 0.79804182, "learning_rate": 2.3040963641889155e-06, "loss": 0.81967974, "num_input_tokens_seen": 161147555, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.74609375, "step": 7509, "time_per_iteration": 2.605541706085205 }, { "auxiliary_loss_clip": 0.0112, "auxiliary_loss_mlp": 0.01035132, "balance_loss_clip": 1.02147651, "balance_loss_mlp": 1.04344046, "epoch": 0.4515256275364497, "flos": 24243437675520.0, "grad_norm": 1.7201805811952982, "language_loss": 0.72872376, "learning_rate": 2.303722929346388e-06, "loss": 0.75027514, "num_input_tokens_seen": 161166255, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.765625, "step": 7510, "time_per_iteration": 2.5821969509124756 }, { "auxiliary_loss_clip": 0.01121672, "auxiliary_loss_mlp": 0.01034316, "balance_loss_clip": 1.0197134, "balance_loss_mlp": 1.04242849, "epoch": 0.45158575078911767, "flos": 20631039811200.0, "grad_norm": 1.9290556576935298, "language_loss": 0.77179193, "learning_rate": 2.3033494836647693e-06, "loss": 0.79335183, "num_input_tokens_seen": 161184720, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.79296875, "step": 7511, "time_per_iteration": 2.5707952976226807 }, { "auxiliary_loss_clip": 0.01129727, "auxiliary_loss_mlp": 0.01036269, "balance_loss_clip": 1.02166581, "balance_loss_mlp": 1.04204917, "epoch": 0.45164587404178563, "flos": 23327751237120.0, "grad_norm": 1.8110053794585832, "language_loss": 0.7905848, "learning_rate": 2.3029760271573887e-06, "loss": 0.81224477, "num_input_tokens_seen": 161204360, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7890625, "step": 7512, "time_per_iteration": 2.5592896938323975 }, { "auxiliary_loss_clip": 0.01151918, "auxiliary_loss_mlp": 0.01036179, "balance_loss_clip": 1.02078938, "balance_loss_mlp": 1.04271638, "epoch": 0.45170599729445365, "flos": 23805973935360.0, "grad_norm": 5.247410317509609, "language_loss": 0.86886042, "learning_rate": 2.3026025598375727e-06, "loss": 0.89074135, "num_input_tokens_seen": 161223575, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.82421875, "step": 7513, "time_per_iteration": 2.619598150253296 }, { "auxiliary_loss_clip": 0.0112414, "auxiliary_loss_mlp": 0.0103214, "balance_loss_clip": 1.01906288, "balance_loss_mlp": 1.04313827, "epoch": 0.4517661205471216, "flos": 23512942782720.0, "grad_norm": 1.7049983942906752, "language_loss": 0.67109764, "learning_rate": 2.30222908171865e-06, "loss": 0.69266045, "num_input_tokens_seen": 161243805, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 7514, "time_per_iteration": 2.5929512977600098 }, { "auxiliary_loss_clip": 0.01133215, "auxiliary_loss_mlp": 0.01033862, "balance_loss_clip": 1.01810825, "balance_loss_mlp": 1.04566479, "epoch": 0.4518262437997896, "flos": 23513948363520.0, "grad_norm": 2.178997756997427, "language_loss": 0.69470346, "learning_rate": 2.301855592813949e-06, "loss": 0.71637428, "num_input_tokens_seen": 161261450, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.78515625, "step": 7515, "time_per_iteration": 2.7168996334075928 }, { "auxiliary_loss_clip": 0.01134018, "auxiliary_loss_mlp": 0.01040103, "balance_loss_clip": 1.0244391, "balance_loss_mlp": 1.04299116, "epoch": 0.45188636705245755, "flos": 14568061489920.0, "grad_norm": 3.23865453717752, "language_loss": 0.81285805, "learning_rate": 2.3014820931367976e-06, "loss": 0.83459926, "num_input_tokens_seen": 161276965, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8203125, "step": 7516, "time_per_iteration": 2.5804381370544434 }, { "auxiliary_loss_clip": 0.01119229, "auxiliary_loss_mlp": 0.01039071, "balance_loss_clip": 1.02468204, "balance_loss_mlp": 1.04409909, "epoch": 0.4519464903051255, "flos": 19901550499200.0, "grad_norm": 1.577186800763357, "language_loss": 0.65732062, "learning_rate": 2.301108582700526e-06, "loss": 0.67890358, "num_input_tokens_seen": 161295375, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.75, "step": 7517, "time_per_iteration": 2.5906028747558594 }, { "auxiliary_loss_clip": 0.01144643, "auxiliary_loss_mlp": 0.01029387, "balance_loss_clip": 1.01668513, "balance_loss_mlp": 1.04359651, "epoch": 0.4520066135577935, "flos": 18844376388480.0, "grad_norm": 1.8432716873445003, "language_loss": 0.6285615, "learning_rate": 2.3007350615184645e-06, "loss": 0.65030181, "num_input_tokens_seen": 161313010, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.74609375, "step": 7518, "time_per_iteration": 2.5760271549224854 }, { "auxiliary_loss_clip": 0.01129432, "auxiliary_loss_mlp": 0.01037037, "balance_loss_clip": 1.02300048, "balance_loss_mlp": 1.04332662, "epoch": 0.45206673681046144, "flos": 48214419713280.0, "grad_norm": 1.3857674267924118, "language_loss": 0.59273553, "learning_rate": 2.300361529603941e-06, "loss": 0.61440021, "num_input_tokens_seen": 161336690, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7734375, "step": 7519, "time_per_iteration": 2.7827651500701904 }, { "auxiliary_loss_clip": 0.01132394, "auxiliary_loss_mlp": 0.01042192, "balance_loss_clip": 1.02833998, "balance_loss_mlp": 1.04535985, "epoch": 0.4521268600631294, "flos": 23842171866240.0, "grad_norm": 1.5376606742099426, "language_loss": 0.7211253, "learning_rate": 2.2999879869702884e-06, "loss": 0.74287117, "num_input_tokens_seen": 161357845, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78125, "step": 7520, "time_per_iteration": 2.622218370437622 }, { "auxiliary_loss_clip": 0.01141306, "auxiliary_loss_mlp": 0.01041513, "balance_loss_clip": 1.02837586, "balance_loss_mlp": 1.04530668, "epoch": 0.4521869833157974, "flos": 18843622202880.0, "grad_norm": 1.9230201337356745, "language_loss": 0.75434804, "learning_rate": 2.299614433630835e-06, "loss": 0.77617621, "num_input_tokens_seen": 161375160, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.78125, "step": 7521, "time_per_iteration": 2.831281900405884 }, { "auxiliary_loss_clip": 0.01143861, "auxiliary_loss_mlp": 0.0104338, "balance_loss_clip": 1.02878845, "balance_loss_mlp": 1.04546523, "epoch": 0.45224710656846534, "flos": 19788072456960.0, "grad_norm": 2.0293164917810396, "language_loss": 0.67894924, "learning_rate": 2.2992408695989144e-06, "loss": 0.70082164, "num_input_tokens_seen": 161393690, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.80859375, "step": 7522, "time_per_iteration": 2.6166772842407227 }, { "auxiliary_loss_clip": 0.01127526, "auxiliary_loss_mlp": 0.01036824, "balance_loss_clip": 1.02332902, "balance_loss_mlp": 1.04249609, "epoch": 0.4523072298211333, "flos": 28256131681920.0, "grad_norm": 2.189618142930794, "language_loss": 0.60572612, "learning_rate": 2.2988672948878564e-06, "loss": 0.62736964, "num_input_tokens_seen": 161415015, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.76171875, "step": 7523, "time_per_iteration": 2.716217517852783 }, { "auxiliary_loss_clip": 0.01134135, "auxiliary_loss_mlp": 0.01041996, "balance_loss_clip": 1.02700531, "balance_loss_mlp": 1.04377675, "epoch": 0.45236735307380127, "flos": 11181039511680.0, "grad_norm": 2.0512983722621563, "language_loss": 0.78360599, "learning_rate": 2.2984937095109926e-06, "loss": 0.80536729, "num_input_tokens_seen": 161432940, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.81640625, "step": 7524, "time_per_iteration": 2.533188819885254 }, { "auxiliary_loss_clip": 0.01155859, "auxiliary_loss_mlp": 0.01041443, "balance_loss_clip": 1.02797771, "balance_loss_mlp": 1.04351866, "epoch": 0.45242747632646924, "flos": 22601386408320.0, "grad_norm": 2.0305740029623855, "language_loss": 0.63138664, "learning_rate": 2.2981201134816573e-06, "loss": 0.65335959, "num_input_tokens_seen": 161452215, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.76171875, "step": 7525, "time_per_iteration": 2.657160997390747 }, { "auxiliary_loss_clip": 0.01127164, "auxiliary_loss_mlp": 0.01039734, "balance_loss_clip": 1.02617383, "balance_loss_mlp": 1.04296613, "epoch": 0.45248759957913726, "flos": 18256267008000.0, "grad_norm": 1.7037425131474067, "language_loss": 0.79163361, "learning_rate": 2.2977465068131812e-06, "loss": 0.81330264, "num_input_tokens_seen": 161469520, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.75390625, "step": 7526, "time_per_iteration": 2.5608232021331787 }, { "auxiliary_loss_clip": 0.0113689, "auxiliary_loss_mlp": 0.01032799, "balance_loss_clip": 1.01934028, "balance_loss_mlp": 1.04385698, "epoch": 0.4525477228318052, "flos": 22450094323200.0, "grad_norm": 1.7152089939776796, "language_loss": 0.80784488, "learning_rate": 2.2973728895188983e-06, "loss": 0.8295418, "num_input_tokens_seen": 161487335, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75, "step": 7527, "time_per_iteration": 2.591094493865967 }, { "auxiliary_loss_clip": 0.01147343, "auxiliary_loss_mlp": 0.01029933, "balance_loss_clip": 1.01641476, "balance_loss_mlp": 1.0440619, "epoch": 0.4526078460844732, "flos": 29644869260160.0, "grad_norm": 1.4566062663002082, "language_loss": 0.65186048, "learning_rate": 2.296999261612142e-06, "loss": 0.67363322, "num_input_tokens_seen": 161510095, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.765625, "step": 7528, "time_per_iteration": 2.700601816177368 }, { "auxiliary_loss_clip": 0.01135091, "auxiliary_loss_mlp": 0.0103646, "balance_loss_clip": 1.02326322, "balance_loss_mlp": 1.04267216, "epoch": 0.45266796933714115, "flos": 23039747988480.0, "grad_norm": 1.6148751820139031, "language_loss": 0.75660115, "learning_rate": 2.2966256231062464e-06, "loss": 0.77831674, "num_input_tokens_seen": 161528725, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7421875, "step": 7529, "time_per_iteration": 2.697877883911133 }, { "auxiliary_loss_clip": 0.01119413, "auxiliary_loss_mlp": 0.01038395, "balance_loss_clip": 1.02532983, "balance_loss_mlp": 1.04300642, "epoch": 0.4527280925898091, "flos": 14428405411200.0, "grad_norm": 1.606660525300535, "language_loss": 0.72251713, "learning_rate": 2.296251974014545e-06, "loss": 0.74409521, "num_input_tokens_seen": 161547195, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.765625, "step": 7530, "time_per_iteration": 2.516131639480591 }, { "auxiliary_loss_clip": 0.01146555, "auxiliary_loss_mlp": 0.01034404, "balance_loss_clip": 1.02172017, "balance_loss_mlp": 1.04297709, "epoch": 0.4527882158424771, "flos": 22925515760640.0, "grad_norm": 1.6362338032500374, "language_loss": 0.76109338, "learning_rate": 2.2958783143503724e-06, "loss": 0.78290296, "num_input_tokens_seen": 161565565, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.765625, "step": 7531, "time_per_iteration": 2.5717215538024902 }, { "auxiliary_loss_clip": 0.01116399, "auxiliary_loss_mlp": 0.01040475, "balance_loss_clip": 1.02661061, "balance_loss_mlp": 1.04243517, "epoch": 0.45284833909514505, "flos": 25555326105600.0, "grad_norm": 1.5941558661748463, "language_loss": 0.66628736, "learning_rate": 2.295504644127064e-06, "loss": 0.68785608, "num_input_tokens_seen": 161586630, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7421875, "step": 7532, "time_per_iteration": 2.7830052375793457 }, { "auxiliary_loss_clip": 0.01129031, "auxiliary_loss_mlp": 0.01041411, "balance_loss_clip": 1.02805376, "balance_loss_mlp": 1.0445956, "epoch": 0.452908462347813, "flos": 18150007599360.0, "grad_norm": 2.2224313896682037, "language_loss": 0.78550982, "learning_rate": 2.295130963357955e-06, "loss": 0.8072142, "num_input_tokens_seen": 161603815, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7578125, "step": 7533, "time_per_iteration": 3.9495415687561035 }, { "auxiliary_loss_clip": 0.01149062, "auxiliary_loss_mlp": 0.01033363, "balance_loss_clip": 1.0189625, "balance_loss_mlp": 1.04011297, "epoch": 0.452968585600481, "flos": 19062749122560.0, "grad_norm": 1.8660340476797177, "language_loss": 0.83231544, "learning_rate": 2.2947572720563815e-06, "loss": 0.85413969, "num_input_tokens_seen": 161622900, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.82421875, "step": 7534, "time_per_iteration": 2.610217332839966 }, { "auxiliary_loss_clip": 0.01139563, "auxiliary_loss_mlp": 0.01035768, "balance_loss_clip": 1.02098632, "balance_loss_mlp": 1.04270756, "epoch": 0.45302870885314894, "flos": 22051737515520.0, "grad_norm": 1.4855061035425046, "language_loss": 0.76408935, "learning_rate": 2.2943835702356788e-06, "loss": 0.7858426, "num_input_tokens_seen": 161641700, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.78515625, "step": 7535, "time_per_iteration": 2.549025774002075 }, { "auxiliary_loss_clip": 0.01144066, "auxiliary_loss_mlp": 0.0103719, "balance_loss_clip": 1.02427936, "balance_loss_mlp": 1.04261112, "epoch": 0.4530888321058169, "flos": 20376217751040.0, "grad_norm": 1.5687121772179982, "language_loss": 0.80349952, "learning_rate": 2.2940098579091836e-06, "loss": 0.82531214, "num_input_tokens_seen": 161661955, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.75, "step": 7536, "time_per_iteration": 2.5601258277893066 }, { "auxiliary_loss_clip": 0.0113647, "auxiliary_loss_mlp": 0.01035204, "balance_loss_clip": 1.02160811, "balance_loss_mlp": 1.03991449, "epoch": 0.4531489553584849, "flos": 14830425406080.0, "grad_norm": 1.5209275466610896, "language_loss": 0.75719613, "learning_rate": 2.2936361350902334e-06, "loss": 0.7789129, "num_input_tokens_seen": 161679245, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.78125, "step": 7537, "time_per_iteration": 2.5182220935821533 }, { "auxiliary_loss_clip": 0.01130302, "auxiliary_loss_mlp": 0.01032524, "balance_loss_clip": 1.01927352, "balance_loss_mlp": 1.0440712, "epoch": 0.45320907861115284, "flos": 21944975316480.0, "grad_norm": 1.85671800957356, "language_loss": 0.75692552, "learning_rate": 2.2932624017921643e-06, "loss": 0.77855378, "num_input_tokens_seen": 161698795, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7734375, "step": 7538, "time_per_iteration": 3.954343795776367 }, { "auxiliary_loss_clip": 0.01046426, "auxiliary_loss_mlp": 0.0100503, "balance_loss_clip": 1.0032773, "balance_loss_mlp": 1.0129683, "epoch": 0.45326920186382086, "flos": 66251455038720.0, "grad_norm": 0.7844123136278155, "language_loss": 0.62374306, "learning_rate": 2.292888658028315e-06, "loss": 0.64425755, "num_input_tokens_seen": 161761980, "router_z_loss_clip": 0.01757812, "router_z_loss_mlp": 0.24414062, "step": 7539, "time_per_iteration": 3.2169814109802246 }, { "auxiliary_loss_clip": 0.01115976, "auxiliary_loss_mlp": 0.01037813, "balance_loss_clip": 1.02452135, "balance_loss_mlp": 1.04112482, "epoch": 0.4533293251164888, "flos": 14684233052160.0, "grad_norm": 1.7048748154555118, "language_loss": 0.65567416, "learning_rate": 2.2925149038120226e-06, "loss": 0.67721206, "num_input_tokens_seen": 161779455, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75, "step": 7540, "time_per_iteration": 2.526031017303467 }, { "auxiliary_loss_clip": 0.01141632, "auxiliary_loss_mlp": 0.01041384, "balance_loss_clip": 1.02704287, "balance_loss_mlp": 1.04322958, "epoch": 0.4533894483691568, "flos": 22601206840320.0, "grad_norm": 2.150290648517308, "language_loss": 0.84977829, "learning_rate": 2.292141139156625e-06, "loss": 0.8716085, "num_input_tokens_seen": 161798980, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8046875, "step": 7541, "time_per_iteration": 2.578320026397705 }, { "auxiliary_loss_clip": 0.01135695, "auxiliary_loss_mlp": 0.01031791, "balance_loss_clip": 1.01878536, "balance_loss_mlp": 1.04145885, "epoch": 0.45344957162182475, "flos": 34751617666560.0, "grad_norm": 1.684259208242757, "language_loss": 0.76220924, "learning_rate": 2.2917673640754626e-06, "loss": 0.78388417, "num_input_tokens_seen": 161819745, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.765625, "step": 7542, "time_per_iteration": 4.230187177658081 }, { "auxiliary_loss_clip": 0.01135509, "auxiliary_loss_mlp": 0.01029201, "balance_loss_clip": 1.01522958, "balance_loss_mlp": 1.04069948, "epoch": 0.4535096948744927, "flos": 23550218121600.0, "grad_norm": 1.54701583784337, "language_loss": 0.80414045, "learning_rate": 2.291393578581873e-06, "loss": 0.82578754, "num_input_tokens_seen": 161838575, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.765625, "step": 7543, "time_per_iteration": 2.5958340167999268 }, { "auxiliary_loss_clip": 0.01130399, "auxiliary_loss_mlp": 0.01282925, "balance_loss_clip": 1.02259386, "balance_loss_mlp": 1.04306507, "epoch": 0.4535698181271607, "flos": 25557552748800.0, "grad_norm": 2.0334643877365166, "language_loss": 0.76206958, "learning_rate": 2.2910197826891966e-06, "loss": 0.78620285, "num_input_tokens_seen": 161858590, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.78125, "step": 7544, "time_per_iteration": 4.053558111190796 }, { "auxiliary_loss_clip": 0.01148098, "auxiliary_loss_mlp": 0.01037594, "balance_loss_clip": 1.02465355, "balance_loss_mlp": 1.04408205, "epoch": 0.45362994137982865, "flos": 24864117713280.0, "grad_norm": 2.3568655051667085, "language_loss": 0.75175571, "learning_rate": 2.2906459764107725e-06, "loss": 0.77361256, "num_input_tokens_seen": 161878390, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7734375, "step": 7545, "time_per_iteration": 2.679241180419922 }, { "auxiliary_loss_clip": 0.01136112, "auxiliary_loss_mlp": 0.01030841, "balance_loss_clip": 1.01703107, "balance_loss_mlp": 1.04216373, "epoch": 0.4536900646324966, "flos": 30806794408320.0, "grad_norm": 1.689462616911335, "language_loss": 0.72098613, "learning_rate": 2.290272159759941e-06, "loss": 0.74265563, "num_input_tokens_seen": 161898610, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.76171875, "step": 7546, "time_per_iteration": 2.5988192558288574 }, { "auxiliary_loss_clip": 0.01136603, "auxiliary_loss_mlp": 0.01032033, "balance_loss_clip": 1.01779938, "balance_loss_mlp": 1.04104114, "epoch": 0.4537501878851646, "flos": 23404313076480.0, "grad_norm": 1.4490901957245972, "language_loss": 0.75682688, "learning_rate": 2.2898983327500428e-06, "loss": 0.77851319, "num_input_tokens_seen": 161918210, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78125, "step": 7547, "time_per_iteration": 2.602088212966919 }, { "auxiliary_loss_clip": 0.01129136, "auxiliary_loss_mlp": 0.01030036, "balance_loss_clip": 1.01596904, "balance_loss_mlp": 1.04214573, "epoch": 0.45381031113783254, "flos": 18149289327360.0, "grad_norm": 2.0620474258405563, "language_loss": 0.69920087, "learning_rate": 2.2895244953944186e-06, "loss": 0.72079265, "num_input_tokens_seen": 161936950, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78125, "step": 7548, "time_per_iteration": 2.5109596252441406 }, { "auxiliary_loss_clip": 0.01115549, "auxiliary_loss_mlp": 0.01031693, "balance_loss_clip": 1.01834154, "balance_loss_mlp": 1.03886604, "epoch": 0.4538704343905005, "flos": 25336666062720.0, "grad_norm": 2.0114212034121732, "language_loss": 0.72051048, "learning_rate": 2.2891506477064105e-06, "loss": 0.74198288, "num_input_tokens_seen": 161955550, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.765625, "step": 7549, "time_per_iteration": 2.575685739517212 }, { "auxiliary_loss_clip": 0.01140714, "auxiliary_loss_mlp": 0.0102728, "balance_loss_clip": 1.01475656, "balance_loss_mlp": 1.03958583, "epoch": 0.4539305576431685, "flos": 28731445378560.0, "grad_norm": 1.8458936570052975, "language_loss": 0.65018415, "learning_rate": 2.2887767896993595e-06, "loss": 0.67186403, "num_input_tokens_seen": 161976760, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.74609375, "step": 7550, "time_per_iteration": 2.6259024143218994 }, { "auxiliary_loss_clip": 0.01125359, "auxiliary_loss_mlp": 0.01029512, "balance_loss_clip": 1.01706707, "balance_loss_mlp": 1.04201114, "epoch": 0.45399068089583644, "flos": 22492397566080.0, "grad_norm": 1.6252245267574847, "language_loss": 0.68689835, "learning_rate": 2.2884029213866073e-06, "loss": 0.7084471, "num_input_tokens_seen": 161996120, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7421875, "step": 7551, "time_per_iteration": 2.634634494781494 }, { "auxiliary_loss_clip": 0.01125185, "auxiliary_loss_mlp": 0.01032034, "balance_loss_clip": 1.01851547, "balance_loss_mlp": 1.03920054, "epoch": 0.45405080414850446, "flos": 12893403651840.0, "grad_norm": 1.6536201408930578, "language_loss": 0.78842127, "learning_rate": 2.2880290427814972e-06, "loss": 0.80999351, "num_input_tokens_seen": 162011125, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.76953125, "step": 7552, "time_per_iteration": 2.59659481048584 }, { "auxiliary_loss_clip": 0.01049316, "auxiliary_loss_mlp": 0.01001035, "balance_loss_clip": 0.99938959, "balance_loss_mlp": 1.01606226, "epoch": 0.4541109274011724, "flos": 59766919724160.0, "grad_norm": 0.8289894825649136, "language_loss": 0.57817483, "learning_rate": 2.2876551538973712e-06, "loss": 0.59867835, "num_input_tokens_seen": 162068705, "router_z_loss_clip": 0.01647949, "router_z_loss_mlp": 0.24609375, "step": 7553, "time_per_iteration": 3.207811117172241 }, { "auxiliary_loss_clip": 0.01158522, "auxiliary_loss_mlp": 0.01034658, "balance_loss_clip": 1.02183723, "balance_loss_mlp": 1.03996277, "epoch": 0.4541710506538404, "flos": 28511743841280.0, "grad_norm": 1.3374101788338868, "language_loss": 0.76444411, "learning_rate": 2.2872812547475723e-06, "loss": 0.78637588, "num_input_tokens_seen": 162089655, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7421875, "step": 7554, "time_per_iteration": 2.757995843887329 }, { "auxiliary_loss_clip": 0.01138089, "auxiliary_loss_mlp": 0.01027878, "balance_loss_clip": 1.01356125, "balance_loss_mlp": 1.04165423, "epoch": 0.45423117390650836, "flos": 17675591742720.0, "grad_norm": 2.3002329848690164, "language_loss": 0.76691753, "learning_rate": 2.286907345345445e-06, "loss": 0.7885772, "num_input_tokens_seen": 162108465, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7890625, "step": 7555, "time_per_iteration": 2.641726016998291 }, { "auxiliary_loss_clip": 0.01168606, "auxiliary_loss_mlp": 0.01032335, "balance_loss_clip": 1.01906729, "balance_loss_mlp": 1.04006195, "epoch": 0.4542912971591763, "flos": 28072556248320.0, "grad_norm": 1.3490157687212274, "language_loss": 0.72630441, "learning_rate": 2.286533425704332e-06, "loss": 0.74831378, "num_input_tokens_seen": 162129910, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75, "step": 7556, "time_per_iteration": 2.728332996368408 }, { "auxiliary_loss_clip": 0.01121374, "auxiliary_loss_mlp": 0.0103062, "balance_loss_clip": 1.01718497, "balance_loss_mlp": 1.04292679, "epoch": 0.4543514204118443, "flos": 22671771108480.0, "grad_norm": 2.208170178882537, "language_loss": 0.62993348, "learning_rate": 2.2861594958375783e-06, "loss": 0.65145338, "num_input_tokens_seen": 162148840, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.78515625, "step": 7557, "time_per_iteration": 2.5316200256347656 }, { "auxiliary_loss_clip": 0.01058217, "auxiliary_loss_mlp": 0.01003842, "balance_loss_clip": 1.0021733, "balance_loss_mlp": 1.01597667, "epoch": 0.45441154366451225, "flos": 58216549921920.0, "grad_norm": 0.6818557786391295, "language_loss": 0.57688862, "learning_rate": 2.285785555758528e-06, "loss": 0.59750921, "num_input_tokens_seen": 162208500, "router_z_loss_clip": 0.01672363, "router_z_loss_mlp": 0.24414062, "step": 7558, "time_per_iteration": 3.100454807281494 }, { "auxiliary_loss_clip": 0.01127654, "auxiliary_loss_mlp": 0.01034816, "balance_loss_clip": 1.02069604, "balance_loss_mlp": 1.04177999, "epoch": 0.4544716669171802, "flos": 16764286763520.0, "grad_norm": 2.4726885509900867, "language_loss": 0.5655002, "learning_rate": 2.285411605480527e-06, "loss": 0.58712488, "num_input_tokens_seen": 162224650, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7734375, "step": 7559, "time_per_iteration": 2.557244062423706 }, { "auxiliary_loss_clip": 0.01128732, "auxiliary_loss_mlp": 0.0128557, "balance_loss_clip": 1.02493572, "balance_loss_mlp": 1.04312682, "epoch": 0.4545317901698482, "flos": 15925233991680.0, "grad_norm": 1.819291308552412, "language_loss": 0.7156831, "learning_rate": 2.2850376450169197e-06, "loss": 0.73982608, "num_input_tokens_seen": 162242930, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.765625, "step": 7560, "time_per_iteration": 2.563361406326294 }, { "auxiliary_loss_clip": 0.01177464, "auxiliary_loss_mlp": 0.010354, "balance_loss_clip": 1.02155983, "balance_loss_mlp": 1.04361701, "epoch": 0.45459191342251615, "flos": 17639752947840.0, "grad_norm": 2.174437892719949, "language_loss": 0.69617468, "learning_rate": 2.284663674381052e-06, "loss": 0.71830326, "num_input_tokens_seen": 162261455, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.796875, "step": 7561, "time_per_iteration": 2.581857204437256 }, { "auxiliary_loss_clip": 0.01131126, "auxiliary_loss_mlp": 0.01032415, "balance_loss_clip": 1.02053571, "balance_loss_mlp": 1.04041314, "epoch": 0.4546520366751841, "flos": 16176608346240.0, "grad_norm": 1.6186248091737885, "language_loss": 0.85111821, "learning_rate": 2.28428969358627e-06, "loss": 0.87275362, "num_input_tokens_seen": 162279725, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.73046875, "step": 7562, "time_per_iteration": 2.643512487411499 }, { "auxiliary_loss_clip": 0.01123013, "auxiliary_loss_mlp": 0.01035374, "balance_loss_clip": 1.02245808, "balance_loss_mlp": 1.04116917, "epoch": 0.4547121599278521, "flos": 19751443562880.0, "grad_norm": 2.3585257352651516, "language_loss": 0.89409041, "learning_rate": 2.28391570264592e-06, "loss": 0.91567421, "num_input_tokens_seen": 162297865, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73046875, "step": 7563, "time_per_iteration": 2.4983439445495605 }, { "auxiliary_loss_clip": 0.01117623, "auxiliary_loss_mlp": 0.01287449, "balance_loss_clip": 1.02643406, "balance_loss_mlp": 1.0407176, "epoch": 0.45477228318052004, "flos": 19937461121280.0, "grad_norm": 2.0831771154873095, "language_loss": 0.71065724, "learning_rate": 2.283541701573349e-06, "loss": 0.73470795, "num_input_tokens_seen": 162316010, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.76953125, "step": 7564, "time_per_iteration": 2.5805275440216064 }, { "auxiliary_loss_clip": 0.01127092, "auxiliary_loss_mlp": 0.01036471, "balance_loss_clip": 1.02310729, "balance_loss_mlp": 1.04076147, "epoch": 0.454832406433188, "flos": 21288312829440.0, "grad_norm": 3.612650344567142, "language_loss": 0.68047941, "learning_rate": 2.283167690381904e-06, "loss": 0.70211506, "num_input_tokens_seen": 162336115, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7734375, "step": 7565, "time_per_iteration": 2.5357956886291504 }, { "auxiliary_loss_clip": 0.01149834, "auxiliary_loss_mlp": 0.01033445, "balance_loss_clip": 1.02017689, "balance_loss_mlp": 1.03958631, "epoch": 0.45489252968585603, "flos": 24498726612480.0, "grad_norm": 1.8147166820918323, "language_loss": 0.799631, "learning_rate": 2.2827936690849326e-06, "loss": 0.82146376, "num_input_tokens_seen": 162355705, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.74609375, "step": 7566, "time_per_iteration": 2.6914072036743164 }, { "auxiliary_loss_clip": 0.01145488, "auxiliary_loss_mlp": 0.01031354, "balance_loss_clip": 1.01662016, "balance_loss_mlp": 1.04139209, "epoch": 0.454952652938524, "flos": 17092474352640.0, "grad_norm": 1.7015105656407608, "language_loss": 0.73497701, "learning_rate": 2.2824196376957833e-06, "loss": 0.75674546, "num_input_tokens_seen": 162374055, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7734375, "step": 7567, "time_per_iteration": 2.6080455780029297 }, { "auxiliary_loss_clip": 0.01147675, "auxiliary_loss_mlp": 0.01036973, "balance_loss_clip": 1.02363896, "balance_loss_mlp": 1.04340398, "epoch": 0.45501277619119196, "flos": 27630387826560.0, "grad_norm": 1.9473129888346, "language_loss": 0.80873096, "learning_rate": 2.282045596227803e-06, "loss": 0.83057743, "num_input_tokens_seen": 162393560, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7734375, "step": 7568, "time_per_iteration": 2.6839263439178467 }, { "auxiliary_loss_clip": 0.01130847, "auxiliary_loss_mlp": 0.01043448, "balance_loss_clip": 1.02820063, "balance_loss_mlp": 1.04090476, "epoch": 0.4550728994438599, "flos": 19974664632960.0, "grad_norm": 2.2613815091735696, "language_loss": 0.79394549, "learning_rate": 2.2816715446943402e-06, "loss": 0.81568843, "num_input_tokens_seen": 162413170, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8125, "step": 7569, "time_per_iteration": 2.514258861541748 }, { "auxiliary_loss_clip": 0.01131626, "auxiliary_loss_mlp": 0.01033561, "balance_loss_clip": 1.01880312, "balance_loss_mlp": 1.04344201, "epoch": 0.4551330226965279, "flos": 26066873646720.0, "grad_norm": 1.8769594908374112, "language_loss": 0.74964213, "learning_rate": 2.281297483108745e-06, "loss": 0.771294, "num_input_tokens_seen": 162434080, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7890625, "step": 7570, "time_per_iteration": 2.6347169876098633 }, { "auxiliary_loss_clip": 0.01136134, "auxiliary_loss_mlp": 0.01041269, "balance_loss_clip": 1.02720785, "balance_loss_mlp": 1.04301214, "epoch": 0.45519314594919585, "flos": 32781091501440.0, "grad_norm": 1.704711246647473, "language_loss": 0.74211115, "learning_rate": 2.2809234114843664e-06, "loss": 0.76388514, "num_input_tokens_seen": 162455445, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 7571, "time_per_iteration": 2.7098639011383057 }, { "auxiliary_loss_clip": 0.01124233, "auxiliary_loss_mlp": 0.01033128, "balance_loss_clip": 1.01973474, "balance_loss_mlp": 1.0426662, "epoch": 0.4552532692018638, "flos": 19172671718400.0, "grad_norm": 1.5314534080846076, "language_loss": 0.81157136, "learning_rate": 2.2805493298345537e-06, "loss": 0.83314502, "num_input_tokens_seen": 162474940, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 7572, "time_per_iteration": 2.5464882850646973 }, { "auxiliary_loss_clip": 0.01140652, "auxiliary_loss_mlp": 0.01043848, "balance_loss_clip": 1.0287559, "balance_loss_mlp": 1.04353321, "epoch": 0.4553133924545318, "flos": 26027156183040.0, "grad_norm": 1.7475649266534952, "language_loss": 0.7287935, "learning_rate": 2.280175238172657e-06, "loss": 0.75063848, "num_input_tokens_seen": 162493340, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.79296875, "step": 7573, "time_per_iteration": 2.6556999683380127 }, { "auxiliary_loss_clip": 0.01132845, "auxiliary_loss_mlp": 0.01037666, "balance_loss_clip": 1.02222848, "balance_loss_mlp": 1.04442596, "epoch": 0.45537351570719975, "flos": 30661535808000.0, "grad_norm": 2.2583659603411967, "language_loss": 0.74458408, "learning_rate": 2.279801136512027e-06, "loss": 0.76628923, "num_input_tokens_seen": 162514360, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.796875, "step": 7574, "time_per_iteration": 2.676281213760376 }, { "auxiliary_loss_clip": 0.01131382, "auxiliary_loss_mlp": 0.0103371, "balance_loss_clip": 1.01946497, "balance_loss_mlp": 1.04400671, "epoch": 0.4554336389598677, "flos": 24353396184960.0, "grad_norm": 1.4963304846342358, "language_loss": 0.71617955, "learning_rate": 2.2794270248660136e-06, "loss": 0.73783046, "num_input_tokens_seen": 162535240, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78515625, "step": 7575, "time_per_iteration": 4.015006065368652 }, { "auxiliary_loss_clip": 0.01137287, "auxiliary_loss_mlp": 0.01034166, "balance_loss_clip": 1.02015281, "balance_loss_mlp": 1.04313791, "epoch": 0.4554937622125357, "flos": 20557925677440.0, "grad_norm": 5.868872577956255, "language_loss": 0.72890466, "learning_rate": 2.279052903247969e-06, "loss": 0.75061917, "num_input_tokens_seen": 162553880, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.765625, "step": 7576, "time_per_iteration": 2.5263476371765137 }, { "auxiliary_loss_clip": 0.01116549, "auxiliary_loss_mlp": 0.01039016, "balance_loss_clip": 1.02515745, "balance_loss_mlp": 1.04194736, "epoch": 0.45555388546520365, "flos": 22820764723200.0, "grad_norm": 1.8503652728668734, "language_loss": 0.66520566, "learning_rate": 2.278678771671244e-06, "loss": 0.68676126, "num_input_tokens_seen": 162574485, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.74609375, "step": 7577, "time_per_iteration": 2.6392478942871094 }, { "auxiliary_loss_clip": 0.01139801, "auxiliary_loss_mlp": 0.01042637, "balance_loss_clip": 1.02701426, "balance_loss_mlp": 1.04474413, "epoch": 0.4556140087178716, "flos": 21725992051200.0, "grad_norm": 1.8021669355638443, "language_loss": 0.73831463, "learning_rate": 2.2783046301491904e-06, "loss": 0.76013899, "num_input_tokens_seen": 162595130, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.76953125, "step": 7578, "time_per_iteration": 2.543607711791992 }, { "auxiliary_loss_clip": 0.01154533, "auxiliary_loss_mlp": 0.01278882, "balance_loss_clip": 1.01816607, "balance_loss_mlp": 1.04345667, "epoch": 0.45567413197053963, "flos": 25994513698560.0, "grad_norm": 2.07896710506754, "language_loss": 0.70354486, "learning_rate": 2.277930478695161e-06, "loss": 0.72787899, "num_input_tokens_seen": 162615720, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 7579, "time_per_iteration": 2.674190044403076 }, { "auxiliary_loss_clip": 0.01127448, "auxiliary_loss_mlp": 0.01034342, "balance_loss_clip": 1.02096081, "balance_loss_mlp": 1.04365015, "epoch": 0.4557342552232076, "flos": 21537604195200.0, "grad_norm": 1.7518639869651798, "language_loss": 0.78583944, "learning_rate": 2.2775563173225064e-06, "loss": 0.80745739, "num_input_tokens_seen": 162635825, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.75, "step": 7580, "time_per_iteration": 4.080231189727783 }, { "auxiliary_loss_clip": 0.01121136, "auxiliary_loss_mlp": 0.01034858, "balance_loss_clip": 1.02051735, "balance_loss_mlp": 1.04552937, "epoch": 0.45579437847587556, "flos": 40001972647680.0, "grad_norm": 1.5894092638502555, "language_loss": 0.68704593, "learning_rate": 2.277182146044582e-06, "loss": 0.70860589, "num_input_tokens_seen": 162659130, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7578125, "step": 7581, "time_per_iteration": 2.681947946548462 }, { "auxiliary_loss_clip": 0.01125343, "auxiliary_loss_mlp": 0.01031679, "balance_loss_clip": 1.01850629, "balance_loss_mlp": 1.0409714, "epoch": 0.4558545017285435, "flos": 31138501530240.0, "grad_norm": 2.6222952922440457, "language_loss": 0.73571134, "learning_rate": 2.2768079648747394e-06, "loss": 0.75728154, "num_input_tokens_seen": 162681665, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.75, "step": 7582, "time_per_iteration": 2.6864020824432373 }, { "auxiliary_loss_clip": 0.0112593, "auxiliary_loss_mlp": 0.01045453, "balance_loss_clip": 1.0301342, "balance_loss_mlp": 1.04679215, "epoch": 0.4559146249812115, "flos": 21725776569600.0, "grad_norm": 3.449248091181767, "language_loss": 0.72347426, "learning_rate": 2.276433773826333e-06, "loss": 0.74518806, "num_input_tokens_seen": 162702040, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.79296875, "step": 7583, "time_per_iteration": 4.112609624862671 }, { "auxiliary_loss_clip": 0.01133851, "auxiliary_loss_mlp": 0.01038507, "balance_loss_clip": 1.02391565, "balance_loss_mlp": 1.04374361, "epoch": 0.45597474823387946, "flos": 23805973935360.0, "grad_norm": 1.5926156997294003, "language_loss": 0.72074378, "learning_rate": 2.2760595729127157e-06, "loss": 0.7424674, "num_input_tokens_seen": 162722375, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.80859375, "step": 7584, "time_per_iteration": 2.567866802215576 }, { "auxiliary_loss_clip": 0.01147588, "auxiliary_loss_mlp": 0.010363, "balance_loss_clip": 1.02251339, "balance_loss_mlp": 1.04499567, "epoch": 0.4560348714865474, "flos": 31905661230720.0, "grad_norm": 1.476660587653411, "language_loss": 0.67505389, "learning_rate": 2.2756853621472424e-06, "loss": 0.6968928, "num_input_tokens_seen": 162746095, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7578125, "step": 7585, "time_per_iteration": 2.731618642807007 }, { "auxiliary_loss_clip": 0.01118812, "auxiliary_loss_mlp": 0.01032184, "balance_loss_clip": 1.01920235, "balance_loss_mlp": 1.04307115, "epoch": 0.4560949947392154, "flos": 22048828513920.0, "grad_norm": 1.4764795249672993, "language_loss": 0.7626316, "learning_rate": 2.275311141543268e-06, "loss": 0.7841416, "num_input_tokens_seen": 162766330, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7578125, "step": 7586, "time_per_iteration": 4.021315097808838 }, { "auxiliary_loss_clip": 0.01139284, "auxiliary_loss_mlp": 0.01030513, "balance_loss_clip": 1.01761448, "balance_loss_mlp": 1.04591846, "epoch": 0.45615511799188335, "flos": 24571804832640.0, "grad_norm": 1.7230827875534815, "language_loss": 0.77828944, "learning_rate": 2.2749369111141464e-06, "loss": 0.79998738, "num_input_tokens_seen": 162784755, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7578125, "step": 7587, "time_per_iteration": 2.7195277214050293 }, { "auxiliary_loss_clip": 0.01141893, "auxiliary_loss_mlp": 0.01045054, "balance_loss_clip": 1.03070724, "balance_loss_mlp": 1.04243112, "epoch": 0.4562152412445513, "flos": 18330709944960.0, "grad_norm": 1.7094820164981046, "language_loss": 0.6920495, "learning_rate": 2.2745626708732348e-06, "loss": 0.71391892, "num_input_tokens_seen": 162803850, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.81640625, "step": 7588, "time_per_iteration": 2.5650737285614014 }, { "auxiliary_loss_clip": 0.01126954, "auxiliary_loss_mlp": 0.01030405, "balance_loss_clip": 1.01756036, "balance_loss_mlp": 1.04325306, "epoch": 0.4562753644972193, "flos": 13516525814400.0, "grad_norm": 1.752948437401508, "language_loss": 0.78924632, "learning_rate": 2.274188420833887e-06, "loss": 0.81081986, "num_input_tokens_seen": 162820775, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.74609375, "step": 7589, "time_per_iteration": 2.550389289855957 }, { "auxiliary_loss_clip": 0.01128475, "auxiliary_loss_mlp": 0.01035675, "balance_loss_clip": 1.02153683, "balance_loss_mlp": 1.0428791, "epoch": 0.45633548774988725, "flos": 29639697701760.0, "grad_norm": 2.1627452805616465, "language_loss": 0.62260652, "learning_rate": 2.27381416100946e-06, "loss": 0.64424801, "num_input_tokens_seen": 162839695, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.765625, "step": 7590, "time_per_iteration": 2.632387638092041 }, { "auxiliary_loss_clip": 0.01136642, "auxiliary_loss_mlp": 0.01042058, "balance_loss_clip": 1.02892733, "balance_loss_mlp": 1.04241467, "epoch": 0.4563956110025552, "flos": 22233409528320.0, "grad_norm": 1.823759866447434, "language_loss": 0.72457147, "learning_rate": 2.27343989141331e-06, "loss": 0.74635839, "num_input_tokens_seen": 162856095, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.76171875, "step": 7591, "time_per_iteration": 2.6012375354766846 }, { "auxiliary_loss_clip": 0.01139859, "auxiliary_loss_mlp": 0.01044381, "balance_loss_clip": 1.02978361, "balance_loss_mlp": 1.0447278, "epoch": 0.45645573425522323, "flos": 17092043389440.0, "grad_norm": 2.0937052445677944, "language_loss": 0.7701931, "learning_rate": 2.2730656120587926e-06, "loss": 0.79203546, "num_input_tokens_seen": 162874070, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.77734375, "step": 7592, "time_per_iteration": 2.5810494422912598 }, { "auxiliary_loss_clip": 0.01136497, "auxiliary_loss_mlp": 0.01030953, "balance_loss_clip": 1.01760173, "balance_loss_mlp": 1.04208541, "epoch": 0.4565158575078912, "flos": 20332334309760.0, "grad_norm": 2.363270820020335, "language_loss": 0.69147676, "learning_rate": 2.2726913229592673e-06, "loss": 0.71315128, "num_input_tokens_seen": 162891000, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.765625, "step": 7593, "time_per_iteration": 2.582852363586426 }, { "auxiliary_loss_clip": 0.0112288, "auxiliary_loss_mlp": 0.0103078, "balance_loss_clip": 1.01857316, "balance_loss_mlp": 1.04149735, "epoch": 0.45657598076055916, "flos": 23983013093760.0, "grad_norm": 1.8156383346678755, "language_loss": 0.84113598, "learning_rate": 2.2723170241280898e-06, "loss": 0.86267263, "num_input_tokens_seen": 162910120, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.72265625, "step": 7594, "time_per_iteration": 2.5941052436828613 }, { "auxiliary_loss_clip": 0.01049204, "auxiliary_loss_mlp": 0.01250846, "balance_loss_clip": 1.00298202, "balance_loss_mlp": 1.01547968, "epoch": 0.45663610401322713, "flos": 69364297526400.0, "grad_norm": 0.7867456180649328, "language_loss": 0.52757108, "learning_rate": 2.271942715578618e-06, "loss": 0.55057156, "num_input_tokens_seen": 162963720, "router_z_loss_clip": 0.01855469, "router_z_loss_mlp": 0.24609375, "step": 7595, "time_per_iteration": 3.305140733718872 }, { "auxiliary_loss_clip": 0.01148885, "auxiliary_loss_mlp": 0.01030799, "balance_loss_clip": 1.01657724, "balance_loss_mlp": 1.04296982, "epoch": 0.4566962272658951, "flos": 15149095891200.0, "grad_norm": 2.273908773098904, "language_loss": 0.87585062, "learning_rate": 2.27156839732421e-06, "loss": 0.8976475, "num_input_tokens_seen": 162975760, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.796875, "step": 7596, "time_per_iteration": 2.6557981967926025 }, { "auxiliary_loss_clip": 0.01129043, "auxiliary_loss_mlp": 0.01043071, "balance_loss_clip": 1.02995229, "balance_loss_mlp": 1.04238713, "epoch": 0.45675635051856306, "flos": 18697465762560.0, "grad_norm": 1.6542756186130945, "language_loss": 0.77322841, "learning_rate": 2.2711940693782247e-06, "loss": 0.79494953, "num_input_tokens_seen": 162994865, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.77734375, "step": 7597, "time_per_iteration": 2.5292186737060547 }, { "auxiliary_loss_clip": 0.0112793, "auxiliary_loss_mlp": 0.01036927, "balance_loss_clip": 1.02226985, "balance_loss_mlp": 1.04160213, "epoch": 0.456816473771231, "flos": 19098300608640.0, "grad_norm": 1.7113704964212264, "language_loss": 0.78523982, "learning_rate": 2.270819731754021e-06, "loss": 0.8068884, "num_input_tokens_seen": 163014730, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7734375, "step": 7598, "time_per_iteration": 2.6429567337036133 }, { "auxiliary_loss_clip": 0.01118106, "auxiliary_loss_mlp": 0.01031687, "balance_loss_clip": 1.018592, "balance_loss_mlp": 1.04182124, "epoch": 0.456876597023899, "flos": 28950069507840.0, "grad_norm": 2.1731400705398727, "language_loss": 0.7163918, "learning_rate": 2.2704453844649573e-06, "loss": 0.73788965, "num_input_tokens_seen": 163033405, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.76171875, "step": 7599, "time_per_iteration": 2.594074010848999 }, { "auxiliary_loss_clip": 0.01141743, "auxiliary_loss_mlp": 0.01037651, "balance_loss_clip": 1.02367342, "balance_loss_mlp": 1.04003382, "epoch": 0.45693672027656695, "flos": 23289470317440.0, "grad_norm": 2.0279710896161762, "language_loss": 0.70093763, "learning_rate": 2.2700710275243936e-06, "loss": 0.72273147, "num_input_tokens_seen": 163051400, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75390625, "step": 7600, "time_per_iteration": 2.598966360092163 }, { "auxiliary_loss_clip": 0.01162513, "auxiliary_loss_mlp": 0.01036387, "balance_loss_clip": 1.02331018, "balance_loss_mlp": 1.04248857, "epoch": 0.4569968435292349, "flos": 20558212986240.0, "grad_norm": 1.725325222064149, "language_loss": 0.78778732, "learning_rate": 2.2696966609456896e-06, "loss": 0.80977631, "num_input_tokens_seen": 163069250, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7578125, "step": 7601, "time_per_iteration": 2.60015869140625 }, { "auxiliary_loss_clip": 0.01137213, "auxiliary_loss_mlp": 0.01035607, "balance_loss_clip": 1.02285206, "balance_loss_mlp": 1.04168773, "epoch": 0.4570569667819029, "flos": 41282619223680.0, "grad_norm": 1.8814459147496299, "language_loss": 0.72033322, "learning_rate": 2.269322284742205e-06, "loss": 0.74206144, "num_input_tokens_seen": 163091755, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7734375, "step": 7602, "time_per_iteration": 2.73238468170166 }, { "auxiliary_loss_clip": 0.01125119, "auxiliary_loss_mlp": 0.01034973, "balance_loss_clip": 1.01972628, "balance_loss_mlp": 1.04002643, "epoch": 0.45711709003457085, "flos": 26031573555840.0, "grad_norm": 2.6336037856142775, "language_loss": 0.73010361, "learning_rate": 2.2689478989273015e-06, "loss": 0.75170457, "num_input_tokens_seen": 163111600, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.76171875, "step": 7603, "time_per_iteration": 2.633080244064331 }, { "auxiliary_loss_clip": 0.01126518, "auxiliary_loss_mlp": 0.01038114, "balance_loss_clip": 1.02277744, "balance_loss_mlp": 1.04191613, "epoch": 0.4571772132872388, "flos": 22158068751360.0, "grad_norm": 1.9609593964913263, "language_loss": 0.82734406, "learning_rate": 2.268573503514339e-06, "loss": 0.84899044, "num_input_tokens_seen": 163127350, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.76171875, "step": 7604, "time_per_iteration": 2.5765438079833984 }, { "auxiliary_loss_clip": 0.01143822, "auxiliary_loss_mlp": 0.01040277, "balance_loss_clip": 1.02549469, "balance_loss_mlp": 1.04568934, "epoch": 0.45723733653990684, "flos": 23878872587520.0, "grad_norm": 1.8898513091394356, "language_loss": 0.86260772, "learning_rate": 2.268199098516679e-06, "loss": 0.88444865, "num_input_tokens_seen": 163145855, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.80078125, "step": 7605, "time_per_iteration": 2.660414934158325 }, { "auxiliary_loss_clip": 0.01154777, "auxiliary_loss_mlp": 0.01034848, "balance_loss_clip": 1.02085316, "balance_loss_mlp": 1.04106915, "epoch": 0.4572974597925748, "flos": 16871803148160.0, "grad_norm": 1.8672138432211924, "language_loss": 0.73917651, "learning_rate": 2.2678246839476837e-06, "loss": 0.7610727, "num_input_tokens_seen": 163163830, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78125, "step": 7606, "time_per_iteration": 2.5473225116729736 }, { "auxiliary_loss_clip": 0.01144057, "auxiliary_loss_mlp": 0.01038251, "balance_loss_clip": 1.02420187, "balance_loss_mlp": 1.04003298, "epoch": 0.45735758304524277, "flos": 13771491528960.0, "grad_norm": 1.9605477007786811, "language_loss": 0.80158305, "learning_rate": 2.2674502598207135e-06, "loss": 0.8234061, "num_input_tokens_seen": 163180700, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7734375, "step": 7607, "time_per_iteration": 2.6452202796936035 }, { "auxiliary_loss_clip": 0.01147312, "auxiliary_loss_mlp": 0.01040114, "balance_loss_clip": 1.02609456, "balance_loss_mlp": 1.04420245, "epoch": 0.45741770629791073, "flos": 21100750986240.0, "grad_norm": 1.7849314467511466, "language_loss": 0.80922532, "learning_rate": 2.2670758261491316e-06, "loss": 0.83109957, "num_input_tokens_seen": 163199450, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76171875, "step": 7608, "time_per_iteration": 2.5636954307556152 }, { "auxiliary_loss_clip": 0.01134887, "auxiliary_loss_mlp": 0.01038114, "balance_loss_clip": 1.02377319, "balance_loss_mlp": 1.0404737, "epoch": 0.4574778295505787, "flos": 23112898035840.0, "grad_norm": 1.6751320766302384, "language_loss": 0.68383813, "learning_rate": 2.2667013829463005e-06, "loss": 0.70556808, "num_input_tokens_seen": 163217875, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.765625, "step": 7609, "time_per_iteration": 2.628859758377075 }, { "auxiliary_loss_clip": 0.01134492, "auxiliary_loss_mlp": 0.01041859, "balance_loss_clip": 1.02794099, "balance_loss_mlp": 1.04108357, "epoch": 0.45753795280324666, "flos": 24352929308160.0, "grad_norm": 1.7171786608860617, "language_loss": 0.80728018, "learning_rate": 2.266326930225584e-06, "loss": 0.82904369, "num_input_tokens_seen": 163237430, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75390625, "step": 7610, "time_per_iteration": 2.6715524196624756 }, { "auxiliary_loss_clip": 0.01127361, "auxiliary_loss_mlp": 0.0103869, "balance_loss_clip": 1.02445078, "balance_loss_mlp": 1.04099715, "epoch": 0.4575980760559146, "flos": 16653789550080.0, "grad_norm": 4.202701897745877, "language_loss": 0.82532406, "learning_rate": 2.265952468000344e-06, "loss": 0.84698457, "num_input_tokens_seen": 163253905, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.77734375, "step": 7611, "time_per_iteration": 2.651992082595825 }, { "auxiliary_loss_clip": 0.01129381, "auxiliary_loss_mlp": 0.01031478, "balance_loss_clip": 1.01742899, "balance_loss_mlp": 1.04329467, "epoch": 0.4576581993085826, "flos": 35911423912320.0, "grad_norm": 2.04185989319684, "language_loss": 0.73738134, "learning_rate": 2.2655779962839443e-06, "loss": 0.75898993, "num_input_tokens_seen": 163274285, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7734375, "step": 7612, "time_per_iteration": 2.7769033908843994 }, { "auxiliary_loss_clip": 0.01135011, "auxiliary_loss_mlp": 0.01034869, "balance_loss_clip": 1.02059925, "balance_loss_mlp": 1.04031467, "epoch": 0.45771832256125056, "flos": 20080421251200.0, "grad_norm": 1.862210282364134, "language_loss": 0.84908146, "learning_rate": 2.265203515089749e-06, "loss": 0.87078023, "num_input_tokens_seen": 163293150, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.76953125, "step": 7613, "time_per_iteration": 2.665846586227417 }, { "auxiliary_loss_clip": 0.01145604, "auxiliary_loss_mlp": 0.01030786, "balance_loss_clip": 1.01740468, "balance_loss_mlp": 1.04200363, "epoch": 0.4577784458139185, "flos": 10744329957120.0, "grad_norm": 2.571561067914342, "language_loss": 0.75744236, "learning_rate": 2.264829024431122e-06, "loss": 0.77920616, "num_input_tokens_seen": 163310065, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.765625, "step": 7614, "time_per_iteration": 2.6632683277130127 }, { "auxiliary_loss_clip": 0.01115159, "auxiliary_loss_mlp": 0.01034847, "balance_loss_clip": 1.02134657, "balance_loss_mlp": 1.04073834, "epoch": 0.4578385690665865, "flos": 21907269014400.0, "grad_norm": 1.689492030894405, "language_loss": 0.74750978, "learning_rate": 2.264454524321429e-06, "loss": 0.76900983, "num_input_tokens_seen": 163329415, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.74609375, "step": 7615, "time_per_iteration": 2.678157091140747 }, { "auxiliary_loss_clip": 0.01056993, "auxiliary_loss_mlp": 0.01005226, "balance_loss_clip": 1.003438, "balance_loss_mlp": 1.01523972, "epoch": 0.45789869231925445, "flos": 64758286667520.0, "grad_norm": 0.7789846338418515, "language_loss": 0.57716274, "learning_rate": 2.264080014774034e-06, "loss": 0.59778488, "num_input_tokens_seen": 163385875, "router_z_loss_clip": 0.01782227, "router_z_loss_mlp": 0.24609375, "step": 7616, "time_per_iteration": 4.505634307861328 }, { "auxiliary_loss_clip": 0.01134605, "auxiliary_loss_mlp": 0.01280707, "balance_loss_clip": 1.02000976, "balance_loss_mlp": 1.04170918, "epoch": 0.4579588155719224, "flos": 16144001775360.0, "grad_norm": 1.8330404915146754, "language_loss": 0.70954931, "learning_rate": 2.2637054958023026e-06, "loss": 0.73370242, "num_input_tokens_seen": 163405170, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75, "step": 7617, "time_per_iteration": 2.554798126220703 }, { "auxiliary_loss_clip": 0.01118573, "auxiliary_loss_mlp": 0.01033223, "balance_loss_clip": 1.01932311, "balance_loss_mlp": 1.04261112, "epoch": 0.45801893882459044, "flos": 21395541905280.0, "grad_norm": 1.8979483393076597, "language_loss": 0.764714, "learning_rate": 2.2633309674196004e-06, "loss": 0.78623199, "num_input_tokens_seen": 163423155, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 7618, "time_per_iteration": 2.611504554748535 }, { "auxiliary_loss_clip": 0.01147823, "auxiliary_loss_mlp": 0.01041032, "balance_loss_clip": 1.02803779, "balance_loss_mlp": 1.04330468, "epoch": 0.4580790620772584, "flos": 19536554448000.0, "grad_norm": 1.7750087784891828, "language_loss": 0.76906341, "learning_rate": 2.2629564296392935e-06, "loss": 0.79095197, "num_input_tokens_seen": 163442450, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.77734375, "step": 7619, "time_per_iteration": 2.557082414627075 }, { "auxiliary_loss_clip": 0.01115195, "auxiliary_loss_mlp": 0.01035409, "balance_loss_clip": 1.02302933, "balance_loss_mlp": 1.04185379, "epoch": 0.45813918532992637, "flos": 16581070465920.0, "grad_norm": 1.90315320717999, "language_loss": 0.71961635, "learning_rate": 2.262581882474748e-06, "loss": 0.74112236, "num_input_tokens_seen": 163459810, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.734375, "step": 7620, "time_per_iteration": 2.5822081565856934 }, { "auxiliary_loss_clip": 0.01130882, "auxiliary_loss_mlp": 0.01033187, "balance_loss_clip": 1.0201757, "balance_loss_mlp": 1.03993726, "epoch": 0.45819930858259433, "flos": 42230301701760.0, "grad_norm": 1.822172424028049, "language_loss": 0.78080791, "learning_rate": 2.2622073259393302e-06, "loss": 0.80244863, "num_input_tokens_seen": 163482970, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.734375, "step": 7621, "time_per_iteration": 2.791541576385498 }, { "auxiliary_loss_clip": 0.01048586, "auxiliary_loss_mlp": 0.01000229, "balance_loss_clip": 0.99823803, "balance_loss_mlp": 1.01489162, "epoch": 0.4582594318352623, "flos": 63714795638400.0, "grad_norm": 0.8051939266789045, "language_loss": 0.65025407, "learning_rate": 2.2618327600464076e-06, "loss": 0.67074227, "num_input_tokens_seen": 163545330, "router_z_loss_clip": 0.01989746, "router_z_loss_mlp": 0.24609375, "step": 7622, "time_per_iteration": 4.483561992645264 }, { "auxiliary_loss_clip": 0.01126724, "auxiliary_loss_mlp": 0.01032446, "balance_loss_clip": 1.01804531, "balance_loss_mlp": 1.04118609, "epoch": 0.45831955508793026, "flos": 26869979882880.0, "grad_norm": 1.6672810116986043, "language_loss": 0.79510194, "learning_rate": 2.2614581848093474e-06, "loss": 0.81669366, "num_input_tokens_seen": 163564620, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.765625, "step": 7623, "time_per_iteration": 2.659167528152466 }, { "auxiliary_loss_clip": 0.01153275, "auxiliary_loss_mlp": 0.0103631, "balance_loss_clip": 1.02261281, "balance_loss_mlp": 1.03995299, "epoch": 0.45837967834059823, "flos": 18733951002240.0, "grad_norm": 1.9124759937042035, "language_loss": 0.70522606, "learning_rate": 2.261083600241517e-06, "loss": 0.72712183, "num_input_tokens_seen": 163581010, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.77734375, "step": 7624, "time_per_iteration": 2.5384421348571777 }, { "auxiliary_loss_clip": 0.01132379, "auxiliary_loss_mlp": 0.01034728, "balance_loss_clip": 1.02107263, "balance_loss_mlp": 1.03881407, "epoch": 0.4584398015932662, "flos": 21178102924800.0, "grad_norm": 1.775760846981182, "language_loss": 0.72705865, "learning_rate": 2.2607090063562846e-06, "loss": 0.74872971, "num_input_tokens_seen": 163599955, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7578125, "step": 7625, "time_per_iteration": 4.143722295761108 }, { "auxiliary_loss_clip": 0.01125051, "auxiliary_loss_mlp": 0.01032594, "balance_loss_clip": 1.0199461, "balance_loss_mlp": 1.04077017, "epoch": 0.45849992484593416, "flos": 19790047704960.0, "grad_norm": 1.9736665425622615, "language_loss": 0.78046823, "learning_rate": 2.260334403167018e-06, "loss": 0.80204475, "num_input_tokens_seen": 163618545, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.75, "step": 7626, "time_per_iteration": 2.5273964405059814 }, { "auxiliary_loss_clip": 0.01120144, "auxiliary_loss_mlp": 0.01041051, "balance_loss_clip": 1.02795577, "balance_loss_mlp": 1.04231572, "epoch": 0.4585600480986021, "flos": 18223265387520.0, "grad_norm": 1.5786960483875998, "language_loss": 0.85143143, "learning_rate": 2.2599597906870873e-06, "loss": 0.87304336, "num_input_tokens_seen": 163636055, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.78125, "step": 7627, "time_per_iteration": 4.559036731719971 }, { "auxiliary_loss_clip": 0.01121372, "auxiliary_loss_mlp": 0.01037464, "balance_loss_clip": 1.02313542, "balance_loss_mlp": 1.04178071, "epoch": 0.4586201713512701, "flos": 29022213974400.0, "grad_norm": 1.7141632721787052, "language_loss": 0.69382048, "learning_rate": 2.25958516892986e-06, "loss": 0.7154088, "num_input_tokens_seen": 163657485, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.796875, "step": 7628, "time_per_iteration": 2.6628472805023193 }, { "auxiliary_loss_clip": 0.01124027, "auxiliary_loss_mlp": 0.01034157, "balance_loss_clip": 1.02038229, "balance_loss_mlp": 1.04034424, "epoch": 0.45868029460393805, "flos": 23404600385280.0, "grad_norm": 1.5486864107142333, "language_loss": 0.78379059, "learning_rate": 2.2592105379087053e-06, "loss": 0.80537236, "num_input_tokens_seen": 163676030, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75, "step": 7629, "time_per_iteration": 2.6204917430877686 }, { "auxiliary_loss_clip": 0.01139198, "auxiliary_loss_mlp": 0.01042275, "balance_loss_clip": 1.02760029, "balance_loss_mlp": 1.04098439, "epoch": 0.458740417856606, "flos": 18221972497920.0, "grad_norm": 2.017602513609007, "language_loss": 0.79465306, "learning_rate": 2.2588358976369933e-06, "loss": 0.81646788, "num_input_tokens_seen": 163694490, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80859375, "step": 7630, "time_per_iteration": 2.549342632293701 }, { "auxiliary_loss_clip": 0.01138993, "auxiliary_loss_mlp": 0.01034247, "balance_loss_clip": 1.0199182, "balance_loss_mlp": 1.04103851, "epoch": 0.458800541109274, "flos": 34568760504960.0, "grad_norm": 1.6655658993144205, "language_loss": 0.71994978, "learning_rate": 2.258461248128094e-06, "loss": 0.74168217, "num_input_tokens_seen": 163717035, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.80078125, "step": 7631, "time_per_iteration": 2.7498855590820312 }, { "auxiliary_loss_clip": 0.01145915, "auxiliary_loss_mlp": 0.01036306, "balance_loss_clip": 1.02140474, "balance_loss_mlp": 1.04234028, "epoch": 0.458860664361942, "flos": 17712112896000.0, "grad_norm": 2.1538570278886495, "language_loss": 0.71150452, "learning_rate": 2.2580865893953776e-06, "loss": 0.73332667, "num_input_tokens_seen": 163734525, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.7734375, "step": 7632, "time_per_iteration": 2.561662197113037 }, { "auxiliary_loss_clip": 0.01156102, "auxiliary_loss_mlp": 0.01278097, "balance_loss_clip": 1.01635635, "balance_loss_mlp": 1.0399766, "epoch": 0.45892078761460997, "flos": 18441889516800.0, "grad_norm": 2.901179260374408, "language_loss": 0.69258523, "learning_rate": 2.2577119214522147e-06, "loss": 0.71692723, "num_input_tokens_seen": 163752860, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8046875, "step": 7633, "time_per_iteration": 2.638277053833008 }, { "auxiliary_loss_clip": 0.01124777, "auxiliary_loss_mlp": 0.0103742, "balance_loss_clip": 1.02361584, "balance_loss_mlp": 1.03916132, "epoch": 0.45898091086727794, "flos": 22672956257280.0, "grad_norm": 1.8148395847248247, "language_loss": 0.80642408, "learning_rate": 2.257337244311976e-06, "loss": 0.82804608, "num_input_tokens_seen": 163772495, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.765625, "step": 7634, "time_per_iteration": 2.599958896636963 }, { "auxiliary_loss_clip": 0.01131954, "auxiliary_loss_mlp": 0.01283945, "balance_loss_clip": 1.02267098, "balance_loss_mlp": 1.04255092, "epoch": 0.4590410341199459, "flos": 21652949744640.0, "grad_norm": 2.986647938817092, "language_loss": 0.81708258, "learning_rate": 2.2569625579880336e-06, "loss": 0.8412416, "num_input_tokens_seen": 163791475, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.80078125, "step": 7635, "time_per_iteration": 2.6480133533477783 }, { "auxiliary_loss_clip": 0.01135099, "auxiliary_loss_mlp": 0.01042879, "balance_loss_clip": 1.02882409, "balance_loss_mlp": 1.04171693, "epoch": 0.45910115737261387, "flos": 36535372087680.0, "grad_norm": 1.4997359259778333, "language_loss": 0.64865196, "learning_rate": 2.256587862493758e-06, "loss": 0.67043173, "num_input_tokens_seen": 163812995, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 7636, "time_per_iteration": 2.696945905685425 }, { "auxiliary_loss_clip": 0.01126812, "auxiliary_loss_mlp": 0.01030851, "balance_loss_clip": 1.01701045, "balance_loss_mlp": 1.04314661, "epoch": 0.45916128062528183, "flos": 24419866302720.0, "grad_norm": 1.3872073028282075, "language_loss": 0.80335277, "learning_rate": 2.256213157842522e-06, "loss": 0.82492942, "num_input_tokens_seen": 163833945, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.74609375, "step": 7637, "time_per_iteration": 2.6605634689331055 }, { "auxiliary_loss_clip": 0.01141538, "auxiliary_loss_mlp": 0.01035725, "balance_loss_clip": 1.02139044, "balance_loss_mlp": 1.04396415, "epoch": 0.4592214038779498, "flos": 23221958705280.0, "grad_norm": 1.978496621594851, "language_loss": 0.75046504, "learning_rate": 2.255838444047697e-06, "loss": 0.77223766, "num_input_tokens_seen": 163853885, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.79296875, "step": 7638, "time_per_iteration": 2.7009024620056152 }, { "auxiliary_loss_clip": 0.0112399, "auxiliary_loss_mlp": 0.01033492, "balance_loss_clip": 1.01981854, "balance_loss_mlp": 1.04033422, "epoch": 0.45928152713061776, "flos": 19172133014400.0, "grad_norm": 1.7126602873958319, "language_loss": 0.74198008, "learning_rate": 2.2554637211226557e-06, "loss": 0.76355493, "num_input_tokens_seen": 163871855, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.74609375, "step": 7639, "time_per_iteration": 2.581897258758545 }, { "auxiliary_loss_clip": 0.01116882, "auxiliary_loss_mlp": 0.0103627, "balance_loss_clip": 1.02262044, "balance_loss_mlp": 1.0398035, "epoch": 0.4593416503832857, "flos": 22414686491520.0, "grad_norm": 1.672293685928696, "language_loss": 0.68068469, "learning_rate": 2.2550889890807726e-06, "loss": 0.70221615, "num_input_tokens_seen": 163891450, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.76953125, "step": 7640, "time_per_iteration": 2.582598924636841 }, { "auxiliary_loss_clip": 0.0113995, "auxiliary_loss_mlp": 0.01033977, "balance_loss_clip": 1.01993442, "balance_loss_mlp": 1.0414865, "epoch": 0.4594017736359537, "flos": 18880215183360.0, "grad_norm": 1.6768648171767047, "language_loss": 0.75725961, "learning_rate": 2.2547142479354186e-06, "loss": 0.77899885, "num_input_tokens_seen": 163909345, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.80078125, "step": 7641, "time_per_iteration": 2.600674629211426 }, { "auxiliary_loss_clip": 0.01139122, "auxiliary_loss_mlp": 0.01032129, "balance_loss_clip": 1.01753139, "balance_loss_mlp": 1.04216361, "epoch": 0.45946189688862166, "flos": 20518567349760.0, "grad_norm": 1.647996465243602, "language_loss": 0.74668038, "learning_rate": 2.2543394976999687e-06, "loss": 0.7683928, "num_input_tokens_seen": 163926940, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.79296875, "step": 7642, "time_per_iteration": 2.6045384407043457 }, { "auxiliary_loss_clip": 0.01049351, "auxiliary_loss_mlp": 0.01006507, "balance_loss_clip": 1.00462389, "balance_loss_mlp": 1.01558018, "epoch": 0.4595220201412896, "flos": 61405990162560.0, "grad_norm": 0.8377724531153324, "language_loss": 0.58228332, "learning_rate": 2.2539647383877964e-06, "loss": 0.60284191, "num_input_tokens_seen": 163977785, "router_z_loss_clip": 0.01879883, "router_z_loss_mlp": 0.24609375, "step": 7643, "time_per_iteration": 2.95588755607605 }, { "auxiliary_loss_clip": 0.01131546, "auxiliary_loss_mlp": 0.01033051, "balance_loss_clip": 1.01893711, "balance_loss_mlp": 1.03937554, "epoch": 0.4595821433939576, "flos": 23330947547520.0, "grad_norm": 1.4895073001438035, "language_loss": 0.92802107, "learning_rate": 2.2535899700122758e-06, "loss": 0.94966698, "num_input_tokens_seen": 163996630, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8359375, "step": 7644, "time_per_iteration": 2.630046844482422 }, { "auxiliary_loss_clip": 0.01128517, "auxiliary_loss_mlp": 0.01032937, "balance_loss_clip": 1.01885271, "balance_loss_mlp": 1.04051495, "epoch": 0.4596422666466256, "flos": 14282356711680.0, "grad_norm": 5.9040385901805585, "language_loss": 0.82551867, "learning_rate": 2.2532151925867816e-06, "loss": 0.84713322, "num_input_tokens_seen": 164013190, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7890625, "step": 7645, "time_per_iteration": 2.6174004077911377 }, { "auxiliary_loss_clip": 0.01065916, "auxiliary_loss_mlp": 0.01003779, "balance_loss_clip": 1.00195491, "balance_loss_mlp": 1.01430392, "epoch": 0.4597023898992936, "flos": 65727337737600.0, "grad_norm": 0.7630462014700499, "language_loss": 0.5989846, "learning_rate": 2.252840406124688e-06, "loss": 0.61968148, "num_input_tokens_seen": 164074030, "router_z_loss_clip": 0.01818848, "router_z_loss_mlp": 0.24414062, "step": 7646, "time_per_iteration": 3.1385109424591064 }, { "auxiliary_loss_clip": 0.01128325, "auxiliary_loss_mlp": 0.01032928, "balance_loss_clip": 1.01862288, "balance_loss_mlp": 1.04127145, "epoch": 0.45976251315196154, "flos": 26907075653760.0, "grad_norm": 1.6309990633294262, "language_loss": 0.72726488, "learning_rate": 2.2524656106393714e-06, "loss": 0.74887747, "num_input_tokens_seen": 164095515, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.78125, "step": 7647, "time_per_iteration": 2.6422693729400635 }, { "auxiliary_loss_clip": 0.01146234, "auxiliary_loss_mlp": 0.01035014, "balance_loss_clip": 1.0212636, "balance_loss_mlp": 1.04066396, "epoch": 0.4598226364046295, "flos": 26618066824320.0, "grad_norm": 1.7214016785107722, "language_loss": 0.66575474, "learning_rate": 2.252090806144206e-06, "loss": 0.68756723, "num_input_tokens_seen": 164117270, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7890625, "step": 7648, "time_per_iteration": 2.690898895263672 }, { "auxiliary_loss_clip": 0.01127768, "auxiliary_loss_mlp": 0.01036543, "balance_loss_clip": 1.02290511, "balance_loss_mlp": 1.04032028, "epoch": 0.45988275965729747, "flos": 24387762522240.0, "grad_norm": 1.438696177787374, "language_loss": 0.78332675, "learning_rate": 2.2517159926525685e-06, "loss": 0.80496991, "num_input_tokens_seen": 164137850, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.78515625, "step": 7649, "time_per_iteration": 2.5684006214141846 }, { "auxiliary_loss_clip": 0.01162153, "auxiliary_loss_mlp": 0.01040286, "balance_loss_clip": 1.02570081, "balance_loss_mlp": 1.04081368, "epoch": 0.45994288290996543, "flos": 24535822383360.0, "grad_norm": 1.5736845737263319, "language_loss": 0.68961418, "learning_rate": 2.2513411701778346e-06, "loss": 0.71163857, "num_input_tokens_seen": 164157960, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.76953125, "step": 7650, "time_per_iteration": 2.705029249191284 }, { "auxiliary_loss_clip": 0.01132323, "auxiliary_loss_mlp": 0.01035968, "balance_loss_clip": 1.0197556, "balance_loss_mlp": 1.04140925, "epoch": 0.4600030061626334, "flos": 14830245838080.0, "grad_norm": 2.9859032940183616, "language_loss": 0.84535635, "learning_rate": 2.2509663387333804e-06, "loss": 0.86703932, "num_input_tokens_seen": 164174590, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.82421875, "step": 7651, "time_per_iteration": 2.532695770263672 }, { "auxiliary_loss_clip": 0.01137291, "auxiliary_loss_mlp": 0.01282074, "balance_loss_clip": 1.02134395, "balance_loss_mlp": 1.04311717, "epoch": 0.46006312941530136, "flos": 18113845582080.0, "grad_norm": 1.7010227380115377, "language_loss": 0.75140464, "learning_rate": 2.250591498332584e-06, "loss": 0.77559829, "num_input_tokens_seen": 164192935, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76171875, "step": 7652, "time_per_iteration": 2.655351400375366 }, { "auxiliary_loss_clip": 0.0115355, "auxiliary_loss_mlp": 0.01031447, "balance_loss_clip": 1.01682019, "balance_loss_mlp": 1.0391494, "epoch": 0.46012325266796933, "flos": 21976468565760.0, "grad_norm": 2.543002560045121, "language_loss": 0.76270521, "learning_rate": 2.2502166489888207e-06, "loss": 0.7845552, "num_input_tokens_seen": 164213160, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7890625, "step": 7653, "time_per_iteration": 2.629607677459717 }, { "auxiliary_loss_clip": 0.01162593, "auxiliary_loss_mlp": 0.01038707, "balance_loss_clip": 1.02250648, "balance_loss_mlp": 1.04340148, "epoch": 0.4601833759206373, "flos": 15268068714240.0, "grad_norm": 2.1604488097825936, "language_loss": 0.65613496, "learning_rate": 2.2498417907154695e-06, "loss": 0.67814803, "num_input_tokens_seen": 164229330, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8359375, "step": 7654, "time_per_iteration": 2.6344778537750244 }, { "auxiliary_loss_clip": 0.01135073, "auxiliary_loss_mlp": 0.01038258, "balance_loss_clip": 1.02406025, "balance_loss_mlp": 1.03886259, "epoch": 0.46024349917330526, "flos": 27088999061760.0, "grad_norm": 1.7361986522892052, "language_loss": 0.7901088, "learning_rate": 2.2494669235259077e-06, "loss": 0.81184208, "num_input_tokens_seen": 164248240, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78125, "step": 7655, "time_per_iteration": 2.6247401237487793 }, { "auxiliary_loss_clip": 0.01142433, "auxiliary_loss_mlp": 0.01035451, "balance_loss_clip": 1.02162313, "balance_loss_mlp": 1.03978431, "epoch": 0.4603036224259732, "flos": 24462923731200.0, "grad_norm": 1.572084399582863, "language_loss": 0.67621583, "learning_rate": 2.249092047433512e-06, "loss": 0.69799471, "num_input_tokens_seen": 164268020, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 7656, "time_per_iteration": 2.6574909687042236 }, { "auxiliary_loss_clip": 0.01136018, "auxiliary_loss_mlp": 0.01032389, "balance_loss_clip": 1.01858485, "balance_loss_mlp": 1.03977895, "epoch": 0.4603637456786412, "flos": 28109292883200.0, "grad_norm": 1.8401605417510583, "language_loss": 0.80786705, "learning_rate": 2.248717162451663e-06, "loss": 0.8295511, "num_input_tokens_seen": 164287305, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78125, "step": 7657, "time_per_iteration": 4.176188945770264 }, { "auxiliary_loss_clip": 0.01119809, "auxiliary_loss_mlp": 0.01279778, "balance_loss_clip": 1.01775146, "balance_loss_mlp": 1.04179335, "epoch": 0.4604238689313092, "flos": 24348942898560.0, "grad_norm": 5.976968424842992, "language_loss": 0.70408165, "learning_rate": 2.248342268593738e-06, "loss": 0.72807753, "num_input_tokens_seen": 164306835, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.78125, "step": 7658, "time_per_iteration": 2.562344789505005 }, { "auxiliary_loss_clip": 0.0104879, "auxiliary_loss_mlp": 0.01007236, "balance_loss_clip": 1.00547147, "balance_loss_mlp": 1.01497984, "epoch": 0.4604839921839772, "flos": 53606229431040.0, "grad_norm": 0.9455853759646959, "language_loss": 0.62110567, "learning_rate": 2.247967365873116e-06, "loss": 0.64166594, "num_input_tokens_seen": 164367095, "router_z_loss_clip": 0.0177002, "router_z_loss_mlp": 0.24804688, "step": 7659, "time_per_iteration": 3.2985341548919678 }, { "auxiliary_loss_clip": 0.01154626, "auxiliary_loss_mlp": 0.01035495, "balance_loss_clip": 1.0210762, "balance_loss_mlp": 1.04273772, "epoch": 0.46054411543664514, "flos": 31248424126080.0, "grad_norm": 1.5839237208728274, "language_loss": 0.6862855, "learning_rate": 2.2475924543031766e-06, "loss": 0.70818663, "num_input_tokens_seen": 164388895, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7578125, "step": 7660, "time_per_iteration": 2.77435040473938 }, { "auxiliary_loss_clip": 0.01126303, "auxiliary_loss_mlp": 0.01039953, "balance_loss_clip": 1.02497387, "balance_loss_mlp": 1.03879189, "epoch": 0.4606042386893131, "flos": 24092863862400.0, "grad_norm": 1.9003510751433168, "language_loss": 0.77196664, "learning_rate": 2.2472175338972995e-06, "loss": 0.79362923, "num_input_tokens_seen": 164409080, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.78515625, "step": 7661, "time_per_iteration": 2.6342175006866455 }, { "auxiliary_loss_clip": 0.01127037, "auxiliary_loss_mlp": 0.01038245, "balance_loss_clip": 1.02426744, "balance_loss_mlp": 1.04301441, "epoch": 0.46066436194198107, "flos": 26578457101440.0, "grad_norm": 1.8622383643672946, "language_loss": 0.74783123, "learning_rate": 2.246842604668865e-06, "loss": 0.76948404, "num_input_tokens_seen": 164427585, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75, "step": 7662, "time_per_iteration": 2.6516835689544678 }, { "auxiliary_loss_clip": 0.01144794, "auxiliary_loss_mlp": 0.01038625, "balance_loss_clip": 1.02243602, "balance_loss_mlp": 1.04222703, "epoch": 0.46072448519464904, "flos": 17775602184960.0, "grad_norm": 4.250014748383536, "language_loss": 0.79282773, "learning_rate": 2.246467666631252e-06, "loss": 0.81466186, "num_input_tokens_seen": 164438455, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.84375, "step": 7663, "time_per_iteration": 3.9989538192749023 }, { "auxiliary_loss_clip": 0.01126556, "auxiliary_loss_mlp": 0.01038141, "balance_loss_clip": 1.02393746, "balance_loss_mlp": 1.04142714, "epoch": 0.460784608447317, "flos": 15086109392640.0, "grad_norm": 1.7337126734797133, "language_loss": 0.73118269, "learning_rate": 2.2460927197978423e-06, "loss": 0.75282973, "num_input_tokens_seen": 164456830, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.765625, "step": 7664, "time_per_iteration": 2.5873570442199707 }, { "auxiliary_loss_clip": 0.01146258, "auxiliary_loss_mlp": 0.01035205, "balance_loss_clip": 1.02109063, "balance_loss_mlp": 1.0417943, "epoch": 0.46084473169998497, "flos": 18588261438720.0, "grad_norm": 1.5730500274106498, "language_loss": 0.72369373, "learning_rate": 2.2457177641820164e-06, "loss": 0.74550831, "num_input_tokens_seen": 164475375, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.78125, "step": 7665, "time_per_iteration": 2.6497673988342285 }, { "auxiliary_loss_clip": 0.0114492, "auxiliary_loss_mlp": 0.01033313, "balance_loss_clip": 1.01930594, "balance_loss_mlp": 1.03987122, "epoch": 0.46090485495265293, "flos": 19494789909120.0, "grad_norm": 1.8912722687818249, "language_loss": 0.78127944, "learning_rate": 2.2453427997971553e-06, "loss": 0.80306178, "num_input_tokens_seen": 164492040, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78125, "step": 7666, "time_per_iteration": 4.092683553695679 }, { "auxiliary_loss_clip": 0.01144062, "auxiliary_loss_mlp": 0.01034418, "balance_loss_clip": 1.01858139, "balance_loss_mlp": 1.04081035, "epoch": 0.4609649782053209, "flos": 33364927163520.0, "grad_norm": 1.4164601432827482, "language_loss": 0.73735166, "learning_rate": 2.2449678266566416e-06, "loss": 0.75913644, "num_input_tokens_seen": 164513665, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.765625, "step": 7667, "time_per_iteration": 2.70119047164917 }, { "auxiliary_loss_clip": 0.01152872, "auxiliary_loss_mlp": 0.01035513, "balance_loss_clip": 1.02153599, "balance_loss_mlp": 1.03999805, "epoch": 0.46102510145798886, "flos": 23769165473280.0, "grad_norm": 1.542274067614383, "language_loss": 0.76285672, "learning_rate": 2.2445928447738556e-06, "loss": 0.78474057, "num_input_tokens_seen": 164533890, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78125, "step": 7668, "time_per_iteration": 2.561105728149414 }, { "auxiliary_loss_clip": 0.01151202, "auxiliary_loss_mlp": 0.01033929, "balance_loss_clip": 1.02055359, "balance_loss_mlp": 1.04163837, "epoch": 0.4610852247106568, "flos": 23294821443840.0, "grad_norm": 1.579802984525296, "language_loss": 0.78102273, "learning_rate": 2.2442178541621804e-06, "loss": 0.80287409, "num_input_tokens_seen": 164553815, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7421875, "step": 7669, "time_per_iteration": 4.1266889572143555 }, { "auxiliary_loss_clip": 0.0112786, "auxiliary_loss_mlp": 0.01033006, "balance_loss_clip": 1.0187012, "balance_loss_mlp": 1.04027033, "epoch": 0.4611453479633248, "flos": 25447450584960.0, "grad_norm": 1.709033262972218, "language_loss": 0.82249486, "learning_rate": 2.2438428548349977e-06, "loss": 0.84410357, "num_input_tokens_seen": 164573125, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78515625, "step": 7670, "time_per_iteration": 2.700531244277954 }, { "auxiliary_loss_clip": 0.01124897, "auxiliary_loss_mlp": 0.01036988, "balance_loss_clip": 1.0221287, "balance_loss_mlp": 1.04016268, "epoch": 0.4612054712159928, "flos": 21139606523520.0, "grad_norm": 5.357307631846866, "language_loss": 0.63354385, "learning_rate": 2.2434678468056916e-06, "loss": 0.65516269, "num_input_tokens_seen": 164592575, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7578125, "step": 7671, "time_per_iteration": 2.5381836891174316 }, { "auxiliary_loss_clip": 0.01058608, "auxiliary_loss_mlp": 0.01004063, "balance_loss_clip": 1.00226271, "balance_loss_mlp": 1.01540947, "epoch": 0.4612655944686608, "flos": 69959266404480.0, "grad_norm": 0.688991050828448, "language_loss": 0.55884886, "learning_rate": 2.2430928300876436e-06, "loss": 0.57947558, "num_input_tokens_seen": 164659795, "router_z_loss_clip": 0.01794434, "router_z_loss_mlp": 0.25, "step": 7672, "time_per_iteration": 3.283938407897949 }, { "auxiliary_loss_clip": 0.01129928, "auxiliary_loss_mlp": 0.01037309, "balance_loss_clip": 1.02320087, "balance_loss_mlp": 1.04222608, "epoch": 0.46132571772132874, "flos": 16837149502080.0, "grad_norm": 7.251085018949723, "language_loss": 0.70883775, "learning_rate": 2.2427178046942387e-06, "loss": 0.73051012, "num_input_tokens_seen": 164678735, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7890625, "step": 7673, "time_per_iteration": 2.5865933895111084 }, { "auxiliary_loss_clip": 0.01135306, "auxiliary_loss_mlp": 0.01032186, "balance_loss_clip": 1.01823223, "balance_loss_mlp": 1.04220867, "epoch": 0.4613858409739967, "flos": 35808935431680.0, "grad_norm": 1.6947116852401278, "language_loss": 0.7045325, "learning_rate": 2.242342770638859e-06, "loss": 0.72620738, "num_input_tokens_seen": 164700885, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75, "step": 7674, "time_per_iteration": 2.718822479248047 }, { "auxiliary_loss_clip": 0.01132968, "auxiliary_loss_mlp": 0.01037684, "balance_loss_clip": 1.02339113, "balance_loss_mlp": 1.03754282, "epoch": 0.4614459642266647, "flos": 35266756567680.0, "grad_norm": 1.6414857294413772, "language_loss": 0.6582191, "learning_rate": 2.241967727934889e-06, "loss": 0.67992568, "num_input_tokens_seen": 164726960, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78125, "step": 7675, "time_per_iteration": 2.722844362258911 }, { "auxiliary_loss_clip": 0.01040741, "auxiliary_loss_mlp": 0.01003427, "balance_loss_clip": 1.00163925, "balance_loss_mlp": 1.0158298, "epoch": 0.46150608747933264, "flos": 66704610044160.0, "grad_norm": 0.8142181096340396, "language_loss": 0.58616984, "learning_rate": 2.241592676595714e-06, "loss": 0.60661155, "num_input_tokens_seen": 164788525, "router_z_loss_clip": 0.01782227, "router_z_loss_mlp": 0.24902344, "step": 7676, "time_per_iteration": 3.1547067165374756 }, { "auxiliary_loss_clip": 0.01067601, "auxiliary_loss_mlp": 0.01005163, "balance_loss_clip": 1.00344646, "balance_loss_mlp": 1.01608777, "epoch": 0.4615662107320006, "flos": 55830177025920.0, "grad_norm": 0.7890551219929557, "language_loss": 0.62692219, "learning_rate": 2.241217616634717e-06, "loss": 0.64764982, "num_input_tokens_seen": 164843525, "router_z_loss_clip": 0.01721191, "router_z_loss_mlp": 0.24804688, "step": 7677, "time_per_iteration": 3.068347930908203 }, { "auxiliary_loss_clip": 0.01134631, "auxiliary_loss_mlp": 0.0103412, "balance_loss_clip": 1.02031612, "balance_loss_mlp": 1.04019403, "epoch": 0.46162633398466857, "flos": 15483245137920.0, "grad_norm": 3.5104702670666352, "language_loss": 0.76159573, "learning_rate": 2.2408425480652838e-06, "loss": 0.78328317, "num_input_tokens_seen": 164859895, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.765625, "step": 7678, "time_per_iteration": 2.4954514503479004 }, { "auxiliary_loss_clip": 0.01120221, "auxiliary_loss_mlp": 0.01037998, "balance_loss_clip": 1.02354407, "balance_loss_mlp": 1.04199934, "epoch": 0.46168645723733653, "flos": 20011437181440.0, "grad_norm": 3.568733852283137, "language_loss": 0.66750121, "learning_rate": 2.2404674709008004e-06, "loss": 0.6890834, "num_input_tokens_seen": 164878030, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78125, "step": 7679, "time_per_iteration": 2.6123368740081787 }, { "auxiliary_loss_clip": 0.01142151, "auxiliary_loss_mlp": 0.01036729, "balance_loss_clip": 1.02324045, "balance_loss_mlp": 1.03924656, "epoch": 0.4617465804900045, "flos": 20298542590080.0, "grad_norm": 5.931838998596966, "language_loss": 0.69745624, "learning_rate": 2.2400923851546506e-06, "loss": 0.71924502, "num_input_tokens_seen": 164895710, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.765625, "step": 7680, "time_per_iteration": 2.6195642948150635 }, { "auxiliary_loss_clip": 0.01125729, "auxiliary_loss_mlp": 0.01047666, "balance_loss_clip": 1.0333786, "balance_loss_mlp": 1.04530048, "epoch": 0.46180670374267246, "flos": 22346312952960.0, "grad_norm": 1.6673087705402871, "language_loss": 0.63563681, "learning_rate": 2.2397172908402217e-06, "loss": 0.65737081, "num_input_tokens_seen": 164913365, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8046875, "step": 7681, "time_per_iteration": 2.631847858428955 }, { "auxiliary_loss_clip": 0.01126052, "auxiliary_loss_mlp": 0.01031864, "balance_loss_clip": 1.01878667, "balance_loss_mlp": 1.04164827, "epoch": 0.46186682699534043, "flos": 19895696582400.0, "grad_norm": 1.5949320801990372, "language_loss": 0.66927302, "learning_rate": 2.2393421879708994e-06, "loss": 0.69085211, "num_input_tokens_seen": 164931620, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7578125, "step": 7682, "time_per_iteration": 2.4987354278564453 }, { "auxiliary_loss_clip": 0.01146252, "auxiliary_loss_mlp": 0.0103781, "balance_loss_clip": 1.02356493, "balance_loss_mlp": 1.04102731, "epoch": 0.4619269502480084, "flos": 31503569408640.0, "grad_norm": 2.1786469679030507, "language_loss": 0.73866284, "learning_rate": 2.2389670765600693e-06, "loss": 0.76050347, "num_input_tokens_seen": 164950905, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78515625, "step": 7683, "time_per_iteration": 2.6777536869049072 }, { "auxiliary_loss_clip": 0.01117641, "auxiliary_loss_mlp": 0.01036811, "balance_loss_clip": 1.0230186, "balance_loss_mlp": 1.04177952, "epoch": 0.46198707350067636, "flos": 25009484054400.0, "grad_norm": 1.9424616046245147, "language_loss": 0.76321912, "learning_rate": 2.2385919566211196e-06, "loss": 0.78476363, "num_input_tokens_seen": 164970950, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7578125, "step": 7684, "time_per_iteration": 2.5072569847106934 }, { "auxiliary_loss_clip": 0.01125638, "auxiliary_loss_mlp": 0.01042268, "balance_loss_clip": 1.0274142, "balance_loss_mlp": 1.04311454, "epoch": 0.4620471967533444, "flos": 18292357198080.0, "grad_norm": 1.9507305398801864, "language_loss": 0.79504555, "learning_rate": 2.2382168281674365e-06, "loss": 0.81672466, "num_input_tokens_seen": 164989855, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.82421875, "step": 7685, "time_per_iteration": 2.552396297454834 }, { "auxiliary_loss_clip": 0.01145497, "auxiliary_loss_mlp": 0.01044178, "balance_loss_clip": 1.0288713, "balance_loss_mlp": 1.04393184, "epoch": 0.46210732000601235, "flos": 33985104410880.0, "grad_norm": 1.660145985972917, "language_loss": 0.66775751, "learning_rate": 2.2378416912124076e-06, "loss": 0.68965429, "num_input_tokens_seen": 165012290, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.74609375, "step": 7686, "time_per_iteration": 2.6611063480377197 }, { "auxiliary_loss_clip": 0.01139236, "auxiliary_loss_mlp": 0.01282454, "balance_loss_clip": 1.02081144, "balance_loss_mlp": 1.04149199, "epoch": 0.4621674432586803, "flos": 25009412227200.0, "grad_norm": 1.9840872553925792, "language_loss": 0.73759627, "learning_rate": 2.23746654576942e-06, "loss": 0.76181316, "num_input_tokens_seen": 165030810, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.80078125, "step": 7687, "time_per_iteration": 2.613774061203003 }, { "auxiliary_loss_clip": 0.01164386, "auxiliary_loss_mlp": 0.01032384, "balance_loss_clip": 1.01840687, "balance_loss_mlp": 1.04310083, "epoch": 0.4622275665113483, "flos": 22014031213440.0, "grad_norm": 1.8515004642182968, "language_loss": 0.74349779, "learning_rate": 2.2370913918518635e-06, "loss": 0.7654655, "num_input_tokens_seen": 165050205, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.76171875, "step": 7688, "time_per_iteration": 2.6054699420928955 }, { "auxiliary_loss_clip": 0.01153076, "auxiliary_loss_mlp": 0.01282469, "balance_loss_clip": 1.02108455, "balance_loss_mlp": 1.04074144, "epoch": 0.46228768976401624, "flos": 24058820747520.0, "grad_norm": 2.144313042019991, "language_loss": 0.78244245, "learning_rate": 2.2367162294731247e-06, "loss": 0.80679786, "num_input_tokens_seen": 165069370, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.76953125, "step": 7689, "time_per_iteration": 2.6486284732818604 }, { "auxiliary_loss_clip": 0.01131673, "auxiliary_loss_mlp": 0.01036484, "balance_loss_clip": 1.02161872, "balance_loss_mlp": 1.04248643, "epoch": 0.4623478130166842, "flos": 26651391667200.0, "grad_norm": 3.9326373970957733, "language_loss": 0.57104993, "learning_rate": 2.236341058646592e-06, "loss": 0.59273148, "num_input_tokens_seen": 165089610, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.80078125, "step": 7690, "time_per_iteration": 2.5917391777038574 }, { "auxiliary_loss_clip": 0.01128355, "auxiliary_loss_mlp": 0.01034529, "balance_loss_clip": 1.01973534, "balance_loss_mlp": 1.04118061, "epoch": 0.46240793626935217, "flos": 20558428467840.0, "grad_norm": 2.5155357304432813, "language_loss": 0.83052111, "learning_rate": 2.2359658793856556e-06, "loss": 0.85214996, "num_input_tokens_seen": 165109050, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.78125, "step": 7691, "time_per_iteration": 2.570652961730957 }, { "auxiliary_loss_clip": 0.01136865, "auxiliary_loss_mlp": 0.01028669, "balance_loss_clip": 1.01531172, "balance_loss_mlp": 1.04269624, "epoch": 0.46246805952202014, "flos": 22456055980800.0, "grad_norm": 1.42598988961146, "language_loss": 0.75081336, "learning_rate": 2.2355906917037027e-06, "loss": 0.77246869, "num_input_tokens_seen": 165130130, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.765625, "step": 7692, "time_per_iteration": 2.587540626525879 }, { "auxiliary_loss_clip": 0.01147763, "auxiliary_loss_mlp": 0.01037546, "balance_loss_clip": 1.02234674, "balance_loss_mlp": 1.04232478, "epoch": 0.4625281827746881, "flos": 35041308854400.0, "grad_norm": 1.5800802394769158, "language_loss": 0.7405107, "learning_rate": 2.2352154956141253e-06, "loss": 0.76236379, "num_input_tokens_seen": 165152685, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.78515625, "step": 7693, "time_per_iteration": 2.7000768184661865 }, { "auxiliary_loss_clip": 0.01146479, "auxiliary_loss_mlp": 0.01043124, "balance_loss_clip": 1.02841985, "balance_loss_mlp": 1.04317486, "epoch": 0.46258830602735607, "flos": 21068647205760.0, "grad_norm": 1.6848287474311727, "language_loss": 0.85532701, "learning_rate": 2.2348402911303113e-06, "loss": 0.87722301, "num_input_tokens_seen": 165173315, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.76171875, "step": 7694, "time_per_iteration": 2.61010479927063 }, { "auxiliary_loss_clip": 0.0113858, "auxiliary_loss_mlp": 0.01041018, "balance_loss_clip": 1.02740467, "balance_loss_mlp": 1.0431602, "epoch": 0.46264842928002403, "flos": 26177227205760.0, "grad_norm": 2.1428529085514523, "language_loss": 0.78858536, "learning_rate": 2.2344650782656512e-06, "loss": 0.81038129, "num_input_tokens_seen": 165192395, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.78125, "step": 7695, "time_per_iteration": 2.6050667762756348 }, { "auxiliary_loss_clip": 0.01125151, "auxiliary_loss_mlp": 0.01037125, "balance_loss_clip": 1.02354705, "balance_loss_mlp": 1.04163265, "epoch": 0.462708552532692, "flos": 16764214936320.0, "grad_norm": 1.8554348088851453, "language_loss": 0.72251928, "learning_rate": 2.234089857033536e-06, "loss": 0.74414206, "num_input_tokens_seen": 165211355, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.75, "step": 7696, "time_per_iteration": 2.5417275428771973 }, { "auxiliary_loss_clip": 0.01137055, "auxiliary_loss_mlp": 0.01039608, "balance_loss_clip": 1.02504027, "balance_loss_mlp": 1.0428375, "epoch": 0.46276867578535996, "flos": 15560453422080.0, "grad_norm": 1.5102486448763464, "language_loss": 0.69167006, "learning_rate": 2.233714627447356e-06, "loss": 0.71343672, "num_input_tokens_seen": 165229380, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.765625, "step": 7697, "time_per_iteration": 2.539578914642334 }, { "auxiliary_loss_clip": 0.01119483, "auxiliary_loss_mlp": 0.01030767, "balance_loss_clip": 1.01739132, "balance_loss_mlp": 1.04387581, "epoch": 0.462828799038028, "flos": 22415404763520.0, "grad_norm": 2.144462428623169, "language_loss": 0.84808707, "learning_rate": 2.233339389520502e-06, "loss": 0.86958957, "num_input_tokens_seen": 165247200, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7578125, "step": 7698, "time_per_iteration": 2.609926462173462 }, { "auxiliary_loss_clip": 0.01116748, "auxiliary_loss_mlp": 0.01033563, "balance_loss_clip": 1.01994967, "balance_loss_mlp": 1.04090154, "epoch": 0.46288892229069595, "flos": 21069580959360.0, "grad_norm": 1.704409959211736, "language_loss": 0.71042943, "learning_rate": 2.2329641432663653e-06, "loss": 0.73193252, "num_input_tokens_seen": 165265825, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7578125, "step": 7699, "time_per_iteration": 3.926398754119873 }, { "auxiliary_loss_clip": 0.01165229, "auxiliary_loss_mlp": 0.01034935, "balance_loss_clip": 1.02022457, "balance_loss_mlp": 1.04070008, "epoch": 0.4629490455433639, "flos": 23185688947200.0, "grad_norm": 2.1821662373554576, "language_loss": 0.71947312, "learning_rate": 2.232588888698337e-06, "loss": 0.74147475, "num_input_tokens_seen": 165284380, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.796875, "step": 7700, "time_per_iteration": 2.6388697624206543 }, { "auxiliary_loss_clip": 0.0111714, "auxiliary_loss_mlp": 0.01281427, "balance_loss_clip": 1.0211432, "balance_loss_mlp": 1.04143357, "epoch": 0.4630091687960319, "flos": 18835541642880.0, "grad_norm": 2.3326231610244195, "language_loss": 0.72745359, "learning_rate": 2.232213625829811e-06, "loss": 0.75143933, "num_input_tokens_seen": 165300320, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 7701, "time_per_iteration": 2.5418548583984375 }, { "auxiliary_loss_clip": 0.01151601, "auxiliary_loss_mlp": 0.0103412, "balance_loss_clip": 1.01992178, "balance_loss_mlp": 1.04456329, "epoch": 0.46306929204869984, "flos": 38907020407680.0, "grad_norm": 2.980701034314773, "language_loss": 0.64995301, "learning_rate": 2.2318383546741768e-06, "loss": 0.67181021, "num_input_tokens_seen": 165318130, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.796875, "step": 7702, "time_per_iteration": 2.7146637439727783 }, { "auxiliary_loss_clip": 0.01133729, "auxiliary_loss_mlp": 0.01032066, "balance_loss_clip": 1.01854205, "balance_loss_mlp": 1.04129219, "epoch": 0.4631294153013678, "flos": 19644178573440.0, "grad_norm": 2.0294908538822938, "language_loss": 0.72977763, "learning_rate": 2.231463075244829e-06, "loss": 0.75143564, "num_input_tokens_seen": 165336225, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.74609375, "step": 7703, "time_per_iteration": 2.637349843978882 }, { "auxiliary_loss_clip": 0.01141198, "auxiliary_loss_mlp": 0.01037045, "balance_loss_clip": 1.02234042, "balance_loss_mlp": 1.04321766, "epoch": 0.4631895385540358, "flos": 24608254158720.0, "grad_norm": 1.6967234459681262, "language_loss": 0.68436551, "learning_rate": 2.231087787555159e-06, "loss": 0.70614797, "num_input_tokens_seen": 165355005, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80078125, "step": 7704, "time_per_iteration": 2.6101980209350586 }, { "auxiliary_loss_clip": 0.01130989, "auxiliary_loss_mlp": 0.01031414, "balance_loss_clip": 1.01683521, "balance_loss_mlp": 1.04282141, "epoch": 0.46324966180670374, "flos": 26320115508480.0, "grad_norm": 2.1021616721827128, "language_loss": 0.80905277, "learning_rate": 2.23071249161856e-06, "loss": 0.83067679, "num_input_tokens_seen": 165374910, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.79296875, "step": 7705, "time_per_iteration": 4.044812440872192 }, { "auxiliary_loss_clip": 0.01129785, "auxiliary_loss_mlp": 0.01037285, "balance_loss_clip": 1.02341461, "balance_loss_mlp": 1.04177523, "epoch": 0.4633097850593717, "flos": 19240506552960.0, "grad_norm": 2.1196243072506085, "language_loss": 0.77160174, "learning_rate": 2.230337187448426e-06, "loss": 0.7932725, "num_input_tokens_seen": 165392590, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7890625, "step": 7706, "time_per_iteration": 2.547593832015991 }, { "auxiliary_loss_clip": 0.01131477, "auxiliary_loss_mlp": 0.01033744, "balance_loss_clip": 1.02033329, "balance_loss_mlp": 1.04057074, "epoch": 0.46336990831203967, "flos": 22783166161920.0, "grad_norm": 1.7283322798776708, "language_loss": 0.69620264, "learning_rate": 2.2299618750581498e-06, "loss": 0.71785486, "num_input_tokens_seen": 165411195, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 7707, "time_per_iteration": 2.621720552444458 }, { "auxiliary_loss_clip": 0.01138852, "auxiliary_loss_mlp": 0.01036721, "balance_loss_clip": 1.02125323, "balance_loss_mlp": 1.04080987, "epoch": 0.46343003156470763, "flos": 38210604543360.0, "grad_norm": 2.0921375460312306, "language_loss": 0.61255497, "learning_rate": 2.2295865544611264e-06, "loss": 0.63431072, "num_input_tokens_seen": 165430150, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8046875, "step": 7708, "time_per_iteration": 4.198235750198364 }, { "auxiliary_loss_clip": 0.01146864, "auxiliary_loss_mlp": 0.01035876, "balance_loss_clip": 1.02115345, "balance_loss_mlp": 1.0424459, "epoch": 0.4634901548173756, "flos": 31938555110400.0, "grad_norm": 1.9942839924946492, "language_loss": 0.76835597, "learning_rate": 2.229211225670749e-06, "loss": 0.79018331, "num_input_tokens_seen": 165450595, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.77734375, "step": 7709, "time_per_iteration": 2.663133144378662 }, { "auxiliary_loss_clip": 0.01124046, "auxiliary_loss_mlp": 0.01039093, "balance_loss_clip": 1.02434039, "balance_loss_mlp": 1.04455996, "epoch": 0.46355027807004356, "flos": 20082540153600.0, "grad_norm": 1.6578495460002312, "language_loss": 0.77344096, "learning_rate": 2.2288358887004127e-06, "loss": 0.79507232, "num_input_tokens_seen": 165469515, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.796875, "step": 7710, "time_per_iteration": 2.612027168273926 }, { "auxiliary_loss_clip": 0.01132913, "auxiliary_loss_mlp": 0.01035532, "balance_loss_clip": 1.02078533, "balance_loss_mlp": 1.04340518, "epoch": 0.4636104013227116, "flos": 24061370613120.0, "grad_norm": 1.8971464537884453, "language_loss": 0.73580402, "learning_rate": 2.2284605435635124e-06, "loss": 0.75748849, "num_input_tokens_seen": 165488125, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8046875, "step": 7711, "time_per_iteration": 4.060222625732422 }, { "auxiliary_loss_clip": 0.01138294, "auxiliary_loss_mlp": 0.01042566, "balance_loss_clip": 1.02764678, "balance_loss_mlp": 1.04359007, "epoch": 0.46367052457537955, "flos": 23914639555200.0, "grad_norm": 1.314687922644381, "language_loss": 0.71527743, "learning_rate": 2.2280851902734427e-06, "loss": 0.73708606, "num_input_tokens_seen": 165509225, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.76953125, "step": 7712, "time_per_iteration": 2.5663888454437256 }, { "auxiliary_loss_clip": 0.01143501, "auxiliary_loss_mlp": 0.01039717, "balance_loss_clip": 1.02374279, "balance_loss_mlp": 1.0432266, "epoch": 0.4637306478280475, "flos": 26396533693440.0, "grad_norm": 3.4999692595979086, "language_loss": 0.72991145, "learning_rate": 2.2277098288435994e-06, "loss": 0.75174367, "num_input_tokens_seen": 165529945, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8203125, "step": 7713, "time_per_iteration": 2.642362356185913 }, { "auxiliary_loss_clip": 0.01151219, "auxiliary_loss_mlp": 0.01035349, "balance_loss_clip": 1.0202446, "balance_loss_mlp": 1.04352653, "epoch": 0.4637907710807155, "flos": 21980706370560.0, "grad_norm": 1.7391193896964618, "language_loss": 0.58362484, "learning_rate": 2.2273344592873775e-06, "loss": 0.60549045, "num_input_tokens_seen": 165550690, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.80859375, "step": 7714, "time_per_iteration": 2.5619988441467285 }, { "auxiliary_loss_clip": 0.01127373, "auxiliary_loss_mlp": 0.01033436, "balance_loss_clip": 1.01963162, "balance_loss_mlp": 1.04229581, "epoch": 0.46385089433338345, "flos": 12422291846400.0, "grad_norm": 2.0373683188472183, "language_loss": 0.70064521, "learning_rate": 2.226959081618174e-06, "loss": 0.72225326, "num_input_tokens_seen": 165567775, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76171875, "step": 7715, "time_per_iteration": 2.663273334503174 }, { "auxiliary_loss_clip": 0.01127117, "auxiliary_loss_mlp": 0.01037188, "balance_loss_clip": 1.02293038, "balance_loss_mlp": 1.04449451, "epoch": 0.4639110175860514, "flos": 23915752876800.0, "grad_norm": 2.0766083360814225, "language_loss": 0.68826699, "learning_rate": 2.2265836958493854e-06, "loss": 0.70991004, "num_input_tokens_seen": 165587010, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.828125, "step": 7716, "time_per_iteration": 2.5828700065612793 }, { "auxiliary_loss_clip": 0.01123283, "auxiliary_loss_mlp": 0.01035057, "balance_loss_clip": 1.01982164, "balance_loss_mlp": 1.04284644, "epoch": 0.4639711408387194, "flos": 25300396304640.0, "grad_norm": 1.6138749485444697, "language_loss": 0.8087728, "learning_rate": 2.2262083019944064e-06, "loss": 0.83035624, "num_input_tokens_seen": 165607850, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8046875, "step": 7717, "time_per_iteration": 2.6597747802734375 }, { "auxiliary_loss_clip": 0.01121095, "auxiliary_loss_mlp": 0.01033784, "balance_loss_clip": 1.01981878, "balance_loss_mlp": 1.04219508, "epoch": 0.46403126409138734, "flos": 21211822817280.0, "grad_norm": 1.8537406145564543, "language_loss": 0.72923934, "learning_rate": 2.225832900066636e-06, "loss": 0.75078809, "num_input_tokens_seen": 165627175, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7890625, "step": 7718, "time_per_iteration": 2.5492103099823 }, { "auxiliary_loss_clip": 0.01117959, "auxiliary_loss_mlp": 0.01283981, "balance_loss_clip": 1.02210999, "balance_loss_mlp": 1.04076552, "epoch": 0.4640913873440553, "flos": 35845564325760.0, "grad_norm": 2.6815091198189163, "language_loss": 0.70374501, "learning_rate": 2.2254574900794693e-06, "loss": 0.72776437, "num_input_tokens_seen": 165648340, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7734375, "step": 7719, "time_per_iteration": 2.7326607704162598 }, { "auxiliary_loss_clip": 0.01152907, "auxiliary_loss_mlp": 0.01037374, "balance_loss_clip": 1.02228248, "balance_loss_mlp": 1.04492855, "epoch": 0.46415151059672327, "flos": 19166207270400.0, "grad_norm": 3.358612368227379, "language_loss": 0.86559153, "learning_rate": 2.2250820720463055e-06, "loss": 0.88749444, "num_input_tokens_seen": 165667195, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8125, "step": 7720, "time_per_iteration": 2.636878728866577 }, { "auxiliary_loss_clip": 0.01062601, "auxiliary_loss_mlp": 0.01002236, "balance_loss_clip": 1.00014961, "balance_loss_mlp": 1.01928067, "epoch": 0.46421163384939124, "flos": 58912750304640.0, "grad_norm": 0.7140313502286619, "language_loss": 0.55061048, "learning_rate": 2.2247066459805414e-06, "loss": 0.57125884, "num_input_tokens_seen": 165726760, "router_z_loss_clip": 0.02087402, "router_z_loss_mlp": 0.25585938, "step": 7721, "time_per_iteration": 3.235987901687622 }, { "auxiliary_loss_clip": 0.01138817, "auxiliary_loss_mlp": 0.01036464, "balance_loss_clip": 1.02162266, "balance_loss_mlp": 1.04371727, "epoch": 0.4642717571020592, "flos": 20157342226560.0, "grad_norm": 2.089338548838468, "language_loss": 0.7975601, "learning_rate": 2.2243312118955746e-06, "loss": 0.81931287, "num_input_tokens_seen": 165745005, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7734375, "step": 7722, "time_per_iteration": 2.6106860637664795 }, { "auxiliary_loss_clip": 0.01123952, "auxiliary_loss_mlp": 0.01037634, "balance_loss_clip": 1.02293539, "balance_loss_mlp": 1.04264569, "epoch": 0.46433188035472717, "flos": 25046184775680.0, "grad_norm": 1.5704036800833658, "language_loss": 0.77565444, "learning_rate": 2.2239557698048043e-06, "loss": 0.79727036, "num_input_tokens_seen": 165765750, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8125, "step": 7723, "time_per_iteration": 2.5998640060424805 }, { "auxiliary_loss_clip": 0.01189139, "auxiliary_loss_mlp": 0.01034333, "balance_loss_clip": 1.01995063, "balance_loss_mlp": 1.04202628, "epoch": 0.4643920036073952, "flos": 28075644817920.0, "grad_norm": 1.5222039257724473, "language_loss": 0.6808697, "learning_rate": 2.2235803197216285e-06, "loss": 0.7031045, "num_input_tokens_seen": 165787515, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.765625, "step": 7724, "time_per_iteration": 2.7892940044403076 }, { "auxiliary_loss_clip": 0.01128199, "auxiliary_loss_mlp": 0.0103878, "balance_loss_clip": 1.02391505, "balance_loss_mlp": 1.04064834, "epoch": 0.46445212686006315, "flos": 18369350000640.0, "grad_norm": 2.059222996594892, "language_loss": 0.67048919, "learning_rate": 2.2232048616594464e-06, "loss": 0.69215894, "num_input_tokens_seen": 165806675, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.78125, "step": 7725, "time_per_iteration": 2.5640134811401367 }, { "auxiliary_loss_clip": 0.01125522, "auxiliary_loss_mlp": 0.0103599, "balance_loss_clip": 1.02288902, "balance_loss_mlp": 1.04228973, "epoch": 0.4645122501127311, "flos": 31721618920320.0, "grad_norm": 2.316537059422622, "language_loss": 0.64791524, "learning_rate": 2.2228293956316563e-06, "loss": 0.66953033, "num_input_tokens_seen": 165829835, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.74609375, "step": 7726, "time_per_iteration": 2.6990177631378174 }, { "auxiliary_loss_clip": 0.01141316, "auxiliary_loss_mlp": 0.01281571, "balance_loss_clip": 1.02034318, "balance_loss_mlp": 1.04548526, "epoch": 0.4645723733653991, "flos": 23768806337280.0, "grad_norm": 1.9315478916352247, "language_loss": 0.74823451, "learning_rate": 2.2224539216516592e-06, "loss": 0.77246332, "num_input_tokens_seen": 165849380, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78125, "step": 7727, "time_per_iteration": 2.576625108718872 }, { "auxiliary_loss_clip": 0.01138704, "auxiliary_loss_mlp": 0.01039722, "balance_loss_clip": 1.02474952, "balance_loss_mlp": 1.04283202, "epoch": 0.46463249661806705, "flos": 33145512935040.0, "grad_norm": 2.0071941453728304, "language_loss": 0.78355122, "learning_rate": 2.2220784397328534e-06, "loss": 0.80533552, "num_input_tokens_seen": 165868620, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.78125, "step": 7728, "time_per_iteration": 2.706233024597168 }, { "auxiliary_loss_clip": 0.01150306, "auxiliary_loss_mlp": 0.01041371, "balance_loss_clip": 1.02658308, "balance_loss_mlp": 1.04431796, "epoch": 0.464692619870735, "flos": 18296020385280.0, "grad_norm": 1.961908744648739, "language_loss": 0.76070201, "learning_rate": 2.2217029498886386e-06, "loss": 0.78261876, "num_input_tokens_seen": 165885915, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.79296875, "step": 7729, "time_per_iteration": 2.5786564350128174 }, { "auxiliary_loss_clip": 0.01138642, "auxiliary_loss_mlp": 0.01037577, "balance_loss_clip": 1.02373123, "balance_loss_mlp": 1.04363322, "epoch": 0.464752743123403, "flos": 22638051216000.0, "grad_norm": 1.6830303231078352, "language_loss": 0.80494153, "learning_rate": 2.2213274521324174e-06, "loss": 0.82670373, "num_input_tokens_seen": 165905465, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76953125, "step": 7730, "time_per_iteration": 2.609333038330078 }, { "auxiliary_loss_clip": 0.01119206, "auxiliary_loss_mlp": 0.01039396, "balance_loss_clip": 1.02544224, "balance_loss_mlp": 1.04153371, "epoch": 0.46481286637607094, "flos": 20412128373120.0, "grad_norm": 1.5423655299083872, "language_loss": 0.76691121, "learning_rate": 2.220951946477587e-06, "loss": 0.78849721, "num_input_tokens_seen": 165924640, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.77734375, "step": 7731, "time_per_iteration": 2.5679073333740234 }, { "auxiliary_loss_clip": 0.01145817, "auxiliary_loss_mlp": 0.01031431, "balance_loss_clip": 1.01766872, "balance_loss_mlp": 1.0419265, "epoch": 0.4648729896287389, "flos": 34275406129920.0, "grad_norm": 1.851269302181572, "language_loss": 0.6576463, "learning_rate": 2.2205764329375516e-06, "loss": 0.6794188, "num_input_tokens_seen": 165945765, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.76953125, "step": 7732, "time_per_iteration": 2.7412142753601074 }, { "auxiliary_loss_clip": 0.01142695, "auxiliary_loss_mlp": 0.0103977, "balance_loss_clip": 1.0237124, "balance_loss_mlp": 1.0422101, "epoch": 0.4649331128814069, "flos": 21321781326720.0, "grad_norm": 3.223386860909147, "language_loss": 0.72560287, "learning_rate": 2.2202009115257105e-06, "loss": 0.74742746, "num_input_tokens_seen": 165964025, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.828125, "step": 7733, "time_per_iteration": 2.564030647277832 }, { "auxiliary_loss_clip": 0.01045027, "auxiliary_loss_mlp": 0.01006835, "balance_loss_clip": 1.00495195, "balance_loss_mlp": 1.01998377, "epoch": 0.46499323613407484, "flos": 58308515717760.0, "grad_norm": 1.2372287108436628, "language_loss": 0.5191704, "learning_rate": 2.219825382255464e-06, "loss": 0.53968894, "num_input_tokens_seen": 166021950, "router_z_loss_clip": 0.01879883, "router_z_loss_mlp": 0.25, "step": 7734, "time_per_iteration": 3.1359686851501465 }, { "auxiliary_loss_clip": 0.01138595, "auxiliary_loss_mlp": 0.01033399, "balance_loss_clip": 1.01902211, "balance_loss_mlp": 1.04356515, "epoch": 0.4650533593867428, "flos": 10889660384640.0, "grad_norm": 2.1446540279986595, "language_loss": 0.75773549, "learning_rate": 2.2194498451402163e-06, "loss": 0.77945542, "num_input_tokens_seen": 166039675, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7734375, "step": 7735, "time_per_iteration": 2.5797832012176514 }, { "auxiliary_loss_clip": 0.01146116, "auxiliary_loss_mlp": 0.01041138, "balance_loss_clip": 1.02604616, "balance_loss_mlp": 1.04321313, "epoch": 0.46511348263941077, "flos": 19974592805760.0, "grad_norm": 1.9871297995113884, "language_loss": 0.69321775, "learning_rate": 2.2190743001933675e-06, "loss": 0.71509027, "num_input_tokens_seen": 166057745, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.76171875, "step": 7736, "time_per_iteration": 2.6878645420074463 }, { "auxiliary_loss_clip": 0.01119133, "auxiliary_loss_mlp": 0.01033092, "balance_loss_clip": 1.01974046, "balance_loss_mlp": 1.04383898, "epoch": 0.46517360589207873, "flos": 19678401256320.0, "grad_norm": 1.6845760931115323, "language_loss": 0.72126383, "learning_rate": 2.2186987474283207e-06, "loss": 0.74278605, "num_input_tokens_seen": 166076440, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75, "step": 7737, "time_per_iteration": 2.6275315284729004 }, { "auxiliary_loss_clip": 0.01144017, "auxiliary_loss_mlp": 0.01043854, "balance_loss_clip": 1.02804685, "balance_loss_mlp": 1.04579544, "epoch": 0.46523372914474675, "flos": 16872665074560.0, "grad_norm": 1.826532868511691, "language_loss": 0.84078985, "learning_rate": 2.218323186858478e-06, "loss": 0.86266851, "num_input_tokens_seen": 166092520, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8046875, "step": 7738, "time_per_iteration": 2.6024365425109863 }, { "auxiliary_loss_clip": 0.01052646, "auxiliary_loss_mlp": 0.01003918, "balance_loss_clip": 1.00181985, "balance_loss_mlp": 1.01875758, "epoch": 0.4652938523974147, "flos": 53439138339840.0, "grad_norm": 0.7705455976064329, "language_loss": 0.57767177, "learning_rate": 2.2179476184972428e-06, "loss": 0.5982374, "num_input_tokens_seen": 166156285, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.25, "step": 7739, "time_per_iteration": 3.1926183700561523 }, { "auxiliary_loss_clip": 0.01140833, "auxiliary_loss_mlp": 0.01040406, "balance_loss_clip": 1.0264349, "balance_loss_mlp": 1.0452745, "epoch": 0.4653539756500827, "flos": 15231296165760.0, "grad_norm": 1.7110994261620411, "language_loss": 0.85102183, "learning_rate": 2.2175720423580173e-06, "loss": 0.87283421, "num_input_tokens_seen": 166173455, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78125, "step": 7740, "time_per_iteration": 3.96447491645813 }, { "auxiliary_loss_clip": 0.01140206, "auxiliary_loss_mlp": 0.01040647, "balance_loss_clip": 1.02545941, "balance_loss_mlp": 1.04563856, "epoch": 0.46541409890275065, "flos": 23732249270400.0, "grad_norm": 1.5015946319100062, "language_loss": 0.75861287, "learning_rate": 2.217196458454205e-06, "loss": 0.78042138, "num_input_tokens_seen": 166194370, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7734375, "step": 7741, "time_per_iteration": 2.600940227508545 }, { "auxiliary_loss_clip": 0.01153237, "auxiliary_loss_mlp": 0.01039796, "balance_loss_clip": 1.02521718, "balance_loss_mlp": 1.04469633, "epoch": 0.4654742221554186, "flos": 20847329556480.0, "grad_norm": 1.8015581826269915, "language_loss": 0.80774677, "learning_rate": 2.2168208667992105e-06, "loss": 0.8296771, "num_input_tokens_seen": 166213195, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8203125, "step": 7742, "time_per_iteration": 2.6728382110595703 }, { "auxiliary_loss_clip": 0.0114004, "auxiliary_loss_mlp": 0.01042182, "balance_loss_clip": 1.02662563, "balance_loss_mlp": 1.04190695, "epoch": 0.4655343454080866, "flos": 20704836303360.0, "grad_norm": 1.7151903993444293, "language_loss": 0.72784877, "learning_rate": 2.2164452674064365e-06, "loss": 0.74967098, "num_input_tokens_seen": 166231350, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8125, "step": 7743, "time_per_iteration": 2.5903801918029785 }, { "auxiliary_loss_clip": 0.01142917, "auxiliary_loss_mlp": 0.01031667, "balance_loss_clip": 1.01706362, "balance_loss_mlp": 1.04573894, "epoch": 0.46559446866075455, "flos": 18989850470400.0, "grad_norm": 1.9698940276213874, "language_loss": 0.71252108, "learning_rate": 2.2160696602892875e-06, "loss": 0.73426688, "num_input_tokens_seen": 166250530, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7890625, "step": 7744, "time_per_iteration": 2.5879154205322266 }, { "auxiliary_loss_clip": 0.01157301, "auxiliary_loss_mlp": 0.01030655, "balance_loss_clip": 1.01702964, "balance_loss_mlp": 1.04424262, "epoch": 0.4656545919134225, "flos": 34496364643200.0, "grad_norm": 1.5230428819433273, "language_loss": 0.85260546, "learning_rate": 2.2156940454611685e-06, "loss": 0.87448502, "num_input_tokens_seen": 166272545, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.76953125, "step": 7745, "time_per_iteration": 2.6994364261627197 }, { "auxiliary_loss_clip": 0.0112866, "auxiliary_loss_mlp": 0.01040356, "balance_loss_clip": 1.02565742, "balance_loss_mlp": 1.04426217, "epoch": 0.4657147151660905, "flos": 24310554238080.0, "grad_norm": 4.279396403687147, "language_loss": 0.73244148, "learning_rate": 2.215318422935484e-06, "loss": 0.75413167, "num_input_tokens_seen": 166292135, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.75390625, "step": 7746, "time_per_iteration": 3.9610486030578613 }, { "auxiliary_loss_clip": 0.01129852, "auxiliary_loss_mlp": 0.01038457, "balance_loss_clip": 1.02372253, "balance_loss_mlp": 1.04442835, "epoch": 0.46577483841875844, "flos": 58795139220480.0, "grad_norm": 1.5649742676503091, "language_loss": 0.69784188, "learning_rate": 2.2149427927256387e-06, "loss": 0.71952498, "num_input_tokens_seen": 166316710, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.765625, "step": 7747, "time_per_iteration": 2.943800687789917 }, { "auxiliary_loss_clip": 0.01146143, "auxiliary_loss_mlp": 0.01034417, "balance_loss_clip": 1.01970661, "balance_loss_mlp": 1.04196382, "epoch": 0.4658349616714264, "flos": 31321969223040.0, "grad_norm": 1.750153579642472, "language_loss": 0.6765089, "learning_rate": 2.2145671548450378e-06, "loss": 0.69831449, "num_input_tokens_seen": 166338535, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.77734375, "step": 7748, "time_per_iteration": 2.70900559425354 }, { "auxiliary_loss_clip": 0.01153113, "auxiliary_loss_mlp": 0.01039634, "balance_loss_clip": 1.02251554, "balance_loss_mlp": 1.04421949, "epoch": 0.46589508492409437, "flos": 14860338456960.0, "grad_norm": 1.9670329474017343, "language_loss": 0.63512039, "learning_rate": 2.2141915093070875e-06, "loss": 0.65704787, "num_input_tokens_seen": 166355540, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.82421875, "step": 7749, "time_per_iteration": 2.616478204727173 }, { "auxiliary_loss_clip": 0.01134334, "auxiliary_loss_mlp": 0.01037361, "balance_loss_clip": 1.02169132, "balance_loss_mlp": 1.04566693, "epoch": 0.46595520817676234, "flos": 12895989431040.0, "grad_norm": 2.08890064413573, "language_loss": 0.73925948, "learning_rate": 2.213815856125193e-06, "loss": 0.76097643, "num_input_tokens_seen": 166372635, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.80078125, "step": 7750, "time_per_iteration": 4.163922071456909 }, { "auxiliary_loss_clip": 0.01142474, "auxiliary_loss_mlp": 0.0102957, "balance_loss_clip": 1.01446605, "balance_loss_mlp": 1.04382789, "epoch": 0.46601533142943036, "flos": 32854169721600.0, "grad_norm": 1.8347758619654184, "language_loss": 0.74182594, "learning_rate": 2.21344019531276e-06, "loss": 0.76354635, "num_input_tokens_seen": 166393175, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8046875, "step": 7751, "time_per_iteration": 2.7425858974456787 }, { "auxiliary_loss_clip": 0.01154801, "auxiliary_loss_mlp": 0.01036216, "balance_loss_clip": 1.02045608, "balance_loss_mlp": 1.04626167, "epoch": 0.4660754546820983, "flos": 19967517826560.0, "grad_norm": 1.9293170762988856, "language_loss": 0.73622435, "learning_rate": 2.2130645268831965e-06, "loss": 0.75813448, "num_input_tokens_seen": 166408630, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8203125, "step": 7752, "time_per_iteration": 4.089848518371582 }, { "auxiliary_loss_clip": 0.01146046, "auxiliary_loss_mlp": 0.01039733, "balance_loss_clip": 1.02480781, "balance_loss_mlp": 1.04551828, "epoch": 0.4661355779347663, "flos": 26688164215680.0, "grad_norm": 2.183330813607159, "language_loss": 0.69118834, "learning_rate": 2.2126888508499074e-06, "loss": 0.71304607, "num_input_tokens_seen": 166428170, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8203125, "step": 7753, "time_per_iteration": 2.5713300704956055 }, { "auxiliary_loss_clip": 0.0113759, "auxiliary_loss_mlp": 0.01036448, "balance_loss_clip": 1.02160656, "balance_loss_mlp": 1.04446018, "epoch": 0.46619570118743425, "flos": 20959442881920.0, "grad_norm": 1.708409317869739, "language_loss": 0.73216069, "learning_rate": 2.2123131672263005e-06, "loss": 0.75390112, "num_input_tokens_seen": 166446705, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7578125, "step": 7754, "time_per_iteration": 2.6159281730651855 }, { "auxiliary_loss_clip": 0.01142128, "auxiliary_loss_mlp": 0.0103377, "balance_loss_clip": 1.01858234, "balance_loss_mlp": 1.04387975, "epoch": 0.4662558244401022, "flos": 24426079355520.0, "grad_norm": 1.5451093171571038, "language_loss": 0.79275435, "learning_rate": 2.2119374760257828e-06, "loss": 0.81451333, "num_input_tokens_seen": 166466750, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8046875, "step": 7755, "time_per_iteration": 2.5736446380615234 }, { "auxiliary_loss_clip": 0.01129154, "auxiliary_loss_mlp": 0.01033349, "balance_loss_clip": 1.018507, "balance_loss_mlp": 1.04443002, "epoch": 0.4663159476927702, "flos": 20595452411520.0, "grad_norm": 1.5759316664875074, "language_loss": 0.71861124, "learning_rate": 2.2115617772617614e-06, "loss": 0.74023622, "num_input_tokens_seen": 166485400, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7578125, "step": 7756, "time_per_iteration": 2.637408494949341 }, { "auxiliary_loss_clip": 0.01131605, "auxiliary_loss_mlp": 0.01034308, "balance_loss_clip": 1.01977038, "balance_loss_mlp": 1.04471731, "epoch": 0.46637607094543815, "flos": 25661872823040.0, "grad_norm": 1.975711786627542, "language_loss": 0.77801931, "learning_rate": 2.211186070947645e-06, "loss": 0.79967844, "num_input_tokens_seen": 166505730, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.78125, "step": 7757, "time_per_iteration": 2.60446834564209 }, { "auxiliary_loss_clip": 0.01150304, "auxiliary_loss_mlp": 0.01031321, "balance_loss_clip": 1.01649141, "balance_loss_mlp": 1.04549217, "epoch": 0.4664361941981061, "flos": 24273853516800.0, "grad_norm": 1.8248838253425157, "language_loss": 0.6620087, "learning_rate": 2.2108103570968403e-06, "loss": 0.68382502, "num_input_tokens_seen": 166523770, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.78125, "step": 7758, "time_per_iteration": 2.782504081726074 }, { "auxiliary_loss_clip": 0.01141829, "auxiliary_loss_mlp": 0.01037981, "balance_loss_clip": 1.02333641, "balance_loss_mlp": 1.04589868, "epoch": 0.4664963174507741, "flos": 18405871153920.0, "grad_norm": 1.7236065666904266, "language_loss": 0.74818385, "learning_rate": 2.210434635722757e-06, "loss": 0.76998192, "num_input_tokens_seen": 166542935, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78125, "step": 7759, "time_per_iteration": 2.6671109199523926 }, { "auxiliary_loss_clip": 0.01140323, "auxiliary_loss_mlp": 0.01039128, "balance_loss_clip": 1.02432215, "balance_loss_mlp": 1.04553413, "epoch": 0.46655644070344204, "flos": 22455122227200.0, "grad_norm": 1.508084524204602, "language_loss": 0.77410209, "learning_rate": 2.2100589068388028e-06, "loss": 0.79589659, "num_input_tokens_seen": 166563935, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7734375, "step": 7760, "time_per_iteration": 2.7215023040771484 }, { "auxiliary_loss_clip": 0.01136749, "auxiliary_loss_mlp": 0.01033937, "balance_loss_clip": 1.01891649, "balance_loss_mlp": 1.0439918, "epoch": 0.46661656395611, "flos": 13808407731840.0, "grad_norm": 1.6393743142980175, "language_loss": 0.73459482, "learning_rate": 2.2096831704583858e-06, "loss": 0.75630164, "num_input_tokens_seen": 166582175, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.75, "step": 7761, "time_per_iteration": 2.7058815956115723 }, { "auxiliary_loss_clip": 0.01135111, "auxiliary_loss_mlp": 0.01038178, "balance_loss_clip": 1.02113152, "balance_loss_mlp": 1.04450107, "epoch": 0.466676687208778, "flos": 21652159645440.0, "grad_norm": 1.679610571387549, "language_loss": 0.79108083, "learning_rate": 2.2093074265949164e-06, "loss": 0.81281364, "num_input_tokens_seen": 166601870, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.81640625, "step": 7762, "time_per_iteration": 2.562544822692871 }, { "auxiliary_loss_clip": 0.01056138, "auxiliary_loss_mlp": 0.00999682, "balance_loss_clip": 0.99760753, "balance_loss_mlp": 1.02198434, "epoch": 0.46673681046144594, "flos": 68534259068160.0, "grad_norm": 0.8265383425974437, "language_loss": 0.59743875, "learning_rate": 2.2089316752618034e-06, "loss": 0.61799699, "num_input_tokens_seen": 166668960, "router_z_loss_clip": 0.02075195, "router_z_loss_mlp": 0.25, "step": 7763, "time_per_iteration": 3.373180866241455 }, { "auxiliary_loss_clip": 0.01153052, "auxiliary_loss_mlp": 0.01038218, "balance_loss_clip": 1.02206564, "balance_loss_mlp": 1.04467559, "epoch": 0.46679693371411396, "flos": 15814449469440.0, "grad_norm": 2.2295784018338547, "language_loss": 0.78718108, "learning_rate": 2.208555916472456e-06, "loss": 0.80909377, "num_input_tokens_seen": 166686110, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.81640625, "step": 7764, "time_per_iteration": 2.639575481414795 }, { "auxiliary_loss_clip": 0.01148185, "auxiliary_loss_mlp": 0.01041318, "balance_loss_clip": 1.02719188, "balance_loss_mlp": 1.04502606, "epoch": 0.4668570569667819, "flos": 18514572687360.0, "grad_norm": 1.7550776704681936, "language_loss": 0.71631753, "learning_rate": 2.208180150240285e-06, "loss": 0.73821259, "num_input_tokens_seen": 166703930, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.765625, "step": 7765, "time_per_iteration": 2.619494915008545 }, { "auxiliary_loss_clip": 0.01151608, "auxiliary_loss_mlp": 0.01044165, "balance_loss_clip": 1.02852499, "balance_loss_mlp": 1.04412699, "epoch": 0.4669171802194499, "flos": 19206643006080.0, "grad_norm": 1.9552126221797022, "language_loss": 0.77600092, "learning_rate": 2.2078043765786993e-06, "loss": 0.79795873, "num_input_tokens_seen": 166719940, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8046875, "step": 7766, "time_per_iteration": 2.5752410888671875 }, { "auxiliary_loss_clip": 0.01128617, "auxiliary_loss_mlp": 0.01036325, "balance_loss_clip": 1.02156734, "balance_loss_mlp": 1.04145181, "epoch": 0.46697730347211786, "flos": 12276135406080.0, "grad_norm": 2.756766782525659, "language_loss": 0.6492331, "learning_rate": 2.2074285955011097e-06, "loss": 0.67088258, "num_input_tokens_seen": 166738285, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.78125, "step": 7767, "time_per_iteration": 2.5992062091827393 }, { "auxiliary_loss_clip": 0.01131712, "auxiliary_loss_mlp": 0.0103734, "balance_loss_clip": 1.02262366, "balance_loss_mlp": 1.04467535, "epoch": 0.4670374267247858, "flos": 23586739274880.0, "grad_norm": 1.9208766580667875, "language_loss": 0.74488568, "learning_rate": 2.2070528070209272e-06, "loss": 0.76657629, "num_input_tokens_seen": 166758170, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78125, "step": 7768, "time_per_iteration": 2.5319607257843018 }, { "auxiliary_loss_clip": 0.01142848, "auxiliary_loss_mlp": 0.01037871, "balance_loss_clip": 1.02369726, "balance_loss_mlp": 1.0455277, "epoch": 0.4670975499774538, "flos": 15991093578240.0, "grad_norm": 1.6646059107088498, "language_loss": 0.70853519, "learning_rate": 2.2066770111515635e-06, "loss": 0.73034239, "num_input_tokens_seen": 166775750, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.79296875, "step": 7769, "time_per_iteration": 2.56463360786438 }, { "auxiliary_loss_clip": 0.01121504, "auxiliary_loss_mlp": 0.01038867, "balance_loss_clip": 1.02450216, "balance_loss_mlp": 1.04328012, "epoch": 0.46715767323012175, "flos": 15377596260480.0, "grad_norm": 2.404223553046452, "language_loss": 0.81348503, "learning_rate": 2.206301207906428e-06, "loss": 0.83508873, "num_input_tokens_seen": 166791720, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.78125, "step": 7770, "time_per_iteration": 2.451537847518921 }, { "auxiliary_loss_clip": 0.01042873, "auxiliary_loss_mlp": 0.01003461, "balance_loss_clip": 1.00152993, "balance_loss_mlp": 1.01818657, "epoch": 0.4672177964827897, "flos": 60252217401600.0, "grad_norm": 0.7914921086731614, "language_loss": 0.5566138, "learning_rate": 2.2059253972989332e-06, "loss": 0.57707715, "num_input_tokens_seen": 166856360, "router_z_loss_clip": 0.01928711, "router_z_loss_mlp": 0.24707031, "step": 7771, "time_per_iteration": 3.1307051181793213 }, { "auxiliary_loss_clip": 0.0112904, "auxiliary_loss_mlp": 0.01288601, "balance_loss_clip": 1.02703547, "balance_loss_mlp": 1.04307818, "epoch": 0.4672779197354577, "flos": 27636134002560.0, "grad_norm": 2.041833779717557, "language_loss": 0.65918934, "learning_rate": 2.2055495793424913e-06, "loss": 0.6833657, "num_input_tokens_seen": 166875925, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.76953125, "step": 7772, "time_per_iteration": 2.577298402786255 }, { "auxiliary_loss_clip": 0.01130098, "auxiliary_loss_mlp": 0.0103462, "balance_loss_clip": 1.02042198, "balance_loss_mlp": 1.0442493, "epoch": 0.46733804298812565, "flos": 31394257344000.0, "grad_norm": 1.7877457842840494, "language_loss": 0.63889802, "learning_rate": 2.2051737540505128e-06, "loss": 0.66054517, "num_input_tokens_seen": 166896520, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.765625, "step": 7773, "time_per_iteration": 2.5994153022766113 }, { "auxiliary_loss_clip": 0.01139073, "auxiliary_loss_mlp": 0.01037141, "balance_loss_clip": 1.02239466, "balance_loss_mlp": 1.04258859, "epoch": 0.4673981662407936, "flos": 19500607912320.0, "grad_norm": 1.870193445164706, "language_loss": 0.79792893, "learning_rate": 2.2047979214364117e-06, "loss": 0.81969106, "num_input_tokens_seen": 166915370, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7890625, "step": 7774, "time_per_iteration": 2.615755081176758 }, { "auxiliary_loss_clip": 0.01151518, "auxiliary_loss_mlp": 0.01034543, "balance_loss_clip": 1.0199275, "balance_loss_mlp": 1.04439425, "epoch": 0.4674582894934616, "flos": 20521835487360.0, "grad_norm": 1.620783422677841, "language_loss": 0.77351665, "learning_rate": 2.2044220815135984e-06, "loss": 0.79537725, "num_input_tokens_seen": 166934875, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8046875, "step": 7775, "time_per_iteration": 2.6731693744659424 }, { "auxiliary_loss_clip": 0.01137951, "auxiliary_loss_mlp": 0.0103529, "balance_loss_clip": 1.02096689, "balance_loss_mlp": 1.04341435, "epoch": 0.46751841274612954, "flos": 22090952188800.0, "grad_norm": 1.664122793108066, "language_loss": 0.6947639, "learning_rate": 2.2040462342954876e-06, "loss": 0.71649629, "num_input_tokens_seen": 166954285, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.765625, "step": 7776, "time_per_iteration": 2.6233737468719482 }, { "auxiliary_loss_clip": 0.01132618, "auxiliary_loss_mlp": 0.01035035, "balance_loss_clip": 1.02186835, "balance_loss_mlp": 1.04535735, "epoch": 0.46757853599879756, "flos": 26980082046720.0, "grad_norm": 2.1984975052094833, "language_loss": 0.73743343, "learning_rate": 2.2036703797954922e-06, "loss": 0.75911003, "num_input_tokens_seen": 166975975, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.78515625, "step": 7777, "time_per_iteration": 2.5748956203460693 }, { "auxiliary_loss_clip": 0.01129613, "auxiliary_loss_mlp": 0.01037985, "balance_loss_clip": 1.02360284, "balance_loss_mlp": 1.04267025, "epoch": 0.4676386592514655, "flos": 24134053783680.0, "grad_norm": 2.239345205790717, "language_loss": 0.69624674, "learning_rate": 2.203294518027024e-06, "loss": 0.71792269, "num_input_tokens_seen": 166996140, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.78125, "step": 7778, "time_per_iteration": 2.7104756832122803 }, { "auxiliary_loss_clip": 0.0112953, "auxiliary_loss_mlp": 0.01043637, "balance_loss_clip": 1.02830708, "balance_loss_mlp": 1.04246294, "epoch": 0.4676987825041335, "flos": 25483720343040.0, "grad_norm": 1.6645530451751336, "language_loss": 0.73558801, "learning_rate": 2.2029186490034977e-06, "loss": 0.75731963, "num_input_tokens_seen": 167016105, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.78125, "step": 7779, "time_per_iteration": 2.595686197280884 }, { "auxiliary_loss_clip": 0.01156721, "auxiliary_loss_mlp": 0.01041672, "balance_loss_clip": 1.02766478, "balance_loss_mlp": 1.04494143, "epoch": 0.46775890575680146, "flos": 21945298538880.0, "grad_norm": 1.6213336315971791, "language_loss": 0.72663307, "learning_rate": 2.2025427727383262e-06, "loss": 0.74861693, "num_input_tokens_seen": 167036185, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76171875, "step": 7780, "time_per_iteration": 2.6969974040985107 }, { "auxiliary_loss_clip": 0.01141868, "auxiliary_loss_mlp": 0.0103766, "balance_loss_clip": 1.02308047, "balance_loss_mlp": 1.04242146, "epoch": 0.4678190290094694, "flos": 25228395492480.0, "grad_norm": 1.7882986835846333, "language_loss": 0.742935, "learning_rate": 2.2021668892449246e-06, "loss": 0.76473022, "num_input_tokens_seen": 167054515, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8125, "step": 7781, "time_per_iteration": 2.607856512069702 }, { "auxiliary_loss_clip": 0.01131174, "auxiliary_loss_mlp": 0.01038569, "balance_loss_clip": 1.02329803, "balance_loss_mlp": 1.04184055, "epoch": 0.4678791522621374, "flos": 32268358811520.0, "grad_norm": 1.7605490839490414, "language_loss": 0.62719399, "learning_rate": 2.201790998536707e-06, "loss": 0.64889145, "num_input_tokens_seen": 167077245, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8046875, "step": 7782, "time_per_iteration": 4.097013235092163 }, { "auxiliary_loss_clip": 0.01142142, "auxiliary_loss_mlp": 0.01042059, "balance_loss_clip": 1.02753913, "balance_loss_mlp": 1.04355168, "epoch": 0.46793927551480535, "flos": 27046480337280.0, "grad_norm": 1.841216425857437, "language_loss": 0.63002199, "learning_rate": 2.2014151006270872e-06, "loss": 0.65186405, "num_input_tokens_seen": 167097235, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8046875, "step": 7783, "time_per_iteration": 2.7028825283050537 }, { "auxiliary_loss_clip": 0.01134363, "auxiliary_loss_mlp": 0.01036945, "balance_loss_clip": 1.02195454, "balance_loss_mlp": 1.04402733, "epoch": 0.4679993987674733, "flos": 17457398576640.0, "grad_norm": 2.8052336779513407, "language_loss": 0.6736961, "learning_rate": 2.2010391955294813e-06, "loss": 0.69540918, "num_input_tokens_seen": 167113155, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8125, "step": 7784, "time_per_iteration": 2.5344409942626953 }, { "auxiliary_loss_clip": 0.01134892, "auxiliary_loss_mlp": 0.01034814, "balance_loss_clip": 1.02160549, "balance_loss_mlp": 1.04041982, "epoch": 0.4680595220201413, "flos": 17165121609600.0, "grad_norm": 3.657359276245492, "language_loss": 0.85084659, "learning_rate": 2.200663283257303e-06, "loss": 0.87254369, "num_input_tokens_seen": 167131765, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.765625, "step": 7785, "time_per_iteration": 2.561951160430908 }, { "auxiliary_loss_clip": 0.01147407, "auxiliary_loss_mlp": 0.01039019, "balance_loss_clip": 1.02377248, "balance_loss_mlp": 1.04143643, "epoch": 0.46811964527280925, "flos": 11327591001600.0, "grad_norm": 1.8299029070196657, "language_loss": 0.7725122, "learning_rate": 2.2002873638239686e-06, "loss": 0.79437649, "num_input_tokens_seen": 167149030, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7890625, "step": 7786, "time_per_iteration": 2.591411828994751 }, { "auxiliary_loss_clip": 0.01124883, "auxiliary_loss_mlp": 0.01031773, "balance_loss_clip": 1.01832008, "balance_loss_mlp": 1.04109812, "epoch": 0.4681797685254772, "flos": 24278809593600.0, "grad_norm": 1.794648118546853, "language_loss": 0.74071312, "learning_rate": 2.1999114372428932e-06, "loss": 0.76227969, "num_input_tokens_seen": 167167375, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75, "step": 7787, "time_per_iteration": 2.603983163833618 }, { "auxiliary_loss_clip": 0.01136706, "auxiliary_loss_mlp": 0.01040831, "balance_loss_clip": 1.0262574, "balance_loss_mlp": 1.04183662, "epoch": 0.4682398917781452, "flos": 31650372293760.0, "grad_norm": 1.8069981953531324, "language_loss": 0.65482986, "learning_rate": 2.1995355035274923e-06, "loss": 0.67660522, "num_input_tokens_seen": 167188065, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.76953125, "step": 7788, "time_per_iteration": 4.130407333374023 }, { "auxiliary_loss_clip": 0.011254, "auxiliary_loss_mlp": 0.01032009, "balance_loss_clip": 1.01923001, "balance_loss_mlp": 1.04097128, "epoch": 0.46830001503081314, "flos": 28110765340800.0, "grad_norm": 1.5337796487950808, "language_loss": 0.63773209, "learning_rate": 2.1991595626911837e-06, "loss": 0.65930617, "num_input_tokens_seen": 167209675, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.75390625, "step": 7789, "time_per_iteration": 2.6567370891571045 }, { "auxiliary_loss_clip": 0.01050424, "auxiliary_loss_mlp": 0.01000495, "balance_loss_clip": 0.99879056, "balance_loss_mlp": 1.01688433, "epoch": 0.4683601382834811, "flos": 57881718316800.0, "grad_norm": 0.6940458518818289, "language_loss": 0.61926585, "learning_rate": 2.1987836147473813e-06, "loss": 0.63977504, "num_input_tokens_seen": 167273940, "router_z_loss_clip": 0.01708984, "router_z_loss_mlp": 0.24609375, "step": 7790, "time_per_iteration": 3.2168097496032715 }, { "auxiliary_loss_clip": 0.01145176, "auxiliary_loss_mlp": 0.01032642, "balance_loss_clip": 1.01922488, "balance_loss_mlp": 1.04338598, "epoch": 0.46842026153614913, "flos": 21871933009920.0, "grad_norm": 1.7184020388231405, "language_loss": 0.79135323, "learning_rate": 2.1984076597095044e-06, "loss": 0.81313145, "num_input_tokens_seen": 167292730, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 7791, "time_per_iteration": 4.064350843429565 }, { "auxiliary_loss_clip": 0.01129207, "auxiliary_loss_mlp": 0.01037401, "balance_loss_clip": 1.02313161, "balance_loss_mlp": 1.04318762, "epoch": 0.4684803847888171, "flos": 24900818434560.0, "grad_norm": 1.9627046298140174, "language_loss": 0.75240445, "learning_rate": 2.1980316975909673e-06, "loss": 0.7740705, "num_input_tokens_seen": 167313460, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7734375, "step": 7792, "time_per_iteration": 2.5873496532440186 }, { "auxiliary_loss_clip": 0.01133781, "auxiliary_loss_mlp": 0.01032141, "balance_loss_clip": 1.01903403, "balance_loss_mlp": 1.04060411, "epoch": 0.46854050804148506, "flos": 26251670142720.0, "grad_norm": 1.5267823971236478, "language_loss": 0.68242532, "learning_rate": 2.1976557284051897e-06, "loss": 0.70408458, "num_input_tokens_seen": 167335385, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.75, "step": 7793, "time_per_iteration": 2.6055850982666016 }, { "auxiliary_loss_clip": 0.01126731, "auxiliary_loss_mlp": 0.01279232, "balance_loss_clip": 1.01873517, "balance_loss_mlp": 1.04227209, "epoch": 0.468600631294153, "flos": 21579799697280.0, "grad_norm": 1.608200616544745, "language_loss": 0.73614073, "learning_rate": 2.1972797521655864e-06, "loss": 0.76020032, "num_input_tokens_seen": 167353625, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7578125, "step": 7794, "time_per_iteration": 4.035047292709351 }, { "auxiliary_loss_clip": 0.0114482, "auxiliary_loss_mlp": 0.01036773, "balance_loss_clip": 1.02213979, "balance_loss_mlp": 1.04305387, "epoch": 0.468660754546821, "flos": 25885632597120.0, "grad_norm": 1.6430766935699406, "language_loss": 0.63239849, "learning_rate": 2.1969037688855765e-06, "loss": 0.65421444, "num_input_tokens_seen": 167374565, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7578125, "step": 7795, "time_per_iteration": 2.644404172897339 }, { "auxiliary_loss_clip": 0.01142051, "auxiliary_loss_mlp": 0.01028753, "balance_loss_clip": 1.01519322, "balance_loss_mlp": 1.04126549, "epoch": 0.46872087779948896, "flos": 35475001666560.0, "grad_norm": 1.9191499944028332, "language_loss": 0.6809032, "learning_rate": 2.196527778578578e-06, "loss": 0.70261127, "num_input_tokens_seen": 167395010, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73828125, "step": 7796, "time_per_iteration": 2.679224729537964 }, { "auxiliary_loss_clip": 0.01116446, "auxiliary_loss_mlp": 0.01278505, "balance_loss_clip": 1.01822054, "balance_loss_mlp": 1.04050088, "epoch": 0.4687810010521569, "flos": 26396425952640.0, "grad_norm": 1.7233512647130496, "language_loss": 0.69659793, "learning_rate": 2.196151781258008e-06, "loss": 0.7205475, "num_input_tokens_seen": 167415285, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 7797, "time_per_iteration": 2.569814920425415 }, { "auxiliary_loss_clip": 0.01137884, "auxiliary_loss_mlp": 0.01037448, "balance_loss_clip": 1.02222455, "balance_loss_mlp": 1.04228139, "epoch": 0.4688411243048249, "flos": 19972761212160.0, "grad_norm": 2.2907903580189064, "language_loss": 0.66834259, "learning_rate": 2.1957757769372856e-06, "loss": 0.6900959, "num_input_tokens_seen": 167432405, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.77734375, "step": 7798, "time_per_iteration": 2.598794460296631 }, { "auxiliary_loss_clip": 0.01049583, "auxiliary_loss_mlp": 0.01002814, "balance_loss_clip": 1.001109, "balance_loss_mlp": 1.01611757, "epoch": 0.46890124755749285, "flos": 63977015900160.0, "grad_norm": 0.9393647935008892, "language_loss": 0.64517295, "learning_rate": 2.1953997656298296e-06, "loss": 0.66569698, "num_input_tokens_seen": 167499365, "router_z_loss_clip": 0.01708984, "router_z_loss_mlp": 0.24414062, "step": 7799, "time_per_iteration": 3.3471689224243164 }, { "auxiliary_loss_clip": 0.01142136, "auxiliary_loss_mlp": 0.01027762, "balance_loss_clip": 1.0139159, "balance_loss_mlp": 1.04031396, "epoch": 0.4689613708101608, "flos": 23768985905280.0, "grad_norm": 1.2903556885234122, "language_loss": 0.71848208, "learning_rate": 2.1950237473490585e-06, "loss": 0.74018109, "num_input_tokens_seen": 167520390, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 7800, "time_per_iteration": 2.6844093799591064 }, { "auxiliary_loss_clip": 0.01113579, "auxiliary_loss_mlp": 0.01034101, "balance_loss_clip": 1.02205539, "balance_loss_mlp": 1.04142809, "epoch": 0.4690214940628288, "flos": 24788705109120.0, "grad_norm": 20.768468302525168, "language_loss": 0.72353983, "learning_rate": 2.1946477221083917e-06, "loss": 0.74501669, "num_input_tokens_seen": 167539865, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.71875, "step": 7801, "time_per_iteration": 2.5787854194641113 }, { "auxiliary_loss_clip": 0.01148841, "auxiliary_loss_mlp": 0.01043008, "balance_loss_clip": 1.02749848, "balance_loss_mlp": 1.04176295, "epoch": 0.46908161731549675, "flos": 18077324428800.0, "grad_norm": 3.7579373774207228, "language_loss": 0.62273932, "learning_rate": 2.194271689921248e-06, "loss": 0.64465779, "num_input_tokens_seen": 167558190, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.80078125, "step": 7802, "time_per_iteration": 2.7133920192718506 }, { "auxiliary_loss_clip": 0.01116738, "auxiliary_loss_mlp": 0.01040492, "balance_loss_clip": 1.02618146, "balance_loss_mlp": 1.03925133, "epoch": 0.4691417405681647, "flos": 25703350053120.0, "grad_norm": 1.832039377489105, "language_loss": 0.73482937, "learning_rate": 2.1938956508010475e-06, "loss": 0.75640172, "num_input_tokens_seen": 167577685, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7734375, "step": 7803, "time_per_iteration": 2.568986177444458 }, { "auxiliary_loss_clip": 0.0113223, "auxiliary_loss_mlp": 0.01286754, "balance_loss_clip": 1.02638543, "balance_loss_mlp": 1.04006577, "epoch": 0.46920186382083273, "flos": 17457039440640.0, "grad_norm": 1.639995301985477, "language_loss": 0.77268064, "learning_rate": 2.19351960476121e-06, "loss": 0.79687047, "num_input_tokens_seen": 167596390, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.74609375, "step": 7804, "time_per_iteration": 2.5563082695007324 }, { "auxiliary_loss_clip": 0.01132377, "auxiliary_loss_mlp": 0.01030203, "balance_loss_clip": 1.01676774, "balance_loss_mlp": 1.03908873, "epoch": 0.4692619870735007, "flos": 20339445202560.0, "grad_norm": 2.580725560002305, "language_loss": 0.76837683, "learning_rate": 2.193143551815155e-06, "loss": 0.79000264, "num_input_tokens_seen": 167614980, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.75, "step": 7805, "time_per_iteration": 2.5044074058532715 }, { "auxiliary_loss_clip": 0.0112407, "auxiliary_loss_mlp": 0.01044608, "balance_loss_clip": 1.02965927, "balance_loss_mlp": 1.04282951, "epoch": 0.46932211032616866, "flos": 29496558003840.0, "grad_norm": 1.720447213546457, "language_loss": 0.82774401, "learning_rate": 2.192767491976305e-06, "loss": 0.84943074, "num_input_tokens_seen": 167635895, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8125, "step": 7806, "time_per_iteration": 2.641903877258301 }, { "auxiliary_loss_clip": 0.01136195, "auxiliary_loss_mlp": 0.01037155, "balance_loss_clip": 1.02278996, "balance_loss_mlp": 1.04072571, "epoch": 0.4693822335788366, "flos": 36211242735360.0, "grad_norm": 1.7787487190492006, "language_loss": 0.7720663, "learning_rate": 2.192391425258078e-06, "loss": 0.79379976, "num_input_tokens_seen": 167657440, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.77734375, "step": 7807, "time_per_iteration": 2.656412363052368 }, { "auxiliary_loss_clip": 0.01135654, "auxiliary_loss_mlp": 0.01036628, "balance_loss_clip": 1.02220345, "balance_loss_mlp": 1.04092383, "epoch": 0.4694423568315046, "flos": 20338978325760.0, "grad_norm": 2.205460864399937, "language_loss": 0.51268208, "learning_rate": 2.1920153516738967e-06, "loss": 0.53440487, "num_input_tokens_seen": 167675025, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.765625, "step": 7808, "time_per_iteration": 2.6382455825805664 }, { "auxiliary_loss_clip": 0.01075389, "auxiliary_loss_mlp": 0.01003041, "balance_loss_clip": 1.00144374, "balance_loss_mlp": 1.01635432, "epoch": 0.46950248008417256, "flos": 64326353621760.0, "grad_norm": 0.7877065608702021, "language_loss": 0.57778847, "learning_rate": 2.1916392712371804e-06, "loss": 0.59857273, "num_input_tokens_seen": 167729635, "router_z_loss_clip": 0.01599121, "router_z_loss_mlp": 0.24316406, "step": 7809, "time_per_iteration": 3.072875499725342 }, { "auxiliary_loss_clip": 0.01157965, "auxiliary_loss_mlp": 0.01039234, "balance_loss_clip": 1.02503037, "balance_loss_mlp": 1.04362464, "epoch": 0.4695626033368405, "flos": 19200106730880.0, "grad_norm": 1.9349209145282837, "language_loss": 0.71786183, "learning_rate": 2.191263183961352e-06, "loss": 0.73983383, "num_input_tokens_seen": 167745135, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78515625, "step": 7810, "time_per_iteration": 2.673129081726074 }, { "auxiliary_loss_clip": 0.01126986, "auxiliary_loss_mlp": 0.01039463, "balance_loss_clip": 1.02551532, "balance_loss_mlp": 1.04065943, "epoch": 0.4696227265895085, "flos": 23002436736000.0, "grad_norm": 1.8392521290208983, "language_loss": 0.80799782, "learning_rate": 2.1908870898598326e-06, "loss": 0.82966226, "num_input_tokens_seen": 167763875, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7734375, "step": 7811, "time_per_iteration": 2.594696283340454 }, { "auxiliary_loss_clip": 0.01127053, "auxiliary_loss_mlp": 0.01037413, "balance_loss_clip": 1.02317297, "balance_loss_mlp": 1.04042482, "epoch": 0.46968284984217645, "flos": 21870855601920.0, "grad_norm": 2.0038636852774205, "language_loss": 0.8080439, "learning_rate": 2.1905109889460436e-06, "loss": 0.82968855, "num_input_tokens_seen": 167784895, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7734375, "step": 7812, "time_per_iteration": 2.6013689041137695 }, { "auxiliary_loss_clip": 0.01150654, "auxiliary_loss_mlp": 0.01031191, "balance_loss_clip": 1.01790512, "balance_loss_mlp": 1.04065681, "epoch": 0.4697429730948444, "flos": 19974987855360.0, "grad_norm": 2.133985227362376, "language_loss": 0.74329281, "learning_rate": 2.1901348812334073e-06, "loss": 0.76511121, "num_input_tokens_seen": 167803185, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73828125, "step": 7813, "time_per_iteration": 2.6759705543518066 }, { "auxiliary_loss_clip": 0.01133354, "auxiliary_loss_mlp": 0.01031122, "balance_loss_clip": 1.01683497, "balance_loss_mlp": 1.03906846, "epoch": 0.4698030963475124, "flos": 15156206784000.0, "grad_norm": 2.190328186342614, "language_loss": 0.84605122, "learning_rate": 2.1897587667353465e-06, "loss": 0.86769593, "num_input_tokens_seen": 167816550, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.76171875, "step": 7814, "time_per_iteration": 2.5336556434631348 }, { "auxiliary_loss_clip": 0.01130437, "auxiliary_loss_mlp": 0.01038644, "balance_loss_clip": 1.02528119, "balance_loss_mlp": 1.03922844, "epoch": 0.46986321960018035, "flos": 15151178880000.0, "grad_norm": 2.0459919853112605, "language_loss": 0.817783, "learning_rate": 2.189382645465284e-06, "loss": 0.83947384, "num_input_tokens_seen": 167831845, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 7815, "time_per_iteration": 2.5939488410949707 }, { "auxiliary_loss_clip": 0.01152961, "auxiliary_loss_mlp": 0.01034464, "balance_loss_clip": 1.01896691, "balance_loss_mlp": 1.04099727, "epoch": 0.4699233428528483, "flos": 23108911626240.0, "grad_norm": 2.109173843655676, "language_loss": 0.77447641, "learning_rate": 2.1890065174366416e-06, "loss": 0.79635066, "num_input_tokens_seen": 167850360, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.76953125, "step": 7816, "time_per_iteration": 2.5821592807769775 }, { "auxiliary_loss_clip": 0.01039904, "auxiliary_loss_mlp": 0.01012603, "balance_loss_clip": 1.0107317, "balance_loss_mlp": 1.01551235, "epoch": 0.46998346610551633, "flos": 68105558246400.0, "grad_norm": 0.8524451101807662, "language_loss": 0.59075069, "learning_rate": 2.1886303826628422e-06, "loss": 0.61127579, "num_input_tokens_seen": 167908660, "router_z_loss_clip": 0.01867676, "router_z_loss_mlp": 0.24414062, "step": 7817, "time_per_iteration": 3.15816068649292 }, { "auxiliary_loss_clip": 0.01130724, "auxiliary_loss_mlp": 0.01032177, "balance_loss_clip": 1.01848006, "balance_loss_mlp": 1.04051828, "epoch": 0.4700435893581843, "flos": 24129456842880.0, "grad_norm": 2.149441228628545, "language_loss": 0.79300785, "learning_rate": 2.1882542411573103e-06, "loss": 0.81463683, "num_input_tokens_seen": 167927905, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 7818, "time_per_iteration": 2.550135850906372 }, { "auxiliary_loss_clip": 0.01115053, "auxiliary_loss_mlp": 0.01033479, "balance_loss_clip": 1.02025294, "balance_loss_mlp": 1.0378201, "epoch": 0.47010371261085226, "flos": 20150518642560.0, "grad_norm": 1.7559387280012102, "language_loss": 0.83648849, "learning_rate": 2.1878780929334684e-06, "loss": 0.85797381, "num_input_tokens_seen": 167945995, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7734375, "step": 7819, "time_per_iteration": 2.56467604637146 }, { "auxiliary_loss_clip": 0.01146852, "auxiliary_loss_mlp": 0.01036969, "balance_loss_clip": 1.022223, "balance_loss_mlp": 1.04074168, "epoch": 0.47016383586352023, "flos": 15122199582720.0, "grad_norm": 2.3569837732626837, "language_loss": 0.75797558, "learning_rate": 2.187501938004741e-06, "loss": 0.77981377, "num_input_tokens_seen": 167963380, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7890625, "step": 7820, "time_per_iteration": 2.516559362411499 }, { "auxiliary_loss_clip": 0.01122911, "auxiliary_loss_mlp": 0.01037767, "balance_loss_clip": 1.02354574, "balance_loss_mlp": 1.0408951, "epoch": 0.4702239591161882, "flos": 13552975140480.0, "grad_norm": 2.312476088996437, "language_loss": 0.7405808, "learning_rate": 2.187125776384552e-06, "loss": 0.76218754, "num_input_tokens_seen": 167981740, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.73046875, "step": 7821, "time_per_iteration": 2.6753928661346436 }, { "auxiliary_loss_clip": 0.01114855, "auxiliary_loss_mlp": 0.01040152, "balance_loss_clip": 1.02704477, "balance_loss_mlp": 1.04038167, "epoch": 0.47028408236885616, "flos": 24276511123200.0, "grad_norm": 2.0259886620186736, "language_loss": 0.88843548, "learning_rate": 2.1867496080863246e-06, "loss": 0.90998554, "num_input_tokens_seen": 167999380, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7421875, "step": 7822, "time_per_iteration": 2.5282504558563232 }, { "auxiliary_loss_clip": 0.01123048, "auxiliary_loss_mlp": 0.01031085, "balance_loss_clip": 1.01807928, "balance_loss_mlp": 1.04010499, "epoch": 0.4703442056215241, "flos": 22856926740480.0, "grad_norm": 1.584005695315098, "language_loss": 0.79633582, "learning_rate": 2.186373433123485e-06, "loss": 0.81787711, "num_input_tokens_seen": 168018395, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7421875, "step": 7823, "time_per_iteration": 3.9573428630828857 }, { "auxiliary_loss_clip": 0.010556, "auxiliary_loss_mlp": 0.01002218, "balance_loss_clip": 1.00033426, "balance_loss_mlp": 1.0134027, "epoch": 0.4704043288741921, "flos": 69240227950080.0, "grad_norm": 0.6991542020997407, "language_loss": 0.56660527, "learning_rate": 2.1859972515094562e-06, "loss": 0.58718342, "num_input_tokens_seen": 168084080, "router_z_loss_clip": 0.01879883, "router_z_loss_mlp": 0.24316406, "step": 7824, "time_per_iteration": 3.2564539909362793 }, { "auxiliary_loss_clip": 0.01134449, "auxiliary_loss_mlp": 0.01039034, "balance_loss_clip": 1.02412677, "balance_loss_mlp": 1.03943145, "epoch": 0.47046445212686006, "flos": 18041090584320.0, "grad_norm": 1.7824200629412976, "language_loss": 0.81029832, "learning_rate": 2.185621063257664e-06, "loss": 0.83203316, "num_input_tokens_seen": 168101555, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.7734375, "step": 7825, "time_per_iteration": 2.5747766494750977 }, { "auxiliary_loss_clip": 0.01135836, "auxiliary_loss_mlp": 0.01034076, "balance_loss_clip": 1.02008653, "balance_loss_mlp": 1.04142737, "epoch": 0.470524575379528, "flos": 23951448017280.0, "grad_norm": 2.9124843640763043, "language_loss": 0.66340548, "learning_rate": 2.185244868381534e-06, "loss": 0.68510461, "num_input_tokens_seen": 168121530, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.765625, "step": 7826, "time_per_iteration": 2.6827640533447266 }, { "auxiliary_loss_clip": 0.01134759, "auxiliary_loss_mlp": 0.01037131, "balance_loss_clip": 1.02228916, "balance_loss_mlp": 1.03941298, "epoch": 0.470584698632196, "flos": 18113558273280.0, "grad_norm": 1.7576548952469702, "language_loss": 0.84327269, "learning_rate": 2.184868666894491e-06, "loss": 0.86499166, "num_input_tokens_seen": 168140335, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7734375, "step": 7827, "time_per_iteration": 2.549384355545044 }, { "auxiliary_loss_clip": 0.01055757, "auxiliary_loss_mlp": 0.01001539, "balance_loss_clip": 0.9996559, "balance_loss_mlp": 1.01367927, "epoch": 0.47064482188486395, "flos": 57251916224640.0, "grad_norm": 0.8145361407327973, "language_loss": 0.55681729, "learning_rate": 2.184492458809961e-06, "loss": 0.57739031, "num_input_tokens_seen": 168200535, "router_z_loss_clip": 0.01879883, "router_z_loss_mlp": 0.24121094, "step": 7828, "time_per_iteration": 3.1490328311920166 }, { "auxiliary_loss_clip": 0.01114711, "auxiliary_loss_mlp": 0.01031935, "balance_loss_clip": 1.01772523, "balance_loss_mlp": 1.03942478, "epoch": 0.4707049451375319, "flos": 17895077798400.0, "grad_norm": 1.9465148795759772, "language_loss": 0.80457288, "learning_rate": 2.1841162441413686e-06, "loss": 0.82603937, "num_input_tokens_seen": 168219610, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.75390625, "step": 7829, "time_per_iteration": 3.96360445022583 }, { "auxiliary_loss_clip": 0.01116592, "auxiliary_loss_mlp": 0.01035153, "balance_loss_clip": 1.02232051, "balance_loss_mlp": 1.04222083, "epoch": 0.47076506839019994, "flos": 25232669210880.0, "grad_norm": 1.2877163656736614, "language_loss": 0.75860912, "learning_rate": 2.1837400229021423e-06, "loss": 0.78012657, "num_input_tokens_seen": 168242505, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7421875, "step": 7830, "time_per_iteration": 2.6024796962738037 }, { "auxiliary_loss_clip": 0.0114291, "auxiliary_loss_mlp": 0.01030675, "balance_loss_clip": 1.01643586, "balance_loss_mlp": 1.0420773, "epoch": 0.4708251916428679, "flos": 13479681438720.0, "grad_norm": 2.023291912062185, "language_loss": 0.79016393, "learning_rate": 2.183363795105707e-06, "loss": 0.81189978, "num_input_tokens_seen": 168260220, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7421875, "step": 7831, "time_per_iteration": 2.7133398056030273 }, { "auxiliary_loss_clip": 0.01150344, "auxiliary_loss_mlp": 0.01036325, "balance_loss_clip": 1.02193689, "balance_loss_mlp": 1.04227626, "epoch": 0.47088531489553587, "flos": 30147833450880.0, "grad_norm": 1.7203563453203399, "language_loss": 0.75262678, "learning_rate": 2.182987560765489e-06, "loss": 0.77449346, "num_input_tokens_seen": 168277360, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.80859375, "step": 7832, "time_per_iteration": 2.608851909637451 }, { "auxiliary_loss_clip": 0.01133775, "auxiliary_loss_mlp": 0.01028361, "balance_loss_clip": 1.01559925, "balance_loss_mlp": 1.04197383, "epoch": 0.47094543814820383, "flos": 21798280172160.0, "grad_norm": 1.3115747082349534, "language_loss": 0.74394608, "learning_rate": 2.182611319894916e-06, "loss": 0.76556736, "num_input_tokens_seen": 168296605, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.73828125, "step": 7833, "time_per_iteration": 4.2276225090026855 }, { "auxiliary_loss_clip": 0.01125382, "auxiliary_loss_mlp": 0.01033963, "balance_loss_clip": 1.01999712, "balance_loss_mlp": 1.04210865, "epoch": 0.4710055614008718, "flos": 23003011353600.0, "grad_norm": 1.8443678015267746, "language_loss": 0.75622392, "learning_rate": 2.1822350725074145e-06, "loss": 0.77781737, "num_input_tokens_seen": 168316205, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7421875, "step": 7834, "time_per_iteration": 2.6430413722991943 }, { "auxiliary_loss_clip": 0.01150435, "auxiliary_loss_mlp": 0.01038338, "balance_loss_clip": 1.02509415, "balance_loss_mlp": 1.04152393, "epoch": 0.47106568465353976, "flos": 42741346452480.0, "grad_norm": 1.4145130935440275, "language_loss": 0.66523242, "learning_rate": 2.181858818616412e-06, "loss": 0.68712008, "num_input_tokens_seen": 168338935, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 7835, "time_per_iteration": 4.210626602172852 }, { "auxiliary_loss_clip": 0.0104916, "auxiliary_loss_mlp": 0.0124786, "balance_loss_clip": 1.00000346, "balance_loss_mlp": 1.01589251, "epoch": 0.4711258079062077, "flos": 68554008570240.0, "grad_norm": 0.9023624884783389, "language_loss": 0.62091208, "learning_rate": 2.181482558235336e-06, "loss": 0.64388227, "num_input_tokens_seen": 168392800, "router_z_loss_clip": 0.01794434, "router_z_loss_mlp": 0.24121094, "step": 7836, "time_per_iteration": 3.1329097747802734 }, { "auxiliary_loss_clip": 0.01148533, "auxiliary_loss_mlp": 0.01281499, "balance_loss_clip": 1.02094603, "balance_loss_mlp": 1.04142809, "epoch": 0.4711859311588757, "flos": 25446588658560.0, "grad_norm": 1.6360788462778328, "language_loss": 0.69870782, "learning_rate": 2.181106291377615e-06, "loss": 0.72300816, "num_input_tokens_seen": 168412940, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.796875, "step": 7837, "time_per_iteration": 2.6882312297821045 }, { "auxiliary_loss_clip": 0.01129338, "auxiliary_loss_mlp": 0.01039381, "balance_loss_clip": 1.02440834, "balance_loss_mlp": 1.04337192, "epoch": 0.47124605441154366, "flos": 21981891519360.0, "grad_norm": 2.547285123849865, "language_loss": 0.66015315, "learning_rate": 2.1807300180566766e-06, "loss": 0.6818403, "num_input_tokens_seen": 168431995, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.765625, "step": 7838, "time_per_iteration": 2.5369389057159424 }, { "auxiliary_loss_clip": 0.01138924, "auxiliary_loss_mlp": 0.01030719, "balance_loss_clip": 1.01656854, "balance_loss_mlp": 1.04216242, "epoch": 0.4713061776642116, "flos": 25412689198080.0, "grad_norm": 1.795696842189753, "language_loss": 0.77291232, "learning_rate": 2.1803537382859478e-06, "loss": 0.79460871, "num_input_tokens_seen": 168454585, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7890625, "step": 7839, "time_per_iteration": 2.6476290225982666 }, { "auxiliary_loss_clip": 0.01140889, "auxiliary_loss_mlp": 0.01035806, "balance_loss_clip": 1.02274621, "balance_loss_mlp": 1.04164886, "epoch": 0.4713663009168796, "flos": 26542259170560.0, "grad_norm": 2.322417821850964, "language_loss": 0.72752714, "learning_rate": 2.179977452078858e-06, "loss": 0.7492941, "num_input_tokens_seen": 168471265, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 7840, "time_per_iteration": 2.628990411758423 }, { "auxiliary_loss_clip": 0.01133683, "auxiliary_loss_mlp": 0.01030091, "balance_loss_clip": 1.0159173, "balance_loss_mlp": 1.04043496, "epoch": 0.47142642416954755, "flos": 23623583650560.0, "grad_norm": 2.7244498628651495, "language_loss": 0.75235164, "learning_rate": 2.1796011594488363e-06, "loss": 0.77398932, "num_input_tokens_seen": 168491360, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.75, "step": 7841, "time_per_iteration": 2.6633105278015137 }, { "auxiliary_loss_clip": 0.01136452, "auxiliary_loss_mlp": 0.01032368, "balance_loss_clip": 1.01891565, "balance_loss_mlp": 1.0426898, "epoch": 0.4714865474222155, "flos": 22310150935680.0, "grad_norm": 1.6215196812466395, "language_loss": 0.70132357, "learning_rate": 2.1792248604093107e-06, "loss": 0.72301173, "num_input_tokens_seen": 168511335, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75390625, "step": 7842, "time_per_iteration": 2.543851613998413 }, { "auxiliary_loss_clip": 0.01128239, "auxiliary_loss_mlp": 0.01033394, "balance_loss_clip": 1.01953042, "balance_loss_mlp": 1.04343784, "epoch": 0.4715466706748835, "flos": 17822430541440.0, "grad_norm": 2.543991096000191, "language_loss": 0.78907925, "learning_rate": 2.1788485549737118e-06, "loss": 0.81069553, "num_input_tokens_seen": 168529920, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76171875, "step": 7843, "time_per_iteration": 2.5836048126220703 }, { "auxiliary_loss_clip": 0.01126741, "auxiliary_loss_mlp": 0.01033352, "balance_loss_clip": 1.01983964, "balance_loss_mlp": 1.04202104, "epoch": 0.4716067939275515, "flos": 23659530186240.0, "grad_norm": 1.5883559259995799, "language_loss": 0.74171531, "learning_rate": 2.178472243155467e-06, "loss": 0.76331627, "num_input_tokens_seen": 168550595, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75390625, "step": 7844, "time_per_iteration": 2.6279447078704834 }, { "auxiliary_loss_clip": 0.01134186, "auxiliary_loss_mlp": 0.01043455, "balance_loss_clip": 1.02860761, "balance_loss_mlp": 1.04133248, "epoch": 0.47166691718021947, "flos": 17930162407680.0, "grad_norm": 1.8868064223807124, "language_loss": 0.78161675, "learning_rate": 2.178095924968008e-06, "loss": 0.80339313, "num_input_tokens_seen": 168569765, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.75, "step": 7845, "time_per_iteration": 2.6547110080718994 }, { "auxiliary_loss_clip": 0.01122519, "auxiliary_loss_mlp": 0.01033737, "balance_loss_clip": 1.0209043, "balance_loss_mlp": 1.04036176, "epoch": 0.47172704043288743, "flos": 26614583205120.0, "grad_norm": 1.2916201274110488, "language_loss": 0.73126209, "learning_rate": 2.1777196004247623e-06, "loss": 0.7528246, "num_input_tokens_seen": 168591525, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.734375, "step": 7846, "time_per_iteration": 2.580707311630249 }, { "auxiliary_loss_clip": 0.01121659, "auxiliary_loss_mlp": 0.0103355, "balance_loss_clip": 1.02016878, "balance_loss_mlp": 1.04030371, "epoch": 0.4717871636855554, "flos": 27922700707200.0, "grad_norm": 1.3380649174122656, "language_loss": 0.73700052, "learning_rate": 2.177343269539162e-06, "loss": 0.75855267, "num_input_tokens_seen": 168611235, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 7847, "time_per_iteration": 2.6491105556488037 }, { "auxiliary_loss_clip": 0.01153998, "auxiliary_loss_mlp": 0.01035353, "balance_loss_clip": 1.0221386, "balance_loss_mlp": 1.04349709, "epoch": 0.47184728693822336, "flos": 14502237816960.0, "grad_norm": 1.7069860595286013, "language_loss": 0.80821103, "learning_rate": 2.176966932324637e-06, "loss": 0.83010453, "num_input_tokens_seen": 168628710, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.75390625, "step": 7848, "time_per_iteration": 2.5235118865966797 }, { "auxiliary_loss_clip": 0.01131126, "auxiliary_loss_mlp": 0.01034117, "balance_loss_clip": 1.02038455, "balance_loss_mlp": 1.04621589, "epoch": 0.47190741019089133, "flos": 17856545483520.0, "grad_norm": 1.9342278360161707, "language_loss": 0.70377612, "learning_rate": 2.1765905887946162e-06, "loss": 0.72542858, "num_input_tokens_seen": 168645645, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7578125, "step": 7849, "time_per_iteration": 2.579671859741211 }, { "auxiliary_loss_clip": 0.0114897, "auxiliary_loss_mlp": 0.01040683, "balance_loss_clip": 1.02519202, "balance_loss_mlp": 1.04347503, "epoch": 0.4719675334435593, "flos": 17895472848000.0, "grad_norm": 1.999110072433156, "language_loss": 0.69250798, "learning_rate": 2.176214238962532e-06, "loss": 0.71440452, "num_input_tokens_seen": 168664165, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.7890625, "step": 7850, "time_per_iteration": 2.5434203147888184 }, { "auxiliary_loss_clip": 0.01165607, "auxiliary_loss_mlp": 0.01029586, "balance_loss_clip": 1.01560283, "balance_loss_mlp": 1.04362977, "epoch": 0.47202765669622726, "flos": 20704369426560.0, "grad_norm": 2.0245996258846852, "language_loss": 0.74791539, "learning_rate": 2.175837882841815e-06, "loss": 0.76986736, "num_input_tokens_seen": 168681940, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7734375, "step": 7851, "time_per_iteration": 2.640123128890991 }, { "auxiliary_loss_clip": 0.01165981, "auxiliary_loss_mlp": 0.01035958, "balance_loss_clip": 1.02155185, "balance_loss_mlp": 1.04286957, "epoch": 0.4720877799488952, "flos": 16360255607040.0, "grad_norm": 1.897219059270289, "language_loss": 0.76065761, "learning_rate": 2.1754615204458963e-06, "loss": 0.78267705, "num_input_tokens_seen": 168698830, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78125, "step": 7852, "time_per_iteration": 2.7633657455444336 }, { "auxiliary_loss_clip": 0.01145308, "auxiliary_loss_mlp": 0.01032557, "balance_loss_clip": 1.01819253, "balance_loss_mlp": 1.04306269, "epoch": 0.4721479032015632, "flos": 20668171495680.0, "grad_norm": 3.2560924168373355, "language_loss": 0.6935423, "learning_rate": 2.175085151788208e-06, "loss": 0.71532094, "num_input_tokens_seen": 168718305, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.75390625, "step": 7853, "time_per_iteration": 2.639871835708618 }, { "auxiliary_loss_clip": 0.01074697, "auxiliary_loss_mlp": 0.01005793, "balance_loss_clip": 1.00390947, "balance_loss_mlp": 1.01490808, "epoch": 0.47220802645423116, "flos": 67750438435200.0, "grad_norm": 0.7011879216677969, "language_loss": 0.50146049, "learning_rate": 2.17470877688218e-06, "loss": 0.52226543, "num_input_tokens_seen": 168782365, "router_z_loss_clip": 0.01879883, "router_z_loss_mlp": 0.24316406, "step": 7854, "time_per_iteration": 3.3438880443573 }, { "auxiliary_loss_clip": 0.0112821, "auxiliary_loss_mlp": 0.01032697, "balance_loss_clip": 1.01792705, "balance_loss_mlp": 1.04243028, "epoch": 0.4722681497068991, "flos": 20921449271040.0, "grad_norm": 1.9518437304715366, "language_loss": 0.63976854, "learning_rate": 2.1743323957412457e-06, "loss": 0.66137761, "num_input_tokens_seen": 168800485, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.76953125, "step": 7855, "time_per_iteration": 2.582170248031616 }, { "auxiliary_loss_clip": 0.01127738, "auxiliary_loss_mlp": 0.01036256, "balance_loss_clip": 1.02129519, "balance_loss_mlp": 1.04163408, "epoch": 0.4723282729595671, "flos": 28293083798400.0, "grad_norm": 1.8379618222531884, "language_loss": 0.75613475, "learning_rate": 2.1739560083788363e-06, "loss": 0.77777469, "num_input_tokens_seen": 168818965, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7734375, "step": 7856, "time_per_iteration": 2.6776812076568604 }, { "auxiliary_loss_clip": 0.01132504, "auxiliary_loss_mlp": 0.0103268, "balance_loss_clip": 1.01954889, "balance_loss_mlp": 1.04014611, "epoch": 0.4723883962122351, "flos": 27125053338240.0, "grad_norm": 2.0228105213885503, "language_loss": 0.74735141, "learning_rate": 2.1735796148083843e-06, "loss": 0.76900327, "num_input_tokens_seen": 168840355, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.74609375, "step": 7857, "time_per_iteration": 2.638516426086426 }, { "auxiliary_loss_clip": 0.01132599, "auxiliary_loss_mlp": 0.01035864, "balance_loss_clip": 1.0230186, "balance_loss_mlp": 1.04112899, "epoch": 0.47244851946490307, "flos": 31537253387520.0, "grad_norm": 1.5092350845737712, "language_loss": 0.64377731, "learning_rate": 2.1732032150433225e-06, "loss": 0.6654619, "num_input_tokens_seen": 168861765, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73828125, "step": 7858, "time_per_iteration": 2.6980717182159424 }, { "auxiliary_loss_clip": 0.01123727, "auxiliary_loss_mlp": 0.01285765, "balance_loss_clip": 1.02462935, "balance_loss_mlp": 1.04006875, "epoch": 0.47250864271757104, "flos": 20886544229760.0, "grad_norm": 1.4574738886639316, "language_loss": 0.70143414, "learning_rate": 2.1728268090970834e-06, "loss": 0.72552907, "num_input_tokens_seen": 168881310, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.74609375, "step": 7859, "time_per_iteration": 2.572300434112549 }, { "auxiliary_loss_clip": 0.01123636, "auxiliary_loss_mlp": 0.01040273, "balance_loss_clip": 1.02594376, "balance_loss_mlp": 1.04397452, "epoch": 0.472568765970239, "flos": 20522086882560.0, "grad_norm": 1.8400502782048347, "language_loss": 0.61908507, "learning_rate": 2.1724503969831003e-06, "loss": 0.64072418, "num_input_tokens_seen": 168899470, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.796875, "step": 7860, "time_per_iteration": 2.5727169513702393 }, { "auxiliary_loss_clip": 0.0113954, "auxiliary_loss_mlp": 0.01042267, "balance_loss_clip": 1.02789092, "balance_loss_mlp": 1.0441823, "epoch": 0.47262888922290697, "flos": 35805200417280.0, "grad_norm": 5.328472749531081, "language_loss": 0.71899092, "learning_rate": 2.172073978714806e-06, "loss": 0.74080902, "num_input_tokens_seen": 168921495, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.78125, "step": 7861, "time_per_iteration": 2.654752731323242 }, { "auxiliary_loss_clip": 0.01135523, "auxiliary_loss_mlp": 0.01037047, "balance_loss_clip": 1.02315879, "balance_loss_mlp": 1.04084051, "epoch": 0.47268901247557493, "flos": 20667740532480.0, "grad_norm": 2.3059712653768543, "language_loss": 0.8447305, "learning_rate": 2.171697554305634e-06, "loss": 0.86645615, "num_input_tokens_seen": 168940515, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76953125, "step": 7862, "time_per_iteration": 2.6161255836486816 }, { "auxiliary_loss_clip": 0.01117737, "auxiliary_loss_mlp": 0.01031959, "balance_loss_clip": 1.01857185, "balance_loss_mlp": 1.0402689, "epoch": 0.4727491357282429, "flos": 19573291082880.0, "grad_norm": 1.7203135047452274, "language_loss": 0.84278923, "learning_rate": 2.1713211237690178e-06, "loss": 0.86428618, "num_input_tokens_seen": 168958340, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.77734375, "step": 7863, "time_per_iteration": 2.6532952785491943 }, { "auxiliary_loss_clip": 0.01135116, "auxiliary_loss_mlp": 0.01039277, "balance_loss_clip": 1.02581191, "balance_loss_mlp": 1.04232717, "epoch": 0.47280925898091086, "flos": 18217231902720.0, "grad_norm": 1.7679987922893854, "language_loss": 0.65968442, "learning_rate": 2.1709446871183917e-06, "loss": 0.68142831, "num_input_tokens_seen": 168974850, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75390625, "step": 7864, "time_per_iteration": 2.619969129562378 }, { "auxiliary_loss_clip": 0.01116906, "auxiliary_loss_mlp": 0.01032076, "balance_loss_clip": 1.01784801, "balance_loss_mlp": 1.04148889, "epoch": 0.4728693822335788, "flos": 17821820010240.0, "grad_norm": 1.9049063838703768, "language_loss": 0.65408242, "learning_rate": 2.1705682443671897e-06, "loss": 0.67557228, "num_input_tokens_seen": 168992860, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75390625, "step": 7865, "time_per_iteration": 3.8467555046081543 }, { "auxiliary_loss_clip": 0.01130807, "auxiliary_loss_mlp": 0.01282694, "balance_loss_clip": 1.02253628, "balance_loss_mlp": 1.03970432, "epoch": 0.4729295054862468, "flos": 20595057361920.0, "grad_norm": 1.6441141855538142, "language_loss": 0.74417752, "learning_rate": 2.1701917955288454e-06, "loss": 0.76831251, "num_input_tokens_seen": 169010325, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 7866, "time_per_iteration": 2.5189669132232666 }, { "auxiliary_loss_clip": 0.01118165, "auxiliary_loss_mlp": 0.01032007, "balance_loss_clip": 1.01822615, "balance_loss_mlp": 1.04062974, "epoch": 0.47298962873891476, "flos": 23368079232000.0, "grad_norm": 1.8091174255659233, "language_loss": 0.83115065, "learning_rate": 2.1698153406167934e-06, "loss": 0.85265243, "num_input_tokens_seen": 169029840, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7734375, "step": 7867, "time_per_iteration": 2.549675703048706 }, { "auxiliary_loss_clip": 0.01122271, "auxiliary_loss_mlp": 0.01031165, "balance_loss_clip": 1.01768196, "balance_loss_mlp": 1.03961825, "epoch": 0.4730497519915827, "flos": 22052240305920.0, "grad_norm": 1.587391981317273, "language_loss": 0.79589981, "learning_rate": 2.1694388796444697e-06, "loss": 0.81743413, "num_input_tokens_seen": 169049975, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 7868, "time_per_iteration": 2.54970645904541 }, { "auxiliary_loss_clip": 0.01174232, "auxiliary_loss_mlp": 0.01034749, "balance_loss_clip": 1.02013433, "balance_loss_mlp": 1.04208064, "epoch": 0.4731098752442507, "flos": 21069724613760.0, "grad_norm": 2.2677455508153774, "language_loss": 0.7465421, "learning_rate": 2.1690624126253074e-06, "loss": 0.76863188, "num_input_tokens_seen": 169069540, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78125, "step": 7869, "time_per_iteration": 2.6683695316314697 }, { "auxiliary_loss_clip": 0.01136414, "auxiliary_loss_mlp": 0.01044021, "balance_loss_clip": 1.02906609, "balance_loss_mlp": 1.04240227, "epoch": 0.4731699984969187, "flos": 22528775064960.0, "grad_norm": 1.5636039989408614, "language_loss": 0.73695111, "learning_rate": 2.168685939572743e-06, "loss": 0.75875545, "num_input_tokens_seen": 169089940, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.765625, "step": 7870, "time_per_iteration": 4.013899803161621 }, { "auxiliary_loss_clip": 0.01121341, "auxiliary_loss_mlp": 0.01034405, "balance_loss_clip": 1.01968312, "balance_loss_mlp": 1.04126883, "epoch": 0.4732301217495867, "flos": 24898124914560.0, "grad_norm": 1.9202918038714998, "language_loss": 0.81193852, "learning_rate": 2.1683094605002107e-06, "loss": 0.83349597, "num_input_tokens_seen": 169109650, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.80078125, "step": 7871, "time_per_iteration": 2.53395676612854 }, { "auxiliary_loss_clip": 0.0112589, "auxiliary_loss_mlp": 0.01035697, "balance_loss_clip": 1.02157032, "balance_loss_mlp": 1.04106319, "epoch": 0.47329024500225464, "flos": 22784423137920.0, "grad_norm": 1.5222373058572214, "language_loss": 0.75631523, "learning_rate": 2.1679329754211472e-06, "loss": 0.77793109, "num_input_tokens_seen": 169128990, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 7872, "time_per_iteration": 2.5772805213928223 }, { "auxiliary_loss_clip": 0.01140983, "auxiliary_loss_mlp": 0.01034165, "balance_loss_clip": 1.01978219, "balance_loss_mlp": 1.03885031, "epoch": 0.4733503682549226, "flos": 38695902220800.0, "grad_norm": 1.9232612214562725, "language_loss": 0.67625248, "learning_rate": 2.1675564843489872e-06, "loss": 0.69800401, "num_input_tokens_seen": 169154645, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7578125, "step": 7873, "time_per_iteration": 2.730616331100464 }, { "auxiliary_loss_clip": 0.0112399, "auxiliary_loss_mlp": 0.0103358, "balance_loss_clip": 1.02006757, "balance_loss_mlp": 1.04043472, "epoch": 0.47341049150759057, "flos": 22966849336320.0, "grad_norm": 1.5128967938130935, "language_loss": 0.72352195, "learning_rate": 2.167179987297168e-06, "loss": 0.7450977, "num_input_tokens_seen": 169174995, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.74609375, "step": 7874, "time_per_iteration": 2.673771381378174 }, { "auxiliary_loss_clip": 0.01137072, "auxiliary_loss_mlp": 0.01034258, "balance_loss_clip": 1.02093077, "balance_loss_mlp": 1.04220212, "epoch": 0.47347061476025853, "flos": 14538471661440.0, "grad_norm": 2.4697046786217216, "language_loss": 0.65286577, "learning_rate": 2.1668034842791246e-06, "loss": 0.67457902, "num_input_tokens_seen": 169191815, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.76953125, "step": 7875, "time_per_iteration": 4.054929733276367 }, { "auxiliary_loss_clip": 0.01154697, "auxiliary_loss_mlp": 0.0103681, "balance_loss_clip": 1.02226651, "balance_loss_mlp": 1.0422914, "epoch": 0.4735307380129265, "flos": 30263250827520.0, "grad_norm": 2.086902069822823, "language_loss": 0.80692106, "learning_rate": 2.166426975308294e-06, "loss": 0.8288362, "num_input_tokens_seen": 169210430, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.765625, "step": 7876, "time_per_iteration": 2.7114756107330322 }, { "auxiliary_loss_clip": 0.01133488, "auxiliary_loss_mlp": 0.01035639, "balance_loss_clip": 1.0222218, "balance_loss_mlp": 1.03920197, "epoch": 0.47359086126559446, "flos": 19391044452480.0, "grad_norm": 2.1849269223769703, "language_loss": 0.79133672, "learning_rate": 2.166050460398113e-06, "loss": 0.81302798, "num_input_tokens_seen": 169229295, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.76171875, "step": 7877, "time_per_iteration": 4.065566062927246 }, { "auxiliary_loss_clip": 0.01124194, "auxiliary_loss_mlp": 0.0128134, "balance_loss_clip": 1.02077734, "balance_loss_mlp": 1.04064941, "epoch": 0.47365098451826243, "flos": 21939408708480.0, "grad_norm": 1.7991740866104504, "language_loss": 0.70579433, "learning_rate": 2.1656739395620173e-06, "loss": 0.7298497, "num_input_tokens_seen": 169247855, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.74609375, "step": 7878, "time_per_iteration": 2.538315773010254 }, { "auxiliary_loss_clip": 0.01140236, "auxiliary_loss_mlp": 0.01036339, "balance_loss_clip": 1.02305269, "balance_loss_mlp": 1.04074526, "epoch": 0.4737111077709304, "flos": 25845053207040.0, "grad_norm": 1.7072654867749173, "language_loss": 0.74924672, "learning_rate": 2.1652974128134457e-06, "loss": 0.77101243, "num_input_tokens_seen": 169268860, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 7879, "time_per_iteration": 2.68996524810791 }, { "auxiliary_loss_clip": 0.01055768, "auxiliary_loss_mlp": 0.00999175, "balance_loss_clip": 0.9974581, "balance_loss_mlp": 1.01421332, "epoch": 0.47377123102359836, "flos": 67760886314880.0, "grad_norm": 0.7281652788742731, "language_loss": 0.61297417, "learning_rate": 2.1649208801658344e-06, "loss": 0.63352358, "num_input_tokens_seen": 169331855, "router_z_loss_clip": 0.01721191, "router_z_loss_mlp": 0.24121094, "step": 7880, "time_per_iteration": 3.2546818256378174 }, { "auxiliary_loss_clip": 0.01131729, "auxiliary_loss_mlp": 0.01039326, "balance_loss_clip": 1.02471066, "balance_loss_mlp": 1.03995883, "epoch": 0.4738313542762663, "flos": 24315977191680.0, "grad_norm": 1.4968031329729246, "language_loss": 0.6800859, "learning_rate": 2.1645443416326214e-06, "loss": 0.70179641, "num_input_tokens_seen": 169352175, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7421875, "step": 7881, "time_per_iteration": 2.6372008323669434 }, { "auxiliary_loss_clip": 0.01168622, "auxiliary_loss_mlp": 0.01029462, "balance_loss_clip": 1.01689184, "balance_loss_mlp": 1.04168415, "epoch": 0.4738914775289343, "flos": 20705339093760.0, "grad_norm": 1.7450681742923912, "language_loss": 0.77236784, "learning_rate": 2.164167797227244e-06, "loss": 0.7943486, "num_input_tokens_seen": 169371215, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7421875, "step": 7882, "time_per_iteration": 2.669553279876709 }, { "auxiliary_loss_clip": 0.01125433, "auxiliary_loss_mlp": 0.01033639, "balance_loss_clip": 1.02005458, "balance_loss_mlp": 1.04127812, "epoch": 0.4739516007816023, "flos": 25446337263360.0, "grad_norm": 1.5571453065890517, "language_loss": 0.7588402, "learning_rate": 2.16379124696314e-06, "loss": 0.78043091, "num_input_tokens_seen": 169391745, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.75390625, "step": 7883, "time_per_iteration": 2.560443162918091 }, { "auxiliary_loss_clip": 0.01132801, "auxiliary_loss_mlp": 0.01034779, "balance_loss_clip": 1.02209496, "balance_loss_mlp": 1.04080606, "epoch": 0.4740117240342703, "flos": 19974341410560.0, "grad_norm": 2.206161034456382, "language_loss": 0.71738577, "learning_rate": 2.1634146908537483e-06, "loss": 0.73906159, "num_input_tokens_seen": 169409845, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7421875, "step": 7884, "time_per_iteration": 2.606112480163574 }, { "auxiliary_loss_clip": 0.011311, "auxiliary_loss_mlp": 0.01035042, "balance_loss_clip": 1.02060556, "balance_loss_mlp": 1.04170799, "epoch": 0.47407184728693824, "flos": 15661146222720.0, "grad_norm": 2.046061335148567, "language_loss": 0.82181937, "learning_rate": 2.163038128912506e-06, "loss": 0.84348083, "num_input_tokens_seen": 169426085, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8046875, "step": 7885, "time_per_iteration": 2.5607519149780273 }, { "auxiliary_loss_clip": 0.01136253, "auxiliary_loss_mlp": 0.01037231, "balance_loss_clip": 1.02296174, "balance_loss_mlp": 1.04344964, "epoch": 0.4741319705396062, "flos": 18588800142720.0, "grad_norm": 1.6866388666757983, "language_loss": 0.73575449, "learning_rate": 2.1626615611528525e-06, "loss": 0.75748932, "num_input_tokens_seen": 169444705, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.74609375, "step": 7886, "time_per_iteration": 2.634949207305908 }, { "auxiliary_loss_clip": 0.0113906, "auxiliary_loss_mlp": 0.01037339, "balance_loss_clip": 1.02190125, "balance_loss_mlp": 1.04123843, "epoch": 0.47419209379227417, "flos": 13261093223040.0, "grad_norm": 2.02288597609642, "language_loss": 0.74105036, "learning_rate": 2.1622849875882266e-06, "loss": 0.7628144, "num_input_tokens_seen": 169460850, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.80078125, "step": 7887, "time_per_iteration": 2.532456874847412 }, { "auxiliary_loss_clip": 0.01122022, "auxiliary_loss_mlp": 0.01027699, "balance_loss_clip": 1.01526535, "balance_loss_mlp": 1.04061413, "epoch": 0.47425221704494214, "flos": 20044043752320.0, "grad_norm": 1.9313790665141382, "language_loss": 0.768489, "learning_rate": 2.1619084082320663e-06, "loss": 0.78998625, "num_input_tokens_seen": 169478890, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7265625, "step": 7888, "time_per_iteration": 2.5362601280212402 }, { "auxiliary_loss_clip": 0.01127327, "auxiliary_loss_mlp": 0.01037413, "balance_loss_clip": 1.02390611, "balance_loss_mlp": 1.04299819, "epoch": 0.4743123402976101, "flos": 27271892136960.0, "grad_norm": 1.6654616584218482, "language_loss": 0.72444022, "learning_rate": 2.161531823097812e-06, "loss": 0.74608755, "num_input_tokens_seen": 169499690, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75390625, "step": 7889, "time_per_iteration": 2.5911152362823486 }, { "auxiliary_loss_clip": 0.01040831, "auxiliary_loss_mlp": 0.01001492, "balance_loss_clip": 0.99962074, "balance_loss_mlp": 1.01695406, "epoch": 0.47437246355027807, "flos": 55393970261760.0, "grad_norm": 0.7172227922194375, "language_loss": 0.56088364, "learning_rate": 2.1611552321989015e-06, "loss": 0.58130687, "num_input_tokens_seen": 169560475, "router_z_loss_clip": 0.01867676, "router_z_loss_mlp": 0.23828125, "step": 7890, "time_per_iteration": 3.2120885848999023 }, { "auxiliary_loss_clip": 0.01128747, "auxiliary_loss_mlp": 0.01037593, "balance_loss_clip": 1.02225101, "balance_loss_mlp": 1.04292846, "epoch": 0.47443258680294603, "flos": 23878477537920.0, "grad_norm": 1.8654737206934866, "language_loss": 0.65568793, "learning_rate": 2.1607786355487764e-06, "loss": 0.6773513, "num_input_tokens_seen": 169580110, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.7734375, "step": 7891, "time_per_iteration": 2.5504987239837646 }, { "auxiliary_loss_clip": 0.01124257, "auxiliary_loss_mlp": 0.01033296, "balance_loss_clip": 1.01822793, "balance_loss_mlp": 1.04451632, "epoch": 0.474492710055614, "flos": 21977761455360.0, "grad_norm": 2.5221642516186917, "language_loss": 0.70302296, "learning_rate": 2.1604020331608746e-06, "loss": 0.72459853, "num_input_tokens_seen": 169597510, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.796875, "step": 7892, "time_per_iteration": 2.581531047821045 }, { "auxiliary_loss_clip": 0.01057707, "auxiliary_loss_mlp": 0.01005973, "balance_loss_clip": 1.0041846, "balance_loss_mlp": 1.01593184, "epoch": 0.47455283330828196, "flos": 62557180122240.0, "grad_norm": 0.8112300588789917, "language_loss": 0.58589351, "learning_rate": 2.1600254250486373e-06, "loss": 0.60653031, "num_input_tokens_seen": 169660010, "router_z_loss_clip": 0.01782227, "router_z_loss_mlp": 0.23925781, "step": 7893, "time_per_iteration": 3.2353620529174805 }, { "auxiliary_loss_clip": 0.01128371, "auxiliary_loss_mlp": 0.01033661, "balance_loss_clip": 1.02025616, "balance_loss_mlp": 1.04479492, "epoch": 0.47461295656094993, "flos": 12093637380480.0, "grad_norm": 1.818551371632497, "language_loss": 0.77069062, "learning_rate": 2.1596488112255036e-06, "loss": 0.79231095, "num_input_tokens_seen": 169678485, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.74609375, "step": 7894, "time_per_iteration": 2.542036294937134 }, { "auxiliary_loss_clip": 0.01128745, "auxiliary_loss_mlp": 0.0103503, "balance_loss_clip": 1.02176785, "balance_loss_mlp": 1.04405737, "epoch": 0.4746730798136179, "flos": 20884568981760.0, "grad_norm": 1.7606290121401134, "language_loss": 0.74629903, "learning_rate": 2.159272191704915e-06, "loss": 0.76793671, "num_input_tokens_seen": 169697335, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7578125, "step": 7895, "time_per_iteration": 2.64509654045105 }, { "auxiliary_loss_clip": 0.01124999, "auxiliary_loss_mlp": 0.01031251, "balance_loss_clip": 1.01714277, "balance_loss_mlp": 1.04228354, "epoch": 0.4747332030662859, "flos": 19974808287360.0, "grad_norm": 2.0080282319706626, "language_loss": 0.82566553, "learning_rate": 2.158895566500312e-06, "loss": 0.84722805, "num_input_tokens_seen": 169715395, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.73828125, "step": 7896, "time_per_iteration": 2.5941734313964844 }, { "auxiliary_loss_clip": 0.01125744, "auxiliary_loss_mlp": 0.01032369, "balance_loss_clip": 1.01730084, "balance_loss_mlp": 1.04241252, "epoch": 0.4747933263189539, "flos": 16034186920320.0, "grad_norm": 3.0244058232070725, "language_loss": 0.75199836, "learning_rate": 2.158518935625134e-06, "loss": 0.77357948, "num_input_tokens_seen": 169733755, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7421875, "step": 7897, "time_per_iteration": 2.5856380462646484 }, { "auxiliary_loss_clip": 0.01139712, "auxiliary_loss_mlp": 0.01037159, "balance_loss_clip": 1.02275229, "balance_loss_mlp": 1.04186177, "epoch": 0.47485344957162184, "flos": 13955102876160.0, "grad_norm": 2.2279212658868106, "language_loss": 0.63392818, "learning_rate": 2.1581422990928233e-06, "loss": 0.65569687, "num_input_tokens_seen": 169751390, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.80078125, "step": 7898, "time_per_iteration": 2.5596182346343994 }, { "auxiliary_loss_clip": 0.01145361, "auxiliary_loss_mlp": 0.01041801, "balance_loss_clip": 1.0270493, "balance_loss_mlp": 1.04188097, "epoch": 0.4749135728242898, "flos": 20449080489600.0, "grad_norm": 2.4175403727547278, "language_loss": 0.69453073, "learning_rate": 2.1577656569168215e-06, "loss": 0.71640235, "num_input_tokens_seen": 169769500, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.765625, "step": 7899, "time_per_iteration": 2.6226069927215576 }, { "auxiliary_loss_clip": 0.01130692, "auxiliary_loss_mlp": 0.0103822, "balance_loss_clip": 1.02396894, "balance_loss_mlp": 1.04427457, "epoch": 0.4749736960769578, "flos": 28949961767040.0, "grad_norm": 1.8990849903363052, "language_loss": 0.6789999, "learning_rate": 2.1573890091105684e-06, "loss": 0.70068896, "num_input_tokens_seen": 169789215, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78125, "step": 7900, "time_per_iteration": 2.5914835929870605 }, { "auxiliary_loss_clip": 0.01144345, "auxiliary_loss_mlp": 0.01037418, "balance_loss_clip": 1.02361917, "balance_loss_mlp": 1.04202819, "epoch": 0.47503381932962574, "flos": 31938770592000.0, "grad_norm": 1.9881716294484737, "language_loss": 0.70533842, "learning_rate": 2.157012355687507e-06, "loss": 0.72715604, "num_input_tokens_seen": 169808825, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75390625, "step": 7901, "time_per_iteration": 2.7009294033050537 }, { "auxiliary_loss_clip": 0.01136982, "auxiliary_loss_mlp": 0.0104364, "balance_loss_clip": 1.02935243, "balance_loss_mlp": 1.04361892, "epoch": 0.4750939425822937, "flos": 22127257860480.0, "grad_norm": 1.6326001448487433, "language_loss": 0.73709029, "learning_rate": 2.1566356966610776e-06, "loss": 0.75889647, "num_input_tokens_seen": 169827590, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7578125, "step": 7902, "time_per_iteration": 2.629324436187744 }, { "auxiliary_loss_clip": 0.01140945, "auxiliary_loss_mlp": 0.01032903, "balance_loss_clip": 1.02002263, "balance_loss_mlp": 1.04214931, "epoch": 0.47515406583496167, "flos": 20850094903680.0, "grad_norm": 2.248974929873935, "language_loss": 0.68735445, "learning_rate": 2.1562590320447234e-06, "loss": 0.70909297, "num_input_tokens_seen": 169844925, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.72265625, "step": 7903, "time_per_iteration": 2.6563730239868164 }, { "auxiliary_loss_clip": 0.01132396, "auxiliary_loss_mlp": 0.01031607, "balance_loss_clip": 1.0191679, "balance_loss_mlp": 1.04157877, "epoch": 0.47521418908762963, "flos": 17524802448000.0, "grad_norm": 1.4440190543112728, "language_loss": 0.71928537, "learning_rate": 2.155882361851887e-06, "loss": 0.74092543, "num_input_tokens_seen": 169862705, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7265625, "step": 7904, "time_per_iteration": 2.6181023120880127 }, { "auxiliary_loss_clip": 0.01159135, "auxiliary_loss_mlp": 0.01034185, "balance_loss_clip": 1.0215131, "balance_loss_mlp": 1.04189396, "epoch": 0.4752743123402976, "flos": 20559434048640.0, "grad_norm": 1.6852735777265195, "language_loss": 0.859474, "learning_rate": 2.1555056860960095e-06, "loss": 0.8814072, "num_input_tokens_seen": 169880155, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 7905, "time_per_iteration": 2.7555253505706787 }, { "auxiliary_loss_clip": 0.01122069, "auxiliary_loss_mlp": 0.01036287, "balance_loss_clip": 1.02347255, "balance_loss_mlp": 1.04093385, "epoch": 0.47533443559296557, "flos": 26360623071360.0, "grad_norm": 1.5581146207641277, "language_loss": 0.8196032, "learning_rate": 2.1551290047905343e-06, "loss": 0.84118682, "num_input_tokens_seen": 169901525, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 7906, "time_per_iteration": 2.5898277759552 }, { "auxiliary_loss_clip": 0.01064197, "auxiliary_loss_mlp": 0.01008053, "balance_loss_clip": 1.00613391, "balance_loss_mlp": 1.01398659, "epoch": 0.47539455884563353, "flos": 65949660967680.0, "grad_norm": 0.6651188631918096, "language_loss": 0.58941168, "learning_rate": 2.1547523179489033e-06, "loss": 0.61013418, "num_input_tokens_seen": 169970345, "router_z_loss_clip": 0.01916504, "router_z_loss_mlp": 0.23925781, "step": 7907, "time_per_iteration": 4.70278525352478 }, { "auxiliary_loss_clip": 0.01125795, "auxiliary_loss_mlp": 0.01035039, "balance_loss_clip": 1.02168167, "balance_loss_mlp": 1.04325485, "epoch": 0.4754546820983015, "flos": 17238128002560.0, "grad_norm": 1.7850166650447883, "language_loss": 0.80816436, "learning_rate": 2.154375625584561e-06, "loss": 0.82977277, "num_input_tokens_seen": 169986440, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 7908, "time_per_iteration": 2.6013877391815186 }, { "auxiliary_loss_clip": 0.01127912, "auxiliary_loss_mlp": 0.01035604, "balance_loss_clip": 1.02121556, "balance_loss_mlp": 1.04385996, "epoch": 0.47551480535096946, "flos": 19825886499840.0, "grad_norm": 2.5059478155345065, "language_loss": 0.7422576, "learning_rate": 2.1539989277109496e-06, "loss": 0.76389277, "num_input_tokens_seen": 170005705, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.75, "step": 7909, "time_per_iteration": 2.539050817489624 }, { "auxiliary_loss_clip": 0.01133408, "auxiliary_loss_mlp": 0.01034859, "balance_loss_clip": 1.02139461, "balance_loss_mlp": 1.04205322, "epoch": 0.4755749286036375, "flos": 22163958581760.0, "grad_norm": 1.5682796107077654, "language_loss": 0.75125593, "learning_rate": 2.153622224341512e-06, "loss": 0.77293861, "num_input_tokens_seen": 170023415, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 7910, "time_per_iteration": 2.6136813163757324 }, { "auxiliary_loss_clip": 0.0112466, "auxiliary_loss_mlp": 0.01026922, "balance_loss_clip": 1.01465535, "balance_loss_mlp": 1.04157877, "epoch": 0.47563505185630545, "flos": 21648280976640.0, "grad_norm": 1.6056709457818732, "language_loss": 0.78683078, "learning_rate": 2.1532455154896926e-06, "loss": 0.80834651, "num_input_tokens_seen": 170042395, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.7421875, "step": 7911, "time_per_iteration": 2.536452054977417 }, { "auxiliary_loss_clip": 0.01136281, "auxiliary_loss_mlp": 0.01277933, "balance_loss_clip": 1.01701403, "balance_loss_mlp": 1.04064417, "epoch": 0.4756951751089734, "flos": 20628777254400.0, "grad_norm": 1.614192408595683, "language_loss": 0.75867724, "learning_rate": 2.1528688011689348e-06, "loss": 0.78281939, "num_input_tokens_seen": 170061610, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7734375, "step": 7912, "time_per_iteration": 4.025468587875366 }, { "auxiliary_loss_clip": 0.0111355, "auxiliary_loss_mlp": 0.01037468, "balance_loss_clip": 1.02443838, "balance_loss_mlp": 1.03899455, "epoch": 0.4757552983616414, "flos": 25848788221440.0, "grad_norm": 2.2267010536812397, "language_loss": 0.74260902, "learning_rate": 2.1524920813926833e-06, "loss": 0.76411927, "num_input_tokens_seen": 170083505, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.74609375, "step": 7913, "time_per_iteration": 2.584075689315796 }, { "auxiliary_loss_clip": 0.01136374, "auxiliary_loss_mlp": 0.01029586, "balance_loss_clip": 1.01548922, "balance_loss_mlp": 1.04314852, "epoch": 0.47581542161430934, "flos": 18223013992320.0, "grad_norm": 1.903610939056047, "language_loss": 0.72074807, "learning_rate": 2.152115356174382e-06, "loss": 0.74240768, "num_input_tokens_seen": 170100690, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75390625, "step": 7914, "time_per_iteration": 2.5231053829193115 }, { "auxiliary_loss_clip": 0.01136471, "auxiliary_loss_mlp": 0.01037144, "balance_loss_clip": 1.02391148, "balance_loss_mlp": 1.04256487, "epoch": 0.4758755448669773, "flos": 21579763783680.0, "grad_norm": 3.857506554270348, "language_loss": 0.64705217, "learning_rate": 2.151738625527474e-06, "loss": 0.66878831, "num_input_tokens_seen": 170119240, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7578125, "step": 7915, "time_per_iteration": 2.5946319103240967 }, { "auxiliary_loss_clip": 0.01113238, "auxiliary_loss_mlp": 0.01034818, "balance_loss_clip": 1.02101374, "balance_loss_mlp": 1.04057682, "epoch": 0.47593566811964527, "flos": 15231152511360.0, "grad_norm": 1.7635037222926588, "language_loss": 0.76881939, "learning_rate": 2.151361889465405e-06, "loss": 0.79029995, "num_input_tokens_seen": 170136450, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7265625, "step": 7916, "time_per_iteration": 4.108005523681641 }, { "auxiliary_loss_clip": 0.01116626, "auxiliary_loss_mlp": 0.01030608, "balance_loss_clip": 1.01703584, "balance_loss_mlp": 1.04209781, "epoch": 0.47599579137231324, "flos": 21543242630400.0, "grad_norm": 2.128098123523798, "language_loss": 0.64061415, "learning_rate": 2.1509851480016197e-06, "loss": 0.66208649, "num_input_tokens_seen": 170155295, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.74609375, "step": 7917, "time_per_iteration": 2.5642025470733643 }, { "auxiliary_loss_clip": 0.01065505, "auxiliary_loss_mlp": 0.01005829, "balance_loss_clip": 1.00400531, "balance_loss_mlp": 1.01577854, "epoch": 0.4760559146249812, "flos": 64554602595840.0, "grad_norm": 0.837287761105677, "language_loss": 0.65651441, "learning_rate": 2.150608401149563e-06, "loss": 0.67722774, "num_input_tokens_seen": 170222325, "router_z_loss_clip": 0.01818848, "router_z_loss_mlp": 0.23828125, "step": 7918, "time_per_iteration": 3.2395637035369873 }, { "auxiliary_loss_clip": 0.01125776, "auxiliary_loss_mlp": 0.01280585, "balance_loss_clip": 1.02022409, "balance_loss_mlp": 1.04100919, "epoch": 0.47611603787764917, "flos": 22233876405120.0, "grad_norm": 1.8171821178185144, "language_loss": 0.69234169, "learning_rate": 2.1502316489226796e-06, "loss": 0.71640527, "num_input_tokens_seen": 170241625, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7578125, "step": 7919, "time_per_iteration": 4.0831170082092285 }, { "auxiliary_loss_clip": 0.01134497, "auxiliary_loss_mlp": 0.01032517, "balance_loss_clip": 1.01942158, "balance_loss_mlp": 1.04099274, "epoch": 0.47617616113031713, "flos": 22780005765120.0, "grad_norm": 2.2730021114632137, "language_loss": 0.74635929, "learning_rate": 2.149854891334415e-06, "loss": 0.76802945, "num_input_tokens_seen": 170262470, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7578125, "step": 7920, "time_per_iteration": 2.598811626434326 }, { "auxiliary_loss_clip": 0.01136534, "auxiliary_loss_mlp": 0.01035198, "balance_loss_clip": 1.02080345, "balance_loss_mlp": 1.04062736, "epoch": 0.4762362843829851, "flos": 24133802388480.0, "grad_norm": 1.5815493193464112, "language_loss": 0.7741909, "learning_rate": 2.149478128398215e-06, "loss": 0.79590821, "num_input_tokens_seen": 170283460, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78515625, "step": 7921, "time_per_iteration": 2.5826022624969482 }, { "auxiliary_loss_clip": 0.01136237, "auxiliary_loss_mlp": 0.01041356, "balance_loss_clip": 1.02622259, "balance_loss_mlp": 1.04151988, "epoch": 0.47629640763565306, "flos": 22452069571200.0, "grad_norm": 1.6375163813441875, "language_loss": 0.78194606, "learning_rate": 2.1491013601275244e-06, "loss": 0.80372202, "num_input_tokens_seen": 170304225, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.7734375, "step": 7922, "time_per_iteration": 2.6045942306518555 }, { "auxiliary_loss_clip": 0.01146936, "auxiliary_loss_mlp": 0.01041938, "balance_loss_clip": 1.02679849, "balance_loss_mlp": 1.04214919, "epoch": 0.4763565308883211, "flos": 11181398647680.0, "grad_norm": 1.866868413588463, "language_loss": 0.72650415, "learning_rate": 2.148724586535791e-06, "loss": 0.74839288, "num_input_tokens_seen": 170322110, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.78515625, "step": 7923, "time_per_iteration": 2.5264973640441895 }, { "auxiliary_loss_clip": 0.01143351, "auxiliary_loss_mlp": 0.01039393, "balance_loss_clip": 1.02579153, "balance_loss_mlp": 1.04076123, "epoch": 0.47641665414098905, "flos": 22382151747840.0, "grad_norm": 1.912120492463577, "language_loss": 0.81867039, "learning_rate": 2.1483478076364586e-06, "loss": 0.84049785, "num_input_tokens_seen": 170340700, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7578125, "step": 7924, "time_per_iteration": 2.6046314239501953 }, { "auxiliary_loss_clip": 0.01135711, "auxiliary_loss_mlp": 0.01035648, "balance_loss_clip": 1.02077627, "balance_loss_mlp": 1.04587233, "epoch": 0.476476777393657, "flos": 25046148862080.0, "grad_norm": 1.8907746933002425, "language_loss": 0.80442488, "learning_rate": 2.147971023442975e-06, "loss": 0.8261385, "num_input_tokens_seen": 170359780, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.80859375, "step": 7925, "time_per_iteration": 2.5966107845306396 }, { "auxiliary_loss_clip": 0.01134186, "auxiliary_loss_mlp": 0.0103954, "balance_loss_clip": 1.02615261, "balance_loss_mlp": 1.04039788, "epoch": 0.476536900646325, "flos": 27269916888960.0, "grad_norm": 1.547821461736264, "language_loss": 0.72334278, "learning_rate": 2.147594233968787e-06, "loss": 0.74507999, "num_input_tokens_seen": 170381260, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.765625, "step": 7926, "time_per_iteration": 2.6249570846557617 }, { "auxiliary_loss_clip": 0.01128973, "auxiliary_loss_mlp": 0.01033569, "balance_loss_clip": 1.01918077, "balance_loss_mlp": 1.04159033, "epoch": 0.47659702389899294, "flos": 25301401885440.0, "grad_norm": 2.2400621938393175, "language_loss": 0.67876828, "learning_rate": 2.147217439227339e-06, "loss": 0.70039368, "num_input_tokens_seen": 170400595, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78515625, "step": 7927, "time_per_iteration": 2.6369051933288574 }, { "auxiliary_loss_clip": 0.01143466, "auxiliary_loss_mlp": 0.01030111, "balance_loss_clip": 1.01742768, "balance_loss_mlp": 1.04216206, "epoch": 0.4766571471516609, "flos": 25992861672960.0, "grad_norm": 1.4918301335017372, "language_loss": 0.67773163, "learning_rate": 2.1468406392320803e-06, "loss": 0.69946742, "num_input_tokens_seen": 170421110, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.74609375, "step": 7928, "time_per_iteration": 2.6185882091522217 }, { "auxiliary_loss_clip": 0.01116539, "auxiliary_loss_mlp": 0.01031664, "balance_loss_clip": 1.01776421, "balance_loss_mlp": 1.04057384, "epoch": 0.4767172704043289, "flos": 16032211672320.0, "grad_norm": 2.163694990360318, "language_loss": 0.78592908, "learning_rate": 2.1464638339964564e-06, "loss": 0.80741107, "num_input_tokens_seen": 170436700, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 7929, "time_per_iteration": 2.6369211673736572 }, { "auxiliary_loss_clip": 0.01134321, "auxiliary_loss_mlp": 0.01037702, "balance_loss_clip": 1.0247978, "balance_loss_mlp": 1.04203057, "epoch": 0.47677739365699684, "flos": 39235351651200.0, "grad_norm": 2.237425861092167, "language_loss": 0.66461098, "learning_rate": 2.1460870235339155e-06, "loss": 0.68633127, "num_input_tokens_seen": 170459555, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7421875, "step": 7930, "time_per_iteration": 2.748769998550415 }, { "auxiliary_loss_clip": 0.01124853, "auxiliary_loss_mlp": 0.01031687, "balance_loss_clip": 1.01907444, "balance_loss_mlp": 1.04036951, "epoch": 0.4768375169096648, "flos": 24717781704960.0, "grad_norm": 1.789419679476017, "language_loss": 0.79536605, "learning_rate": 2.1457102078579045e-06, "loss": 0.81693143, "num_input_tokens_seen": 170479175, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.75390625, "step": 7931, "time_per_iteration": 2.6424152851104736 }, { "auxiliary_loss_clip": 0.01136749, "auxiliary_loss_mlp": 0.0103257, "balance_loss_clip": 1.01828885, "balance_loss_mlp": 1.04171991, "epoch": 0.47689764016233277, "flos": 22528667324160.0, "grad_norm": 1.796268364294113, "language_loss": 0.75838315, "learning_rate": 2.1453333869818702e-06, "loss": 0.78007632, "num_input_tokens_seen": 170498450, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.76953125, "step": 7932, "time_per_iteration": 2.5364956855773926 }, { "auxiliary_loss_clip": 0.01132908, "auxiliary_loss_mlp": 0.012798, "balance_loss_clip": 1.01914823, "balance_loss_mlp": 1.04169559, "epoch": 0.47695776341500074, "flos": 15120619384320.0, "grad_norm": 1.7215875035286017, "language_loss": 0.79078305, "learning_rate": 2.1449565609192617e-06, "loss": 0.81491005, "num_input_tokens_seen": 170516255, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.734375, "step": 7933, "time_per_iteration": 2.551246166229248 }, { "auxiliary_loss_clip": 0.01131726, "auxiliary_loss_mlp": 0.01040924, "balance_loss_clip": 1.02512908, "balance_loss_mlp": 1.0426271, "epoch": 0.4770178866676687, "flos": 14678917839360.0, "grad_norm": 1.8735581886593093, "language_loss": 0.73937523, "learning_rate": 2.144579729683526e-06, "loss": 0.76110172, "num_input_tokens_seen": 170532705, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8046875, "step": 7934, "time_per_iteration": 2.4979310035705566 }, { "auxiliary_loss_clip": 0.0112718, "auxiliary_loss_mlp": 0.01030836, "balance_loss_clip": 1.0173887, "balance_loss_mlp": 1.04014325, "epoch": 0.47707800992033667, "flos": 22565583527040.0, "grad_norm": 1.911137258401512, "language_loss": 0.79728997, "learning_rate": 2.1442028932881123e-06, "loss": 0.81887007, "num_input_tokens_seen": 170551925, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.78125, "step": 7935, "time_per_iteration": 2.584167003631592 }, { "auxiliary_loss_clip": 0.01152078, "auxiliary_loss_mlp": 0.01042419, "balance_loss_clip": 1.027601, "balance_loss_mlp": 1.04306841, "epoch": 0.4771381331730047, "flos": 30918225375360.0, "grad_norm": 1.6445416411699068, "language_loss": 0.70926625, "learning_rate": 2.143826051746468e-06, "loss": 0.73121119, "num_input_tokens_seen": 170572320, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8203125, "step": 7936, "time_per_iteration": 2.6735432147979736 }, { "auxiliary_loss_clip": 0.0111938, "auxiliary_loss_mlp": 0.01034063, "balance_loss_clip": 1.02035379, "balance_loss_mlp": 1.04068208, "epoch": 0.47719825642567265, "flos": 25738901539200.0, "grad_norm": 2.477203155048194, "language_loss": 0.68040812, "learning_rate": 2.143449205072042e-06, "loss": 0.7019425, "num_input_tokens_seen": 170589470, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7890625, "step": 7937, "time_per_iteration": 2.653877019882202 }, { "auxiliary_loss_clip": 0.01044777, "auxiliary_loss_mlp": 0.00999731, "balance_loss_clip": 0.99779969, "balance_loss_mlp": 1.01233768, "epoch": 0.4772583796783406, "flos": 66355128668160.0, "grad_norm": 0.7044464288829817, "language_loss": 0.56377906, "learning_rate": 2.1430723532782828e-06, "loss": 0.5842241, "num_input_tokens_seen": 170662265, "router_z_loss_clip": 0.01928711, "router_z_loss_mlp": 0.23828125, "step": 7938, "time_per_iteration": 3.3355491161346436 }, { "auxiliary_loss_clip": 0.01135487, "auxiliary_loss_mlp": 0.01033057, "balance_loss_clip": 1.01885915, "balance_loss_mlp": 1.04162717, "epoch": 0.4773185029310086, "flos": 22051091070720.0, "grad_norm": 1.7570727915473052, "language_loss": 0.8910253, "learning_rate": 2.142695496378639e-06, "loss": 0.91271073, "num_input_tokens_seen": 170679680, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.765625, "step": 7939, "time_per_iteration": 2.5588972568511963 }, { "auxiliary_loss_clip": 0.01126059, "auxiliary_loss_mlp": 0.01034694, "balance_loss_clip": 1.0217303, "balance_loss_mlp": 1.04040885, "epoch": 0.47737862618367655, "flos": 16727801523840.0, "grad_norm": 1.8056255312477814, "language_loss": 0.76636004, "learning_rate": 2.14231863438656e-06, "loss": 0.78796762, "num_input_tokens_seen": 170697340, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.765625, "step": 7940, "time_per_iteration": 2.580251693725586 }, { "auxiliary_loss_clip": 0.01132392, "auxiliary_loss_mlp": 0.01035914, "balance_loss_clip": 1.0229615, "balance_loss_mlp": 1.04142189, "epoch": 0.4774387494363445, "flos": 19609453100160.0, "grad_norm": 1.6090764131745303, "language_loss": 0.84769022, "learning_rate": 2.1419417673154954e-06, "loss": 0.86937332, "num_input_tokens_seen": 170714905, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.734375, "step": 7941, "time_per_iteration": 2.5820248126983643 }, { "auxiliary_loss_clip": 0.011376, "auxiliary_loss_mlp": 0.01037284, "balance_loss_clip": 1.02303815, "balance_loss_mlp": 1.04222107, "epoch": 0.4774988726890125, "flos": 16653969118080.0, "grad_norm": 2.0411452973274304, "language_loss": 0.75719655, "learning_rate": 2.1415648951788944e-06, "loss": 0.77894533, "num_input_tokens_seen": 170731810, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.77734375, "step": 7942, "time_per_iteration": 2.5954158306121826 }, { "auxiliary_loss_clip": 0.01128858, "auxiliary_loss_mlp": 0.01036541, "balance_loss_clip": 1.02314234, "balance_loss_mlp": 1.04228449, "epoch": 0.47755899594168044, "flos": 20485565729280.0, "grad_norm": 1.6922740986984814, "language_loss": 0.64305562, "learning_rate": 2.1411880179902056e-06, "loss": 0.66470963, "num_input_tokens_seen": 170750270, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7734375, "step": 7943, "time_per_iteration": 2.607635736465454 }, { "auxiliary_loss_clip": 0.01147689, "auxiliary_loss_mlp": 0.01039324, "balance_loss_clip": 1.02424359, "balance_loss_mlp": 1.04112065, "epoch": 0.4776191191943484, "flos": 21652806090240.0, "grad_norm": 1.7687880777577238, "language_loss": 0.73654842, "learning_rate": 2.140811135762881e-06, "loss": 0.75841856, "num_input_tokens_seen": 170769015, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.79296875, "step": 7944, "time_per_iteration": 2.5993189811706543 }, { "auxiliary_loss_clip": 0.01128387, "auxiliary_loss_mlp": 0.01040447, "balance_loss_clip": 1.02536654, "balance_loss_mlp": 1.04233587, "epoch": 0.4776792424470164, "flos": 18770220760320.0, "grad_norm": 2.1995809940300792, "language_loss": 0.68128735, "learning_rate": 2.1404342485103683e-06, "loss": 0.70297569, "num_input_tokens_seen": 170785725, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.76953125, "step": 7945, "time_per_iteration": 2.5728015899658203 }, { "auxiliary_loss_clip": 0.01142291, "auxiliary_loss_mlp": 0.01281417, "balance_loss_clip": 1.0220232, "balance_loss_mlp": 1.03894091, "epoch": 0.47773936569968434, "flos": 29715828577920.0, "grad_norm": 1.9311987033838474, "language_loss": 0.75053245, "learning_rate": 2.1400573562461185e-06, "loss": 0.77476954, "num_input_tokens_seen": 170804600, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.765625, "step": 7946, "time_per_iteration": 2.7092535495758057 }, { "auxiliary_loss_clip": 0.01141021, "auxiliary_loss_mlp": 0.01038124, "balance_loss_clip": 1.02353334, "balance_loss_mlp": 1.04484153, "epoch": 0.4777994889523523, "flos": 24791542283520.0, "grad_norm": 1.6760013263821953, "language_loss": 0.79119706, "learning_rate": 2.139680458983582e-06, "loss": 0.81298852, "num_input_tokens_seen": 170824230, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.78125, "step": 7947, "time_per_iteration": 2.631490707397461 }, { "auxiliary_loss_clip": 0.01132094, "auxiliary_loss_mlp": 0.01035208, "balance_loss_clip": 1.02175558, "balance_loss_mlp": 1.040084, "epoch": 0.47785961220502027, "flos": 17858161595520.0, "grad_norm": 7.840310360126525, "language_loss": 0.73638833, "learning_rate": 2.139303556736209e-06, "loss": 0.75806135, "num_input_tokens_seen": 170843365, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73828125, "step": 7948, "time_per_iteration": 4.023226976394653 }, { "auxiliary_loss_clip": 0.01136398, "auxiliary_loss_mlp": 0.01034202, "balance_loss_clip": 1.01982546, "balance_loss_mlp": 1.04160929, "epoch": 0.4779197354576883, "flos": 20266546550400.0, "grad_norm": 1.6303988626319241, "language_loss": 0.77864456, "learning_rate": 2.1389266495174507e-06, "loss": 0.80035061, "num_input_tokens_seen": 170863515, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.76953125, "step": 7949, "time_per_iteration": 2.5897579193115234 }, { "auxiliary_loss_clip": 0.01132591, "auxiliary_loss_mlp": 0.01030984, "balance_loss_clip": 1.01782906, "balance_loss_mlp": 1.04047823, "epoch": 0.47797985871035625, "flos": 17056599644160.0, "grad_norm": 2.0792843109845776, "language_loss": 0.73863924, "learning_rate": 2.1385497373407574e-06, "loss": 0.76027489, "num_input_tokens_seen": 170881245, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7421875, "step": 7950, "time_per_iteration": 2.5381062030792236 }, { "auxiliary_loss_clip": 0.01133035, "auxiliary_loss_mlp": 0.01042385, "balance_loss_clip": 1.02760339, "balance_loss_mlp": 1.04055417, "epoch": 0.4780399819630242, "flos": 31358418549120.0, "grad_norm": 1.9318934282489582, "language_loss": 0.74481869, "learning_rate": 2.13817282021958e-06, "loss": 0.76657295, "num_input_tokens_seen": 170901285, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.74609375, "step": 7951, "time_per_iteration": 2.7359559535980225 }, { "auxiliary_loss_clip": 0.01143771, "auxiliary_loss_mlp": 0.01031193, "balance_loss_clip": 1.01713777, "balance_loss_mlp": 1.03934908, "epoch": 0.4781001052156922, "flos": 24899597372160.0, "grad_norm": 2.9548638891590144, "language_loss": 0.80235255, "learning_rate": 2.137795898167371e-06, "loss": 0.82410216, "num_input_tokens_seen": 170919740, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7734375, "step": 7952, "time_per_iteration": 2.6996254920959473 }, { "auxiliary_loss_clip": 0.01136848, "auxiliary_loss_mlp": 0.01038798, "balance_loss_clip": 1.024791, "balance_loss_mlp": 1.04143751, "epoch": 0.47816022846836015, "flos": 18697717157760.0, "grad_norm": 1.694040843536238, "language_loss": 0.78222996, "learning_rate": 2.1374189711975806e-06, "loss": 0.80398643, "num_input_tokens_seen": 170938510, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7734375, "step": 7953, "time_per_iteration": 3.9635770320892334 }, { "auxiliary_loss_clip": 0.01128789, "auxiliary_loss_mlp": 0.01034983, "balance_loss_clip": 1.0209167, "balance_loss_mlp": 1.04136705, "epoch": 0.4782203517210281, "flos": 11977573559040.0, "grad_norm": 2.258668465702571, "language_loss": 0.83953685, "learning_rate": 2.1370420393236604e-06, "loss": 0.86117458, "num_input_tokens_seen": 170951170, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78515625, "step": 7954, "time_per_iteration": 2.5443694591522217 }, { "auxiliary_loss_clip": 0.01134438, "auxiliary_loss_mlp": 0.01031903, "balance_loss_clip": 1.01837301, "balance_loss_mlp": 1.04067731, "epoch": 0.4782804749736961, "flos": 20813501923200.0, "grad_norm": 1.4403565378016876, "language_loss": 0.70654309, "learning_rate": 2.136665102559062e-06, "loss": 0.72820652, "num_input_tokens_seen": 170970990, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 7955, "time_per_iteration": 2.608445405960083 }, { "auxiliary_loss_clip": 0.01136932, "auxiliary_loss_mlp": 0.01037257, "balance_loss_clip": 1.02411389, "balance_loss_mlp": 1.04233444, "epoch": 0.47834059822636404, "flos": 23840304359040.0, "grad_norm": 1.6102798245810452, "language_loss": 0.82062101, "learning_rate": 2.136288160917238e-06, "loss": 0.84236294, "num_input_tokens_seen": 170991215, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.765625, "step": 7956, "time_per_iteration": 2.62115740776062 }, { "auxiliary_loss_clip": 0.01134713, "auxiliary_loss_mlp": 0.01034752, "balance_loss_clip": 1.01998806, "balance_loss_mlp": 1.04143143, "epoch": 0.478400721479032, "flos": 22633777497600.0, "grad_norm": 20.596023804877174, "language_loss": 0.84734678, "learning_rate": 2.13591121441164e-06, "loss": 0.86904144, "num_input_tokens_seen": 171007325, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.75390625, "step": 7957, "time_per_iteration": 2.7133240699768066 }, { "auxiliary_loss_clip": 0.01127254, "auxiliary_loss_mlp": 0.01038849, "balance_loss_clip": 1.02581382, "balance_loss_mlp": 1.0422945, "epoch": 0.4784608447317, "flos": 19354954262400.0, "grad_norm": 1.758286011474187, "language_loss": 0.79522943, "learning_rate": 2.135534263055721e-06, "loss": 0.81689048, "num_input_tokens_seen": 171025650, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.76171875, "step": 7958, "time_per_iteration": 4.140897512435913 }, { "auxiliary_loss_clip": 0.0114203, "auxiliary_loss_mlp": 0.0128079, "balance_loss_clip": 1.01975131, "balance_loss_mlp": 1.03978002, "epoch": 0.47852096798436794, "flos": 24021114445440.0, "grad_norm": 1.9797200386566776, "language_loss": 0.82721555, "learning_rate": 2.1351573068629324e-06, "loss": 0.85144377, "num_input_tokens_seen": 171045045, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75390625, "step": 7959, "time_per_iteration": 2.6781880855560303 }, { "auxiliary_loss_clip": 0.01141212, "auxiliary_loss_mlp": 0.01035874, "balance_loss_clip": 1.02386403, "balance_loss_mlp": 1.04211521, "epoch": 0.4785810912370359, "flos": 25666433850240.0, "grad_norm": 2.027342675587023, "language_loss": 0.72759759, "learning_rate": 2.1347803458467268e-06, "loss": 0.74936843, "num_input_tokens_seen": 171062910, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7265625, "step": 7960, "time_per_iteration": 4.070032596588135 }, { "auxiliary_loss_clip": 0.01141864, "auxiliary_loss_mlp": 0.01035842, "balance_loss_clip": 1.02277076, "balance_loss_mlp": 1.04131758, "epoch": 0.47864121448970387, "flos": 21432134885760.0, "grad_norm": 1.835477074309287, "language_loss": 0.76474154, "learning_rate": 2.1344033800205573e-06, "loss": 0.78651857, "num_input_tokens_seen": 171080875, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7421875, "step": 7961, "time_per_iteration": 2.6578595638275146 }, { "auxiliary_loss_clip": 0.01141239, "auxiliary_loss_mlp": 0.0103079, "balance_loss_clip": 1.01714075, "balance_loss_mlp": 1.04019737, "epoch": 0.47870133774237184, "flos": 16143894034560.0, "grad_norm": 1.7707236982824468, "language_loss": 0.7796967, "learning_rate": 2.134026409397878e-06, "loss": 0.80141705, "num_input_tokens_seen": 171099190, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 7962, "time_per_iteration": 2.533857583999634 }, { "auxiliary_loss_clip": 0.01137643, "auxiliary_loss_mlp": 0.01034991, "balance_loss_clip": 1.02157426, "balance_loss_mlp": 1.04365766, "epoch": 0.47876146099503986, "flos": 26906788344960.0, "grad_norm": 1.6255098683934723, "language_loss": 0.64624441, "learning_rate": 2.13364943399214e-06, "loss": 0.66797078, "num_input_tokens_seen": 171119060, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.76171875, "step": 7963, "time_per_iteration": 2.577407121658325 }, { "auxiliary_loss_clip": 0.01126704, "auxiliary_loss_mlp": 0.01032665, "balance_loss_clip": 1.01902747, "balance_loss_mlp": 1.04232597, "epoch": 0.4788215842477078, "flos": 45332085778560.0, "grad_norm": 2.3456829523012495, "language_loss": 0.77772301, "learning_rate": 2.133272453816797e-06, "loss": 0.7993167, "num_input_tokens_seen": 171141900, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75390625, "step": 7964, "time_per_iteration": 2.683924436569214 }, { "auxiliary_loss_clip": 0.01128627, "auxiliary_loss_mlp": 0.01034285, "balance_loss_clip": 1.01891875, "balance_loss_mlp": 1.04110157, "epoch": 0.4788817075003758, "flos": 22237180456320.0, "grad_norm": 1.6668462522382004, "language_loss": 0.76352942, "learning_rate": 2.1328954688853036e-06, "loss": 0.78515863, "num_input_tokens_seen": 171161045, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.78515625, "step": 7965, "time_per_iteration": 2.6165175437927246 }, { "auxiliary_loss_clip": 0.01153469, "auxiliary_loss_mlp": 0.01036142, "balance_loss_clip": 1.02323151, "balance_loss_mlp": 1.04162502, "epoch": 0.47894183075304375, "flos": 16471183783680.0, "grad_norm": 1.6158917749108783, "language_loss": 0.74211073, "learning_rate": 2.1325184792111125e-06, "loss": 0.76400685, "num_input_tokens_seen": 171179675, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7578125, "step": 7966, "time_per_iteration": 2.5022788047790527 }, { "auxiliary_loss_clip": 0.01135773, "auxiliary_loss_mlp": 0.01033909, "balance_loss_clip": 1.02020025, "balance_loss_mlp": 1.04218972, "epoch": 0.4790019540057117, "flos": 24282688262400.0, "grad_norm": 1.53663658779881, "language_loss": 0.72758776, "learning_rate": 2.132141484807678e-06, "loss": 0.74928457, "num_input_tokens_seen": 171201175, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75390625, "step": 7967, "time_per_iteration": 2.5871856212615967 }, { "auxiliary_loss_clip": 0.01129319, "auxiliary_loss_mlp": 0.01030907, "balance_loss_clip": 1.01809752, "balance_loss_mlp": 1.03982687, "epoch": 0.4790620772583797, "flos": 25666469763840.0, "grad_norm": 1.9763677791531349, "language_loss": 0.79116833, "learning_rate": 2.131764485688454e-06, "loss": 0.81277055, "num_input_tokens_seen": 171221750, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 7968, "time_per_iteration": 2.5361430644989014 }, { "auxiliary_loss_clip": 0.0106715, "auxiliary_loss_mlp": 0.0101266, "balance_loss_clip": 1.01072931, "balance_loss_mlp": 1.01651418, "epoch": 0.47912220051104765, "flos": 69428077102080.0, "grad_norm": 0.7803960035579646, "language_loss": 0.62290156, "learning_rate": 2.131387481866894e-06, "loss": 0.64369965, "num_input_tokens_seen": 171292235, "router_z_loss_clip": 0.01928711, "router_z_loss_mlp": 0.23828125, "step": 7969, "time_per_iteration": 3.3089301586151123 }, { "auxiliary_loss_clip": 0.0114186, "auxiliary_loss_mlp": 0.01032495, "balance_loss_clip": 1.01987088, "balance_loss_mlp": 1.04306543, "epoch": 0.4791823237637156, "flos": 24168922911360.0, "grad_norm": 1.4565114613227064, "language_loss": 0.77107406, "learning_rate": 2.131010473356453e-06, "loss": 0.79281759, "num_input_tokens_seen": 171312215, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73046875, "step": 7970, "time_per_iteration": 2.5554075241088867 }, { "auxiliary_loss_clip": 0.01125103, "auxiliary_loss_mlp": 0.0103442, "balance_loss_clip": 1.02064574, "balance_loss_mlp": 1.04015231, "epoch": 0.4792424470163836, "flos": 24751465683840.0, "grad_norm": 1.7236252581302935, "language_loss": 0.70180941, "learning_rate": 2.130633460170585e-06, "loss": 0.72340465, "num_input_tokens_seen": 171332975, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7578125, "step": 7971, "time_per_iteration": 2.596383571624756 }, { "auxiliary_loss_clip": 0.01152861, "auxiliary_loss_mlp": 0.01032008, "balance_loss_clip": 1.0177207, "balance_loss_mlp": 1.04107523, "epoch": 0.47930257026905154, "flos": 23257905240960.0, "grad_norm": 1.3217871917548678, "language_loss": 0.79651183, "learning_rate": 2.1302564423227453e-06, "loss": 0.81836057, "num_input_tokens_seen": 171353880, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75390625, "step": 7972, "time_per_iteration": 2.6066505908966064 }, { "auxiliary_loss_clip": 0.01127072, "auxiliary_loss_mlp": 0.01029742, "balance_loss_clip": 1.01566935, "balance_loss_mlp": 1.04127097, "epoch": 0.4793626935217195, "flos": 14064091718400.0, "grad_norm": 2.1447866262413102, "language_loss": 0.70084321, "learning_rate": 2.129879419826387e-06, "loss": 0.72241127, "num_input_tokens_seen": 171370930, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76953125, "step": 7973, "time_per_iteration": 2.5079667568206787 }, { "auxiliary_loss_clip": 0.01125951, "auxiliary_loss_mlp": 0.01035038, "balance_loss_clip": 1.02218735, "balance_loss_mlp": 1.04427445, "epoch": 0.4794228167743875, "flos": 21798854789760.0, "grad_norm": 1.8163933642390835, "language_loss": 0.78718901, "learning_rate": 2.129502392694968e-06, "loss": 0.80879891, "num_input_tokens_seen": 171387575, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73046875, "step": 7974, "time_per_iteration": 2.5860328674316406 }, { "auxiliary_loss_clip": 0.0105744, "auxiliary_loss_mlp": 0.01006026, "balance_loss_clip": 1.00405931, "balance_loss_mlp": 1.01580071, "epoch": 0.47948294002705544, "flos": 66968805553920.0, "grad_norm": 0.758291099457459, "language_loss": 0.54066849, "learning_rate": 2.1291253609419415e-06, "loss": 0.56130314, "num_input_tokens_seen": 171449980, "router_z_loss_clip": 0.01965332, "router_z_loss_mlp": 0.23828125, "step": 7975, "time_per_iteration": 3.1694719791412354 }, { "auxiliary_loss_clip": 0.01139821, "auxiliary_loss_mlp": 0.01036826, "balance_loss_clip": 1.02209783, "balance_loss_mlp": 1.04213715, "epoch": 0.47954306327972346, "flos": 12422471414400.0, "grad_norm": 2.8401720952034246, "language_loss": 0.89688152, "learning_rate": 2.1287483245807622e-06, "loss": 0.918648, "num_input_tokens_seen": 171465290, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.796875, "step": 7976, "time_per_iteration": 2.5670461654663086 }, { "auxiliary_loss_clip": 0.01123136, "auxiliary_loss_mlp": 0.01043309, "balance_loss_clip": 1.02769232, "balance_loss_mlp": 1.0441432, "epoch": 0.4796031865323914, "flos": 18361951799040.0, "grad_norm": 2.351722707316145, "language_loss": 0.73582542, "learning_rate": 2.1283712836248866e-06, "loss": 0.75748986, "num_input_tokens_seen": 171481130, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.7890625, "step": 7977, "time_per_iteration": 2.6178486347198486 }, { "auxiliary_loss_clip": 0.01123585, "auxiliary_loss_mlp": 0.01282555, "balance_loss_clip": 1.02240062, "balance_loss_mlp": 1.04089117, "epoch": 0.4796633097850594, "flos": 21835088634240.0, "grad_norm": 2.2181826648279053, "language_loss": 0.78493237, "learning_rate": 2.1279942380877694e-06, "loss": 0.80899382, "num_input_tokens_seen": 171501140, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 7978, "time_per_iteration": 2.60477352142334 }, { "auxiliary_loss_clip": 0.01134551, "auxiliary_loss_mlp": 0.0103383, "balance_loss_clip": 1.01954842, "balance_loss_mlp": 1.04029536, "epoch": 0.47972343303772735, "flos": 23437350610560.0, "grad_norm": 1.7134087590677198, "language_loss": 0.89306897, "learning_rate": 2.127617187982868e-06, "loss": 0.91475278, "num_input_tokens_seen": 171519835, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.765625, "step": 7979, "time_per_iteration": 2.5689005851745605 }, { "auxiliary_loss_clip": 0.01129394, "auxiliary_loss_mlp": 0.01037899, "balance_loss_clip": 1.02266455, "balance_loss_mlp": 1.0431031, "epoch": 0.4797835562903953, "flos": 24899776940160.0, "grad_norm": 1.8608615977545047, "language_loss": 0.76701725, "learning_rate": 2.1272401333236377e-06, "loss": 0.78869021, "num_input_tokens_seen": 171540980, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7734375, "step": 7980, "time_per_iteration": 2.604463577270508 }, { "auxiliary_loss_clip": 0.01148944, "auxiliary_loss_mlp": 0.01036322, "balance_loss_clip": 1.02039528, "balance_loss_mlp": 1.04256785, "epoch": 0.4798436795430633, "flos": 35042996793600.0, "grad_norm": 1.966617530909058, "language_loss": 0.71077579, "learning_rate": 2.1268630741235334e-06, "loss": 0.73262846, "num_input_tokens_seen": 171563600, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.79296875, "step": 7981, "time_per_iteration": 2.7215254306793213 }, { "auxiliary_loss_clip": 0.01117008, "auxiliary_loss_mlp": 0.01029711, "balance_loss_clip": 1.01634228, "balance_loss_mlp": 1.04327273, "epoch": 0.47990380279573125, "flos": 20590209025920.0, "grad_norm": 1.990076341903512, "language_loss": 0.70184392, "learning_rate": 2.126486010396013e-06, "loss": 0.72331107, "num_input_tokens_seen": 171580700, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73828125, "step": 7982, "time_per_iteration": 2.583897352218628 }, { "auxiliary_loss_clip": 0.01129469, "auxiliary_loss_mlp": 0.01029762, "balance_loss_clip": 1.01742339, "balance_loss_mlp": 1.04138982, "epoch": 0.4799639260483992, "flos": 26359402008960.0, "grad_norm": 1.8032250689338232, "language_loss": 0.70876771, "learning_rate": 2.126108942154532e-06, "loss": 0.73035997, "num_input_tokens_seen": 171602035, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.703125, "step": 7983, "time_per_iteration": 2.6211369037628174 }, { "auxiliary_loss_clip": 0.01048666, "auxiliary_loss_mlp": 0.01248639, "balance_loss_clip": 1.00046813, "balance_loss_mlp": 1.01583385, "epoch": 0.4800240493010672, "flos": 70979021521920.0, "grad_norm": 0.8466816095700482, "language_loss": 0.59420037, "learning_rate": 2.125731869412547e-06, "loss": 0.61717343, "num_input_tokens_seen": 171659215, "router_z_loss_clip": 0.02062988, "router_z_loss_mlp": 0.23925781, "step": 7984, "time_per_iteration": 3.171348810195923 }, { "auxiliary_loss_clip": 0.01152542, "auxiliary_loss_mlp": 0.01035979, "balance_loss_clip": 1.02252626, "balance_loss_mlp": 1.04122734, "epoch": 0.48008417255373514, "flos": 17086656349440.0, "grad_norm": 1.5981019493812543, "language_loss": 0.66718048, "learning_rate": 2.125354792183516e-06, "loss": 0.68906569, "num_input_tokens_seen": 171675710, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75390625, "step": 7985, "time_per_iteration": 2.654759645462036 }, { "auxiliary_loss_clip": 0.0113602, "auxiliary_loss_mlp": 0.01040302, "balance_loss_clip": 1.02592504, "balance_loss_mlp": 1.0400362, "epoch": 0.4801442958064031, "flos": 15413435055360.0, "grad_norm": 1.9401216289312895, "language_loss": 0.70305425, "learning_rate": 2.124977710480894e-06, "loss": 0.72481751, "num_input_tokens_seen": 171692510, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.78125, "step": 7986, "time_per_iteration": 2.520073652267456 }, { "auxiliary_loss_clip": 0.01150515, "auxiliary_loss_mlp": 0.01041447, "balance_loss_clip": 1.02681398, "balance_loss_mlp": 1.04405594, "epoch": 0.4802044190590711, "flos": 11473747441920.0, "grad_norm": 1.935644263990819, "language_loss": 0.78916115, "learning_rate": 2.124600624318139e-06, "loss": 0.81108081, "num_input_tokens_seen": 171710235, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.796875, "step": 7987, "time_per_iteration": 2.529616117477417 }, { "auxiliary_loss_clip": 0.01152191, "auxiliary_loss_mlp": 0.01038023, "balance_loss_clip": 1.02446318, "balance_loss_mlp": 1.04132855, "epoch": 0.48026454231173904, "flos": 20951003185920.0, "grad_norm": 2.401228861408657, "language_loss": 0.75224912, "learning_rate": 2.124223533708708e-06, "loss": 0.77415121, "num_input_tokens_seen": 171726715, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.75390625, "step": 7988, "time_per_iteration": 2.580359935760498 }, { "auxiliary_loss_clip": 0.01148403, "auxiliary_loss_mlp": 0.0103335, "balance_loss_clip": 1.01924157, "balance_loss_mlp": 1.0460732, "epoch": 0.48032466556440706, "flos": 20448110822400.0, "grad_norm": 2.167475695298326, "language_loss": 0.79386544, "learning_rate": 2.1238464386660597e-06, "loss": 0.81568301, "num_input_tokens_seen": 171743605, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 7989, "time_per_iteration": 2.5735175609588623 }, { "auxiliary_loss_clip": 0.01137255, "auxiliary_loss_mlp": 0.01038469, "balance_loss_clip": 1.02399111, "balance_loss_mlp": 1.04091978, "epoch": 0.480384788817075, "flos": 37120823861760.0, "grad_norm": 1.713999485084622, "language_loss": 0.73292577, "learning_rate": 2.12346933920365e-06, "loss": 0.75468302, "num_input_tokens_seen": 171765445, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7890625, "step": 7990, "time_per_iteration": 4.107341289520264 }, { "auxiliary_loss_clip": 0.01047641, "auxiliary_loss_mlp": 0.01249354, "balance_loss_clip": 1.00123441, "balance_loss_mlp": 1.01512241, "epoch": 0.480444912069743, "flos": 69552577641600.0, "grad_norm": 0.7655430970616597, "language_loss": 0.5904963, "learning_rate": 2.123092235334937e-06, "loss": 0.61346626, "num_input_tokens_seen": 171830115, "router_z_loss_clip": 0.01989746, "router_z_loss_mlp": 0.23632812, "step": 7991, "time_per_iteration": 3.2467238903045654 }, { "auxiliary_loss_clip": 0.01124217, "auxiliary_loss_mlp": 0.01278956, "balance_loss_clip": 1.01856542, "balance_loss_mlp": 1.04050708, "epoch": 0.48050503532241096, "flos": 29822231640960.0, "grad_norm": 1.8938217138697957, "language_loss": 0.67014194, "learning_rate": 2.1227151270733793e-06, "loss": 0.69417363, "num_input_tokens_seen": 171849135, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.74609375, "step": 7992, "time_per_iteration": 2.612028121948242 }, { "auxiliary_loss_clip": 0.01134157, "auxiliary_loss_mlp": 0.01038533, "balance_loss_clip": 1.02463293, "balance_loss_mlp": 1.0393641, "epoch": 0.4805651585750789, "flos": 23948539015680.0, "grad_norm": 1.5806806657063375, "language_loss": 0.76139617, "learning_rate": 2.1223380144324332e-06, "loss": 0.78312302, "num_input_tokens_seen": 171868880, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76953125, "step": 7993, "time_per_iteration": 2.540782928466797 }, { "auxiliary_loss_clip": 0.01150921, "auxiliary_loss_mlp": 0.01037595, "balance_loss_clip": 1.02441061, "balance_loss_mlp": 1.04236925, "epoch": 0.4806252818277469, "flos": 25665428269440.0, "grad_norm": 2.702516576976614, "language_loss": 0.7829206, "learning_rate": 2.121960897425559e-06, "loss": 0.80480576, "num_input_tokens_seen": 171889455, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73046875, "step": 7994, "time_per_iteration": 2.6533660888671875 }, { "auxiliary_loss_clip": 0.01135397, "auxiliary_loss_mlp": 0.01030769, "balance_loss_clip": 1.01756644, "balance_loss_mlp": 1.04186058, "epoch": 0.48068540508041485, "flos": 13151996640000.0, "grad_norm": 1.836236683317544, "language_loss": 0.79917014, "learning_rate": 2.1215837760662136e-06, "loss": 0.82083178, "num_input_tokens_seen": 171906070, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7578125, "step": 7995, "time_per_iteration": 3.9117588996887207 }, { "auxiliary_loss_clip": 0.01150113, "auxiliary_loss_mlp": 0.01035209, "balance_loss_clip": 1.0216434, "balance_loss_mlp": 1.03963709, "epoch": 0.4807455283330828, "flos": 21176738208000.0, "grad_norm": 1.5811012070046713, "language_loss": 0.82977843, "learning_rate": 2.1212066503678566e-06, "loss": 0.85163164, "num_input_tokens_seen": 171926515, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7421875, "step": 7996, "time_per_iteration": 2.65203857421875 }, { "auxiliary_loss_clip": 0.01159722, "auxiliary_loss_mlp": 0.0103138, "balance_loss_clip": 1.01826727, "balance_loss_mlp": 1.04169035, "epoch": 0.4808056515857508, "flos": 12275991751680.0, "grad_norm": 1.6815065549851937, "language_loss": 0.80898869, "learning_rate": 2.1208295203439462e-06, "loss": 0.83089972, "num_input_tokens_seen": 171943845, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73046875, "step": 7997, "time_per_iteration": 2.5547878742218018 }, { "auxiliary_loss_clip": 0.01149953, "auxiliary_loss_mlp": 0.01037593, "balance_loss_clip": 1.02399755, "balance_loss_mlp": 1.03905666, "epoch": 0.48086577483841875, "flos": 24826052275200.0, "grad_norm": 2.7135008254411184, "language_loss": 0.72560018, "learning_rate": 2.12045238600794e-06, "loss": 0.74747562, "num_input_tokens_seen": 171964970, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.75390625, "step": 7998, "time_per_iteration": 2.672001600265503 }, { "auxiliary_loss_clip": 0.01117291, "auxiliary_loss_mlp": 0.01035337, "balance_loss_clip": 1.02175307, "balance_loss_mlp": 1.04183209, "epoch": 0.4809258980910867, "flos": 24465365856000.0, "grad_norm": 1.790856681601899, "language_loss": 0.7059027, "learning_rate": 2.1200752473732984e-06, "loss": 0.72742897, "num_input_tokens_seen": 171986340, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.75390625, "step": 7999, "time_per_iteration": 4.228322267532349 }, { "auxiliary_loss_clip": 0.01128454, "auxiliary_loss_mlp": 0.01040652, "balance_loss_clip": 1.02719331, "balance_loss_mlp": 1.04165471, "epoch": 0.4809860213437547, "flos": 21215952881280.0, "grad_norm": 1.5530959191124711, "language_loss": 0.71207541, "learning_rate": 2.11969810445348e-06, "loss": 0.73376644, "num_input_tokens_seen": 172007300, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.77734375, "step": 8000, "time_per_iteration": 2.572450876235962 }, { "auxiliary_loss_clip": 0.01128366, "auxiliary_loss_mlp": 0.01039736, "balance_loss_clip": 1.02567506, "balance_loss_mlp": 1.04384184, "epoch": 0.48104614459642264, "flos": 37632084094080.0, "grad_norm": 1.2896194245395385, "language_loss": 0.74610913, "learning_rate": 2.119320957261945e-06, "loss": 0.76779008, "num_input_tokens_seen": 172029585, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 8001, "time_per_iteration": 2.726900815963745 }, { "auxiliary_loss_clip": 0.01127246, "auxiliary_loss_mlp": 0.01038839, "balance_loss_clip": 1.02494526, "balance_loss_mlp": 1.04162014, "epoch": 0.48110626784909066, "flos": 18406122549120.0, "grad_norm": 1.7087129003646642, "language_loss": 0.81400746, "learning_rate": 2.118943805812151e-06, "loss": 0.83566833, "num_input_tokens_seen": 172047495, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.765625, "step": 8002, "time_per_iteration": 4.774209022521973 }, { "auxiliary_loss_clip": 0.0113002, "auxiliary_loss_mlp": 0.01038352, "balance_loss_clip": 1.02345681, "balance_loss_mlp": 1.04374444, "epoch": 0.48116639110175863, "flos": 28439814856320.0, "grad_norm": 1.8292856626937748, "language_loss": 0.71486604, "learning_rate": 2.1185666501175587e-06, "loss": 0.73654974, "num_input_tokens_seen": 172067625, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.7734375, "step": 8003, "time_per_iteration": 2.6554605960845947 }, { "auxiliary_loss_clip": 0.01125397, "auxiliary_loss_mlp": 0.01035498, "balance_loss_clip": 1.02311206, "balance_loss_mlp": 1.04342914, "epoch": 0.4812265143544266, "flos": 21725237865600.0, "grad_norm": 1.585993388755276, "language_loss": 0.82024288, "learning_rate": 2.1181894901916286e-06, "loss": 0.84185183, "num_input_tokens_seen": 172087885, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7265625, "step": 8004, "time_per_iteration": 2.641169786453247 }, { "auxiliary_loss_clip": 0.01144077, "auxiliary_loss_mlp": 0.01041842, "balance_loss_clip": 1.02646351, "balance_loss_mlp": 1.04304409, "epoch": 0.48128663760709456, "flos": 13224679810560.0, "grad_norm": 1.9195151421697767, "language_loss": 0.77707982, "learning_rate": 2.1178123260478183e-06, "loss": 0.79893899, "num_input_tokens_seen": 172105815, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.83203125, "step": 8005, "time_per_iteration": 2.5771677494049072 }, { "auxiliary_loss_clip": 0.01126898, "auxiliary_loss_mlp": 0.01035986, "balance_loss_clip": 1.02283168, "balance_loss_mlp": 1.04043734, "epoch": 0.4813467608597625, "flos": 24243437675520.0, "grad_norm": 1.7327600943219696, "language_loss": 0.71011859, "learning_rate": 2.1174351576995897e-06, "loss": 0.73174739, "num_input_tokens_seen": 172126125, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7734375, "step": 8006, "time_per_iteration": 2.651871919631958 }, { "auxiliary_loss_clip": 0.01136398, "auxiliary_loss_mlp": 0.01040395, "balance_loss_clip": 1.02660251, "balance_loss_mlp": 1.04220867, "epoch": 0.4814068841124305, "flos": 27480424544640.0, "grad_norm": 1.8212309360451138, "language_loss": 0.70744658, "learning_rate": 2.117057985160403e-06, "loss": 0.72921443, "num_input_tokens_seen": 172141945, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.76171875, "step": 8007, "time_per_iteration": 2.6720774173736572 }, { "auxiliary_loss_clip": 0.01117586, "auxiliary_loss_mlp": 0.01034418, "balance_loss_clip": 1.02091169, "balance_loss_mlp": 1.04041004, "epoch": 0.48146700736509845, "flos": 19572896033280.0, "grad_norm": 5.2545696324292654, "language_loss": 0.71387553, "learning_rate": 2.1166808084437168e-06, "loss": 0.73539555, "num_input_tokens_seen": 172161095, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7734375, "step": 8008, "time_per_iteration": 2.5274264812469482 }, { "auxiliary_loss_clip": 0.01148158, "auxiliary_loss_mlp": 0.01047364, "balance_loss_clip": 1.03209329, "balance_loss_mlp": 1.04279494, "epoch": 0.4815271306177664, "flos": 20627771673600.0, "grad_norm": 1.7961887548633555, "language_loss": 0.60534447, "learning_rate": 2.1163036275629933e-06, "loss": 0.62729967, "num_input_tokens_seen": 172178750, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.78515625, "step": 8009, "time_per_iteration": 2.5892372131347656 }, { "auxiliary_loss_clip": 0.01055158, "auxiliary_loss_mlp": 0.01009479, "balance_loss_clip": 1.00746429, "balance_loss_mlp": 1.01358807, "epoch": 0.4815872538704344, "flos": 67691076232320.0, "grad_norm": 0.8557594800544199, "language_loss": 0.6131801, "learning_rate": 2.1159264425316922e-06, "loss": 0.63382643, "num_input_tokens_seen": 172240235, "router_z_loss_clip": 0.0201416, "router_z_loss_mlp": 0.23632812, "step": 8010, "time_per_iteration": 3.1659252643585205 }, { "auxiliary_loss_clip": 0.0112848, "auxiliary_loss_mlp": 0.010459, "balance_loss_clip": 1.03121352, "balance_loss_mlp": 1.04300141, "epoch": 0.48164737712310235, "flos": 22820764723200.0, "grad_norm": 2.9375746180233975, "language_loss": 0.73514938, "learning_rate": 2.115549253363275e-06, "loss": 0.7568931, "num_input_tokens_seen": 172259875, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.765625, "step": 8011, "time_per_iteration": 2.590236186981201 }, { "auxiliary_loss_clip": 0.01129003, "auxiliary_loss_mlp": 0.01034993, "balance_loss_clip": 1.02260733, "balance_loss_mlp": 1.03896725, "epoch": 0.4817075003757703, "flos": 23733865382400.0, "grad_norm": 1.7415128392479748, "language_loss": 0.79442775, "learning_rate": 2.115172060071201e-06, "loss": 0.81606776, "num_input_tokens_seen": 172280150, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.72265625, "step": 8012, "time_per_iteration": 2.5532000064849854 }, { "auxiliary_loss_clip": 0.01144594, "auxiliary_loss_mlp": 0.01047558, "balance_loss_clip": 1.03293085, "balance_loss_mlp": 1.04166412, "epoch": 0.4817676236284383, "flos": 28182909807360.0, "grad_norm": 2.745759740058338, "language_loss": 0.73382139, "learning_rate": 2.114794862668934e-06, "loss": 0.75574291, "num_input_tokens_seen": 172300810, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.76171875, "step": 8013, "time_per_iteration": 2.6819372177124023 }, { "auxiliary_loss_clip": 0.01132505, "auxiliary_loss_mlp": 0.01032666, "balance_loss_clip": 1.01969016, "balance_loss_mlp": 1.04046059, "epoch": 0.48182774688110624, "flos": 17091756080640.0, "grad_norm": 2.423954879951841, "language_loss": 0.90771604, "learning_rate": 2.114417661169933e-06, "loss": 0.92936778, "num_input_tokens_seen": 172317930, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73828125, "step": 8014, "time_per_iteration": 2.4812216758728027 }, { "auxiliary_loss_clip": 0.01121515, "auxiliary_loss_mlp": 0.01039344, "balance_loss_clip": 1.02453256, "balance_loss_mlp": 1.04165447, "epoch": 0.4818878701337742, "flos": 12567873669120.0, "grad_norm": 2.3923394877407214, "language_loss": 0.74664998, "learning_rate": 2.1140404555876595e-06, "loss": 0.76825857, "num_input_tokens_seen": 172336340, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.796875, "step": 8015, "time_per_iteration": 2.5450315475463867 }, { "auxiliary_loss_clip": 0.01137042, "auxiliary_loss_mlp": 0.01038291, "balance_loss_clip": 1.02376568, "balance_loss_mlp": 1.03892946, "epoch": 0.48194799338644223, "flos": 24608505553920.0, "grad_norm": 2.014942975340795, "language_loss": 0.80240196, "learning_rate": 2.113663245935576e-06, "loss": 0.82415533, "num_input_tokens_seen": 172354315, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8046875, "step": 8016, "time_per_iteration": 2.5694985389709473 }, { "auxiliary_loss_clip": 0.01122676, "auxiliary_loss_mlp": 0.01030344, "balance_loss_clip": 1.01724315, "balance_loss_mlp": 1.04235387, "epoch": 0.4820081166391102, "flos": 21105204272640.0, "grad_norm": 1.8760473785300003, "language_loss": 0.77415359, "learning_rate": 2.1132860322271436e-06, "loss": 0.79568374, "num_input_tokens_seen": 172372695, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 8017, "time_per_iteration": 2.561387300491333 }, { "auxiliary_loss_clip": 0.01122603, "auxiliary_loss_mlp": 0.01032434, "balance_loss_clip": 1.01781249, "balance_loss_mlp": 1.04147613, "epoch": 0.48206823989177816, "flos": 25264593423360.0, "grad_norm": 3.7603073436812653, "language_loss": 0.79742008, "learning_rate": 2.112908814475824e-06, "loss": 0.8189705, "num_input_tokens_seen": 172390905, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.72265625, "step": 8018, "time_per_iteration": 2.5331084728240967 }, { "auxiliary_loss_clip": 0.01144201, "auxiliary_loss_mlp": 0.01032141, "balance_loss_clip": 1.01896238, "balance_loss_mlp": 1.04063606, "epoch": 0.4821283631444461, "flos": 24645062620800.0, "grad_norm": 1.9538511258619715, "language_loss": 0.7593509, "learning_rate": 2.1125315926950802e-06, "loss": 0.7811144, "num_input_tokens_seen": 172412295, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.765625, "step": 8019, "time_per_iteration": 2.597522020339966 }, { "auxiliary_loss_clip": 0.01143419, "auxiliary_loss_mlp": 0.01034633, "balance_loss_clip": 1.02054214, "balance_loss_mlp": 1.03938746, "epoch": 0.4821884863971141, "flos": 23952094462080.0, "grad_norm": 1.834650961477056, "language_loss": 0.79022759, "learning_rate": 2.1121543668983718e-06, "loss": 0.81200814, "num_input_tokens_seen": 172432625, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76953125, "step": 8020, "time_per_iteration": 2.55839467048645 }, { "auxiliary_loss_clip": 0.01116513, "auxiliary_loss_mlp": 0.01034212, "balance_loss_clip": 1.02003217, "balance_loss_mlp": 1.04186726, "epoch": 0.48224860964978206, "flos": 17160668323200.0, "grad_norm": 2.0714007957861353, "language_loss": 0.69659513, "learning_rate": 2.1117771370991636e-06, "loss": 0.7181024, "num_input_tokens_seen": 172450010, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.74609375, "step": 8021, "time_per_iteration": 2.5752921104431152 }, { "auxiliary_loss_clip": 0.01138814, "auxiliary_loss_mlp": 0.01032024, "balance_loss_clip": 1.01798725, "balance_loss_mlp": 1.0414356, "epoch": 0.48230873290245, "flos": 23075838178560.0, "grad_norm": 1.8497116674259713, "language_loss": 0.63291532, "learning_rate": 2.111399903310916e-06, "loss": 0.65462375, "num_input_tokens_seen": 172469080, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.79296875, "step": 8022, "time_per_iteration": 2.5279388427734375 }, { "auxiliary_loss_clip": 0.01131228, "auxiliary_loss_mlp": 0.01275333, "balance_loss_clip": 1.01530981, "balance_loss_mlp": 1.03989398, "epoch": 0.482368856155118, "flos": 19353517718400.0, "grad_norm": 3.365253074190029, "language_loss": 0.66625357, "learning_rate": 2.1110226655470932e-06, "loss": 0.69031918, "num_input_tokens_seen": 172484850, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 8023, "time_per_iteration": 2.576587200164795 }, { "auxiliary_loss_clip": 0.01167766, "auxiliary_loss_mlp": 0.01032341, "balance_loss_clip": 1.01890647, "balance_loss_mlp": 1.03917956, "epoch": 0.48242897940778595, "flos": 20078984707200.0, "grad_norm": 1.7814860990821633, "language_loss": 0.76416022, "learning_rate": 2.1106454238211572e-06, "loss": 0.7861613, "num_input_tokens_seen": 172503525, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.74609375, "step": 8024, "time_per_iteration": 2.7120113372802734 }, { "auxiliary_loss_clip": 0.01133856, "auxiliary_loss_mlp": 0.01036692, "balance_loss_clip": 1.02149844, "balance_loss_mlp": 1.03999877, "epoch": 0.4824891026604539, "flos": 23403989854080.0, "grad_norm": 1.43869647586648, "language_loss": 0.75399542, "learning_rate": 2.11026817814657e-06, "loss": 0.77570093, "num_input_tokens_seen": 172524360, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.76171875, "step": 8025, "time_per_iteration": 2.634669065475464 }, { "auxiliary_loss_clip": 0.01115802, "auxiliary_loss_mlp": 0.0103436, "balance_loss_clip": 1.02048457, "balance_loss_mlp": 1.04030395, "epoch": 0.4825492259131219, "flos": 20368675895040.0, "grad_norm": 1.833488103571367, "language_loss": 0.7110045, "learning_rate": 2.1098909285367953e-06, "loss": 0.73250616, "num_input_tokens_seen": 172541480, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75390625, "step": 8026, "time_per_iteration": 2.4969677925109863 }, { "auxiliary_loss_clip": 0.01129996, "auxiliary_loss_mlp": 0.01037679, "balance_loss_clip": 1.02276564, "balance_loss_mlp": 1.0420773, "epoch": 0.48260934916578985, "flos": 14319021519360.0, "grad_norm": 2.751332519665663, "language_loss": 0.74578208, "learning_rate": 2.1095136750052957e-06, "loss": 0.7674588, "num_input_tokens_seen": 172559005, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.7890625, "step": 8027, "time_per_iteration": 2.533684492111206 }, { "auxiliary_loss_clip": 0.01138311, "auxiliary_loss_mlp": 0.01039195, "balance_loss_clip": 1.02571237, "balance_loss_mlp": 1.04226112, "epoch": 0.4826694724184578, "flos": 22121152548480.0, "grad_norm": 1.8136272321826434, "language_loss": 0.67058736, "learning_rate": 2.1091364175655352e-06, "loss": 0.69236237, "num_input_tokens_seen": 172578435, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.78125, "step": 8028, "time_per_iteration": 2.6205086708068848 }, { "auxiliary_loss_clip": 0.01135076, "auxiliary_loss_mlp": 0.01036718, "balance_loss_clip": 1.02253819, "balance_loss_mlp": 1.04164267, "epoch": 0.48272959567112583, "flos": 16181169373440.0, "grad_norm": 1.6245943897793218, "language_loss": 0.73008215, "learning_rate": 2.108759156230977e-06, "loss": 0.75180012, "num_input_tokens_seen": 172596095, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7578125, "step": 8029, "time_per_iteration": 2.523921251296997 }, { "auxiliary_loss_clip": 0.01136758, "auxiliary_loss_mlp": 0.01031196, "balance_loss_clip": 1.01679599, "balance_loss_mlp": 1.04156041, "epoch": 0.4827897189237938, "flos": 23180445561600.0, "grad_norm": 2.051662966304921, "language_loss": 0.84698707, "learning_rate": 2.1083818910150836e-06, "loss": 0.86866665, "num_input_tokens_seen": 172615255, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.76953125, "step": 8030, "time_per_iteration": 2.5718538761138916 }, { "auxiliary_loss_clip": 0.01131354, "auxiliary_loss_mlp": 0.01031472, "balance_loss_clip": 1.01826346, "balance_loss_mlp": 1.03898239, "epoch": 0.48284984217646176, "flos": 21652626522240.0, "grad_norm": 1.9775798507617333, "language_loss": 0.73855567, "learning_rate": 2.10800462193132e-06, "loss": 0.76018387, "num_input_tokens_seen": 172633185, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.74609375, "step": 8031, "time_per_iteration": 2.5722079277038574 }, { "auxiliary_loss_clip": 0.01137537, "auxiliary_loss_mlp": 0.0104619, "balance_loss_clip": 1.03131318, "balance_loss_mlp": 1.04031384, "epoch": 0.48290996542912973, "flos": 31467443304960.0, "grad_norm": 1.6021540494057835, "language_loss": 0.71551538, "learning_rate": 2.1076273489931483e-06, "loss": 0.73735261, "num_input_tokens_seen": 172654280, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.79296875, "step": 8032, "time_per_iteration": 4.052723169326782 }, { "auxiliary_loss_clip": 0.01124455, "auxiliary_loss_mlp": 0.01034678, "balance_loss_clip": 1.02092707, "balance_loss_mlp": 1.03990805, "epoch": 0.4829700886817977, "flos": 24461954064000.0, "grad_norm": 1.5101224027648241, "language_loss": 0.73852575, "learning_rate": 2.107250072214034e-06, "loss": 0.76011711, "num_input_tokens_seen": 172675545, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75390625, "step": 8033, "time_per_iteration": 2.5324203968048096 }, { "auxiliary_loss_clip": 0.01165445, "auxiliary_loss_mlp": 0.01038351, "balance_loss_clip": 1.02416492, "balance_loss_mlp": 1.04216564, "epoch": 0.48303021193446566, "flos": 25702164904320.0, "grad_norm": 1.6152574068252599, "language_loss": 0.83203954, "learning_rate": 2.1068727916074406e-06, "loss": 0.85407752, "num_input_tokens_seen": 172696455, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.78515625, "step": 8034, "time_per_iteration": 2.645538091659546 }, { "auxiliary_loss_clip": 0.01141152, "auxiliary_loss_mlp": 0.01033151, "balance_loss_clip": 1.01995468, "balance_loss_mlp": 1.04142368, "epoch": 0.4830903351871336, "flos": 20085233673600.0, "grad_norm": 1.6735732833560313, "language_loss": 0.79129684, "learning_rate": 2.106495507186832e-06, "loss": 0.81303978, "num_input_tokens_seen": 172716720, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 8035, "time_per_iteration": 2.5947773456573486 }, { "auxiliary_loss_clip": 0.01136375, "auxiliary_loss_mlp": 0.0128717, "balance_loss_clip": 1.0247004, "balance_loss_mlp": 1.04178488, "epoch": 0.4831504584398016, "flos": 39452216014080.0, "grad_norm": 2.1964823167069802, "language_loss": 0.69827127, "learning_rate": 2.106118218965673e-06, "loss": 0.72250676, "num_input_tokens_seen": 172737435, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.76953125, "step": 8036, "time_per_iteration": 2.737675905227661 }, { "auxiliary_loss_clip": 0.01123826, "auxiliary_loss_mlp": 0.01036923, "balance_loss_clip": 1.02307642, "balance_loss_mlp": 1.03985405, "epoch": 0.48321058169246955, "flos": 20006588845440.0, "grad_norm": 2.0117402553531814, "language_loss": 0.72849232, "learning_rate": 2.105740926957427e-06, "loss": 0.75009978, "num_input_tokens_seen": 172755700, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75, "step": 8037, "time_per_iteration": 3.993837833404541 }, { "auxiliary_loss_clip": 0.01160636, "auxiliary_loss_mlp": 0.01035969, "balance_loss_clip": 1.02081752, "balance_loss_mlp": 1.04348516, "epoch": 0.4832707049451375, "flos": 20741465197440.0, "grad_norm": 2.218660008350297, "language_loss": 0.69044596, "learning_rate": 2.1053636311755604e-06, "loss": 0.712412, "num_input_tokens_seen": 172775185, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8125, "step": 8038, "time_per_iteration": 2.5374536514282227 }, { "auxiliary_loss_clip": 0.01144306, "auxiliary_loss_mlp": 0.01038841, "balance_loss_clip": 1.02440441, "balance_loss_mlp": 1.04287028, "epoch": 0.4833308281978055, "flos": 33145584762240.0, "grad_norm": 1.624268814642304, "language_loss": 0.78997993, "learning_rate": 2.1049863316335356e-06, "loss": 0.81181145, "num_input_tokens_seen": 172796990, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7578125, "step": 8039, "time_per_iteration": 2.678577423095703 }, { "auxiliary_loss_clip": 0.01126297, "auxiliary_loss_mlp": 0.01030929, "balance_loss_clip": 1.01746452, "balance_loss_mlp": 1.0409559, "epoch": 0.48339095145047345, "flos": 19099234362240.0, "grad_norm": 1.4588850633877457, "language_loss": 0.77157342, "learning_rate": 2.1046090283448198e-06, "loss": 0.79314566, "num_input_tokens_seen": 172814915, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.765625, "step": 8040, "time_per_iteration": 2.5042459964752197 }, { "auxiliary_loss_clip": 0.0112869, "auxiliary_loss_mlp": 0.01041228, "balance_loss_clip": 1.02672052, "balance_loss_mlp": 1.04210877, "epoch": 0.4834510747031414, "flos": 34459448440320.0, "grad_norm": 2.1059975577632497, "language_loss": 0.7578482, "learning_rate": 2.104231721322876e-06, "loss": 0.77954739, "num_input_tokens_seen": 172837060, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.77734375, "step": 8041, "time_per_iteration": 4.141900062561035 }, { "auxiliary_loss_clip": 0.01144012, "auxiliary_loss_mlp": 0.01034481, "balance_loss_clip": 1.02070069, "balance_loss_mlp": 1.04312372, "epoch": 0.48351119795580944, "flos": 27380845065600.0, "grad_norm": 2.6017526574424505, "language_loss": 0.66914582, "learning_rate": 2.1038544105811704e-06, "loss": 0.69093084, "num_input_tokens_seen": 172856545, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7421875, "step": 8042, "time_per_iteration": 2.601372241973877 }, { "auxiliary_loss_clip": 0.01058495, "auxiliary_loss_mlp": 0.01002508, "balance_loss_clip": 1.00055337, "balance_loss_mlp": 1.01674628, "epoch": 0.4835713212084774, "flos": 67143941291520.0, "grad_norm": 0.6911088299614292, "language_loss": 0.58623123, "learning_rate": 2.103477096133168e-06, "loss": 0.60684133, "num_input_tokens_seen": 172923055, "router_z_loss_clip": 0.01953125, "router_z_loss_mlp": 0.23828125, "step": 8043, "time_per_iteration": 4.718103885650635 }, { "auxiliary_loss_clip": 0.01136624, "auxiliary_loss_mlp": 0.01033849, "balance_loss_clip": 1.01928782, "balance_loss_mlp": 1.04108059, "epoch": 0.48363144446114537, "flos": 17967473660160.0, "grad_norm": 4.2642169442691555, "language_loss": 0.72101986, "learning_rate": 2.1030997779923344e-06, "loss": 0.74272454, "num_input_tokens_seen": 172940700, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.77734375, "step": 8044, "time_per_iteration": 2.653460741043091 }, { "auxiliary_loss_clip": 0.01151832, "auxiliary_loss_mlp": 0.01035749, "balance_loss_clip": 1.02108014, "balance_loss_mlp": 1.04163027, "epoch": 0.48369156771381333, "flos": 20593513077120.0, "grad_norm": 1.9782564069326676, "language_loss": 0.76046002, "learning_rate": 2.1027224561721352e-06, "loss": 0.78233588, "num_input_tokens_seen": 172961125, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7421875, "step": 8045, "time_per_iteration": 2.7515361309051514 }, { "auxiliary_loss_clip": 0.01152159, "auxiliary_loss_mlp": 0.01039113, "balance_loss_clip": 1.02571344, "balance_loss_mlp": 1.04013276, "epoch": 0.4837516909664813, "flos": 22675075159680.0, "grad_norm": 1.9814598314433196, "language_loss": 0.69315827, "learning_rate": 2.1023451306860355e-06, "loss": 0.71507102, "num_input_tokens_seen": 172980405, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.76953125, "step": 8046, "time_per_iteration": 2.651074171066284 }, { "auxiliary_loss_clip": 0.01117729, "auxiliary_loss_mlp": 0.01038531, "balance_loss_clip": 1.02454233, "balance_loss_mlp": 1.04185236, "epoch": 0.48381181421914926, "flos": 25518625384320.0, "grad_norm": 1.7758650630566264, "language_loss": 0.82622766, "learning_rate": 2.101967801547501e-06, "loss": 0.84779024, "num_input_tokens_seen": 172999105, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 8047, "time_per_iteration": 2.6216788291931152 }, { "auxiliary_loss_clip": 0.01133672, "auxiliary_loss_mlp": 0.01032233, "balance_loss_clip": 1.01846457, "balance_loss_mlp": 1.04082882, "epoch": 0.4838719374718172, "flos": 24207491139840.0, "grad_norm": 2.1072793256890283, "language_loss": 0.80541873, "learning_rate": 2.1015904687699988e-06, "loss": 0.82707775, "num_input_tokens_seen": 173019935, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75, "step": 8048, "time_per_iteration": 2.6012260913848877 }, { "auxiliary_loss_clip": 0.0112875, "auxiliary_loss_mlp": 0.01038552, "balance_loss_clip": 1.02446163, "balance_loss_mlp": 1.04126215, "epoch": 0.4839320607244852, "flos": 26724577628160.0, "grad_norm": 1.6032954943886604, "language_loss": 0.80847538, "learning_rate": 2.101213132366993e-06, "loss": 0.83014834, "num_input_tokens_seen": 173039700, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7890625, "step": 8049, "time_per_iteration": 2.644108295440674 }, { "auxiliary_loss_clip": 0.01133829, "auxiliary_loss_mlp": 0.01030263, "balance_loss_clip": 1.01784718, "balance_loss_mlp": 1.04328883, "epoch": 0.48399218397715316, "flos": 20448900921600.0, "grad_norm": 1.6078516856208982, "language_loss": 0.72237748, "learning_rate": 2.100835792351952e-06, "loss": 0.74401844, "num_input_tokens_seen": 173059170, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.72265625, "step": 8050, "time_per_iteration": 2.558457136154175 }, { "auxiliary_loss_clip": 0.0104823, "auxiliary_loss_mlp": 0.01251149, "balance_loss_clip": 1.00303578, "balance_loss_mlp": 1.01522398, "epoch": 0.4840523072298211, "flos": 67180570185600.0, "grad_norm": 0.726263443222333, "language_loss": 0.56376195, "learning_rate": 2.1004584487383405e-06, "loss": 0.58675575, "num_input_tokens_seen": 173119000, "router_z_loss_clip": 0.01953125, "router_z_loss_mlp": 0.23828125, "step": 8051, "time_per_iteration": 3.2225565910339355 }, { "auxiliary_loss_clip": 0.01147487, "auxiliary_loss_mlp": 0.0103392, "balance_loss_clip": 1.01993108, "balance_loss_mlp": 1.0440042, "epoch": 0.4841124304824891, "flos": 22411490181120.0, "grad_norm": 2.712700508382025, "language_loss": 0.75266403, "learning_rate": 2.1000811015396248e-06, "loss": 0.77447808, "num_input_tokens_seen": 173137570, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.76953125, "step": 8052, "time_per_iteration": 2.5266733169555664 }, { "auxiliary_loss_clip": 0.01126658, "auxiliary_loss_mlp": 0.01030189, "balance_loss_clip": 1.01670635, "balance_loss_mlp": 1.04160917, "epoch": 0.48417255373515705, "flos": 13843959217920.0, "grad_norm": 2.8604878929240134, "language_loss": 0.66175175, "learning_rate": 2.0997037507692726e-06, "loss": 0.68332022, "num_input_tokens_seen": 173154355, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.76171875, "step": 8053, "time_per_iteration": 2.5350048542022705 }, { "auxiliary_loss_clip": 0.01121306, "auxiliary_loss_mlp": 0.01032967, "balance_loss_clip": 1.01999664, "balance_loss_mlp": 1.03891742, "epoch": 0.484232676987825, "flos": 31649689935360.0, "grad_norm": 1.8633663300052083, "language_loss": 0.68965328, "learning_rate": 2.0993263964407494e-06, "loss": 0.71119595, "num_input_tokens_seen": 173174845, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.734375, "step": 8054, "time_per_iteration": 2.5989038944244385 }, { "auxiliary_loss_clip": 0.01168508, "auxiliary_loss_mlp": 0.01036811, "balance_loss_clip": 1.02283931, "balance_loss_mlp": 1.03970337, "epoch": 0.48429280024049304, "flos": 24095377814400.0, "grad_norm": 2.0279932157240617, "language_loss": 0.69551474, "learning_rate": 2.0989490385675237e-06, "loss": 0.71756792, "num_input_tokens_seen": 173195025, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75, "step": 8055, "time_per_iteration": 2.6982650756835938 }, { "auxiliary_loss_clip": 0.01126873, "auxiliary_loss_mlp": 0.01032731, "balance_loss_clip": 1.01879549, "balance_loss_mlp": 1.0418179, "epoch": 0.484352923493161, "flos": 17530081747200.0, "grad_norm": 2.0189138906851465, "language_loss": 0.62868834, "learning_rate": 2.0985716771630604e-06, "loss": 0.65028441, "num_input_tokens_seen": 173213065, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76171875, "step": 8056, "time_per_iteration": 2.570523738861084 }, { "auxiliary_loss_clip": 0.01124554, "auxiliary_loss_mlp": 0.01031883, "balance_loss_clip": 1.01774454, "balance_loss_mlp": 1.03970146, "epoch": 0.48441304674582897, "flos": 29166862043520.0, "grad_norm": 1.7762205414434158, "language_loss": 0.67602348, "learning_rate": 2.0981943122408278e-06, "loss": 0.69758785, "num_input_tokens_seen": 173234545, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7578125, "step": 8057, "time_per_iteration": 2.6952896118164062 }, { "auxiliary_loss_clip": 0.01140541, "auxiliary_loss_mlp": 0.01033086, "balance_loss_clip": 1.01967549, "balance_loss_mlp": 1.03873849, "epoch": 0.48447316999849693, "flos": 15886701676800.0, "grad_norm": 2.1077586500022574, "language_loss": 0.81704843, "learning_rate": 2.097816943814293e-06, "loss": 0.83878469, "num_input_tokens_seen": 173252175, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.74609375, "step": 8058, "time_per_iteration": 2.5725064277648926 }, { "auxiliary_loss_clip": 0.01128707, "auxiliary_loss_mlp": 0.01034335, "balance_loss_clip": 1.02011943, "balance_loss_mlp": 1.04109979, "epoch": 0.4845332932511649, "flos": 24381405815040.0, "grad_norm": 1.728796125065186, "language_loss": 0.79576194, "learning_rate": 2.097439571896923e-06, "loss": 0.81739235, "num_input_tokens_seen": 173268790, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78515625, "step": 8059, "time_per_iteration": 2.5309512615203857 }, { "auxiliary_loss_clip": 0.01121472, "auxiliary_loss_mlp": 0.01038613, "balance_loss_clip": 1.02414083, "balance_loss_mlp": 1.04238892, "epoch": 0.48459341650383286, "flos": 37116478316160.0, "grad_norm": 1.9478106000537074, "language_loss": 0.66338128, "learning_rate": 2.097062196502185e-06, "loss": 0.68498206, "num_input_tokens_seen": 173288030, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7890625, "step": 8060, "time_per_iteration": 2.7069478034973145 }, { "auxiliary_loss_clip": 0.01128677, "auxiliary_loss_mlp": 0.01039688, "balance_loss_clip": 1.02577686, "balance_loss_mlp": 1.04313707, "epoch": 0.48465353975650083, "flos": 22966777509120.0, "grad_norm": 1.8943838384419147, "language_loss": 0.67223883, "learning_rate": 2.096684817643547e-06, "loss": 0.69392252, "num_input_tokens_seen": 173305965, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.765625, "step": 8061, "time_per_iteration": 2.5640852451324463 }, { "auxiliary_loss_clip": 0.01137596, "auxiliary_loss_mlp": 0.01044231, "balance_loss_clip": 1.02904427, "balance_loss_mlp": 1.04261386, "epoch": 0.4847136630091688, "flos": 17707695523200.0, "grad_norm": 2.1131517945330964, "language_loss": 0.8218869, "learning_rate": 2.0963074353344765e-06, "loss": 0.84370512, "num_input_tokens_seen": 173321985, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7734375, "step": 8062, "time_per_iteration": 2.59572696685791 }, { "auxiliary_loss_clip": 0.01161825, "auxiliary_loss_mlp": 0.01031455, "balance_loss_clip": 1.01824021, "balance_loss_mlp": 1.0419271, "epoch": 0.48477378626183676, "flos": 22018269018240.0, "grad_norm": 2.4464325227971626, "language_loss": 0.74467242, "learning_rate": 2.0959300495884416e-06, "loss": 0.7666052, "num_input_tokens_seen": 173341315, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75, "step": 8063, "time_per_iteration": 2.5958986282348633 }, { "auxiliary_loss_clip": 0.01134412, "auxiliary_loss_mlp": 0.01033322, "balance_loss_clip": 1.0189693, "balance_loss_mlp": 1.04034996, "epoch": 0.4848339095145047, "flos": 27962956874880.0, "grad_norm": 1.9340946689585654, "language_loss": 0.78560454, "learning_rate": 2.0955526604189104e-06, "loss": 0.80728185, "num_input_tokens_seen": 173361055, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.76171875, "step": 8064, "time_per_iteration": 2.5954954624176025 }, { "auxiliary_loss_clip": 0.01115314, "auxiliary_loss_mlp": 0.01283144, "balance_loss_clip": 1.02381444, "balance_loss_mlp": 1.04328942, "epoch": 0.4848940327671727, "flos": 21688752625920.0, "grad_norm": 3.04595150699347, "language_loss": 0.78651428, "learning_rate": 2.09517526783935e-06, "loss": 0.81049883, "num_input_tokens_seen": 173379255, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 8065, "time_per_iteration": 2.5038697719573975 }, { "auxiliary_loss_clip": 0.01118932, "auxiliary_loss_mlp": 0.01277139, "balance_loss_clip": 1.01742983, "balance_loss_mlp": 1.03922272, "epoch": 0.48495415601984065, "flos": 20631578515200.0, "grad_norm": 1.5700843130459672, "language_loss": 0.76255757, "learning_rate": 2.094797871863229e-06, "loss": 0.78651834, "num_input_tokens_seen": 173398370, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 8066, "time_per_iteration": 2.6088268756866455 }, { "auxiliary_loss_clip": 0.01115387, "auxiliary_loss_mlp": 0.01030727, "balance_loss_clip": 1.01736367, "balance_loss_mlp": 1.0413518, "epoch": 0.4850142792725086, "flos": 25628152930560.0, "grad_norm": 1.326317756688877, "language_loss": 0.7179895, "learning_rate": 2.094420472504016e-06, "loss": 0.73945063, "num_input_tokens_seen": 173419595, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7421875, "step": 8067, "time_per_iteration": 2.5535037517547607 }, { "auxiliary_loss_clip": 0.01160822, "auxiliary_loss_mlp": 0.01035522, "balance_loss_clip": 1.02150297, "balance_loss_mlp": 1.04048634, "epoch": 0.4850744025251766, "flos": 13771958405760.0, "grad_norm": 2.3286799165963914, "language_loss": 0.79308838, "learning_rate": 2.0940430697751796e-06, "loss": 0.81505185, "num_input_tokens_seen": 173435390, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7578125, "step": 8068, "time_per_iteration": 2.608353614807129 }, { "auxiliary_loss_clip": 0.01130509, "auxiliary_loss_mlp": 0.01033317, "balance_loss_clip": 1.0206039, "balance_loss_mlp": 1.03925288, "epoch": 0.4851345257778446, "flos": 20261339078400.0, "grad_norm": 1.4886543338743847, "language_loss": 0.84271038, "learning_rate": 2.093665663690187e-06, "loss": 0.86434865, "num_input_tokens_seen": 173454095, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.734375, "step": 8069, "time_per_iteration": 2.6042304039001465 }, { "auxiliary_loss_clip": 0.01142125, "auxiliary_loss_mlp": 0.01031935, "balance_loss_clip": 1.01885772, "balance_loss_mlp": 1.04162991, "epoch": 0.48519464903051257, "flos": 27089681420160.0, "grad_norm": 1.7687877103292664, "language_loss": 0.77864194, "learning_rate": 2.0932882542625085e-06, "loss": 0.80038261, "num_input_tokens_seen": 173475300, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7421875, "step": 8070, "time_per_iteration": 2.715520143508911 }, { "auxiliary_loss_clip": 0.01161254, "auxiliary_loss_mlp": 0.01033241, "balance_loss_clip": 1.01882839, "balance_loss_mlp": 1.04172611, "epoch": 0.48525477228318054, "flos": 17127235739520.0, "grad_norm": 1.9904590008352023, "language_loss": 0.77538526, "learning_rate": 2.0929108415056115e-06, "loss": 0.79733014, "num_input_tokens_seen": 173492005, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.75390625, "step": 8071, "time_per_iteration": 2.5077970027923584 }, { "auxiliary_loss_clip": 0.01159152, "auxiliary_loss_mlp": 0.01032315, "balance_loss_clip": 1.01792669, "balance_loss_mlp": 1.03878355, "epoch": 0.4853148955358485, "flos": 28180324028160.0, "grad_norm": 1.9467205014701294, "language_loss": 0.71896094, "learning_rate": 2.0925334254329667e-06, "loss": 0.7408756, "num_input_tokens_seen": 173511995, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7578125, "step": 8072, "time_per_iteration": 2.6386942863464355 }, { "auxiliary_loss_clip": 0.01156404, "auxiliary_loss_mlp": 0.01040319, "balance_loss_clip": 1.02607942, "balance_loss_mlp": 1.04228771, "epoch": 0.48537501878851647, "flos": 17493309198720.0, "grad_norm": 1.9337859005221525, "language_loss": 0.87498587, "learning_rate": 2.092156006058041e-06, "loss": 0.89695311, "num_input_tokens_seen": 173530215, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78125, "step": 8073, "time_per_iteration": 3.9368369579315186 }, { "auxiliary_loss_clip": 0.01139573, "auxiliary_loss_mlp": 0.01029799, "balance_loss_clip": 1.01637042, "balance_loss_mlp": 1.04088759, "epoch": 0.48543514204118443, "flos": 28584857975040.0, "grad_norm": 1.8232420510914913, "language_loss": 0.60840166, "learning_rate": 2.0917785833943044e-06, "loss": 0.63009536, "num_input_tokens_seen": 173550920, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 8074, "time_per_iteration": 2.6693015098571777 }, { "auxiliary_loss_clip": 0.01143861, "auxiliary_loss_mlp": 0.01287608, "balance_loss_clip": 1.02610612, "balance_loss_mlp": 1.03980494, "epoch": 0.4854952652938524, "flos": 20959981585920.0, "grad_norm": 1.6259976691450917, "language_loss": 0.73286802, "learning_rate": 2.091401157455227e-06, "loss": 0.75718266, "num_input_tokens_seen": 173569065, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.76953125, "step": 8075, "time_per_iteration": 2.6627042293548584 }, { "auxiliary_loss_clip": 0.01111868, "auxiliary_loss_mlp": 0.01035192, "balance_loss_clip": 1.02199543, "balance_loss_mlp": 1.04014122, "epoch": 0.48555538854652036, "flos": 66529543155840.0, "grad_norm": 1.5555190729739674, "language_loss": 0.81579709, "learning_rate": 2.091023728254277e-06, "loss": 0.83726764, "num_input_tokens_seen": 173596085, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 8076, "time_per_iteration": 2.9045751094818115 }, { "auxiliary_loss_clip": 0.01134238, "auxiliary_loss_mlp": 0.01032627, "balance_loss_clip": 1.01744545, "balance_loss_mlp": 1.04155183, "epoch": 0.4856155117991883, "flos": 15924982596480.0, "grad_norm": 1.832609425507973, "language_loss": 0.86359602, "learning_rate": 2.0906462958049247e-06, "loss": 0.88526464, "num_input_tokens_seen": 173613900, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.75, "step": 8077, "time_per_iteration": 2.698129892349243 }, { "auxiliary_loss_clip": 0.0106248, "auxiliary_loss_mlp": 0.01000583, "balance_loss_clip": 0.99891418, "balance_loss_mlp": 1.01197529, "epoch": 0.4856756350518563, "flos": 71047395060480.0, "grad_norm": 0.9005033315900269, "language_loss": 0.5839383, "learning_rate": 2.090268860120638e-06, "loss": 0.60456902, "num_input_tokens_seen": 173671305, "router_z_loss_clip": 0.01672363, "router_z_loss_mlp": 0.23828125, "step": 8078, "time_per_iteration": 4.5148255825042725 }, { "auxiliary_loss_clip": 0.01120868, "auxiliary_loss_mlp": 0.01034429, "balance_loss_clip": 1.02074981, "balance_loss_mlp": 1.04253232, "epoch": 0.48573575830452426, "flos": 29825679346560.0, "grad_norm": 3.0605186323761924, "language_loss": 0.72141075, "learning_rate": 2.0898914212148895e-06, "loss": 0.74296367, "num_input_tokens_seen": 173692070, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.78515625, "step": 8079, "time_per_iteration": 2.5702691078186035 }, { "auxiliary_loss_clip": 0.01146254, "auxiliary_loss_mlp": 0.01033588, "balance_loss_clip": 1.01979566, "balance_loss_mlp": 1.04308915, "epoch": 0.4857958815571922, "flos": 17639501552640.0, "grad_norm": 2.2493260898891476, "language_loss": 0.79290581, "learning_rate": 2.089513979101147e-06, "loss": 0.8147043, "num_input_tokens_seen": 173709785, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.765625, "step": 8080, "time_per_iteration": 2.586266040802002 }, { "auxiliary_loss_clip": 0.01132287, "auxiliary_loss_mlp": 0.01032176, "balance_loss_clip": 1.01770377, "balance_loss_mlp": 1.03880286, "epoch": 0.4858560048098602, "flos": 21105491581440.0, "grad_norm": 2.941044001501938, "language_loss": 0.84032249, "learning_rate": 2.0891365337928803e-06, "loss": 0.86196715, "num_input_tokens_seen": 173728770, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7578125, "step": 8081, "time_per_iteration": 2.5517923831939697 }, { "auxiliary_loss_clip": 0.011254, "auxiliary_loss_mlp": 0.01040038, "balance_loss_clip": 1.02438009, "balance_loss_mlp": 1.04082739, "epoch": 0.4859161280625282, "flos": 22090844448000.0, "grad_norm": 1.5970883082701175, "language_loss": 0.8323859, "learning_rate": 2.0887590853035604e-06, "loss": 0.85404027, "num_input_tokens_seen": 173747355, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.75390625, "step": 8082, "time_per_iteration": 2.6480581760406494 }, { "auxiliary_loss_clip": 0.01132432, "auxiliary_loss_mlp": 0.01034806, "balance_loss_clip": 1.02197933, "balance_loss_mlp": 1.04233479, "epoch": 0.4859762513151962, "flos": 17493452853120.0, "grad_norm": 4.062522053268692, "language_loss": 0.86849093, "learning_rate": 2.0883816336466567e-06, "loss": 0.8901633, "num_input_tokens_seen": 173764825, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.72265625, "step": 8083, "time_per_iteration": 4.106275320053101 }, { "auxiliary_loss_clip": 0.01149019, "auxiliary_loss_mlp": 0.01040951, "balance_loss_clip": 1.02758181, "balance_loss_mlp": 1.04015636, "epoch": 0.48603637456786414, "flos": 18004246208640.0, "grad_norm": 1.6956728992320136, "language_loss": 0.80783927, "learning_rate": 2.0880041788356402e-06, "loss": 0.82973891, "num_input_tokens_seen": 173783215, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 8084, "time_per_iteration": 4.033481121063232 }, { "auxiliary_loss_clip": 0.01127004, "auxiliary_loss_mlp": 0.01037285, "balance_loss_clip": 1.02249682, "balance_loss_mlp": 1.04211473, "epoch": 0.4860964978205321, "flos": 22492038430080.0, "grad_norm": 3.1368384152573894, "language_loss": 0.68305218, "learning_rate": 2.087626720883981e-06, "loss": 0.70469511, "num_input_tokens_seen": 173801905, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.76171875, "step": 8085, "time_per_iteration": 2.6009185314178467 }, { "auxiliary_loss_clip": 0.0112629, "auxiliary_loss_mlp": 0.01036365, "balance_loss_clip": 1.02231586, "balance_loss_mlp": 1.04121172, "epoch": 0.48615662107320007, "flos": 23372532518400.0, "grad_norm": 1.6044675607728591, "language_loss": 0.77478075, "learning_rate": 2.0872492598051486e-06, "loss": 0.79640728, "num_input_tokens_seen": 173824690, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76171875, "step": 8086, "time_per_iteration": 2.6495184898376465 }, { "auxiliary_loss_clip": 0.01115368, "auxiliary_loss_mlp": 0.01028853, "balance_loss_clip": 1.01475632, "balance_loss_mlp": 1.040326, "epoch": 0.48621674432586803, "flos": 34418833136640.0, "grad_norm": 2.0250543713718194, "language_loss": 0.70027244, "learning_rate": 2.0868717956126155e-06, "loss": 0.72171462, "num_input_tokens_seen": 173844450, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75, "step": 8087, "time_per_iteration": 2.6778950691223145 }, { "auxiliary_loss_clip": 0.01129433, "auxiliary_loss_mlp": 0.01039717, "balance_loss_clip": 1.02509022, "balance_loss_mlp": 1.03941512, "epoch": 0.486276867578536, "flos": 33107555237760.0, "grad_norm": 2.2523477305800235, "language_loss": 0.7228058, "learning_rate": 2.086494328319851e-06, "loss": 0.7444973, "num_input_tokens_seen": 173864975, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8125, "step": 8088, "time_per_iteration": 2.7112786769866943 }, { "auxiliary_loss_clip": 0.01134983, "auxiliary_loss_mlp": 0.01033894, "balance_loss_clip": 1.01979709, "balance_loss_mlp": 1.0419395, "epoch": 0.48633699083120396, "flos": 21470703114240.0, "grad_norm": 1.653168925739946, "language_loss": 0.75004929, "learning_rate": 2.086116857940327e-06, "loss": 0.77173805, "num_input_tokens_seen": 173883805, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75390625, "step": 8089, "time_per_iteration": 2.5824811458587646 }, { "auxiliary_loss_clip": 0.01145535, "auxiliary_loss_mlp": 0.01030862, "balance_loss_clip": 1.0165267, "balance_loss_mlp": 1.04107368, "epoch": 0.48639711408387193, "flos": 20084335833600.0, "grad_norm": 1.6652799017155497, "language_loss": 0.83814543, "learning_rate": 2.0857393844875135e-06, "loss": 0.85990936, "num_input_tokens_seen": 173903520, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.77734375, "step": 8090, "time_per_iteration": 2.6093056201934814 }, { "auxiliary_loss_clip": 0.01114284, "auxiliary_loss_mlp": 0.0103457, "balance_loss_clip": 1.02063489, "balance_loss_mlp": 1.04142451, "epoch": 0.4864572373365399, "flos": 20778884190720.0, "grad_norm": 1.55023256545631, "language_loss": 0.75904012, "learning_rate": 2.0853619079748815e-06, "loss": 0.78052872, "num_input_tokens_seen": 173924255, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7265625, "step": 8091, "time_per_iteration": 2.5656116008758545 }, { "auxiliary_loss_clip": 0.01127529, "auxiliary_loss_mlp": 0.01031837, "balance_loss_clip": 1.01743698, "balance_loss_mlp": 1.04214382, "epoch": 0.48651736058920786, "flos": 26025360503040.0, "grad_norm": 1.5214798436171255, "language_loss": 0.80046487, "learning_rate": 2.0849844284159035e-06, "loss": 0.8220585, "num_input_tokens_seen": 173943285, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.765625, "step": 8092, "time_per_iteration": 2.726966142654419 }, { "auxiliary_loss_clip": 0.01137831, "auxiliary_loss_mlp": 0.01275983, "balance_loss_clip": 1.01503968, "balance_loss_mlp": 1.04274917, "epoch": 0.4865774838418758, "flos": 20485601642880.0, "grad_norm": 1.4640747431875818, "language_loss": 0.71581036, "learning_rate": 2.084606945824049e-06, "loss": 0.73994851, "num_input_tokens_seen": 173962205, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76953125, "step": 8093, "time_per_iteration": 2.5420632362365723 }, { "auxiliary_loss_clip": 0.01116227, "auxiliary_loss_mlp": 0.01033243, "balance_loss_clip": 1.01976609, "balance_loss_mlp": 1.04154015, "epoch": 0.4866376070945438, "flos": 23547704169600.0, "grad_norm": 1.7976653698789051, "language_loss": 0.68343711, "learning_rate": 2.0842294602127916e-06, "loss": 0.70493174, "num_input_tokens_seen": 173980945, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.74609375, "step": 8094, "time_per_iteration": 2.574341297149658 }, { "auxiliary_loss_clip": 0.01140493, "auxiliary_loss_mlp": 0.01038959, "balance_loss_clip": 1.02368832, "balance_loss_mlp": 1.04363883, "epoch": 0.4866977303472118, "flos": 16690598012160.0, "grad_norm": 2.065617626546367, "language_loss": 0.66762537, "learning_rate": 2.0838519715956006e-06, "loss": 0.68941993, "num_input_tokens_seen": 173998860, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.79296875, "step": 8095, "time_per_iteration": 2.5099246501922607 }, { "auxiliary_loss_clip": 0.01128986, "auxiliary_loss_mlp": 0.01035134, "balance_loss_clip": 1.02020884, "balance_loss_mlp": 1.04218864, "epoch": 0.4867578535998798, "flos": 17896011552000.0, "grad_norm": 2.134119826107847, "language_loss": 0.7876581, "learning_rate": 2.0834744799859475e-06, "loss": 0.80929923, "num_input_tokens_seen": 174016665, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.77734375, "step": 8096, "time_per_iteration": 2.619476556777954 }, { "auxiliary_loss_clip": 0.01133059, "auxiliary_loss_mlp": 0.01030758, "balance_loss_clip": 1.01688218, "balance_loss_mlp": 1.04029894, "epoch": 0.48681797685254774, "flos": 22637799820800.0, "grad_norm": 1.7839305343369711, "language_loss": 0.67493743, "learning_rate": 2.0830969853973063e-06, "loss": 0.69657558, "num_input_tokens_seen": 174034800, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.74609375, "step": 8097, "time_per_iteration": 2.5697243213653564 }, { "auxiliary_loss_clip": 0.01134057, "auxiliary_loss_mlp": 0.01030477, "balance_loss_clip": 1.0163269, "balance_loss_mlp": 1.04119718, "epoch": 0.4868781001052157, "flos": 20886077352960.0, "grad_norm": 1.539960326291627, "language_loss": 0.71446019, "learning_rate": 2.0827194878431464e-06, "loss": 0.7361055, "num_input_tokens_seen": 174054445, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75, "step": 8098, "time_per_iteration": 2.594045639038086 }, { "auxiliary_loss_clip": 0.01133209, "auxiliary_loss_mlp": 0.01033806, "balance_loss_clip": 1.01835048, "balance_loss_mlp": 1.04347944, "epoch": 0.48693822335788367, "flos": 41974940937600.0, "grad_norm": 1.8967061595069608, "language_loss": 0.66079211, "learning_rate": 2.082341987336941e-06, "loss": 0.68246222, "num_input_tokens_seen": 174077890, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8125, "step": 8099, "time_per_iteration": 2.7590293884277344 }, { "auxiliary_loss_clip": 0.01150152, "auxiliary_loss_mlp": 0.0103878, "balance_loss_clip": 1.02503502, "balance_loss_mlp": 1.03933895, "epoch": 0.48699834661055164, "flos": 24243294021120.0, "grad_norm": 1.9188387286450646, "language_loss": 0.66719258, "learning_rate": 2.0819644838921618e-06, "loss": 0.68908191, "num_input_tokens_seen": 174097460, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75390625, "step": 8100, "time_per_iteration": 2.7037880420684814 }, { "auxiliary_loss_clip": 0.0114008, "auxiliary_loss_mlp": 0.01030937, "balance_loss_clip": 1.01748466, "balance_loss_mlp": 1.04067886, "epoch": 0.4870584698632196, "flos": 25923877603200.0, "grad_norm": 1.4138852743970567, "language_loss": 0.76412964, "learning_rate": 2.0815869775222816e-06, "loss": 0.78583986, "num_input_tokens_seen": 174120775, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73046875, "step": 8101, "time_per_iteration": 2.6161317825317383 }, { "auxiliary_loss_clip": 0.01037161, "auxiliary_loss_mlp": 0.01006196, "balance_loss_clip": 1.00481343, "balance_loss_mlp": 1.013273, "epoch": 0.48711859311588757, "flos": 70211933648640.0, "grad_norm": 0.684183992417975, "language_loss": 0.52273893, "learning_rate": 2.0812094682407718e-06, "loss": 0.54317248, "num_input_tokens_seen": 174189135, "router_z_loss_clip": 0.01385498, "router_z_loss_mlp": 0.23828125, "step": 8102, "time_per_iteration": 3.3217179775238037 }, { "auxiliary_loss_clip": 0.01137876, "auxiliary_loss_mlp": 0.01036505, "balance_loss_clip": 1.02307034, "balance_loss_mlp": 1.04274535, "epoch": 0.48717871636855553, "flos": 12342964659840.0, "grad_norm": 2.069995413423852, "language_loss": 0.73671937, "learning_rate": 2.080831956061105e-06, "loss": 0.75846314, "num_input_tokens_seen": 174203250, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.765625, "step": 8103, "time_per_iteration": 2.5723626613616943 }, { "auxiliary_loss_clip": 0.01129845, "auxiliary_loss_mlp": 0.01043325, "balance_loss_clip": 1.02832878, "balance_loss_mlp": 1.04369438, "epoch": 0.4872388396212235, "flos": 23477139901440.0, "grad_norm": 2.535059093868774, "language_loss": 0.63048118, "learning_rate": 2.0804544409967534e-06, "loss": 0.65221286, "num_input_tokens_seen": 174224145, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7734375, "step": 8104, "time_per_iteration": 2.649212598800659 }, { "auxiliary_loss_clip": 0.01118484, "auxiliary_loss_mlp": 0.0102795, "balance_loss_clip": 1.01388383, "balance_loss_mlp": 1.04337358, "epoch": 0.48729896287389146, "flos": 31427582186880.0, "grad_norm": 1.5960422488940416, "language_loss": 0.69097745, "learning_rate": 2.0800769230611897e-06, "loss": 0.7124418, "num_input_tokens_seen": 174244435, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75, "step": 8105, "time_per_iteration": 2.675640344619751 }, { "auxiliary_loss_clip": 0.01140326, "auxiliary_loss_mlp": 0.01030139, "balance_loss_clip": 1.01704407, "balance_loss_mlp": 1.04017222, "epoch": 0.4873590861265594, "flos": 19057936700160.0, "grad_norm": 1.771913734986796, "language_loss": 0.73680568, "learning_rate": 2.079699402267887e-06, "loss": 0.75851029, "num_input_tokens_seen": 174262710, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 8106, "time_per_iteration": 2.5315682888031006 }, { "auxiliary_loss_clip": 0.01141818, "auxiliary_loss_mlp": 0.012808, "balance_loss_clip": 1.01945889, "balance_loss_mlp": 1.04547656, "epoch": 0.4874192093792274, "flos": 24348296453760.0, "grad_norm": 1.670979421177996, "language_loss": 0.76517236, "learning_rate": 2.0793218786303176e-06, "loss": 0.78939855, "num_input_tokens_seen": 174281545, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7890625, "step": 8107, "time_per_iteration": 2.626833438873291 }, { "auxiliary_loss_clip": 0.01150598, "auxiliary_loss_mlp": 0.0103018, "balance_loss_clip": 1.01734078, "balance_loss_mlp": 1.0424366, "epoch": 0.4874793326318954, "flos": 23112610727040.0, "grad_norm": 1.8973769558778353, "language_loss": 0.75093973, "learning_rate": 2.0789443521619536e-06, "loss": 0.77274752, "num_input_tokens_seen": 174300290, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.72265625, "step": 8108, "time_per_iteration": 2.5888147354125977 }, { "auxiliary_loss_clip": 0.01123516, "auxiliary_loss_mlp": 0.01031768, "balance_loss_clip": 1.02000237, "balance_loss_mlp": 1.04211068, "epoch": 0.4875394558845634, "flos": 19026156142080.0, "grad_norm": 1.734980296999352, "language_loss": 0.7351917, "learning_rate": 2.07856682287627e-06, "loss": 0.7567445, "num_input_tokens_seen": 174318490, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.7265625, "step": 8109, "time_per_iteration": 2.552887439727783 }, { "auxiliary_loss_clip": 0.01124366, "auxiliary_loss_mlp": 0.01032197, "balance_loss_clip": 1.01885724, "balance_loss_mlp": 1.04192972, "epoch": 0.48759957913723134, "flos": 21433607343360.0, "grad_norm": 1.6805844063545563, "language_loss": 0.78512454, "learning_rate": 2.078189290786738e-06, "loss": 0.80669016, "num_input_tokens_seen": 174335505, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 8110, "time_per_iteration": 2.550103187561035 }, { "auxiliary_loss_clip": 0.0112869, "auxiliary_loss_mlp": 0.01042617, "balance_loss_clip": 1.02829957, "balance_loss_mlp": 1.04401374, "epoch": 0.4876597023898993, "flos": 17748669962880.0, "grad_norm": 1.9532166188173032, "language_loss": 0.71625519, "learning_rate": 2.0778117559068307e-06, "loss": 0.73796827, "num_input_tokens_seen": 174353990, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7578125, "step": 8111, "time_per_iteration": 2.5501317977905273 }, { "auxiliary_loss_clip": 0.01125624, "auxiliary_loss_mlp": 0.01036063, "balance_loss_clip": 1.02216923, "balance_loss_mlp": 1.04351997, "epoch": 0.4877198256425673, "flos": 17019647527680.0, "grad_norm": 1.5626041915016875, "language_loss": 0.75821477, "learning_rate": 2.077434218250023e-06, "loss": 0.77983159, "num_input_tokens_seen": 174373425, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.73046875, "step": 8112, "time_per_iteration": 2.515501022338867 }, { "auxiliary_loss_clip": 0.01132475, "auxiliary_loss_mlp": 0.01036638, "balance_loss_clip": 1.02235723, "balance_loss_mlp": 1.04340899, "epoch": 0.48777994889523524, "flos": 22384091082240.0, "grad_norm": 1.692776249498954, "language_loss": 0.74916238, "learning_rate": 2.0770566778297868e-06, "loss": 0.77085352, "num_input_tokens_seen": 174393070, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8046875, "step": 8113, "time_per_iteration": 2.6889681816101074 }, { "auxiliary_loss_clip": 0.01053959, "auxiliary_loss_mlp": 0.01255788, "balance_loss_clip": 1.00803649, "balance_loss_mlp": 1.01225519, "epoch": 0.4878400721479032, "flos": 61241772159360.0, "grad_norm": 0.7904200536192997, "language_loss": 0.48867935, "learning_rate": 2.076679134659596e-06, "loss": 0.51177686, "num_input_tokens_seen": 174446880, "router_z_loss_clip": 0.01550293, "router_z_loss_mlp": 0.23828125, "step": 8114, "time_per_iteration": 3.047187328338623 }, { "auxiliary_loss_clip": 0.01148902, "auxiliary_loss_mlp": 0.0104168, "balance_loss_clip": 1.02649236, "balance_loss_mlp": 1.04376233, "epoch": 0.48790019540057117, "flos": 24536612482560.0, "grad_norm": 1.4494270588220628, "language_loss": 0.7684707, "learning_rate": 2.0763015887529235e-06, "loss": 0.79037654, "num_input_tokens_seen": 174468485, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.78125, "step": 8115, "time_per_iteration": 4.035052537918091 }, { "auxiliary_loss_clip": 0.01141056, "auxiliary_loss_mlp": 0.01040302, "balance_loss_clip": 1.02584219, "balance_loss_mlp": 1.0451622, "epoch": 0.48796031865323913, "flos": 21833939399040.0, "grad_norm": 2.4988355620820673, "language_loss": 0.71578157, "learning_rate": 2.0759240401232444e-06, "loss": 0.73759508, "num_input_tokens_seen": 174486360, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78515625, "step": 8116, "time_per_iteration": 2.578242063522339 }, { "auxiliary_loss_clip": 0.01143027, "auxiliary_loss_mlp": 0.0103468, "balance_loss_clip": 1.02106059, "balance_loss_mlp": 1.0420754, "epoch": 0.4880204419059071, "flos": 18588907883520.0, "grad_norm": 2.0725810394257276, "language_loss": 0.62772524, "learning_rate": 2.0755464887840314e-06, "loss": 0.64950228, "num_input_tokens_seen": 174505075, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7421875, "step": 8117, "time_per_iteration": 2.575971841812134 }, { "auxiliary_loss_clip": 0.01134223, "auxiliary_loss_mlp": 0.01035377, "balance_loss_clip": 1.02144706, "balance_loss_mlp": 1.04097545, "epoch": 0.48808056515857506, "flos": 19172168928000.0, "grad_norm": 1.8655815797182946, "language_loss": 0.79080582, "learning_rate": 2.0751689347487583e-06, "loss": 0.81250179, "num_input_tokens_seen": 174523385, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75390625, "step": 8118, "time_per_iteration": 2.5676236152648926 }, { "auxiliary_loss_clip": 0.01136418, "auxiliary_loss_mlp": 0.01032666, "balance_loss_clip": 1.01843238, "balance_loss_mlp": 1.04238784, "epoch": 0.48814068841124303, "flos": 20120497850880.0, "grad_norm": 1.8603113725631468, "language_loss": 0.63198364, "learning_rate": 2.0747913780308996e-06, "loss": 0.65367448, "num_input_tokens_seen": 174542200, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.765625, "step": 8119, "time_per_iteration": 3.9516682624816895 }, { "auxiliary_loss_clip": 0.01164307, "auxiliary_loss_mlp": 0.0127994, "balance_loss_clip": 1.0194304, "balance_loss_mlp": 1.04283559, "epoch": 0.488200811663911, "flos": 22965592360320.0, "grad_norm": 1.7587873110516843, "language_loss": 0.72247452, "learning_rate": 2.0744138186439288e-06, "loss": 0.74691695, "num_input_tokens_seen": 174563620, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.765625, "step": 8120, "time_per_iteration": 2.666546583175659 }, { "auxiliary_loss_clip": 0.01127439, "auxiliary_loss_mlp": 0.01031319, "balance_loss_clip": 1.01646519, "balance_loss_mlp": 1.0417819, "epoch": 0.48826093491657896, "flos": 33910697387520.0, "grad_norm": 2.3869826072777096, "language_loss": 0.63760841, "learning_rate": 2.0740362566013207e-06, "loss": 0.65919602, "num_input_tokens_seen": 174586465, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.765625, "step": 8121, "time_per_iteration": 2.7218868732452393 }, { "auxiliary_loss_clip": 0.0113186, "auxiliary_loss_mlp": 0.01036351, "balance_loss_clip": 1.02135408, "balance_loss_mlp": 1.04431939, "epoch": 0.488321058169247, "flos": 23070307484160.0, "grad_norm": 2.009916906342907, "language_loss": 0.82562256, "learning_rate": 2.073658691916548e-06, "loss": 0.8473047, "num_input_tokens_seen": 174604035, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.78515625, "step": 8122, "time_per_iteration": 2.5842747688293457 }, { "auxiliary_loss_clip": 0.01129751, "auxiliary_loss_mlp": 0.01034307, "balance_loss_clip": 1.01962018, "balance_loss_mlp": 1.04226804, "epoch": 0.48838118142191494, "flos": 19317714837120.0, "grad_norm": 5.353740663482726, "language_loss": 0.85399717, "learning_rate": 2.073281124603087e-06, "loss": 0.87563771, "num_input_tokens_seen": 174621715, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78125, "step": 8123, "time_per_iteration": 2.5951476097106934 }, { "auxiliary_loss_clip": 0.01147521, "auxiliary_loss_mlp": 0.01031841, "balance_loss_clip": 1.01786399, "balance_loss_mlp": 1.04278588, "epoch": 0.4884413046745829, "flos": 25410678036480.0, "grad_norm": 1.4881709823463334, "language_loss": 0.85505551, "learning_rate": 2.0729035546744115e-06, "loss": 0.87684911, "num_input_tokens_seen": 174643835, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.77734375, "step": 8124, "time_per_iteration": 4.189625024795532 }, { "auxiliary_loss_clip": 0.01138602, "auxiliary_loss_mlp": 0.01033038, "balance_loss_clip": 1.01869094, "balance_loss_mlp": 1.04290831, "epoch": 0.4885014279272509, "flos": 20991546662400.0, "grad_norm": 2.1380579794363372, "language_loss": 0.79444557, "learning_rate": 2.072525982143995e-06, "loss": 0.81616193, "num_input_tokens_seen": 174660955, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.77734375, "step": 8125, "time_per_iteration": 2.6899349689483643 }, { "auxiliary_loss_clip": 0.0115403, "auxiliary_loss_mlp": 0.01031738, "balance_loss_clip": 1.01830268, "balance_loss_mlp": 1.0424217, "epoch": 0.48856155117991884, "flos": 13771599269760.0, "grad_norm": 2.821404920934579, "language_loss": 0.72176474, "learning_rate": 2.0721484070253127e-06, "loss": 0.74362248, "num_input_tokens_seen": 174678270, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 8126, "time_per_iteration": 4.784988164901733 }, { "auxiliary_loss_clip": 0.0112858, "auxiliary_loss_mlp": 0.01032423, "balance_loss_clip": 1.018857, "balance_loss_mlp": 1.04164171, "epoch": 0.4886216744325868, "flos": 32087764206720.0, "grad_norm": 2.1255561369051583, "language_loss": 0.68490493, "learning_rate": 2.0717708293318393e-06, "loss": 0.70651501, "num_input_tokens_seen": 174698360, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.78125, "step": 8127, "time_per_iteration": 2.6940817832946777 }, { "auxiliary_loss_clip": 0.01119157, "auxiliary_loss_mlp": 0.0103588, "balance_loss_clip": 1.02161074, "balance_loss_mlp": 1.0405612, "epoch": 0.48868179768525477, "flos": 19610063631360.0, "grad_norm": 1.9234590031280845, "language_loss": 0.75741875, "learning_rate": 2.07139324907705e-06, "loss": 0.77896917, "num_input_tokens_seen": 174716755, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78515625, "step": 8128, "time_per_iteration": 2.686951160430908 }, { "auxiliary_loss_clip": 0.01120861, "auxiliary_loss_mlp": 0.01032148, "balance_loss_clip": 1.019333, "balance_loss_mlp": 1.04397082, "epoch": 0.48874192093792274, "flos": 21286912199040.0, "grad_norm": 1.8947577860256184, "language_loss": 0.76161259, "learning_rate": 2.0710156662744192e-06, "loss": 0.78314269, "num_input_tokens_seen": 174735560, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.76953125, "step": 8129, "time_per_iteration": 2.55863618850708 }, { "auxiliary_loss_clip": 0.01127397, "auxiliary_loss_mlp": 0.01031577, "balance_loss_clip": 1.01705766, "balance_loss_mlp": 1.04173839, "epoch": 0.4888020441905907, "flos": 14173439696640.0, "grad_norm": 2.2525215467805184, "language_loss": 0.64909923, "learning_rate": 2.0706380809374213e-06, "loss": 0.67068899, "num_input_tokens_seen": 174752730, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.76953125, "step": 8130, "time_per_iteration": 2.5434911251068115 }, { "auxiliary_loss_clip": 0.0114154, "auxiliary_loss_mlp": 0.01030665, "balance_loss_clip": 1.01739097, "balance_loss_mlp": 1.0398016, "epoch": 0.48886216744325867, "flos": 24097891766400.0, "grad_norm": 2.3730469529258995, "language_loss": 0.71831197, "learning_rate": 2.070260493079533e-06, "loss": 0.7400341, "num_input_tokens_seen": 174772520, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75, "step": 8131, "time_per_iteration": 2.6692733764648438 }, { "auxiliary_loss_clip": 0.0112906, "auxiliary_loss_mlp": 0.01040445, "balance_loss_clip": 1.02632463, "balance_loss_mlp": 1.04209089, "epoch": 0.48892229069592663, "flos": 38431419402240.0, "grad_norm": 1.4584019148525356, "language_loss": 0.69832629, "learning_rate": 2.0698829027142274e-06, "loss": 0.72002137, "num_input_tokens_seen": 174796540, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78125, "step": 8132, "time_per_iteration": 2.6832659244537354 }, { "auxiliary_loss_clip": 0.0112608, "auxiliary_loss_mlp": 0.01031906, "balance_loss_clip": 1.01784563, "balance_loss_mlp": 1.04233265, "epoch": 0.4889824139485946, "flos": 23843321101440.0, "grad_norm": 1.4533269335262857, "language_loss": 0.70094186, "learning_rate": 2.0695053098549814e-06, "loss": 0.72252166, "num_input_tokens_seen": 174817840, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.74609375, "step": 8133, "time_per_iteration": 2.588852882385254 }, { "auxiliary_loss_clip": 0.01153036, "auxiliary_loss_mlp": 0.01034872, "balance_loss_clip": 1.02128184, "balance_loss_mlp": 1.04184651, "epoch": 0.48904253720126256, "flos": 24425827960320.0, "grad_norm": 1.9376773006523358, "language_loss": 0.71074504, "learning_rate": 2.06912771451527e-06, "loss": 0.73262417, "num_input_tokens_seen": 174837885, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75, "step": 8134, "time_per_iteration": 2.5842056274414062 }, { "auxiliary_loss_clip": 0.01159582, "auxiliary_loss_mlp": 0.0103826, "balance_loss_clip": 1.02378178, "balance_loss_mlp": 1.0441103, "epoch": 0.4891026604539306, "flos": 24170682677760.0, "grad_norm": 2.6430834482864145, "language_loss": 0.80500293, "learning_rate": 2.068750116708567e-06, "loss": 0.82698143, "num_input_tokens_seen": 174855240, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.80078125, "step": 8135, "time_per_iteration": 2.5870211124420166 }, { "auxiliary_loss_clip": 0.01113935, "auxiliary_loss_mlp": 0.0103819, "balance_loss_clip": 1.0252316, "balance_loss_mlp": 1.04000199, "epoch": 0.48916278370659855, "flos": 21470954509440.0, "grad_norm": 1.6481887347175441, "language_loss": 0.74434012, "learning_rate": 2.0683725164483504e-06, "loss": 0.76586139, "num_input_tokens_seen": 174875145, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73828125, "step": 8136, "time_per_iteration": 2.5034244060516357 }, { "auxiliary_loss_clip": 0.01134904, "auxiliary_loss_mlp": 0.01036649, "balance_loss_clip": 1.02369666, "balance_loss_mlp": 1.04263413, "epoch": 0.4892229069592665, "flos": 22309755886080.0, "grad_norm": 4.008682287945396, "language_loss": 0.72949326, "learning_rate": 2.067994913748094e-06, "loss": 0.75120878, "num_input_tokens_seen": 174894770, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.74609375, "step": 8137, "time_per_iteration": 2.5626227855682373 }, { "auxiliary_loss_clip": 0.01171563, "auxiliary_loss_mlp": 0.01034112, "balance_loss_clip": 1.01977062, "balance_loss_mlp": 1.04105103, "epoch": 0.4892830302119345, "flos": 12786856934400.0, "grad_norm": 2.2788506876130468, "language_loss": 0.74898762, "learning_rate": 2.0676173086212745e-06, "loss": 0.77104437, "num_input_tokens_seen": 174912780, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7734375, "step": 8138, "time_per_iteration": 2.566021203994751 }, { "auxiliary_loss_clip": 0.01144147, "auxiliary_loss_mlp": 0.01036267, "balance_loss_clip": 1.02345157, "balance_loss_mlp": 1.04297519, "epoch": 0.48934315346460244, "flos": 20813896972800.0, "grad_norm": 1.6499490179296559, "language_loss": 0.74659276, "learning_rate": 2.067239701081367e-06, "loss": 0.76839685, "num_input_tokens_seen": 174931250, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7421875, "step": 8139, "time_per_iteration": 2.6309685707092285 }, { "auxiliary_loss_clip": 0.01126638, "auxiliary_loss_mlp": 0.01036163, "balance_loss_clip": 1.02279365, "balance_loss_mlp": 1.04147661, "epoch": 0.4894032767172704, "flos": 19755537713280.0, "grad_norm": 1.8779137663309258, "language_loss": 0.62159735, "learning_rate": 2.066862091141848e-06, "loss": 0.64322531, "num_input_tokens_seen": 174951105, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.76171875, "step": 8140, "time_per_iteration": 2.5660195350646973 }, { "auxiliary_loss_clip": 0.01147348, "auxiliary_loss_mlp": 0.01043536, "balance_loss_clip": 1.02924252, "balance_loss_mlp": 1.04266548, "epoch": 0.4894633999699384, "flos": 17818982835840.0, "grad_norm": 3.0745452830599858, "language_loss": 0.82571143, "learning_rate": 2.0664844788161923e-06, "loss": 0.84762031, "num_input_tokens_seen": 174969120, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78125, "step": 8141, "time_per_iteration": 2.548247814178467 }, { "auxiliary_loss_clip": 0.0114694, "auxiliary_loss_mlp": 0.01037937, "balance_loss_clip": 1.02340555, "balance_loss_mlp": 1.04298139, "epoch": 0.48952352322260634, "flos": 25523222325120.0, "grad_norm": 1.5347113703767559, "language_loss": 0.72297662, "learning_rate": 2.0661068641178764e-06, "loss": 0.74482542, "num_input_tokens_seen": 174991295, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.76953125, "step": 8142, "time_per_iteration": 2.6146581172943115 }, { "auxiliary_loss_clip": 0.01115982, "auxiliary_loss_mlp": 0.01036277, "balance_loss_clip": 1.0228008, "balance_loss_mlp": 1.04047179, "epoch": 0.4895836464752743, "flos": 29055502903680.0, "grad_norm": 1.7230627118184447, "language_loss": 0.66634059, "learning_rate": 2.065729247060377e-06, "loss": 0.68786323, "num_input_tokens_seen": 175012830, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75390625, "step": 8143, "time_per_iteration": 2.7206902503967285 }, { "auxiliary_loss_clip": 0.01121944, "auxiliary_loss_mlp": 0.01275228, "balance_loss_clip": 1.01554239, "balance_loss_mlp": 1.03983617, "epoch": 0.48964376972794227, "flos": 33546958312320.0, "grad_norm": 1.68860140062109, "language_loss": 0.74932122, "learning_rate": 2.0653516276571694e-06, "loss": 0.7732929, "num_input_tokens_seen": 175035695, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 8144, "time_per_iteration": 2.6670422554016113 }, { "auxiliary_loss_clip": 0.01135263, "auxiliary_loss_mlp": 0.01030281, "balance_loss_clip": 1.0168221, "balance_loss_mlp": 1.04214978, "epoch": 0.48970389298061023, "flos": 22054035985920.0, "grad_norm": 1.646371461103969, "language_loss": 0.75911868, "learning_rate": 2.0649740059217304e-06, "loss": 0.78077412, "num_input_tokens_seen": 175056425, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75, "step": 8145, "time_per_iteration": 2.661189079284668 }, { "auxiliary_loss_clip": 0.01138288, "auxiliary_loss_mlp": 0.01284829, "balance_loss_clip": 1.02374947, "balance_loss_mlp": 1.04247451, "epoch": 0.4897640162332782, "flos": 20084299920000.0, "grad_norm": 1.8016112252802654, "language_loss": 0.80451, "learning_rate": 2.064596381867537e-06, "loss": 0.82874113, "num_input_tokens_seen": 175074800, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78125, "step": 8146, "time_per_iteration": 2.5352182388305664 }, { "auxiliary_loss_clip": 0.01120529, "auxiliary_loss_mlp": 0.01031755, "balance_loss_clip": 1.01834989, "balance_loss_mlp": 1.03885031, "epoch": 0.48982413948594616, "flos": 23806225330560.0, "grad_norm": 1.5335004567573827, "language_loss": 0.74194807, "learning_rate": 2.064218755508064e-06, "loss": 0.76347089, "num_input_tokens_seen": 175094500, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73046875, "step": 8147, "time_per_iteration": 2.6300222873687744 }, { "auxiliary_loss_clip": 0.01136043, "auxiliary_loss_mlp": 0.01030181, "balance_loss_clip": 1.01696062, "balance_loss_mlp": 1.04274607, "epoch": 0.4898842627386142, "flos": 17639645207040.0, "grad_norm": 2.087826793644591, "language_loss": 0.82816315, "learning_rate": 2.0638411268567894e-06, "loss": 0.84982538, "num_input_tokens_seen": 175112920, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.75, "step": 8148, "time_per_iteration": 2.6118834018707275 }, { "auxiliary_loss_clip": 0.01137993, "auxiliary_loss_mlp": 0.0103095, "balance_loss_clip": 1.01927996, "balance_loss_mlp": 1.04011083, "epoch": 0.48994438599128215, "flos": 16617914841600.0, "grad_norm": 1.890242926203484, "language_loss": 0.73964095, "learning_rate": 2.0634634959271886e-06, "loss": 0.76133037, "num_input_tokens_seen": 175129910, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.7109375, "step": 8149, "time_per_iteration": 2.629321813583374 }, { "auxiliary_loss_clip": 0.0112087, "auxiliary_loss_mlp": 0.01031931, "balance_loss_clip": 1.01753056, "balance_loss_mlp": 1.04268837, "epoch": 0.4900045092439501, "flos": 26614834600320.0, "grad_norm": 1.6779234124738662, "language_loss": 0.75375104, "learning_rate": 2.0630858627327394e-06, "loss": 0.77527905, "num_input_tokens_seen": 175148705, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78125, "step": 8150, "time_per_iteration": 2.694298028945923 }, { "auxiliary_loss_clip": 0.01137539, "auxiliary_loss_mlp": 0.01034433, "balance_loss_clip": 1.0207063, "balance_loss_mlp": 1.0428288, "epoch": 0.4900646324966181, "flos": 19902125116800.0, "grad_norm": 1.8193150725836729, "language_loss": 0.72270489, "learning_rate": 2.0627082272869176e-06, "loss": 0.74442458, "num_input_tokens_seen": 175167425, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.76953125, "step": 8151, "time_per_iteration": 2.585649013519287 }, { "auxiliary_loss_clip": 0.01066237, "auxiliary_loss_mlp": 0.0100186, "balance_loss_clip": 1.00033462, "balance_loss_mlp": 1.01547837, "epoch": 0.49012475574928605, "flos": 59189620337280.0, "grad_norm": 0.8188411662127631, "language_loss": 0.54321492, "learning_rate": 2.062330589603201e-06, "loss": 0.56389594, "num_input_tokens_seen": 175227985, "router_z_loss_clip": 0.01525879, "router_z_loss_mlp": 0.23730469, "step": 8152, "time_per_iteration": 3.1273584365844727 }, { "auxiliary_loss_clip": 0.01155415, "auxiliary_loss_mlp": 0.01033519, "balance_loss_clip": 1.02012563, "balance_loss_mlp": 1.04151094, "epoch": 0.490184879001954, "flos": 45259797657600.0, "grad_norm": 1.660564870119571, "language_loss": 0.61290479, "learning_rate": 2.0619529496950657e-06, "loss": 0.63479412, "num_input_tokens_seen": 175251895, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7734375, "step": 8153, "time_per_iteration": 2.7785964012145996 }, { "auxiliary_loss_clip": 0.01140303, "auxiliary_loss_mlp": 0.0103648, "balance_loss_clip": 1.02296162, "balance_loss_mlp": 1.03981566, "epoch": 0.490245002254622, "flos": 28002135634560.0, "grad_norm": 1.5199173223766556, "language_loss": 0.76888978, "learning_rate": 2.0615753075759894e-06, "loss": 0.79065758, "num_input_tokens_seen": 175272770, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 8154, "time_per_iteration": 2.651346445083618 }, { "auxiliary_loss_clip": 0.01056598, "auxiliary_loss_mlp": 0.0100129, "balance_loss_clip": 0.99959755, "balance_loss_mlp": 1.01499915, "epoch": 0.49030512550728994, "flos": 58951318533120.0, "grad_norm": 0.9633808392332351, "language_loss": 0.66983521, "learning_rate": 2.0611976632594487e-06, "loss": 0.69041419, "num_input_tokens_seen": 175336320, "router_z_loss_clip": 0.01696777, "router_z_loss_mlp": 0.23632812, "step": 8155, "time_per_iteration": 3.2556562423706055 }, { "auxiliary_loss_clip": 0.01118289, "auxiliary_loss_mlp": 0.01034714, "balance_loss_clip": 1.02193522, "balance_loss_mlp": 1.04315066, "epoch": 0.4903652487599579, "flos": 19791843384960.0, "grad_norm": 1.9396310788192208, "language_loss": 0.76646918, "learning_rate": 2.0608200167589204e-06, "loss": 0.78799915, "num_input_tokens_seen": 175353540, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.75, "step": 8156, "time_per_iteration": 2.5657546520233154 }, { "auxiliary_loss_clip": 0.01122938, "auxiliary_loss_mlp": 0.01037184, "balance_loss_clip": 1.02468479, "balance_loss_mlp": 1.04009736, "epoch": 0.49042537201262587, "flos": 21762082241280.0, "grad_norm": 3.872085846514452, "language_loss": 0.83212173, "learning_rate": 2.060442368087882e-06, "loss": 0.85372293, "num_input_tokens_seen": 175370445, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7421875, "step": 8157, "time_per_iteration": 3.9316513538360596 }, { "auxiliary_loss_clip": 0.01124962, "auxiliary_loss_mlp": 0.010368, "balance_loss_clip": 1.02346063, "balance_loss_mlp": 1.04126406, "epoch": 0.49048549526529384, "flos": 18953042008320.0, "grad_norm": 1.7214661658635635, "language_loss": 0.79866779, "learning_rate": 2.060064717259811e-06, "loss": 0.82028544, "num_input_tokens_seen": 175389020, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.75, "step": 8158, "time_per_iteration": 2.536553382873535 }, { "auxiliary_loss_clip": 0.01120091, "auxiliary_loss_mlp": 0.01033315, "balance_loss_clip": 1.01979685, "balance_loss_mlp": 1.04307568, "epoch": 0.4905456185179618, "flos": 26906393295360.0, "grad_norm": 1.4638436905421517, "language_loss": 0.69111657, "learning_rate": 2.059687064288185e-06, "loss": 0.7126506, "num_input_tokens_seen": 175409545, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.76953125, "step": 8159, "time_per_iteration": 2.625164270401001 }, { "auxiliary_loss_clip": 0.01124643, "auxiliary_loss_mlp": 0.0103283, "balance_loss_clip": 1.01989019, "balance_loss_mlp": 1.04235125, "epoch": 0.49060574177062977, "flos": 20412343854720.0, "grad_norm": 2.4461942801544834, "language_loss": 0.73169822, "learning_rate": 2.059309409186481e-06, "loss": 0.75327295, "num_input_tokens_seen": 175429335, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 8160, "time_per_iteration": 2.6280500888824463 }, { "auxiliary_loss_clip": 0.01129814, "auxiliary_loss_mlp": 0.01042033, "balance_loss_clip": 1.02805018, "balance_loss_mlp": 1.04206717, "epoch": 0.4906658650232978, "flos": 17493704248320.0, "grad_norm": 3.2326385495407304, "language_loss": 0.71654177, "learning_rate": 2.0589317519681773e-06, "loss": 0.73826027, "num_input_tokens_seen": 175446955, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78515625, "step": 8161, "time_per_iteration": 3.9127564430236816 }, { "auxiliary_loss_clip": 0.01154097, "auxiliary_loss_mlp": 0.01039185, "balance_loss_clip": 1.02608383, "balance_loss_mlp": 1.04447424, "epoch": 0.49072598827596575, "flos": 26614439550720.0, "grad_norm": 1.7083551060134923, "language_loss": 0.68496644, "learning_rate": 2.0585540926467507e-06, "loss": 0.70689923, "num_input_tokens_seen": 175468195, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73828125, "step": 8162, "time_per_iteration": 2.7118115425109863 }, { "auxiliary_loss_clip": 0.01130588, "auxiliary_loss_mlp": 0.01036431, "balance_loss_clip": 1.02186406, "balance_loss_mlp": 1.04250741, "epoch": 0.4907861115286337, "flos": 20412595249920.0, "grad_norm": 1.851801473599825, "language_loss": 0.6364705, "learning_rate": 2.058176431235679e-06, "loss": 0.65814066, "num_input_tokens_seen": 175487455, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7890625, "step": 8163, "time_per_iteration": 2.5466699600219727 }, { "auxiliary_loss_clip": 0.01140632, "auxiliary_loss_mlp": 0.01031903, "balance_loss_clip": 1.01932049, "balance_loss_mlp": 1.04159212, "epoch": 0.4908462347813017, "flos": 14064271286400.0, "grad_norm": 2.486984289280964, "language_loss": 0.77288389, "learning_rate": 2.05779876774844e-06, "loss": 0.79460919, "num_input_tokens_seen": 175504450, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7265625, "step": 8164, "time_per_iteration": 2.563800811767578 }, { "auxiliary_loss_clip": 0.01125401, "auxiliary_loss_mlp": 0.01028593, "balance_loss_clip": 1.01502705, "balance_loss_mlp": 1.04054344, "epoch": 0.49090635803396965, "flos": 18735100237440.0, "grad_norm": 2.0091614575245367, "language_loss": 0.76601219, "learning_rate": 2.057421102198512e-06, "loss": 0.78755218, "num_input_tokens_seen": 175523600, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 8165, "time_per_iteration": 2.5571742057800293 }, { "auxiliary_loss_clip": 0.01134044, "auxiliary_loss_mlp": 0.01029277, "balance_loss_clip": 1.0163846, "balance_loss_mlp": 1.04292536, "epoch": 0.4909664812866376, "flos": 20558500295040.0, "grad_norm": 1.8565891502616536, "language_loss": 0.77338541, "learning_rate": 2.0570434345993717e-06, "loss": 0.79501861, "num_input_tokens_seen": 175542720, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 8166, "time_per_iteration": 4.06053352355957 }, { "auxiliary_loss_clip": 0.0104792, "auxiliary_loss_mlp": 0.01002178, "balance_loss_clip": 1.00056839, "balance_loss_mlp": 1.01549017, "epoch": 0.4910266045393056, "flos": 54684017948160.0, "grad_norm": 0.8021588399691305, "language_loss": 0.54175985, "learning_rate": 2.056665764964499e-06, "loss": 0.56226087, "num_input_tokens_seen": 175598640, "router_z_loss_clip": 0.01611328, "router_z_loss_mlp": 0.23535156, "step": 8167, "time_per_iteration": 4.699709177017212 }, { "auxiliary_loss_clip": 0.01149588, "auxiliary_loss_mlp": 0.01033658, "balance_loss_clip": 1.02047896, "balance_loss_mlp": 1.04137897, "epoch": 0.49108672779197354, "flos": 16246454342400.0, "grad_norm": 2.216959505491387, "language_loss": 0.86178547, "learning_rate": 2.0562880933073705e-06, "loss": 0.88361788, "num_input_tokens_seen": 175615675, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73046875, "step": 8168, "time_per_iteration": 2.52458119392395 }, { "auxiliary_loss_clip": 0.01123161, "auxiliary_loss_mlp": 0.01032421, "balance_loss_clip": 1.01951098, "balance_loss_mlp": 1.04208899, "epoch": 0.4911468510446415, "flos": 19825419623040.0, "grad_norm": 1.7851881859514935, "language_loss": 0.73196274, "learning_rate": 2.055910419641465e-06, "loss": 0.75351858, "num_input_tokens_seen": 175632255, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 8169, "time_per_iteration": 2.555166244506836 }, { "auxiliary_loss_clip": 0.01123443, "auxiliary_loss_mlp": 0.01026215, "balance_loss_clip": 1.01435423, "balance_loss_mlp": 1.04119587, "epoch": 0.4912069742973095, "flos": 21212684743680.0, "grad_norm": 1.4881672570636812, "language_loss": 0.77858162, "learning_rate": 2.05553274398026e-06, "loss": 0.80007821, "num_input_tokens_seen": 175651625, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.734375, "step": 8170, "time_per_iteration": 2.5260913372039795 }, { "auxiliary_loss_clip": 0.0113458, "auxiliary_loss_mlp": 0.01033766, "balance_loss_clip": 1.02126074, "balance_loss_mlp": 1.0412662, "epoch": 0.49126709754997744, "flos": 19537129065600.0, "grad_norm": 2.5921918708194234, "language_loss": 0.75521481, "learning_rate": 2.055155066337235e-06, "loss": 0.77689826, "num_input_tokens_seen": 175669265, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.75, "step": 8171, "time_per_iteration": 2.585374355316162 }, { "auxiliary_loss_clip": 0.01136613, "auxiliary_loss_mlp": 0.01032359, "balance_loss_clip": 1.02000856, "balance_loss_mlp": 1.04196072, "epoch": 0.4913272208026454, "flos": 12239686080000.0, "grad_norm": 3.048627016859004, "language_loss": 0.81667793, "learning_rate": 2.0547773867258667e-06, "loss": 0.83836764, "num_input_tokens_seen": 175686065, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.76953125, "step": 8172, "time_per_iteration": 2.516085147857666 }, { "auxiliary_loss_clip": 0.01047617, "auxiliary_loss_mlp": 0.01001904, "balance_loss_clip": 1.00034237, "balance_loss_mlp": 1.01523018, "epoch": 0.49138734405531337, "flos": 65465871661440.0, "grad_norm": 0.6889368398685474, "language_loss": 0.53367257, "learning_rate": 2.054399705159635e-06, "loss": 0.55416775, "num_input_tokens_seen": 175748595, "router_z_loss_clip": 0.015625, "router_z_loss_mlp": 0.23535156, "step": 8173, "time_per_iteration": 3.1839475631713867 }, { "auxiliary_loss_clip": 0.01127673, "auxiliary_loss_mlp": 0.01284525, "balance_loss_clip": 1.02375436, "balance_loss_mlp": 1.04291463, "epoch": 0.4914474673079814, "flos": 18439052342400.0, "grad_norm": 1.9877040018859946, "language_loss": 0.62633365, "learning_rate": 2.054022021652017e-06, "loss": 0.65045559, "num_input_tokens_seen": 175766770, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7578125, "step": 8174, "time_per_iteration": 2.606370449066162 }, { "auxiliary_loss_clip": 0.01144991, "auxiliary_loss_mlp": 0.01032552, "balance_loss_clip": 1.01942718, "balance_loss_mlp": 1.04347539, "epoch": 0.49150759056064935, "flos": 21685053525120.0, "grad_norm": 2.157959500353606, "language_loss": 0.68900847, "learning_rate": 2.0536443362164927e-06, "loss": 0.71078396, "num_input_tokens_seen": 175783605, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.75, "step": 8175, "time_per_iteration": 2.584824562072754 }, { "auxiliary_loss_clip": 0.01126212, "auxiliary_loss_mlp": 0.01028116, "balance_loss_clip": 1.01511633, "balance_loss_mlp": 1.0429244, "epoch": 0.4915677138133173, "flos": 22382439056640.0, "grad_norm": 1.6336932828039072, "language_loss": 0.74559271, "learning_rate": 2.0532666488665393e-06, "loss": 0.76713598, "num_input_tokens_seen": 175801390, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7421875, "step": 8176, "time_per_iteration": 2.5936849117279053 }, { "auxiliary_loss_clip": 0.01127635, "auxiliary_loss_mlp": 0.01040554, "balance_loss_clip": 1.02726817, "balance_loss_mlp": 1.04389679, "epoch": 0.4916278370659853, "flos": 18402890325120.0, "grad_norm": 1.6720303239310896, "language_loss": 0.69812989, "learning_rate": 2.052888959615637e-06, "loss": 0.7198118, "num_input_tokens_seen": 175819830, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.74609375, "step": 8177, "time_per_iteration": 2.540179967880249 }, { "auxiliary_loss_clip": 0.01056293, "auxiliary_loss_mlp": 0.01249884, "balance_loss_clip": 1.00235176, "balance_loss_mlp": 1.0153451, "epoch": 0.49168796031865325, "flos": 66609124715520.0, "grad_norm": 0.6851453853932311, "language_loss": 0.46174729, "learning_rate": 2.0525112684772633e-06, "loss": 0.4848091, "num_input_tokens_seen": 175881765, "router_z_loss_clip": 0.01501465, "router_z_loss_mlp": 0.234375, "step": 8178, "time_per_iteration": 3.1783254146575928 }, { "auxiliary_loss_clip": 0.0112339, "auxiliary_loss_mlp": 0.01035589, "balance_loss_clip": 1.02116513, "balance_loss_mlp": 1.04492974, "epoch": 0.4917480835713212, "flos": 20959335141120.0, "grad_norm": 2.1788964259748993, "language_loss": 0.655141, "learning_rate": 2.052133575464898e-06, "loss": 0.67673081, "num_input_tokens_seen": 175901795, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78515625, "step": 8179, "time_per_iteration": 2.5337555408477783 }, { "auxiliary_loss_clip": 0.01183959, "auxiliary_loss_mlp": 0.01037203, "balance_loss_clip": 1.02299309, "balance_loss_mlp": 1.0426898, "epoch": 0.4918082068239892, "flos": 15772900412160.0, "grad_norm": 2.309634506663442, "language_loss": 0.702452, "learning_rate": 2.0517558805920193e-06, "loss": 0.72466362, "num_input_tokens_seen": 175917770, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7890625, "step": 8180, "time_per_iteration": 2.707491874694824 }, { "auxiliary_loss_clip": 0.01134137, "auxiliary_loss_mlp": 0.010344, "balance_loss_clip": 1.02066088, "balance_loss_mlp": 1.04257751, "epoch": 0.49186833007665715, "flos": 24604806453120.0, "grad_norm": 1.955186089688137, "language_loss": 0.84282094, "learning_rate": 2.0513781838721057e-06, "loss": 0.86450624, "num_input_tokens_seen": 175937000, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73828125, "step": 8181, "time_per_iteration": 2.5705201625823975 }, { "auxiliary_loss_clip": 0.01115853, "auxiliary_loss_mlp": 0.01033349, "balance_loss_clip": 1.02139223, "balance_loss_mlp": 1.04388404, "epoch": 0.4919284533293251, "flos": 22090557139200.0, "grad_norm": 1.9858266391587922, "language_loss": 0.72863591, "learning_rate": 2.051000485318637e-06, "loss": 0.75012791, "num_input_tokens_seen": 175955170, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.71875, "step": 8182, "time_per_iteration": 2.6853132247924805 }, { "auxiliary_loss_clip": 0.01145453, "auxiliary_loss_mlp": 0.01036166, "balance_loss_clip": 1.02291, "balance_loss_mlp": 1.0433327, "epoch": 0.4919885765819931, "flos": 23368043318400.0, "grad_norm": 1.9256954590078124, "language_loss": 0.72650141, "learning_rate": 2.050622784945093e-06, "loss": 0.7483176, "num_input_tokens_seen": 175973725, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75390625, "step": 8183, "time_per_iteration": 2.6468021869659424 }, { "auxiliary_loss_clip": 0.01164792, "auxiliary_loss_mlp": 0.01029858, "balance_loss_clip": 1.0163095, "balance_loss_mlp": 1.04180384, "epoch": 0.49204869983466104, "flos": 21360493209600.0, "grad_norm": 2.013046516123848, "language_loss": 0.77639556, "learning_rate": 2.0502450827649514e-06, "loss": 0.79834211, "num_input_tokens_seen": 175993885, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.78515625, "step": 8184, "time_per_iteration": 2.6713898181915283 }, { "auxiliary_loss_clip": 0.01136715, "auxiliary_loss_mlp": 0.01034336, "balance_loss_clip": 1.02190256, "balance_loss_mlp": 1.04417849, "epoch": 0.492108823087329, "flos": 21142695093120.0, "grad_norm": 4.903023626916628, "language_loss": 0.70476788, "learning_rate": 2.049867378791693e-06, "loss": 0.7264784, "num_input_tokens_seen": 176014210, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7421875, "step": 8185, "time_per_iteration": 2.541689395904541 }, { "auxiliary_loss_clip": 0.01138764, "auxiliary_loss_mlp": 0.01039851, "balance_loss_clip": 1.02523589, "balance_loss_mlp": 1.0439105, "epoch": 0.49216894633999697, "flos": 25116605389440.0, "grad_norm": 1.9511251891999613, "language_loss": 0.74752045, "learning_rate": 2.049489673038795e-06, "loss": 0.7693066, "num_input_tokens_seen": 176033890, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7734375, "step": 8186, "time_per_iteration": 2.7180473804473877 }, { "auxiliary_loss_clip": 0.01129339, "auxiliary_loss_mlp": 0.01036507, "balance_loss_clip": 1.02270246, "balance_loss_mlp": 1.04379892, "epoch": 0.49222906959266494, "flos": 22637943475200.0, "grad_norm": 2.0051448015973303, "language_loss": 0.67607898, "learning_rate": 2.0491119655197382e-06, "loss": 0.69773746, "num_input_tokens_seen": 176052720, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.765625, "step": 8187, "time_per_iteration": 2.6111035346984863 }, { "auxiliary_loss_clip": 0.01153376, "auxiliary_loss_mlp": 0.01039617, "balance_loss_clip": 1.02626014, "balance_loss_mlp": 1.04348385, "epoch": 0.49228919284533296, "flos": 20410548174720.0, "grad_norm": 2.226464291962309, "language_loss": 0.6678282, "learning_rate": 2.0487342562480016e-06, "loss": 0.68975812, "num_input_tokens_seen": 176072545, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7421875, "step": 8188, "time_per_iteration": 2.632995128631592 }, { "auxiliary_loss_clip": 0.01129707, "auxiliary_loss_mlp": 0.01031905, "balance_loss_clip": 1.0199008, "balance_loss_mlp": 1.04535294, "epoch": 0.4923493160980009, "flos": 27122359818240.0, "grad_norm": 1.5723117162220013, "language_loss": 0.74916959, "learning_rate": 2.048356545237065e-06, "loss": 0.77078581, "num_input_tokens_seen": 176091490, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7578125, "step": 8189, "time_per_iteration": 2.6511731147766113 }, { "auxiliary_loss_clip": 0.0113571, "auxiliary_loss_mlp": 0.01030284, "balance_loss_clip": 1.01724255, "balance_loss_mlp": 1.04348838, "epoch": 0.4924094393506689, "flos": 35736683224320.0, "grad_norm": 1.6559897642245431, "language_loss": 0.64414221, "learning_rate": 2.0479788325004076e-06, "loss": 0.66580212, "num_input_tokens_seen": 176113200, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7421875, "step": 8190, "time_per_iteration": 2.7355058193206787 }, { "auxiliary_loss_clip": 0.01117576, "auxiliary_loss_mlp": 0.01030601, "balance_loss_clip": 1.01752973, "balance_loss_mlp": 1.04377103, "epoch": 0.49246956260333685, "flos": 20412487509120.0, "grad_norm": 1.913758326438094, "language_loss": 0.71569604, "learning_rate": 2.0476011180515086e-06, "loss": 0.73717785, "num_input_tokens_seen": 176132485, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73828125, "step": 8191, "time_per_iteration": 2.6237659454345703 }, { "auxiliary_loss_clip": 0.01116927, "auxiliary_loss_mlp": 0.01287661, "balance_loss_clip": 1.02739012, "balance_loss_mlp": 1.04232001, "epoch": 0.4925296858560048, "flos": 38976938231040.0, "grad_norm": 1.6472191185004323, "language_loss": 0.71601331, "learning_rate": 2.047223401903849e-06, "loss": 0.74005926, "num_input_tokens_seen": 176155755, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.74609375, "step": 8192, "time_per_iteration": 2.6972286701202393 }, { "auxiliary_loss_clip": 0.01054743, "auxiliary_loss_mlp": 0.01006164, "balance_loss_clip": 1.00482929, "balance_loss_mlp": 1.01355302, "epoch": 0.4925898091086728, "flos": 64278917712000.0, "grad_norm": 0.7190811188489341, "language_loss": 0.52156579, "learning_rate": 2.0468456840709066e-06, "loss": 0.54217482, "num_input_tokens_seen": 176216295, "router_z_loss_clip": 0.0133667, "router_z_loss_mlp": 0.234375, "step": 8193, "time_per_iteration": 3.2422966957092285 }, { "auxiliary_loss_clip": 0.01118059, "auxiliary_loss_mlp": 0.01038548, "balance_loss_clip": 1.02535737, "balance_loss_mlp": 1.0453186, "epoch": 0.49264993236134075, "flos": 23036372110080.0, "grad_norm": 1.6340300386773212, "language_loss": 0.76976585, "learning_rate": 2.0464679645661637e-06, "loss": 0.79133189, "num_input_tokens_seen": 176235925, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 8194, "time_per_iteration": 2.538782835006714 }, { "auxiliary_loss_clip": 0.01133779, "auxiliary_loss_mlp": 0.01031384, "balance_loss_clip": 1.01871777, "balance_loss_mlp": 1.04185271, "epoch": 0.4927100556140087, "flos": 24718212668160.0, "grad_norm": 2.0512015731036453, "language_loss": 0.70352638, "learning_rate": 2.0460902434030975e-06, "loss": 0.72517806, "num_input_tokens_seen": 176253865, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73828125, "step": 8195, "time_per_iteration": 2.67488169670105 }, { "auxiliary_loss_clip": 0.01152811, "auxiliary_loss_mlp": 0.01028173, "balance_loss_clip": 1.01575172, "balance_loss_mlp": 1.04179001, "epoch": 0.4927701788666767, "flos": 23505544581120.0, "grad_norm": 1.928329271892186, "language_loss": 0.80685878, "learning_rate": 2.045712520595189e-06, "loss": 0.82866859, "num_input_tokens_seen": 176271525, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7578125, "step": 8196, "time_per_iteration": 2.6791560649871826 }, { "auxiliary_loss_clip": 0.01146382, "auxiliary_loss_mlp": 0.01034563, "balance_loss_clip": 1.02124143, "balance_loss_mlp": 1.04275358, "epoch": 0.49283030211934464, "flos": 22928891639040.0, "grad_norm": 1.8985434561059134, "language_loss": 0.70051908, "learning_rate": 2.045334796155919e-06, "loss": 0.72232854, "num_input_tokens_seen": 176290810, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.76953125, "step": 8197, "time_per_iteration": 2.7228713035583496 }, { "auxiliary_loss_clip": 0.01125736, "auxiliary_loss_mlp": 0.01030775, "balance_loss_clip": 1.01850247, "balance_loss_mlp": 1.0433923, "epoch": 0.4928904253720126, "flos": 16873024210560.0, "grad_norm": 1.9191598228430857, "language_loss": 0.84418082, "learning_rate": 2.044957070098766e-06, "loss": 0.8657459, "num_input_tokens_seen": 176309165, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.734375, "step": 8198, "time_per_iteration": 3.92210054397583 }, { "auxiliary_loss_clip": 0.01130724, "auxiliary_loss_mlp": 0.01033804, "balance_loss_clip": 1.01976728, "balance_loss_mlp": 1.04450464, "epoch": 0.4929505486246806, "flos": 14866551509760.0, "grad_norm": 2.5337848120562714, "language_loss": 0.76413035, "learning_rate": 2.0445793424372114e-06, "loss": 0.78577566, "num_input_tokens_seen": 176324960, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76953125, "step": 8199, "time_per_iteration": 2.56162428855896 }, { "auxiliary_loss_clip": 0.01137522, "auxiliary_loss_mlp": 0.01035868, "balance_loss_clip": 1.02134228, "balance_loss_mlp": 1.0418824, "epoch": 0.49301067187734854, "flos": 23842351434240.0, "grad_norm": 1.4077095341730481, "language_loss": 0.60077584, "learning_rate": 2.044201613184735e-06, "loss": 0.62250972, "num_input_tokens_seen": 176346195, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7734375, "step": 8200, "time_per_iteration": 2.6441709995269775 }, { "auxiliary_loss_clip": 0.01121052, "auxiliary_loss_mlp": 0.01036482, "balance_loss_clip": 1.02379191, "balance_loss_mlp": 1.04106987, "epoch": 0.49307079513001656, "flos": 22491284244480.0, "grad_norm": 1.5111097320710567, "language_loss": 0.79015815, "learning_rate": 2.0438238823548164e-06, "loss": 0.81173348, "num_input_tokens_seen": 176366735, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 8201, "time_per_iteration": 2.529827356338501 }, { "auxiliary_loss_clip": 0.01131667, "auxiliary_loss_mlp": 0.01033969, "balance_loss_clip": 1.01935446, "balance_loss_mlp": 1.04256034, "epoch": 0.4931309183826845, "flos": 15924587546880.0, "grad_norm": 2.2029066908317727, "language_loss": 0.67012483, "learning_rate": 2.043446149960936e-06, "loss": 0.69178116, "num_input_tokens_seen": 176384475, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80078125, "step": 8202, "time_per_iteration": 3.978886127471924 }, { "auxiliary_loss_clip": 0.01133829, "auxiliary_loss_mlp": 0.01028877, "balance_loss_clip": 1.01588356, "balance_loss_mlp": 1.04154944, "epoch": 0.4931910416353525, "flos": 27309059735040.0, "grad_norm": 2.165531225944402, "language_loss": 0.75298101, "learning_rate": 2.043068416016574e-06, "loss": 0.77460808, "num_input_tokens_seen": 176402645, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.74609375, "step": 8203, "time_per_iteration": 2.6719956398010254 }, { "auxiliary_loss_clip": 0.0115197, "auxiliary_loss_mlp": 0.01036233, "balance_loss_clip": 1.0225594, "balance_loss_mlp": 1.04132462, "epoch": 0.49325116488802045, "flos": 20806139635200.0, "grad_norm": 3.4647417976850154, "language_loss": 0.80640936, "learning_rate": 2.0426906805352113e-06, "loss": 0.82829142, "num_input_tokens_seen": 176416715, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.74609375, "step": 8204, "time_per_iteration": 2.5683112144470215 }, { "auxiliary_loss_clip": 0.01124088, "auxiliary_loss_mlp": 0.0103442, "balance_loss_clip": 1.02189112, "balance_loss_mlp": 1.04207039, "epoch": 0.4933112881406884, "flos": 19865963099520.0, "grad_norm": 1.9276777817361492, "language_loss": 0.66017044, "learning_rate": 2.0423129435303277e-06, "loss": 0.68175542, "num_input_tokens_seen": 176435755, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.73046875, "step": 8205, "time_per_iteration": 2.542149305343628 }, { "auxiliary_loss_clip": 0.0112306, "auxiliary_loss_mlp": 0.01032728, "balance_loss_clip": 1.01791024, "balance_loss_mlp": 1.04314005, "epoch": 0.4933714113933564, "flos": 21827977741440.0, "grad_norm": 1.9172440696697899, "language_loss": 0.66853338, "learning_rate": 2.0419352050154046e-06, "loss": 0.69009131, "num_input_tokens_seen": 176453915, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.80078125, "step": 8206, "time_per_iteration": 2.523284673690796 }, { "auxiliary_loss_clip": 0.01146712, "auxiliary_loss_mlp": 0.0103197, "balance_loss_clip": 1.01957202, "balance_loss_mlp": 1.04343665, "epoch": 0.49343153464602435, "flos": 27890130049920.0, "grad_norm": 1.7056006387086218, "language_loss": 0.76573676, "learning_rate": 2.041557465003922e-06, "loss": 0.78752357, "num_input_tokens_seen": 176475175, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7578125, "step": 8207, "time_per_iteration": 2.686192274093628 }, { "auxiliary_loss_clip": 0.01138105, "auxiliary_loss_mlp": 0.01034554, "balance_loss_clip": 1.02042794, "balance_loss_mlp": 1.04321742, "epoch": 0.4934916578986923, "flos": 24681080983680.0, "grad_norm": 2.365716358274434, "language_loss": 0.59622192, "learning_rate": 2.0411797235093593e-06, "loss": 0.61794853, "num_input_tokens_seen": 176494250, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.765625, "step": 8208, "time_per_iteration": 4.046005487442017 }, { "auxiliary_loss_clip": 0.01137319, "auxiliary_loss_mlp": 0.0103397, "balance_loss_clip": 1.01996279, "balance_loss_mlp": 1.04281247, "epoch": 0.4935517811513603, "flos": 23405139089280.0, "grad_norm": 1.867587121112665, "language_loss": 0.78030717, "learning_rate": 2.040801980545199e-06, "loss": 0.80202013, "num_input_tokens_seen": 176513325, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.765625, "step": 8209, "time_per_iteration": 4.10125207901001 }, { "auxiliary_loss_clip": 0.01135415, "auxiliary_loss_mlp": 0.01278549, "balance_loss_clip": 1.01799083, "balance_loss_mlp": 1.04285383, "epoch": 0.49361190440402825, "flos": 21944508439680.0, "grad_norm": 4.010088824857067, "language_loss": 0.78646302, "learning_rate": 2.040424236124921e-06, "loss": 0.81060266, "num_input_tokens_seen": 176532915, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.74609375, "step": 8210, "time_per_iteration": 2.5970871448516846 }, { "auxiliary_loss_clip": 0.01133325, "auxiliary_loss_mlp": 0.010361, "balance_loss_clip": 1.02173519, "balance_loss_mlp": 1.04427862, "epoch": 0.4936720276566962, "flos": 25115671635840.0, "grad_norm": 1.8535796688496182, "language_loss": 0.8148936, "learning_rate": 2.0400464902620057e-06, "loss": 0.83658785, "num_input_tokens_seen": 176552775, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.80078125, "step": 8211, "time_per_iteration": 2.5097222328186035 }, { "auxiliary_loss_clip": 0.01147702, "auxiliary_loss_mlp": 0.01036044, "balance_loss_clip": 1.02232862, "balance_loss_mlp": 1.04436278, "epoch": 0.4937321509093642, "flos": 26358935132160.0, "grad_norm": 2.3389365282918515, "language_loss": 0.91295898, "learning_rate": 2.0396687429699345e-06, "loss": 0.93479645, "num_input_tokens_seen": 176572185, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.765625, "step": 8212, "time_per_iteration": 2.629124641418457 }, { "auxiliary_loss_clip": 0.01146952, "auxiliary_loss_mlp": 0.01034878, "balance_loss_clip": 1.02176464, "balance_loss_mlp": 1.04339933, "epoch": 0.49379227416203214, "flos": 22961390469120.0, "grad_norm": 1.8072565540507604, "language_loss": 0.65077227, "learning_rate": 2.0392909942621875e-06, "loss": 0.67259061, "num_input_tokens_seen": 176591490, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.76171875, "step": 8213, "time_per_iteration": 2.522731065750122 }, { "auxiliary_loss_clip": 0.01152785, "auxiliary_loss_mlp": 0.01277178, "balance_loss_clip": 1.01631939, "balance_loss_mlp": 1.04230475, "epoch": 0.49385239741470016, "flos": 32489101843200.0, "grad_norm": 1.940704361302796, "language_loss": 0.75579989, "learning_rate": 2.0389132441522464e-06, "loss": 0.78009951, "num_input_tokens_seen": 176612715, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75, "step": 8214, "time_per_iteration": 2.670172691345215 }, { "auxiliary_loss_clip": 0.01119078, "auxiliary_loss_mlp": 0.01036602, "balance_loss_clip": 1.02277946, "balance_loss_mlp": 1.0437597, "epoch": 0.4939125206673681, "flos": 22492864442880.0, "grad_norm": 1.7679955392524396, "language_loss": 0.84313107, "learning_rate": 2.0385354926535914e-06, "loss": 0.8646878, "num_input_tokens_seen": 176631950, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75390625, "step": 8215, "time_per_iteration": 2.4927914142608643 }, { "auxiliary_loss_clip": 0.01122474, "auxiliary_loss_mlp": 0.01030472, "balance_loss_clip": 1.01826489, "balance_loss_mlp": 1.04212999, "epoch": 0.4939726439200361, "flos": 31176351486720.0, "grad_norm": 1.5425299744507013, "language_loss": 0.83315098, "learning_rate": 2.0381577397797043e-06, "loss": 0.85468048, "num_input_tokens_seen": 176653060, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.71484375, "step": 8216, "time_per_iteration": 2.6618199348449707 }, { "auxiliary_loss_clip": 0.01134883, "auxiliary_loss_mlp": 0.0103509, "balance_loss_clip": 1.02193499, "balance_loss_mlp": 1.04330468, "epoch": 0.49403276717270406, "flos": 22674213233280.0, "grad_norm": 1.4281446189127953, "language_loss": 0.74620247, "learning_rate": 2.0377799855440653e-06, "loss": 0.7679022, "num_input_tokens_seen": 176673895, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7421875, "step": 8217, "time_per_iteration": 2.627469301223755 }, { "auxiliary_loss_clip": 0.01116884, "auxiliary_loss_mlp": 0.01283304, "balance_loss_clip": 1.02302194, "balance_loss_mlp": 1.04447842, "epoch": 0.494092890425372, "flos": 20741070147840.0, "grad_norm": 1.5793466730567862, "language_loss": 0.78222954, "learning_rate": 2.037402229960156e-06, "loss": 0.80623144, "num_input_tokens_seen": 176692550, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7265625, "step": 8218, "time_per_iteration": 2.574404001235962 }, { "auxiliary_loss_clip": 0.01150956, "auxiliary_loss_mlp": 0.01283094, "balance_loss_clip": 1.02342856, "balance_loss_mlp": 1.04330254, "epoch": 0.49415301367804, "flos": 18369026778240.0, "grad_norm": 2.5580172100197713, "language_loss": 0.76229399, "learning_rate": 2.0370244730414566e-06, "loss": 0.78663445, "num_input_tokens_seen": 176709335, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 8219, "time_per_iteration": 2.5761728286743164 }, { "auxiliary_loss_clip": 0.01137324, "auxiliary_loss_mlp": 0.01033535, "balance_loss_clip": 1.01935506, "balance_loss_mlp": 1.04441226, "epoch": 0.49421313693070795, "flos": 17530620451200.0, "grad_norm": 1.5390371342613536, "language_loss": 0.62271351, "learning_rate": 2.03664671480145e-06, "loss": 0.64442211, "num_input_tokens_seen": 176727715, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.75, "step": 8220, "time_per_iteration": 2.6194839477539062 }, { "auxiliary_loss_clip": 0.01118205, "auxiliary_loss_mlp": 0.01035712, "balance_loss_clip": 1.02202129, "balance_loss_mlp": 1.04302382, "epoch": 0.4942732601833759, "flos": 20812173120000.0, "grad_norm": 2.1338716634088675, "language_loss": 0.71905708, "learning_rate": 2.0362689552536152e-06, "loss": 0.74059629, "num_input_tokens_seen": 176747530, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75, "step": 8221, "time_per_iteration": 2.5466339588165283 }, { "auxiliary_loss_clip": 0.01143013, "auxiliary_loss_mlp": 0.01035073, "balance_loss_clip": 1.022681, "balance_loss_mlp": 1.04499078, "epoch": 0.4943333834360439, "flos": 15048941794560.0, "grad_norm": 1.8068564683263397, "language_loss": 0.79289675, "learning_rate": 2.035891194411436e-06, "loss": 0.8146776, "num_input_tokens_seen": 176765260, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 8222, "time_per_iteration": 2.589446783065796 }, { "auxiliary_loss_clip": 0.01128436, "auxiliary_loss_mlp": 0.01034345, "balance_loss_clip": 1.02025473, "balance_loss_mlp": 1.04456377, "epoch": 0.49439350668871185, "flos": 16070420764800.0, "grad_norm": 2.499203521207752, "language_loss": 0.71491742, "learning_rate": 2.0355134322883913e-06, "loss": 0.73654526, "num_input_tokens_seen": 176781770, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75, "step": 8223, "time_per_iteration": 2.580453634262085 }, { "auxiliary_loss_clip": 0.01130665, "auxiliary_loss_mlp": 0.01041019, "balance_loss_clip": 1.02685678, "balance_loss_mlp": 1.04485595, "epoch": 0.4944536299413798, "flos": 20880079781760.0, "grad_norm": 1.63948893333238, "language_loss": 0.75073504, "learning_rate": 2.035135668897964e-06, "loss": 0.77245188, "num_input_tokens_seen": 176800655, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.76953125, "step": 8224, "time_per_iteration": 2.5806283950805664 }, { "auxiliary_loss_clip": 0.01141291, "auxiliary_loss_mlp": 0.01032843, "balance_loss_clip": 1.01928282, "balance_loss_mlp": 1.04657185, "epoch": 0.4945137531940478, "flos": 26608908856320.0, "grad_norm": 3.0553064542308013, "language_loss": 0.63577664, "learning_rate": 2.034757904253635e-06, "loss": 0.65751791, "num_input_tokens_seen": 176820610, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.765625, "step": 8225, "time_per_iteration": 2.6496167182922363 }, { "auxiliary_loss_clip": 0.01135956, "auxiliary_loss_mlp": 0.01037224, "balance_loss_clip": 1.02419996, "balance_loss_mlp": 1.04309845, "epoch": 0.49457387644671574, "flos": 23988148738560.0, "grad_norm": 1.9150167578933566, "language_loss": 0.76335067, "learning_rate": 2.034380138368886e-06, "loss": 0.78508246, "num_input_tokens_seen": 176840520, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.75390625, "step": 8226, "time_per_iteration": 2.6126177310943604 }, { "auxiliary_loss_clip": 0.01122601, "auxiliary_loss_mlp": 0.01039794, "balance_loss_clip": 1.02471352, "balance_loss_mlp": 1.04490733, "epoch": 0.49463399969938376, "flos": 26976598427520.0, "grad_norm": 1.603458373952275, "language_loss": 0.70891553, "learning_rate": 2.034002371257198e-06, "loss": 0.7305395, "num_input_tokens_seen": 176860265, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.77734375, "step": 8227, "time_per_iteration": 2.653688907623291 }, { "auxiliary_loss_clip": 0.01140971, "auxiliary_loss_mlp": 0.01036221, "balance_loss_clip": 1.02251768, "balance_loss_mlp": 1.04538703, "epoch": 0.49469412295205173, "flos": 29681534067840.0, "grad_norm": 1.665601123777504, "language_loss": 0.71267188, "learning_rate": 2.033624602932053e-06, "loss": 0.73444378, "num_input_tokens_seen": 176882910, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.77734375, "step": 8228, "time_per_iteration": 2.640320301055908 }, { "auxiliary_loss_clip": 0.01117538, "auxiliary_loss_mlp": 0.01029364, "balance_loss_clip": 1.01648962, "balance_loss_mlp": 1.04401541, "epoch": 0.4947542462047197, "flos": 24131791226880.0, "grad_norm": 1.4987144444475398, "language_loss": 0.84024334, "learning_rate": 2.0332468334069327e-06, "loss": 0.8617124, "num_input_tokens_seen": 176903030, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 8229, "time_per_iteration": 2.6759419441223145 }, { "auxiliary_loss_clip": 0.0112827, "auxiliary_loss_mlp": 0.01035788, "balance_loss_clip": 1.02111912, "balance_loss_mlp": 1.04264617, "epoch": 0.49481436945738766, "flos": 20045049333120.0, "grad_norm": 1.9765459306982698, "language_loss": 0.75080401, "learning_rate": 2.032869062695318e-06, "loss": 0.77244455, "num_input_tokens_seen": 176919025, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.765625, "step": 8230, "time_per_iteration": 2.555629253387451 }, { "auxiliary_loss_clip": 0.01128639, "auxiliary_loss_mlp": 0.0102837, "balance_loss_clip": 1.01492333, "balance_loss_mlp": 1.04376936, "epoch": 0.4948744927100556, "flos": 15669550005120.0, "grad_norm": 2.548042570951935, "language_loss": 0.79797822, "learning_rate": 2.032491290810692e-06, "loss": 0.81954825, "num_input_tokens_seen": 176937945, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 8231, "time_per_iteration": 2.665560007095337 }, { "auxiliary_loss_clip": 0.01053764, "auxiliary_loss_mlp": 0.00999805, "balance_loss_clip": 0.99848223, "balance_loss_mlp": 1.02115345, "epoch": 0.4949346159627236, "flos": 68872071502080.0, "grad_norm": 0.7439638929968325, "language_loss": 0.60204333, "learning_rate": 2.0321135177665337e-06, "loss": 0.62257904, "num_input_tokens_seen": 177004575, "router_z_loss_clip": 0.01324463, "router_z_loss_mlp": 0.2421875, "step": 8232, "time_per_iteration": 3.2959511280059814 }, { "auxiliary_loss_clip": 0.01143311, "auxiliary_loss_mlp": 0.01035477, "balance_loss_clip": 1.0208143, "balance_loss_mlp": 1.04535341, "epoch": 0.49499473921539155, "flos": 24790285307520.0, "grad_norm": 1.79210272912607, "language_loss": 0.69439578, "learning_rate": 2.0317357435763277e-06, "loss": 0.71618366, "num_input_tokens_seen": 177024155, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80078125, "step": 8233, "time_per_iteration": 2.6861820220947266 }, { "auxiliary_loss_clip": 0.01154964, "auxiliary_loss_mlp": 0.01034164, "balance_loss_clip": 1.01978755, "balance_loss_mlp": 1.04337168, "epoch": 0.4950548624680595, "flos": 32707905540480.0, "grad_norm": 2.159264271063642, "language_loss": 0.65964007, "learning_rate": 2.0313579682535544e-06, "loss": 0.68153131, "num_input_tokens_seen": 177046185, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7578125, "step": 8234, "time_per_iteration": 2.6995646953582764 }, { "auxiliary_loss_clip": 0.01062373, "auxiliary_loss_mlp": 0.01000574, "balance_loss_clip": 0.99902457, "balance_loss_mlp": 1.02046323, "epoch": 0.4951149857207275, "flos": 50082173066880.0, "grad_norm": 0.7970709157153079, "language_loss": 0.5804708, "learning_rate": 2.030980191811696e-06, "loss": 0.60110033, "num_input_tokens_seen": 177099025, "router_z_loss_clip": 0.01550293, "router_z_loss_mlp": 0.24121094, "step": 8235, "time_per_iteration": 3.1146864891052246 }, { "auxiliary_loss_clip": 0.01148089, "auxiliary_loss_mlp": 0.0103629, "balance_loss_clip": 1.02183032, "balance_loss_mlp": 1.04397571, "epoch": 0.49517510897339545, "flos": 22236785406720.0, "grad_norm": 5.049774643594219, "language_loss": 0.77192974, "learning_rate": 2.0306024142642338e-06, "loss": 0.79377353, "num_input_tokens_seen": 177118365, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7734375, "step": 8236, "time_per_iteration": 2.6645095348358154 }, { "auxiliary_loss_clip": 0.01146193, "auxiliary_loss_mlp": 0.01035848, "balance_loss_clip": 1.0224247, "balance_loss_mlp": 1.04518151, "epoch": 0.4952352322260634, "flos": 25374120969600.0, "grad_norm": 1.6001556663259808, "language_loss": 0.72624648, "learning_rate": 2.03022463562465e-06, "loss": 0.74806684, "num_input_tokens_seen": 177136415, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7421875, "step": 8237, "time_per_iteration": 2.6127195358276367 }, { "auxiliary_loss_clip": 0.01116678, "auxiliary_loss_mlp": 0.01033073, "balance_loss_clip": 1.01993608, "balance_loss_mlp": 1.04346418, "epoch": 0.4952953554787314, "flos": 24608721035520.0, "grad_norm": 1.7079816680537967, "language_loss": 0.76157951, "learning_rate": 2.0298468559064276e-06, "loss": 0.783077, "num_input_tokens_seen": 177155690, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 8238, "time_per_iteration": 2.6177446842193604 }, { "auxiliary_loss_clip": 0.01149897, "auxiliary_loss_mlp": 0.01036501, "balance_loss_clip": 1.02230358, "balance_loss_mlp": 1.04485071, "epoch": 0.49535547873139935, "flos": 17311278049920.0, "grad_norm": 2.2648553911636613, "language_loss": 0.73787493, "learning_rate": 2.0294690751230476e-06, "loss": 0.75973892, "num_input_tokens_seen": 177173350, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.77734375, "step": 8239, "time_per_iteration": 2.5410075187683105 }, { "auxiliary_loss_clip": 0.01184618, "auxiliary_loss_mlp": 0.01037481, "balance_loss_clip": 1.02165616, "balance_loss_mlp": 1.04385734, "epoch": 0.4954156019840673, "flos": 20740315962240.0, "grad_norm": 2.5051764017528324, "language_loss": 0.79064327, "learning_rate": 2.0290912932879915e-06, "loss": 0.81286418, "num_input_tokens_seen": 177191115, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.78515625, "step": 8240, "time_per_iteration": 4.013585805892944 }, { "auxiliary_loss_clip": 0.01125196, "auxiliary_loss_mlp": 0.01041208, "balance_loss_clip": 1.02801776, "balance_loss_mlp": 1.04388309, "epoch": 0.49547572523673533, "flos": 12820684567680.0, "grad_norm": 2.046055693976273, "language_loss": 0.85504049, "learning_rate": 2.0287135104147423e-06, "loss": 0.87670451, "num_input_tokens_seen": 177206155, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 8241, "time_per_iteration": 2.515981435775757 }, { "auxiliary_loss_clip": 0.01153263, "auxiliary_loss_mlp": 0.01034957, "balance_loss_clip": 1.02224898, "balance_loss_mlp": 1.04485297, "epoch": 0.4955358484894033, "flos": 15597046402560.0, "grad_norm": 1.7001372991023747, "language_loss": 0.76942647, "learning_rate": 2.028335726516781e-06, "loss": 0.79130864, "num_input_tokens_seen": 177224815, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 8242, "time_per_iteration": 2.5861964225769043 }, { "auxiliary_loss_clip": 0.01135351, "auxiliary_loss_mlp": 0.01277895, "balance_loss_clip": 1.01743877, "balance_loss_mlp": 1.04318333, "epoch": 0.49559597174207126, "flos": 26464368528000.0, "grad_norm": 1.8722773784490918, "language_loss": 0.67026341, "learning_rate": 2.0279579416075917e-06, "loss": 0.6943959, "num_input_tokens_seen": 177244490, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.74609375, "step": 8243, "time_per_iteration": 2.6097872257232666 }, { "auxiliary_loss_clip": 0.01130173, "auxiliary_loss_mlp": 0.01031523, "balance_loss_clip": 1.0188272, "balance_loss_mlp": 1.04755735, "epoch": 0.4956560949947392, "flos": 23148234040320.0, "grad_norm": 1.8588207131156518, "language_loss": 0.68064249, "learning_rate": 2.027580155700655e-06, "loss": 0.70225948, "num_input_tokens_seen": 177264340, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.734375, "step": 8244, "time_per_iteration": 3.9851186275482178 }, { "auxiliary_loss_clip": 0.01139161, "auxiliary_loss_mlp": 0.01036419, "balance_loss_clip": 1.02193475, "balance_loss_mlp": 1.04509652, "epoch": 0.4957162182474072, "flos": 20773461237120.0, "grad_norm": 1.909764373138655, "language_loss": 0.74610233, "learning_rate": 2.0272023688094534e-06, "loss": 0.76785815, "num_input_tokens_seen": 177283055, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7578125, "step": 8245, "time_per_iteration": 2.6312193870544434 }, { "auxiliary_loss_clip": 0.01125235, "auxiliary_loss_mlp": 0.01031799, "balance_loss_clip": 1.01847112, "balance_loss_mlp": 1.04330432, "epoch": 0.49577634150007516, "flos": 18734202397440.0, "grad_norm": 1.8385125559124276, "language_loss": 0.81310201, "learning_rate": 2.026824580947469e-06, "loss": 0.83467233, "num_input_tokens_seen": 177301140, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 8246, "time_per_iteration": 2.5119831562042236 }, { "auxiliary_loss_clip": 0.01150155, "auxiliary_loss_mlp": 0.01042986, "balance_loss_clip": 1.02869833, "balance_loss_mlp": 1.04503703, "epoch": 0.4958364647527431, "flos": 25554176870400.0, "grad_norm": 1.6277970414152982, "language_loss": 0.83896911, "learning_rate": 2.0264467921281846e-06, "loss": 0.86090052, "num_input_tokens_seen": 177323095, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78125, "step": 8247, "time_per_iteration": 2.681814432144165 }, { "auxiliary_loss_clip": 0.01147167, "auxiliary_loss_mlp": 0.01032817, "balance_loss_clip": 1.01926851, "balance_loss_mlp": 1.04469919, "epoch": 0.4958965880054111, "flos": 24425325169920.0, "grad_norm": 2.209457416738383, "language_loss": 0.83531022, "learning_rate": 2.0260690023650818e-06, "loss": 0.85711002, "num_input_tokens_seen": 177339845, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7578125, "step": 8248, "time_per_iteration": 2.5985405445098877 }, { "auxiliary_loss_clip": 0.01130213, "auxiliary_loss_mlp": 0.01033162, "balance_loss_clip": 1.01922011, "balance_loss_mlp": 1.04530048, "epoch": 0.49595671125807905, "flos": 25083460114560.0, "grad_norm": 1.6913071871955425, "language_loss": 0.73403066, "learning_rate": 2.0256912116716437e-06, "loss": 0.75566435, "num_input_tokens_seen": 177359980, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7578125, "step": 8249, "time_per_iteration": 4.12369179725647 }, { "auxiliary_loss_clip": 0.0114284, "auxiliary_loss_mlp": 0.01045946, "balance_loss_clip": 1.03191543, "balance_loss_mlp": 1.04795349, "epoch": 0.496016834510747, "flos": 16435883692800.0, "grad_norm": 1.6792613404212005, "language_loss": 0.76063079, "learning_rate": 2.0253134200613526e-06, "loss": 0.78251863, "num_input_tokens_seen": 177378580, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7734375, "step": 8250, "time_per_iteration": 2.6279773712158203 }, { "auxiliary_loss_clip": 0.01120819, "auxiliary_loss_mlp": 0.01038492, "balance_loss_clip": 1.02479517, "balance_loss_mlp": 1.04482782, "epoch": 0.496076957763415, "flos": 23437925228160.0, "grad_norm": 2.022123621235962, "language_loss": 0.70377851, "learning_rate": 2.0249356275476903e-06, "loss": 0.7253716, "num_input_tokens_seen": 177398790, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7578125, "step": 8251, "time_per_iteration": 4.208557605743408 }, { "auxiliary_loss_clip": 0.01155949, "auxiliary_loss_mlp": 0.01279534, "balance_loss_clip": 1.02055836, "balance_loss_mlp": 1.04634559, "epoch": 0.49613708101608295, "flos": 16909509450240.0, "grad_norm": 1.883666109384382, "language_loss": 0.79882407, "learning_rate": 2.02455783414414e-06, "loss": 0.82317889, "num_input_tokens_seen": 177416515, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7421875, "step": 8252, "time_per_iteration": 2.623300790786743 }, { "auxiliary_loss_clip": 0.01120933, "auxiliary_loss_mlp": 0.01033568, "balance_loss_clip": 1.01820791, "balance_loss_mlp": 1.04527044, "epoch": 0.4961972042687509, "flos": 16618094409600.0, "grad_norm": 1.6675953190784154, "language_loss": 0.81417298, "learning_rate": 2.0241800398641834e-06, "loss": 0.83571798, "num_input_tokens_seen": 177434425, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.7578125, "step": 8253, "time_per_iteration": 2.5549845695495605 }, { "auxiliary_loss_clip": 0.01144403, "auxiliary_loss_mlp": 0.01032823, "balance_loss_clip": 1.02062798, "balance_loss_mlp": 1.04375613, "epoch": 0.49625732752141893, "flos": 28956749437440.0, "grad_norm": 1.374893681309854, "language_loss": 0.66986698, "learning_rate": 2.023802244721303e-06, "loss": 0.6916393, "num_input_tokens_seen": 177459675, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7421875, "step": 8254, "time_per_iteration": 2.715514898300171 }, { "auxiliary_loss_clip": 0.01148674, "auxiliary_loss_mlp": 0.0104077, "balance_loss_clip": 1.02700758, "balance_loss_mlp": 1.04524899, "epoch": 0.4963174507740869, "flos": 23112359331840.0, "grad_norm": 1.7054680756309568, "language_loss": 0.73810792, "learning_rate": 2.023424448728982e-06, "loss": 0.76000232, "num_input_tokens_seen": 177478895, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.765625, "step": 8255, "time_per_iteration": 2.6543641090393066 }, { "auxiliary_loss_clip": 0.01137126, "auxiliary_loss_mlp": 0.01035274, "balance_loss_clip": 1.02173209, "balance_loss_mlp": 1.04424584, "epoch": 0.49637757402675486, "flos": 13917863450880.0, "grad_norm": 1.8274544678724713, "language_loss": 0.81630468, "learning_rate": 2.023046651900703e-06, "loss": 0.83802873, "num_input_tokens_seen": 177494920, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.74609375, "step": 8256, "time_per_iteration": 2.5992844104766846 }, { "auxiliary_loss_clip": 0.01133602, "auxiliary_loss_mlp": 0.01031857, "balance_loss_clip": 1.01925039, "balance_loss_mlp": 1.04297733, "epoch": 0.49643769727942283, "flos": 22309001700480.0, "grad_norm": 1.4558211793903817, "language_loss": 0.81046343, "learning_rate": 2.022668854249948e-06, "loss": 0.83211803, "num_input_tokens_seen": 177515455, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 8257, "time_per_iteration": 2.6282217502593994 }, { "auxiliary_loss_clip": 0.01124055, "auxiliary_loss_mlp": 0.01035967, "balance_loss_clip": 1.02132821, "balance_loss_mlp": 1.04459667, "epoch": 0.4964978205320908, "flos": 19500248776320.0, "grad_norm": 2.3652478893888786, "language_loss": 0.6592921, "learning_rate": 2.0222910557902e-06, "loss": 0.68089235, "num_input_tokens_seen": 177534040, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.79296875, "step": 8258, "time_per_iteration": 2.5463781356811523 }, { "auxiliary_loss_clip": 0.01125224, "auxiliary_loss_mlp": 0.01031112, "balance_loss_clip": 1.01907182, "balance_loss_mlp": 1.0436511, "epoch": 0.49655794378475876, "flos": 23436524597760.0, "grad_norm": 1.4515777146995914, "language_loss": 0.77459991, "learning_rate": 2.0219132565349414e-06, "loss": 0.7961632, "num_input_tokens_seen": 177554510, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.73046875, "step": 8259, "time_per_iteration": 2.598463535308838 }, { "auxiliary_loss_clip": 0.01136782, "auxiliary_loss_mlp": 0.01035749, "balance_loss_clip": 1.02202201, "balance_loss_mlp": 1.04468441, "epoch": 0.4966180670374267, "flos": 26831124345600.0, "grad_norm": 1.4419661149731837, "language_loss": 0.7846781, "learning_rate": 2.0215354564976555e-06, "loss": 0.8064034, "num_input_tokens_seen": 177575780, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 8260, "time_per_iteration": 2.623744010925293 }, { "auxiliary_loss_clip": 0.0113772, "auxiliary_loss_mlp": 0.01033002, "balance_loss_clip": 1.01939988, "balance_loss_mlp": 1.04352927, "epoch": 0.4966781902900947, "flos": 22009326531840.0, "grad_norm": 2.821980269287129, "language_loss": 0.75936574, "learning_rate": 2.0211576556918244e-06, "loss": 0.78107297, "num_input_tokens_seen": 177588965, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.76171875, "step": 8261, "time_per_iteration": 2.6145496368408203 }, { "auxiliary_loss_clip": 0.01151374, "auxiliary_loss_mlp": 0.01032176, "balance_loss_clip": 1.01912856, "balance_loss_mlp": 1.04403889, "epoch": 0.49673831354276266, "flos": 26213353309440.0, "grad_norm": 1.8127514830594837, "language_loss": 0.89510214, "learning_rate": 2.0207798541309307e-06, "loss": 0.91693765, "num_input_tokens_seen": 177608425, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 8262, "time_per_iteration": 2.676180362701416 }, { "auxiliary_loss_clip": 0.01161372, "auxiliary_loss_mlp": 0.01031888, "balance_loss_clip": 1.01928711, "balance_loss_mlp": 1.04520774, "epoch": 0.4967984367954306, "flos": 23182277155200.0, "grad_norm": 1.693263907583648, "language_loss": 0.7378819, "learning_rate": 2.0204020518284576e-06, "loss": 0.7598145, "num_input_tokens_seen": 177628240, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 8263, "time_per_iteration": 2.7216641902923584 }, { "auxiliary_loss_clip": 0.01160812, "auxiliary_loss_mlp": 0.01034611, "balance_loss_clip": 1.02013934, "balance_loss_mlp": 1.04728723, "epoch": 0.4968585600480986, "flos": 19281445079040.0, "grad_norm": 2.13812858450092, "language_loss": 0.69046712, "learning_rate": 2.0200242487978877e-06, "loss": 0.71242142, "num_input_tokens_seen": 177645920, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.77734375, "step": 8264, "time_per_iteration": 2.5927789211273193 }, { "auxiliary_loss_clip": 0.01137096, "auxiliary_loss_mlp": 0.01028983, "balance_loss_clip": 1.01576233, "balance_loss_mlp": 1.0422821, "epoch": 0.49691868330076655, "flos": 22528703237760.0, "grad_norm": 1.5065020543525038, "language_loss": 0.64655328, "learning_rate": 2.019646445052704e-06, "loss": 0.66821408, "num_input_tokens_seen": 177667185, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7734375, "step": 8265, "time_per_iteration": 2.5851845741271973 }, { "auxiliary_loss_clip": 0.01060197, "auxiliary_loss_mlp": 0.01001898, "balance_loss_clip": 1.00028825, "balance_loss_mlp": 1.01827884, "epoch": 0.4969788065534345, "flos": 66577128675840.0, "grad_norm": 0.8740137511128038, "language_loss": 0.53506678, "learning_rate": 2.0192686406063897e-06, "loss": 0.55568773, "num_input_tokens_seen": 177733020, "router_z_loss_clip": 0.01611328, "router_z_loss_mlp": 0.24023438, "step": 8266, "time_per_iteration": 3.2247824668884277 }, { "auxiliary_loss_clip": 0.01125339, "auxiliary_loss_mlp": 0.01031061, "balance_loss_clip": 1.01804972, "balance_loss_mlp": 1.04428816, "epoch": 0.49703892980610254, "flos": 24059503105920.0, "grad_norm": 2.3459722300140466, "language_loss": 0.79527402, "learning_rate": 2.018890835472426e-06, "loss": 0.81683803, "num_input_tokens_seen": 177753370, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 8267, "time_per_iteration": 2.5902259349823 }, { "auxiliary_loss_clip": 0.01140584, "auxiliary_loss_mlp": 0.01036248, "balance_loss_clip": 1.02188349, "balance_loss_mlp": 1.0462122, "epoch": 0.4970990530587705, "flos": 29126174912640.0, "grad_norm": 2.6416507654984875, "language_loss": 0.74353576, "learning_rate": 2.0185130296642974e-06, "loss": 0.76530397, "num_input_tokens_seen": 177771530, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.765625, "step": 8268, "time_per_iteration": 2.6842055320739746 }, { "auxiliary_loss_clip": 0.01127217, "auxiliary_loss_mlp": 0.01281615, "balance_loss_clip": 1.02072132, "balance_loss_mlp": 1.04404974, "epoch": 0.49715917631143847, "flos": 46026167258880.0, "grad_norm": 1.5256863110378813, "language_loss": 0.67683434, "learning_rate": 2.0181352231954865e-06, "loss": 0.70092267, "num_input_tokens_seen": 177796355, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.73828125, "step": 8269, "time_per_iteration": 2.751232862472534 }, { "auxiliary_loss_clip": 0.01134363, "auxiliary_loss_mlp": 0.01038931, "balance_loss_clip": 1.02613938, "balance_loss_mlp": 1.04446375, "epoch": 0.49721929956410643, "flos": 20191277600640.0, "grad_norm": 1.525734593940415, "language_loss": 0.85664093, "learning_rate": 2.0177574160794768e-06, "loss": 0.87837386, "num_input_tokens_seen": 177814300, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.72265625, "step": 8270, "time_per_iteration": 2.5969038009643555 }, { "auxiliary_loss_clip": 0.01155575, "auxiliary_loss_mlp": 0.01277607, "balance_loss_clip": 1.01938772, "balance_loss_mlp": 1.04577672, "epoch": 0.4972794228167744, "flos": 21653560275840.0, "grad_norm": 1.7420470079968962, "language_loss": 0.70972788, "learning_rate": 2.017379608329749e-06, "loss": 0.73405969, "num_input_tokens_seen": 177833615, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.7421875, "step": 8271, "time_per_iteration": 2.571808099746704 }, { "auxiliary_loss_clip": 0.01131339, "auxiliary_loss_mlp": 0.01032394, "balance_loss_clip": 1.01819038, "balance_loss_mlp": 1.04608595, "epoch": 0.49733954606944236, "flos": 24279743347200.0, "grad_norm": 1.4607436525850257, "language_loss": 0.78287941, "learning_rate": 2.017001799959789e-06, "loss": 0.80451679, "num_input_tokens_seen": 177855315, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.76171875, "step": 8272, "time_per_iteration": 2.704658269882202 }, { "auxiliary_loss_clip": 0.01147539, "auxiliary_loss_mlp": 0.0103785, "balance_loss_clip": 1.02465391, "balance_loss_mlp": 1.04558814, "epoch": 0.4973996693221103, "flos": 37852575730560.0, "grad_norm": 2.2357552125923146, "language_loss": 0.66733396, "learning_rate": 2.0166239909830786e-06, "loss": 0.68918788, "num_input_tokens_seen": 177875590, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.75, "step": 8273, "time_per_iteration": 2.800478935241699 }, { "auxiliary_loss_clip": 0.0114821, "auxiliary_loss_mlp": 0.01034173, "balance_loss_clip": 1.0209111, "balance_loss_mlp": 1.04636979, "epoch": 0.4974597925747783, "flos": 21361426963200.0, "grad_norm": 1.7277942187822342, "language_loss": 0.78293425, "learning_rate": 2.0162461814130996e-06, "loss": 0.80475807, "num_input_tokens_seen": 177894175, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.74609375, "step": 8274, "time_per_iteration": 2.6339075565338135 }, { "auxiliary_loss_clip": 0.01147076, "auxiliary_loss_mlp": 0.01032632, "balance_loss_clip": 1.01935184, "balance_loss_mlp": 1.04309595, "epoch": 0.49751991582744626, "flos": 30738133560960.0, "grad_norm": 1.7486432995256795, "language_loss": 0.75906956, "learning_rate": 2.015868371263338e-06, "loss": 0.78086662, "num_input_tokens_seen": 177913920, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7734375, "step": 8275, "time_per_iteration": 2.729764699935913 }, { "auxiliary_loss_clip": 0.01132727, "auxiliary_loss_mlp": 0.01037177, "balance_loss_clip": 1.02176356, "balance_loss_mlp": 1.04523921, "epoch": 0.4975800390801142, "flos": 14100541044480.0, "grad_norm": 2.3346436934421866, "language_loss": 0.83593214, "learning_rate": 2.0154905605472736e-06, "loss": 0.85763121, "num_input_tokens_seen": 177930425, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.78515625, "step": 8276, "time_per_iteration": 2.647925853729248 }, { "auxiliary_loss_clip": 0.01114563, "auxiliary_loss_mlp": 0.01275301, "balance_loss_clip": 1.01668131, "balance_loss_mlp": 1.04359496, "epoch": 0.4976401623327822, "flos": 24207275658240.0, "grad_norm": 1.6422968086888747, "language_loss": 0.6998688, "learning_rate": 2.0151127492783913e-06, "loss": 0.7237674, "num_input_tokens_seen": 177949885, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.7109375, "step": 8277, "time_per_iteration": 2.5691123008728027 }, { "auxiliary_loss_clip": 0.01125635, "auxiliary_loss_mlp": 0.01038861, "balance_loss_clip": 1.02544439, "balance_loss_mlp": 1.0437206, "epoch": 0.49770028558545015, "flos": 21136769349120.0, "grad_norm": 2.1647981380613017, "language_loss": 0.82064688, "learning_rate": 2.014734937470174e-06, "loss": 0.84229183, "num_input_tokens_seen": 177965720, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 8278, "time_per_iteration": 2.5278003215789795 }, { "auxiliary_loss_clip": 0.01129072, "auxiliary_loss_mlp": 0.01036241, "balance_loss_clip": 1.02306223, "balance_loss_mlp": 1.04426873, "epoch": 0.4977604088381181, "flos": 16763927627520.0, "grad_norm": 2.1311354992849463, "language_loss": 0.67422271, "learning_rate": 2.014357125136104e-06, "loss": 0.69587582, "num_input_tokens_seen": 177983190, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7578125, "step": 8279, "time_per_iteration": 2.5383341312408447 }, { "auxiliary_loss_clip": 0.01126645, "auxiliary_loss_mlp": 0.01037156, "balance_loss_clip": 1.02235079, "balance_loss_mlp": 1.04202747, "epoch": 0.49782053209078614, "flos": 15703521292800.0, "grad_norm": 2.2794872479058257, "language_loss": 0.70534635, "learning_rate": 2.013979312289666e-06, "loss": 0.72698438, "num_input_tokens_seen": 178000155, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.75390625, "step": 8280, "time_per_iteration": 2.6162962913513184 }, { "auxiliary_loss_clip": 0.01144305, "auxiliary_loss_mlp": 0.01036169, "balance_loss_clip": 1.02293038, "balance_loss_mlp": 1.04256725, "epoch": 0.4978806553434541, "flos": 24753692327040.0, "grad_norm": 2.3394179975056577, "language_loss": 0.64453077, "learning_rate": 2.0136014989443416e-06, "loss": 0.66633546, "num_input_tokens_seen": 178021060, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75, "step": 8281, "time_per_iteration": 4.10659384727478 }, { "auxiliary_loss_clip": 0.01129126, "auxiliary_loss_mlp": 0.01036267, "balance_loss_clip": 1.02183104, "balance_loss_mlp": 1.04405546, "epoch": 0.49794077859612207, "flos": 13115726881920.0, "grad_norm": 2.173840238523294, "language_loss": 0.73230308, "learning_rate": 2.013223685113615e-06, "loss": 0.75395703, "num_input_tokens_seen": 178038180, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.76171875, "step": 8282, "time_per_iteration": 2.580587148666382 }, { "auxiliary_loss_clip": 0.01131621, "auxiliary_loss_mlp": 0.01036173, "balance_loss_clip": 1.02463913, "balance_loss_mlp": 1.04207706, "epoch": 0.49800090184879003, "flos": 27525133998720.0, "grad_norm": 1.5661900263502933, "language_loss": 0.73661631, "learning_rate": 2.0128458708109694e-06, "loss": 0.75829422, "num_input_tokens_seen": 178057565, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.71484375, "step": 8283, "time_per_iteration": 2.7121407985687256 }, { "auxiliary_loss_clip": 0.01151756, "auxiliary_loss_mlp": 0.0103889, "balance_loss_clip": 1.02562237, "balance_loss_mlp": 1.04302561, "epoch": 0.498061025101458, "flos": 19792489829760.0, "grad_norm": 1.7246101447464253, "language_loss": 0.78753102, "learning_rate": 2.0124680560498877e-06, "loss": 0.80943739, "num_input_tokens_seen": 178076965, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 8284, "time_per_iteration": 2.6145904064178467 }, { "auxiliary_loss_clip": 0.01141366, "auxiliary_loss_mlp": 0.01038519, "balance_loss_clip": 1.02433348, "balance_loss_mlp": 1.04441571, "epoch": 0.49812114835412596, "flos": 29893909230720.0, "grad_norm": 1.7040451841505138, "language_loss": 0.73672581, "learning_rate": 2.0120902408438527e-06, "loss": 0.75852472, "num_input_tokens_seen": 178095105, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7890625, "step": 8285, "time_per_iteration": 4.073685646057129 }, { "auxiliary_loss_clip": 0.01131831, "auxiliary_loss_mlp": 0.01033712, "balance_loss_clip": 1.02018762, "balance_loss_mlp": 1.04792237, "epoch": 0.49818127160679393, "flos": 23147048891520.0, "grad_norm": 1.572949966089763, "language_loss": 0.74220657, "learning_rate": 2.011712425206348e-06, "loss": 0.76386201, "num_input_tokens_seen": 178114505, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.74609375, "step": 8286, "time_per_iteration": 2.7099719047546387 }, { "auxiliary_loss_clip": 0.01138528, "auxiliary_loss_mlp": 0.01044063, "balance_loss_clip": 1.03048539, "balance_loss_mlp": 1.04497862, "epoch": 0.4982413948594619, "flos": 21652806090240.0, "grad_norm": 1.7114605550485131, "language_loss": 0.85515702, "learning_rate": 2.011334609150857e-06, "loss": 0.87698293, "num_input_tokens_seen": 178131595, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7578125, "step": 8287, "time_per_iteration": 2.5265679359436035 }, { "auxiliary_loss_clip": 0.01155289, "auxiliary_loss_mlp": 0.01029804, "balance_loss_clip": 1.0163157, "balance_loss_mlp": 1.04417753, "epoch": 0.49830151811212986, "flos": 32486982940800.0, "grad_norm": 1.7262815563019018, "language_loss": 0.72433203, "learning_rate": 2.0109567926908636e-06, "loss": 0.74618292, "num_input_tokens_seen": 178152055, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75, "step": 8288, "time_per_iteration": 2.7316367626190186 }, { "auxiliary_loss_clip": 0.01140988, "auxiliary_loss_mlp": 0.01033716, "balance_loss_clip": 1.01893449, "balance_loss_mlp": 1.04492044, "epoch": 0.4983616413647978, "flos": 18142358002560.0, "grad_norm": 3.2035003400837727, "language_loss": 0.80606318, "learning_rate": 2.01057897583985e-06, "loss": 0.82781017, "num_input_tokens_seen": 178168150, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.78125, "step": 8289, "time_per_iteration": 2.5305142402648926 }, { "auxiliary_loss_clip": 0.01124557, "auxiliary_loss_mlp": 0.0103719, "balance_loss_clip": 1.02374911, "balance_loss_mlp": 1.04151332, "epoch": 0.4984217646174658, "flos": 19718836992000.0, "grad_norm": 2.067540601828452, "language_loss": 0.72915614, "learning_rate": 2.0102011586113003e-06, "loss": 0.75077367, "num_input_tokens_seen": 178186150, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73828125, "step": 8290, "time_per_iteration": 2.7214601039886475 }, { "auxiliary_loss_clip": 0.01127835, "auxiliary_loss_mlp": 0.01038099, "balance_loss_clip": 1.022614, "balance_loss_mlp": 1.04341435, "epoch": 0.49848188787013376, "flos": 24936549488640.0, "grad_norm": 1.5563608216966553, "language_loss": 0.84363317, "learning_rate": 2.009823341018697e-06, "loss": 0.86529249, "num_input_tokens_seen": 178207665, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.75390625, "step": 8291, "time_per_iteration": 4.220175743103027 }, { "auxiliary_loss_clip": 0.01143683, "auxiliary_loss_mlp": 0.01040076, "balance_loss_clip": 1.0251869, "balance_loss_mlp": 1.0459801, "epoch": 0.4985420111228017, "flos": 22382439056640.0, "grad_norm": 1.717609610064667, "language_loss": 0.6669575, "learning_rate": 2.0094455230755247e-06, "loss": 0.68879509, "num_input_tokens_seen": 178226325, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.796875, "step": 8292, "time_per_iteration": 3.991814374923706 }, { "auxiliary_loss_clip": 0.01118163, "auxiliary_loss_mlp": 0.010331, "balance_loss_clip": 1.02080977, "balance_loss_mlp": 1.04349113, "epoch": 0.4986021343754697, "flos": 16216469464320.0, "grad_norm": 1.7999427558156051, "language_loss": 0.67006284, "learning_rate": 2.009067704795265e-06, "loss": 0.69157541, "num_input_tokens_seen": 178244960, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.74609375, "step": 8293, "time_per_iteration": 2.579213857650757 }, { "auxiliary_loss_clip": 0.01127261, "auxiliary_loss_mlp": 0.01029197, "balance_loss_clip": 1.01669836, "balance_loss_mlp": 1.04294264, "epoch": 0.4986622576281377, "flos": 23403594804480.0, "grad_norm": 1.8121514208653682, "language_loss": 0.82264948, "learning_rate": 2.0086898861914026e-06, "loss": 0.84421402, "num_input_tokens_seen": 178265400, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.75390625, "step": 8294, "time_per_iteration": 2.6398868560791016 }, { "auxiliary_loss_clip": 0.01127022, "auxiliary_loss_mlp": 0.01031891, "balance_loss_clip": 1.01810443, "balance_loss_mlp": 1.04303193, "epoch": 0.49872238088080567, "flos": 19974556892160.0, "grad_norm": 1.6732123758554729, "language_loss": 0.72768271, "learning_rate": 2.008312067277421e-06, "loss": 0.74927175, "num_input_tokens_seen": 178284535, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75, "step": 8295, "time_per_iteration": 2.5313761234283447 }, { "auxiliary_loss_clip": 0.01161504, "auxiliary_loss_mlp": 0.01036304, "balance_loss_clip": 1.02354896, "balance_loss_mlp": 1.04356134, "epoch": 0.49878250413347364, "flos": 22893016930560.0, "grad_norm": 1.9580019284053822, "language_loss": 0.67709523, "learning_rate": 2.007934248066802e-06, "loss": 0.69907331, "num_input_tokens_seen": 178302425, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73046875, "step": 8296, "time_per_iteration": 2.690781831741333 }, { "auxiliary_loss_clip": 0.01138401, "auxiliary_loss_mlp": 0.01034679, "balance_loss_clip": 1.02074337, "balance_loss_mlp": 1.04379809, "epoch": 0.4988426273861416, "flos": 32598449821440.0, "grad_norm": 2.085737138228305, "language_loss": 0.64523429, "learning_rate": 2.0075564285730313e-06, "loss": 0.66696507, "num_input_tokens_seen": 178323065, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.765625, "step": 8297, "time_per_iteration": 2.654064893722534 }, { "auxiliary_loss_clip": 0.0115187, "auxiliary_loss_mlp": 0.01029785, "balance_loss_clip": 1.01780486, "balance_loss_mlp": 1.04357684, "epoch": 0.49890275063880957, "flos": 20923604087040.0, "grad_norm": 1.715875401487111, "language_loss": 0.69280326, "learning_rate": 2.00717860880959e-06, "loss": 0.71461976, "num_input_tokens_seen": 178343985, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.72265625, "step": 8298, "time_per_iteration": 2.6405603885650635 }, { "auxiliary_loss_clip": 0.01113852, "auxiliary_loss_mlp": 0.01037802, "balance_loss_clip": 1.02463543, "balance_loss_mlp": 1.04072523, "epoch": 0.49896287389147753, "flos": 18624459369600.0, "grad_norm": 1.878075964809939, "language_loss": 0.84404171, "learning_rate": 2.0068007887899636e-06, "loss": 0.86555827, "num_input_tokens_seen": 178362345, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73046875, "step": 8299, "time_per_iteration": 2.53059720993042 }, { "auxiliary_loss_clip": 0.0114709, "auxiliary_loss_mlp": 0.01038317, "balance_loss_clip": 1.02372611, "balance_loss_mlp": 1.04362643, "epoch": 0.4990229971441455, "flos": 24826555065600.0, "grad_norm": 2.1616434297820644, "language_loss": 0.69053936, "learning_rate": 2.0064229685276345e-06, "loss": 0.7123934, "num_input_tokens_seen": 178383190, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.765625, "step": 8300, "time_per_iteration": 2.6549384593963623 }, { "auxiliary_loss_clip": 0.01059446, "auxiliary_loss_mlp": 0.01001614, "balance_loss_clip": 1.00001681, "balance_loss_mlp": 1.01655531, "epoch": 0.49908312039681346, "flos": 71384525136000.0, "grad_norm": 0.7526545180716006, "language_loss": 0.51141161, "learning_rate": 2.0060451480360855e-06, "loss": 0.53202218, "num_input_tokens_seen": 178444250, "router_z_loss_clip": 0.01599121, "router_z_loss_mlp": 0.24609375, "step": 8301, "time_per_iteration": 3.2715036869049072 }, { "auxiliary_loss_clip": 0.01132141, "auxiliary_loss_mlp": 0.01036729, "balance_loss_clip": 1.02376556, "balance_loss_mlp": 1.04250407, "epoch": 0.4991432436494814, "flos": 19828651847040.0, "grad_norm": 1.7891803284143388, "language_loss": 0.84029031, "learning_rate": 2.005667327328801e-06, "loss": 0.86197901, "num_input_tokens_seen": 178463250, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 8302, "time_per_iteration": 2.5706121921539307 }, { "auxiliary_loss_clip": 0.01123383, "auxiliary_loss_mlp": 0.01028392, "balance_loss_clip": 1.01641107, "balance_loss_mlp": 1.04279709, "epoch": 0.4992033669021494, "flos": 15121912273920.0, "grad_norm": 1.793084008941413, "language_loss": 0.6921972, "learning_rate": 2.005289506419264e-06, "loss": 0.7137149, "num_input_tokens_seen": 178481340, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.71875, "step": 8303, "time_per_iteration": 2.547691822052002 }, { "auxiliary_loss_clip": 0.01119431, "auxiliary_loss_mlp": 0.01032241, "balance_loss_clip": 1.01906824, "balance_loss_mlp": 1.04347014, "epoch": 0.49926349015481736, "flos": 31207952476800.0, "grad_norm": 1.6652358595762455, "language_loss": 0.72827947, "learning_rate": 2.0049116853209586e-06, "loss": 0.74979615, "num_input_tokens_seen": 178501545, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.76171875, "step": 8304, "time_per_iteration": 2.6301605701446533 }, { "auxiliary_loss_clip": 0.01126614, "auxiliary_loss_mlp": 0.01033423, "balance_loss_clip": 1.02129948, "balance_loss_mlp": 1.04434276, "epoch": 0.4993236134074853, "flos": 24900207903360.0, "grad_norm": 1.9753785781877256, "language_loss": 0.80032313, "learning_rate": 2.0045338640473683e-06, "loss": 0.82192349, "num_input_tokens_seen": 178519700, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.734375, "step": 8305, "time_per_iteration": 2.586256742477417 }, { "auxiliary_loss_clip": 0.01067248, "auxiliary_loss_mlp": 0.0100786, "balance_loss_clip": 1.00627446, "balance_loss_mlp": 1.01627254, "epoch": 0.4993837366601533, "flos": 70420573797120.0, "grad_norm": 0.7153014274709114, "language_loss": 0.56844532, "learning_rate": 2.0041560426119747e-06, "loss": 0.58919644, "num_input_tokens_seen": 178576740, "router_z_loss_clip": 0.01586914, "router_z_loss_mlp": 0.24414062, "step": 8306, "time_per_iteration": 3.3184714317321777 }, { "auxiliary_loss_clip": 0.01116881, "auxiliary_loss_mlp": 0.01035458, "balance_loss_clip": 1.02112937, "balance_loss_mlp": 1.04384613, "epoch": 0.4994438599128213, "flos": 15961216440960.0, "grad_norm": 1.7638442725905488, "language_loss": 0.82543445, "learning_rate": 2.0037782210282632e-06, "loss": 0.8469578, "num_input_tokens_seen": 178594745, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.73046875, "step": 8307, "time_per_iteration": 2.5256645679473877 }, { "auxiliary_loss_clip": 0.01113998, "auxiliary_loss_mlp": 0.01034471, "balance_loss_clip": 1.0207324, "balance_loss_mlp": 1.04153562, "epoch": 0.4995039831654893, "flos": 27928303228800.0, "grad_norm": 1.6373118014210057, "language_loss": 0.60918903, "learning_rate": 2.0034003993097168e-06, "loss": 0.63067371, "num_input_tokens_seen": 178614110, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.72265625, "step": 8308, "time_per_iteration": 2.5292935371398926 }, { "auxiliary_loss_clip": 0.01112409, "auxiliary_loss_mlp": 0.0103142, "balance_loss_clip": 1.01883185, "balance_loss_mlp": 1.04105234, "epoch": 0.49956410641815724, "flos": 24204797619840.0, "grad_norm": 1.5293753783585609, "language_loss": 0.74387026, "learning_rate": 2.0030225774698184e-06, "loss": 0.76530862, "num_input_tokens_seen": 178634170, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 8309, "time_per_iteration": 2.6822869777679443 }, { "auxiliary_loss_clip": 0.01133156, "auxiliary_loss_mlp": 0.01032083, "balance_loss_clip": 1.01979303, "balance_loss_mlp": 1.04214573, "epoch": 0.4996242296708252, "flos": 16180127879040.0, "grad_norm": 1.9637911172173037, "language_loss": 0.79478335, "learning_rate": 2.002644755522053e-06, "loss": 0.81643569, "num_input_tokens_seen": 178651775, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.73828125, "step": 8310, "time_per_iteration": 2.5426743030548096 }, { "auxiliary_loss_clip": 0.01126746, "auxiliary_loss_mlp": 0.01037735, "balance_loss_clip": 1.0250634, "balance_loss_mlp": 1.04218447, "epoch": 0.49968435292349317, "flos": 16873527000960.0, "grad_norm": 1.7337507671151746, "language_loss": 0.70848715, "learning_rate": 2.0022669334799023e-06, "loss": 0.73013192, "num_input_tokens_seen": 178669720, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.75390625, "step": 8311, "time_per_iteration": 2.5776453018188477 }, { "auxiliary_loss_clip": 0.01134406, "auxiliary_loss_mlp": 0.01031307, "balance_loss_clip": 1.0192492, "balance_loss_mlp": 1.0434649, "epoch": 0.49974447617616113, "flos": 14939521989120.0, "grad_norm": 1.7623520382032376, "language_loss": 0.77183366, "learning_rate": 2.0018891113568506e-06, "loss": 0.79349077, "num_input_tokens_seen": 178686765, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.73046875, "step": 8312, "time_per_iteration": 2.5901920795440674 }, { "auxiliary_loss_clip": 0.01143241, "auxiliary_loss_mlp": 0.01032478, "balance_loss_clip": 1.01907945, "balance_loss_mlp": 1.04310477, "epoch": 0.4998045994288291, "flos": 26651535321600.0, "grad_norm": 1.8079044515137594, "language_loss": 0.84414756, "learning_rate": 2.0015112891663814e-06, "loss": 0.86590481, "num_input_tokens_seen": 178705845, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 8313, "time_per_iteration": 2.6459202766418457 }, { "auxiliary_loss_clip": 0.01134387, "auxiliary_loss_mlp": 0.01031047, "balance_loss_clip": 1.01726043, "balance_loss_mlp": 1.04201484, "epoch": 0.49986472268149706, "flos": 20953768533120.0, "grad_norm": 1.7758278182592246, "language_loss": 0.80864906, "learning_rate": 2.0011334669219787e-06, "loss": 0.83030343, "num_input_tokens_seen": 178723410, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7421875, "step": 8314, "time_per_iteration": 2.557337760925293 }, { "auxiliary_loss_clip": 0.01145272, "auxiliary_loss_mlp": 0.01285875, "balance_loss_clip": 1.02482605, "balance_loss_mlp": 1.04381049, "epoch": 0.49992484593416503, "flos": 22783884433920.0, "grad_norm": 1.9357328928569395, "language_loss": 0.79351175, "learning_rate": 2.000755644637124e-06, "loss": 0.81782317, "num_input_tokens_seen": 178743560, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75, "step": 8315, "time_per_iteration": 2.6208252906799316 }, { "auxiliary_loss_clip": 0.01116272, "auxiliary_loss_mlp": 0.01027459, "balance_loss_clip": 1.01527619, "balance_loss_mlp": 1.0443871, "epoch": 0.499984969186833, "flos": 46786970252160.0, "grad_norm": 1.920402577819852, "language_loss": 0.74779379, "learning_rate": 2.000377822325304e-06, "loss": 0.76923114, "num_input_tokens_seen": 178767225, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.71875, "step": 8316, "time_per_iteration": 2.775170087814331 }, { "auxiliary_loss_clip": 0.01122253, "auxiliary_loss_mlp": 0.01029047, "balance_loss_clip": 1.01713192, "balance_loss_mlp": 1.04222012, "epoch": 0.500045092439501, "flos": 25556978131200.0, "grad_norm": 1.5471933233256676, "language_loss": 0.81179649, "learning_rate": 2e-06, "loss": 0.83330947, "num_input_tokens_seen": 178786810, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.70703125, "step": 8317, "time_per_iteration": 2.6959404945373535 }, { "auxiliary_loss_clip": 0.01133427, "auxiliary_loss_mlp": 0.01036728, "balance_loss_clip": 1.02240503, "balance_loss_mlp": 1.04176116, "epoch": 0.5001052156921689, "flos": 20704764476160.0, "grad_norm": 1.6110649205548724, "language_loss": 0.83248401, "learning_rate": 1.9996221776746954e-06, "loss": 0.85418552, "num_input_tokens_seen": 178805660, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.74609375, "step": 8318, "time_per_iteration": 2.5513792037963867 }, { "auxiliary_loss_clip": 0.01124437, "auxiliary_loss_mlp": 0.0102969, "balance_loss_clip": 1.01694059, "balance_loss_mlp": 1.04191232, "epoch": 0.500165338944837, "flos": 21251109317760.0, "grad_norm": 1.7837050085986084, "language_loss": 0.81877601, "learning_rate": 1.999244355362875e-06, "loss": 0.84031725, "num_input_tokens_seen": 178824780, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73828125, "step": 8319, "time_per_iteration": 2.5988309383392334 }, { "auxiliary_loss_clip": 0.01141052, "auxiliary_loss_mlp": 0.01027392, "balance_loss_clip": 1.01561451, "balance_loss_mlp": 1.04008377, "epoch": 0.5002254621975049, "flos": 27854398995840.0, "grad_norm": 1.766497835632857, "language_loss": 0.71606177, "learning_rate": 1.9988665330780216e-06, "loss": 0.73774624, "num_input_tokens_seen": 178845640, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.73828125, "step": 8320, "time_per_iteration": 2.6199533939361572 }, { "auxiliary_loss_clip": 0.01151782, "auxiliary_loss_mlp": 0.01045186, "balance_loss_clip": 1.030321, "balance_loss_mlp": 1.04331136, "epoch": 0.5002855854501729, "flos": 15551941898880.0, "grad_norm": 3.721069677358002, "language_loss": 0.76929152, "learning_rate": 1.998488710833619e-06, "loss": 0.7912612, "num_input_tokens_seen": 178862290, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8125, "step": 8321, "time_per_iteration": 2.5583698749542236 }, { "auxiliary_loss_clip": 0.01126274, "auxiliary_loss_mlp": 0.01283985, "balance_loss_clip": 1.02393198, "balance_loss_mlp": 1.0429287, "epoch": 0.5003457087028408, "flos": 16107408794880.0, "grad_norm": 1.6831495518071458, "language_loss": 0.8288694, "learning_rate": 1.9981108886431497e-06, "loss": 0.85297203, "num_input_tokens_seen": 178879805, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.74609375, "step": 8322, "time_per_iteration": 3.9190335273742676 }, { "auxiliary_loss_clip": 0.01129014, "auxiliary_loss_mlp": 0.01032488, "balance_loss_clip": 1.01930356, "balance_loss_mlp": 1.04404056, "epoch": 0.5004058319555088, "flos": 22710518904960.0, "grad_norm": 2.5407697617480984, "language_loss": 0.73468101, "learning_rate": 1.997733066520098e-06, "loss": 0.7562961, "num_input_tokens_seen": 178896985, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7578125, "step": 8323, "time_per_iteration": 2.5457301139831543 }, { "auxiliary_loss_clip": 0.01143983, "auxiliary_loss_mlp": 0.01036195, "balance_loss_clip": 1.02271271, "balance_loss_mlp": 1.0429455, "epoch": 0.5004659552081767, "flos": 30117956313600.0, "grad_norm": 1.5209876544239178, "language_loss": 0.69279641, "learning_rate": 1.9973552444779477e-06, "loss": 0.71459818, "num_input_tokens_seen": 178920605, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 8324, "time_per_iteration": 2.667468786239624 }, { "auxiliary_loss_clip": 0.01125297, "auxiliary_loss_mlp": 0.01281239, "balance_loss_clip": 1.02149379, "balance_loss_mlp": 1.04195845, "epoch": 0.5005260784608447, "flos": 18624710764800.0, "grad_norm": 1.9148818121381002, "language_loss": 0.72431874, "learning_rate": 1.9969774225301814e-06, "loss": 0.74838406, "num_input_tokens_seen": 178937760, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7421875, "step": 8325, "time_per_iteration": 2.5847208499908447 }, { "auxiliary_loss_clip": 0.0114534, "auxiliary_loss_mlp": 0.01037556, "balance_loss_clip": 1.02287543, "balance_loss_mlp": 1.04351234, "epoch": 0.5005862017135126, "flos": 24859987649280.0, "grad_norm": 1.6068208959732317, "language_loss": 0.73427749, "learning_rate": 1.9965996006902835e-06, "loss": 0.75610644, "num_input_tokens_seen": 178957985, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.75, "step": 8326, "time_per_iteration": 2.7249815464019775 }, { "auxiliary_loss_clip": 0.01132453, "auxiliary_loss_mlp": 0.01032804, "balance_loss_clip": 1.02016211, "balance_loss_mlp": 1.04083931, "epoch": 0.5006463249661807, "flos": 18734381965440.0, "grad_norm": 1.6903170831415508, "language_loss": 0.77937663, "learning_rate": 1.996221778971737e-06, "loss": 0.80102921, "num_input_tokens_seen": 178977070, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.734375, "step": 8327, "time_per_iteration": 4.073974370956421 }, { "auxiliary_loss_clip": 0.0112085, "auxiliary_loss_mlp": 0.01036541, "balance_loss_clip": 1.02314138, "balance_loss_mlp": 1.04433727, "epoch": 0.5007064482188487, "flos": 13042145871360.0, "grad_norm": 2.606151760177748, "language_loss": 0.87698412, "learning_rate": 1.995843957388025e-06, "loss": 0.89855802, "num_input_tokens_seen": 178994175, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.765625, "step": 8328, "time_per_iteration": 2.5286333560943604 }, { "auxiliary_loss_clip": 0.01146005, "auxiliary_loss_mlp": 0.01036255, "balance_loss_clip": 1.02276039, "balance_loss_mlp": 1.04189146, "epoch": 0.5007665714715166, "flos": 21288671965440.0, "grad_norm": 2.3919535402193, "language_loss": 0.74559307, "learning_rate": 1.9954661359526324e-06, "loss": 0.76741564, "num_input_tokens_seen": 179013710, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.77734375, "step": 8329, "time_per_iteration": 2.7237584590911865 }, { "auxiliary_loss_clip": 0.01114266, "auxiliary_loss_mlp": 0.01031355, "balance_loss_clip": 1.01921403, "balance_loss_mlp": 1.04207301, "epoch": 0.5008266947241846, "flos": 29754576374400.0, "grad_norm": 1.9152357558117838, "language_loss": 0.79451197, "learning_rate": 1.9950883146790413e-06, "loss": 0.81596816, "num_input_tokens_seen": 179035255, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.72265625, "step": 8330, "time_per_iteration": 2.677192449569702 }, { "auxiliary_loss_clip": 0.01057033, "auxiliary_loss_mlp": 0.01249249, "balance_loss_clip": 1.0015614, "balance_loss_mlp": 1.01505291, "epoch": 0.5008868179768525, "flos": 63557829204480.0, "grad_norm": 0.7239416930172823, "language_loss": 0.56060553, "learning_rate": 1.9947104935807355e-06, "loss": 0.58366829, "num_input_tokens_seen": 179090915, "router_z_loss_clip": 0.01647949, "router_z_loss_mlp": 0.2421875, "step": 8331, "time_per_iteration": 3.004032850265503 }, { "auxiliary_loss_clip": 0.01125433, "auxiliary_loss_mlp": 0.01032116, "balance_loss_clip": 1.0198791, "balance_loss_mlp": 1.04409838, "epoch": 0.5009469412295205, "flos": 27375637593600.0, "grad_norm": 2.1465195023203294, "language_loss": 0.65332812, "learning_rate": 1.9943326726711987e-06, "loss": 0.67490363, "num_input_tokens_seen": 179109160, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.7265625, "step": 8332, "time_per_iteration": 2.5520174503326416 }, { "auxiliary_loss_clip": 0.01113697, "auxiliary_loss_mlp": 0.01031652, "balance_loss_clip": 1.01934409, "balance_loss_mlp": 1.0408349, "epoch": 0.5010070644821885, "flos": 27378833904000.0, "grad_norm": 1.6447649527787465, "language_loss": 0.74857444, "learning_rate": 1.9939548519639143e-06, "loss": 0.77002788, "num_input_tokens_seen": 179130610, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7265625, "step": 8333, "time_per_iteration": 5.56099534034729 }, { "auxiliary_loss_clip": 0.01115561, "auxiliary_loss_mlp": 0.01031357, "balance_loss_clip": 1.01839864, "balance_loss_mlp": 1.04038274, "epoch": 0.5010671877348565, "flos": 20662748542080.0, "grad_norm": 1.8872681333475765, "language_loss": 0.80685949, "learning_rate": 1.9935770314723658e-06, "loss": 0.82832867, "num_input_tokens_seen": 179147860, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.75, "step": 8334, "time_per_iteration": 2.5585732460021973 }, { "auxiliary_loss_clip": 0.01047339, "auxiliary_loss_mlp": 0.0100336, "balance_loss_clip": 1.0016911, "balance_loss_mlp": 1.01456475, "epoch": 0.5011273109875244, "flos": 59128645000320.0, "grad_norm": 0.890875995192704, "language_loss": 0.62619066, "learning_rate": 1.9931992112100362e-06, "loss": 0.64669764, "num_input_tokens_seen": 179210490, "router_z_loss_clip": 0.01672363, "router_z_loss_mlp": 0.23828125, "step": 8335, "time_per_iteration": 3.1368167400360107 }, { "auxiliary_loss_clip": 0.01121163, "auxiliary_loss_mlp": 0.01030418, "balance_loss_clip": 1.01824057, "balance_loss_mlp": 1.04019713, "epoch": 0.5011874342401924, "flos": 25336342840320.0, "grad_norm": 1.563030353592734, "language_loss": 0.79605329, "learning_rate": 1.9928213911904096e-06, "loss": 0.81756908, "num_input_tokens_seen": 179231360, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.72265625, "step": 8336, "time_per_iteration": 2.5874781608581543 }, { "auxiliary_loss_clip": 0.01142392, "auxiliary_loss_mlp": 0.0103225, "balance_loss_clip": 1.01926851, "balance_loss_mlp": 1.04139864, "epoch": 0.5012475574928603, "flos": 20229953569920.0, "grad_norm": 1.7423510173736239, "language_loss": 0.80366778, "learning_rate": 1.992443571426969e-06, "loss": 0.82541418, "num_input_tokens_seen": 179250625, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.74609375, "step": 8337, "time_per_iteration": 2.606091022491455 }, { "auxiliary_loss_clip": 0.01119618, "auxiliary_loss_mlp": 0.01035353, "balance_loss_clip": 1.02195442, "balance_loss_mlp": 1.04137301, "epoch": 0.5013076807455283, "flos": 22710123855360.0, "grad_norm": 1.8740834568804303, "language_loss": 0.7934289, "learning_rate": 1.9920657519331977e-06, "loss": 0.81497848, "num_input_tokens_seen": 179267360, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.78125, "step": 8338, "time_per_iteration": 2.572575807571411 }, { "auxiliary_loss_clip": 0.01133818, "auxiliary_loss_mlp": 0.01027545, "balance_loss_clip": 1.01412201, "balance_loss_mlp": 1.04073441, "epoch": 0.5013678039981962, "flos": 24245161528320.0, "grad_norm": 1.6165966115316661, "language_loss": 0.84988236, "learning_rate": 1.9916879327225794e-06, "loss": 0.87149602, "num_input_tokens_seen": 179289810, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.75390625, "step": 8339, "time_per_iteration": 2.6879663467407227 }, { "auxiliary_loss_clip": 0.01123369, "auxiliary_loss_mlp": 0.01034659, "balance_loss_clip": 1.02194548, "balance_loss_mlp": 1.04092121, "epoch": 0.5014279272508643, "flos": 26176688501760.0, "grad_norm": 1.4567269463517898, "language_loss": 0.70615631, "learning_rate": 1.991310113808597e-06, "loss": 0.72773659, "num_input_tokens_seen": 179310620, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73828125, "step": 8340, "time_per_iteration": 2.630993366241455 }, { "auxiliary_loss_clip": 0.01145568, "auxiliary_loss_mlp": 0.01034866, "balance_loss_clip": 1.02180111, "balance_loss_mlp": 1.04223359, "epoch": 0.5014880505035323, "flos": 21430446946560.0, "grad_norm": 3.822845226963931, "language_loss": 0.78021824, "learning_rate": 1.9909322952047353e-06, "loss": 0.8020227, "num_input_tokens_seen": 179329005, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.765625, "step": 8341, "time_per_iteration": 2.6664235591888428 }, { "auxiliary_loss_clip": 0.01148622, "auxiliary_loss_mlp": 0.01035082, "balance_loss_clip": 1.02103877, "balance_loss_mlp": 1.04408693, "epoch": 0.5015481737562002, "flos": 15770745596160.0, "grad_norm": 2.3636218258032673, "language_loss": 0.89393425, "learning_rate": 1.9905544769244756e-06, "loss": 0.91577131, "num_input_tokens_seen": 179343785, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78125, "step": 8342, "time_per_iteration": 2.7037365436553955 }, { "auxiliary_loss_clip": 0.0113195, "auxiliary_loss_mlp": 0.010328, "balance_loss_clip": 1.01960897, "balance_loss_mlp": 1.04069626, "epoch": 0.5016082970088682, "flos": 26830801123200.0, "grad_norm": 2.0342666717034086, "language_loss": 0.76210952, "learning_rate": 1.9901766589813028e-06, "loss": 0.78375703, "num_input_tokens_seen": 179364070, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 8343, "time_per_iteration": 2.701748847961426 }, { "auxiliary_loss_clip": 0.01123934, "auxiliary_loss_mlp": 0.01026662, "balance_loss_clip": 1.01486611, "balance_loss_mlp": 1.0430069, "epoch": 0.5016684202615361, "flos": 21470595373440.0, "grad_norm": 1.7715515471986847, "language_loss": 0.67390609, "learning_rate": 1.9897988413887e-06, "loss": 0.69541204, "num_input_tokens_seen": 179384225, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.72265625, "step": 8344, "time_per_iteration": 2.5543463230133057 }, { "auxiliary_loss_clip": 0.01134659, "auxiliary_loss_mlp": 0.01033571, "balance_loss_clip": 1.02087522, "balance_loss_mlp": 1.04171145, "epoch": 0.5017285435142042, "flos": 26246821806720.0, "grad_norm": 2.0594237438248495, "language_loss": 0.75873339, "learning_rate": 1.9894210241601498e-06, "loss": 0.78041565, "num_input_tokens_seen": 179402595, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.75, "step": 8345, "time_per_iteration": 2.693535566329956 }, { "auxiliary_loss_clip": 0.01127893, "auxiliary_loss_mlp": 0.01029426, "balance_loss_clip": 1.015324, "balance_loss_mlp": 1.04454398, "epoch": 0.5017886667668721, "flos": 20777555387520.0, "grad_norm": 2.3467762026471646, "language_loss": 0.78445786, "learning_rate": 1.989043207309136e-06, "loss": 0.80603111, "num_input_tokens_seen": 179419635, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7421875, "step": 8346, "time_per_iteration": 2.543686866760254 }, { "auxiliary_loss_clip": 0.01134404, "auxiliary_loss_mlp": 0.01279736, "balance_loss_clip": 1.01954091, "balance_loss_mlp": 1.04190087, "epoch": 0.5018487900195401, "flos": 20156408472960.0, "grad_norm": 1.5478977440708666, "language_loss": 0.69459438, "learning_rate": 1.988665390849142e-06, "loss": 0.71873581, "num_input_tokens_seen": 179438770, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75, "step": 8347, "time_per_iteration": 2.6202502250671387 }, { "auxiliary_loss_clip": 0.01145275, "auxiliary_loss_mlp": 0.01034528, "balance_loss_clip": 1.02130723, "balance_loss_mlp": 1.04188442, "epoch": 0.501908913272208, "flos": 18150689957760.0, "grad_norm": 2.528551435724402, "language_loss": 0.71061254, "learning_rate": 1.9882875747936518e-06, "loss": 0.73241055, "num_input_tokens_seen": 179457475, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.765625, "step": 8348, "time_per_iteration": 2.55051851272583 }, { "auxiliary_loss_clip": 0.01124424, "auxiliary_loss_mlp": 0.01033603, "balance_loss_clip": 1.0216943, "balance_loss_mlp": 1.04359448, "epoch": 0.501969036524876, "flos": 23112287504640.0, "grad_norm": 1.4591551932200697, "language_loss": 0.74194497, "learning_rate": 1.9879097591561475e-06, "loss": 0.76352525, "num_input_tokens_seen": 179478140, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.71875, "step": 8349, "time_per_iteration": 2.5855367183685303 }, { "auxiliary_loss_clip": 0.0112629, "auxiliary_loss_mlp": 0.01031288, "balance_loss_clip": 1.01751971, "balance_loss_mlp": 1.0418011, "epoch": 0.5020291597775439, "flos": 11363214314880.0, "grad_norm": 2.1141444492976933, "language_loss": 0.64092231, "learning_rate": 1.987531943950113e-06, "loss": 0.66249812, "num_input_tokens_seen": 179494325, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7578125, "step": 8350, "time_per_iteration": 2.508821487426758 }, { "auxiliary_loss_clip": 0.01067668, "auxiliary_loss_mlp": 0.01016585, "balance_loss_clip": 1.01510072, "balance_loss_mlp": 1.01705456, "epoch": 0.5020892830302119, "flos": 64011094928640.0, "grad_norm": 0.776259120231752, "language_loss": 0.59853399, "learning_rate": 1.9871541291890312e-06, "loss": 0.61937654, "num_input_tokens_seen": 179553545, "router_z_loss_clip": 0.01483154, "router_z_loss_mlp": 0.24023438, "step": 8351, "time_per_iteration": 3.346903085708618 }, { "auxiliary_loss_clip": 0.01136241, "auxiliary_loss_mlp": 0.01032551, "balance_loss_clip": 1.01908648, "balance_loss_mlp": 1.04279768, "epoch": 0.5021494062828799, "flos": 23732859801600.0, "grad_norm": 1.5684487114855938, "language_loss": 0.74878514, "learning_rate": 1.986776314886385e-06, "loss": 0.77047306, "num_input_tokens_seen": 179573645, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 8352, "time_per_iteration": 2.6557273864746094 }, { "auxiliary_loss_clip": 0.01135239, "auxiliary_loss_mlp": 0.01033303, "balance_loss_clip": 1.02004075, "balance_loss_mlp": 1.04268384, "epoch": 0.5022095295355479, "flos": 21576747041280.0, "grad_norm": 1.7031532285741966, "language_loss": 0.72000712, "learning_rate": 1.9863985010556587e-06, "loss": 0.74169254, "num_input_tokens_seen": 179591435, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7421875, "step": 8353, "time_per_iteration": 2.560405969619751 }, { "auxiliary_loss_clip": 0.0112974, "auxiliary_loss_mlp": 0.0103297, "balance_loss_clip": 1.01888525, "balance_loss_mlp": 1.04572225, "epoch": 0.5022696527882159, "flos": 21397229844480.0, "grad_norm": 1.6620051541188796, "language_loss": 0.73891115, "learning_rate": 1.9860206877103344e-06, "loss": 0.76053822, "num_input_tokens_seen": 179609955, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75, "step": 8354, "time_per_iteration": 2.5574657917022705 }, { "auxiliary_loss_clip": 0.01133396, "auxiliary_loss_mlp": 0.01039483, "balance_loss_clip": 1.02566051, "balance_loss_mlp": 1.04313922, "epoch": 0.5023297760408838, "flos": 27160712565120.0, "grad_norm": 1.6371632377958085, "language_loss": 0.72471261, "learning_rate": 1.9856428748638957e-06, "loss": 0.74644142, "num_input_tokens_seen": 179630875, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.72265625, "step": 8355, "time_per_iteration": 2.6007182598114014 }, { "auxiliary_loss_clip": 0.01128807, "auxiliary_loss_mlp": 0.01035872, "balance_loss_clip": 1.02208495, "balance_loss_mlp": 1.04290867, "epoch": 0.5023898992935518, "flos": 26213820186240.0, "grad_norm": 6.042301725120283, "language_loss": 0.81206739, "learning_rate": 1.9852650625298267e-06, "loss": 0.83371413, "num_input_tokens_seen": 179649835, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7734375, "step": 8356, "time_per_iteration": 2.710211992263794 }, { "auxiliary_loss_clip": 0.01130498, "auxiliary_loss_mlp": 0.01037464, "balance_loss_clip": 1.02213395, "balance_loss_mlp": 1.04228175, "epoch": 0.5024500225462197, "flos": 13440323111040.0, "grad_norm": 2.148026699190228, "language_loss": 0.76496303, "learning_rate": 1.984887250721609e-06, "loss": 0.78664261, "num_input_tokens_seen": 179667605, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.79296875, "step": 8357, "time_per_iteration": 2.523742437362671 }, { "auxiliary_loss_clip": 0.01126843, "auxiliary_loss_mlp": 0.0103441, "balance_loss_clip": 1.01996779, "balance_loss_mlp": 1.04256618, "epoch": 0.5025101457988878, "flos": 21579584215680.0, "grad_norm": 1.8183524116273733, "language_loss": 0.75990021, "learning_rate": 1.9845094394527267e-06, "loss": 0.78151274, "num_input_tokens_seen": 179686910, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7578125, "step": 8358, "time_per_iteration": 2.5830016136169434 }, { "auxiliary_loss_clip": 0.01134587, "auxiliary_loss_mlp": 0.01032249, "balance_loss_clip": 1.01828945, "balance_loss_mlp": 1.04145145, "epoch": 0.5025702690515557, "flos": 24645134448000.0, "grad_norm": 2.284165514750674, "language_loss": 0.72118843, "learning_rate": 1.984131628736662e-06, "loss": 0.74285686, "num_input_tokens_seen": 179706395, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75390625, "step": 8359, "time_per_iteration": 2.590456962585449 }, { "auxiliary_loss_clip": 0.01134716, "auxiliary_loss_mlp": 0.01037933, "balance_loss_clip": 1.02501059, "balance_loss_mlp": 1.04264307, "epoch": 0.5026303923042237, "flos": 22090162089600.0, "grad_norm": 1.738962084895231, "language_loss": 0.76729381, "learning_rate": 1.9837538185868998e-06, "loss": 0.7890203, "num_input_tokens_seen": 179725735, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7421875, "step": 8360, "time_per_iteration": 2.614617347717285 }, { "auxiliary_loss_clip": 0.01143597, "auxiliary_loss_mlp": 0.01033149, "balance_loss_clip": 1.02061391, "balance_loss_mlp": 1.04296994, "epoch": 0.5026905155568916, "flos": 23697200574720.0, "grad_norm": 1.7457366136484724, "language_loss": 0.77089, "learning_rate": 1.9833760090169216e-06, "loss": 0.79265749, "num_input_tokens_seen": 179746150, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.734375, "step": 8361, "time_per_iteration": 2.5678656101226807 }, { "auxiliary_loss_clip": 0.01145648, "auxiliary_loss_mlp": 0.01031961, "balance_loss_clip": 1.01759577, "balance_loss_mlp": 1.04241693, "epoch": 0.5027506388095596, "flos": 25812410722560.0, "grad_norm": 1.9741613979495962, "language_loss": 0.84737122, "learning_rate": 1.9829982000402105e-06, "loss": 0.8691473, "num_input_tokens_seen": 179767550, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.765625, "step": 8362, "time_per_iteration": 2.709581136703491 }, { "auxiliary_loss_clip": 0.0113331, "auxiliary_loss_mlp": 0.01030627, "balance_loss_clip": 1.01732945, "balance_loss_mlp": 1.04020417, "epoch": 0.5028107620622275, "flos": 27526606456320.0, "grad_norm": 1.4754016331880346, "language_loss": 0.78317595, "learning_rate": 1.9826203916702502e-06, "loss": 0.80481535, "num_input_tokens_seen": 179790075, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7578125, "step": 8363, "time_per_iteration": 2.615084648132324 }, { "auxiliary_loss_clip": 0.01122039, "auxiliary_loss_mlp": 0.01030954, "balance_loss_clip": 1.01791859, "balance_loss_mlp": 1.04209661, "epoch": 0.5028708853148955, "flos": 24534278098560.0, "grad_norm": 2.5278908689061304, "language_loss": 0.76113808, "learning_rate": 1.982242583920523e-06, "loss": 0.78266799, "num_input_tokens_seen": 179806515, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 8364, "time_per_iteration": 3.8988242149353027 }, { "auxiliary_loss_clip": 0.01115054, "auxiliary_loss_mlp": 0.01029272, "balance_loss_clip": 1.0163554, "balance_loss_mlp": 1.0425843, "epoch": 0.5029310085675635, "flos": 20813609664000.0, "grad_norm": 1.780960820628529, "language_loss": 0.6946106, "learning_rate": 1.9818647768045137e-06, "loss": 0.71605384, "num_input_tokens_seen": 179826450, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 8365, "time_per_iteration": 2.5594496726989746 }, { "auxiliary_loss_clip": 0.01136451, "auxiliary_loss_mlp": 0.01033307, "balance_loss_clip": 1.01902533, "balance_loss_mlp": 1.04160953, "epoch": 0.5029911318202315, "flos": 22342470197760.0, "grad_norm": 1.5108516857633638, "language_loss": 0.72904146, "learning_rate": 1.981486970335703e-06, "loss": 0.75073898, "num_input_tokens_seen": 179846770, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7734375, "step": 8366, "time_per_iteration": 2.577108383178711 }, { "auxiliary_loss_clip": 0.0112571, "auxiliary_loss_mlp": 0.01032096, "balance_loss_clip": 1.01948929, "balance_loss_mlp": 1.04373431, "epoch": 0.5030512550728995, "flos": 24352713826560.0, "grad_norm": 1.7582774240122374, "language_loss": 0.78361362, "learning_rate": 1.9811091645275742e-06, "loss": 0.80519164, "num_input_tokens_seen": 179866585, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.73046875, "step": 8367, "time_per_iteration": 2.606233596801758 }, { "auxiliary_loss_clip": 0.01123879, "auxiliary_loss_mlp": 0.01026957, "balance_loss_clip": 1.01470244, "balance_loss_mlp": 1.04104745, "epoch": 0.5031113783255674, "flos": 18259930195200.0, "grad_norm": 1.787255365288227, "language_loss": 0.69737482, "learning_rate": 1.9807313593936114e-06, "loss": 0.71888316, "num_input_tokens_seen": 179885575, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.73828125, "step": 8368, "time_per_iteration": 3.9317686557769775 }, { "auxiliary_loss_clip": 0.01049545, "auxiliary_loss_mlp": 0.0100468, "balance_loss_clip": 1.00310636, "balance_loss_mlp": 1.01657891, "epoch": 0.5031715015782354, "flos": 57253173200640.0, "grad_norm": 0.9110969241561374, "language_loss": 0.6331079, "learning_rate": 1.9803535549472962e-06, "loss": 0.65365016, "num_input_tokens_seen": 179939650, "router_z_loss_clip": 0.01574707, "router_z_loss_mlp": 0.23828125, "step": 8369, "time_per_iteration": 3.1205661296844482 }, { "auxiliary_loss_clip": 0.01123675, "auxiliary_loss_mlp": 0.01031068, "balance_loss_clip": 1.01835406, "balance_loss_mlp": 1.04178691, "epoch": 0.5032316248309033, "flos": 27527360641920.0, "grad_norm": 1.9808988414432789, "language_loss": 0.60602075, "learning_rate": 1.9799757512021126e-06, "loss": 0.62756819, "num_input_tokens_seen": 179961765, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 8370, "time_per_iteration": 2.6868643760681152 }, { "auxiliary_loss_clip": 0.01154689, "auxiliary_loss_mlp": 0.0103595, "balance_loss_clip": 1.02169895, "balance_loss_mlp": 1.04289889, "epoch": 0.5032917480835714, "flos": 34495825939200.0, "grad_norm": 2.4210657703683323, "language_loss": 0.67950779, "learning_rate": 1.9795979481715426e-06, "loss": 0.70141417, "num_input_tokens_seen": 179983015, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.76171875, "step": 8371, "time_per_iteration": 2.744060516357422 }, { "auxiliary_loss_clip": 0.01136386, "auxiliary_loss_mlp": 0.01029177, "balance_loss_clip": 1.01497328, "balance_loss_mlp": 1.04328895, "epoch": 0.5033518713362393, "flos": 33656773167360.0, "grad_norm": 2.3356283624314567, "language_loss": 0.67296875, "learning_rate": 1.9792201458690695e-06, "loss": 0.69462442, "num_input_tokens_seen": 180003210, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75, "step": 8372, "time_per_iteration": 2.7964000701904297 }, { "auxiliary_loss_clip": 0.01142406, "auxiliary_loss_mlp": 0.01033565, "balance_loss_clip": 1.01901579, "balance_loss_mlp": 1.04215789, "epoch": 0.5034119945889073, "flos": 28185495586560.0, "grad_norm": 2.031103836862598, "language_loss": 0.6652509, "learning_rate": 1.978842344308176e-06, "loss": 0.68701065, "num_input_tokens_seen": 180025530, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.73828125, "step": 8373, "time_per_iteration": 2.65716814994812 }, { "auxiliary_loss_clip": 0.01130787, "auxiliary_loss_mlp": 0.0102946, "balance_loss_clip": 1.01654387, "balance_loss_mlp": 1.04030144, "epoch": 0.5034721178415752, "flos": 21358697529600.0, "grad_norm": 2.3394095992417707, "language_loss": 0.74577272, "learning_rate": 1.9784645435023443e-06, "loss": 0.76737523, "num_input_tokens_seen": 180043180, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 8374, "time_per_iteration": 4.117786884307861 }, { "auxiliary_loss_clip": 0.01135366, "auxiliary_loss_mlp": 0.01033582, "balance_loss_clip": 1.01969361, "balance_loss_mlp": 1.04245758, "epoch": 0.5035322410942432, "flos": 22674823764480.0, "grad_norm": 1.6235029803658345, "language_loss": 0.68060517, "learning_rate": 1.9780867434650584e-06, "loss": 0.70229459, "num_input_tokens_seen": 180062905, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75, "step": 8375, "time_per_iteration": 4.020368337631226 }, { "auxiliary_loss_clip": 0.01159032, "auxiliary_loss_mlp": 0.01031738, "balance_loss_clip": 1.01730824, "balance_loss_mlp": 1.04491639, "epoch": 0.5035923643469111, "flos": 19828723674240.0, "grad_norm": 1.7980738155738056, "language_loss": 0.78468162, "learning_rate": 1.9777089442098e-06, "loss": 0.80658931, "num_input_tokens_seen": 180082000, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78125, "step": 8376, "time_per_iteration": 2.5262157917022705 }, { "auxiliary_loss_clip": 0.01128598, "auxiliary_loss_mlp": 0.01282017, "balance_loss_clip": 1.02141714, "balance_loss_mlp": 1.04184055, "epoch": 0.5036524875995791, "flos": 30514625182080.0, "grad_norm": 2.263512111789978, "language_loss": 0.5993855, "learning_rate": 1.977331145750052e-06, "loss": 0.62349164, "num_input_tokens_seen": 180101340, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.78125, "step": 8377, "time_per_iteration": 2.631782293319702 }, { "auxiliary_loss_clip": 0.01133214, "auxiliary_loss_mlp": 0.01029739, "balance_loss_clip": 1.01604199, "balance_loss_mlp": 1.04127598, "epoch": 0.5037126108522471, "flos": 14720574637440.0, "grad_norm": 1.870830444450135, "language_loss": 0.75982821, "learning_rate": 1.976953348099297e-06, "loss": 0.78145766, "num_input_tokens_seen": 180119160, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7421875, "step": 8378, "time_per_iteration": 2.5438642501831055 }, { "auxiliary_loss_clip": 0.01139439, "auxiliary_loss_mlp": 0.01032677, "balance_loss_clip": 1.02009475, "balance_loss_mlp": 1.03981388, "epoch": 0.5037727341049151, "flos": 25297702784640.0, "grad_norm": 1.7514992353221974, "language_loss": 0.74582291, "learning_rate": 1.9765755512710173e-06, "loss": 0.76754403, "num_input_tokens_seen": 180138730, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.734375, "step": 8379, "time_per_iteration": 2.5841610431671143 }, { "auxiliary_loss_clip": 0.01124218, "auxiliary_loss_mlp": 0.01032396, "balance_loss_clip": 1.01932526, "balance_loss_mlp": 1.04360354, "epoch": 0.5038328573575831, "flos": 28541764632960.0, "grad_norm": 2.424703743996612, "language_loss": 0.66779774, "learning_rate": 1.9761977552786974e-06, "loss": 0.68936396, "num_input_tokens_seen": 180158810, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 8380, "time_per_iteration": 2.6019067764282227 }, { "auxiliary_loss_clip": 0.01149151, "auxiliary_loss_mlp": 0.01032371, "balance_loss_clip": 1.02025974, "balance_loss_mlp": 1.04155183, "epoch": 0.503892980610251, "flos": 31649869503360.0, "grad_norm": 1.924681536841426, "language_loss": 0.63286328, "learning_rate": 1.975819960135817e-06, "loss": 0.65467846, "num_input_tokens_seen": 180179700, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.71875, "step": 8381, "time_per_iteration": 2.6475157737731934 }, { "auxiliary_loss_clip": 0.01129294, "auxiliary_loss_mlp": 0.01036423, "balance_loss_clip": 1.02375126, "balance_loss_mlp": 1.03913307, "epoch": 0.503953103862919, "flos": 27089358197760.0, "grad_norm": 1.6289789981689047, "language_loss": 0.67458874, "learning_rate": 1.9754421658558604e-06, "loss": 0.69624591, "num_input_tokens_seen": 180199890, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 8382, "time_per_iteration": 2.6503946781158447 }, { "auxiliary_loss_clip": 0.01141458, "auxiliary_loss_mlp": 0.01039517, "balance_loss_clip": 1.02547467, "balance_loss_mlp": 1.04393172, "epoch": 0.5040132271155869, "flos": 15632957024640.0, "grad_norm": 1.6904919965540894, "language_loss": 0.62319851, "learning_rate": 1.97506437245231e-06, "loss": 0.64500833, "num_input_tokens_seen": 180217840, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.796875, "step": 8383, "time_per_iteration": 2.5959057807922363 }, { "auxiliary_loss_clip": 0.01139098, "auxiliary_loss_mlp": 0.01031326, "balance_loss_clip": 1.01891041, "balance_loss_mlp": 1.04117799, "epoch": 0.504073350368255, "flos": 13590106824960.0, "grad_norm": 2.302415450761371, "language_loss": 0.67372727, "learning_rate": 1.9746865799386476e-06, "loss": 0.69543159, "num_input_tokens_seen": 180236465, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 8384, "time_per_iteration": 2.675732135772705 }, { "auxiliary_loss_clip": 0.01123297, "auxiliary_loss_mlp": 0.01036007, "balance_loss_clip": 1.02385378, "balance_loss_mlp": 1.04051149, "epoch": 0.5041334736209229, "flos": 29058160510080.0, "grad_norm": 1.7768517591569386, "language_loss": 0.70914793, "learning_rate": 1.974308788328356e-06, "loss": 0.73074096, "num_input_tokens_seen": 180258025, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.734375, "step": 8385, "time_per_iteration": 2.6355955600738525 }, { "auxiliary_loss_clip": 0.01118368, "auxiliary_loss_mlp": 0.01028419, "balance_loss_clip": 1.01653981, "balance_loss_mlp": 1.03980553, "epoch": 0.5041935968735909, "flos": 24608361899520.0, "grad_norm": 1.767053826928155, "language_loss": 0.83110571, "learning_rate": 1.973930997634918e-06, "loss": 0.85257351, "num_input_tokens_seen": 180277825, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 8386, "time_per_iteration": 2.6124002933502197 }, { "auxiliary_loss_clip": 0.01123877, "auxiliary_loss_mlp": 0.01033475, "balance_loss_clip": 1.02018881, "balance_loss_mlp": 1.04084146, "epoch": 0.5042537201262588, "flos": 26286934320000.0, "grad_norm": 1.4248498185056644, "language_loss": 0.66411769, "learning_rate": 1.9735532078718157e-06, "loss": 0.68569124, "num_input_tokens_seen": 180300465, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73828125, "step": 8387, "time_per_iteration": 2.582348108291626 }, { "auxiliary_loss_clip": 0.0115909, "auxiliary_loss_mlp": 0.0103095, "balance_loss_clip": 1.01868963, "balance_loss_mlp": 1.04271865, "epoch": 0.5043138433789268, "flos": 22017371178240.0, "grad_norm": 1.4107185334453605, "language_loss": 0.80223215, "learning_rate": 1.973175419052531e-06, "loss": 0.82413256, "num_input_tokens_seen": 180321050, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.72265625, "step": 8388, "time_per_iteration": 2.6300292015075684 }, { "auxiliary_loss_clip": 0.01122422, "auxiliary_loss_mlp": 0.01031722, "balance_loss_clip": 1.01927018, "balance_loss_mlp": 1.04045975, "epoch": 0.5043739666315947, "flos": 28767104605440.0, "grad_norm": 2.390321442814674, "language_loss": 0.69606072, "learning_rate": 1.972797631190547e-06, "loss": 0.71760219, "num_input_tokens_seen": 180338870, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.734375, "step": 8389, "time_per_iteration": 2.566222667694092 }, { "auxiliary_loss_clip": 0.01134006, "auxiliary_loss_mlp": 0.01037642, "balance_loss_clip": 1.02545273, "balance_loss_mlp": 1.04088461, "epoch": 0.5044340898842627, "flos": 27599253713280.0, "grad_norm": 1.903488610103705, "language_loss": 0.69897008, "learning_rate": 1.972419844299345e-06, "loss": 0.72068661, "num_input_tokens_seen": 180361285, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.75, "step": 8390, "time_per_iteration": 2.671956777572632 }, { "auxiliary_loss_clip": 0.01056598, "auxiliary_loss_mlp": 0.01002734, "balance_loss_clip": 1.0012145, "balance_loss_mlp": 1.01473141, "epoch": 0.5044942131369307, "flos": 67458050749440.0, "grad_norm": 0.8294886300033774, "language_loss": 0.5300383, "learning_rate": 1.972042058392408e-06, "loss": 0.5506317, "num_input_tokens_seen": 180415170, "router_z_loss_clip": 0.01519775, "router_z_loss_mlp": 0.23925781, "step": 8391, "time_per_iteration": 2.9859259128570557 }, { "auxiliary_loss_clip": 0.01065097, "auxiliary_loss_mlp": 0.01003572, "balance_loss_clip": 1.00202262, "balance_loss_mlp": 1.0141753, "epoch": 0.5045543363895987, "flos": 58630849390080.0, "grad_norm": 0.8663333184972195, "language_loss": 0.60736704, "learning_rate": 1.9716642734832183e-06, "loss": 0.62805378, "num_input_tokens_seen": 180468060, "router_z_loss_clip": 0.01550293, "router_z_loss_mlp": 0.23925781, "step": 8392, "time_per_iteration": 2.9876534938812256 }, { "auxiliary_loss_clip": 0.0113845, "auxiliary_loss_mlp": 0.01030565, "balance_loss_clip": 1.01863813, "balance_loss_mlp": 1.04115736, "epoch": 0.5046144596422667, "flos": 22526620248960.0, "grad_norm": 1.7477009816028306, "language_loss": 0.84441203, "learning_rate": 1.971286489585258e-06, "loss": 0.86610222, "num_input_tokens_seen": 180486610, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.703125, "step": 8393, "time_per_iteration": 2.6107683181762695 }, { "auxiliary_loss_clip": 0.0114041, "auxiliary_loss_mlp": 0.01028495, "balance_loss_clip": 1.01659214, "balance_loss_mlp": 1.04128015, "epoch": 0.5046745828949346, "flos": 27454246508160.0, "grad_norm": 2.651945973719794, "language_loss": 0.50438476, "learning_rate": 1.9709087067120084e-06, "loss": 0.52607375, "num_input_tokens_seen": 180508135, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.7265625, "step": 8394, "time_per_iteration": 2.631739616394043 }, { "auxiliary_loss_clip": 0.01112631, "auxiliary_loss_mlp": 0.01031309, "balance_loss_clip": 1.01771355, "balance_loss_mlp": 1.03957665, "epoch": 0.5047347061476026, "flos": 17274541415040.0, "grad_norm": 1.5036214541204511, "language_loss": 0.75045311, "learning_rate": 1.970530924876953e-06, "loss": 0.77189255, "num_input_tokens_seen": 180527000, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73046875, "step": 8395, "time_per_iteration": 2.6619045734405518 }, { "auxiliary_loss_clip": 0.0112136, "auxiliary_loss_mlp": 0.01031614, "balance_loss_clip": 1.01940715, "balance_loss_mlp": 1.03991473, "epoch": 0.5047948294002705, "flos": 16649515831680.0, "grad_norm": 1.8207549162100682, "language_loss": 0.68003172, "learning_rate": 1.9701531440935726e-06, "loss": 0.70156145, "num_input_tokens_seen": 180544715, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.72265625, "step": 8396, "time_per_iteration": 2.5595507621765137 }, { "auxiliary_loss_clip": 0.01124117, "auxiliary_loss_mlp": 0.01029262, "balance_loss_clip": 1.01719236, "balance_loss_mlp": 1.04280424, "epoch": 0.5048549526529386, "flos": 26865706164480.0, "grad_norm": 1.5705589817973709, "language_loss": 0.78710634, "learning_rate": 1.9697753643753497e-06, "loss": 0.80864012, "num_input_tokens_seen": 180565365, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.7265625, "step": 8397, "time_per_iteration": 2.6985552310943604 }, { "auxiliary_loss_clip": 0.01148014, "auxiliary_loss_mlp": 0.01029974, "balance_loss_clip": 1.01795793, "balance_loss_mlp": 1.04062235, "epoch": 0.5049150759056065, "flos": 21833939399040.0, "grad_norm": 1.4265294109241453, "language_loss": 0.66218454, "learning_rate": 1.9693975857357665e-06, "loss": 0.68396437, "num_input_tokens_seen": 180586670, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.71875, "step": 8398, "time_per_iteration": 2.621772289276123 }, { "auxiliary_loss_clip": 0.01122018, "auxiliary_loss_mlp": 0.01275673, "balance_loss_clip": 1.0163722, "balance_loss_mlp": 1.04068232, "epoch": 0.5049751991582745, "flos": 21685807710720.0, "grad_norm": 1.7647044462908714, "language_loss": 0.71182179, "learning_rate": 1.9690198081883043e-06, "loss": 0.73579872, "num_input_tokens_seen": 180605085, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7265625, "step": 8399, "time_per_iteration": 2.665038585662842 }, { "auxiliary_loss_clip": 0.01135587, "auxiliary_loss_mlp": 0.01038772, "balance_loss_clip": 1.02624273, "balance_loss_mlp": 1.0430541, "epoch": 0.5050353224109424, "flos": 21359379888000.0, "grad_norm": 1.697777595322651, "language_loss": 0.81387526, "learning_rate": 1.968642031746446e-06, "loss": 0.83561885, "num_input_tokens_seen": 180624370, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7421875, "step": 8400, "time_per_iteration": 2.5678062438964844 }, { "auxiliary_loss_clip": 0.01142304, "auxiliary_loss_mlp": 0.01029565, "balance_loss_clip": 1.01655304, "balance_loss_mlp": 1.04307377, "epoch": 0.5050954456636104, "flos": 22820082364800.0, "grad_norm": 1.9973645027822116, "language_loss": 0.78740108, "learning_rate": 1.9682642564236725e-06, "loss": 0.8091197, "num_input_tokens_seen": 180642450, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 8401, "time_per_iteration": 2.6402359008789062 }, { "auxiliary_loss_clip": 0.01115574, "auxiliary_loss_mlp": 0.01281183, "balance_loss_clip": 1.02226162, "balance_loss_mlp": 1.04059315, "epoch": 0.5051555689162783, "flos": 30448226891520.0, "grad_norm": 1.6665724276110345, "language_loss": 0.69912815, "learning_rate": 1.967886482233466e-06, "loss": 0.72309572, "num_input_tokens_seen": 180665250, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.75, "step": 8402, "time_per_iteration": 2.6358675956726074 }, { "auxiliary_loss_clip": 0.01121558, "auxiliary_loss_mlp": 0.01274299, "balance_loss_clip": 1.015131, "balance_loss_mlp": 1.03957307, "epoch": 0.5052156921689464, "flos": 21287953693440.0, "grad_norm": 1.6100154215791356, "language_loss": 0.69385552, "learning_rate": 1.9675087091893084e-06, "loss": 0.71781415, "num_input_tokens_seen": 180687425, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.734375, "step": 8403, "time_per_iteration": 2.605624198913574 }, { "auxiliary_loss_clip": 0.01119496, "auxiliary_loss_mlp": 0.0103865, "balance_loss_clip": 1.02671754, "balance_loss_mlp": 1.04114151, "epoch": 0.5052758154216143, "flos": 25081305298560.0, "grad_norm": 1.4585113255067, "language_loss": 0.85484111, "learning_rate": 1.9671309373046816e-06, "loss": 0.87642252, "num_input_tokens_seen": 180708725, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 8404, "time_per_iteration": 2.6101725101470947 }, { "auxiliary_loss_clip": 0.01140436, "auxiliary_loss_mlp": 0.01282626, "balance_loss_clip": 1.02359366, "balance_loss_mlp": 1.04181385, "epoch": 0.5053359386742823, "flos": 20885502735360.0, "grad_norm": 1.6251813229770196, "language_loss": 0.75456488, "learning_rate": 1.9667531665930676e-06, "loss": 0.77879548, "num_input_tokens_seen": 180727990, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.72265625, "step": 8405, "time_per_iteration": 3.991929054260254 }, { "auxiliary_loss_clip": 0.01115824, "auxiliary_loss_mlp": 0.01028451, "balance_loss_clip": 1.01512933, "balance_loss_mlp": 1.04107165, "epoch": 0.5053960619269503, "flos": 37743335493120.0, "grad_norm": 1.6296285096782899, "language_loss": 0.73308837, "learning_rate": 1.966375397067947e-06, "loss": 0.75453109, "num_input_tokens_seen": 180749765, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75, "step": 8406, "time_per_iteration": 2.722977638244629 }, { "auxiliary_loss_clip": 0.01136107, "auxiliary_loss_mlp": 0.01031732, "balance_loss_clip": 1.01970363, "balance_loss_mlp": 1.03924489, "epoch": 0.5054561851796182, "flos": 23513840622720.0, "grad_norm": 1.7022456974222249, "language_loss": 0.76576626, "learning_rate": 1.965997628742802e-06, "loss": 0.78744465, "num_input_tokens_seen": 180769580, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69921875, "step": 8407, "time_per_iteration": 2.5590734481811523 }, { "auxiliary_loss_clip": 0.01142537, "auxiliary_loss_mlp": 0.01031553, "balance_loss_clip": 1.01957822, "balance_loss_mlp": 1.04027879, "epoch": 0.5055163084322862, "flos": 30410233280640.0, "grad_norm": 1.814391820414002, "language_loss": 0.63377607, "learning_rate": 1.965619861631114e-06, "loss": 0.65551698, "num_input_tokens_seen": 180790295, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.75, "step": 8408, "time_per_iteration": 2.6404550075531006 }, { "auxiliary_loss_clip": 0.01133766, "auxiliary_loss_mlp": 0.01031556, "balance_loss_clip": 1.01869345, "balance_loss_mlp": 1.04291272, "epoch": 0.5055764316849541, "flos": 20259651139200.0, "grad_norm": 1.603711994910738, "language_loss": 0.63444763, "learning_rate": 1.9652420957463645e-06, "loss": 0.65610087, "num_input_tokens_seen": 180807875, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 8409, "time_per_iteration": 2.5280115604400635 }, { "auxiliary_loss_clip": 0.01130262, "auxiliary_loss_mlp": 0.01026498, "balance_loss_clip": 1.01534033, "balance_loss_mlp": 1.04241848, "epoch": 0.5056365549376222, "flos": 26070895969920.0, "grad_norm": 1.3093530376616787, "language_loss": 0.70844162, "learning_rate": 1.9648643311020365e-06, "loss": 0.7300092, "num_input_tokens_seen": 180831300, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.703125, "step": 8410, "time_per_iteration": 3.9831533432006836 }, { "auxiliary_loss_clip": 0.01132949, "auxiliary_loss_mlp": 0.01038269, "balance_loss_clip": 1.02523971, "balance_loss_mlp": 1.04178023, "epoch": 0.5056966781902901, "flos": 19279074781440.0, "grad_norm": 1.5317587186871926, "language_loss": 0.79673493, "learning_rate": 1.9644865677116086e-06, "loss": 0.81844711, "num_input_tokens_seen": 180849055, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.734375, "step": 8411, "time_per_iteration": 2.6095471382141113 }, { "auxiliary_loss_clip": 0.01115031, "auxiliary_loss_mlp": 0.01035923, "balance_loss_clip": 1.02188611, "balance_loss_mlp": 1.0407697, "epoch": 0.5057568014429581, "flos": 21323325611520.0, "grad_norm": 1.7649904325701975, "language_loss": 0.81889743, "learning_rate": 1.9641088055885647e-06, "loss": 0.84040695, "num_input_tokens_seen": 180867395, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7421875, "step": 8412, "time_per_iteration": 2.569363832473755 }, { "auxiliary_loss_clip": 0.01133492, "auxiliary_loss_mlp": 0.01035316, "balance_loss_clip": 1.02268577, "balance_loss_mlp": 1.04318321, "epoch": 0.505816924695626, "flos": 17493596507520.0, "grad_norm": 2.7055606289853587, "language_loss": 0.80641961, "learning_rate": 1.9637310447463846e-06, "loss": 0.82810766, "num_input_tokens_seen": 180886670, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 8413, "time_per_iteration": 2.6210427284240723 }, { "auxiliary_loss_clip": 0.01144149, "auxiliary_loss_mlp": 0.01285279, "balance_loss_clip": 1.02524805, "balance_loss_mlp": 1.04297149, "epoch": 0.505877047948294, "flos": 21142084561920.0, "grad_norm": 1.8651519221749007, "language_loss": 0.80507588, "learning_rate": 1.9633532851985504e-06, "loss": 0.82937014, "num_input_tokens_seen": 180904645, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.74609375, "step": 8414, "time_per_iteration": 2.5781564712524414 }, { "auxiliary_loss_clip": 0.01121898, "auxiliary_loss_mlp": 0.01031736, "balance_loss_clip": 1.01884305, "balance_loss_mlp": 1.03857648, "epoch": 0.5059371712009619, "flos": 36350036887680.0, "grad_norm": 5.038293178328145, "language_loss": 0.62065619, "learning_rate": 1.9629755269585436e-06, "loss": 0.64219254, "num_input_tokens_seen": 180922340, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7421875, "step": 8415, "time_per_iteration": 2.7285895347595215 }, { "auxiliary_loss_clip": 0.01139377, "auxiliary_loss_mlp": 0.01029503, "balance_loss_clip": 1.0173676, "balance_loss_mlp": 1.04056394, "epoch": 0.50599729445363, "flos": 22673387220480.0, "grad_norm": 1.668765420303594, "language_loss": 0.81596684, "learning_rate": 1.9625977700398442e-06, "loss": 0.83765566, "num_input_tokens_seen": 180941350, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.7265625, "step": 8416, "time_per_iteration": 4.154823541641235 }, { "auxiliary_loss_clip": 0.01139421, "auxiliary_loss_mlp": 0.01032475, "balance_loss_clip": 1.02060747, "balance_loss_mlp": 1.04262137, "epoch": 0.5060574177062979, "flos": 22747866071040.0, "grad_norm": 1.6819767497982963, "language_loss": 0.79405475, "learning_rate": 1.962220014455935e-06, "loss": 0.81577373, "num_input_tokens_seen": 180960720, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.703125, "step": 8417, "time_per_iteration": 4.833225727081299 }, { "auxiliary_loss_clip": 0.01147194, "auxiliary_loss_mlp": 0.01031902, "balance_loss_clip": 1.019665, "balance_loss_mlp": 1.04182649, "epoch": 0.5061175409589659, "flos": 21653201139840.0, "grad_norm": 2.3066473948269084, "language_loss": 0.62827975, "learning_rate": 1.9618422602202955e-06, "loss": 0.65007073, "num_input_tokens_seen": 180979725, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.69921875, "step": 8418, "time_per_iteration": 2.676266670227051 }, { "auxiliary_loss_clip": 0.01141934, "auxiliary_loss_mlp": 0.01031262, "balance_loss_clip": 1.01884019, "balance_loss_mlp": 1.04116225, "epoch": 0.5061776642116339, "flos": 21616249023360.0, "grad_norm": 1.74852211146092, "language_loss": 0.77410907, "learning_rate": 1.9614645073464084e-06, "loss": 0.79584098, "num_input_tokens_seen": 180998980, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7421875, "step": 8419, "time_per_iteration": 2.5931787490844727 }, { "auxiliary_loss_clip": 0.01159352, "auxiliary_loss_mlp": 0.01032176, "balance_loss_clip": 1.01914072, "balance_loss_mlp": 1.0402478, "epoch": 0.5062377874643018, "flos": 24426294837120.0, "grad_norm": 1.8835542668783125, "language_loss": 0.76841486, "learning_rate": 1.9610867558477534e-06, "loss": 0.79033011, "num_input_tokens_seen": 181019165, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7421875, "step": 8420, "time_per_iteration": 2.7493484020233154 }, { "auxiliary_loss_clip": 0.01113926, "auxiliary_loss_mlp": 0.01036286, "balance_loss_clip": 1.02314937, "balance_loss_mlp": 1.04041266, "epoch": 0.5062979107169698, "flos": 22524429519360.0, "grad_norm": 1.6795331322650737, "language_loss": 0.77467209, "learning_rate": 1.960709005737812e-06, "loss": 0.79617423, "num_input_tokens_seen": 181037110, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 8421, "time_per_iteration": 2.5952000617980957 }, { "auxiliary_loss_clip": 0.01120975, "auxiliary_loss_mlp": 0.01028765, "balance_loss_clip": 1.01690936, "balance_loss_mlp": 1.0395844, "epoch": 0.5063580339696377, "flos": 24571984400640.0, "grad_norm": 1.4342198887761048, "language_loss": 0.6658709, "learning_rate": 1.9603312570300653e-06, "loss": 0.68736839, "num_input_tokens_seen": 181057775, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.72265625, "step": 8422, "time_per_iteration": 2.68121075630188 }, { "auxiliary_loss_clip": 0.01113378, "auxiliary_loss_mlp": 0.0103317, "balance_loss_clip": 1.02084994, "balance_loss_mlp": 1.04174876, "epoch": 0.5064181572223058, "flos": 22596143022720.0, "grad_norm": 1.5757305787206584, "language_loss": 0.81700599, "learning_rate": 1.9599535097379936e-06, "loss": 0.83847147, "num_input_tokens_seen": 181078260, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71875, "step": 8423, "time_per_iteration": 2.621816873550415 }, { "auxiliary_loss_clip": 0.0111549, "auxiliary_loss_mlp": 0.01283091, "balance_loss_clip": 1.02282858, "balance_loss_mlp": 1.04089773, "epoch": 0.5064782804749737, "flos": 25994944661760.0, "grad_norm": 2.324142710750866, "language_loss": 0.74675703, "learning_rate": 1.9595757638750787e-06, "loss": 0.77074277, "num_input_tokens_seen": 181098755, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.74609375, "step": 8424, "time_per_iteration": 2.6020352840423584 }, { "auxiliary_loss_clip": 0.01111617, "auxiliary_loss_mlp": 0.01035151, "balance_loss_clip": 1.02218699, "balance_loss_mlp": 1.04066277, "epoch": 0.5065384037276417, "flos": 28553041503360.0, "grad_norm": 1.3292994390348483, "language_loss": 0.71183342, "learning_rate": 1.9591980194548007e-06, "loss": 0.73330104, "num_input_tokens_seen": 181121570, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 8425, "time_per_iteration": 2.6420397758483887 }, { "auxiliary_loss_clip": 0.0114037, "auxiliary_loss_mlp": 0.01031399, "balance_loss_clip": 1.01921558, "balance_loss_mlp": 1.04048657, "epoch": 0.5065985269803096, "flos": 22966023323520.0, "grad_norm": 1.7717152441239374, "language_loss": 0.78705966, "learning_rate": 1.9588202764906405e-06, "loss": 0.80877733, "num_input_tokens_seen": 181140240, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.734375, "step": 8426, "time_per_iteration": 2.6152379512786865 }, { "auxiliary_loss_clip": 0.01126303, "auxiliary_loss_mlp": 0.01033812, "balance_loss_clip": 1.02102637, "balance_loss_mlp": 1.04309082, "epoch": 0.5066586502329776, "flos": 21608563512960.0, "grad_norm": 2.2317006602753904, "language_loss": 0.77787161, "learning_rate": 1.9584425349960787e-06, "loss": 0.79947275, "num_input_tokens_seen": 181158630, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7421875, "step": 8427, "time_per_iteration": 2.575409412384033 }, { "auxiliary_loss_clip": 0.01111205, "auxiliary_loss_mlp": 0.01026989, "balance_loss_clip": 1.01438284, "balance_loss_mlp": 1.04002213, "epoch": 0.5067187734856455, "flos": 20339912079360.0, "grad_norm": 1.8828951876246982, "language_loss": 0.71737039, "learning_rate": 1.9580647949845953e-06, "loss": 0.73875237, "num_input_tokens_seen": 181176405, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 8428, "time_per_iteration": 2.5556068420410156 }, { "auxiliary_loss_clip": 0.01121733, "auxiliary_loss_mlp": 0.01278868, "balance_loss_clip": 1.02021682, "balance_loss_mlp": 1.03985786, "epoch": 0.5067788967383136, "flos": 28841080665600.0, "grad_norm": 2.0161713513356463, "language_loss": 0.83169806, "learning_rate": 1.957687056469672e-06, "loss": 0.85570407, "num_input_tokens_seen": 181197595, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.7265625, "step": 8429, "time_per_iteration": 2.693744421005249 }, { "auxiliary_loss_clip": 0.01144627, "auxiliary_loss_mlp": 0.0103761, "balance_loss_clip": 1.02378154, "balance_loss_mlp": 1.04180241, "epoch": 0.5068390199909815, "flos": 32450174478720.0, "grad_norm": 1.9138048711395883, "language_loss": 0.73309839, "learning_rate": 1.957309319464789e-06, "loss": 0.75492072, "num_input_tokens_seen": 181218560, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.765625, "step": 8430, "time_per_iteration": 2.6819262504577637 }, { "auxiliary_loss_clip": 0.01159785, "auxiliary_loss_mlp": 0.01034492, "balance_loss_clip": 1.02220774, "balance_loss_mlp": 1.04140997, "epoch": 0.5068991432436495, "flos": 23146582014720.0, "grad_norm": 1.6293680860422046, "language_loss": 0.76508355, "learning_rate": 1.956931583983426e-06, "loss": 0.78702629, "num_input_tokens_seen": 181237095, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.734375, "step": 8431, "time_per_iteration": 2.7300868034362793 }, { "auxiliary_loss_clip": 0.01122198, "auxiliary_loss_mlp": 0.01029041, "balance_loss_clip": 1.0165534, "balance_loss_mlp": 1.04124963, "epoch": 0.5069592664963174, "flos": 19936096404480.0, "grad_norm": 1.5174051527321357, "language_loss": 0.72101617, "learning_rate": 1.9565538500390644e-06, "loss": 0.74252856, "num_input_tokens_seen": 181255940, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.71875, "step": 8432, "time_per_iteration": 2.573490858078003 }, { "auxiliary_loss_clip": 0.01075004, "auxiliary_loss_mlp": 0.01007295, "balance_loss_clip": 1.00573337, "balance_loss_mlp": 1.01575029, "epoch": 0.5070193897489854, "flos": 65793771941760.0, "grad_norm": 0.7550625348734635, "language_loss": 0.63632911, "learning_rate": 1.956176117645184e-06, "loss": 0.65715206, "num_input_tokens_seen": 181316945, "router_z_loss_clip": 0.015625, "router_z_loss_mlp": 0.23828125, "step": 8433, "time_per_iteration": 3.2168471813201904 }, { "auxiliary_loss_clip": 0.01139353, "auxiliary_loss_mlp": 0.01038519, "balance_loss_clip": 1.02579308, "balance_loss_mlp": 1.04020357, "epoch": 0.5070795130016534, "flos": 17275331514240.0, "grad_norm": 1.8626385750446492, "language_loss": 0.77643275, "learning_rate": 1.9557983868152652e-06, "loss": 0.79821146, "num_input_tokens_seen": 181335555, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73046875, "step": 8434, "time_per_iteration": 2.6484265327453613 }, { "auxiliary_loss_clip": 0.01132392, "auxiliary_loss_mlp": 0.01031687, "balance_loss_clip": 1.01930082, "balance_loss_mlp": 1.03988743, "epoch": 0.5071396362543213, "flos": 21069940095360.0, "grad_norm": 1.6741929872370425, "language_loss": 0.70740992, "learning_rate": 1.955420657562788e-06, "loss": 0.72905064, "num_input_tokens_seen": 181354580, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.75, "step": 8435, "time_per_iteration": 2.6342968940734863 }, { "auxiliary_loss_clip": 0.01125176, "auxiliary_loss_mlp": 0.01030968, "balance_loss_clip": 1.01751506, "balance_loss_mlp": 1.0424037, "epoch": 0.5071997595069894, "flos": 23144822248320.0, "grad_norm": 2.6768518214341186, "language_loss": 0.72368455, "learning_rate": 1.9550429299012334e-06, "loss": 0.74524593, "num_input_tokens_seen": 181374320, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73828125, "step": 8436, "time_per_iteration": 2.605581283569336 }, { "auxiliary_loss_clip": 0.01114632, "auxiliary_loss_mlp": 0.01030847, "balance_loss_clip": 1.01670873, "balance_loss_mlp": 1.04100811, "epoch": 0.5072598827596573, "flos": 22747183712640.0, "grad_norm": 1.649826871260177, "language_loss": 0.83864093, "learning_rate": 1.954665203844081e-06, "loss": 0.86009574, "num_input_tokens_seen": 181392190, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.734375, "step": 8437, "time_per_iteration": 2.568901538848877 }, { "auxiliary_loss_clip": 0.01121296, "auxiliary_loss_mlp": 0.01029521, "balance_loss_clip": 1.01720643, "balance_loss_mlp": 1.0410341, "epoch": 0.5073200060123253, "flos": 22566301799040.0, "grad_norm": 1.3831767254195246, "language_loss": 0.80506331, "learning_rate": 1.9542874794048103e-06, "loss": 0.82657146, "num_input_tokens_seen": 181413890, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71484375, "step": 8438, "time_per_iteration": 2.645258665084839 }, { "auxiliary_loss_clip": 0.01143263, "auxiliary_loss_mlp": 0.01034325, "balance_loss_clip": 1.01970387, "balance_loss_mlp": 1.04129887, "epoch": 0.5073801292649932, "flos": 25806341324160.0, "grad_norm": 1.6051545555631301, "language_loss": 0.796049, "learning_rate": 1.9539097565969023e-06, "loss": 0.81782496, "num_input_tokens_seen": 181433240, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.75, "step": 8439, "time_per_iteration": 2.6323678493499756 }, { "auxiliary_loss_clip": 0.01130853, "auxiliary_loss_mlp": 0.01278972, "balance_loss_clip": 1.02025568, "balance_loss_mlp": 1.04094887, "epoch": 0.5074402525176612, "flos": 25373941401600.0, "grad_norm": 1.614360589966622, "language_loss": 0.70747066, "learning_rate": 1.9535320354338366e-06, "loss": 0.73156887, "num_input_tokens_seen": 181453535, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.71875, "step": 8440, "time_per_iteration": 2.6565732955932617 }, { "auxiliary_loss_clip": 0.01135251, "auxiliary_loss_mlp": 0.01034051, "balance_loss_clip": 1.02137351, "balance_loss_mlp": 1.04203606, "epoch": 0.5075003757703291, "flos": 26064431521920.0, "grad_norm": 1.7303839894992572, "language_loss": 0.71167576, "learning_rate": 1.9531543159290933e-06, "loss": 0.73336875, "num_input_tokens_seen": 181474195, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.75, "step": 8441, "time_per_iteration": 2.6101629734039307 }, { "auxiliary_loss_clip": 0.01127296, "auxiliary_loss_mlp": 0.01286134, "balance_loss_clip": 1.02532828, "balance_loss_mlp": 1.04165077, "epoch": 0.5075604990229972, "flos": 21835447770240.0, "grad_norm": 1.6218019587614454, "language_loss": 0.63804156, "learning_rate": 1.9527765980961516e-06, "loss": 0.66217589, "num_input_tokens_seen": 181494000, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.765625, "step": 8442, "time_per_iteration": 2.6198012828826904 }, { "auxiliary_loss_clip": 0.01147301, "auxiliary_loss_mlp": 0.01028954, "balance_loss_clip": 1.01708639, "balance_loss_mlp": 1.04048324, "epoch": 0.5076206222756651, "flos": 31978703537280.0, "grad_norm": 1.4406386456137221, "language_loss": 0.71197414, "learning_rate": 1.952398881948491e-06, "loss": 0.73373675, "num_input_tokens_seen": 181515955, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.7109375, "step": 8443, "time_per_iteration": 2.659928560256958 }, { "auxiliary_loss_clip": 0.01171101, "auxiliary_loss_mlp": 0.0103227, "balance_loss_clip": 1.01878119, "balance_loss_mlp": 1.04080248, "epoch": 0.5076807455283331, "flos": 36904031326080.0, "grad_norm": 1.3364729690189288, "language_loss": 0.62133205, "learning_rate": 1.9520211674995927e-06, "loss": 0.64336568, "num_input_tokens_seen": 181540225, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.76171875, "step": 8444, "time_per_iteration": 2.796433925628662 }, { "auxiliary_loss_clip": 0.01138784, "auxiliary_loss_mlp": 0.01032313, "balance_loss_clip": 1.01974845, "balance_loss_mlp": 1.0395298, "epoch": 0.507740868781001, "flos": 29862415981440.0, "grad_norm": 1.6173438290235482, "language_loss": 0.63848472, "learning_rate": 1.951643454762935e-06, "loss": 0.66019571, "num_input_tokens_seen": 181560125, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.73046875, "step": 8445, "time_per_iteration": 2.6496996879577637 }, { "auxiliary_loss_clip": 0.01066953, "auxiliary_loss_mlp": 0.01003691, "balance_loss_clip": 1.00215292, "balance_loss_mlp": 1.01603138, "epoch": 0.507800992033669, "flos": 61918974247680.0, "grad_norm": 0.8323605061022207, "language_loss": 0.61892653, "learning_rate": 1.9512657437519986e-06, "loss": 0.63963294, "num_input_tokens_seen": 181618830, "router_z_loss_clip": 0.01531982, "router_z_loss_mlp": 0.24023438, "step": 8446, "time_per_iteration": 3.2345681190490723 }, { "auxiliary_loss_clip": 0.01136176, "auxiliary_loss_mlp": 0.01026136, "balance_loss_clip": 1.01362455, "balance_loss_mlp": 1.03790879, "epoch": 0.507861115286337, "flos": 20700490757760.0, "grad_norm": 1.8932993330278387, "language_loss": 0.80345142, "learning_rate": 1.9508880344802616e-06, "loss": 0.82507443, "num_input_tokens_seen": 181637120, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 8447, "time_per_iteration": 4.057762145996094 }, { "auxiliary_loss_clip": 0.01122884, "auxiliary_loss_mlp": 0.01030246, "balance_loss_clip": 1.01779437, "balance_loss_mlp": 1.04270768, "epoch": 0.507921238539005, "flos": 30847050576000.0, "grad_norm": 1.403268318021119, "language_loss": 0.70278341, "learning_rate": 1.950510326961205e-06, "loss": 0.72431469, "num_input_tokens_seen": 181659965, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7109375, "step": 8448, "time_per_iteration": 2.6727304458618164 }, { "auxiliary_loss_clip": 0.01112804, "auxiliary_loss_mlp": 0.0127796, "balance_loss_clip": 1.01938701, "balance_loss_mlp": 1.04150569, "epoch": 0.507981361791673, "flos": 35700197984640.0, "grad_norm": 1.3715520810589288, "language_loss": 0.71685541, "learning_rate": 1.9501326212083077e-06, "loss": 0.74076307, "num_input_tokens_seen": 181685290, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.7109375, "step": 8449, "time_per_iteration": 2.7118072509765625 }, { "auxiliary_loss_clip": 0.01127724, "auxiliary_loss_mlp": 0.01029127, "balance_loss_clip": 1.01699805, "balance_loss_mlp": 1.03827524, "epoch": 0.5080414850443409, "flos": 27161466750720.0, "grad_norm": 1.4991340066373122, "language_loss": 0.72797751, "learning_rate": 1.949754917235048e-06, "loss": 0.74954599, "num_input_tokens_seen": 181706080, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.71875, "step": 8450, "time_per_iteration": 2.634568929672241 }, { "auxiliary_loss_clip": 0.01116926, "auxiliary_loss_mlp": 0.01031394, "balance_loss_clip": 1.01875198, "balance_loss_mlp": 1.04267383, "epoch": 0.5081016082970089, "flos": 27085192220160.0, "grad_norm": 1.6338299213164298, "language_loss": 0.77177578, "learning_rate": 1.9493772150549068e-06, "loss": 0.79325902, "num_input_tokens_seen": 181724805, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7421875, "step": 8451, "time_per_iteration": 4.012899875640869 }, { "auxiliary_loss_clip": 0.01114307, "auxiliary_loss_mlp": 0.01036755, "balance_loss_clip": 1.02427959, "balance_loss_mlp": 1.04048991, "epoch": 0.5081617315496768, "flos": 22856531690880.0, "grad_norm": 1.7360892205297707, "language_loss": 0.84590209, "learning_rate": 1.9489995146813622e-06, "loss": 0.86741269, "num_input_tokens_seen": 181743725, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.73828125, "step": 8452, "time_per_iteration": 2.627671718597412 }, { "auxiliary_loss_clip": 0.0112229, "auxiliary_loss_mlp": 0.01033815, "balance_loss_clip": 1.01986778, "balance_loss_mlp": 1.04422665, "epoch": 0.5082218548023448, "flos": 16281898087680.0, "grad_norm": 2.714982158973496, "language_loss": 0.77564347, "learning_rate": 1.948621816127894e-06, "loss": 0.79720449, "num_input_tokens_seen": 181757720, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78125, "step": 8453, "time_per_iteration": 2.5406742095947266 }, { "auxiliary_loss_clip": 0.0113668, "auxiliary_loss_mlp": 0.01030091, "balance_loss_clip": 1.01783574, "balance_loss_mlp": 1.04032302, "epoch": 0.5082819780550127, "flos": 24460768915200.0, "grad_norm": 1.6317095363595109, "language_loss": 0.76388675, "learning_rate": 1.948244119407981e-06, "loss": 0.78555447, "num_input_tokens_seen": 181778545, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.703125, "step": 8454, "time_per_iteration": 2.6583096981048584 }, { "auxiliary_loss_clip": 0.01133954, "auxiliary_loss_mlp": 0.01033667, "balance_loss_clip": 1.02005315, "balance_loss_mlp": 1.04168797, "epoch": 0.5083421013076808, "flos": 23403271582080.0, "grad_norm": 1.4744113879502063, "language_loss": 0.89145494, "learning_rate": 1.947866424535102e-06, "loss": 0.91313112, "num_input_tokens_seen": 181799495, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7421875, "step": 8455, "time_per_iteration": 2.5854146480560303 }, { "auxiliary_loss_clip": 0.01139675, "auxiliary_loss_mlp": 0.01283563, "balance_loss_clip": 1.02167273, "balance_loss_mlp": 1.04324162, "epoch": 0.5084022245603487, "flos": 23872695448320.0, "grad_norm": 1.635308746879706, "language_loss": 0.62469792, "learning_rate": 1.947488731522737e-06, "loss": 0.64893031, "num_input_tokens_seen": 181818400, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.78515625, "step": 8456, "time_per_iteration": 2.6231865882873535 }, { "auxiliary_loss_clip": 0.01049862, "auxiliary_loss_mlp": 0.01000043, "balance_loss_clip": 0.99841017, "balance_loss_mlp": 1.01746011, "epoch": 0.5084623478130167, "flos": 62873336655360.0, "grad_norm": 0.8099793117182548, "language_loss": 0.62425953, "learning_rate": 1.947111040384363e-06, "loss": 0.64475858, "num_input_tokens_seen": 181875975, "router_z_loss_clip": 0.01635742, "router_z_loss_mlp": 0.23828125, "step": 8457, "time_per_iteration": 4.5729804039001465 }, { "auxiliary_loss_clip": 0.01124975, "auxiliary_loss_mlp": 0.01033824, "balance_loss_clip": 1.02012062, "balance_loss_mlp": 1.04131055, "epoch": 0.5085224710656846, "flos": 22346133384960.0, "grad_norm": 1.5541139116327967, "language_loss": 0.67137432, "learning_rate": 1.9467333511334605e-06, "loss": 0.69296229, "num_input_tokens_seen": 181896450, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.74609375, "step": 8458, "time_per_iteration": 4.018720626831055 }, { "auxiliary_loss_clip": 0.01124924, "auxiliary_loss_mlp": 0.01032391, "balance_loss_clip": 1.01863456, "balance_loss_mlp": 1.04051423, "epoch": 0.5085825943183526, "flos": 26066263115520.0, "grad_norm": 1.6595171280118988, "language_loss": 0.77953094, "learning_rate": 1.946355663783508e-06, "loss": 0.80110407, "num_input_tokens_seen": 181916770, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75390625, "step": 8459, "time_per_iteration": 2.599719762802124 }, { "auxiliary_loss_clip": 0.01122831, "auxiliary_loss_mlp": 0.01038185, "balance_loss_clip": 1.02435064, "balance_loss_mlp": 1.04545045, "epoch": 0.5086427175710206, "flos": 17420733768960.0, "grad_norm": 1.9611154964727822, "language_loss": 0.80712819, "learning_rate": 1.945977978347983e-06, "loss": 0.82873833, "num_input_tokens_seen": 181932710, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7734375, "step": 8460, "time_per_iteration": 2.5658583641052246 }, { "auxiliary_loss_clip": 0.0113458, "auxiliary_loss_mlp": 0.01037976, "balance_loss_clip": 1.02496409, "balance_loss_mlp": 1.04138446, "epoch": 0.5087028408236886, "flos": 20631758083200.0, "grad_norm": 3.247485440675607, "language_loss": 0.68652099, "learning_rate": 1.9456002948403656e-06, "loss": 0.70824653, "num_input_tokens_seen": 181950665, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.75390625, "step": 8461, "time_per_iteration": 2.5650339126586914 }, { "auxiliary_loss_clip": 0.01112706, "auxiliary_loss_mlp": 0.01032233, "balance_loss_clip": 1.01903009, "balance_loss_mlp": 1.04036474, "epoch": 0.5087629640763566, "flos": 25593822506880.0, "grad_norm": 1.5699569614690163, "language_loss": 0.75852966, "learning_rate": 1.945222613274133e-06, "loss": 0.77997905, "num_input_tokens_seen": 181971270, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 8462, "time_per_iteration": 2.6083877086639404 }, { "auxiliary_loss_clip": 0.01143585, "auxiliary_loss_mlp": 0.01033405, "balance_loss_clip": 1.02053046, "balance_loss_mlp": 1.04348576, "epoch": 0.5088230873290245, "flos": 13261631927040.0, "grad_norm": 2.2421145830757068, "language_loss": 0.81306994, "learning_rate": 1.9448449336627654e-06, "loss": 0.83483982, "num_input_tokens_seen": 181988410, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73828125, "step": 8463, "time_per_iteration": 2.531317710876465 }, { "auxiliary_loss_clip": 0.01125517, "auxiliary_loss_mlp": 0.01039766, "balance_loss_clip": 1.02698076, "balance_loss_mlp": 1.04335701, "epoch": 0.5088832105816925, "flos": 20043469134720.0, "grad_norm": 1.9604349874793243, "language_loss": 0.76170069, "learning_rate": 1.94446725601974e-06, "loss": 0.78335351, "num_input_tokens_seen": 182006530, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.73046875, "step": 8464, "time_per_iteration": 2.576754331588745 }, { "auxiliary_loss_clip": 0.01048008, "auxiliary_loss_mlp": 0.0100733, "balance_loss_clip": 1.00595331, "balance_loss_mlp": 1.01604509, "epoch": 0.5089433338343604, "flos": 67422179018880.0, "grad_norm": 0.6853161680019016, "language_loss": 0.59403354, "learning_rate": 1.9440895803585347e-06, "loss": 0.61458695, "num_input_tokens_seen": 182074240, "router_z_loss_clip": 0.01379395, "router_z_loss_mlp": 0.234375, "step": 8465, "time_per_iteration": 3.2582573890686035 }, { "auxiliary_loss_clip": 0.01114553, "auxiliary_loss_mlp": 0.01032268, "balance_loss_clip": 1.0192678, "balance_loss_mlp": 1.04137039, "epoch": 0.5090034570870284, "flos": 22710339336960.0, "grad_norm": 1.9485261210777085, "language_loss": 0.79825521, "learning_rate": 1.9437119066926293e-06, "loss": 0.81972343, "num_input_tokens_seen": 182093360, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.734375, "step": 8466, "time_per_iteration": 2.622089147567749 }, { "auxiliary_loss_clip": 0.01121834, "auxiliary_loss_mlp": 0.01028962, "balance_loss_clip": 1.01572394, "balance_loss_mlp": 1.04019499, "epoch": 0.5090635803396963, "flos": 20445812352000.0, "grad_norm": 1.8961774862461054, "language_loss": 0.78476119, "learning_rate": 1.9433342350355007e-06, "loss": 0.80626911, "num_input_tokens_seen": 182110170, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 8467, "time_per_iteration": 2.596635103225708 }, { "auxiliary_loss_clip": 0.01112819, "auxiliary_loss_mlp": 0.0102963, "balance_loss_clip": 1.01752424, "balance_loss_mlp": 1.04227853, "epoch": 0.5091237035923644, "flos": 23768878164480.0, "grad_norm": 1.7010377597396207, "language_loss": 0.74186528, "learning_rate": 1.9429565654006277e-06, "loss": 0.76328975, "num_input_tokens_seen": 182129570, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.70703125, "step": 8468, "time_per_iteration": 2.570209503173828 }, { "auxiliary_loss_clip": 0.01114283, "auxiliary_loss_mlp": 0.01026836, "balance_loss_clip": 1.01418757, "balance_loss_mlp": 1.04134607, "epoch": 0.5091838268450323, "flos": 18327908684160.0, "grad_norm": 1.7471826746038306, "language_loss": 0.77629054, "learning_rate": 1.942578897801488e-06, "loss": 0.79770178, "num_input_tokens_seen": 182147565, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 8469, "time_per_iteration": 2.5071444511413574 }, { "auxiliary_loss_clip": 0.01126447, "auxiliary_loss_mlp": 0.01035767, "balance_loss_clip": 1.02200389, "balance_loss_mlp": 1.04380846, "epoch": 0.5092439500977003, "flos": 29057621806080.0, "grad_norm": 1.7783177722864534, "language_loss": 0.6971674, "learning_rate": 1.94220123225156e-06, "loss": 0.71878958, "num_input_tokens_seen": 182169695, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.734375, "step": 8470, "time_per_iteration": 2.6402220726013184 }, { "auxiliary_loss_clip": 0.01135998, "auxiliary_loss_mlp": 0.01033296, "balance_loss_clip": 1.01948595, "balance_loss_mlp": 1.04221821, "epoch": 0.5093040733503682, "flos": 13553908894080.0, "grad_norm": 2.1524946728317076, "language_loss": 0.73883557, "learning_rate": 1.9418235687643216e-06, "loss": 0.76052856, "num_input_tokens_seen": 182186385, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 8471, "time_per_iteration": 2.5031421184539795 }, { "auxiliary_loss_clip": 0.0103919, "auxiliary_loss_mlp": 0.01001413, "balance_loss_clip": 0.99992859, "balance_loss_mlp": 1.01573086, "epoch": 0.5093641966030362, "flos": 68906617407360.0, "grad_norm": 0.7554227013860855, "language_loss": 0.58128548, "learning_rate": 1.9414459073532495e-06, "loss": 0.60169148, "num_input_tokens_seen": 182247095, "router_z_loss_clip": 0.01483154, "router_z_loss_mlp": 0.234375, "step": 8472, "time_per_iteration": 3.20649790763855 }, { "auxiliary_loss_clip": 0.01150853, "auxiliary_loss_mlp": 0.01034667, "balance_loss_clip": 1.02250195, "balance_loss_mlp": 1.04170191, "epoch": 0.5094243198557042, "flos": 21580948932480.0, "grad_norm": 1.9193617924477286, "language_loss": 0.68867433, "learning_rate": 1.941068248031823e-06, "loss": 0.71052957, "num_input_tokens_seen": 182266380, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.734375, "step": 8473, "time_per_iteration": 2.6373400688171387 }, { "auxiliary_loss_clip": 0.01133638, "auxiliary_loss_mlp": 0.01031615, "balance_loss_clip": 1.01900291, "balance_loss_mlp": 1.04308546, "epoch": 0.5094844431083722, "flos": 28840721529600.0, "grad_norm": 1.8600808571490424, "language_loss": 0.85121942, "learning_rate": 1.9406905908135187e-06, "loss": 0.87287199, "num_input_tokens_seen": 182284685, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.734375, "step": 8474, "time_per_iteration": 2.650411367416382 }, { "auxiliary_loss_clip": 0.01133987, "auxiliary_loss_mlp": 0.01035459, "balance_loss_clip": 1.02263224, "balance_loss_mlp": 1.04147315, "epoch": 0.5095445663610402, "flos": 14976114969600.0, "grad_norm": 1.9391251563302319, "language_loss": 0.64940798, "learning_rate": 1.940312935711815e-06, "loss": 0.67110246, "num_input_tokens_seen": 182301810, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7421875, "step": 8475, "time_per_iteration": 2.6074607372283936 }, { "auxiliary_loss_clip": 0.01138267, "auxiliary_loss_mlp": 0.01038207, "balance_loss_clip": 1.02418804, "balance_loss_mlp": 1.04260981, "epoch": 0.5096046896137081, "flos": 20777088510720.0, "grad_norm": 2.492129859157277, "language_loss": 0.81752264, "learning_rate": 1.939935282740189e-06, "loss": 0.8392874, "num_input_tokens_seen": 182320285, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78125, "step": 8476, "time_per_iteration": 2.6292073726654053 }, { "auxiliary_loss_clip": 0.01155191, "auxiliary_loss_mlp": 0.01037836, "balance_loss_clip": 1.02162385, "balance_loss_mlp": 1.04235327, "epoch": 0.5096648128663761, "flos": 23185078416000.0, "grad_norm": 2.375217586236723, "language_loss": 0.80575061, "learning_rate": 1.939557631912118e-06, "loss": 0.82768095, "num_input_tokens_seen": 182339465, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.765625, "step": 8477, "time_per_iteration": 2.6746692657470703 }, { "auxiliary_loss_clip": 0.01135831, "auxiliary_loss_mlp": 0.01030802, "balance_loss_clip": 1.01592469, "balance_loss_mlp": 1.04174089, "epoch": 0.509724936119044, "flos": 22309432663680.0, "grad_norm": 2.1241199053221096, "language_loss": 0.6155107, "learning_rate": 1.9391799832410803e-06, "loss": 0.63717699, "num_input_tokens_seen": 182358375, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.76171875, "step": 8478, "time_per_iteration": 2.66074538230896 }, { "auxiliary_loss_clip": 0.01123736, "auxiliary_loss_mlp": 0.01039277, "balance_loss_clip": 1.02647984, "balance_loss_mlp": 1.04195213, "epoch": 0.509785059371712, "flos": 26287077974400.0, "grad_norm": 1.981798936638523, "language_loss": 0.65454435, "learning_rate": 1.9388023367405516e-06, "loss": 0.67617452, "num_input_tokens_seen": 182377935, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.73046875, "step": 8479, "time_per_iteration": 2.6366097927093506 }, { "auxiliary_loss_clip": 0.0112438, "auxiliary_loss_mlp": 0.01031258, "balance_loss_clip": 1.01819253, "balance_loss_mlp": 1.04268622, "epoch": 0.50984518262438, "flos": 22964586779520.0, "grad_norm": 1.6032680618431612, "language_loss": 0.69877678, "learning_rate": 1.938424692424011e-06, "loss": 0.7203331, "num_input_tokens_seen": 182396440, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 8480, "time_per_iteration": 2.557314157485962 }, { "auxiliary_loss_clip": 0.0111779, "auxiliary_loss_mlp": 0.01033004, "balance_loss_clip": 1.01977181, "balance_loss_mlp": 1.04097509, "epoch": 0.509905305877048, "flos": 26213389223040.0, "grad_norm": 1.7193876458412052, "language_loss": 0.79338372, "learning_rate": 1.938047050304934e-06, "loss": 0.8148917, "num_input_tokens_seen": 182415890, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.76953125, "step": 8481, "time_per_iteration": 2.678220748901367 }, { "auxiliary_loss_clip": 0.01119806, "auxiliary_loss_mlp": 0.01036131, "balance_loss_clip": 1.02277374, "balance_loss_mlp": 1.03946209, "epoch": 0.5099654291297159, "flos": 20340055733760.0, "grad_norm": 1.5074016074214058, "language_loss": 0.8333503, "learning_rate": 1.937669410396799e-06, "loss": 0.85490966, "num_input_tokens_seen": 182434235, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71484375, "step": 8482, "time_per_iteration": 2.572765588760376 }, { "auxiliary_loss_clip": 0.01128517, "auxiliary_loss_mlp": 0.01280751, "balance_loss_clip": 1.0195967, "balance_loss_mlp": 1.04176986, "epoch": 0.5100255523823839, "flos": 29054820545280.0, "grad_norm": 1.6253327915418572, "language_loss": 0.85390562, "learning_rate": 1.937291772713082e-06, "loss": 0.87799835, "num_input_tokens_seen": 182454360, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.77734375, "step": 8483, "time_per_iteration": 2.8118138313293457 }, { "auxiliary_loss_clip": 0.0112317, "auxiliary_loss_mlp": 0.0103226, "balance_loss_clip": 1.01945102, "balance_loss_mlp": 1.03897023, "epoch": 0.5100856756350518, "flos": 22455912326400.0, "grad_norm": 1.8020317920656157, "language_loss": 0.8300637, "learning_rate": 1.93691413726726e-06, "loss": 0.85161799, "num_input_tokens_seen": 182471940, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.75390625, "step": 8484, "time_per_iteration": 2.554043769836426 }, { "auxiliary_loss_clip": 0.01134354, "auxiliary_loss_mlp": 0.01028691, "balance_loss_clip": 1.01489902, "balance_loss_mlp": 1.0417738, "epoch": 0.5101457988877198, "flos": 19171055606400.0, "grad_norm": 2.4047571556959695, "language_loss": 0.8162787, "learning_rate": 1.936536504072811e-06, "loss": 0.83790916, "num_input_tokens_seen": 182490685, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75, "step": 8485, "time_per_iteration": 2.602032423019409 }, { "auxiliary_loss_clip": 0.01128209, "auxiliary_loss_mlp": 0.01030922, "balance_loss_clip": 1.0181073, "balance_loss_mlp": 1.03929567, "epoch": 0.5102059221403878, "flos": 14866371941760.0, "grad_norm": 1.8254037044115476, "language_loss": 0.74289083, "learning_rate": 1.9361588731432112e-06, "loss": 0.76448214, "num_input_tokens_seen": 182508325, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 8486, "time_per_iteration": 2.537604331970215 }, { "auxiliary_loss_clip": 0.01146888, "auxiliary_loss_mlp": 0.01031761, "balance_loss_clip": 1.01770651, "balance_loss_mlp": 1.04221177, "epoch": 0.5102660453930558, "flos": 22961103160320.0, "grad_norm": 1.6176622695573368, "language_loss": 0.70049423, "learning_rate": 1.9357812444919363e-06, "loss": 0.72228068, "num_input_tokens_seen": 182527020, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7734375, "step": 8487, "time_per_iteration": 2.6499862670898438 }, { "auxiliary_loss_clip": 0.01131879, "auxiliary_loss_mlp": 0.01035594, "balance_loss_clip": 1.02127075, "balance_loss_mlp": 1.04134715, "epoch": 0.5103261686457238, "flos": 23149311448320.0, "grad_norm": 6.480740577060409, "language_loss": 0.72885799, "learning_rate": 1.9354036181324636e-06, "loss": 0.75053275, "num_input_tokens_seen": 182543505, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.73046875, "step": 8488, "time_per_iteration": 4.01157808303833 }, { "auxiliary_loss_clip": 0.01143579, "auxiliary_loss_mlp": 0.01279727, "balance_loss_clip": 1.01900244, "balance_loss_mlp": 1.04100931, "epoch": 0.5103862918983917, "flos": 14319237000960.0, "grad_norm": 2.3068628577490133, "language_loss": 0.69365811, "learning_rate": 1.9350259940782694e-06, "loss": 0.71789116, "num_input_tokens_seen": 182562250, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7578125, "step": 8489, "time_per_iteration": 2.611008405685425 }, { "auxiliary_loss_clip": 0.01133021, "auxiliary_loss_mlp": 0.01034696, "balance_loss_clip": 1.02156544, "balance_loss_mlp": 1.03998482, "epoch": 0.5104464151510597, "flos": 25848536826240.0, "grad_norm": 1.6811769407865957, "language_loss": 0.72756696, "learning_rate": 1.934648372342831e-06, "loss": 0.74924409, "num_input_tokens_seen": 182581910, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.74609375, "step": 8490, "time_per_iteration": 2.6193175315856934 }, { "auxiliary_loss_clip": 0.01130654, "auxiliary_loss_mlp": 0.01035032, "balance_loss_clip": 1.02266431, "balance_loss_mlp": 1.0407486, "epoch": 0.5105065384037276, "flos": 21652913831040.0, "grad_norm": 3.42271665072438, "language_loss": 0.80515862, "learning_rate": 1.934270752939623e-06, "loss": 0.82681549, "num_input_tokens_seen": 182601350, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71875, "step": 8491, "time_per_iteration": 2.5453484058380127 }, { "auxiliary_loss_clip": 0.01119367, "auxiliary_loss_mlp": 0.01033114, "balance_loss_clip": 1.02019191, "balance_loss_mlp": 1.03920114, "epoch": 0.5105666616563956, "flos": 22491571553280.0, "grad_norm": 1.8425936443029218, "language_loss": 0.78750086, "learning_rate": 1.933893135882124e-06, "loss": 0.80902565, "num_input_tokens_seen": 182619660, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 8492, "time_per_iteration": 2.6493279933929443 }, { "auxiliary_loss_clip": 0.01147093, "auxiliary_loss_mlp": 0.0104394, "balance_loss_clip": 1.03033268, "balance_loss_mlp": 1.04389715, "epoch": 0.5106267849090635, "flos": 22455768672000.0, "grad_norm": 16.636244852886367, "language_loss": 0.77778792, "learning_rate": 1.9335155211838083e-06, "loss": 0.79969823, "num_input_tokens_seen": 182639815, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.76953125, "step": 8493, "time_per_iteration": 4.021355867385864 }, { "auxiliary_loss_clip": 0.01143933, "auxiliary_loss_mlp": 0.01035917, "balance_loss_clip": 1.02108717, "balance_loss_mlp": 1.04161751, "epoch": 0.5106869081617316, "flos": 23547093638400.0, "grad_norm": 1.7605721274940962, "language_loss": 0.83471501, "learning_rate": 1.9331379088581524e-06, "loss": 0.85651356, "num_input_tokens_seen": 182659655, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.75, "step": 8494, "time_per_iteration": 2.7008166313171387 }, { "auxiliary_loss_clip": 0.01120503, "auxiliary_loss_mlp": 0.01038291, "balance_loss_clip": 1.02566123, "balance_loss_mlp": 1.04391396, "epoch": 0.5107470314143995, "flos": 26792987080320.0, "grad_norm": 4.179748015451599, "language_loss": 0.78786564, "learning_rate": 1.932760298918633e-06, "loss": 0.80945361, "num_input_tokens_seen": 182677075, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.765625, "step": 8495, "time_per_iteration": 2.6565299034118652 }, { "auxiliary_loss_clip": 0.01141791, "auxiliary_loss_mlp": 0.01036528, "balance_loss_clip": 1.0227412, "balance_loss_mlp": 1.04060316, "epoch": 0.5108071546670675, "flos": 25739691638400.0, "grad_norm": 1.447529451420221, "language_loss": 0.78204334, "learning_rate": 1.9323826913787253e-06, "loss": 0.80382657, "num_input_tokens_seen": 182699625, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75, "step": 8496, "time_per_iteration": 2.6296610832214355 }, { "auxiliary_loss_clip": 0.01120618, "auxiliary_loss_mlp": 0.01031642, "balance_loss_clip": 1.0190655, "balance_loss_mlp": 1.0392797, "epoch": 0.5108672779197354, "flos": 18697537589760.0, "grad_norm": 1.7366891080864821, "language_loss": 0.78308994, "learning_rate": 1.9320050862519054e-06, "loss": 0.80461252, "num_input_tokens_seen": 182717020, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 8497, "time_per_iteration": 2.6666250228881836 }, { "auxiliary_loss_clip": 0.0111344, "auxiliary_loss_mlp": 0.01038003, "balance_loss_clip": 1.02444839, "balance_loss_mlp": 1.03991866, "epoch": 0.5109274011724034, "flos": 26688164215680.0, "grad_norm": 1.593223464260621, "language_loss": 0.8189168, "learning_rate": 1.9316274835516494e-06, "loss": 0.84043121, "num_input_tokens_seen": 182736955, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 8498, "time_per_iteration": 2.548983573913574 }, { "auxiliary_loss_clip": 0.01131609, "auxiliary_loss_mlp": 0.01279449, "balance_loss_clip": 1.01890409, "balance_loss_mlp": 1.04054856, "epoch": 0.5109875244250714, "flos": 22784028088320.0, "grad_norm": 1.973451846896082, "language_loss": 0.70492685, "learning_rate": 1.9312498832914323e-06, "loss": 0.7290374, "num_input_tokens_seen": 182757620, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 8499, "time_per_iteration": 4.080793380737305 }, { "auxiliary_loss_clip": 0.01150643, "auxiliary_loss_mlp": 0.01036924, "balance_loss_clip": 1.02346492, "balance_loss_mlp": 1.04025042, "epoch": 0.5110476476777394, "flos": 35588515622400.0, "grad_norm": 1.6780051994524354, "language_loss": 0.72046971, "learning_rate": 1.9308722854847304e-06, "loss": 0.74234545, "num_input_tokens_seen": 182780195, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.74609375, "step": 8500, "time_per_iteration": 4.171503782272339 }, { "auxiliary_loss_clip": 0.0112868, "auxiliary_loss_mlp": 0.0103922, "balance_loss_clip": 1.02415228, "balance_loss_mlp": 1.03995919, "epoch": 0.5111077709304074, "flos": 19280798634240.0, "grad_norm": 2.8064180011257256, "language_loss": 0.62811965, "learning_rate": 1.930494690145019e-06, "loss": 0.64979863, "num_input_tokens_seen": 182795765, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.80078125, "step": 8501, "time_per_iteration": 2.511492967605591 }, { "auxiliary_loss_clip": 0.01114907, "auxiliary_loss_mlp": 0.01042708, "balance_loss_clip": 1.02890348, "balance_loss_mlp": 1.03790534, "epoch": 0.5111678941830753, "flos": 20668207409280.0, "grad_norm": 3.425962533645235, "language_loss": 0.87805402, "learning_rate": 1.930117097285773e-06, "loss": 0.89963019, "num_input_tokens_seen": 182813120, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76953125, "step": 8502, "time_per_iteration": 2.6003785133361816 }, { "auxiliary_loss_clip": 0.01120931, "auxiliary_loss_mlp": 0.01037507, "balance_loss_clip": 1.02479351, "balance_loss_mlp": 1.03743267, "epoch": 0.5112280174357433, "flos": 26287903987200.0, "grad_norm": 1.7855406045808515, "language_loss": 0.82257301, "learning_rate": 1.929739506920468e-06, "loss": 0.8441574, "num_input_tokens_seen": 182835745, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.74609375, "step": 8503, "time_per_iteration": 2.5967190265655518 }, { "auxiliary_loss_clip": 0.01056422, "auxiliary_loss_mlp": 0.01001494, "balance_loss_clip": 1.00000381, "balance_loss_mlp": 1.01512372, "epoch": 0.5112881406884112, "flos": 59474247707520.0, "grad_norm": 0.8536097993888073, "language_loss": 0.63882554, "learning_rate": 1.9293619190625785e-06, "loss": 0.6594047, "num_input_tokens_seen": 182892540, "router_z_loss_clip": 0.01489258, "router_z_loss_mlp": 0.23925781, "step": 8504, "time_per_iteration": 3.226865530014038 }, { "auxiliary_loss_clip": 0.01142399, "auxiliary_loss_mlp": 0.01281516, "balance_loss_clip": 1.02022123, "balance_loss_mlp": 1.04182851, "epoch": 0.5113482639410792, "flos": 26468857728000.0, "grad_norm": 1.825170933040037, "language_loss": 0.8421163, "learning_rate": 1.9289843337255814e-06, "loss": 0.86635548, "num_input_tokens_seen": 182911515, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.73828125, "step": 8505, "time_per_iteration": 2.7761106491088867 }, { "auxiliary_loss_clip": 0.01122599, "auxiliary_loss_mlp": 0.0103105, "balance_loss_clip": 1.01758552, "balance_loss_mlp": 1.04121494, "epoch": 0.5114083871937471, "flos": 29895848565120.0, "grad_norm": 1.9410584588210826, "language_loss": 0.75129527, "learning_rate": 1.92860675092295e-06, "loss": 0.77283174, "num_input_tokens_seen": 182930860, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 8506, "time_per_iteration": 2.6802730560302734 }, { "auxiliary_loss_clip": 0.01115513, "auxiliary_loss_mlp": 0.01034849, "balance_loss_clip": 1.01985204, "balance_loss_mlp": 1.04002547, "epoch": 0.5114685104464152, "flos": 24314576561280.0, "grad_norm": 1.7213460740994753, "language_loss": 0.58044302, "learning_rate": 1.928229170668161e-06, "loss": 0.60194665, "num_input_tokens_seen": 182949960, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.75390625, "step": 8507, "time_per_iteration": 2.6721513271331787 }, { "auxiliary_loss_clip": 0.01128375, "auxiliary_loss_mlp": 0.01042735, "balance_loss_clip": 1.02823329, "balance_loss_mlp": 1.04334927, "epoch": 0.5115286336990831, "flos": 17019288391680.0, "grad_norm": 2.056065851051335, "language_loss": 0.85452032, "learning_rate": 1.9278515929746875e-06, "loss": 0.87623149, "num_input_tokens_seen": 182968085, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.76171875, "step": 8508, "time_per_iteration": 2.5817935466766357 }, { "auxiliary_loss_clip": 0.01143292, "auxiliary_loss_mlp": 0.01287547, "balance_loss_clip": 1.02556372, "balance_loss_mlp": 1.04068279, "epoch": 0.5115887569517511, "flos": 23659386531840.0, "grad_norm": 1.7677841860872563, "language_loss": 0.72165322, "learning_rate": 1.9274740178560054e-06, "loss": 0.74596161, "num_input_tokens_seen": 182987275, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.76171875, "step": 8509, "time_per_iteration": 2.657230854034424 }, { "auxiliary_loss_clip": 0.01112557, "auxiliary_loss_mlp": 0.01281599, "balance_loss_clip": 1.02173293, "balance_loss_mlp": 1.03961062, "epoch": 0.511648880204419, "flos": 16107193313280.0, "grad_norm": 1.793106324197087, "language_loss": 0.76256168, "learning_rate": 1.9270964453255887e-06, "loss": 0.78650326, "num_input_tokens_seen": 183004700, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 8510, "time_per_iteration": 2.502779960632324 }, { "auxiliary_loss_clip": 0.01112154, "auxiliary_loss_mlp": 0.01036296, "balance_loss_clip": 1.02352905, "balance_loss_mlp": 1.03980327, "epoch": 0.511709003457087, "flos": 32634970974720.0, "grad_norm": 1.7493782575530203, "language_loss": 0.71103203, "learning_rate": 1.9267188753969125e-06, "loss": 0.73251653, "num_input_tokens_seen": 183025830, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.72265625, "step": 8511, "time_per_iteration": 2.62559175491333 }, { "auxiliary_loss_clip": 0.0113167, "auxiliary_loss_mlp": 0.01029636, "balance_loss_clip": 1.0170058, "balance_loss_mlp": 1.04209495, "epoch": 0.511769126709755, "flos": 21762082241280.0, "grad_norm": 2.2291062417198595, "language_loss": 0.66778338, "learning_rate": 1.9263413080834514e-06, "loss": 0.68939638, "num_input_tokens_seen": 183045140, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 8512, "time_per_iteration": 2.518279552459717 }, { "auxiliary_loss_clip": 0.01160661, "auxiliary_loss_mlp": 0.01037927, "balance_loss_clip": 1.02317512, "balance_loss_mlp": 1.0395329, "epoch": 0.511829249962423, "flos": 23915357827200.0, "grad_norm": 1.7082107228211008, "language_loss": 0.66468042, "learning_rate": 1.925963743398679e-06, "loss": 0.68666625, "num_input_tokens_seen": 183063935, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.765625, "step": 8513, "time_per_iteration": 2.6220040321350098 }, { "auxiliary_loss_clip": 0.01139985, "auxiliary_loss_mlp": 0.01038597, "balance_loss_clip": 1.02404153, "balance_loss_mlp": 1.0403918, "epoch": 0.511889373215091, "flos": 23727005884800.0, "grad_norm": 2.034473719521223, "language_loss": 0.6889897, "learning_rate": 1.9255861813560706e-06, "loss": 0.71077549, "num_input_tokens_seen": 183084135, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7265625, "step": 8514, "time_per_iteration": 2.5951602458953857 }, { "auxiliary_loss_clip": 0.01111367, "auxiliary_loss_mlp": 0.01032046, "balance_loss_clip": 1.01914799, "balance_loss_mlp": 1.03929853, "epoch": 0.5119494964677589, "flos": 28111519526400.0, "grad_norm": 1.6279059474054058, "language_loss": 0.65974915, "learning_rate": 1.9252086219691e-06, "loss": 0.68118322, "num_input_tokens_seen": 183104570, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 8515, "time_per_iteration": 2.679002285003662 }, { "auxiliary_loss_clip": 0.01121768, "auxiliary_loss_mlp": 0.01032007, "balance_loss_clip": 1.01929355, "balance_loss_mlp": 1.03966284, "epoch": 0.5120096197204269, "flos": 24973214296320.0, "grad_norm": 1.9021795224953295, "language_loss": 0.7531625, "learning_rate": 1.9248310652512415e-06, "loss": 0.77470028, "num_input_tokens_seen": 183123850, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.734375, "step": 8516, "time_per_iteration": 2.5525455474853516 }, { "auxiliary_loss_clip": 0.01136913, "auxiliary_loss_mlp": 0.01040003, "balance_loss_clip": 1.02631211, "balance_loss_mlp": 1.04283834, "epoch": 0.5120697429730948, "flos": 17968012364160.0, "grad_norm": 1.9922919259748753, "language_loss": 0.7809189, "learning_rate": 1.924453511215969e-06, "loss": 0.80268806, "num_input_tokens_seen": 183141725, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7578125, "step": 8517, "time_per_iteration": 2.609938144683838 }, { "auxiliary_loss_clip": 0.01127056, "auxiliary_loss_mlp": 0.01036025, "balance_loss_clip": 1.02202404, "balance_loss_mlp": 1.04079783, "epoch": 0.5121298662257628, "flos": 23292343405440.0, "grad_norm": 1.7069135722121551, "language_loss": 0.73763835, "learning_rate": 1.9240759598767554e-06, "loss": 0.75926924, "num_input_tokens_seen": 183161300, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7734375, "step": 8518, "time_per_iteration": 2.5661394596099854 }, { "auxiliary_loss_clip": 0.01125468, "auxiliary_loss_mlp": 0.01039685, "balance_loss_clip": 1.02598763, "balance_loss_mlp": 1.0416007, "epoch": 0.5121899894784308, "flos": 17311062568320.0, "grad_norm": 1.7369534968281601, "language_loss": 0.78231621, "learning_rate": 1.9236984112470763e-06, "loss": 0.80396771, "num_input_tokens_seen": 183180495, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75, "step": 8519, "time_per_iteration": 2.565110921859741 }, { "auxiliary_loss_clip": 0.01116079, "auxiliary_loss_mlp": 0.01036226, "balance_loss_clip": 1.02208161, "balance_loss_mlp": 1.04150891, "epoch": 0.5122501127310988, "flos": 24930085040640.0, "grad_norm": 1.8967619720399407, "language_loss": 0.79724783, "learning_rate": 1.923320865340405e-06, "loss": 0.81877089, "num_input_tokens_seen": 183200330, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.74609375, "step": 8520, "time_per_iteration": 2.5956122875213623 }, { "auxiliary_loss_clip": 0.01138699, "auxiliary_loss_mlp": 0.01040005, "balance_loss_clip": 1.02467525, "balance_loss_mlp": 1.04199982, "epoch": 0.5123102359837667, "flos": 18442859184000.0, "grad_norm": 2.4818000629158465, "language_loss": 0.81224644, "learning_rate": 1.9229433221702135e-06, "loss": 0.83403349, "num_input_tokens_seen": 183218230, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.79296875, "step": 8521, "time_per_iteration": 2.6264328956604004 }, { "auxiliary_loss_clip": 0.01120633, "auxiliary_loss_mlp": 0.0103019, "balance_loss_clip": 1.01618278, "balance_loss_mlp": 1.03680778, "epoch": 0.5123703592364347, "flos": 26684860164480.0, "grad_norm": 1.8077923804465812, "language_loss": 0.68243659, "learning_rate": 1.9225657817499773e-06, "loss": 0.70394486, "num_input_tokens_seen": 183236735, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75, "step": 8522, "time_per_iteration": 2.6362040042877197 }, { "auxiliary_loss_clip": 0.0114811, "auxiliary_loss_mlp": 0.01037862, "balance_loss_clip": 1.02032602, "balance_loss_mlp": 1.04293442, "epoch": 0.5124304824891026, "flos": 28803948981120.0, "grad_norm": 1.6874594440436974, "language_loss": 0.61077362, "learning_rate": 1.922188244093169e-06, "loss": 0.63263339, "num_input_tokens_seen": 183257550, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.7890625, "step": 8523, "time_per_iteration": 2.668853998184204 }, { "auxiliary_loss_clip": 0.011431, "auxiliary_loss_mlp": 0.01037486, "balance_loss_clip": 1.02515364, "balance_loss_mlp": 1.04214847, "epoch": 0.5124906057417706, "flos": 21761830846080.0, "grad_norm": 8.073928977728517, "language_loss": 0.77928972, "learning_rate": 1.9218107092132623e-06, "loss": 0.8010956, "num_input_tokens_seen": 183275515, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.7421875, "step": 8524, "time_per_iteration": 2.60373854637146 }, { "auxiliary_loss_clip": 0.01134161, "auxiliary_loss_mlp": 0.0103141, "balance_loss_clip": 1.01832032, "balance_loss_mlp": 1.04286432, "epoch": 0.5125507289944387, "flos": 18880538405760.0, "grad_norm": 1.914980902157273, "language_loss": 0.74963373, "learning_rate": 1.9214331771237307e-06, "loss": 0.77128941, "num_input_tokens_seen": 183293880, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 8525, "time_per_iteration": 2.590778112411499 }, { "auxiliary_loss_clip": 0.01136083, "auxiliary_loss_mlp": 0.01038212, "balance_loss_clip": 1.02369213, "balance_loss_mlp": 1.04145479, "epoch": 0.5126108522471066, "flos": 35627838036480.0, "grad_norm": 2.195717400559122, "language_loss": 0.74266148, "learning_rate": 1.9210556478380458e-06, "loss": 0.76440442, "num_input_tokens_seen": 183315860, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.765625, "step": 8526, "time_per_iteration": 2.7128446102142334 }, { "auxiliary_loss_clip": 0.01125478, "auxiliary_loss_mlp": 0.01039276, "balance_loss_clip": 1.02516127, "balance_loss_mlp": 1.04175532, "epoch": 0.5126709754997746, "flos": 20190918464640.0, "grad_norm": 1.639859199177956, "language_loss": 0.65247405, "learning_rate": 1.9206781213696827e-06, "loss": 0.67412156, "num_input_tokens_seen": 183335480, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75, "step": 8527, "time_per_iteration": 2.566147565841675 }, { "auxiliary_loss_clip": 0.01131507, "auxiliary_loss_mlp": 0.01037777, "balance_loss_clip": 1.02534986, "balance_loss_mlp": 1.04363203, "epoch": 0.5127310987524425, "flos": 18588548747520.0, "grad_norm": 1.5606866337230292, "language_loss": 0.74620593, "learning_rate": 1.920300597732113e-06, "loss": 0.7678988, "num_input_tokens_seen": 183354395, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 8528, "time_per_iteration": 2.6208527088165283 }, { "auxiliary_loss_clip": 0.01121341, "auxiliary_loss_mlp": 0.01033932, "balance_loss_clip": 1.02003193, "balance_loss_mlp": 1.03895485, "epoch": 0.5127912220051105, "flos": 22454691264000.0, "grad_norm": 1.9177274560349578, "language_loss": 0.82771873, "learning_rate": 1.91992307693881e-06, "loss": 0.84927148, "num_input_tokens_seen": 183372980, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.734375, "step": 8529, "time_per_iteration": 2.5738680362701416 }, { "auxiliary_loss_clip": 0.01145775, "auxiliary_loss_mlp": 0.01034147, "balance_loss_clip": 1.02085543, "balance_loss_mlp": 1.03882933, "epoch": 0.5128513452577784, "flos": 19093703667840.0, "grad_norm": 1.8342694807991713, "language_loss": 0.74050993, "learning_rate": 1.919545559003247e-06, "loss": 0.76230913, "num_input_tokens_seen": 183390160, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 8530, "time_per_iteration": 3.972355604171753 }, { "auxiliary_loss_clip": 0.01063672, "auxiliary_loss_mlp": 0.0100283, "balance_loss_clip": 1.00124478, "balance_loss_mlp": 1.01289153, "epoch": 0.5129114685104464, "flos": 67892285243520.0, "grad_norm": 0.7559578173970273, "language_loss": 0.60804868, "learning_rate": 1.9191680439388954e-06, "loss": 0.62871373, "num_input_tokens_seen": 183455280, "router_z_loss_clip": 0.01586914, "router_z_loss_mlp": 0.24414062, "step": 8531, "time_per_iteration": 3.3129897117614746 }, { "auxiliary_loss_clip": 0.01134938, "auxiliary_loss_mlp": 0.01029694, "balance_loss_clip": 1.0165751, "balance_loss_mlp": 1.04171872, "epoch": 0.5129715917631144, "flos": 20449152316800.0, "grad_norm": 2.1218223128350022, "language_loss": 0.76889956, "learning_rate": 1.9187905317592285e-06, "loss": 0.79054588, "num_input_tokens_seen": 183473955, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.75, "step": 8532, "time_per_iteration": 2.533581256866455 }, { "auxiliary_loss_clip": 0.01130261, "auxiliary_loss_mlp": 0.01034069, "balance_loss_clip": 1.02014518, "balance_loss_mlp": 1.03938651, "epoch": 0.5130317150157824, "flos": 35116146840960.0, "grad_norm": 1.7643850469095512, "language_loss": 0.66867483, "learning_rate": 1.9184130224777183e-06, "loss": 0.69031817, "num_input_tokens_seen": 183497195, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 8533, "time_per_iteration": 2.7274324893951416 }, { "auxiliary_loss_clip": 0.01143017, "auxiliary_loss_mlp": 0.01041037, "balance_loss_clip": 1.02643442, "balance_loss_mlp": 1.040797, "epoch": 0.5130918382684503, "flos": 19791627903360.0, "grad_norm": 1.8705493102936468, "language_loss": 0.82310724, "learning_rate": 1.918035516107838e-06, "loss": 0.8449477, "num_input_tokens_seen": 183513675, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.76171875, "step": 8534, "time_per_iteration": 2.5105063915252686 }, { "auxiliary_loss_clip": 0.01126244, "auxiliary_loss_mlp": 0.01041996, "balance_loss_clip": 1.02711284, "balance_loss_mlp": 1.04148388, "epoch": 0.5131519615211183, "flos": 26323096337280.0, "grad_norm": 1.7243139412158326, "language_loss": 0.63744986, "learning_rate": 1.9176580126630587e-06, "loss": 0.65913224, "num_input_tokens_seen": 183535165, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7578125, "step": 8535, "time_per_iteration": 4.007023572921753 }, { "auxiliary_loss_clip": 0.01125377, "auxiliary_loss_mlp": 0.01032052, "balance_loss_clip": 1.01896238, "balance_loss_mlp": 1.04258275, "epoch": 0.5132120847737862, "flos": 19171917532800.0, "grad_norm": 1.5598806095959792, "language_loss": 0.69603992, "learning_rate": 1.917280512156854e-06, "loss": 0.71761417, "num_input_tokens_seen": 183553780, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7421875, "step": 8536, "time_per_iteration": 2.5913970470428467 }, { "auxiliary_loss_clip": 0.01126435, "auxiliary_loss_mlp": 0.01038947, "balance_loss_clip": 1.02462971, "balance_loss_mlp": 1.04070401, "epoch": 0.5132722080264542, "flos": 20230420446720.0, "grad_norm": 2.0798582577016584, "language_loss": 0.71315503, "learning_rate": 1.9169030146026944e-06, "loss": 0.73480886, "num_input_tokens_seen": 183572285, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.76953125, "step": 8537, "time_per_iteration": 2.572932243347168 }, { "auxiliary_loss_clip": 0.01151482, "auxiliary_loss_mlp": 0.01037963, "balance_loss_clip": 1.02371764, "balance_loss_mlp": 1.03932154, "epoch": 0.5133323312791223, "flos": 16469459930880.0, "grad_norm": 4.474931371903545, "language_loss": 0.80258191, "learning_rate": 1.9165255200140523e-06, "loss": 0.82447636, "num_input_tokens_seen": 183589330, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.76953125, "step": 8538, "time_per_iteration": 2.5926249027252197 }, { "auxiliary_loss_clip": 0.01122443, "auxiliary_loss_mlp": 0.0103407, "balance_loss_clip": 1.02053416, "balance_loss_mlp": 1.03857255, "epoch": 0.5133924545317902, "flos": 26068094709120.0, "grad_norm": 1.8424424768370649, "language_loss": 0.78622615, "learning_rate": 1.9161480284044e-06, "loss": 0.80779123, "num_input_tokens_seen": 183609205, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75, "step": 8539, "time_per_iteration": 2.6167781352996826 }, { "auxiliary_loss_clip": 0.01132475, "auxiliary_loss_mlp": 0.0103606, "balance_loss_clip": 1.02226758, "balance_loss_mlp": 1.0394547, "epoch": 0.5134525777844582, "flos": 29131023248640.0, "grad_norm": 1.7005084703140985, "language_loss": 0.76194656, "learning_rate": 1.915770539787209e-06, "loss": 0.78363192, "num_input_tokens_seen": 183629985, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75, "step": 8540, "time_per_iteration": 2.6755728721618652 }, { "auxiliary_loss_clip": 0.01154668, "auxiliary_loss_mlp": 0.01038956, "balance_loss_clip": 1.0245018, "balance_loss_mlp": 1.04131424, "epoch": 0.5135127010371261, "flos": 17454776883840.0, "grad_norm": 2.671506516164486, "language_loss": 0.74846935, "learning_rate": 1.9153930541759507e-06, "loss": 0.77040565, "num_input_tokens_seen": 183648220, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.77734375, "step": 8541, "time_per_iteration": 5.620535612106323 }, { "auxiliary_loss_clip": 0.01159696, "auxiliary_loss_mlp": 0.01040395, "balance_loss_clip": 1.02693617, "balance_loss_mlp": 1.03985167, "epoch": 0.5135728242897941, "flos": 21944975316480.0, "grad_norm": 1.8964175937032755, "language_loss": 0.70024073, "learning_rate": 1.9150155715840967e-06, "loss": 0.72224164, "num_input_tokens_seen": 183668230, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75, "step": 8542, "time_per_iteration": 2.656627893447876 }, { "auxiliary_loss_clip": 0.01132975, "auxiliary_loss_mlp": 0.0103259, "balance_loss_clip": 1.01911318, "balance_loss_mlp": 1.04063153, "epoch": 0.513632947542462, "flos": 22674859678080.0, "grad_norm": 1.897988255933764, "language_loss": 0.79272985, "learning_rate": 1.914638092025118e-06, "loss": 0.81438541, "num_input_tokens_seen": 183687800, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75, "step": 8543, "time_per_iteration": 2.5984909534454346 }, { "auxiliary_loss_clip": 0.01133716, "auxiliary_loss_mlp": 0.01041945, "balance_loss_clip": 1.02762187, "balance_loss_mlp": 1.04154682, "epoch": 0.51369307079513, "flos": 29457163762560.0, "grad_norm": 1.7993485502407467, "language_loss": 0.68102938, "learning_rate": 1.9142606155124863e-06, "loss": 0.70278597, "num_input_tokens_seen": 183709025, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.74609375, "step": 8544, "time_per_iteration": 2.611215353012085 }, { "auxiliary_loss_clip": 0.01145013, "auxiliary_loss_mlp": 0.01042874, "balance_loss_clip": 1.02917719, "balance_loss_mlp": 1.04115725, "epoch": 0.513753194047798, "flos": 18989347680000.0, "grad_norm": 2.1185563061748502, "language_loss": 0.72201669, "learning_rate": 1.9138831420596727e-06, "loss": 0.74389559, "num_input_tokens_seen": 183725740, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.765625, "step": 8545, "time_per_iteration": 2.6143972873687744 }, { "auxiliary_loss_clip": 0.01127036, "auxiliary_loss_mlp": 0.01040636, "balance_loss_clip": 1.02534151, "balance_loss_mlp": 1.04032636, "epoch": 0.513813317300466, "flos": 17821855923840.0, "grad_norm": 2.377469022224117, "language_loss": 0.81941313, "learning_rate": 1.9135056716801487e-06, "loss": 0.84108984, "num_input_tokens_seen": 183743995, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.77734375, "step": 8546, "time_per_iteration": 2.5810790061950684 }, { "auxiliary_loss_clip": 0.01127289, "auxiliary_loss_mlp": 0.01035083, "balance_loss_clip": 1.02105784, "balance_loss_mlp": 1.04212558, "epoch": 0.5138734405531339, "flos": 24061191045120.0, "grad_norm": 2.1354345521154707, "language_loss": 0.73015153, "learning_rate": 1.9131282043873848e-06, "loss": 0.75177521, "num_input_tokens_seen": 183764150, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76171875, "step": 8547, "time_per_iteration": 2.6381592750549316 }, { "auxiliary_loss_clip": 0.01123698, "auxiliary_loss_mlp": 0.01044915, "balance_loss_clip": 1.03093779, "balance_loss_mlp": 1.04016781, "epoch": 0.5139335638058019, "flos": 26097253574400.0, "grad_norm": 1.637818556123001, "language_loss": 0.73282373, "learning_rate": 1.912750740194851e-06, "loss": 0.75450981, "num_input_tokens_seen": 183783280, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75, "step": 8548, "time_per_iteration": 2.6211471557617188 }, { "auxiliary_loss_clip": 0.01142579, "auxiliary_loss_mlp": 0.01034765, "balance_loss_clip": 1.02054977, "balance_loss_mlp": 1.03942204, "epoch": 0.5139936870584698, "flos": 18917095472640.0, "grad_norm": 2.3700544555781, "language_loss": 0.8202728, "learning_rate": 1.9123732791160196e-06, "loss": 0.84204626, "num_input_tokens_seen": 183800725, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.76171875, "step": 8549, "time_per_iteration": 2.714864492416382 }, { "auxiliary_loss_clip": 0.01124977, "auxiliary_loss_mlp": 0.01034465, "balance_loss_clip": 1.02131629, "balance_loss_mlp": 1.04319882, "epoch": 0.5140538103111378, "flos": 16144001775360.0, "grad_norm": 1.8861671070249848, "language_loss": 0.72022307, "learning_rate": 1.91199582116436e-06, "loss": 0.74181747, "num_input_tokens_seen": 183818735, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 8550, "time_per_iteration": 2.5643584728240967 }, { "auxiliary_loss_clip": 0.01126416, "auxiliary_loss_mlp": 0.01035037, "balance_loss_clip": 1.02067208, "balance_loss_mlp": 1.03964472, "epoch": 0.5141139335638057, "flos": 22420145358720.0, "grad_norm": 2.6839324026323808, "language_loss": 0.75024992, "learning_rate": 1.9116183663533436e-06, "loss": 0.77186441, "num_input_tokens_seen": 183840015, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.77734375, "step": 8551, "time_per_iteration": 2.6551880836486816 }, { "auxiliary_loss_clip": 0.01131823, "auxiliary_loss_mlp": 0.01031131, "balance_loss_clip": 1.01719534, "balance_loss_mlp": 1.04132295, "epoch": 0.5141740568164738, "flos": 27089645506560.0, "grad_norm": 1.800671593427586, "language_loss": 0.68366289, "learning_rate": 1.9112409146964402e-06, "loss": 0.70529246, "num_input_tokens_seen": 183860145, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.72265625, "step": 8552, "time_per_iteration": 2.656290292739868 }, { "auxiliary_loss_clip": 0.01124766, "auxiliary_loss_mlp": 0.01033366, "balance_loss_clip": 1.01923394, "balance_loss_mlp": 1.04032958, "epoch": 0.5142341800691418, "flos": 24973250209920.0, "grad_norm": 1.8426436311502767, "language_loss": 0.74467623, "learning_rate": 1.9108634662071195e-06, "loss": 0.76625758, "num_input_tokens_seen": 183880540, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 8553, "time_per_iteration": 2.675692081451416 }, { "auxiliary_loss_clip": 0.01115699, "auxiliary_loss_mlp": 0.01037634, "balance_loss_clip": 1.02394271, "balance_loss_mlp": 1.04154897, "epoch": 0.5142943033218097, "flos": 20704513080960.0, "grad_norm": 1.7229046493876374, "language_loss": 0.67933464, "learning_rate": 1.9104860208988534e-06, "loss": 0.70086789, "num_input_tokens_seen": 183900895, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 8554, "time_per_iteration": 2.539245367050171 }, { "auxiliary_loss_clip": 0.01130637, "auxiliary_loss_mlp": 0.01042107, "balance_loss_clip": 1.02551317, "balance_loss_mlp": 1.04258943, "epoch": 0.5143544265744777, "flos": 22925479847040.0, "grad_norm": 2.0039837942519534, "language_loss": 0.73231304, "learning_rate": 1.9101085787851103e-06, "loss": 0.75404048, "num_input_tokens_seen": 183920335, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.7890625, "step": 8555, "time_per_iteration": 2.5930159091949463 }, { "auxiliary_loss_clip": 0.01145337, "auxiliary_loss_mlp": 0.01034769, "balance_loss_clip": 1.02069044, "balance_loss_mlp": 1.04317927, "epoch": 0.5144145498271456, "flos": 15921391236480.0, "grad_norm": 4.609509843426351, "language_loss": 0.75461173, "learning_rate": 1.9097311398793613e-06, "loss": 0.77641284, "num_input_tokens_seen": 183936220, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75390625, "step": 8556, "time_per_iteration": 2.6072092056274414 }, { "auxiliary_loss_clip": 0.01139903, "auxiliary_loss_mlp": 0.01033648, "balance_loss_clip": 1.02076745, "balance_loss_mlp": 1.0408268, "epoch": 0.5144746730798136, "flos": 19681238430720.0, "grad_norm": 1.8493449408371643, "language_loss": 0.8634851, "learning_rate": 1.909353704195075e-06, "loss": 0.88522059, "num_input_tokens_seen": 183953250, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 8557, "time_per_iteration": 2.5541129112243652 }, { "auxiliary_loss_clip": 0.01122383, "auxiliary_loss_mlp": 0.01034861, "balance_loss_clip": 1.02228463, "balance_loss_mlp": 1.041255, "epoch": 0.5145347963324816, "flos": 23914711382400.0, "grad_norm": 1.6745448439099755, "language_loss": 0.89099884, "learning_rate": 1.9089762717457226e-06, "loss": 0.91257131, "num_input_tokens_seen": 183973865, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 8558, "time_per_iteration": 2.655407190322876 }, { "auxiliary_loss_clip": 0.01114093, "auxiliary_loss_mlp": 0.01280223, "balance_loss_clip": 1.0202378, "balance_loss_mlp": 1.04241264, "epoch": 0.5145949195851496, "flos": 18260002022400.0, "grad_norm": 1.8993411883008997, "language_loss": 0.65274215, "learning_rate": 1.908598842544773e-06, "loss": 0.67668533, "num_input_tokens_seen": 183992555, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 8559, "time_per_iteration": 2.5266354084014893 }, { "auxiliary_loss_clip": 0.01124656, "auxiliary_loss_mlp": 0.01283066, "balance_loss_clip": 1.02278566, "balance_loss_mlp": 1.04221952, "epoch": 0.5146550428378175, "flos": 26213425136640.0, "grad_norm": 1.7205572489342602, "language_loss": 0.63741738, "learning_rate": 1.908221416605695e-06, "loss": 0.66149461, "num_input_tokens_seen": 184010825, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 8560, "time_per_iteration": 2.5745725631713867 }, { "auxiliary_loss_clip": 0.01113394, "auxiliary_loss_mlp": 0.01033054, "balance_loss_clip": 1.01963675, "balance_loss_mlp": 1.04038453, "epoch": 0.5147151660904855, "flos": 22674177319680.0, "grad_norm": 1.6055255337370824, "language_loss": 0.7004292, "learning_rate": 1.9078439939419595e-06, "loss": 0.72189367, "num_input_tokens_seen": 184030155, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73046875, "step": 8561, "time_per_iteration": 2.5887563228607178 }, { "auxiliary_loss_clip": 0.01133802, "auxiliary_loss_mlp": 0.01031004, "balance_loss_clip": 1.01860619, "balance_loss_mlp": 1.0410316, "epoch": 0.5147752893431534, "flos": 24972388283520.0, "grad_norm": 1.5316016664372907, "language_loss": 0.66438276, "learning_rate": 1.907466574567034e-06, "loss": 0.68603081, "num_input_tokens_seen": 184051440, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.74609375, "step": 8562, "time_per_iteration": 2.740692615509033 }, { "auxiliary_loss_clip": 0.01127572, "auxiliary_loss_mlp": 0.01030945, "balance_loss_clip": 1.01690841, "balance_loss_mlp": 1.04330182, "epoch": 0.5148354125958214, "flos": 22744669760640.0, "grad_norm": 1.6686965201116344, "language_loss": 0.77301359, "learning_rate": 1.9070891584943885e-06, "loss": 0.79459882, "num_input_tokens_seen": 184070205, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 8563, "time_per_iteration": 2.6079905033111572 }, { "auxiliary_loss_clip": 0.01129025, "auxiliary_loss_mlp": 0.01034467, "balance_loss_clip": 1.01863647, "balance_loss_mlp": 1.0411973, "epoch": 0.5148955358484893, "flos": 23068763199360.0, "grad_norm": 2.4944042383768936, "language_loss": 0.82380569, "learning_rate": 1.9067117457374921e-06, "loss": 0.84544063, "num_input_tokens_seen": 184087345, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.7890625, "step": 8564, "time_per_iteration": 2.632195234298706 }, { "auxiliary_loss_clip": 0.0113715, "auxiliary_loss_mlp": 0.01034081, "balance_loss_clip": 1.01930463, "balance_loss_mlp": 1.04193556, "epoch": 0.5149556591011574, "flos": 20340127560960.0, "grad_norm": 1.8820316550203822, "language_loss": 0.72971427, "learning_rate": 1.9063343363098132e-06, "loss": 0.75142658, "num_input_tokens_seen": 184107110, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.77734375, "step": 8565, "time_per_iteration": 2.5456454753875732 }, { "auxiliary_loss_clip": 0.0113059, "auxiliary_loss_mlp": 0.01032112, "balance_loss_clip": 1.01617408, "balance_loss_mlp": 1.04288065, "epoch": 0.5150157823538254, "flos": 22638230784000.0, "grad_norm": 12.531402996551405, "language_loss": 0.68291944, "learning_rate": 1.9059569302248213e-06, "loss": 0.70454645, "num_input_tokens_seen": 184127105, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.78515625, "step": 8566, "time_per_iteration": 2.5932776927948 }, { "auxiliary_loss_clip": 0.01118399, "auxiliary_loss_mlp": 0.01280705, "balance_loss_clip": 1.02004135, "balance_loss_mlp": 1.03946042, "epoch": 0.5150759056064933, "flos": 26067627832320.0, "grad_norm": 1.5841935489350518, "language_loss": 0.78030539, "learning_rate": 1.9055795274959841e-06, "loss": 0.80429637, "num_input_tokens_seen": 184148060, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7890625, "step": 8567, "time_per_iteration": 2.554222822189331 }, { "auxiliary_loss_clip": 0.01127719, "auxiliary_loss_mlp": 0.01033376, "balance_loss_clip": 1.01878476, "balance_loss_mlp": 1.04183674, "epoch": 0.5151360288591613, "flos": 25952641418880.0, "grad_norm": 2.049433896974165, "language_loss": 0.78578204, "learning_rate": 1.9052021281367711e-06, "loss": 0.80739295, "num_input_tokens_seen": 184166175, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.76953125, "step": 8568, "time_per_iteration": 2.5967094898223877 }, { "auxiliary_loss_clip": 0.01131335, "auxiliary_loss_mlp": 0.01031282, "balance_loss_clip": 1.01811504, "balance_loss_mlp": 1.03984571, "epoch": 0.5151961521118292, "flos": 18507246312960.0, "grad_norm": 2.1933383257435373, "language_loss": 0.90968394, "learning_rate": 1.9048247321606505e-06, "loss": 0.93131012, "num_input_tokens_seen": 184182600, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73828125, "step": 8569, "time_per_iteration": 2.5252716541290283 }, { "auxiliary_loss_clip": 0.01119324, "auxiliary_loss_mlp": 0.01035211, "balance_loss_clip": 1.02107835, "balance_loss_mlp": 1.04275036, "epoch": 0.5152562753644973, "flos": 22233696837120.0, "grad_norm": 1.5868812122550058, "language_loss": 0.76652992, "learning_rate": 1.90444733958109e-06, "loss": 0.78807533, "num_input_tokens_seen": 184202020, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.765625, "step": 8570, "time_per_iteration": 2.6018505096435547 }, { "auxiliary_loss_clip": 0.01125964, "auxiliary_loss_mlp": 0.01041883, "balance_loss_clip": 1.02661204, "balance_loss_mlp": 1.04096818, "epoch": 0.5153163986171652, "flos": 38436555047040.0, "grad_norm": 1.4872948443157272, "language_loss": 0.7375223, "learning_rate": 1.9040699504115584e-06, "loss": 0.75920081, "num_input_tokens_seen": 184224850, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.76171875, "step": 8571, "time_per_iteration": 2.6877171993255615 }, { "auxiliary_loss_clip": 0.0105585, "auxiliary_loss_mlp": 0.010061, "balance_loss_clip": 1.00464559, "balance_loss_mlp": 1.0138545, "epoch": 0.5153765218698332, "flos": 66384503015040.0, "grad_norm": 0.7798931414462574, "language_loss": 0.52968258, "learning_rate": 1.9036925646655231e-06, "loss": 0.55030209, "num_input_tokens_seen": 184288520, "router_z_loss_clip": 0.01452637, "router_z_loss_mlp": 0.24414062, "step": 8572, "time_per_iteration": 4.701510667800903 }, { "auxiliary_loss_clip": 0.01124035, "auxiliary_loss_mlp": 0.01036799, "balance_loss_clip": 1.02389419, "balance_loss_mlp": 1.04279113, "epoch": 0.5154366451225011, "flos": 24024669891840.0, "grad_norm": 1.7512252848367593, "language_loss": 0.76228005, "learning_rate": 1.9033151823564531e-06, "loss": 0.78388846, "num_input_tokens_seen": 184308565, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 8573, "time_per_iteration": 2.6927640438079834 }, { "auxiliary_loss_clip": 0.01123474, "auxiliary_loss_mlp": 0.01032015, "balance_loss_clip": 1.0183661, "balance_loss_mlp": 1.04102755, "epoch": 0.5154967683751691, "flos": 23468843859840.0, "grad_norm": 2.0602092461902557, "language_loss": 0.76885098, "learning_rate": 1.9029378034978153e-06, "loss": 0.79040587, "num_input_tokens_seen": 184326795, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73828125, "step": 8574, "time_per_iteration": 2.4993093013763428 }, { "auxiliary_loss_clip": 0.01133806, "auxiliary_loss_mlp": 0.01035484, "balance_loss_clip": 1.02278209, "balance_loss_mlp": 1.04133129, "epoch": 0.515556891627837, "flos": 23805650712960.0, "grad_norm": 1.834377993852244, "language_loss": 0.85191298, "learning_rate": 1.9025604281030772e-06, "loss": 0.87360591, "num_input_tokens_seen": 184345990, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7421875, "step": 8575, "time_per_iteration": 2.6517810821533203 }, { "auxiliary_loss_clip": 0.01124116, "auxiliary_loss_mlp": 0.01032818, "balance_loss_clip": 1.01876378, "balance_loss_mlp": 1.03941143, "epoch": 0.515617014880505, "flos": 19828544106240.0, "grad_norm": 1.697014018657278, "language_loss": 0.77012098, "learning_rate": 1.9021830561857074e-06, "loss": 0.79169023, "num_input_tokens_seen": 184366300, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 8576, "time_per_iteration": 3.9468994140625 }, { "auxiliary_loss_clip": 0.01155639, "auxiliary_loss_mlp": 0.01279841, "balance_loss_clip": 1.01787198, "balance_loss_mlp": 1.04060519, "epoch": 0.515677138133173, "flos": 14245907385600.0, "grad_norm": 2.170880377749584, "language_loss": 0.75335306, "learning_rate": 1.9018056877591725e-06, "loss": 0.77770782, "num_input_tokens_seen": 184383030, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.796875, "step": 8577, "time_per_iteration": 2.5488669872283936 }, { "auxiliary_loss_clip": 0.0113596, "auxiliary_loss_mlp": 0.01038491, "balance_loss_clip": 1.02328563, "balance_loss_mlp": 1.04113173, "epoch": 0.515737261385841, "flos": 28289707920000.0, "grad_norm": 1.9108033559661017, "language_loss": 0.81006688, "learning_rate": 1.9014283228369399e-06, "loss": 0.83181143, "num_input_tokens_seen": 184403410, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7734375, "step": 8578, "time_per_iteration": 2.6529879570007324 }, { "auxiliary_loss_clip": 0.01113238, "auxiliary_loss_mlp": 0.01033858, "balance_loss_clip": 1.02023196, "balance_loss_mlp": 1.03967106, "epoch": 0.515797384638509, "flos": 27891925729920.0, "grad_norm": 1.6976320263480904, "language_loss": 0.7602967, "learning_rate": 1.9010509614324766e-06, "loss": 0.78176761, "num_input_tokens_seen": 184423830, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 8579, "time_per_iteration": 2.5320394039154053 }, { "auxiliary_loss_clip": 0.01144237, "auxiliary_loss_mlp": 0.01031209, "balance_loss_clip": 1.01780415, "balance_loss_mlp": 1.04093111, "epoch": 0.5158575078911769, "flos": 23040071210880.0, "grad_norm": 1.701139535421083, "language_loss": 0.79174531, "learning_rate": 1.9006736035592505e-06, "loss": 0.81349981, "num_input_tokens_seen": 184445050, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.76171875, "step": 8580, "time_per_iteration": 2.6007182598114014 }, { "auxiliary_loss_clip": 0.01148302, "auxiliary_loss_mlp": 0.0104491, "balance_loss_clip": 1.02991915, "balance_loss_mlp": 1.04308593, "epoch": 0.5159176311438449, "flos": 12641346938880.0, "grad_norm": 1.959856517157245, "language_loss": 0.72955441, "learning_rate": 1.900296249230728e-06, "loss": 0.75148648, "num_input_tokens_seen": 184460775, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.78515625, "step": 8581, "time_per_iteration": 2.496346950531006 }, { "auxiliary_loss_clip": 0.0112416, "auxiliary_loss_mlp": 0.01030714, "balance_loss_clip": 1.01770282, "balance_loss_mlp": 1.04152536, "epoch": 0.5159777543965128, "flos": 15558154951680.0, "grad_norm": 2.1378058707473575, "language_loss": 0.7442292, "learning_rate": 1.8999188984603753e-06, "loss": 0.76577789, "num_input_tokens_seen": 184477365, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.73828125, "step": 8582, "time_per_iteration": 4.103499412536621 }, { "auxiliary_loss_clip": 0.01142875, "auxiliary_loss_mlp": 0.01032721, "balance_loss_clip": 1.01995385, "balance_loss_mlp": 1.04162407, "epoch": 0.5160378776491809, "flos": 23221671396480.0, "grad_norm": 4.119963431561096, "language_loss": 0.65917844, "learning_rate": 1.8995415512616602e-06, "loss": 0.68093443, "num_input_tokens_seen": 184497045, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.74609375, "step": 8583, "time_per_iteration": 4.12235164642334 }, { "auxiliary_loss_clip": 0.01128434, "auxiliary_loss_mlp": 0.01035351, "balance_loss_clip": 1.02053332, "balance_loss_mlp": 1.04331183, "epoch": 0.5160980009018488, "flos": 21944616180480.0, "grad_norm": 1.625596654267476, "language_loss": 0.76096702, "learning_rate": 1.8991642076480482e-06, "loss": 0.78260481, "num_input_tokens_seen": 184517675, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.76171875, "step": 8584, "time_per_iteration": 2.5606889724731445 }, { "auxiliary_loss_clip": 0.01146583, "auxiliary_loss_mlp": 0.01045043, "balance_loss_clip": 1.03078544, "balance_loss_mlp": 1.04210234, "epoch": 0.5161581241545168, "flos": 22784064001920.0, "grad_norm": 2.3756510165273896, "language_loss": 0.78667527, "learning_rate": 1.8987868676330068e-06, "loss": 0.80859149, "num_input_tokens_seen": 184537745, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78125, "step": 8585, "time_per_iteration": 2.60621976852417 }, { "auxiliary_loss_clip": 0.01121528, "auxiliary_loss_mlp": 0.0103782, "balance_loss_clip": 1.02487373, "balance_loss_mlp": 1.04094458, "epoch": 0.5162182474071847, "flos": 19675384513920.0, "grad_norm": 2.5875837587210304, "language_loss": 0.80829376, "learning_rate": 1.8984095312300017e-06, "loss": 0.82988727, "num_input_tokens_seen": 184553630, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 8586, "time_per_iteration": 2.5239171981811523 }, { "auxiliary_loss_clip": 0.01136286, "auxiliary_loss_mlp": 0.01033464, "balance_loss_clip": 1.01961768, "balance_loss_mlp": 1.0424, "epoch": 0.5162783706598527, "flos": 20046198568320.0, "grad_norm": 1.7095101654422313, "language_loss": 0.71142668, "learning_rate": 1.8980321984524988e-06, "loss": 0.73312414, "num_input_tokens_seen": 184573530, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76171875, "step": 8587, "time_per_iteration": 2.6175966262817383 }, { "auxiliary_loss_clip": 0.01049432, "auxiliary_loss_mlp": 0.01001832, "balance_loss_clip": 1.00030661, "balance_loss_mlp": 1.01652527, "epoch": 0.5163384939125206, "flos": 69959553713280.0, "grad_norm": 0.7671558150444975, "language_loss": 0.57819688, "learning_rate": 1.8976548693139648e-06, "loss": 0.59870952, "num_input_tokens_seen": 184637875, "router_z_loss_clip": 0.01525879, "router_z_loss_mlp": 0.23925781, "step": 8588, "time_per_iteration": 3.1691999435424805 }, { "auxiliary_loss_clip": 0.01137167, "auxiliary_loss_mlp": 0.01042276, "balance_loss_clip": 1.02752376, "balance_loss_mlp": 1.0425241, "epoch": 0.5163986171651886, "flos": 17417034668160.0, "grad_norm": 1.8034792846207048, "language_loss": 0.75479609, "learning_rate": 1.8972775438278646e-06, "loss": 0.77659059, "num_input_tokens_seen": 184656125, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.765625, "step": 8589, "time_per_iteration": 2.5537984371185303 }, { "auxiliary_loss_clip": 0.01127046, "auxiliary_loss_mlp": 0.01033454, "balance_loss_clip": 1.02015626, "balance_loss_mlp": 1.04215145, "epoch": 0.5164587404178566, "flos": 21322679166720.0, "grad_norm": 1.726217325766284, "language_loss": 0.67621768, "learning_rate": 1.8969002220076654e-06, "loss": 0.69782269, "num_input_tokens_seen": 184675920, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7578125, "step": 8590, "time_per_iteration": 2.630624294281006 }, { "auxiliary_loss_clip": 0.01048296, "auxiliary_loss_mlp": 0.01002092, "balance_loss_clip": 1.0004828, "balance_loss_mlp": 1.01551259, "epoch": 0.5165188636705246, "flos": 68057149691520.0, "grad_norm": 0.778469816512835, "language_loss": 0.55886596, "learning_rate": 1.8965229038668323e-06, "loss": 0.57936984, "num_input_tokens_seen": 184730520, "router_z_loss_clip": 0.01611328, "router_z_loss_mlp": 0.24023438, "step": 8591, "time_per_iteration": 3.0575718879699707 }, { "auxiliary_loss_clip": 0.01128196, "auxiliary_loss_mlp": 0.01027825, "balance_loss_clip": 1.01539767, "balance_loss_mlp": 1.0408957, "epoch": 0.5165789869231926, "flos": 19385657412480.0, "grad_norm": 1.6437742477779451, "language_loss": 0.81272382, "learning_rate": 1.8961455894188297e-06, "loss": 0.83428407, "num_input_tokens_seen": 184748340, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 8592, "time_per_iteration": 2.584947347640991 }, { "auxiliary_loss_clip": 0.01124535, "auxiliary_loss_mlp": 0.01029522, "balance_loss_clip": 1.01624227, "balance_loss_mlp": 1.04338384, "epoch": 0.5166391101758605, "flos": 20960197067520.0, "grad_norm": 1.682628485311245, "language_loss": 0.83279073, "learning_rate": 1.8957682786771243e-06, "loss": 0.85433131, "num_input_tokens_seen": 184766615, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 8593, "time_per_iteration": 2.5592260360717773 }, { "auxiliary_loss_clip": 0.01136039, "auxiliary_loss_mlp": 0.01035475, "balance_loss_clip": 1.02217698, "balance_loss_mlp": 1.04349875, "epoch": 0.5166992334285285, "flos": 29462407148160.0, "grad_norm": 1.5594262007019668, "language_loss": 0.69001722, "learning_rate": 1.8953909716551807e-06, "loss": 0.71173227, "num_input_tokens_seen": 184788075, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.74609375, "step": 8594, "time_per_iteration": 2.8039145469665527 }, { "auxiliary_loss_clip": 0.01141016, "auxiliary_loss_mlp": 0.01029459, "balance_loss_clip": 1.01583362, "balance_loss_mlp": 1.0408535, "epoch": 0.5167593566811964, "flos": 20304360593280.0, "grad_norm": 1.5649472786257321, "language_loss": 0.77276272, "learning_rate": 1.8950136683664645e-06, "loss": 0.79446745, "num_input_tokens_seen": 184808710, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 8595, "time_per_iteration": 2.5333194732666016 }, { "auxiliary_loss_clip": 0.01116543, "auxiliary_loss_mlp": 0.01036017, "balance_loss_clip": 1.02264166, "balance_loss_mlp": 1.04313016, "epoch": 0.5168194799338645, "flos": 14611370313600.0, "grad_norm": 1.5128814693817092, "language_loss": 0.64912891, "learning_rate": 1.8946363688244405e-06, "loss": 0.67065448, "num_input_tokens_seen": 184826475, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 8596, "time_per_iteration": 2.640941858291626 }, { "auxiliary_loss_clip": 0.01145203, "auxiliary_loss_mlp": 0.01035818, "balance_loss_clip": 1.02046406, "balance_loss_mlp": 1.04431129, "epoch": 0.5168796031865324, "flos": 25007257411200.0, "grad_norm": 1.5141730312063155, "language_loss": 0.75527006, "learning_rate": 1.894259073042573e-06, "loss": 0.7770803, "num_input_tokens_seen": 184845245, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.73828125, "step": 8597, "time_per_iteration": 2.607604742050171 }, { "auxiliary_loss_clip": 0.01116549, "auxiliary_loss_mlp": 0.01026384, "balance_loss_clip": 1.01312757, "balance_loss_mlp": 1.04149675, "epoch": 0.5169397264392004, "flos": 26939969533440.0, "grad_norm": 1.6143924674625183, "language_loss": 0.81009793, "learning_rate": 1.8938817810343276e-06, "loss": 0.83152723, "num_input_tokens_seen": 184866605, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75, "step": 8598, "time_per_iteration": 2.7065975666046143 }, { "auxiliary_loss_clip": 0.01150801, "auxiliary_loss_mlp": 0.01042266, "balance_loss_clip": 1.02829421, "balance_loss_mlp": 1.04156208, "epoch": 0.5169998496918683, "flos": 25407804948480.0, "grad_norm": 1.479527110058982, "language_loss": 0.7533741, "learning_rate": 1.8935044928131679e-06, "loss": 0.77530479, "num_input_tokens_seen": 184886945, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.73828125, "step": 8599, "time_per_iteration": 2.6127512454986572 }, { "auxiliary_loss_clip": 0.01132888, "auxiliary_loss_mlp": 0.01033099, "balance_loss_clip": 1.01996207, "balance_loss_mlp": 1.04264808, "epoch": 0.5170599729445363, "flos": 24680793674880.0, "grad_norm": 3.5213526174310394, "language_loss": 0.72138512, "learning_rate": 1.8931272083925593e-06, "loss": 0.74304491, "num_input_tokens_seen": 184905590, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 8600, "time_per_iteration": 2.656111478805542 }, { "auxiliary_loss_clip": 0.01142598, "auxiliary_loss_mlp": 0.01031994, "balance_loss_clip": 1.01809967, "balance_loss_mlp": 1.04219294, "epoch": 0.5171200961972042, "flos": 20994455664000.0, "grad_norm": 1.5630436988654768, "language_loss": 0.74489915, "learning_rate": 1.8927499277859655e-06, "loss": 0.76664507, "num_input_tokens_seen": 184925555, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 8601, "time_per_iteration": 2.601581573486328 }, { "auxiliary_loss_clip": 0.01125512, "auxiliary_loss_mlp": 0.01039432, "balance_loss_clip": 1.02490592, "balance_loss_mlp": 1.04232311, "epoch": 0.5171802194498722, "flos": 22745639427840.0, "grad_norm": 2.9296482228757013, "language_loss": 0.83950567, "learning_rate": 1.8923726510068513e-06, "loss": 0.86115509, "num_input_tokens_seen": 184944490, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7421875, "step": 8602, "time_per_iteration": 2.5677876472473145 }, { "auxiliary_loss_clip": 0.01117901, "auxiliary_loss_mlp": 0.01033687, "balance_loss_clip": 1.01947737, "balance_loss_mlp": 1.04280925, "epoch": 0.5172403427025402, "flos": 28176732668160.0, "grad_norm": 1.894347428576515, "language_loss": 0.74595857, "learning_rate": 1.8919953780686804e-06, "loss": 0.76747441, "num_input_tokens_seen": 184963190, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75, "step": 8603, "time_per_iteration": 2.6359593868255615 }, { "auxiliary_loss_clip": 0.01137377, "auxiliary_loss_mlp": 0.01037597, "balance_loss_clip": 1.02410281, "balance_loss_mlp": 1.04433882, "epoch": 0.5173004659552082, "flos": 20337829090560.0, "grad_norm": 2.0877686158669446, "language_loss": 0.72705472, "learning_rate": 1.8916181089849162e-06, "loss": 0.74880445, "num_input_tokens_seen": 184981220, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75390625, "step": 8604, "time_per_iteration": 2.5851783752441406 }, { "auxiliary_loss_clip": 0.01142438, "auxiliary_loss_mlp": 0.01036944, "balance_loss_clip": 1.02208424, "balance_loss_mlp": 1.04334784, "epoch": 0.5173605892078762, "flos": 19063323740160.0, "grad_norm": 2.1493413088638014, "language_loss": 0.85320008, "learning_rate": 1.8912408437690234e-06, "loss": 0.87499386, "num_input_tokens_seen": 184998810, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8125, "step": 8605, "time_per_iteration": 2.6869914531707764 }, { "auxiliary_loss_clip": 0.01125125, "auxiliary_loss_mlp": 0.01029937, "balance_loss_clip": 1.01715755, "balance_loss_mlp": 1.04151618, "epoch": 0.5174207124605441, "flos": 27995168396160.0, "grad_norm": 1.4815911434005717, "language_loss": 0.64634788, "learning_rate": 1.8908635824344648e-06, "loss": 0.66789854, "num_input_tokens_seen": 185021185, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7421875, "step": 8606, "time_per_iteration": 2.658813238143921 }, { "auxiliary_loss_clip": 0.01133601, "auxiliary_loss_mlp": 0.0103288, "balance_loss_clip": 1.01934958, "balance_loss_mlp": 1.04231286, "epoch": 0.5174808357132121, "flos": 19496657416320.0, "grad_norm": 1.4876167841206336, "language_loss": 0.77425134, "learning_rate": 1.8904863249947043e-06, "loss": 0.79591614, "num_input_tokens_seen": 185038465, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 8607, "time_per_iteration": 2.5685765743255615 }, { "auxiliary_loss_clip": 0.01127928, "auxiliary_loss_mlp": 0.01038958, "balance_loss_clip": 1.02424121, "balance_loss_mlp": 1.04359567, "epoch": 0.51754095896588, "flos": 22784171742720.0, "grad_norm": 2.0753854604952946, "language_loss": 0.72311425, "learning_rate": 1.8901090714632054e-06, "loss": 0.74478304, "num_input_tokens_seen": 185057340, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7578125, "step": 8608, "time_per_iteration": 2.6304802894592285 }, { "auxiliary_loss_clip": 0.01133067, "auxiliary_loss_mlp": 0.01032766, "balance_loss_clip": 1.01830626, "balance_loss_mlp": 1.04429114, "epoch": 0.5176010822185481, "flos": 22669257156480.0, "grad_norm": 1.9462183712280818, "language_loss": 0.86403221, "learning_rate": 1.8897318218534304e-06, "loss": 0.88569045, "num_input_tokens_seen": 185074935, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.796875, "step": 8609, "time_per_iteration": 2.6189539432525635 }, { "auxiliary_loss_clip": 0.01124583, "auxiliary_loss_mlp": 0.01028375, "balance_loss_clip": 1.01532149, "balance_loss_mlp": 1.04132771, "epoch": 0.517661205471216, "flos": 23951196622080.0, "grad_norm": 1.5324887430840577, "language_loss": 0.73793566, "learning_rate": 1.8893545761788436e-06, "loss": 0.75946522, "num_input_tokens_seen": 185095050, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7421875, "step": 8610, "time_per_iteration": 2.610154867172241 }, { "auxiliary_loss_clip": 0.01123949, "auxiliary_loss_mlp": 0.01037189, "balance_loss_clip": 1.02215695, "balance_loss_mlp": 1.04550302, "epoch": 0.517721328723884, "flos": 15596076735360.0, "grad_norm": 1.9657135873194065, "language_loss": 0.67069304, "learning_rate": 1.8889773344529068e-06, "loss": 0.69230443, "num_input_tokens_seen": 185112275, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.78515625, "step": 8611, "time_per_iteration": 2.5475382804870605 }, { "auxiliary_loss_clip": 0.01116266, "auxiliary_loss_mlp": 0.01034648, "balance_loss_clip": 1.02097487, "balance_loss_mlp": 1.04166961, "epoch": 0.5177814519765519, "flos": 20960197067520.0, "grad_norm": 2.085654652245792, "language_loss": 0.77334487, "learning_rate": 1.888600096689084e-06, "loss": 0.79485404, "num_input_tokens_seen": 185132165, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.74609375, "step": 8612, "time_per_iteration": 2.5767223834991455 }, { "auxiliary_loss_clip": 0.01128408, "auxiliary_loss_mlp": 0.01036993, "balance_loss_clip": 1.02361774, "balance_loss_mlp": 1.04366446, "epoch": 0.5178415752292199, "flos": 17967832796160.0, "grad_norm": 2.019089136086034, "language_loss": 0.816504, "learning_rate": 1.888222862900837e-06, "loss": 0.83815801, "num_input_tokens_seen": 185151025, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7578125, "step": 8613, "time_per_iteration": 4.009929180145264 }, { "auxiliary_loss_clip": 0.01121487, "auxiliary_loss_mlp": 0.01040545, "balance_loss_clip": 1.02569759, "balance_loss_mlp": 1.04578483, "epoch": 0.5179016984818878, "flos": 17821496787840.0, "grad_norm": 2.2592074098752026, "language_loss": 0.66590321, "learning_rate": 1.887845633101628e-06, "loss": 0.6875236, "num_input_tokens_seen": 185168455, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7578125, "step": 8614, "time_per_iteration": 2.546816110610962 }, { "auxiliary_loss_clip": 0.01139147, "auxiliary_loss_mlp": 0.010346, "balance_loss_clip": 1.02036643, "balance_loss_mlp": 1.04441214, "epoch": 0.5179618217345558, "flos": 17820455293440.0, "grad_norm": 2.1985338715000937, "language_loss": 0.86667317, "learning_rate": 1.8874684073049204e-06, "loss": 0.88841069, "num_input_tokens_seen": 185184415, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.765625, "step": 8615, "time_per_iteration": 2.5690560340881348 }, { "auxiliary_loss_clip": 0.01126539, "auxiliary_loss_mlp": 0.01042125, "balance_loss_clip": 1.02882695, "balance_loss_mlp": 1.04367995, "epoch": 0.5180219449872238, "flos": 22522131048960.0, "grad_norm": 1.6555575772653057, "language_loss": 0.80866152, "learning_rate": 1.8870911855241755e-06, "loss": 0.83034813, "num_input_tokens_seen": 185202910, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73828125, "step": 8616, "time_per_iteration": 2.581819772720337 }, { "auxiliary_loss_clip": 0.01136642, "auxiliary_loss_mlp": 0.01291224, "balance_loss_clip": 1.02950275, "balance_loss_mlp": 1.04909742, "epoch": 0.5180820682398918, "flos": 23915465568000.0, "grad_norm": 1.7030268273584812, "language_loss": 0.74685019, "learning_rate": 1.8867139677728564e-06, "loss": 0.77112883, "num_input_tokens_seen": 185223085, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7890625, "step": 8617, "time_per_iteration": 2.6474502086639404 }, { "auxiliary_loss_clip": 0.01120414, "auxiliary_loss_mlp": 0.01040949, "balance_loss_clip": 1.0259943, "balance_loss_mlp": 1.04375648, "epoch": 0.5181421914925598, "flos": 16979930064000.0, "grad_norm": 1.6842745194900426, "language_loss": 0.70348871, "learning_rate": 1.886336754064424e-06, "loss": 0.72510231, "num_input_tokens_seen": 185241295, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.765625, "step": 8618, "time_per_iteration": 4.015296459197998 }, { "auxiliary_loss_clip": 0.01050266, "auxiliary_loss_mlp": 0.01005862, "balance_loss_clip": 1.00438929, "balance_loss_mlp": 1.01702464, "epoch": 0.5182023147452277, "flos": 66059870872320.0, "grad_norm": 0.9460122752369228, "language_loss": 0.67251444, "learning_rate": 1.8859595444123401e-06, "loss": 0.69307566, "num_input_tokens_seen": 185298295, "router_z_loss_clip": 0.01470947, "router_z_loss_mlp": 0.24414062, "step": 8619, "time_per_iteration": 3.1693480014801025 }, { "auxiliary_loss_clip": 0.01150312, "auxiliary_loss_mlp": 0.01031745, "balance_loss_clip": 1.0181967, "balance_loss_mlp": 1.04312229, "epoch": 0.5182624379978957, "flos": 18187749815040.0, "grad_norm": 2.3560707819139504, "language_loss": 0.79432911, "learning_rate": 1.8855823388300672e-06, "loss": 0.81614971, "num_input_tokens_seen": 185317000, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 8620, "time_per_iteration": 2.611499786376953 }, { "auxiliary_loss_clip": 0.01162345, "auxiliary_loss_mlp": 0.01285268, "balance_loss_clip": 1.02540767, "balance_loss_mlp": 1.04461205, "epoch": 0.5183225612505636, "flos": 14026708638720.0, "grad_norm": 2.085206808247366, "language_loss": 0.8213532, "learning_rate": 1.8852051373310665e-06, "loss": 0.84582925, "num_input_tokens_seen": 185331185, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.73046875, "step": 8621, "time_per_iteration": 2.624420642852783 }, { "auxiliary_loss_clip": 0.01136563, "auxiliary_loss_mlp": 0.01037678, "balance_loss_clip": 1.02483881, "balance_loss_mlp": 1.04377115, "epoch": 0.5183826845032317, "flos": 23659781581440.0, "grad_norm": 2.09123430514602, "language_loss": 0.65499127, "learning_rate": 1.8848279399287987e-06, "loss": 0.67673367, "num_input_tokens_seen": 185348955, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.75, "step": 8622, "time_per_iteration": 2.6425693035125732 }, { "auxiliary_loss_clip": 0.01141222, "auxiliary_loss_mlp": 0.0104085, "balance_loss_clip": 1.02615774, "balance_loss_mlp": 1.04485297, "epoch": 0.5184428077558996, "flos": 15888605097600.0, "grad_norm": 1.7879477336639378, "language_loss": 0.60716325, "learning_rate": 1.8844507466367254e-06, "loss": 0.62898397, "num_input_tokens_seen": 185367330, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7890625, "step": 8623, "time_per_iteration": 2.554845094680786 }, { "auxiliary_loss_clip": 0.01117172, "auxiliary_loss_mlp": 0.01032977, "balance_loss_clip": 1.019274, "balance_loss_mlp": 1.04170251, "epoch": 0.5185029310085676, "flos": 21030833162880.0, "grad_norm": 1.7587180665729092, "language_loss": 0.7641961, "learning_rate": 1.8840735574683082e-06, "loss": 0.78569758, "num_input_tokens_seen": 185385060, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75390625, "step": 8624, "time_per_iteration": 5.446946382522583 }, { "auxiliary_loss_clip": 0.01145364, "auxiliary_loss_mlp": 0.01031798, "balance_loss_clip": 1.01900077, "balance_loss_mlp": 1.04268777, "epoch": 0.5185630542612355, "flos": 26542690133760.0, "grad_norm": 1.708390159197013, "language_loss": 0.70591235, "learning_rate": 1.8836963724370074e-06, "loss": 0.72768402, "num_input_tokens_seen": 185403745, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.75390625, "step": 8625, "time_per_iteration": 2.6326651573181152 }, { "auxiliary_loss_clip": 0.01136507, "auxiliary_loss_mlp": 0.01032842, "balance_loss_clip": 1.02018762, "balance_loss_mlp": 1.04348254, "epoch": 0.5186231775139035, "flos": 20668422890880.0, "grad_norm": 1.8335614667937885, "language_loss": 0.68122649, "learning_rate": 1.8833191915562835e-06, "loss": 0.70291996, "num_input_tokens_seen": 185422620, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.75, "step": 8626, "time_per_iteration": 2.6120898723602295 }, { "auxiliary_loss_clip": 0.01126902, "auxiliary_loss_mlp": 0.01029585, "balance_loss_clip": 1.0162698, "balance_loss_mlp": 1.04416704, "epoch": 0.5186833007665714, "flos": 20885502735360.0, "grad_norm": 2.232485529026909, "language_loss": 0.70423424, "learning_rate": 1.8829420148395978e-06, "loss": 0.72579908, "num_input_tokens_seen": 185439380, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73828125, "step": 8627, "time_per_iteration": 2.5869877338409424 }, { "auxiliary_loss_clip": 0.01121971, "auxiliary_loss_mlp": 0.01286632, "balance_loss_clip": 1.0263772, "balance_loss_mlp": 1.04435921, "epoch": 0.5187434240192395, "flos": 20886903365760.0, "grad_norm": 1.8988970819501798, "language_loss": 0.72896808, "learning_rate": 1.8825648423004101e-06, "loss": 0.75305408, "num_input_tokens_seen": 185458830, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7734375, "step": 8628, "time_per_iteration": 2.6638081073760986 }, { "auxiliary_loss_clip": 0.01136943, "auxiliary_loss_mlp": 0.0103626, "balance_loss_clip": 1.02303934, "balance_loss_mlp": 1.04501271, "epoch": 0.5188035472719074, "flos": 19859929614720.0, "grad_norm": 1.651381312911307, "language_loss": 0.77679187, "learning_rate": 1.8821876739521815e-06, "loss": 0.7985239, "num_input_tokens_seen": 185477270, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7421875, "step": 8629, "time_per_iteration": 2.5840647220611572 }, { "auxiliary_loss_clip": 0.01130687, "auxiliary_loss_mlp": 0.01035284, "balance_loss_clip": 1.02067459, "balance_loss_mlp": 1.04515374, "epoch": 0.5188636705245754, "flos": 21138313633920.0, "grad_norm": 1.8852906474015692, "language_loss": 0.74411464, "learning_rate": 1.881810509808372e-06, "loss": 0.76577431, "num_input_tokens_seen": 185495795, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.765625, "step": 8630, "time_per_iteration": 2.671940565109253 }, { "auxiliary_loss_clip": 0.01126631, "auxiliary_loss_mlp": 0.01039912, "balance_loss_clip": 1.02476025, "balance_loss_mlp": 1.04151201, "epoch": 0.5189237937772434, "flos": 22419786222720.0, "grad_norm": 1.8070760067076697, "language_loss": 0.80350327, "learning_rate": 1.8814333498824409e-06, "loss": 0.82516873, "num_input_tokens_seen": 185514885, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.765625, "step": 8631, "time_per_iteration": 2.6061084270477295 }, { "auxiliary_loss_clip": 0.0112962, "auxiliary_loss_mlp": 0.0103294, "balance_loss_clip": 1.01873589, "balance_loss_mlp": 1.04408765, "epoch": 0.5189839170299113, "flos": 25446696399360.0, "grad_norm": 1.629433853461044, "language_loss": 0.74114484, "learning_rate": 1.8810561941878488e-06, "loss": 0.76277047, "num_input_tokens_seen": 185537155, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.765625, "step": 8632, "time_per_iteration": 2.648516893386841 }, { "auxiliary_loss_clip": 0.01139871, "auxiliary_loss_mlp": 0.01031744, "balance_loss_clip": 1.01821399, "balance_loss_mlp": 1.04163492, "epoch": 0.5190440402825793, "flos": 18587722734720.0, "grad_norm": 1.9125384736967623, "language_loss": 0.79331088, "learning_rate": 1.880679042738055e-06, "loss": 0.815027, "num_input_tokens_seen": 185555520, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 8633, "time_per_iteration": 2.531341791152954 }, { "auxiliary_loss_clip": 0.01125423, "auxiliary_loss_mlp": 0.01033022, "balance_loss_clip": 1.01962316, "balance_loss_mlp": 1.04163003, "epoch": 0.5191041635352472, "flos": 21908633731200.0, "grad_norm": 2.16122959184214, "language_loss": 0.80510139, "learning_rate": 1.8803018955465194e-06, "loss": 0.82668585, "num_input_tokens_seen": 185573855, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.75, "step": 8634, "time_per_iteration": 2.573850631713867 }, { "auxiliary_loss_clip": 0.01117146, "auxiliary_loss_mlp": 0.0103604, "balance_loss_clip": 1.02218175, "balance_loss_mlp": 1.04373622, "epoch": 0.5191642867879153, "flos": 27527971173120.0, "grad_norm": 1.5016655775548444, "language_loss": 0.68944311, "learning_rate": 1.8799247526267015e-06, "loss": 0.71097493, "num_input_tokens_seen": 185595145, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 8635, "time_per_iteration": 2.5954136848449707 }, { "auxiliary_loss_clip": 0.01140216, "auxiliary_loss_mlp": 0.01035587, "balance_loss_clip": 1.02196121, "balance_loss_mlp": 1.04523027, "epoch": 0.5192244100405832, "flos": 15705999331200.0, "grad_norm": 1.7536577728500469, "language_loss": 0.77382594, "learning_rate": 1.87954761399206e-06, "loss": 0.79558402, "num_input_tokens_seen": 185613320, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.76953125, "step": 8636, "time_per_iteration": 2.5430097579956055 }, { "auxiliary_loss_clip": 0.01119904, "auxiliary_loss_mlp": 0.01034554, "balance_loss_clip": 1.02075565, "balance_loss_mlp": 1.04346585, "epoch": 0.5192845332932512, "flos": 12057080313600.0, "grad_norm": 2.012893579900223, "language_loss": 0.70981306, "learning_rate": 1.8791704796560547e-06, "loss": 0.73135763, "num_input_tokens_seen": 185630730, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.765625, "step": 8637, "time_per_iteration": 2.522867202758789 }, { "auxiliary_loss_clip": 0.01142762, "auxiliary_loss_mlp": 0.01031077, "balance_loss_clip": 1.0170939, "balance_loss_mlp": 1.04280436, "epoch": 0.5193446565459191, "flos": 18953185662720.0, "grad_norm": 1.9971561718877933, "language_loss": 0.76078373, "learning_rate": 1.8787933496321433e-06, "loss": 0.7825222, "num_input_tokens_seen": 185648515, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.734375, "step": 8638, "time_per_iteration": 2.5659759044647217 }, { "auxiliary_loss_clip": 0.01141941, "auxiliary_loss_mlp": 0.01032789, "balance_loss_clip": 1.01944983, "balance_loss_mlp": 1.04154885, "epoch": 0.5194047797985871, "flos": 20374960775040.0, "grad_norm": 1.9266707065382025, "language_loss": 0.74407911, "learning_rate": 1.8784162239337862e-06, "loss": 0.76582634, "num_input_tokens_seen": 185665220, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 8639, "time_per_iteration": 2.68285870552063 }, { "auxiliary_loss_clip": 0.01116365, "auxiliary_loss_mlp": 0.01032934, "balance_loss_clip": 1.01885545, "balance_loss_mlp": 1.04229522, "epoch": 0.519464903051255, "flos": 24353001135360.0, "grad_norm": 1.889546136323985, "language_loss": 0.77460527, "learning_rate": 1.8780391025744413e-06, "loss": 0.79609823, "num_input_tokens_seen": 185683750, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.73828125, "step": 8640, "time_per_iteration": 2.63092041015625 }, { "auxiliary_loss_clip": 0.01142635, "auxiliary_loss_mlp": 0.0103507, "balance_loss_clip": 1.02195668, "balance_loss_mlp": 1.04260421, "epoch": 0.519525026303923, "flos": 14061829161600.0, "grad_norm": 1.887143242832047, "language_loss": 0.65602648, "learning_rate": 1.8776619855675666e-06, "loss": 0.67780352, "num_input_tokens_seen": 185700625, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 8641, "time_per_iteration": 2.561250925064087 }, { "auxiliary_loss_clip": 0.01115278, "auxiliary_loss_mlp": 0.01035776, "balance_loss_clip": 1.02250838, "balance_loss_mlp": 1.0418458, "epoch": 0.519585149556591, "flos": 28835873193600.0, "grad_norm": 1.7729473890242338, "language_loss": 0.76221013, "learning_rate": 1.8772848729266212e-06, "loss": 0.78372061, "num_input_tokens_seen": 185721155, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 8642, "time_per_iteration": 2.573829412460327 }, { "auxiliary_loss_clip": 0.0111502, "auxiliary_loss_mlp": 0.01032843, "balance_loss_clip": 1.01956284, "balance_loss_mlp": 1.0421046, "epoch": 0.519645272809259, "flos": 25373007648000.0, "grad_norm": 2.396210458785469, "language_loss": 0.82981944, "learning_rate": 1.8769077646650631e-06, "loss": 0.85129803, "num_input_tokens_seen": 185740990, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 8643, "time_per_iteration": 2.5838770866394043 }, { "auxiliary_loss_clip": 0.01119179, "auxiliary_loss_mlp": 0.0104401, "balance_loss_clip": 1.02971077, "balance_loss_mlp": 1.04278052, "epoch": 0.519705396061927, "flos": 25372863993600.0, "grad_norm": 1.58938505799837, "language_loss": 0.70335066, "learning_rate": 1.8765306607963503e-06, "loss": 0.72498262, "num_input_tokens_seen": 185762235, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.765625, "step": 8644, "time_per_iteration": 2.605842351913452 }, { "auxiliary_loss_clip": 0.01122603, "auxiliary_loss_mlp": 0.01285225, "balance_loss_clip": 1.02491879, "balance_loss_mlp": 1.03923082, "epoch": 0.5197655193145949, "flos": 28476228268800.0, "grad_norm": 1.60385867367502, "language_loss": 0.80183786, "learning_rate": 1.8761535613339401e-06, "loss": 0.82591617, "num_input_tokens_seen": 185783415, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7421875, "step": 8645, "time_per_iteration": 2.6729049682617188 }, { "auxiliary_loss_clip": 0.01115802, "auxiliary_loss_mlp": 0.01034443, "balance_loss_clip": 1.0206033, "balance_loss_mlp": 1.04109216, "epoch": 0.5198256425672629, "flos": 20009138711040.0, "grad_norm": 1.703404257798647, "language_loss": 0.7789107, "learning_rate": 1.8757764662912913e-06, "loss": 0.80041313, "num_input_tokens_seen": 185801345, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75, "step": 8646, "time_per_iteration": 2.5191407203674316 }, { "auxiliary_loss_clip": 0.01126532, "auxiliary_loss_mlp": 0.01036314, "balance_loss_clip": 1.0231061, "balance_loss_mlp": 1.04327834, "epoch": 0.5198857658199308, "flos": 19828867328640.0, "grad_norm": 2.071914748888826, "language_loss": 0.6544531, "learning_rate": 1.875399375681861e-06, "loss": 0.67608154, "num_input_tokens_seen": 185820815, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7421875, "step": 8647, "time_per_iteration": 2.579533815383911 }, { "auxiliary_loss_clip": 0.01121202, "auxiliary_loss_mlp": 0.01037216, "balance_loss_clip": 1.02211261, "balance_loss_mlp": 1.04352355, "epoch": 0.5199458890725989, "flos": 24461918150400.0, "grad_norm": 1.5872591161305767, "language_loss": 0.7146982, "learning_rate": 1.875022289519106e-06, "loss": 0.73628241, "num_input_tokens_seen": 185841450, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.7734375, "step": 8648, "time_per_iteration": 2.5454554557800293 }, { "auxiliary_loss_clip": 0.01135857, "auxiliary_loss_mlp": 0.01031426, "balance_loss_clip": 1.01663232, "balance_loss_mlp": 1.04259062, "epoch": 0.5200060123252668, "flos": 23404779953280.0, "grad_norm": 1.9014500961466532, "language_loss": 0.6424861, "learning_rate": 1.8746452078164843e-06, "loss": 0.66415894, "num_input_tokens_seen": 185859935, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7578125, "step": 8649, "time_per_iteration": 2.5984482765197754 }, { "auxiliary_loss_clip": 0.01137474, "auxiliary_loss_mlp": 0.01033317, "balance_loss_clip": 1.01793885, "balance_loss_mlp": 1.04085875, "epoch": 0.5200661355779348, "flos": 17201355454080.0, "grad_norm": 1.6722400643699844, "language_loss": 0.70347595, "learning_rate": 1.8742681305874523e-06, "loss": 0.72518384, "num_input_tokens_seen": 185876795, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.78515625, "step": 8650, "time_per_iteration": 2.491457223892212 }, { "auxiliary_loss_clip": 0.01114217, "auxiliary_loss_mlp": 0.01031396, "balance_loss_clip": 1.01796722, "balance_loss_mlp": 1.03974998, "epoch": 0.5201262588306027, "flos": 18515075477760.0, "grad_norm": 1.5885642397530335, "language_loss": 0.77278423, "learning_rate": 1.873891057845468e-06, "loss": 0.79424036, "num_input_tokens_seen": 185895570, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 8651, "time_per_iteration": 2.587268829345703 }, { "auxiliary_loss_clip": 0.01164104, "auxiliary_loss_mlp": 0.0103879, "balance_loss_clip": 1.02412772, "balance_loss_mlp": 1.04348016, "epoch": 0.5201863820832707, "flos": 18619395552000.0, "grad_norm": 1.7114617152628915, "language_loss": 0.78718644, "learning_rate": 1.8735139896039874e-06, "loss": 0.80921543, "num_input_tokens_seen": 185913700, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.765625, "step": 8652, "time_per_iteration": 2.5678822994232178 }, { "auxiliary_loss_clip": 0.01146085, "auxiliary_loss_mlp": 0.01034258, "balance_loss_clip": 1.01894617, "balance_loss_mlp": 1.04360485, "epoch": 0.5202465053359386, "flos": 22857142222080.0, "grad_norm": 2.0974910303474816, "language_loss": 0.70258307, "learning_rate": 1.8731369258764664e-06, "loss": 0.72438645, "num_input_tokens_seen": 185932460, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.7578125, "step": 8653, "time_per_iteration": 2.632506847381592 }, { "auxiliary_loss_clip": 0.01144184, "auxiliary_loss_mlp": 0.01042591, "balance_loss_clip": 1.02816701, "balance_loss_mlp": 1.04042578, "epoch": 0.5203066285886067, "flos": 21981532383360.0, "grad_norm": 1.7304678875325974, "language_loss": 0.78500319, "learning_rate": 1.8727598666763628e-06, "loss": 0.80687094, "num_input_tokens_seen": 185952030, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7734375, "step": 8654, "time_per_iteration": 2.584665298461914 }, { "auxiliary_loss_clip": 0.011303, "auxiliary_loss_mlp": 0.01044522, "balance_loss_clip": 1.03004992, "balance_loss_mlp": 1.04133511, "epoch": 0.5203667518412746, "flos": 20233329448320.0, "grad_norm": 1.857249738755294, "language_loss": 0.84005314, "learning_rate": 1.8723828120171316e-06, "loss": 0.86180139, "num_input_tokens_seen": 185973130, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.796875, "step": 8655, "time_per_iteration": 2.645411491394043 }, { "auxiliary_loss_clip": 0.01122819, "auxiliary_loss_mlp": 0.01038046, "balance_loss_clip": 1.02535582, "balance_loss_mlp": 1.04167986, "epoch": 0.5204268750939426, "flos": 15705460627200.0, "grad_norm": 2.2083202751896756, "language_loss": 0.65916234, "learning_rate": 1.8720057619122302e-06, "loss": 0.68077099, "num_input_tokens_seen": 185990200, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 8656, "time_per_iteration": 3.957585096359253 }, { "auxiliary_loss_clip": 0.01146082, "auxiliary_loss_mlp": 0.01037431, "balance_loss_clip": 1.02387047, "balance_loss_mlp": 1.04318213, "epoch": 0.5204869983466105, "flos": 27449469999360.0, "grad_norm": 2.58076392477436, "language_loss": 0.73019791, "learning_rate": 1.871628716375114e-06, "loss": 0.752033, "num_input_tokens_seen": 186009880, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7578125, "step": 8657, "time_per_iteration": 2.6556832790374756 }, { "auxiliary_loss_clip": 0.01141145, "auxiliary_loss_mlp": 0.01041962, "balance_loss_clip": 1.02683449, "balance_loss_mlp": 1.04027581, "epoch": 0.5205471215992785, "flos": 20595452411520.0, "grad_norm": 1.6862520409721442, "language_loss": 0.71541274, "learning_rate": 1.8712516754192382e-06, "loss": 0.73724383, "num_input_tokens_seen": 186026680, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.7421875, "step": 8658, "time_per_iteration": 2.6829094886779785 }, { "auxiliary_loss_clip": 0.01113536, "auxiliary_loss_mlp": 0.01034486, "balance_loss_clip": 1.020926, "balance_loss_mlp": 1.03892422, "epoch": 0.5206072448519465, "flos": 22127904305280.0, "grad_norm": 2.1780366903163335, "language_loss": 0.83370113, "learning_rate": 1.8708746390580592e-06, "loss": 0.8551814, "num_input_tokens_seen": 186046920, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.74609375, "step": 8659, "time_per_iteration": 2.5789082050323486 }, { "auxiliary_loss_clip": 0.01140057, "auxiliary_loss_mlp": 0.01040831, "balance_loss_clip": 1.02488112, "balance_loss_mlp": 1.04146338, "epoch": 0.5206673681046144, "flos": 18330422636160.0, "grad_norm": 4.794575344404155, "language_loss": 0.75374973, "learning_rate": 1.8704976073050318e-06, "loss": 0.77555859, "num_input_tokens_seen": 186062090, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8046875, "step": 8660, "time_per_iteration": 3.9251420497894287 }, { "auxiliary_loss_clip": 0.01114154, "auxiliary_loss_mlp": 0.01039775, "balance_loss_clip": 1.02691245, "balance_loss_mlp": 1.04074275, "epoch": 0.5207274913572825, "flos": 20230240878720.0, "grad_norm": 1.8199643588705707, "language_loss": 0.77109748, "learning_rate": 1.8701205801736121e-06, "loss": 0.79263675, "num_input_tokens_seen": 186081135, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 8661, "time_per_iteration": 2.572361469268799 }, { "auxiliary_loss_clip": 0.01133932, "auxiliary_loss_mlp": 0.01034996, "balance_loss_clip": 1.02136481, "balance_loss_mlp": 1.03961957, "epoch": 0.5207876146099504, "flos": 22127042378880.0, "grad_norm": 1.8991601910640805, "language_loss": 0.70581943, "learning_rate": 1.8697435576772551e-06, "loss": 0.72750878, "num_input_tokens_seen": 186099700, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.765625, "step": 8662, "time_per_iteration": 2.6590261459350586 }, { "auxiliary_loss_clip": 0.0112443, "auxiliary_loss_mlp": 0.01035466, "balance_loss_clip": 1.02061248, "balance_loss_mlp": 1.04133999, "epoch": 0.5208477378626184, "flos": 23878908501120.0, "grad_norm": 1.8504669406462768, "language_loss": 0.69581097, "learning_rate": 1.8693665398294148e-06, "loss": 0.71740991, "num_input_tokens_seen": 186119740, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7421875, "step": 8663, "time_per_iteration": 2.6257503032684326 }, { "auxiliary_loss_clip": 0.01134316, "auxiliary_loss_mlp": 0.01280428, "balance_loss_clip": 1.02002764, "balance_loss_mlp": 1.04104495, "epoch": 0.5209078611152863, "flos": 20961525870720.0, "grad_norm": 1.5050900564765273, "language_loss": 0.76764023, "learning_rate": 1.868989526643547e-06, "loss": 0.79178768, "num_input_tokens_seen": 186140645, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.75, "step": 8664, "time_per_iteration": 2.589153289794922 }, { "auxiliary_loss_clip": 0.01133951, "auxiliary_loss_mlp": 0.01038772, "balance_loss_clip": 1.02498507, "balance_loss_mlp": 1.04087639, "epoch": 0.5209679843679543, "flos": 20667740532480.0, "grad_norm": 2.1985660435034142, "language_loss": 0.76090157, "learning_rate": 1.8686125181331056e-06, "loss": 0.78262889, "num_input_tokens_seen": 186160130, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75, "step": 8665, "time_per_iteration": 2.5772640705108643 }, { "auxiliary_loss_clip": 0.01125307, "auxiliary_loss_mlp": 0.0103475, "balance_loss_clip": 1.02198231, "balance_loss_mlp": 1.04255962, "epoch": 0.5210281076206222, "flos": 20227295963520.0, "grad_norm": 1.9524753019375845, "language_loss": 0.72102356, "learning_rate": 1.8682355143115464e-06, "loss": 0.7426241, "num_input_tokens_seen": 186179485, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7421875, "step": 8666, "time_per_iteration": 4.073886156082153 }, { "auxiliary_loss_clip": 0.01140454, "auxiliary_loss_mlp": 0.01039052, "balance_loss_clip": 1.02245784, "balance_loss_mlp": 1.0411427, "epoch": 0.5210882308732903, "flos": 16069989801600.0, "grad_norm": 2.0573199950663206, "language_loss": 0.67839009, "learning_rate": 1.867858515192322e-06, "loss": 0.70018512, "num_input_tokens_seen": 186197140, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.81640625, "step": 8667, "time_per_iteration": 2.6908528804779053 }, { "auxiliary_loss_clip": 0.01147707, "auxiliary_loss_mlp": 0.01032774, "balance_loss_clip": 1.0192076, "balance_loss_mlp": 1.03964782, "epoch": 0.5211483541259582, "flos": 24825298089600.0, "grad_norm": 1.4947210783016003, "language_loss": 0.80994129, "learning_rate": 1.8674815207888875e-06, "loss": 0.8317461, "num_input_tokens_seen": 186216800, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 8668, "time_per_iteration": 2.656276226043701 }, { "auxiliary_loss_clip": 0.01133666, "auxiliary_loss_mlp": 0.01032325, "balance_loss_clip": 1.01886058, "balance_loss_mlp": 1.04133141, "epoch": 0.5212084773786262, "flos": 20370651143040.0, "grad_norm": 2.3952731034665655, "language_loss": 0.63567978, "learning_rate": 1.8671045311146966e-06, "loss": 0.65733969, "num_input_tokens_seen": 186235320, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 8669, "time_per_iteration": 2.656399726867676 }, { "auxiliary_loss_clip": 0.01134635, "auxiliary_loss_mlp": 0.01283505, "balance_loss_clip": 1.02314854, "balance_loss_mlp": 1.04315054, "epoch": 0.5212686006312941, "flos": 23145468693120.0, "grad_norm": 1.4943464494392271, "language_loss": 0.66494757, "learning_rate": 1.866727546183203e-06, "loss": 0.68912899, "num_input_tokens_seen": 186254460, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7421875, "step": 8670, "time_per_iteration": 2.6228482723236084 }, { "auxiliary_loss_clip": 0.01136788, "auxiliary_loss_mlp": 0.01033883, "balance_loss_clip": 1.02076411, "balance_loss_mlp": 1.0396024, "epoch": 0.5213287238839621, "flos": 27774030314880.0, "grad_norm": 2.097395716700195, "language_loss": 0.7612105, "learning_rate": 1.8663505660078608e-06, "loss": 0.7829172, "num_input_tokens_seen": 186269465, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 8671, "time_per_iteration": 2.6108896732330322 }, { "auxiliary_loss_clip": 0.01147392, "auxiliary_loss_mlp": 0.010392, "balance_loss_clip": 1.0242393, "balance_loss_mlp": 1.04418111, "epoch": 0.5213888471366301, "flos": 19937676602880.0, "grad_norm": 2.256883451109958, "language_loss": 0.78321493, "learning_rate": 1.8659735906021226e-06, "loss": 0.80508077, "num_input_tokens_seen": 186288660, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.76953125, "step": 8672, "time_per_iteration": 2.6577861309051514 }, { "auxiliary_loss_clip": 0.01137713, "auxiliary_loss_mlp": 0.01032887, "balance_loss_clip": 1.02017939, "balance_loss_mlp": 1.03850818, "epoch": 0.521448970389298, "flos": 16982731324800.0, "grad_norm": 2.239837424349127, "language_loss": 0.72137928, "learning_rate": 1.8655966199794427e-06, "loss": 0.74308527, "num_input_tokens_seen": 186305760, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 8673, "time_per_iteration": 2.669767141342163 }, { "auxiliary_loss_clip": 0.01123848, "auxiliary_loss_mlp": 0.01032249, "balance_loss_clip": 1.01861095, "balance_loss_mlp": 1.04054952, "epoch": 0.5215090936419661, "flos": 18989706816000.0, "grad_norm": 1.8754376417820993, "language_loss": 0.75034684, "learning_rate": 1.8652196541532735e-06, "loss": 0.77190781, "num_input_tokens_seen": 186324135, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 8674, "time_per_iteration": 2.6328184604644775 }, { "auxiliary_loss_clip": 0.01134006, "auxiliary_loss_mlp": 0.01034709, "balance_loss_clip": 1.01979542, "balance_loss_mlp": 1.03982592, "epoch": 0.521569216894634, "flos": 16143427157760.0, "grad_norm": 2.1098186367318505, "language_loss": 0.86221784, "learning_rate": 1.8648426931370678e-06, "loss": 0.88390499, "num_input_tokens_seen": 186340205, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.76171875, "step": 8675, "time_per_iteration": 2.595895767211914 }, { "auxiliary_loss_clip": 0.01059273, "auxiliary_loss_mlp": 0.0100283, "balance_loss_clip": 1.00134611, "balance_loss_mlp": 1.01670039, "epoch": 0.521629340147302, "flos": 57579493282560.0, "grad_norm": 0.8689331218522075, "language_loss": 0.6320675, "learning_rate": 1.8644657369442794e-06, "loss": 0.65268856, "num_input_tokens_seen": 186396940, "router_z_loss_clip": 0.01483154, "router_z_loss_mlp": 0.24609375, "step": 8676, "time_per_iteration": 3.2048771381378174 }, { "auxiliary_loss_clip": 0.01121874, "auxiliary_loss_mlp": 0.01030847, "balance_loss_clip": 1.01724529, "balance_loss_mlp": 1.04046285, "epoch": 0.5216894633999699, "flos": 26796901662720.0, "grad_norm": 1.5868529355152365, "language_loss": 0.68750834, "learning_rate": 1.8640887855883594e-06, "loss": 0.70903558, "num_input_tokens_seen": 186418680, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 8677, "time_per_iteration": 2.6488680839538574 }, { "auxiliary_loss_clip": 0.0112277, "auxiliary_loss_mlp": 0.01030582, "balance_loss_clip": 1.01710534, "balance_loss_mlp": 1.04031825, "epoch": 0.5217495866526379, "flos": 26358719650560.0, "grad_norm": 1.7180856375394113, "language_loss": 0.65298229, "learning_rate": 1.8637118390827618e-06, "loss": 0.67451584, "num_input_tokens_seen": 186438265, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 8678, "time_per_iteration": 2.6836719512939453 }, { "auxiliary_loss_clip": 0.0113728, "auxiliary_loss_mlp": 0.01037103, "balance_loss_clip": 1.02186847, "balance_loss_mlp": 1.0417366, "epoch": 0.5218097099053058, "flos": 23584009841280.0, "grad_norm": 2.8131134832552913, "language_loss": 0.68124926, "learning_rate": 1.8633348974409377e-06, "loss": 0.70299315, "num_input_tokens_seen": 186456870, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7734375, "step": 8679, "time_per_iteration": 2.5993802547454834 }, { "auxiliary_loss_clip": 0.01117006, "auxiliary_loss_mlp": 0.01032871, "balance_loss_clip": 1.01949549, "balance_loss_mlp": 1.04254103, "epoch": 0.5218698331579739, "flos": 18077396256000.0, "grad_norm": 2.6312501126192998, "language_loss": 0.66898656, "learning_rate": 1.8629579606763395e-06, "loss": 0.6904853, "num_input_tokens_seen": 186476425, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.74609375, "step": 8680, "time_per_iteration": 2.5887880325317383 }, { "auxiliary_loss_clip": 0.0111765, "auxiliary_loss_mlp": 0.01035627, "balance_loss_clip": 1.02119124, "balance_loss_mlp": 1.0423969, "epoch": 0.5219299564106418, "flos": 19281121856640.0, "grad_norm": 1.8294538436105763, "language_loss": 0.83340162, "learning_rate": 1.86258102880242e-06, "loss": 0.85493439, "num_input_tokens_seen": 186492555, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.75, "step": 8681, "time_per_iteration": 2.6797995567321777 }, { "auxiliary_loss_clip": 0.01130811, "auxiliary_loss_mlp": 0.01033078, "balance_loss_clip": 1.01947653, "balance_loss_mlp": 1.03946555, "epoch": 0.5219900796633098, "flos": 26651355753600.0, "grad_norm": 1.6410410992893296, "language_loss": 0.77639782, "learning_rate": 1.862204101832629e-06, "loss": 0.79803669, "num_input_tokens_seen": 186513190, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73828125, "step": 8682, "time_per_iteration": 2.6502063274383545 }, { "auxiliary_loss_clip": 0.01144081, "auxiliary_loss_mlp": 0.01038597, "balance_loss_clip": 1.02575231, "balance_loss_mlp": 1.04318905, "epoch": 0.5220502029159777, "flos": 34312717382400.0, "grad_norm": 1.6245823887006674, "language_loss": 0.69273955, "learning_rate": 1.8618271797804197e-06, "loss": 0.71456629, "num_input_tokens_seen": 186534830, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.73828125, "step": 8683, "time_per_iteration": 2.745871067047119 }, { "auxiliary_loss_clip": 0.0114367, "auxiliary_loss_mlp": 0.01041568, "balance_loss_clip": 1.02756071, "balance_loss_mlp": 1.04186761, "epoch": 0.5221103261686457, "flos": 22156488552960.0, "grad_norm": 1.5370827167833392, "language_loss": 0.75777221, "learning_rate": 1.861450262659243e-06, "loss": 0.77962458, "num_input_tokens_seen": 186554390, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75390625, "step": 8684, "time_per_iteration": 2.6237545013427734 }, { "auxiliary_loss_clip": 0.01121171, "auxiliary_loss_mlp": 0.01277955, "balance_loss_clip": 1.01799846, "balance_loss_mlp": 1.03995299, "epoch": 0.5221704494213137, "flos": 19208402772480.0, "grad_norm": 34.28748534209772, "language_loss": 0.75991327, "learning_rate": 1.8610733504825495e-06, "loss": 0.78390449, "num_input_tokens_seen": 186572360, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 8685, "time_per_iteration": 2.639507532119751 }, { "auxiliary_loss_clip": 0.01136095, "auxiliary_loss_mlp": 0.01037548, "balance_loss_clip": 1.02427995, "balance_loss_mlp": 1.04345679, "epoch": 0.5222305726739817, "flos": 19354056422400.0, "grad_norm": 1.7577072578559025, "language_loss": 0.80846786, "learning_rate": 1.8606964432637912e-06, "loss": 0.83020425, "num_input_tokens_seen": 186590655, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75, "step": 8686, "time_per_iteration": 2.6253840923309326 }, { "auxiliary_loss_clip": 0.01127445, "auxiliary_loss_mlp": 0.01033797, "balance_loss_clip": 1.02113128, "balance_loss_mlp": 1.03932202, "epoch": 0.5222906959266497, "flos": 27814789272960.0, "grad_norm": 1.9648649021787514, "language_loss": 0.70192587, "learning_rate": 1.8603195410164183e-06, "loss": 0.72353828, "num_input_tokens_seen": 186610345, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 8687, "time_per_iteration": 2.6988470554351807 }, { "auxiliary_loss_clip": 0.01111049, "auxiliary_loss_mlp": 0.01031485, "balance_loss_clip": 1.01881897, "balance_loss_mlp": 1.03931332, "epoch": 0.5223508191793176, "flos": 12712988615040.0, "grad_norm": 1.9168124638118076, "language_loss": 0.82927692, "learning_rate": 1.859942643753882e-06, "loss": 0.85070217, "num_input_tokens_seen": 186624360, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 8688, "time_per_iteration": 2.548830270767212 }, { "auxiliary_loss_clip": 0.01121862, "auxiliary_loss_mlp": 0.01281798, "balance_loss_clip": 1.02135825, "balance_loss_mlp": 1.04042196, "epoch": 0.5224109424319856, "flos": 15632238752640.0, "grad_norm": 4.01987540114344, "language_loss": 0.73798621, "learning_rate": 1.859565751489632e-06, "loss": 0.76202285, "num_input_tokens_seen": 186638680, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 8689, "time_per_iteration": 2.668980121612549 }, { "auxiliary_loss_clip": 0.01129491, "auxiliary_loss_mlp": 0.01032982, "balance_loss_clip": 1.0194037, "balance_loss_mlp": 1.0398736, "epoch": 0.5224710656846535, "flos": 15742233175680.0, "grad_norm": 1.890128510859122, "language_loss": 0.82568765, "learning_rate": 1.8591888642371194e-06, "loss": 0.84731233, "num_input_tokens_seen": 186655840, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 8690, "time_per_iteration": 2.57979679107666 }, { "auxiliary_loss_clip": 0.01141949, "auxiliary_loss_mlp": 0.01039466, "balance_loss_clip": 1.0250299, "balance_loss_mlp": 1.04097831, "epoch": 0.5225311889373215, "flos": 26030998938240.0, "grad_norm": 2.247475841123562, "language_loss": 0.78912574, "learning_rate": 1.858811982009794e-06, "loss": 0.81093991, "num_input_tokens_seen": 186674150, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7421875, "step": 8691, "time_per_iteration": 2.695011615753174 }, { "auxiliary_loss_clip": 0.01127336, "auxiliary_loss_mlp": 0.01039629, "balance_loss_clip": 1.02479374, "balance_loss_mlp": 1.0421443, "epoch": 0.5225913121899894, "flos": 18369278173440.0, "grad_norm": 2.5960620100144434, "language_loss": 0.76949823, "learning_rate": 1.8584351048211056e-06, "loss": 0.79116786, "num_input_tokens_seen": 186690675, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.76171875, "step": 8692, "time_per_iteration": 2.569059371948242 }, { "auxiliary_loss_clip": 0.01119784, "auxiliary_loss_mlp": 0.01033342, "balance_loss_clip": 1.0206759, "balance_loss_mlp": 1.04062128, "epoch": 0.5226514354426575, "flos": 29273516501760.0, "grad_norm": 2.1123654515008488, "language_loss": 0.72710896, "learning_rate": 1.8580582326845044e-06, "loss": 0.74864018, "num_input_tokens_seen": 186710380, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 8693, "time_per_iteration": 2.7014946937561035 }, { "auxiliary_loss_clip": 0.01119602, "auxiliary_loss_mlp": 0.01042039, "balance_loss_clip": 1.02684593, "balance_loss_mlp": 1.0414691, "epoch": 0.5227115586953254, "flos": 22853299466880.0, "grad_norm": 2.3402119535912824, "language_loss": 0.81986505, "learning_rate": 1.8576813656134393e-06, "loss": 0.84148151, "num_input_tokens_seen": 186729135, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.78125, "step": 8694, "time_per_iteration": 2.5623621940612793 }, { "auxiliary_loss_clip": 0.0106819, "auxiliary_loss_mlp": 0.0099991, "balance_loss_clip": 0.99854499, "balance_loss_mlp": 1.01716113, "epoch": 0.5227716819479934, "flos": 57474419022720.0, "grad_norm": 0.7894470237197927, "language_loss": 0.55686271, "learning_rate": 1.8573045036213608e-06, "loss": 0.57754368, "num_input_tokens_seen": 186791115, "router_z_loss_clip": 0.01367188, "router_z_loss_mlp": 0.24414062, "step": 8695, "time_per_iteration": 3.2576231956481934 }, { "auxiliary_loss_clip": 0.01161584, "auxiliary_loss_mlp": 0.01034494, "balance_loss_clip": 1.02049935, "balance_loss_mlp": 1.04354286, "epoch": 0.5228318052006613, "flos": 13808264077440.0, "grad_norm": 2.030604288328548, "language_loss": 0.7267381, "learning_rate": 1.8569276467217175e-06, "loss": 0.74869883, "num_input_tokens_seen": 186808660, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.734375, "step": 8696, "time_per_iteration": 2.6834754943847656 }, { "auxiliary_loss_clip": 0.01133196, "auxiliary_loss_mlp": 0.01032893, "balance_loss_clip": 1.0192076, "balance_loss_mlp": 1.04128695, "epoch": 0.5228919284533293, "flos": 15596184476160.0, "grad_norm": 1.5538019785871833, "language_loss": 0.71270037, "learning_rate": 1.8565507949279584e-06, "loss": 0.73436129, "num_input_tokens_seen": 186825900, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 8697, "time_per_iteration": 3.9612982273101807 }, { "auxiliary_loss_clip": 0.01129714, "auxiliary_loss_mlp": 0.01035353, "balance_loss_clip": 1.0228833, "balance_loss_mlp": 1.0413909, "epoch": 0.5229520517059973, "flos": 22491499726080.0, "grad_norm": 1.693876330305257, "language_loss": 0.80198705, "learning_rate": 1.8561739482535323e-06, "loss": 0.82363772, "num_input_tokens_seen": 186843735, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.703125, "step": 8698, "time_per_iteration": 2.6526012420654297 }, { "auxiliary_loss_clip": 0.01124639, "auxiliary_loss_mlp": 0.01035027, "balance_loss_clip": 1.0221585, "balance_loss_mlp": 1.04261208, "epoch": 0.5230121749586653, "flos": 22090880361600.0, "grad_norm": 1.6750337908226736, "language_loss": 0.74205017, "learning_rate": 1.8557971067118877e-06, "loss": 0.76364684, "num_input_tokens_seen": 186862440, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73046875, "step": 8699, "time_per_iteration": 2.5861284732818604 }, { "auxiliary_loss_clip": 0.01129566, "auxiliary_loss_mlp": 0.01279745, "balance_loss_clip": 1.01874971, "balance_loss_mlp": 1.04300141, "epoch": 0.5230722982113333, "flos": 22127150119680.0, "grad_norm": 1.6373447657585307, "language_loss": 0.73557699, "learning_rate": 1.8554202703164739e-06, "loss": 0.75967014, "num_input_tokens_seen": 186880940, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.77734375, "step": 8700, "time_per_iteration": 2.7074708938598633 }, { "auxiliary_loss_clip": 0.01138267, "auxiliary_loss_mlp": 0.01036528, "balance_loss_clip": 1.02202594, "balance_loss_mlp": 1.04368949, "epoch": 0.5231324214640012, "flos": 25009268572800.0, "grad_norm": 1.716354584362026, "language_loss": 0.6650548, "learning_rate": 1.8550434390807387e-06, "loss": 0.68680274, "num_input_tokens_seen": 186900785, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.765625, "step": 8701, "time_per_iteration": 3.9929494857788086 }, { "auxiliary_loss_clip": 0.01130774, "auxiliary_loss_mlp": 0.01282222, "balance_loss_clip": 1.02184772, "balance_loss_mlp": 1.0403173, "epoch": 0.5231925447166692, "flos": 25740517651200.0, "grad_norm": 1.7300077402268053, "language_loss": 0.66643381, "learning_rate": 1.8546666130181298e-06, "loss": 0.69056374, "num_input_tokens_seen": 186920895, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 8702, "time_per_iteration": 2.570617914199829 }, { "auxiliary_loss_clip": 0.01133097, "auxiliary_loss_mlp": 0.01038131, "balance_loss_clip": 1.023844, "balance_loss_mlp": 1.04287803, "epoch": 0.5232526679693371, "flos": 21433930565760.0, "grad_norm": 1.7791765698036213, "language_loss": 0.76688159, "learning_rate": 1.8542897921420961e-06, "loss": 0.78859389, "num_input_tokens_seen": 186940605, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7265625, "step": 8703, "time_per_iteration": 2.599672555923462 }, { "auxiliary_loss_clip": 0.01129894, "auxiliary_loss_mlp": 0.01043108, "balance_loss_clip": 1.0268712, "balance_loss_mlp": 1.04174101, "epoch": 0.5233127912220051, "flos": 35298393471360.0, "grad_norm": 2.2060681754152127, "language_loss": 0.76554573, "learning_rate": 1.8539129764660845e-06, "loss": 0.78727579, "num_input_tokens_seen": 186960820, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.796875, "step": 8704, "time_per_iteration": 2.6480519771575928 }, { "auxiliary_loss_clip": 0.01131913, "auxiliary_loss_mlp": 0.01038773, "balance_loss_clip": 1.02570724, "balance_loss_mlp": 1.04268587, "epoch": 0.523372914474673, "flos": 17051320344960.0, "grad_norm": 1.7005097489806709, "language_loss": 0.78064859, "learning_rate": 1.8535361660035436e-06, "loss": 0.80235547, "num_input_tokens_seen": 186976240, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 8705, "time_per_iteration": 2.6759531497955322 }, { "auxiliary_loss_clip": 0.01136185, "auxiliary_loss_mlp": 0.01033123, "balance_loss_clip": 1.01914573, "balance_loss_mlp": 1.04268324, "epoch": 0.5234330377273411, "flos": 18406302117120.0, "grad_norm": 2.009869513374488, "language_loss": 0.70005357, "learning_rate": 1.8531593607679195e-06, "loss": 0.72174668, "num_input_tokens_seen": 186992855, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7578125, "step": 8706, "time_per_iteration": 2.545964479446411 }, { "auxiliary_loss_clip": 0.01046737, "auxiliary_loss_mlp": 0.00999917, "balance_loss_clip": 0.99855852, "balance_loss_mlp": 1.01400197, "epoch": 0.523493160980009, "flos": 65850296970240.0, "grad_norm": 0.6743782257964989, "language_loss": 0.52471125, "learning_rate": 1.8527825607726606e-06, "loss": 0.54517776, "num_input_tokens_seen": 187051205, "router_z_loss_clip": 0.01361084, "router_z_loss_mlp": 0.24023438, "step": 8707, "time_per_iteration": 4.681773662567139 }, { "auxiliary_loss_clip": 0.01133993, "auxiliary_loss_mlp": 0.01040721, "balance_loss_clip": 1.02753043, "balance_loss_mlp": 1.04221261, "epoch": 0.523553284232677, "flos": 21872076664320.0, "grad_norm": 1.7141985956464107, "language_loss": 0.74891877, "learning_rate": 1.8524057660312134e-06, "loss": 0.77066588, "num_input_tokens_seen": 187070540, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73828125, "step": 8708, "time_per_iteration": 4.02062726020813 }, { "auxiliary_loss_clip": 0.01132214, "auxiliary_loss_mlp": 0.01030917, "balance_loss_clip": 1.01791155, "balance_loss_mlp": 1.04238665, "epoch": 0.5236134074853449, "flos": 20848191482880.0, "grad_norm": 3.263619398212117, "language_loss": 0.77612787, "learning_rate": 1.8520289765570242e-06, "loss": 0.79775918, "num_input_tokens_seen": 187089975, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 8709, "time_per_iteration": 2.550785779953003 }, { "auxiliary_loss_clip": 0.01154815, "auxiliary_loss_mlp": 0.01034098, "balance_loss_clip": 1.01875567, "balance_loss_mlp": 1.03999746, "epoch": 0.5236735307380129, "flos": 25520421064320.0, "grad_norm": 2.0833945345891336, "language_loss": 0.82992971, "learning_rate": 1.8516521923635408e-06, "loss": 0.8518188, "num_input_tokens_seen": 187108775, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.79296875, "step": 8710, "time_per_iteration": 2.638979434967041 }, { "auxiliary_loss_clip": 0.01129278, "auxiliary_loss_mlp": 0.01027911, "balance_loss_clip": 1.01490486, "balance_loss_mlp": 1.04020524, "epoch": 0.523733653990681, "flos": 23583112001280.0, "grad_norm": 1.8388914826428984, "language_loss": 0.69641262, "learning_rate": 1.8512754134642092e-06, "loss": 0.71798456, "num_input_tokens_seen": 187128830, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 8711, "time_per_iteration": 2.6180782318115234 }, { "auxiliary_loss_clip": 0.01121026, "auxiliary_loss_mlp": 0.01034145, "balance_loss_clip": 1.02054942, "balance_loss_mlp": 1.03866172, "epoch": 0.5237937772433489, "flos": 21106245767040.0, "grad_norm": 1.8503899899538827, "language_loss": 0.82784057, "learning_rate": 1.8508986398724752e-06, "loss": 0.84939229, "num_input_tokens_seen": 187149570, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 8712, "time_per_iteration": 2.585505485534668 }, { "auxiliary_loss_clip": 0.01117028, "auxiliary_loss_mlp": 0.0103879, "balance_loss_clip": 1.02409208, "balance_loss_mlp": 1.0412097, "epoch": 0.5238539004960169, "flos": 19172887200000.0, "grad_norm": 1.7983244222665051, "language_loss": 0.69633341, "learning_rate": 1.8505218716017857e-06, "loss": 0.71789157, "num_input_tokens_seen": 187170575, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7578125, "step": 8713, "time_per_iteration": 2.6536288261413574 }, { "auxiliary_loss_clip": 0.01144401, "auxiliary_loss_mlp": 0.01037318, "balance_loss_clip": 1.02261913, "balance_loss_mlp": 1.03882229, "epoch": 0.5239140237486848, "flos": 17888218300800.0, "grad_norm": 2.4828750667981287, "language_loss": 0.7654351, "learning_rate": 1.8501451086655852e-06, "loss": 0.78725231, "num_input_tokens_seen": 187187190, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7890625, "step": 8714, "time_per_iteration": 2.628368854522705 }, { "auxiliary_loss_clip": 0.01143889, "auxiliary_loss_mlp": 0.01033817, "balance_loss_clip": 1.01949954, "balance_loss_mlp": 1.04237151, "epoch": 0.5239741470013528, "flos": 17930413802880.0, "grad_norm": 2.3439476426861052, "language_loss": 0.75612158, "learning_rate": 1.8497683510773207e-06, "loss": 0.77789861, "num_input_tokens_seen": 187204350, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.75, "step": 8715, "time_per_iteration": 2.603358030319214 }, { "auxiliary_loss_clip": 0.01128645, "auxiliary_loss_mlp": 0.01033651, "balance_loss_clip": 1.01998353, "balance_loss_mlp": 1.03966486, "epoch": 0.5240342702540207, "flos": 30993386584320.0, "grad_norm": 1.6214687153626481, "language_loss": 0.70929682, "learning_rate": 1.8493915988504372e-06, "loss": 0.73091972, "num_input_tokens_seen": 187225605, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7109375, "step": 8716, "time_per_iteration": 2.684081554412842 }, { "auxiliary_loss_clip": 0.01113478, "auxiliary_loss_mlp": 0.01035833, "balance_loss_clip": 1.02357805, "balance_loss_mlp": 1.04036856, "epoch": 0.5240943935066887, "flos": 25005066681600.0, "grad_norm": 1.9888109826451266, "language_loss": 0.87051898, "learning_rate": 1.8490148519983804e-06, "loss": 0.89201212, "num_input_tokens_seen": 187241335, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.73046875, "step": 8717, "time_per_iteration": 2.668468475341797 }, { "auxiliary_loss_clip": 0.0112317, "auxiliary_loss_mlp": 0.01032525, "balance_loss_clip": 1.019454, "balance_loss_mlp": 1.04133511, "epoch": 0.5241545167593566, "flos": 23659099223040.0, "grad_norm": 2.211646463133658, "language_loss": 0.6099577, "learning_rate": 1.8486381105345953e-06, "loss": 0.63151461, "num_input_tokens_seen": 187259925, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 8718, "time_per_iteration": 2.6017701625823975 }, { "auxiliary_loss_clip": 0.01137698, "auxiliary_loss_mlp": 0.01033262, "balance_loss_clip": 1.01945114, "balance_loss_mlp": 1.04323697, "epoch": 0.5242146400120247, "flos": 23400398494080.0, "grad_norm": 1.771970026694157, "language_loss": 0.71489322, "learning_rate": 1.848261374472526e-06, "loss": 0.73660284, "num_input_tokens_seen": 187279035, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76171875, "step": 8719, "time_per_iteration": 2.651350259780884 }, { "auxiliary_loss_clip": 0.01118273, "auxiliary_loss_mlp": 0.01029173, "balance_loss_clip": 1.01672769, "balance_loss_mlp": 1.03895998, "epoch": 0.5242747632646926, "flos": 17749065012480.0, "grad_norm": 2.1019772813847877, "language_loss": 0.72669125, "learning_rate": 1.8478846438256183e-06, "loss": 0.74816573, "num_input_tokens_seen": 187297555, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 8720, "time_per_iteration": 2.530259132385254 }, { "auxiliary_loss_clip": 0.0112311, "auxiliary_loss_mlp": 0.01033453, "balance_loss_clip": 1.019696, "balance_loss_mlp": 1.04038739, "epoch": 0.5243348865173606, "flos": 32597731549440.0, "grad_norm": 1.5962594939263763, "language_loss": 0.7028054, "learning_rate": 1.847507918607316e-06, "loss": 0.72437102, "num_input_tokens_seen": 187320265, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.73828125, "step": 8721, "time_per_iteration": 2.705883026123047 }, { "auxiliary_loss_clip": 0.01147451, "auxiliary_loss_mlp": 0.01037409, "balance_loss_clip": 1.02392006, "balance_loss_mlp": 1.03936958, "epoch": 0.5243950097700285, "flos": 25484115392640.0, "grad_norm": 1.6289000054015348, "language_loss": 0.86848533, "learning_rate": 1.8471311988310646e-06, "loss": 0.89033389, "num_input_tokens_seen": 187338045, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 8722, "time_per_iteration": 2.6828694343566895 }, { "auxiliary_loss_clip": 0.01113396, "auxiliary_loss_mlp": 0.01030503, "balance_loss_clip": 1.01644254, "balance_loss_mlp": 1.04196572, "epoch": 0.5244551330226965, "flos": 15268391936640.0, "grad_norm": 1.7533438178051688, "language_loss": 0.79808414, "learning_rate": 1.8467544845103074e-06, "loss": 0.81952316, "num_input_tokens_seen": 187356040, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.71484375, "step": 8723, "time_per_iteration": 2.5767159461975098 }, { "auxiliary_loss_clip": 0.01126791, "auxiliary_loss_mlp": 0.01038923, "balance_loss_clip": 1.02486277, "balance_loss_mlp": 1.0427469, "epoch": 0.5245152562753645, "flos": 22237108629120.0, "grad_norm": 1.9644198168620037, "language_loss": 0.74912417, "learning_rate": 1.8463777756584878e-06, "loss": 0.77078134, "num_input_tokens_seen": 187374185, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75, "step": 8724, "time_per_iteration": 2.5790536403656006 }, { "auxiliary_loss_clip": 0.01118707, "auxiliary_loss_mlp": 0.01035541, "balance_loss_clip": 1.02029395, "balance_loss_mlp": 1.04113996, "epoch": 0.5245753795280325, "flos": 29426460612480.0, "grad_norm": 1.550326057244914, "language_loss": 0.70010102, "learning_rate": 1.8460010722890507e-06, "loss": 0.72164357, "num_input_tokens_seen": 187396640, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.77734375, "step": 8725, "time_per_iteration": 2.6985960006713867 }, { "auxiliary_loss_clip": 0.01062578, "auxiliary_loss_mlp": 0.01003266, "balance_loss_clip": 1.00171626, "balance_loss_mlp": 1.01272154, "epoch": 0.5246355027807005, "flos": 58834392785280.0, "grad_norm": 0.7559559574811995, "language_loss": 0.55630463, "learning_rate": 1.8456243744154392e-06, "loss": 0.57696307, "num_input_tokens_seen": 187455945, "router_z_loss_clip": 0.01544189, "router_z_loss_mlp": 0.23828125, "step": 8726, "time_per_iteration": 3.1899139881134033 }, { "auxiliary_loss_clip": 0.01054422, "auxiliary_loss_mlp": 0.01000942, "balance_loss_clip": 0.99950552, "balance_loss_mlp": 1.01237607, "epoch": 0.5246956260333684, "flos": 64526592965760.0, "grad_norm": 0.7891977292522709, "language_loss": 0.58398265, "learning_rate": 1.8452476820510967e-06, "loss": 0.60453629, "num_input_tokens_seen": 187519975, "router_z_loss_clip": 0.01434326, "router_z_loss_mlp": 0.24023438, "step": 8727, "time_per_iteration": 3.151855230331421 }, { "auxiliary_loss_clip": 0.01125007, "auxiliary_loss_mlp": 0.01033243, "balance_loss_clip": 1.01872361, "balance_loss_mlp": 1.03999281, "epoch": 0.5247557492860364, "flos": 24533631653760.0, "grad_norm": 1.4860542293222898, "language_loss": 0.70418394, "learning_rate": 1.844870995209466e-06, "loss": 0.72576642, "num_input_tokens_seen": 187541775, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.76171875, "step": 8728, "time_per_iteration": 2.645876407623291 }, { "auxiliary_loss_clip": 0.01121638, "auxiliary_loss_mlp": 0.01028303, "balance_loss_clip": 1.01565456, "balance_loss_mlp": 1.03925037, "epoch": 0.5248158725387043, "flos": 18806131382400.0, "grad_norm": 1.382673249678368, "language_loss": 0.69557214, "learning_rate": 1.8444943139039907e-06, "loss": 0.71707153, "num_input_tokens_seen": 187560425, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.734375, "step": 8729, "time_per_iteration": 2.579228401184082 }, { "auxiliary_loss_clip": 0.01143427, "auxiliary_loss_mlp": 0.01032125, "balance_loss_clip": 1.01898813, "balance_loss_mlp": 1.04070628, "epoch": 0.5248759957913723, "flos": 20955851521920.0, "grad_norm": 1.838332684157255, "language_loss": 0.83571458, "learning_rate": 1.8441176381481135e-06, "loss": 0.85747015, "num_input_tokens_seen": 187579930, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7578125, "step": 8730, "time_per_iteration": 2.6328728199005127 }, { "auxiliary_loss_clip": 0.01141331, "auxiliary_loss_mlp": 0.01034414, "balance_loss_clip": 1.02091932, "balance_loss_mlp": 1.04119718, "epoch": 0.5249361190440403, "flos": 18660980522880.0, "grad_norm": 2.276771438053782, "language_loss": 0.7901417, "learning_rate": 1.8437409679552762e-06, "loss": 0.81189907, "num_input_tokens_seen": 187595365, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 8731, "time_per_iteration": 2.5589640140533447 }, { "auxiliary_loss_clip": 0.01115962, "auxiliary_loss_mlp": 0.01030898, "balance_loss_clip": 1.01748645, "balance_loss_mlp": 1.03750145, "epoch": 0.5249962422967083, "flos": 24863327614080.0, "grad_norm": 1.659590296390246, "language_loss": 0.82828808, "learning_rate": 1.8433643033389227e-06, "loss": 0.84975666, "num_input_tokens_seen": 187614715, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6953125, "step": 8732, "time_per_iteration": 2.6628739833831787 }, { "auxiliary_loss_clip": 0.0112021, "auxiliary_loss_mlp": 0.01031687, "balance_loss_clip": 1.01918209, "balance_loss_mlp": 1.03913713, "epoch": 0.5250563655493762, "flos": 15262681674240.0, "grad_norm": 1.4886079338542733, "language_loss": 0.77352929, "learning_rate": 1.8429876443124934e-06, "loss": 0.7950483, "num_input_tokens_seen": 187630745, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7265625, "step": 8733, "time_per_iteration": 2.5386359691619873 }, { "auxiliary_loss_clip": 0.01116367, "auxiliary_loss_mlp": 0.01036897, "balance_loss_clip": 1.02323544, "balance_loss_mlp": 1.04056811, "epoch": 0.5251164888020442, "flos": 18625177641600.0, "grad_norm": 2.1187811898581512, "language_loss": 0.81763202, "learning_rate": 1.8426109908894316e-06, "loss": 0.83916461, "num_input_tokens_seen": 187648200, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7578125, "step": 8734, "time_per_iteration": 2.5592124462127686 }, { "auxiliary_loss_clip": 0.01127731, "auxiliary_loss_mlp": 0.01029848, "balance_loss_clip": 1.01696789, "balance_loss_mlp": 1.03932643, "epoch": 0.5251766120547121, "flos": 29710764760320.0, "grad_norm": 1.3334701582834283, "language_loss": 0.76533973, "learning_rate": 1.8422343430831791e-06, "loss": 0.78691548, "num_input_tokens_seen": 187669205, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 8735, "time_per_iteration": 2.6791625022888184 }, { "auxiliary_loss_clip": 0.01124459, "auxiliary_loss_mlp": 0.01030193, "balance_loss_clip": 1.01822424, "balance_loss_mlp": 1.04185605, "epoch": 0.5252367353073801, "flos": 23440295525760.0, "grad_norm": 2.246065274760402, "language_loss": 0.79952163, "learning_rate": 1.8418577009071763e-06, "loss": 0.82106817, "num_input_tokens_seen": 187690890, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.73828125, "step": 8736, "time_per_iteration": 2.612586259841919 }, { "auxiliary_loss_clip": 0.01115495, "auxiliary_loss_mlp": 0.01031787, "balance_loss_clip": 1.01799464, "balance_loss_mlp": 1.04185653, "epoch": 0.5252968585600482, "flos": 30810708990720.0, "grad_norm": 2.000051038401203, "language_loss": 0.69423854, "learning_rate": 1.8414810643748656e-06, "loss": 0.71571136, "num_input_tokens_seen": 187713045, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.73828125, "step": 8737, "time_per_iteration": 2.588575601577759 }, { "auxiliary_loss_clip": 0.01129381, "auxiliary_loss_mlp": 0.01032611, "balance_loss_clip": 1.01924181, "balance_loss_mlp": 1.03847885, "epoch": 0.5253569818127161, "flos": 20628274464000.0, "grad_norm": 1.4454647058798582, "language_loss": 0.77151871, "learning_rate": 1.841104433499688e-06, "loss": 0.79313862, "num_input_tokens_seen": 187733640, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 8738, "time_per_iteration": 3.985401153564453 }, { "auxiliary_loss_clip": 0.01053502, "auxiliary_loss_mlp": 0.01250242, "balance_loss_clip": 1.00235736, "balance_loss_mlp": 1.01203644, "epoch": 0.5254171050653841, "flos": 63428695810560.0, "grad_norm": 0.7419404807237311, "language_loss": 0.54469907, "learning_rate": 1.8407278082950846e-06, "loss": 0.56773651, "num_input_tokens_seen": 187792930, "router_z_loss_clip": 0.01708984, "router_z_loss_mlp": 0.23828125, "step": 8739, "time_per_iteration": 3.1580164432525635 }, { "auxiliary_loss_clip": 0.01131386, "auxiliary_loss_mlp": 0.01033652, "balance_loss_clip": 1.0210042, "balance_loss_mlp": 1.0407896, "epoch": 0.525477228318052, "flos": 34670782108800.0, "grad_norm": 2.134757320923389, "language_loss": 0.84840775, "learning_rate": 1.840351188774496e-06, "loss": 0.87005812, "num_input_tokens_seen": 187812495, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 8740, "time_per_iteration": 2.6725316047668457 }, { "auxiliary_loss_clip": 0.01121906, "auxiliary_loss_mlp": 0.01285482, "balance_loss_clip": 1.02518523, "balance_loss_mlp": 1.04057312, "epoch": 0.52553735157072, "flos": 17930844766080.0, "grad_norm": 1.9781350998982568, "language_loss": 0.69087553, "learning_rate": 1.8399745749513627e-06, "loss": 0.71494943, "num_input_tokens_seen": 187829685, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.72265625, "step": 8741, "time_per_iteration": 2.5644917488098145 }, { "auxiliary_loss_clip": 0.01135101, "auxiliary_loss_mlp": 0.01032429, "balance_loss_clip": 1.01906013, "balance_loss_mlp": 1.04211807, "epoch": 0.5255974748233879, "flos": 9940864584960.0, "grad_norm": 1.8252798306620537, "language_loss": 0.66267073, "learning_rate": 1.8395979668391256e-06, "loss": 0.68434602, "num_input_tokens_seen": 187846495, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.75, "step": 8742, "time_per_iteration": 4.009873867034912 }, { "auxiliary_loss_clip": 0.0114369, "auxiliary_loss_mlp": 0.01038229, "balance_loss_clip": 1.02359653, "balance_loss_mlp": 1.04025018, "epoch": 0.5256575980760559, "flos": 16868427269760.0, "grad_norm": 1.9398309359579624, "language_loss": 0.63032812, "learning_rate": 1.839221364451224e-06, "loss": 0.65214729, "num_input_tokens_seen": 187862010, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.765625, "step": 8743, "time_per_iteration": 2.554211139678955 }, { "auxiliary_loss_clip": 0.01045329, "auxiliary_loss_mlp": 0.00999625, "balance_loss_clip": 0.99802732, "balance_loss_mlp": 1.01221323, "epoch": 0.5257177213287239, "flos": 62386210362240.0, "grad_norm": 0.7744036307627198, "language_loss": 0.54178298, "learning_rate": 1.8388447678010985e-06, "loss": 0.56223249, "num_input_tokens_seen": 187922730, "router_z_loss_clip": 0.01599121, "router_z_loss_mlp": 0.24023438, "step": 8744, "time_per_iteration": 3.2587311267852783 }, { "auxiliary_loss_clip": 0.01138712, "auxiliary_loss_mlp": 0.01037077, "balance_loss_clip": 1.02204514, "balance_loss_mlp": 1.04302001, "epoch": 0.5257778445813919, "flos": 20776908942720.0, "grad_norm": 2.510836153614025, "language_loss": 0.75464886, "learning_rate": 1.8384681769021888e-06, "loss": 0.77640676, "num_input_tokens_seen": 187940160, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.77734375, "step": 8745, "time_per_iteration": 2.572307586669922 }, { "auxiliary_loss_clip": 0.01123273, "auxiliary_loss_mlp": 0.01037261, "balance_loss_clip": 1.0245471, "balance_loss_mlp": 1.04081404, "epoch": 0.5258379678340598, "flos": 17018606033280.0, "grad_norm": 1.883163058119104, "language_loss": 0.80919427, "learning_rate": 1.8380915917679337e-06, "loss": 0.83079964, "num_input_tokens_seen": 187958625, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.734375, "step": 8746, "time_per_iteration": 2.6025524139404297 }, { "auxiliary_loss_clip": 0.0112197, "auxiliary_loss_mlp": 0.01034555, "balance_loss_clip": 1.0212872, "balance_loss_mlp": 1.04078579, "epoch": 0.5258980910867278, "flos": 21068754946560.0, "grad_norm": 1.7978633228712384, "language_loss": 0.75285709, "learning_rate": 1.8377150124117739e-06, "loss": 0.77442235, "num_input_tokens_seen": 187977575, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 8747, "time_per_iteration": 2.555558681488037 }, { "auxiliary_loss_clip": 0.01153818, "auxiliary_loss_mlp": 0.01031545, "balance_loss_clip": 1.01809192, "balance_loss_mlp": 1.03936529, "epoch": 0.5259582143393957, "flos": 18004461690240.0, "grad_norm": 8.75891361805708, "language_loss": 0.82385838, "learning_rate": 1.8373384388471474e-06, "loss": 0.84571195, "num_input_tokens_seen": 187996650, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 8748, "time_per_iteration": 2.627295970916748 }, { "auxiliary_loss_clip": 0.01131784, "auxiliary_loss_mlp": 0.01037152, "balance_loss_clip": 1.02281165, "balance_loss_mlp": 1.03908861, "epoch": 0.5260183375920637, "flos": 22783848520320.0, "grad_norm": 2.0167978452309177, "language_loss": 0.80634344, "learning_rate": 1.836961871087494e-06, "loss": 0.82803279, "num_input_tokens_seen": 188013510, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.75, "step": 8749, "time_per_iteration": 5.606698751449585 }, { "auxiliary_loss_clip": 0.01122245, "auxiliary_loss_mlp": 0.01035829, "balance_loss_clip": 1.02222729, "balance_loss_mlp": 1.04128027, "epoch": 0.5260784608447318, "flos": 27052406081280.0, "grad_norm": 1.7865680339621814, "language_loss": 0.72476912, "learning_rate": 1.8365853091462516e-06, "loss": 0.74634993, "num_input_tokens_seen": 188032085, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 8750, "time_per_iteration": 2.7344160079956055 }, { "auxiliary_loss_clip": 0.01130311, "auxiliary_loss_mlp": 0.01035939, "balance_loss_clip": 1.02248645, "balance_loss_mlp": 1.0389725, "epoch": 0.5261385840973997, "flos": 20662820369280.0, "grad_norm": 1.447460213613257, "language_loss": 0.7623648, "learning_rate": 1.8362087530368597e-06, "loss": 0.78402728, "num_input_tokens_seen": 188050590, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 8751, "time_per_iteration": 2.6521449089050293 }, { "auxiliary_loss_clip": 0.01109722, "auxiliary_loss_mlp": 0.01036774, "balance_loss_clip": 1.02405453, "balance_loss_mlp": 1.03960991, "epoch": 0.5261987073500677, "flos": 23622649896960.0, "grad_norm": 1.4003428521072747, "language_loss": 0.76061064, "learning_rate": 1.835832202772756e-06, "loss": 0.78207564, "num_input_tokens_seen": 188071620, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 8752, "time_per_iteration": 2.6234028339385986 }, { "auxiliary_loss_clip": 0.0112468, "auxiliary_loss_mlp": 0.01036056, "balance_loss_clip": 1.02179825, "balance_loss_mlp": 1.04060459, "epoch": 0.5262588306027356, "flos": 27636241743360.0, "grad_norm": 1.5969175196506005, "language_loss": 0.67774218, "learning_rate": 1.8354556583673782e-06, "loss": 0.69934952, "num_input_tokens_seen": 188091740, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75, "step": 8753, "time_per_iteration": 2.643406629562378 }, { "auxiliary_loss_clip": 0.01122459, "auxiliary_loss_mlp": 0.01039675, "balance_loss_clip": 1.02634192, "balance_loss_mlp": 1.04172754, "epoch": 0.5263189538554036, "flos": 21759711943680.0, "grad_norm": 1.5182802328662235, "language_loss": 0.8411746, "learning_rate": 1.835079119834165e-06, "loss": 0.86279595, "num_input_tokens_seen": 188111165, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 8754, "time_per_iteration": 2.586092710494995 }, { "auxiliary_loss_clip": 0.01124415, "auxiliary_loss_mlp": 0.01034549, "balance_loss_clip": 1.02096462, "balance_loss_mlp": 1.04055941, "epoch": 0.5263790771080715, "flos": 14276359140480.0, "grad_norm": 2.3646956893763997, "language_loss": 0.87185723, "learning_rate": 1.8347025871865537e-06, "loss": 0.89344692, "num_input_tokens_seen": 188127825, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.75, "step": 8755, "time_per_iteration": 2.539015769958496 }, { "auxiliary_loss_clip": 0.01062973, "auxiliary_loss_mlp": 0.01016542, "balance_loss_clip": 1.01508141, "balance_loss_mlp": 1.01227975, "epoch": 0.5264392003607395, "flos": 65806413528960.0, "grad_norm": 0.7233430653215202, "language_loss": 0.58795476, "learning_rate": 1.834326060437982e-06, "loss": 0.60874993, "num_input_tokens_seen": 188194050, "router_z_loss_clip": 0.0145874, "router_z_loss_mlp": 0.2421875, "step": 8756, "time_per_iteration": 3.247037172317505 }, { "auxiliary_loss_clip": 0.01142855, "auxiliary_loss_mlp": 0.0103983, "balance_loss_clip": 1.02568614, "balance_loss_mlp": 1.04066873, "epoch": 0.5264993236134075, "flos": 21032413361280.0, "grad_norm": 1.6360000240093442, "language_loss": 0.70543134, "learning_rate": 1.8339495396018876e-06, "loss": 0.72725815, "num_input_tokens_seen": 188212565, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 8757, "time_per_iteration": 2.7095770835876465 }, { "auxiliary_loss_clip": 0.01124771, "auxiliary_loss_mlp": 0.01038098, "balance_loss_clip": 1.02471077, "balance_loss_mlp": 1.04168725, "epoch": 0.5265594468660755, "flos": 16618202150400.0, "grad_norm": 1.6627295874916506, "language_loss": 0.8783232, "learning_rate": 1.8335730246917063e-06, "loss": 0.89995193, "num_input_tokens_seen": 188229505, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7421875, "step": 8758, "time_per_iteration": 2.620662212371826 }, { "auxiliary_loss_clip": 0.01161237, "auxiliary_loss_mlp": 0.01038017, "balance_loss_clip": 1.02331281, "balance_loss_mlp": 1.04101717, "epoch": 0.5266195701187434, "flos": 24134125610880.0, "grad_norm": 1.4223603820435937, "language_loss": 0.75984216, "learning_rate": 1.8331965157208757e-06, "loss": 0.78183472, "num_input_tokens_seen": 188250395, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7578125, "step": 8759, "time_per_iteration": 2.604226589202881 }, { "auxiliary_loss_clip": 0.01127913, "auxiliary_loss_mlp": 0.01027125, "balance_loss_clip": 1.01419687, "balance_loss_mlp": 1.03911138, "epoch": 0.5266796933714114, "flos": 15844111125120.0, "grad_norm": 1.9524874139978907, "language_loss": 0.71748316, "learning_rate": 1.8328200127028324e-06, "loss": 0.73903352, "num_input_tokens_seen": 188266785, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 8760, "time_per_iteration": 2.5642378330230713 }, { "auxiliary_loss_clip": 0.01138735, "auxiliary_loss_mlp": 0.0103594, "balance_loss_clip": 1.02151513, "balance_loss_mlp": 1.04281664, "epoch": 0.5267398166240793, "flos": 20951434149120.0, "grad_norm": 2.4424218501059816, "language_loss": 0.75502336, "learning_rate": 1.832443515651013e-06, "loss": 0.77677006, "num_input_tokens_seen": 188282525, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.77734375, "step": 8761, "time_per_iteration": 2.6105167865753174 }, { "auxiliary_loss_clip": 0.01122674, "auxiliary_loss_mlp": 0.01279092, "balance_loss_clip": 1.01945019, "balance_loss_mlp": 1.03973699, "epoch": 0.5267999398767473, "flos": 20996394998400.0, "grad_norm": 1.7306076083061817, "language_loss": 0.70445609, "learning_rate": 1.8320670245788534e-06, "loss": 0.72847378, "num_input_tokens_seen": 188301395, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.73828125, "step": 8762, "time_per_iteration": 2.5364012718200684 }, { "auxiliary_loss_clip": 0.01053951, "auxiliary_loss_mlp": 0.01007168, "balance_loss_clip": 1.00553501, "balance_loss_mlp": 1.01196051, "epoch": 0.5268600631294152, "flos": 66849401767680.0, "grad_norm": 0.9095887157264221, "language_loss": 0.65471333, "learning_rate": 1.8316905394997895e-06, "loss": 0.67532456, "num_input_tokens_seen": 188357665, "router_z_loss_clip": 0.01635742, "router_z_loss_mlp": 0.24023438, "step": 8763, "time_per_iteration": 3.0566089153289795 }, { "auxiliary_loss_clip": 0.01122088, "auxiliary_loss_mlp": 0.01025651, "balance_loss_clip": 1.01189423, "balance_loss_mlp": 1.03971505, "epoch": 0.5269201863820833, "flos": 17165552572800.0, "grad_norm": 1.73215898679669, "language_loss": 0.71110207, "learning_rate": 1.8313140604272577e-06, "loss": 0.73257947, "num_input_tokens_seen": 188376935, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.734375, "step": 8764, "time_per_iteration": 2.5299670696258545 }, { "auxiliary_loss_clip": 0.01142249, "auxiliary_loss_mlp": 0.01033657, "balance_loss_clip": 1.01880395, "balance_loss_mlp": 1.04033899, "epoch": 0.5269803096347513, "flos": 20522589672960.0, "grad_norm": 3.611390482614295, "language_loss": 0.74459857, "learning_rate": 1.8309375873746926e-06, "loss": 0.76635766, "num_input_tokens_seen": 188394995, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.74609375, "step": 8765, "time_per_iteration": 2.628270387649536 }, { "auxiliary_loss_clip": 0.01132992, "auxiliary_loss_mlp": 0.01030759, "balance_loss_clip": 1.0159595, "balance_loss_mlp": 1.03833055, "epoch": 0.5270404328874192, "flos": 27230989524480.0, "grad_norm": 1.5311600733091886, "language_loss": 0.85369241, "learning_rate": 1.8305611203555307e-06, "loss": 0.87532997, "num_input_tokens_seen": 188415475, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.76953125, "step": 8766, "time_per_iteration": 2.55794095993042 }, { "auxiliary_loss_clip": 0.01118443, "auxiliary_loss_mlp": 0.01035978, "balance_loss_clip": 1.02148187, "balance_loss_mlp": 1.04229915, "epoch": 0.5271005561400872, "flos": 23110491824640.0, "grad_norm": 1.7319532573975085, "language_loss": 0.79008901, "learning_rate": 1.8301846593832064e-06, "loss": 0.81163323, "num_input_tokens_seen": 188435665, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.76171875, "step": 8767, "time_per_iteration": 2.6466481685638428 }, { "auxiliary_loss_clip": 0.01127092, "auxiliary_loss_mlp": 0.01033819, "balance_loss_clip": 1.01940024, "balance_loss_mlp": 1.04242468, "epoch": 0.5271606793927551, "flos": 22564793427840.0, "grad_norm": 1.8763891112099156, "language_loss": 0.73753905, "learning_rate": 1.8298082044711544e-06, "loss": 0.75914812, "num_input_tokens_seen": 188455405, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7578125, "step": 8768, "time_per_iteration": 2.520843267440796 }, { "auxiliary_loss_clip": 0.01131275, "auxiliary_loss_mlp": 0.01034225, "balance_loss_clip": 1.02204216, "balance_loss_mlp": 1.04116583, "epoch": 0.5272208026454231, "flos": 18764259102720.0, "grad_norm": 1.8215832934974556, "language_loss": 0.73630446, "learning_rate": 1.8294317556328102e-06, "loss": 0.75795949, "num_input_tokens_seen": 188472940, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.72265625, "step": 8769, "time_per_iteration": 2.5679538249969482 }, { "auxiliary_loss_clip": 0.01141372, "auxiliary_loss_mlp": 0.01036349, "balance_loss_clip": 1.02267003, "balance_loss_mlp": 1.04091299, "epoch": 0.5272809258980911, "flos": 20412164286720.0, "grad_norm": 2.3103677375932126, "language_loss": 0.7389099, "learning_rate": 1.8290553128816077e-06, "loss": 0.76068711, "num_input_tokens_seen": 188493035, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 8770, "time_per_iteration": 2.5281031131744385 }, { "auxiliary_loss_clip": 0.0111685, "auxiliary_loss_mlp": 0.01033616, "balance_loss_clip": 1.02038956, "balance_loss_mlp": 1.04279983, "epoch": 0.5273410491507591, "flos": 28256742213120.0, "grad_norm": 1.781252033821367, "language_loss": 0.68128884, "learning_rate": 1.8286788762309816e-06, "loss": 0.70279354, "num_input_tokens_seen": 188513860, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7421875, "step": 8771, "time_per_iteration": 2.6107330322265625 }, { "auxiliary_loss_clip": 0.01122899, "auxiliary_loss_mlp": 0.01033926, "balance_loss_clip": 1.02051544, "balance_loss_mlp": 1.04123211, "epoch": 0.527401172403427, "flos": 22455158140800.0, "grad_norm": 1.8573769893946617, "language_loss": 0.76891375, "learning_rate": 1.8283024456943659e-06, "loss": 0.79048193, "num_input_tokens_seen": 188533345, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73046875, "step": 8772, "time_per_iteration": 2.5068438053131104 }, { "auxiliary_loss_clip": 0.01128569, "auxiliary_loss_mlp": 0.01037753, "balance_loss_clip": 1.02350128, "balance_loss_mlp": 1.04199505, "epoch": 0.527461295656095, "flos": 21031084558080.0, "grad_norm": 1.9468127328564282, "language_loss": 0.65721518, "learning_rate": 1.8279260212851938e-06, "loss": 0.67887837, "num_input_tokens_seen": 188551550, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.77734375, "step": 8773, "time_per_iteration": 2.610044240951538 }, { "auxiliary_loss_clip": 0.01142528, "auxiliary_loss_mlp": 0.01039762, "balance_loss_clip": 1.02716148, "balance_loss_mlp": 1.04082036, "epoch": 0.5275214189087629, "flos": 26322018929280.0, "grad_norm": 1.701975298931846, "language_loss": 0.86121565, "learning_rate": 1.8275496030169e-06, "loss": 0.88303852, "num_input_tokens_seen": 188571615, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.75, "step": 8774, "time_per_iteration": 2.6804637908935547 }, { "auxiliary_loss_clip": 0.01132141, "auxiliary_loss_mlp": 0.01031435, "balance_loss_clip": 1.01862061, "balance_loss_mlp": 1.04044783, "epoch": 0.5275815421614309, "flos": 20047024581120.0, "grad_norm": 1.6973908492106586, "language_loss": 0.8003183, "learning_rate": 1.8271731909029164e-06, "loss": 0.82195401, "num_input_tokens_seen": 188591965, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.734375, "step": 8775, "time_per_iteration": 2.63141131401062 }, { "auxiliary_loss_clip": 0.0113081, "auxiliary_loss_mlp": 0.01039797, "balance_loss_clip": 1.02590275, "balance_loss_mlp": 1.04032087, "epoch": 0.5276416654140988, "flos": 21432206712960.0, "grad_norm": 2.0347015917532167, "language_loss": 0.83253026, "learning_rate": 1.8267967849566776e-06, "loss": 0.85423625, "num_input_tokens_seen": 188610675, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7265625, "step": 8776, "time_per_iteration": 2.542172431945801 }, { "auxiliary_loss_clip": 0.01133876, "auxiliary_loss_mlp": 0.01029423, "balance_loss_clip": 1.01549935, "balance_loss_mlp": 1.04058337, "epoch": 0.5277017886667669, "flos": 17165085696000.0, "grad_norm": 1.946394597499737, "language_loss": 0.67819631, "learning_rate": 1.8264203851916155e-06, "loss": 0.69982928, "num_input_tokens_seen": 188628235, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75390625, "step": 8777, "time_per_iteration": 2.5656650066375732 }, { "auxiliary_loss_clip": 0.01133706, "auxiliary_loss_mlp": 0.01038051, "balance_loss_clip": 1.0248009, "balance_loss_mlp": 1.04288054, "epoch": 0.5277619119194349, "flos": 20448146736000.0, "grad_norm": 1.5355935439437085, "language_loss": 0.81997728, "learning_rate": 1.826043991621164e-06, "loss": 0.84169483, "num_input_tokens_seen": 188648925, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 8778, "time_per_iteration": 2.65859317779541 }, { "auxiliary_loss_clip": 0.0112776, "auxiliary_loss_mlp": 0.01037447, "balance_loss_clip": 1.02304018, "balance_loss_mlp": 1.04290473, "epoch": 0.5278220351721028, "flos": 24061083304320.0, "grad_norm": 3.1508470370004735, "language_loss": 0.79029167, "learning_rate": 1.825667604258755e-06, "loss": 0.81194377, "num_input_tokens_seen": 188668125, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.765625, "step": 8779, "time_per_iteration": 2.616138458251953 }, { "auxiliary_loss_clip": 0.01120787, "auxiliary_loss_mlp": 0.01032449, "balance_loss_clip": 1.01940119, "balance_loss_mlp": 1.03969443, "epoch": 0.5278821584247708, "flos": 24642907804800.0, "grad_norm": 2.624820057894263, "language_loss": 0.76112545, "learning_rate": 1.82529122311782e-06, "loss": 0.78265786, "num_input_tokens_seen": 188684410, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 8780, "time_per_iteration": 3.9441113471984863 }, { "auxiliary_loss_clip": 0.01148555, "auxiliary_loss_mlp": 0.01034579, "balance_loss_clip": 1.01997626, "balance_loss_mlp": 1.04465342, "epoch": 0.5279422816774387, "flos": 35408244240000.0, "grad_norm": 5.099937526772832, "language_loss": 0.69279259, "learning_rate": 1.8249148482117925e-06, "loss": 0.71462393, "num_input_tokens_seen": 188706130, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.77734375, "step": 8781, "time_per_iteration": 2.7585442066192627 }, { "auxiliary_loss_clip": 0.01121346, "auxiliary_loss_mlp": 0.0103851, "balance_loss_clip": 1.02624893, "balance_loss_mlp": 1.04022813, "epoch": 0.5280024049301068, "flos": 22967028904320.0, "grad_norm": 3.3669921205371787, "language_loss": 0.71902251, "learning_rate": 1.8245384795541033e-06, "loss": 0.74062109, "num_input_tokens_seen": 188725030, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.72265625, "step": 8782, "time_per_iteration": 2.6457202434539795 }, { "auxiliary_loss_clip": 0.01131324, "auxiliary_loss_mlp": 0.01030179, "balance_loss_clip": 1.01700091, "balance_loss_mlp": 1.0413537, "epoch": 0.5280625281827747, "flos": 21507619317120.0, "grad_norm": 1.937843856357643, "language_loss": 0.68344009, "learning_rate": 1.8241621171581846e-06, "loss": 0.70505512, "num_input_tokens_seen": 188744325, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 8783, "time_per_iteration": 2.532680034637451 }, { "auxiliary_loss_clip": 0.01132639, "auxiliary_loss_mlp": 0.01043015, "balance_loss_clip": 1.02692127, "balance_loss_mlp": 1.04347634, "epoch": 0.5281226514354427, "flos": 31208167958400.0, "grad_norm": 1.7565957982225198, "language_loss": 0.69450688, "learning_rate": 1.8237857610374678e-06, "loss": 0.71626341, "num_input_tokens_seen": 188765100, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8046875, "step": 8784, "time_per_iteration": 4.008503675460815 }, { "auxiliary_loss_clip": 0.01120085, "auxiliary_loss_mlp": 0.01033855, "balance_loss_clip": 1.01897192, "balance_loss_mlp": 1.04064703, "epoch": 0.5281827746881106, "flos": 25077821679360.0, "grad_norm": 3.202124291966952, "language_loss": 0.74745244, "learning_rate": 1.8234094112053836e-06, "loss": 0.76899189, "num_input_tokens_seen": 188783995, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.79296875, "step": 8785, "time_per_iteration": 2.5461795330047607 }, { "auxiliary_loss_clip": 0.01127087, "auxiliary_loss_mlp": 0.01034196, "balance_loss_clip": 1.01934862, "balance_loss_mlp": 1.04231882, "epoch": 0.5282428979407786, "flos": 20631255292800.0, "grad_norm": 1.704211947709369, "language_loss": 0.83050847, "learning_rate": 1.8230330676753637e-06, "loss": 0.85212129, "num_input_tokens_seen": 188803120, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7578125, "step": 8786, "time_per_iteration": 2.560488224029541 }, { "auxiliary_loss_clip": 0.01137005, "auxiliary_loss_mlp": 0.01024074, "balance_loss_clip": 1.01221859, "balance_loss_mlp": 1.04002237, "epoch": 0.5283030211934465, "flos": 22419391173120.0, "grad_norm": 4.466807337465933, "language_loss": 0.82860225, "learning_rate": 1.8226567304608383e-06, "loss": 0.85021305, "num_input_tokens_seen": 188820960, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.70703125, "step": 8787, "time_per_iteration": 2.570277452468872 }, { "auxiliary_loss_clip": 0.01148251, "auxiliary_loss_mlp": 0.01032009, "balance_loss_clip": 1.0196352, "balance_loss_mlp": 1.04026318, "epoch": 0.5283631444461145, "flos": 23615467176960.0, "grad_norm": 1.9361357723655992, "language_loss": 0.83332169, "learning_rate": 1.822280399575238e-06, "loss": 0.85512429, "num_input_tokens_seen": 188837165, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.72265625, "step": 8788, "time_per_iteration": 2.563563346862793 }, { "auxiliary_loss_clip": 0.01142491, "auxiliary_loss_mlp": 0.01042604, "balance_loss_clip": 1.02761996, "balance_loss_mlp": 1.04294372, "epoch": 0.5284232676987825, "flos": 32671994918400.0, "grad_norm": 1.605326184111669, "language_loss": 0.75122947, "learning_rate": 1.821904075031993e-06, "loss": 0.77308047, "num_input_tokens_seen": 188858555, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7265625, "step": 8789, "time_per_iteration": 2.719892978668213 }, { "auxiliary_loss_clip": 0.01127849, "auxiliary_loss_mlp": 0.01034926, "balance_loss_clip": 1.02104425, "balance_loss_mlp": 1.04292631, "epoch": 0.5284833909514505, "flos": 26760919213440.0, "grad_norm": 1.6470117659036638, "language_loss": 0.69606435, "learning_rate": 1.821527756844533e-06, "loss": 0.71769214, "num_input_tokens_seen": 188879050, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76171875, "step": 8790, "time_per_iteration": 4.07992959022522 }, { "auxiliary_loss_clip": 0.01110955, "auxiliary_loss_mlp": 0.01029302, "balance_loss_clip": 1.01675534, "balance_loss_mlp": 1.03957653, "epoch": 0.5285435142041185, "flos": 22090700793600.0, "grad_norm": 1.4597217343661173, "language_loss": 0.78631926, "learning_rate": 1.821151445026289e-06, "loss": 0.80772185, "num_input_tokens_seen": 188898885, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71484375, "step": 8791, "time_per_iteration": 4.1126182079315186 }, { "auxiliary_loss_clip": 0.01153118, "auxiliary_loss_mlp": 0.0102875, "balance_loss_clip": 1.01568508, "balance_loss_mlp": 1.04373896, "epoch": 0.5286036374567864, "flos": 20375463565440.0, "grad_norm": 2.2346141597330775, "language_loss": 0.66564536, "learning_rate": 1.820775139590689e-06, "loss": 0.68746406, "num_input_tokens_seen": 188917225, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 8792, "time_per_iteration": 2.657801389694214 }, { "auxiliary_loss_clip": 0.01129201, "auxiliary_loss_mlp": 0.01036937, "balance_loss_clip": 1.02474797, "balance_loss_mlp": 1.03891146, "epoch": 0.5286637607094544, "flos": 24352175122560.0, "grad_norm": 2.9371087299533567, "language_loss": 0.79894733, "learning_rate": 1.820398840551164e-06, "loss": 0.82060874, "num_input_tokens_seen": 188936120, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.72265625, "step": 8793, "time_per_iteration": 2.616136312484741 }, { "auxiliary_loss_clip": 0.01116081, "auxiliary_loss_mlp": 0.01044074, "balance_loss_clip": 1.02982879, "balance_loss_mlp": 1.04166675, "epoch": 0.5287238839621223, "flos": 17271165536640.0, "grad_norm": 2.0881937989869233, "language_loss": 0.85112059, "learning_rate": 1.8200225479211416e-06, "loss": 0.87272215, "num_input_tokens_seen": 188953405, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.74609375, "step": 8794, "time_per_iteration": 2.534052610397339 }, { "auxiliary_loss_clip": 0.01138335, "auxiliary_loss_mlp": 0.01037138, "balance_loss_clip": 1.02250457, "balance_loss_mlp": 1.04134893, "epoch": 0.5287840072147904, "flos": 17566890209280.0, "grad_norm": 2.326011418324513, "language_loss": 0.67962891, "learning_rate": 1.8196462617140525e-06, "loss": 0.70138365, "num_input_tokens_seen": 188971150, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7890625, "step": 8795, "time_per_iteration": 2.569524049758911 }, { "auxiliary_loss_clip": 0.01135912, "auxiliary_loss_mlp": 0.01036229, "balance_loss_clip": 1.02269316, "balance_loss_mlp": 1.04172158, "epoch": 0.5288441304674583, "flos": 18552099421440.0, "grad_norm": 1.9859139386515627, "language_loss": 0.80356848, "learning_rate": 1.8192699819433242e-06, "loss": 0.82528985, "num_input_tokens_seen": 188989550, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.76171875, "step": 8796, "time_per_iteration": 2.58189058303833 }, { "auxiliary_loss_clip": 0.01147324, "auxiliary_loss_mlp": 0.01036786, "balance_loss_clip": 1.02231383, "balance_loss_mlp": 1.04428124, "epoch": 0.5289042537201263, "flos": 20814507504000.0, "grad_norm": 1.5231942502777416, "language_loss": 0.690804, "learning_rate": 1.8188937086223847e-06, "loss": 0.71264511, "num_input_tokens_seen": 189008795, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.76171875, "step": 8797, "time_per_iteration": 2.6093504428863525 }, { "auxiliary_loss_clip": 0.01120902, "auxiliary_loss_mlp": 0.01034894, "balance_loss_clip": 1.02212727, "balance_loss_mlp": 1.04068279, "epoch": 0.5289643769727942, "flos": 15735265937280.0, "grad_norm": 1.600480904958281, "language_loss": 0.82303751, "learning_rate": 1.8185174417646633e-06, "loss": 0.84459543, "num_input_tokens_seen": 189025540, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 8798, "time_per_iteration": 2.585615396499634 }, { "auxiliary_loss_clip": 0.01136583, "auxiliary_loss_mlp": 0.01287633, "balance_loss_clip": 1.02623415, "balance_loss_mlp": 1.04109335, "epoch": 0.5290245002254622, "flos": 19537308633600.0, "grad_norm": 1.737091402124393, "language_loss": 0.70585704, "learning_rate": 1.8181411813835873e-06, "loss": 0.7300992, "num_input_tokens_seen": 189044885, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.77734375, "step": 8799, "time_per_iteration": 2.619570016860962 }, { "auxiliary_loss_clip": 0.01121757, "auxiliary_loss_mlp": 0.01036535, "balance_loss_clip": 1.02391028, "balance_loss_mlp": 1.04267764, "epoch": 0.5290846234781301, "flos": 15815131827840.0, "grad_norm": 2.036472599969866, "language_loss": 0.69072092, "learning_rate": 1.8177649274925852e-06, "loss": 0.71230388, "num_input_tokens_seen": 189061280, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 8800, "time_per_iteration": 2.5067155361175537 }, { "auxiliary_loss_clip": 0.01065714, "auxiliary_loss_mlp": 0.01005676, "balance_loss_clip": 1.00404286, "balance_loss_mlp": 1.01486039, "epoch": 0.5291447467307981, "flos": 70057624821120.0, "grad_norm": 0.9263504263583986, "language_loss": 0.56963027, "learning_rate": 1.8173886801050842e-06, "loss": 0.59034407, "num_input_tokens_seen": 189114775, "router_z_loss_clip": 0.01635742, "router_z_loss_mlp": 0.24414062, "step": 8801, "time_per_iteration": 3.046597719192505 }, { "auxiliary_loss_clip": 0.01136561, "auxiliary_loss_mlp": 0.01035414, "balance_loss_clip": 1.0217582, "balance_loss_mlp": 1.04301322, "epoch": 0.529204869983466, "flos": 28364186770560.0, "grad_norm": 1.5992878536929038, "language_loss": 0.63620305, "learning_rate": 1.8170124392345113e-06, "loss": 0.65792286, "num_input_tokens_seen": 189134700, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7578125, "step": 8802, "time_per_iteration": 2.593573808670044 }, { "auxiliary_loss_clip": 0.01131187, "auxiliary_loss_mlp": 0.01030715, "balance_loss_clip": 1.01713657, "balance_loss_mlp": 1.03984976, "epoch": 0.5292649932361341, "flos": 33758830684800.0, "grad_norm": 1.5008006333014219, "language_loss": 0.68799973, "learning_rate": 1.8166362048942935e-06, "loss": 0.70961869, "num_input_tokens_seen": 189155365, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 8803, "time_per_iteration": 2.6543238162994385 }, { "auxiliary_loss_clip": 0.01120613, "auxiliary_loss_mlp": 0.01284879, "balance_loss_clip": 1.02426147, "balance_loss_mlp": 1.03997612, "epoch": 0.5293251164888021, "flos": 20447679859200.0, "grad_norm": 1.6696946179231875, "language_loss": 0.76492476, "learning_rate": 1.816259977097858e-06, "loss": 0.78897977, "num_input_tokens_seen": 189173885, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 8804, "time_per_iteration": 2.4964070320129395 }, { "auxiliary_loss_clip": 0.01048328, "auxiliary_loss_mlp": 0.0100337, "balance_loss_clip": 1.00177276, "balance_loss_mlp": 1.01511431, "epoch": 0.52938523974147, "flos": 66545312204160.0, "grad_norm": 0.7576060954190332, "language_loss": 0.52974164, "learning_rate": 1.8158837558586313e-06, "loss": 0.55025864, "num_input_tokens_seen": 189236515, "router_z_loss_clip": 0.01599121, "router_z_loss_mlp": 0.24121094, "step": 8805, "time_per_iteration": 3.260002613067627 }, { "auxiliary_loss_clip": 0.0104908, "auxiliary_loss_mlp": 0.01001845, "balance_loss_clip": 1.00023556, "balance_loss_mlp": 1.01583672, "epoch": 0.529445362994138, "flos": 67151734542720.0, "grad_norm": 0.740630492761266, "language_loss": 0.63769954, "learning_rate": 1.8155075411900398e-06, "loss": 0.65820879, "num_input_tokens_seen": 189300500, "router_z_loss_clip": 0.01611328, "router_z_loss_mlp": 0.24121094, "step": 8806, "time_per_iteration": 3.2082529067993164 }, { "auxiliary_loss_clip": 0.01128308, "auxiliary_loss_mlp": 0.01034119, "balance_loss_clip": 1.02080369, "balance_loss_mlp": 1.04162788, "epoch": 0.5295054862468059, "flos": 17749316407680.0, "grad_norm": 1.5607951152709114, "language_loss": 0.722543, "learning_rate": 1.8151313331055094e-06, "loss": 0.74416733, "num_input_tokens_seen": 189319745, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.77734375, "step": 8807, "time_per_iteration": 2.6224138736724854 }, { "auxiliary_loss_clip": 0.01140881, "auxiliary_loss_mlp": 0.01029567, "balance_loss_clip": 1.01732469, "balance_loss_mlp": 1.04181075, "epoch": 0.529565609499474, "flos": 11397401084160.0, "grad_norm": 3.5384555485319438, "language_loss": 0.68945128, "learning_rate": 1.8147551316184661e-06, "loss": 0.71115577, "num_input_tokens_seen": 189334550, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.72265625, "step": 8808, "time_per_iteration": 2.6652278900146484 }, { "auxiliary_loss_clip": 0.01124378, "auxiliary_loss_mlp": 0.01033322, "balance_loss_clip": 1.02030408, "balance_loss_mlp": 1.04203403, "epoch": 0.5296257327521419, "flos": 17196363463680.0, "grad_norm": 1.7974009485146232, "language_loss": 0.86850584, "learning_rate": 1.8143789367423356e-06, "loss": 0.89008284, "num_input_tokens_seen": 189351735, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.734375, "step": 8809, "time_per_iteration": 2.4989848136901855 }, { "auxiliary_loss_clip": 0.01130278, "auxiliary_loss_mlp": 0.01034521, "balance_loss_clip": 1.01961994, "balance_loss_mlp": 1.04291296, "epoch": 0.5296858560048099, "flos": 39964086777600.0, "grad_norm": 15.952084218223678, "language_loss": 0.63989997, "learning_rate": 1.8140027484905438e-06, "loss": 0.66154802, "num_input_tokens_seen": 189373105, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.7890625, "step": 8810, "time_per_iteration": 2.696634531021118 }, { "auxiliary_loss_clip": 0.01112694, "auxiliary_loss_mlp": 0.01036119, "balance_loss_clip": 1.02303028, "balance_loss_mlp": 1.04047632, "epoch": 0.5297459792574778, "flos": 20961418129920.0, "grad_norm": 1.561400225357745, "language_loss": 0.67794204, "learning_rate": 1.8136265668765153e-06, "loss": 0.69943017, "num_input_tokens_seen": 189394615, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 8811, "time_per_iteration": 2.587100028991699 }, { "auxiliary_loss_clip": 0.01073924, "auxiliary_loss_mlp": 0.01001961, "balance_loss_clip": 1.00041127, "balance_loss_mlp": 1.0140295, "epoch": 0.5298061025101458, "flos": 66523620389760.0, "grad_norm": 0.6629980695767747, "language_loss": 0.53357607, "learning_rate": 1.813250391913675e-06, "loss": 0.55433494, "num_input_tokens_seen": 189459750, "router_z_loss_clip": 0.01544189, "router_z_loss_mlp": 0.24023438, "step": 8812, "time_per_iteration": 3.299679756164551 }, { "auxiliary_loss_clip": 0.01053145, "auxiliary_loss_mlp": 0.01002432, "balance_loss_clip": 1.00083506, "balance_loss_mlp": 1.01207852, "epoch": 0.5298662257628137, "flos": 67662994775040.0, "grad_norm": 0.7323558187443331, "language_loss": 0.56325674, "learning_rate": 1.8128742236154482e-06, "loss": 0.58381248, "num_input_tokens_seen": 189527540, "router_z_loss_clip": 0.01599121, "router_z_loss_mlp": 0.24023438, "step": 8813, "time_per_iteration": 3.3031060695648193 }, { "auxiliary_loss_clip": 0.01147012, "auxiliary_loss_mlp": 0.01034118, "balance_loss_clip": 1.02111864, "balance_loss_mlp": 1.04144228, "epoch": 0.5299263490154817, "flos": 19646405216640.0, "grad_norm": 1.6923361655567928, "language_loss": 0.81501269, "learning_rate": 1.8124980619952585e-06, "loss": 0.836824, "num_input_tokens_seen": 189546900, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69921875, "step": 8814, "time_per_iteration": 2.63775634765625 }, { "auxiliary_loss_clip": 0.0111877, "auxiliary_loss_mlp": 0.01028122, "balance_loss_clip": 1.01519346, "balance_loss_mlp": 1.0424751, "epoch": 0.5299864722681497, "flos": 22055005653120.0, "grad_norm": 1.4786129883208623, "language_loss": 0.84962857, "learning_rate": 1.8121219070665312e-06, "loss": 0.87109745, "num_input_tokens_seen": 189566490, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.765625, "step": 8815, "time_per_iteration": 2.5322000980377197 }, { "auxiliary_loss_clip": 0.01113737, "auxiliary_loss_mlp": 0.01035945, "balance_loss_clip": 1.02328515, "balance_loss_mlp": 1.04286635, "epoch": 0.5300465955208177, "flos": 21763698353280.0, "grad_norm": 1.9403494269509765, "language_loss": 0.65821511, "learning_rate": 1.8117457588426893e-06, "loss": 0.67971194, "num_input_tokens_seen": 189585580, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 8816, "time_per_iteration": 2.54081654548645 }, { "auxiliary_loss_clip": 0.01134103, "auxiliary_loss_mlp": 0.01035735, "balance_loss_clip": 1.0227232, "balance_loss_mlp": 1.04263186, "epoch": 0.5301067187734857, "flos": 42996491735040.0, "grad_norm": 1.7899552942517898, "language_loss": 0.72151947, "learning_rate": 1.8113696173371578e-06, "loss": 0.74321783, "num_input_tokens_seen": 189608485, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.734375, "step": 8817, "time_per_iteration": 2.791555166244507 }, { "auxiliary_loss_clip": 0.0113426, "auxiliary_loss_mlp": 0.01033083, "balance_loss_clip": 1.01946378, "balance_loss_mlp": 1.0411787, "epoch": 0.5301668420261536, "flos": 20554298403840.0, "grad_norm": 1.7017891991770921, "language_loss": 0.65562356, "learning_rate": 1.810993482563359e-06, "loss": 0.677297, "num_input_tokens_seen": 189627815, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.75, "step": 8818, "time_per_iteration": 2.589634656906128 }, { "auxiliary_loss_clip": 0.01153454, "auxiliary_loss_mlp": 0.01030576, "balance_loss_clip": 1.01612163, "balance_loss_mlp": 1.04288518, "epoch": 0.5302269652788216, "flos": 17486665182720.0, "grad_norm": 2.0981586170078765, "language_loss": 0.74932843, "learning_rate": 1.8106173545347164e-06, "loss": 0.77116871, "num_input_tokens_seen": 189644850, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.74609375, "step": 8819, "time_per_iteration": 2.5987114906311035 }, { "auxiliary_loss_clip": 0.01146539, "auxiliary_loss_mlp": 0.01035182, "balance_loss_clip": 1.02100241, "balance_loss_mlp": 1.04285312, "epoch": 0.5302870885314895, "flos": 14574202715520.0, "grad_norm": 2.2091591142659355, "language_loss": 0.82133174, "learning_rate": 1.8102412332646536e-06, "loss": 0.84314895, "num_input_tokens_seen": 189660945, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.76953125, "step": 8820, "time_per_iteration": 2.6028945446014404 }, { "auxiliary_loss_clip": 0.01133239, "auxiliary_loss_mlp": 0.01033936, "balance_loss_clip": 1.02190137, "balance_loss_mlp": 1.04152441, "epoch": 0.5303472117841576, "flos": 23438032968960.0, "grad_norm": 1.4762758048322357, "language_loss": 0.72886813, "learning_rate": 1.8098651187665923e-06, "loss": 0.7505399, "num_input_tokens_seen": 189680425, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.7421875, "step": 8821, "time_per_iteration": 4.030205249786377 }, { "auxiliary_loss_clip": 0.01129531, "auxiliary_loss_mlp": 0.01028348, "balance_loss_clip": 1.01606965, "balance_loss_mlp": 1.04020882, "epoch": 0.5304073350368255, "flos": 22709010533760.0, "grad_norm": 1.4837621773232048, "language_loss": 0.74068463, "learning_rate": 1.8094890110539567e-06, "loss": 0.76226342, "num_input_tokens_seen": 189700375, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 8822, "time_per_iteration": 2.59328556060791 }, { "auxiliary_loss_clip": 0.01140988, "auxiliary_loss_mlp": 0.01035839, "balance_loss_clip": 1.02212954, "balance_loss_mlp": 1.04077911, "epoch": 0.5304674582894935, "flos": 27928554624000.0, "grad_norm": 1.9065016329967854, "language_loss": 0.6740737, "learning_rate": 1.809112910140168e-06, "loss": 0.69584191, "num_input_tokens_seen": 189721225, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 8823, "time_per_iteration": 2.6828930377960205 }, { "auxiliary_loss_clip": 0.01118301, "auxiliary_loss_mlp": 0.010378, "balance_loss_clip": 1.02254152, "balance_loss_mlp": 1.04109836, "epoch": 0.5305275815421614, "flos": 21250642440960.0, "grad_norm": 1.69123972638318, "language_loss": 0.69250977, "learning_rate": 1.8087368160386483e-06, "loss": 0.7140708, "num_input_tokens_seen": 189740170, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7734375, "step": 8824, "time_per_iteration": 2.5299770832061768 }, { "auxiliary_loss_clip": 0.01128579, "auxiliary_loss_mlp": 0.01278092, "balance_loss_clip": 1.01828635, "balance_loss_mlp": 1.04004872, "epoch": 0.5305877047948294, "flos": 17603088140160.0, "grad_norm": 2.102528870936022, "language_loss": 0.76084584, "learning_rate": 1.8083607287628198e-06, "loss": 0.78491259, "num_input_tokens_seen": 189757890, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 8825, "time_per_iteration": 3.934340000152588 }, { "auxiliary_loss_clip": 0.0116523, "auxiliary_loss_mlp": 0.01043742, "balance_loss_clip": 1.03006852, "balance_loss_mlp": 1.03889203, "epoch": 0.5306478280474973, "flos": 15195493284480.0, "grad_norm": 1.7907063183684477, "language_loss": 0.85208762, "learning_rate": 1.8079846483261035e-06, "loss": 0.87417734, "num_input_tokens_seen": 189775390, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73046875, "step": 8826, "time_per_iteration": 2.645573616027832 }, { "auxiliary_loss_clip": 0.01110248, "auxiliary_loss_mlp": 0.01033231, "balance_loss_clip": 1.01974821, "balance_loss_mlp": 1.03906071, "epoch": 0.5307079513001653, "flos": 15341218761600.0, "grad_norm": 1.4117085155294657, "language_loss": 0.64357948, "learning_rate": 1.807608574741922e-06, "loss": 0.66501421, "num_input_tokens_seen": 189793975, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 8827, "time_per_iteration": 2.5360846519470215 }, { "auxiliary_loss_clip": 0.01136422, "auxiliary_loss_mlp": 0.01039633, "balance_loss_clip": 1.02582884, "balance_loss_mlp": 1.04345608, "epoch": 0.5307680745528333, "flos": 43544452688640.0, "grad_norm": 1.7955425131027676, "language_loss": 0.59803998, "learning_rate": 1.8072325080236951e-06, "loss": 0.61980057, "num_input_tokens_seen": 189817870, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75390625, "step": 8828, "time_per_iteration": 2.8463425636291504 }, { "auxiliary_loss_clip": 0.01125006, "auxiliary_loss_mlp": 0.01033708, "balance_loss_clip": 1.01979601, "balance_loss_mlp": 1.04098725, "epoch": 0.5308281978055013, "flos": 20048928001920.0, "grad_norm": 7.552026518433515, "language_loss": 0.81336808, "learning_rate": 1.806856448184844e-06, "loss": 0.83495522, "num_input_tokens_seen": 189837905, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75, "step": 8829, "time_per_iteration": 2.579817533493042 }, { "auxiliary_loss_clip": 0.01122552, "auxiliary_loss_mlp": 0.01032848, "balance_loss_clip": 1.01915622, "balance_loss_mlp": 1.04117298, "epoch": 0.5308883210581693, "flos": 20703938463360.0, "grad_norm": 1.554445344084018, "language_loss": 0.78139436, "learning_rate": 1.80648039523879e-06, "loss": 0.80294842, "num_input_tokens_seen": 189856970, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 8830, "time_per_iteration": 2.6166133880615234 }, { "auxiliary_loss_clip": 0.01121871, "auxiliary_loss_mlp": 0.01030109, "balance_loss_clip": 1.01731205, "balance_loss_mlp": 1.04031515, "epoch": 0.5309484443108372, "flos": 14355506759040.0, "grad_norm": 1.967139471670703, "language_loss": 0.80528677, "learning_rate": 1.8061043491989523e-06, "loss": 0.82680655, "num_input_tokens_seen": 189872830, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 8831, "time_per_iteration": 2.4835195541381836 }, { "auxiliary_loss_clip": 0.01122399, "auxiliary_loss_mlp": 0.01031906, "balance_loss_clip": 1.01876307, "balance_loss_mlp": 1.04086339, "epoch": 0.5310085675635052, "flos": 20010503427840.0, "grad_norm": 1.9937499614711842, "language_loss": 0.73232955, "learning_rate": 1.8057283100787524e-06, "loss": 0.75387263, "num_input_tokens_seen": 189891635, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 8832, "time_per_iteration": 5.589354038238525 }, { "auxiliary_loss_clip": 0.0114447, "auxiliary_loss_mlp": 0.01033521, "balance_loss_clip": 1.0194366, "balance_loss_mlp": 1.04219973, "epoch": 0.5310686908161731, "flos": 22127293774080.0, "grad_norm": 1.962495439544104, "language_loss": 0.75518411, "learning_rate": 1.805352277891609e-06, "loss": 0.77696407, "num_input_tokens_seen": 189909050, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75390625, "step": 8833, "time_per_iteration": 2.573326826095581 }, { "auxiliary_loss_clip": 0.01141733, "auxiliary_loss_mlp": 0.01030765, "balance_loss_clip": 1.01682901, "balance_loss_mlp": 1.04038429, "epoch": 0.5311288140688412, "flos": 24717889445760.0, "grad_norm": 2.7751330222865245, "language_loss": 0.73142397, "learning_rate": 1.8049762526509416e-06, "loss": 0.75314891, "num_input_tokens_seen": 189927405, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75, "step": 8834, "time_per_iteration": 2.5678672790527344 }, { "auxiliary_loss_clip": 0.0111866, "auxiliary_loss_mlp": 0.01039908, "balance_loss_clip": 1.02445793, "balance_loss_mlp": 1.0403564, "epoch": 0.5311889373215091, "flos": 24097712198400.0, "grad_norm": 2.012449045322975, "language_loss": 0.7788707, "learning_rate": 1.8046002343701708e-06, "loss": 0.80045629, "num_input_tokens_seen": 189947740, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.78125, "step": 8835, "time_per_iteration": 2.552856922149658 }, { "auxiliary_loss_clip": 0.01132527, "auxiliary_loss_mlp": 0.01036889, "balance_loss_clip": 1.02257204, "balance_loss_mlp": 1.03954923, "epoch": 0.5312490605741771, "flos": 22017012042240.0, "grad_norm": 1.5707063624360351, "language_loss": 0.72365397, "learning_rate": 1.8042242230627142e-06, "loss": 0.7453481, "num_input_tokens_seen": 189966495, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75, "step": 8836, "time_per_iteration": 2.5408735275268555 }, { "auxiliary_loss_clip": 0.01132719, "auxiliary_loss_mlp": 0.01033766, "balance_loss_clip": 1.01942539, "balance_loss_mlp": 1.03984439, "epoch": 0.531309183826845, "flos": 19390541662080.0, "grad_norm": 11.759305985353, "language_loss": 0.80351651, "learning_rate": 1.8038482187419922e-06, "loss": 0.82518137, "num_input_tokens_seen": 189985325, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.75390625, "step": 8837, "time_per_iteration": 2.5510244369506836 }, { "auxiliary_loss_clip": 0.01123095, "auxiliary_loss_mlp": 0.01029528, "balance_loss_clip": 1.01544952, "balance_loss_mlp": 1.04149556, "epoch": 0.531369307079513, "flos": 20190056538240.0, "grad_norm": 1.8085953252512965, "language_loss": 0.85791928, "learning_rate": 1.8034722214214223e-06, "loss": 0.87944549, "num_input_tokens_seen": 190003290, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7265625, "step": 8838, "time_per_iteration": 2.5259592533111572 }, { "auxiliary_loss_clip": 0.0112843, "auxiliary_loss_mlp": 0.0103149, "balance_loss_clip": 1.0187223, "balance_loss_mlp": 1.03812194, "epoch": 0.5314294303321809, "flos": 18880143356160.0, "grad_norm": 1.7027355884814206, "language_loss": 0.72638279, "learning_rate": 1.8030962311144233e-06, "loss": 0.74798203, "num_input_tokens_seen": 190023260, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 8839, "time_per_iteration": 2.6083478927612305 }, { "auxiliary_loss_clip": 0.01131953, "auxiliary_loss_mlp": 0.01035175, "balance_loss_clip": 1.02142453, "balance_loss_mlp": 1.03886962, "epoch": 0.531489553584849, "flos": 23040035297280.0, "grad_norm": 1.6629808345876147, "language_loss": 0.76593292, "learning_rate": 1.8027202478344136e-06, "loss": 0.78760421, "num_input_tokens_seen": 190042035, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75390625, "step": 8840, "time_per_iteration": 2.611924886703491 }, { "auxiliary_loss_clip": 0.01142734, "auxiliary_loss_mlp": 0.01033467, "balance_loss_clip": 1.01955509, "balance_loss_mlp": 1.0396601, "epoch": 0.5315496768375169, "flos": 19790478668160.0, "grad_norm": 2.9714081694324377, "language_loss": 0.77018869, "learning_rate": 1.8023442715948105e-06, "loss": 0.7919507, "num_input_tokens_seen": 190057545, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.765625, "step": 8841, "time_per_iteration": 2.644702672958374 }, { "auxiliary_loss_clip": 0.01055349, "auxiliary_loss_mlp": 0.01002172, "balance_loss_clip": 1.00074792, "balance_loss_mlp": 1.01391983, "epoch": 0.5316098000901849, "flos": 71023228185600.0, "grad_norm": 0.6885097998980471, "language_loss": 0.56789154, "learning_rate": 1.8019683024090323e-06, "loss": 0.58846676, "num_input_tokens_seen": 190123800, "router_z_loss_clip": 0.01422119, "router_z_loss_mlp": 0.23828125, "step": 8842, "time_per_iteration": 3.3939085006713867 }, { "auxiliary_loss_clip": 0.01134612, "auxiliary_loss_mlp": 0.01037106, "balance_loss_clip": 1.02245545, "balance_loss_mlp": 1.04084969, "epoch": 0.5316699233428529, "flos": 16435560470400.0, "grad_norm": 2.013098880184016, "language_loss": 0.73537397, "learning_rate": 1.8015923402904952e-06, "loss": 0.75709116, "num_input_tokens_seen": 190141625, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.76171875, "step": 8843, "time_per_iteration": 2.4962620735168457 }, { "auxiliary_loss_clip": 0.01053418, "auxiliary_loss_mlp": 0.01002142, "balance_loss_clip": 1.00075364, "balance_loss_mlp": 1.0126915, "epoch": 0.5317300465955208, "flos": 67420814302080.0, "grad_norm": 0.8926666062207237, "language_loss": 0.60984033, "learning_rate": 1.8012163852526179e-06, "loss": 0.63039595, "num_input_tokens_seen": 190198110, "router_z_loss_clip": 0.01391602, "router_z_loss_mlp": 0.23730469, "step": 8844, "time_per_iteration": 3.2452926635742188 }, { "auxiliary_loss_clip": 0.01044169, "auxiliary_loss_mlp": 0.01002109, "balance_loss_clip": 1.00070274, "balance_loss_mlp": 1.01147103, "epoch": 0.5317901698481888, "flos": 59508075340800.0, "grad_norm": 0.8370703765067542, "language_loss": 0.62175584, "learning_rate": 1.8008404373088164e-06, "loss": 0.64221859, "num_input_tokens_seen": 190259950, "router_z_loss_clip": 0.01403809, "router_z_loss_mlp": 0.23535156, "step": 8845, "time_per_iteration": 3.1635262966156006 }, { "auxiliary_loss_clip": 0.01141251, "auxiliary_loss_mlp": 0.01037864, "balance_loss_clip": 1.02312374, "balance_loss_mlp": 1.04000723, "epoch": 0.5318502931008567, "flos": 19129219240320.0, "grad_norm": 3.0986951390843185, "language_loss": 0.75423825, "learning_rate": 1.8004644964725069e-06, "loss": 0.77602947, "num_input_tokens_seen": 190278265, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.74609375, "step": 8846, "time_per_iteration": 2.5931406021118164 }, { "auxiliary_loss_clip": 0.011247, "auxiliary_loss_mlp": 0.01031296, "balance_loss_clip": 1.01896346, "balance_loss_mlp": 1.04152453, "epoch": 0.5319104163535248, "flos": 24681045070080.0, "grad_norm": 2.3504137272751593, "language_loss": 0.75338614, "learning_rate": 1.8000885627571072e-06, "loss": 0.77494609, "num_input_tokens_seen": 190298400, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.7421875, "step": 8847, "time_per_iteration": 2.634521007537842 }, { "auxiliary_loss_clip": 0.0113189, "auxiliary_loss_mlp": 0.01030177, "balance_loss_clip": 1.01726687, "balance_loss_mlp": 1.04103172, "epoch": 0.5319705396061927, "flos": 19385513758080.0, "grad_norm": 2.9355955900006134, "language_loss": 0.87644786, "learning_rate": 1.7997126361760314e-06, "loss": 0.89806849, "num_input_tokens_seen": 190316235, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 8848, "time_per_iteration": 2.5130717754364014 }, { "auxiliary_loss_clip": 0.01144936, "auxiliary_loss_mlp": 0.01039059, "balance_loss_clip": 1.02365088, "balance_loss_mlp": 1.04250109, "epoch": 0.5320306628588607, "flos": 18259319664000.0, "grad_norm": 2.05909029632304, "language_loss": 0.7459501, "learning_rate": 1.7993367167426972e-06, "loss": 0.76778996, "num_input_tokens_seen": 190335060, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.7578125, "step": 8849, "time_per_iteration": 2.6111717224121094 }, { "auxiliary_loss_clip": 0.01125504, "auxiliary_loss_mlp": 0.01026978, "balance_loss_clip": 1.01286387, "balance_loss_mlp": 1.04002023, "epoch": 0.5320907861115286, "flos": 23732321097600.0, "grad_norm": 1.685672548496806, "language_loss": 0.79531151, "learning_rate": 1.7989608044705194e-06, "loss": 0.8168363, "num_input_tokens_seen": 190353265, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.765625, "step": 8850, "time_per_iteration": 2.6387906074523926 }, { "auxiliary_loss_clip": 0.0111438, "auxiliary_loss_mlp": 0.01031366, "balance_loss_clip": 1.01783609, "balance_loss_mlp": 1.03986311, "epoch": 0.5321509093641966, "flos": 34495251321600.0, "grad_norm": 1.3629861409598598, "language_loss": 0.55031312, "learning_rate": 1.7985848993729124e-06, "loss": 0.57177055, "num_input_tokens_seen": 190376575, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.74609375, "step": 8851, "time_per_iteration": 2.653531074523926 }, { "auxiliary_loss_clip": 0.01127005, "auxiliary_loss_mlp": 0.0103114, "balance_loss_clip": 1.01786065, "balance_loss_mlp": 1.04145694, "epoch": 0.5322110326168645, "flos": 20010934391040.0, "grad_norm": 1.6043846937994746, "language_loss": 0.68322325, "learning_rate": 1.798209001463293e-06, "loss": 0.70480472, "num_input_tokens_seen": 190395185, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.765625, "step": 8852, "time_per_iteration": 2.4894461631774902 }, { "auxiliary_loss_clip": 0.0105305, "auxiliary_loss_mlp": 0.01248941, "balance_loss_clip": 1.00119674, "balance_loss_mlp": 1.01246095, "epoch": 0.5322711558695326, "flos": 64631164435200.0, "grad_norm": 0.8006754447477638, "language_loss": 0.6277194, "learning_rate": 1.797833110755075e-06, "loss": 0.65073931, "num_input_tokens_seen": 190452595, "router_z_loss_clip": 0.01623535, "router_z_loss_mlp": 0.234375, "step": 8853, "time_per_iteration": 3.148320436477661 }, { "auxiliary_loss_clip": 0.01124886, "auxiliary_loss_mlp": 0.01030804, "balance_loss_clip": 1.01699328, "balance_loss_mlp": 1.03977025, "epoch": 0.5323312791222005, "flos": 14939342421120.0, "grad_norm": 2.3557534587344575, "language_loss": 0.79641974, "learning_rate": 1.7974572272616736e-06, "loss": 0.81797659, "num_input_tokens_seen": 190469140, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.765625, "step": 8854, "time_per_iteration": 2.4658117294311523 }, { "auxiliary_loss_clip": 0.01126816, "auxiliary_loss_mlp": 0.01028947, "balance_loss_clip": 1.01712728, "balance_loss_mlp": 1.03796029, "epoch": 0.5323914023748685, "flos": 23440834229760.0, "grad_norm": 2.1053496265998746, "language_loss": 0.73479319, "learning_rate": 1.7970813509965025e-06, "loss": 0.75635082, "num_input_tokens_seen": 190489015, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.70703125, "step": 8855, "time_per_iteration": 2.6935808658599854 }, { "auxiliary_loss_clip": 0.01130254, "auxiliary_loss_mlp": 0.01275225, "balance_loss_clip": 1.0161221, "balance_loss_mlp": 1.03929222, "epoch": 0.5324515256275365, "flos": 26286180134400.0, "grad_norm": 2.047826092170418, "language_loss": 0.6463967, "learning_rate": 1.796705481972976e-06, "loss": 0.67045152, "num_input_tokens_seen": 190508065, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.73046875, "step": 8856, "time_per_iteration": 2.625558614730835 }, { "auxiliary_loss_clip": 0.01138489, "auxiliary_loss_mlp": 0.01036533, "balance_loss_clip": 1.02167964, "balance_loss_mlp": 1.04331064, "epoch": 0.5325116488802044, "flos": 26870913636480.0, "grad_norm": 4.9288492647334845, "language_loss": 0.77588797, "learning_rate": 1.796329620204508e-06, "loss": 0.79763824, "num_input_tokens_seen": 190527045, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.77734375, "step": 8857, "time_per_iteration": 2.6055493354797363 }, { "auxiliary_loss_clip": 0.01141654, "auxiliary_loss_mlp": 0.0103491, "balance_loss_clip": 1.02095616, "balance_loss_mlp": 1.04068661, "epoch": 0.5325717721328724, "flos": 25884734757120.0, "grad_norm": 2.195319713557347, "language_loss": 0.7123422, "learning_rate": 1.7959537657045115e-06, "loss": 0.73410785, "num_input_tokens_seen": 190544075, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.73828125, "step": 8858, "time_per_iteration": 2.594052791595459 }, { "auxiliary_loss_clip": 0.01127376, "auxiliary_loss_mlp": 0.01034544, "balance_loss_clip": 1.01983929, "balance_loss_mlp": 1.04304039, "epoch": 0.5326318953855403, "flos": 21799321666560.0, "grad_norm": 1.6565673710341418, "language_loss": 0.69401169, "learning_rate": 1.795577918486401e-06, "loss": 0.71563095, "num_input_tokens_seen": 190566030, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.75, "step": 8859, "time_per_iteration": 2.641516923904419 }, { "auxiliary_loss_clip": 0.0111949, "auxiliary_loss_mlp": 0.01027087, "balance_loss_clip": 1.01535082, "balance_loss_mlp": 1.04143894, "epoch": 0.5326920186382084, "flos": 20922921728640.0, "grad_norm": 1.4682434104159894, "language_loss": 0.69861168, "learning_rate": 1.795202078563588e-06, "loss": 0.7200774, "num_input_tokens_seen": 190585605, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69140625, "step": 8860, "time_per_iteration": 2.590507984161377 }, { "auxiliary_loss_clip": 0.01132337, "auxiliary_loss_mlp": 0.0103239, "balance_loss_clip": 1.01974845, "balance_loss_mlp": 1.04150176, "epoch": 0.5327521418908763, "flos": 21433427775360.0, "grad_norm": 1.8158402701478815, "language_loss": 0.77787811, "learning_rate": 1.7948262459494866e-06, "loss": 0.79952538, "num_input_tokens_seen": 190604625, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.73046875, "step": 8861, "time_per_iteration": 2.6393134593963623 }, { "auxiliary_loss_clip": 0.01150601, "auxiliary_loss_mlp": 0.01038578, "balance_loss_clip": 1.02435648, "balance_loss_mlp": 1.04115748, "epoch": 0.5328122651435443, "flos": 21760250647680.0, "grad_norm": 1.685340748912775, "language_loss": 0.85694575, "learning_rate": 1.794450420657509e-06, "loss": 0.87883753, "num_input_tokens_seen": 190625060, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7421875, "step": 8862, "time_per_iteration": 2.674189567565918 }, { "auxiliary_loss_clip": 0.01133714, "auxiliary_loss_mlp": 0.01037703, "balance_loss_clip": 1.0226227, "balance_loss_mlp": 1.0395813, "epoch": 0.5328723883962122, "flos": 18296487262080.0, "grad_norm": 2.09302694804879, "language_loss": 0.61629385, "learning_rate": 1.7940746027010664e-06, "loss": 0.638008, "num_input_tokens_seen": 190643150, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.765625, "step": 8863, "time_per_iteration": 3.920137643814087 }, { "auxiliary_loss_clip": 0.01133442, "auxiliary_loss_mlp": 0.0103385, "balance_loss_clip": 1.02152991, "balance_loss_mlp": 1.04351163, "epoch": 0.5329325116488802, "flos": 25374911068800.0, "grad_norm": 3.403100801316875, "language_loss": 0.73438579, "learning_rate": 1.793698792093572e-06, "loss": 0.75605869, "num_input_tokens_seen": 190662725, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71875, "step": 8864, "time_per_iteration": 2.7478296756744385 }, { "auxiliary_loss_clip": 0.01113356, "auxiliary_loss_mlp": 0.01035407, "balance_loss_clip": 1.0224607, "balance_loss_mlp": 1.04007936, "epoch": 0.5329926349015481, "flos": 25592098654080.0, "grad_norm": 3.262205178091583, "language_loss": 0.64412975, "learning_rate": 1.7933229888484367e-06, "loss": 0.66561735, "num_input_tokens_seen": 190683680, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.734375, "step": 8865, "time_per_iteration": 2.5651209354400635 }, { "auxiliary_loss_clip": 0.01122792, "auxiliary_loss_mlp": 0.0103167, "balance_loss_clip": 1.01806211, "balance_loss_mlp": 1.03978324, "epoch": 0.5330527581542162, "flos": 22889605138560.0, "grad_norm": 2.004448927617124, "language_loss": 0.78440452, "learning_rate": 1.7929471929790726e-06, "loss": 0.80594915, "num_input_tokens_seen": 190703350, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7421875, "step": 8866, "time_per_iteration": 2.603628635406494 }, { "auxiliary_loss_clip": 0.01119791, "auxiliary_loss_mlp": 0.01032411, "balance_loss_clip": 1.01950634, "balance_loss_mlp": 1.03822339, "epoch": 0.5331128814068841, "flos": 16026752805120.0, "grad_norm": 4.845843017570537, "language_loss": 0.73274541, "learning_rate": 1.7925714044988904e-06, "loss": 0.75426745, "num_input_tokens_seen": 190721170, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73046875, "step": 8867, "time_per_iteration": 3.8514444828033447 }, { "auxiliary_loss_clip": 0.0113366, "auxiliary_loss_mlp": 0.0103997, "balance_loss_clip": 1.02525401, "balance_loss_mlp": 1.04035997, "epoch": 0.5331730046595521, "flos": 39344699629440.0, "grad_norm": 1.5538799670066712, "language_loss": 0.72111881, "learning_rate": 1.7921956234213011e-06, "loss": 0.74285519, "num_input_tokens_seen": 190743795, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7578125, "step": 8868, "time_per_iteration": 2.650583505630493 }, { "auxiliary_loss_clip": 0.01137837, "auxiliary_loss_mlp": 0.0103273, "balance_loss_clip": 1.01967609, "balance_loss_mlp": 1.03721189, "epoch": 0.5332331279122201, "flos": 24024382583040.0, "grad_norm": 1.6883378515642684, "language_loss": 0.78460872, "learning_rate": 1.7918198497597158e-06, "loss": 0.80631435, "num_input_tokens_seen": 190761560, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73828125, "step": 8869, "time_per_iteration": 2.583324432373047 }, { "auxiliary_loss_clip": 0.01127286, "auxiliary_loss_mlp": 0.01039452, "balance_loss_clip": 1.02634525, "balance_loss_mlp": 1.04065406, "epoch": 0.533293251164888, "flos": 17129318728320.0, "grad_norm": 1.9458161034192742, "language_loss": 0.75526583, "learning_rate": 1.791444083527544e-06, "loss": 0.77693319, "num_input_tokens_seen": 190778875, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.78125, "step": 8870, "time_per_iteration": 2.5099527835845947 }, { "auxiliary_loss_clip": 0.01136722, "auxiliary_loss_mlp": 0.01034413, "balance_loss_clip": 1.01948857, "balance_loss_mlp": 1.04240346, "epoch": 0.533353374417556, "flos": 22126360020480.0, "grad_norm": 2.8011462752130636, "language_loss": 0.75793839, "learning_rate": 1.7910683247381968e-06, "loss": 0.77964973, "num_input_tokens_seen": 190799830, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.765625, "step": 8871, "time_per_iteration": 2.63668155670166 }, { "auxiliary_loss_clip": 0.01132528, "auxiliary_loss_mlp": 0.01031738, "balance_loss_clip": 1.01848793, "balance_loss_mlp": 1.0408752, "epoch": 0.533413497670224, "flos": 15011091838080.0, "grad_norm": 2.4070146276806677, "language_loss": 0.71897894, "learning_rate": 1.7906925734050837e-06, "loss": 0.74062163, "num_input_tokens_seen": 190817155, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7421875, "step": 8872, "time_per_iteration": 2.545227289199829 }, { "auxiliary_loss_clip": 0.0113459, "auxiliary_loss_mlp": 0.01042136, "balance_loss_clip": 1.02830195, "balance_loss_mlp": 1.04367304, "epoch": 0.533473620922892, "flos": 19609955890560.0, "grad_norm": 1.7107764548324365, "language_loss": 0.64984244, "learning_rate": 1.7903168295416138e-06, "loss": 0.67160964, "num_input_tokens_seen": 190835240, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7265625, "step": 8873, "time_per_iteration": 4.117762804031372 }, { "auxiliary_loss_clip": 0.01130627, "auxiliary_loss_mlp": 0.01038394, "balance_loss_clip": 1.02545989, "balance_loss_mlp": 1.0400672, "epoch": 0.5335337441755599, "flos": 14282644020480.0, "grad_norm": 2.173399823666832, "language_loss": 0.79820919, "learning_rate": 1.7899410931611972e-06, "loss": 0.81989944, "num_input_tokens_seen": 190851620, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73046875, "step": 8874, "time_per_iteration": 4.128478765487671 }, { "auxiliary_loss_clip": 0.01122067, "auxiliary_loss_mlp": 0.01033471, "balance_loss_clip": 1.02026868, "balance_loss_mlp": 1.03997183, "epoch": 0.5335938674282279, "flos": 20814830726400.0, "grad_norm": 1.7344515462385433, "language_loss": 0.6986599, "learning_rate": 1.7895653642772425e-06, "loss": 0.72021532, "num_input_tokens_seen": 190870545, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 8875, "time_per_iteration": 2.5790178775787354 }, { "auxiliary_loss_clip": 0.0104027, "auxiliary_loss_mlp": 0.01016119, "balance_loss_clip": 1.01460481, "balance_loss_mlp": 1.01651669, "epoch": 0.5336539906808958, "flos": 71396448451200.0, "grad_norm": 0.7300376364398581, "language_loss": 0.59701502, "learning_rate": 1.789189642903159e-06, "loss": 0.61757886, "num_input_tokens_seen": 190931995, "router_z_loss_clip": 0.01513672, "router_z_loss_mlp": 0.23828125, "step": 8876, "time_per_iteration": 3.28798508644104 }, { "auxiliary_loss_clip": 0.01121303, "auxiliary_loss_mlp": 0.01032452, "balance_loss_clip": 1.01970863, "balance_loss_mlp": 1.0398078, "epoch": 0.5337141139335638, "flos": 20152996680960.0, "grad_norm": 1.8061313429367492, "language_loss": 0.74617147, "learning_rate": 1.7888139290523555e-06, "loss": 0.76770902, "num_input_tokens_seen": 190949890, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 8877, "time_per_iteration": 2.6064341068267822 }, { "auxiliary_loss_clip": 0.01048312, "auxiliary_loss_mlp": 0.01008062, "balance_loss_clip": 1.00636888, "balance_loss_mlp": 1.01588571, "epoch": 0.5337742371862317, "flos": 67728387484800.0, "grad_norm": 0.7509249394091166, "language_loss": 0.57168663, "learning_rate": 1.7884382227382384e-06, "loss": 0.59225047, "num_input_tokens_seen": 191008480, "router_z_loss_clip": 0.01696777, "router_z_loss_mlp": 0.23632812, "step": 8878, "time_per_iteration": 3.025916814804077 }, { "auxiliary_loss_clip": 0.01117154, "auxiliary_loss_mlp": 0.01028337, "balance_loss_clip": 1.01474714, "balance_loss_mlp": 1.04073286, "epoch": 0.5338343604388998, "flos": 25008909436800.0, "grad_norm": 1.6447725908114526, "language_loss": 0.72299623, "learning_rate": 1.7880625239742175e-06, "loss": 0.74445117, "num_input_tokens_seen": 191028995, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.76171875, "step": 8879, "time_per_iteration": 2.5964982509613037 }, { "auxiliary_loss_clip": 0.01116096, "auxiliary_loss_mlp": 0.01031104, "balance_loss_clip": 1.01718044, "balance_loss_mlp": 1.04127061, "epoch": 0.5338944836915677, "flos": 17601256546560.0, "grad_norm": 2.125548253478279, "language_loss": 0.83246034, "learning_rate": 1.7876868327736995e-06, "loss": 0.85393232, "num_input_tokens_seen": 191045285, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.74609375, "step": 8880, "time_per_iteration": 2.4736883640289307 }, { "auxiliary_loss_clip": 0.01154743, "auxiliary_loss_mlp": 0.0102881, "balance_loss_clip": 1.01608443, "balance_loss_mlp": 1.03832281, "epoch": 0.5339546069442357, "flos": 21724124544000.0, "grad_norm": 1.4223737261427156, "language_loss": 0.79421639, "learning_rate": 1.7873111491500927e-06, "loss": 0.81605196, "num_input_tokens_seen": 191066105, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 8881, "time_per_iteration": 2.622321128845215 }, { "auxiliary_loss_clip": 0.01057081, "auxiliary_loss_mlp": 0.01004748, "balance_loss_clip": 1.00319791, "balance_loss_mlp": 1.01607776, "epoch": 0.5340147301969036, "flos": 69723583315200.0, "grad_norm": 0.7867487743260859, "language_loss": 0.59279251, "learning_rate": 1.7869354731168035e-06, "loss": 0.61341083, "num_input_tokens_seen": 191126315, "router_z_loss_clip": 0.01550293, "router_z_loss_mlp": 0.24023438, "step": 8882, "time_per_iteration": 3.094515562057495 }, { "auxiliary_loss_clip": 0.0105819, "auxiliary_loss_mlp": 0.01004588, "balance_loss_clip": 1.00313938, "balance_loss_mlp": 1.01620102, "epoch": 0.5340748534495716, "flos": 63880701580800.0, "grad_norm": 0.8645399154412156, "language_loss": 0.63654244, "learning_rate": 1.7865598046872396e-06, "loss": 0.6571703, "num_input_tokens_seen": 191174240, "router_z_loss_clip": 0.01446533, "router_z_loss_mlp": 0.23925781, "step": 8883, "time_per_iteration": 2.9882354736328125 }, { "auxiliary_loss_clip": 0.01152543, "auxiliary_loss_mlp": 0.01028817, "balance_loss_clip": 1.01500666, "balance_loss_mlp": 1.04065657, "epoch": 0.5341349767022396, "flos": 28694313694080.0, "grad_norm": 2.0326190646236024, "language_loss": 0.82137609, "learning_rate": 1.7861841438748073e-06, "loss": 0.84318972, "num_input_tokens_seen": 191193335, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.765625, "step": 8884, "time_per_iteration": 2.7260117530822754 }, { "auxiliary_loss_clip": 0.01127674, "auxiliary_loss_mlp": 0.01029358, "balance_loss_clip": 1.01678109, "balance_loss_mlp": 1.04008937, "epoch": 0.5341950999549075, "flos": 16289691338880.0, "grad_norm": 1.5731823055845944, "language_loss": 0.72141916, "learning_rate": 1.7858084906929126e-06, "loss": 0.74298948, "num_input_tokens_seen": 191210900, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 8885, "time_per_iteration": 2.578179359436035 }, { "auxiliary_loss_clip": 0.01135886, "auxiliary_loss_mlp": 0.01030456, "balance_loss_clip": 1.01660371, "balance_loss_mlp": 1.04111981, "epoch": 0.5342552232075756, "flos": 14355650413440.0, "grad_norm": 1.9968619192290933, "language_loss": 0.78688073, "learning_rate": 1.785432845154962e-06, "loss": 0.8085441, "num_input_tokens_seen": 191226730, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7734375, "step": 8886, "time_per_iteration": 2.609187602996826 }, { "auxiliary_loss_clip": 0.01140717, "auxiliary_loss_mlp": 0.01278488, "balance_loss_clip": 1.01706028, "balance_loss_mlp": 1.03959131, "epoch": 0.5343153464602435, "flos": 30297976300800.0, "grad_norm": 1.6340531837156194, "language_loss": 0.74922502, "learning_rate": 1.7850572072743611e-06, "loss": 0.77341706, "num_input_tokens_seen": 191250435, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.74609375, "step": 8887, "time_per_iteration": 2.712186336517334 }, { "auxiliary_loss_clip": 0.01128654, "auxiliary_loss_mlp": 0.01029866, "balance_loss_clip": 1.01695001, "balance_loss_mlp": 1.03832769, "epoch": 0.5343754697129115, "flos": 15596292216960.0, "grad_norm": 2.213757905295401, "language_loss": 0.69067037, "learning_rate": 1.7846815770645158e-06, "loss": 0.71225554, "num_input_tokens_seen": 191268315, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 8888, "time_per_iteration": 2.585627794265747 }, { "auxiliary_loss_clip": 0.01130052, "auxiliary_loss_mlp": 0.01033965, "balance_loss_clip": 1.02056026, "balance_loss_mlp": 1.04129541, "epoch": 0.5344355929655794, "flos": 16909617191040.0, "grad_norm": 3.2782374742816702, "language_loss": 0.77558994, "learning_rate": 1.7843059545388313e-06, "loss": 0.79723012, "num_input_tokens_seen": 191287000, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.796875, "step": 8889, "time_per_iteration": 2.539630651473999 }, { "auxiliary_loss_clip": 0.01114039, "auxiliary_loss_mlp": 0.01037435, "balance_loss_clip": 1.023803, "balance_loss_mlp": 1.04080486, "epoch": 0.5344957162182474, "flos": 16798186224000.0, "grad_norm": 1.86574086374147, "language_loss": 0.69258833, "learning_rate": 1.783930339710712e-06, "loss": 0.71410298, "num_input_tokens_seen": 191304565, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 8890, "time_per_iteration": 2.5505363941192627 }, { "auxiliary_loss_clip": 0.01118575, "auxiliary_loss_mlp": 0.01040284, "balance_loss_clip": 1.02522171, "balance_loss_mlp": 1.04076719, "epoch": 0.5345558394709153, "flos": 12705590413440.0, "grad_norm": 2.4414035515594312, "language_loss": 0.76874864, "learning_rate": 1.7835547325935633e-06, "loss": 0.79033732, "num_input_tokens_seen": 191318300, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.77734375, "step": 8891, "time_per_iteration": 2.490082025527954 }, { "auxiliary_loss_clip": 0.01118363, "auxiliary_loss_mlp": 0.01033524, "balance_loss_clip": 1.02140069, "balance_loss_mlp": 1.03857505, "epoch": 0.5346159627235834, "flos": 22455050400000.0, "grad_norm": 1.537369208113951, "language_loss": 0.74628329, "learning_rate": 1.7831791332007897e-06, "loss": 0.76780218, "num_input_tokens_seen": 191337925, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.70703125, "step": 8892, "time_per_iteration": 2.6653831005096436 }, { "auxiliary_loss_clip": 0.01130403, "auxiliary_loss_mlp": 0.01033126, "balance_loss_clip": 1.02024519, "balance_loss_mlp": 1.03884721, "epoch": 0.5346760859762513, "flos": 22415763899520.0, "grad_norm": 1.5149289814951532, "language_loss": 0.87721324, "learning_rate": 1.782803541545795e-06, "loss": 0.89884853, "num_input_tokens_seen": 191357120, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73828125, "step": 8893, "time_per_iteration": 2.587378978729248 }, { "auxiliary_loss_clip": 0.01108623, "auxiliary_loss_mlp": 0.01030064, "balance_loss_clip": 1.01808369, "balance_loss_mlp": 1.03749323, "epoch": 0.5347362092289193, "flos": 22816131868800.0, "grad_norm": 1.473509371252771, "language_loss": 0.72785634, "learning_rate": 1.7824279576419832e-06, "loss": 0.74924326, "num_input_tokens_seen": 191375395, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.7109375, "step": 8894, "time_per_iteration": 2.63140869140625 }, { "auxiliary_loss_clip": 0.01120061, "auxiliary_loss_mlp": 0.01031159, "balance_loss_clip": 1.01787353, "balance_loss_mlp": 1.03892493, "epoch": 0.5347963324815872, "flos": 23219480666880.0, "grad_norm": 2.090838460314983, "language_loss": 0.74540716, "learning_rate": 1.7820523815027575e-06, "loss": 0.76691937, "num_input_tokens_seen": 191395595, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 8895, "time_per_iteration": 2.5972416400909424 }, { "auxiliary_loss_clip": 0.01128495, "auxiliary_loss_mlp": 0.01030354, "balance_loss_clip": 1.01748514, "balance_loss_mlp": 1.03799915, "epoch": 0.5348564557342552, "flos": 22601350494720.0, "grad_norm": 1.6412378209528655, "language_loss": 0.7652598, "learning_rate": 1.7816768131415221e-06, "loss": 0.78684831, "num_input_tokens_seen": 191413730, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.72265625, "step": 8896, "time_per_iteration": 2.680748462677002 }, { "auxiliary_loss_clip": 0.01128927, "auxiliary_loss_mlp": 0.01027913, "balance_loss_clip": 1.01490164, "balance_loss_mlp": 1.03943467, "epoch": 0.5349165789869232, "flos": 18002378701440.0, "grad_norm": 1.8974081422417939, "language_loss": 0.78461927, "learning_rate": 1.7813012525716794e-06, "loss": 0.80618763, "num_input_tokens_seen": 191432400, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 8897, "time_per_iteration": 2.60506010055542 }, { "auxiliary_loss_clip": 0.01123699, "auxiliary_loss_mlp": 0.0103097, "balance_loss_clip": 1.01784456, "balance_loss_mlp": 1.04061842, "epoch": 0.5349767022395912, "flos": 17159770483200.0, "grad_norm": 2.423848304005189, "language_loss": 0.75726342, "learning_rate": 1.7809256998066323e-06, "loss": 0.77881014, "num_input_tokens_seen": 191448855, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.74609375, "step": 8898, "time_per_iteration": 2.6795482635498047 }, { "auxiliary_loss_clip": 0.01049514, "auxiliary_loss_mlp": 0.01002836, "balance_loss_clip": 1.00138748, "balance_loss_mlp": 1.01614928, "epoch": 0.5350368254922592, "flos": 70992058158720.0, "grad_norm": 0.8442426370883044, "language_loss": 0.58004308, "learning_rate": 1.7805501548597842e-06, "loss": 0.60056657, "num_input_tokens_seen": 191519690, "router_z_loss_clip": 0.01446533, "router_z_loss_mlp": 0.2421875, "step": 8899, "time_per_iteration": 3.350546360015869 }, { "auxiliary_loss_clip": 0.01126992, "auxiliary_loss_mlp": 0.0103054, "balance_loss_clip": 1.0181129, "balance_loss_mlp": 1.03783059, "epoch": 0.5350969487449271, "flos": 27417833095680.0, "grad_norm": 1.6276997263814317, "language_loss": 0.6999144, "learning_rate": 1.7801746177445357e-06, "loss": 0.72148973, "num_input_tokens_seen": 191539380, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71484375, "step": 8900, "time_per_iteration": 2.661454439163208 }, { "auxiliary_loss_clip": 0.01131674, "auxiliary_loss_mlp": 0.01031771, "balance_loss_clip": 1.01887882, "balance_loss_mlp": 1.03906226, "epoch": 0.5351570719975951, "flos": 19316134638720.0, "grad_norm": 1.63058967137789, "language_loss": 0.71435344, "learning_rate": 1.7797990884742901e-06, "loss": 0.73598784, "num_input_tokens_seen": 191557400, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.74609375, "step": 8901, "time_per_iteration": 2.6596767902374268 }, { "auxiliary_loss_clip": 0.01140789, "auxiliary_loss_mlp": 0.01032256, "balance_loss_clip": 1.01858258, "balance_loss_mlp": 1.03950477, "epoch": 0.535217195250263, "flos": 19828580019840.0, "grad_norm": 2.2056129768140718, "language_loss": 0.77739942, "learning_rate": 1.7794235670624482e-06, "loss": 0.79912984, "num_input_tokens_seen": 191575860, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.74609375, "step": 8902, "time_per_iteration": 2.5951218605041504 }, { "auxiliary_loss_clip": 0.01127546, "auxiliary_loss_mlp": 0.01034803, "balance_loss_clip": 1.02297115, "balance_loss_mlp": 1.0396843, "epoch": 0.535277318502931, "flos": 22127868391680.0, "grad_norm": 3.191450219177557, "language_loss": 0.6994406, "learning_rate": 1.7790480535224122e-06, "loss": 0.72106409, "num_input_tokens_seen": 191595775, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6953125, "step": 8903, "time_per_iteration": 2.6866769790649414 }, { "auxiliary_loss_clip": 0.0112395, "auxiliary_loss_mlp": 0.01283125, "balance_loss_clip": 1.02300072, "balance_loss_mlp": 1.03974867, "epoch": 0.5353374417555989, "flos": 25045897466880.0, "grad_norm": 1.766013698744949, "language_loss": 0.72433305, "learning_rate": 1.7786725478675827e-06, "loss": 0.74840379, "num_input_tokens_seen": 191617785, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75390625, "step": 8904, "time_per_iteration": 2.6060829162597656 }, { "auxiliary_loss_clip": 0.01132718, "auxiliary_loss_mlp": 0.01039918, "balance_loss_clip": 1.02645946, "balance_loss_mlp": 1.04082179, "epoch": 0.535397565008267, "flos": 19388710068480.0, "grad_norm": 2.0013452973930375, "language_loss": 0.73595423, "learning_rate": 1.7782970501113606e-06, "loss": 0.75768054, "num_input_tokens_seen": 191636900, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 8905, "time_per_iteration": 3.9883599281311035 }, { "auxiliary_loss_clip": 0.01114886, "auxiliary_loss_mlp": 0.01035299, "balance_loss_clip": 1.02418256, "balance_loss_mlp": 1.03700328, "epoch": 0.5354576882609349, "flos": 21471205904640.0, "grad_norm": 1.6664597639521141, "language_loss": 0.83561337, "learning_rate": 1.7779215602671466e-06, "loss": 0.85711527, "num_input_tokens_seen": 191656720, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.69140625, "step": 8906, "time_per_iteration": 2.6467533111572266 }, { "auxiliary_loss_clip": 0.01124168, "auxiliary_loss_mlp": 0.0128402, "balance_loss_clip": 1.0235194, "balance_loss_mlp": 1.04063129, "epoch": 0.5355178115136029, "flos": 20777519473920.0, "grad_norm": 1.8763986821650664, "language_loss": 0.73681748, "learning_rate": 1.7775460783483412e-06, "loss": 0.76089942, "num_input_tokens_seen": 191674445, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.74609375, "step": 8907, "time_per_iteration": 2.6215176582336426 }, { "auxiliary_loss_clip": 0.01120941, "auxiliary_loss_mlp": 0.01034189, "balance_loss_clip": 1.02127242, "balance_loss_mlp": 1.0386256, "epoch": 0.5355779347662708, "flos": 23514020190720.0, "grad_norm": 1.8272530322553262, "language_loss": 0.76352972, "learning_rate": 1.7771706043683437e-06, "loss": 0.78508103, "num_input_tokens_seen": 191695000, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 8908, "time_per_iteration": 3.9802470207214355 }, { "auxiliary_loss_clip": 0.01131566, "auxiliary_loss_mlp": 0.01283058, "balance_loss_clip": 1.0230453, "balance_loss_mlp": 1.03858542, "epoch": 0.5356380580189388, "flos": 20303211358080.0, "grad_norm": 1.953800964771016, "language_loss": 0.74182022, "learning_rate": 1.7767951383405539e-06, "loss": 0.76596642, "num_input_tokens_seen": 191713295, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75, "step": 8909, "time_per_iteration": 2.5427446365356445 }, { "auxiliary_loss_clip": 0.01140132, "auxiliary_loss_mlp": 0.0103794, "balance_loss_clip": 1.02569103, "balance_loss_mlp": 1.03998315, "epoch": 0.5356981812716068, "flos": 21361642444800.0, "grad_norm": 2.4807233661659547, "language_loss": 0.84025192, "learning_rate": 1.7764196802783717e-06, "loss": 0.86203253, "num_input_tokens_seen": 191732725, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.73046875, "step": 8910, "time_per_iteration": 2.645521640777588 }, { "auxiliary_loss_clip": 0.01130854, "auxiliary_loss_mlp": 0.01033898, "balance_loss_clip": 1.02055883, "balance_loss_mlp": 1.0381732, "epoch": 0.5357583045242748, "flos": 23111246010240.0, "grad_norm": 1.4540610005489643, "language_loss": 0.81313682, "learning_rate": 1.7760442301951962e-06, "loss": 0.83478439, "num_input_tokens_seen": 191753765, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.75, "step": 8911, "time_per_iteration": 2.5305848121643066 }, { "auxiliary_loss_clip": 0.01138085, "auxiliary_loss_mlp": 0.01037593, "balance_loss_clip": 1.02553523, "balance_loss_mlp": 1.03879499, "epoch": 0.5358184277769428, "flos": 21141761339520.0, "grad_norm": 2.013276490734868, "language_loss": 0.68725812, "learning_rate": 1.7756687881044255e-06, "loss": 0.70901489, "num_input_tokens_seen": 191773560, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.7265625, "step": 8912, "time_per_iteration": 2.5800888538360596 }, { "auxiliary_loss_clip": 0.01128762, "auxiliary_loss_mlp": 0.01034892, "balance_loss_clip": 1.02204752, "balance_loss_mlp": 1.0386281, "epoch": 0.5358785510296107, "flos": 16282400878080.0, "grad_norm": 1.9248839807045492, "language_loss": 0.71309137, "learning_rate": 1.7752933540194593e-06, "loss": 0.73472786, "num_input_tokens_seen": 191791255, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 8913, "time_per_iteration": 2.511784553527832 }, { "auxiliary_loss_clip": 0.01144699, "auxiliary_loss_mlp": 0.01033914, "balance_loss_clip": 1.02052712, "balance_loss_mlp": 1.04130805, "epoch": 0.5359386742822787, "flos": 16976877408000.0, "grad_norm": 1.6244242969045306, "language_loss": 0.72040945, "learning_rate": 1.7749179279536946e-06, "loss": 0.74219561, "num_input_tokens_seen": 191809325, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.765625, "step": 8914, "time_per_iteration": 2.601043701171875 }, { "auxiliary_loss_clip": 0.01138831, "auxiliary_loss_mlp": 0.01041661, "balance_loss_clip": 1.02750504, "balance_loss_mlp": 1.04241669, "epoch": 0.5359987975349466, "flos": 20812927305600.0, "grad_norm": 1.820697167208082, "language_loss": 0.70368594, "learning_rate": 1.7745425099205305e-06, "loss": 0.72549087, "num_input_tokens_seen": 191829795, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.78125, "step": 8915, "time_per_iteration": 4.806511878967285 }, { "auxiliary_loss_clip": 0.01129758, "auxiliary_loss_mlp": 0.01040263, "balance_loss_clip": 1.02679849, "balance_loss_mlp": 1.03936744, "epoch": 0.5360589207876146, "flos": 22199941031040.0, "grad_norm": 1.7561009532929237, "language_loss": 0.75105166, "learning_rate": 1.7741670999333645e-06, "loss": 0.77275193, "num_input_tokens_seen": 191850840, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 8916, "time_per_iteration": 2.610947608947754 }, { "auxiliary_loss_clip": 0.01150603, "auxiliary_loss_mlp": 0.01278848, "balance_loss_clip": 1.01891994, "balance_loss_mlp": 1.03960097, "epoch": 0.5361190440402825, "flos": 31394365084800.0, "grad_norm": 2.2724215801633254, "language_loss": 0.72610021, "learning_rate": 1.7737916980055932e-06, "loss": 0.7503947, "num_input_tokens_seen": 191869520, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.75390625, "step": 8917, "time_per_iteration": 2.7147746086120605 }, { "auxiliary_loss_clip": 0.0113157, "auxiliary_loss_mlp": 0.01035471, "balance_loss_clip": 1.0223217, "balance_loss_mlp": 1.04126072, "epoch": 0.5361791672929506, "flos": 16069882060800.0, "grad_norm": 1.922521129657802, "language_loss": 0.71436155, "learning_rate": 1.7734163041506146e-06, "loss": 0.73603195, "num_input_tokens_seen": 191887240, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 8918, "time_per_iteration": 2.553438425064087 }, { "auxiliary_loss_clip": 0.01132685, "auxiliary_loss_mlp": 0.01035409, "balance_loss_clip": 1.02258193, "balance_loss_mlp": 1.04195654, "epoch": 0.5362392905456185, "flos": 20740926493440.0, "grad_norm": 1.4365362326529232, "language_loss": 0.75119579, "learning_rate": 1.773040918381825e-06, "loss": 0.77287674, "num_input_tokens_seen": 191905690, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.73046875, "step": 8919, "time_per_iteration": 2.6597189903259277 }, { "auxiliary_loss_clip": 0.01131481, "auxiliary_loss_mlp": 0.01036622, "balance_loss_clip": 1.02390885, "balance_loss_mlp": 1.03880453, "epoch": 0.5362994137982865, "flos": 17340077779200.0, "grad_norm": 2.0192366006142413, "language_loss": 0.71252954, "learning_rate": 1.7726655407126219e-06, "loss": 0.73421061, "num_input_tokens_seen": 191920725, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.74609375, "step": 8920, "time_per_iteration": 2.5656731128692627 }, { "auxiliary_loss_clip": 0.01122808, "auxiliary_loss_mlp": 0.01032179, "balance_loss_clip": 1.01926231, "balance_loss_mlp": 1.04034007, "epoch": 0.5363595370509544, "flos": 42813957795840.0, "grad_norm": 2.0021768307994905, "language_loss": 0.68647838, "learning_rate": 1.7722901711564006e-06, "loss": 0.7080282, "num_input_tokens_seen": 191944645, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 8921, "time_per_iteration": 2.843339443206787 }, { "auxiliary_loss_clip": 0.01125786, "auxiliary_loss_mlp": 0.01033239, "balance_loss_clip": 1.02054358, "balance_loss_mlp": 1.04306066, "epoch": 0.5364196603036224, "flos": 19171953446400.0, "grad_norm": 3.8944780268699875, "language_loss": 0.8201741, "learning_rate": 1.7719148097265575e-06, "loss": 0.84176433, "num_input_tokens_seen": 191962265, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73828125, "step": 8922, "time_per_iteration": 2.6176247596740723 }, { "auxiliary_loss_clip": 0.01121928, "auxiliary_loss_mlp": 0.01028043, "balance_loss_clip": 1.01597309, "balance_loss_mlp": 1.03895664, "epoch": 0.5364797835562904, "flos": 17931060247680.0, "grad_norm": 2.6544311967748127, "language_loss": 0.76528329, "learning_rate": 1.771539456436488e-06, "loss": 0.78678304, "num_input_tokens_seen": 191978850, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.7421875, "step": 8923, "time_per_iteration": 2.502145290374756 }, { "auxiliary_loss_clip": 0.01137345, "auxiliary_loss_mlp": 0.01035964, "balance_loss_clip": 1.02241635, "balance_loss_mlp": 1.04226208, "epoch": 0.5365399068089584, "flos": 30228058477440.0, "grad_norm": 1.5843806867595052, "language_loss": 0.70156944, "learning_rate": 1.771164111299587e-06, "loss": 0.72330248, "num_input_tokens_seen": 192002000, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.76953125, "step": 8924, "time_per_iteration": 2.6168196201324463 }, { "auxiliary_loss_clip": 0.01145083, "auxiliary_loss_mlp": 0.01036509, "balance_loss_clip": 1.02162623, "balance_loss_mlp": 1.04241586, "epoch": 0.5366000300616264, "flos": 24891696380160.0, "grad_norm": 1.8768397840842077, "language_loss": 0.86943316, "learning_rate": 1.770788774329251e-06, "loss": 0.89124912, "num_input_tokens_seen": 192019100, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.765625, "step": 8925, "time_per_iteration": 2.5770888328552246 }, { "auxiliary_loss_clip": 0.01114172, "auxiliary_loss_mlp": 0.01031444, "balance_loss_clip": 1.01852798, "balance_loss_mlp": 1.03855968, "epoch": 0.5366601533142943, "flos": 29826649013760.0, "grad_norm": 1.9809380752576278, "language_loss": 0.78228039, "learning_rate": 1.7704134455388737e-06, "loss": 0.80373651, "num_input_tokens_seen": 192041660, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7578125, "step": 8926, "time_per_iteration": 2.664010763168335 }, { "auxiliary_loss_clip": 0.01140837, "auxiliary_loss_mlp": 0.01030337, "balance_loss_clip": 1.01858306, "balance_loss_mlp": 1.04281509, "epoch": 0.5367202765669623, "flos": 27199352620800.0, "grad_norm": 1.6789340272119433, "language_loss": 0.66871309, "learning_rate": 1.77003812494185e-06, "loss": 0.69042486, "num_input_tokens_seen": 192063540, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.7109375, "step": 8927, "time_per_iteration": 2.6235034465789795 }, { "auxiliary_loss_clip": 0.01123121, "auxiliary_loss_mlp": 0.01028736, "balance_loss_clip": 1.01567698, "balance_loss_mlp": 1.03930473, "epoch": 0.5367803998196302, "flos": 20229953569920.0, "grad_norm": 1.6825657677866162, "language_loss": 0.73472977, "learning_rate": 1.7696628125515745e-06, "loss": 0.75624835, "num_input_tokens_seen": 192081760, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.75, "step": 8928, "time_per_iteration": 2.68682599067688 }, { "auxiliary_loss_clip": 0.01131396, "auxiliary_loss_mlp": 0.01030944, "balance_loss_clip": 1.01821291, "balance_loss_mlp": 1.0392406, "epoch": 0.5368405230722982, "flos": 32154629374080.0, "grad_norm": 1.7242241911564056, "language_loss": 0.63447058, "learning_rate": 1.76928750838144e-06, "loss": 0.65609401, "num_input_tokens_seen": 192101620, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.74609375, "step": 8929, "time_per_iteration": 2.626587390899658 }, { "auxiliary_loss_clip": 0.01124106, "auxiliary_loss_mlp": 0.01032271, "balance_loss_clip": 1.0193193, "balance_loss_mlp": 1.04026747, "epoch": 0.5369006463249661, "flos": 26247935128320.0, "grad_norm": 1.848621280978325, "language_loss": 0.66571057, "learning_rate": 1.7689122124448415e-06, "loss": 0.68727434, "num_input_tokens_seen": 192121805, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.75, "step": 8930, "time_per_iteration": 2.657687187194824 }, { "auxiliary_loss_clip": 0.01145464, "auxiliary_loss_mlp": 0.01029942, "balance_loss_clip": 1.01641774, "balance_loss_mlp": 1.03805923, "epoch": 0.5369607695776342, "flos": 26211306234240.0, "grad_norm": 1.4351792092355729, "language_loss": 0.67128694, "learning_rate": 1.7685369247551712e-06, "loss": 0.69304103, "num_input_tokens_seen": 192141765, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 8931, "time_per_iteration": 2.6031651496887207 }, { "auxiliary_loss_clip": 0.01151238, "auxiliary_loss_mlp": 0.01031348, "balance_loss_clip": 1.01749015, "balance_loss_mlp": 1.04264045, "epoch": 0.5370208928303021, "flos": 25009017177600.0, "grad_norm": 1.5443799735823782, "language_loss": 0.75836968, "learning_rate": 1.768161645325823e-06, "loss": 0.78019553, "num_input_tokens_seen": 192161560, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 8932, "time_per_iteration": 2.6466591358184814 }, { "auxiliary_loss_clip": 0.01143512, "auxiliary_loss_mlp": 0.01035532, "balance_loss_clip": 1.02189493, "balance_loss_mlp": 1.04294741, "epoch": 0.5370810160829701, "flos": 31792147274880.0, "grad_norm": 2.319938014094419, "language_loss": 0.66056567, "learning_rate": 1.7677863741701892e-06, "loss": 0.68235612, "num_input_tokens_seen": 192180190, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 8933, "time_per_iteration": 2.592771291732788 }, { "auxiliary_loss_clip": 0.01121064, "auxiliary_loss_mlp": 0.01282781, "balance_loss_clip": 1.02368498, "balance_loss_mlp": 1.0414927, "epoch": 0.537141139335638, "flos": 23842602829440.0, "grad_norm": 1.5750372778716568, "language_loss": 0.82529151, "learning_rate": 1.767411111301662e-06, "loss": 0.84932995, "num_input_tokens_seen": 192198855, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.70703125, "step": 8934, "time_per_iteration": 2.5811767578125 }, { "auxiliary_loss_clip": 0.01137286, "auxiliary_loss_mlp": 0.01032374, "balance_loss_clip": 1.01922524, "balance_loss_mlp": 1.03915989, "epoch": 0.537201262588306, "flos": 18508826511360.0, "grad_norm": 1.7409928240447512, "language_loss": 0.79602599, "learning_rate": 1.7670358567336347e-06, "loss": 0.81772262, "num_input_tokens_seen": 192216555, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 8935, "time_per_iteration": 2.5191965103149414 }, { "auxiliary_loss_clip": 0.01122775, "auxiliary_loss_mlp": 0.0103668, "balance_loss_clip": 1.02330422, "balance_loss_mlp": 1.03965902, "epoch": 0.537261385840974, "flos": 25662950231040.0, "grad_norm": 1.8331730675558657, "language_loss": 0.83685333, "learning_rate": 1.766660610479498e-06, "loss": 0.85844785, "num_input_tokens_seen": 192236910, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7421875, "step": 8936, "time_per_iteration": 2.6045589447021484 }, { "auxiliary_loss_clip": 0.0113917, "auxiliary_loss_mlp": 0.01030189, "balance_loss_clip": 1.01782703, "balance_loss_mlp": 1.04165292, "epoch": 0.537321509093642, "flos": 40735017406080.0, "grad_norm": 1.2571690636363937, "language_loss": 0.72971797, "learning_rate": 1.7662853725526443e-06, "loss": 0.75141156, "num_input_tokens_seen": 192260790, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.7109375, "step": 8937, "time_per_iteration": 2.724682092666626 }, { "auxiliary_loss_clip": 0.01132842, "auxiliary_loss_mlp": 0.01030612, "balance_loss_clip": 1.01696825, "balance_loss_mlp": 1.03820419, "epoch": 0.53738163234631, "flos": 17238487138560.0, "grad_norm": 2.540448522901236, "language_loss": 0.8184638, "learning_rate": 1.7659101429664642e-06, "loss": 0.84009838, "num_input_tokens_seen": 192277230, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.765625, "step": 8938, "time_per_iteration": 2.5685858726501465 }, { "auxiliary_loss_clip": 0.01118043, "auxiliary_loss_mlp": 0.01035187, "balance_loss_clip": 1.02231848, "balance_loss_mlp": 1.04052234, "epoch": 0.5374417555989779, "flos": 12821977457280.0, "grad_norm": 2.439234344707651, "language_loss": 0.80852628, "learning_rate": 1.7655349217343488e-06, "loss": 0.83005857, "num_input_tokens_seen": 192292840, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.77734375, "step": 8939, "time_per_iteration": 2.4872727394104004 }, { "auxiliary_loss_clip": 0.01067785, "auxiliary_loss_mlp": 0.01003107, "balance_loss_clip": 1.00167084, "balance_loss_mlp": 1.01712215, "epoch": 0.5375018788516459, "flos": 67256018703360.0, "grad_norm": 0.7033705739995938, "language_loss": 0.52480876, "learning_rate": 1.765159708869689e-06, "loss": 0.54551768, "num_input_tokens_seen": 192358240, "router_z_loss_clip": 0.01434326, "router_z_loss_mlp": 0.24023438, "step": 8940, "time_per_iteration": 3.2475533485412598 }, { "auxiliary_loss_clip": 0.01139633, "auxiliary_loss_mlp": 0.010275, "balance_loss_clip": 1.01493561, "balance_loss_mlp": 1.04072094, "epoch": 0.5375620021043138, "flos": 18114168804480.0, "grad_norm": 2.423338340177046, "language_loss": 0.71178865, "learning_rate": 1.764784504385875e-06, "loss": 0.73345995, "num_input_tokens_seen": 192377370, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 8941, "time_per_iteration": 2.785154342651367 }, { "auxiliary_loss_clip": 0.01128891, "auxiliary_loss_mlp": 0.01028137, "balance_loss_clip": 1.0161624, "balance_loss_mlp": 1.04040635, "epoch": 0.5376221253569818, "flos": 23149383275520.0, "grad_norm": 1.6503906198040255, "language_loss": 0.79377204, "learning_rate": 1.7644093082962969e-06, "loss": 0.81534231, "num_input_tokens_seen": 192396450, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.7109375, "step": 8942, "time_per_iteration": 2.5896599292755127 }, { "auxiliary_loss_clip": 0.01117413, "auxiliary_loss_mlp": 0.0103594, "balance_loss_clip": 1.02258873, "balance_loss_mlp": 1.04095137, "epoch": 0.5376822486096497, "flos": 29972302663680.0, "grad_norm": 2.346246375602564, "language_loss": 0.69940794, "learning_rate": 1.7640341206143452e-06, "loss": 0.72094148, "num_input_tokens_seen": 192417390, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.765625, "step": 8943, "time_per_iteration": 2.6594343185424805 }, { "auxiliary_loss_clip": 0.01047628, "auxiliary_loss_mlp": 0.01004232, "balance_loss_clip": 1.00270581, "balance_loss_mlp": 1.01520157, "epoch": 0.5377423718623178, "flos": 54168950874240.0, "grad_norm": 0.8229207370975369, "language_loss": 0.59641707, "learning_rate": 1.763658941353408e-06, "loss": 0.61693561, "num_input_tokens_seen": 192478060, "router_z_loss_clip": 0.01525879, "router_z_loss_mlp": 0.23828125, "step": 8944, "time_per_iteration": 3.1535398960113525 }, { "auxiliary_loss_clip": 0.01154682, "auxiliary_loss_mlp": 0.01033974, "balance_loss_clip": 1.01979959, "balance_loss_mlp": 1.04094553, "epoch": 0.5378024951149857, "flos": 23257079228160.0, "grad_norm": 2.3267958015037515, "language_loss": 0.78492326, "learning_rate": 1.7632837705268758e-06, "loss": 0.80680978, "num_input_tokens_seen": 192495985, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.78125, "step": 8945, "time_per_iteration": 2.6754045486450195 }, { "auxiliary_loss_clip": 0.01121955, "auxiliary_loss_mlp": 0.01035226, "balance_loss_clip": 1.02244651, "balance_loss_mlp": 1.03829896, "epoch": 0.5378626183676537, "flos": 24024095274240.0, "grad_norm": 1.7102947469385652, "language_loss": 0.68533242, "learning_rate": 1.7629086081481363e-06, "loss": 0.70690417, "num_input_tokens_seen": 192515445, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.75, "step": 8946, "time_per_iteration": 4.041644334793091 }, { "auxiliary_loss_clip": 0.01146575, "auxiliary_loss_mlp": 0.01039136, "balance_loss_clip": 1.02649379, "balance_loss_mlp": 1.03929114, "epoch": 0.5379227416203216, "flos": 27161789973120.0, "grad_norm": 1.6057070419855701, "language_loss": 0.77012873, "learning_rate": 1.7625334542305792e-06, "loss": 0.79198581, "num_input_tokens_seen": 192536530, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 8947, "time_per_iteration": 2.6420624256134033 }, { "auxiliary_loss_clip": 0.01120185, "auxiliary_loss_mlp": 0.01032443, "balance_loss_clip": 1.02023005, "balance_loss_mlp": 1.039222, "epoch": 0.5379828648729896, "flos": 24681619687680.0, "grad_norm": 1.5292348583050979, "language_loss": 0.59888953, "learning_rate": 1.762158308787592e-06, "loss": 0.62041581, "num_input_tokens_seen": 192556075, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.71875, "step": 8948, "time_per_iteration": 2.6187222003936768 }, { "auxiliary_loss_clip": 0.01142874, "auxiliary_loss_mlp": 0.01030985, "balance_loss_clip": 1.01880157, "balance_loss_mlp": 1.04249656, "epoch": 0.5380429881256577, "flos": 22523280284160.0, "grad_norm": 1.5875578496316154, "language_loss": 0.79610169, "learning_rate": 1.7617831718325634e-06, "loss": 0.81784028, "num_input_tokens_seen": 192575535, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.734375, "step": 8949, "time_per_iteration": 2.611583948135376 }, { "auxiliary_loss_clip": 0.01137537, "auxiliary_loss_mlp": 0.0103546, "balance_loss_clip": 1.02291882, "balance_loss_mlp": 1.0403198, "epoch": 0.5381031113783256, "flos": 26979543342720.0, "grad_norm": 1.6716881077772492, "language_loss": 0.77522385, "learning_rate": 1.7614080433788802e-06, "loss": 0.79695374, "num_input_tokens_seen": 192594490, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 8950, "time_per_iteration": 4.045056343078613 }, { "auxiliary_loss_clip": 0.01129794, "auxiliary_loss_mlp": 0.01032281, "balance_loss_clip": 1.02066982, "balance_loss_mlp": 1.04102647, "epoch": 0.5381632346309936, "flos": 24754087376640.0, "grad_norm": 1.614545612025281, "language_loss": 0.72520506, "learning_rate": 1.7610329234399301e-06, "loss": 0.74682581, "num_input_tokens_seen": 192615650, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.7109375, "step": 8951, "time_per_iteration": 2.623640775680542 }, { "auxiliary_loss_clip": 0.01112213, "auxiliary_loss_mlp": 0.01275054, "balance_loss_clip": 1.01543915, "balance_loss_mlp": 1.03775668, "epoch": 0.5382233578836615, "flos": 15560058372480.0, "grad_norm": 1.7137609619887664, "language_loss": 0.75034899, "learning_rate": 1.7606578120291013e-06, "loss": 0.77422166, "num_input_tokens_seen": 192633840, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7421875, "step": 8952, "time_per_iteration": 2.5249035358428955 }, { "auxiliary_loss_clip": 0.01131481, "auxiliary_loss_mlp": 0.01028203, "balance_loss_clip": 1.01508427, "balance_loss_mlp": 1.04077506, "epoch": 0.5382834811363295, "flos": 25084501608960.0, "grad_norm": 1.567742515367377, "language_loss": 0.79622489, "learning_rate": 1.7602827091597785e-06, "loss": 0.81782174, "num_input_tokens_seen": 192655890, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 8953, "time_per_iteration": 2.667086124420166 }, { "auxiliary_loss_clip": 0.01137907, "auxiliary_loss_mlp": 0.0102785, "balance_loss_clip": 1.0150584, "balance_loss_mlp": 1.0405966, "epoch": 0.5383436043889974, "flos": 13297901685120.0, "grad_norm": 1.8103574643383389, "language_loss": 0.80703509, "learning_rate": 1.7599076148453496e-06, "loss": 0.82869267, "num_input_tokens_seen": 192673025, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 8954, "time_per_iteration": 2.5874617099761963 }, { "auxiliary_loss_clip": 0.01115576, "auxiliary_loss_mlp": 0.01030545, "balance_loss_clip": 1.01725352, "balance_loss_mlp": 1.04208016, "epoch": 0.5384037276416654, "flos": 23039388852480.0, "grad_norm": 2.1559514662685695, "language_loss": 0.76481104, "learning_rate": 1.7595325290992003e-06, "loss": 0.78627217, "num_input_tokens_seen": 192692190, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 8955, "time_per_iteration": 2.6676723957061768 }, { "auxiliary_loss_clip": 0.01129978, "auxiliary_loss_mlp": 0.01037178, "balance_loss_clip": 1.02421975, "balance_loss_mlp": 1.04067647, "epoch": 0.5384638508943334, "flos": 20631147552000.0, "grad_norm": 1.5368714347164807, "language_loss": 0.78113186, "learning_rate": 1.759157451934716e-06, "loss": 0.8028034, "num_input_tokens_seen": 192710380, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 8956, "time_per_iteration": 4.1733691692352295 }, { "auxiliary_loss_clip": 0.01054813, "auxiliary_loss_mlp": 0.01000749, "balance_loss_clip": 0.9992348, "balance_loss_mlp": 1.01296353, "epoch": 0.5385239741470014, "flos": 66737683491840.0, "grad_norm": 0.8630577178588452, "language_loss": 0.6348778, "learning_rate": 1.7587823833652833e-06, "loss": 0.65543342, "num_input_tokens_seen": 192768995, "router_z_loss_clip": 0.01513672, "router_z_loss_mlp": 0.23828125, "step": 8957, "time_per_iteration": 4.538283348083496 }, { "auxiliary_loss_clip": 0.01071742, "auxiliary_loss_mlp": 0.01001773, "balance_loss_clip": 1.0002234, "balance_loss_mlp": 1.01292539, "epoch": 0.5385840973996693, "flos": 64716058229760.0, "grad_norm": 0.7271863566864377, "language_loss": 0.51671511, "learning_rate": 1.7584073234042865e-06, "loss": 0.53745025, "num_input_tokens_seen": 192825585, "router_z_loss_clip": 0.01544189, "router_z_loss_mlp": 0.23828125, "step": 8958, "time_per_iteration": 3.2320196628570557 }, { "auxiliary_loss_clip": 0.01127432, "auxiliary_loss_mlp": 0.01030193, "balance_loss_clip": 1.01688969, "balance_loss_mlp": 1.04319859, "epoch": 0.5386442206523373, "flos": 26141783460480.0, "grad_norm": 1.8302668747106556, "language_loss": 0.77147603, "learning_rate": 1.7580322720651111e-06, "loss": 0.79305226, "num_input_tokens_seen": 192847335, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75390625, "step": 8959, "time_per_iteration": 2.632315158843994 }, { "auxiliary_loss_clip": 0.01148625, "auxiliary_loss_mlp": 0.01029717, "balance_loss_clip": 1.01693749, "balance_loss_mlp": 1.03858948, "epoch": 0.5387043439050052, "flos": 18251849635200.0, "grad_norm": 2.3531301943176754, "language_loss": 0.83677173, "learning_rate": 1.7576572293611413e-06, "loss": 0.85855508, "num_input_tokens_seen": 192862205, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.74609375, "step": 8960, "time_per_iteration": 2.6093568801879883 }, { "auxiliary_loss_clip": 0.01122135, "auxiliary_loss_mlp": 0.01030674, "balance_loss_clip": 1.01821053, "balance_loss_mlp": 1.04083622, "epoch": 0.5387644671576732, "flos": 29788296266880.0, "grad_norm": 1.5291491192386084, "language_loss": 0.78614628, "learning_rate": 1.7572821953057615e-06, "loss": 0.80767441, "num_input_tokens_seen": 192883695, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.72265625, "step": 8961, "time_per_iteration": 2.6960668563842773 }, { "auxiliary_loss_clip": 0.01139237, "auxiliary_loss_mlp": 0.01034699, "balance_loss_clip": 1.02214074, "balance_loss_mlp": 1.04011941, "epoch": 0.5388245904103413, "flos": 22374466237440.0, "grad_norm": 2.264814844878189, "language_loss": 0.8441608, "learning_rate": 1.7569071699123563e-06, "loss": 0.86590016, "num_input_tokens_seen": 192900190, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 8962, "time_per_iteration": 2.5626437664031982 }, { "auxiliary_loss_clip": 0.01053653, "auxiliary_loss_mlp": 0.01003831, "balance_loss_clip": 1.0024842, "balance_loss_mlp": 1.01242411, "epoch": 0.5388847136630092, "flos": 69807794751360.0, "grad_norm": 0.7299178489202303, "language_loss": 0.5414719, "learning_rate": 1.7565321531943082e-06, "loss": 0.56204677, "num_input_tokens_seen": 192958675, "router_z_loss_clip": 0.01348877, "router_z_loss_mlp": 0.23925781, "step": 8963, "time_per_iteration": 3.0943663120269775 }, { "auxiliary_loss_clip": 0.01054013, "auxiliary_loss_mlp": 0.01002455, "balance_loss_clip": 1.00106001, "balance_loss_mlp": 1.01262724, "epoch": 0.5389448369156772, "flos": 69822303845760.0, "grad_norm": 0.8002195268185467, "language_loss": 0.63307422, "learning_rate": 1.756157145165002e-06, "loss": 0.6536389, "num_input_tokens_seen": 193033135, "router_z_loss_clip": 0.01397705, "router_z_loss_mlp": 0.23925781, "step": 8964, "time_per_iteration": 3.259160280227661 }, { "auxiliary_loss_clip": 0.01161392, "auxiliary_loss_mlp": 0.01033233, "balance_loss_clip": 1.02000058, "balance_loss_mlp": 1.04001689, "epoch": 0.5390049601683451, "flos": 31722444933120.0, "grad_norm": 1.4430937527493302, "language_loss": 0.69895828, "learning_rate": 1.7557821458378197e-06, "loss": 0.72090447, "num_input_tokens_seen": 193055570, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.765625, "step": 8965, "time_per_iteration": 2.676685333251953 }, { "auxiliary_loss_clip": 0.01135636, "auxiliary_loss_mlp": 0.01279299, "balance_loss_clip": 1.0187453, "balance_loss_mlp": 1.04179621, "epoch": 0.5390650834210131, "flos": 18113486446080.0, "grad_norm": 2.091981363412146, "language_loss": 0.81984377, "learning_rate": 1.7554071552261442e-06, "loss": 0.84399307, "num_input_tokens_seen": 193073120, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7578125, "step": 8966, "time_per_iteration": 2.5168306827545166 }, { "auxiliary_loss_clip": 0.01113895, "auxiliary_loss_mlp": 0.0103418, "balance_loss_clip": 1.0204885, "balance_loss_mlp": 1.04183447, "epoch": 0.539125206673681, "flos": 17416711445760.0, "grad_norm": 3.5209795418090732, "language_loss": 0.720667, "learning_rate": 1.755032173343359e-06, "loss": 0.74214768, "num_input_tokens_seen": 193090105, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 8967, "time_per_iteration": 2.506854772567749 }, { "auxiliary_loss_clip": 0.01129205, "auxiliary_loss_mlp": 0.01033543, "balance_loss_clip": 1.02143097, "balance_loss_mlp": 1.03851461, "epoch": 0.539185329926349, "flos": 22198935450240.0, "grad_norm": 1.767345595634833, "language_loss": 0.81782675, "learning_rate": 1.7546572002028446e-06, "loss": 0.83945417, "num_input_tokens_seen": 193109325, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.73046875, "step": 8968, "time_per_iteration": 2.5537145137786865 }, { "auxiliary_loss_clip": 0.01139529, "auxiliary_loss_mlp": 0.01032816, "balance_loss_clip": 1.01954222, "balance_loss_mlp": 1.03910089, "epoch": 0.539245453179017, "flos": 21434397442560.0, "grad_norm": 1.547245824261693, "language_loss": 0.74096441, "learning_rate": 1.754282235817984e-06, "loss": 0.76268792, "num_input_tokens_seen": 193130595, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73828125, "step": 8969, "time_per_iteration": 2.6044955253601074 }, { "auxiliary_loss_clip": 0.01142246, "auxiliary_loss_mlp": 0.01278037, "balance_loss_clip": 1.01753044, "balance_loss_mlp": 1.03897822, "epoch": 0.539305576431685, "flos": 20735000749440.0, "grad_norm": 1.7628976948372048, "language_loss": 0.82176763, "learning_rate": 1.753907280202158e-06, "loss": 0.84597045, "num_input_tokens_seen": 193148930, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.765625, "step": 8970, "time_per_iteration": 2.649996280670166 }, { "auxiliary_loss_clip": 0.0112758, "auxiliary_loss_mlp": 0.01032588, "balance_loss_clip": 1.02038121, "balance_loss_mlp": 1.03930688, "epoch": 0.5393656996843529, "flos": 30920452018560.0, "grad_norm": 1.3920681851066188, "language_loss": 0.75419056, "learning_rate": 1.7535323333687485e-06, "loss": 0.77579224, "num_input_tokens_seen": 193170140, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7109375, "step": 8971, "time_per_iteration": 2.68819260597229 }, { "auxiliary_loss_clip": 0.01044611, "auxiliary_loss_mlp": 0.01000805, "balance_loss_clip": 0.99937421, "balance_loss_mlp": 1.01190889, "epoch": 0.5394258229370209, "flos": 50317781351040.0, "grad_norm": 0.875666822220868, "language_loss": 0.60291803, "learning_rate": 1.7531573953311358e-06, "loss": 0.6233722, "num_input_tokens_seen": 193227235, "router_z_loss_clip": 0.01428223, "router_z_loss_mlp": 0.23632812, "step": 8972, "time_per_iteration": 3.152282953262329 }, { "auxiliary_loss_clip": 0.0111939, "auxiliary_loss_mlp": 0.0103865, "balance_loss_clip": 1.02523255, "balance_loss_mlp": 1.03751814, "epoch": 0.5394859461896888, "flos": 25411935012480.0, "grad_norm": 1.421269098699104, "language_loss": 0.78278029, "learning_rate": 1.7527824661027007e-06, "loss": 0.80436069, "num_input_tokens_seen": 193248435, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 8973, "time_per_iteration": 2.5291898250579834 }, { "auxiliary_loss_clip": 0.01125749, "auxiliary_loss_mlp": 0.01038327, "balance_loss_clip": 1.02406371, "balance_loss_mlp": 1.03854275, "epoch": 0.5395460694423568, "flos": 25478476957440.0, "grad_norm": 2.2889085357225087, "language_loss": 0.74118173, "learning_rate": 1.7524075456968234e-06, "loss": 0.76282251, "num_input_tokens_seen": 193267490, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78515625, "step": 8974, "time_per_iteration": 2.584960460662842 }, { "auxiliary_loss_clip": 0.0103326, "auxiliary_loss_mlp": 0.01001552, "balance_loss_clip": 1.00006235, "balance_loss_mlp": 1.00965512, "epoch": 0.5396061926950249, "flos": 53249493507840.0, "grad_norm": 0.7313160052019994, "language_loss": 0.51086104, "learning_rate": 1.7520326341268838e-06, "loss": 0.53120911, "num_input_tokens_seen": 193326050, "router_z_loss_clip": 0.01489258, "router_z_loss_mlp": 0.23632812, "step": 8975, "time_per_iteration": 3.1070008277893066 }, { "auxiliary_loss_clip": 0.01121384, "auxiliary_loss_mlp": 0.0103613, "balance_loss_clip": 1.02287364, "balance_loss_mlp": 1.040344, "epoch": 0.5396663159476928, "flos": 26725080418560.0, "grad_norm": 1.603878041951499, "language_loss": 0.71999872, "learning_rate": 1.7516577314062622e-06, "loss": 0.74157381, "num_input_tokens_seen": 193348785, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 8976, "time_per_iteration": 2.6900689601898193 }, { "auxiliary_loss_clip": 0.01121828, "auxiliary_loss_mlp": 0.01281669, "balance_loss_clip": 1.02231646, "balance_loss_mlp": 1.03952885, "epoch": 0.5397264392003608, "flos": 23253380127360.0, "grad_norm": 1.7597969778095222, "language_loss": 0.69835603, "learning_rate": 1.7512828375483371e-06, "loss": 0.72239101, "num_input_tokens_seen": 193367080, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.734375, "step": 8977, "time_per_iteration": 2.525243043899536 }, { "auxiliary_loss_clip": 0.01159285, "auxiliary_loss_mlp": 0.01035605, "balance_loss_clip": 1.02192557, "balance_loss_mlp": 1.0412426, "epoch": 0.5397865624530287, "flos": 18294188791680.0, "grad_norm": 1.7244749733294888, "language_loss": 0.72655433, "learning_rate": 1.7509079525664875e-06, "loss": 0.74850321, "num_input_tokens_seen": 193383715, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 8978, "time_per_iteration": 2.5982017517089844 }, { "auxiliary_loss_clip": 0.0114759, "auxiliary_loss_mlp": 0.01038581, "balance_loss_clip": 1.02365637, "balance_loss_mlp": 1.04190123, "epoch": 0.5398466857056967, "flos": 15297514888320.0, "grad_norm": 1.9511584213031132, "language_loss": 0.73901713, "learning_rate": 1.7505330764740927e-06, "loss": 0.7608788, "num_input_tokens_seen": 193400560, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.78125, "step": 8979, "time_per_iteration": 2.5717709064483643 }, { "auxiliary_loss_clip": 0.011638, "auxiliary_loss_mlp": 0.01047096, "balance_loss_clip": 1.03365541, "balance_loss_mlp": 1.04183388, "epoch": 0.5399068089583646, "flos": 17821748183040.0, "grad_norm": 2.0654231835864194, "language_loss": 0.77170587, "learning_rate": 1.75015820928453e-06, "loss": 0.79381484, "num_input_tokens_seen": 193418680, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7734375, "step": 8980, "time_per_iteration": 2.662954568862915 }, { "auxiliary_loss_clip": 0.0111282, "auxiliary_loss_mlp": 0.01029674, "balance_loss_clip": 1.01754475, "balance_loss_mlp": 1.03862786, "epoch": 0.5399669322110326, "flos": 27381635164800.0, "grad_norm": 1.6605352726927018, "language_loss": 0.82011604, "learning_rate": 1.7497833510111787e-06, "loss": 0.84154105, "num_input_tokens_seen": 193439310, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.7421875, "step": 8981, "time_per_iteration": 2.5892179012298584 }, { "auxiliary_loss_clip": 0.01123287, "auxiliary_loss_mlp": 0.01029884, "balance_loss_clip": 1.01628852, "balance_loss_mlp": 1.0381906, "epoch": 0.5400270554637006, "flos": 20449116403200.0, "grad_norm": 1.768491423177125, "language_loss": 0.66498119, "learning_rate": 1.7494085016674162e-06, "loss": 0.68651283, "num_input_tokens_seen": 193458115, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.76171875, "step": 8982, "time_per_iteration": 2.625757932662964 }, { "auxiliary_loss_clip": 0.01140325, "auxiliary_loss_mlp": 0.01283201, "balance_loss_clip": 1.02416182, "balance_loss_mlp": 1.04025722, "epoch": 0.5400871787163686, "flos": 21689578638720.0, "grad_norm": 2.316631761241538, "language_loss": 0.82724071, "learning_rate": 1.7490336612666196e-06, "loss": 0.85147595, "num_input_tokens_seen": 193477365, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.73046875, "step": 8983, "time_per_iteration": 2.7287745475769043 }, { "auxiliary_loss_clip": 0.01146961, "auxiliary_loss_mlp": 0.01038979, "balance_loss_clip": 1.0249958, "balance_loss_mlp": 1.03858519, "epoch": 0.5401473019690365, "flos": 19204739585280.0, "grad_norm": 1.801223439362752, "language_loss": 0.70433056, "learning_rate": 1.748658829822166e-06, "loss": 0.72618991, "num_input_tokens_seen": 193495595, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7265625, "step": 8984, "time_per_iteration": 2.5870978832244873 }, { "auxiliary_loss_clip": 0.01117508, "auxiliary_loss_mlp": 0.010328, "balance_loss_clip": 1.01803005, "balance_loss_mlp": 1.0406003, "epoch": 0.5402074252217045, "flos": 20627376624000.0, "grad_norm": 1.7358380175652155, "language_loss": 0.8002553, "learning_rate": 1.748284007347432e-06, "loss": 0.82175839, "num_input_tokens_seen": 193514035, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.76953125, "step": 8985, "time_per_iteration": 2.5354156494140625 }, { "auxiliary_loss_clip": 0.01134126, "auxiliary_loss_mlp": 0.01027692, "balance_loss_clip": 1.01472783, "balance_loss_mlp": 1.03837252, "epoch": 0.5402675484743724, "flos": 24973465691520.0, "grad_norm": 1.9914683885630426, "language_loss": 0.78732377, "learning_rate": 1.7479091938557945e-06, "loss": 0.80894196, "num_input_tokens_seen": 193535445, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6875, "step": 8986, "time_per_iteration": 2.6408329010009766 }, { "auxiliary_loss_clip": 0.01120231, "auxiliary_loss_mlp": 0.01031621, "balance_loss_clip": 1.01934838, "balance_loss_mlp": 1.03777671, "epoch": 0.5403276717270404, "flos": 19459022941440.0, "grad_norm": 1.585789999032829, "language_loss": 0.76739025, "learning_rate": 1.7475343893606293e-06, "loss": 0.78890872, "num_input_tokens_seen": 193554780, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.734375, "step": 8987, "time_per_iteration": 2.5906848907470703 }, { "auxiliary_loss_clip": 0.01129898, "auxiliary_loss_mlp": 0.01031201, "balance_loss_clip": 1.01785588, "balance_loss_mlp": 1.03861785, "epoch": 0.5403877949797083, "flos": 18442140912000.0, "grad_norm": 1.9133327588010527, "language_loss": 0.70959401, "learning_rate": 1.747159593875312e-06, "loss": 0.73120505, "num_input_tokens_seen": 193573580, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 8988, "time_per_iteration": 3.9288995265960693 }, { "auxiliary_loss_clip": 0.01147155, "auxiliary_loss_mlp": 0.01036263, "balance_loss_clip": 1.02264905, "balance_loss_mlp": 1.03791213, "epoch": 0.5404479182323764, "flos": 28292868316800.0, "grad_norm": 3.5401613065307416, "language_loss": 0.66971231, "learning_rate": 1.746784807413219e-06, "loss": 0.69154656, "num_input_tokens_seen": 193590490, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7421875, "step": 8989, "time_per_iteration": 2.625823497772217 }, { "auxiliary_loss_clip": 0.01123877, "auxiliary_loss_mlp": 0.01037692, "balance_loss_clip": 1.02406001, "balance_loss_mlp": 1.04000282, "epoch": 0.5405080414850444, "flos": 23367325046400.0, "grad_norm": 1.5454745638664786, "language_loss": 0.77793485, "learning_rate": 1.7464100299877243e-06, "loss": 0.79955053, "num_input_tokens_seen": 193609900, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75, "step": 8990, "time_per_iteration": 2.533130168914795 }, { "auxiliary_loss_clip": 0.01117156, "auxiliary_loss_mlp": 0.01025438, "balance_loss_clip": 1.01347589, "balance_loss_mlp": 1.03660524, "epoch": 0.5405681647377123, "flos": 21106425335040.0, "grad_norm": 1.7770435271693354, "language_loss": 0.69002414, "learning_rate": 1.7460352616122039e-06, "loss": 0.71145004, "num_input_tokens_seen": 193629775, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.71875, "step": 8991, "time_per_iteration": 2.5477309226989746 }, { "auxiliary_loss_clip": 0.01156401, "auxiliary_loss_mlp": 0.01030041, "balance_loss_clip": 1.0169394, "balance_loss_mlp": 1.03788936, "epoch": 0.5406282879903803, "flos": 20449188230400.0, "grad_norm": 1.7230918659339334, "language_loss": 0.762371, "learning_rate": 1.745660502300031e-06, "loss": 0.78423542, "num_input_tokens_seen": 193648070, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73828125, "step": 8992, "time_per_iteration": 4.027724266052246 }, { "auxiliary_loss_clip": 0.01132146, "auxiliary_loss_mlp": 0.01030859, "balance_loss_clip": 1.01772773, "balance_loss_mlp": 1.03829992, "epoch": 0.5406884112430482, "flos": 14209493973120.0, "grad_norm": 3.0381372721213715, "language_loss": 0.76376581, "learning_rate": 1.7452857520645812e-06, "loss": 0.78539592, "num_input_tokens_seen": 193665060, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7578125, "step": 8993, "time_per_iteration": 2.526089906692505 }, { "auxiliary_loss_clip": 0.01117912, "auxiliary_loss_mlp": 0.0103105, "balance_loss_clip": 1.01836586, "balance_loss_mlp": 1.03767109, "epoch": 0.5407485344957162, "flos": 23875568536320.0, "grad_norm": 1.7387150034559593, "language_loss": 0.70411289, "learning_rate": 1.7449110109192278e-06, "loss": 0.72560251, "num_input_tokens_seen": 193683620, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 8994, "time_per_iteration": 2.6832151412963867 }, { "auxiliary_loss_clip": 0.01123048, "auxiliary_loss_mlp": 0.01033224, "balance_loss_clip": 1.01971161, "balance_loss_mlp": 1.0388782, "epoch": 0.5408086577483842, "flos": 23148485435520.0, "grad_norm": 2.0197055598374045, "language_loss": 0.75159067, "learning_rate": 1.7445362788773435e-06, "loss": 0.77315342, "num_input_tokens_seen": 193702990, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75, "step": 8995, "time_per_iteration": 2.5508944988250732 }, { "auxiliary_loss_clip": 0.0111887, "auxiliary_loss_mlp": 0.01032286, "balance_loss_clip": 1.02047229, "balance_loss_mlp": 1.03867078, "epoch": 0.5408687810010522, "flos": 18771046773120.0, "grad_norm": 1.762096322002008, "language_loss": 0.7337178, "learning_rate": 1.7441615559523028e-06, "loss": 0.75522935, "num_input_tokens_seen": 193721785, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.7109375, "step": 8996, "time_per_iteration": 2.5930066108703613 }, { "auxiliary_loss_clip": 0.0114452, "auxiliary_loss_mlp": 0.0103271, "balance_loss_clip": 1.02048528, "balance_loss_mlp": 1.03912723, "epoch": 0.5409289042537201, "flos": 13881557779200.0, "grad_norm": 1.7916061582916725, "language_loss": 0.72656047, "learning_rate": 1.7437868421574783e-06, "loss": 0.7483328, "num_input_tokens_seen": 193740315, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 8997, "time_per_iteration": 2.5208899974823 }, { "auxiliary_loss_clip": 0.01112923, "auxiliary_loss_mlp": 0.01032539, "balance_loss_clip": 1.01962876, "balance_loss_mlp": 1.03942728, "epoch": 0.5409890275063881, "flos": 14465357527680.0, "grad_norm": 2.1112880975535435, "language_loss": 0.71726978, "learning_rate": 1.7434121375062424e-06, "loss": 0.73872441, "num_input_tokens_seen": 193757580, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 8998, "time_per_iteration": 3.9552266597747803 }, { "auxiliary_loss_clip": 0.01135278, "auxiliary_loss_mlp": 0.01036761, "balance_loss_clip": 1.0235343, "balance_loss_mlp": 1.03767955, "epoch": 0.541049150759056, "flos": 48977449349760.0, "grad_norm": 1.4222722746264338, "language_loss": 0.70555425, "learning_rate": 1.7430374420119668e-06, "loss": 0.72727466, "num_input_tokens_seen": 193780965, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 8999, "time_per_iteration": 4.384413719177246 }, { "auxiliary_loss_clip": 0.01129849, "auxiliary_loss_mlp": 0.0102837, "balance_loss_clip": 1.01479793, "balance_loss_mlp": 1.04009628, "epoch": 0.541109274011724, "flos": 18147601388160.0, "grad_norm": 3.417975438194879, "language_loss": 0.80014145, "learning_rate": 1.7426627556880238e-06, "loss": 0.82172364, "num_input_tokens_seen": 193797855, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71484375, "step": 9000, "time_per_iteration": 2.5654163360595703 }, { "auxiliary_loss_clip": 0.01146506, "auxiliary_loss_mlp": 0.01030184, "balance_loss_clip": 1.01704741, "balance_loss_mlp": 1.03905916, "epoch": 0.541169397264392, "flos": 20522553759360.0, "grad_norm": 2.0523427520475197, "language_loss": 0.72693729, "learning_rate": 1.7422880785477855e-06, "loss": 0.7487042, "num_input_tokens_seen": 193817375, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 9001, "time_per_iteration": 2.6277921199798584 }, { "auxiliary_loss_clip": 0.01143217, "auxiliary_loss_mlp": 0.0103572, "balance_loss_clip": 1.02156341, "balance_loss_mlp": 1.03988433, "epoch": 0.54122952051706, "flos": 20044043752320.0, "grad_norm": 2.2276016548899826, "language_loss": 0.85114217, "learning_rate": 1.7419134106046224e-06, "loss": 0.87293154, "num_input_tokens_seen": 193832205, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.765625, "step": 9002, "time_per_iteration": 2.546907663345337 }, { "auxiliary_loss_clip": 0.01145846, "auxiliary_loss_mlp": 0.01030777, "balance_loss_clip": 1.01935065, "balance_loss_mlp": 1.03965163, "epoch": 0.541289643769728, "flos": 19062246332160.0, "grad_norm": 2.1097533976086016, "language_loss": 0.78216469, "learning_rate": 1.7415387518719063e-06, "loss": 0.80393094, "num_input_tokens_seen": 193849830, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.7109375, "step": 9003, "time_per_iteration": 2.610776662826538 }, { "auxiliary_loss_clip": 0.0111918, "auxiliary_loss_mlp": 0.01033361, "balance_loss_clip": 1.01972401, "balance_loss_mlp": 1.0382266, "epoch": 0.5413497670223959, "flos": 22382295402240.0, "grad_norm": 2.247172123044415, "language_loss": 0.69087225, "learning_rate": 1.741164102363007e-06, "loss": 0.71239763, "num_input_tokens_seen": 193869945, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 9004, "time_per_iteration": 2.5911011695861816 }, { "auxiliary_loss_clip": 0.01043766, "auxiliary_loss_mlp": 0.01002034, "balance_loss_clip": 1.00058556, "balance_loss_mlp": 1.01106858, "epoch": 0.5414098902750639, "flos": 70031734093440.0, "grad_norm": 0.9463467420477415, "language_loss": 0.59062994, "learning_rate": 1.740789462091295e-06, "loss": 0.61108792, "num_input_tokens_seen": 193930860, "router_z_loss_clip": 0.01446533, "router_z_loss_mlp": 0.24023438, "step": 9005, "time_per_iteration": 3.2438292503356934 }, { "auxiliary_loss_clip": 0.01116727, "auxiliary_loss_mlp": 0.01277855, "balance_loss_clip": 1.01886594, "balance_loss_mlp": 1.04249763, "epoch": 0.5414700135277318, "flos": 21798962530560.0, "grad_norm": 1.6507726114481265, "language_loss": 0.77911675, "learning_rate": 1.7404148310701405e-06, "loss": 0.80306256, "num_input_tokens_seen": 193949075, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7421875, "step": 9006, "time_per_iteration": 2.593378782272339 }, { "auxiliary_loss_clip": 0.0112859, "auxiliary_loss_mlp": 0.01034516, "balance_loss_clip": 1.0226959, "balance_loss_mlp": 1.03924274, "epoch": 0.5415301367803999, "flos": 16907929251840.0, "grad_norm": 1.7202752758087896, "language_loss": 0.83175337, "learning_rate": 1.7400402093129125e-06, "loss": 0.85338449, "num_input_tokens_seen": 193967630, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.71484375, "step": 9007, "time_per_iteration": 2.5834615230560303 }, { "auxiliary_loss_clip": 0.01124567, "auxiliary_loss_mlp": 0.01033711, "balance_loss_clip": 1.01992989, "balance_loss_mlp": 1.04149044, "epoch": 0.5415902600330678, "flos": 25704176065920.0, "grad_norm": 1.8322715316238123, "language_loss": 0.66936886, "learning_rate": 1.7396655968329813e-06, "loss": 0.6909517, "num_input_tokens_seen": 193988730, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.73828125, "step": 9008, "time_per_iteration": 2.70639705657959 }, { "auxiliary_loss_clip": 0.01119036, "auxiliary_loss_mlp": 0.01033351, "balance_loss_clip": 1.01861703, "balance_loss_mlp": 1.04149663, "epoch": 0.5416503832857358, "flos": 19208151377280.0, "grad_norm": 2.10641803744804, "language_loss": 0.73776543, "learning_rate": 1.7392909936437152e-06, "loss": 0.75928932, "num_input_tokens_seen": 194005160, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.77734375, "step": 9009, "time_per_iteration": 2.5612714290618896 }, { "auxiliary_loss_clip": 0.01150438, "auxiliary_loss_mlp": 0.01035867, "balance_loss_clip": 1.02228308, "balance_loss_mlp": 1.03999949, "epoch": 0.5417105065384037, "flos": 12713706887040.0, "grad_norm": 3.309860806703195, "language_loss": 0.8763963, "learning_rate": 1.7389163997584825e-06, "loss": 0.89825934, "num_input_tokens_seen": 194021700, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.75, "step": 9010, "time_per_iteration": 2.6136250495910645 }, { "auxiliary_loss_clip": 0.01112691, "auxiliary_loss_mlp": 0.0104065, "balance_loss_clip": 1.02775168, "balance_loss_mlp": 1.0395, "epoch": 0.5417706297910717, "flos": 30335933998080.0, "grad_norm": 1.836831225619865, "language_loss": 0.64947814, "learning_rate": 1.7385418151906524e-06, "loss": 0.67101157, "num_input_tokens_seen": 194042620, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 9011, "time_per_iteration": 2.5407917499542236 }, { "auxiliary_loss_clip": 0.01124722, "auxiliary_loss_mlp": 0.01036399, "balance_loss_clip": 1.02347696, "balance_loss_mlp": 1.04237485, "epoch": 0.5418307530437396, "flos": 29020992912000.0, "grad_norm": 2.1808864476387955, "language_loss": 0.79055363, "learning_rate": 1.7381672399535918e-06, "loss": 0.8121649, "num_input_tokens_seen": 194061800, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 9012, "time_per_iteration": 2.575674295425415 }, { "auxiliary_loss_clip": 0.01113815, "auxiliary_loss_mlp": 0.01031957, "balance_loss_clip": 1.018767, "balance_loss_mlp": 1.03968549, "epoch": 0.5418908762964076, "flos": 16873455173760.0, "grad_norm": 1.9491239067123365, "language_loss": 0.74368286, "learning_rate": 1.73779267406067e-06, "loss": 0.76514053, "num_input_tokens_seen": 194079890, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7421875, "step": 9013, "time_per_iteration": 2.4741902351379395 }, { "auxiliary_loss_clip": 0.0114207, "auxiliary_loss_mlp": 0.01035316, "balance_loss_clip": 1.02198792, "balance_loss_mlp": 1.03949547, "epoch": 0.5419509995490756, "flos": 18949702043520.0, "grad_norm": 2.460106396661865, "language_loss": 0.72733885, "learning_rate": 1.7374181175252522e-06, "loss": 0.74911273, "num_input_tokens_seen": 194097625, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7578125, "step": 9014, "time_per_iteration": 2.570540189743042 }, { "auxiliary_loss_clip": 0.0113151, "auxiliary_loss_mlp": 0.01032974, "balance_loss_clip": 1.01980138, "balance_loss_mlp": 1.0398674, "epoch": 0.5420111228017436, "flos": 18077719478400.0, "grad_norm": 1.5470074235631548, "language_loss": 0.80274653, "learning_rate": 1.7370435703607068e-06, "loss": 0.82439137, "num_input_tokens_seen": 194116055, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73828125, "step": 9015, "time_per_iteration": 2.501988649368286 }, { "auxiliary_loss_clip": 0.01126421, "auxiliary_loss_mlp": 0.01038982, "balance_loss_clip": 1.02540445, "balance_loss_mlp": 1.04104996, "epoch": 0.5420712460544116, "flos": 19061779455360.0, "grad_norm": 2.0467569712387403, "language_loss": 0.80907309, "learning_rate": 1.7366690325803998e-06, "loss": 0.83072722, "num_input_tokens_seen": 194130365, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.765625, "step": 9016, "time_per_iteration": 2.6157310009002686 }, { "auxiliary_loss_clip": 0.01112229, "auxiliary_loss_mlp": 0.01031394, "balance_loss_clip": 1.01819205, "balance_loss_mlp": 1.03920758, "epoch": 0.5421313693070795, "flos": 18187103370240.0, "grad_norm": 1.5866056599625922, "language_loss": 0.81327331, "learning_rate": 1.7362945041976972e-06, "loss": 0.83470953, "num_input_tokens_seen": 194148975, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73046875, "step": 9017, "time_per_iteration": 2.4716739654541016 }, { "auxiliary_loss_clip": 0.0111165, "auxiliary_loss_mlp": 0.01037252, "balance_loss_clip": 1.02382898, "balance_loss_mlp": 1.03979015, "epoch": 0.5421914925597475, "flos": 13005947940480.0, "grad_norm": 1.6311441808495393, "language_loss": 0.77751052, "learning_rate": 1.7359199852259663e-06, "loss": 0.79899955, "num_input_tokens_seen": 194167185, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 9018, "time_per_iteration": 2.5278055667877197 }, { "auxiliary_loss_clip": 0.011125, "auxiliary_loss_mlp": 0.01039057, "balance_loss_clip": 1.02518106, "balance_loss_mlp": 1.03667951, "epoch": 0.5422516158124154, "flos": 46758457831680.0, "grad_norm": 1.701420601785434, "language_loss": 0.66291505, "learning_rate": 1.735545475678571e-06, "loss": 0.6844306, "num_input_tokens_seen": 194192840, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 9019, "time_per_iteration": 2.750391721725464 }, { "auxiliary_loss_clip": 0.01118499, "auxiliary_loss_mlp": 0.01027808, "balance_loss_clip": 1.01574981, "balance_loss_mlp": 1.03851092, "epoch": 0.5423117390650835, "flos": 31758642864000.0, "grad_norm": 1.6992134807840897, "language_loss": 0.69629455, "learning_rate": 1.735170975568878e-06, "loss": 0.71775752, "num_input_tokens_seen": 194213150, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7109375, "step": 9020, "time_per_iteration": 2.6317224502563477 }, { "auxiliary_loss_clip": 0.01134239, "auxiliary_loss_mlp": 0.01038091, "balance_loss_clip": 1.02566385, "balance_loss_mlp": 1.03817225, "epoch": 0.5423718623177514, "flos": 27201974313600.0, "grad_norm": 1.4419839640115042, "language_loss": 0.80405074, "learning_rate": 1.7347964849102517e-06, "loss": 0.82577407, "num_input_tokens_seen": 194234665, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69921875, "step": 9021, "time_per_iteration": 2.6868093013763428 }, { "auxiliary_loss_clip": 0.01148473, "auxiliary_loss_mlp": 0.01034442, "balance_loss_clip": 1.02146578, "balance_loss_mlp": 1.04063034, "epoch": 0.5424319855704194, "flos": 23546447193600.0, "grad_norm": 1.5541727622861992, "language_loss": 0.7880038, "learning_rate": 1.734422003716056e-06, "loss": 0.80983293, "num_input_tokens_seen": 194253790, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 9022, "time_per_iteration": 2.572622060775757 }, { "auxiliary_loss_clip": 0.01130629, "auxiliary_loss_mlp": 0.01286139, "balance_loss_clip": 1.02697372, "balance_loss_mlp": 1.03931844, "epoch": 0.5424921088230873, "flos": 26615624699520.0, "grad_norm": 1.699450353995961, "language_loss": 0.82136625, "learning_rate": 1.7340475319996564e-06, "loss": 0.84553391, "num_input_tokens_seen": 194274950, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.73046875, "step": 9023, "time_per_iteration": 2.6414895057678223 }, { "auxiliary_loss_clip": 0.01120928, "auxiliary_loss_mlp": 0.01032392, "balance_loss_clip": 1.01977348, "balance_loss_mlp": 1.03943503, "epoch": 0.5425522320757553, "flos": 23586811102080.0, "grad_norm": 1.561140377192307, "language_loss": 0.71338332, "learning_rate": 1.733673069774416e-06, "loss": 0.73491657, "num_input_tokens_seen": 194296155, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.72265625, "step": 9024, "time_per_iteration": 2.629605770111084 }, { "auxiliary_loss_clip": 0.01121404, "auxiliary_loss_mlp": 0.01033839, "balance_loss_clip": 1.02141702, "balance_loss_mlp": 1.03853965, "epoch": 0.5426123553284232, "flos": 30592264429440.0, "grad_norm": 1.7142264540272742, "language_loss": 0.65573347, "learning_rate": 1.7332986170536987e-06, "loss": 0.67728585, "num_input_tokens_seen": 194318025, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7421875, "step": 9025, "time_per_iteration": 2.748445749282837 }, { "auxiliary_loss_clip": 0.01142002, "auxiliary_loss_mlp": 0.01280013, "balance_loss_clip": 1.02014446, "balance_loss_mlp": 1.04103923, "epoch": 0.5426724785810912, "flos": 12495118671360.0, "grad_norm": 1.7587135734043615, "language_loss": 0.73764062, "learning_rate": 1.732924173850868e-06, "loss": 0.76186079, "num_input_tokens_seen": 194336150, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.74609375, "step": 9026, "time_per_iteration": 2.531012535095215 }, { "auxiliary_loss_clip": 0.01131449, "auxiliary_loss_mlp": 0.01042161, "balance_loss_clip": 1.02747416, "balance_loss_mlp": 1.03816867, "epoch": 0.5427326018337592, "flos": 26064611089920.0, "grad_norm": 1.9202132802065335, "language_loss": 0.78731972, "learning_rate": 1.7325497401792861e-06, "loss": 0.80905581, "num_input_tokens_seen": 194355980, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.75390625, "step": 9027, "time_per_iteration": 2.5930685997009277 }, { "auxiliary_loss_clip": 0.01062683, "auxiliary_loss_mlp": 0.01001167, "balance_loss_clip": 0.999695, "balance_loss_mlp": 1.01180661, "epoch": 0.5427927250864272, "flos": 65984745576960.0, "grad_norm": 0.7414841360061564, "language_loss": 0.5655635, "learning_rate": 1.732175316052317e-06, "loss": 0.58620203, "num_input_tokens_seen": 194422660, "router_z_loss_clip": 0.01470947, "router_z_loss_mlp": 0.23828125, "step": 9028, "time_per_iteration": 3.174752950668335 }, { "auxiliary_loss_clip": 0.01151081, "auxiliary_loss_mlp": 0.01032135, "balance_loss_clip": 1.01794326, "balance_loss_mlp": 1.04041791, "epoch": 0.5428528483390952, "flos": 19975382904960.0, "grad_norm": 1.997762349969015, "language_loss": 0.77525342, "learning_rate": 1.7318009014833209e-06, "loss": 0.79708552, "num_input_tokens_seen": 194438545, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.75, "step": 9029, "time_per_iteration": 4.003287315368652 }, { "auxiliary_loss_clip": 0.01146948, "auxiliary_loss_mlp": 0.01028798, "balance_loss_clip": 1.01540506, "balance_loss_mlp": 1.03940058, "epoch": 0.5429129715917631, "flos": 21832323287040.0, "grad_norm": 1.464605008673385, "language_loss": 0.83162075, "learning_rate": 1.731426496485661e-06, "loss": 0.8533783, "num_input_tokens_seen": 194458060, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 9030, "time_per_iteration": 2.5895166397094727 }, { "auxiliary_loss_clip": 0.01112088, "auxiliary_loss_mlp": 0.01033366, "balance_loss_clip": 1.02092052, "balance_loss_mlp": 1.04002953, "epoch": 0.5429730948444311, "flos": 27782685492480.0, "grad_norm": 1.4993294507045103, "language_loss": 0.74758607, "learning_rate": 1.7310521010726988e-06, "loss": 0.76904058, "num_input_tokens_seen": 194477405, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.71875, "step": 9031, "time_per_iteration": 2.521379232406616 }, { "auxiliary_loss_clip": 0.01116963, "auxiliary_loss_mlp": 0.01031536, "balance_loss_clip": 1.0199846, "balance_loss_mlp": 1.03846788, "epoch": 0.543033218097099, "flos": 26760452336640.0, "grad_norm": 1.7011844568947994, "language_loss": 0.85240674, "learning_rate": 1.7306777152577949e-06, "loss": 0.87389171, "num_input_tokens_seen": 194497085, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6953125, "step": 9032, "time_per_iteration": 2.5773696899414062 }, { "auxiliary_loss_clip": 0.01132837, "auxiliary_loss_mlp": 0.01033881, "balance_loss_clip": 1.02079177, "balance_loss_mlp": 1.03942978, "epoch": 0.5430933413497671, "flos": 22675254727680.0, "grad_norm": 1.7376865807292952, "language_loss": 0.7389046, "learning_rate": 1.7303033390543108e-06, "loss": 0.76057172, "num_input_tokens_seen": 194516785, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.75390625, "step": 9033, "time_per_iteration": 3.9366626739501953 }, { "auxiliary_loss_clip": 0.01139099, "auxiliary_loss_mlp": 0.01036552, "balance_loss_clip": 1.02336693, "balance_loss_mlp": 1.03979182, "epoch": 0.543153464602435, "flos": 24607499973120.0, "grad_norm": 1.6240023630567086, "language_loss": 0.75816035, "learning_rate": 1.7299289724756065e-06, "loss": 0.77991688, "num_input_tokens_seen": 194536475, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 9034, "time_per_iteration": 2.6135470867156982 }, { "auxiliary_loss_clip": 0.01138142, "auxiliary_loss_mlp": 0.01036582, "balance_loss_clip": 1.02410066, "balance_loss_mlp": 1.04025245, "epoch": 0.543213587855103, "flos": 19025725178880.0, "grad_norm": 1.589902186056156, "language_loss": 0.84371299, "learning_rate": 1.7295546155350431e-06, "loss": 0.86546022, "num_input_tokens_seen": 194554495, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 9035, "time_per_iteration": 2.540916681289673 }, { "auxiliary_loss_clip": 0.01062809, "auxiliary_loss_mlp": 0.01001559, "balance_loss_clip": 1.0000813, "balance_loss_mlp": 1.01314449, "epoch": 0.5432737111077709, "flos": 65686435125120.0, "grad_norm": 0.7141647705942405, "language_loss": 0.55882764, "learning_rate": 1.729180268245979e-06, "loss": 0.57947135, "num_input_tokens_seen": 194617620, "router_z_loss_clip": 0.01477051, "router_z_loss_mlp": 0.24023438, "step": 9036, "time_per_iteration": 3.1510376930236816 }, { "auxiliary_loss_clip": 0.01157782, "auxiliary_loss_mlp": 0.01034049, "balance_loss_clip": 1.02061975, "balance_loss_mlp": 1.03883314, "epoch": 0.5433338343604389, "flos": 22091670460800.0, "grad_norm": 1.7066374427900592, "language_loss": 0.74893779, "learning_rate": 1.7288059306217751e-06, "loss": 0.77085614, "num_input_tokens_seen": 194637690, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.74609375, "step": 9037, "time_per_iteration": 2.598867177963257 }, { "auxiliary_loss_clip": 0.0113011, "auxiliary_loss_mlp": 0.01037212, "balance_loss_clip": 1.02408719, "balance_loss_mlp": 1.03859913, "epoch": 0.5433939576131068, "flos": 34672649616000.0, "grad_norm": 1.5331658906767702, "language_loss": 0.66570771, "learning_rate": 1.72843160267579e-06, "loss": 0.68738091, "num_input_tokens_seen": 194659520, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 9038, "time_per_iteration": 2.6760048866271973 }, { "auxiliary_loss_clip": 0.01118099, "auxiliary_loss_mlp": 0.0103356, "balance_loss_clip": 1.02162695, "balance_loss_mlp": 1.03771901, "epoch": 0.5434540808657748, "flos": 20303355012480.0, "grad_norm": 2.84351659686087, "language_loss": 0.78023887, "learning_rate": 1.7280572844213818e-06, "loss": 0.80175543, "num_input_tokens_seen": 194677645, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.71484375, "step": 9039, "time_per_iteration": 2.575859785079956 }, { "auxiliary_loss_clip": 0.01139796, "auxiliary_loss_mlp": 0.01034278, "balance_loss_clip": 1.02189863, "balance_loss_mlp": 1.04011881, "epoch": 0.5435142041184428, "flos": 23112790295040.0, "grad_norm": 1.8142226043948053, "language_loss": 0.76474547, "learning_rate": 1.7276829758719103e-06, "loss": 0.78648627, "num_input_tokens_seen": 194697400, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7265625, "step": 9040, "time_per_iteration": 4.1357715129852295 }, { "auxiliary_loss_clip": 0.01054726, "auxiliary_loss_mlp": 0.0099998, "balance_loss_clip": 0.99833453, "balance_loss_mlp": 1.01329279, "epoch": 0.5435743273711108, "flos": 64012746954240.0, "grad_norm": 0.8477098255030245, "language_loss": 0.5244621, "learning_rate": 1.7273086770407323e-06, "loss": 0.54500914, "num_input_tokens_seen": 194761205, "router_z_loss_clip": 0.01647949, "router_z_loss_mlp": 0.24023438, "step": 9041, "time_per_iteration": 3.1284735202789307 }, { "auxiliary_loss_clip": 0.01130354, "auxiliary_loss_mlp": 0.01035074, "balance_loss_clip": 1.02216339, "balance_loss_mlp": 1.0398922, "epoch": 0.5436344506237788, "flos": 25118903859840.0, "grad_norm": 1.8339307780272194, "language_loss": 0.7607463, "learning_rate": 1.7269343879412065e-06, "loss": 0.78240061, "num_input_tokens_seen": 194782445, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 9042, "time_per_iteration": 2.640868902206421 }, { "auxiliary_loss_clip": 0.01135105, "auxiliary_loss_mlp": 0.01031118, "balance_loss_clip": 1.01895261, "balance_loss_mlp": 1.03843558, "epoch": 0.5436945738764467, "flos": 19572967860480.0, "grad_norm": 1.6251496668620755, "language_loss": 0.67432165, "learning_rate": 1.7265601085866909e-06, "loss": 0.69598389, "num_input_tokens_seen": 194800325, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.703125, "step": 9043, "time_per_iteration": 2.5158658027648926 }, { "auxiliary_loss_clip": 0.01134906, "auxiliary_loss_mlp": 0.0103205, "balance_loss_clip": 1.01990867, "balance_loss_mlp": 1.03852224, "epoch": 0.5437546971291147, "flos": 21142515525120.0, "grad_norm": 1.5579792143547244, "language_loss": 0.84301108, "learning_rate": 1.7261858389905402e-06, "loss": 0.86468071, "num_input_tokens_seen": 194818675, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 9044, "time_per_iteration": 2.5825369358062744 }, { "auxiliary_loss_clip": 0.01116588, "auxiliary_loss_mlp": 0.01031604, "balance_loss_clip": 1.0183841, "balance_loss_mlp": 1.04043424, "epoch": 0.5438148203817826, "flos": 25118688378240.0, "grad_norm": 1.7692539172044361, "language_loss": 0.61949915, "learning_rate": 1.7258115791661134e-06, "loss": 0.64098102, "num_input_tokens_seen": 194836595, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.76171875, "step": 9045, "time_per_iteration": 2.518336772918701 }, { "auxiliary_loss_clip": 0.01113508, "auxiliary_loss_mlp": 0.0103145, "balance_loss_clip": 1.01847434, "balance_loss_mlp": 1.04027057, "epoch": 0.5438749436344507, "flos": 23002939526400.0, "grad_norm": 1.4637131153548872, "language_loss": 0.6981318, "learning_rate": 1.7254373291267655e-06, "loss": 0.71958143, "num_input_tokens_seen": 194857520, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.734375, "step": 9046, "time_per_iteration": 2.613178253173828 }, { "auxiliary_loss_clip": 0.01109229, "auxiliary_loss_mlp": 0.01030731, "balance_loss_clip": 1.01821995, "balance_loss_mlp": 1.03869319, "epoch": 0.5439350668871186, "flos": 15487016065920.0, "grad_norm": 1.6058136427561143, "language_loss": 0.77404529, "learning_rate": 1.7250630888858533e-06, "loss": 0.79544485, "num_input_tokens_seen": 194876020, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 9047, "time_per_iteration": 2.5460400581359863 }, { "auxiliary_loss_clip": 0.01131639, "auxiliary_loss_mlp": 0.01037329, "balance_loss_clip": 1.02484238, "balance_loss_mlp": 1.04113674, "epoch": 0.5439951901397866, "flos": 17238415311360.0, "grad_norm": 1.660368193878636, "language_loss": 0.72515339, "learning_rate": 1.7246888584567325e-06, "loss": 0.74684304, "num_input_tokens_seen": 194894650, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7265625, "step": 9048, "time_per_iteration": 2.5671582221984863 }, { "auxiliary_loss_clip": 0.01149805, "auxiliary_loss_mlp": 0.01032754, "balance_loss_clip": 1.01900935, "balance_loss_mlp": 1.04275703, "epoch": 0.5440553133924545, "flos": 18661016436480.0, "grad_norm": 2.06238431435721, "language_loss": 0.93676984, "learning_rate": 1.7243146378527576e-06, "loss": 0.95859545, "num_input_tokens_seen": 194911935, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7109375, "step": 9049, "time_per_iteration": 2.5705153942108154 }, { "auxiliary_loss_clip": 0.0111738, "auxiliary_loss_mlp": 0.01034319, "balance_loss_clip": 1.02255285, "balance_loss_mlp": 1.03760993, "epoch": 0.5441154366451225, "flos": 27122934435840.0, "grad_norm": 1.7217797245799018, "language_loss": 0.74012375, "learning_rate": 1.7239404270872846e-06, "loss": 0.76164073, "num_input_tokens_seen": 194931620, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.70703125, "step": 9050, "time_per_iteration": 2.676034927368164 }, { "auxiliary_loss_clip": 0.01122573, "auxiliary_loss_mlp": 0.01031077, "balance_loss_clip": 1.01855421, "balance_loss_mlp": 1.04134989, "epoch": 0.5441755598977904, "flos": 25993867253760.0, "grad_norm": 1.726487179219819, "language_loss": 0.67493582, "learning_rate": 1.7235662261736672e-06, "loss": 0.69647229, "num_input_tokens_seen": 194952560, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.72265625, "step": 9051, "time_per_iteration": 2.559619426727295 }, { "auxiliary_loss_clip": 0.01135461, "auxiliary_loss_mlp": 0.01031122, "balance_loss_clip": 1.01873589, "balance_loss_mlp": 1.03734148, "epoch": 0.5442356831504584, "flos": 32380041173760.0, "grad_norm": 1.7091605290978849, "language_loss": 0.67634815, "learning_rate": 1.7231920351252604e-06, "loss": 0.69801402, "num_input_tokens_seen": 194973915, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71484375, "step": 9052, "time_per_iteration": 2.660240411758423 }, { "auxiliary_loss_clip": 0.01112112, "auxiliary_loss_mlp": 0.01031067, "balance_loss_clip": 1.01815033, "balance_loss_mlp": 1.03851521, "epoch": 0.5442958064031264, "flos": 24164290056960.0, "grad_norm": 1.6877158665749807, "language_loss": 0.92751062, "learning_rate": 1.7228178539554181e-06, "loss": 0.94894242, "num_input_tokens_seen": 194990170, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 9053, "time_per_iteration": 2.5063064098358154 }, { "auxiliary_loss_clip": 0.01130553, "auxiliary_loss_mlp": 0.01033963, "balance_loss_clip": 1.02108216, "balance_loss_mlp": 1.03880858, "epoch": 0.5443559296557944, "flos": 18764690065920.0, "grad_norm": 3.0544603324591195, "language_loss": 0.83519953, "learning_rate": 1.722443682677493e-06, "loss": 0.85684478, "num_input_tokens_seen": 195006395, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73828125, "step": 9054, "time_per_iteration": 2.59287166595459 }, { "auxiliary_loss_clip": 0.01132391, "auxiliary_loss_mlp": 0.01032704, "balance_loss_clip": 1.02029991, "balance_loss_mlp": 1.04076374, "epoch": 0.5444160529084624, "flos": 22632556435200.0, "grad_norm": 2.0200041074281607, "language_loss": 0.6810922, "learning_rate": 1.7220695213048396e-06, "loss": 0.70274317, "num_input_tokens_seen": 195025080, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.734375, "step": 9055, "time_per_iteration": 2.534114360809326 }, { "auxiliary_loss_clip": 0.01062576, "auxiliary_loss_mlp": 0.01004986, "balance_loss_clip": 1.00350785, "balance_loss_mlp": 1.01193416, "epoch": 0.5444761761611303, "flos": 69671909600640.0, "grad_norm": 0.7341581936679089, "language_loss": 0.57668376, "learning_rate": 1.7216953698508092e-06, "loss": 0.59735936, "num_input_tokens_seen": 195085725, "router_z_loss_clip": 0.01477051, "router_z_loss_mlp": 0.23925781, "step": 9056, "time_per_iteration": 3.167842149734497 }, { "auxiliary_loss_clip": 0.01138222, "auxiliary_loss_mlp": 0.01031807, "balance_loss_clip": 1.0186404, "balance_loss_mlp": 1.03799248, "epoch": 0.5445362994137983, "flos": 14278442129280.0, "grad_norm": 2.4642255718135693, "language_loss": 0.69635719, "learning_rate": 1.721321228328756e-06, "loss": 0.71805751, "num_input_tokens_seen": 195102585, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73828125, "step": 9057, "time_per_iteration": 2.6267826557159424 }, { "auxiliary_loss_clip": 0.01117751, "auxiliary_loss_mlp": 0.01037268, "balance_loss_clip": 1.02423286, "balance_loss_mlp": 1.03808832, "epoch": 0.5445964226664662, "flos": 28986195611520.0, "grad_norm": 1.9623505720934233, "language_loss": 0.75312245, "learning_rate": 1.720947096752031e-06, "loss": 0.77467263, "num_input_tokens_seen": 195120055, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 9058, "time_per_iteration": 2.5991649627685547 }, { "auxiliary_loss_clip": 0.01062195, "auxiliary_loss_mlp": 0.01000899, "balance_loss_clip": 0.99946862, "balance_loss_mlp": 1.01167619, "epoch": 0.5446565459191343, "flos": 68620230270720.0, "grad_norm": 0.7994568745005469, "language_loss": 0.62704015, "learning_rate": 1.7205729751339864e-06, "loss": 0.6476711, "num_input_tokens_seen": 195181045, "router_z_loss_clip": 0.01428223, "router_z_loss_mlp": 0.23632812, "step": 9059, "time_per_iteration": 3.2554428577423096 }, { "auxiliary_loss_clip": 0.01117862, "auxiliary_loss_mlp": 0.01029994, "balance_loss_clip": 1.01761389, "balance_loss_mlp": 1.03825212, "epoch": 0.5447166691718022, "flos": 16216469464320.0, "grad_norm": 1.834331853292114, "language_loss": 0.79446048, "learning_rate": 1.7201988634879736e-06, "loss": 0.81593901, "num_input_tokens_seen": 195198840, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 9060, "time_per_iteration": 2.4996252059936523 }, { "auxiliary_loss_clip": 0.01120801, "auxiliary_loss_mlp": 0.01032739, "balance_loss_clip": 1.01992977, "balance_loss_mlp": 1.03913283, "epoch": 0.5447767924244702, "flos": 25849039616640.0, "grad_norm": 1.8345309818996065, "language_loss": 0.797189, "learning_rate": 1.7198247618273432e-06, "loss": 0.81872445, "num_input_tokens_seen": 195218720, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 9061, "time_per_iteration": 2.613917112350464 }, { "auxiliary_loss_clip": 0.01119717, "auxiliary_loss_mlp": 0.0102609, "balance_loss_clip": 1.01430655, "balance_loss_mlp": 1.03997636, "epoch": 0.5448369156771381, "flos": 19677718897920.0, "grad_norm": 1.6930673035700154, "language_loss": 0.86988652, "learning_rate": 1.7194506701654467e-06, "loss": 0.89134467, "num_input_tokens_seen": 195235770, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.7109375, "step": 9062, "time_per_iteration": 2.517529249191284 }, { "auxiliary_loss_clip": 0.01133809, "auxiliary_loss_mlp": 0.01033351, "balance_loss_clip": 1.01966596, "balance_loss_mlp": 1.03953815, "epoch": 0.5448970389298061, "flos": 19281804215040.0, "grad_norm": 1.683498498583607, "language_loss": 0.82000816, "learning_rate": 1.7190765885156338e-06, "loss": 0.84167981, "num_input_tokens_seen": 195254870, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.76171875, "step": 9063, "time_per_iteration": 2.588992118835449 }, { "auxiliary_loss_clip": 0.01110164, "auxiliary_loss_mlp": 0.01030846, "balance_loss_clip": 1.01719666, "balance_loss_mlp": 1.03787935, "epoch": 0.544957162182474, "flos": 20991690316800.0, "grad_norm": 2.6481874109409933, "language_loss": 0.63656741, "learning_rate": 1.718702516891255e-06, "loss": 0.65797746, "num_input_tokens_seen": 195273390, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 9064, "time_per_iteration": 2.533271551132202 }, { "auxiliary_loss_clip": 0.01113019, "auxiliary_loss_mlp": 0.01034718, "balance_loss_clip": 1.0212115, "balance_loss_mlp": 1.03889906, "epoch": 0.545017285435142, "flos": 25374587846400.0, "grad_norm": 1.6648016021723198, "language_loss": 0.79755098, "learning_rate": 1.71832845530566e-06, "loss": 0.81902832, "num_input_tokens_seen": 195295635, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 9065, "time_per_iteration": 2.615724563598633 }, { "auxiliary_loss_clip": 0.01134143, "auxiliary_loss_mlp": 0.01034193, "balance_loss_clip": 1.02317238, "balance_loss_mlp": 1.03771329, "epoch": 0.54507740868781, "flos": 19134749934720.0, "grad_norm": 2.7582341760853053, "language_loss": 0.77975744, "learning_rate": 1.7179544037721976e-06, "loss": 0.80144083, "num_input_tokens_seen": 195312545, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.703125, "step": 9066, "time_per_iteration": 2.5733885765075684 }, { "auxiliary_loss_clip": 0.0112174, "auxiliary_loss_mlp": 0.01030453, "balance_loss_clip": 1.01715541, "balance_loss_mlp": 1.03719711, "epoch": 0.545137531940478, "flos": 26249802635520.0, "grad_norm": 1.7572602484543518, "language_loss": 0.75566757, "learning_rate": 1.7175803623042174e-06, "loss": 0.77718949, "num_input_tokens_seen": 195332955, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75390625, "step": 9067, "time_per_iteration": 2.652663230895996 }, { "auxiliary_loss_clip": 0.01144535, "auxiliary_loss_mlp": 0.01033052, "balance_loss_clip": 1.01798439, "balance_loss_mlp": 1.04009318, "epoch": 0.545197655193146, "flos": 37555629995520.0, "grad_norm": 1.9975543336519377, "language_loss": 0.63437891, "learning_rate": 1.7172063309150668e-06, "loss": 0.65615481, "num_input_tokens_seen": 195355930, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.77734375, "step": 9068, "time_per_iteration": 2.7000396251678467 }, { "auxiliary_loss_clip": 0.01137172, "auxiliary_loss_mlp": 0.01039211, "balance_loss_clip": 1.02732027, "balance_loss_mlp": 1.03966236, "epoch": 0.5452577784458139, "flos": 26031250333440.0, "grad_norm": 1.4781221168151877, "language_loss": 0.72015166, "learning_rate": 1.7168323096180956e-06, "loss": 0.74191546, "num_input_tokens_seen": 195376445, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.703125, "step": 9069, "time_per_iteration": 2.7054412364959717 }, { "auxiliary_loss_clip": 0.01108643, "auxiliary_loss_mlp": 0.01027269, "balance_loss_clip": 1.01593852, "balance_loss_mlp": 1.03973126, "epoch": 0.5453179016984819, "flos": 17639034675840.0, "grad_norm": 1.6815639353026837, "language_loss": 0.74275988, "learning_rate": 1.7164582984266508e-06, "loss": 0.76411903, "num_input_tokens_seen": 195393725, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6875, "step": 9070, "time_per_iteration": 2.4708008766174316 }, { "auxiliary_loss_clip": 0.01118992, "auxiliary_loss_mlp": 0.01032426, "balance_loss_clip": 1.01955187, "balance_loss_mlp": 1.03756952, "epoch": 0.5453780249511498, "flos": 23216679406080.0, "grad_norm": 1.8712295432391175, "language_loss": 0.60724229, "learning_rate": 1.7160842973540798e-06, "loss": 0.62875652, "num_input_tokens_seen": 195411380, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 9071, "time_per_iteration": 3.9771337509155273 }, { "auxiliary_loss_clip": 0.01034751, "auxiliary_loss_mlp": 0.01003587, "balance_loss_clip": 1.00216246, "balance_loss_mlp": 1.01116776, "epoch": 0.5454381482038179, "flos": 68696504801280.0, "grad_norm": 0.6982847397941471, "language_loss": 0.5713191, "learning_rate": 1.71571030641373e-06, "loss": 0.59170246, "num_input_tokens_seen": 195482015, "router_z_loss_clip": 0.01422119, "router_z_loss_mlp": 0.23632812, "step": 9072, "time_per_iteration": 3.279466152191162 }, { "auxiliary_loss_clip": 0.01123892, "auxiliary_loss_mlp": 0.01031185, "balance_loss_clip": 1.01923418, "balance_loss_mlp": 1.03689754, "epoch": 0.5454982714564858, "flos": 13260626346240.0, "grad_norm": 1.5722173609202736, "language_loss": 0.70117813, "learning_rate": 1.715336325618948e-06, "loss": 0.72272897, "num_input_tokens_seen": 195500440, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.69140625, "step": 9073, "time_per_iteration": 2.512681245803833 }, { "auxiliary_loss_clip": 0.01124011, "auxiliary_loss_mlp": 0.01038561, "balance_loss_clip": 1.02674747, "balance_loss_mlp": 1.03695858, "epoch": 0.5455583947091538, "flos": 21835878733440.0, "grad_norm": 1.591751195497768, "language_loss": 0.71193373, "learning_rate": 1.7149623549830805e-06, "loss": 0.73355949, "num_input_tokens_seen": 195520860, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 9074, "time_per_iteration": 2.5922489166259766 }, { "auxiliary_loss_clip": 0.01117585, "auxiliary_loss_mlp": 0.0103841, "balance_loss_clip": 1.02613115, "balance_loss_mlp": 1.03729832, "epoch": 0.5456185179618217, "flos": 17817438551040.0, "grad_norm": 1.9437461367343405, "language_loss": 0.68186998, "learning_rate": 1.7145883945194731e-06, "loss": 0.70342994, "num_input_tokens_seen": 195538615, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71484375, "step": 9075, "time_per_iteration": 3.8829410076141357 }, { "auxiliary_loss_clip": 0.01115849, "auxiliary_loss_mlp": 0.01030605, "balance_loss_clip": 1.01924443, "balance_loss_mlp": 1.03781855, "epoch": 0.5456786412144897, "flos": 21069401391360.0, "grad_norm": 1.5955821086693873, "language_loss": 0.80498707, "learning_rate": 1.7142144442414716e-06, "loss": 0.8264516, "num_input_tokens_seen": 195557460, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6953125, "step": 9076, "time_per_iteration": 2.582051992416382 }, { "auxiliary_loss_clip": 0.01118509, "auxiliary_loss_mlp": 0.0103427, "balance_loss_clip": 1.021878, "balance_loss_mlp": 1.03783429, "epoch": 0.5457387644671576, "flos": 23294965098240.0, "grad_norm": 1.6291568438157844, "language_loss": 0.80325025, "learning_rate": 1.713840504162422e-06, "loss": 0.82477808, "num_input_tokens_seen": 195577985, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71875, "step": 9077, "time_per_iteration": 2.572849750518799 }, { "auxiliary_loss_clip": 0.01117315, "auxiliary_loss_mlp": 0.01032592, "balance_loss_clip": 1.02116585, "balance_loss_mlp": 1.03583026, "epoch": 0.5457988877198257, "flos": 21617039122560.0, "grad_norm": 1.9032476817597188, "language_loss": 0.67522418, "learning_rate": 1.713466574295668e-06, "loss": 0.69672322, "num_input_tokens_seen": 195597620, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.72265625, "step": 9078, "time_per_iteration": 2.613682746887207 }, { "auxiliary_loss_clip": 0.01119962, "auxiliary_loss_mlp": 0.01032178, "balance_loss_clip": 1.01941085, "balance_loss_mlp": 1.03736258, "epoch": 0.5458590109724936, "flos": 23762485543680.0, "grad_norm": 1.830347121549261, "language_loss": 0.81018782, "learning_rate": 1.7130926546545555e-06, "loss": 0.83170927, "num_input_tokens_seen": 195615910, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.73828125, "step": 9079, "time_per_iteration": 2.592512607574463 }, { "auxiliary_loss_clip": 0.0111313, "auxiliary_loss_mlp": 0.0103588, "balance_loss_clip": 1.02201557, "balance_loss_mlp": 1.03724027, "epoch": 0.5459191342251616, "flos": 24424283675520.0, "grad_norm": 1.6482703510140193, "language_loss": 0.7588222, "learning_rate": 1.7127187452524275e-06, "loss": 0.7803123, "num_input_tokens_seen": 195635620, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 9080, "time_per_iteration": 2.5586841106414795 }, { "auxiliary_loss_clip": 0.01138833, "auxiliary_loss_mlp": 0.01031348, "balance_loss_clip": 1.01791894, "balance_loss_mlp": 1.03843641, "epoch": 0.5459792574778296, "flos": 23623009032960.0, "grad_norm": 2.1454780033117378, "language_loss": 0.8296746, "learning_rate": 1.712344846102629e-06, "loss": 0.85137641, "num_input_tokens_seen": 195652495, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 9081, "time_per_iteration": 5.545758485794067 }, { "auxiliary_loss_clip": 0.01114013, "auxiliary_loss_mlp": 0.01030116, "balance_loss_clip": 1.01742625, "balance_loss_mlp": 1.03869843, "epoch": 0.5460393807304975, "flos": 19135540033920.0, "grad_norm": 1.5029483536465007, "language_loss": 0.69641447, "learning_rate": 1.7119709572185032e-06, "loss": 0.71785575, "num_input_tokens_seen": 195671965, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.75390625, "step": 9082, "time_per_iteration": 2.5735270977020264 }, { "auxiliary_loss_clip": 0.01105961, "auxiliary_loss_mlp": 0.01028776, "balance_loss_clip": 1.01582408, "balance_loss_mlp": 1.03598714, "epoch": 0.5460995039831655, "flos": 35918534805120.0, "grad_norm": 1.8949512551940275, "language_loss": 0.66138792, "learning_rate": 1.7115970786133925e-06, "loss": 0.68273532, "num_input_tokens_seen": 195694725, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 9083, "time_per_iteration": 2.670642852783203 }, { "auxiliary_loss_clip": 0.0112499, "auxiliary_loss_mlp": 0.01027256, "balance_loss_clip": 1.01530576, "balance_loss_mlp": 1.03707194, "epoch": 0.5461596272358334, "flos": 26759231274240.0, "grad_norm": 2.1366699951107804, "language_loss": 0.7876265, "learning_rate": 1.7112232103006405e-06, "loss": 0.80914897, "num_input_tokens_seen": 195714090, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.703125, "step": 9084, "time_per_iteration": 2.640772819519043 }, { "auxiliary_loss_clip": 0.01119045, "auxiliary_loss_mlp": 0.0103125, "balance_loss_clip": 1.01876283, "balance_loss_mlp": 1.03671861, "epoch": 0.5462197504885015, "flos": 20886580143360.0, "grad_norm": 1.820931673263686, "language_loss": 0.75048018, "learning_rate": 1.710849352293589e-06, "loss": 0.77198315, "num_input_tokens_seen": 195733585, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.734375, "step": 9085, "time_per_iteration": 2.5064597129821777 }, { "auxiliary_loss_clip": 0.01129475, "auxiliary_loss_mlp": 0.01029812, "balance_loss_clip": 1.01716435, "balance_loss_mlp": 1.03773892, "epoch": 0.5462798737411694, "flos": 25804976607360.0, "grad_norm": 1.919932056546634, "language_loss": 0.74749678, "learning_rate": 1.7104755046055808e-06, "loss": 0.7690897, "num_input_tokens_seen": 195752820, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73828125, "step": 9086, "time_per_iteration": 2.611419439315796 }, { "auxiliary_loss_clip": 0.01116973, "auxiliary_loss_mlp": 0.01028998, "balance_loss_clip": 1.0164876, "balance_loss_mlp": 1.03577352, "epoch": 0.5463399969938374, "flos": 25775027642880.0, "grad_norm": 1.651315310475349, "language_loss": 0.76884484, "learning_rate": 1.710101667249957e-06, "loss": 0.7903046, "num_input_tokens_seen": 195773740, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.72265625, "step": 9087, "time_per_iteration": 2.5535874366760254 }, { "auxiliary_loss_clip": 0.01122231, "auxiliary_loss_mlp": 0.01039393, "balance_loss_clip": 1.02562475, "balance_loss_mlp": 1.03948641, "epoch": 0.5464001202465053, "flos": 18843298980480.0, "grad_norm": 1.8069866249304207, "language_loss": 0.77563751, "learning_rate": 1.7097278402400592e-06, "loss": 0.79725373, "num_input_tokens_seen": 195792125, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.73828125, "step": 9088, "time_per_iteration": 2.51029896736145 }, { "auxiliary_loss_clip": 0.01088264, "auxiliary_loss_mlp": 0.00999527, "balance_loss_clip": 0.99795324, "balance_loss_mlp": 1.0118947, "epoch": 0.5464602434991733, "flos": 69049541623680.0, "grad_norm": 0.7246367819852519, "language_loss": 0.57754123, "learning_rate": 1.709354023589228e-06, "loss": 0.59841919, "num_input_tokens_seen": 195854935, "router_z_loss_clip": 0.01574707, "router_z_loss_mlp": 0.23535156, "step": 9089, "time_per_iteration": 3.2539408206939697 }, { "auxiliary_loss_clip": 0.0113659, "auxiliary_loss_mlp": 0.01034999, "balance_loss_clip": 1.02217817, "balance_loss_mlp": 1.03651142, "epoch": 0.5465203667518412, "flos": 27560039040000.0, "grad_norm": 2.009662912001411, "language_loss": 0.76741827, "learning_rate": 1.7089802173108035e-06, "loss": 0.7891342, "num_input_tokens_seen": 195874715, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 9090, "time_per_iteration": 2.5643625259399414 }, { "auxiliary_loss_clip": 0.01122486, "auxiliary_loss_mlp": 0.01034662, "balance_loss_clip": 1.0206964, "balance_loss_mlp": 1.03697133, "epoch": 0.5465804900045093, "flos": 21210206705280.0, "grad_norm": 5.080533693181854, "language_loss": 0.73429829, "learning_rate": 1.7086064214181267e-06, "loss": 0.75586975, "num_input_tokens_seen": 195892610, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.765625, "step": 9091, "time_per_iteration": 2.565791606903076 }, { "auxiliary_loss_clip": 0.01129417, "auxiliary_loss_mlp": 0.0103806, "balance_loss_clip": 1.02480423, "balance_loss_mlp": 1.0381453, "epoch": 0.5466406132571772, "flos": 22488949860480.0, "grad_norm": 1.9230644427470338, "language_loss": 0.77951825, "learning_rate": 1.7082326359245376e-06, "loss": 0.801193, "num_input_tokens_seen": 195911085, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 9092, "time_per_iteration": 2.5456526279449463 }, { "auxiliary_loss_clip": 0.01112352, "auxiliary_loss_mlp": 0.01032977, "balance_loss_clip": 1.01967883, "balance_loss_mlp": 1.03783083, "epoch": 0.5467007365098452, "flos": 17675843137920.0, "grad_norm": 1.9332538509941959, "language_loss": 0.75328803, "learning_rate": 1.7078588608433747e-06, "loss": 0.77474129, "num_input_tokens_seen": 195929845, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.74609375, "step": 9093, "time_per_iteration": 2.588658332824707 }, { "auxiliary_loss_clip": 0.01121141, "auxiliary_loss_mlp": 0.01036623, "balance_loss_clip": 1.02182877, "balance_loss_mlp": 1.03704071, "epoch": 0.5467608597625132, "flos": 15698852524800.0, "grad_norm": 2.0372816610180053, "language_loss": 0.68966967, "learning_rate": 1.7074850961879779e-06, "loss": 0.71124732, "num_input_tokens_seen": 195946350, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.75390625, "step": 9094, "time_per_iteration": 2.5314457416534424 }, { "auxiliary_loss_clip": 0.01128623, "auxiliary_loss_mlp": 0.01032365, "balance_loss_clip": 1.02022946, "balance_loss_mlp": 1.03877008, "epoch": 0.5468209830151811, "flos": 24312816794880.0, "grad_norm": 2.8187059066739577, "language_loss": 0.67951095, "learning_rate": 1.7071113419716852e-06, "loss": 0.70112085, "num_input_tokens_seen": 195959840, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.71875, "step": 9095, "time_per_iteration": 2.5359275341033936 }, { "auxiliary_loss_clip": 0.01139274, "auxiliary_loss_mlp": 0.01032752, "balance_loss_clip": 1.02056861, "balance_loss_mlp": 1.0407362, "epoch": 0.5468811062678491, "flos": 29166323339520.0, "grad_norm": 1.5721771391180974, "language_loss": 0.66191006, "learning_rate": 1.7067375982078355e-06, "loss": 0.68363035, "num_input_tokens_seen": 195981125, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.71875, "step": 9096, "time_per_iteration": 2.5791261196136475 }, { "auxiliary_loss_clip": 0.01062088, "auxiliary_loss_mlp": 0.01006153, "balance_loss_clip": 1.00464523, "balance_loss_mlp": 1.01122117, "epoch": 0.546941229520517, "flos": 67867037982720.0, "grad_norm": 0.7355554761627684, "language_loss": 0.5755136, "learning_rate": 1.7063638649097668e-06, "loss": 0.596196, "num_input_tokens_seen": 196038880, "router_z_loss_clip": 0.01507568, "router_z_loss_mlp": 0.23828125, "step": 9097, "time_per_iteration": 3.2995190620422363 }, { "auxiliary_loss_clip": 0.0104384, "auxiliary_loss_mlp": 0.0100445, "balance_loss_clip": 1.00297785, "balance_loss_mlp": 1.01095891, "epoch": 0.5470013527731851, "flos": 58270306625280.0, "grad_norm": 0.9077357041889543, "language_loss": 0.64792144, "learning_rate": 1.705990142090816e-06, "loss": 0.66840434, "num_input_tokens_seen": 196099215, "router_z_loss_clip": 0.01470947, "router_z_loss_mlp": 0.23828125, "step": 9098, "time_per_iteration": 3.0861124992370605 }, { "auxiliary_loss_clip": 0.01121342, "auxiliary_loss_mlp": 0.0104271, "balance_loss_clip": 1.02917993, "balance_loss_mlp": 1.03805196, "epoch": 0.547061476025853, "flos": 22965915582720.0, "grad_norm": 1.4971774417388752, "language_loss": 0.72829181, "learning_rate": 1.7056164297643213e-06, "loss": 0.74993229, "num_input_tokens_seen": 196120370, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 9099, "time_per_iteration": 2.5668067932128906 }, { "auxiliary_loss_clip": 0.01126172, "auxiliary_loss_mlp": 0.01031113, "balance_loss_clip": 1.0192163, "balance_loss_mlp": 1.03692198, "epoch": 0.547121599278521, "flos": 29968244426880.0, "grad_norm": 2.3632628429654727, "language_loss": 0.7218321, "learning_rate": 1.7052427279436183e-06, "loss": 0.74340492, "num_input_tokens_seen": 196139075, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.71484375, "step": 9100, "time_per_iteration": 2.6540236473083496 }, { "auxiliary_loss_clip": 0.01116862, "auxiliary_loss_mlp": 0.01277432, "balance_loss_clip": 1.0169096, "balance_loss_mlp": 1.0364325, "epoch": 0.5471817225311889, "flos": 36535443914880.0, "grad_norm": 2.9466473853467834, "language_loss": 0.68323261, "learning_rate": 1.7048690366420447e-06, "loss": 0.70717555, "num_input_tokens_seen": 196159990, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.71484375, "step": 9101, "time_per_iteration": 2.676544666290283 }, { "auxiliary_loss_clip": 0.01110568, "auxiliary_loss_mlp": 0.01028022, "balance_loss_clip": 1.01533198, "balance_loss_mlp": 1.03706193, "epoch": 0.5472418457838569, "flos": 25775243124480.0, "grad_norm": 2.0894594134012916, "language_loss": 0.77942169, "learning_rate": 1.7044953558729356e-06, "loss": 0.8008076, "num_input_tokens_seen": 196180570, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.734375, "step": 9102, "time_per_iteration": 2.5854697227478027 }, { "auxiliary_loss_clip": 0.01137484, "auxiliary_loss_mlp": 0.01036662, "balance_loss_clip": 1.02326322, "balance_loss_mlp": 1.03958416, "epoch": 0.5473019690365248, "flos": 27887687925120.0, "grad_norm": 1.5362517443652688, "language_loss": 0.72278422, "learning_rate": 1.7041216856496278e-06, "loss": 0.74452573, "num_input_tokens_seen": 196200300, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 9103, "time_per_iteration": 2.6147851943969727 }, { "auxiliary_loss_clip": 0.01127069, "auxiliary_loss_mlp": 0.01030349, "balance_loss_clip": 1.01774907, "balance_loss_mlp": 1.03704119, "epoch": 0.5473620922891929, "flos": 57631490219520.0, "grad_norm": 1.3407628287811122, "language_loss": 0.65372473, "learning_rate": 1.7037480259854558e-06, "loss": 0.67529893, "num_input_tokens_seen": 196228525, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 9104, "time_per_iteration": 2.8983516693115234 }, { "auxiliary_loss_clip": 0.01122106, "auxiliary_loss_mlp": 0.01031993, "balance_loss_clip": 1.01820636, "balance_loss_mlp": 1.03804028, "epoch": 0.5474222155418608, "flos": 19354056422400.0, "grad_norm": 1.9486940614295916, "language_loss": 0.81583077, "learning_rate": 1.703374376893754e-06, "loss": 0.83737177, "num_input_tokens_seen": 196247690, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75, "step": 9105, "time_per_iteration": 2.5672624111175537 }, { "auxiliary_loss_clip": 0.01107279, "auxiliary_loss_mlp": 0.01027179, "balance_loss_clip": 1.01486433, "balance_loss_mlp": 1.03629565, "epoch": 0.5474823387945288, "flos": 25120448144640.0, "grad_norm": 1.4356745258900854, "language_loss": 0.80244476, "learning_rate": 1.7030007383878583e-06, "loss": 0.82378936, "num_input_tokens_seen": 196268555, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 9106, "time_per_iteration": 2.5849342346191406 }, { "auxiliary_loss_clip": 0.01118311, "auxiliary_loss_mlp": 0.01035064, "balance_loss_clip": 1.0227437, "balance_loss_mlp": 1.0376631, "epoch": 0.5475424620471967, "flos": 18004174381440.0, "grad_norm": 2.029390221744832, "language_loss": 0.6993472, "learning_rate": 1.7026271104811017e-06, "loss": 0.72088099, "num_input_tokens_seen": 196285585, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71875, "step": 9107, "time_per_iteration": 2.5244181156158447 }, { "auxiliary_loss_clip": 0.01119914, "auxiliary_loss_mlp": 0.01028699, "balance_loss_clip": 1.0158658, "balance_loss_mlp": 1.03621626, "epoch": 0.5476025852998647, "flos": 22309324922880.0, "grad_norm": 1.7857317178489593, "language_loss": 0.63248217, "learning_rate": 1.702253493186819e-06, "loss": 0.65396833, "num_input_tokens_seen": 196305085, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.75, "step": 9108, "time_per_iteration": 2.557551145553589 }, { "auxiliary_loss_clip": 0.01108068, "auxiliary_loss_mlp": 0.01025887, "balance_loss_clip": 1.01444864, "balance_loss_mlp": 1.03629923, "epoch": 0.5476627085525327, "flos": 20120497850880.0, "grad_norm": 1.558993199960173, "language_loss": 0.75201291, "learning_rate": 1.7018798865183436e-06, "loss": 0.7733525, "num_input_tokens_seen": 196323945, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.71875, "step": 9109, "time_per_iteration": 2.472883462905884 }, { "auxiliary_loss_clip": 0.01126305, "auxiliary_loss_mlp": 0.01028889, "balance_loss_clip": 1.01528764, "balance_loss_mlp": 1.03733611, "epoch": 0.5477228318052006, "flos": 17712579772800.0, "grad_norm": 2.134783446803743, "language_loss": 0.77528763, "learning_rate": 1.7015062904890072e-06, "loss": 0.79683954, "num_input_tokens_seen": 196342200, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7109375, "step": 9110, "time_per_iteration": 2.546743631362915 }, { "auxiliary_loss_clip": 0.01116177, "auxiliary_loss_mlp": 0.01030913, "balance_loss_clip": 1.01890874, "balance_loss_mlp": 1.03699279, "epoch": 0.5477829550578687, "flos": 25848895962240.0, "grad_norm": 1.5567704896297196, "language_loss": 0.71023822, "learning_rate": 1.7011327051121443e-06, "loss": 0.73170912, "num_input_tokens_seen": 196362940, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 9111, "time_per_iteration": 2.553619623184204 }, { "auxiliary_loss_clip": 0.01119081, "auxiliary_loss_mlp": 0.01036648, "balance_loss_clip": 1.02395833, "balance_loss_mlp": 1.03763056, "epoch": 0.5478430783105366, "flos": 23039676161280.0, "grad_norm": 1.4730044714468138, "language_loss": 0.70972496, "learning_rate": 1.7007591304010858e-06, "loss": 0.73128223, "num_input_tokens_seen": 196383070, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 9112, "time_per_iteration": 3.9309051036834717 }, { "auxiliary_loss_clip": 0.01146266, "auxiliary_loss_mlp": 0.01031717, "balance_loss_clip": 1.01891422, "balance_loss_mlp": 1.03701901, "epoch": 0.5479032015632046, "flos": 16071210864000.0, "grad_norm": 1.8356492547987742, "language_loss": 0.87600678, "learning_rate": 1.7003855663691647e-06, "loss": 0.89778662, "num_input_tokens_seen": 196398485, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.734375, "step": 9113, "time_per_iteration": 2.5354197025299072 }, { "auxiliary_loss_clip": 0.01156743, "auxiliary_loss_mlp": 0.01030887, "balance_loss_clip": 1.01791143, "balance_loss_mlp": 1.03878915, "epoch": 0.5479633248158725, "flos": 24278701852800.0, "grad_norm": 1.4898069525463615, "language_loss": 0.72917271, "learning_rate": 1.7000120130297119e-06, "loss": 0.75104904, "num_input_tokens_seen": 196417725, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.734375, "step": 9114, "time_per_iteration": 2.613945722579956 }, { "auxiliary_loss_clip": 0.01123488, "auxiliary_loss_mlp": 0.01275906, "balance_loss_clip": 1.01756358, "balance_loss_mlp": 1.03444743, "epoch": 0.5480234480685405, "flos": 26358216860160.0, "grad_norm": 1.6040100651610423, "language_loss": 0.72130162, "learning_rate": 1.6996384703960584e-06, "loss": 0.74529552, "num_input_tokens_seen": 196437840, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.7109375, "step": 9115, "time_per_iteration": 2.6149775981903076 }, { "auxiliary_loss_clip": 0.01139914, "auxiliary_loss_mlp": 0.01282013, "balance_loss_clip": 1.02154171, "balance_loss_mlp": 1.03748512, "epoch": 0.5480835713212084, "flos": 22055077480320.0, "grad_norm": 1.8926877312467834, "language_loss": 0.71933371, "learning_rate": 1.6992649384815355e-06, "loss": 0.74355298, "num_input_tokens_seen": 196457300, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7578125, "step": 9116, "time_per_iteration": 3.9711177349090576 }, { "auxiliary_loss_clip": 0.01129505, "auxiliary_loss_mlp": 0.0103729, "balance_loss_clip": 1.02447462, "balance_loss_mlp": 1.03860593, "epoch": 0.5481436945738765, "flos": 25301042749440.0, "grad_norm": 2.146141849083401, "language_loss": 0.76541436, "learning_rate": 1.6988914172994732e-06, "loss": 0.78708231, "num_input_tokens_seen": 196476720, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.73046875, "step": 9117, "time_per_iteration": 2.7201907634735107 }, { "auxiliary_loss_clip": 0.01131023, "auxiliary_loss_mlp": 0.01031099, "balance_loss_clip": 1.01927316, "balance_loss_mlp": 1.03441262, "epoch": 0.5482038178265444, "flos": 33580857772800.0, "grad_norm": 1.5349741761124263, "language_loss": 0.62854397, "learning_rate": 1.6985179068632025e-06, "loss": 0.6501652, "num_input_tokens_seen": 196496765, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 9118, "time_per_iteration": 2.7043230533599854 }, { "auxiliary_loss_clip": 0.01114095, "auxiliary_loss_mlp": 0.01031051, "balance_loss_clip": 1.01753294, "balance_loss_mlp": 1.03982306, "epoch": 0.5482639410792124, "flos": 19026192055680.0, "grad_norm": 1.6527340596991233, "language_loss": 0.78711975, "learning_rate": 1.6981444071860518e-06, "loss": 0.80857122, "num_input_tokens_seen": 196516220, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 9119, "time_per_iteration": 2.5597496032714844 }, { "auxiliary_loss_clip": 0.01127326, "auxiliary_loss_mlp": 0.01283398, "balance_loss_clip": 1.02490449, "balance_loss_mlp": 1.03892016, "epoch": 0.5483240643318803, "flos": 25410318900480.0, "grad_norm": 1.6715200280567342, "language_loss": 0.82648355, "learning_rate": 1.6977709182813503e-06, "loss": 0.85059077, "num_input_tokens_seen": 196533860, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.70703125, "step": 9120, "time_per_iteration": 2.5723373889923096 }, { "auxiliary_loss_clip": 0.01140026, "auxiliary_loss_mlp": 0.01036558, "balance_loss_clip": 1.02305174, "balance_loss_mlp": 1.03936684, "epoch": 0.5483841875845483, "flos": 21466896272640.0, "grad_norm": 3.6743034623157294, "language_loss": 0.8012383, "learning_rate": 1.6973974401624273e-06, "loss": 0.82300419, "num_input_tokens_seen": 196551305, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7421875, "step": 9121, "time_per_iteration": 2.5660908222198486 }, { "auxiliary_loss_clip": 0.01146358, "auxiliary_loss_mlp": 0.01032487, "balance_loss_clip": 1.01866484, "balance_loss_mlp": 1.03586757, "epoch": 0.5484443108372163, "flos": 24747263792640.0, "grad_norm": 1.6669922503666799, "language_loss": 0.6126973, "learning_rate": 1.6970239728426114e-06, "loss": 0.63448572, "num_input_tokens_seen": 196569420, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75, "step": 9122, "time_per_iteration": 2.6723382472991943 }, { "auxiliary_loss_clip": 0.01125466, "auxiliary_loss_mlp": 0.01036034, "balance_loss_clip": 1.02472699, "balance_loss_mlp": 1.03683126, "epoch": 0.5485044340898843, "flos": 25375377945600.0, "grad_norm": 1.651213647975434, "language_loss": 0.71662879, "learning_rate": 1.6966505163352307e-06, "loss": 0.7382437, "num_input_tokens_seen": 196590610, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.7109375, "step": 9123, "time_per_iteration": 5.567425489425659 }, { "auxiliary_loss_clip": 0.01111277, "auxiliary_loss_mlp": 0.01029868, "balance_loss_clip": 1.01800072, "balance_loss_mlp": 1.03798342, "epoch": 0.5485645573425523, "flos": 12641167370880.0, "grad_norm": 1.8267306739453355, "language_loss": 0.83191431, "learning_rate": 1.6962770706536126e-06, "loss": 0.85332572, "num_input_tokens_seen": 196606495, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.734375, "step": 9124, "time_per_iteration": 2.5142977237701416 }, { "auxiliary_loss_clip": 0.01129328, "auxiliary_loss_mlp": 0.01033138, "balance_loss_clip": 1.02031708, "balance_loss_mlp": 1.03983974, "epoch": 0.5486246805952202, "flos": 28329425383680.0, "grad_norm": 1.6532507591238106, "language_loss": 0.80014652, "learning_rate": 1.6959036358110845e-06, "loss": 0.82177114, "num_input_tokens_seen": 196626365, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 9125, "time_per_iteration": 2.66089129447937 }, { "auxiliary_loss_clip": 0.01122254, "auxiliary_loss_mlp": 0.01034002, "balance_loss_clip": 1.02066803, "balance_loss_mlp": 1.03861344, "epoch": 0.5486848038478882, "flos": 16800017817600.0, "grad_norm": 1.9462473020224904, "language_loss": 0.74396282, "learning_rate": 1.6955302118209737e-06, "loss": 0.7655254, "num_input_tokens_seen": 196644465, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75, "step": 9126, "time_per_iteration": 2.491523027420044 }, { "auxiliary_loss_clip": 0.01123129, "auxiliary_loss_mlp": 0.01036348, "balance_loss_clip": 1.02158952, "balance_loss_mlp": 1.03728247, "epoch": 0.5487449271005561, "flos": 17236224581760.0, "grad_norm": 2.1955473345181673, "language_loss": 0.66742164, "learning_rate": 1.6951567986966061e-06, "loss": 0.68901646, "num_input_tokens_seen": 196659160, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.765625, "step": 9127, "time_per_iteration": 2.56191086769104 }, { "auxiliary_loss_clip": 0.01138754, "auxiliary_loss_mlp": 0.01036279, "balance_loss_clip": 1.02253389, "balance_loss_mlp": 1.03834307, "epoch": 0.5488050503532241, "flos": 17340867878400.0, "grad_norm": 1.626480446158249, "language_loss": 0.8321104, "learning_rate": 1.6947833964513087e-06, "loss": 0.85386074, "num_input_tokens_seen": 196677410, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.734375, "step": 9128, "time_per_iteration": 2.636003255844116 }, { "auxiliary_loss_clip": 0.01136312, "auxiliary_loss_mlp": 0.01032625, "balance_loss_clip": 1.01925015, "balance_loss_mlp": 1.03676093, "epoch": 0.548865173605892, "flos": 17239169496960.0, "grad_norm": 1.8332285010531364, "language_loss": 0.74468017, "learning_rate": 1.6944100050984062e-06, "loss": 0.76636958, "num_input_tokens_seen": 196696765, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 9129, "time_per_iteration": 2.583225965499878 }, { "auxiliary_loss_clip": 0.01144181, "auxiliary_loss_mlp": 0.01034123, "balance_loss_clip": 1.01937127, "balance_loss_mlp": 1.04002905, "epoch": 0.5489252968585601, "flos": 17456716218240.0, "grad_norm": 2.5219296820104553, "language_loss": 0.62913871, "learning_rate": 1.694036624651225e-06, "loss": 0.65092176, "num_input_tokens_seen": 196714895, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.765625, "step": 9130, "time_per_iteration": 2.590268611907959 }, { "auxiliary_loss_clip": 0.01113599, "auxiliary_loss_mlp": 0.01283678, "balance_loss_clip": 1.02351058, "balance_loss_mlp": 1.03824019, "epoch": 0.548985420111228, "flos": 26323383646080.0, "grad_norm": 1.9710851872414135, "language_loss": 0.62938267, "learning_rate": 1.6936632551230895e-06, "loss": 0.65335548, "num_input_tokens_seen": 196735510, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75390625, "step": 9131, "time_per_iteration": 2.589641571044922 }, { "auxiliary_loss_clip": 0.01122762, "auxiliary_loss_mlp": 0.01037402, "balance_loss_clip": 1.02241707, "balance_loss_mlp": 1.03845346, "epoch": 0.549045543363896, "flos": 18693730748160.0, "grad_norm": 1.8810229974013917, "language_loss": 0.74648988, "learning_rate": 1.6932898965273243e-06, "loss": 0.76809156, "num_input_tokens_seen": 196752855, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.75390625, "step": 9132, "time_per_iteration": 2.547124147415161 }, { "auxiliary_loss_clip": 0.01118771, "auxiliary_loss_mlp": 0.01029916, "balance_loss_clip": 1.01742911, "balance_loss_mlp": 1.03551531, "epoch": 0.5491056666165639, "flos": 24717386655360.0, "grad_norm": 1.5064172286041249, "language_loss": 0.81090891, "learning_rate": 1.6929165488772545e-06, "loss": 0.83239579, "num_input_tokens_seen": 196772230, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7421875, "step": 9133, "time_per_iteration": 2.6334617137908936 }, { "auxiliary_loss_clip": 0.01129044, "auxiliary_loss_mlp": 0.01283012, "balance_loss_clip": 1.02258492, "balance_loss_mlp": 1.03846765, "epoch": 0.5491657898692319, "flos": 21576926609280.0, "grad_norm": 1.8050798153052965, "language_loss": 0.69888377, "learning_rate": 1.6925432121862021e-06, "loss": 0.72300434, "num_input_tokens_seen": 196790405, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 9134, "time_per_iteration": 2.591858148574829 }, { "auxiliary_loss_clip": 0.01139273, "auxiliary_loss_mlp": 0.01034895, "balance_loss_clip": 1.02205038, "balance_loss_mlp": 1.0370822, "epoch": 0.5492259131219, "flos": 50476432746240.0, "grad_norm": 2.1190105679183335, "language_loss": 0.61347866, "learning_rate": 1.6921698864674922e-06, "loss": 0.63522035, "num_input_tokens_seen": 196813785, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.75, "step": 9135, "time_per_iteration": 2.870081901550293 }, { "auxiliary_loss_clip": 0.01123684, "auxiliary_loss_mlp": 0.01033133, "balance_loss_clip": 1.01864314, "balance_loss_mlp": 1.03869748, "epoch": 0.5492860363745679, "flos": 25119262995840.0, "grad_norm": 1.9345243153772933, "language_loss": 0.723809, "learning_rate": 1.691796571734447e-06, "loss": 0.74537718, "num_input_tokens_seen": 196834390, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.76171875, "step": 9136, "time_per_iteration": 2.5796496868133545 }, { "auxiliary_loss_clip": 0.01160694, "auxiliary_loss_mlp": 0.01037218, "balance_loss_clip": 1.02297854, "balance_loss_mlp": 1.03720677, "epoch": 0.5493461596272359, "flos": 22633777497600.0, "grad_norm": 2.0443935969351394, "language_loss": 0.67875624, "learning_rate": 1.6914232680003894e-06, "loss": 0.70073533, "num_input_tokens_seen": 196853290, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7890625, "step": 9137, "time_per_iteration": 2.670854091644287 }, { "auxiliary_loss_clip": 0.01043249, "auxiliary_loss_mlp": 0.01002553, "balance_loss_clip": 1.00106907, "balance_loss_mlp": 1.01102734, "epoch": 0.5494062828799038, "flos": 66151800754560.0, "grad_norm": 0.7486819980959019, "language_loss": 0.65303302, "learning_rate": 1.6910499752786416e-06, "loss": 0.67349106, "num_input_tokens_seen": 196913120, "router_z_loss_clip": 0.01483154, "router_z_loss_mlp": 0.234375, "step": 9138, "time_per_iteration": 3.167689561843872 }, { "auxiliary_loss_clip": 0.01122203, "auxiliary_loss_mlp": 0.01030615, "balance_loss_clip": 1.01749623, "balance_loss_mlp": 1.03895438, "epoch": 0.5494664061325718, "flos": 21105958458240.0, "grad_norm": 1.9233102808388067, "language_loss": 0.75183702, "learning_rate": 1.6906766935825251e-06, "loss": 0.7733652, "num_input_tokens_seen": 196931530, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.74609375, "step": 9139, "time_per_iteration": 2.583491802215576 }, { "auxiliary_loss_clip": 0.01119912, "auxiliary_loss_mlp": 0.01030223, "balance_loss_clip": 1.01703894, "balance_loss_mlp": 1.03758335, "epoch": 0.5495265293852397, "flos": 14392566616320.0, "grad_norm": 1.9514308572085848, "language_loss": 0.71218282, "learning_rate": 1.6903034229253624e-06, "loss": 0.73368418, "num_input_tokens_seen": 196949430, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 9140, "time_per_iteration": 2.586566209793091 }, { "auxiliary_loss_clip": 0.01127962, "auxiliary_loss_mlp": 0.01037114, "balance_loss_clip": 1.02339923, "balance_loss_mlp": 1.03639293, "epoch": 0.5495866526379077, "flos": 25549148966400.0, "grad_norm": 1.7537806369962023, "language_loss": 0.76696718, "learning_rate": 1.6899301633204736e-06, "loss": 0.78861797, "num_input_tokens_seen": 196968265, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.734375, "step": 9141, "time_per_iteration": 2.5569212436676025 }, { "auxiliary_loss_clip": 0.01136099, "auxiliary_loss_mlp": 0.01030933, "balance_loss_clip": 1.0180707, "balance_loss_mlp": 1.03935671, "epoch": 0.5496467758905756, "flos": 21317256213120.0, "grad_norm": 2.684429964859972, "language_loss": 0.74465132, "learning_rate": 1.6895569147811794e-06, "loss": 0.76632166, "num_input_tokens_seen": 196984930, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 9142, "time_per_iteration": 2.5614216327667236 }, { "auxiliary_loss_clip": 0.01133593, "auxiliary_loss_mlp": 0.01038992, "balance_loss_clip": 1.02437735, "balance_loss_mlp": 1.03957045, "epoch": 0.5497068991432437, "flos": 22233086305920.0, "grad_norm": 1.8752071513915323, "language_loss": 0.76920199, "learning_rate": 1.6891836773208009e-06, "loss": 0.79092789, "num_input_tokens_seen": 197002320, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.76171875, "step": 9143, "time_per_iteration": 2.6101269721984863 }, { "auxiliary_loss_clip": 0.01128504, "auxiliary_loss_mlp": 0.01031291, "balance_loss_clip": 1.01878583, "balance_loss_mlp": 1.03786671, "epoch": 0.5497670223959116, "flos": 18479093028480.0, "grad_norm": 3.484996970879381, "language_loss": 0.79716456, "learning_rate": 1.688810450952657e-06, "loss": 0.81876248, "num_input_tokens_seen": 197020825, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7265625, "step": 9144, "time_per_iteration": 2.5698463916778564 }, { "auxiliary_loss_clip": 0.01122786, "auxiliary_loss_mlp": 0.01034925, "balance_loss_clip": 1.02160358, "balance_loss_mlp": 1.03945148, "epoch": 0.5498271456485796, "flos": 29205107049600.0, "grad_norm": 1.7286135710388921, "language_loss": 0.71077567, "learning_rate": 1.6884372356900679e-06, "loss": 0.73235273, "num_input_tokens_seen": 197040450, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7421875, "step": 9145, "time_per_iteration": 2.5685391426086426 }, { "auxiliary_loss_clip": 0.01122266, "auxiliary_loss_mlp": 0.01033805, "balance_loss_clip": 1.02126968, "balance_loss_mlp": 1.03903961, "epoch": 0.5498872689012475, "flos": 34824372664320.0, "grad_norm": 1.6169558270958608, "language_loss": 0.70387161, "learning_rate": 1.688064031546352e-06, "loss": 0.72543228, "num_input_tokens_seen": 197063930, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7421875, "step": 9146, "time_per_iteration": 2.693450689315796 }, { "auxiliary_loss_clip": 0.01124447, "auxiliary_loss_mlp": 0.01029601, "balance_loss_clip": 1.01738191, "balance_loss_mlp": 1.03628659, "epoch": 0.5499473921539155, "flos": 25921938268800.0, "grad_norm": 1.6876233505502811, "language_loss": 0.63970762, "learning_rate": 1.6876908385348288e-06, "loss": 0.66124815, "num_input_tokens_seen": 197082660, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 9147, "time_per_iteration": 2.5566952228546143 }, { "auxiliary_loss_clip": 0.0111318, "auxiliary_loss_mlp": 0.01033325, "balance_loss_clip": 1.02012849, "balance_loss_mlp": 1.04009426, "epoch": 0.5500075154065835, "flos": 22273701609600.0, "grad_norm": 2.252481281410991, "language_loss": 0.80584151, "learning_rate": 1.6873176566688168e-06, "loss": 0.82730657, "num_input_tokens_seen": 197100675, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73046875, "step": 9148, "time_per_iteration": 2.578512668609619 }, { "auxiliary_loss_clip": 0.01129353, "auxiliary_loss_mlp": 0.01038302, "balance_loss_clip": 1.02494454, "balance_loss_mlp": 1.03820086, "epoch": 0.5500676386592515, "flos": 28037507552640.0, "grad_norm": 1.7405571252267653, "language_loss": 0.79111838, "learning_rate": 1.6869444859616323e-06, "loss": 0.81279492, "num_input_tokens_seen": 197121320, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 9149, "time_per_iteration": 2.584547519683838 }, { "auxiliary_loss_clip": 0.01136836, "auxiliary_loss_mlp": 0.01029897, "balance_loss_clip": 1.01664078, "balance_loss_mlp": 1.03671324, "epoch": 0.5501277619119195, "flos": 23914819123200.0, "grad_norm": 1.8960402824874054, "language_loss": 0.71927679, "learning_rate": 1.6865713264265944e-06, "loss": 0.74094409, "num_input_tokens_seen": 197138965, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 9150, "time_per_iteration": 2.6694326400756836 }, { "auxiliary_loss_clip": 0.01131542, "auxiliary_loss_mlp": 0.01033036, "balance_loss_clip": 1.01882601, "balance_loss_mlp": 1.03733706, "epoch": 0.5501878851645874, "flos": 20923783655040.0, "grad_norm": 2.0075509411611048, "language_loss": 0.75394726, "learning_rate": 1.686198178077019e-06, "loss": 0.77559298, "num_input_tokens_seen": 197156460, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.76171875, "step": 9151, "time_per_iteration": 2.6389050483703613 }, { "auxiliary_loss_clip": 0.01130783, "auxiliary_loss_mlp": 0.01029108, "balance_loss_clip": 1.01690078, "balance_loss_mlp": 1.03888059, "epoch": 0.5502480084172554, "flos": 20665298407680.0, "grad_norm": 1.6880759794363094, "language_loss": 0.76016897, "learning_rate": 1.685825040926224e-06, "loss": 0.7817679, "num_input_tokens_seen": 197175140, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.73828125, "step": 9152, "time_per_iteration": 2.6673853397369385 }, { "auxiliary_loss_clip": 0.01126698, "auxiliary_loss_mlp": 0.01032081, "balance_loss_clip": 1.01970124, "balance_loss_mlp": 1.03722942, "epoch": 0.5503081316699233, "flos": 26432552056320.0, "grad_norm": 3.142805850483852, "language_loss": 0.82660264, "learning_rate": 1.6854519149875253e-06, "loss": 0.84819043, "num_input_tokens_seen": 197194345, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71875, "step": 9153, "time_per_iteration": 2.63273286819458 }, { "auxiliary_loss_clip": 0.0111928, "auxiliary_loss_mlp": 0.01033306, "balance_loss_clip": 1.02069914, "balance_loss_mlp": 1.03813851, "epoch": 0.5503682549225913, "flos": 30629144718720.0, "grad_norm": 1.8266005918628818, "language_loss": 0.74149561, "learning_rate": 1.6850788002742379e-06, "loss": 0.76302147, "num_input_tokens_seen": 197215535, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.72265625, "step": 9154, "time_per_iteration": 4.018104553222656 }, { "auxiliary_loss_clip": 0.01134334, "auxiliary_loss_mlp": 0.01035843, "balance_loss_clip": 1.02185392, "balance_loss_mlp": 1.03899479, "epoch": 0.5504283781752592, "flos": 22565439872640.0, "grad_norm": 1.6313978231964879, "language_loss": 0.72373629, "learning_rate": 1.6847056967996786e-06, "loss": 0.7454381, "num_input_tokens_seen": 197234945, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7734375, "step": 9155, "time_per_iteration": 2.609862804412842 }, { "auxiliary_loss_clip": 0.01043014, "auxiliary_loss_mlp": 0.0100334, "balance_loss_clip": 1.00196886, "balance_loss_mlp": 1.01019025, "epoch": 0.5504885014279273, "flos": 67901009270400.0, "grad_norm": 0.7516716960576573, "language_loss": 0.55394965, "learning_rate": 1.6843326045771615e-06, "loss": 0.57441312, "num_input_tokens_seen": 197302285, "router_z_loss_clip": 0.01373291, "router_z_loss_mlp": 0.23535156, "step": 9156, "time_per_iteration": 3.251997470855713 }, { "auxiliary_loss_clip": 0.01118718, "auxiliary_loss_mlp": 0.01033355, "balance_loss_clip": 1.0203259, "balance_loss_mlp": 1.0371629, "epoch": 0.5505486246805952, "flos": 22450058409600.0, "grad_norm": 1.613186717315904, "language_loss": 0.82554495, "learning_rate": 1.6839595236200022e-06, "loss": 0.84706569, "num_input_tokens_seen": 197321575, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 9157, "time_per_iteration": 2.5609986782073975 }, { "auxiliary_loss_clip": 0.01117295, "auxiliary_loss_mlp": 0.0103285, "balance_loss_clip": 1.01818705, "balance_loss_mlp": 1.03990841, "epoch": 0.5506087479332632, "flos": 26906896085760.0, "grad_norm": 2.570586566446852, "language_loss": 0.75538433, "learning_rate": 1.6835864539415145e-06, "loss": 0.77688581, "num_input_tokens_seen": 197340255, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7734375, "step": 9158, "time_per_iteration": 4.01320219039917 }, { "auxiliary_loss_clip": 0.01144009, "auxiliary_loss_mlp": 0.01032045, "balance_loss_clip": 1.01966548, "balance_loss_mlp": 1.03857434, "epoch": 0.5506688711859311, "flos": 22930256355840.0, "grad_norm": 1.6274284187665697, "language_loss": 0.69686168, "learning_rate": 1.683213395555012e-06, "loss": 0.71862221, "num_input_tokens_seen": 197360360, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69921875, "step": 9159, "time_per_iteration": 2.609358787536621 }, { "auxiliary_loss_clip": 0.01150172, "auxiliary_loss_mlp": 0.01282612, "balance_loss_clip": 1.02224469, "balance_loss_mlp": 1.03953266, "epoch": 0.5507289944385991, "flos": 29606408772480.0, "grad_norm": 1.6039169209250825, "language_loss": 0.68229115, "learning_rate": 1.6828403484738089e-06, "loss": 0.70661902, "num_input_tokens_seen": 197381905, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75, "step": 9160, "time_per_iteration": 2.707907199859619 }, { "auxiliary_loss_clip": 0.01108733, "auxiliary_loss_mlp": 0.01032366, "balance_loss_clip": 1.01964068, "balance_loss_mlp": 1.0374918, "epoch": 0.5507891176912671, "flos": 15334431091200.0, "grad_norm": 1.9399765726808738, "language_loss": 0.7175051, "learning_rate": 1.6824673127112178e-06, "loss": 0.73891616, "num_input_tokens_seen": 197398555, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 9161, "time_per_iteration": 2.5068767070770264 }, { "auxiliary_loss_clip": 0.01119445, "auxiliary_loss_mlp": 0.01038534, "balance_loss_clip": 1.02536774, "balance_loss_mlp": 1.0378046, "epoch": 0.5508492409439351, "flos": 26578313447040.0, "grad_norm": 1.6499138707354295, "language_loss": 0.69475615, "learning_rate": 1.6820942882805515e-06, "loss": 0.71633601, "num_input_tokens_seen": 197419630, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73046875, "step": 9162, "time_per_iteration": 2.625178575515747 }, { "auxiliary_loss_clip": 0.0111043, "auxiliary_loss_mlp": 0.01040026, "balance_loss_clip": 1.02710366, "balance_loss_mlp": 1.03765404, "epoch": 0.5509093641966031, "flos": 25443428261760.0, "grad_norm": 1.6695876199260957, "language_loss": 0.85867625, "learning_rate": 1.681721275195123e-06, "loss": 0.88018084, "num_input_tokens_seen": 197438480, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 9163, "time_per_iteration": 2.5569887161254883 }, { "auxiliary_loss_clip": 0.01137572, "auxiliary_loss_mlp": 0.01030514, "balance_loss_clip": 1.01808631, "balance_loss_mlp": 1.03883147, "epoch": 0.550969487449271, "flos": 18698543170560.0, "grad_norm": 1.6035450259219748, "language_loss": 0.80248469, "learning_rate": 1.6813482734682426e-06, "loss": 0.82416558, "num_input_tokens_seen": 197456755, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.71875, "step": 9164, "time_per_iteration": 4.148115873336792 }, { "auxiliary_loss_clip": 0.01135089, "auxiliary_loss_mlp": 0.01027661, "balance_loss_clip": 1.01388609, "balance_loss_mlp": 1.04223883, "epoch": 0.551029610701939, "flos": 22708723224960.0, "grad_norm": 1.7204519611445823, "language_loss": 0.73157227, "learning_rate": 1.680975283113223e-06, "loss": 0.75319982, "num_input_tokens_seen": 197475530, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75, "step": 9165, "time_per_iteration": 4.000838756561279 }, { "auxiliary_loss_clip": 0.01117113, "auxiliary_loss_mlp": 0.01029224, "balance_loss_clip": 1.01704717, "balance_loss_mlp": 1.03646159, "epoch": 0.5510897339546069, "flos": 12420496166400.0, "grad_norm": 2.137074747223024, "language_loss": 0.78632879, "learning_rate": 1.6806023041433745e-06, "loss": 0.80779219, "num_input_tokens_seen": 197490835, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.71484375, "step": 9166, "time_per_iteration": 2.4994335174560547 }, { "auxiliary_loss_clip": 0.01139014, "auxiliary_loss_mlp": 0.01034017, "balance_loss_clip": 1.02094603, "balance_loss_mlp": 1.03753304, "epoch": 0.5511498572072749, "flos": 18770579896320.0, "grad_norm": 1.983838611937219, "language_loss": 0.76088256, "learning_rate": 1.6802293365720087e-06, "loss": 0.78261292, "num_input_tokens_seen": 197508770, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.75, "step": 9167, "time_per_iteration": 2.5836169719696045 }, { "auxiliary_loss_clip": 0.01117172, "auxiliary_loss_mlp": 0.01029038, "balance_loss_clip": 1.01524544, "balance_loss_mlp": 1.03629231, "epoch": 0.5512099804599428, "flos": 19573326996480.0, "grad_norm": 1.9515924541872969, "language_loss": 0.80205238, "learning_rate": 1.679856380412435e-06, "loss": 0.82351446, "num_input_tokens_seen": 197527340, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.71875, "step": 9168, "time_per_iteration": 2.539504289627075 }, { "auxiliary_loss_clip": 0.01122599, "auxiliary_loss_mlp": 0.01035221, "balance_loss_clip": 1.02193546, "balance_loss_mlp": 1.03879988, "epoch": 0.5512701037126109, "flos": 26245600744320.0, "grad_norm": 1.669346559522416, "language_loss": 0.68714857, "learning_rate": 1.6794834356779634e-06, "loss": 0.70872676, "num_input_tokens_seen": 197547280, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75, "step": 9169, "time_per_iteration": 2.5723352432250977 }, { "auxiliary_loss_clip": 0.01115078, "auxiliary_loss_mlp": 0.01026778, "balance_loss_clip": 1.01453543, "balance_loss_mlp": 1.03640068, "epoch": 0.5513302269652788, "flos": 21945406279680.0, "grad_norm": 2.674415633545242, "language_loss": 0.85048115, "learning_rate": 1.6791105023819042e-06, "loss": 0.87189972, "num_input_tokens_seen": 197565045, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.69921875, "step": 9170, "time_per_iteration": 2.539700746536255 }, { "auxiliary_loss_clip": 0.01050997, "auxiliary_loss_mlp": 0.01001108, "balance_loss_clip": 0.99959987, "balance_loss_mlp": 1.01010823, "epoch": 0.5513903502179468, "flos": 68235948616320.0, "grad_norm": 0.7829430268966657, "language_loss": 0.5991745, "learning_rate": 1.678737580537565e-06, "loss": 0.61969554, "num_input_tokens_seen": 197625005, "router_z_loss_clip": 0.01507568, "router_z_loss_mlp": 0.234375, "step": 9171, "time_per_iteration": 3.177942991256714 }, { "auxiliary_loss_clip": 0.01135521, "auxiliary_loss_mlp": 0.01033795, "balance_loss_clip": 1.02070022, "balance_loss_mlp": 1.03710592, "epoch": 0.5514504734706147, "flos": 18734238311040.0, "grad_norm": 1.776732009852988, "language_loss": 0.70119482, "learning_rate": 1.6783646701582557e-06, "loss": 0.72288799, "num_input_tokens_seen": 197645050, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 9172, "time_per_iteration": 2.6359803676605225 }, { "auxiliary_loss_clip": 0.01109268, "auxiliary_loss_mlp": 0.01035716, "balance_loss_clip": 1.02282357, "balance_loss_mlp": 1.03674519, "epoch": 0.5515105967232827, "flos": 22270972176000.0, "grad_norm": 1.6615401700740708, "language_loss": 0.75941777, "learning_rate": 1.6779917712572833e-06, "loss": 0.78086758, "num_input_tokens_seen": 197663910, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 9173, "time_per_iteration": 2.517136573791504 }, { "auxiliary_loss_clip": 0.01134387, "auxiliary_loss_mlp": 0.0103272, "balance_loss_clip": 1.01983976, "balance_loss_mlp": 1.03749049, "epoch": 0.5515707199759508, "flos": 22557682535040.0, "grad_norm": 1.7828566752403523, "language_loss": 0.74852991, "learning_rate": 1.677618883847957e-06, "loss": 0.77020103, "num_input_tokens_seen": 197681580, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 9174, "time_per_iteration": 2.563791036605835 }, { "auxiliary_loss_clip": 0.01129316, "auxiliary_loss_mlp": 0.01031208, "balance_loss_clip": 1.01775479, "balance_loss_mlp": 1.03790724, "epoch": 0.5516308432286187, "flos": 28291072636800.0, "grad_norm": 1.6716505588571662, "language_loss": 0.72470391, "learning_rate": 1.6772460079435832e-06, "loss": 0.74630916, "num_input_tokens_seen": 197702095, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73828125, "step": 9175, "time_per_iteration": 2.62040638923645 }, { "auxiliary_loss_clip": 0.01121501, "auxiliary_loss_mlp": 0.01033384, "balance_loss_clip": 1.02079511, "balance_loss_mlp": 1.03872013, "epoch": 0.5516909664812867, "flos": 18764474584320.0, "grad_norm": 1.7168678731949218, "language_loss": 0.69239181, "learning_rate": 1.676873143557469e-06, "loss": 0.71394062, "num_input_tokens_seen": 197720720, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7421875, "step": 9176, "time_per_iteration": 2.59969425201416 }, { "auxiliary_loss_clip": 0.01118896, "auxiliary_loss_mlp": 0.01030839, "balance_loss_clip": 1.01835203, "balance_loss_mlp": 1.03807974, "epoch": 0.5517510897339546, "flos": 27740346336000.0, "grad_norm": 1.5812162731279407, "language_loss": 0.71053803, "learning_rate": 1.6765002907029215e-06, "loss": 0.73203528, "num_input_tokens_seen": 197741820, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 9177, "time_per_iteration": 2.5840110778808594 }, { "auxiliary_loss_clip": 0.01121469, "auxiliary_loss_mlp": 0.01028538, "balance_loss_clip": 1.01604521, "balance_loss_mlp": 1.03968906, "epoch": 0.5518112129866226, "flos": 18404470523520.0, "grad_norm": 1.579593707250422, "language_loss": 0.80212545, "learning_rate": 1.6761274493932466e-06, "loss": 0.82362556, "num_input_tokens_seen": 197759160, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.734375, "step": 9178, "time_per_iteration": 2.5444626808166504 }, { "auxiliary_loss_clip": 0.0115827, "auxiliary_loss_mlp": 0.01040439, "balance_loss_clip": 1.02628279, "balance_loss_mlp": 1.03886664, "epoch": 0.5518713362392905, "flos": 25082670015360.0, "grad_norm": 1.494757614624269, "language_loss": 0.74714875, "learning_rate": 1.6757546196417496e-06, "loss": 0.76913583, "num_input_tokens_seen": 197779760, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.75, "step": 9179, "time_per_iteration": 2.589895486831665 }, { "auxiliary_loss_clip": 0.01120493, "auxiliary_loss_mlp": 0.01034432, "balance_loss_clip": 1.02150393, "balance_loss_mlp": 1.03896844, "epoch": 0.5519314594919585, "flos": 36538999361280.0, "grad_norm": 4.0532993749370085, "language_loss": 0.70011055, "learning_rate": 1.6753818014617363e-06, "loss": 0.72165978, "num_input_tokens_seen": 197801545, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 9180, "time_per_iteration": 2.681367874145508 }, { "auxiliary_loss_clip": 0.01120653, "auxiliary_loss_mlp": 0.01038036, "balance_loss_clip": 1.02405286, "balance_loss_mlp": 1.03795123, "epoch": 0.5519915827446265, "flos": 20448613612800.0, "grad_norm": 2.132328373460664, "language_loss": 0.67173809, "learning_rate": 1.6750089948665112e-06, "loss": 0.69332498, "num_input_tokens_seen": 197820760, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.734375, "step": 9181, "time_per_iteration": 2.5186431407928467 }, { "auxiliary_loss_clip": 0.01114154, "auxiliary_loss_mlp": 0.01034504, "balance_loss_clip": 1.02100992, "balance_loss_mlp": 1.03844774, "epoch": 0.5520517059972945, "flos": 23768052151680.0, "grad_norm": 2.2804532059575626, "language_loss": 0.79246277, "learning_rate": 1.6746361998693793e-06, "loss": 0.81394935, "num_input_tokens_seen": 197840195, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 9182, "time_per_iteration": 2.5551581382751465 }, { "auxiliary_loss_clip": 0.01138625, "auxiliary_loss_mlp": 0.01031323, "balance_loss_clip": 1.01846623, "balance_loss_mlp": 1.03770185, "epoch": 0.5521118292499624, "flos": 22196457411840.0, "grad_norm": 2.024846591666777, "language_loss": 0.83117616, "learning_rate": 1.6742634164836442e-06, "loss": 0.85287565, "num_input_tokens_seen": 197859475, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73828125, "step": 9183, "time_per_iteration": 2.521808624267578 }, { "auxiliary_loss_clip": 0.01128712, "auxiliary_loss_mlp": 0.01025063, "balance_loss_clip": 1.01189613, "balance_loss_mlp": 1.03864372, "epoch": 0.5521719525026304, "flos": 23583291569280.0, "grad_norm": 1.4338323943724351, "language_loss": 0.67295647, "learning_rate": 1.6738906447226103e-06, "loss": 0.69449425, "num_input_tokens_seen": 197879395, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 9184, "time_per_iteration": 2.628094434738159 }, { "auxiliary_loss_clip": 0.01119514, "auxiliary_loss_mlp": 0.01022991, "balance_loss_clip": 1.01007497, "balance_loss_mlp": 1.0377481, "epoch": 0.5522320757552983, "flos": 26137617482880.0, "grad_norm": 1.6615331276723386, "language_loss": 0.76505792, "learning_rate": 1.6735178845995803e-06, "loss": 0.78648305, "num_input_tokens_seen": 197900815, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 9185, "time_per_iteration": 2.554236888885498 }, { "auxiliary_loss_clip": 0.0113926, "auxiliary_loss_mlp": 0.01034687, "balance_loss_clip": 1.02134109, "balance_loss_mlp": 1.03870058, "epoch": 0.5522921990079663, "flos": 24676160820480.0, "grad_norm": 1.634468719933328, "language_loss": 0.74208343, "learning_rate": 1.673145136127857e-06, "loss": 0.76382291, "num_input_tokens_seen": 197918985, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73828125, "step": 9186, "time_per_iteration": 2.8261473178863525 }, { "auxiliary_loss_clip": 0.0114153, "auxiliary_loss_mlp": 0.0103399, "balance_loss_clip": 1.0213474, "balance_loss_mlp": 1.03912258, "epoch": 0.5523523222606344, "flos": 22748153379840.0, "grad_norm": 1.945773799721147, "language_loss": 0.66266894, "learning_rate": 1.6727723993207432e-06, "loss": 0.68442416, "num_input_tokens_seen": 197937725, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7578125, "step": 9187, "time_per_iteration": 2.7892327308654785 }, { "auxiliary_loss_clip": 0.01128762, "auxiliary_loss_mlp": 0.01028767, "balance_loss_clip": 1.01563597, "balance_loss_mlp": 1.03830743, "epoch": 0.5524124455133023, "flos": 19755825022080.0, "grad_norm": 1.757375605065587, "language_loss": 0.77716029, "learning_rate": 1.6723996741915406e-06, "loss": 0.79873556, "num_input_tokens_seen": 197955635, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73046875, "step": 9188, "time_per_iteration": 2.554544448852539 }, { "auxiliary_loss_clip": 0.01130015, "auxiliary_loss_mlp": 0.01033726, "balance_loss_clip": 1.01997519, "balance_loss_mlp": 1.03920853, "epoch": 0.5524725687659703, "flos": 23294821443840.0, "grad_norm": 1.901265486575991, "language_loss": 0.81130344, "learning_rate": 1.672026960753551e-06, "loss": 0.83294088, "num_input_tokens_seen": 197974490, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.734375, "step": 9189, "time_per_iteration": 2.539689302444458 }, { "auxiliary_loss_clip": 0.01120046, "auxiliary_loss_mlp": 0.01275563, "balance_loss_clip": 1.01562238, "balance_loss_mlp": 1.03945863, "epoch": 0.5525326920186382, "flos": 24862178378880.0, "grad_norm": 1.3697483546718805, "language_loss": 0.76777023, "learning_rate": 1.6716542590200753e-06, "loss": 0.79172629, "num_input_tokens_seen": 197995735, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 9190, "time_per_iteration": 2.584990978240967 }, { "auxiliary_loss_clip": 0.01115291, "auxiliary_loss_mlp": 0.01035363, "balance_loss_clip": 1.02077734, "balance_loss_mlp": 1.037938, "epoch": 0.5525928152713062, "flos": 13735580906880.0, "grad_norm": 2.304302284647976, "language_loss": 0.78603399, "learning_rate": 1.671281569004415e-06, "loss": 0.80754054, "num_input_tokens_seen": 198009685, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7734375, "step": 9191, "time_per_iteration": 2.5080811977386475 }, { "auxiliary_loss_clip": 0.01157403, "auxiliary_loss_mlp": 0.010321, "balance_loss_clip": 1.01942158, "balance_loss_mlp": 1.03985715, "epoch": 0.5526529385239741, "flos": 13071592045440.0, "grad_norm": 1.6780638936911527, "language_loss": 0.68508768, "learning_rate": 1.6709088907198698e-06, "loss": 0.70698273, "num_input_tokens_seen": 198026845, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 9192, "time_per_iteration": 2.561836004257202 }, { "auxiliary_loss_clip": 0.0111896, "auxiliary_loss_mlp": 0.01032874, "balance_loss_clip": 1.02105451, "balance_loss_mlp": 1.03924823, "epoch": 0.5527130617766421, "flos": 23148377694720.0, "grad_norm": 1.4692189197902725, "language_loss": 0.77460468, "learning_rate": 1.6705362241797398e-06, "loss": 0.79612303, "num_input_tokens_seen": 198045275, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.70703125, "step": 9193, "time_per_iteration": 2.5842394828796387 }, { "auxiliary_loss_clip": 0.01138691, "auxiliary_loss_mlp": 0.01037941, "balance_loss_clip": 1.02484012, "balance_loss_mlp": 1.03889048, "epoch": 0.55277318502931, "flos": 21285547482240.0, "grad_norm": 4.096439751675797, "language_loss": 0.78055489, "learning_rate": 1.6701635693973245e-06, "loss": 0.8023212, "num_input_tokens_seen": 198065760, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 9194, "time_per_iteration": 2.5680739879608154 }, { "auxiliary_loss_clip": 0.01142299, "auxiliary_loss_mlp": 0.01036128, "balance_loss_clip": 1.02244854, "balance_loss_mlp": 1.03749537, "epoch": 0.5528333082819781, "flos": 38324549462400.0, "grad_norm": 5.139580678999246, "language_loss": 0.69621491, "learning_rate": 1.6697909263859226e-06, "loss": 0.71799922, "num_input_tokens_seen": 198087595, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.78125, "step": 9195, "time_per_iteration": 2.758127450942993 }, { "auxiliary_loss_clip": 0.01137572, "auxiliary_loss_mlp": 0.0103383, "balance_loss_clip": 1.01956105, "balance_loss_mlp": 1.04053438, "epoch": 0.552893431534646, "flos": 13553621585280.0, "grad_norm": 2.272812678742069, "language_loss": 0.7404207, "learning_rate": 1.6694182951588335e-06, "loss": 0.76213479, "num_input_tokens_seen": 198104620, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.79296875, "step": 9196, "time_per_iteration": 3.858572244644165 }, { "auxiliary_loss_clip": 0.01122123, "auxiliary_loss_mlp": 0.01034619, "balance_loss_clip": 1.02194095, "balance_loss_mlp": 1.03972697, "epoch": 0.552953554787314, "flos": 21939408708480.0, "grad_norm": 1.7210161990770414, "language_loss": 0.77039385, "learning_rate": 1.669045675729355e-06, "loss": 0.79196131, "num_input_tokens_seen": 198123565, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.734375, "step": 9197, "time_per_iteration": 2.5483744144439697 }, { "auxiliary_loss_clip": 0.01127474, "auxiliary_loss_mlp": 0.01028151, "balance_loss_clip": 1.01623631, "balance_loss_mlp": 1.03689289, "epoch": 0.5530136780399819, "flos": 43658002558080.0, "grad_norm": 1.536953527945337, "language_loss": 0.76394975, "learning_rate": 1.6686730681107849e-06, "loss": 0.78550595, "num_input_tokens_seen": 198148270, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.7265625, "step": 9198, "time_per_iteration": 2.7587811946868896 }, { "auxiliary_loss_clip": 0.01109079, "auxiliary_loss_mlp": 0.01026877, "balance_loss_clip": 1.0148133, "balance_loss_mlp": 1.03644514, "epoch": 0.5530738012926499, "flos": 25045502417280.0, "grad_norm": 1.6763776679100082, "language_loss": 0.79298478, "learning_rate": 1.6683004723164208e-06, "loss": 0.81434435, "num_input_tokens_seen": 198168810, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.7265625, "step": 9199, "time_per_iteration": 2.601065158843994 }, { "auxiliary_loss_clip": 0.0113751, "auxiliary_loss_mlp": 0.01034874, "balance_loss_clip": 1.02292943, "balance_loss_mlp": 1.03697252, "epoch": 0.553133924545318, "flos": 16472081623680.0, "grad_norm": 2.1629599718937493, "language_loss": 0.63972437, "learning_rate": 1.6679278883595592e-06, "loss": 0.66144824, "num_input_tokens_seen": 198186200, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.734375, "step": 9200, "time_per_iteration": 3.9544003009796143 }, { "auxiliary_loss_clip": 0.01134701, "auxiliary_loss_mlp": 0.01034685, "balance_loss_clip": 1.02021337, "balance_loss_mlp": 1.04011893, "epoch": 0.5531940477979859, "flos": 24606207083520.0, "grad_norm": 6.4796185563425635, "language_loss": 0.66029024, "learning_rate": 1.6675553162534977e-06, "loss": 0.68198407, "num_input_tokens_seen": 198207050, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.76953125, "step": 9201, "time_per_iteration": 2.5542349815368652 }, { "auxiliary_loss_clip": 0.0112236, "auxiliary_loss_mlp": 0.01034678, "balance_loss_clip": 1.02114773, "balance_loss_mlp": 1.0408206, "epoch": 0.5532541710506539, "flos": 22159577122560.0, "grad_norm": 2.2813548765047615, "language_loss": 0.60854852, "learning_rate": 1.667182756011532e-06, "loss": 0.63011897, "num_input_tokens_seen": 198224565, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 9202, "time_per_iteration": 2.5548441410064697 }, { "auxiliary_loss_clip": 0.01117756, "auxiliary_loss_mlp": 0.01277836, "balance_loss_clip": 1.01828635, "balance_loss_mlp": 1.03809345, "epoch": 0.5533142943033218, "flos": 21397265758080.0, "grad_norm": 1.5003402543675797, "language_loss": 0.63906014, "learning_rate": 1.6668102076469567e-06, "loss": 0.66301608, "num_input_tokens_seen": 198244790, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 9203, "time_per_iteration": 2.535998821258545 }, { "auxiliary_loss_clip": 0.01119137, "auxiliary_loss_mlp": 0.01029724, "balance_loss_clip": 1.01744556, "balance_loss_mlp": 1.03690302, "epoch": 0.5533744175559898, "flos": 23550541344000.0, "grad_norm": 1.5464912644961768, "language_loss": 0.6376856, "learning_rate": 1.6664376711730687e-06, "loss": 0.6591742, "num_input_tokens_seen": 198264375, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.734375, "step": 9204, "time_per_iteration": 2.596857786178589 }, { "auxiliary_loss_clip": 0.01116409, "auxiliary_loss_mlp": 0.01029989, "balance_loss_clip": 1.01793694, "balance_loss_mlp": 1.03574729, "epoch": 0.5534345408086577, "flos": 24061514267520.0, "grad_norm": 1.6735928267098943, "language_loss": 0.77155012, "learning_rate": 1.6660651466031616e-06, "loss": 0.79301411, "num_input_tokens_seen": 198283895, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.71875, "step": 9205, "time_per_iteration": 2.505406141281128 }, { "auxiliary_loss_clip": 0.01145098, "auxiliary_loss_mlp": 0.01036223, "balance_loss_clip": 1.02362275, "balance_loss_mlp": 1.03791976, "epoch": 0.5534946640613257, "flos": 33771831408000.0, "grad_norm": 1.3912243223526137, "language_loss": 0.72345805, "learning_rate": 1.6656926339505311e-06, "loss": 0.74527133, "num_input_tokens_seen": 198310035, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 9206, "time_per_iteration": 5.695149660110474 }, { "auxiliary_loss_clip": 0.01146917, "auxiliary_loss_mlp": 0.0103262, "balance_loss_clip": 1.02050209, "balance_loss_mlp": 1.03868818, "epoch": 0.5535547873139937, "flos": 15159223526400.0, "grad_norm": 1.8352013752929894, "language_loss": 0.7552346, "learning_rate": 1.6653201332284705e-06, "loss": 0.77702999, "num_input_tokens_seen": 198327810, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.72265625, "step": 9207, "time_per_iteration": 2.5584821701049805 }, { "auxiliary_loss_clip": 0.01143191, "auxiliary_loss_mlp": 0.01032424, "balance_loss_clip": 1.01829791, "balance_loss_mlp": 1.04021013, "epoch": 0.5536149105666617, "flos": 16980863817600.0, "grad_norm": 2.646586381276633, "language_loss": 0.60768306, "learning_rate": 1.6649476444502734e-06, "loss": 0.62943918, "num_input_tokens_seen": 198343150, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.765625, "step": 9208, "time_per_iteration": 2.5239040851593018 }, { "auxiliary_loss_clip": 0.0111145, "auxiliary_loss_mlp": 0.01033952, "balance_loss_clip": 1.02139294, "balance_loss_mlp": 1.03734493, "epoch": 0.5536750338193296, "flos": 18149935772160.0, "grad_norm": 3.3998805236113134, "language_loss": 0.64241308, "learning_rate": 1.664575167629233e-06, "loss": 0.66386712, "num_input_tokens_seen": 198360925, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.73828125, "step": 9209, "time_per_iteration": 2.530482769012451 }, { "auxiliary_loss_clip": 0.01137179, "auxiliary_loss_mlp": 0.01036228, "balance_loss_clip": 1.02284718, "balance_loss_mlp": 1.03814781, "epoch": 0.5537351570719976, "flos": 22747794243840.0, "grad_norm": 1.8538649471817727, "language_loss": 0.82653713, "learning_rate": 1.6642027027786415e-06, "loss": 0.84827119, "num_input_tokens_seen": 198379265, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 9210, "time_per_iteration": 2.544599771499634 }, { "auxiliary_loss_clip": 0.01124029, "auxiliary_loss_mlp": 0.01029971, "balance_loss_clip": 1.01734114, "balance_loss_mlp": 1.03570247, "epoch": 0.5537952803246655, "flos": 26356026130560.0, "grad_norm": 1.6954372363359598, "language_loss": 0.72861648, "learning_rate": 1.6638302499117924e-06, "loss": 0.75015652, "num_input_tokens_seen": 198399490, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 9211, "time_per_iteration": 2.5979275703430176 }, { "auxiliary_loss_clip": 0.01140724, "auxiliary_loss_mlp": 0.01037447, "balance_loss_clip": 1.0233748, "balance_loss_mlp": 1.03829789, "epoch": 0.5538554035773335, "flos": 18037427397120.0, "grad_norm": 3.136684262453476, "language_loss": 0.66724014, "learning_rate": 1.6634578090419766e-06, "loss": 0.68902189, "num_input_tokens_seen": 198419110, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 9212, "time_per_iteration": 2.533236265182495 }, { "auxiliary_loss_clip": 0.01138776, "auxiliary_loss_mlp": 0.01033171, "balance_loss_clip": 1.01862717, "balance_loss_mlp": 1.03660977, "epoch": 0.5539155268300014, "flos": 31686247002240.0, "grad_norm": 1.5182259749315865, "language_loss": 0.51768845, "learning_rate": 1.663085380182486e-06, "loss": 0.53940791, "num_input_tokens_seen": 198441360, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7578125, "step": 9213, "time_per_iteration": 2.6279690265655518 }, { "auxiliary_loss_clip": 0.01141595, "auxiliary_loss_mlp": 0.01037225, "balance_loss_clip": 1.02402246, "balance_loss_mlp": 1.03980684, "epoch": 0.5539756500826695, "flos": 15193769431680.0, "grad_norm": 2.1620256895552785, "language_loss": 0.8571378, "learning_rate": 1.6627129633466117e-06, "loss": 0.87892598, "num_input_tokens_seen": 198459835, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75, "step": 9214, "time_per_iteration": 2.5644328594207764 }, { "auxiliary_loss_clip": 0.01118419, "auxiliary_loss_mlp": 0.01033757, "balance_loss_clip": 1.02188396, "balance_loss_mlp": 1.03824031, "epoch": 0.5540357733353375, "flos": 26353117128960.0, "grad_norm": 1.6769170362465105, "language_loss": 0.70017844, "learning_rate": 1.6623405585476438e-06, "loss": 0.72170019, "num_input_tokens_seen": 198478955, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.7109375, "step": 9215, "time_per_iteration": 2.571159839630127 }, { "auxiliary_loss_clip": 0.01130812, "auxiliary_loss_mlp": 0.0102997, "balance_loss_clip": 1.01660693, "balance_loss_mlp": 1.03839195, "epoch": 0.5540958965880054, "flos": 21323684747520.0, "grad_norm": 1.5662470208018775, "language_loss": 0.72974682, "learning_rate": 1.6619681657988732e-06, "loss": 0.75135463, "num_input_tokens_seen": 198499030, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7421875, "step": 9216, "time_per_iteration": 2.568647861480713 }, { "auxiliary_loss_clip": 0.01125954, "auxiliary_loss_mlp": 0.0103504, "balance_loss_clip": 1.02124774, "balance_loss_mlp": 1.03772855, "epoch": 0.5541560198406734, "flos": 25666828899840.0, "grad_norm": 1.8547949857513355, "language_loss": 0.71870852, "learning_rate": 1.661595785113589e-06, "loss": 0.74031848, "num_input_tokens_seen": 198520265, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7109375, "step": 9217, "time_per_iteration": 2.6308465003967285 }, { "auxiliary_loss_clip": 0.01115387, "auxiliary_loss_mlp": 0.01028241, "balance_loss_clip": 1.01555717, "balance_loss_mlp": 1.03544033, "epoch": 0.5542161430933413, "flos": 21939624190080.0, "grad_norm": 2.1318201981283647, "language_loss": 0.7801066, "learning_rate": 1.6612234165050808e-06, "loss": 0.80154288, "num_input_tokens_seen": 198539645, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 9218, "time_per_iteration": 2.5764124393463135 }, { "auxiliary_loss_clip": 0.0113931, "auxiliary_loss_mlp": 0.01036451, "balance_loss_clip": 1.02204418, "balance_loss_mlp": 1.03636932, "epoch": 0.5542762663460093, "flos": 19571459489280.0, "grad_norm": 1.6176972437261619, "language_loss": 0.72514784, "learning_rate": 1.6608510599866374e-06, "loss": 0.74690539, "num_input_tokens_seen": 198558710, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7578125, "step": 9219, "time_per_iteration": 2.5262374877929688 }, { "auxiliary_loss_clip": 0.01124359, "auxiliary_loss_mlp": 0.01038993, "balance_loss_clip": 1.02555251, "balance_loss_mlp": 1.0403893, "epoch": 0.5543363895986773, "flos": 19499063627520.0, "grad_norm": 1.5537347966535398, "language_loss": 0.71503693, "learning_rate": 1.6604787155715471e-06, "loss": 0.73667043, "num_input_tokens_seen": 198577050, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75, "step": 9220, "time_per_iteration": 2.5942461490631104 }, { "auxiliary_loss_clip": 0.01109435, "auxiliary_loss_mlp": 0.01027462, "balance_loss_clip": 1.01517713, "balance_loss_mlp": 1.03795958, "epoch": 0.5543965128513453, "flos": 22635609091200.0, "grad_norm": 1.6332050640184814, "language_loss": 0.79428518, "learning_rate": 1.6601063832730984e-06, "loss": 0.8156541, "num_input_tokens_seen": 198595290, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71484375, "step": 9221, "time_per_iteration": 2.4988396167755127 }, { "auxiliary_loss_clip": 0.01138509, "auxiliary_loss_mlp": 0.01033926, "balance_loss_clip": 1.02099741, "balance_loss_mlp": 1.03917205, "epoch": 0.5544566361040132, "flos": 25989952671360.0, "grad_norm": 1.7894889352909988, "language_loss": 0.83586746, "learning_rate": 1.6597340631045783e-06, "loss": 0.85759181, "num_input_tokens_seen": 198614110, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.72265625, "step": 9222, "time_per_iteration": 2.6978774070739746 }, { "auxiliary_loss_clip": 0.0114102, "auxiliary_loss_mlp": 0.01283051, "balance_loss_clip": 1.02208614, "balance_loss_mlp": 1.03861666, "epoch": 0.5545167593566812, "flos": 28257568225920.0, "grad_norm": 8.105663632341587, "language_loss": 0.75451279, "learning_rate": 1.6593617550792749e-06, "loss": 0.77875352, "num_input_tokens_seen": 198633880, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76171875, "step": 9223, "time_per_iteration": 2.6251378059387207 }, { "auxiliary_loss_clip": 0.01128578, "auxiliary_loss_mlp": 0.01030143, "balance_loss_clip": 1.01710796, "balance_loss_mlp": 1.03883123, "epoch": 0.5545768826093491, "flos": 28476551491200.0, "grad_norm": 1.5941860796922003, "language_loss": 0.8252601, "learning_rate": 1.6589894592104738e-06, "loss": 0.8468473, "num_input_tokens_seen": 198653505, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 9224, "time_per_iteration": 2.6619064807891846 }, { "auxiliary_loss_clip": 0.01123041, "auxiliary_loss_mlp": 0.01040282, "balance_loss_clip": 1.02716875, "balance_loss_mlp": 1.03893495, "epoch": 0.5546370058620171, "flos": 18478051534080.0, "grad_norm": 2.0754375952987196, "language_loss": 0.57149255, "learning_rate": 1.6586171755114614e-06, "loss": 0.59312582, "num_input_tokens_seen": 198671890, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.75, "step": 9225, "time_per_iteration": 2.525219440460205 }, { "auxiliary_loss_clip": 0.01120658, "auxiliary_loss_mlp": 0.01035995, "balance_loss_clip": 1.02273273, "balance_loss_mlp": 1.03752255, "epoch": 0.554697129114685, "flos": 22930507751040.0, "grad_norm": 1.6108227664385013, "language_loss": 0.67969692, "learning_rate": 1.6582449039955242e-06, "loss": 0.70126343, "num_input_tokens_seen": 198691995, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7421875, "step": 9226, "time_per_iteration": 2.6156203746795654 }, { "auxiliary_loss_clip": 0.01137752, "auxiliary_loss_mlp": 0.01033219, "balance_loss_clip": 1.01992691, "balance_loss_mlp": 1.03745019, "epoch": 0.5547572523673531, "flos": 21797166850560.0, "grad_norm": 1.4459611004570667, "language_loss": 0.74561453, "learning_rate": 1.657872644675947e-06, "loss": 0.76732433, "num_input_tokens_seen": 198712440, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73828125, "step": 9227, "time_per_iteration": 2.560969114303589 }, { "auxiliary_loss_clip": 0.01141887, "auxiliary_loss_mlp": 0.01034453, "balance_loss_clip": 1.02107167, "balance_loss_mlp": 1.04102826, "epoch": 0.5548173756200211, "flos": 22342829333760.0, "grad_norm": 1.9838216919762568, "language_loss": 0.73326445, "learning_rate": 1.6575003975660154e-06, "loss": 0.75502789, "num_input_tokens_seen": 198731515, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.74609375, "step": 9228, "time_per_iteration": 2.6488053798675537 }, { "auxiliary_loss_clip": 0.01112527, "auxiliary_loss_mlp": 0.01034519, "balance_loss_clip": 1.02133477, "balance_loss_mlp": 1.03835607, "epoch": 0.554877498872689, "flos": 17858736213120.0, "grad_norm": 1.6593878501773378, "language_loss": 0.75938928, "learning_rate": 1.657128162679013e-06, "loss": 0.78085983, "num_input_tokens_seen": 198749750, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73828125, "step": 9229, "time_per_iteration": 2.4679605960845947 }, { "auxiliary_loss_clip": 0.01120257, "auxiliary_loss_mlp": 0.0128649, "balance_loss_clip": 1.02604973, "balance_loss_mlp": 1.03934348, "epoch": 0.554937622125357, "flos": 17238343484160.0, "grad_norm": 1.501665695215213, "language_loss": 0.68904531, "learning_rate": 1.6567559400282248e-06, "loss": 0.71311283, "num_input_tokens_seen": 198768320, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 9230, "time_per_iteration": 2.6829583644866943 }, { "auxiliary_loss_clip": 0.01146926, "auxiliary_loss_mlp": 0.01036328, "balance_loss_clip": 1.02244568, "balance_loss_mlp": 1.0367465, "epoch": 0.5549977453780249, "flos": 25368087484800.0, "grad_norm": 2.1259396791319567, "language_loss": 0.68060768, "learning_rate": 1.6563837296269347e-06, "loss": 0.7024402, "num_input_tokens_seen": 198787230, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7421875, "step": 9231, "time_per_iteration": 2.5978293418884277 }, { "auxiliary_loss_clip": 0.0111543, "auxiliary_loss_mlp": 0.0103522, "balance_loss_clip": 1.02089131, "balance_loss_mlp": 1.03906107, "epoch": 0.555057868630693, "flos": 25079114568960.0, "grad_norm": 1.6491831949907225, "language_loss": 0.78209758, "learning_rate": 1.6560115314884247e-06, "loss": 0.80360407, "num_input_tokens_seen": 198806720, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.76171875, "step": 9232, "time_per_iteration": 2.6098082065582275 }, { "auxiliary_loss_clip": 0.01136717, "auxiliary_loss_mlp": 0.01033351, "balance_loss_clip": 1.02139425, "balance_loss_mlp": 1.03729224, "epoch": 0.5551179918833609, "flos": 26104220812800.0, "grad_norm": 1.6190428174108902, "language_loss": 0.82815254, "learning_rate": 1.6556393456259787e-06, "loss": 0.84985316, "num_input_tokens_seen": 198826235, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.72265625, "step": 9233, "time_per_iteration": 2.5969173908233643 }, { "auxiliary_loss_clip": 0.0112408, "auxiliary_loss_mlp": 0.01034105, "balance_loss_clip": 1.02120686, "balance_loss_mlp": 1.03835869, "epoch": 0.5551781151360289, "flos": 19384759572480.0, "grad_norm": 1.991139174754361, "language_loss": 0.75023925, "learning_rate": 1.6552671720528783e-06, "loss": 0.77182114, "num_input_tokens_seen": 198842655, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.765625, "step": 9234, "time_per_iteration": 2.602895975112915 }, { "auxiliary_loss_clip": 0.01126125, "auxiliary_loss_mlp": 0.01028984, "balance_loss_clip": 1.01689625, "balance_loss_mlp": 1.03690004, "epoch": 0.5552382383886968, "flos": 21725956137600.0, "grad_norm": 1.9812786328582492, "language_loss": 0.65085781, "learning_rate": 1.6548950107824062e-06, "loss": 0.67240894, "num_input_tokens_seen": 198861210, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.71875, "step": 9235, "time_per_iteration": 2.5491108894348145 }, { "auxiliary_loss_clip": 0.01146977, "auxiliary_loss_mlp": 0.01031068, "balance_loss_clip": 1.01778817, "balance_loss_mlp": 1.0365212, "epoch": 0.5552983616413648, "flos": 14356189117440.0, "grad_norm": 1.87002731704967, "language_loss": 0.6842221, "learning_rate": 1.6545228618278434e-06, "loss": 0.70600259, "num_input_tokens_seen": 198880045, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.74609375, "step": 9236, "time_per_iteration": 2.5761804580688477 }, { "auxiliary_loss_clip": 0.01106326, "auxiliary_loss_mlp": 0.01027674, "balance_loss_clip": 1.01468658, "balance_loss_mlp": 1.03651345, "epoch": 0.5553584848940327, "flos": 25478548784640.0, "grad_norm": 1.829135266692625, "language_loss": 0.86040235, "learning_rate": 1.6541507252024706e-06, "loss": 0.8817423, "num_input_tokens_seen": 198900210, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69921875, "step": 9237, "time_per_iteration": 3.9128682613372803 }, { "auxiliary_loss_clip": 0.01115741, "auxiliary_loss_mlp": 0.01033577, "balance_loss_clip": 1.02070236, "balance_loss_mlp": 1.03595102, "epoch": 0.5554186081467007, "flos": 22163850840960.0, "grad_norm": 4.273757237929603, "language_loss": 0.73167336, "learning_rate": 1.6537786009195695e-06, "loss": 0.75316656, "num_input_tokens_seen": 198919055, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 9238, "time_per_iteration": 2.502485513687134 }, { "auxiliary_loss_clip": 0.0112052, "auxiliary_loss_mlp": 0.01031328, "balance_loss_clip": 1.01835251, "balance_loss_mlp": 1.03677154, "epoch": 0.5554787313993687, "flos": 49746656125440.0, "grad_norm": 1.5413303252130328, "language_loss": 0.78579557, "learning_rate": 1.653406488992419e-06, "loss": 0.80731404, "num_input_tokens_seen": 198943505, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.74609375, "step": 9239, "time_per_iteration": 2.871868848800659 }, { "auxiliary_loss_clip": 0.0112948, "auxiliary_loss_mlp": 0.01027748, "balance_loss_clip": 1.01470077, "balance_loss_mlp": 1.03821397, "epoch": 0.5555388546520367, "flos": 22127365601280.0, "grad_norm": 1.6305106004401277, "language_loss": 0.79782104, "learning_rate": 1.6530343894342994e-06, "loss": 0.81939334, "num_input_tokens_seen": 198963590, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 9240, "time_per_iteration": 2.551053524017334 }, { "auxiliary_loss_clip": 0.01127503, "auxiliary_loss_mlp": 0.01030883, "balance_loss_clip": 1.01764452, "balance_loss_mlp": 1.03699613, "epoch": 0.5555989779047047, "flos": 24682122478080.0, "grad_norm": 1.6652881321850568, "language_loss": 0.65261942, "learning_rate": 1.6526623022584902e-06, "loss": 0.67420328, "num_input_tokens_seen": 198982680, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 9241, "time_per_iteration": 4.003798484802246 }, { "auxiliary_loss_clip": 0.01120605, "auxiliary_loss_mlp": 0.01277878, "balance_loss_clip": 1.01823068, "balance_loss_mlp": 1.03771567, "epoch": 0.5556591011573726, "flos": 16106510954880.0, "grad_norm": 1.9029126082497243, "language_loss": 0.72326821, "learning_rate": 1.6522902274782696e-06, "loss": 0.747253, "num_input_tokens_seen": 199000185, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7421875, "step": 9242, "time_per_iteration": 2.570317506790161 }, { "auxiliary_loss_clip": 0.01128946, "auxiliary_loss_mlp": 0.01034608, "balance_loss_clip": 1.0203445, "balance_loss_mlp": 1.03749514, "epoch": 0.5557192244100406, "flos": 12933695733120.0, "grad_norm": 1.9226530926623004, "language_loss": 0.63814497, "learning_rate": 1.6519181651069167e-06, "loss": 0.6597805, "num_input_tokens_seen": 199018380, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.734375, "step": 9243, "time_per_iteration": 2.5588667392730713 }, { "auxiliary_loss_clip": 0.01130009, "auxiliary_loss_mlp": 0.01278849, "balance_loss_clip": 1.01906037, "balance_loss_mlp": 1.03956223, "epoch": 0.5557793476627085, "flos": 23111712887040.0, "grad_norm": 5.491067349329589, "language_loss": 0.75885606, "learning_rate": 1.6515461151577085e-06, "loss": 0.78294456, "num_input_tokens_seen": 199037115, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 9244, "time_per_iteration": 2.6175405979156494 }, { "auxiliary_loss_clip": 0.01122592, "auxiliary_loss_mlp": 0.01032415, "balance_loss_clip": 1.02011871, "balance_loss_mlp": 1.03542233, "epoch": 0.5558394709153766, "flos": 21428040735360.0, "grad_norm": 1.9048462066046947, "language_loss": 0.74510813, "learning_rate": 1.6511740776439238e-06, "loss": 0.76665825, "num_input_tokens_seen": 199053375, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 9245, "time_per_iteration": 2.5394794940948486 }, { "auxiliary_loss_clip": 0.01132549, "auxiliary_loss_mlp": 0.01030706, "balance_loss_clip": 1.01673436, "balance_loss_mlp": 1.03789473, "epoch": 0.5558995941680445, "flos": 25078324469760.0, "grad_norm": 2.0660758044392566, "language_loss": 0.79751575, "learning_rate": 1.6508020525788388e-06, "loss": 0.8191483, "num_input_tokens_seen": 199070930, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.765625, "step": 9246, "time_per_iteration": 2.603508472442627 }, { "auxiliary_loss_clip": 0.01110961, "auxiliary_loss_mlp": 0.01036276, "balance_loss_clip": 1.02328861, "balance_loss_mlp": 1.03833103, "epoch": 0.5559597174207125, "flos": 20011149872640.0, "grad_norm": 2.1119591874084693, "language_loss": 0.74001944, "learning_rate": 1.65043003997573e-06, "loss": 0.76149189, "num_input_tokens_seen": 199088675, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 9247, "time_per_iteration": 4.1681342124938965 }, { "auxiliary_loss_clip": 0.0112899, "auxiliary_loss_mlp": 0.01035736, "balance_loss_clip": 1.02155614, "balance_loss_mlp": 1.03748941, "epoch": 0.5560198406733804, "flos": 16835677044480.0, "grad_norm": 2.5237195390531593, "language_loss": 0.75545722, "learning_rate": 1.6500580398478743e-06, "loss": 0.7771045, "num_input_tokens_seen": 199103075, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.734375, "step": 9248, "time_per_iteration": 4.00269889831543 }, { "auxiliary_loss_clip": 0.01069371, "auxiliary_loss_mlp": 0.01008851, "balance_loss_clip": 1.00739086, "balance_loss_mlp": 1.01091146, "epoch": 0.5560799639260484, "flos": 70697051758080.0, "grad_norm": 0.8421404328151493, "language_loss": 0.59508902, "learning_rate": 1.6496860522085466e-06, "loss": 0.61587125, "num_input_tokens_seen": 199160325, "router_z_loss_clip": 0.0145874, "router_z_loss_mlp": 0.23046875, "step": 9249, "time_per_iteration": 3.2859935760498047 }, { "auxiliary_loss_clip": 0.0110674, "auxiliary_loss_mlp": 0.01027819, "balance_loss_clip": 1.01456308, "balance_loss_mlp": 1.03375566, "epoch": 0.5561400871787163, "flos": 23148593176320.0, "grad_norm": 1.9913398024390683, "language_loss": 0.79553074, "learning_rate": 1.6493140770710228e-06, "loss": 0.81687635, "num_input_tokens_seen": 199179760, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 9250, "time_per_iteration": 2.5213332176208496 }, { "auxiliary_loss_clip": 0.01128453, "auxiliary_loss_mlp": 0.0103517, "balance_loss_clip": 1.02122831, "balance_loss_mlp": 1.03550327, "epoch": 0.5562002104313843, "flos": 17566423332480.0, "grad_norm": 2.0291843285388738, "language_loss": 0.69368172, "learning_rate": 1.6489421144485773e-06, "loss": 0.71531796, "num_input_tokens_seen": 199196695, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7578125, "step": 9251, "time_per_iteration": 2.568192720413208 }, { "auxiliary_loss_clip": 0.01118911, "auxiliary_loss_mlp": 0.01033833, "balance_loss_clip": 1.02102947, "balance_loss_mlp": 1.03741574, "epoch": 0.5562603336840523, "flos": 25045430590080.0, "grad_norm": 1.6894635610797877, "language_loss": 0.7515555, "learning_rate": 1.6485701643544852e-06, "loss": 0.77308297, "num_input_tokens_seen": 199217845, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 9252, "time_per_iteration": 2.6614584922790527 }, { "auxiliary_loss_clip": 0.01122975, "auxiliary_loss_mlp": 0.01036543, "balance_loss_clip": 1.02337646, "balance_loss_mlp": 1.03811419, "epoch": 0.5563204569367203, "flos": 29059022436480.0, "grad_norm": 1.5675686842089955, "language_loss": 0.72597659, "learning_rate": 1.6481982268020196e-06, "loss": 0.74757171, "num_input_tokens_seen": 199239250, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7578125, "step": 9253, "time_per_iteration": 2.5827836990356445 }, { "auxiliary_loss_clip": 0.01137707, "auxiliary_loss_mlp": 0.01029151, "balance_loss_clip": 1.01699805, "balance_loss_mlp": 1.03899145, "epoch": 0.5563805801893883, "flos": 22090449398400.0, "grad_norm": 1.7185093739042492, "language_loss": 0.82960647, "learning_rate": 1.6478263018044546e-06, "loss": 0.85127503, "num_input_tokens_seen": 199258320, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.71875, "step": 9254, "time_per_iteration": 2.6096882820129395 }, { "auxiliary_loss_clip": 0.01052082, "auxiliary_loss_mlp": 0.01002679, "balance_loss_clip": 1.00121295, "balance_loss_mlp": 1.01068282, "epoch": 0.5564407034420562, "flos": 58636128689280.0, "grad_norm": 0.8620421069765599, "language_loss": 0.64820737, "learning_rate": 1.647454389375063e-06, "loss": 0.66875499, "num_input_tokens_seen": 199314840, "router_z_loss_clip": 0.01464844, "router_z_loss_mlp": 0.23046875, "step": 9255, "time_per_iteration": 2.9835453033447266 }, { "auxiliary_loss_clip": 0.01122067, "auxiliary_loss_mlp": 0.01037733, "balance_loss_clip": 1.02509069, "balance_loss_mlp": 1.04222417, "epoch": 0.5565008266947242, "flos": 23112323418240.0, "grad_norm": 1.9174074021292271, "language_loss": 0.6912533, "learning_rate": 1.6470824895271168e-06, "loss": 0.71285129, "num_input_tokens_seen": 199335405, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 9256, "time_per_iteration": 2.594890832901001 }, { "auxiliary_loss_clip": 0.01134016, "auxiliary_loss_mlp": 0.01031698, "balance_loss_clip": 1.01983666, "balance_loss_mlp": 1.03709626, "epoch": 0.5565609499473921, "flos": 21578399066880.0, "grad_norm": 1.6375684245699977, "language_loss": 0.75619024, "learning_rate": 1.6467106022738896e-06, "loss": 0.77784741, "num_input_tokens_seen": 199354345, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.70703125, "step": 9257, "time_per_iteration": 2.529348373413086 }, { "auxiliary_loss_clip": 0.01147306, "auxiliary_loss_mlp": 0.01035421, "balance_loss_clip": 1.02285624, "balance_loss_mlp": 1.0369432, "epoch": 0.5566210732000602, "flos": 18369637309440.0, "grad_norm": 2.0565099088445407, "language_loss": 0.61194623, "learning_rate": 1.6463387276286518e-06, "loss": 0.63377357, "num_input_tokens_seen": 199372250, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.75, "step": 9258, "time_per_iteration": 2.524381637573242 }, { "auxiliary_loss_clip": 0.01137719, "auxiliary_loss_mlp": 0.01034791, "balance_loss_clip": 1.02059364, "balance_loss_mlp": 1.03790975, "epoch": 0.5566811964527281, "flos": 25703350053120.0, "grad_norm": 1.6497686956094333, "language_loss": 0.78969848, "learning_rate": 1.6459668656046746e-06, "loss": 0.8114236, "num_input_tokens_seen": 199392815, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.73046875, "step": 9259, "time_per_iteration": 2.5452563762664795 }, { "auxiliary_loss_clip": 0.01127549, "auxiliary_loss_mlp": 0.01032106, "balance_loss_clip": 1.01896334, "balance_loss_mlp": 1.03639758, "epoch": 0.5567413197053961, "flos": 26943991856640.0, "grad_norm": 1.925781438792092, "language_loss": 0.81413317, "learning_rate": 1.64559501621523e-06, "loss": 0.83572972, "num_input_tokens_seen": 199412375, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 9260, "time_per_iteration": 2.586026430130005 }, { "auxiliary_loss_clip": 0.01110657, "auxiliary_loss_mlp": 0.01037337, "balance_loss_clip": 1.02402163, "balance_loss_mlp": 1.03785598, "epoch": 0.556801442958064, "flos": 20850597694080.0, "grad_norm": 1.4695653021777637, "language_loss": 0.68824375, "learning_rate": 1.6452231794735872e-06, "loss": 0.70972371, "num_input_tokens_seen": 199431490, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 9261, "time_per_iteration": 2.472789764404297 }, { "auxiliary_loss_clip": 0.01060184, "auxiliary_loss_mlp": 0.01001664, "balance_loss_clip": 1.00012612, "balance_loss_mlp": 1.01080704, "epoch": 0.556861566210732, "flos": 70498213044480.0, "grad_norm": 0.7526320688600137, "language_loss": 0.6113835, "learning_rate": 1.6448513553930167e-06, "loss": 0.632002, "num_input_tokens_seen": 199495855, "router_z_loss_clip": 0.01531982, "router_z_loss_mlp": 0.23144531, "step": 9262, "time_per_iteration": 3.1662309169769287 }, { "auxiliary_loss_clip": 0.01122105, "auxiliary_loss_mlp": 0.01036994, "balance_loss_clip": 1.02425623, "balance_loss_mlp": 1.03900313, "epoch": 0.5569216894633999, "flos": 25337276593920.0, "grad_norm": 1.533840683314835, "language_loss": 0.64542675, "learning_rate": 1.644479543986788e-06, "loss": 0.6670177, "num_input_tokens_seen": 199515870, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7421875, "step": 9263, "time_per_iteration": 2.5491721630096436 }, { "auxiliary_loss_clip": 0.01128481, "auxiliary_loss_mlp": 0.01034025, "balance_loss_clip": 1.02143693, "balance_loss_mlp": 1.03756618, "epoch": 0.556981812716068, "flos": 22638733574400.0, "grad_norm": 1.7601577057270013, "language_loss": 0.73016876, "learning_rate": 1.6441077452681693e-06, "loss": 0.7517938, "num_input_tokens_seen": 199535745, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 9264, "time_per_iteration": 2.5731449127197266 }, { "auxiliary_loss_clip": 0.01140041, "auxiliary_loss_mlp": 0.01033086, "balance_loss_clip": 1.0204612, "balance_loss_mlp": 1.04041934, "epoch": 0.5570419359687359, "flos": 11035852738560.0, "grad_norm": 1.9797721920990987, "language_loss": 0.76145434, "learning_rate": 1.64373595925043e-06, "loss": 0.7831856, "num_input_tokens_seen": 199554035, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 9265, "time_per_iteration": 2.548940420150757 }, { "auxiliary_loss_clip": 0.01125045, "auxiliary_loss_mlp": 0.01032628, "balance_loss_clip": 1.02105832, "balance_loss_mlp": 1.03794575, "epoch": 0.5571020592214039, "flos": 22823135020800.0, "grad_norm": 1.4283010499771909, "language_loss": 0.70635641, "learning_rate": 1.643364185946838e-06, "loss": 0.72793311, "num_input_tokens_seen": 199576120, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6875, "step": 9266, "time_per_iteration": 2.614532232284546 }, { "auxiliary_loss_clip": 0.01126737, "auxiliary_loss_mlp": 0.01035746, "balance_loss_clip": 1.02313352, "balance_loss_mlp": 1.03667617, "epoch": 0.5571621824740719, "flos": 22927778317440.0, "grad_norm": 2.082229484785824, "language_loss": 0.6793865, "learning_rate": 1.642992425370661e-06, "loss": 0.7010113, "num_input_tokens_seen": 199593780, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.72265625, "step": 9267, "time_per_iteration": 2.5690877437591553 }, { "auxiliary_loss_clip": 0.01117595, "auxiliary_loss_mlp": 0.01039529, "balance_loss_clip": 1.02767324, "balance_loss_mlp": 1.03887212, "epoch": 0.5572223057267398, "flos": 22966705681920.0, "grad_norm": 1.7562344212423464, "language_loss": 0.74514294, "learning_rate": 1.6426206775351657e-06, "loss": 0.76671422, "num_input_tokens_seen": 199613220, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.69921875, "step": 9268, "time_per_iteration": 2.6037983894348145 }, { "auxiliary_loss_clip": 0.011178, "auxiliary_loss_mlp": 0.01276178, "balance_loss_clip": 1.01758146, "balance_loss_mlp": 1.0375917, "epoch": 0.5572824289794078, "flos": 20960053413120.0, "grad_norm": 2.4225314448879858, "language_loss": 0.74797636, "learning_rate": 1.6422489424536192e-06, "loss": 0.77191615, "num_input_tokens_seen": 199632085, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.7109375, "step": 9269, "time_per_iteration": 2.583969831466675 }, { "auxiliary_loss_clip": 0.01136902, "auxiliary_loss_mlp": 0.01034127, "balance_loss_clip": 1.02175879, "balance_loss_mlp": 1.03719807, "epoch": 0.5573425522320757, "flos": 25042413847680.0, "grad_norm": 1.6859673544953222, "language_loss": 0.83037788, "learning_rate": 1.6418772201392879e-06, "loss": 0.85208821, "num_input_tokens_seen": 199649295, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.73046875, "step": 9270, "time_per_iteration": 2.6218678951263428 }, { "auxiliary_loss_clip": 0.01148369, "auxiliary_loss_mlp": 0.01033899, "balance_loss_clip": 1.02078629, "balance_loss_mlp": 1.0375458, "epoch": 0.5574026754847438, "flos": 23659637927040.0, "grad_norm": 1.9721976361450144, "language_loss": 0.80226552, "learning_rate": 1.6415055106054369e-06, "loss": 0.82408822, "num_input_tokens_seen": 199668870, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.75, "step": 9271, "time_per_iteration": 2.7921364307403564 }, { "auxiliary_loss_clip": 0.01128973, "auxiliary_loss_mlp": 0.01032901, "balance_loss_clip": 1.01986551, "balance_loss_mlp": 1.03749347, "epoch": 0.5574627987374117, "flos": 24782240661120.0, "grad_norm": 1.7161698362995497, "language_loss": 0.90213358, "learning_rate": 1.6411338138653327e-06, "loss": 0.92375231, "num_input_tokens_seen": 199684870, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73828125, "step": 9272, "time_per_iteration": 2.586395502090454 }, { "auxiliary_loss_clip": 0.0113528, "auxiliary_loss_mlp": 0.01034674, "balance_loss_clip": 1.02201402, "balance_loss_mlp": 1.03810287, "epoch": 0.5575229219900797, "flos": 21834944979840.0, "grad_norm": 1.9188201119261143, "language_loss": 0.83942938, "learning_rate": 1.6407621299322387e-06, "loss": 0.86112887, "num_input_tokens_seen": 199701975, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 9273, "time_per_iteration": 2.629164218902588 }, { "auxiliary_loss_clip": 0.01121386, "auxiliary_loss_mlp": 0.01036816, "balance_loss_clip": 1.02235579, "balance_loss_mlp": 1.03944981, "epoch": 0.5575830452427476, "flos": 27815148408960.0, "grad_norm": 2.0048688505541437, "language_loss": 0.73742646, "learning_rate": 1.640390458819421e-06, "loss": 0.75900853, "num_input_tokens_seen": 199721865, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.73046875, "step": 9274, "time_per_iteration": 2.6085281372070312 }, { "auxiliary_loss_clip": 0.01135876, "auxiliary_loss_mlp": 0.01034912, "balance_loss_clip": 1.02058923, "balance_loss_mlp": 1.03940749, "epoch": 0.5576431684954156, "flos": 17812805696640.0, "grad_norm": 2.757557270342339, "language_loss": 0.77970475, "learning_rate": 1.6400188005401427e-06, "loss": 0.80141264, "num_input_tokens_seen": 199736455, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7890625, "step": 9275, "time_per_iteration": 2.5601494312286377 }, { "auxiliary_loss_clip": 0.01128041, "auxiliary_loss_mlp": 0.0103626, "balance_loss_clip": 1.02339745, "balance_loss_mlp": 1.03812289, "epoch": 0.5577032917480835, "flos": 15486872411520.0, "grad_norm": 1.5676305462329259, "language_loss": 0.74707353, "learning_rate": 1.6396471551076672e-06, "loss": 0.76871651, "num_input_tokens_seen": 199753125, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 9276, "time_per_iteration": 2.5620336532592773 }, { "auxiliary_loss_clip": 0.01124673, "auxiliary_loss_mlp": 0.01034641, "balance_loss_clip": 1.02236307, "balance_loss_mlp": 1.03616035, "epoch": 0.5577634150007516, "flos": 21579763783680.0, "grad_norm": 1.6631987412809235, "language_loss": 0.75542521, "learning_rate": 1.639275522535258e-06, "loss": 0.77701837, "num_input_tokens_seen": 199771365, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.70703125, "step": 9277, "time_per_iteration": 2.5993244647979736 }, { "auxiliary_loss_clip": 0.01105836, "auxiliary_loss_mlp": 0.01035243, "balance_loss_clip": 1.023, "balance_loss_mlp": 1.03532588, "epoch": 0.5578235382534195, "flos": 21139750177920.0, "grad_norm": 1.5286111966775637, "language_loss": 0.71781468, "learning_rate": 1.638903902836177e-06, "loss": 0.73922551, "num_input_tokens_seen": 199790035, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 9278, "time_per_iteration": 2.519861936569214 }, { "auxiliary_loss_clip": 0.01139625, "auxiliary_loss_mlp": 0.0103558, "balance_loss_clip": 1.02152526, "balance_loss_mlp": 1.03732085, "epoch": 0.5578836615060875, "flos": 26505199313280.0, "grad_norm": 1.615201512964871, "language_loss": 0.75756896, "learning_rate": 1.6385322960236874e-06, "loss": 0.77932096, "num_input_tokens_seen": 199811125, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 9279, "time_per_iteration": 3.97151255607605 }, { "auxiliary_loss_clip": 0.01126673, "auxiliary_loss_mlp": 0.01033931, "balance_loss_clip": 1.02139568, "balance_loss_mlp": 1.0357188, "epoch": 0.5579437847587555, "flos": 20153786780160.0, "grad_norm": 1.5312961724959389, "language_loss": 0.67347491, "learning_rate": 1.6381607021110505e-06, "loss": 0.69508094, "num_input_tokens_seen": 199829915, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.734375, "step": 9280, "time_per_iteration": 2.566946029663086 }, { "auxiliary_loss_clip": 0.01121716, "auxiliary_loss_mlp": 0.01038215, "balance_loss_clip": 1.0253818, "balance_loss_mlp": 1.03775477, "epoch": 0.5580039080114234, "flos": 26102281478400.0, "grad_norm": 1.5239683750747566, "language_loss": 0.73331845, "learning_rate": 1.6377891211115268e-06, "loss": 0.75491774, "num_input_tokens_seen": 199850670, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.75, "step": 9281, "time_per_iteration": 2.5719075202941895 }, { "auxiliary_loss_clip": 0.01135243, "auxiliary_loss_mlp": 0.0128221, "balance_loss_clip": 1.02274776, "balance_loss_mlp": 1.03734732, "epoch": 0.5580640312640914, "flos": 13771671096960.0, "grad_norm": 2.779547339335242, "language_loss": 0.7532711, "learning_rate": 1.6374175530383778e-06, "loss": 0.77744567, "num_input_tokens_seen": 199867645, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 9282, "time_per_iteration": 2.5415537357330322 }, { "auxiliary_loss_clip": 0.01152981, "auxiliary_loss_mlp": 0.01029269, "balance_loss_clip": 1.0159657, "balance_loss_mlp": 1.03715301, "epoch": 0.5581241545167593, "flos": 17675986792320.0, "grad_norm": 1.6865585947435644, "language_loss": 0.66346657, "learning_rate": 1.6370459979048642e-06, "loss": 0.68528914, "num_input_tokens_seen": 199886320, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 9283, "time_per_iteration": 3.9252209663391113 }, { "auxiliary_loss_clip": 0.01136612, "auxiliary_loss_mlp": 0.01027754, "balance_loss_clip": 1.01645255, "balance_loss_mlp": 1.03881681, "epoch": 0.5581842777694274, "flos": 19569161018880.0, "grad_norm": 1.7729732009972985, "language_loss": 0.83257216, "learning_rate": 1.6366744557242448e-06, "loss": 0.8542158, "num_input_tokens_seen": 199904895, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.71875, "step": 9284, "time_per_iteration": 2.5997395515441895 }, { "auxiliary_loss_clip": 0.01127017, "auxiliary_loss_mlp": 0.01030717, "balance_loss_clip": 1.01787817, "balance_loss_mlp": 1.03660548, "epoch": 0.5582444010220953, "flos": 20595165102720.0, "grad_norm": 2.3242829007766237, "language_loss": 0.85458207, "learning_rate": 1.63630292650978e-06, "loss": 0.87615943, "num_input_tokens_seen": 199921090, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.73046875, "step": 9285, "time_per_iteration": 2.5508663654327393 }, { "auxiliary_loss_clip": 0.01128349, "auxiliary_loss_mlp": 0.0103359, "balance_loss_clip": 1.02129972, "balance_loss_mlp": 1.03744817, "epoch": 0.5583045242747633, "flos": 19135504120320.0, "grad_norm": 2.3618268924258405, "language_loss": 0.73380148, "learning_rate": 1.6359314102747272e-06, "loss": 0.75542092, "num_input_tokens_seen": 199939925, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7265625, "step": 9286, "time_per_iteration": 2.635845422744751 }, { "auxiliary_loss_clip": 0.01110934, "auxiliary_loss_mlp": 0.01032735, "balance_loss_clip": 1.02041471, "balance_loss_mlp": 1.0378747, "epoch": 0.5583646475274312, "flos": 27454569730560.0, "grad_norm": 1.583251416419682, "language_loss": 0.73873019, "learning_rate": 1.6355599070323467e-06, "loss": 0.76016688, "num_input_tokens_seen": 199960015, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.73046875, "step": 9287, "time_per_iteration": 2.570131778717041 }, { "auxiliary_loss_clip": 0.01140511, "auxiliary_loss_mlp": 0.01035311, "balance_loss_clip": 1.02212071, "balance_loss_mlp": 1.03850102, "epoch": 0.5584247707800992, "flos": 23653784010240.0, "grad_norm": 1.6502186294465846, "language_loss": 0.75249118, "learning_rate": 1.6351884167958952e-06, "loss": 0.77424937, "num_input_tokens_seen": 199980505, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.75, "step": 9288, "time_per_iteration": 2.6302874088287354 }, { "auxiliary_loss_clip": 0.01130793, "auxiliary_loss_mlp": 0.01037273, "balance_loss_clip": 1.02409458, "balance_loss_mlp": 1.0400331, "epoch": 0.5584848940327671, "flos": 13698880185600.0, "grad_norm": 1.7859207203100067, "language_loss": 0.7808724, "learning_rate": 1.634816939578631e-06, "loss": 0.80255306, "num_input_tokens_seen": 199999020, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73046875, "step": 9289, "time_per_iteration": 5.638872146606445 }, { "auxiliary_loss_clip": 0.01143265, "auxiliary_loss_mlp": 0.01032839, "balance_loss_clip": 1.01919508, "balance_loss_mlp": 1.03987145, "epoch": 0.5585450172854352, "flos": 27016208150400.0, "grad_norm": 1.9342742460419613, "language_loss": 0.6110177, "learning_rate": 1.634445475393811e-06, "loss": 0.63277876, "num_input_tokens_seen": 200019020, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.765625, "step": 9290, "time_per_iteration": 2.5742294788360596 }, { "auxiliary_loss_clip": 0.01146856, "auxiliary_loss_mlp": 0.01026961, "balance_loss_clip": 1.01387191, "balance_loss_mlp": 1.03616714, "epoch": 0.5586051405381031, "flos": 23185653033600.0, "grad_norm": 2.3134549262298894, "language_loss": 0.67975968, "learning_rate": 1.6340740242546911e-06, "loss": 0.70149785, "num_input_tokens_seen": 200038110, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.74609375, "step": 9291, "time_per_iteration": 2.564610242843628 }, { "auxiliary_loss_clip": 0.01139858, "auxiliary_loss_mlp": 0.01033371, "balance_loss_clip": 1.01982236, "balance_loss_mlp": 1.03797317, "epoch": 0.5586652637907711, "flos": 20775544225920.0, "grad_norm": 2.9810133092317046, "language_loss": 0.84076881, "learning_rate": 1.6337025861745286e-06, "loss": 0.86250114, "num_input_tokens_seen": 200056210, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.75390625, "step": 9292, "time_per_iteration": 2.661156177520752 }, { "auxiliary_loss_clip": 0.01126797, "auxiliary_loss_mlp": 0.01034584, "balance_loss_clip": 1.022228, "balance_loss_mlp": 1.03796637, "epoch": 0.5587253870434391, "flos": 28219897837440.0, "grad_norm": 1.9722939600126033, "language_loss": 0.73483902, "learning_rate": 1.6333311611665779e-06, "loss": 0.75645286, "num_input_tokens_seen": 200075620, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.70703125, "step": 9293, "time_per_iteration": 2.729597806930542 }, { "auxiliary_loss_clip": 0.01125378, "auxiliary_loss_mlp": 0.01032013, "balance_loss_clip": 1.01990175, "balance_loss_mlp": 1.03636873, "epoch": 0.558785510296107, "flos": 26615732440320.0, "grad_norm": 1.7242404113332712, "language_loss": 0.72772288, "learning_rate": 1.6329597492440957e-06, "loss": 0.74929678, "num_input_tokens_seen": 200095945, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.703125, "step": 9294, "time_per_iteration": 2.6135759353637695 }, { "auxiliary_loss_clip": 0.0112843, "auxiliary_loss_mlp": 0.01277831, "balance_loss_clip": 1.018453, "balance_loss_mlp": 1.03812122, "epoch": 0.558845633548775, "flos": 20156767608960.0, "grad_norm": 2.21483999126963, "language_loss": 0.68234253, "learning_rate": 1.632588350420335e-06, "loss": 0.7064051, "num_input_tokens_seen": 200114185, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 9295, "time_per_iteration": 2.569645404815674 }, { "auxiliary_loss_clip": 0.01111314, "auxiliary_loss_mlp": 0.01033183, "balance_loss_clip": 1.02021861, "balance_loss_mlp": 1.03909838, "epoch": 0.5589057568014429, "flos": 24350774492160.0, "grad_norm": 1.5395577282070143, "language_loss": 0.80460656, "learning_rate": 1.6322169647085517e-06, "loss": 0.82605147, "num_input_tokens_seen": 200135030, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.72265625, "step": 9296, "time_per_iteration": 2.629098892211914 }, { "auxiliary_loss_clip": 0.01118212, "auxiliary_loss_mlp": 0.01033072, "balance_loss_clip": 1.0200541, "balance_loss_mlp": 1.03774381, "epoch": 0.558965880054111, "flos": 21105168359040.0, "grad_norm": 2.2684712344600046, "language_loss": 0.65617299, "learning_rate": 1.6318455921219988e-06, "loss": 0.67768586, "num_input_tokens_seen": 200154290, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 9297, "time_per_iteration": 2.522986650466919 }, { "auxiliary_loss_clip": 0.01138545, "auxiliary_loss_mlp": 0.01040586, "balance_loss_clip": 1.02726996, "balance_loss_mlp": 1.03795671, "epoch": 0.5590260033067789, "flos": 18436071513600.0, "grad_norm": 1.6321326126671318, "language_loss": 0.75180757, "learning_rate": 1.6314742326739291e-06, "loss": 0.77359879, "num_input_tokens_seen": 200171555, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7421875, "step": 9298, "time_per_iteration": 2.6029012203216553 }, { "auxiliary_loss_clip": 0.01052235, "auxiliary_loss_mlp": 0.01002278, "balance_loss_clip": 1.00072813, "balance_loss_mlp": 1.01098931, "epoch": 0.5590861265594469, "flos": 70577432490240.0, "grad_norm": 0.6808660131432902, "language_loss": 0.52384424, "learning_rate": 1.6311028863775974e-06, "loss": 0.54438937, "num_input_tokens_seen": 200237010, "router_z_loss_clip": 0.01544189, "router_z_loss_mlp": 0.234375, "step": 9299, "time_per_iteration": 3.2107386589050293 }, { "auxiliary_loss_clip": 0.01116817, "auxiliary_loss_mlp": 0.01030748, "balance_loss_clip": 1.01873732, "balance_loss_mlp": 1.03736949, "epoch": 0.5591462498121148, "flos": 30664408896000.0, "grad_norm": 1.9520752130257746, "language_loss": 0.6859479, "learning_rate": 1.6307315532462536e-06, "loss": 0.70742351, "num_input_tokens_seen": 200260820, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 9300, "time_per_iteration": 2.757021188735962 }, { "auxiliary_loss_clip": 0.0114598, "auxiliary_loss_mlp": 0.01038201, "balance_loss_clip": 1.02535665, "balance_loss_mlp": 1.03667092, "epoch": 0.5592063730647828, "flos": 18150438562560.0, "grad_norm": 1.8411948434969954, "language_loss": 0.81696784, "learning_rate": 1.6303602332931513e-06, "loss": 0.83880961, "num_input_tokens_seen": 200278035, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 9301, "time_per_iteration": 2.588932991027832 }, { "auxiliary_loss_clip": 0.0110793, "auxiliary_loss_mlp": 0.01028873, "balance_loss_clip": 1.01574254, "balance_loss_mlp": 1.03563285, "epoch": 0.5592664963174507, "flos": 24060400945920.0, "grad_norm": 2.017494866549308, "language_loss": 0.67866182, "learning_rate": 1.6299889265315415e-06, "loss": 0.70002985, "num_input_tokens_seen": 200297255, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 9302, "time_per_iteration": 2.6632003784179688 }, { "auxiliary_loss_clip": 0.01140177, "auxiliary_loss_mlp": 0.01025827, "balance_loss_clip": 1.01264262, "balance_loss_mlp": 1.03797328, "epoch": 0.5593266195701188, "flos": 19827897661440.0, "grad_norm": 2.8990335473081794, "language_loss": 0.70975941, "learning_rate": 1.6296176329746745e-06, "loss": 0.7314195, "num_input_tokens_seen": 200317505, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.75, "step": 9303, "time_per_iteration": 2.6065011024475098 }, { "auxiliary_loss_clip": 0.01152067, "auxiliary_loss_mlp": 0.01027552, "balance_loss_clip": 1.0164057, "balance_loss_mlp": 1.03872061, "epoch": 0.5593867428227867, "flos": 25300755440640.0, "grad_norm": 1.5690701045379514, "language_loss": 0.72537315, "learning_rate": 1.629246352635802e-06, "loss": 0.74716938, "num_input_tokens_seen": 200338350, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6796875, "step": 9304, "time_per_iteration": 2.6525235176086426 }, { "auxiliary_loss_clip": 0.01115294, "auxiliary_loss_mlp": 0.01027631, "balance_loss_clip": 1.01594901, "balance_loss_mlp": 1.03486371, "epoch": 0.5594468660754547, "flos": 12933013374720.0, "grad_norm": 2.0167639295892617, "language_loss": 0.77705616, "learning_rate": 1.628875085528173e-06, "loss": 0.7984854, "num_input_tokens_seen": 200353965, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.71484375, "step": 9305, "time_per_iteration": 2.6223232746124268 }, { "auxiliary_loss_clip": 0.01141778, "auxiliary_loss_mlp": 0.01027376, "balance_loss_clip": 1.01584864, "balance_loss_mlp": 1.03708518, "epoch": 0.5595069893281227, "flos": 19062713208960.0, "grad_norm": 2.029615525964659, "language_loss": 0.69386709, "learning_rate": 1.628503831665038e-06, "loss": 0.71555865, "num_input_tokens_seen": 200373595, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.69140625, "step": 9306, "time_per_iteration": 2.561699151992798 }, { "auxiliary_loss_clip": 0.01051531, "auxiliary_loss_mlp": 0.01004329, "balance_loss_clip": 1.00282693, "balance_loss_mlp": 1.01025116, "epoch": 0.5595671125807906, "flos": 70273375862400.0, "grad_norm": 0.9146899963515901, "language_loss": 0.60314929, "learning_rate": 1.6281325910596456e-06, "loss": 0.62370789, "num_input_tokens_seen": 200429155, "router_z_loss_clip": 0.01501465, "router_z_loss_mlp": 0.23242188, "step": 9307, "time_per_iteration": 3.3018720149993896 }, { "auxiliary_loss_clip": 0.01119126, "auxiliary_loss_mlp": 0.01031237, "balance_loss_clip": 1.01890445, "balance_loss_mlp": 1.03830862, "epoch": 0.5596272358334586, "flos": 20665513889280.0, "grad_norm": 2.0561017200405827, "language_loss": 0.74229681, "learning_rate": 1.627761363725244e-06, "loss": 0.76380044, "num_input_tokens_seen": 200448290, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.71484375, "step": 9308, "time_per_iteration": 2.5442943572998047 }, { "auxiliary_loss_clip": 0.01155008, "auxiliary_loss_mlp": 0.01033156, "balance_loss_clip": 1.01995349, "balance_loss_mlp": 1.0363133, "epoch": 0.5596873590861265, "flos": 25041013217280.0, "grad_norm": 1.961753617591214, "language_loss": 0.69485056, "learning_rate": 1.6273901496750823e-06, "loss": 0.7167322, "num_input_tokens_seen": 200466555, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7421875, "step": 9309, "time_per_iteration": 2.648482084274292 }, { "auxiliary_loss_clip": 0.01130419, "auxiliary_loss_mlp": 0.01032889, "balance_loss_clip": 1.01869166, "balance_loss_mlp": 1.03840029, "epoch": 0.5597474823387946, "flos": 25958387594880.0, "grad_norm": 2.3843699820712265, "language_loss": 0.75133574, "learning_rate": 1.6270189489224074e-06, "loss": 0.77296889, "num_input_tokens_seen": 200485980, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7421875, "step": 9310, "time_per_iteration": 2.56184458732605 }, { "auxiliary_loss_clip": 0.01122829, "auxiliary_loss_mlp": 0.01032237, "balance_loss_clip": 1.01886749, "balance_loss_mlp": 1.03958869, "epoch": 0.5598076055914625, "flos": 26177442687360.0, "grad_norm": 2.377279713729929, "language_loss": 0.6981799, "learning_rate": 1.6266477614804673e-06, "loss": 0.7197305, "num_input_tokens_seen": 200504555, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.74609375, "step": 9311, "time_per_iteration": 2.576627016067505 }, { "auxiliary_loss_clip": 0.01117117, "auxiliary_loss_mlp": 0.01027566, "balance_loss_clip": 1.01457798, "balance_loss_mlp": 1.03615999, "epoch": 0.5598677288441305, "flos": 11655778590720.0, "grad_norm": 1.8131292666815513, "language_loss": 0.71778333, "learning_rate": 1.626276587362508e-06, "loss": 0.73923016, "num_input_tokens_seen": 200522700, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.72265625, "step": 9312, "time_per_iteration": 2.486891508102417 }, { "auxiliary_loss_clip": 0.01136735, "auxiliary_loss_mlp": 0.01033498, "balance_loss_clip": 1.02061176, "balance_loss_mlp": 1.03901947, "epoch": 0.5599278520967984, "flos": 22966597941120.0, "grad_norm": 4.585485288244966, "language_loss": 0.89357948, "learning_rate": 1.6259054265817756e-06, "loss": 0.91528177, "num_input_tokens_seen": 200541910, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 9313, "time_per_iteration": 2.6255640983581543 }, { "auxiliary_loss_clip": 0.0113658, "auxiliary_loss_mlp": 0.01036357, "balance_loss_clip": 1.0239234, "balance_loss_mlp": 1.03883088, "epoch": 0.5599879753494664, "flos": 21215557831680.0, "grad_norm": 1.3478607708959465, "language_loss": 0.77411544, "learning_rate": 1.625534279151517e-06, "loss": 0.79584479, "num_input_tokens_seen": 200562600, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 9314, "time_per_iteration": 2.5559985637664795 }, { "auxiliary_loss_clip": 0.01134726, "auxiliary_loss_mlp": 0.01027808, "balance_loss_clip": 1.01482046, "balance_loss_mlp": 1.03595972, "epoch": 0.5600480986021343, "flos": 31903219105920.0, "grad_norm": 1.9114642513878757, "language_loss": 0.70151448, "learning_rate": 1.6251631450849758e-06, "loss": 0.72313988, "num_input_tokens_seen": 200584795, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 9315, "time_per_iteration": 2.677420139312744 }, { "auxiliary_loss_clip": 0.01131504, "auxiliary_loss_mlp": 0.01035736, "balance_loss_clip": 1.02243197, "balance_loss_mlp": 1.03868318, "epoch": 0.5601082218548024, "flos": 28476048700800.0, "grad_norm": 1.7748153171925027, "language_loss": 0.66298378, "learning_rate": 1.6247920243953983e-06, "loss": 0.6846562, "num_input_tokens_seen": 200606945, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75, "step": 9316, "time_per_iteration": 2.5933279991149902 }, { "auxiliary_loss_clip": 0.01136578, "auxiliary_loss_mlp": 0.01032831, "balance_loss_clip": 1.01936603, "balance_loss_mlp": 1.03835595, "epoch": 0.5601683451074703, "flos": 24097173494400.0, "grad_norm": 1.9147872573695144, "language_loss": 0.86957139, "learning_rate": 1.6244209170960282e-06, "loss": 0.89126551, "num_input_tokens_seen": 200626340, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71484375, "step": 9317, "time_per_iteration": 2.5880126953125 }, { "auxiliary_loss_clip": 0.01121053, "auxiliary_loss_mlp": 0.01039827, "balance_loss_clip": 1.02542615, "balance_loss_mlp": 1.03707612, "epoch": 0.5602284683601383, "flos": 26356205698560.0, "grad_norm": 1.7908273173909999, "language_loss": 0.77146614, "learning_rate": 1.6240498232001094e-06, "loss": 0.79307497, "num_input_tokens_seen": 200644520, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.75390625, "step": 9318, "time_per_iteration": 2.564034938812256 }, { "auxiliary_loss_clip": 0.01115819, "auxiliary_loss_mlp": 0.01033494, "balance_loss_clip": 1.02143002, "balance_loss_mlp": 1.0352037, "epoch": 0.5602885916128063, "flos": 24496392228480.0, "grad_norm": 1.840181385724973, "language_loss": 0.76022601, "learning_rate": 1.6236787427208856e-06, "loss": 0.78171909, "num_input_tokens_seen": 200664845, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.71875, "step": 9319, "time_per_iteration": 2.7044012546539307 }, { "auxiliary_loss_clip": 0.01116281, "auxiliary_loss_mlp": 0.01034256, "balance_loss_clip": 1.02205503, "balance_loss_mlp": 1.03705072, "epoch": 0.5603487148654742, "flos": 27345006270720.0, "grad_norm": 1.36943141408522, "language_loss": 0.84902483, "learning_rate": 1.623307675671599e-06, "loss": 0.87053025, "num_input_tokens_seen": 200686535, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 9320, "time_per_iteration": 3.9822139739990234 }, { "auxiliary_loss_clip": 0.01042197, "auxiliary_loss_mlp": 0.01002032, "balance_loss_clip": 1.000458, "balance_loss_mlp": 1.00978684, "epoch": 0.5604088381181422, "flos": 54087756180480.0, "grad_norm": 0.7607276739980225, "language_loss": 0.52605122, "learning_rate": 1.622936622065493e-06, "loss": 0.54649353, "num_input_tokens_seen": 200736965, "router_z_loss_clip": 0.01574707, "router_z_loss_mlp": 0.23632812, "step": 9321, "time_per_iteration": 3.012937307357788 }, { "auxiliary_loss_clip": 0.01126072, "auxiliary_loss_mlp": 0.01032493, "balance_loss_clip": 1.02017879, "balance_loss_mlp": 1.03840494, "epoch": 0.5604689613708101, "flos": 22236390357120.0, "grad_norm": 2.560611858423504, "language_loss": 0.74515212, "learning_rate": 1.6225655819158083e-06, "loss": 0.76673782, "num_input_tokens_seen": 200757420, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 9322, "time_per_iteration": 2.628664970397949 }, { "auxiliary_loss_clip": 0.0110856, "auxiliary_loss_mlp": 0.01032077, "balance_loss_clip": 1.01835012, "balance_loss_mlp": 1.03642654, "epoch": 0.5605290846234782, "flos": 35297782940160.0, "grad_norm": 2.207830446914137, "language_loss": 0.73986423, "learning_rate": 1.6221945552357879e-06, "loss": 0.76127064, "num_input_tokens_seen": 200779520, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 9323, "time_per_iteration": 2.6017563343048096 }, { "auxiliary_loss_clip": 0.01130694, "auxiliary_loss_mlp": 0.01032222, "balance_loss_clip": 1.01908517, "balance_loss_mlp": 1.03846502, "epoch": 0.5605892078761461, "flos": 20263314326400.0, "grad_norm": 1.694923382857834, "language_loss": 0.61470103, "learning_rate": 1.6218235420386716e-06, "loss": 0.63633013, "num_input_tokens_seen": 200799485, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7421875, "step": 9324, "time_per_iteration": 3.959331750869751 }, { "auxiliary_loss_clip": 0.01144917, "auxiliary_loss_mlp": 0.01034494, "balance_loss_clip": 1.02118421, "balance_loss_mlp": 1.036443, "epoch": 0.5606493311288141, "flos": 17308333134720.0, "grad_norm": 1.8515735265976365, "language_loss": 0.87760812, "learning_rate": 1.6214525423377e-06, "loss": 0.89940226, "num_input_tokens_seen": 200817540, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 9325, "time_per_iteration": 2.6315600872039795 }, { "auxiliary_loss_clip": 0.01128244, "auxiliary_loss_mlp": 0.01031963, "balance_loss_clip": 1.01862931, "balance_loss_mlp": 1.03625488, "epoch": 0.560709454381482, "flos": 21652985658240.0, "grad_norm": 1.954590691203123, "language_loss": 0.73436213, "learning_rate": 1.6210815561461143e-06, "loss": 0.75596422, "num_input_tokens_seen": 200838380, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7421875, "step": 9326, "time_per_iteration": 2.5374937057495117 }, { "auxiliary_loss_clip": 0.0113111, "auxiliary_loss_mlp": 0.01029795, "balance_loss_clip": 1.01613379, "balance_loss_mlp": 1.03726137, "epoch": 0.56076957763415, "flos": 20303355012480.0, "grad_norm": 1.7517268243221538, "language_loss": 0.78042942, "learning_rate": 1.6207105834771523e-06, "loss": 0.80203855, "num_input_tokens_seen": 200855640, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.76171875, "step": 9327, "time_per_iteration": 2.6084630489349365 }, { "auxiliary_loss_clip": 0.01121143, "auxiliary_loss_mlp": 0.01031044, "balance_loss_clip": 1.01720333, "balance_loss_mlp": 1.03989565, "epoch": 0.5608297008868179, "flos": 25045897466880.0, "grad_norm": 1.5431238798036984, "language_loss": 0.78580576, "learning_rate": 1.6203396243440543e-06, "loss": 0.80732763, "num_input_tokens_seen": 200876585, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71875, "step": 9328, "time_per_iteration": 2.5349531173706055 }, { "auxiliary_loss_clip": 0.0110898, "auxiliary_loss_mlp": 0.01029788, "balance_loss_clip": 1.01674652, "balance_loss_mlp": 1.03689158, "epoch": 0.560889824139486, "flos": 19866825025920.0, "grad_norm": 1.593243436395513, "language_loss": 0.73387885, "learning_rate": 1.6199686787600592e-06, "loss": 0.75526655, "num_input_tokens_seen": 200898175, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 9329, "time_per_iteration": 2.600780725479126 }, { "auxiliary_loss_clip": 0.01130767, "auxiliary_loss_mlp": 0.01282144, "balance_loss_clip": 1.02072835, "balance_loss_mlp": 1.03745604, "epoch": 0.5609499473921539, "flos": 22929394429440.0, "grad_norm": 1.886298551125299, "language_loss": 0.83506662, "learning_rate": 1.6195977467384035e-06, "loss": 0.85919571, "num_input_tokens_seen": 200917515, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7578125, "step": 9330, "time_per_iteration": 4.174026012420654 }, { "auxiliary_loss_clip": 0.01032733, "auxiliary_loss_mlp": 0.00998125, "balance_loss_clip": 0.99653929, "balance_loss_mlp": 1.00927293, "epoch": 0.5610100706448219, "flos": 53035825455360.0, "grad_norm": 0.7146752073534928, "language_loss": 0.57867944, "learning_rate": 1.6192268282923261e-06, "loss": 0.59898806, "num_input_tokens_seen": 200978615, "router_z_loss_clip": 0.01586914, "router_z_loss_mlp": 0.234375, "step": 9331, "time_per_iteration": 4.559318542480469 }, { "auxiliary_loss_clip": 0.01145373, "auxiliary_loss_mlp": 0.01031664, "balance_loss_clip": 1.01867652, "balance_loss_mlp": 1.03726459, "epoch": 0.5610701938974898, "flos": 21834944979840.0, "grad_norm": 1.876051391238004, "language_loss": 0.81489909, "learning_rate": 1.6188559234350632e-06, "loss": 0.83666945, "num_input_tokens_seen": 200997745, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.73046875, "step": 9332, "time_per_iteration": 2.6030080318450928 }, { "auxiliary_loss_clip": 0.01124275, "auxiliary_loss_mlp": 0.01040225, "balance_loss_clip": 1.02547908, "balance_loss_mlp": 1.03839624, "epoch": 0.5611303171501578, "flos": 17457183095040.0, "grad_norm": 2.3957786426034855, "language_loss": 0.82106996, "learning_rate": 1.6184850321798524e-06, "loss": 0.84271491, "num_input_tokens_seen": 201016370, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.765625, "step": 9333, "time_per_iteration": 2.522883176803589 }, { "auxiliary_loss_clip": 0.0110918, "auxiliary_loss_mlp": 0.01029975, "balance_loss_clip": 1.01729703, "balance_loss_mlp": 1.03640342, "epoch": 0.5611904404028258, "flos": 22637799820800.0, "grad_norm": 1.5446936087943846, "language_loss": 0.73006713, "learning_rate": 1.6181141545399294e-06, "loss": 0.7514587, "num_input_tokens_seen": 201034310, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 9334, "time_per_iteration": 2.5901856422424316 }, { "auxiliary_loss_clip": 0.01130928, "auxiliary_loss_mlp": 0.0103694, "balance_loss_clip": 1.02370739, "balance_loss_mlp": 1.0399437, "epoch": 0.5612505636554938, "flos": 14316327999360.0, "grad_norm": 1.843741081028126, "language_loss": 0.71594352, "learning_rate": 1.6177432905285296e-06, "loss": 0.73762214, "num_input_tokens_seen": 201052030, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 9335, "time_per_iteration": 2.683187246322632 }, { "auxiliary_loss_clip": 0.0112906, "auxiliary_loss_mlp": 0.01031727, "balance_loss_clip": 1.01892352, "balance_loss_mlp": 1.03741527, "epoch": 0.5613106869081618, "flos": 16508279554560.0, "grad_norm": 2.1214807642950175, "language_loss": 0.76839197, "learning_rate": 1.617372440158889e-06, "loss": 0.78999984, "num_input_tokens_seen": 201068445, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.73828125, "step": 9336, "time_per_iteration": 2.5419538021087646 }, { "auxiliary_loss_clip": 0.01118404, "auxiliary_loss_mlp": 0.01032097, "balance_loss_clip": 1.01958609, "balance_loss_mlp": 1.03786087, "epoch": 0.5613708101608297, "flos": 24058569352320.0, "grad_norm": 2.3432953441397943, "language_loss": 0.64397216, "learning_rate": 1.6170016034442412e-06, "loss": 0.66547722, "num_input_tokens_seen": 201082140, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 9337, "time_per_iteration": 2.6036219596862793 }, { "auxiliary_loss_clip": 0.0104982, "auxiliary_loss_mlp": 0.01002035, "balance_loss_clip": 1.00042617, "balance_loss_mlp": 1.00921953, "epoch": 0.5614309334134977, "flos": 64905735997440.0, "grad_norm": 0.8823106507371555, "language_loss": 0.62588227, "learning_rate": 1.6166307803978213e-06, "loss": 0.64640087, "num_input_tokens_seen": 201137245, "router_z_loss_clip": 0.01611328, "router_z_loss_mlp": 0.23046875, "step": 9338, "time_per_iteration": 3.1191654205322266 }, { "auxiliary_loss_clip": 0.01138734, "auxiliary_loss_mlp": 0.01032691, "balance_loss_clip": 1.01976323, "balance_loss_mlp": 1.03764892, "epoch": 0.5614910566661656, "flos": 32919849740160.0, "grad_norm": 1.8755842735298511, "language_loss": 0.65655923, "learning_rate": 1.6162599710328624e-06, "loss": 0.6782735, "num_input_tokens_seen": 201157270, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7421875, "step": 9339, "time_per_iteration": 2.689419984817505 }, { "auxiliary_loss_clip": 0.01117527, "auxiliary_loss_mlp": 0.01032467, "balance_loss_clip": 1.01925266, "balance_loss_mlp": 1.03744984, "epoch": 0.5615511799188336, "flos": 18588871969920.0, "grad_norm": 2.8852332145700763, "language_loss": 0.69761193, "learning_rate": 1.6158891753625986e-06, "loss": 0.71911192, "num_input_tokens_seen": 201174530, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 9340, "time_per_iteration": 2.5382447242736816 }, { "auxiliary_loss_clip": 0.01135479, "auxiliary_loss_mlp": 0.01028575, "balance_loss_clip": 1.01633179, "balance_loss_mlp": 1.03903627, "epoch": 0.5616113031715015, "flos": 22820010537600.0, "grad_norm": 1.5617610414775405, "language_loss": 0.76834583, "learning_rate": 1.6155183934002618e-06, "loss": 0.78998637, "num_input_tokens_seen": 201194905, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 9341, "time_per_iteration": 2.7469441890716553 }, { "auxiliary_loss_clip": 0.01146117, "auxiliary_loss_mlp": 0.0103311, "balance_loss_clip": 1.01953244, "balance_loss_mlp": 1.03688002, "epoch": 0.5616714264241696, "flos": 22345702421760.0, "grad_norm": 1.6147818974023354, "language_loss": 0.79415464, "learning_rate": 1.6151476251590843e-06, "loss": 0.81594694, "num_input_tokens_seen": 201213715, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73828125, "step": 9342, "time_per_iteration": 2.5670788288116455 }, { "auxiliary_loss_clip": 0.01140916, "auxiliary_loss_mlp": 0.01032785, "balance_loss_clip": 1.01970148, "balance_loss_mlp": 1.03896821, "epoch": 0.5617315496768375, "flos": 18807783408000.0, "grad_norm": 1.760211744420738, "language_loss": 0.7613036, "learning_rate": 1.6147768706522983e-06, "loss": 0.78304058, "num_input_tokens_seen": 201231415, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.75390625, "step": 9343, "time_per_iteration": 2.5746071338653564 }, { "auxiliary_loss_clip": 0.0114751, "auxiliary_loss_mlp": 0.01040532, "balance_loss_clip": 1.02720404, "balance_loss_mlp": 1.04040742, "epoch": 0.5617916729295055, "flos": 18369314087040.0, "grad_norm": 1.709946250896849, "language_loss": 0.68971753, "learning_rate": 1.614406129893135e-06, "loss": 0.71159792, "num_input_tokens_seen": 201249625, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 9344, "time_per_iteration": 2.5602447986602783 }, { "auxiliary_loss_clip": 0.01111784, "auxiliary_loss_mlp": 0.01037644, "balance_loss_clip": 1.02376771, "balance_loss_mlp": 1.03849638, "epoch": 0.5618517961821734, "flos": 28179964892160.0, "grad_norm": 2.6291164149656283, "language_loss": 0.66085243, "learning_rate": 1.6140354028948253e-06, "loss": 0.6823467, "num_input_tokens_seen": 201271205, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 9345, "time_per_iteration": 2.6262624263763428 }, { "auxiliary_loss_clip": 0.01130942, "auxiliary_loss_mlp": 0.01028612, "balance_loss_clip": 1.01584494, "balance_loss_mlp": 1.0399915, "epoch": 0.5619119194348414, "flos": 15486872411520.0, "grad_norm": 2.1650086748179014, "language_loss": 0.87408835, "learning_rate": 1.613664689670599e-06, "loss": 0.89568383, "num_input_tokens_seen": 201287700, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.734375, "step": 9346, "time_per_iteration": 2.5715997219085693 }, { "auxiliary_loss_clip": 0.01128242, "auxiliary_loss_mlp": 0.01035564, "balance_loss_clip": 1.0212115, "balance_loss_mlp": 1.03765082, "epoch": 0.5619720426875094, "flos": 29128652951040.0, "grad_norm": 2.4201079335788305, "language_loss": 0.59605169, "learning_rate": 1.6132939902336857e-06, "loss": 0.61768973, "num_input_tokens_seen": 201307530, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7265625, "step": 9347, "time_per_iteration": 2.6757619380950928 }, { "auxiliary_loss_clip": 0.01121625, "auxiliary_loss_mlp": 0.01035127, "balance_loss_clip": 1.02060103, "balance_loss_mlp": 1.03886354, "epoch": 0.5620321659401774, "flos": 18003743418240.0, "grad_norm": 2.2463044690588436, "language_loss": 0.69037157, "learning_rate": 1.6129233045973159e-06, "loss": 0.71193904, "num_input_tokens_seen": 201326210, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.73828125, "step": 9348, "time_per_iteration": 2.5381534099578857 }, { "auxiliary_loss_clip": 0.0114864, "auxiliary_loss_mlp": 0.01032243, "balance_loss_clip": 1.01827121, "balance_loss_mlp": 1.03653657, "epoch": 0.5620922891928454, "flos": 20594518657920.0, "grad_norm": 2.804967350302927, "language_loss": 0.78862023, "learning_rate": 1.612552632774717e-06, "loss": 0.8104291, "num_input_tokens_seen": 201346120, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.76953125, "step": 9349, "time_per_iteration": 2.6143813133239746 }, { "auxiliary_loss_clip": 0.01157894, "auxiliary_loss_mlp": 0.01031999, "balance_loss_clip": 1.01805735, "balance_loss_mlp": 1.03903592, "epoch": 0.5621524124455133, "flos": 26287006147200.0, "grad_norm": 1.994050908520936, "language_loss": 0.67393017, "learning_rate": 1.6121819747791183e-06, "loss": 0.69582915, "num_input_tokens_seen": 201365700, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.74609375, "step": 9350, "time_per_iteration": 2.626056671142578 }, { "auxiliary_loss_clip": 0.01149295, "auxiliary_loss_mlp": 0.0103902, "balance_loss_clip": 1.02520335, "balance_loss_mlp": 1.03773808, "epoch": 0.5622125356981813, "flos": 12750299867520.0, "grad_norm": 2.0836706772482887, "language_loss": 0.78551972, "learning_rate": 1.6118113306237474e-06, "loss": 0.80740285, "num_input_tokens_seen": 201382795, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75, "step": 9351, "time_per_iteration": 2.5876448154449463 }, { "auxiliary_loss_clip": 0.01132265, "auxiliary_loss_mlp": 0.01279042, "balance_loss_clip": 1.01911688, "balance_loss_mlp": 1.03823626, "epoch": 0.5622726589508492, "flos": 23805327490560.0, "grad_norm": 1.480075782574648, "language_loss": 0.5915761, "learning_rate": 1.6114407003218314e-06, "loss": 0.61568916, "num_input_tokens_seen": 201402780, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.76171875, "step": 9352, "time_per_iteration": 2.6956844329833984 }, { "auxiliary_loss_clip": 0.01116321, "auxiliary_loss_mlp": 0.01031637, "balance_loss_clip": 1.0194056, "balance_loss_mlp": 1.03816402, "epoch": 0.5623327822035172, "flos": 24718212668160.0, "grad_norm": 1.3216205539615629, "language_loss": 0.72020572, "learning_rate": 1.6110700838865976e-06, "loss": 0.74168527, "num_input_tokens_seen": 201424140, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6953125, "step": 9353, "time_per_iteration": 2.634449005126953 }, { "auxiliary_loss_clip": 0.01130453, "auxiliary_loss_mlp": 0.01031552, "balance_loss_clip": 1.01855814, "balance_loss_mlp": 1.03867328, "epoch": 0.5623929054561851, "flos": 14019274523520.0, "grad_norm": 2.608865492608692, "language_loss": 0.76356852, "learning_rate": 1.6106994813312716e-06, "loss": 0.78518856, "num_input_tokens_seen": 201439645, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.74609375, "step": 9354, "time_per_iteration": 2.520359992980957 }, { "auxiliary_loss_clip": 0.01139914, "auxiliary_loss_mlp": 0.01035314, "balance_loss_clip": 1.02206945, "balance_loss_mlp": 1.0380218, "epoch": 0.5624530287088532, "flos": 20704405340160.0, "grad_norm": 1.9493922918995492, "language_loss": 0.73050141, "learning_rate": 1.61032889266908e-06, "loss": 0.75225365, "num_input_tokens_seen": 201459970, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7578125, "step": 9355, "time_per_iteration": 2.5678412914276123 }, { "auxiliary_loss_clip": 0.01119791, "auxiliary_loss_mlp": 0.01033997, "balance_loss_clip": 1.02075291, "balance_loss_mlp": 1.03742933, "epoch": 0.5625131519615211, "flos": 21470918595840.0, "grad_norm": 1.4624233987077935, "language_loss": 0.73632705, "learning_rate": 1.6099583179132482e-06, "loss": 0.75786489, "num_input_tokens_seen": 201480055, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 9356, "time_per_iteration": 2.555262804031372 }, { "auxiliary_loss_clip": 0.01110134, "auxiliary_loss_mlp": 0.0103736, "balance_loss_clip": 1.02453351, "balance_loss_mlp": 1.03913116, "epoch": 0.5625732752141891, "flos": 18698004466560.0, "grad_norm": 1.8473806819649619, "language_loss": 0.83199447, "learning_rate": 1.609587757077e-06, "loss": 0.85346943, "num_input_tokens_seen": 201497645, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 9357, "time_per_iteration": 2.513003349304199 }, { "auxiliary_loss_clip": 0.01120049, "auxiliary_loss_mlp": 0.01037646, "balance_loss_clip": 1.02523589, "balance_loss_mlp": 1.03848004, "epoch": 0.562633398466857, "flos": 16216900427520.0, "grad_norm": 2.3566104757554567, "language_loss": 0.72305453, "learning_rate": 1.609217210173561e-06, "loss": 0.74463153, "num_input_tokens_seen": 201515455, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7265625, "step": 9358, "time_per_iteration": 2.582667589187622 }, { "auxiliary_loss_clip": 0.01108255, "auxiliary_loss_mlp": 0.01042854, "balance_loss_clip": 1.02998519, "balance_loss_mlp": 1.0371573, "epoch": 0.562693521719525, "flos": 22491930689280.0, "grad_norm": 1.6142874407125578, "language_loss": 0.77565414, "learning_rate": 1.6088466772161547e-06, "loss": 0.79716527, "num_input_tokens_seen": 201534500, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 9359, "time_per_iteration": 2.55043363571167 }, { "auxiliary_loss_clip": 0.01127009, "auxiliary_loss_mlp": 0.01032877, "balance_loss_clip": 1.02071142, "balance_loss_mlp": 1.03683209, "epoch": 0.562753644972193, "flos": 25331171281920.0, "grad_norm": 1.730529036381401, "language_loss": 0.70028663, "learning_rate": 1.6084761582180039e-06, "loss": 0.7218855, "num_input_tokens_seen": 201553280, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.7265625, "step": 9360, "time_per_iteration": 2.6738688945770264 }, { "auxiliary_loss_clip": 0.01070985, "auxiliary_loss_mlp": 0.01010569, "balance_loss_clip": 1.00904357, "balance_loss_mlp": 1.0116117, "epoch": 0.562813768224861, "flos": 67392622126080.0, "grad_norm": 0.7776923061771104, "language_loss": 0.55561864, "learning_rate": 1.6081056531923321e-06, "loss": 0.57643414, "num_input_tokens_seen": 201610030, "router_z_loss_clip": 0.01525879, "router_z_loss_mlp": 0.23242188, "step": 9361, "time_per_iteration": 3.1776750087738037 }, { "auxiliary_loss_clip": 0.01126427, "auxiliary_loss_mlp": 0.01032859, "balance_loss_clip": 1.0204556, "balance_loss_mlp": 1.03917241, "epoch": 0.562873891477529, "flos": 23331163029120.0, "grad_norm": 1.4101529608878194, "language_loss": 0.8180778, "learning_rate": 1.6077351621523615e-06, "loss": 0.83967072, "num_input_tokens_seen": 201628370, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 9362, "time_per_iteration": 3.9831602573394775 }, { "auxiliary_loss_clip": 0.01135401, "auxiliary_loss_mlp": 0.0128003, "balance_loss_clip": 1.02044082, "balance_loss_mlp": 1.03786063, "epoch": 0.5629340147301969, "flos": 38472824805120.0, "grad_norm": 1.6212407328694904, "language_loss": 0.74631685, "learning_rate": 1.6073646851113139e-06, "loss": 0.77047116, "num_input_tokens_seen": 201649790, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 9363, "time_per_iteration": 2.7680928707122803 }, { "auxiliary_loss_clip": 0.01119896, "auxiliary_loss_mlp": 0.01034928, "balance_loss_clip": 1.0215174, "balance_loss_mlp": 1.03709853, "epoch": 0.5629941379828649, "flos": 29242023252480.0, "grad_norm": 1.6185126771291256, "language_loss": 0.83049512, "learning_rate": 1.6069942220824104e-06, "loss": 0.85204339, "num_input_tokens_seen": 201669175, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7421875, "step": 9364, "time_per_iteration": 2.5546157360076904 }, { "auxiliary_loss_clip": 0.01136879, "auxiliary_loss_mlp": 0.01032102, "balance_loss_clip": 1.0202651, "balance_loss_mlp": 1.03912735, "epoch": 0.5630542612355328, "flos": 19420885676160.0, "grad_norm": 2.5499625659757035, "language_loss": 0.64831203, "learning_rate": 1.6066237730788725e-06, "loss": 0.67000186, "num_input_tokens_seen": 201687000, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.70703125, "step": 9365, "time_per_iteration": 2.6243703365325928 }, { "auxiliary_loss_clip": 0.0113438, "auxiliary_loss_mlp": 0.01034421, "balance_loss_clip": 1.02083755, "balance_loss_mlp": 1.03805459, "epoch": 0.5631143844882008, "flos": 22266303408000.0, "grad_norm": 3.002673109546307, "language_loss": 0.8098793, "learning_rate": 1.6062533381139201e-06, "loss": 0.83156729, "num_input_tokens_seen": 201703335, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.78125, "step": 9366, "time_per_iteration": 4.020461559295654 }, { "auxiliary_loss_clip": 0.01108606, "auxiliary_loss_mlp": 0.01032755, "balance_loss_clip": 1.02058375, "balance_loss_mlp": 1.03813481, "epoch": 0.5631745077408687, "flos": 22965305051520.0, "grad_norm": 1.9359039262474282, "language_loss": 0.73384869, "learning_rate": 1.6058829172007732e-06, "loss": 0.75526237, "num_input_tokens_seen": 201723495, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 9367, "time_per_iteration": 2.5874319076538086 }, { "auxiliary_loss_clip": 0.01049806, "auxiliary_loss_mlp": 0.0100242, "balance_loss_clip": 1.00087011, "balance_loss_mlp": 1.00941789, "epoch": 0.5632346309935368, "flos": 65080515576960.0, "grad_norm": 0.6270153914761607, "language_loss": 0.53479463, "learning_rate": 1.6055125103526518e-06, "loss": 0.55531687, "num_input_tokens_seen": 201792615, "router_z_loss_clip": 0.01550293, "router_z_loss_mlp": 0.23046875, "step": 9368, "time_per_iteration": 3.324033260345459 }, { "auxiliary_loss_clip": 0.01110183, "auxiliary_loss_mlp": 0.01036756, "balance_loss_clip": 1.02352393, "balance_loss_mlp": 1.03897309, "epoch": 0.5632947542462047, "flos": 23002903612800.0, "grad_norm": 1.676499728117625, "language_loss": 0.69284189, "learning_rate": 1.6051421175827734e-06, "loss": 0.7143113, "num_input_tokens_seen": 201812520, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 9369, "time_per_iteration": 2.550734758377075 }, { "auxiliary_loss_clip": 0.01134896, "auxiliary_loss_mlp": 0.01033785, "balance_loss_clip": 1.02100015, "balance_loss_mlp": 1.03589892, "epoch": 0.5633548774988727, "flos": 30482593228800.0, "grad_norm": 1.69654382429107, "language_loss": 0.75954401, "learning_rate": 1.6047717389043574e-06, "loss": 0.78123093, "num_input_tokens_seen": 201834185, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 9370, "time_per_iteration": 2.712671995162964 }, { "auxiliary_loss_clip": 0.01130757, "auxiliary_loss_mlp": 0.01035363, "balance_loss_clip": 1.02092648, "balance_loss_mlp": 1.03764963, "epoch": 0.5634150007515406, "flos": 18515039564160.0, "grad_norm": 2.065082467943443, "language_loss": 0.75804776, "learning_rate": 1.604401374330621e-06, "loss": 0.77970898, "num_input_tokens_seen": 201851305, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.75, "step": 9371, "time_per_iteration": 2.5337185859680176 }, { "auxiliary_loss_clip": 0.0111635, "auxiliary_loss_mlp": 0.01031749, "balance_loss_clip": 1.01899338, "balance_loss_mlp": 1.03623438, "epoch": 0.5634751240042086, "flos": 19244672530560.0, "grad_norm": 1.870799816078916, "language_loss": 0.76186407, "learning_rate": 1.6040310238747826e-06, "loss": 0.7833451, "num_input_tokens_seen": 201870350, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 9372, "time_per_iteration": 4.100172758102417 }, { "auxiliary_loss_clip": 0.01125847, "auxiliary_loss_mlp": 0.01031933, "balance_loss_clip": 1.01905847, "balance_loss_mlp": 1.03715563, "epoch": 0.5635352472568766, "flos": 12020630987520.0, "grad_norm": 2.1940697944667753, "language_loss": 0.71219897, "learning_rate": 1.6036606875500583e-06, "loss": 0.73377681, "num_input_tokens_seen": 201886800, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 9373, "time_per_iteration": 4.137161731719971 }, { "auxiliary_loss_clip": 0.01137563, "auxiliary_loss_mlp": 0.010336, "balance_loss_clip": 1.01911616, "balance_loss_mlp": 1.03854728, "epoch": 0.5635953705095446, "flos": 21871645701120.0, "grad_norm": 2.330538281540088, "language_loss": 0.82995713, "learning_rate": 1.6032903653696645e-06, "loss": 0.85166872, "num_input_tokens_seen": 201904730, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7265625, "step": 9374, "time_per_iteration": 2.5852549076080322 }, { "auxiliary_loss_clip": 0.01115916, "auxiliary_loss_mlp": 0.01028738, "balance_loss_clip": 1.01642919, "balance_loss_mlp": 1.03599358, "epoch": 0.5636554937622126, "flos": 27126166659840.0, "grad_norm": 2.1132884719181395, "language_loss": 0.66137612, "learning_rate": 1.6029200573468172e-06, "loss": 0.6828227, "num_input_tokens_seen": 201924850, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 9375, "time_per_iteration": 2.6468968391418457 }, { "auxiliary_loss_clip": 0.01137624, "auxiliary_loss_mlp": 0.01286387, "balance_loss_clip": 1.02682006, "balance_loss_mlp": 1.0373292, "epoch": 0.5637156170148805, "flos": 12926405272320.0, "grad_norm": 4.312559190122786, "language_loss": 0.80276704, "learning_rate": 1.602549763494731e-06, "loss": 0.82700717, "num_input_tokens_seen": 201939500, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73828125, "step": 9376, "time_per_iteration": 2.579212188720703 }, { "auxiliary_loss_clip": 0.01131424, "auxiliary_loss_mlp": 0.01035085, "balance_loss_clip": 1.02132249, "balance_loss_mlp": 1.03793538, "epoch": 0.5637757402675485, "flos": 45551033130240.0, "grad_norm": 1.3771789654110786, "language_loss": 0.68987215, "learning_rate": 1.6021794838266223e-06, "loss": 0.71153724, "num_input_tokens_seen": 201963000, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7578125, "step": 9377, "time_per_iteration": 2.8560686111450195 }, { "auxiliary_loss_clip": 0.01134388, "auxiliary_loss_mlp": 0.01028857, "balance_loss_clip": 1.01640558, "balance_loss_mlp": 1.0389781, "epoch": 0.5638358635202164, "flos": 20886041439360.0, "grad_norm": 1.4437898727994625, "language_loss": 0.74723387, "learning_rate": 1.601809218355704e-06, "loss": 0.7688663, "num_input_tokens_seen": 201983145, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.68359375, "step": 9378, "time_per_iteration": 2.581470251083374 }, { "auxiliary_loss_clip": 0.01114334, "auxiliary_loss_mlp": 0.01033989, "balance_loss_clip": 1.02013671, "balance_loss_mlp": 1.03636527, "epoch": 0.5638959867728844, "flos": 18806562345600.0, "grad_norm": 2.2133224704685213, "language_loss": 0.82214189, "learning_rate": 1.6014389670951902e-06, "loss": 0.84362519, "num_input_tokens_seen": 202000335, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78125, "step": 9379, "time_per_iteration": 2.5218393802642822 }, { "auxiliary_loss_clip": 0.01129242, "auxiliary_loss_mlp": 0.01034431, "balance_loss_clip": 1.0208292, "balance_loss_mlp": 1.03730977, "epoch": 0.5639561100255523, "flos": 27490336698240.0, "grad_norm": 2.118690870160227, "language_loss": 0.71226144, "learning_rate": 1.6010687300582948e-06, "loss": 0.73389816, "num_input_tokens_seen": 202018275, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7421875, "step": 9380, "time_per_iteration": 2.695521831512451 }, { "auxiliary_loss_clip": 0.0114753, "auxiliary_loss_mlp": 0.01034587, "balance_loss_clip": 1.02060962, "balance_loss_mlp": 1.03839517, "epoch": 0.5640162332782204, "flos": 18076570243200.0, "grad_norm": 1.9615802801387128, "language_loss": 0.73655176, "learning_rate": 1.60069850725823e-06, "loss": 0.75837302, "num_input_tokens_seen": 202034330, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7421875, "step": 9381, "time_per_iteration": 2.6009480953216553 }, { "auxiliary_loss_clip": 0.01148451, "auxiliary_loss_mlp": 0.01033247, "balance_loss_clip": 1.02037215, "balance_loss_mlp": 1.03860009, "epoch": 0.5640763565308883, "flos": 20884856290560.0, "grad_norm": 1.9825805728147325, "language_loss": 0.72085023, "learning_rate": 1.6003282987082086e-06, "loss": 0.74266726, "num_input_tokens_seen": 202053100, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7421875, "step": 9382, "time_per_iteration": 2.5959692001342773 }, { "auxiliary_loss_clip": 0.01033841, "auxiliary_loss_mlp": 0.01001848, "balance_loss_clip": 1.00027454, "balance_loss_mlp": 1.0107969, "epoch": 0.5641364797835563, "flos": 64447912224000.0, "grad_norm": 0.7367365619419781, "language_loss": 0.54350352, "learning_rate": 1.5999581044214417e-06, "loss": 0.56386042, "num_input_tokens_seen": 202120125, "router_z_loss_clip": 0.01574707, "router_z_loss_mlp": 0.23046875, "step": 9383, "time_per_iteration": 3.2339086532592773 }, { "auxiliary_loss_clip": 0.01123306, "auxiliary_loss_mlp": 0.0102765, "balance_loss_clip": 1.01434588, "balance_loss_mlp": 1.03780746, "epoch": 0.5641966030362242, "flos": 18660944609280.0, "grad_norm": 2.4048348396240953, "language_loss": 0.70242429, "learning_rate": 1.5995879244111417e-06, "loss": 0.72393382, "num_input_tokens_seen": 202138030, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.76171875, "step": 9384, "time_per_iteration": 2.5661258697509766 }, { "auxiliary_loss_clip": 0.01129246, "auxiliary_loss_mlp": 0.01032686, "balance_loss_clip": 1.01893568, "balance_loss_mlp": 1.03775406, "epoch": 0.5642567262888922, "flos": 22492325738880.0, "grad_norm": 1.8485903520757574, "language_loss": 0.76224631, "learning_rate": 1.5992177586905185e-06, "loss": 0.78386557, "num_input_tokens_seen": 202155580, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.73828125, "step": 9385, "time_per_iteration": 2.5666089057922363 }, { "auxiliary_loss_clip": 0.01117625, "auxiliary_loss_mlp": 0.01034284, "balance_loss_clip": 1.02112317, "balance_loss_mlp": 1.03683913, "epoch": 0.5643168495415603, "flos": 13003972692480.0, "grad_norm": 2.498047915917434, "language_loss": 0.8223688, "learning_rate": 1.598847607272782e-06, "loss": 0.84388787, "num_input_tokens_seen": 202170365, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 9386, "time_per_iteration": 2.546473741531372 }, { "auxiliary_loss_clip": 0.01126163, "auxiliary_loss_mlp": 0.01036302, "balance_loss_clip": 1.02206266, "balance_loss_mlp": 1.04005623, "epoch": 0.5643769727942282, "flos": 18588297352320.0, "grad_norm": 2.0393247826337024, "language_loss": 0.70032382, "learning_rate": 1.5984774701711433e-06, "loss": 0.7219485, "num_input_tokens_seen": 202189095, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.76953125, "step": 9387, "time_per_iteration": 2.4989540576934814 }, { "auxiliary_loss_clip": 0.01123126, "auxiliary_loss_mlp": 0.01033749, "balance_loss_clip": 1.02068377, "balance_loss_mlp": 1.0393517, "epoch": 0.5644370960468962, "flos": 33806269572480.0, "grad_norm": 1.4458362111753562, "language_loss": 0.74584705, "learning_rate": 1.59810734739881e-06, "loss": 0.76741588, "num_input_tokens_seen": 202213500, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.75, "step": 9388, "time_per_iteration": 2.6867172718048096 }, { "auxiliary_loss_clip": 0.01118247, "auxiliary_loss_mlp": 0.01031759, "balance_loss_clip": 1.02002847, "balance_loss_mlp": 1.03857231, "epoch": 0.5644972192995641, "flos": 21214911386880.0, "grad_norm": 1.5744064121457124, "language_loss": 0.81945586, "learning_rate": 1.5977372389689927e-06, "loss": 0.84095585, "num_input_tokens_seen": 202231920, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.70703125, "step": 9389, "time_per_iteration": 2.4947848320007324 }, { "auxiliary_loss_clip": 0.01142022, "auxiliary_loss_mlp": 0.01036188, "balance_loss_clip": 1.02174044, "balance_loss_mlp": 1.03988814, "epoch": 0.5645573425522321, "flos": 18587722734720.0, "grad_norm": 2.1550995350851467, "language_loss": 0.64115846, "learning_rate": 1.5973671448948981e-06, "loss": 0.66294056, "num_input_tokens_seen": 202247600, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.75390625, "step": 9390, "time_per_iteration": 2.688225746154785 }, { "auxiliary_loss_clip": 0.01129454, "auxiliary_loss_mlp": 0.01029708, "balance_loss_clip": 1.01794839, "balance_loss_mlp": 1.04005694, "epoch": 0.5646174658049, "flos": 18113809668480.0, "grad_norm": 1.6969696834644072, "language_loss": 0.9229424, "learning_rate": 1.5969970651897343e-06, "loss": 0.94453406, "num_input_tokens_seen": 202265350, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.71875, "step": 9391, "time_per_iteration": 2.499772548675537 }, { "auxiliary_loss_clip": 0.01127822, "auxiliary_loss_mlp": 0.01036217, "balance_loss_clip": 1.02152491, "balance_loss_mlp": 1.04039633, "epoch": 0.564677589057568, "flos": 28329964087680.0, "grad_norm": 1.8470890230469192, "language_loss": 0.60218847, "learning_rate": 1.5966269998667088e-06, "loss": 0.62382883, "num_input_tokens_seen": 202284285, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78515625, "step": 9392, "time_per_iteration": 2.643993854522705 }, { "auxiliary_loss_clip": 0.01145927, "auxiliary_loss_mlp": 0.01029793, "balance_loss_clip": 1.01577425, "balance_loss_mlp": 1.03753257, "epoch": 0.564737712310236, "flos": 22163743100160.0, "grad_norm": 2.2106832648357333, "language_loss": 0.81243998, "learning_rate": 1.5962569489390277e-06, "loss": 0.83419716, "num_input_tokens_seen": 202303450, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.73046875, "step": 9393, "time_per_iteration": 2.566570520401001 }, { "auxiliary_loss_clip": 0.01132047, "auxiliary_loss_mlp": 0.01029645, "balance_loss_clip": 1.01646018, "balance_loss_mlp": 1.0407908, "epoch": 0.564797835562904, "flos": 20959011918720.0, "grad_norm": 1.5989463655591303, "language_loss": 0.87231535, "learning_rate": 1.595886912419898e-06, "loss": 0.89393222, "num_input_tokens_seen": 202322315, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73828125, "step": 9394, "time_per_iteration": 2.6306562423706055 }, { "auxiliary_loss_clip": 0.01123101, "auxiliary_loss_mlp": 0.0103223, "balance_loss_clip": 1.01914144, "balance_loss_mlp": 1.04253006, "epoch": 0.5648579588155719, "flos": 17420302805760.0, "grad_norm": 2.2506503035875425, "language_loss": 0.84967244, "learning_rate": 1.5955168903225246e-06, "loss": 0.87122577, "num_input_tokens_seen": 202339905, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 9395, "time_per_iteration": 2.504864454269409 }, { "auxiliary_loss_clip": 0.01042456, "auxiliary_loss_mlp": 0.01004646, "balance_loss_clip": 1.0032692, "balance_loss_mlp": 1.01087952, "epoch": 0.5649180820682399, "flos": 69929568835200.0, "grad_norm": 0.8291462387649328, "language_loss": 0.58283794, "learning_rate": 1.5951468826601127e-06, "loss": 0.60330904, "num_input_tokens_seen": 202397320, "router_z_loss_clip": 0.01379395, "router_z_loss_mlp": 0.22851562, "step": 9396, "time_per_iteration": 3.176413059234619 }, { "auxiliary_loss_clip": 0.0113204, "auxiliary_loss_mlp": 0.01031146, "balance_loss_clip": 1.01760364, "balance_loss_mlp": 1.03891873, "epoch": 0.5649782053209078, "flos": 24973070641920.0, "grad_norm": 1.847741674355939, "language_loss": 0.69759029, "learning_rate": 1.5947768894458674e-06, "loss": 0.71922219, "num_input_tokens_seen": 202416865, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75390625, "step": 9397, "time_per_iteration": 2.6320323944091797 }, { "auxiliary_loss_clip": 0.01141012, "auxiliary_loss_mlp": 0.01035471, "balance_loss_clip": 1.02245879, "balance_loss_mlp": 1.03952765, "epoch": 0.5650383285735758, "flos": 21726602582400.0, "grad_norm": 1.6676446010448482, "language_loss": 0.6684773, "learning_rate": 1.5944069106929924e-06, "loss": 0.69024211, "num_input_tokens_seen": 202436210, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.75, "step": 9398, "time_per_iteration": 2.5866856575012207 }, { "auxiliary_loss_clip": 0.01052103, "auxiliary_loss_mlp": 0.01006461, "balance_loss_clip": 1.00495875, "balance_loss_mlp": 1.01107359, "epoch": 0.5650984518262439, "flos": 65904484636800.0, "grad_norm": 0.7672928953529411, "language_loss": 0.58185232, "learning_rate": 1.594036946414692e-06, "loss": 0.60243797, "num_input_tokens_seen": 202492925, "router_z_loss_clip": 0.01501465, "router_z_loss_mlp": 0.23046875, "step": 9399, "time_per_iteration": 3.108772039413452 }, { "auxiliary_loss_clip": 0.01149576, "auxiliary_loss_mlp": 0.01035921, "balance_loss_clip": 1.02097201, "balance_loss_mlp": 1.03789544, "epoch": 0.5651585750789118, "flos": 21032592929280.0, "grad_norm": 1.9397077831584413, "language_loss": 0.73444796, "learning_rate": 1.5936669966241684e-06, "loss": 0.75630283, "num_input_tokens_seen": 202511905, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.76171875, "step": 9400, "time_per_iteration": 2.569592237472534 }, { "auxiliary_loss_clip": 0.01142508, "auxiliary_loss_mlp": 0.01036559, "balance_loss_clip": 1.02181888, "balance_loss_mlp": 1.03935254, "epoch": 0.5652186983315798, "flos": 18551919853440.0, "grad_norm": 1.9798890718408373, "language_loss": 0.60572147, "learning_rate": 1.593297061334624e-06, "loss": 0.6275121, "num_input_tokens_seen": 202529815, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.765625, "step": 9401, "time_per_iteration": 2.6307899951934814 }, { "auxiliary_loss_clip": 0.01137791, "auxiliary_loss_mlp": 0.01028483, "balance_loss_clip": 1.01563215, "balance_loss_mlp": 1.03764343, "epoch": 0.5652788215842477, "flos": 18478662065280.0, "grad_norm": 1.7505419815285346, "language_loss": 0.8113212, "learning_rate": 1.5929271405592622e-06, "loss": 0.83298397, "num_input_tokens_seen": 202547710, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 9402, "time_per_iteration": 2.5451462268829346 }, { "auxiliary_loss_clip": 0.01146014, "auxiliary_loss_mlp": 0.01030887, "balance_loss_clip": 1.01824474, "balance_loss_mlp": 1.03819394, "epoch": 0.5653389448369157, "flos": 30044052080640.0, "grad_norm": 1.492653172717246, "language_loss": 0.77581465, "learning_rate": 1.592557234311283e-06, "loss": 0.7975837, "num_input_tokens_seen": 202568835, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 9403, "time_per_iteration": 2.6813149452209473 }, { "auxiliary_loss_clip": 0.01149034, "auxiliary_loss_mlp": 0.01028994, "balance_loss_clip": 1.01558936, "balance_loss_mlp": 1.0389204, "epoch": 0.5653990680895836, "flos": 16727550128640.0, "grad_norm": 1.5634566323735744, "language_loss": 0.69003153, "learning_rate": 1.5921873426038888e-06, "loss": 0.71181184, "num_input_tokens_seen": 202587385, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7421875, "step": 9404, "time_per_iteration": 4.00436544418335 }, { "auxiliary_loss_clip": 0.01127114, "auxiliary_loss_mlp": 0.01032557, "balance_loss_clip": 1.01957464, "balance_loss_mlp": 1.03754854, "epoch": 0.5654591913422516, "flos": 14538256179840.0, "grad_norm": 2.044942250398568, "language_loss": 0.67060971, "learning_rate": 1.5918174654502784e-06, "loss": 0.69220644, "num_input_tokens_seen": 202604815, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 9405, "time_per_iteration": 2.5281383991241455 }, { "auxiliary_loss_clip": 0.01127721, "auxiliary_loss_mlp": 0.01029828, "balance_loss_clip": 1.01753783, "balance_loss_mlp": 1.04095316, "epoch": 0.5655193145949196, "flos": 26209905603840.0, "grad_norm": 2.150057919445825, "language_loss": 0.74268258, "learning_rate": 1.5914476028636532e-06, "loss": 0.76425803, "num_input_tokens_seen": 202623775, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 9406, "time_per_iteration": 2.6604983806610107 }, { "auxiliary_loss_clip": 0.01150698, "auxiliary_loss_mlp": 0.01033902, "balance_loss_clip": 1.01853609, "balance_loss_mlp": 1.03838897, "epoch": 0.5655794378475876, "flos": 25046579825280.0, "grad_norm": 1.9287399449873373, "language_loss": 0.79380554, "learning_rate": 1.591077754857212e-06, "loss": 0.8156516, "num_input_tokens_seen": 202643375, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.7578125, "step": 9407, "time_per_iteration": 4.058988809585571 }, { "auxiliary_loss_clip": 0.0113544, "auxiliary_loss_mlp": 0.01033422, "balance_loss_clip": 1.02068412, "balance_loss_mlp": 1.03909016, "epoch": 0.5656395611002555, "flos": 31032852652800.0, "grad_norm": 1.8005023716185673, "language_loss": 0.70539165, "learning_rate": 1.5907079214441537e-06, "loss": 0.72708029, "num_input_tokens_seen": 202668400, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 9408, "time_per_iteration": 2.6865928173065186 }, { "auxiliary_loss_clip": 0.01148002, "auxiliary_loss_mlp": 0.01033772, "balance_loss_clip": 1.02026606, "balance_loss_mlp": 1.03966749, "epoch": 0.5656996843529235, "flos": 20229522606720.0, "grad_norm": 1.9236097784632145, "language_loss": 0.81177092, "learning_rate": 1.5903381026376769e-06, "loss": 0.83358866, "num_input_tokens_seen": 202685125, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 9409, "time_per_iteration": 2.5991690158843994 }, { "auxiliary_loss_clip": 0.0114011, "auxiliary_loss_mlp": 0.01031355, "balance_loss_clip": 1.01786613, "balance_loss_mlp": 1.03952157, "epoch": 0.5657598076055914, "flos": 20996251344000.0, "grad_norm": 1.6101864253198543, "language_loss": 0.7838704, "learning_rate": 1.5899682984509794e-06, "loss": 0.80558503, "num_input_tokens_seen": 202703830, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 9410, "time_per_iteration": 2.5746536254882812 }, { "auxiliary_loss_clip": 0.01112488, "auxiliary_loss_mlp": 0.01036453, "balance_loss_clip": 1.02339911, "balance_loss_mlp": 1.03871906, "epoch": 0.5658199308582594, "flos": 11545999649280.0, "grad_norm": 2.208380320029195, "language_loss": 0.83182406, "learning_rate": 1.589598508897259e-06, "loss": 0.85331351, "num_input_tokens_seen": 202719835, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73828125, "step": 9411, "time_per_iteration": 2.536926507949829 }, { "auxiliary_loss_clip": 0.01124953, "auxiliary_loss_mlp": 0.01031838, "balance_loss_clip": 1.01703262, "balance_loss_mlp": 1.03888965, "epoch": 0.5658800541109275, "flos": 14172146807040.0, "grad_norm": 2.5025730304607623, "language_loss": 0.66884446, "learning_rate": 1.589228733989712e-06, "loss": 0.6904124, "num_input_tokens_seen": 202736795, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7734375, "step": 9412, "time_per_iteration": 2.541268825531006 }, { "auxiliary_loss_clip": 0.01118125, "auxiliary_loss_mlp": 0.01031256, "balance_loss_clip": 1.01935935, "balance_loss_mlp": 1.03935659, "epoch": 0.5659401773635954, "flos": 27305073325440.0, "grad_norm": 1.5442297707661776, "language_loss": 0.58014333, "learning_rate": 1.5888589737415342e-06, "loss": 0.60163713, "num_input_tokens_seen": 202756900, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69921875, "step": 9413, "time_per_iteration": 2.631624937057495 }, { "auxiliary_loss_clip": 0.0116308, "auxiliary_loss_mlp": 0.01030849, "balance_loss_clip": 1.01817143, "balance_loss_mlp": 1.03864241, "epoch": 0.5660003006162634, "flos": 16728196573440.0, "grad_norm": 1.594463428351133, "language_loss": 0.69503206, "learning_rate": 1.588489228165923e-06, "loss": 0.7169714, "num_input_tokens_seen": 202775145, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 9414, "time_per_iteration": 5.572661399841309 }, { "auxiliary_loss_clip": 0.01161102, "auxiliary_loss_mlp": 0.01033466, "balance_loss_clip": 1.02144408, "balance_loss_mlp": 1.03799677, "epoch": 0.5660604238689313, "flos": 21653452535040.0, "grad_norm": 1.5791190577553536, "language_loss": 0.78440952, "learning_rate": 1.588119497276072e-06, "loss": 0.80635512, "num_input_tokens_seen": 202794505, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 9415, "time_per_iteration": 2.579829454421997 }, { "auxiliary_loss_clip": 0.01053129, "auxiliary_loss_mlp": 0.01003586, "balance_loss_clip": 1.00194097, "balance_loss_mlp": 1.0125792, "epoch": 0.5661205471215993, "flos": 68824022083200.0, "grad_norm": 0.684169561482108, "language_loss": 0.49201289, "learning_rate": 1.587749781085177e-06, "loss": 0.51258004, "num_input_tokens_seen": 202858580, "router_z_loss_clip": 0.01647949, "router_z_loss_mlp": 0.23144531, "step": 9416, "time_per_iteration": 3.2978081703186035 }, { "auxiliary_loss_clip": 0.0112135, "auxiliary_loss_mlp": 0.01283599, "balance_loss_clip": 1.02292573, "balance_loss_mlp": 1.03853559, "epoch": 0.5661806703742672, "flos": 28621774177920.0, "grad_norm": 1.8276671131874365, "language_loss": 0.62937307, "learning_rate": 1.587380079606432e-06, "loss": 0.65342259, "num_input_tokens_seen": 202878565, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.73828125, "step": 9417, "time_per_iteration": 2.6553518772125244 }, { "auxiliary_loss_clip": 0.01130859, "auxiliary_loss_mlp": 0.01035422, "balance_loss_clip": 1.02181387, "balance_loss_mlp": 1.03797412, "epoch": 0.5662407936269352, "flos": 21397948116480.0, "grad_norm": 1.9146809327105438, "language_loss": 0.69107866, "learning_rate": 1.5870103928530302e-06, "loss": 0.71274149, "num_input_tokens_seen": 202897350, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75, "step": 9418, "time_per_iteration": 2.603806734085083 }, { "auxiliary_loss_clip": 0.01120463, "auxiliary_loss_mlp": 0.01034527, "balance_loss_clip": 1.02002478, "balance_loss_mlp": 1.03815067, "epoch": 0.5663009168796032, "flos": 25660005315840.0, "grad_norm": 1.755008028649505, "language_loss": 0.64553726, "learning_rate": 1.5866407208381659e-06, "loss": 0.66708714, "num_input_tokens_seen": 202916745, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.734375, "step": 9419, "time_per_iteration": 2.6097469329833984 }, { "auxiliary_loss_clip": 0.01087831, "auxiliary_loss_mlp": 0.01004664, "balance_loss_clip": 1.00315571, "balance_loss_mlp": 1.01206243, "epoch": 0.5663610401322712, "flos": 67930458422400.0, "grad_norm": 0.7379051039933285, "language_loss": 0.59733474, "learning_rate": 1.58627106357503e-06, "loss": 0.61825973, "num_input_tokens_seen": 202982375, "router_z_loss_clip": 0.01507568, "router_z_loss_mlp": 0.23046875, "step": 9420, "time_per_iteration": 3.3279287815093994 }, { "auxiliary_loss_clip": 0.01125111, "auxiliary_loss_mlp": 0.01275472, "balance_loss_clip": 1.01683164, "balance_loss_mlp": 1.03706646, "epoch": 0.5664211633849391, "flos": 24609367480320.0, "grad_norm": 1.8518770533860556, "language_loss": 0.74699152, "learning_rate": 1.5859014210768163e-06, "loss": 0.77099735, "num_input_tokens_seen": 203002430, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 9421, "time_per_iteration": 2.6254751682281494 }, { "auxiliary_loss_clip": 0.01141012, "auxiliary_loss_mlp": 0.01033387, "balance_loss_clip": 1.02016664, "balance_loss_mlp": 1.03953838, "epoch": 0.5664812866376071, "flos": 11648811352320.0, "grad_norm": 2.120983557099997, "language_loss": 0.7272706, "learning_rate": 1.5855317933567156e-06, "loss": 0.74901462, "num_input_tokens_seen": 203019425, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75, "step": 9422, "time_per_iteration": 2.5601649284362793 }, { "auxiliary_loss_clip": 0.01118947, "auxiliary_loss_mlp": 0.01037283, "balance_loss_clip": 1.02533269, "balance_loss_mlp": 1.04001653, "epoch": 0.566541409890275, "flos": 24643985212800.0, "grad_norm": 1.6151969052827468, "language_loss": 0.82020855, "learning_rate": 1.5851621804279186e-06, "loss": 0.84177083, "num_input_tokens_seen": 203039035, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.69921875, "step": 9423, "time_per_iteration": 2.5748555660247803 }, { "auxiliary_loss_clip": 0.01126786, "auxiliary_loss_mlp": 0.01037066, "balance_loss_clip": 1.0237205, "balance_loss_mlp": 1.03744268, "epoch": 0.566601533142943, "flos": 22270577126400.0, "grad_norm": 1.7202254624543707, "language_loss": 0.80640113, "learning_rate": 1.5847925823036169e-06, "loss": 0.82803965, "num_input_tokens_seen": 203059320, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 9424, "time_per_iteration": 2.590801239013672 }, { "auxiliary_loss_clip": 0.01116889, "auxiliary_loss_mlp": 0.01281442, "balance_loss_clip": 1.01964068, "balance_loss_mlp": 1.04050589, "epoch": 0.5666616563956111, "flos": 29971656218880.0, "grad_norm": 1.9370162357907565, "language_loss": 0.79044807, "learning_rate": 1.584422998996999e-06, "loss": 0.81443143, "num_input_tokens_seen": 203078490, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.76171875, "step": 9425, "time_per_iteration": 2.6289851665496826 }, { "auxiliary_loss_clip": 0.01130198, "auxiliary_loss_mlp": 0.01035599, "balance_loss_clip": 1.02218211, "balance_loss_mlp": 1.04009771, "epoch": 0.566721779648279, "flos": 17781456101760.0, "grad_norm": 1.8482674530363647, "language_loss": 0.58563244, "learning_rate": 1.584053430521256e-06, "loss": 0.60729039, "num_input_tokens_seen": 203096065, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 9426, "time_per_iteration": 2.6029133796691895 }, { "auxiliary_loss_clip": 0.01124768, "auxiliary_loss_mlp": 0.01031828, "balance_loss_clip": 1.01904869, "balance_loss_mlp": 1.04021215, "epoch": 0.566781902900947, "flos": 21033490769280.0, "grad_norm": 1.8577705441975196, "language_loss": 0.81590039, "learning_rate": 1.5836838768895757e-06, "loss": 0.83746636, "num_input_tokens_seen": 203115270, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.75390625, "step": 9427, "time_per_iteration": 2.5730907917022705 }, { "auxiliary_loss_clip": 0.01110045, "auxiliary_loss_mlp": 0.01034904, "balance_loss_clip": 1.02187443, "balance_loss_mlp": 1.04004562, "epoch": 0.5668420261536149, "flos": 23148593176320.0, "grad_norm": 2.827565810941965, "language_loss": 0.85993397, "learning_rate": 1.5833143381151474e-06, "loss": 0.88138342, "num_input_tokens_seen": 203134290, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69921875, "step": 9428, "time_per_iteration": 2.563234806060791 }, { "auxiliary_loss_clip": 0.01136363, "auxiliary_loss_mlp": 0.01035902, "balance_loss_clip": 1.02235961, "balance_loss_mlp": 1.04229558, "epoch": 0.5669021494062829, "flos": 22601601889920.0, "grad_norm": 2.123425108783727, "language_loss": 0.73654807, "learning_rate": 1.5829448142111586e-06, "loss": 0.75827074, "num_input_tokens_seen": 203152935, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.76171875, "step": 9429, "time_per_iteration": 2.6689553260803223 }, { "auxiliary_loss_clip": 0.01132013, "auxiliary_loss_mlp": 0.01282614, "balance_loss_clip": 1.02158129, "balance_loss_mlp": 1.03926504, "epoch": 0.5669622726589508, "flos": 17381231786880.0, "grad_norm": 2.066691402846244, "language_loss": 0.75705647, "learning_rate": 1.582575305190796e-06, "loss": 0.78120273, "num_input_tokens_seen": 203170110, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75, "step": 9430, "time_per_iteration": 2.5554239749908447 }, { "auxiliary_loss_clip": 0.01112536, "auxiliary_loss_mlp": 0.01034442, "balance_loss_clip": 1.02152002, "balance_loss_mlp": 1.03883314, "epoch": 0.5670223959116188, "flos": 18763253521920.0, "grad_norm": 1.8963958337023787, "language_loss": 0.72781265, "learning_rate": 1.5822058110672475e-06, "loss": 0.74928236, "num_input_tokens_seen": 203188825, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 9431, "time_per_iteration": 2.567532777786255 }, { "auxiliary_loss_clip": 0.01140932, "auxiliary_loss_mlp": 0.01030103, "balance_loss_clip": 1.01861715, "balance_loss_mlp": 1.03898358, "epoch": 0.5670825191642868, "flos": 13553334276480.0, "grad_norm": 1.7044154972054595, "language_loss": 0.73428369, "learning_rate": 1.5818363318536985e-06, "loss": 0.75599402, "num_input_tokens_seen": 203206860, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 9432, "time_per_iteration": 2.554109811782837 }, { "auxiliary_loss_clip": 0.01147779, "auxiliary_loss_mlp": 0.01030946, "balance_loss_clip": 1.01868534, "balance_loss_mlp": 1.03868234, "epoch": 0.5671426424169548, "flos": 22054035985920.0, "grad_norm": 1.7404698471486704, "language_loss": 0.77741504, "learning_rate": 1.5814668675633356e-06, "loss": 0.79920226, "num_input_tokens_seen": 203225625, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.73046875, "step": 9433, "time_per_iteration": 2.6087427139282227 }, { "auxiliary_loss_clip": 0.01115597, "auxiliary_loss_mlp": 0.01032512, "balance_loss_clip": 1.01722991, "balance_loss_mlp": 1.04008627, "epoch": 0.5672027656696227, "flos": 21323972056320.0, "grad_norm": 2.2307286876373116, "language_loss": 0.63462698, "learning_rate": 1.581097418209344e-06, "loss": 0.65610802, "num_input_tokens_seen": 203242920, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7578125, "step": 9434, "time_per_iteration": 2.576451539993286 }, { "auxiliary_loss_clip": 0.01118546, "auxiliary_loss_mlp": 0.01032061, "balance_loss_clip": 1.01945448, "balance_loss_mlp": 1.03863478, "epoch": 0.5672628889222907, "flos": 23514056104320.0, "grad_norm": 1.6185982306053686, "language_loss": 0.66293383, "learning_rate": 1.580727983804907e-06, "loss": 0.6844399, "num_input_tokens_seen": 203261995, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 9435, "time_per_iteration": 2.6301639080047607 }, { "auxiliary_loss_clip": 0.0112698, "auxiliary_loss_mlp": 0.01276411, "balance_loss_clip": 1.01714134, "balance_loss_mlp": 1.03820586, "epoch": 0.5673230121749586, "flos": 27121928855040.0, "grad_norm": 1.3557662211133674, "language_loss": 0.7161029, "learning_rate": 1.5803585643632102e-06, "loss": 0.7401368, "num_input_tokens_seen": 203280670, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 9436, "time_per_iteration": 2.6511690616607666 }, { "auxiliary_loss_clip": 0.01114598, "auxiliary_loss_mlp": 0.01031487, "balance_loss_clip": 1.01812387, "balance_loss_mlp": 1.03981543, "epoch": 0.5673831354276266, "flos": 31141985149440.0, "grad_norm": 1.496802555210907, "language_loss": 0.7404505, "learning_rate": 1.5799891598974366e-06, "loss": 0.76191133, "num_input_tokens_seen": 203304800, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.75, "step": 9437, "time_per_iteration": 2.7067952156066895 }, { "auxiliary_loss_clip": 0.01126292, "auxiliary_loss_mlp": 0.01034816, "balance_loss_clip": 1.02083302, "balance_loss_mlp": 1.04360068, "epoch": 0.5674432586802945, "flos": 27673193859840.0, "grad_norm": 2.077749591384099, "language_loss": 0.60912013, "learning_rate": 1.5796197704207698e-06, "loss": 0.63073123, "num_input_tokens_seen": 203324060, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.73828125, "step": 9438, "time_per_iteration": 2.68259596824646 }, { "auxiliary_loss_clip": 0.01129268, "auxiliary_loss_mlp": 0.01028343, "balance_loss_clip": 1.01574826, "balance_loss_mlp": 1.03814292, "epoch": 0.5675033819329626, "flos": 26615157822720.0, "grad_norm": 1.506501510755135, "language_loss": 0.74943864, "learning_rate": 1.579250395946392e-06, "loss": 0.77101475, "num_input_tokens_seen": 203344360, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.734375, "step": 9439, "time_per_iteration": 2.6070499420166016 }, { "auxiliary_loss_clip": 0.01131336, "auxiliary_loss_mlp": 0.01033038, "balance_loss_clip": 1.01844633, "balance_loss_mlp": 1.04143322, "epoch": 0.5675635051856306, "flos": 19098372435840.0, "grad_norm": 2.0239572473785548, "language_loss": 0.83427167, "learning_rate": 1.5788810364874849e-06, "loss": 0.85591543, "num_input_tokens_seen": 203362115, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.72265625, "step": 9440, "time_per_iteration": 2.648127794265747 }, { "auxiliary_loss_clip": 0.01142417, "auxiliary_loss_mlp": 0.01037719, "balance_loss_clip": 1.02352142, "balance_loss_mlp": 1.03865218, "epoch": 0.5676236284382985, "flos": 17566315591680.0, "grad_norm": 1.990507031066767, "language_loss": 0.75874507, "learning_rate": 1.5785116920572307e-06, "loss": 0.78054643, "num_input_tokens_seen": 203380550, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.765625, "step": 9441, "time_per_iteration": 2.5969784259796143 }, { "auxiliary_loss_clip": 0.01147551, "auxiliary_loss_mlp": 0.01036271, "balance_loss_clip": 1.02295542, "balance_loss_mlp": 1.03965724, "epoch": 0.5676837516909665, "flos": 15954069634560.0, "grad_norm": 1.8523792882322556, "language_loss": 0.82778549, "learning_rate": 1.5781423626688097e-06, "loss": 0.8496238, "num_input_tokens_seen": 203396590, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 9442, "time_per_iteration": 2.565505027770996 }, { "auxiliary_loss_clip": 0.01134569, "auxiliary_loss_mlp": 0.01029879, "balance_loss_clip": 1.01714206, "balance_loss_mlp": 1.04088211, "epoch": 0.5677438749436344, "flos": 18295912644480.0, "grad_norm": 1.8200371008154346, "language_loss": 0.74571395, "learning_rate": 1.5777730483354033e-06, "loss": 0.76735842, "num_input_tokens_seen": 203414280, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.671875, "step": 9443, "time_per_iteration": 2.547659397125244 }, { "auxiliary_loss_clip": 0.01130538, "auxiliary_loss_mlp": 0.01033429, "balance_loss_clip": 1.02011967, "balance_loss_mlp": 1.03918457, "epoch": 0.5678039981963025, "flos": 17931311642880.0, "grad_norm": 1.8030569980475821, "language_loss": 0.77420306, "learning_rate": 1.5774037490701903e-06, "loss": 0.79584271, "num_input_tokens_seen": 203433280, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 9444, "time_per_iteration": 2.665226936340332 }, { "auxiliary_loss_clip": 0.0114192, "auxiliary_loss_mlp": 0.01039933, "balance_loss_clip": 1.02637887, "balance_loss_mlp": 1.04077375, "epoch": 0.5678641214489704, "flos": 19316350120320.0, "grad_norm": 1.578810020986299, "language_loss": 0.81134146, "learning_rate": 1.57703446488635e-06, "loss": 0.83315998, "num_input_tokens_seen": 203449935, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 9445, "time_per_iteration": 3.961398124694824 }, { "auxiliary_loss_clip": 0.01108732, "auxiliary_loss_mlp": 0.01026799, "balance_loss_clip": 1.01402545, "balance_loss_mlp": 1.03653002, "epoch": 0.5679242447016384, "flos": 27751084502400.0, "grad_norm": 1.3293673288974837, "language_loss": 0.70944566, "learning_rate": 1.5766651957970624e-06, "loss": 0.73080093, "num_input_tokens_seen": 203473025, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.72265625, "step": 9446, "time_per_iteration": 2.6935529708862305 }, { "auxiliary_loss_clip": 0.01141505, "auxiliary_loss_mlp": 0.01032284, "balance_loss_clip": 1.01903379, "balance_loss_mlp": 1.03935146, "epoch": 0.5679843679543063, "flos": 23769093646080.0, "grad_norm": 1.8373477128251698, "language_loss": 0.73568296, "learning_rate": 1.5762959418155043e-06, "loss": 0.7574209, "num_input_tokens_seen": 203492895, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7578125, "step": 9447, "time_per_iteration": 2.7079215049743652 }, { "auxiliary_loss_clip": 0.01133458, "auxiliary_loss_mlp": 0.01033854, "balance_loss_clip": 1.02050877, "balance_loss_mlp": 1.04033947, "epoch": 0.5680444912069743, "flos": 25591883172480.0, "grad_norm": 2.0419820404331803, "language_loss": 0.74857378, "learning_rate": 1.5759267029548548e-06, "loss": 0.77024686, "num_input_tokens_seen": 203513710, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.75, "step": 9448, "time_per_iteration": 2.709153413772583 }, { "auxiliary_loss_clip": 0.01152518, "auxiliary_loss_mlp": 0.01038576, "balance_loss_clip": 1.02471209, "balance_loss_mlp": 1.04080224, "epoch": 0.5681046144596422, "flos": 23695799944320.0, "grad_norm": 2.125261336782834, "language_loss": 0.76555926, "learning_rate": 1.5755574792282902e-06, "loss": 0.78747016, "num_input_tokens_seen": 203531630, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76171875, "step": 9449, "time_per_iteration": 4.066070556640625 }, { "auxiliary_loss_clip": 0.01130095, "auxiliary_loss_mlp": 0.01033216, "balance_loss_clip": 1.02000737, "balance_loss_mlp": 1.0389607, "epoch": 0.5681647377123102, "flos": 28000770917760.0, "grad_norm": 1.5485246715581256, "language_loss": 0.74834335, "learning_rate": 1.5751882706489875e-06, "loss": 0.7699765, "num_input_tokens_seen": 203551885, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 9450, "time_per_iteration": 2.5878794193267822 }, { "auxiliary_loss_clip": 0.01134667, "auxiliary_loss_mlp": 0.01037362, "balance_loss_clip": 1.02463603, "balance_loss_mlp": 1.0425477, "epoch": 0.5682248609649782, "flos": 22747758330240.0, "grad_norm": 1.6105896862265938, "language_loss": 0.6720258, "learning_rate": 1.5748190772301228e-06, "loss": 0.69374609, "num_input_tokens_seen": 203572250, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.74609375, "step": 9451, "time_per_iteration": 2.595792055130005 }, { "auxiliary_loss_clip": 0.01138462, "auxiliary_loss_mlp": 0.0103268, "balance_loss_clip": 1.01772523, "balance_loss_mlp": 1.03991055, "epoch": 0.5682849842176462, "flos": 21288600138240.0, "grad_norm": 2.198438250482466, "language_loss": 0.72039497, "learning_rate": 1.574449898984871e-06, "loss": 0.74210644, "num_input_tokens_seen": 203590605, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8046875, "step": 9452, "time_per_iteration": 2.5840303897857666 }, { "auxiliary_loss_clip": 0.01135329, "auxiliary_loss_mlp": 0.01031114, "balance_loss_clip": 1.01862121, "balance_loss_mlp": 1.03752422, "epoch": 0.5683451074703142, "flos": 21141689512320.0, "grad_norm": 1.4579541299558276, "language_loss": 0.70327836, "learning_rate": 1.5740807359264082e-06, "loss": 0.7249428, "num_input_tokens_seen": 203610080, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 9453, "time_per_iteration": 2.5631349086761475 }, { "auxiliary_loss_clip": 0.01147972, "auxiliary_loss_mlp": 0.01284134, "balance_loss_clip": 1.02488756, "balance_loss_mlp": 1.04031062, "epoch": 0.5684052307229821, "flos": 22344481359360.0, "grad_norm": 1.6489272415539096, "language_loss": 0.69600809, "learning_rate": 1.5737115880679074e-06, "loss": 0.72032917, "num_input_tokens_seen": 203630060, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 9454, "time_per_iteration": 2.5798213481903076 }, { "auxiliary_loss_clip": 0.01124613, "auxiliary_loss_mlp": 0.01031635, "balance_loss_clip": 1.01982176, "balance_loss_mlp": 1.03577352, "epoch": 0.5684653539756501, "flos": 21798639308160.0, "grad_norm": 2.016274928236637, "language_loss": 0.6090982, "learning_rate": 1.5733424554225443e-06, "loss": 0.63066065, "num_input_tokens_seen": 203649065, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.7109375, "step": 9455, "time_per_iteration": 4.092095851898193 }, { "auxiliary_loss_clip": 0.01153511, "auxiliary_loss_mlp": 0.01037329, "balance_loss_clip": 1.02382278, "balance_loss_mlp": 1.04171586, "epoch": 0.568525477228318, "flos": 22999635475200.0, "grad_norm": 1.7885651017897606, "language_loss": 0.73058558, "learning_rate": 1.5729733380034915e-06, "loss": 0.75249398, "num_input_tokens_seen": 203667545, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.765625, "step": 9456, "time_per_iteration": 4.159726142883301 }, { "auxiliary_loss_clip": 0.01128404, "auxiliary_loss_mlp": 0.01032017, "balance_loss_clip": 1.01995921, "balance_loss_mlp": 1.03880632, "epoch": 0.568585600480986, "flos": 21392489249280.0, "grad_norm": 1.566195459984674, "language_loss": 0.77223623, "learning_rate": 1.5726042358239212e-06, "loss": 0.79384041, "num_input_tokens_seen": 203686025, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.72265625, "step": 9457, "time_per_iteration": 2.578078031539917 }, { "auxiliary_loss_clip": 0.01122226, "auxiliary_loss_mlp": 0.01034676, "balance_loss_clip": 1.02023339, "balance_loss_mlp": 1.03849268, "epoch": 0.568645723733654, "flos": 30007351359360.0, "grad_norm": 1.7629950811854596, "language_loss": 0.66019821, "learning_rate": 1.5722351488970072e-06, "loss": 0.68176717, "num_input_tokens_seen": 203705540, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.75, "step": 9458, "time_per_iteration": 2.572816848754883 }, { "auxiliary_loss_clip": 0.01122576, "auxiliary_loss_mlp": 0.01030223, "balance_loss_clip": 1.01638842, "balance_loss_mlp": 1.03868282, "epoch": 0.568705846986322, "flos": 20412667077120.0, "grad_norm": 2.1620370656205563, "language_loss": 0.677127, "learning_rate": 1.5718660772359197e-06, "loss": 0.69865501, "num_input_tokens_seen": 203723670, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75, "step": 9459, "time_per_iteration": 2.608822822570801 }, { "auxiliary_loss_clip": 0.01143117, "auxiliary_loss_mlp": 0.01033289, "balance_loss_clip": 1.01940107, "balance_loss_mlp": 1.04046392, "epoch": 0.5687659702389899, "flos": 17456752131840.0, "grad_norm": 2.2161964567162835, "language_loss": 0.7678324, "learning_rate": 1.571497020853831e-06, "loss": 0.78959644, "num_input_tokens_seen": 203739705, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 9460, "time_per_iteration": 2.494868755340576 }, { "auxiliary_loss_clip": 0.01157278, "auxiliary_loss_mlp": 0.01043398, "balance_loss_clip": 1.02973068, "balance_loss_mlp": 1.03840303, "epoch": 0.5688260934916579, "flos": 25406081095680.0, "grad_norm": 1.90128986764168, "language_loss": 0.72237736, "learning_rate": 1.571127979763911e-06, "loss": 0.74438411, "num_input_tokens_seen": 203759000, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75, "step": 9461, "time_per_iteration": 2.6096975803375244 }, { "auxiliary_loss_clip": 0.01120914, "auxiliary_loss_mlp": 0.01034857, "balance_loss_clip": 1.02102876, "balance_loss_mlp": 1.03855443, "epoch": 0.5688862167443258, "flos": 21608024808960.0, "grad_norm": 2.5257843910002746, "language_loss": 0.7329669, "learning_rate": 1.5707589539793305e-06, "loss": 0.75452459, "num_input_tokens_seen": 203774295, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 9462, "time_per_iteration": 2.530252456665039 }, { "auxiliary_loss_clip": 0.0111627, "auxiliary_loss_mlp": 0.01029247, "balance_loss_clip": 1.01721883, "balance_loss_mlp": 1.0373702, "epoch": 0.5689463399969938, "flos": 22418996123520.0, "grad_norm": 2.208603392605044, "language_loss": 0.72157824, "learning_rate": 1.5703899435132588e-06, "loss": 0.74303341, "num_input_tokens_seen": 203792710, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69921875, "step": 9463, "time_per_iteration": 2.547083854675293 }, { "auxiliary_loss_clip": 0.01118884, "auxiliary_loss_mlp": 0.01033166, "balance_loss_clip": 1.0204699, "balance_loss_mlp": 1.03713322, "epoch": 0.5690064632496618, "flos": 18296810484480.0, "grad_norm": 2.1128542788135842, "language_loss": 0.74292016, "learning_rate": 1.570020948378865e-06, "loss": 0.76444066, "num_input_tokens_seen": 203811645, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73046875, "step": 9464, "time_per_iteration": 2.4843835830688477 }, { "auxiliary_loss_clip": 0.01120847, "auxiliary_loss_mlp": 0.0128373, "balance_loss_clip": 1.0238322, "balance_loss_mlp": 1.03819036, "epoch": 0.5690665865023298, "flos": 21579260993280.0, "grad_norm": 1.7039142714914697, "language_loss": 0.84162998, "learning_rate": 1.5696519685893175e-06, "loss": 0.86567581, "num_input_tokens_seen": 203830040, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73828125, "step": 9465, "time_per_iteration": 2.543729782104492 }, { "auxiliary_loss_clip": 0.01125071, "auxiliary_loss_mlp": 0.0103196, "balance_loss_clip": 1.01937699, "balance_loss_mlp": 1.0367676, "epoch": 0.5691267097549978, "flos": 24421446501120.0, "grad_norm": 1.9292040845287082, "language_loss": 0.71980512, "learning_rate": 1.5692830041577842e-06, "loss": 0.74137545, "num_input_tokens_seen": 203851245, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 9466, "time_per_iteration": 2.5775349140167236 }, { "auxiliary_loss_clip": 0.01063972, "auxiliary_loss_mlp": 0.0100132, "balance_loss_clip": 0.99978274, "balance_loss_mlp": 1.01425016, "epoch": 0.5691868330076657, "flos": 61657906199040.0, "grad_norm": 0.7313907541709973, "language_loss": 0.55376214, "learning_rate": 1.5689140550974323e-06, "loss": 0.57441515, "num_input_tokens_seen": 203916400, "router_z_loss_clip": 0.01531982, "router_z_loss_mlp": 0.22949219, "step": 9467, "time_per_iteration": 3.236898899078369 }, { "auxiliary_loss_clip": 0.01124021, "auxiliary_loss_mlp": 0.01038085, "balance_loss_clip": 1.02314806, "balance_loss_mlp": 1.03889847, "epoch": 0.5692469562603337, "flos": 21325193118720.0, "grad_norm": 2.1042735332132914, "language_loss": 0.63927847, "learning_rate": 1.5685451214214292e-06, "loss": 0.66089952, "num_input_tokens_seen": 203935870, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.7578125, "step": 9468, "time_per_iteration": 2.594567060470581 }, { "auxiliary_loss_clip": 0.01145938, "auxiliary_loss_mlp": 0.01034184, "balance_loss_clip": 1.02085054, "balance_loss_mlp": 1.0377183, "epoch": 0.5693070795130016, "flos": 23367899664000.0, "grad_norm": 1.7933613853958992, "language_loss": 0.79391837, "learning_rate": 1.5681762031429405e-06, "loss": 0.8157196, "num_input_tokens_seen": 203954950, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 9469, "time_per_iteration": 2.633419990539551 }, { "auxiliary_loss_clip": 0.01141969, "auxiliary_loss_mlp": 0.01041307, "balance_loss_clip": 1.02744341, "balance_loss_mlp": 1.03859985, "epoch": 0.5693672027656697, "flos": 18697250280960.0, "grad_norm": 2.088204303819715, "language_loss": 0.69779217, "learning_rate": 1.5678073002751329e-06, "loss": 0.719625, "num_input_tokens_seen": 203972715, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.765625, "step": 9470, "time_per_iteration": 2.537109851837158 }, { "auxiliary_loss_clip": 0.01128808, "auxiliary_loss_mlp": 0.01033826, "balance_loss_clip": 1.02027154, "balance_loss_mlp": 1.03726339, "epoch": 0.5694273260183376, "flos": 20449188230400.0, "grad_norm": 1.6960819672251353, "language_loss": 0.74612296, "learning_rate": 1.5674384128311702e-06, "loss": 0.76774931, "num_input_tokens_seen": 203990775, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 9471, "time_per_iteration": 2.562830686569214 }, { "auxiliary_loss_clip": 0.01137296, "auxiliary_loss_mlp": 0.0103461, "balance_loss_clip": 1.02185416, "balance_loss_mlp": 1.03921974, "epoch": 0.5694874492710056, "flos": 17603195880960.0, "grad_norm": 1.874392684458139, "language_loss": 0.57546103, "learning_rate": 1.5670695408242186e-06, "loss": 0.59718013, "num_input_tokens_seen": 204008845, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 9472, "time_per_iteration": 2.54060959815979 }, { "auxiliary_loss_clip": 0.01134648, "auxiliary_loss_mlp": 0.01031586, "balance_loss_clip": 1.01914668, "balance_loss_mlp": 1.03852487, "epoch": 0.5695475725236735, "flos": 13370836250880.0, "grad_norm": 1.8764346790485067, "language_loss": 0.73980808, "learning_rate": 1.5667006842674412e-06, "loss": 0.76147044, "num_input_tokens_seen": 204023755, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 9473, "time_per_iteration": 2.539980411529541 }, { "auxiliary_loss_clip": 0.01063644, "auxiliary_loss_mlp": 0.0100029, "balance_loss_clip": 0.99877566, "balance_loss_mlp": 1.01381278, "epoch": 0.5696076957763415, "flos": 68174398661760.0, "grad_norm": 0.7184252613500818, "language_loss": 0.5764432, "learning_rate": 1.5663318431740017e-06, "loss": 0.59708256, "num_input_tokens_seen": 204091255, "router_z_loss_clip": 0.01513672, "router_z_loss_mlp": 0.22851562, "step": 9474, "time_per_iteration": 3.192512035369873 }, { "auxiliary_loss_clip": 0.01120006, "auxiliary_loss_mlp": 0.01035902, "balance_loss_clip": 1.0219723, "balance_loss_mlp": 1.04008937, "epoch": 0.5696678190290094, "flos": 33838301525760.0, "grad_norm": 2.0444834591832772, "language_loss": 0.53996241, "learning_rate": 1.5659630175570634e-06, "loss": 0.56152147, "num_input_tokens_seen": 204113285, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7109375, "step": 9475, "time_per_iteration": 2.620384454727173 }, { "auxiliary_loss_clip": 0.01112461, "auxiliary_loss_mlp": 0.01037712, "balance_loss_clip": 1.0238781, "balance_loss_mlp": 1.03885579, "epoch": 0.5697279422816774, "flos": 26356600748160.0, "grad_norm": 1.6385434881310963, "language_loss": 0.7961781, "learning_rate": 1.565594207429788e-06, "loss": 0.81767976, "num_input_tokens_seen": 204133045, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 9476, "time_per_iteration": 2.6419527530670166 }, { "auxiliary_loss_clip": 0.01131398, "auxiliary_loss_mlp": 0.01038063, "balance_loss_clip": 1.02486086, "balance_loss_mlp": 1.03983438, "epoch": 0.5697880655343454, "flos": 22930507751040.0, "grad_norm": 1.892844680634997, "language_loss": 0.66608369, "learning_rate": 1.5652254128053385e-06, "loss": 0.68777829, "num_input_tokens_seen": 204152590, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 9477, "time_per_iteration": 2.548814058303833 }, { "auxiliary_loss_clip": 0.01137182, "auxiliary_loss_mlp": 0.01030785, "balance_loss_clip": 1.01743913, "balance_loss_mlp": 1.03753114, "epoch": 0.5698481887870134, "flos": 26761314263040.0, "grad_norm": 2.8338338919539847, "language_loss": 0.70862556, "learning_rate": 1.5648566336968758e-06, "loss": 0.73030519, "num_input_tokens_seen": 204171815, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73046875, "step": 9478, "time_per_iteration": 2.616849184036255 }, { "auxiliary_loss_clip": 0.01133424, "auxiliary_loss_mlp": 0.01032699, "balance_loss_clip": 1.0207659, "balance_loss_mlp": 1.03876448, "epoch": 0.5699083120396814, "flos": 15742269089280.0, "grad_norm": 1.7626976160005725, "language_loss": 0.69712031, "learning_rate": 1.5644878701175604e-06, "loss": 0.71878159, "num_input_tokens_seen": 204188535, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 9479, "time_per_iteration": 2.6003386974334717 }, { "auxiliary_loss_clip": 0.011327, "auxiliary_loss_mlp": 0.0102899, "balance_loss_clip": 1.01678872, "balance_loss_mlp": 1.03740573, "epoch": 0.5699684352923493, "flos": 19537272720000.0, "grad_norm": 1.4002378475906707, "language_loss": 0.71397913, "learning_rate": 1.5641191220805525e-06, "loss": 0.73559606, "num_input_tokens_seen": 204208365, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.68359375, "step": 9480, "time_per_iteration": 2.6251416206359863 }, { "auxiliary_loss_clip": 0.011373, "auxiliary_loss_mlp": 0.01275454, "balance_loss_clip": 1.01496744, "balance_loss_mlp": 1.03825688, "epoch": 0.5700285585450173, "flos": 16253349753600.0, "grad_norm": 2.356653384588232, "language_loss": 0.7194128, "learning_rate": 1.5637503895990116e-06, "loss": 0.74354029, "num_input_tokens_seen": 204226560, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.72265625, "step": 9481, "time_per_iteration": 2.630147695541382 }, { "auxiliary_loss_clip": 0.01120476, "auxiliary_loss_mlp": 0.01033061, "balance_loss_clip": 1.0198884, "balance_loss_mlp": 1.03901815, "epoch": 0.5700886817976852, "flos": 19864993432320.0, "grad_norm": 1.7214712741139608, "language_loss": 0.78267378, "learning_rate": 1.5633816726860975e-06, "loss": 0.80420911, "num_input_tokens_seen": 204245410, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 9482, "time_per_iteration": 2.6862099170684814 }, { "auxiliary_loss_clip": 0.01127327, "auxiliary_loss_mlp": 0.01026895, "balance_loss_clip": 1.01411557, "balance_loss_mlp": 1.0380677, "epoch": 0.5701488050503533, "flos": 23841704989440.0, "grad_norm": 1.5543292122446073, "language_loss": 0.77759039, "learning_rate": 1.5630129713549685e-06, "loss": 0.79913259, "num_input_tokens_seen": 204264840, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 9483, "time_per_iteration": 2.5921897888183594 }, { "auxiliary_loss_clip": 0.01127074, "auxiliary_loss_mlp": 0.010353, "balance_loss_clip": 1.02340293, "balance_loss_mlp": 1.03756475, "epoch": 0.5702089283030212, "flos": 23659673840640.0, "grad_norm": 1.6462919200491057, "language_loss": 0.81324077, "learning_rate": 1.562644285618782e-06, "loss": 0.8348645, "num_input_tokens_seen": 204284335, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.71875, "step": 9484, "time_per_iteration": 2.651155710220337 }, { "auxiliary_loss_clip": 0.01111496, "auxiliary_loss_mlp": 0.01280765, "balance_loss_clip": 1.02026892, "balance_loss_mlp": 1.03818059, "epoch": 0.5702690515556892, "flos": 27891171544320.0, "grad_norm": 1.9793054729391582, "language_loss": 0.6023798, "learning_rate": 1.5622756154906964e-06, "loss": 0.62630236, "num_input_tokens_seen": 204302590, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 9485, "time_per_iteration": 2.5565171241760254 }, { "auxiliary_loss_clip": 0.0111926, "auxiliary_loss_mlp": 0.01031294, "balance_loss_clip": 1.01844943, "balance_loss_mlp": 1.03759289, "epoch": 0.5703291748083571, "flos": 24023951619840.0, "grad_norm": 1.6993858976019, "language_loss": 0.65437353, "learning_rate": 1.5619069609838676e-06, "loss": 0.67587906, "num_input_tokens_seen": 204323055, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 9486, "time_per_iteration": 2.658953905105591 }, { "auxiliary_loss_clip": 0.01053187, "auxiliary_loss_mlp": 0.0100068, "balance_loss_clip": 0.99910605, "balance_loss_mlp": 1.01227522, "epoch": 0.5703892980610251, "flos": 57023382919680.0, "grad_norm": 0.6672399526821123, "language_loss": 0.48000723, "learning_rate": 1.5615383221114531e-06, "loss": 0.50054586, "num_input_tokens_seen": 204386160, "router_z_loss_clip": 0.01574707, "router_z_loss_mlp": 0.22949219, "step": 9487, "time_per_iteration": 4.615994215011597 }, { "auxiliary_loss_clip": 0.011372, "auxiliary_loss_mlp": 0.01281566, "balance_loss_clip": 1.02133965, "balance_loss_mlp": 1.03887939, "epoch": 0.570449421313693, "flos": 24351025887360.0, "grad_norm": 1.5106666668434943, "language_loss": 0.85902339, "learning_rate": 1.5611696988866076e-06, "loss": 0.88321102, "num_input_tokens_seen": 204406315, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 9488, "time_per_iteration": 2.5698158740997314 }, { "auxiliary_loss_clip": 0.01139846, "auxiliary_loss_mlp": 0.01032395, "balance_loss_clip": 1.01893044, "balance_loss_mlp": 1.0377568, "epoch": 0.570509544566361, "flos": 24828566227200.0, "grad_norm": 1.444796740927311, "language_loss": 0.79224467, "learning_rate": 1.5608010913224868e-06, "loss": 0.81396705, "num_input_tokens_seen": 204427645, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75390625, "step": 9489, "time_per_iteration": 2.619420289993286 }, { "auxiliary_loss_clip": 0.01127948, "auxiliary_loss_mlp": 0.01029051, "balance_loss_clip": 1.01711774, "balance_loss_mlp": 1.03784466, "epoch": 0.570569667819029, "flos": 21397301671680.0, "grad_norm": 1.7161759188585428, "language_loss": 0.70042479, "learning_rate": 1.5604324994322453e-06, "loss": 0.72199476, "num_input_tokens_seen": 204445910, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.7265625, "step": 9490, "time_per_iteration": 2.5504705905914307 }, { "auxiliary_loss_clip": 0.0113692, "auxiliary_loss_mlp": 0.01033874, "balance_loss_clip": 1.02132106, "balance_loss_mlp": 1.03867054, "epoch": 0.570629791071697, "flos": 23216751233280.0, "grad_norm": 1.5268667961444569, "language_loss": 0.75945461, "learning_rate": 1.560063923229037e-06, "loss": 0.78116256, "num_input_tokens_seen": 204464680, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 9491, "time_per_iteration": 3.9048516750335693 }, { "auxiliary_loss_clip": 0.01061204, "auxiliary_loss_mlp": 0.01248887, "balance_loss_clip": 1.00139487, "balance_loss_mlp": 1.01286888, "epoch": 0.570689914324365, "flos": 65284666525440.0, "grad_norm": 0.7916550188283729, "language_loss": 0.57399619, "learning_rate": 1.559695362726016e-06, "loss": 0.5970971, "num_input_tokens_seen": 204525580, "router_z_loss_clip": 0.01538086, "router_z_loss_mlp": 0.22753906, "step": 9492, "time_per_iteration": 3.2192258834838867 }, { "auxiliary_loss_clip": 0.01129411, "auxiliary_loss_mlp": 0.01281156, "balance_loss_clip": 1.02093995, "balance_loss_mlp": 1.03940749, "epoch": 0.5707500375770329, "flos": 21141904993920.0, "grad_norm": 1.7309520891932395, "language_loss": 0.72026354, "learning_rate": 1.5593268179363346e-06, "loss": 0.74436921, "num_input_tokens_seen": 204541320, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 9493, "time_per_iteration": 2.5333573818206787 }, { "auxiliary_loss_clip": 0.01113556, "auxiliary_loss_mlp": 0.01031357, "balance_loss_clip": 1.01818466, "balance_loss_mlp": 1.03849387, "epoch": 0.5708101608297009, "flos": 20812747737600.0, "grad_norm": 1.5767066058612194, "language_loss": 0.78070474, "learning_rate": 1.5589582888731462e-06, "loss": 0.80215383, "num_input_tokens_seen": 204560275, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.75, "step": 9494, "time_per_iteration": 2.5590462684631348 }, { "auxiliary_loss_clip": 0.01114602, "auxiliary_loss_mlp": 0.01034776, "balance_loss_clip": 1.02078092, "balance_loss_mlp": 1.03756893, "epoch": 0.5708702840823688, "flos": 25812338895360.0, "grad_norm": 2.439025315096797, "language_loss": 0.80009949, "learning_rate": 1.5585897755496016e-06, "loss": 0.82159323, "num_input_tokens_seen": 204579430, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76953125, "step": 9495, "time_per_iteration": 2.5539934635162354 }, { "auxiliary_loss_clip": 0.01155982, "auxiliary_loss_mlp": 0.0104305, "balance_loss_clip": 1.02717733, "balance_loss_mlp": 1.04090726, "epoch": 0.5709304073350369, "flos": 23651916503040.0, "grad_norm": 1.8890299862965665, "language_loss": 0.66008353, "learning_rate": 1.558221277978852e-06, "loss": 0.68207389, "num_input_tokens_seen": 204597710, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.7890625, "step": 9496, "time_per_iteration": 4.267963647842407 }, { "auxiliary_loss_clip": 0.01115542, "auxiliary_loss_mlp": 0.01037933, "balance_loss_clip": 1.02397919, "balance_loss_mlp": 1.0391283, "epoch": 0.5709905305877048, "flos": 16107552449280.0, "grad_norm": 2.077610781900154, "language_loss": 0.69209796, "learning_rate": 1.557852796174049e-06, "loss": 0.7136327, "num_input_tokens_seen": 204616140, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.765625, "step": 9497, "time_per_iteration": 2.590345621109009 }, { "auxiliary_loss_clip": 0.01122153, "auxiliary_loss_mlp": 0.01276348, "balance_loss_clip": 1.01554835, "balance_loss_mlp": 1.03807652, "epoch": 0.5710506538403728, "flos": 24750819239040.0, "grad_norm": 2.311899813188526, "language_loss": 0.81490344, "learning_rate": 1.5574843301483422e-06, "loss": 0.83888847, "num_input_tokens_seen": 204636470, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75, "step": 9498, "time_per_iteration": 4.3886449337005615 }, { "auxiliary_loss_clip": 0.01112443, "auxiliary_loss_mlp": 0.01033942, "balance_loss_clip": 1.01907659, "balance_loss_mlp": 1.03832936, "epoch": 0.5711107770930407, "flos": 21982250655360.0, "grad_norm": 3.5641174855093425, "language_loss": 0.6639806, "learning_rate": 1.5571158799148815e-06, "loss": 0.68544441, "num_input_tokens_seen": 204656640, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7421875, "step": 9499, "time_per_iteration": 2.570399761199951 }, { "auxiliary_loss_clip": 0.01129493, "auxiliary_loss_mlp": 0.0103333, "balance_loss_clip": 1.02036047, "balance_loss_mlp": 1.03899014, "epoch": 0.5711709003457087, "flos": 19574009354880.0, "grad_norm": 1.4870131227932104, "language_loss": 0.71458584, "learning_rate": 1.556747445486816e-06, "loss": 0.73621404, "num_input_tokens_seen": 204675475, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 9500, "time_per_iteration": 2.5602548122406006 }, { "auxiliary_loss_clip": 0.01140324, "auxiliary_loss_mlp": 0.01033134, "balance_loss_clip": 1.01924634, "balance_loss_mlp": 1.03685522, "epoch": 0.5712310235983766, "flos": 24242683489920.0, "grad_norm": 2.9687385411813585, "language_loss": 0.76034182, "learning_rate": 1.5563790268772934e-06, "loss": 0.78207642, "num_input_tokens_seen": 204695385, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.765625, "step": 9501, "time_per_iteration": 2.5701587200164795 }, { "auxiliary_loss_clip": 0.01131368, "auxiliary_loss_mlp": 0.01031855, "balance_loss_clip": 1.01897454, "balance_loss_mlp": 1.03803837, "epoch": 0.5712911468510447, "flos": 20996143603200.0, "grad_norm": 1.556804736183545, "language_loss": 0.75158894, "learning_rate": 1.5560106240994629e-06, "loss": 0.77322114, "num_input_tokens_seen": 204714730, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.75390625, "step": 9502, "time_per_iteration": 2.6247599124908447 }, { "auxiliary_loss_clip": 0.01129799, "auxiliary_loss_mlp": 0.01025431, "balance_loss_clip": 1.01264548, "balance_loss_mlp": 1.03858995, "epoch": 0.5713512701037126, "flos": 18916987731840.0, "grad_norm": 2.519256958380799, "language_loss": 0.82007146, "learning_rate": 1.5556422371664705e-06, "loss": 0.84162378, "num_input_tokens_seen": 204735025, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 9503, "time_per_iteration": 2.587096691131592 }, { "auxiliary_loss_clip": 0.01136588, "auxiliary_loss_mlp": 0.01032316, "balance_loss_clip": 1.0196557, "balance_loss_mlp": 1.03786218, "epoch": 0.5714113933563806, "flos": 17413443308160.0, "grad_norm": 2.4480470717133738, "language_loss": 0.85834908, "learning_rate": 1.555273866091464e-06, "loss": 0.8800382, "num_input_tokens_seen": 204751365, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 9504, "time_per_iteration": 2.5710301399230957 }, { "auxiliary_loss_clip": 0.0113458, "auxiliary_loss_mlp": 0.01025574, "balance_loss_clip": 1.01274681, "balance_loss_mlp": 1.03757024, "epoch": 0.5714715166090486, "flos": 20193360589440.0, "grad_norm": 1.8640085342548514, "language_loss": 0.74968839, "learning_rate": 1.5549055108875895e-06, "loss": 0.77128994, "num_input_tokens_seen": 204768980, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 9505, "time_per_iteration": 2.5457332134246826 }, { "auxiliary_loss_clip": 0.01110278, "auxiliary_loss_mlp": 0.01031939, "balance_loss_clip": 1.01998854, "balance_loss_mlp": 1.03791499, "epoch": 0.5715316398617165, "flos": 18551668458240.0, "grad_norm": 1.6731810537554699, "language_loss": 0.81497824, "learning_rate": 1.5545371715679919e-06, "loss": 0.83640039, "num_input_tokens_seen": 204788110, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.7265625, "step": 9506, "time_per_iteration": 2.6028521060943604 }, { "auxiliary_loss_clip": 0.01134717, "auxiliary_loss_mlp": 0.01273979, "balance_loss_clip": 1.0146538, "balance_loss_mlp": 1.03675961, "epoch": 0.5715917631143845, "flos": 18478195188480.0, "grad_norm": 1.7291832898712864, "language_loss": 0.77202857, "learning_rate": 1.5541688481458169e-06, "loss": 0.79611552, "num_input_tokens_seen": 204807240, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7109375, "step": 9507, "time_per_iteration": 2.5803818702697754 }, { "auxiliary_loss_clip": 0.01129971, "auxiliary_loss_mlp": 0.01039765, "balance_loss_clip": 1.02559125, "balance_loss_mlp": 1.03785408, "epoch": 0.5716518863670524, "flos": 24020037037440.0, "grad_norm": 1.4466434618623631, "language_loss": 0.68350387, "learning_rate": 1.5538005406342088e-06, "loss": 0.70520121, "num_input_tokens_seen": 204826415, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7421875, "step": 9508, "time_per_iteration": 2.5632991790771484 }, { "auxiliary_loss_clip": 0.01111336, "auxiliary_loss_mlp": 0.01027393, "balance_loss_clip": 1.0139339, "balance_loss_mlp": 1.03639126, "epoch": 0.5717120096197205, "flos": 17819485626240.0, "grad_norm": 2.012101247370008, "language_loss": 0.79700708, "learning_rate": 1.5534322490463124e-06, "loss": 0.81839442, "num_input_tokens_seen": 204844305, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75, "step": 9509, "time_per_iteration": 2.4638938903808594 }, { "auxiliary_loss_clip": 0.0111698, "auxiliary_loss_mlp": 0.0102979, "balance_loss_clip": 1.01673651, "balance_loss_mlp": 1.03745914, "epoch": 0.5717721328723884, "flos": 21866043179520.0, "grad_norm": 2.868601780056298, "language_loss": 0.71518695, "learning_rate": 1.5530639733952697e-06, "loss": 0.73665464, "num_input_tokens_seen": 204861765, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 9510, "time_per_iteration": 2.5456533432006836 }, { "auxiliary_loss_clip": 0.0112823, "auxiliary_loss_mlp": 0.01028611, "balance_loss_clip": 1.0163976, "balance_loss_mlp": 1.03769279, "epoch": 0.5718322561250564, "flos": 28437624126720.0, "grad_norm": 1.349474802778481, "language_loss": 0.69182241, "learning_rate": 1.552695713694224e-06, "loss": 0.71339077, "num_input_tokens_seen": 204882505, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7265625, "step": 9511, "time_per_iteration": 2.6301918029785156 }, { "auxiliary_loss_clip": 0.0112714, "auxiliary_loss_mlp": 0.01035205, "balance_loss_clip": 1.0224731, "balance_loss_mlp": 1.03762686, "epoch": 0.5718923793777243, "flos": 13551825905280.0, "grad_norm": 1.793877941648849, "language_loss": 0.61572337, "learning_rate": 1.552327469956318e-06, "loss": 0.6373468, "num_input_tokens_seen": 204899830, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 9512, "time_per_iteration": 2.6202850341796875 }, { "auxiliary_loss_clip": 0.01127287, "auxiliary_loss_mlp": 0.01028916, "balance_loss_clip": 1.01653624, "balance_loss_mlp": 1.03821015, "epoch": 0.5719525026303923, "flos": 17822035491840.0, "grad_norm": 2.2972787902649037, "language_loss": 0.76088226, "learning_rate": 1.5519592421946925e-06, "loss": 0.7824443, "num_input_tokens_seen": 204918100, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71484375, "step": 9513, "time_per_iteration": 2.510746717453003 }, { "auxiliary_loss_clip": 0.01045288, "auxiliary_loss_mlp": 0.01000951, "balance_loss_clip": 0.99948502, "balance_loss_mlp": 1.01311994, "epoch": 0.5720126258830602, "flos": 61298042814720.0, "grad_norm": 0.9038186282211911, "language_loss": 0.66844535, "learning_rate": 1.5515910304224898e-06, "loss": 0.68890774, "num_input_tokens_seen": 204972925, "router_z_loss_clip": 0.01464844, "router_z_loss_mlp": 0.23144531, "step": 9514, "time_per_iteration": 3.129070520401001 }, { "auxiliary_loss_clip": 0.01124117, "auxiliary_loss_mlp": 0.01039299, "balance_loss_clip": 1.02593517, "balance_loss_mlp": 1.03993833, "epoch": 0.5720727491357283, "flos": 23988040997760.0, "grad_norm": 1.945959098078944, "language_loss": 0.81254989, "learning_rate": 1.551222834652849e-06, "loss": 0.83418405, "num_input_tokens_seen": 204990910, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.75, "step": 9515, "time_per_iteration": 2.60109543800354 }, { "auxiliary_loss_clip": 0.01117662, "auxiliary_loss_mlp": 0.01029938, "balance_loss_clip": 1.01676595, "balance_loss_mlp": 1.0379802, "epoch": 0.5721328723883962, "flos": 23405426398080.0, "grad_norm": 1.5263744340799321, "language_loss": 0.85582274, "learning_rate": 1.5508546548989117e-06, "loss": 0.87729871, "num_input_tokens_seen": 205010500, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.70703125, "step": 9516, "time_per_iteration": 2.5527002811431885 }, { "auxiliary_loss_clip": 0.01127448, "auxiliary_loss_mlp": 0.01029219, "balance_loss_clip": 1.01619554, "balance_loss_mlp": 1.03746462, "epoch": 0.5721929956410642, "flos": 18804910320000.0, "grad_norm": 1.6870116404395474, "language_loss": 0.87494981, "learning_rate": 1.5504864911738163e-06, "loss": 0.8965165, "num_input_tokens_seen": 205028560, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 9517, "time_per_iteration": 2.5719499588012695 }, { "auxiliary_loss_clip": 0.01145591, "auxiliary_loss_mlp": 0.01031478, "balance_loss_clip": 1.01798964, "balance_loss_mlp": 1.03846526, "epoch": 0.5722531188937322, "flos": 27196659100800.0, "grad_norm": 1.5636487023962982, "language_loss": 0.8526032, "learning_rate": 1.5501183434907012e-06, "loss": 0.87437391, "num_input_tokens_seen": 205048650, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 9518, "time_per_iteration": 2.62888503074646 }, { "auxiliary_loss_clip": 0.01116332, "auxiliary_loss_mlp": 0.01031186, "balance_loss_clip": 1.01903284, "balance_loss_mlp": 1.03863418, "epoch": 0.5723132421464001, "flos": 15195672852480.0, "grad_norm": 1.8203708577478366, "language_loss": 0.7855795, "learning_rate": 1.5497502118627057e-06, "loss": 0.8070547, "num_input_tokens_seen": 205066480, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6875, "step": 9519, "time_per_iteration": 2.564656972885132 }, { "auxiliary_loss_clip": 0.01110362, "auxiliary_loss_mlp": 0.01029813, "balance_loss_clip": 1.01696837, "balance_loss_mlp": 1.03930128, "epoch": 0.5723733653990681, "flos": 27599433281280.0, "grad_norm": 1.4038800899766024, "language_loss": 0.82893777, "learning_rate": 1.5493820963029665e-06, "loss": 0.85033959, "num_input_tokens_seen": 205087475, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 9520, "time_per_iteration": 2.6281540393829346 }, { "auxiliary_loss_clip": 0.01130395, "auxiliary_loss_mlp": 0.01039774, "balance_loss_clip": 1.02611828, "balance_loss_mlp": 1.03920174, "epoch": 0.572433488651736, "flos": 18222870337920.0, "grad_norm": 1.8156397029721405, "language_loss": 0.72141039, "learning_rate": 1.5490139968246214e-06, "loss": 0.74311203, "num_input_tokens_seen": 205106495, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 9521, "time_per_iteration": 2.52748703956604 }, { "auxiliary_loss_clip": 0.01108734, "auxiliary_loss_mlp": 0.01278934, "balance_loss_clip": 1.01979291, "balance_loss_mlp": 1.03712213, "epoch": 0.5724936119044041, "flos": 31249106484480.0, "grad_norm": 1.6383831633644985, "language_loss": 0.78008157, "learning_rate": 1.548645913440807e-06, "loss": 0.8039583, "num_input_tokens_seen": 205128285, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71484375, "step": 9522, "time_per_iteration": 2.60013747215271 }, { "auxiliary_loss_clip": 0.01124452, "auxiliary_loss_mlp": 0.01031616, "balance_loss_clip": 1.01998127, "balance_loss_mlp": 1.03764617, "epoch": 0.572553735157072, "flos": 19202189719680.0, "grad_norm": 1.5960817783901526, "language_loss": 0.71726859, "learning_rate": 1.5482778461646583e-06, "loss": 0.73882926, "num_input_tokens_seen": 205146595, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6875, "step": 9523, "time_per_iteration": 2.5783650875091553 }, { "auxiliary_loss_clip": 0.01126061, "auxiliary_loss_mlp": 0.01026721, "balance_loss_clip": 1.01472831, "balance_loss_mlp": 1.03787839, "epoch": 0.57261385840974, "flos": 21214911386880.0, "grad_norm": 2.185395582170315, "language_loss": 0.70078129, "learning_rate": 1.5479097950093124e-06, "loss": 0.72230911, "num_input_tokens_seen": 205164295, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.70703125, "step": 9524, "time_per_iteration": 2.564089298248291 }, { "auxiliary_loss_clip": 0.01121379, "auxiliary_loss_mlp": 0.01031987, "balance_loss_clip": 1.01963711, "balance_loss_mlp": 1.04124165, "epoch": 0.5726739816624079, "flos": 33984529793280.0, "grad_norm": 1.3750567565036038, "language_loss": 0.64847624, "learning_rate": 1.5475417599879017e-06, "loss": 0.67000991, "num_input_tokens_seen": 205185380, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.71484375, "step": 9525, "time_per_iteration": 2.670581817626953 }, { "auxiliary_loss_clip": 0.01124358, "auxiliary_loss_mlp": 0.01029484, "balance_loss_clip": 1.01740766, "balance_loss_mlp": 1.03690565, "epoch": 0.5727341049150759, "flos": 24275972419200.0, "grad_norm": 1.7031459104666997, "language_loss": 0.72085285, "learning_rate": 1.5471737411135623e-06, "loss": 0.74239123, "num_input_tokens_seen": 205204895, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69921875, "step": 9526, "time_per_iteration": 2.529210329055786 }, { "auxiliary_loss_clip": 0.01117352, "auxiliary_loss_mlp": 0.01031785, "balance_loss_clip": 1.01953614, "balance_loss_mlp": 1.03922272, "epoch": 0.5727942281677438, "flos": 28400564269440.0, "grad_norm": 1.6768361805981264, "language_loss": 0.80106515, "learning_rate": 1.5468057383994275e-06, "loss": 0.8225565, "num_input_tokens_seen": 205223440, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6953125, "step": 9527, "time_per_iteration": 2.643245220184326 }, { "auxiliary_loss_clip": 0.01117568, "auxiliary_loss_mlp": 0.0103525, "balance_loss_clip": 1.02222061, "balance_loss_mlp": 1.03872132, "epoch": 0.5728543514204119, "flos": 19536769929600.0, "grad_norm": 1.595237957298884, "language_loss": 0.72285891, "learning_rate": 1.5464377518586296e-06, "loss": 0.74438715, "num_input_tokens_seen": 205242800, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 9528, "time_per_iteration": 3.994231700897217 }, { "auxiliary_loss_clip": 0.01110785, "auxiliary_loss_mlp": 0.01032944, "balance_loss_clip": 1.0200758, "balance_loss_mlp": 1.03963566, "epoch": 0.5729144746730798, "flos": 21506757390720.0, "grad_norm": 2.0925727714223483, "language_loss": 0.85361612, "learning_rate": 1.5460697815043021e-06, "loss": 0.87505341, "num_input_tokens_seen": 205259465, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 9529, "time_per_iteration": 2.520939350128174 }, { "auxiliary_loss_clip": 0.01055321, "auxiliary_loss_mlp": 0.0100517, "balance_loss_clip": 1.00369787, "balance_loss_mlp": 1.01463377, "epoch": 0.5729745979257478, "flos": 58681628242560.0, "grad_norm": 0.7620025379647021, "language_loss": 0.56145144, "learning_rate": 1.5457018273495758e-06, "loss": 0.5820564, "num_input_tokens_seen": 205314100, "router_z_loss_clip": 0.01470947, "router_z_loss_mlp": 0.23144531, "step": 9530, "time_per_iteration": 3.024583339691162 }, { "auxiliary_loss_clip": 0.01130923, "auxiliary_loss_mlp": 0.01029313, "balance_loss_clip": 1.0171833, "balance_loss_mlp": 1.04090881, "epoch": 0.5730347211784158, "flos": 18552099421440.0, "grad_norm": 1.8876292418835439, "language_loss": 0.66373503, "learning_rate": 1.5453338894075834e-06, "loss": 0.68533742, "num_input_tokens_seen": 205333420, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.7265625, "step": 9531, "time_per_iteration": 2.5179755687713623 }, { "auxiliary_loss_clip": 0.01119715, "auxiliary_loss_mlp": 0.01031786, "balance_loss_clip": 1.01879215, "balance_loss_mlp": 1.03890014, "epoch": 0.5730948444310837, "flos": 38031482396160.0, "grad_norm": 1.9818003980642107, "language_loss": 0.75616133, "learning_rate": 1.5449659676914547e-06, "loss": 0.77767634, "num_input_tokens_seen": 205350995, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 9532, "time_per_iteration": 4.088069677352905 }, { "auxiliary_loss_clip": 0.01109108, "auxiliary_loss_mlp": 0.01028756, "balance_loss_clip": 1.01614964, "balance_loss_mlp": 1.03853393, "epoch": 0.5731549676837517, "flos": 25227066689280.0, "grad_norm": 1.2581115250227077, "language_loss": 0.78880453, "learning_rate": 1.54459806221432e-06, "loss": 0.81018317, "num_input_tokens_seen": 205372675, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 9533, "time_per_iteration": 2.6087276935577393 }, { "auxiliary_loss_clip": 0.01118013, "auxiliary_loss_mlp": 0.01030107, "balance_loss_clip": 1.01786411, "balance_loss_mlp": 1.03993773, "epoch": 0.5732150909364196, "flos": 23368222886400.0, "grad_norm": 1.6970112613592214, "language_loss": 0.85323286, "learning_rate": 1.5442301729893092e-06, "loss": 0.87471402, "num_input_tokens_seen": 205392590, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 9534, "time_per_iteration": 2.6046266555786133 }, { "auxiliary_loss_clip": 0.01127923, "auxiliary_loss_mlp": 0.01034621, "balance_loss_clip": 1.02110898, "balance_loss_mlp": 1.03870928, "epoch": 0.5732752141890877, "flos": 23079357711360.0, "grad_norm": 1.5910361685575334, "language_loss": 0.74846256, "learning_rate": 1.543862300029551e-06, "loss": 0.77008796, "num_input_tokens_seen": 205414885, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71484375, "step": 9535, "time_per_iteration": 2.5548946857452393 }, { "auxiliary_loss_clip": 0.01133812, "auxiliary_loss_mlp": 0.01032789, "balance_loss_clip": 1.01990867, "balance_loss_mlp": 1.04151928, "epoch": 0.5733353374417556, "flos": 24352282863360.0, "grad_norm": 1.520309529577766, "language_loss": 0.71258605, "learning_rate": 1.543494443348174e-06, "loss": 0.7342521, "num_input_tokens_seen": 205434440, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7421875, "step": 9536, "time_per_iteration": 2.589115858078003 }, { "auxiliary_loss_clip": 0.01134672, "auxiliary_loss_mlp": 0.01029357, "balance_loss_clip": 1.01681638, "balance_loss_mlp": 1.03796148, "epoch": 0.5733954606944236, "flos": 27198849830400.0, "grad_norm": 1.7932335481378616, "language_loss": 0.69874722, "learning_rate": 1.5431266029583058e-06, "loss": 0.72038752, "num_input_tokens_seen": 205454225, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 9537, "time_per_iteration": 2.6460981369018555 }, { "auxiliary_loss_clip": 0.01109787, "auxiliary_loss_mlp": 0.01028844, "balance_loss_clip": 1.01647043, "balance_loss_mlp": 1.03884959, "epoch": 0.5734555839470915, "flos": 28765129357440.0, "grad_norm": 4.024420457696834, "language_loss": 0.62588513, "learning_rate": 1.5427587788730744e-06, "loss": 0.64727145, "num_input_tokens_seen": 205474750, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 9538, "time_per_iteration": 4.000261306762695 }, { "auxiliary_loss_clip": 0.01117892, "auxiliary_loss_mlp": 0.01032124, "balance_loss_clip": 1.01876104, "balance_loss_mlp": 1.0388279, "epoch": 0.5735157071997595, "flos": 22966813422720.0, "grad_norm": 1.6283070739840977, "language_loss": 0.8326661, "learning_rate": 1.5423909711056062e-06, "loss": 0.85416627, "num_input_tokens_seen": 205495495, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.703125, "step": 9539, "time_per_iteration": 4.048799991607666 }, { "auxiliary_loss_clip": 0.01147571, "auxiliary_loss_mlp": 0.01033529, "balance_loss_clip": 1.02009463, "balance_loss_mlp": 1.0386498, "epoch": 0.5735758304524274, "flos": 18989455420800.0, "grad_norm": 1.8066767480015338, "language_loss": 0.72518098, "learning_rate": 1.5420231796690268e-06, "loss": 0.74699199, "num_input_tokens_seen": 205510070, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 9540, "time_per_iteration": 2.6475462913513184 }, { "auxiliary_loss_clip": 0.01116102, "auxiliary_loss_mlp": 0.01273222, "balance_loss_clip": 1.01483583, "balance_loss_mlp": 1.03695965, "epoch": 0.5736359537050955, "flos": 28397942576640.0, "grad_norm": 1.9939118563801392, "language_loss": 0.79783106, "learning_rate": 1.5416554045764623e-06, "loss": 0.8217243, "num_input_tokens_seen": 205530190, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.703125, "step": 9541, "time_per_iteration": 2.5538482666015625 }, { "auxiliary_loss_clip": 0.01132298, "auxiliary_loss_mlp": 0.01031567, "balance_loss_clip": 1.01839495, "balance_loss_mlp": 1.04065514, "epoch": 0.5736960769577634, "flos": 15627210848640.0, "grad_norm": 2.2604720606547803, "language_loss": 0.64803737, "learning_rate": 1.541287645841037e-06, "loss": 0.66967607, "num_input_tokens_seen": 205547380, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7421875, "step": 9542, "time_per_iteration": 2.5179240703582764 }, { "auxiliary_loss_clip": 0.01126668, "auxiliary_loss_mlp": 0.01029257, "balance_loss_clip": 1.01716876, "balance_loss_mlp": 1.03836691, "epoch": 0.5737562002104314, "flos": 18003994813440.0, "grad_norm": 2.116465557345704, "language_loss": 0.83340615, "learning_rate": 1.540919903475876e-06, "loss": 0.85496545, "num_input_tokens_seen": 205566540, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.7109375, "step": 9543, "time_per_iteration": 2.5010831356048584 }, { "auxiliary_loss_clip": 0.01142179, "auxiliary_loss_mlp": 0.01030761, "balance_loss_clip": 1.0168972, "balance_loss_mlp": 1.0407207, "epoch": 0.5738163234630994, "flos": 20698192287360.0, "grad_norm": 1.8393263356109726, "language_loss": 0.7306838, "learning_rate": 1.5405521774941027e-06, "loss": 0.75241315, "num_input_tokens_seen": 205584200, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75, "step": 9544, "time_per_iteration": 2.591122627258301 }, { "auxiliary_loss_clip": 0.01149874, "auxiliary_loss_mlp": 0.01029758, "balance_loss_clip": 1.01679397, "balance_loss_mlp": 1.03968549, "epoch": 0.5738764467157673, "flos": 23149311448320.0, "grad_norm": 1.8573064664136891, "language_loss": 0.76179194, "learning_rate": 1.5401844679088399e-06, "loss": 0.78358829, "num_input_tokens_seen": 205604675, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7421875, "step": 9545, "time_per_iteration": 2.5861475467681885 }, { "auxiliary_loss_clip": 0.0113001, "auxiliary_loss_mlp": 0.01032576, "balance_loss_clip": 1.01941562, "balance_loss_mlp": 1.03988981, "epoch": 0.5739365699684353, "flos": 29492930730240.0, "grad_norm": 1.8235145710925662, "language_loss": 0.56697369, "learning_rate": 1.539816774733211e-06, "loss": 0.58859956, "num_input_tokens_seen": 205624680, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 9546, "time_per_iteration": 2.6506574153900146 }, { "auxiliary_loss_clip": 0.01129269, "auxiliary_loss_mlp": 0.01030401, "balance_loss_clip": 1.01753795, "balance_loss_mlp": 1.03809142, "epoch": 0.5739966932211032, "flos": 14027247342720.0, "grad_norm": 2.2112177189310316, "language_loss": 0.76433396, "learning_rate": 1.5394490979803374e-06, "loss": 0.78593063, "num_input_tokens_seen": 205641950, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 9547, "time_per_iteration": 2.523252248764038 }, { "auxiliary_loss_clip": 0.01121672, "auxiliary_loss_mlp": 0.01029066, "balance_loss_clip": 1.01598859, "balance_loss_mlp": 1.04009366, "epoch": 0.5740568164737713, "flos": 19062030850560.0, "grad_norm": 1.9586256843216718, "language_loss": 0.7425589, "learning_rate": 1.5390814376633413e-06, "loss": 0.76406628, "num_input_tokens_seen": 205660130, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 9548, "time_per_iteration": 2.562840223312378 }, { "auxiliary_loss_clip": 0.01137884, "auxiliary_loss_mlp": 0.01036698, "balance_loss_clip": 1.02357936, "balance_loss_mlp": 1.03841805, "epoch": 0.5741169397264392, "flos": 22127832478080.0, "grad_norm": 3.769166883627711, "language_loss": 0.69509137, "learning_rate": 1.538713793795343e-06, "loss": 0.71683717, "num_input_tokens_seen": 205678895, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 9549, "time_per_iteration": 2.543860912322998 }, { "auxiliary_loss_clip": 0.01148423, "auxiliary_loss_mlp": 0.01028135, "balance_loss_clip": 1.01624954, "balance_loss_mlp": 1.04212713, "epoch": 0.5741770629791072, "flos": 24936836797440.0, "grad_norm": 1.4466847391546829, "language_loss": 0.79357928, "learning_rate": 1.5383461663894623e-06, "loss": 0.81534487, "num_input_tokens_seen": 205698450, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.70703125, "step": 9550, "time_per_iteration": 2.6548991203308105 }, { "auxiliary_loss_clip": 0.01128711, "auxiliary_loss_mlp": 0.01032011, "balance_loss_clip": 1.01897597, "balance_loss_mlp": 1.03801918, "epoch": 0.5742371862317751, "flos": 18801462614400.0, "grad_norm": 2.2125352100960494, "language_loss": 0.67915171, "learning_rate": 1.53797855545882e-06, "loss": 0.70075893, "num_input_tokens_seen": 205714870, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 9551, "time_per_iteration": 2.504040002822876 }, { "auxiliary_loss_clip": 0.0113879, "auxiliary_loss_mlp": 0.01033268, "balance_loss_clip": 1.02036905, "balance_loss_mlp": 1.03913248, "epoch": 0.5742973094844431, "flos": 24460661174400.0, "grad_norm": 1.6724198111770041, "language_loss": 0.71726155, "learning_rate": 1.537610961016534e-06, "loss": 0.73898214, "num_input_tokens_seen": 205736045, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73046875, "step": 9552, "time_per_iteration": 2.6070525646209717 }, { "auxiliary_loss_clip": 0.01119837, "auxiliary_loss_mlp": 0.01033262, "balance_loss_clip": 1.02090001, "balance_loss_mlp": 1.03945804, "epoch": 0.574357432737111, "flos": 21652770176640.0, "grad_norm": 2.2277736082084236, "language_loss": 0.79749173, "learning_rate": 1.537243383075724e-06, "loss": 0.81902266, "num_input_tokens_seen": 205754445, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.7109375, "step": 9553, "time_per_iteration": 2.494096517562866 }, { "auxiliary_loss_clip": 0.01121237, "auxiliary_loss_mlp": 0.0103073, "balance_loss_clip": 1.01827264, "balance_loss_mlp": 1.03793871, "epoch": 0.5744175559897791, "flos": 16544728880640.0, "grad_norm": 2.1469168033489345, "language_loss": 0.83797264, "learning_rate": 1.536875821649507e-06, "loss": 0.85949236, "num_input_tokens_seen": 205770595, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7421875, "step": 9554, "time_per_iteration": 2.569016695022583 }, { "auxiliary_loss_clip": 0.01124401, "auxiliary_loss_mlp": 0.01282798, "balance_loss_clip": 1.02254081, "balance_loss_mlp": 1.04080057, "epoch": 0.574477679242447, "flos": 24207598880640.0, "grad_norm": 1.3527030239317126, "language_loss": 0.71098274, "learning_rate": 1.536508276751001e-06, "loss": 0.73505473, "num_input_tokens_seen": 205791935, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75, "step": 9555, "time_per_iteration": 2.544684410095215 }, { "auxiliary_loss_clip": 0.01127429, "auxiliary_loss_mlp": 0.01027312, "balance_loss_clip": 1.01430583, "balance_loss_mlp": 1.03816152, "epoch": 0.574537802495115, "flos": 14903000835840.0, "grad_norm": 2.308680427536463, "language_loss": 0.72992837, "learning_rate": 1.5361407483933223e-06, "loss": 0.75147581, "num_input_tokens_seen": 205807260, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 9556, "time_per_iteration": 2.598719835281372 }, { "auxiliary_loss_clip": 0.01120798, "auxiliary_loss_mlp": 0.01031456, "balance_loss_clip": 1.01826572, "balance_loss_mlp": 1.0398922, "epoch": 0.5745979257477829, "flos": 24934969290240.0, "grad_norm": 1.527254920271786, "language_loss": 0.73949444, "learning_rate": 1.5357732365895863e-06, "loss": 0.76101702, "num_input_tokens_seen": 205826885, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 9557, "time_per_iteration": 2.5190107822418213 }, { "auxiliary_loss_clip": 0.0111927, "auxiliary_loss_mlp": 0.0103519, "balance_loss_clip": 1.02270293, "balance_loss_mlp": 1.0388788, "epoch": 0.5746580490004509, "flos": 17235757704960.0, "grad_norm": 1.535711350447953, "language_loss": 0.67865741, "learning_rate": 1.5354057413529103e-06, "loss": 0.70020199, "num_input_tokens_seen": 205844630, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71484375, "step": 9558, "time_per_iteration": 2.619702100753784 }, { "auxiliary_loss_clip": 0.01135956, "auxiliary_loss_mlp": 0.01041149, "balance_loss_clip": 1.02577078, "balance_loss_mlp": 1.04056382, "epoch": 0.5747181722531189, "flos": 13187871348480.0, "grad_norm": 2.2321121869194367, "language_loss": 0.71160227, "learning_rate": 1.5350382626964076e-06, "loss": 0.73337334, "num_input_tokens_seen": 205860960, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.76953125, "step": 9559, "time_per_iteration": 2.616391897201538 }, { "auxiliary_loss_clip": 0.01128509, "auxiliary_loss_mlp": 0.01276555, "balance_loss_clip": 1.01877642, "balance_loss_mlp": 1.03960824, "epoch": 0.5747782955057869, "flos": 22963006581120.0, "grad_norm": 1.7788210172784042, "language_loss": 0.79325289, "learning_rate": 1.5346708006331936e-06, "loss": 0.81730354, "num_input_tokens_seen": 205880675, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.7109375, "step": 9560, "time_per_iteration": 2.649515390396118 }, { "auxiliary_loss_clip": 0.011555, "auxiliary_loss_mlp": 0.01030378, "balance_loss_clip": 1.0177958, "balance_loss_mlp": 1.03804505, "epoch": 0.5748384187584549, "flos": 23403235668480.0, "grad_norm": 1.5823529894276607, "language_loss": 0.64239132, "learning_rate": 1.534303355176382e-06, "loss": 0.66425008, "num_input_tokens_seen": 205900050, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.73046875, "step": 9561, "time_per_iteration": 2.6129310131073 }, { "auxiliary_loss_clip": 0.01128705, "auxiliary_loss_mlp": 0.01037911, "balance_loss_clip": 1.02374911, "balance_loss_mlp": 1.04228187, "epoch": 0.5748985420111228, "flos": 17785514338560.0, "grad_norm": 1.705015325053328, "language_loss": 0.71402001, "learning_rate": 1.5339359263390852e-06, "loss": 0.73568618, "num_input_tokens_seen": 205918855, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7734375, "step": 9562, "time_per_iteration": 2.5388479232788086 }, { "auxiliary_loss_clip": 0.01146097, "auxiliary_loss_mlp": 0.0103809, "balance_loss_clip": 1.02643108, "balance_loss_mlp": 1.03927565, "epoch": 0.5749586652637908, "flos": 19866250408320.0, "grad_norm": 1.4917994186095673, "language_loss": 0.84134507, "learning_rate": 1.5335685141344169e-06, "loss": 0.86318696, "num_input_tokens_seen": 205936970, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.7109375, "step": 9563, "time_per_iteration": 2.6269309520721436 }, { "auxiliary_loss_clip": 0.01159957, "auxiliary_loss_mlp": 0.01037635, "balance_loss_clip": 1.02402163, "balance_loss_mlp": 1.04008341, "epoch": 0.5750187885164587, "flos": 21287235421440.0, "grad_norm": 2.9477677425440807, "language_loss": 0.57335281, "learning_rate": 1.5332011185754878e-06, "loss": 0.59532869, "num_input_tokens_seen": 205954630, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75390625, "step": 9564, "time_per_iteration": 2.702709913253784 }, { "auxiliary_loss_clip": 0.01124441, "auxiliary_loss_mlp": 0.01030071, "balance_loss_clip": 1.01763177, "balance_loss_mlp": 1.0372932, "epoch": 0.5750789117691267, "flos": 18804658924800.0, "grad_norm": 1.6929648434734745, "language_loss": 0.76054096, "learning_rate": 1.5328337396754108e-06, "loss": 0.78208607, "num_input_tokens_seen": 205971510, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 9565, "time_per_iteration": 2.545072317123413 }, { "auxiliary_loss_clip": 0.01074266, "auxiliary_loss_mlp": 0.01003555, "balance_loss_clip": 1.00180233, "balance_loss_mlp": 1.01600814, "epoch": 0.5751390350217946, "flos": 70663224124800.0, "grad_norm": 0.7550233587670618, "language_loss": 0.60774612, "learning_rate": 1.5324663774472955e-06, "loss": 0.6285243, "num_input_tokens_seen": 206035125, "router_z_loss_clip": 0.01757812, "router_z_loss_mlp": 0.22949219, "step": 9566, "time_per_iteration": 3.257270336151123 }, { "auxiliary_loss_clip": 0.01131101, "auxiliary_loss_mlp": 0.01034026, "balance_loss_clip": 1.02162874, "balance_loss_mlp": 1.04130244, "epoch": 0.5751991582744627, "flos": 14246338348800.0, "grad_norm": 1.8592475617384598, "language_loss": 0.75402689, "learning_rate": 1.5320990319042525e-06, "loss": 0.77567816, "num_input_tokens_seen": 206052075, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71875, "step": 9567, "time_per_iteration": 2.543949604034424 }, { "auxiliary_loss_clip": 0.01120103, "auxiliary_loss_mlp": 0.01035684, "balance_loss_clip": 1.0231967, "balance_loss_mlp": 1.03934193, "epoch": 0.5752592815271306, "flos": 18328160079360.0, "grad_norm": 1.4754133692892417, "language_loss": 0.7453562, "learning_rate": 1.5317317030593916e-06, "loss": 0.76691407, "num_input_tokens_seen": 206069970, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.72265625, "step": 9568, "time_per_iteration": 2.600287675857544 }, { "auxiliary_loss_clip": 0.01113135, "auxiliary_loss_mlp": 0.01035176, "balance_loss_clip": 1.02274823, "balance_loss_mlp": 1.03989148, "epoch": 0.5753194047797986, "flos": 20922742160640.0, "grad_norm": 3.687401786603734, "language_loss": 0.71130383, "learning_rate": 1.5313643909258217e-06, "loss": 0.73278695, "num_input_tokens_seen": 206088950, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.73046875, "step": 9569, "time_per_iteration": 3.9841206073760986 }, { "auxiliary_loss_clip": 0.0112281, "auxiliary_loss_mlp": 0.0104478, "balance_loss_clip": 1.03103495, "balance_loss_mlp": 1.04035568, "epoch": 0.5753795280324665, "flos": 19281804215040.0, "grad_norm": 2.38350793524412, "language_loss": 0.55508178, "learning_rate": 1.5309970955166515e-06, "loss": 0.57675773, "num_input_tokens_seen": 206107780, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.734375, "step": 9570, "time_per_iteration": 2.5688047409057617 }, { "auxiliary_loss_clip": 0.01109693, "auxiliary_loss_mlp": 0.01038331, "balance_loss_clip": 1.02618361, "balance_loss_mlp": 1.03875911, "epoch": 0.5754396512851345, "flos": 21652877917440.0, "grad_norm": 1.4564769721816815, "language_loss": 0.64267069, "learning_rate": 1.5306298168449888e-06, "loss": 0.66415089, "num_input_tokens_seen": 206127445, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.7109375, "step": 9571, "time_per_iteration": 2.5459389686584473 }, { "auxiliary_loss_clip": 0.01118751, "auxiliary_loss_mlp": 0.01034554, "balance_loss_clip": 1.02130938, "balance_loss_mlp": 1.03723979, "epoch": 0.5754997745378025, "flos": 51021700179840.0, "grad_norm": 1.5755353337960167, "language_loss": 0.6680758, "learning_rate": 1.5302625549239396e-06, "loss": 0.68960881, "num_input_tokens_seen": 206152005, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 9572, "time_per_iteration": 2.8667542934417725 }, { "auxiliary_loss_clip": 0.01131515, "auxiliary_loss_mlp": 0.01033713, "balance_loss_clip": 1.02073693, "balance_loss_mlp": 1.04127359, "epoch": 0.5755598977904705, "flos": 22856890826880.0, "grad_norm": 1.6467122858392353, "language_loss": 0.72114086, "learning_rate": 1.529895309766612e-06, "loss": 0.74279314, "num_input_tokens_seen": 206169875, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 9573, "time_per_iteration": 2.579690456390381 }, { "auxiliary_loss_clip": 0.01107325, "auxiliary_loss_mlp": 0.01034521, "balance_loss_clip": 1.02282119, "balance_loss_mlp": 1.03998983, "epoch": 0.5756200210431385, "flos": 38472824805120.0, "grad_norm": 1.7195395440429455, "language_loss": 0.76364017, "learning_rate": 1.5295280813861111e-06, "loss": 0.78505862, "num_input_tokens_seen": 206192635, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 9574, "time_per_iteration": 4.244813442230225 }, { "auxiliary_loss_clip": 0.01154349, "auxiliary_loss_mlp": 0.01037282, "balance_loss_clip": 1.02272618, "balance_loss_mlp": 1.04171944, "epoch": 0.5756801442958064, "flos": 23910006700800.0, "grad_norm": 2.642432727309744, "language_loss": 0.66910493, "learning_rate": 1.5291608697955434e-06, "loss": 0.69102132, "num_input_tokens_seen": 206211485, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7734375, "step": 9575, "time_per_iteration": 2.66739559173584 }, { "auxiliary_loss_clip": 0.01127623, "auxiliary_loss_mlp": 0.01036527, "balance_loss_clip": 1.02386713, "balance_loss_mlp": 1.0402807, "epoch": 0.5757402675484744, "flos": 21105276099840.0, "grad_norm": 1.434094414708295, "language_loss": 0.80171806, "learning_rate": 1.528793675008013e-06, "loss": 0.82335961, "num_input_tokens_seen": 206231740, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 9576, "time_per_iteration": 2.5785152912139893 }, { "auxiliary_loss_clip": 0.0112905, "auxiliary_loss_mlp": 0.01029947, "balance_loss_clip": 1.01767421, "balance_loss_mlp": 1.0403645, "epoch": 0.5758003908011423, "flos": 20559110826240.0, "grad_norm": 1.6357030924308604, "language_loss": 0.6964491, "learning_rate": 1.528426497036624e-06, "loss": 0.71803904, "num_input_tokens_seen": 206250975, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 9577, "time_per_iteration": 2.666574716567993 }, { "auxiliary_loss_clip": 0.01122735, "auxiliary_loss_mlp": 0.0103662, "balance_loss_clip": 1.02408552, "balance_loss_mlp": 1.0396049, "epoch": 0.5758605140538103, "flos": 16473015377280.0, "grad_norm": 1.7272339154795573, "language_loss": 0.66653657, "learning_rate": 1.5280593358944804e-06, "loss": 0.68813014, "num_input_tokens_seen": 206268800, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7421875, "step": 9578, "time_per_iteration": 2.528740406036377 }, { "auxiliary_loss_clip": 0.01126913, "auxiliary_loss_mlp": 0.01030639, "balance_loss_clip": 1.01850343, "balance_loss_mlp": 1.03863072, "epoch": 0.5759206373064782, "flos": 21287558643840.0, "grad_norm": 1.5225752554755068, "language_loss": 0.72416335, "learning_rate": 1.527692191594685e-06, "loss": 0.74573892, "num_input_tokens_seen": 206287190, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.703125, "step": 9579, "time_per_iteration": 2.5928401947021484 }, { "auxiliary_loss_clip": 0.01134624, "auxiliary_loss_mlp": 0.0103658, "balance_loss_clip": 1.0221678, "balance_loss_mlp": 1.04019618, "epoch": 0.5759807605591463, "flos": 26067879227520.0, "grad_norm": 2.2683128479713517, "language_loss": 0.64457405, "learning_rate": 1.5273250641503406e-06, "loss": 0.66628611, "num_input_tokens_seen": 206307020, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.76953125, "step": 9580, "time_per_iteration": 4.0954389572143555 }, { "auxiliary_loss_clip": 0.01130283, "auxiliary_loss_mlp": 0.01036295, "balance_loss_clip": 1.02257979, "balance_loss_mlp": 1.03889406, "epoch": 0.5760408838118142, "flos": 18873068376960.0, "grad_norm": 1.7234790716450255, "language_loss": 0.85219145, "learning_rate": 1.5269579535745486e-06, "loss": 0.87385714, "num_input_tokens_seen": 206324095, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 9581, "time_per_iteration": 4.8200414180755615 }, { "auxiliary_loss_clip": 0.01120258, "auxiliary_loss_mlp": 0.01041731, "balance_loss_clip": 1.02814734, "balance_loss_mlp": 1.04311109, "epoch": 0.5761010070644822, "flos": 15378134964480.0, "grad_norm": 2.309751891101898, "language_loss": 0.67289567, "learning_rate": 1.5265908598804104e-06, "loss": 0.69451553, "num_input_tokens_seen": 206343210, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7734375, "step": 9582, "time_per_iteration": 2.5392918586730957 }, { "auxiliary_loss_clip": 0.01055905, "auxiliary_loss_mlp": 0.01000901, "balance_loss_clip": 0.99923211, "balance_loss_mlp": 1.01513493, "epoch": 0.5761611303171501, "flos": 71471932882560.0, "grad_norm": 0.6502431591111775, "language_loss": 0.57199526, "learning_rate": 1.526223783081027e-06, "loss": 0.59256327, "num_input_tokens_seen": 206415935, "router_z_loss_clip": 0.01672363, "router_z_loss_mlp": 0.22949219, "step": 9583, "time_per_iteration": 3.356153726577759 }, { "auxiliary_loss_clip": 0.0113224, "auxiliary_loss_mlp": 0.01030687, "balance_loss_clip": 1.01794362, "balance_loss_mlp": 1.04068434, "epoch": 0.5762212535698181, "flos": 16246167033600.0, "grad_norm": 1.9156649130896826, "language_loss": 0.82355833, "learning_rate": 1.5258567231894977e-06, "loss": 0.8451876, "num_input_tokens_seen": 206431900, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.734375, "step": 9584, "time_per_iteration": 2.5413870811462402 }, { "auxiliary_loss_clip": 0.01037652, "auxiliary_loss_mlp": 0.01000518, "balance_loss_clip": 0.99881381, "balance_loss_mlp": 1.01481307, "epoch": 0.5762813768224861, "flos": 70185504216960.0, "grad_norm": 0.6243995401088245, "language_loss": 0.49530485, "learning_rate": 1.525489680218923e-06, "loss": 0.51568651, "num_input_tokens_seen": 206501200, "router_z_loss_clip": 0.01708984, "router_z_loss_mlp": 0.22851562, "step": 9585, "time_per_iteration": 3.207455635070801 }, { "auxiliary_loss_clip": 0.01126831, "auxiliary_loss_mlp": 0.01029218, "balance_loss_clip": 1.01665354, "balance_loss_mlp": 1.03810501, "epoch": 0.5763415000751541, "flos": 20518028645760.0, "grad_norm": 1.8246164928082567, "language_loss": 0.84820879, "learning_rate": 1.5251226541824003e-06, "loss": 0.86976922, "num_input_tokens_seen": 206520575, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 9586, "time_per_iteration": 2.5264689922332764 }, { "auxiliary_loss_clip": 0.01121681, "auxiliary_loss_mlp": 0.01031855, "balance_loss_clip": 1.01792502, "balance_loss_mlp": 1.03982377, "epoch": 0.5764016233278221, "flos": 15815526877440.0, "grad_norm": 1.6330795337216386, "language_loss": 0.7979278, "learning_rate": 1.5247556450930287e-06, "loss": 0.81946313, "num_input_tokens_seen": 206538060, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.73046875, "step": 9587, "time_per_iteration": 2.5633292198181152 }, { "auxiliary_loss_clip": 0.01129424, "auxiliary_loss_mlp": 0.01035332, "balance_loss_clip": 1.02164125, "balance_loss_mlp": 1.0399195, "epoch": 0.57646174658049, "flos": 20772312001920.0, "grad_norm": 1.8890144890632963, "language_loss": 0.65779221, "learning_rate": 1.524388652963906e-06, "loss": 0.67943978, "num_input_tokens_seen": 206557320, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 9588, "time_per_iteration": 2.5673646926879883 }, { "auxiliary_loss_clip": 0.01132411, "auxiliary_loss_mlp": 0.01034514, "balance_loss_clip": 1.0201019, "balance_loss_mlp": 1.04081082, "epoch": 0.576521869833158, "flos": 23549930812800.0, "grad_norm": 1.7138207256686289, "language_loss": 0.78752732, "learning_rate": 1.5240216778081282e-06, "loss": 0.80919659, "num_input_tokens_seen": 206575780, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.734375, "step": 9589, "time_per_iteration": 2.6477537155151367 }, { "auxiliary_loss_clip": 0.01107408, "auxiliary_loss_mlp": 0.01028171, "balance_loss_clip": 1.01545739, "balance_loss_mlp": 1.03695989, "epoch": 0.5765819930858259, "flos": 20266582464000.0, "grad_norm": 2.2128617788533242, "language_loss": 0.793154, "learning_rate": 1.523654719638793e-06, "loss": 0.81450981, "num_input_tokens_seen": 206594100, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 9590, "time_per_iteration": 2.514841079711914 }, { "auxiliary_loss_clip": 0.01155305, "auxiliary_loss_mlp": 0.01278076, "balance_loss_clip": 1.01907349, "balance_loss_mlp": 1.03980744, "epoch": 0.5766421163384939, "flos": 23148772744320.0, "grad_norm": 1.7217659523347166, "language_loss": 0.63030243, "learning_rate": 1.5232877784689947e-06, "loss": 0.65463626, "num_input_tokens_seen": 206613325, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.7109375, "step": 9591, "time_per_iteration": 2.634876012802124 }, { "auxiliary_loss_clip": 0.01115809, "auxiliary_loss_mlp": 0.01284548, "balance_loss_clip": 1.02464628, "balance_loss_mlp": 1.04272509, "epoch": 0.5767022395911618, "flos": 25848895962240.0, "grad_norm": 1.5475618661361075, "language_loss": 0.77896845, "learning_rate": 1.5229208543118302e-06, "loss": 0.80297196, "num_input_tokens_seen": 206634265, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 9592, "time_per_iteration": 2.550189733505249 }, { "auxiliary_loss_clip": 0.01083003, "auxiliary_loss_mlp": 0.01000568, "balance_loss_clip": 0.99872077, "balance_loss_mlp": 1.01552022, "epoch": 0.5767623628438299, "flos": 68293299657600.0, "grad_norm": 0.7314468674462824, "language_loss": 0.59664214, "learning_rate": 1.522553947180393e-06, "loss": 0.61747789, "num_input_tokens_seen": 206696990, "router_z_loss_clip": 0.01843262, "router_z_loss_mlp": 0.22851562, "step": 9593, "time_per_iteration": 3.302013635635376 }, { "auxiliary_loss_clip": 0.01124236, "auxiliary_loss_mlp": 0.01041023, "balance_loss_clip": 1.02688456, "balance_loss_mlp": 1.04034829, "epoch": 0.5768224860964978, "flos": 30188448754560.0, "grad_norm": 1.7855889925470187, "language_loss": 0.71032894, "learning_rate": 1.5221870570877771e-06, "loss": 0.73198158, "num_input_tokens_seen": 206717815, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.75, "step": 9594, "time_per_iteration": 2.6412947177886963 }, { "auxiliary_loss_clip": 0.01139213, "auxiliary_loss_mlp": 0.01030917, "balance_loss_clip": 1.01872778, "balance_loss_mlp": 1.0399735, "epoch": 0.5768826093491658, "flos": 17895041884800.0, "grad_norm": 1.6718901853853336, "language_loss": 0.70638919, "learning_rate": 1.5218201840470761e-06, "loss": 0.72809047, "num_input_tokens_seen": 206735985, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7265625, "step": 9595, "time_per_iteration": 2.5396220684051514 }, { "auxiliary_loss_clip": 0.01113001, "auxiliary_loss_mlp": 0.01029599, "balance_loss_clip": 1.01605725, "balance_loss_mlp": 1.04030764, "epoch": 0.5769427326018337, "flos": 17457183095040.0, "grad_norm": 2.12685479538788, "language_loss": 0.70118332, "learning_rate": 1.5214533280713827e-06, "loss": 0.7226094, "num_input_tokens_seen": 206753370, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 9596, "time_per_iteration": 2.546778917312622 }, { "auxiliary_loss_clip": 0.0113526, "auxiliary_loss_mlp": 0.01040611, "balance_loss_clip": 1.02544725, "balance_loss_mlp": 1.04075611, "epoch": 0.5770028558545017, "flos": 39421728345600.0, "grad_norm": 2.365544553060537, "language_loss": 0.67549658, "learning_rate": 1.521086489173789e-06, "loss": 0.69725525, "num_input_tokens_seen": 206777645, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.76953125, "step": 9597, "time_per_iteration": 2.72259783744812 }, { "auxiliary_loss_clip": 0.01130564, "auxiliary_loss_mlp": 0.01036695, "balance_loss_clip": 1.02364182, "balance_loss_mlp": 1.04015422, "epoch": 0.5770629791071697, "flos": 21536383132800.0, "grad_norm": 4.474703275783943, "language_loss": 0.81528378, "learning_rate": 1.520719667367387e-06, "loss": 0.83695638, "num_input_tokens_seen": 206794865, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 9598, "time_per_iteration": 2.6110165119171143 }, { "auxiliary_loss_clip": 0.0114124, "auxiliary_loss_mlp": 0.01031993, "balance_loss_clip": 1.01814723, "balance_loss_mlp": 1.04060602, "epoch": 0.5771231023598377, "flos": 20886795624960.0, "grad_norm": 1.656296960289047, "language_loss": 0.72967166, "learning_rate": 1.5203528626652666e-06, "loss": 0.75140399, "num_input_tokens_seen": 206814095, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 9599, "time_per_iteration": 2.561380386352539 }, { "auxiliary_loss_clip": 0.01108835, "auxiliary_loss_mlp": 0.01034733, "balance_loss_clip": 1.02231789, "balance_loss_mlp": 1.03894722, "epoch": 0.5771832256125057, "flos": 18077216688000.0, "grad_norm": 1.831914766222018, "language_loss": 0.78055012, "learning_rate": 1.5199860750805196e-06, "loss": 0.80198574, "num_input_tokens_seen": 206832245, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69921875, "step": 9600, "time_per_iteration": 2.5426583290100098 }, { "auxiliary_loss_clip": 0.01123402, "auxiliary_loss_mlp": 0.01285702, "balance_loss_clip": 1.02489591, "balance_loss_mlp": 1.04110241, "epoch": 0.5772433488651736, "flos": 26359078786560.0, "grad_norm": 1.9319083605191139, "language_loss": 0.72404242, "learning_rate": 1.519619304626234e-06, "loss": 0.74813342, "num_input_tokens_seen": 206851535, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 9601, "time_per_iteration": 2.627204656600952 }, { "auxiliary_loss_clip": 0.0115809, "auxiliary_loss_mlp": 0.01037619, "balance_loss_clip": 1.0243988, "balance_loss_mlp": 1.04133308, "epoch": 0.5773034721178416, "flos": 19680987035520.0, "grad_norm": 1.6475931720155208, "language_loss": 0.68334252, "learning_rate": 1.5192525513155e-06, "loss": 0.70529962, "num_input_tokens_seen": 206870595, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 9602, "time_per_iteration": 2.609839677810669 }, { "auxiliary_loss_clip": 0.01127042, "auxiliary_loss_mlp": 0.01032086, "balance_loss_clip": 1.02000451, "balance_loss_mlp": 1.04024458, "epoch": 0.5773635953705095, "flos": 25082885496960.0, "grad_norm": 1.508115853482432, "language_loss": 0.73312706, "learning_rate": 1.5188858151614056e-06, "loss": 0.7547183, "num_input_tokens_seen": 206892320, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 9603, "time_per_iteration": 2.5671732425689697 }, { "auxiliary_loss_clip": 0.01120528, "auxiliary_loss_mlp": 0.01029008, "balance_loss_clip": 1.01590133, "balance_loss_mlp": 1.04045701, "epoch": 0.5774237186231775, "flos": 21032987978880.0, "grad_norm": 1.7009166718073883, "language_loss": 0.76513016, "learning_rate": 1.5185190961770394e-06, "loss": 0.7866255, "num_input_tokens_seen": 206912485, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 9604, "time_per_iteration": 2.623897075653076 }, { "auxiliary_loss_clip": 0.01139425, "auxiliary_loss_mlp": 0.0103793, "balance_loss_clip": 1.02379203, "balance_loss_mlp": 1.03931642, "epoch": 0.5774838418758454, "flos": 15231727128960.0, "grad_norm": 2.1782347839038425, "language_loss": 0.83281171, "learning_rate": 1.5181523943754878e-06, "loss": 0.85458523, "num_input_tokens_seen": 206929100, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.734375, "step": 9605, "time_per_iteration": 2.622370958328247 }, { "auxiliary_loss_clip": 0.01146222, "auxiliary_loss_mlp": 0.01033546, "balance_loss_clip": 1.01829278, "balance_loss_mlp": 1.04190195, "epoch": 0.5775439651285135, "flos": 23582609210880.0, "grad_norm": 1.6805946621102499, "language_loss": 0.78390211, "learning_rate": 1.5177857097698378e-06, "loss": 0.80569977, "num_input_tokens_seen": 206947020, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.77734375, "step": 9606, "time_per_iteration": 2.5841681957244873 }, { "auxiliary_loss_clip": 0.01110302, "auxiliary_loss_mlp": 0.01036097, "balance_loss_clip": 1.0236398, "balance_loss_mlp": 1.03943515, "epoch": 0.5776040883811814, "flos": 18040515966720.0, "grad_norm": 1.661568392246642, "language_loss": 0.73790675, "learning_rate": 1.5174190423731755e-06, "loss": 0.75937074, "num_input_tokens_seen": 206964065, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.70703125, "step": 9607, "time_per_iteration": 2.4641807079315186 }, { "auxiliary_loss_clip": 0.01141456, "auxiliary_loss_mlp": 0.01036268, "balance_loss_clip": 1.02308345, "balance_loss_mlp": 1.0403415, "epoch": 0.5776642116338494, "flos": 18624638937600.0, "grad_norm": 1.5567717880105267, "language_loss": 0.69313323, "learning_rate": 1.517052392198586e-06, "loss": 0.71491051, "num_input_tokens_seen": 206981940, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73828125, "step": 9608, "time_per_iteration": 2.5847747325897217 }, { "auxiliary_loss_clip": 0.01140121, "auxiliary_loss_mlp": 0.010374, "balance_loss_clip": 1.02373302, "balance_loss_mlp": 1.04033327, "epoch": 0.5777243348865173, "flos": 28402539517440.0, "grad_norm": 3.850558416938362, "language_loss": 0.76521915, "learning_rate": 1.5166857592591547e-06, "loss": 0.78699434, "num_input_tokens_seen": 207002365, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73828125, "step": 9609, "time_per_iteration": 2.589211940765381 }, { "auxiliary_loss_clip": 0.01140973, "auxiliary_loss_mlp": 0.01034863, "balance_loss_clip": 1.0216186, "balance_loss_mlp": 1.04035616, "epoch": 0.5777844581391853, "flos": 24024705805440.0, "grad_norm": 1.9959642691795385, "language_loss": 0.77176821, "learning_rate": 1.5163191435679651e-06, "loss": 0.79352653, "num_input_tokens_seen": 207021195, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7421875, "step": 9610, "time_per_iteration": 2.6517767906188965 }, { "auxiliary_loss_clip": 0.01139081, "auxiliary_loss_mlp": 0.01033147, "balance_loss_clip": 1.01956272, "balance_loss_mlp": 1.03971386, "epoch": 0.5778445813918534, "flos": 17777361951360.0, "grad_norm": 2.831376686572291, "language_loss": 0.68872845, "learning_rate": 1.5159525451381012e-06, "loss": 0.71045077, "num_input_tokens_seen": 207037465, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 9611, "time_per_iteration": 3.9333839416503906 }, { "auxiliary_loss_clip": 0.0112014, "auxiliary_loss_mlp": 0.010281, "balance_loss_clip": 1.01477861, "balance_loss_mlp": 1.03928506, "epoch": 0.5779047046445213, "flos": 22233194046720.0, "grad_norm": 1.7410838665944444, "language_loss": 0.83071089, "learning_rate": 1.515585963982646e-06, "loss": 0.8521933, "num_input_tokens_seen": 207054230, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 9612, "time_per_iteration": 2.5640320777893066 }, { "auxiliary_loss_clip": 0.01111904, "auxiliary_loss_mlp": 0.01034133, "balance_loss_clip": 1.02056718, "balance_loss_mlp": 1.03969455, "epoch": 0.5779648278971893, "flos": 21434361528960.0, "grad_norm": 1.7749759573473796, "language_loss": 0.79295266, "learning_rate": 1.5152194001146813e-06, "loss": 0.81441301, "num_input_tokens_seen": 207073150, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 9613, "time_per_iteration": 2.5659754276275635 }, { "auxiliary_loss_clip": 0.01118146, "auxiliary_loss_mlp": 0.01034057, "balance_loss_clip": 1.022035, "balance_loss_mlp": 1.03801346, "epoch": 0.5780249511498572, "flos": 19026120228480.0, "grad_norm": 1.7209696790635154, "language_loss": 0.77311003, "learning_rate": 1.5148528535472894e-06, "loss": 0.79463208, "num_input_tokens_seen": 207090375, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7109375, "step": 9614, "time_per_iteration": 2.519958019256592 }, { "auxiliary_loss_clip": 0.01140933, "auxiliary_loss_mlp": 0.01034627, "balance_loss_clip": 1.02142453, "balance_loss_mlp": 1.03961599, "epoch": 0.5780850744025252, "flos": 12124663752960.0, "grad_norm": 2.318545659310001, "language_loss": 0.81303287, "learning_rate": 1.514486324293552e-06, "loss": 0.83478844, "num_input_tokens_seen": 207106030, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.75, "step": 9615, "time_per_iteration": 2.525862455368042 }, { "auxiliary_loss_clip": 0.01121966, "auxiliary_loss_mlp": 0.01033035, "balance_loss_clip": 1.01934433, "balance_loss_mlp": 1.03897536, "epoch": 0.5781451976551931, "flos": 25044425009280.0, "grad_norm": 1.557203006117818, "language_loss": 0.67163789, "learning_rate": 1.5141198123665477e-06, "loss": 0.69318789, "num_input_tokens_seen": 207125435, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 9616, "time_per_iteration": 3.9315943717956543 }, { "auxiliary_loss_clip": 0.01119738, "auxiliary_loss_mlp": 0.01030776, "balance_loss_clip": 1.01754999, "balance_loss_mlp": 1.04019773, "epoch": 0.5782053209078611, "flos": 19245606284160.0, "grad_norm": 1.871212524472863, "language_loss": 0.77775514, "learning_rate": 1.513753317779358e-06, "loss": 0.79926026, "num_input_tokens_seen": 207145095, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 9617, "time_per_iteration": 2.520787477493286 }, { "auxiliary_loss_clip": 0.01129016, "auxiliary_loss_mlp": 0.01033854, "balance_loss_clip": 1.02020407, "balance_loss_mlp": 1.03990877, "epoch": 0.578265444160529, "flos": 25993831340160.0, "grad_norm": 1.4417013015471838, "language_loss": 0.75054395, "learning_rate": 1.5133868405450611e-06, "loss": 0.77217269, "num_input_tokens_seen": 207166045, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71484375, "step": 9618, "time_per_iteration": 2.590916633605957 }, { "auxiliary_loss_clip": 0.0111001, "auxiliary_loss_mlp": 0.01029024, "balance_loss_clip": 1.01653671, "balance_loss_mlp": 1.03870094, "epoch": 0.5783255674131971, "flos": 21798603394560.0, "grad_norm": 1.572221412279327, "language_loss": 0.81439912, "learning_rate": 1.5130203806767367e-06, "loss": 0.83578944, "num_input_tokens_seen": 207185290, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 9619, "time_per_iteration": 2.7104368209838867 }, { "auxiliary_loss_clip": 0.01130725, "auxiliary_loss_mlp": 0.01033345, "balance_loss_clip": 1.02082849, "balance_loss_mlp": 1.04011452, "epoch": 0.578385690665865, "flos": 24789746603520.0, "grad_norm": 1.638443674254882, "language_loss": 0.72682881, "learning_rate": 1.512653938187462e-06, "loss": 0.74846953, "num_input_tokens_seen": 207205505, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.73046875, "step": 9620, "time_per_iteration": 2.6455891132354736 }, { "auxiliary_loss_clip": 0.01131112, "auxiliary_loss_mlp": 0.01030939, "balance_loss_clip": 1.01804662, "balance_loss_mlp": 1.04054618, "epoch": 0.578445813918533, "flos": 21212864311680.0, "grad_norm": 1.4821016573331762, "language_loss": 0.76772571, "learning_rate": 1.5122875130903147e-06, "loss": 0.78934616, "num_input_tokens_seen": 207225315, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 9621, "time_per_iteration": 4.207132577896118 }, { "auxiliary_loss_clip": 0.01158379, "auxiliary_loss_mlp": 0.01034822, "balance_loss_clip": 1.0213511, "balance_loss_mlp": 1.03917003, "epoch": 0.5785059371712009, "flos": 25046795306880.0, "grad_norm": 1.4616596608646086, "language_loss": 0.70412052, "learning_rate": 1.5119211053983715e-06, "loss": 0.72605258, "num_input_tokens_seen": 207247690, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.74609375, "step": 9622, "time_per_iteration": 2.6848318576812744 }, { "auxiliary_loss_clip": 0.01054865, "auxiliary_loss_mlp": 0.01004495, "balance_loss_clip": 1.00282598, "balance_loss_mlp": 1.01468372, "epoch": 0.5785660604238689, "flos": 70843172284800.0, "grad_norm": 0.7717281345349601, "language_loss": 0.56019074, "learning_rate": 1.5115547151247082e-06, "loss": 0.58078432, "num_input_tokens_seen": 207301735, "router_z_loss_clip": 0.01672363, "router_z_loss_mlp": 0.22851562, "step": 9623, "time_per_iteration": 4.685261964797974 }, { "auxiliary_loss_clip": 0.01140922, "auxiliary_loss_mlp": 0.01033845, "balance_loss_clip": 1.01895523, "balance_loss_mlp": 1.03918993, "epoch": 0.578626183676537, "flos": 31649977244160.0, "grad_norm": 2.799469660211144, "language_loss": 0.71147227, "learning_rate": 1.5111883422824013e-06, "loss": 0.73321992, "num_input_tokens_seen": 207321240, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.75390625, "step": 9624, "time_per_iteration": 2.6393814086914062 }, { "auxiliary_loss_clip": 0.01139915, "auxiliary_loss_mlp": 0.01041539, "balance_loss_clip": 1.02755547, "balance_loss_mlp": 1.04129744, "epoch": 0.5786863069292049, "flos": 21865181253120.0, "grad_norm": 1.9729422474171907, "language_loss": 0.82497221, "learning_rate": 1.5108219868845247e-06, "loss": 0.84678674, "num_input_tokens_seen": 207339540, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.71875, "step": 9625, "time_per_iteration": 2.6182901859283447 }, { "auxiliary_loss_clip": 0.01108451, "auxiliary_loss_mlp": 0.01035364, "balance_loss_clip": 1.02157187, "balance_loss_mlp": 1.03702497, "epoch": 0.5787464301818729, "flos": 23364954748800.0, "grad_norm": 1.5092933667802013, "language_loss": 0.70199966, "learning_rate": 1.5104556489441534e-06, "loss": 0.72343779, "num_input_tokens_seen": 207360470, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.71484375, "step": 9626, "time_per_iteration": 2.556654930114746 }, { "auxiliary_loss_clip": 0.01127756, "auxiliary_loss_mlp": 0.01039294, "balance_loss_clip": 1.02641356, "balance_loss_mlp": 1.03854585, "epoch": 0.5788065534345408, "flos": 30004011394560.0, "grad_norm": 1.412192877212368, "language_loss": 0.71195173, "learning_rate": 1.5100893284743605e-06, "loss": 0.73362219, "num_input_tokens_seen": 207383080, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 9627, "time_per_iteration": 2.663417100906372 }, { "auxiliary_loss_clip": 0.01117417, "auxiliary_loss_mlp": 0.01025131, "balance_loss_clip": 1.01241708, "balance_loss_mlp": 1.03768492, "epoch": 0.5788666766872088, "flos": 24527849564160.0, "grad_norm": 1.505190143308105, "language_loss": 0.83626997, "learning_rate": 1.509723025488219e-06, "loss": 0.85769546, "num_input_tokens_seen": 207401000, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 9628, "time_per_iteration": 2.5584216117858887 }, { "auxiliary_loss_clip": 0.01128685, "auxiliary_loss_mlp": 0.01031047, "balance_loss_clip": 1.01753449, "balance_loss_mlp": 1.03821111, "epoch": 0.5789267999398767, "flos": 23732823888000.0, "grad_norm": 1.5228431030344634, "language_loss": 0.72273135, "learning_rate": 1.5093567399988022e-06, "loss": 0.74432868, "num_input_tokens_seen": 207419230, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 9629, "time_per_iteration": 2.5863375663757324 }, { "auxiliary_loss_clip": 0.01130702, "auxiliary_loss_mlp": 0.01037385, "balance_loss_clip": 1.02347326, "balance_loss_mlp": 1.04049802, "epoch": 0.5789869231925447, "flos": 21135045496320.0, "grad_norm": 2.1318041680617, "language_loss": 0.74534076, "learning_rate": 1.5089904720191809e-06, "loss": 0.7670216, "num_input_tokens_seen": 207437615, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7265625, "step": 9630, "time_per_iteration": 2.5498921871185303 }, { "auxiliary_loss_clip": 0.01133862, "auxiliary_loss_mlp": 0.0127854, "balance_loss_clip": 1.01898527, "balance_loss_mlp": 1.03740788, "epoch": 0.5790470464452127, "flos": 21209632087680.0, "grad_norm": 1.5236606544208033, "language_loss": 0.7911256, "learning_rate": 1.5086242215624268e-06, "loss": 0.81524962, "num_input_tokens_seen": 207457270, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 9631, "time_per_iteration": 2.5792815685272217 }, { "auxiliary_loss_clip": 0.01110303, "auxiliary_loss_mlp": 0.01026404, "balance_loss_clip": 1.01315975, "balance_loss_mlp": 1.03781748, "epoch": 0.5791071696978807, "flos": 23404384903680.0, "grad_norm": 1.6534618177740106, "language_loss": 0.74737656, "learning_rate": 1.5082579886416102e-06, "loss": 0.76874363, "num_input_tokens_seen": 207477890, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 9632, "time_per_iteration": 2.5859646797180176 }, { "auxiliary_loss_clip": 0.01129309, "auxiliary_loss_mlp": 0.01030098, "balance_loss_clip": 1.01576364, "balance_loss_mlp": 1.03900754, "epoch": 0.5791672929505486, "flos": 24206521472640.0, "grad_norm": 2.0606851929082746, "language_loss": 0.79259157, "learning_rate": 1.5078917732698009e-06, "loss": 0.81418562, "num_input_tokens_seen": 207497670, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.72265625, "step": 9633, "time_per_iteration": 2.625809907913208 }, { "auxiliary_loss_clip": 0.01147242, "auxiliary_loss_mlp": 0.01035902, "balance_loss_clip": 1.02162063, "balance_loss_mlp": 1.04143655, "epoch": 0.5792274162032166, "flos": 24348871071360.0, "grad_norm": 1.7828978866800458, "language_loss": 0.77590734, "learning_rate": 1.5075255754600686e-06, "loss": 0.79773879, "num_input_tokens_seen": 207516105, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7890625, "step": 9634, "time_per_iteration": 2.6157636642456055 }, { "auxiliary_loss_clip": 0.01132843, "auxiliary_loss_mlp": 0.01034463, "balance_loss_clip": 1.02050328, "balance_loss_mlp": 1.03963101, "epoch": 0.5792875394558845, "flos": 20449403712000.0, "grad_norm": 1.9717078937313393, "language_loss": 0.63020098, "learning_rate": 1.5071593952254814e-06, "loss": 0.65187395, "num_input_tokens_seen": 207533685, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7578125, "step": 9635, "time_per_iteration": 2.611727714538574 }, { "auxiliary_loss_clip": 0.0112926, "auxiliary_loss_mlp": 0.01036038, "balance_loss_clip": 1.02301478, "balance_loss_mlp": 1.04058111, "epoch": 0.5793476627085525, "flos": 24060329118720.0, "grad_norm": 1.4702554425205414, "language_loss": 0.76742303, "learning_rate": 1.5067932325791077e-06, "loss": 0.78907597, "num_input_tokens_seen": 207552840, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 9636, "time_per_iteration": 2.6095707416534424 }, { "auxiliary_loss_clip": 0.01124551, "auxiliary_loss_mlp": 0.01029421, "balance_loss_clip": 1.01685071, "balance_loss_mlp": 1.03772938, "epoch": 0.5794077859612206, "flos": 22054287381120.0, "grad_norm": 1.6414844017073058, "language_loss": 0.68524706, "learning_rate": 1.5064270875340153e-06, "loss": 0.70678675, "num_input_tokens_seen": 207572095, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 9637, "time_per_iteration": 2.506049394607544 }, { "auxiliary_loss_clip": 0.01147727, "auxiliary_loss_mlp": 0.01031252, "balance_loss_clip": 1.01763868, "balance_loss_mlp": 1.03765988, "epoch": 0.5794679092138885, "flos": 11434855991040.0, "grad_norm": 2.4185013491649294, "language_loss": 0.72465998, "learning_rate": 1.50606096010327e-06, "loss": 0.74644971, "num_input_tokens_seen": 207587495, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.74609375, "step": 9638, "time_per_iteration": 2.533843517303467 }, { "auxiliary_loss_clip": 0.01121508, "auxiliary_loss_mlp": 0.01035369, "balance_loss_clip": 1.02270293, "balance_loss_mlp": 1.04061437, "epoch": 0.5795280324665565, "flos": 18880215183360.0, "grad_norm": 1.6893668893344707, "language_loss": 0.72159338, "learning_rate": 1.5056948502999386e-06, "loss": 0.74316216, "num_input_tokens_seen": 207606795, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 9639, "time_per_iteration": 2.48288893699646 }, { "auxiliary_loss_clip": 0.01131585, "auxiliary_loss_mlp": 0.01034251, "balance_loss_clip": 1.02075636, "balance_loss_mlp": 1.04017091, "epoch": 0.5795881557192244, "flos": 13005947940480.0, "grad_norm": 2.1054748742958957, "language_loss": 0.69782168, "learning_rate": 1.5053287581370863e-06, "loss": 0.71948004, "num_input_tokens_seen": 207623620, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 9640, "time_per_iteration": 2.5024447441101074 }, { "auxiliary_loss_clip": 0.01137371, "auxiliary_loss_mlp": 0.01038188, "balance_loss_clip": 1.02418685, "balance_loss_mlp": 1.04130936, "epoch": 0.5796482789718924, "flos": 19932397303680.0, "grad_norm": 2.9730162202948813, "language_loss": 0.77494049, "learning_rate": 1.5049626836277787e-06, "loss": 0.79669613, "num_input_tokens_seen": 207639380, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78515625, "step": 9641, "time_per_iteration": 2.5219616889953613 }, { "auxiliary_loss_clip": 0.01112145, "auxiliary_loss_mlp": 0.01037414, "balance_loss_clip": 1.02480197, "balance_loss_mlp": 1.03959584, "epoch": 0.5797084022245603, "flos": 21650794928640.0, "grad_norm": 1.7719456283877677, "language_loss": 0.73766106, "learning_rate": 1.5045966267850793e-06, "loss": 0.7591567, "num_input_tokens_seen": 207657915, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 9642, "time_per_iteration": 2.6103246212005615 }, { "auxiliary_loss_clip": 0.01129189, "auxiliary_loss_mlp": 0.01036875, "balance_loss_clip": 1.02342188, "balance_loss_mlp": 1.03920317, "epoch": 0.5797685254772283, "flos": 26031573555840.0, "grad_norm": 1.5078192511125184, "language_loss": 0.73146462, "learning_rate": 1.5042305876220515e-06, "loss": 0.75312531, "num_input_tokens_seen": 207678620, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 9643, "time_per_iteration": 2.58919358253479 }, { "auxiliary_loss_clip": 0.01115301, "auxiliary_loss_mlp": 0.01033193, "balance_loss_clip": 1.019508, "balance_loss_mlp": 1.03980613, "epoch": 0.5798286487298963, "flos": 22705167778560.0, "grad_norm": 2.1135691856220484, "language_loss": 0.67375302, "learning_rate": 1.5038645661517594e-06, "loss": 0.69523799, "num_input_tokens_seen": 207696980, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7578125, "step": 9644, "time_per_iteration": 2.522745132446289 }, { "auxiliary_loss_clip": 0.01127256, "auxiliary_loss_mlp": 0.01035611, "balance_loss_clip": 1.02085853, "balance_loss_mlp": 1.04180408, "epoch": 0.5798887719825643, "flos": 23148988225920.0, "grad_norm": 2.355554004963052, "language_loss": 0.8520897, "learning_rate": 1.5034985623872647e-06, "loss": 0.87371838, "num_input_tokens_seen": 207714065, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.765625, "step": 9645, "time_per_iteration": 2.493328332901001 }, { "auxiliary_loss_clip": 0.01115869, "auxiliary_loss_mlp": 0.01032179, "balance_loss_clip": 1.01915526, "balance_loss_mlp": 1.04213071, "epoch": 0.5799488952352322, "flos": 24426043441920.0, "grad_norm": 1.8648973209631252, "language_loss": 0.75418711, "learning_rate": 1.5031325763416292e-06, "loss": 0.77566755, "num_input_tokens_seen": 207734720, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73828125, "step": 9646, "time_per_iteration": 2.5750155448913574 }, { "auxiliary_loss_clip": 0.01134137, "auxiliary_loss_mlp": 0.01034108, "balance_loss_clip": 1.01982069, "balance_loss_mlp": 1.04125309, "epoch": 0.5800090184879002, "flos": 38395903829760.0, "grad_norm": 1.883651281270512, "language_loss": 0.59289086, "learning_rate": 1.5027666080279134e-06, "loss": 0.6145733, "num_input_tokens_seen": 207755435, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75390625, "step": 9647, "time_per_iteration": 2.679267644882202 }, { "auxiliary_loss_clip": 0.01152253, "auxiliary_loss_mlp": 0.0104062, "balance_loss_clip": 1.02623153, "balance_loss_mlp": 1.04114914, "epoch": 0.5800691417405681, "flos": 19784840232960.0, "grad_norm": 2.123202194583209, "language_loss": 0.8417182, "learning_rate": 1.5024006574591788e-06, "loss": 0.86364692, "num_input_tokens_seen": 207773570, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.75390625, "step": 9648, "time_per_iteration": 2.5967512130737305 }, { "auxiliary_loss_clip": 0.0104548, "auxiliary_loss_mlp": 0.01004526, "balance_loss_clip": 1.00284505, "balance_loss_mlp": 1.01354396, "epoch": 0.5801292649932361, "flos": 70314565783680.0, "grad_norm": 0.9561155584469386, "language_loss": 0.62985671, "learning_rate": 1.5020347246484848e-06, "loss": 0.65035677, "num_input_tokens_seen": 207830095, "router_z_loss_clip": 0.0168457, "router_z_loss_mlp": 0.23046875, "step": 9649, "time_per_iteration": 3.281379461288452 }, { "auxiliary_loss_clip": 0.01136725, "auxiliary_loss_mlp": 0.01030145, "balance_loss_clip": 1.0173955, "balance_loss_mlp": 1.03847325, "epoch": 0.5801893882459042, "flos": 18734812928640.0, "grad_norm": 1.881871134481788, "language_loss": 0.81677604, "learning_rate": 1.50166880960889e-06, "loss": 0.83844483, "num_input_tokens_seen": 207848555, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 9650, "time_per_iteration": 2.6552987098693848 }, { "auxiliary_loss_clip": 0.01143448, "auxiliary_loss_mlp": 0.01035712, "balance_loss_clip": 1.02250922, "balance_loss_mlp": 1.04067087, "epoch": 0.5802495114985721, "flos": 15596507698560.0, "grad_norm": 2.626015705993401, "language_loss": 0.77626264, "learning_rate": 1.501302912353454e-06, "loss": 0.79805422, "num_input_tokens_seen": 207867060, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.765625, "step": 9651, "time_per_iteration": 2.5184497833251953 }, { "auxiliary_loss_clip": 0.01134271, "auxiliary_loss_mlp": 0.01038685, "balance_loss_clip": 1.02465391, "balance_loss_mlp": 1.0417136, "epoch": 0.5803096347512401, "flos": 18255405081600.0, "grad_norm": 2.273849655777149, "language_loss": 0.74525291, "learning_rate": 1.500937032895234e-06, "loss": 0.76698244, "num_input_tokens_seen": 207884520, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75, "step": 9652, "time_per_iteration": 3.8506548404693604 }, { "auxiliary_loss_clip": 0.0113301, "auxiliary_loss_mlp": 0.01033778, "balance_loss_clip": 1.01991987, "balance_loss_mlp": 1.03982401, "epoch": 0.580369758003908, "flos": 22893160584960.0, "grad_norm": 1.8754157808896428, "language_loss": 0.76753569, "learning_rate": 1.5005711712472877e-06, "loss": 0.78920352, "num_input_tokens_seen": 207905370, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75390625, "step": 9653, "time_per_iteration": 2.6767959594726562 }, { "auxiliary_loss_clip": 0.01114451, "auxiliary_loss_mlp": 0.0103619, "balance_loss_clip": 1.02139616, "balance_loss_mlp": 1.03999615, "epoch": 0.580429881256576, "flos": 18697681244160.0, "grad_norm": 2.6954213957847686, "language_loss": 0.74090248, "learning_rate": 1.5002053274226718e-06, "loss": 0.76240891, "num_input_tokens_seen": 207923790, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7421875, "step": 9654, "time_per_iteration": 2.490525245666504 }, { "auxiliary_loss_clip": 0.01156701, "auxiliary_loss_mlp": 0.01035699, "balance_loss_clip": 1.0227294, "balance_loss_mlp": 1.04165423, "epoch": 0.5804900045092439, "flos": 24681978823680.0, "grad_norm": 2.9062334841648645, "language_loss": 0.69863993, "learning_rate": 1.4998395014344416e-06, "loss": 0.72056395, "num_input_tokens_seen": 207942335, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69921875, "step": 9655, "time_per_iteration": 2.654451608657837 }, { "auxiliary_loss_clip": 0.01117386, "auxiliary_loss_mlp": 0.01035231, "balance_loss_clip": 1.02124739, "balance_loss_mlp": 1.03908992, "epoch": 0.580550127761912, "flos": 23112790295040.0, "grad_norm": 3.5455172604774683, "language_loss": 0.68998456, "learning_rate": 1.4994736932956536e-06, "loss": 0.71151072, "num_input_tokens_seen": 207961975, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78125, "step": 9656, "time_per_iteration": 2.546970844268799 }, { "auxiliary_loss_clip": 0.0111844, "auxiliary_loss_mlp": 0.01032195, "balance_loss_clip": 1.01959467, "balance_loss_mlp": 1.0400033, "epoch": 0.5806102510145799, "flos": 18475681236480.0, "grad_norm": 1.578190037076967, "language_loss": 0.71675241, "learning_rate": 1.4991079030193614e-06, "loss": 0.73825872, "num_input_tokens_seen": 207979520, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 9657, "time_per_iteration": 3.987595319747925 }, { "auxiliary_loss_clip": 0.01123451, "auxiliary_loss_mlp": 0.01037362, "balance_loss_clip": 1.02238905, "balance_loss_mlp": 1.03829098, "epoch": 0.5806703742672479, "flos": 23915645136000.0, "grad_norm": 1.9290314741154564, "language_loss": 0.70998293, "learning_rate": 1.4987421306186202e-06, "loss": 0.73159111, "num_input_tokens_seen": 207998375, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.765625, "step": 9658, "time_per_iteration": 2.543842315673828 }, { "auxiliary_loss_clip": 0.01045125, "auxiliary_loss_mlp": 0.01000993, "balance_loss_clip": 0.99934834, "balance_loss_mlp": 1.01311672, "epoch": 0.5807304975199158, "flos": 66311999412480.0, "grad_norm": 0.6465548694370779, "language_loss": 0.53605908, "learning_rate": 1.498376376106483e-06, "loss": 0.55652028, "num_input_tokens_seen": 208060605, "router_z_loss_clip": 0.01647949, "router_z_loss_mlp": 0.23242188, "step": 9659, "time_per_iteration": 3.148644208908081 }, { "auxiliary_loss_clip": 0.01124835, "auxiliary_loss_mlp": 0.01041511, "balance_loss_clip": 1.02713442, "balance_loss_mlp": 1.04038668, "epoch": 0.5807906207725838, "flos": 31722444933120.0, "grad_norm": 1.8513470783718353, "language_loss": 0.61993337, "learning_rate": 1.4980106394960026e-06, "loss": 0.64159685, "num_input_tokens_seen": 208080320, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7578125, "step": 9660, "time_per_iteration": 2.6279795169830322 }, { "auxiliary_loss_clip": 0.01123029, "auxiliary_loss_mlp": 0.01036216, "balance_loss_clip": 1.02276897, "balance_loss_mlp": 1.03789806, "epoch": 0.5808507440252517, "flos": 23801161512960.0, "grad_norm": 1.5039521099587934, "language_loss": 0.64869273, "learning_rate": 1.4976449208002312e-06, "loss": 0.67028522, "num_input_tokens_seen": 208099305, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.76171875, "step": 9661, "time_per_iteration": 2.533719301223755 }, { "auxiliary_loss_clip": 0.01143628, "auxiliary_loss_mlp": 0.0102877, "balance_loss_clip": 1.0163362, "balance_loss_mlp": 1.03883266, "epoch": 0.5809108672779197, "flos": 13698449222400.0, "grad_norm": 1.9700164133499742, "language_loss": 0.74726427, "learning_rate": 1.4972792200322197e-06, "loss": 0.76898819, "num_input_tokens_seen": 208116960, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6875, "step": 9662, "time_per_iteration": 2.558314085006714 }, { "auxiliary_loss_clip": 0.01036388, "auxiliary_loss_mlp": 0.01001527, "balance_loss_clip": 0.99984568, "balance_loss_mlp": 1.01321852, "epoch": 0.5809709905305876, "flos": 69134866381440.0, "grad_norm": 0.8830849270447234, "language_loss": 0.58378136, "learning_rate": 1.4969135372050204e-06, "loss": 0.60416043, "num_input_tokens_seen": 208182190, "router_z_loss_clip": 0.0168457, "router_z_loss_mlp": 0.23242188, "step": 9663, "time_per_iteration": 4.775710105895996 }, { "auxiliary_loss_clip": 0.01131137, "auxiliary_loss_mlp": 0.01032591, "balance_loss_clip": 1.01893556, "balance_loss_mlp": 1.04116273, "epoch": 0.5810311137832557, "flos": 19827538525440.0, "grad_norm": 2.250527955155243, "language_loss": 0.81998837, "learning_rate": 1.4965478723316826e-06, "loss": 0.84162569, "num_input_tokens_seen": 208197015, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 9664, "time_per_iteration": 4.407526731491089 }, { "auxiliary_loss_clip": 0.01131397, "auxiliary_loss_mlp": 0.01274462, "balance_loss_clip": 1.01429224, "balance_loss_mlp": 1.03701007, "epoch": 0.5810912370359237, "flos": 29238503719680.0, "grad_norm": 1.7874648563601392, "language_loss": 0.81303883, "learning_rate": 1.496182225425256e-06, "loss": 0.83709741, "num_input_tokens_seen": 208215795, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.76171875, "step": 9665, "time_per_iteration": 2.626577377319336 }, { "auxiliary_loss_clip": 0.01135682, "auxiliary_loss_mlp": 0.0103765, "balance_loss_clip": 1.02196777, "balance_loss_mlp": 1.04194868, "epoch": 0.5811513602885916, "flos": 22785572373120.0, "grad_norm": 2.4635223682215064, "language_loss": 0.80891299, "learning_rate": 1.4958165964987904e-06, "loss": 0.83064634, "num_input_tokens_seen": 208234655, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.76171875, "step": 9666, "time_per_iteration": 2.5591440200805664 }, { "auxiliary_loss_clip": 0.01120132, "auxiliary_loss_mlp": 0.01035978, "balance_loss_clip": 1.02316284, "balance_loss_mlp": 1.0395087, "epoch": 0.5812114835412596, "flos": 18734346051840.0, "grad_norm": 1.6943012361618754, "language_loss": 0.8001914, "learning_rate": 1.4954509855653328e-06, "loss": 0.82175255, "num_input_tokens_seen": 208251300, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 9667, "time_per_iteration": 2.5555813312530518 }, { "auxiliary_loss_clip": 0.01127472, "auxiliary_loss_mlp": 0.01039518, "balance_loss_clip": 1.02456975, "balance_loss_mlp": 1.03944051, "epoch": 0.5812716067939275, "flos": 26431295080320.0, "grad_norm": 1.5308856822788897, "language_loss": 0.78689229, "learning_rate": 1.4950853926379323e-06, "loss": 0.80856222, "num_input_tokens_seen": 208272685, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.79296875, "step": 9668, "time_per_iteration": 2.5867366790771484 }, { "auxiliary_loss_clip": 0.01132301, "auxiliary_loss_mlp": 0.01033502, "balance_loss_clip": 1.02115798, "balance_loss_mlp": 1.04036307, "epoch": 0.5813317300465956, "flos": 43397865285120.0, "grad_norm": 1.900211609875385, "language_loss": 0.6468643, "learning_rate": 1.4947198177296347e-06, "loss": 0.66852236, "num_input_tokens_seen": 208294315, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.7421875, "step": 9669, "time_per_iteration": 2.7027535438537598 }, { "auxiliary_loss_clip": 0.01149064, "auxiliary_loss_mlp": 0.01038842, "balance_loss_clip": 1.02513885, "balance_loss_mlp": 1.03993273, "epoch": 0.5813918532992635, "flos": 24785472885120.0, "grad_norm": 1.7510294258616226, "language_loss": 0.72963357, "learning_rate": 1.4943542608534877e-06, "loss": 0.75151265, "num_input_tokens_seen": 208315610, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 9670, "time_per_iteration": 2.5981109142303467 }, { "auxiliary_loss_clip": 0.01140878, "auxiliary_loss_mlp": 0.01042425, "balance_loss_clip": 1.02886534, "balance_loss_mlp": 1.03898966, "epoch": 0.5814519765519315, "flos": 22857357703680.0, "grad_norm": 2.0326068742723127, "language_loss": 0.79106605, "learning_rate": 1.4939887220225361e-06, "loss": 0.81289911, "num_input_tokens_seen": 208334725, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.75390625, "step": 9671, "time_per_iteration": 2.547889471054077 }, { "auxiliary_loss_clip": 0.01154055, "auxiliary_loss_mlp": 0.01038245, "balance_loss_clip": 1.02306914, "balance_loss_mlp": 1.04147637, "epoch": 0.5815120998045994, "flos": 24060831909120.0, "grad_norm": 2.4417008561822167, "language_loss": 0.61653924, "learning_rate": 1.4936232012498256e-06, "loss": 0.63846231, "num_input_tokens_seen": 208353825, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.76953125, "step": 9672, "time_per_iteration": 2.595712184906006 }, { "auxiliary_loss_clip": 0.01147453, "auxiliary_loss_mlp": 0.01034522, "balance_loss_clip": 1.02215385, "balance_loss_mlp": 1.04019713, "epoch": 0.5815722230572674, "flos": 24279491952000.0, "grad_norm": 1.6678558555626075, "language_loss": 0.80942786, "learning_rate": 1.4932576985484005e-06, "loss": 0.83124757, "num_input_tokens_seen": 208374160, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.7109375, "step": 9673, "time_per_iteration": 2.6018972396850586 }, { "auxiliary_loss_clip": 0.01144691, "auxiliary_loss_mlp": 0.01040519, "balance_loss_clip": 1.02604699, "balance_loss_mlp": 1.04144228, "epoch": 0.5816323463099353, "flos": 22200371994240.0, "grad_norm": 2.036274460442838, "language_loss": 0.87944996, "learning_rate": 1.492892213931304e-06, "loss": 0.90130204, "num_input_tokens_seen": 208392105, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.765625, "step": 9674, "time_per_iteration": 2.617283821105957 }, { "auxiliary_loss_clip": 0.01112656, "auxiliary_loss_mlp": 0.01283123, "balance_loss_clip": 1.02345562, "balance_loss_mlp": 1.03943849, "epoch": 0.5816924695626033, "flos": 24134448833280.0, "grad_norm": 1.4242553433967655, "language_loss": 0.78825355, "learning_rate": 1.4925267474115812e-06, "loss": 0.81221139, "num_input_tokens_seen": 208411755, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73046875, "step": 9675, "time_per_iteration": 2.534726142883301 }, { "auxiliary_loss_clip": 0.01121917, "auxiliary_loss_mlp": 0.01036479, "balance_loss_clip": 1.02265668, "balance_loss_mlp": 1.04000068, "epoch": 0.5817525928152713, "flos": 21324223451520.0, "grad_norm": 2.52633278332263, "language_loss": 0.70547086, "learning_rate": 1.492161299002273e-06, "loss": 0.72705483, "num_input_tokens_seen": 208429995, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7265625, "step": 9676, "time_per_iteration": 2.6210086345672607 }, { "auxiliary_loss_clip": 0.01159023, "auxiliary_loss_mlp": 0.01032057, "balance_loss_clip": 1.01823497, "balance_loss_mlp": 1.04001057, "epoch": 0.5818127160679393, "flos": 26934510666240.0, "grad_norm": 2.574176770903761, "language_loss": 0.63892037, "learning_rate": 1.4917958687164212e-06, "loss": 0.66083121, "num_input_tokens_seen": 208443655, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 9677, "time_per_iteration": 2.5530457496643066 }, { "auxiliary_loss_clip": 0.010528, "auxiliary_loss_mlp": 0.01007183, "balance_loss_clip": 1.00557351, "balance_loss_mlp": 1.01214671, "epoch": 0.5818728393206073, "flos": 63918626342400.0, "grad_norm": 0.8038120660685824, "language_loss": 0.54213393, "learning_rate": 1.491430456567068e-06, "loss": 0.56273365, "num_input_tokens_seen": 208498405, "router_z_loss_clip": 0.01611328, "router_z_loss_mlp": 0.22753906, "step": 9678, "time_per_iteration": 3.2142446041107178 }, { "auxiliary_loss_clip": 0.01053483, "auxiliary_loss_mlp": 0.01006546, "balance_loss_clip": 1.00486505, "balance_loss_mlp": 1.01276064, "epoch": 0.5819329625732752, "flos": 64954108638720.0, "grad_norm": 0.7421633958927687, "language_loss": 0.5617547, "learning_rate": 1.491065062567253e-06, "loss": 0.58235496, "num_input_tokens_seen": 208559075, "router_z_loss_clip": 0.0168457, "router_z_loss_mlp": 0.22851562, "step": 9679, "time_per_iteration": 3.057006597518921 }, { "auxiliary_loss_clip": 0.01109978, "auxiliary_loss_mlp": 0.01033067, "balance_loss_clip": 1.02086008, "balance_loss_mlp": 1.03747988, "epoch": 0.5819930858259432, "flos": 21215270522880.0, "grad_norm": 1.6873424798907646, "language_loss": 0.65572393, "learning_rate": 1.4906996867300174e-06, "loss": 0.67715442, "num_input_tokens_seen": 208577770, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7265625, "step": 9680, "time_per_iteration": 2.5156307220458984 }, { "auxiliary_loss_clip": 0.01128558, "auxiliary_loss_mlp": 0.01028419, "balance_loss_clip": 1.0156877, "balance_loss_mlp": 1.03811097, "epoch": 0.5820532090786111, "flos": 19458520151040.0, "grad_norm": 2.0944235766208896, "language_loss": 0.83054042, "learning_rate": 1.4903343290683999e-06, "loss": 0.85211021, "num_input_tokens_seen": 208595110, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 9681, "time_per_iteration": 2.5944244861602783 }, { "auxiliary_loss_clip": 0.01112062, "auxiliary_loss_mlp": 0.01031327, "balance_loss_clip": 1.01817179, "balance_loss_mlp": 1.03870702, "epoch": 0.5821133323312792, "flos": 17712615686400.0, "grad_norm": 1.7464895514680485, "language_loss": 0.75332558, "learning_rate": 1.4899689895954385e-06, "loss": 0.77475947, "num_input_tokens_seen": 208612080, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 9682, "time_per_iteration": 2.470064163208008 }, { "auxiliary_loss_clip": 0.01159528, "auxiliary_loss_mlp": 0.01029901, "balance_loss_clip": 1.01650834, "balance_loss_mlp": 1.03961968, "epoch": 0.5821734555839471, "flos": 24571804832640.0, "grad_norm": 1.874403544447957, "language_loss": 0.7476036, "learning_rate": 1.4896036683241727e-06, "loss": 0.76949787, "num_input_tokens_seen": 208630235, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.75, "step": 9683, "time_per_iteration": 2.682823419570923 }, { "auxiliary_loss_clip": 0.01132447, "auxiliary_loss_mlp": 0.01032151, "balance_loss_clip": 1.01822114, "balance_loss_mlp": 1.03856874, "epoch": 0.5822335788366151, "flos": 22382259488640.0, "grad_norm": 1.729996018057234, "language_loss": 0.74106151, "learning_rate": 1.4892383652676385e-06, "loss": 0.76270747, "num_input_tokens_seen": 208647925, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7578125, "step": 9684, "time_per_iteration": 2.5553677082061768 }, { "auxiliary_loss_clip": 0.01131637, "auxiliary_loss_mlp": 0.01035381, "balance_loss_clip": 1.02188659, "balance_loss_mlp": 1.03918529, "epoch": 0.582293702089283, "flos": 26722494639360.0, "grad_norm": 2.130733469674565, "language_loss": 0.78488111, "learning_rate": 1.4888730804388736e-06, "loss": 0.80655122, "num_input_tokens_seen": 208666180, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 9685, "time_per_iteration": 2.590561628341675 }, { "auxiliary_loss_clip": 0.01113245, "auxiliary_loss_mlp": 0.01037393, "balance_loss_clip": 1.02468526, "balance_loss_mlp": 1.03951108, "epoch": 0.582353825341951, "flos": 17348661129600.0, "grad_norm": 1.6640876673105358, "language_loss": 0.75524706, "learning_rate": 1.4885078138509137e-06, "loss": 0.77675343, "num_input_tokens_seen": 208684240, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.734375, "step": 9686, "time_per_iteration": 2.532423734664917 }, { "auxiliary_loss_clip": 0.01044331, "auxiliary_loss_mlp": 0.01001804, "balance_loss_clip": 1.00024271, "balance_loss_mlp": 1.01278138, "epoch": 0.5824139485946189, "flos": 55473261534720.0, "grad_norm": 0.8116274321113858, "language_loss": 0.57425606, "learning_rate": 1.4881425655167936e-06, "loss": 0.59471738, "num_input_tokens_seen": 208736090, "router_z_loss_clip": 0.015625, "router_z_loss_mlp": 0.22753906, "step": 9687, "time_per_iteration": 2.993882179260254 }, { "auxiliary_loss_clip": 0.0113857, "auxiliary_loss_mlp": 0.0103155, "balance_loss_clip": 1.01758468, "balance_loss_mlp": 1.03788292, "epoch": 0.582474071847287, "flos": 20303031790080.0, "grad_norm": 1.902593975432022, "language_loss": 0.6967274, "learning_rate": 1.4877773354495496e-06, "loss": 0.71842861, "num_input_tokens_seen": 208754600, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.73828125, "step": 9688, "time_per_iteration": 2.5145821571350098 }, { "auxiliary_loss_clip": 0.01121157, "auxiliary_loss_mlp": 0.01033979, "balance_loss_clip": 1.02112269, "balance_loss_mlp": 1.03887665, "epoch": 0.5825341950999549, "flos": 23878010661120.0, "grad_norm": 2.209156831802734, "language_loss": 0.65645266, "learning_rate": 1.4874121236622141e-06, "loss": 0.67800403, "num_input_tokens_seen": 208773140, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73046875, "step": 9689, "time_per_iteration": 2.552584648132324 }, { "auxiliary_loss_clip": 0.01129557, "auxiliary_loss_mlp": 0.01280835, "balance_loss_clip": 1.02018166, "balance_loss_mlp": 1.04031754, "epoch": 0.5825943183526229, "flos": 23113041690240.0, "grad_norm": 1.754847937201231, "language_loss": 0.73009837, "learning_rate": 1.4870469301678223e-06, "loss": 0.75420225, "num_input_tokens_seen": 208793410, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71484375, "step": 9690, "time_per_iteration": 2.5223565101623535 }, { "auxiliary_loss_clip": 0.01131621, "auxiliary_loss_mlp": 0.0103684, "balance_loss_clip": 1.02191556, "balance_loss_mlp": 1.03890276, "epoch": 0.5826544416052909, "flos": 22857429530880.0, "grad_norm": 1.803605662150971, "language_loss": 0.75654638, "learning_rate": 1.4866817549794053e-06, "loss": 0.77823102, "num_input_tokens_seen": 208811920, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.75, "step": 9691, "time_per_iteration": 2.5825936794281006 }, { "auxiliary_loss_clip": 0.01120262, "auxiliary_loss_mlp": 0.01035329, "balance_loss_clip": 1.02222157, "balance_loss_mlp": 1.03753281, "epoch": 0.5827145648579588, "flos": 31501845555840.0, "grad_norm": 1.7755150682589604, "language_loss": 0.80571592, "learning_rate": 1.4863165981099963e-06, "loss": 0.82727182, "num_input_tokens_seen": 208834720, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73828125, "step": 9692, "time_per_iteration": 2.619002342224121 }, { "auxiliary_loss_clip": 0.01043391, "auxiliary_loss_mlp": 0.01001669, "balance_loss_clip": 1.0, "balance_loss_mlp": 1.01204324, "epoch": 0.5827746881106268, "flos": 69811817074560.0, "grad_norm": 0.7745605245926882, "language_loss": 0.56879288, "learning_rate": 1.4859514595726267e-06, "loss": 0.58924347, "num_input_tokens_seen": 208898415, "router_z_loss_clip": 0.01672363, "router_z_loss_mlp": 0.22558594, "step": 9693, "time_per_iteration": 3.0850002765655518 }, { "auxiliary_loss_clip": 0.01121104, "auxiliary_loss_mlp": 0.01282097, "balance_loss_clip": 1.02273524, "balance_loss_mlp": 1.03826809, "epoch": 0.5828348113632947, "flos": 23112395245440.0, "grad_norm": 1.941400195015301, "language_loss": 0.79520881, "learning_rate": 1.485586339380327e-06, "loss": 0.81924087, "num_input_tokens_seen": 208919045, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.73828125, "step": 9694, "time_per_iteration": 3.998852491378784 }, { "auxiliary_loss_clip": 0.01125659, "auxiliary_loss_mlp": 0.01034085, "balance_loss_clip": 1.02208102, "balance_loss_mlp": 1.03827167, "epoch": 0.5828949346159628, "flos": 21873082245120.0, "grad_norm": 1.3783113604483104, "language_loss": 0.76245797, "learning_rate": 1.4852212375461277e-06, "loss": 0.78405547, "num_input_tokens_seen": 208939375, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 9695, "time_per_iteration": 2.584401845932007 }, { "auxiliary_loss_clip": 0.01042422, "auxiliary_loss_mlp": 0.01001585, "balance_loss_clip": 0.99998754, "balance_loss_mlp": 1.01093888, "epoch": 0.5829550578686307, "flos": 65962553950080.0, "grad_norm": 0.7686373478304528, "language_loss": 0.54985487, "learning_rate": 1.4848561540830579e-06, "loss": 0.57029498, "num_input_tokens_seen": 209004760, "router_z_loss_clip": 0.01599121, "router_z_loss_mlp": 0.2265625, "step": 9696, "time_per_iteration": 3.278583526611328 }, { "auxiliary_loss_clip": 0.01139698, "auxiliary_loss_mlp": 0.01031194, "balance_loss_clip": 1.0179615, "balance_loss_mlp": 1.03979564, "epoch": 0.5830151811212987, "flos": 16289799079680.0, "grad_norm": 1.9862320236563982, "language_loss": 0.76162791, "learning_rate": 1.4844910890041474e-06, "loss": 0.78333688, "num_input_tokens_seen": 209022930, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 9697, "time_per_iteration": 2.5873465538024902 }, { "auxiliary_loss_clip": 0.01114276, "auxiliary_loss_mlp": 0.01034785, "balance_loss_clip": 1.02053952, "balance_loss_mlp": 1.03828514, "epoch": 0.5830753043739666, "flos": 24168851084160.0, "grad_norm": 1.7547443421911, "language_loss": 0.77879554, "learning_rate": 1.4841260423224239e-06, "loss": 0.80028617, "num_input_tokens_seen": 209043740, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7578125, "step": 9698, "time_per_iteration": 2.5390405654907227 }, { "auxiliary_loss_clip": 0.01148206, "auxiliary_loss_mlp": 0.01039046, "balance_loss_clip": 1.02528906, "balance_loss_mlp": 1.03998089, "epoch": 0.5831354276266346, "flos": 27059050097280.0, "grad_norm": 1.6340677594322928, "language_loss": 0.83622301, "learning_rate": 1.4837610140509144e-06, "loss": 0.85809553, "num_input_tokens_seen": 209068885, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.734375, "step": 9699, "time_per_iteration": 4.116661071777344 }, { "auxiliary_loss_clip": 0.01130941, "auxiliary_loss_mlp": 0.01030651, "balance_loss_clip": 1.01770449, "balance_loss_mlp": 1.03825736, "epoch": 0.5831955508793025, "flos": 23623475909760.0, "grad_norm": 2.2941183261197313, "language_loss": 0.66580063, "learning_rate": 1.4833960042026467e-06, "loss": 0.68741655, "num_input_tokens_seen": 209087340, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.74609375, "step": 9700, "time_per_iteration": 2.519625425338745 }, { "auxiliary_loss_clip": 0.01120429, "auxiliary_loss_mlp": 0.01034917, "balance_loss_clip": 1.02142239, "balance_loss_mlp": 1.03745103, "epoch": 0.5832556741319705, "flos": 24973250209920.0, "grad_norm": 1.6679186129128107, "language_loss": 0.71715784, "learning_rate": 1.4830310127906467e-06, "loss": 0.7387113, "num_input_tokens_seen": 209108840, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 9701, "time_per_iteration": 2.597747325897217 }, { "auxiliary_loss_clip": 0.01127457, "auxiliary_loss_mlp": 0.01037216, "balance_loss_clip": 1.02372122, "balance_loss_mlp": 1.03768647, "epoch": 0.5833157973846385, "flos": 23221563655680.0, "grad_norm": 1.632467219320568, "language_loss": 0.85532188, "learning_rate": 1.48266603982794e-06, "loss": 0.87696862, "num_input_tokens_seen": 209127985, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 9702, "time_per_iteration": 2.513369083404541 }, { "auxiliary_loss_clip": 0.01114502, "auxiliary_loss_mlp": 0.01035193, "balance_loss_clip": 1.02311707, "balance_loss_mlp": 1.03518796, "epoch": 0.5833759206373065, "flos": 21977941023360.0, "grad_norm": 1.4485474274776071, "language_loss": 0.78150082, "learning_rate": 1.482301085327552e-06, "loss": 0.80299777, "num_input_tokens_seen": 209146885, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.703125, "step": 9703, "time_per_iteration": 2.612187385559082 }, { "auxiliary_loss_clip": 0.01114773, "auxiliary_loss_mlp": 0.01037633, "balance_loss_clip": 1.02325082, "balance_loss_mlp": 1.039361, "epoch": 0.5834360438899745, "flos": 21762405463680.0, "grad_norm": 2.1391417677681797, "language_loss": 0.71402812, "learning_rate": 1.481936149302506e-06, "loss": 0.73555219, "num_input_tokens_seen": 209166130, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.75390625, "step": 9704, "time_per_iteration": 2.5387587547302246 }, { "auxiliary_loss_clip": 0.01129127, "auxiliary_loss_mlp": 0.01031553, "balance_loss_clip": 1.01926804, "balance_loss_mlp": 1.03859305, "epoch": 0.5834961671426424, "flos": 15992566035840.0, "grad_norm": 2.7630300355457367, "language_loss": 0.81264794, "learning_rate": 1.4815712317658271e-06, "loss": 0.83425474, "num_input_tokens_seen": 209183350, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7265625, "step": 9705, "time_per_iteration": 5.566835403442383 }, { "auxiliary_loss_clip": 0.01136085, "auxiliary_loss_mlp": 0.01028184, "balance_loss_clip": 1.01579809, "balance_loss_mlp": 1.03619218, "epoch": 0.5835562903953104, "flos": 22818322598400.0, "grad_norm": 2.8388141036709844, "language_loss": 0.80527174, "learning_rate": 1.4812063327305367e-06, "loss": 0.82691443, "num_input_tokens_seen": 209203945, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7265625, "step": 9706, "time_per_iteration": 2.5397794246673584 }, { "auxiliary_loss_clip": 0.01118908, "auxiliary_loss_mlp": 0.01035153, "balance_loss_clip": 1.02130151, "balance_loss_mlp": 1.03812099, "epoch": 0.5836164136479783, "flos": 48468056624640.0, "grad_norm": 1.917869430670049, "language_loss": 0.74936128, "learning_rate": 1.480841452209658e-06, "loss": 0.77090192, "num_input_tokens_seen": 209227080, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71875, "step": 9707, "time_per_iteration": 2.8099310398101807 }, { "auxiliary_loss_clip": 0.01134537, "auxiliary_loss_mlp": 0.01031302, "balance_loss_clip": 1.01891065, "balance_loss_mlp": 1.03725517, "epoch": 0.5836765369006464, "flos": 26905998245760.0, "grad_norm": 1.7065365488276882, "language_loss": 0.81301248, "learning_rate": 1.4804765902162122e-06, "loss": 0.8346709, "num_input_tokens_seen": 209248170, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 9708, "time_per_iteration": 2.6225101947784424 }, { "auxiliary_loss_clip": 0.01120054, "auxiliary_loss_mlp": 0.0102988, "balance_loss_clip": 1.01671302, "balance_loss_mlp": 1.03992438, "epoch": 0.5837366601533143, "flos": 20084048524800.0, "grad_norm": 1.3780919703100647, "language_loss": 0.78027207, "learning_rate": 1.4801117467632204e-06, "loss": 0.8017714, "num_input_tokens_seen": 209267730, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 9709, "time_per_iteration": 2.60245943069458 }, { "auxiliary_loss_clip": 0.01052624, "auxiliary_loss_mlp": 0.01003055, "balance_loss_clip": 1.00148153, "balance_loss_mlp": 1.01192999, "epoch": 0.5837967834059823, "flos": 65363885971200.0, "grad_norm": 0.7741176413173952, "language_loss": 0.5656991, "learning_rate": 1.4797469218637035e-06, "loss": 0.58625591, "num_input_tokens_seen": 209332510, "router_z_loss_clip": 0.01574707, "router_z_loss_mlp": 0.2265625, "step": 9710, "time_per_iteration": 3.2012014389038086 }, { "auxiliary_loss_clip": 0.01135901, "auxiliary_loss_mlp": 0.01033922, "balance_loss_clip": 1.02174461, "balance_loss_mlp": 1.03790891, "epoch": 0.5838569066586502, "flos": 25338641310720.0, "grad_norm": 1.3472857197892676, "language_loss": 0.6505999, "learning_rate": 1.4793821155306803e-06, "loss": 0.67229807, "num_input_tokens_seen": 209353355, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7109375, "step": 9711, "time_per_iteration": 2.622271776199341 }, { "auxiliary_loss_clip": 0.01114995, "auxiliary_loss_mlp": 0.0103646, "balance_loss_clip": 1.02229762, "balance_loss_mlp": 1.03842092, "epoch": 0.5839170299113182, "flos": 22229243550720.0, "grad_norm": 1.781831274455313, "language_loss": 0.78419864, "learning_rate": 1.4790173277771705e-06, "loss": 0.80571318, "num_input_tokens_seen": 209370960, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.765625, "step": 9712, "time_per_iteration": 2.541923761367798 }, { "auxiliary_loss_clip": 0.01137589, "auxiliary_loss_mlp": 0.01276546, "balance_loss_clip": 1.01635885, "balance_loss_mlp": 1.0376482, "epoch": 0.5839771531639861, "flos": 22200012858240.0, "grad_norm": 1.5016879553391067, "language_loss": 0.73170114, "learning_rate": 1.4786525586161917e-06, "loss": 0.75584257, "num_input_tokens_seen": 209390955, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73046875, "step": 9713, "time_per_iteration": 2.5853593349456787 }, { "auxiliary_loss_clip": 0.01131899, "auxiliary_loss_mlp": 0.01033004, "balance_loss_clip": 1.01891935, "balance_loss_mlp": 1.03771019, "epoch": 0.5840372764166541, "flos": 22419355259520.0, "grad_norm": 1.8417645041878772, "language_loss": 0.6927222, "learning_rate": 1.4782878080607627e-06, "loss": 0.7143712, "num_input_tokens_seen": 209410260, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.765625, "step": 9714, "time_per_iteration": 2.618467092514038 }, { "auxiliary_loss_clip": 0.01112085, "auxiliary_loss_mlp": 0.01034853, "balance_loss_clip": 1.02156091, "balance_loss_mlp": 1.0371933, "epoch": 0.5840973996693221, "flos": 19828256797440.0, "grad_norm": 1.6224659196393365, "language_loss": 0.80124223, "learning_rate": 1.4779230761238997e-06, "loss": 0.82271165, "num_input_tokens_seen": 209429920, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75, "step": 9715, "time_per_iteration": 2.5387628078460693 }, { "auxiliary_loss_clip": 0.01128248, "auxiliary_loss_mlp": 0.01035445, "balance_loss_clip": 1.02133679, "balance_loss_mlp": 1.03783226, "epoch": 0.5841575229219901, "flos": 21142982401920.0, "grad_norm": 1.9090993693971428, "language_loss": 0.72735417, "learning_rate": 1.4775583628186184e-06, "loss": 0.74899113, "num_input_tokens_seen": 209449470, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7265625, "step": 9716, "time_per_iteration": 2.507636070251465 }, { "auxiliary_loss_clip": 0.01125053, "auxiliary_loss_mlp": 0.01031252, "balance_loss_clip": 1.01869309, "balance_loss_mlp": 1.03686929, "epoch": 0.5842176461746581, "flos": 24640322025600.0, "grad_norm": 1.642822375803166, "language_loss": 0.75023246, "learning_rate": 1.477193668157936e-06, "loss": 0.77179545, "num_input_tokens_seen": 209467695, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 9717, "time_per_iteration": 2.5959103107452393 }, { "auxiliary_loss_clip": 0.01118435, "auxiliary_loss_mlp": 0.01036713, "balance_loss_clip": 1.02339101, "balance_loss_mlp": 1.03710866, "epoch": 0.584277769427326, "flos": 19131158574720.0, "grad_norm": 2.1858904421729832, "language_loss": 0.79992461, "learning_rate": 1.4768289921548665e-06, "loss": 0.8214761, "num_input_tokens_seen": 209484250, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 9718, "time_per_iteration": 2.4765915870666504 }, { "auxiliary_loss_clip": 0.01128635, "auxiliary_loss_mlp": 0.01034877, "balance_loss_clip": 1.02147746, "balance_loss_mlp": 1.03898478, "epoch": 0.584337892679994, "flos": 22675111073280.0, "grad_norm": 1.5679749308710325, "language_loss": 0.67336464, "learning_rate": 1.4764643348224247e-06, "loss": 0.69499975, "num_input_tokens_seen": 209502830, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 9719, "time_per_iteration": 2.68032169342041 }, { "auxiliary_loss_clip": 0.01111653, "auxiliary_loss_mlp": 0.01031314, "balance_loss_clip": 1.01812971, "balance_loss_mlp": 1.03744864, "epoch": 0.5843980159326619, "flos": 31284083352960.0, "grad_norm": 1.8052399729265756, "language_loss": 0.75572336, "learning_rate": 1.4760996961736245e-06, "loss": 0.77715302, "num_input_tokens_seen": 209525995, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7421875, "step": 9720, "time_per_iteration": 2.6136868000030518 }, { "auxiliary_loss_clip": 0.0113678, "auxiliary_loss_mlp": 0.01036211, "balance_loss_clip": 1.02362275, "balance_loss_mlp": 1.03625929, "epoch": 0.58445813918533, "flos": 22748117466240.0, "grad_norm": 1.599138411546241, "language_loss": 0.83116788, "learning_rate": 1.4757350762214778e-06, "loss": 0.85289776, "num_input_tokens_seen": 209545895, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.734375, "step": 9721, "time_per_iteration": 2.6088979244232178 }, { "auxiliary_loss_clip": 0.01035108, "auxiliary_loss_mlp": 0.01006464, "balance_loss_clip": 1.00477111, "balance_loss_mlp": 1.01248431, "epoch": 0.5845182624379979, "flos": 60686556658560.0, "grad_norm": 0.9588529885970328, "language_loss": 0.71284342, "learning_rate": 1.4753704749789976e-06, "loss": 0.7332592, "num_input_tokens_seen": 209602315, "router_z_loss_clip": 0.01696777, "router_z_loss_mlp": 0.2265625, "step": 9722, "time_per_iteration": 3.1667723655700684 }, { "auxiliary_loss_clip": 0.0111755, "auxiliary_loss_mlp": 0.01034532, "balance_loss_clip": 1.02205122, "balance_loss_mlp": 1.03658342, "epoch": 0.5845783856906659, "flos": 16362446336640.0, "grad_norm": 2.134249972148915, "language_loss": 0.88796788, "learning_rate": 1.4750058924591957e-06, "loss": 0.90948874, "num_input_tokens_seen": 209617615, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 9723, "time_per_iteration": 2.4927845001220703 }, { "auxiliary_loss_clip": 0.01128359, "auxiliary_loss_mlp": 0.01034572, "balance_loss_clip": 1.02228141, "balance_loss_mlp": 1.03919482, "epoch": 0.5846385089433338, "flos": 20083402080000.0, "grad_norm": 1.4394278227527366, "language_loss": 0.68436885, "learning_rate": 1.4746413286750836e-06, "loss": 0.70599812, "num_input_tokens_seen": 209637005, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71484375, "step": 9724, "time_per_iteration": 2.574505090713501 }, { "auxiliary_loss_clip": 0.01123187, "auxiliary_loss_mlp": 0.01034326, "balance_loss_clip": 1.02087915, "balance_loss_mlp": 1.040133, "epoch": 0.5846986321960018, "flos": 17311062568320.0, "grad_norm": 1.8167062975069517, "language_loss": 0.86022794, "learning_rate": 1.474276783639671e-06, "loss": 0.88180304, "num_input_tokens_seen": 209653170, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 9725, "time_per_iteration": 2.5599679946899414 }, { "auxiliary_loss_clip": 0.01132979, "auxiliary_loss_mlp": 0.01038327, "balance_loss_clip": 1.02548182, "balance_loss_mlp": 1.03971362, "epoch": 0.5847587554486697, "flos": 17197907748480.0, "grad_norm": 1.739212134114045, "language_loss": 0.83015919, "learning_rate": 1.473912257365967e-06, "loss": 0.85187221, "num_input_tokens_seen": 209671275, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7578125, "step": 9726, "time_per_iteration": 2.6008875370025635 }, { "auxiliary_loss_clip": 0.01156723, "auxiliary_loss_mlp": 0.0127962, "balance_loss_clip": 1.02032995, "balance_loss_mlp": 1.03940475, "epoch": 0.5848188787013378, "flos": 24529106540160.0, "grad_norm": 1.8025712912802898, "language_loss": 0.6681304, "learning_rate": 1.4735477498669817e-06, "loss": 0.69249386, "num_input_tokens_seen": 209690380, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7265625, "step": 9727, "time_per_iteration": 2.668973684310913 }, { "auxiliary_loss_clip": 0.01139561, "auxiliary_loss_mlp": 0.01042195, "balance_loss_clip": 1.028844, "balance_loss_mlp": 1.03968978, "epoch": 0.5848790019540057, "flos": 20628382204800.0, "grad_norm": 1.8580204277676784, "language_loss": 0.81542718, "learning_rate": 1.4731832611557229e-06, "loss": 0.83724475, "num_input_tokens_seen": 209708845, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 9728, "time_per_iteration": 2.6209423542022705 }, { "auxiliary_loss_clip": 0.01117412, "auxiliary_loss_mlp": 0.01035981, "balance_loss_clip": 1.02372074, "balance_loss_mlp": 1.0377388, "epoch": 0.5849391252066737, "flos": 22418852469120.0, "grad_norm": 2.03396005221417, "language_loss": 0.779396, "learning_rate": 1.4728187912451987e-06, "loss": 0.8009299, "num_input_tokens_seen": 209729000, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.70703125, "step": 9729, "time_per_iteration": 2.5563900470733643 }, { "auxiliary_loss_clip": 0.01121044, "auxiliary_loss_mlp": 0.01031078, "balance_loss_clip": 1.01741064, "balance_loss_mlp": 1.03791916, "epoch": 0.5849992484593417, "flos": 25410929431680.0, "grad_norm": 1.8475713759573988, "language_loss": 0.70340359, "learning_rate": 1.4724543401484155e-06, "loss": 0.7249248, "num_input_tokens_seen": 209747435, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 9730, "time_per_iteration": 2.6098382472991943 }, { "auxiliary_loss_clip": 0.01115862, "auxiliary_loss_mlp": 0.01035516, "balance_loss_clip": 1.02092445, "balance_loss_mlp": 1.03948212, "epoch": 0.5850593717120096, "flos": 21065163586560.0, "grad_norm": 2.018919004388271, "language_loss": 0.7843796, "learning_rate": 1.4720899078783797e-06, "loss": 0.80589336, "num_input_tokens_seen": 209764910, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.765625, "step": 9731, "time_per_iteration": 2.535062551498413 }, { "auxiliary_loss_clip": 0.01119292, "auxiliary_loss_mlp": 0.01036305, "balance_loss_clip": 1.02270317, "balance_loss_mlp": 1.03759241, "epoch": 0.5851194949646776, "flos": 25301545539840.0, "grad_norm": 1.953379314726703, "language_loss": 0.70137548, "learning_rate": 1.4717254944480978e-06, "loss": 0.72293139, "num_input_tokens_seen": 209786115, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 9732, "time_per_iteration": 2.567009449005127 }, { "auxiliary_loss_clip": 0.01151444, "auxiliary_loss_mlp": 0.01035061, "balance_loss_clip": 1.0204823, "balance_loss_mlp": 1.03941703, "epoch": 0.5851796182173455, "flos": 23587242065280.0, "grad_norm": 4.41340980722562, "language_loss": 0.52844542, "learning_rate": 1.471361099870573e-06, "loss": 0.55031049, "num_input_tokens_seen": 209806095, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.765625, "step": 9733, "time_per_iteration": 2.5498218536376953 }, { "auxiliary_loss_clip": 0.01121029, "auxiliary_loss_mlp": 0.0103264, "balance_loss_clip": 1.01928234, "balance_loss_mlp": 1.03866768, "epoch": 0.5852397414700136, "flos": 24822712310400.0, "grad_norm": 1.9536722766265042, "language_loss": 0.87671781, "learning_rate": 1.4709967241588116e-06, "loss": 0.89825451, "num_input_tokens_seen": 209823650, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 9734, "time_per_iteration": 2.5707502365112305 }, { "auxiliary_loss_clip": 0.01125633, "auxiliary_loss_mlp": 0.01036221, "balance_loss_clip": 1.02397251, "balance_loss_mlp": 1.03923607, "epoch": 0.5852998647226815, "flos": 19937784343680.0, "grad_norm": 1.6349370934776666, "language_loss": 0.72109151, "learning_rate": 1.4706323673258165e-06, "loss": 0.74271005, "num_input_tokens_seen": 209843220, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 9735, "time_per_iteration": 3.887098550796509 }, { "auxiliary_loss_clip": 0.01123822, "auxiliary_loss_mlp": 0.01043397, "balance_loss_clip": 1.02902007, "balance_loss_mlp": 1.03899002, "epoch": 0.5853599879753495, "flos": 16720367408640.0, "grad_norm": 3.9073308013329195, "language_loss": 0.74383664, "learning_rate": 1.4702680293845901e-06, "loss": 0.76550877, "num_input_tokens_seen": 209854880, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.76171875, "step": 9736, "time_per_iteration": 2.5726265907287598 }, { "auxiliary_loss_clip": 0.01135585, "auxiliary_loss_mlp": 0.01032462, "balance_loss_clip": 1.01952147, "balance_loss_mlp": 1.03691483, "epoch": 0.5854201112280174, "flos": 44456583680640.0, "grad_norm": 23.04207590087794, "language_loss": 0.70255995, "learning_rate": 1.4699037103481356e-06, "loss": 0.7242403, "num_input_tokens_seen": 209877870, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 9737, "time_per_iteration": 2.809699773788452 }, { "auxiliary_loss_clip": 0.01135486, "auxiliary_loss_mlp": 0.01039319, "balance_loss_clip": 1.0267005, "balance_loss_mlp": 1.0381366, "epoch": 0.5854802344806854, "flos": 20339193807360.0, "grad_norm": 1.6990872167058924, "language_loss": 0.82809496, "learning_rate": 1.469539410229453e-06, "loss": 0.84984303, "num_input_tokens_seen": 209896690, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 9738, "time_per_iteration": 2.574111223220825 }, { "auxiliary_loss_clip": 0.01115879, "auxiliary_loss_mlp": 0.0128562, "balance_loss_clip": 1.02441859, "balance_loss_mlp": 1.03947914, "epoch": 0.5855403577333533, "flos": 20921054221440.0, "grad_norm": 1.762303087886391, "language_loss": 0.68248498, "learning_rate": 1.4691751290415454e-06, "loss": 0.70649993, "num_input_tokens_seen": 209914640, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.765625, "step": 9739, "time_per_iteration": 2.5473718643188477 }, { "auxiliary_loss_clip": 0.01111386, "auxiliary_loss_mlp": 0.01028014, "balance_loss_clip": 1.01422191, "balance_loss_mlp": 1.03622913, "epoch": 0.5856004809860214, "flos": 20448649526400.0, "grad_norm": 2.4124637487550715, "language_loss": 0.59063184, "learning_rate": 1.4688108667974115e-06, "loss": 0.61202586, "num_input_tokens_seen": 209933375, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75, "step": 9740, "time_per_iteration": 3.9209585189819336 }, { "auxiliary_loss_clip": 0.01127474, "auxiliary_loss_mlp": 0.01030282, "balance_loss_clip": 1.01709795, "balance_loss_mlp": 1.03655231, "epoch": 0.5856606042386893, "flos": 19640766781440.0, "grad_norm": 1.7268561724068403, "language_loss": 0.75281489, "learning_rate": 1.4684466235100517e-06, "loss": 0.77439243, "num_input_tokens_seen": 209952055, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 9741, "time_per_iteration": 2.5377936363220215 }, { "auxiliary_loss_clip": 0.01140043, "auxiliary_loss_mlp": 0.01034885, "balance_loss_clip": 1.0222429, "balance_loss_mlp": 1.03945374, "epoch": 0.5857207274913573, "flos": 21686166846720.0, "grad_norm": 3.1473464466367562, "language_loss": 0.75858396, "learning_rate": 1.4680823991924645e-06, "loss": 0.78033322, "num_input_tokens_seen": 209971190, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73828125, "step": 9742, "time_per_iteration": 2.525479555130005 }, { "auxiliary_loss_clip": 0.01127066, "auxiliary_loss_mlp": 0.01029341, "balance_loss_clip": 1.01665163, "balance_loss_mlp": 1.03789163, "epoch": 0.5857808507440253, "flos": 23182708118400.0, "grad_norm": 1.6511074565060067, "language_loss": 0.75037551, "learning_rate": 1.4677181938576477e-06, "loss": 0.77193952, "num_input_tokens_seen": 209990695, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 9743, "time_per_iteration": 2.62455153465271 }, { "auxiliary_loss_clip": 0.01125519, "auxiliary_loss_mlp": 0.010293, "balance_loss_clip": 1.01690805, "balance_loss_mlp": 1.03801465, "epoch": 0.5858409739966932, "flos": 27235299156480.0, "grad_norm": 2.110758432819296, "language_loss": 0.81280255, "learning_rate": 1.4673540075186002e-06, "loss": 0.83435065, "num_input_tokens_seen": 210010210, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69921875, "step": 9744, "time_per_iteration": 2.578939914703369 }, { "auxiliary_loss_clip": 0.01133467, "auxiliary_loss_mlp": 0.01030927, "balance_loss_clip": 1.01805878, "balance_loss_mlp": 1.03943491, "epoch": 0.5859010972493612, "flos": 27855512317440.0, "grad_norm": 1.6038689613713344, "language_loss": 0.72005928, "learning_rate": 1.4669898401883171e-06, "loss": 0.74170321, "num_input_tokens_seen": 210030030, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7578125, "step": 9745, "time_per_iteration": 2.5910556316375732 }, { "auxiliary_loss_clip": 0.01053138, "auxiliary_loss_mlp": 0.01004482, "balance_loss_clip": 1.00289679, "balance_loss_mlp": 1.01288259, "epoch": 0.5859612205020291, "flos": 70007064428160.0, "grad_norm": 0.7312386698329127, "language_loss": 0.53284442, "learning_rate": 1.4666256918797964e-06, "loss": 0.5534206, "num_input_tokens_seen": 210094840, "router_z_loss_clip": 0.01586914, "router_z_loss_mlp": 0.22851562, "step": 9746, "time_per_iteration": 4.695799350738525 }, { "auxiliary_loss_clip": 0.01131593, "auxiliary_loss_mlp": 0.01035102, "balance_loss_clip": 1.02112448, "balance_loss_mlp": 1.03965461, "epoch": 0.5860213437546972, "flos": 24056019486720.0, "grad_norm": 1.9717560882312652, "language_loss": 0.73223811, "learning_rate": 1.4662615626060325e-06, "loss": 0.75390506, "num_input_tokens_seen": 210114660, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7421875, "step": 9747, "time_per_iteration": 4.0150673389434814 }, { "auxiliary_loss_clip": 0.01131915, "auxiliary_loss_mlp": 0.01032584, "balance_loss_clip": 1.01922679, "balance_loss_mlp": 1.04044127, "epoch": 0.5860814670073651, "flos": 18947583141120.0, "grad_norm": 2.360085468392693, "language_loss": 0.81195247, "learning_rate": 1.4658974523800202e-06, "loss": 0.83359754, "num_input_tokens_seen": 210132770, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 9748, "time_per_iteration": 2.557753801345825 }, { "auxiliary_loss_clip": 0.0112144, "auxiliary_loss_mlp": 0.01033634, "balance_loss_clip": 1.0201993, "balance_loss_mlp": 1.03821087, "epoch": 0.5861415902600331, "flos": 22561848512640.0, "grad_norm": 1.6683770945236371, "language_loss": 0.71860647, "learning_rate": 1.4655333612147542e-06, "loss": 0.74015725, "num_input_tokens_seen": 210151895, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 9749, "time_per_iteration": 2.7005996704101562 }, { "auxiliary_loss_clip": 0.01128503, "auxiliary_loss_mlp": 0.01031047, "balance_loss_clip": 1.01797533, "balance_loss_mlp": 1.04015338, "epoch": 0.586201713512701, "flos": 14392027912320.0, "grad_norm": 2.2693615469584296, "language_loss": 0.7482419, "learning_rate": 1.4651692891232279e-06, "loss": 0.76983738, "num_input_tokens_seen": 210168040, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 9750, "time_per_iteration": 2.525620222091675 }, { "auxiliary_loss_clip": 0.01137096, "auxiliary_loss_mlp": 0.01036885, "balance_loss_clip": 1.02269888, "balance_loss_mlp": 1.0381825, "epoch": 0.586261836765369, "flos": 19498560837120.0, "grad_norm": 1.6258183491559948, "language_loss": 0.70722169, "learning_rate": 1.4648052361184337e-06, "loss": 0.72896147, "num_input_tokens_seen": 210187720, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7265625, "step": 9751, "time_per_iteration": 2.6604413986206055 }, { "auxiliary_loss_clip": 0.01114487, "auxiliary_loss_mlp": 0.01039923, "balance_loss_clip": 1.02526069, "balance_loss_mlp": 1.03922069, "epoch": 0.5863219600180369, "flos": 20701819560960.0, "grad_norm": 1.7255279830794852, "language_loss": 0.74449265, "learning_rate": 1.4644412022133637e-06, "loss": 0.76603675, "num_input_tokens_seen": 210206080, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.75390625, "step": 9752, "time_per_iteration": 2.539868116378784 }, { "auxiliary_loss_clip": 0.0111949, "auxiliary_loss_mlp": 0.01033628, "balance_loss_clip": 1.02003241, "balance_loss_mlp": 1.03779697, "epoch": 0.586382083270705, "flos": 19792130693760.0, "grad_norm": 1.8453308445954135, "language_loss": 0.77319926, "learning_rate": 1.4640771874210101e-06, "loss": 0.79473042, "num_input_tokens_seen": 210225660, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 9753, "time_per_iteration": 2.566762685775757 }, { "auxiliary_loss_clip": 0.01108786, "auxiliary_loss_mlp": 0.010275, "balance_loss_clip": 1.01496458, "balance_loss_mlp": 1.03947353, "epoch": 0.5864422065233729, "flos": 16500558130560.0, "grad_norm": 1.6887201697801584, "language_loss": 0.7120471, "learning_rate": 1.4637131917543628e-06, "loss": 0.73340994, "num_input_tokens_seen": 210242725, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 9754, "time_per_iteration": 2.4488558769226074 }, { "auxiliary_loss_clip": 0.01127929, "auxiliary_loss_mlp": 0.01032202, "balance_loss_clip": 1.01844501, "balance_loss_mlp": 1.03750467, "epoch": 0.5865023297760409, "flos": 20413277608320.0, "grad_norm": 1.8931228763774177, "language_loss": 0.72249913, "learning_rate": 1.4633492152264123e-06, "loss": 0.74410045, "num_input_tokens_seen": 210263225, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7265625, "step": 9755, "time_per_iteration": 2.6184206008911133 }, { "auxiliary_loss_clip": 0.01061489, "auxiliary_loss_mlp": 0.01005554, "balance_loss_clip": 1.00400436, "balance_loss_mlp": 1.01273513, "epoch": 0.5865624530287089, "flos": 63350769254400.0, "grad_norm": 0.7417594970215596, "language_loss": 0.56893748, "learning_rate": 1.462985257850148e-06, "loss": 0.58960789, "num_input_tokens_seen": 210322310, "router_z_loss_clip": 0.01550293, "router_z_loss_mlp": 0.22851562, "step": 9756, "time_per_iteration": 3.1594290733337402 }, { "auxiliary_loss_clip": 0.01107768, "auxiliary_loss_mlp": 0.01035666, "balance_loss_clip": 1.02320242, "balance_loss_mlp": 1.03699636, "epoch": 0.5866225762813768, "flos": 27016279977600.0, "grad_norm": 1.8240625700706923, "language_loss": 0.76125586, "learning_rate": 1.4626213196385577e-06, "loss": 0.78269017, "num_input_tokens_seen": 210340845, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.70703125, "step": 9757, "time_per_iteration": 2.6081271171569824 }, { "auxiliary_loss_clip": 0.01053509, "auxiliary_loss_mlp": 0.01002083, "balance_loss_clip": 1.0004977, "balance_loss_mlp": 1.01258731, "epoch": 0.5866826995340448, "flos": 72987038507520.0, "grad_norm": 0.8482455842992024, "language_loss": 0.60525155, "learning_rate": 1.462257400604631e-06, "loss": 0.62580746, "num_input_tokens_seen": 210397815, "router_z_loss_clip": 0.01586914, "router_z_loss_mlp": 0.23046875, "step": 9758, "time_per_iteration": 3.1624491214752197 }, { "auxiliary_loss_clip": 0.01114926, "auxiliary_loss_mlp": 0.01033579, "balance_loss_clip": 1.01985765, "balance_loss_mlp": 1.03838563, "epoch": 0.5867428227867127, "flos": 21285727050240.0, "grad_norm": 2.0420430047497553, "language_loss": 0.71519589, "learning_rate": 1.4618935007613544e-06, "loss": 0.73668087, "num_input_tokens_seen": 210413900, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.765625, "step": 9759, "time_per_iteration": 2.5276403427124023 }, { "auxiliary_loss_clip": 0.01129504, "auxiliary_loss_mlp": 0.01034695, "balance_loss_clip": 1.02205849, "balance_loss_mlp": 1.03915405, "epoch": 0.5868029460393808, "flos": 33468852188160.0, "grad_norm": 1.4917091080839582, "language_loss": 0.72758383, "learning_rate": 1.461529620121714e-06, "loss": 0.7492258, "num_input_tokens_seen": 210434110, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 9760, "time_per_iteration": 2.661268711090088 }, { "auxiliary_loss_clip": 0.01117953, "auxiliary_loss_mlp": 0.01030059, "balance_loss_clip": 1.01670706, "balance_loss_mlp": 1.03680968, "epoch": 0.5868630692920487, "flos": 17889475276800.0, "grad_norm": 2.1536664317530807, "language_loss": 0.72860157, "learning_rate": 1.461165758698697e-06, "loss": 0.75008166, "num_input_tokens_seen": 210451685, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.72265625, "step": 9761, "time_per_iteration": 2.5748484134674072 }, { "auxiliary_loss_clip": 0.01129253, "auxiliary_loss_mlp": 0.0103338, "balance_loss_clip": 1.01844263, "balance_loss_mlp": 1.03616393, "epoch": 0.5869231925447167, "flos": 21035035054080.0, "grad_norm": 1.6879959129399555, "language_loss": 0.74653172, "learning_rate": 1.4608019165052876e-06, "loss": 0.76815796, "num_input_tokens_seen": 210470825, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.75390625, "step": 9762, "time_per_iteration": 2.6094043254852295 }, { "auxiliary_loss_clip": 0.01137084, "auxiliary_loss_mlp": 0.01032641, "balance_loss_clip": 1.01883638, "balance_loss_mlp": 1.03936541, "epoch": 0.5869833157973846, "flos": 74738219293440.0, "grad_norm": 1.3060469926923333, "language_loss": 0.72001886, "learning_rate": 1.4604380935544712e-06, "loss": 0.74171615, "num_input_tokens_seen": 210500075, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7109375, "step": 9763, "time_per_iteration": 2.9799153804779053 }, { "auxiliary_loss_clip": 0.01118857, "auxiliary_loss_mlp": 0.01031257, "balance_loss_clip": 1.01940215, "balance_loss_mlp": 1.03788912, "epoch": 0.5870434390500526, "flos": 17638998762240.0, "grad_norm": 1.5145233104859734, "language_loss": 0.80061889, "learning_rate": 1.4600742898592313e-06, "loss": 0.82212007, "num_input_tokens_seen": 210518150, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.71875, "step": 9764, "time_per_iteration": 2.6246676445007324 }, { "auxiliary_loss_clip": 0.01131402, "auxiliary_loss_mlp": 0.0103551, "balance_loss_clip": 1.02236128, "balance_loss_mlp": 1.04057372, "epoch": 0.5871035623027205, "flos": 21506146859520.0, "grad_norm": 1.567174150959763, "language_loss": 0.79032892, "learning_rate": 1.4597105054325512e-06, "loss": 0.81199801, "num_input_tokens_seen": 210537760, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 9765, "time_per_iteration": 2.5480446815490723 }, { "auxiliary_loss_clip": 0.01127752, "auxiliary_loss_mlp": 0.01034234, "balance_loss_clip": 1.02094197, "balance_loss_mlp": 1.03896332, "epoch": 0.5871636855553886, "flos": 13551861818880.0, "grad_norm": 1.6526122074375156, "language_loss": 0.83522934, "learning_rate": 1.4593467402874132e-06, "loss": 0.85684919, "num_input_tokens_seen": 210555515, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 9766, "time_per_iteration": 2.5648059844970703 }, { "auxiliary_loss_clip": 0.01122634, "auxiliary_loss_mlp": 0.0103692, "balance_loss_clip": 1.02387285, "balance_loss_mlp": 1.03805292, "epoch": 0.5872238088080565, "flos": 26212922346240.0, "grad_norm": 1.564684839075645, "language_loss": 0.69661987, "learning_rate": 1.4589829944367989e-06, "loss": 0.71821541, "num_input_tokens_seen": 210575000, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7578125, "step": 9767, "time_per_iteration": 2.567537307739258 }, { "auxiliary_loss_clip": 0.01120895, "auxiliary_loss_mlp": 0.01282326, "balance_loss_clip": 1.02151275, "balance_loss_mlp": 1.0369153, "epoch": 0.5872839320607245, "flos": 30665198995200.0, "grad_norm": 2.0384336270423637, "language_loss": 0.6298542, "learning_rate": 1.4586192678936903e-06, "loss": 0.65388632, "num_input_tokens_seen": 210595185, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75, "step": 9768, "time_per_iteration": 2.6326842308044434 }, { "auxiliary_loss_clip": 0.01035055, "auxiliary_loss_mlp": 0.01248122, "balance_loss_clip": 1.00078666, "balance_loss_mlp": 1.01202714, "epoch": 0.5873440553133924, "flos": 60303570871680.0, "grad_norm": 0.718604355143901, "language_loss": 0.53873301, "learning_rate": 1.4582555606710676e-06, "loss": 0.5615648, "num_input_tokens_seen": 210653210, "router_z_loss_clip": 0.01538086, "router_z_loss_mlp": 0.23046875, "step": 9769, "time_per_iteration": 3.1507058143615723 }, { "auxiliary_loss_clip": 0.0113525, "auxiliary_loss_mlp": 0.01036015, "balance_loss_clip": 1.0229969, "balance_loss_mlp": 1.03756571, "epoch": 0.5874041785660604, "flos": 21539292134400.0, "grad_norm": 1.4262393219452196, "language_loss": 0.70947069, "learning_rate": 1.4578918727819099e-06, "loss": 0.73118329, "num_input_tokens_seen": 210673750, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 9770, "time_per_iteration": 2.619047164916992 }, { "auxiliary_loss_clip": 0.01144285, "auxiliary_loss_mlp": 0.01034601, "balance_loss_clip": 1.02086782, "balance_loss_mlp": 1.03729403, "epoch": 0.5874643018187284, "flos": 24388947671040.0, "grad_norm": 1.8916672596767006, "language_loss": 0.67535055, "learning_rate": 1.457528204239197e-06, "loss": 0.69713938, "num_input_tokens_seen": 210692960, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.71875, "step": 9771, "time_per_iteration": 2.662313222885132 }, { "auxiliary_loss_clip": 0.01146392, "auxiliary_loss_mlp": 0.01036899, "balance_loss_clip": 1.0224328, "balance_loss_mlp": 1.03620481, "epoch": 0.5875244250713964, "flos": 28147717457280.0, "grad_norm": 1.4118516626388584, "language_loss": 0.65856957, "learning_rate": 1.4571645550559068e-06, "loss": 0.68040252, "num_input_tokens_seen": 210714040, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.74609375, "step": 9772, "time_per_iteration": 2.6726789474487305 }, { "auxiliary_loss_clip": 0.01115496, "auxiliary_loss_mlp": 0.01044374, "balance_loss_clip": 1.02923429, "balance_loss_mlp": 1.03980613, "epoch": 0.5875845483240644, "flos": 25812410722560.0, "grad_norm": 1.7209785775157682, "language_loss": 0.74554062, "learning_rate": 1.4568009252450177e-06, "loss": 0.76713932, "num_input_tokens_seen": 210733710, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.7578125, "step": 9773, "time_per_iteration": 2.599635124206543 }, { "auxiliary_loss_clip": 0.01138782, "auxiliary_loss_mlp": 0.01041594, "balance_loss_clip": 1.0270977, "balance_loss_mlp": 1.03865623, "epoch": 0.5876446715767323, "flos": 26906572863360.0, "grad_norm": 1.9322275757168081, "language_loss": 0.52859783, "learning_rate": 1.456437314819506e-06, "loss": 0.55040157, "num_input_tokens_seen": 210753580, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.73828125, "step": 9774, "time_per_iteration": 2.57893967628479 }, { "auxiliary_loss_clip": 0.01121607, "auxiliary_loss_mlp": 0.0103949, "balance_loss_clip": 1.02479136, "balance_loss_mlp": 1.0394454, "epoch": 0.5877047948294003, "flos": 36684832579200.0, "grad_norm": 4.06692378920988, "language_loss": 0.6486969, "learning_rate": 1.456073723792349e-06, "loss": 0.67030787, "num_input_tokens_seen": 210773495, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.734375, "step": 9775, "time_per_iteration": 2.685518503189087 }, { "auxiliary_loss_clip": 0.01120422, "auxiliary_loss_mlp": 0.01033002, "balance_loss_clip": 1.01948977, "balance_loss_mlp": 1.03806221, "epoch": 0.5877649180820682, "flos": 26724721282560.0, "grad_norm": 1.697830507431523, "language_loss": 0.73205924, "learning_rate": 1.4557101521765211e-06, "loss": 0.75359356, "num_input_tokens_seen": 210793645, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 9776, "time_per_iteration": 2.5312695503234863 }, { "auxiliary_loss_clip": 0.01110897, "auxiliary_loss_mlp": 0.01033863, "balance_loss_clip": 1.02056527, "balance_loss_mlp": 1.03763998, "epoch": 0.5878250413347362, "flos": 21032197879680.0, "grad_norm": 1.6944424390943118, "language_loss": 0.7400468, "learning_rate": 1.4553465999849977e-06, "loss": 0.7614944, "num_input_tokens_seen": 210813415, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 9777, "time_per_iteration": 3.940661668777466 }, { "auxiliary_loss_clip": 0.0113512, "auxiliary_loss_mlp": 0.01035104, "balance_loss_clip": 1.02256322, "balance_loss_mlp": 1.03923273, "epoch": 0.5878851645874041, "flos": 25484259047040.0, "grad_norm": 1.461938960063886, "language_loss": 0.74418545, "learning_rate": 1.4549830672307533e-06, "loss": 0.76588774, "num_input_tokens_seen": 210833850, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 9778, "time_per_iteration": 2.693740129470825 }, { "auxiliary_loss_clip": 0.01135345, "auxiliary_loss_mlp": 0.01030741, "balance_loss_clip": 1.0179317, "balance_loss_mlp": 1.03634632, "epoch": 0.5879452878400722, "flos": 23769129559680.0, "grad_norm": 2.0176641954252754, "language_loss": 0.70069873, "learning_rate": 1.454619553926761e-06, "loss": 0.72235954, "num_input_tokens_seen": 210853115, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 9779, "time_per_iteration": 2.6387197971343994 }, { "auxiliary_loss_clip": 0.01112669, "auxiliary_loss_mlp": 0.01281787, "balance_loss_clip": 1.02143383, "balance_loss_mlp": 1.03770661, "epoch": 0.5880054110927401, "flos": 17824513530240.0, "grad_norm": 2.223153806299893, "language_loss": 0.6670562, "learning_rate": 1.4542560600859949e-06, "loss": 0.69100082, "num_input_tokens_seen": 210872090, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75, "step": 9780, "time_per_iteration": 2.561459541320801 }, { "auxiliary_loss_clip": 0.01130848, "auxiliary_loss_mlp": 0.01037577, "balance_loss_clip": 1.0238322, "balance_loss_mlp": 1.03961587, "epoch": 0.5880655343454081, "flos": 19463404400640.0, "grad_norm": 1.930134486105735, "language_loss": 0.72320211, "learning_rate": 1.4538925857214256e-06, "loss": 0.74488634, "num_input_tokens_seen": 210888490, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7265625, "step": 9781, "time_per_iteration": 2.562002420425415 }, { "auxiliary_loss_clip": 0.01127274, "auxiliary_loss_mlp": 0.01278892, "balance_loss_clip": 1.01851952, "balance_loss_mlp": 1.03661811, "epoch": 0.588125657598076, "flos": 21397588980480.0, "grad_norm": 1.3632145557534063, "language_loss": 0.70478904, "learning_rate": 1.453529130846025e-06, "loss": 0.72885072, "num_input_tokens_seen": 210908220, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 9782, "time_per_iteration": 4.0728864669799805 }, { "auxiliary_loss_clip": 0.01121366, "auxiliary_loss_mlp": 0.01032358, "balance_loss_clip": 1.01890564, "balance_loss_mlp": 1.03669381, "epoch": 0.588185780850744, "flos": 16034653797120.0, "grad_norm": 2.7524738783844915, "language_loss": 0.70164216, "learning_rate": 1.4531656954727641e-06, "loss": 0.72317946, "num_input_tokens_seen": 210923945, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 9783, "time_per_iteration": 2.4809489250183105 }, { "auxiliary_loss_clip": 0.01139682, "auxiliary_loss_mlp": 0.0103551, "balance_loss_clip": 1.0210619, "balance_loss_mlp": 1.03824925, "epoch": 0.588245904103412, "flos": 23728226947200.0, "grad_norm": 1.8546414649653404, "language_loss": 0.68991691, "learning_rate": 1.4528022796146128e-06, "loss": 0.71166885, "num_input_tokens_seen": 210941955, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.74609375, "step": 9784, "time_per_iteration": 2.6676228046417236 }, { "auxiliary_loss_clip": 0.011476, "auxiliary_loss_mlp": 0.01033413, "balance_loss_clip": 1.02007985, "balance_loss_mlp": 1.03674531, "epoch": 0.58830602735608, "flos": 33802534558080.0, "grad_norm": 2.1356224013474754, "language_loss": 0.69234717, "learning_rate": 1.452438883284541e-06, "loss": 0.71415722, "num_input_tokens_seen": 210963105, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75, "step": 9785, "time_per_iteration": 2.6735713481903076 }, { "auxiliary_loss_clip": 0.01117055, "auxiliary_loss_mlp": 0.01027904, "balance_loss_clip": 1.01541734, "balance_loss_mlp": 1.03725457, "epoch": 0.588366150608748, "flos": 17090714586240.0, "grad_norm": 1.9531317436520634, "language_loss": 0.77280545, "learning_rate": 1.4520755064955165e-06, "loss": 0.79425508, "num_input_tokens_seen": 210978720, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 9786, "time_per_iteration": 2.6028568744659424 }, { "auxiliary_loss_clip": 0.01130345, "auxiliary_loss_mlp": 0.01034144, "balance_loss_clip": 1.02063727, "balance_loss_mlp": 1.03801394, "epoch": 0.5884262738614159, "flos": 22127186033280.0, "grad_norm": 1.3651001023220155, "language_loss": 0.79015368, "learning_rate": 1.4517121492605075e-06, "loss": 0.81179857, "num_input_tokens_seen": 210998750, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 9787, "time_per_iteration": 2.6071290969848633 }, { "auxiliary_loss_clip": 0.0113642, "auxiliary_loss_mlp": 0.01033121, "balance_loss_clip": 1.02049696, "balance_loss_mlp": 1.03688323, "epoch": 0.5884863971140839, "flos": 21031838743680.0, "grad_norm": 1.7119761708541104, "language_loss": 0.66227967, "learning_rate": 1.4513488115924823e-06, "loss": 0.6839751, "num_input_tokens_seen": 211017550, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 9788, "time_per_iteration": 5.546133995056152 }, { "auxiliary_loss_clip": 0.01125517, "auxiliary_loss_mlp": 0.01033539, "balance_loss_clip": 1.02061069, "balance_loss_mlp": 1.03685999, "epoch": 0.5885465203667518, "flos": 23805112008960.0, "grad_norm": 2.257530333744255, "language_loss": 0.80375952, "learning_rate": 1.450985493504406e-06, "loss": 0.82535005, "num_input_tokens_seen": 211034135, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 9789, "time_per_iteration": 2.597121238708496 }, { "auxiliary_loss_clip": 0.01124945, "auxiliary_loss_mlp": 0.01278099, "balance_loss_clip": 1.01827061, "balance_loss_mlp": 1.03627384, "epoch": 0.5886066436194198, "flos": 18880574319360.0, "grad_norm": 1.8446266815686994, "language_loss": 0.70292181, "learning_rate": 1.4506221950092457e-06, "loss": 0.7269522, "num_input_tokens_seen": 211053850, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 9790, "time_per_iteration": 2.5478873252868652 }, { "auxiliary_loss_clip": 0.01135462, "auxiliary_loss_mlp": 0.01032198, "balance_loss_clip": 1.01899576, "balance_loss_mlp": 1.03753102, "epoch": 0.5886667668720877, "flos": 24790141653120.0, "grad_norm": 1.979258316859851, "language_loss": 0.8347801, "learning_rate": 1.450258916119966e-06, "loss": 0.85645676, "num_input_tokens_seen": 211072165, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 9791, "time_per_iteration": 2.5780816078186035 }, { "auxiliary_loss_clip": 0.01120286, "auxiliary_loss_mlp": 0.01035536, "balance_loss_clip": 1.02214313, "balance_loss_mlp": 1.03825915, "epoch": 0.5887268901247558, "flos": 21614381516160.0, "grad_norm": 1.5894265568778347, "language_loss": 0.76360983, "learning_rate": 1.4498956568495313e-06, "loss": 0.78516805, "num_input_tokens_seen": 211089630, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 9792, "time_per_iteration": 2.6087417602539062 }, { "auxiliary_loss_clip": 0.01147216, "auxiliary_loss_mlp": 0.01035153, "balance_loss_clip": 1.02180791, "balance_loss_mlp": 1.03619349, "epoch": 0.5887870133774237, "flos": 20481722974080.0, "grad_norm": 1.9435915852349166, "language_loss": 0.68531758, "learning_rate": 1.4495324172109057e-06, "loss": 0.70714128, "num_input_tokens_seen": 211106120, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.75, "step": 9793, "time_per_iteration": 2.554888963699341 }, { "auxiliary_loss_clip": 0.01106806, "auxiliary_loss_mlp": 0.01033321, "balance_loss_clip": 1.01976728, "balance_loss_mlp": 1.03697848, "epoch": 0.5888471366300917, "flos": 19206283870080.0, "grad_norm": 2.0147680547552906, "language_loss": 0.61171395, "learning_rate": 1.449169197217052e-06, "loss": 0.63311523, "num_input_tokens_seen": 211122450, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.69921875, "step": 9794, "time_per_iteration": 2.570415735244751 }, { "auxiliary_loss_clip": 0.01138462, "auxiliary_loss_mlp": 0.01035844, "balance_loss_clip": 1.0218066, "balance_loss_mlp": 1.03846335, "epoch": 0.5889072598827596, "flos": 19972904866560.0, "grad_norm": 1.536026535729638, "language_loss": 0.64788073, "learning_rate": 1.4488059968809335e-06, "loss": 0.66962373, "num_input_tokens_seen": 211141765, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.734375, "step": 9795, "time_per_iteration": 2.5372872352600098 }, { "auxiliary_loss_clip": 0.01135956, "auxiliary_loss_mlp": 0.01038318, "balance_loss_clip": 1.02615321, "balance_loss_mlp": 1.03745651, "epoch": 0.5889673831354276, "flos": 20741249715840.0, "grad_norm": 1.6200609081749549, "language_loss": 0.74148262, "learning_rate": 1.4484428162155102e-06, "loss": 0.76322544, "num_input_tokens_seen": 211160475, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.71484375, "step": 9796, "time_per_iteration": 2.5980827808380127 }, { "auxiliary_loss_clip": 0.01127872, "auxiliary_loss_mlp": 0.01028945, "balance_loss_clip": 1.01630282, "balance_loss_mlp": 1.03819668, "epoch": 0.5890275063880956, "flos": 25300935008640.0, "grad_norm": 1.5216569837020428, "language_loss": 0.82783031, "learning_rate": 1.4480796552337444e-06, "loss": 0.84939849, "num_input_tokens_seen": 211180480, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 9797, "time_per_iteration": 2.553595542907715 }, { "auxiliary_loss_clip": 0.01123872, "auxiliary_loss_mlp": 0.01032304, "balance_loss_clip": 1.01943588, "balance_loss_mlp": 1.03605247, "epoch": 0.5890876296407636, "flos": 11765377964160.0, "grad_norm": 2.0373462708168417, "language_loss": 0.79116213, "learning_rate": 1.4477165139485962e-06, "loss": 0.81272388, "num_input_tokens_seen": 211198000, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 9798, "time_per_iteration": 2.5714662075042725 }, { "auxiliary_loss_clip": 0.01107446, "auxiliary_loss_mlp": 0.01034046, "balance_loss_clip": 1.02168965, "balance_loss_mlp": 1.03676891, "epoch": 0.5891477528934316, "flos": 13589460380160.0, "grad_norm": 2.534909714071863, "language_loss": 0.73833686, "learning_rate": 1.4473533923730244e-06, "loss": 0.7597518, "num_input_tokens_seen": 211214765, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.70703125, "step": 9799, "time_per_iteration": 2.4974374771118164 }, { "auxiliary_loss_clip": 0.0113896, "auxiliary_loss_mlp": 0.01033921, "balance_loss_clip": 1.02018249, "balance_loss_mlp": 1.03802013, "epoch": 0.5892078761460995, "flos": 15049193189760.0, "grad_norm": 2.1052308563010764, "language_loss": 0.76273102, "learning_rate": 1.4469902905199889e-06, "loss": 0.78445983, "num_input_tokens_seen": 211232335, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.74609375, "step": 9800, "time_per_iteration": 2.6115307807922363 }, { "auxiliary_loss_clip": 0.01146698, "auxiliary_loss_mlp": 0.01039656, "balance_loss_clip": 1.02667391, "balance_loss_mlp": 1.03766215, "epoch": 0.5892679993987675, "flos": 15778215624960.0, "grad_norm": 1.78726671342475, "language_loss": 0.78630316, "learning_rate": 1.446627208402447e-06, "loss": 0.80816668, "num_input_tokens_seen": 211249985, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.734375, "step": 9801, "time_per_iteration": 2.6404762268066406 }, { "auxiliary_loss_clip": 0.01139697, "auxiliary_loss_mlp": 0.01030679, "balance_loss_clip": 1.0175724, "balance_loss_mlp": 1.03787208, "epoch": 0.5893281226514354, "flos": 25265203954560.0, "grad_norm": 1.7263993695631357, "language_loss": 0.65816665, "learning_rate": 1.4462641460333572e-06, "loss": 0.67987043, "num_input_tokens_seen": 211268425, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.74609375, "step": 9802, "time_per_iteration": 2.6555378437042236 }, { "auxiliary_loss_clip": 0.01116506, "auxiliary_loss_mlp": 0.01028242, "balance_loss_clip": 1.01529551, "balance_loss_mlp": 1.03681231, "epoch": 0.5893882459041034, "flos": 19458232842240.0, "grad_norm": 1.6434874224554692, "language_loss": 0.78207672, "learning_rate": 1.4459011034256752e-06, "loss": 0.80352414, "num_input_tokens_seen": 211286680, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 9803, "time_per_iteration": 2.559640407562256 }, { "auxiliary_loss_clip": 0.01119124, "auxiliary_loss_mlp": 0.01037138, "balance_loss_clip": 1.02393532, "balance_loss_mlp": 1.03735793, "epoch": 0.5894483691567713, "flos": 20634056553600.0, "grad_norm": 1.470331290137403, "language_loss": 0.73120654, "learning_rate": 1.4455380805923573e-06, "loss": 0.75276911, "num_input_tokens_seen": 211307700, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 9804, "time_per_iteration": 2.601527214050293 }, { "auxiliary_loss_clip": 0.01127461, "auxiliary_loss_mlp": 0.01031054, "balance_loss_clip": 1.01816177, "balance_loss_mlp": 1.03846478, "epoch": 0.5895084924094394, "flos": 17778223877760.0, "grad_norm": 1.5110358378195454, "language_loss": 0.74380493, "learning_rate": 1.4451750775463596e-06, "loss": 0.76539016, "num_input_tokens_seen": 211324835, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 9805, "time_per_iteration": 2.5716445446014404 }, { "auxiliary_loss_clip": 0.01125397, "auxiliary_loss_mlp": 0.01280171, "balance_loss_clip": 1.01921237, "balance_loss_mlp": 1.0390836, "epoch": 0.5895686156621073, "flos": 20121072468480.0, "grad_norm": 1.9921239005235225, "language_loss": 0.77846634, "learning_rate": 1.4448120943006359e-06, "loss": 0.802522, "num_input_tokens_seen": 211344130, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7734375, "step": 9806, "time_per_iteration": 2.588677167892456 }, { "auxiliary_loss_clip": 0.01109753, "auxiliary_loss_mlp": 0.01033446, "balance_loss_clip": 1.01982605, "balance_loss_mlp": 1.03718042, "epoch": 0.5896287389147753, "flos": 20850058990080.0, "grad_norm": 1.716970697228718, "language_loss": 0.76499808, "learning_rate": 1.4444491308681404e-06, "loss": 0.78643, "num_input_tokens_seen": 211362915, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 9807, "time_per_iteration": 2.5494275093078613 }, { "auxiliary_loss_clip": 0.01122196, "auxiliary_loss_mlp": 0.01031192, "balance_loss_clip": 1.01747727, "balance_loss_mlp": 1.03883052, "epoch": 0.5896888621674432, "flos": 14537897043840.0, "grad_norm": 1.7595235336327213, "language_loss": 0.7452817, "learning_rate": 1.4440861872618268e-06, "loss": 0.76681554, "num_input_tokens_seen": 211380700, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.74609375, "step": 9808, "time_per_iteration": 2.533498764038086 }, { "auxiliary_loss_clip": 0.01148771, "auxiliary_loss_mlp": 0.01031764, "balance_loss_clip": 1.0177331, "balance_loss_mlp": 1.0384115, "epoch": 0.5897489854201112, "flos": 20886759711360.0, "grad_norm": 1.933388000658081, "language_loss": 0.72276479, "learning_rate": 1.4437232634946465e-06, "loss": 0.74457014, "num_input_tokens_seen": 211400095, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75, "step": 9809, "time_per_iteration": 2.5801424980163574 }, { "auxiliary_loss_clip": 0.01130982, "auxiliary_loss_mlp": 0.010371, "balance_loss_clip": 1.02356946, "balance_loss_mlp": 1.03789532, "epoch": 0.5898091086727792, "flos": 20011149872640.0, "grad_norm": 1.9550623004125058, "language_loss": 0.82487422, "learning_rate": 1.4433603595795525e-06, "loss": 0.84655505, "num_input_tokens_seen": 211417810, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75390625, "step": 9810, "time_per_iteration": 2.5851120948791504 }, { "auxiliary_loss_clip": 0.01119207, "auxiliary_loss_mlp": 0.01030707, "balance_loss_clip": 1.01779056, "balance_loss_mlp": 1.03920197, "epoch": 0.5898692319254472, "flos": 16253242012800.0, "grad_norm": 1.4776390375175867, "language_loss": 0.80605656, "learning_rate": 1.4429974755294956e-06, "loss": 0.82755572, "num_input_tokens_seen": 211436020, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 9811, "time_per_iteration": 2.5132157802581787 }, { "auxiliary_loss_clip": 0.01122125, "auxiliary_loss_mlp": 0.01030126, "balance_loss_clip": 1.01598191, "balance_loss_mlp": 1.03833079, "epoch": 0.5899293551781152, "flos": 20448541785600.0, "grad_norm": 1.930077818641239, "language_loss": 0.76843059, "learning_rate": 1.442634611357426e-06, "loss": 0.78995311, "num_input_tokens_seen": 211454335, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.75, "step": 9812, "time_per_iteration": 2.5476434230804443 }, { "auxiliary_loss_clip": 0.01136537, "auxiliary_loss_mlp": 0.01033881, "balance_loss_clip": 1.0219301, "balance_loss_mlp": 1.03841114, "epoch": 0.5899894784307831, "flos": 13881701433600.0, "grad_norm": 1.8892029573581677, "language_loss": 0.70709294, "learning_rate": 1.4422717670762932e-06, "loss": 0.72879708, "num_input_tokens_seen": 211472775, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.71875, "step": 9813, "time_per_iteration": 2.6302032470703125 }, { "auxiliary_loss_clip": 0.01128494, "auxiliary_loss_mlp": 0.01036692, "balance_loss_clip": 1.02347803, "balance_loss_mlp": 1.03741384, "epoch": 0.5900496016834511, "flos": 20083797129600.0, "grad_norm": 1.464043102783705, "language_loss": 0.71841335, "learning_rate": 1.441908942699046e-06, "loss": 0.74006522, "num_input_tokens_seen": 211492195, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 9814, "time_per_iteration": 2.5865066051483154 }, { "auxiliary_loss_clip": 0.01128992, "auxiliary_loss_mlp": 0.01031328, "balance_loss_clip": 1.01819694, "balance_loss_mlp": 1.03788197, "epoch": 0.590109724936119, "flos": 20259148348800.0, "grad_norm": 2.3978593778405304, "language_loss": 0.78478277, "learning_rate": 1.4415461382386335e-06, "loss": 0.80638599, "num_input_tokens_seen": 211510220, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73046875, "step": 9815, "time_per_iteration": 2.576956272125244 }, { "auxiliary_loss_clip": 0.01144042, "auxiliary_loss_mlp": 0.01288028, "balance_loss_clip": 1.02666426, "balance_loss_mlp": 1.0397402, "epoch": 0.590169848188787, "flos": 24235069806720.0, "grad_norm": 3.1344898021322605, "language_loss": 0.75860476, "learning_rate": 1.4411833537080026e-06, "loss": 0.78292549, "num_input_tokens_seen": 211526260, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.77734375, "step": 9816, "time_per_iteration": 2.5663928985595703 }, { "auxiliary_loss_clip": 0.01111771, "auxiliary_loss_mlp": 0.01035698, "balance_loss_clip": 1.02222168, "balance_loss_mlp": 1.03989267, "epoch": 0.590229971441455, "flos": 17784724239360.0, "grad_norm": 1.7907602906786513, "language_loss": 0.80978799, "learning_rate": 1.4408205891201005e-06, "loss": 0.83126271, "num_input_tokens_seen": 211542890, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 9817, "time_per_iteration": 2.523033380508423 }, { "auxiliary_loss_clip": 0.01148195, "auxiliary_loss_mlp": 0.010388, "balance_loss_clip": 1.02594376, "balance_loss_mlp": 1.03903437, "epoch": 0.590290094694123, "flos": 22236893147520.0, "grad_norm": 1.7245384086552682, "language_loss": 0.76397771, "learning_rate": 1.4404578444878727e-06, "loss": 0.78584766, "num_input_tokens_seen": 211562685, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 9818, "time_per_iteration": 2.5700957775115967 }, { "auxiliary_loss_clip": 0.01134252, "auxiliary_loss_mlp": 0.01033801, "balance_loss_clip": 1.02130187, "balance_loss_mlp": 1.0364449, "epoch": 0.5903502179467909, "flos": 19098623831040.0, "grad_norm": 1.8140147203359813, "language_loss": 0.66728342, "learning_rate": 1.440095119824266e-06, "loss": 0.68896395, "num_input_tokens_seen": 211579960, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 9819, "time_per_iteration": 3.997105836868286 }, { "auxiliary_loss_clip": 0.01112942, "auxiliary_loss_mlp": 0.01033289, "balance_loss_clip": 1.01999128, "balance_loss_mlp": 1.03966439, "epoch": 0.5904103411994589, "flos": 24235500769920.0, "grad_norm": 1.5672145264412245, "language_loss": 0.77699959, "learning_rate": 1.439732415142224e-06, "loss": 0.79846191, "num_input_tokens_seen": 211599310, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 9820, "time_per_iteration": 2.5703420639038086 }, { "auxiliary_loss_clip": 0.01065164, "auxiliary_loss_mlp": 0.01003779, "balance_loss_clip": 1.00234878, "balance_loss_mlp": 1.01587617, "epoch": 0.5904704644521268, "flos": 64876613045760.0, "grad_norm": 0.9364637218754198, "language_loss": 0.65106606, "learning_rate": 1.43936973045469e-06, "loss": 0.67175543, "num_input_tokens_seen": 211658790, "router_z_loss_clip": 0.01428223, "router_z_loss_mlp": 0.22851562, "step": 9821, "time_per_iteration": 3.162571668624878 }, { "auxiliary_loss_clip": 0.01136735, "auxiliary_loss_mlp": 0.01036458, "balance_loss_clip": 1.02342248, "balance_loss_mlp": 1.03845704, "epoch": 0.5905305877047948, "flos": 19609991804160.0, "grad_norm": 2.0049390354459558, "language_loss": 0.61071885, "learning_rate": 1.4390070657746093e-06, "loss": 0.63245082, "num_input_tokens_seen": 211677240, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 9822, "time_per_iteration": 2.6153790950775146 }, { "auxiliary_loss_clip": 0.01112171, "auxiliary_loss_mlp": 0.01040177, "balance_loss_clip": 1.02653933, "balance_loss_mlp": 1.0379318, "epoch": 0.5905907109574628, "flos": 18989634988800.0, "grad_norm": 1.858697446270588, "language_loss": 0.82746518, "learning_rate": 1.4386444211149226e-06, "loss": 0.84898865, "num_input_tokens_seen": 211695485, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 9823, "time_per_iteration": 2.5163168907165527 }, { "auxiliary_loss_clip": 0.01126737, "auxiliary_loss_mlp": 0.01030963, "balance_loss_clip": 1.01771283, "balance_loss_mlp": 1.03746474, "epoch": 0.5906508342101308, "flos": 22200407907840.0, "grad_norm": 1.8375709326852248, "language_loss": 0.73154724, "learning_rate": 1.4382817964885731e-06, "loss": 0.75312418, "num_input_tokens_seen": 211713090, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 9824, "time_per_iteration": 3.9326906204223633 }, { "auxiliary_loss_clip": 0.01146022, "auxiliary_loss_mlp": 0.01287549, "balance_loss_clip": 1.02534771, "balance_loss_mlp": 1.03940511, "epoch": 0.5907109574627988, "flos": 20886687884160.0, "grad_norm": 2.2052568388408442, "language_loss": 0.8302874, "learning_rate": 1.4379191919085014e-06, "loss": 0.85462308, "num_input_tokens_seen": 211732510, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.796875, "step": 9825, "time_per_iteration": 2.531511068344116 }, { "auxiliary_loss_clip": 0.0111523, "auxiliary_loss_mlp": 0.01035361, "balance_loss_clip": 1.0239526, "balance_loss_mlp": 1.03715134, "epoch": 0.5907710807154667, "flos": 21506649649920.0, "grad_norm": 1.8403715532276477, "language_loss": 0.76935333, "learning_rate": 1.4375566073876478e-06, "loss": 0.79085916, "num_input_tokens_seen": 211748695, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6953125, "step": 9826, "time_per_iteration": 2.592585802078247 }, { "auxiliary_loss_clip": 0.01127507, "auxiliary_loss_mlp": 0.01029746, "balance_loss_clip": 1.0172348, "balance_loss_mlp": 1.03800583, "epoch": 0.5908312039681347, "flos": 22018376759040.0, "grad_norm": 1.9763943796086731, "language_loss": 0.72261059, "learning_rate": 1.4371940429389523e-06, "loss": 0.74418312, "num_input_tokens_seen": 211768545, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 9827, "time_per_iteration": 2.5478031635284424 }, { "auxiliary_loss_clip": 0.01145021, "auxiliary_loss_mlp": 0.0103069, "balance_loss_clip": 1.01815486, "balance_loss_mlp": 1.03642452, "epoch": 0.5908913272208026, "flos": 18479523991680.0, "grad_norm": 1.7410304257185998, "language_loss": 0.79309493, "learning_rate": 1.4368314985753531e-06, "loss": 0.81485206, "num_input_tokens_seen": 211786665, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.73046875, "step": 9828, "time_per_iteration": 2.6213150024414062 }, { "auxiliary_loss_clip": 0.01148521, "auxiliary_loss_mlp": 0.01035744, "balance_loss_clip": 1.02204108, "balance_loss_mlp": 1.03846383, "epoch": 0.5909514504734706, "flos": 12312189682560.0, "grad_norm": 2.182582836384756, "language_loss": 0.88270169, "learning_rate": 1.4364689743097892e-06, "loss": 0.90454435, "num_input_tokens_seen": 211801215, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 9829, "time_per_iteration": 4.077269554138184 }, { "auxiliary_loss_clip": 0.01054766, "auxiliary_loss_mlp": 0.01003126, "balance_loss_clip": 1.00161839, "balance_loss_mlp": 1.01444769, "epoch": 0.5910115737261386, "flos": 70213262451840.0, "grad_norm": 0.7590903214826188, "language_loss": 0.57871777, "learning_rate": 1.4361064701551985e-06, "loss": 0.59929669, "num_input_tokens_seen": 211857005, "router_z_loss_clip": 0.01507568, "router_z_loss_mlp": 0.23242188, "step": 9830, "time_per_iteration": 4.604274272918701 }, { "auxiliary_loss_clip": 0.01127096, "auxiliary_loss_mlp": 0.01281723, "balance_loss_clip": 1.01983833, "balance_loss_mlp": 1.03954375, "epoch": 0.5910716969788066, "flos": 22017766227840.0, "grad_norm": 3.797225801907107, "language_loss": 0.75754815, "learning_rate": 1.4357439861245168e-06, "loss": 0.78163636, "num_input_tokens_seen": 211876675, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7890625, "step": 9831, "time_per_iteration": 2.592823028564453 }, { "auxiliary_loss_clip": 0.01124858, "auxiliary_loss_mlp": 0.0103204, "balance_loss_clip": 1.01913524, "balance_loss_mlp": 1.03808618, "epoch": 0.5911318202314745, "flos": 21251648021760.0, "grad_norm": 1.6646989771949015, "language_loss": 0.77358937, "learning_rate": 1.4353815222306813e-06, "loss": 0.79515839, "num_input_tokens_seen": 211895725, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 9832, "time_per_iteration": 2.6935081481933594 }, { "auxiliary_loss_clip": 0.01118114, "auxiliary_loss_mlp": 0.01031355, "balance_loss_clip": 1.01805079, "balance_loss_mlp": 1.03699279, "epoch": 0.5911919434841425, "flos": 17821604528640.0, "grad_norm": 1.7135529200612543, "language_loss": 0.83562136, "learning_rate": 1.4350190784866266e-06, "loss": 0.8571161, "num_input_tokens_seen": 211913860, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 9833, "time_per_iteration": 2.5888960361480713 }, { "auxiliary_loss_clip": 0.01120193, "auxiliary_loss_mlp": 0.01035788, "balance_loss_clip": 1.02240705, "balance_loss_mlp": 1.03866518, "epoch": 0.5912520667368104, "flos": 20374781207040.0, "grad_norm": 2.332299227221201, "language_loss": 0.74298054, "learning_rate": 1.4346566549052877e-06, "loss": 0.76454037, "num_input_tokens_seen": 211932880, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 9834, "time_per_iteration": 2.593564510345459 }, { "auxiliary_loss_clip": 0.01109375, "auxiliary_loss_mlp": 0.01033124, "balance_loss_clip": 1.01997566, "balance_loss_mlp": 1.0366863, "epoch": 0.5913121899894784, "flos": 17930557457280.0, "grad_norm": 2.1076380410788498, "language_loss": 0.77827221, "learning_rate": 1.4342942514995989e-06, "loss": 0.79969722, "num_input_tokens_seen": 211948625, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 9835, "time_per_iteration": 2.5824263095855713 }, { "auxiliary_loss_clip": 0.01130158, "auxiliary_loss_mlp": 0.0103463, "balance_loss_clip": 1.02142763, "balance_loss_mlp": 1.03830683, "epoch": 0.5913723132421465, "flos": 22126934638080.0, "grad_norm": 1.9012076883421851, "language_loss": 0.73830211, "learning_rate": 1.4339318682824924e-06, "loss": 0.75994998, "num_input_tokens_seen": 211965355, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73828125, "step": 9836, "time_per_iteration": 2.7124125957489014 }, { "auxiliary_loss_clip": 0.01139137, "auxiliary_loss_mlp": 0.01029256, "balance_loss_clip": 1.01671565, "balance_loss_mlp": 1.03825426, "epoch": 0.5914324364948144, "flos": 15697918771200.0, "grad_norm": 2.010557058820627, "language_loss": 0.81707525, "learning_rate": 1.433569505266902e-06, "loss": 0.83875924, "num_input_tokens_seen": 211982245, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7421875, "step": 9837, "time_per_iteration": 2.5836896896362305 }, { "auxiliary_loss_clip": 0.01138835, "auxiliary_loss_mlp": 0.01033748, "balance_loss_clip": 1.01967573, "balance_loss_mlp": 1.0380578, "epoch": 0.5914925597474824, "flos": 22747327367040.0, "grad_norm": 1.6382016315197623, "language_loss": 0.7933706, "learning_rate": 1.4332071624657585e-06, "loss": 0.81509644, "num_input_tokens_seen": 212000250, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.73828125, "step": 9838, "time_per_iteration": 2.5975422859191895 }, { "auxiliary_loss_clip": 0.01147133, "auxiliary_loss_mlp": 0.01039561, "balance_loss_clip": 1.02588165, "balance_loss_mlp": 1.03787327, "epoch": 0.5915526830001503, "flos": 18292788161280.0, "grad_norm": 2.03137871147108, "language_loss": 0.69478095, "learning_rate": 1.4328448398919937e-06, "loss": 0.71664786, "num_input_tokens_seen": 212017505, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 9839, "time_per_iteration": 2.5334532260894775 }, { "auxiliary_loss_clip": 0.01131554, "auxiliary_loss_mlp": 0.01042894, "balance_loss_clip": 1.02929795, "balance_loss_mlp": 1.0380255, "epoch": 0.5916128062528183, "flos": 17019072910080.0, "grad_norm": 2.333103215750299, "language_loss": 0.65532142, "learning_rate": 1.4324825375585379e-06, "loss": 0.67706591, "num_input_tokens_seen": 212034595, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.75, "step": 9840, "time_per_iteration": 2.5555412769317627 }, { "auxiliary_loss_clip": 0.01130424, "auxiliary_loss_mlp": 0.01030144, "balance_loss_clip": 1.01651859, "balance_loss_mlp": 1.03754759, "epoch": 0.5916729295054862, "flos": 24754231031040.0, "grad_norm": 1.661989604167741, "language_loss": 0.81452179, "learning_rate": 1.43212025547832e-06, "loss": 0.83612752, "num_input_tokens_seen": 212055775, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75, "step": 9841, "time_per_iteration": 2.5433554649353027 }, { "auxiliary_loss_clip": 0.01125701, "auxiliary_loss_mlp": 0.01032506, "balance_loss_clip": 1.01936913, "balance_loss_mlp": 1.03651655, "epoch": 0.5917330527581542, "flos": 15958199698560.0, "grad_norm": 1.65523822793089, "language_loss": 0.69099993, "learning_rate": 1.4317579936642701e-06, "loss": 0.71258199, "num_input_tokens_seen": 212074000, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 9842, "time_per_iteration": 2.6473469734191895 }, { "auxiliary_loss_clip": 0.01128385, "auxiliary_loss_mlp": 0.0103354, "balance_loss_clip": 1.01995623, "balance_loss_mlp": 1.03649521, "epoch": 0.5917931760108222, "flos": 23800730549760.0, "grad_norm": 4.12972649607819, "language_loss": 0.82501578, "learning_rate": 1.431395752129315e-06, "loss": 0.84663498, "num_input_tokens_seen": 212091415, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7421875, "step": 9843, "time_per_iteration": 2.50604248046875 }, { "auxiliary_loss_clip": 0.01128544, "auxiliary_loss_mlp": 0.01031221, "balance_loss_clip": 1.01732075, "balance_loss_mlp": 1.03683162, "epoch": 0.5918532992634902, "flos": 23249609199360.0, "grad_norm": 2.248466127209088, "language_loss": 0.81420368, "learning_rate": 1.431033530886383e-06, "loss": 0.83580136, "num_input_tokens_seen": 212105255, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 9844, "time_per_iteration": 2.569683790206909 }, { "auxiliary_loss_clip": 0.01149081, "auxiliary_loss_mlp": 0.01034121, "balance_loss_clip": 1.0207634, "balance_loss_mlp": 1.03959131, "epoch": 0.5919134225161581, "flos": 19499853726720.0, "grad_norm": 2.012325081764217, "language_loss": 0.74013311, "learning_rate": 1.4306713299484008e-06, "loss": 0.76196516, "num_input_tokens_seen": 212122765, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7421875, "step": 9845, "time_per_iteration": 2.5457310676574707 }, { "auxiliary_loss_clip": 0.01139791, "auxiliary_loss_mlp": 0.01029093, "balance_loss_clip": 1.01506245, "balance_loss_mlp": 1.03801513, "epoch": 0.5919735457688261, "flos": 38800940567040.0, "grad_norm": 1.759474929225025, "language_loss": 0.63992727, "learning_rate": 1.4303091493282944e-06, "loss": 0.66161621, "num_input_tokens_seen": 212143960, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75, "step": 9846, "time_per_iteration": 2.7714650630950928 }, { "auxiliary_loss_clip": 0.01131752, "auxiliary_loss_mlp": 0.0127204, "balance_loss_clip": 1.01356316, "balance_loss_mlp": 1.03674698, "epoch": 0.592033669021494, "flos": 22163994495360.0, "grad_norm": 1.9633614805710657, "language_loss": 0.76505232, "learning_rate": 1.4299469890389893e-06, "loss": 0.78909022, "num_input_tokens_seen": 212162005, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 9847, "time_per_iteration": 2.5824532508850098 }, { "auxiliary_loss_clip": 0.0112938, "auxiliary_loss_mlp": 0.01031268, "balance_loss_clip": 1.01801801, "balance_loss_mlp": 1.03779602, "epoch": 0.592093792274162, "flos": 22710985781760.0, "grad_norm": 1.9126124666171118, "language_loss": 0.62674302, "learning_rate": 1.4295848490934093e-06, "loss": 0.64834952, "num_input_tokens_seen": 212181635, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 9848, "time_per_iteration": 2.647526741027832 }, { "auxiliary_loss_clip": 0.01121858, "auxiliary_loss_mlp": 0.010396, "balance_loss_clip": 1.02673173, "balance_loss_mlp": 1.0390141, "epoch": 0.59215391552683, "flos": 22528954632960.0, "grad_norm": 2.242564811954476, "language_loss": 0.75728774, "learning_rate": 1.4292227295044793e-06, "loss": 0.77890235, "num_input_tokens_seen": 212201615, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73828125, "step": 9849, "time_per_iteration": 2.5983359813690186 }, { "auxiliary_loss_clip": 0.01130842, "auxiliary_loss_mlp": 0.01035837, "balance_loss_clip": 1.02234864, "balance_loss_mlp": 1.03867042, "epoch": 0.592214038779498, "flos": 24499013921280.0, "grad_norm": 1.7390882544759, "language_loss": 0.75156844, "learning_rate": 1.4288606302851211e-06, "loss": 0.7732352, "num_input_tokens_seen": 212219355, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 9850, "time_per_iteration": 2.6033682823181152 }, { "auxiliary_loss_clip": 0.01135707, "auxiliary_loss_mlp": 0.01032666, "balance_loss_clip": 1.0194757, "balance_loss_mlp": 1.03680491, "epoch": 0.592274162032166, "flos": 21831353619840.0, "grad_norm": 1.938954078817583, "language_loss": 0.75399208, "learning_rate": 1.4284985514482584e-06, "loss": 0.77567583, "num_input_tokens_seen": 212236710, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 9851, "time_per_iteration": 2.5737037658691406 }, { "auxiliary_loss_clip": 0.0113, "auxiliary_loss_mlp": 0.0103303, "balance_loss_clip": 1.01983321, "balance_loss_mlp": 1.03866696, "epoch": 0.5923342852848339, "flos": 24608146417920.0, "grad_norm": 2.223520305240533, "language_loss": 0.70738864, "learning_rate": 1.4281364930068125e-06, "loss": 0.72901887, "num_input_tokens_seen": 212256195, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73828125, "step": 9852, "time_per_iteration": 2.5746631622314453 }, { "auxiliary_loss_clip": 0.01133735, "auxiliary_loss_mlp": 0.01286775, "balance_loss_clip": 1.02611423, "balance_loss_mlp": 1.03950429, "epoch": 0.5923944085375019, "flos": 19938143479680.0, "grad_norm": 1.6847839819493327, "language_loss": 0.80041838, "learning_rate": 1.4277744549737035e-06, "loss": 0.82462347, "num_input_tokens_seen": 212274085, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.765625, "step": 9853, "time_per_iteration": 2.56538724899292 }, { "auxiliary_loss_clip": 0.01125255, "auxiliary_loss_mlp": 0.01029835, "balance_loss_clip": 1.01773512, "balance_loss_mlp": 1.0365622, "epoch": 0.5924545317901698, "flos": 28658510812800.0, "grad_norm": 1.6824769651098523, "language_loss": 0.67583144, "learning_rate": 1.427412437361853e-06, "loss": 0.69738233, "num_input_tokens_seen": 212295530, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.7109375, "step": 9854, "time_per_iteration": 2.667051315307617 }, { "auxiliary_loss_clip": 0.01137813, "auxiliary_loss_mlp": 0.01026069, "balance_loss_clip": 1.01337302, "balance_loss_mlp": 1.03794694, "epoch": 0.5925146550428378, "flos": 19864885691520.0, "grad_norm": 1.6826876159127155, "language_loss": 0.89271462, "learning_rate": 1.4270504401841791e-06, "loss": 0.91435349, "num_input_tokens_seen": 212313770, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 9855, "time_per_iteration": 2.5851166248321533 }, { "auxiliary_loss_clip": 0.01117071, "auxiliary_loss_mlp": 0.01033843, "balance_loss_clip": 1.02121258, "balance_loss_mlp": 1.03781116, "epoch": 0.5925747782955058, "flos": 15122989681920.0, "grad_norm": 2.803876873413598, "language_loss": 0.86687636, "learning_rate": 1.426688463453602e-06, "loss": 0.88838553, "num_input_tokens_seen": 212331525, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 9856, "time_per_iteration": 2.567316770553589 }, { "auxiliary_loss_clip": 0.01120739, "auxiliary_loss_mlp": 0.01033371, "balance_loss_clip": 1.02062154, "balance_loss_mlp": 1.03769994, "epoch": 0.5926349015481738, "flos": 18405440190720.0, "grad_norm": 1.8223758639226793, "language_loss": 0.77692759, "learning_rate": 1.4263265071830387e-06, "loss": 0.79846865, "num_input_tokens_seen": 212347295, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7421875, "step": 9857, "time_per_iteration": 2.497725248336792 }, { "auxiliary_loss_clip": 0.01140839, "auxiliary_loss_mlp": 0.01035932, "balance_loss_clip": 1.02245557, "balance_loss_mlp": 1.0403825, "epoch": 0.5926950248008417, "flos": 23111138269440.0, "grad_norm": 1.9323220247050357, "language_loss": 0.64331758, "learning_rate": 1.425964571385406e-06, "loss": 0.66508532, "num_input_tokens_seen": 212365750, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 9858, "time_per_iteration": 2.5573630332946777 }, { "auxiliary_loss_clip": 0.01118174, "auxiliary_loss_mlp": 0.01035187, "balance_loss_clip": 1.02236581, "balance_loss_mlp": 1.03864574, "epoch": 0.5927551480535097, "flos": 28033916192640.0, "grad_norm": 1.7036610052546532, "language_loss": 0.76960701, "learning_rate": 1.4256026560736218e-06, "loss": 0.79114062, "num_input_tokens_seen": 212385300, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 9859, "time_per_iteration": 2.698195695877075 }, { "auxiliary_loss_clip": 0.01149249, "auxiliary_loss_mlp": 0.01283835, "balance_loss_clip": 1.02286899, "balance_loss_mlp": 1.03983963, "epoch": 0.5928152713061776, "flos": 21798675221760.0, "grad_norm": 2.591680217164458, "language_loss": 0.74931514, "learning_rate": 1.4252407612606008e-06, "loss": 0.773646, "num_input_tokens_seen": 212402140, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.74609375, "step": 9860, "time_per_iteration": 4.007298946380615 }, { "auxiliary_loss_clip": 0.01137729, "auxiliary_loss_mlp": 0.01034454, "balance_loss_clip": 1.02227688, "balance_loss_mlp": 1.03936183, "epoch": 0.5928753945588456, "flos": 24316839118080.0, "grad_norm": 1.6392349640921862, "language_loss": 0.76149911, "learning_rate": 1.4248788869592589e-06, "loss": 0.78322095, "num_input_tokens_seen": 212421790, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.71875, "step": 9861, "time_per_iteration": 2.6009750366210938 }, { "auxiliary_loss_clip": 0.01133032, "auxiliary_loss_mlp": 0.01026282, "balance_loss_clip": 1.01417065, "balance_loss_mlp": 1.03555775, "epoch": 0.5929355178115137, "flos": 26464619923200.0, "grad_norm": 1.4985860048516748, "language_loss": 0.705935, "learning_rate": 1.42451703318251e-06, "loss": 0.72752821, "num_input_tokens_seen": 212442115, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.7109375, "step": 9862, "time_per_iteration": 2.6945693492889404 }, { "auxiliary_loss_clip": 0.01147095, "auxiliary_loss_mlp": 0.01031329, "balance_loss_clip": 1.01893163, "balance_loss_mlp": 1.03789437, "epoch": 0.5929956410641816, "flos": 24965995662720.0, "grad_norm": 1.8056038581534426, "language_loss": 0.77792692, "learning_rate": 1.424155199943268e-06, "loss": 0.79971117, "num_input_tokens_seen": 212459535, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.73046875, "step": 9863, "time_per_iteration": 2.7114481925964355 }, { "auxiliary_loss_clip": 0.01131136, "auxiliary_loss_mlp": 0.01039736, "balance_loss_clip": 1.02649772, "balance_loss_mlp": 1.03946257, "epoch": 0.5930557643168496, "flos": 26208325405440.0, "grad_norm": 2.5717847107165643, "language_loss": 0.70329154, "learning_rate": 1.4237933872544456e-06, "loss": 0.72500032, "num_input_tokens_seen": 212479385, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73828125, "step": 9864, "time_per_iteration": 2.643174648284912 }, { "auxiliary_loss_clip": 0.01131814, "auxiliary_loss_mlp": 0.01036572, "balance_loss_clip": 1.02268386, "balance_loss_mlp": 1.0399648, "epoch": 0.5931158875695175, "flos": 27854937699840.0, "grad_norm": 1.4241243366844072, "language_loss": 0.67277873, "learning_rate": 1.4234315951289548e-06, "loss": 0.69446254, "num_input_tokens_seen": 212500060, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7421875, "step": 9865, "time_per_iteration": 4.013397216796875 }, { "auxiliary_loss_clip": 0.01129853, "auxiliary_loss_mlp": 0.01036619, "balance_loss_clip": 1.02283859, "balance_loss_mlp": 1.03704619, "epoch": 0.5931760108221855, "flos": 15413650536960.0, "grad_norm": 2.100316417133479, "language_loss": 0.77834117, "learning_rate": 1.4230698235797073e-06, "loss": 0.80000585, "num_input_tokens_seen": 212518590, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75, "step": 9866, "time_per_iteration": 2.5383481979370117 }, { "auxiliary_loss_clip": 0.01126548, "auxiliary_loss_mlp": 0.01030043, "balance_loss_clip": 1.01785994, "balance_loss_mlp": 1.03803527, "epoch": 0.5932361340748534, "flos": 30188520581760.0, "grad_norm": 2.031836651084381, "language_loss": 0.71903455, "learning_rate": 1.422708072619614e-06, "loss": 0.74060047, "num_input_tokens_seen": 212538190, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.70703125, "step": 9867, "time_per_iteration": 2.57039213180542 }, { "auxiliary_loss_clip": 0.01129225, "auxiliary_loss_mlp": 0.01034361, "balance_loss_clip": 1.0208075, "balance_loss_mlp": 1.03930545, "epoch": 0.5932962573275214, "flos": 20157557708160.0, "grad_norm": 1.7608658787607423, "language_loss": 0.66433877, "learning_rate": 1.4223463422615844e-06, "loss": 0.6859746, "num_input_tokens_seen": 212557820, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.72265625, "step": 9868, "time_per_iteration": 2.534026861190796 }, { "auxiliary_loss_clip": 0.01157477, "auxiliary_loss_mlp": 0.01283814, "balance_loss_clip": 1.02304697, "balance_loss_mlp": 1.03866565, "epoch": 0.5933563805801894, "flos": 25445906300160.0, "grad_norm": 1.4777495021702327, "language_loss": 0.75084329, "learning_rate": 1.4219846325185282e-06, "loss": 0.77525622, "num_input_tokens_seen": 212577645, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.74609375, "step": 9869, "time_per_iteration": 2.594717025756836 }, { "auxiliary_loss_clip": 0.01150585, "auxiliary_loss_mlp": 0.01038098, "balance_loss_clip": 1.02456784, "balance_loss_mlp": 1.0402987, "epoch": 0.5934165038328574, "flos": 59995740337920.0, "grad_norm": 1.721219496210408, "language_loss": 0.74088347, "learning_rate": 1.4216229434033533e-06, "loss": 0.7627703, "num_input_tokens_seen": 212603430, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.74609375, "step": 9870, "time_per_iteration": 2.927541971206665 }, { "auxiliary_loss_clip": 0.01149648, "auxiliary_loss_mlp": 0.01029454, "balance_loss_clip": 1.0162636, "balance_loss_mlp": 1.04018521, "epoch": 0.5934766270855253, "flos": 24420548661120.0, "grad_norm": 1.932929603279424, "language_loss": 0.71522808, "learning_rate": 1.4212612749289687e-06, "loss": 0.73701906, "num_input_tokens_seen": 212620730, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.74609375, "step": 9871, "time_per_iteration": 4.117949962615967 }, { "auxiliary_loss_clip": 0.01142791, "auxiliary_loss_mlp": 0.01037195, "balance_loss_clip": 1.02298522, "balance_loss_mlp": 1.04109454, "epoch": 0.5935367503381933, "flos": 23513158264320.0, "grad_norm": 2.165747523554999, "language_loss": 0.74810529, "learning_rate": 1.4208996271082794e-06, "loss": 0.76990521, "num_input_tokens_seen": 212639745, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75390625, "step": 9872, "time_per_iteration": 4.24219012260437 }, { "auxiliary_loss_clip": 0.01124539, "auxiliary_loss_mlp": 0.01035421, "balance_loss_clip": 1.02111602, "balance_loss_mlp": 1.03935933, "epoch": 0.5935968735908612, "flos": 18948337326720.0, "grad_norm": 2.0915292272326975, "language_loss": 0.79925168, "learning_rate": 1.4205379999541935e-06, "loss": 0.82085127, "num_input_tokens_seen": 212655915, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.76171875, "step": 9873, "time_per_iteration": 2.4927194118499756 }, { "auxiliary_loss_clip": 0.01121011, "auxiliary_loss_mlp": 0.01036078, "balance_loss_clip": 1.02222013, "balance_loss_mlp": 1.03838146, "epoch": 0.5936569968435292, "flos": 25483433034240.0, "grad_norm": 1.5948833223668346, "language_loss": 0.84917742, "learning_rate": 1.4201763934796157e-06, "loss": 0.87074828, "num_input_tokens_seen": 212676115, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.73828125, "step": 9874, "time_per_iteration": 2.6023290157318115 }, { "auxiliary_loss_clip": 0.01054968, "auxiliary_loss_mlp": 0.01001006, "balance_loss_clip": 0.99942058, "balance_loss_mlp": 1.01409554, "epoch": 0.5937171200961973, "flos": 66378361789440.0, "grad_norm": 0.7093093391894654, "language_loss": 0.60072553, "learning_rate": 1.4198148076974503e-06, "loss": 0.6212852, "num_input_tokens_seen": 212737560, "router_z_loss_clip": 0.01586914, "router_z_loss_mlp": 0.23046875, "step": 9875, "time_per_iteration": 3.3043861389160156 }, { "auxiliary_loss_clip": 0.01133404, "auxiliary_loss_mlp": 0.01031658, "balance_loss_clip": 1.01674449, "balance_loss_mlp": 1.03809714, "epoch": 0.5937772433488652, "flos": 14903467712640.0, "grad_norm": 1.9236785546722284, "language_loss": 0.77557254, "learning_rate": 1.4194532426206028e-06, "loss": 0.79722321, "num_input_tokens_seen": 212755365, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.7734375, "step": 9876, "time_per_iteration": 2.5796072483062744 }, { "auxiliary_loss_clip": 0.01114589, "auxiliary_loss_mlp": 0.01030771, "balance_loss_clip": 1.01825428, "balance_loss_mlp": 1.03571105, "epoch": 0.5938373666015332, "flos": 22561489376640.0, "grad_norm": 1.4171861113575097, "language_loss": 0.7553665, "learning_rate": 1.4190916982619749e-06, "loss": 0.77682006, "num_input_tokens_seen": 212773875, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 9877, "time_per_iteration": 2.551347255706787 }, { "auxiliary_loss_clip": 0.0112006, "auxiliary_loss_mlp": 0.01031134, "balance_loss_clip": 1.01777029, "balance_loss_mlp": 1.03592825, "epoch": 0.5938974898542011, "flos": 18440883936000.0, "grad_norm": 2.2726975620609333, "language_loss": 0.81628263, "learning_rate": 1.418730174634471e-06, "loss": 0.83779454, "num_input_tokens_seen": 212790590, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75390625, "step": 9878, "time_per_iteration": 2.5607850551605225 }, { "auxiliary_loss_clip": 0.01129676, "auxiliary_loss_mlp": 0.01286607, "balance_loss_clip": 1.02652037, "balance_loss_mlp": 1.03692055, "epoch": 0.5939576131068691, "flos": 45586728270720.0, "grad_norm": 1.6171708389349038, "language_loss": 0.70843834, "learning_rate": 1.4183686717509913e-06, "loss": 0.73260117, "num_input_tokens_seen": 212812265, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.74609375, "step": 9879, "time_per_iteration": 2.7923007011413574 }, { "auxiliary_loss_clip": 0.01119091, "auxiliary_loss_mlp": 0.01037274, "balance_loss_clip": 1.02430367, "balance_loss_mlp": 1.03785992, "epoch": 0.594017736359537, "flos": 23587708942080.0, "grad_norm": 1.432655746832579, "language_loss": 0.57556009, "learning_rate": 1.4180071896244375e-06, "loss": 0.59712374, "num_input_tokens_seen": 212831915, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.72265625, "step": 9880, "time_per_iteration": 2.5574424266815186 }, { "auxiliary_loss_clip": 0.01118371, "auxiliary_loss_mlp": 0.01038513, "balance_loss_clip": 1.02433348, "balance_loss_mlp": 1.03621125, "epoch": 0.594077859612205, "flos": 29457235589760.0, "grad_norm": 1.717117329791074, "language_loss": 0.77308816, "learning_rate": 1.4176457282677103e-06, "loss": 0.79465699, "num_input_tokens_seen": 212851350, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.734375, "step": 9881, "time_per_iteration": 2.609910011291504 }, { "auxiliary_loss_clip": 0.01117895, "auxiliary_loss_mlp": 0.01029558, "balance_loss_clip": 1.01676059, "balance_loss_mlp": 1.03577137, "epoch": 0.594137982864873, "flos": 16800089644800.0, "grad_norm": 2.354541913375512, "language_loss": 0.82780892, "learning_rate": 1.4172842876937088e-06, "loss": 0.84928346, "num_input_tokens_seen": 212867995, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.734375, "step": 9882, "time_per_iteration": 2.5221023559570312 }, { "auxiliary_loss_clip": 0.01119401, "auxiliary_loss_mlp": 0.01035993, "balance_loss_clip": 1.02338052, "balance_loss_mlp": 1.0378511, "epoch": 0.594198106117541, "flos": 12750263953920.0, "grad_norm": 2.1577868678129977, "language_loss": 0.7957083, "learning_rate": 1.4169228679153324e-06, "loss": 0.81726229, "num_input_tokens_seen": 212885220, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 9883, "time_per_iteration": 2.5632219314575195 }, { "auxiliary_loss_clip": 0.01143097, "auxiliary_loss_mlp": 0.01281174, "balance_loss_clip": 1.01958382, "balance_loss_mlp": 1.0398438, "epoch": 0.5942582293702089, "flos": 20996538652800.0, "grad_norm": 1.8225660531107581, "language_loss": 0.75010538, "learning_rate": 1.4165614689454788e-06, "loss": 0.77434802, "num_input_tokens_seen": 212903195, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.765625, "step": 9884, "time_per_iteration": 2.577498197555542 }, { "auxiliary_loss_clip": 0.01112186, "auxiliary_loss_mlp": 0.01030816, "balance_loss_clip": 1.01797163, "balance_loss_mlp": 1.03917766, "epoch": 0.5943183526228769, "flos": 28291431772800.0, "grad_norm": 2.374796113038007, "language_loss": 0.66560256, "learning_rate": 1.416200090797046e-06, "loss": 0.68703258, "num_input_tokens_seen": 212923340, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 9885, "time_per_iteration": 2.6824100017547607 }, { "auxiliary_loss_clip": 0.01128081, "auxiliary_loss_mlp": 0.01038884, "balance_loss_clip": 1.02561593, "balance_loss_mlp": 1.03876686, "epoch": 0.5943784758755448, "flos": 26614619118720.0, "grad_norm": 3.0235460129687906, "language_loss": 0.76884973, "learning_rate": 1.4158387334829304e-06, "loss": 0.79051936, "num_input_tokens_seen": 212942755, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 9886, "time_per_iteration": 2.577589511871338 }, { "auxiliary_loss_clip": 0.01046154, "auxiliary_loss_mlp": 0.01001417, "balance_loss_clip": 0.99972385, "balance_loss_mlp": 1.01424193, "epoch": 0.5944385991282128, "flos": 64190935347840.0, "grad_norm": 0.8628898255476263, "language_loss": 0.64375353, "learning_rate": 1.4154773970160272e-06, "loss": 0.66422927, "num_input_tokens_seen": 212999355, "router_z_loss_clip": 0.01696777, "router_z_loss_mlp": 0.23144531, "step": 9887, "time_per_iteration": 3.0906364917755127 }, { "auxiliary_loss_clip": 0.01118418, "auxiliary_loss_mlp": 0.01029244, "balance_loss_clip": 1.01615441, "balance_loss_mlp": 1.03704584, "epoch": 0.5944987223808808, "flos": 19571998193280.0, "grad_norm": 1.9738720499245723, "language_loss": 0.6931954, "learning_rate": 1.4151160814092325e-06, "loss": 0.71467203, "num_input_tokens_seen": 213018570, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 9888, "time_per_iteration": 2.528585433959961 }, { "auxiliary_loss_clip": 0.01128265, "auxiliary_loss_mlp": 0.01032683, "balance_loss_clip": 1.01871789, "balance_loss_mlp": 1.03793418, "epoch": 0.5945588456335488, "flos": 26177586341760.0, "grad_norm": 1.588179957218177, "language_loss": 0.79360265, "learning_rate": 1.4147547866754396e-06, "loss": 0.81521213, "num_input_tokens_seen": 213037735, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7265625, "step": 9889, "time_per_iteration": 2.5981357097625732 }, { "auxiliary_loss_clip": 0.01135786, "auxiliary_loss_mlp": 0.01033834, "balance_loss_clip": 1.02115643, "balance_loss_mlp": 1.03699958, "epoch": 0.5946189688862168, "flos": 20446494710400.0, "grad_norm": 2.575052311678589, "language_loss": 0.70402926, "learning_rate": 1.414393512827544e-06, "loss": 0.72572553, "num_input_tokens_seen": 213057160, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 9890, "time_per_iteration": 2.624068021774292 }, { "auxiliary_loss_clip": 0.01137379, "auxiliary_loss_mlp": 0.01034382, "balance_loss_clip": 1.0204705, "balance_loss_mlp": 1.03703737, "epoch": 0.5946790921388847, "flos": 13437521850240.0, "grad_norm": 2.8869484078837684, "language_loss": 0.69332814, "learning_rate": 1.414032259878437e-06, "loss": 0.71504575, "num_input_tokens_seen": 213073630, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.73828125, "step": 9891, "time_per_iteration": 2.603860378265381 }, { "auxiliary_loss_clip": 0.01116637, "auxiliary_loss_mlp": 0.01032044, "balance_loss_clip": 1.0190208, "balance_loss_mlp": 1.03665018, "epoch": 0.5947392153915527, "flos": 20412272027520.0, "grad_norm": 1.9565309123460572, "language_loss": 0.53049874, "learning_rate": 1.4136710278410111e-06, "loss": 0.55198556, "num_input_tokens_seen": 213092450, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 9892, "time_per_iteration": 2.6235971450805664 }, { "auxiliary_loss_clip": 0.01062112, "auxiliary_loss_mlp": 0.01000648, "balance_loss_clip": 0.99884808, "balance_loss_mlp": 1.01252902, "epoch": 0.5947993386442206, "flos": 65619138994560.0, "grad_norm": 0.66241678549833, "language_loss": 0.54612017, "learning_rate": 1.4133098167281583e-06, "loss": 0.56674778, "num_input_tokens_seen": 213155465, "router_z_loss_clip": 0.01794434, "router_z_loss_mlp": 0.23046875, "step": 9893, "time_per_iteration": 3.2426795959472656 }, { "auxiliary_loss_clip": 0.01134206, "auxiliary_loss_mlp": 0.01032598, "balance_loss_clip": 1.02001572, "balance_loss_mlp": 1.03726614, "epoch": 0.5948594618968887, "flos": 23183103168000.0, "grad_norm": 1.6577854111650274, "language_loss": 0.74900055, "learning_rate": 1.4129486265527689e-06, "loss": 0.77066863, "num_input_tokens_seen": 213174875, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 9894, "time_per_iteration": 2.5813465118408203 }, { "auxiliary_loss_clip": 0.0113494, "auxiliary_loss_mlp": 0.01031815, "balance_loss_clip": 1.018821, "balance_loss_mlp": 1.03644419, "epoch": 0.5949195851495566, "flos": 13626771632640.0, "grad_norm": 1.8269033644926094, "language_loss": 0.77851737, "learning_rate": 1.4125874573277333e-06, "loss": 0.80018491, "num_input_tokens_seen": 213192695, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.72265625, "step": 9895, "time_per_iteration": 2.529982566833496 }, { "auxiliary_loss_clip": 0.01148244, "auxiliary_loss_mlp": 0.0103413, "balance_loss_clip": 1.02043235, "balance_loss_mlp": 1.03813159, "epoch": 0.5949797084022246, "flos": 19751012599680.0, "grad_norm": 2.0887715836186938, "language_loss": 0.79201573, "learning_rate": 1.41222630906594e-06, "loss": 0.81383944, "num_input_tokens_seen": 213211195, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 9896, "time_per_iteration": 2.640233278274536 }, { "auxiliary_loss_clip": 0.01119458, "auxiliary_loss_mlp": 0.01033335, "balance_loss_clip": 1.02080047, "balance_loss_mlp": 1.03734887, "epoch": 0.5950398316548925, "flos": 25773878407680.0, "grad_norm": 1.6941491542109006, "language_loss": 0.83333039, "learning_rate": 1.4118651817802776e-06, "loss": 0.85485834, "num_input_tokens_seen": 213231975, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.73046875, "step": 9897, "time_per_iteration": 2.6877379417419434 }, { "auxiliary_loss_clip": 0.01129842, "auxiliary_loss_mlp": 0.01035783, "balance_loss_clip": 1.0229857, "balance_loss_mlp": 1.03848374, "epoch": 0.5950999549075605, "flos": 23039029716480.0, "grad_norm": 1.9584216691166447, "language_loss": 0.70885837, "learning_rate": 1.4115040754836344e-06, "loss": 0.73051465, "num_input_tokens_seen": 213249760, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.73046875, "step": 9898, "time_per_iteration": 2.6287808418273926 }, { "auxiliary_loss_clip": 0.01124076, "auxiliary_loss_mlp": 0.01041187, "balance_loss_clip": 1.02644038, "balance_loss_mlp": 1.03728199, "epoch": 0.5951600781602284, "flos": 32446367637120.0, "grad_norm": 2.877188406768367, "language_loss": 0.63849652, "learning_rate": 1.4111429901888964e-06, "loss": 0.6601491, "num_input_tokens_seen": 213269890, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.77734375, "step": 9899, "time_per_iteration": 2.601468563079834 }, { "auxiliary_loss_clip": 0.01108851, "auxiliary_loss_mlp": 0.01027817, "balance_loss_clip": 1.01657557, "balance_loss_mlp": 1.03774536, "epoch": 0.5952202014128964, "flos": 23800874204160.0, "grad_norm": 1.621087234185613, "language_loss": 0.72134161, "learning_rate": 1.4107819259089514e-06, "loss": 0.74270833, "num_input_tokens_seen": 213289400, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.7109375, "step": 9900, "time_per_iteration": 2.5626518726348877 }, { "auxiliary_loss_clip": 0.01107707, "auxiliary_loss_mlp": 0.01033803, "balance_loss_clip": 1.02082658, "balance_loss_mlp": 1.0374018, "epoch": 0.5952803246655644, "flos": 22492182084480.0, "grad_norm": 1.6803716722466722, "language_loss": 0.84238207, "learning_rate": 1.4104208826566835e-06, "loss": 0.86379719, "num_input_tokens_seen": 213308040, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 9901, "time_per_iteration": 2.5285043716430664 }, { "auxiliary_loss_clip": 0.01068319, "auxiliary_loss_mlp": 0.01003258, "balance_loss_clip": 1.00172043, "balance_loss_mlp": 1.01080942, "epoch": 0.5953404479182324, "flos": 51234688851840.0, "grad_norm": 0.7989476777190091, "language_loss": 0.58215433, "learning_rate": 1.4100598604449773e-06, "loss": 0.60287011, "num_input_tokens_seen": 213358585, "router_z_loss_clip": 0.01538086, "router_z_loss_mlp": 0.2265625, "step": 9902, "time_per_iteration": 4.3641743659973145 }, { "auxiliary_loss_clip": 0.01155001, "auxiliary_loss_mlp": 0.01034466, "balance_loss_clip": 1.02091837, "balance_loss_mlp": 1.03712082, "epoch": 0.5954005711709004, "flos": 23112682554240.0, "grad_norm": 1.6223885612797138, "language_loss": 0.77015197, "learning_rate": 1.4096988592867173e-06, "loss": 0.79204667, "num_input_tokens_seen": 213379585, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 9903, "time_per_iteration": 2.6335501670837402 }, { "auxiliary_loss_clip": 0.01138421, "auxiliary_loss_mlp": 0.01034633, "balance_loss_clip": 1.02228332, "balance_loss_mlp": 1.03865552, "epoch": 0.5954606944235683, "flos": 35954732736000.0, "grad_norm": 1.7265509742290328, "language_loss": 0.78076386, "learning_rate": 1.4093378791947863e-06, "loss": 0.80249435, "num_input_tokens_seen": 213401465, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.73046875, "step": 9904, "time_per_iteration": 2.6858673095703125 }, { "auxiliary_loss_clip": 0.01120296, "auxiliary_loss_mlp": 0.01033645, "balance_loss_clip": 1.02024555, "balance_loss_mlp": 1.03612399, "epoch": 0.5955208176762363, "flos": 30443665864320.0, "grad_norm": 1.4162496401814566, "language_loss": 0.72681737, "learning_rate": 1.4089769201820673e-06, "loss": 0.74835682, "num_input_tokens_seen": 213422720, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.75, "step": 9905, "time_per_iteration": 2.5997188091278076 }, { "auxiliary_loss_clip": 0.01120043, "auxiliary_loss_mlp": 0.01030963, "balance_loss_clip": 1.0178082, "balance_loss_mlp": 1.03770113, "epoch": 0.5955809409289042, "flos": 17640112083840.0, "grad_norm": 1.7254647950944197, "language_loss": 0.69783413, "learning_rate": 1.4086159822614417e-06, "loss": 0.7193442, "num_input_tokens_seen": 213439480, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 9906, "time_per_iteration": 2.5152366161346436 }, { "auxiliary_loss_clip": 0.0111899, "auxiliary_loss_mlp": 0.01031681, "balance_loss_clip": 1.01834154, "balance_loss_mlp": 1.03718221, "epoch": 0.5956410641815723, "flos": 24279887001600.0, "grad_norm": 2.4778389652098403, "language_loss": 0.75179672, "learning_rate": 1.4082550654457906e-06, "loss": 0.77330351, "num_input_tokens_seen": 213458895, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73046875, "step": 9907, "time_per_iteration": 3.9396915435791016 }, { "auxiliary_loss_clip": 0.01128363, "auxiliary_loss_mlp": 0.01033811, "balance_loss_clip": 1.02138984, "balance_loss_mlp": 1.03751206, "epoch": 0.5957011874342402, "flos": 35734277013120.0, "grad_norm": 1.569211255127407, "language_loss": 0.66751957, "learning_rate": 1.407894169747994e-06, "loss": 0.68914127, "num_input_tokens_seen": 213481730, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.73046875, "step": 9908, "time_per_iteration": 2.6677303314208984 }, { "auxiliary_loss_clip": 0.01162851, "auxiliary_loss_mlp": 0.01028927, "balance_loss_clip": 1.0167501, "balance_loss_mlp": 1.0388453, "epoch": 0.5957613106869082, "flos": 21245004005760.0, "grad_norm": 2.6761486970153707, "language_loss": 0.76377618, "learning_rate": 1.4075332951809312e-06, "loss": 0.785694, "num_input_tokens_seen": 213497225, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.70703125, "step": 9909, "time_per_iteration": 2.6790366172790527 }, { "auxiliary_loss_clip": 0.01032942, "auxiliary_loss_mlp": 0.01001961, "balance_loss_clip": 1.0003041, "balance_loss_mlp": 1.01043677, "epoch": 0.5958214339395761, "flos": 65940969876480.0, "grad_norm": 0.9219161385716661, "language_loss": 0.73450243, "learning_rate": 1.4071724417574814e-06, "loss": 0.75485146, "num_input_tokens_seen": 213556890, "router_z_loss_clip": 0.01660156, "router_z_loss_mlp": 0.22460938, "step": 9910, "time_per_iteration": 3.204436779022217 }, { "auxiliary_loss_clip": 0.01127248, "auxiliary_loss_mlp": 0.01035607, "balance_loss_clip": 1.02209473, "balance_loss_mlp": 1.03598118, "epoch": 0.5958815571922441, "flos": 23218690567680.0, "grad_norm": 1.6817204587141923, "language_loss": 0.69755816, "learning_rate": 1.4068116094905218e-06, "loss": 0.71918666, "num_input_tokens_seen": 213575800, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 9911, "time_per_iteration": 2.550363779067993 }, { "auxiliary_loss_clip": 0.01126552, "auxiliary_loss_mlp": 0.01035733, "balance_loss_clip": 1.02151704, "balance_loss_mlp": 1.04026175, "epoch": 0.595941680444912, "flos": 16538623568640.0, "grad_norm": 1.8268318826585175, "language_loss": 0.65370595, "learning_rate": 1.4064507983929304e-06, "loss": 0.67532885, "num_input_tokens_seen": 213592740, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.77734375, "step": 9912, "time_per_iteration": 4.09842586517334 }, { "auxiliary_loss_clip": 0.01137629, "auxiliary_loss_mlp": 0.01039773, "balance_loss_clip": 1.0261538, "balance_loss_mlp": 1.03712475, "epoch": 0.59600180369758, "flos": 27818883423360.0, "grad_norm": 1.9590455387995618, "language_loss": 0.73484266, "learning_rate": 1.4060900084775832e-06, "loss": 0.75661671, "num_input_tokens_seen": 213611970, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 9913, "time_per_iteration": 4.1552135944366455 }, { "auxiliary_loss_clip": 0.0113931, "auxiliary_loss_mlp": 0.01276344, "balance_loss_clip": 1.0162394, "balance_loss_mlp": 1.0352931, "epoch": 0.596061926950248, "flos": 29491566013440.0, "grad_norm": 2.4674348352089317, "language_loss": 0.79921037, "learning_rate": 1.4057292397573553e-06, "loss": 0.82336688, "num_input_tokens_seen": 213632230, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7734375, "step": 9914, "time_per_iteration": 2.620865821838379 }, { "auxiliary_loss_clip": 0.01136274, "auxiliary_loss_mlp": 0.01032588, "balance_loss_clip": 1.02051163, "balance_loss_mlp": 1.03792429, "epoch": 0.596122050202916, "flos": 16836790366080.0, "grad_norm": 2.221194901014585, "language_loss": 0.67629993, "learning_rate": 1.405368492245123e-06, "loss": 0.69798851, "num_input_tokens_seen": 213649645, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.71875, "step": 9915, "time_per_iteration": 2.590202569961548 }, { "auxiliary_loss_clip": 0.01127898, "auxiliary_loss_mlp": 0.01033512, "balance_loss_clip": 1.02095342, "balance_loss_mlp": 1.03651249, "epoch": 0.596182173455584, "flos": 20996646393600.0, "grad_norm": 1.7933631003472905, "language_loss": 0.78835052, "learning_rate": 1.4050077659537593e-06, "loss": 0.80996466, "num_input_tokens_seen": 213668850, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.734375, "step": 9916, "time_per_iteration": 2.5280301570892334 }, { "auxiliary_loss_clip": 0.01128325, "auxiliary_loss_mlp": 0.01030769, "balance_loss_clip": 1.01716089, "balance_loss_mlp": 1.03653848, "epoch": 0.5962422967082519, "flos": 16065680169600.0, "grad_norm": 1.788000937844596, "language_loss": 0.8281796, "learning_rate": 1.404647060896138e-06, "loss": 0.84977055, "num_input_tokens_seen": 213685695, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7421875, "step": 9917, "time_per_iteration": 2.6039061546325684 }, { "auxiliary_loss_clip": 0.01127918, "auxiliary_loss_mlp": 0.01031424, "balance_loss_clip": 1.01903796, "balance_loss_mlp": 1.03795028, "epoch": 0.5963024199609199, "flos": 12166966995840.0, "grad_norm": 1.732511597791904, "language_loss": 0.77016211, "learning_rate": 1.404286377085132e-06, "loss": 0.7917555, "num_input_tokens_seen": 213703515, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71875, "step": 9918, "time_per_iteration": 2.6712753772735596 }, { "auxiliary_loss_clip": 0.01120015, "auxiliary_loss_mlp": 0.01038559, "balance_loss_clip": 1.02480841, "balance_loss_mlp": 1.03827453, "epoch": 0.5963625432135878, "flos": 28074280101120.0, "grad_norm": 1.4700916509772244, "language_loss": 0.78705347, "learning_rate": 1.4039257145336118e-06, "loss": 0.80863917, "num_input_tokens_seen": 213724170, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.73046875, "step": 9919, "time_per_iteration": 2.6206448078155518 }, { "auxiliary_loss_clip": 0.01129001, "auxiliary_loss_mlp": 0.01036372, "balance_loss_clip": 1.02214408, "balance_loss_mlp": 1.03789616, "epoch": 0.5964226664662559, "flos": 19860324664320.0, "grad_norm": 1.8557081062001861, "language_loss": 0.77440357, "learning_rate": 1.4035650732544504e-06, "loss": 0.79605734, "num_input_tokens_seen": 213740620, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.73046875, "step": 9920, "time_per_iteration": 2.588348627090454 }, { "auxiliary_loss_clip": 0.01122633, "auxiliary_loss_mlp": 0.01032876, "balance_loss_clip": 1.01998401, "balance_loss_mlp": 1.03990126, "epoch": 0.5964827897189238, "flos": 12932618325120.0, "grad_norm": 2.1224355898274383, "language_loss": 0.82324952, "learning_rate": 1.4032044532605168e-06, "loss": 0.84480464, "num_input_tokens_seen": 213755390, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 9921, "time_per_iteration": 2.6092183589935303 }, { "auxiliary_loss_clip": 0.01119209, "auxiliary_loss_mlp": 0.01030928, "balance_loss_clip": 1.01791024, "balance_loss_mlp": 1.03708911, "epoch": 0.5965429129715918, "flos": 18150797698560.0, "grad_norm": 2.1244081628260645, "language_loss": 0.8082065, "learning_rate": 1.4028438545646817e-06, "loss": 0.82970786, "num_input_tokens_seen": 213773225, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.73046875, "step": 9922, "time_per_iteration": 2.559389591217041 }, { "auxiliary_loss_clip": 0.01117381, "auxiliary_loss_mlp": 0.01280681, "balance_loss_clip": 1.02078652, "balance_loss_mlp": 1.03638721, "epoch": 0.5966030362242597, "flos": 21763231476480.0, "grad_norm": 2.021277242067811, "language_loss": 0.76586902, "learning_rate": 1.4024832771798132e-06, "loss": 0.78984964, "num_input_tokens_seen": 213791860, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 9923, "time_per_iteration": 2.596837043762207 }, { "auxiliary_loss_clip": 0.01133103, "auxiliary_loss_mlp": 0.01036316, "balance_loss_clip": 1.02151656, "balance_loss_mlp": 1.03872204, "epoch": 0.5966631594769277, "flos": 18807208790400.0, "grad_norm": 2.5904835635900265, "language_loss": 0.75868201, "learning_rate": 1.4021227211187793e-06, "loss": 0.7803762, "num_input_tokens_seen": 213809455, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.765625, "step": 9924, "time_per_iteration": 2.583872079849243 }, { "auxiliary_loss_clip": 0.01115989, "auxiliary_loss_mlp": 0.01032959, "balance_loss_clip": 1.01991761, "balance_loss_mlp": 1.03704512, "epoch": 0.5967232827295956, "flos": 14064163545600.0, "grad_norm": 1.8040898602061797, "language_loss": 0.66646075, "learning_rate": 1.4017621863944475e-06, "loss": 0.68795019, "num_input_tokens_seen": 213826615, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 9925, "time_per_iteration": 2.58404803276062 }, { "auxiliary_loss_clip": 0.01128375, "auxiliary_loss_mlp": 0.01028272, "balance_loss_clip": 1.01562405, "balance_loss_mlp": 1.04081857, "epoch": 0.5967834059822636, "flos": 17238235743360.0, "grad_norm": 2.940118133708782, "language_loss": 0.71564925, "learning_rate": 1.4014016730196845e-06, "loss": 0.7372157, "num_input_tokens_seen": 213844495, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 9926, "time_per_iteration": 2.6128461360931396 }, { "auxiliary_loss_clip": 0.0114707, "auxiliary_loss_mlp": 0.01278438, "balance_loss_clip": 1.01778746, "balance_loss_mlp": 1.03711879, "epoch": 0.5968435292349316, "flos": 42520244284800.0, "grad_norm": 1.957266444139623, "language_loss": 0.70467609, "learning_rate": 1.4010411810073563e-06, "loss": 0.72893119, "num_input_tokens_seen": 213869125, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7421875, "step": 9927, "time_per_iteration": 2.7608373165130615 }, { "auxiliary_loss_clip": 0.0115016, "auxiliary_loss_mlp": 0.01032683, "balance_loss_clip": 1.01846719, "balance_loss_mlp": 1.0377636, "epoch": 0.5969036524875996, "flos": 37630898945280.0, "grad_norm": 1.7988346877149, "language_loss": 0.63817656, "learning_rate": 1.4006807103703271e-06, "loss": 0.66000497, "num_input_tokens_seen": 213891115, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.765625, "step": 9928, "time_per_iteration": 2.71264910697937 }, { "auxiliary_loss_clip": 0.01107846, "auxiliary_loss_mlp": 0.01031819, "balance_loss_clip": 1.01922488, "balance_loss_mlp": 1.03614473, "epoch": 0.5969637757402676, "flos": 23148377694720.0, "grad_norm": 1.441513892457638, "language_loss": 0.69301295, "learning_rate": 1.4003202611214623e-06, "loss": 0.71440959, "num_input_tokens_seen": 213911925, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 9929, "time_per_iteration": 2.5631096363067627 }, { "auxiliary_loss_clip": 0.01143636, "auxiliary_loss_mlp": 0.01037522, "balance_loss_clip": 1.02505302, "balance_loss_mlp": 1.03904688, "epoch": 0.5970238989929355, "flos": 24020934877440.0, "grad_norm": 1.6469052859616342, "language_loss": 0.7649743, "learning_rate": 1.3999598332736247e-06, "loss": 0.78678584, "num_input_tokens_seen": 213930715, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 9930, "time_per_iteration": 2.6367886066436768 }, { "auxiliary_loss_clip": 0.01133078, "auxiliary_loss_mlp": 0.01034122, "balance_loss_clip": 1.01977503, "balance_loss_mlp": 1.03932965, "epoch": 0.5970840222456035, "flos": 19426883247360.0, "grad_norm": 1.721397847242319, "language_loss": 0.68631357, "learning_rate": 1.399599426839677e-06, "loss": 0.70798558, "num_input_tokens_seen": 213950015, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7578125, "step": 9931, "time_per_iteration": 2.5399763584136963 }, { "auxiliary_loss_clip": 0.01041682, "auxiliary_loss_mlp": 0.01001539, "balance_loss_clip": 1.00003076, "balance_loss_mlp": 1.00970662, "epoch": 0.5971441454982714, "flos": 62976615235200.0, "grad_norm": 0.8556622841617156, "language_loss": 0.64263177, "learning_rate": 1.3992390418324815e-06, "loss": 0.663064, "num_input_tokens_seen": 214003330, "router_z_loss_clip": 0.01507568, "router_z_loss_mlp": 0.22851562, "step": 9932, "time_per_iteration": 3.1410515308380127 }, { "auxiliary_loss_clip": 0.01136586, "auxiliary_loss_mlp": 0.01028528, "balance_loss_clip": 1.01486611, "balance_loss_mlp": 1.03698969, "epoch": 0.5972042687509395, "flos": 20266223328000.0, "grad_norm": 1.7359158263854573, "language_loss": 0.73940134, "learning_rate": 1.3988786782648992e-06, "loss": 0.76105249, "num_input_tokens_seen": 214021680, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 9933, "time_per_iteration": 2.532923698425293 }, { "auxiliary_loss_clip": 0.01041523, "auxiliary_loss_mlp": 0.0099966, "balance_loss_clip": 0.99817544, "balance_loss_mlp": 1.00965428, "epoch": 0.5972643920036074, "flos": 71652383832960.0, "grad_norm": 0.6648679439703743, "language_loss": 0.52010393, "learning_rate": 1.3985183361497906e-06, "loss": 0.54051578, "num_input_tokens_seen": 214090265, "router_z_loss_clip": 0.01483154, "router_z_loss_mlp": 0.2265625, "step": 9934, "time_per_iteration": 3.2453341484069824 }, { "auxiliary_loss_clip": 0.01032634, "auxiliary_loss_mlp": 0.00999369, "balance_loss_clip": 0.99778318, "balance_loss_mlp": 1.00996017, "epoch": 0.5973245152562754, "flos": 56892702263040.0, "grad_norm": 0.8390526267871932, "language_loss": 0.54215181, "learning_rate": 1.3981580155000155e-06, "loss": 0.56247175, "num_input_tokens_seen": 214146375, "router_z_loss_clip": 0.01586914, "router_z_loss_mlp": 0.2265625, "step": 9935, "time_per_iteration": 3.0500829219818115 }, { "auxiliary_loss_clip": 0.01128733, "auxiliary_loss_mlp": 0.01034104, "balance_loss_clip": 1.02022767, "balance_loss_mlp": 1.03709531, "epoch": 0.5973846385089433, "flos": 24164361884160.0, "grad_norm": 1.6607947710920334, "language_loss": 0.65659076, "learning_rate": 1.3977977163284323e-06, "loss": 0.67821914, "num_input_tokens_seen": 214165340, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7421875, "step": 9936, "time_per_iteration": 2.585589647293091 }, { "auxiliary_loss_clip": 0.0114891, "auxiliary_loss_mlp": 0.01035537, "balance_loss_clip": 1.02310967, "balance_loss_mlp": 1.0386405, "epoch": 0.5974447617616113, "flos": 17670599752320.0, "grad_norm": 2.6992362759354847, "language_loss": 0.67518264, "learning_rate": 1.3974374386478998e-06, "loss": 0.69702709, "num_input_tokens_seen": 214181360, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7421875, "step": 9937, "time_per_iteration": 2.5984323024749756 }, { "auxiliary_loss_clip": 0.01122655, "auxiliary_loss_mlp": 0.01032599, "balance_loss_clip": 1.01849055, "balance_loss_mlp": 1.03791213, "epoch": 0.5975048850142792, "flos": 22892514140160.0, "grad_norm": 2.0503032743019287, "language_loss": 0.7708258, "learning_rate": 1.397077182471275e-06, "loss": 0.79237831, "num_input_tokens_seen": 214198525, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7578125, "step": 9938, "time_per_iteration": 2.5941507816314697 }, { "auxiliary_loss_clip": 0.01120623, "auxiliary_loss_mlp": 0.0103178, "balance_loss_clip": 1.01851809, "balance_loss_mlp": 1.03777528, "epoch": 0.5975650082669473, "flos": 24353108876160.0, "grad_norm": 1.5259757587487321, "language_loss": 0.75954628, "learning_rate": 1.3967169478114149e-06, "loss": 0.78107035, "num_input_tokens_seen": 214218710, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73828125, "step": 9939, "time_per_iteration": 2.6078743934631348 }, { "auxiliary_loss_clip": 0.01150489, "auxiliary_loss_mlp": 0.01031312, "balance_loss_clip": 1.01575565, "balance_loss_mlp": 1.03685308, "epoch": 0.5976251315196152, "flos": 20923352691840.0, "grad_norm": 2.036800522039665, "language_loss": 0.68648279, "learning_rate": 1.396356734681175e-06, "loss": 0.70830077, "num_input_tokens_seen": 214237800, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.77734375, "step": 9940, "time_per_iteration": 2.6094298362731934 }, { "auxiliary_loss_clip": 0.01124414, "auxiliary_loss_mlp": 0.01030837, "balance_loss_clip": 1.01867795, "balance_loss_mlp": 1.03621459, "epoch": 0.5976852547722832, "flos": 35844594658560.0, "grad_norm": 1.4271385346353114, "language_loss": 0.70102012, "learning_rate": 1.3959965430934105e-06, "loss": 0.72257262, "num_input_tokens_seen": 214260355, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.703125, "step": 9941, "time_per_iteration": 2.786928415298462 }, { "auxiliary_loss_clip": 0.01124042, "auxiliary_loss_mlp": 0.01035851, "balance_loss_clip": 1.02173686, "balance_loss_mlp": 1.03497982, "epoch": 0.5977453780249512, "flos": 12855948744960.0, "grad_norm": 1.720733045089564, "language_loss": 0.76999092, "learning_rate": 1.3956363730609757e-06, "loss": 0.79158986, "num_input_tokens_seen": 214277120, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.71484375, "step": 9942, "time_per_iteration": 2.651132822036743 }, { "auxiliary_loss_clip": 0.01145403, "auxiliary_loss_mlp": 0.01043037, "balance_loss_clip": 1.02951849, "balance_loss_mlp": 1.03579617, "epoch": 0.5978055012776191, "flos": 20959155573120.0, "grad_norm": 1.895716038358429, "language_loss": 0.75200629, "learning_rate": 1.3952762245967239e-06, "loss": 0.77389073, "num_input_tokens_seen": 214295300, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 9943, "time_per_iteration": 2.6161651611328125 }, { "auxiliary_loss_clip": 0.01152805, "auxiliary_loss_mlp": 0.01029848, "balance_loss_clip": 1.01781344, "balance_loss_mlp": 1.0367595, "epoch": 0.5978656245302871, "flos": 34058003063040.0, "grad_norm": 1.6760827549292419, "language_loss": 0.6186108, "learning_rate": 1.3949160977135084e-06, "loss": 0.64043736, "num_input_tokens_seen": 214317050, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.71484375, "step": 9944, "time_per_iteration": 4.1628501415252686 }, { "auxiliary_loss_clip": 0.01127371, "auxiliary_loss_mlp": 0.01034556, "balance_loss_clip": 1.02081096, "balance_loss_mlp": 1.03648067, "epoch": 0.597925747782955, "flos": 37373275624320.0, "grad_norm": 1.616760882451486, "language_loss": 0.72503245, "learning_rate": 1.394555992424181e-06, "loss": 0.74665165, "num_input_tokens_seen": 214337470, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.734375, "step": 9945, "time_per_iteration": 2.674388885498047 }, { "auxiliary_loss_clip": 0.01143223, "auxiliary_loss_mlp": 0.01035269, "balance_loss_clip": 1.0218935, "balance_loss_mlp": 1.03543949, "epoch": 0.5979858710356231, "flos": 25374803328000.0, "grad_norm": 2.3539664917054837, "language_loss": 0.67281818, "learning_rate": 1.394195908741593e-06, "loss": 0.69460309, "num_input_tokens_seen": 214357975, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 9946, "time_per_iteration": 2.6268486976623535 }, { "auxiliary_loss_clip": 0.0112122, "auxiliary_loss_mlp": 0.01038662, "balance_loss_clip": 1.02516794, "balance_loss_mlp": 1.03653622, "epoch": 0.598045994288291, "flos": 13698413308800.0, "grad_norm": 2.39606366897309, "language_loss": 0.88510901, "learning_rate": 1.3938358466785944e-06, "loss": 0.90670782, "num_input_tokens_seen": 214374125, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 9947, "time_per_iteration": 2.463761329650879 }, { "auxiliary_loss_clip": 0.01104815, "auxiliary_loss_mlp": 0.01032819, "balance_loss_clip": 1.02121353, "balance_loss_mlp": 1.03533912, "epoch": 0.598106117540959, "flos": 21981352815360.0, "grad_norm": 1.706478323114582, "language_loss": 0.7215454, "learning_rate": 1.3934758062480347e-06, "loss": 0.74292171, "num_input_tokens_seen": 214393395, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6953125, "step": 9948, "time_per_iteration": 2.5520308017730713 }, { "auxiliary_loss_clip": 0.01126236, "auxiliary_loss_mlp": 0.01030567, "balance_loss_clip": 1.01672101, "balance_loss_mlp": 1.03584838, "epoch": 0.5981662407936269, "flos": 20559362221440.0, "grad_norm": 1.9789010439245716, "language_loss": 0.89285886, "learning_rate": 1.3931157874627642e-06, "loss": 0.91442692, "num_input_tokens_seen": 214411550, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.73046875, "step": 9949, "time_per_iteration": 3.944772481918335 }, { "auxiliary_loss_clip": 0.01117736, "auxiliary_loss_mlp": 0.01031308, "balance_loss_clip": 1.01956582, "balance_loss_mlp": 1.03779721, "epoch": 0.5982263640462949, "flos": 14063840323200.0, "grad_norm": 2.5442861524282034, "language_loss": 0.70194638, "learning_rate": 1.3927557903356294e-06, "loss": 0.72343683, "num_input_tokens_seen": 214429780, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.7109375, "step": 9950, "time_per_iteration": 2.5303947925567627 }, { "auxiliary_loss_clip": 0.01118595, "auxiliary_loss_mlp": 0.01033947, "balance_loss_clip": 1.02046466, "balance_loss_mlp": 1.03595316, "epoch": 0.5982864872989628, "flos": 17707228646400.0, "grad_norm": 1.5757056873263446, "language_loss": 0.7812047, "learning_rate": 1.3923958148794788e-06, "loss": 0.80273008, "num_input_tokens_seen": 214447775, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73828125, "step": 9951, "time_per_iteration": 2.5700550079345703 }, { "auxiliary_loss_clip": 0.01128435, "auxiliary_loss_mlp": 0.01042855, "balance_loss_clip": 1.02855587, "balance_loss_mlp": 1.03718805, "epoch": 0.5983466105516309, "flos": 16764789553920.0, "grad_norm": 1.386676200002854, "language_loss": 0.73793137, "learning_rate": 1.3920358611071587e-06, "loss": 0.75964427, "num_input_tokens_seen": 214467245, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.73828125, "step": 9952, "time_per_iteration": 2.576913595199585 }, { "auxiliary_loss_clip": 0.01128134, "auxiliary_loss_mlp": 0.01276164, "balance_loss_clip": 1.01447725, "balance_loss_mlp": 1.03554058, "epoch": 0.5984067338042988, "flos": 20042714949120.0, "grad_norm": 2.5702904335693613, "language_loss": 0.78616464, "learning_rate": 1.3916759290315145e-06, "loss": 0.81020772, "num_input_tokens_seen": 214484385, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.75, "step": 9953, "time_per_iteration": 2.597579002380371 }, { "auxiliary_loss_clip": 0.01133765, "auxiliary_loss_mlp": 0.01032144, "balance_loss_clip": 1.01972282, "balance_loss_mlp": 1.03553128, "epoch": 0.5984668570569668, "flos": 26319900026880.0, "grad_norm": 1.427618716941393, "language_loss": 0.69362539, "learning_rate": 1.391316018665392e-06, "loss": 0.71528447, "num_input_tokens_seen": 214503465, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71875, "step": 9954, "time_per_iteration": 4.121609687805176 }, { "auxiliary_loss_clip": 0.01124455, "auxiliary_loss_mlp": 0.01031859, "balance_loss_clip": 1.01958609, "balance_loss_mlp": 1.03490996, "epoch": 0.5985269803096348, "flos": 20593728558720.0, "grad_norm": 1.913403171403057, "language_loss": 0.73024142, "learning_rate": 1.3909561300216343e-06, "loss": 0.75180453, "num_input_tokens_seen": 214520725, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.72265625, "step": 9955, "time_per_iteration": 4.785559415817261 }, { "auxiliary_loss_clip": 0.01109997, "auxiliary_loss_mlp": 0.01030471, "balance_loss_clip": 1.01726902, "balance_loss_mlp": 1.03643942, "epoch": 0.5985871035623027, "flos": 26865382942080.0, "grad_norm": 1.6110764609292905, "language_loss": 0.68312716, "learning_rate": 1.3905962631130867e-06, "loss": 0.70453191, "num_input_tokens_seen": 214540675, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 9956, "time_per_iteration": 2.6236412525177 }, { "auxiliary_loss_clip": 0.01111131, "auxiliary_loss_mlp": 0.01029941, "balance_loss_clip": 1.01683962, "balance_loss_mlp": 1.03764808, "epoch": 0.5986472268149707, "flos": 19609704495360.0, "grad_norm": 3.7392018664640014, "language_loss": 0.73603147, "learning_rate": 1.3902364179525905e-06, "loss": 0.75744218, "num_input_tokens_seen": 214559910, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 9957, "time_per_iteration": 2.541224479675293 }, { "auxiliary_loss_clip": 0.01131856, "auxiliary_loss_mlp": 0.01029311, "balance_loss_clip": 1.01672244, "balance_loss_mlp": 1.0356462, "epoch": 0.5987073500676386, "flos": 21794616984960.0, "grad_norm": 1.7219913629271066, "language_loss": 0.84976667, "learning_rate": 1.3898765945529878e-06, "loss": 0.87137836, "num_input_tokens_seen": 214575960, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 9958, "time_per_iteration": 2.6314539909362793 }, { "auxiliary_loss_clip": 0.01137351, "auxiliary_loss_mlp": 0.01283856, "balance_loss_clip": 1.02324951, "balance_loss_mlp": 1.03631282, "epoch": 0.5987674733203067, "flos": 24314361079680.0, "grad_norm": 4.212180161657559, "language_loss": 0.65862006, "learning_rate": 1.3895167929271203e-06, "loss": 0.68283212, "num_input_tokens_seen": 214594230, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75, "step": 9959, "time_per_iteration": 2.5790419578552246 }, { "auxiliary_loss_clip": 0.0111987, "auxiliary_loss_mlp": 0.01030337, "balance_loss_clip": 1.01759899, "balance_loss_mlp": 1.03743386, "epoch": 0.5988275965729746, "flos": 21320201128320.0, "grad_norm": 2.039497497091578, "language_loss": 0.83449179, "learning_rate": 1.3891570130878276e-06, "loss": 0.85599387, "num_input_tokens_seen": 214613130, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.734375, "step": 9960, "time_per_iteration": 2.5731256008148193 }, { "auxiliary_loss_clip": 0.01124725, "auxiliary_loss_mlp": 0.01030308, "balance_loss_clip": 1.01764798, "balance_loss_mlp": 1.03484225, "epoch": 0.5988877198256426, "flos": 25118041933440.0, "grad_norm": 1.4862129188096005, "language_loss": 0.79246652, "learning_rate": 1.388797255047951e-06, "loss": 0.81401682, "num_input_tokens_seen": 214634470, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 9961, "time_per_iteration": 2.591113567352295 }, { "auxiliary_loss_clip": 0.01108833, "auxiliary_loss_mlp": 0.01032824, "balance_loss_clip": 1.01987791, "balance_loss_mlp": 1.03685927, "epoch": 0.5989478430783105, "flos": 26429104350720.0, "grad_norm": 1.7794175880720633, "language_loss": 0.67031527, "learning_rate": 1.3884375188203278e-06, "loss": 0.69173187, "num_input_tokens_seen": 214654030, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 9962, "time_per_iteration": 2.599398374557495 }, { "auxiliary_loss_clip": 0.01118209, "auxiliary_loss_mlp": 0.01032748, "balance_loss_clip": 1.01952815, "balance_loss_mlp": 1.03599286, "epoch": 0.5990079663309785, "flos": 25778439434880.0, "grad_norm": 1.3652199866429688, "language_loss": 0.74131423, "learning_rate": 1.3880778044177955e-06, "loss": 0.76282382, "num_input_tokens_seen": 214676985, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 9963, "time_per_iteration": 2.590893268585205 }, { "auxiliary_loss_clip": 0.01117148, "auxiliary_loss_mlp": 0.01032526, "balance_loss_clip": 1.01933527, "balance_loss_mlp": 1.03642511, "epoch": 0.5990680895836464, "flos": 36831779118720.0, "grad_norm": 1.6063768275739714, "language_loss": 0.67969531, "learning_rate": 1.387718111853193e-06, "loss": 0.70119202, "num_input_tokens_seen": 214700105, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 9964, "time_per_iteration": 2.706940174102783 }, { "auxiliary_loss_clip": 0.01132469, "auxiliary_loss_mlp": 0.01029103, "balance_loss_clip": 1.0152452, "balance_loss_mlp": 1.03671479, "epoch": 0.5991282128363145, "flos": 24133550993280.0, "grad_norm": 2.363996746673504, "language_loss": 0.77000523, "learning_rate": 1.3873584411393557e-06, "loss": 0.79162097, "num_input_tokens_seen": 214717885, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.6953125, "step": 9965, "time_per_iteration": 2.644216537475586 }, { "auxiliary_loss_clip": 0.01119259, "auxiliary_loss_mlp": 0.01027246, "balance_loss_clip": 1.01485419, "balance_loss_mlp": 1.03634548, "epoch": 0.5991883360889824, "flos": 10304064956160.0, "grad_norm": 2.2068037432763132, "language_loss": 0.7736752, "learning_rate": 1.3869987922891202e-06, "loss": 0.79514027, "num_input_tokens_seen": 214733680, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.73828125, "step": 9966, "time_per_iteration": 2.5497822761535645 }, { "auxiliary_loss_clip": 0.01134545, "auxiliary_loss_mlp": 0.01028877, "balance_loss_clip": 1.01662803, "balance_loss_mlp": 1.03773713, "epoch": 0.5992484593416504, "flos": 23951196622080.0, "grad_norm": 1.4794239563286289, "language_loss": 0.7351526, "learning_rate": 1.3866391653153208e-06, "loss": 0.75678682, "num_input_tokens_seen": 214753285, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.703125, "step": 9967, "time_per_iteration": 2.5976288318634033 }, { "auxiliary_loss_clip": 0.01124055, "auxiliary_loss_mlp": 0.01034714, "balance_loss_clip": 1.02061105, "balance_loss_mlp": 1.03824377, "epoch": 0.5993085825943184, "flos": 11944105061760.0, "grad_norm": 1.7783001586419611, "language_loss": 0.68747079, "learning_rate": 1.3862795602307914e-06, "loss": 0.7090584, "num_input_tokens_seen": 214767810, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76953125, "step": 9968, "time_per_iteration": 2.538639545440674 }, { "auxiliary_loss_clip": 0.01118967, "auxiliary_loss_mlp": 0.01033788, "balance_loss_clip": 1.02023995, "balance_loss_mlp": 1.03592396, "epoch": 0.5993687058469863, "flos": 19026838500480.0, "grad_norm": 1.523804076582377, "language_loss": 0.79116911, "learning_rate": 1.3859199770483665e-06, "loss": 0.8126967, "num_input_tokens_seen": 214786040, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73828125, "step": 9969, "time_per_iteration": 2.576856851577759 }, { "auxiliary_loss_clip": 0.01117563, "auxiliary_loss_mlp": 0.01030841, "balance_loss_clip": 1.01772189, "balance_loss_mlp": 1.03473353, "epoch": 0.5994288290996543, "flos": 14282967242880.0, "grad_norm": 2.133329722247064, "language_loss": 0.81105083, "learning_rate": 1.3855604157808776e-06, "loss": 0.83253485, "num_input_tokens_seen": 214803110, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 9970, "time_per_iteration": 2.546706438064575 }, { "auxiliary_loss_clip": 0.01146993, "auxiliary_loss_mlp": 0.01039493, "balance_loss_clip": 1.02443671, "balance_loss_mlp": 1.03598511, "epoch": 0.5994889523523222, "flos": 19206643006080.0, "grad_norm": 1.89568318982742, "language_loss": 0.61786085, "learning_rate": 1.385200876441157e-06, "loss": 0.63972569, "num_input_tokens_seen": 214819945, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.75, "step": 9971, "time_per_iteration": 2.6124210357666016 }, { "auxiliary_loss_clip": 0.0110587, "auxiliary_loss_mlp": 0.01028393, "balance_loss_clip": 1.01579905, "balance_loss_mlp": 1.03594565, "epoch": 0.5995490756049903, "flos": 28037040675840.0, "grad_norm": 1.6296748104908494, "language_loss": 0.78695911, "learning_rate": 1.3848413590420358e-06, "loss": 0.80830169, "num_input_tokens_seen": 214838810, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 9972, "time_per_iteration": 2.53898024559021 }, { "auxiliary_loss_clip": 0.01150783, "auxiliary_loss_mlp": 0.01038249, "balance_loss_clip": 1.02350891, "balance_loss_mlp": 1.03746271, "epoch": 0.5996091988576582, "flos": 29052953038080.0, "grad_norm": 1.9510543732382375, "language_loss": 0.75763899, "learning_rate": 1.3844818635963442e-06, "loss": 0.77952933, "num_input_tokens_seen": 214857040, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.78125, "step": 9973, "time_per_iteration": 2.6599459648132324 }, { "auxiliary_loss_clip": 0.01136284, "auxiliary_loss_mlp": 0.01034385, "balance_loss_clip": 1.02083731, "balance_loss_mlp": 1.03560126, "epoch": 0.5996693221103262, "flos": 20813968800000.0, "grad_norm": 2.004801014109456, "language_loss": 0.64950573, "learning_rate": 1.3841223901169116e-06, "loss": 0.67121243, "num_input_tokens_seen": 214873375, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7421875, "step": 9974, "time_per_iteration": 2.5357370376586914 }, { "auxiliary_loss_clip": 0.0111816, "auxiliary_loss_mlp": 0.01035076, "balance_loss_clip": 1.02243996, "balance_loss_mlp": 1.03617561, "epoch": 0.5997294453629941, "flos": 23768914078080.0, "grad_norm": 1.3912933334886983, "language_loss": 0.74267441, "learning_rate": 1.383762938616566e-06, "loss": 0.76420677, "num_input_tokens_seen": 214893900, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73046875, "step": 9975, "time_per_iteration": 2.566690683364868 }, { "auxiliary_loss_clip": 0.01110209, "auxiliary_loss_mlp": 0.01030629, "balance_loss_clip": 1.017313, "balance_loss_mlp": 1.03644538, "epoch": 0.5997895686156621, "flos": 20960017499520.0, "grad_norm": 1.8239023167433608, "language_loss": 0.76999736, "learning_rate": 1.3834035091081374e-06, "loss": 0.79140568, "num_input_tokens_seen": 214912110, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73828125, "step": 9976, "time_per_iteration": 2.5100855827331543 }, { "auxiliary_loss_clip": 0.01128267, "auxiliary_loss_mlp": 0.01035798, "balance_loss_clip": 1.02170753, "balance_loss_mlp": 1.03753567, "epoch": 0.59984969186833, "flos": 28365443746560.0, "grad_norm": 5.03373316455295, "language_loss": 0.75213116, "learning_rate": 1.38304410160445e-06, "loss": 0.77377188, "num_input_tokens_seen": 214930140, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.73046875, "step": 9977, "time_per_iteration": 2.6138601303100586 }, { "auxiliary_loss_clip": 0.01137466, "auxiliary_loss_mlp": 0.01030977, "balance_loss_clip": 1.01765549, "balance_loss_mlp": 1.03715706, "epoch": 0.5999098151209981, "flos": 22565906749440.0, "grad_norm": 1.7711750156606458, "language_loss": 0.69170177, "learning_rate": 1.3826847161183324e-06, "loss": 0.71338624, "num_input_tokens_seen": 214949200, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 9978, "time_per_iteration": 2.5533828735351562 }, { "auxiliary_loss_clip": 0.01116242, "auxiliary_loss_mlp": 0.01035163, "balance_loss_clip": 1.02108455, "balance_loss_mlp": 1.04007089, "epoch": 0.599969938373666, "flos": 18768712389120.0, "grad_norm": 2.0095357597411163, "language_loss": 0.80980361, "learning_rate": 1.3823253526626095e-06, "loss": 0.83131766, "num_input_tokens_seen": 214965775, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76171875, "step": 9979, "time_per_iteration": 2.607463836669922 }, { "auxiliary_loss_clip": 0.01105569, "auxiliary_loss_mlp": 0.01034322, "balance_loss_clip": 1.02182841, "balance_loss_mlp": 1.03632689, "epoch": 0.600030061626334, "flos": 11327231865600.0, "grad_norm": 1.762126916952276, "language_loss": 0.69797796, "learning_rate": 1.3819660112501052e-06, "loss": 0.71937686, "num_input_tokens_seen": 214982480, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 9980, "time_per_iteration": 2.5825295448303223 }, { "auxiliary_loss_clip": 0.01122853, "auxiliary_loss_mlp": 0.01032024, "balance_loss_clip": 1.01829731, "balance_loss_mlp": 1.03766406, "epoch": 0.600090184879002, "flos": 16578664254720.0, "grad_norm": 2.415248678495445, "language_loss": 0.68324643, "learning_rate": 1.3816066918936446e-06, "loss": 0.70479518, "num_input_tokens_seen": 214998110, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.76171875, "step": 9981, "time_per_iteration": 2.6065332889556885 }, { "auxiliary_loss_clip": 0.01118516, "auxiliary_loss_mlp": 0.01033587, "balance_loss_clip": 1.02164817, "balance_loss_mlp": 1.03794026, "epoch": 0.6001503081316699, "flos": 23618627573760.0, "grad_norm": 2.7455863189604948, "language_loss": 0.78514993, "learning_rate": 1.3812473946060504e-06, "loss": 0.8066709, "num_input_tokens_seen": 215017995, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.71875, "step": 9982, "time_per_iteration": 2.535658121109009 }, { "auxiliary_loss_clip": 0.01146165, "auxiliary_loss_mlp": 0.01033976, "balance_loss_clip": 1.02061892, "balance_loss_mlp": 1.03814697, "epoch": 0.6002104313843379, "flos": 20667668705280.0, "grad_norm": 1.4856330932950659, "language_loss": 0.7311579, "learning_rate": 1.3808881194001451e-06, "loss": 0.75295931, "num_input_tokens_seen": 215038285, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.72265625, "step": 9983, "time_per_iteration": 2.5919482707977295 }, { "auxiliary_loss_clip": 0.01137423, "auxiliary_loss_mlp": 0.01031074, "balance_loss_clip": 1.01774096, "balance_loss_mlp": 1.03583491, "epoch": 0.6002705546370058, "flos": 22455229968000.0, "grad_norm": 1.8930616969258762, "language_loss": 0.78109992, "learning_rate": 1.3805288662887504e-06, "loss": 0.80278492, "num_input_tokens_seen": 215057825, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.74609375, "step": 9984, "time_per_iteration": 2.545297384262085 }, { "auxiliary_loss_clip": 0.01119024, "auxiliary_loss_mlp": 0.01034515, "balance_loss_clip": 1.02091932, "balance_loss_mlp": 1.03759968, "epoch": 0.6003306778896739, "flos": 25191982080000.0, "grad_norm": 1.6378522234539186, "language_loss": 0.83213979, "learning_rate": 1.3801696352846865e-06, "loss": 0.85367519, "num_input_tokens_seen": 215077790, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 9985, "time_per_iteration": 4.038860559463501 }, { "auxiliary_loss_clip": 0.01120423, "auxiliary_loss_mlp": 0.01037008, "balance_loss_clip": 1.02431178, "balance_loss_mlp": 1.03890848, "epoch": 0.6003908011423418, "flos": 26687733252480.0, "grad_norm": 1.883229086975776, "language_loss": 0.71279967, "learning_rate": 1.3798104264007745e-06, "loss": 0.73437393, "num_input_tokens_seen": 215097650, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 9986, "time_per_iteration": 2.569807529449463 }, { "auxiliary_loss_clip": 0.01112932, "auxiliary_loss_mlp": 0.01276969, "balance_loss_clip": 1.01650929, "balance_loss_mlp": 1.03913236, "epoch": 0.6004509243950098, "flos": 22565080736640.0, "grad_norm": 1.4778426760887333, "language_loss": 0.72074085, "learning_rate": 1.3794512396498326e-06, "loss": 0.74463987, "num_input_tokens_seen": 215118235, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73828125, "step": 9987, "time_per_iteration": 2.7374868392944336 }, { "auxiliary_loss_clip": 0.01126986, "auxiliary_loss_mlp": 0.01036793, "balance_loss_clip": 1.02359688, "balance_loss_mlp": 1.03680062, "epoch": 0.6005110476476777, "flos": 19719303868800.0, "grad_norm": 1.6959486342613863, "language_loss": 0.84840453, "learning_rate": 1.3790920750446801e-06, "loss": 0.87004232, "num_input_tokens_seen": 215136755, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 9988, "time_per_iteration": 2.57099986076355 }, { "auxiliary_loss_clip": 0.01118817, "auxiliary_loss_mlp": 0.01036649, "balance_loss_clip": 1.02285051, "balance_loss_mlp": 1.03728151, "epoch": 0.6005711709003457, "flos": 17712543859200.0, "grad_norm": 1.762434444912159, "language_loss": 0.65467626, "learning_rate": 1.3787329325981343e-06, "loss": 0.67623091, "num_input_tokens_seen": 215155225, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7265625, "step": 9989, "time_per_iteration": 3.865422248840332 }, { "auxiliary_loss_clip": 0.01118439, "auxiliary_loss_mlp": 0.01031867, "balance_loss_clip": 1.01961839, "balance_loss_mlp": 1.03779113, "epoch": 0.6006312941530136, "flos": 18514464946560.0, "grad_norm": 1.5380649020217156, "language_loss": 0.80400848, "learning_rate": 1.3783738123230114e-06, "loss": 0.82551157, "num_input_tokens_seen": 215174815, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.71484375, "step": 9990, "time_per_iteration": 2.5283446311950684 }, { "auxiliary_loss_clip": 0.01139046, "auxiliary_loss_mlp": 0.01035075, "balance_loss_clip": 1.02164054, "balance_loss_mlp": 1.03654599, "epoch": 0.6006914174056817, "flos": 21390837223680.0, "grad_norm": 1.9439959175410635, "language_loss": 0.82700598, "learning_rate": 1.3780147142321292e-06, "loss": 0.84874725, "num_input_tokens_seen": 215192045, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75390625, "step": 9991, "time_per_iteration": 2.5296926498413086 }, { "auxiliary_loss_clip": 0.01142369, "auxiliary_loss_mlp": 0.01034057, "balance_loss_clip": 1.02114666, "balance_loss_mlp": 1.036291, "epoch": 0.6007515406583496, "flos": 12750515349120.0, "grad_norm": 1.7752863087927213, "language_loss": 0.82606697, "learning_rate": 1.3776556383383011e-06, "loss": 0.84783125, "num_input_tokens_seen": 215209885, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 9992, "time_per_iteration": 2.545881509780884 }, { "auxiliary_loss_clip": 0.01128053, "auxiliary_loss_mlp": 0.01279528, "balance_loss_clip": 1.01857007, "balance_loss_mlp": 1.03844094, "epoch": 0.6008116639110176, "flos": 19206894401280.0, "grad_norm": 2.3261375583371535, "language_loss": 0.66359842, "learning_rate": 1.377296584654343e-06, "loss": 0.68767422, "num_input_tokens_seen": 215228150, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71875, "step": 9993, "time_per_iteration": 2.5950844287872314 }, { "auxiliary_loss_clip": 0.01137003, "auxiliary_loss_mlp": 0.01032848, "balance_loss_clip": 1.01932967, "balance_loss_mlp": 1.0369848, "epoch": 0.6008717871636855, "flos": 17055342668160.0, "grad_norm": 4.669393732278781, "language_loss": 0.808043, "learning_rate": 1.3769375531930672e-06, "loss": 0.82974148, "num_input_tokens_seen": 215243755, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 9994, "time_per_iteration": 2.5323781967163086 }, { "auxiliary_loss_clip": 0.01134768, "auxiliary_loss_mlp": 0.01030352, "balance_loss_clip": 1.01815641, "balance_loss_mlp": 1.03977072, "epoch": 0.6009319104163535, "flos": 20298686244480.0, "grad_norm": 1.8334192208161701, "language_loss": 0.72051656, "learning_rate": 1.376578543967288e-06, "loss": 0.74216771, "num_input_tokens_seen": 215262130, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.68359375, "step": 9995, "time_per_iteration": 4.112945795059204 }, { "auxiliary_loss_clip": 0.01128427, "auxiliary_loss_mlp": 0.01036332, "balance_loss_clip": 1.02305818, "balance_loss_mlp": 1.03681123, "epoch": 0.6009920336690215, "flos": 21836776573440.0, "grad_norm": 1.7032677174178743, "language_loss": 0.80804908, "learning_rate": 1.376219556989817e-06, "loss": 0.82969666, "num_input_tokens_seen": 215281785, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73828125, "step": 9996, "time_per_iteration": 4.212918519973755 }, { "auxiliary_loss_clip": 0.01156059, "auxiliary_loss_mlp": 0.01035766, "balance_loss_clip": 1.02232563, "balance_loss_mlp": 1.03942108, "epoch": 0.6010521569216895, "flos": 22596107109120.0, "grad_norm": 1.7113213375904186, "language_loss": 0.78273869, "learning_rate": 1.3758605922734648e-06, "loss": 0.80465692, "num_input_tokens_seen": 215297550, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 9997, "time_per_iteration": 2.625363826751709 }, { "auxiliary_loss_clip": 0.01123064, "auxiliary_loss_mlp": 0.01034523, "balance_loss_clip": 1.02041423, "balance_loss_mlp": 1.03859913, "epoch": 0.6011122801743575, "flos": 19171702051200.0, "grad_norm": 5.699655558073282, "language_loss": 0.73247552, "learning_rate": 1.3755016498310432e-06, "loss": 0.75405139, "num_input_tokens_seen": 215316360, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7578125, "step": 9998, "time_per_iteration": 2.577406167984009 }, { "auxiliary_loss_clip": 0.01127221, "auxiliary_loss_mlp": 0.01035954, "balance_loss_clip": 1.02294874, "balance_loss_mlp": 1.03820598, "epoch": 0.6011724034270254, "flos": 25010022758400.0, "grad_norm": 1.5720438960849141, "language_loss": 0.7216233, "learning_rate": 1.3751427296753608e-06, "loss": 0.74325514, "num_input_tokens_seen": 215336405, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 9999, "time_per_iteration": 2.666131019592285 }, { "auxiliary_loss_clip": 0.01146933, "auxiliary_loss_mlp": 0.01037128, "balance_loss_clip": 1.02369905, "balance_loss_mlp": 1.03581548, "epoch": 0.6012325266796934, "flos": 21797669640960.0, "grad_norm": 1.4838260162897117, "language_loss": 0.78521675, "learning_rate": 1.3747838318192275e-06, "loss": 0.80705738, "num_input_tokens_seen": 215356590, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75390625, "step": 10000, "time_per_iteration": 2.5684142112731934 }, { "auxiliary_loss_clip": 0.01116365, "auxiliary_loss_mlp": 0.01039866, "balance_loss_clip": 1.02520883, "balance_loss_mlp": 1.04045224, "epoch": 0.6012926499323613, "flos": 19573003774080.0, "grad_norm": 1.8070346100163812, "language_loss": 0.77842784, "learning_rate": 1.3744249562754511e-06, "loss": 0.79999018, "num_input_tokens_seen": 215374295, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7578125, "step": 10001, "time_per_iteration": 2.590883731842041 }, { "auxiliary_loss_clip": 0.01123191, "auxiliary_loss_mlp": 0.01033288, "balance_loss_clip": 1.01881564, "balance_loss_mlp": 1.03870749, "epoch": 0.6013527731850293, "flos": 34860786076800.0, "grad_norm": 1.8104001902076594, "language_loss": 0.58684015, "learning_rate": 1.3740661030568385e-06, "loss": 0.60840499, "num_input_tokens_seen": 215394535, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7578125, "step": 10002, "time_per_iteration": 2.641293525695801 }, { "auxiliary_loss_clip": 0.0113206, "auxiliary_loss_mlp": 0.01034039, "balance_loss_clip": 1.01902497, "balance_loss_mlp": 1.03965843, "epoch": 0.6014128964376972, "flos": 23291948355840.0, "grad_norm": 1.582677522976211, "language_loss": 0.77874982, "learning_rate": 1.3737072721761966e-06, "loss": 0.80041075, "num_input_tokens_seen": 215414355, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.75, "step": 10003, "time_per_iteration": 2.5631842613220215 }, { "auxiliary_loss_clip": 0.01114436, "auxiliary_loss_mlp": 0.01032531, "balance_loss_clip": 1.01885724, "balance_loss_mlp": 1.03976083, "epoch": 0.6014730196903653, "flos": 24820916630400.0, "grad_norm": 2.61826201279309, "language_loss": 0.77856618, "learning_rate": 1.373348463646331e-06, "loss": 0.80003583, "num_input_tokens_seen": 215428280, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.74609375, "step": 10004, "time_per_iteration": 2.505619764328003 }, { "auxiliary_loss_clip": 0.01118972, "auxiliary_loss_mlp": 0.01033225, "balance_loss_clip": 1.01986146, "balance_loss_mlp": 1.03799117, "epoch": 0.6015331429430332, "flos": 23112359331840.0, "grad_norm": 1.5273807822576437, "language_loss": 0.72411251, "learning_rate": 1.3729896774800474e-06, "loss": 0.7456345, "num_input_tokens_seen": 215448970, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.72265625, "step": 10005, "time_per_iteration": 2.623567819595337 }, { "auxiliary_loss_clip": 0.01108406, "auxiliary_loss_mlp": 0.01029836, "balance_loss_clip": 1.01665127, "balance_loss_mlp": 1.03904033, "epoch": 0.6015932661957012, "flos": 19201363706880.0, "grad_norm": 2.3813832669000554, "language_loss": 0.74816084, "learning_rate": 1.3726309136901495e-06, "loss": 0.76954323, "num_input_tokens_seen": 215465260, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6953125, "step": 10006, "time_per_iteration": 2.4676125049591064 }, { "auxiliary_loss_clip": 0.01127873, "auxiliary_loss_mlp": 0.01035221, "balance_loss_clip": 1.02145863, "balance_loss_mlp": 1.03664088, "epoch": 0.6016533894483691, "flos": 18113630100480.0, "grad_norm": 5.435308483085828, "language_loss": 0.73514247, "learning_rate": 1.3722721722894397e-06, "loss": 0.75677347, "num_input_tokens_seen": 215482725, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.734375, "step": 10007, "time_per_iteration": 2.543743371963501 }, { "auxiliary_loss_clip": 0.01126774, "auxiliary_loss_mlp": 0.01029309, "balance_loss_clip": 1.01659489, "balance_loss_mlp": 1.03706598, "epoch": 0.6017135127010371, "flos": 16216900427520.0, "grad_norm": 1.7515189755604958, "language_loss": 0.69796968, "learning_rate": 1.371913453290722e-06, "loss": 0.71953058, "num_input_tokens_seen": 215500420, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 10008, "time_per_iteration": 2.5280206203460693 }, { "auxiliary_loss_clip": 0.01119448, "auxiliary_loss_mlp": 0.01030829, "balance_loss_clip": 1.0181272, "balance_loss_mlp": 1.0381062, "epoch": 0.6017736359537051, "flos": 23444246021760.0, "grad_norm": 1.9678910726904195, "language_loss": 0.76197994, "learning_rate": 1.3715547567067968e-06, "loss": 0.78348267, "num_input_tokens_seen": 215522260, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 10009, "time_per_iteration": 2.661524772644043 }, { "auxiliary_loss_clip": 0.01129441, "auxiliary_loss_mlp": 0.01281262, "balance_loss_clip": 1.0212357, "balance_loss_mlp": 1.03894758, "epoch": 0.601833759206373, "flos": 23514056104320.0, "grad_norm": 1.9290007866066454, "language_loss": 0.74243116, "learning_rate": 1.3711960825504662e-06, "loss": 0.76653814, "num_input_tokens_seen": 215541715, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 10010, "time_per_iteration": 2.57769513130188 }, { "auxiliary_loss_clip": 0.01045038, "auxiliary_loss_mlp": 0.01009001, "balance_loss_clip": 1.0074991, "balance_loss_mlp": 1.01311612, "epoch": 0.6018938824590411, "flos": 63991668648960.0, "grad_norm": 0.7987541417132327, "language_loss": 0.55094558, "learning_rate": 1.37083743083453e-06, "loss": 0.571486, "num_input_tokens_seen": 215603020, "router_z_loss_clip": 0.01501465, "router_z_loss_mlp": 0.22753906, "step": 10011, "time_per_iteration": 3.2409331798553467 }, { "auxiliary_loss_clip": 0.01121232, "auxiliary_loss_mlp": 0.01038369, "balance_loss_clip": 1.02597094, "balance_loss_mlp": 1.0390197, "epoch": 0.601954005711709, "flos": 34640007131520.0, "grad_norm": 1.4943019451885793, "language_loss": 0.62067819, "learning_rate": 1.3704788015717872e-06, "loss": 0.6422742, "num_input_tokens_seen": 215625115, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.734375, "step": 10012, "time_per_iteration": 2.7988169193267822 }, { "auxiliary_loss_clip": 0.01138738, "auxiliary_loss_mlp": 0.01029288, "balance_loss_clip": 1.01699769, "balance_loss_mlp": 1.03963614, "epoch": 0.602014128964377, "flos": 19427062815360.0, "grad_norm": 2.15972054030785, "language_loss": 0.74751729, "learning_rate": 1.3701201947750368e-06, "loss": 0.76919758, "num_input_tokens_seen": 215643730, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71875, "step": 10013, "time_per_iteration": 2.6375155448913574 }, { "auxiliary_loss_clip": 0.01105524, "auxiliary_loss_mlp": 0.01033118, "balance_loss_clip": 1.02082753, "balance_loss_mlp": 1.0376128, "epoch": 0.6020742522170449, "flos": 28329389470080.0, "grad_norm": 2.383038428186375, "language_loss": 0.81397676, "learning_rate": 1.3697616104570764e-06, "loss": 0.83536315, "num_input_tokens_seen": 215664425, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 10014, "time_per_iteration": 2.6370456218719482 }, { "auxiliary_loss_clip": 0.01122985, "auxiliary_loss_mlp": 0.01028833, "balance_loss_clip": 1.01625061, "balance_loss_mlp": 1.03658938, "epoch": 0.6021343754697129, "flos": 22747040058240.0, "grad_norm": 1.456130620843392, "language_loss": 0.72225821, "learning_rate": 1.369403048630703e-06, "loss": 0.74377644, "num_input_tokens_seen": 215684280, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 10015, "time_per_iteration": 2.5976998805999756 }, { "auxiliary_loss_clip": 0.01052928, "auxiliary_loss_mlp": 0.0100744, "balance_loss_clip": 1.00589073, "balance_loss_mlp": 1.01255655, "epoch": 0.6021944987223808, "flos": 65752007402880.0, "grad_norm": 0.8392504607928388, "language_loss": 0.54823041, "learning_rate": 1.3690445093087125e-06, "loss": 0.56883407, "num_input_tokens_seen": 215739780, "router_z_loss_clip": 0.01550293, "router_z_loss_mlp": 0.2265625, "step": 10016, "time_per_iteration": 3.105193614959717 }, { "auxiliary_loss_clip": 0.01128355, "auxiliary_loss_mlp": 0.0103368, "balance_loss_clip": 1.0203104, "balance_loss_mlp": 1.03945231, "epoch": 0.6022546219750489, "flos": 16105182151680.0, "grad_norm": 1.5822007267785325, "language_loss": 0.82589352, "learning_rate": 1.3686859925039009e-06, "loss": 0.84751391, "num_input_tokens_seen": 215757885, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71484375, "step": 10017, "time_per_iteration": 2.600766897201538 }, { "auxiliary_loss_clip": 0.01126525, "auxiliary_loss_mlp": 0.01031682, "balance_loss_clip": 1.0188787, "balance_loss_mlp": 1.03799069, "epoch": 0.6023147452277168, "flos": 25512555985920.0, "grad_norm": 2.3388377276288557, "language_loss": 0.83749574, "learning_rate": 1.3683274982290622e-06, "loss": 0.85907781, "num_input_tokens_seen": 215776415, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 10018, "time_per_iteration": 2.620279550552368 }, { "auxiliary_loss_clip": 0.01131672, "auxiliary_loss_mlp": 0.01040121, "balance_loss_clip": 1.02601838, "balance_loss_mlp": 1.03845334, "epoch": 0.6023748684803848, "flos": 22636075968000.0, "grad_norm": 1.6409625543796265, "language_loss": 0.78292561, "learning_rate": 1.36796902649699e-06, "loss": 0.80464351, "num_input_tokens_seen": 215794865, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.75390625, "step": 10019, "time_per_iteration": 2.5532891750335693 }, { "auxiliary_loss_clip": 0.0113494, "auxiliary_loss_mlp": 0.01273396, "balance_loss_clip": 1.01448715, "balance_loss_mlp": 1.03619218, "epoch": 0.6024349917330527, "flos": 26210444307840.0, "grad_norm": 1.5397511645113002, "language_loss": 0.7362262, "learning_rate": 1.3676105773204774e-06, "loss": 0.76030964, "num_input_tokens_seen": 215816840, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.72265625, "step": 10020, "time_per_iteration": 2.5986077785491943 }, { "auxiliary_loss_clip": 0.01127406, "auxiliary_loss_mlp": 0.01031177, "balance_loss_clip": 1.0180099, "balance_loss_mlp": 1.03688359, "epoch": 0.6024951149857207, "flos": 21251755762560.0, "grad_norm": 1.674765712631799, "language_loss": 0.6423099, "learning_rate": 1.3672521507123169e-06, "loss": 0.66389573, "num_input_tokens_seen": 215836100, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 10021, "time_per_iteration": 2.5194196701049805 }, { "auxiliary_loss_clip": 0.0111771, "auxiliary_loss_mlp": 0.01031002, "balance_loss_clip": 1.01716793, "balance_loss_mlp": 1.036515, "epoch": 0.6025552382383887, "flos": 26943453152640.0, "grad_norm": 1.516373998024287, "language_loss": 0.80456161, "learning_rate": 1.3668937466852994e-06, "loss": 0.82604873, "num_input_tokens_seen": 215858480, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7265625, "step": 10022, "time_per_iteration": 2.5969491004943848 }, { "auxiliary_loss_clip": 0.0112359, "auxiliary_loss_mlp": 0.01032048, "balance_loss_clip": 1.01870835, "balance_loss_mlp": 1.03839874, "epoch": 0.6026153614910567, "flos": 31684379495040.0, "grad_norm": 1.8829536324133966, "language_loss": 0.66680968, "learning_rate": 1.3665353652522157e-06, "loss": 0.688366, "num_input_tokens_seen": 215879950, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7578125, "step": 10023, "time_per_iteration": 2.612722158432007 }, { "auxiliary_loss_clip": 0.01120818, "auxiliary_loss_mlp": 0.01031419, "balance_loss_clip": 1.01816916, "balance_loss_mlp": 1.0366075, "epoch": 0.6026754847437247, "flos": 29312731175040.0, "grad_norm": 5.1909561508841096, "language_loss": 0.74424744, "learning_rate": 1.3661770064258549e-06, "loss": 0.76576978, "num_input_tokens_seen": 215899830, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75390625, "step": 10024, "time_per_iteration": 2.601879596710205 }, { "auxiliary_loss_clip": 0.01118435, "auxiliary_loss_mlp": 0.01037589, "balance_loss_clip": 1.02377248, "balance_loss_mlp": 1.03742468, "epoch": 0.6027356079963926, "flos": 23586775188480.0, "grad_norm": 1.7182981796319197, "language_loss": 0.72581601, "learning_rate": 1.3658186702190068e-06, "loss": 0.7473762, "num_input_tokens_seen": 215920440, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.72265625, "step": 10025, "time_per_iteration": 2.5477089881896973 }, { "auxiliary_loss_clip": 0.01114138, "auxiliary_loss_mlp": 0.01033738, "balance_loss_clip": 1.01967168, "balance_loss_mlp": 1.03713942, "epoch": 0.6027957312490606, "flos": 20813753318400.0, "grad_norm": 2.1458713050084146, "language_loss": 0.67042708, "learning_rate": 1.3654603566444585e-06, "loss": 0.69190586, "num_input_tokens_seen": 215940535, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76953125, "step": 10026, "time_per_iteration": 4.042736053466797 }, { "auxiliary_loss_clip": 0.01103191, "auxiliary_loss_mlp": 0.01035644, "balance_loss_clip": 1.02328849, "balance_loss_mlp": 1.03476715, "epoch": 0.6028558545017285, "flos": 19935773182080.0, "grad_norm": 1.7090128165221334, "language_loss": 0.8033818, "learning_rate": 1.3651020657149986e-06, "loss": 0.82477021, "num_input_tokens_seen": 215958045, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.68359375, "step": 10027, "time_per_iteration": 2.634856939315796 }, { "auxiliary_loss_clip": 0.01140064, "auxiliary_loss_mlp": 0.01035422, "balance_loss_clip": 1.02221382, "balance_loss_mlp": 1.03870535, "epoch": 0.6029159777543965, "flos": 22820836550400.0, "grad_norm": 2.0052833112945234, "language_loss": 0.71262622, "learning_rate": 1.3647437974434124e-06, "loss": 0.73438102, "num_input_tokens_seen": 215977330, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.74609375, "step": 10028, "time_per_iteration": 2.555659770965576 }, { "auxiliary_loss_clip": 0.01147407, "auxiliary_loss_mlp": 0.01038293, "balance_loss_clip": 1.02461934, "balance_loss_mlp": 1.03746378, "epoch": 0.6029761010070644, "flos": 23587242065280.0, "grad_norm": 5.088366868552302, "language_loss": 0.8443734, "learning_rate": 1.3643855518424859e-06, "loss": 0.86623037, "num_input_tokens_seen": 215997865, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 10029, "time_per_iteration": 2.6365606784820557 }, { "auxiliary_loss_clip": 0.01123581, "auxiliary_loss_mlp": 0.0103297, "balance_loss_clip": 1.02011371, "balance_loss_mlp": 1.03520036, "epoch": 0.6030362242597325, "flos": 13662430859520.0, "grad_norm": 1.734638154187279, "language_loss": 0.80278641, "learning_rate": 1.3640273289250043e-06, "loss": 0.82435191, "num_input_tokens_seen": 216016230, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 10030, "time_per_iteration": 2.5661258697509766 }, { "auxiliary_loss_clip": 0.01154691, "auxiliary_loss_mlp": 0.01034072, "balance_loss_clip": 1.02035141, "balance_loss_mlp": 1.03776002, "epoch": 0.6030963475124004, "flos": 24422883045120.0, "grad_norm": 2.000607697623317, "language_loss": 0.71146762, "learning_rate": 1.363669128703751e-06, "loss": 0.73335522, "num_input_tokens_seen": 216035785, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 10031, "time_per_iteration": 4.032185077667236 }, { "auxiliary_loss_clip": 0.01147821, "auxiliary_loss_mlp": 0.01034672, "balance_loss_clip": 1.02046239, "balance_loss_mlp": 1.03639662, "epoch": 0.6031564707650684, "flos": 29644043247360.0, "grad_norm": 1.6256073277111491, "language_loss": 0.73637116, "learning_rate": 1.3633109511915099e-06, "loss": 0.75819606, "num_input_tokens_seen": 216059555, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.76171875, "step": 10032, "time_per_iteration": 2.637988567352295 }, { "auxiliary_loss_clip": 0.01131452, "auxiliary_loss_mlp": 0.01034404, "balance_loss_clip": 1.02098751, "balance_loss_mlp": 1.04035175, "epoch": 0.6032165940177363, "flos": 16618776768000.0, "grad_norm": 2.1499436022333596, "language_loss": 0.68581718, "learning_rate": 1.3629527964010635e-06, "loss": 0.70747578, "num_input_tokens_seen": 216077235, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 10033, "time_per_iteration": 2.517359733581543 }, { "auxiliary_loss_clip": 0.01129288, "auxiliary_loss_mlp": 0.01032868, "balance_loss_clip": 1.0195049, "balance_loss_mlp": 1.03944588, "epoch": 0.6032767172704043, "flos": 17488173553920.0, "grad_norm": 1.9617875593339982, "language_loss": 0.75671679, "learning_rate": 1.3625946643451924e-06, "loss": 0.77833831, "num_input_tokens_seen": 216094985, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 10034, "time_per_iteration": 2.536425828933716 }, { "auxiliary_loss_clip": 0.01128996, "auxiliary_loss_mlp": 0.0103214, "balance_loss_clip": 1.01818085, "balance_loss_mlp": 1.03714645, "epoch": 0.6033368405230723, "flos": 26832955939200.0, "grad_norm": 1.871352133513655, "language_loss": 0.74188089, "learning_rate": 1.3622365550366789e-06, "loss": 0.76349223, "num_input_tokens_seen": 216115905, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7421875, "step": 10035, "time_per_iteration": 2.5669217109680176 }, { "auxiliary_loss_clip": 0.01125758, "auxiliary_loss_mlp": 0.01029722, "balance_loss_clip": 1.01715183, "balance_loss_mlp": 1.03731036, "epoch": 0.6033969637757403, "flos": 16909904499840.0, "grad_norm": 2.3963779478104783, "language_loss": 0.8638944, "learning_rate": 1.3618784684883019e-06, "loss": 0.88544929, "num_input_tokens_seen": 216132420, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 10036, "time_per_iteration": 2.5442893505096436 }, { "auxiliary_loss_clip": 0.0113783, "auxiliary_loss_mlp": 0.01035042, "balance_loss_clip": 1.0216136, "balance_loss_mlp": 1.03792214, "epoch": 0.6034570870284083, "flos": 22930076787840.0, "grad_norm": 2.278907125865002, "language_loss": 0.70172948, "learning_rate": 1.361520404712841e-06, "loss": 0.72345829, "num_input_tokens_seen": 216149800, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 10037, "time_per_iteration": 4.054041385650635 }, { "auxiliary_loss_clip": 0.01119356, "auxiliary_loss_mlp": 0.0103493, "balance_loss_clip": 1.02103019, "balance_loss_mlp": 1.0394088, "epoch": 0.6035172102810762, "flos": 23366319465600.0, "grad_norm": 2.6099014489007657, "language_loss": 0.84950703, "learning_rate": 1.3611623637230743e-06, "loss": 0.87104988, "num_input_tokens_seen": 216168200, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7109375, "step": 10038, "time_per_iteration": 4.086095094680786 }, { "auxiliary_loss_clip": 0.01129611, "auxiliary_loss_mlp": 0.01037096, "balance_loss_clip": 1.02307677, "balance_loss_mlp": 1.0368135, "epoch": 0.6035773335337442, "flos": 20887082933760.0, "grad_norm": 2.326152313097224, "language_loss": 0.7581557, "learning_rate": 1.36080434553178e-06, "loss": 0.77982271, "num_input_tokens_seen": 216187105, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75, "step": 10039, "time_per_iteration": 2.5935542583465576 }, { "auxiliary_loss_clip": 0.01117556, "auxiliary_loss_mlp": 0.01030327, "balance_loss_clip": 1.01702309, "balance_loss_mlp": 1.03649354, "epoch": 0.6036374567864121, "flos": 24936298093440.0, "grad_norm": 1.6972161624209314, "language_loss": 0.70874953, "learning_rate": 1.3604463501517338e-06, "loss": 0.7302283, "num_input_tokens_seen": 216205440, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 10040, "time_per_iteration": 2.7038190364837646 }, { "auxiliary_loss_clip": 0.01124619, "auxiliary_loss_mlp": 0.01034448, "balance_loss_clip": 1.01891506, "balance_loss_mlp": 1.0384655, "epoch": 0.6036975800390801, "flos": 23148269953920.0, "grad_norm": 1.9756339514357413, "language_loss": 0.77913386, "learning_rate": 1.3600883775957123e-06, "loss": 0.80072451, "num_input_tokens_seen": 216223130, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.7734375, "step": 10041, "time_per_iteration": 2.522324562072754 }, { "auxiliary_loss_clip": 0.01127617, "auxiliary_loss_mlp": 0.01032767, "balance_loss_clip": 1.01986289, "balance_loss_mlp": 1.03853774, "epoch": 0.603757703291748, "flos": 18660729127680.0, "grad_norm": 1.835181673559444, "language_loss": 0.75314134, "learning_rate": 1.3597304278764909e-06, "loss": 0.77474523, "num_input_tokens_seen": 216240260, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 10042, "time_per_iteration": 2.66221022605896 }, { "auxiliary_loss_clip": 0.01143089, "auxiliary_loss_mlp": 0.01029066, "balance_loss_clip": 1.01672769, "balance_loss_mlp": 1.03688288, "epoch": 0.6038178265444161, "flos": 19682603147520.0, "grad_norm": 1.8658999097272597, "language_loss": 0.84472638, "learning_rate": 1.3593725010068431e-06, "loss": 0.86644793, "num_input_tokens_seen": 216258510, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 10043, "time_per_iteration": 2.5407702922821045 }, { "auxiliary_loss_clip": 0.01140154, "auxiliary_loss_mlp": 0.01037513, "balance_loss_clip": 1.02298188, "balance_loss_mlp": 1.03729069, "epoch": 0.603877949797084, "flos": 22638230784000.0, "grad_norm": 1.8958236182676516, "language_loss": 0.69668055, "learning_rate": 1.3590145969995434e-06, "loss": 0.71845722, "num_input_tokens_seen": 216277550, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.76171875, "step": 10044, "time_per_iteration": 2.5573246479034424 }, { "auxiliary_loss_clip": 0.01122239, "auxiliary_loss_mlp": 0.01034503, "balance_loss_clip": 1.02035332, "balance_loss_mlp": 1.03889227, "epoch": 0.603938073049752, "flos": 25447881548160.0, "grad_norm": 1.9368614592010185, "language_loss": 0.78190732, "learning_rate": 1.3586567158673639e-06, "loss": 0.80347478, "num_input_tokens_seen": 216296690, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7421875, "step": 10045, "time_per_iteration": 2.5448949337005615 }, { "auxiliary_loss_clip": 0.01153169, "auxiliary_loss_mlp": 0.01032271, "balance_loss_clip": 1.01833534, "balance_loss_mlp": 1.03730488, "epoch": 0.6039981963024199, "flos": 22340135813760.0, "grad_norm": 1.5533878025436072, "language_loss": 0.76735055, "learning_rate": 1.3582988576230761e-06, "loss": 0.78920496, "num_input_tokens_seen": 216316110, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.71875, "step": 10046, "time_per_iteration": 2.555628538131714 }, { "auxiliary_loss_clip": 0.01143362, "auxiliary_loss_mlp": 0.01031314, "balance_loss_clip": 1.01918399, "balance_loss_mlp": 1.03539884, "epoch": 0.6040583195550879, "flos": 20703148364160.0, "grad_norm": 2.1531455054815125, "language_loss": 0.86875665, "learning_rate": 1.3579410222794515e-06, "loss": 0.89050347, "num_input_tokens_seen": 216333855, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.7265625, "step": 10047, "time_per_iteration": 2.5291476249694824 }, { "auxiliary_loss_clip": 0.01117406, "auxiliary_loss_mlp": 0.01028828, "balance_loss_clip": 1.01646554, "balance_loss_mlp": 1.03807032, "epoch": 0.604118442807756, "flos": 27163118776320.0, "grad_norm": 1.5305404055948506, "language_loss": 0.75529623, "learning_rate": 1.3575832098492601e-06, "loss": 0.77675855, "num_input_tokens_seen": 216354890, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 10048, "time_per_iteration": 2.5678696632385254 }, { "auxiliary_loss_clip": 0.01127945, "auxiliary_loss_mlp": 0.01041873, "balance_loss_clip": 1.02612591, "balance_loss_mlp": 1.03925824, "epoch": 0.6041785660604239, "flos": 30881524654080.0, "grad_norm": 2.235599836055448, "language_loss": 0.66397333, "learning_rate": 1.357225420345272e-06, "loss": 0.68567151, "num_input_tokens_seen": 216376055, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.796875, "step": 10049, "time_per_iteration": 2.653369426727295 }, { "auxiliary_loss_clip": 0.01128015, "auxiliary_loss_mlp": 0.01034835, "balance_loss_clip": 1.02199578, "balance_loss_mlp": 1.03854656, "epoch": 0.6042386893130919, "flos": 19938215306880.0, "grad_norm": 1.563263535243117, "language_loss": 0.66313547, "learning_rate": 1.356867653780255e-06, "loss": 0.68476403, "num_input_tokens_seen": 216396295, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 10050, "time_per_iteration": 2.546856641769409 }, { "auxiliary_loss_clip": 0.0113846, "auxiliary_loss_mlp": 0.01035599, "balance_loss_clip": 1.02225363, "balance_loss_mlp": 1.03884101, "epoch": 0.6042988125657598, "flos": 32415915882240.0, "grad_norm": 1.672874027154584, "language_loss": 0.69331217, "learning_rate": 1.356509910166977e-06, "loss": 0.71505272, "num_input_tokens_seen": 216416605, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 10051, "time_per_iteration": 2.6004951000213623 }, { "auxiliary_loss_clip": 0.01114916, "auxiliary_loss_mlp": 0.010325, "balance_loss_clip": 1.01946449, "balance_loss_mlp": 1.03408575, "epoch": 0.6043589358184278, "flos": 17420805596160.0, "grad_norm": 1.789792051917459, "language_loss": 0.64463055, "learning_rate": 1.3561521895182054e-06, "loss": 0.66610473, "num_input_tokens_seen": 216435130, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 10052, "time_per_iteration": 2.589871883392334 }, { "auxiliary_loss_clip": 0.01127181, "auxiliary_loss_mlp": 0.01035711, "balance_loss_clip": 1.02190697, "balance_loss_mlp": 1.03676414, "epoch": 0.6044190590710957, "flos": 27672834723840.0, "grad_norm": 1.684837930301507, "language_loss": 0.68778551, "learning_rate": 1.3557944918467052e-06, "loss": 0.70941436, "num_input_tokens_seen": 216455640, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7265625, "step": 10053, "time_per_iteration": 2.5756020545959473 }, { "auxiliary_loss_clip": 0.01050068, "auxiliary_loss_mlp": 0.0101002, "balance_loss_clip": 1.00854206, "balance_loss_mlp": 1.00946724, "epoch": 0.6044791823237637, "flos": 65316267515520.0, "grad_norm": 0.7273794831255826, "language_loss": 0.60499597, "learning_rate": 1.355436817165243e-06, "loss": 0.62559682, "num_input_tokens_seen": 216518130, "router_z_loss_clip": 0.01477051, "router_z_loss_mlp": 0.22753906, "step": 10054, "time_per_iteration": 3.2555007934570312 }, { "auxiliary_loss_clip": 0.01152965, "auxiliary_loss_mlp": 0.01030751, "balance_loss_clip": 1.01706016, "balance_loss_mlp": 1.03658521, "epoch": 0.6045393055764317, "flos": 24492369905280.0, "grad_norm": 1.6830807680184956, "language_loss": 0.85444152, "learning_rate": 1.355079165486583e-06, "loss": 0.87627864, "num_input_tokens_seen": 216536845, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 10055, "time_per_iteration": 2.6394827365875244 }, { "auxiliary_loss_clip": 0.01134291, "auxiliary_loss_mlp": 0.01044866, "balance_loss_clip": 1.03031611, "balance_loss_mlp": 1.04217267, "epoch": 0.6045994288290997, "flos": 19054345340160.0, "grad_norm": 4.750877646919236, "language_loss": 0.73824549, "learning_rate": 1.3547215368234879e-06, "loss": 0.760037, "num_input_tokens_seen": 216551860, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7421875, "step": 10056, "time_per_iteration": 2.627794027328491 }, { "auxiliary_loss_clip": 0.0113687, "auxiliary_loss_mlp": 0.01035477, "balance_loss_clip": 1.02271533, "balance_loss_mlp": 1.03895044, "epoch": 0.6046595520817676, "flos": 26576697335040.0, "grad_norm": 1.5050742339456236, "language_loss": 0.80379498, "learning_rate": 1.3543639311887221e-06, "loss": 0.82551849, "num_input_tokens_seen": 216574775, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 10057, "time_per_iteration": 2.6776347160339355 }, { "auxiliary_loss_clip": 0.0110739, "auxiliary_loss_mlp": 0.01281668, "balance_loss_clip": 1.02109146, "balance_loss_mlp": 1.03653443, "epoch": 0.6047196753344356, "flos": 13582277660160.0, "grad_norm": 2.244063439053704, "language_loss": 0.75687557, "learning_rate": 1.3540063485950462e-06, "loss": 0.78076613, "num_input_tokens_seen": 216590100, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7109375, "step": 10058, "time_per_iteration": 2.5434067249298096 }, { "auxiliary_loss_clip": 0.01117135, "auxiliary_loss_mlp": 0.01033921, "balance_loss_clip": 1.021029, "balance_loss_mlp": 1.03666949, "epoch": 0.6047797985871035, "flos": 25520456977920.0, "grad_norm": 2.1086966573742947, "language_loss": 0.70734501, "learning_rate": 1.3536487890552224e-06, "loss": 0.72885561, "num_input_tokens_seen": 216610145, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 10059, "time_per_iteration": 2.604252815246582 }, { "auxiliary_loss_clip": 0.01128701, "auxiliary_loss_mlp": 0.01030688, "balance_loss_clip": 1.01726532, "balance_loss_mlp": 1.03790474, "epoch": 0.6048399218397715, "flos": 20520147548160.0, "grad_norm": 1.7222432472395302, "language_loss": 0.75966704, "learning_rate": 1.3532912525820104e-06, "loss": 0.78126085, "num_input_tokens_seen": 216630625, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73046875, "step": 10060, "time_per_iteration": 2.5626678466796875 }, { "auxiliary_loss_clip": 0.01117331, "auxiliary_loss_mlp": 0.01033424, "balance_loss_clip": 1.0201503, "balance_loss_mlp": 1.03718376, "epoch": 0.6049000450924396, "flos": 20408788408320.0, "grad_norm": 2.0150646473084803, "language_loss": 0.73616529, "learning_rate": 1.3529337391881704e-06, "loss": 0.75767285, "num_input_tokens_seen": 216649255, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 10061, "time_per_iteration": 2.5731990337371826 }, { "auxiliary_loss_clip": 0.01146057, "auxiliary_loss_mlp": 0.01277587, "balance_loss_clip": 1.0174427, "balance_loss_mlp": 1.03671229, "epoch": 0.6049601683451075, "flos": 20741357456640.0, "grad_norm": 1.7639776632419497, "language_loss": 0.67360878, "learning_rate": 1.3525762488864606e-06, "loss": 0.69784522, "num_input_tokens_seen": 216668100, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 10062, "time_per_iteration": 2.6137356758117676 }, { "auxiliary_loss_clip": 0.01120011, "auxiliary_loss_mlp": 0.01035535, "balance_loss_clip": 1.02196288, "balance_loss_mlp": 1.03808975, "epoch": 0.6050202915977755, "flos": 20083114771200.0, "grad_norm": 1.9561835931117537, "language_loss": 0.71751904, "learning_rate": 1.3522187816896392e-06, "loss": 0.73907453, "num_input_tokens_seen": 216686125, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 10063, "time_per_iteration": 2.577609062194824 }, { "auxiliary_loss_clip": 0.01108045, "auxiliary_loss_mlp": 0.01039224, "balance_loss_clip": 1.02634966, "balance_loss_mlp": 1.03755307, "epoch": 0.6050804148504434, "flos": 15960821391360.0, "grad_norm": 1.7671018834115373, "language_loss": 0.84655482, "learning_rate": 1.351861337610463e-06, "loss": 0.86802745, "num_input_tokens_seen": 216704265, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 10064, "time_per_iteration": 2.5114707946777344 }, { "auxiliary_loss_clip": 0.01119095, "auxiliary_loss_mlp": 0.01033627, "balance_loss_clip": 1.02035928, "balance_loss_mlp": 1.03750658, "epoch": 0.6051405381031114, "flos": 17456644391040.0, "grad_norm": 2.4709477966242352, "language_loss": 0.79447144, "learning_rate": 1.3515039166616885e-06, "loss": 0.81599867, "num_input_tokens_seen": 216721765, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 10065, "time_per_iteration": 2.600574493408203 }, { "auxiliary_loss_clip": 0.01150936, "auxiliary_loss_mlp": 0.01032687, "balance_loss_clip": 1.01805425, "balance_loss_mlp": 1.03931797, "epoch": 0.6052006613557793, "flos": 11400130517760.0, "grad_norm": 1.8937967400718483, "language_loss": 0.78770578, "learning_rate": 1.3511465188560717e-06, "loss": 0.80954206, "num_input_tokens_seen": 216738295, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7578125, "step": 10066, "time_per_iteration": 2.520291805267334 }, { "auxiliary_loss_clip": 0.01153351, "auxiliary_loss_mlp": 0.01034068, "balance_loss_clip": 1.02034736, "balance_loss_mlp": 1.03639865, "epoch": 0.6052607846084473, "flos": 24750998807040.0, "grad_norm": 1.8146086672930857, "language_loss": 0.73054278, "learning_rate": 1.350789144206366e-06, "loss": 0.75241697, "num_input_tokens_seen": 216759875, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 10067, "time_per_iteration": 2.618521213531494 }, { "auxiliary_loss_clip": 0.01129648, "auxiliary_loss_mlp": 0.01275915, "balance_loss_clip": 1.01621568, "balance_loss_mlp": 1.03926134, "epoch": 0.6053209078611153, "flos": 20741141975040.0, "grad_norm": 1.337139327142562, "language_loss": 0.68934631, "learning_rate": 1.350431792725326e-06, "loss": 0.71340191, "num_input_tokens_seen": 216780705, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 10068, "time_per_iteration": 4.04133677482605 }, { "auxiliary_loss_clip": 0.01117567, "auxiliary_loss_mlp": 0.01031896, "balance_loss_clip": 1.01856232, "balance_loss_mlp": 1.03441072, "epoch": 0.6053810311137833, "flos": 18953149749120.0, "grad_norm": 2.0966838670766763, "language_loss": 0.87582135, "learning_rate": 1.3500744644257043e-06, "loss": 0.89731598, "num_input_tokens_seen": 216797625, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73828125, "step": 10069, "time_per_iteration": 2.5559213161468506 }, { "auxiliary_loss_clip": 0.01134244, "auxiliary_loss_mlp": 0.01027915, "balance_loss_clip": 1.01501703, "balance_loss_mlp": 1.03567147, "epoch": 0.6054411543664512, "flos": 23951124794880.0, "grad_norm": 1.8617318105756546, "language_loss": 0.82887447, "learning_rate": 1.349717159320253e-06, "loss": 0.85049605, "num_input_tokens_seen": 216817610, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 10070, "time_per_iteration": 2.639116048812866 }, { "auxiliary_loss_clip": 0.01129153, "auxiliary_loss_mlp": 0.01036565, "balance_loss_clip": 1.02298689, "balance_loss_mlp": 1.03736269, "epoch": 0.6055012776191192, "flos": 20593979953920.0, "grad_norm": 1.8135372877074574, "language_loss": 0.8597576, "learning_rate": 1.349359877421724e-06, "loss": 0.88141477, "num_input_tokens_seen": 216836835, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73828125, "step": 10071, "time_per_iteration": 2.6065890789031982 }, { "auxiliary_loss_clip": 0.01123237, "auxiliary_loss_mlp": 0.01034207, "balance_loss_clip": 1.01934707, "balance_loss_mlp": 1.03941393, "epoch": 0.6055614008717871, "flos": 30298191782400.0, "grad_norm": 1.7442983547772213, "language_loss": 0.76930833, "learning_rate": 1.3490026187428668e-06, "loss": 0.79088283, "num_input_tokens_seen": 216856760, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.75, "step": 10072, "time_per_iteration": 2.6236841678619385 }, { "auxiliary_loss_clip": 0.01126277, "auxiliary_loss_mlp": 0.0103136, "balance_loss_clip": 1.01713884, "balance_loss_mlp": 1.0351938, "epoch": 0.6056215241244551, "flos": 27125017424640.0, "grad_norm": 1.7178985897073789, "language_loss": 0.74127823, "learning_rate": 1.3486453832964318e-06, "loss": 0.7628547, "num_input_tokens_seen": 216878795, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.734375, "step": 10073, "time_per_iteration": 4.024399757385254 }, { "auxiliary_loss_clip": 0.0106859, "auxiliary_loss_mlp": 0.01008438, "balance_loss_clip": 1.00690007, "balance_loss_mlp": 1.00969696, "epoch": 0.6056816473771232, "flos": 56007323925120.0, "grad_norm": 0.7675237187200838, "language_loss": 0.55193496, "learning_rate": 1.3482881710951674e-06, "loss": 0.57270527, "num_input_tokens_seen": 216937800, "router_z_loss_clip": 0.01538086, "router_z_loss_mlp": 0.22753906, "step": 10074, "time_per_iteration": 3.2495837211608887 }, { "auxiliary_loss_clip": 0.01136586, "auxiliary_loss_mlp": 0.01034182, "balance_loss_clip": 1.02065802, "balance_loss_mlp": 1.03834486, "epoch": 0.6057417706297911, "flos": 23549499849600.0, "grad_norm": 1.6821321603038273, "language_loss": 0.81774461, "learning_rate": 1.347930982151822e-06, "loss": 0.83945227, "num_input_tokens_seen": 216955280, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7109375, "step": 10075, "time_per_iteration": 2.572880744934082 }, { "auxiliary_loss_clip": 0.01118303, "auxiliary_loss_mlp": 0.01027295, "balance_loss_clip": 1.01338291, "balance_loss_mlp": 1.0362606, "epoch": 0.6058018938824591, "flos": 27744296832000.0, "grad_norm": 2.0145119560942795, "language_loss": 0.78114736, "learning_rate": 1.3475738164791425e-06, "loss": 0.80260336, "num_input_tokens_seen": 216976950, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 10076, "time_per_iteration": 2.5584499835968018 }, { "auxiliary_loss_clip": 0.01120585, "auxiliary_loss_mlp": 0.01036216, "balance_loss_clip": 1.02230418, "balance_loss_mlp": 1.03682923, "epoch": 0.605862017135127, "flos": 22783381643520.0, "grad_norm": 1.8694524642716321, "language_loss": 0.72460949, "learning_rate": 1.3472166740898754e-06, "loss": 0.74617755, "num_input_tokens_seen": 216996945, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.74609375, "step": 10077, "time_per_iteration": 2.5467734336853027 }, { "auxiliary_loss_clip": 0.01111847, "auxiliary_loss_mlp": 0.01035599, "balance_loss_clip": 1.02149069, "balance_loss_mlp": 1.03862059, "epoch": 0.605922140387795, "flos": 21215019127680.0, "grad_norm": 1.6648057958050768, "language_loss": 0.55579793, "learning_rate": 1.3468595549967657e-06, "loss": 0.57727236, "num_input_tokens_seen": 217016580, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.73046875, "step": 10078, "time_per_iteration": 2.543501138687134 }, { "auxiliary_loss_clip": 0.01146858, "auxiliary_loss_mlp": 0.01029836, "balance_loss_clip": 1.01631737, "balance_loss_mlp": 1.03729475, "epoch": 0.6059822636404629, "flos": 27268372604160.0, "grad_norm": 1.8877537251283834, "language_loss": 0.70339811, "learning_rate": 1.3465024592125588e-06, "loss": 0.72516501, "num_input_tokens_seen": 217037300, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 10079, "time_per_iteration": 5.653556823730469 }, { "auxiliary_loss_clip": 0.01127282, "auxiliary_loss_mlp": 0.01039745, "balance_loss_clip": 1.02629769, "balance_loss_mlp": 1.03689432, "epoch": 0.606042386893131, "flos": 20631327120000.0, "grad_norm": 1.9782787240864212, "language_loss": 0.62042499, "learning_rate": 1.3461453867499975e-06, "loss": 0.64209521, "num_input_tokens_seen": 217055805, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 10080, "time_per_iteration": 2.6224470138549805 }, { "auxiliary_loss_clip": 0.01129692, "auxiliary_loss_mlp": 0.01032165, "balance_loss_clip": 1.01800346, "balance_loss_mlp": 1.03801894, "epoch": 0.6061025101457989, "flos": 23002293081600.0, "grad_norm": 2.1432952876194578, "language_loss": 0.71221364, "learning_rate": 1.3457883376218262e-06, "loss": 0.73383224, "num_input_tokens_seen": 217074175, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7421875, "step": 10081, "time_per_iteration": 2.5596649646759033 }, { "auxiliary_loss_clip": 0.01127422, "auxiliary_loss_mlp": 0.01030335, "balance_loss_clip": 1.01704955, "balance_loss_mlp": 1.03832102, "epoch": 0.6061626333984669, "flos": 29898937134720.0, "grad_norm": 1.6697048964263044, "language_loss": 0.69228423, "learning_rate": 1.345431311840786e-06, "loss": 0.71386182, "num_input_tokens_seen": 217095695, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 10082, "time_per_iteration": 2.722649335861206 }, { "auxiliary_loss_clip": 0.0111, "auxiliary_loss_mlp": 0.01028531, "balance_loss_clip": 1.01560855, "balance_loss_mlp": 1.03880703, "epoch": 0.6062227566511348, "flos": 25009196745600.0, "grad_norm": 3.4302409616407328, "language_loss": 0.65769523, "learning_rate": 1.3450743094196183e-06, "loss": 0.67908055, "num_input_tokens_seen": 217116260, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 10083, "time_per_iteration": 2.592022657394409 }, { "auxiliary_loss_clip": 0.01117821, "auxiliary_loss_mlp": 0.01034688, "balance_loss_clip": 1.0206275, "balance_loss_mlp": 1.03653359, "epoch": 0.6062828799038028, "flos": 19463943104640.0, "grad_norm": 2.2846615477198515, "language_loss": 0.74393892, "learning_rate": 1.3447173303710644e-06, "loss": 0.76546401, "num_input_tokens_seen": 217134465, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7265625, "step": 10084, "time_per_iteration": 2.6356465816497803 }, { "auxiliary_loss_clip": 0.01128205, "auxiliary_loss_mlp": 0.01035857, "balance_loss_clip": 1.02208829, "balance_loss_mlp": 1.03655982, "epoch": 0.6063430031564707, "flos": 13589568120960.0, "grad_norm": 3.0502196532479013, "language_loss": 0.72242504, "learning_rate": 1.3443603747078625e-06, "loss": 0.7440657, "num_input_tokens_seen": 217149920, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.73828125, "step": 10085, "time_per_iteration": 2.5983150005340576 }, { "auxiliary_loss_clip": 0.01109682, "auxiliary_loss_mlp": 0.01033162, "balance_loss_clip": 1.02024531, "balance_loss_mlp": 1.0376519, "epoch": 0.6064031264091387, "flos": 23255499029760.0, "grad_norm": 1.9946067042925082, "language_loss": 0.76550972, "learning_rate": 1.344003442442753e-06, "loss": 0.78693819, "num_input_tokens_seen": 217168165, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 10086, "time_per_iteration": 2.631844997406006 }, { "auxiliary_loss_clip": 0.01126466, "auxiliary_loss_mlp": 0.01035037, "balance_loss_clip": 1.02166152, "balance_loss_mlp": 1.03787947, "epoch": 0.6064632496618068, "flos": 22458462192000.0, "grad_norm": 1.768848031147198, "language_loss": 0.72478426, "learning_rate": 1.3436465335884728e-06, "loss": 0.74639922, "num_input_tokens_seen": 217190070, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.703125, "step": 10087, "time_per_iteration": 2.6305036544799805 }, { "auxiliary_loss_clip": 0.0111389, "auxiliary_loss_mlp": 0.01029046, "balance_loss_clip": 1.01739967, "balance_loss_mlp": 1.03549147, "epoch": 0.6065233729144747, "flos": 25118652464640.0, "grad_norm": 1.6408787071120547, "language_loss": 0.83824456, "learning_rate": 1.3432896481577597e-06, "loss": 0.85967398, "num_input_tokens_seen": 217209370, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6953125, "step": 10088, "time_per_iteration": 2.6723666191101074 }, { "auxiliary_loss_clip": 0.01143203, "auxiliary_loss_mlp": 0.01275112, "balance_loss_clip": 1.01421225, "balance_loss_mlp": 1.03633237, "epoch": 0.6065834961671427, "flos": 23477355383040.0, "grad_norm": 1.9883255881054875, "language_loss": 0.70892012, "learning_rate": 1.3429327861633501e-06, "loss": 0.73310328, "num_input_tokens_seen": 217226990, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7109375, "step": 10089, "time_per_iteration": 2.6702170372009277 }, { "auxiliary_loss_clip": 0.01119105, "auxiliary_loss_mlp": 0.01034852, "balance_loss_clip": 1.02203655, "balance_loss_mlp": 1.03796768, "epoch": 0.6066436194198106, "flos": 17019396132480.0, "grad_norm": 3.0676170782750427, "language_loss": 0.82888985, "learning_rate": 1.3425759476179785e-06, "loss": 0.85042942, "num_input_tokens_seen": 217244585, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.72265625, "step": 10090, "time_per_iteration": 2.6126694679260254 }, { "auxiliary_loss_clip": 0.01142875, "auxiliary_loss_mlp": 0.01035941, "balance_loss_clip": 1.02138519, "balance_loss_mlp": 1.03770602, "epoch": 0.6067037426724786, "flos": 18514752255360.0, "grad_norm": 2.058927585110635, "language_loss": 0.75528097, "learning_rate": 1.3422191325343808e-06, "loss": 0.77706915, "num_input_tokens_seen": 217263435, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.78515625, "step": 10091, "time_per_iteration": 2.6422505378723145 }, { "auxiliary_loss_clip": 0.01105805, "auxiliary_loss_mlp": 0.01034644, "balance_loss_clip": 1.0214293, "balance_loss_mlp": 1.03574395, "epoch": 0.6067638659251465, "flos": 22345989730560.0, "grad_norm": 1.5841400812954793, "language_loss": 0.79251307, "learning_rate": 1.3418623409252899e-06, "loss": 0.81391758, "num_input_tokens_seen": 217283725, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 10092, "time_per_iteration": 2.686310291290283 }, { "auxiliary_loss_clip": 0.01129379, "auxiliary_loss_mlp": 0.01038617, "balance_loss_clip": 1.0243175, "balance_loss_mlp": 1.03696239, "epoch": 0.6068239891778145, "flos": 12451019748480.0, "grad_norm": 1.7906387525891925, "language_loss": 0.76170862, "learning_rate": 1.3415055728034394e-06, "loss": 0.78338861, "num_input_tokens_seen": 217301120, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.75, "step": 10093, "time_per_iteration": 2.6221330165863037 }, { "auxiliary_loss_clip": 0.01125167, "auxiliary_loss_mlp": 0.01035402, "balance_loss_clip": 1.02278924, "balance_loss_mlp": 1.03783464, "epoch": 0.6068841124304825, "flos": 23185868515200.0, "grad_norm": 1.7517307737901242, "language_loss": 0.87273049, "learning_rate": 1.3411488281815611e-06, "loss": 0.89433622, "num_input_tokens_seen": 217319585, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 10094, "time_per_iteration": 2.6185574531555176 }, { "auxiliary_loss_clip": 0.011174, "auxiliary_loss_mlp": 0.01033417, "balance_loss_clip": 1.01985121, "balance_loss_mlp": 1.03679848, "epoch": 0.6069442356831505, "flos": 18587902302720.0, "grad_norm": 1.9242645585036133, "language_loss": 0.72331274, "learning_rate": 1.340792107072386e-06, "loss": 0.74482083, "num_input_tokens_seen": 217338880, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 10095, "time_per_iteration": 2.5897912979125977 }, { "auxiliary_loss_clip": 0.01125077, "auxiliary_loss_mlp": 0.01030801, "balance_loss_clip": 1.01809382, "balance_loss_mlp": 1.03586376, "epoch": 0.6070043589358184, "flos": 20960340721920.0, "grad_norm": 1.633529620121712, "language_loss": 0.76512951, "learning_rate": 1.3404354094886454e-06, "loss": 0.78668827, "num_input_tokens_seen": 217357480, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 10096, "time_per_iteration": 2.630539655685425 }, { "auxiliary_loss_clip": 0.01111118, "auxiliary_loss_mlp": 0.01041088, "balance_loss_clip": 1.02811158, "balance_loss_mlp": 1.03795457, "epoch": 0.6070644821884864, "flos": 11692443398400.0, "grad_norm": 1.8015651047080512, "language_loss": 0.79367435, "learning_rate": 1.3400787354430683e-06, "loss": 0.81519639, "num_input_tokens_seen": 217374575, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.73046875, "step": 10097, "time_per_iteration": 2.6211585998535156 }, { "auxiliary_loss_clip": 0.01123982, "auxiliary_loss_mlp": 0.01029854, "balance_loss_clip": 1.01703846, "balance_loss_mlp": 1.03591716, "epoch": 0.6071246054411543, "flos": 19310568030720.0, "grad_norm": 1.3839379710448803, "language_loss": 0.67367649, "learning_rate": 1.3397220849483837e-06, "loss": 0.69521487, "num_input_tokens_seen": 217392950, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 10098, "time_per_iteration": 2.5521395206451416 }, { "auxiliary_loss_clip": 0.01107226, "auxiliary_loss_mlp": 0.0103023, "balance_loss_clip": 1.01777864, "balance_loss_mlp": 1.0363996, "epoch": 0.6071847286938223, "flos": 17749029098880.0, "grad_norm": 2.263725430318227, "language_loss": 0.80815005, "learning_rate": 1.3393654580173194e-06, "loss": 0.82952464, "num_input_tokens_seen": 217412145, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.70703125, "step": 10099, "time_per_iteration": 2.56844425201416 }, { "auxiliary_loss_clip": 0.01138698, "auxiliary_loss_mlp": 0.01038676, "balance_loss_clip": 1.02486002, "balance_loss_mlp": 1.03619468, "epoch": 0.6072448519464904, "flos": 22637512512000.0, "grad_norm": 2.8627284332970406, "language_loss": 0.70962155, "learning_rate": 1.3390088546626023e-06, "loss": 0.73139524, "num_input_tokens_seen": 217432080, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 10100, "time_per_iteration": 2.6451971530914307 }, { "auxiliary_loss_clip": 0.01139502, "auxiliary_loss_mlp": 0.01036271, "balance_loss_clip": 1.02235961, "balance_loss_mlp": 1.03940058, "epoch": 0.6073049751991583, "flos": 19537308633600.0, "grad_norm": 2.3176662812418107, "language_loss": 0.70980084, "learning_rate": 1.338652274896959e-06, "loss": 0.73155856, "num_input_tokens_seen": 217450945, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.73046875, "step": 10101, "time_per_iteration": 2.6540298461914062 }, { "auxiliary_loss_clip": 0.01124463, "auxiliary_loss_mlp": 0.01032992, "balance_loss_clip": 1.02014685, "balance_loss_mlp": 1.03620291, "epoch": 0.6073650984518263, "flos": 28294233033600.0, "grad_norm": 2.5414044570063097, "language_loss": 0.69373083, "learning_rate": 1.3382957187331147e-06, "loss": 0.71530545, "num_input_tokens_seen": 217473105, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 10102, "time_per_iteration": 2.659430503845215 }, { "auxiliary_loss_clip": 0.01110293, "auxiliary_loss_mlp": 0.01036573, "balance_loss_clip": 1.02393103, "balance_loss_mlp": 1.03742659, "epoch": 0.6074252217044942, "flos": 25664422688640.0, "grad_norm": 1.9174087904817019, "language_loss": 0.7328763, "learning_rate": 1.3379391861837945e-06, "loss": 0.75434494, "num_input_tokens_seen": 217491780, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73046875, "step": 10103, "time_per_iteration": 2.6494178771972656 }, { "auxiliary_loss_clip": 0.01142721, "auxiliary_loss_mlp": 0.01035377, "balance_loss_clip": 1.02138174, "balance_loss_mlp": 1.04002285, "epoch": 0.6074853449571622, "flos": 22857106308480.0, "grad_norm": 1.6833899767970577, "language_loss": 0.76733792, "learning_rate": 1.3375826772617212e-06, "loss": 0.78911889, "num_input_tokens_seen": 217510605, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75390625, "step": 10104, "time_per_iteration": 2.62385892868042 }, { "auxiliary_loss_clip": 0.0105822, "auxiliary_loss_mlp": 0.01010708, "balance_loss_clip": 1.00928354, "balance_loss_mlp": 1.00927043, "epoch": 0.6075454682098301, "flos": 67111406547840.0, "grad_norm": 0.7091448965414868, "language_loss": 0.55951476, "learning_rate": 1.3372261919796187e-06, "loss": 0.58020401, "num_input_tokens_seen": 217574815, "router_z_loss_clip": 0.01422119, "router_z_loss_mlp": 0.22753906, "step": 10105, "time_per_iteration": 3.213465452194214 }, { "auxiliary_loss_clip": 0.01041111, "auxiliary_loss_mlp": 0.01009569, "balance_loss_clip": 1.00814426, "balance_loss_mlp": 1.0096215, "epoch": 0.6076055914624982, "flos": 70712024751360.0, "grad_norm": 0.7715399960081046, "language_loss": 0.56824636, "learning_rate": 1.3368697303502083e-06, "loss": 0.58875316, "num_input_tokens_seen": 217632375, "router_z_loss_clip": 0.01422119, "router_z_loss_mlp": 0.22753906, "step": 10106, "time_per_iteration": 3.161024808883667 }, { "auxiliary_loss_clip": 0.01143391, "auxiliary_loss_mlp": 0.01031695, "balance_loss_clip": 1.01920748, "balance_loss_mlp": 1.03708887, "epoch": 0.6076657147151661, "flos": 28364545906560.0, "grad_norm": 1.4149141082092098, "language_loss": 0.68325001, "learning_rate": 1.3365132923862112e-06, "loss": 0.70500082, "num_input_tokens_seen": 217653055, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 10107, "time_per_iteration": 2.6222660541534424 }, { "auxiliary_loss_clip": 0.01122382, "auxiliary_loss_mlp": 0.01030967, "balance_loss_clip": 1.01750207, "balance_loss_mlp": 1.03894114, "epoch": 0.6077258379678341, "flos": 15049767807360.0, "grad_norm": 2.187278707547959, "language_loss": 0.80901951, "learning_rate": 1.3361568781003485e-06, "loss": 0.83055305, "num_input_tokens_seen": 217671520, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 10108, "time_per_iteration": 2.618875503540039 }, { "auxiliary_loss_clip": 0.01134135, "auxiliary_loss_mlp": 0.01036513, "balance_loss_clip": 1.02225566, "balance_loss_mlp": 1.03968787, "epoch": 0.607785961220502, "flos": 36167251553280.0, "grad_norm": 1.5897027966422363, "language_loss": 0.71074116, "learning_rate": 1.3358004875053387e-06, "loss": 0.73244762, "num_input_tokens_seen": 217691880, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.765625, "step": 10109, "time_per_iteration": 2.6754238605499268 }, { "auxiliary_loss_clip": 0.01122345, "auxiliary_loss_mlp": 0.01031465, "balance_loss_clip": 1.01956248, "balance_loss_mlp": 1.03593731, "epoch": 0.60784608447317, "flos": 22524249951360.0, "grad_norm": 1.6693869807196677, "language_loss": 0.80232382, "learning_rate": 1.3354441206139012e-06, "loss": 0.8238619, "num_input_tokens_seen": 217710530, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69140625, "step": 10110, "time_per_iteration": 4.01737380027771 }, { "auxiliary_loss_clip": 0.01128913, "auxiliary_loss_mlp": 0.01029141, "balance_loss_clip": 1.01584315, "balance_loss_mlp": 1.03736997, "epoch": 0.6079062077258379, "flos": 23841166285440.0, "grad_norm": 2.375495904899441, "language_loss": 0.70775378, "learning_rate": 1.3350877774387541e-06, "loss": 0.72933435, "num_input_tokens_seen": 217728650, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 10111, "time_per_iteration": 2.78519344329834 }, { "auxiliary_loss_clip": 0.01137782, "auxiliary_loss_mlp": 0.01033284, "balance_loss_clip": 1.0196166, "balance_loss_mlp": 1.03818977, "epoch": 0.6079663309785059, "flos": 23367037737600.0, "grad_norm": 1.7926151252907963, "language_loss": 0.65054369, "learning_rate": 1.3347314579926137e-06, "loss": 0.67225432, "num_input_tokens_seen": 217747135, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 10112, "time_per_iteration": 2.590352773666382 }, { "auxiliary_loss_clip": 0.01041358, "auxiliary_loss_mlp": 0.01005344, "balance_loss_clip": 1.00402629, "balance_loss_mlp": 1.00955796, "epoch": 0.6080264542311739, "flos": 71382873110400.0, "grad_norm": 0.6453510380738343, "language_loss": 0.49584338, "learning_rate": 1.334375162288196e-06, "loss": 0.51631045, "num_input_tokens_seen": 217811860, "router_z_loss_clip": 0.01318359, "router_z_loss_mlp": 0.2265625, "step": 10113, "time_per_iteration": 3.220329761505127 }, { "auxiliary_loss_clip": 0.01041352, "auxiliary_loss_mlp": 0.01003561, "balance_loss_clip": 1.00211263, "balance_loss_mlp": 1.00938678, "epoch": 0.6080865774838419, "flos": 66529833442560.0, "grad_norm": 0.8366678350774386, "language_loss": 0.56946397, "learning_rate": 1.3340188903382164e-06, "loss": 0.58991313, "num_input_tokens_seen": 217866510, "router_z_loss_clip": 0.01446533, "router_z_loss_mlp": 0.22851562, "step": 10114, "time_per_iteration": 4.45536470413208 }, { "auxiliary_loss_clip": 0.01113461, "auxiliary_loss_mlp": 0.01027772, "balance_loss_clip": 1.01357448, "balance_loss_mlp": 1.03887367, "epoch": 0.6081467007365099, "flos": 19207935895680.0, "grad_norm": 1.8638014965036653, "language_loss": 0.70317054, "learning_rate": 1.3336626421553897e-06, "loss": 0.72458279, "num_input_tokens_seen": 217885650, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.74609375, "step": 10115, "time_per_iteration": 2.5271849632263184 }, { "auxiliary_loss_clip": 0.01137985, "auxiliary_loss_mlp": 0.01029358, "balance_loss_clip": 1.01639354, "balance_loss_mlp": 1.03843236, "epoch": 0.6082068239891778, "flos": 24167737762560.0, "grad_norm": 10.500534825840942, "language_loss": 0.72532064, "learning_rate": 1.3333064177524296e-06, "loss": 0.74699402, "num_input_tokens_seen": 217905300, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 10116, "time_per_iteration": 2.651853084564209 }, { "auxiliary_loss_clip": 0.01130996, "auxiliary_loss_mlp": 0.01037073, "balance_loss_clip": 1.02263677, "balance_loss_mlp": 1.03798056, "epoch": 0.6082669472418458, "flos": 37413316310400.0, "grad_norm": 1.4016633211241378, "language_loss": 0.53017896, "learning_rate": 1.3329502171420478e-06, "loss": 0.55185962, "num_input_tokens_seen": 217927845, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.75, "step": 10117, "time_per_iteration": 2.783392906188965 }, { "auxiliary_loss_clip": 0.01129881, "auxiliary_loss_mlp": 0.01028995, "balance_loss_clip": 1.01598382, "balance_loss_mlp": 1.03956187, "epoch": 0.6083270704945137, "flos": 15085534775040.0, "grad_norm": 1.7811312399174537, "language_loss": 0.70276976, "learning_rate": 1.3325940403369575e-06, "loss": 0.72435856, "num_input_tokens_seen": 217946145, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 10118, "time_per_iteration": 2.724271059036255 }, { "auxiliary_loss_clip": 0.01154088, "auxiliary_loss_mlp": 0.01032354, "balance_loss_clip": 1.01913953, "balance_loss_mlp": 1.03841531, "epoch": 0.6083871937471818, "flos": 20668458804480.0, "grad_norm": 1.5571892141212083, "language_loss": 0.74462652, "learning_rate": 1.3322378873498685e-06, "loss": 0.766491, "num_input_tokens_seen": 217965190, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 10119, "time_per_iteration": 2.6600356101989746 }, { "auxiliary_loss_clip": 0.01115871, "auxiliary_loss_mlp": 0.01032142, "balance_loss_clip": 1.01965439, "balance_loss_mlp": 1.03602171, "epoch": 0.6084473169998497, "flos": 21506901045120.0, "grad_norm": 11.302556934379055, "language_loss": 0.67187428, "learning_rate": 1.3318817581934922e-06, "loss": 0.69335437, "num_input_tokens_seen": 217983625, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 10120, "time_per_iteration": 5.78080940246582 }, { "auxiliary_loss_clip": 0.01121434, "auxiliary_loss_mlp": 0.01032423, "balance_loss_clip": 1.01855278, "balance_loss_mlp": 1.03841305, "epoch": 0.6085074402525177, "flos": 26870051710080.0, "grad_norm": 2.2881622834346844, "language_loss": 0.74354893, "learning_rate": 1.3315256528805373e-06, "loss": 0.76508754, "num_input_tokens_seen": 218006005, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7421875, "step": 10121, "time_per_iteration": 2.610222578048706 }, { "auxiliary_loss_clip": 0.0111681, "auxiliary_loss_mlp": 0.01028271, "balance_loss_clip": 1.01592731, "balance_loss_mlp": 1.03620648, "epoch": 0.6085675635051856, "flos": 10889839952640.0, "grad_norm": 2.099386534456875, "language_loss": 0.80719203, "learning_rate": 1.3311695714237118e-06, "loss": 0.82864285, "num_input_tokens_seen": 218024195, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.71875, "step": 10122, "time_per_iteration": 2.5551772117614746 }, { "auxiliary_loss_clip": 0.01119353, "auxiliary_loss_mlp": 0.01030395, "balance_loss_clip": 1.01741886, "balance_loss_mlp": 1.03835106, "epoch": 0.6086276867578536, "flos": 34862186707200.0, "grad_norm": 1.927450383839187, "language_loss": 0.55659997, "learning_rate": 1.3308135138357247e-06, "loss": 0.57809746, "num_input_tokens_seen": 218047190, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 10123, "time_per_iteration": 2.685718297958374 }, { "auxiliary_loss_clip": 0.01119335, "auxiliary_loss_mlp": 0.01035067, "balance_loss_clip": 1.02191257, "balance_loss_mlp": 1.03985977, "epoch": 0.6086878100105215, "flos": 20047706939520.0, "grad_norm": 2.1034606200656416, "language_loss": 0.74178278, "learning_rate": 1.330457480129281e-06, "loss": 0.76332682, "num_input_tokens_seen": 218065945, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 10124, "time_per_iteration": 2.5958199501037598 }, { "auxiliary_loss_clip": 0.0112856, "auxiliary_loss_mlp": 0.01033181, "balance_loss_clip": 1.02037168, "balance_loss_mlp": 1.03795373, "epoch": 0.6087479332631895, "flos": 18332469711360.0, "grad_norm": 5.492103538937525, "language_loss": 0.65327132, "learning_rate": 1.3301014703170883e-06, "loss": 0.67488867, "num_input_tokens_seen": 218085285, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 10125, "time_per_iteration": 2.517470359802246 }, { "auxiliary_loss_clip": 0.0110806, "auxiliary_loss_mlp": 0.0103102, "balance_loss_clip": 1.0178715, "balance_loss_mlp": 1.03738594, "epoch": 0.6088080565158575, "flos": 24493411399680.0, "grad_norm": 1.8131436807602603, "language_loss": 0.76161528, "learning_rate": 1.3297454844118503e-06, "loss": 0.78300607, "num_input_tokens_seen": 218104735, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 10126, "time_per_iteration": 2.607501983642578 }, { "auxiliary_loss_clip": 0.01139728, "auxiliary_loss_mlp": 0.01029862, "balance_loss_clip": 1.01599836, "balance_loss_mlp": 1.0377363, "epoch": 0.6088681797685255, "flos": 10269016260480.0, "grad_norm": 4.442583758949131, "language_loss": 0.70507091, "learning_rate": 1.3293895224262728e-06, "loss": 0.72676682, "num_input_tokens_seen": 218121855, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75390625, "step": 10127, "time_per_iteration": 2.525604009628296 }, { "auxiliary_loss_clip": 0.01115442, "auxiliary_loss_mlp": 0.01028868, "balance_loss_clip": 1.01441431, "balance_loss_mlp": 1.0396347, "epoch": 0.6089283030211935, "flos": 21973703218560.0, "grad_norm": 2.207696169264436, "language_loss": 0.72795624, "learning_rate": 1.3290335843730578e-06, "loss": 0.74939936, "num_input_tokens_seen": 218137325, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7578125, "step": 10128, "time_per_iteration": 2.5291495323181152 }, { "auxiliary_loss_clip": 0.01144826, "auxiliary_loss_mlp": 0.01033905, "balance_loss_clip": 1.02092266, "balance_loss_mlp": 1.03739524, "epoch": 0.6089884262738614, "flos": 17785191116160.0, "grad_norm": 2.3807494544738232, "language_loss": 0.73378789, "learning_rate": 1.3286776702649078e-06, "loss": 0.75557524, "num_input_tokens_seen": 218155530, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 10129, "time_per_iteration": 2.6309077739715576 }, { "auxiliary_loss_clip": 0.01109294, "auxiliary_loss_mlp": 0.01034301, "balance_loss_clip": 1.02129579, "balance_loss_mlp": 1.03778648, "epoch": 0.6090485495265294, "flos": 36910423946880.0, "grad_norm": 1.9332564563441352, "language_loss": 0.65657485, "learning_rate": 1.3283217801145255e-06, "loss": 0.67801082, "num_input_tokens_seen": 218182535, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 10130, "time_per_iteration": 2.7091543674468994 }, { "auxiliary_loss_clip": 0.01122368, "auxiliary_loss_mlp": 0.01023808, "balance_loss_clip": 1.01207745, "balance_loss_mlp": 1.03578067, "epoch": 0.6091086727791973, "flos": 19899036547200.0, "grad_norm": 4.314603341188605, "language_loss": 0.76845247, "learning_rate": 1.3279659139346104e-06, "loss": 0.78991425, "num_input_tokens_seen": 218201740, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 10131, "time_per_iteration": 2.555598735809326 }, { "auxiliary_loss_clip": 0.01136159, "auxiliary_loss_mlp": 0.01032332, "balance_loss_clip": 1.02036893, "balance_loss_mlp": 1.03846002, "epoch": 0.6091687960318654, "flos": 22163635359360.0, "grad_norm": 2.331112954049474, "language_loss": 0.7701416, "learning_rate": 1.327610071737864e-06, "loss": 0.79182649, "num_input_tokens_seen": 218219800, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7109375, "step": 10132, "time_per_iteration": 2.631167411804199 }, { "auxiliary_loss_clip": 0.0110554, "auxiliary_loss_mlp": 0.01031968, "balance_loss_clip": 1.02003503, "balance_loss_mlp": 1.03718805, "epoch": 0.6092289192845333, "flos": 21465280160640.0, "grad_norm": 2.2903173868450777, "language_loss": 0.75509411, "learning_rate": 1.3272542535369841e-06, "loss": 0.77646911, "num_input_tokens_seen": 218237585, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 10133, "time_per_iteration": 2.5137734413146973 }, { "auxiliary_loss_clip": 0.01116155, "auxiliary_loss_mlp": 0.01029185, "balance_loss_clip": 1.01572609, "balance_loss_mlp": 1.03606701, "epoch": 0.6092890425372013, "flos": 28694924225280.0, "grad_norm": 1.908228938998391, "language_loss": 0.63979858, "learning_rate": 1.3268984593446693e-06, "loss": 0.66125196, "num_input_tokens_seen": 218258700, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 10134, "time_per_iteration": 2.759021759033203 }, { "auxiliary_loss_clip": 0.01122971, "auxiliary_loss_mlp": 0.01031399, "balance_loss_clip": 1.01885247, "balance_loss_mlp": 1.03587365, "epoch": 0.6093491657898692, "flos": 20813178700800.0, "grad_norm": 1.7894362512913644, "language_loss": 0.78676498, "learning_rate": 1.326542689173617e-06, "loss": 0.80830866, "num_input_tokens_seen": 218275655, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 10135, "time_per_iteration": 2.605259418487549 }, { "auxiliary_loss_clip": 0.01125558, "auxiliary_loss_mlp": 0.01027867, "balance_loss_clip": 1.01611936, "balance_loss_mlp": 1.03624272, "epoch": 0.6094092890425372, "flos": 25446983708160.0, "grad_norm": 1.8339830358891467, "language_loss": 0.72045958, "learning_rate": 1.3261869430365237e-06, "loss": 0.74199378, "num_input_tokens_seen": 218295720, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.71875, "step": 10136, "time_per_iteration": 2.6577670574188232 }, { "auxiliary_loss_clip": 0.01110216, "auxiliary_loss_mlp": 0.01030374, "balance_loss_clip": 1.01776183, "balance_loss_mlp": 1.03813934, "epoch": 0.6094694122952051, "flos": 35621265847680.0, "grad_norm": 1.7304646617165729, "language_loss": 0.74463415, "learning_rate": 1.3258312209460859e-06, "loss": 0.76604009, "num_input_tokens_seen": 218316745, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.72265625, "step": 10137, "time_per_iteration": 2.659227132797241 }, { "auxiliary_loss_clip": 0.01119467, "auxiliary_loss_mlp": 0.01037279, "balance_loss_clip": 1.024369, "balance_loss_mlp": 1.03671622, "epoch": 0.6095295355478731, "flos": 24456962073600.0, "grad_norm": 1.8527204674245195, "language_loss": 0.80146694, "learning_rate": 1.325475522914997e-06, "loss": 0.82303441, "num_input_tokens_seen": 218335385, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73828125, "step": 10138, "time_per_iteration": 2.5872433185577393 }, { "auxiliary_loss_clip": 0.01121476, "auxiliary_loss_mlp": 0.01033174, "balance_loss_clip": 1.02131844, "balance_loss_mlp": 1.03551626, "epoch": 0.6095896588005411, "flos": 15633208419840.0, "grad_norm": 1.5225841911802653, "language_loss": 0.80738378, "learning_rate": 1.3251198489559517e-06, "loss": 0.82893032, "num_input_tokens_seen": 218353320, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.68359375, "step": 10139, "time_per_iteration": 2.5809617042541504 }, { "auxiliary_loss_clip": 0.01139662, "auxiliary_loss_mlp": 0.01036268, "balance_loss_clip": 1.0231725, "balance_loss_mlp": 1.03904998, "epoch": 0.6096497820532091, "flos": 15550577182080.0, "grad_norm": 2.1365928574041115, "language_loss": 0.83773506, "learning_rate": 1.3247641990816432e-06, "loss": 0.85949433, "num_input_tokens_seen": 218365620, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 10140, "time_per_iteration": 2.638988733291626 }, { "auxiliary_loss_clip": 0.01118053, "auxiliary_loss_mlp": 0.01033915, "balance_loss_clip": 1.02101648, "balance_loss_mlp": 1.03609836, "epoch": 0.6097099053058771, "flos": 24204474397440.0, "grad_norm": 2.057259177656049, "language_loss": 0.75845402, "learning_rate": 1.324408573304763e-06, "loss": 0.77997375, "num_input_tokens_seen": 218383785, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73046875, "step": 10141, "time_per_iteration": 2.5736923217773438 }, { "auxiliary_loss_clip": 0.01126073, "auxiliary_loss_mlp": 0.01029533, "balance_loss_clip": 1.01740336, "balance_loss_mlp": 1.03650987, "epoch": 0.609770028558545, "flos": 19570238426880.0, "grad_norm": 1.9523745643704211, "language_loss": 0.76521087, "learning_rate": 1.3240529716380038e-06, "loss": 0.78676689, "num_input_tokens_seen": 218399055, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.71875, "step": 10142, "time_per_iteration": 2.568344831466675 }, { "auxiliary_loss_clip": 0.01109329, "auxiliary_loss_mlp": 0.01032897, "balance_loss_clip": 1.0198617, "balance_loss_mlp": 1.03727961, "epoch": 0.609830151811213, "flos": 23949185460480.0, "grad_norm": 1.7235277091293586, "language_loss": 0.76832056, "learning_rate": 1.3236973940940552e-06, "loss": 0.78974283, "num_input_tokens_seen": 218419120, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 10143, "time_per_iteration": 2.5868496894836426 }, { "auxiliary_loss_clip": 0.01118314, "auxiliary_loss_mlp": 0.01034654, "balance_loss_clip": 1.02165461, "balance_loss_mlp": 1.03664994, "epoch": 0.6098902750638809, "flos": 16179732829440.0, "grad_norm": 1.7002216741895526, "language_loss": 0.74647796, "learning_rate": 1.323341840685606e-06, "loss": 0.76800764, "num_input_tokens_seen": 218435290, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 10144, "time_per_iteration": 2.5884082317352295 }, { "auxiliary_loss_clip": 0.01128815, "auxiliary_loss_mlp": 0.01031193, "balance_loss_clip": 1.01823533, "balance_loss_mlp": 1.03685641, "epoch": 0.609950398316549, "flos": 44526393763200.0, "grad_norm": 1.8585973657105557, "language_loss": 0.7317903, "learning_rate": 1.322986311425347e-06, "loss": 0.75339037, "num_input_tokens_seen": 218457880, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7421875, "step": 10145, "time_per_iteration": 2.801084280014038 }, { "auxiliary_loss_clip": 0.01125526, "auxiliary_loss_mlp": 0.01032587, "balance_loss_clip": 1.01990914, "balance_loss_mlp": 1.03642833, "epoch": 0.6100105215692169, "flos": 23221743223680.0, "grad_norm": 1.8938882270214341, "language_loss": 0.69295418, "learning_rate": 1.3226308063259643e-06, "loss": 0.7145353, "num_input_tokens_seen": 218475930, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 10146, "time_per_iteration": 2.593918800354004 }, { "auxiliary_loss_clip": 0.01142878, "auxiliary_loss_mlp": 0.01275728, "balance_loss_clip": 1.01773238, "balance_loss_mlp": 1.03756714, "epoch": 0.6100706448218849, "flos": 15012564295680.0, "grad_norm": 1.6561424553972766, "language_loss": 0.76999968, "learning_rate": 1.3222753254001462e-06, "loss": 0.79418576, "num_input_tokens_seen": 218493675, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6953125, "step": 10147, "time_per_iteration": 2.613722562789917 }, { "auxiliary_loss_clip": 0.01124378, "auxiliary_loss_mlp": 0.01029433, "balance_loss_clip": 1.01686215, "balance_loss_mlp": 1.03503454, "epoch": 0.6101307680745528, "flos": 19639976682240.0, "grad_norm": 1.9383574018469787, "language_loss": 0.78546715, "learning_rate": 1.321919868660578e-06, "loss": 0.80700529, "num_input_tokens_seen": 218511780, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 10148, "time_per_iteration": 2.631591558456421 }, { "auxiliary_loss_clip": 0.0114038, "auxiliary_loss_mlp": 0.01034448, "balance_loss_clip": 1.0205065, "balance_loss_mlp": 1.03725147, "epoch": 0.6101908913272208, "flos": 29935566028800.0, "grad_norm": 2.1347182834086698, "language_loss": 0.54245734, "learning_rate": 1.321564436119946e-06, "loss": 0.56420565, "num_input_tokens_seen": 218531850, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.76171875, "step": 10149, "time_per_iteration": 2.757241725921631 }, { "auxiliary_loss_clip": 0.011231, "auxiliary_loss_mlp": 0.01034989, "balance_loss_clip": 1.02283621, "balance_loss_mlp": 1.0371269, "epoch": 0.6102510145798887, "flos": 21798639308160.0, "grad_norm": 1.7979623912914857, "language_loss": 0.80246079, "learning_rate": 1.3212090277909335e-06, "loss": 0.82404172, "num_input_tokens_seen": 218551245, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.68359375, "step": 10150, "time_per_iteration": 2.582641124725342 }, { "auxiliary_loss_clip": 0.01129001, "auxiliary_loss_mlp": 0.0103732, "balance_loss_clip": 1.0240221, "balance_loss_mlp": 1.03821659, "epoch": 0.6103111378325567, "flos": 20706129192960.0, "grad_norm": 1.6063066360455784, "language_loss": 0.68774855, "learning_rate": 1.320853643686225e-06, "loss": 0.7094118, "num_input_tokens_seen": 218571365, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 10151, "time_per_iteration": 4.062318325042725 }, { "auxiliary_loss_clip": 0.01106637, "auxiliary_loss_mlp": 0.01031684, "balance_loss_clip": 1.01982832, "balance_loss_mlp": 1.03550172, "epoch": 0.6103712610852247, "flos": 29381643417600.0, "grad_norm": 1.6952607960446748, "language_loss": 0.70775485, "learning_rate": 1.320498283818503e-06, "loss": 0.72913802, "num_input_tokens_seen": 218588315, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.7109375, "step": 10152, "time_per_iteration": 2.6553127765655518 }, { "auxiliary_loss_clip": 0.01138102, "auxiliary_loss_mlp": 0.01033652, "balance_loss_clip": 1.02069402, "balance_loss_mlp": 1.03822231, "epoch": 0.6104313843378927, "flos": 20813035046400.0, "grad_norm": 1.8044084802614644, "language_loss": 0.77976143, "learning_rate": 1.3201429482004493e-06, "loss": 0.80147904, "num_input_tokens_seen": 218605940, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.73046875, "step": 10153, "time_per_iteration": 2.6106505393981934 }, { "auxiliary_loss_clip": 0.01128628, "auxiliary_loss_mlp": 0.01031685, "balance_loss_clip": 1.01943588, "balance_loss_mlp": 1.03808069, "epoch": 0.6104915075905607, "flos": 26578457101440.0, "grad_norm": 2.16907471085926, "language_loss": 0.7917515, "learning_rate": 1.3197876368447452e-06, "loss": 0.81335461, "num_input_tokens_seen": 218626100, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.73046875, "step": 10154, "time_per_iteration": 2.6642417907714844 }, { "auxiliary_loss_clip": 0.01117393, "auxiliary_loss_mlp": 0.01031826, "balance_loss_clip": 1.01954103, "balance_loss_mlp": 1.03791571, "epoch": 0.6105516308432286, "flos": 23915788790400.0, "grad_norm": 1.5746418401640632, "language_loss": 0.69810975, "learning_rate": 1.3194323497640707e-06, "loss": 0.71960193, "num_input_tokens_seen": 218645060, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.70703125, "step": 10155, "time_per_iteration": 4.005609512329102 }, { "auxiliary_loss_clip": 0.01119094, "auxiliary_loss_mlp": 0.01033955, "balance_loss_clip": 1.02051461, "balance_loss_mlp": 1.03824282, "epoch": 0.6106117540958966, "flos": 31577365900800.0, "grad_norm": 2.156097008158041, "language_loss": 0.71590137, "learning_rate": 1.3190770869711045e-06, "loss": 0.73743182, "num_input_tokens_seen": 218667690, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 10156, "time_per_iteration": 2.65407657623291 }, { "auxiliary_loss_clip": 0.01124584, "auxiliary_loss_mlp": 0.0103251, "balance_loss_clip": 1.01979685, "balance_loss_mlp": 1.03781581, "epoch": 0.6106718773485645, "flos": 19608160210560.0, "grad_norm": 1.623581035124561, "language_loss": 0.67358965, "learning_rate": 1.3187218484785264e-06, "loss": 0.69516063, "num_input_tokens_seen": 218687505, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 10157, "time_per_iteration": 2.588074207305908 }, { "auxiliary_loss_clip": 0.01134206, "auxiliary_loss_mlp": 0.01027087, "balance_loss_clip": 1.01444483, "balance_loss_mlp": 1.03487647, "epoch": 0.6107320006012326, "flos": 17123895774720.0, "grad_norm": 1.7265970903868562, "language_loss": 0.72148204, "learning_rate": 1.3183666342990122e-06, "loss": 0.74309498, "num_input_tokens_seen": 218705315, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 10158, "time_per_iteration": 2.6186275482177734 }, { "auxiliary_loss_clip": 0.01119652, "auxiliary_loss_mlp": 0.01036043, "balance_loss_clip": 1.02368724, "balance_loss_mlp": 1.03700459, "epoch": 0.6107921238539005, "flos": 30148228500480.0, "grad_norm": 1.4317063331652307, "language_loss": 0.69037712, "learning_rate": 1.3180114444452398e-06, "loss": 0.71193403, "num_input_tokens_seen": 218725735, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.734375, "step": 10159, "time_per_iteration": 2.5962514877319336 }, { "auxiliary_loss_clip": 0.01111896, "auxiliary_loss_mlp": 0.01032568, "balance_loss_clip": 1.0189786, "balance_loss_mlp": 1.03822005, "epoch": 0.6108522471065685, "flos": 18440273404800.0, "grad_norm": 1.7855896273254548, "language_loss": 0.7880072, "learning_rate": 1.3176562789298852e-06, "loss": 0.80945182, "num_input_tokens_seen": 218743215, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 10160, "time_per_iteration": 2.575167655944824 }, { "auxiliary_loss_clip": 0.01040907, "auxiliary_loss_mlp": 0.01001607, "balance_loss_clip": 1.00003386, "balance_loss_mlp": 1.0095408, "epoch": 0.6109123703592364, "flos": 64135454791680.0, "grad_norm": 0.8182384627269286, "language_loss": 0.61454409, "learning_rate": 1.3173011377656217e-06, "loss": 0.63496923, "num_input_tokens_seen": 218806440, "router_z_loss_clip": 0.01574707, "router_z_loss_mlp": 0.2265625, "step": 10161, "time_per_iteration": 4.887314796447754 }, { "auxiliary_loss_clip": 0.01131295, "auxiliary_loss_mlp": 0.01281824, "balance_loss_clip": 1.02152157, "balance_loss_mlp": 1.03919196, "epoch": 0.6109724936119044, "flos": 20667848273280.0, "grad_norm": 2.2514252473461287, "language_loss": 0.76109445, "learning_rate": 1.3169460209651253e-06, "loss": 0.78522563, "num_input_tokens_seen": 218825720, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7421875, "step": 10162, "time_per_iteration": 4.192142009735107 }, { "auxiliary_loss_clip": 0.01108932, "auxiliary_loss_mlp": 0.01032824, "balance_loss_clip": 1.01962805, "balance_loss_mlp": 1.03660083, "epoch": 0.6110326168645723, "flos": 31351882273920.0, "grad_norm": 1.6193972322836454, "language_loss": 0.71614933, "learning_rate": 1.3165909285410676e-06, "loss": 0.73756689, "num_input_tokens_seen": 218847735, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 10163, "time_per_iteration": 2.606717824935913 }, { "auxiliary_loss_clip": 0.01106102, "auxiliary_loss_mlp": 0.01031662, "balance_loss_clip": 1.01943111, "balance_loss_mlp": 1.03600669, "epoch": 0.6110927401172404, "flos": 25003378742400.0, "grad_norm": 2.4918815478792316, "language_loss": 0.59740496, "learning_rate": 1.3162358605061226e-06, "loss": 0.61878258, "num_input_tokens_seen": 218866585, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.69921875, "step": 10164, "time_per_iteration": 2.6435437202453613 }, { "auxiliary_loss_clip": 0.01117267, "auxiliary_loss_mlp": 0.01031256, "balance_loss_clip": 1.0185008, "balance_loss_mlp": 1.0363493, "epoch": 0.6111528633699083, "flos": 26248078782720.0, "grad_norm": 2.2564543391847907, "language_loss": 0.75586033, "learning_rate": 1.3158808168729607e-06, "loss": 0.7773456, "num_input_tokens_seen": 218885560, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 10165, "time_per_iteration": 2.680814266204834 }, { "auxiliary_loss_clip": 0.01127709, "auxiliary_loss_mlp": 0.01029956, "balance_loss_clip": 1.0168314, "balance_loss_mlp": 1.04062676, "epoch": 0.6112129866225763, "flos": 22382474970240.0, "grad_norm": 1.4757240264667166, "language_loss": 0.79181069, "learning_rate": 1.3155257976542523e-06, "loss": 0.81338733, "num_input_tokens_seen": 218905055, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 10166, "time_per_iteration": 2.616959810256958 }, { "auxiliary_loss_clip": 0.01129227, "auxiliary_loss_mlp": 0.01029759, "balance_loss_clip": 1.01670575, "balance_loss_mlp": 1.03856707, "epoch": 0.6112731098752443, "flos": 25227892702080.0, "grad_norm": 1.8013266190508639, "language_loss": 0.67655683, "learning_rate": 1.3151708028626676e-06, "loss": 0.6981467, "num_input_tokens_seen": 218924030, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 10167, "time_per_iteration": 2.6167731285095215 }, { "auxiliary_loss_clip": 0.01109632, "auxiliary_loss_mlp": 0.01033163, "balance_loss_clip": 1.02115846, "balance_loss_mlp": 1.038908, "epoch": 0.6113332331279122, "flos": 22893160584960.0, "grad_norm": 1.6735266655860732, "language_loss": 0.79416698, "learning_rate": 1.3148158325108754e-06, "loss": 0.81559491, "num_input_tokens_seen": 218943750, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7109375, "step": 10168, "time_per_iteration": 2.5846407413482666 }, { "auxiliary_loss_clip": 0.01116826, "auxiliary_loss_mlp": 0.01039252, "balance_loss_clip": 1.02457166, "balance_loss_mlp": 1.03849483, "epoch": 0.6113933563805802, "flos": 18620329305600.0, "grad_norm": 2.7539375014568552, "language_loss": 0.85730815, "learning_rate": 1.3144608866115437e-06, "loss": 0.87886894, "num_input_tokens_seen": 218957585, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78125, "step": 10169, "time_per_iteration": 2.531520128250122 }, { "auxiliary_loss_clip": 0.01106525, "auxiliary_loss_mlp": 0.01027954, "balance_loss_clip": 1.01639664, "balance_loss_mlp": 1.03532386, "epoch": 0.6114534796332481, "flos": 41866275317760.0, "grad_norm": 1.8780693188638307, "language_loss": 0.78919661, "learning_rate": 1.3141059651773395e-06, "loss": 0.81054151, "num_input_tokens_seen": 218980025, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.7109375, "step": 10170, "time_per_iteration": 2.732550859451294 }, { "auxiliary_loss_clip": 0.01131581, "auxiliary_loss_mlp": 0.01041813, "balance_loss_clip": 1.02790117, "balance_loss_mlp": 1.03661776, "epoch": 0.6115136028859162, "flos": 21908454163200.0, "grad_norm": 2.06595937650239, "language_loss": 0.68686295, "learning_rate": 1.3137510682209293e-06, "loss": 0.70859689, "num_input_tokens_seen": 218998200, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.77734375, "step": 10171, "time_per_iteration": 2.529972553253174 }, { "auxiliary_loss_clip": 0.01138364, "auxiliary_loss_mlp": 0.01036214, "balance_loss_clip": 1.0219028, "balance_loss_mlp": 1.03726339, "epoch": 0.6115737261385841, "flos": 28804846821120.0, "grad_norm": 1.6461416812030598, "language_loss": 0.79276353, "learning_rate": 1.3133961957549783e-06, "loss": 0.81450927, "num_input_tokens_seen": 219017910, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.74609375, "step": 10172, "time_per_iteration": 2.610470771789551 }, { "auxiliary_loss_clip": 0.01142214, "auxiliary_loss_mlp": 0.01035843, "balance_loss_clip": 1.02215171, "balance_loss_mlp": 1.03828871, "epoch": 0.6116338493912521, "flos": 21251468453760.0, "grad_norm": 2.12062799648437, "language_loss": 0.67038393, "learning_rate": 1.3130413477921504e-06, "loss": 0.69216454, "num_input_tokens_seen": 219037730, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.76953125, "step": 10173, "time_per_iteration": 2.615952968597412 }, { "auxiliary_loss_clip": 0.01132568, "auxiliary_loss_mlp": 0.01038315, "balance_loss_clip": 1.02487421, "balance_loss_mlp": 1.04048526, "epoch": 0.61169397264392, "flos": 17530189488000.0, "grad_norm": 2.6934630177032397, "language_loss": 0.55929089, "learning_rate": 1.3126865243451102e-06, "loss": 0.58099961, "num_input_tokens_seen": 219056755, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 10174, "time_per_iteration": 2.621861219406128 }, { "auxiliary_loss_clip": 0.01133695, "auxiliary_loss_mlp": 0.01035721, "balance_loss_clip": 1.02178502, "balance_loss_mlp": 1.04123259, "epoch": 0.611754095896588, "flos": 23951555758080.0, "grad_norm": 1.884602679029761, "language_loss": 0.66501033, "learning_rate": 1.3123317254265195e-06, "loss": 0.68670452, "num_input_tokens_seen": 219076985, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75, "step": 10175, "time_per_iteration": 2.6078364849090576 }, { "auxiliary_loss_clip": 0.01115235, "auxiliary_loss_mlp": 0.01270862, "balance_loss_clip": 1.01246333, "balance_loss_mlp": 1.03428912, "epoch": 0.6118142191492559, "flos": 25994872834560.0, "grad_norm": 1.9469244300590158, "language_loss": 0.82633388, "learning_rate": 1.311976951049041e-06, "loss": 0.85019487, "num_input_tokens_seen": 219096050, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.72265625, "step": 10176, "time_per_iteration": 2.537240505218506 }, { "auxiliary_loss_clip": 0.01130218, "auxiliary_loss_mlp": 0.01036815, "balance_loss_clip": 1.02231359, "balance_loss_mlp": 1.03899801, "epoch": 0.611874342401924, "flos": 24603190341120.0, "grad_norm": 1.7324071508414423, "language_loss": 0.77046049, "learning_rate": 1.3116222012253354e-06, "loss": 0.79213083, "num_input_tokens_seen": 219112665, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.734375, "step": 10177, "time_per_iteration": 2.605081796646118 }, { "auxiliary_loss_clip": 0.01119767, "auxiliary_loss_mlp": 0.01280314, "balance_loss_clip": 1.02081466, "balance_loss_mlp": 1.03696799, "epoch": 0.6119344656545919, "flos": 15887132640000.0, "grad_norm": 1.9497124982616196, "language_loss": 0.75257337, "learning_rate": 1.3112674759680622e-06, "loss": 0.77657419, "num_input_tokens_seen": 219129120, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73828125, "step": 10178, "time_per_iteration": 2.4975345134735107 }, { "auxiliary_loss_clip": 0.0113036, "auxiliary_loss_mlp": 0.01032251, "balance_loss_clip": 1.01868463, "balance_loss_mlp": 1.03936386, "epoch": 0.6119945889072599, "flos": 21652877917440.0, "grad_norm": 1.8432116755764887, "language_loss": 0.66897178, "learning_rate": 1.3109127752898817e-06, "loss": 0.69059789, "num_input_tokens_seen": 219148950, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73046875, "step": 10179, "time_per_iteration": 2.6161186695098877 }, { "auxiliary_loss_clip": 0.01130822, "auxiliary_loss_mlp": 0.01035023, "balance_loss_clip": 1.02227938, "balance_loss_mlp": 1.04012871, "epoch": 0.6120547121599279, "flos": 13772533023360.0, "grad_norm": 1.789480388728631, "language_loss": 0.8350144, "learning_rate": 1.3105580992034511e-06, "loss": 0.85667288, "num_input_tokens_seen": 219165585, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.73046875, "step": 10180, "time_per_iteration": 2.505401849746704 }, { "auxiliary_loss_clip": 0.01110485, "auxiliary_loss_mlp": 0.01030235, "balance_loss_clip": 1.0164603, "balance_loss_mlp": 1.03861403, "epoch": 0.6121148354125958, "flos": 20079164275200.0, "grad_norm": 1.7165877591332188, "language_loss": 0.77958488, "learning_rate": 1.310203447721429e-06, "loss": 0.80099207, "num_input_tokens_seen": 219183280, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.71875, "step": 10181, "time_per_iteration": 2.567387104034424 }, { "auxiliary_loss_clip": 0.01112939, "auxiliary_loss_mlp": 0.0103535, "balance_loss_clip": 1.02201068, "balance_loss_mlp": 1.03944826, "epoch": 0.6121749586652638, "flos": 13471313569920.0, "grad_norm": 2.076986350810761, "language_loss": 0.80486059, "learning_rate": 1.3098488208564712e-06, "loss": 0.82634354, "num_input_tokens_seen": 219197200, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 10182, "time_per_iteration": 2.5519423484802246 }, { "auxiliary_loss_clip": 0.0113249, "auxiliary_loss_mlp": 0.01028679, "balance_loss_clip": 1.01843882, "balance_loss_mlp": 1.03722548, "epoch": 0.6122350819179317, "flos": 20120533764480.0, "grad_norm": 1.6332183848558595, "language_loss": 0.83056098, "learning_rate": 1.309494218621234e-06, "loss": 0.85217267, "num_input_tokens_seen": 219216825, "router_z_loss_clip": 0.10253906, "router_z_loss_mlp": 0.68359375, "step": 10183, "time_per_iteration": 2.5602965354919434 }, { "auxiliary_loss_clip": 0.01041043, "auxiliary_loss_mlp": 0.00999154, "balance_loss_clip": 0.99769372, "balance_loss_mlp": 1.00978351, "epoch": 0.6122952051705998, "flos": 65429242767360.0, "grad_norm": 0.7811741811955616, "language_loss": 0.62875563, "learning_rate": 1.3091396410283718e-06, "loss": 0.64915758, "num_input_tokens_seen": 219283795, "router_z_loss_clip": 0.0145874, "router_z_loss_mlp": 0.22460938, "step": 10184, "time_per_iteration": 3.2515006065368652 }, { "auxiliary_loss_clip": 0.01122781, "auxiliary_loss_mlp": 0.01031973, "balance_loss_clip": 1.01962888, "balance_loss_mlp": 1.04168534, "epoch": 0.6123553284232677, "flos": 20376253664640.0, "grad_norm": 1.8195105871879185, "language_loss": 0.81949568, "learning_rate": 1.3087850880905383e-06, "loss": 0.84104323, "num_input_tokens_seen": 219302385, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.72265625, "step": 10185, "time_per_iteration": 2.586927652359009 }, { "auxiliary_loss_clip": 0.01135387, "auxiliary_loss_mlp": 0.01032055, "balance_loss_clip": 1.01873946, "balance_loss_mlp": 1.04123533, "epoch": 0.6124154516759357, "flos": 23987645948160.0, "grad_norm": 1.9931002580823605, "language_loss": 0.74625039, "learning_rate": 1.3084305598203874e-06, "loss": 0.76792485, "num_input_tokens_seen": 219319765, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.765625, "step": 10186, "time_per_iteration": 2.5893092155456543 }, { "auxiliary_loss_clip": 0.01117174, "auxiliary_loss_mlp": 0.01035488, "balance_loss_clip": 1.02373981, "balance_loss_mlp": 1.0374378, "epoch": 0.6124755749286036, "flos": 21468799693440.0, "grad_norm": 1.549633750889513, "language_loss": 0.7842291, "learning_rate": 1.3080760562305715e-06, "loss": 0.80575573, "num_input_tokens_seen": 219337440, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.70703125, "step": 10187, "time_per_iteration": 2.6027534008026123 }, { "auxiliary_loss_clip": 0.01106786, "auxiliary_loss_mlp": 0.01031967, "balance_loss_clip": 1.01887739, "balance_loss_mlp": 1.0351851, "epoch": 0.6125356981812716, "flos": 23879195809920.0, "grad_norm": 1.6655817734129237, "language_loss": 0.83234787, "learning_rate": 1.3077215773337405e-06, "loss": 0.85373533, "num_input_tokens_seen": 219357525, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 10188, "time_per_iteration": 2.5989553928375244 }, { "auxiliary_loss_clip": 0.01137478, "auxiliary_loss_mlp": 0.0103278, "balance_loss_clip": 1.02014351, "balance_loss_mlp": 1.0385437, "epoch": 0.6125958214339395, "flos": 14425604150400.0, "grad_norm": 1.830347866887843, "language_loss": 0.76013982, "learning_rate": 1.3073671231425461e-06, "loss": 0.78184235, "num_input_tokens_seen": 219374855, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 10189, "time_per_iteration": 2.5900728702545166 }, { "auxiliary_loss_clip": 0.01121859, "auxiliary_loss_mlp": 0.01035796, "balance_loss_clip": 1.02216995, "balance_loss_mlp": 1.03931713, "epoch": 0.6126559446866076, "flos": 23259090389760.0, "grad_norm": 1.5229109596729855, "language_loss": 0.74247444, "learning_rate": 1.3070126936696366e-06, "loss": 0.76405096, "num_input_tokens_seen": 219394740, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 10190, "time_per_iteration": 2.5196127891540527 }, { "auxiliary_loss_clip": 0.01121065, "auxiliary_loss_mlp": 0.0103518, "balance_loss_clip": 1.02182865, "balance_loss_mlp": 1.0395124, "epoch": 0.6127160679392755, "flos": 26864808324480.0, "grad_norm": 1.6871488782567918, "language_loss": 0.68185848, "learning_rate": 1.3066582889276622e-06, "loss": 0.70342094, "num_input_tokens_seen": 219413755, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73046875, "step": 10191, "time_per_iteration": 2.6065564155578613 }, { "auxiliary_loss_clip": 0.01118867, "auxiliary_loss_mlp": 0.0103688, "balance_loss_clip": 1.02362418, "balance_loss_mlp": 1.036075, "epoch": 0.6127761911919435, "flos": 26396425952640.0, "grad_norm": 2.3486693922785253, "language_loss": 0.74060285, "learning_rate": 1.3063039089292696e-06, "loss": 0.7621603, "num_input_tokens_seen": 219433560, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73828125, "step": 10192, "time_per_iteration": 3.970163345336914 }, { "auxiliary_loss_clip": 0.011358, "auxiliary_loss_mlp": 0.01034294, "balance_loss_clip": 1.02202153, "balance_loss_mlp": 1.03772449, "epoch": 0.6128363144446115, "flos": 22634747164800.0, "grad_norm": 1.8840555059704784, "language_loss": 0.83430725, "learning_rate": 1.3059495536871063e-06, "loss": 0.85600823, "num_input_tokens_seen": 219452640, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.7109375, "step": 10193, "time_per_iteration": 2.5662841796875 }, { "auxiliary_loss_clip": 0.0113719, "auxiliary_loss_mlp": 0.01034806, "balance_loss_clip": 1.02126431, "balance_loss_mlp": 1.03794599, "epoch": 0.6128964376972794, "flos": 26759051706240.0, "grad_norm": 2.2708017186329035, "language_loss": 0.70266467, "learning_rate": 1.3055952232138184e-06, "loss": 0.72438467, "num_input_tokens_seen": 219468585, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 10194, "time_per_iteration": 2.6567931175231934 }, { "auxiliary_loss_clip": 0.01057946, "auxiliary_loss_mlp": 0.01000562, "balance_loss_clip": 0.99908942, "balance_loss_mlp": 1.00936389, "epoch": 0.6129565609499474, "flos": 65567929178880.0, "grad_norm": 0.8160300185004993, "language_loss": 0.58647704, "learning_rate": 1.3052409175220502e-06, "loss": 0.6070621, "num_input_tokens_seen": 219523015, "router_z_loss_clip": 0.01470947, "router_z_loss_mlp": 0.22753906, "step": 10195, "time_per_iteration": 2.9618563652038574 }, { "auxiliary_loss_clip": 0.01129426, "auxiliary_loss_mlp": 0.01034888, "balance_loss_clip": 1.02162576, "balance_loss_mlp": 1.03942776, "epoch": 0.6130166842026153, "flos": 16362087200640.0, "grad_norm": 2.2306720051819453, "language_loss": 0.69363427, "learning_rate": 1.304886636624447e-06, "loss": 0.71527743, "num_input_tokens_seen": 219539980, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 10196, "time_per_iteration": 2.638367176055908 }, { "auxiliary_loss_clip": 0.01127218, "auxiliary_loss_mlp": 0.0103785, "balance_loss_clip": 1.0251956, "balance_loss_mlp": 1.03801, "epoch": 0.6130768074552834, "flos": 23652455207040.0, "grad_norm": 1.8124568750224412, "language_loss": 0.71326089, "learning_rate": 1.3045323805336512e-06, "loss": 0.73491162, "num_input_tokens_seen": 219556980, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 10197, "time_per_iteration": 3.9551234245300293 }, { "auxiliary_loss_clip": 0.0112318, "auxiliary_loss_mlp": 0.01042481, "balance_loss_clip": 1.02914715, "balance_loss_mlp": 1.03936279, "epoch": 0.6131369307079513, "flos": 20047455544320.0, "grad_norm": 1.8619764106519865, "language_loss": 0.78932738, "learning_rate": 1.3041781492623064e-06, "loss": 0.81098402, "num_input_tokens_seen": 219576410, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.75, "step": 10198, "time_per_iteration": 2.5286340713500977 }, { "auxiliary_loss_clip": 0.01139705, "auxiliary_loss_mlp": 0.0103431, "balance_loss_clip": 1.0211792, "balance_loss_mlp": 1.03882587, "epoch": 0.6131970539606193, "flos": 22672166158080.0, "grad_norm": 1.7613353828591358, "language_loss": 0.7431913, "learning_rate": 1.3038239428230534e-06, "loss": 0.7649315, "num_input_tokens_seen": 219597180, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7421875, "step": 10199, "time_per_iteration": 2.6266286373138428 }, { "auxiliary_loss_clip": 0.0113077, "auxiliary_loss_mlp": 0.01038246, "balance_loss_clip": 1.02432203, "balance_loss_mlp": 1.03753853, "epoch": 0.6132571772132872, "flos": 26870913636480.0, "grad_norm": 2.8315035895400933, "language_loss": 0.6092037, "learning_rate": 1.3034697612285324e-06, "loss": 0.63089383, "num_input_tokens_seen": 219617630, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75390625, "step": 10200, "time_per_iteration": 2.5679409503936768 }, { "auxiliary_loss_clip": 0.01127434, "auxiliary_loss_mlp": 0.01037625, "balance_loss_clip": 1.0250957, "balance_loss_mlp": 1.03776693, "epoch": 0.6133173004659552, "flos": 22892657794560.0, "grad_norm": 1.873830590101261, "language_loss": 0.68875575, "learning_rate": 1.3031156044913847e-06, "loss": 0.7104063, "num_input_tokens_seen": 219637025, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 10201, "time_per_iteration": 2.6121997833251953 }, { "auxiliary_loss_clip": 0.01122654, "auxiliary_loss_mlp": 0.01031312, "balance_loss_clip": 1.01853895, "balance_loss_mlp": 1.03455329, "epoch": 0.6133774237186231, "flos": 20485098852480.0, "grad_norm": 1.8355787004084199, "language_loss": 0.83579361, "learning_rate": 1.3027614726242485e-06, "loss": 0.8573333, "num_input_tokens_seen": 219656625, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 10202, "time_per_iteration": 2.596691370010376 }, { "auxiliary_loss_clip": 0.01030467, "auxiliary_loss_mlp": 0.010006, "balance_loss_clip": 0.9992528, "balance_loss_mlp": 1.00792325, "epoch": 0.6134375469712912, "flos": 69413065217280.0, "grad_norm": 1.2414778283608816, "language_loss": 0.67143452, "learning_rate": 1.3024073656397616e-06, "loss": 0.69174528, "num_input_tokens_seen": 219718090, "router_z_loss_clip": 0.01348877, "router_z_loss_mlp": 0.22558594, "step": 10203, "time_per_iteration": 6.178293943405151 }, { "auxiliary_loss_clip": 0.01118747, "auxiliary_loss_mlp": 0.0103279, "balance_loss_clip": 1.02010584, "balance_loss_mlp": 1.03751683, "epoch": 0.6134976702239591, "flos": 41281541815680.0, "grad_norm": 1.5214957327726544, "language_loss": 0.61203259, "learning_rate": 1.3020532835505615e-06, "loss": 0.63354796, "num_input_tokens_seen": 219740100, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 10204, "time_per_iteration": 2.676657199859619 }, { "auxiliary_loss_clip": 0.01132673, "auxiliary_loss_mlp": 0.01033104, "balance_loss_clip": 1.02042651, "balance_loss_mlp": 1.0366112, "epoch": 0.6135577934766271, "flos": 22346600261760.0, "grad_norm": 1.7482200748656023, "language_loss": 0.7247085, "learning_rate": 1.301699226369284e-06, "loss": 0.74636626, "num_input_tokens_seen": 219761225, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 10205, "time_per_iteration": 2.580888271331787 }, { "auxiliary_loss_clip": 0.01115637, "auxiliary_loss_mlp": 0.01025201, "balance_loss_clip": 1.01221895, "balance_loss_mlp": 1.03697014, "epoch": 0.6136179167292951, "flos": 23728155120000.0, "grad_norm": 1.9215614915852781, "language_loss": 0.76029313, "learning_rate": 1.3013451941085655e-06, "loss": 0.78170145, "num_input_tokens_seen": 219780085, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 10206, "time_per_iteration": 2.5662949085235596 }, { "auxiliary_loss_clip": 0.01146364, "auxiliary_loss_mlp": 0.01030758, "balance_loss_clip": 1.01717424, "balance_loss_mlp": 1.03719807, "epoch": 0.613678039981963, "flos": 26024678144640.0, "grad_norm": 1.8792966648808531, "language_loss": 0.75681102, "learning_rate": 1.3009911867810393e-06, "loss": 0.77858222, "num_input_tokens_seen": 219797895, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 10207, "time_per_iteration": 2.620081663131714 }, { "auxiliary_loss_clip": 0.01118176, "auxiliary_loss_mlp": 0.01033866, "balance_loss_clip": 1.02139091, "balance_loss_mlp": 1.0372715, "epoch": 0.613738163234631, "flos": 9859957200000.0, "grad_norm": 2.211948966154755, "language_loss": 0.82374012, "learning_rate": 1.3006372043993396e-06, "loss": 0.84526056, "num_input_tokens_seen": 219811295, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 10208, "time_per_iteration": 2.5369350910186768 }, { "auxiliary_loss_clip": 0.01131236, "auxiliary_loss_mlp": 0.0103433, "balance_loss_clip": 1.02048969, "balance_loss_mlp": 1.03926706, "epoch": 0.613798286487299, "flos": 33182070001920.0, "grad_norm": 2.5363039693414717, "language_loss": 0.7243197, "learning_rate": 1.3002832469760997e-06, "loss": 0.74597538, "num_input_tokens_seen": 219832735, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7421875, "step": 10209, "time_per_iteration": 2.719733715057373 }, { "auxiliary_loss_clip": 0.01108241, "auxiliary_loss_mlp": 0.01040328, "balance_loss_clip": 1.02776372, "balance_loss_mlp": 1.03769326, "epoch": 0.613858409739967, "flos": 25627901535360.0, "grad_norm": 1.868125071391009, "language_loss": 0.74500501, "learning_rate": 1.29992931452395e-06, "loss": 0.7664907, "num_input_tokens_seen": 219852755, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 10210, "time_per_iteration": 2.583214282989502 }, { "auxiliary_loss_clip": 0.01057379, "auxiliary_loss_mlp": 0.0100301, "balance_loss_clip": 1.0015614, "balance_loss_mlp": 1.00766873, "epoch": 0.6139185329926349, "flos": 67192313932800.0, "grad_norm": 0.8766098652343478, "language_loss": 0.64912355, "learning_rate": 1.2995754070555229e-06, "loss": 0.66972744, "num_input_tokens_seen": 219922785, "router_z_loss_clip": 0.01446533, "router_z_loss_mlp": 0.2265625, "step": 10211, "time_per_iteration": 3.3076999187469482 }, { "auxiliary_loss_clip": 0.01126373, "auxiliary_loss_mlp": 0.01283979, "balance_loss_clip": 1.02399993, "balance_loss_mlp": 1.03692555, "epoch": 0.6139786562453029, "flos": 21543637680000.0, "grad_norm": 2.066596079628158, "language_loss": 0.75753105, "learning_rate": 1.2992215245834472e-06, "loss": 0.78163457, "num_input_tokens_seen": 219942215, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 10212, "time_per_iteration": 2.5314416885375977 }, { "auxiliary_loss_clip": 0.0112691, "auxiliary_loss_mlp": 0.01036303, "balance_loss_clip": 1.02352417, "balance_loss_mlp": 1.03766143, "epoch": 0.6140387794979708, "flos": 26068489758720.0, "grad_norm": 1.4725904956392126, "language_loss": 0.73694551, "learning_rate": 1.298867667120353e-06, "loss": 0.7585777, "num_input_tokens_seen": 219963830, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 10213, "time_per_iteration": 2.5941522121429443 }, { "auxiliary_loss_clip": 0.01132292, "auxiliary_loss_mlp": 0.01038196, "balance_loss_clip": 1.02381968, "balance_loss_mlp": 1.03895617, "epoch": 0.6140989027506388, "flos": 23694614795520.0, "grad_norm": 1.50651881450487, "language_loss": 0.72917295, "learning_rate": 1.2985138346788685e-06, "loss": 0.75087786, "num_input_tokens_seen": 219983815, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7578125, "step": 10214, "time_per_iteration": 2.5573947429656982 }, { "auxiliary_loss_clip": 0.01110844, "auxiliary_loss_mlp": 0.01033456, "balance_loss_clip": 1.0198307, "balance_loss_mlp": 1.03730547, "epoch": 0.6141590260033067, "flos": 22231721589120.0, "grad_norm": 1.7773923661278723, "language_loss": 0.74351108, "learning_rate": 1.2981600272716207e-06, "loss": 0.76495409, "num_input_tokens_seen": 220003165, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 10215, "time_per_iteration": 2.5594756603240967 }, { "auxiliary_loss_clip": 0.01116351, "auxiliary_loss_mlp": 0.01033318, "balance_loss_clip": 1.02020502, "balance_loss_mlp": 1.03645778, "epoch": 0.6142191492559748, "flos": 23871653953920.0, "grad_norm": 1.6119200098340667, "language_loss": 0.78150004, "learning_rate": 1.2978062449112362e-06, "loss": 0.80299675, "num_input_tokens_seen": 220021015, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 10216, "time_per_iteration": 2.538994073867798 }, { "auxiliary_loss_clip": 0.01109933, "auxiliary_loss_mlp": 0.01034651, "balance_loss_clip": 1.0202744, "balance_loss_mlp": 1.03794217, "epoch": 0.6142792725086427, "flos": 15042513260160.0, "grad_norm": 2.1188367159062165, "language_loss": 0.79742163, "learning_rate": 1.2974524876103404e-06, "loss": 0.81886744, "num_input_tokens_seen": 220035780, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.71875, "step": 10217, "time_per_iteration": 2.5115082263946533 }, { "auxiliary_loss_clip": 0.01126384, "auxiliary_loss_mlp": 0.01028567, "balance_loss_clip": 1.01518536, "balance_loss_mlp": 1.03558421, "epoch": 0.6143393957613107, "flos": 23330947547520.0, "grad_norm": 2.408711286802283, "language_loss": 0.77929461, "learning_rate": 1.2970987553815584e-06, "loss": 0.80084413, "num_input_tokens_seen": 220054280, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73046875, "step": 10218, "time_per_iteration": 2.615006685256958 }, { "auxiliary_loss_clip": 0.01117346, "auxiliary_loss_mlp": 0.01037722, "balance_loss_clip": 1.02507937, "balance_loss_mlp": 1.03776193, "epoch": 0.6143995190139786, "flos": 20117086058880.0, "grad_norm": 1.582212477362434, "language_loss": 0.81872058, "learning_rate": 1.2967450482375133e-06, "loss": 0.84027123, "num_input_tokens_seen": 220074120, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 10219, "time_per_iteration": 2.6016061305999756 }, { "auxiliary_loss_clip": 0.01119373, "auxiliary_loss_mlp": 0.01031334, "balance_loss_clip": 1.01793516, "balance_loss_mlp": 1.03604758, "epoch": 0.6144596422666466, "flos": 42303559489920.0, "grad_norm": 1.6638579405348723, "language_loss": 0.66717035, "learning_rate": 1.2963913661908287e-06, "loss": 0.68867749, "num_input_tokens_seen": 220096320, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7421875, "step": 10220, "time_per_iteration": 2.735673427581787 }, { "auxiliary_loss_clip": 0.01115262, "auxiliary_loss_mlp": 0.01031549, "balance_loss_clip": 1.01866293, "balance_loss_mlp": 1.03526735, "epoch": 0.6145197655193146, "flos": 21573622558080.0, "grad_norm": 1.880215348400413, "language_loss": 0.71492147, "learning_rate": 1.2960377092541267e-06, "loss": 0.73638964, "num_input_tokens_seen": 220114850, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 10221, "time_per_iteration": 2.6603496074676514 }, { "auxiliary_loss_clip": 0.01142124, "auxiliary_loss_mlp": 0.01032579, "balance_loss_clip": 1.01994848, "balance_loss_mlp": 1.03543663, "epoch": 0.6145798887719826, "flos": 21471098163840.0, "grad_norm": 1.6096854773428086, "language_loss": 0.79414451, "learning_rate": 1.2956840774400274e-06, "loss": 0.8158915, "num_input_tokens_seen": 220133395, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 10222, "time_per_iteration": 2.6022353172302246 }, { "auxiliary_loss_clip": 0.01129947, "auxiliary_loss_mlp": 0.01035754, "balance_loss_clip": 1.02309346, "balance_loss_mlp": 1.03892195, "epoch": 0.6146400120246506, "flos": 20777016683520.0, "grad_norm": 2.0826673233390363, "language_loss": 0.76248717, "learning_rate": 1.295330470761152e-06, "loss": 0.78414416, "num_input_tokens_seen": 220152790, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.734375, "step": 10223, "time_per_iteration": 2.5972917079925537 }, { "auxiliary_loss_clip": 0.01124735, "auxiliary_loss_mlp": 0.01033141, "balance_loss_clip": 1.0201174, "balance_loss_mlp": 1.0374763, "epoch": 0.6147001352773185, "flos": 13881306384000.0, "grad_norm": 2.1989753043414755, "language_loss": 0.78344178, "learning_rate": 1.294976889230119e-06, "loss": 0.80502057, "num_input_tokens_seen": 220169535, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 10224, "time_per_iteration": 2.5379679203033447 }, { "auxiliary_loss_clip": 0.01130791, "auxiliary_loss_mlp": 0.0103204, "balance_loss_clip": 1.02007174, "balance_loss_mlp": 1.03498697, "epoch": 0.6147602585299865, "flos": 56641791807360.0, "grad_norm": 1.3028936565112823, "language_loss": 0.66545236, "learning_rate": 1.2946233328595479e-06, "loss": 0.68708062, "num_input_tokens_seen": 220195305, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6953125, "step": 10225, "time_per_iteration": 2.9598968029022217 }, { "auxiliary_loss_clip": 0.01121795, "auxiliary_loss_mlp": 0.01278959, "balance_loss_clip": 1.01850414, "balance_loss_mlp": 1.03805542, "epoch": 0.6148203817826544, "flos": 32817217605120.0, "grad_norm": 2.285664109359096, "language_loss": 0.63483965, "learning_rate": 1.2942698016620554e-06, "loss": 0.65884715, "num_input_tokens_seen": 220215040, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7421875, "step": 10226, "time_per_iteration": 2.6941616535186768 }, { "auxiliary_loss_clip": 0.01129304, "auxiliary_loss_mlp": 0.01037654, "balance_loss_clip": 1.02342677, "balance_loss_mlp": 1.0382359, "epoch": 0.6148805050353224, "flos": 18332038748160.0, "grad_norm": 1.650507193710451, "language_loss": 0.75545371, "learning_rate": 1.2939162956502582e-06, "loss": 0.77712327, "num_input_tokens_seen": 220234205, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.734375, "step": 10227, "time_per_iteration": 2.5521774291992188 }, { "auxiliary_loss_clip": 0.01156931, "auxiliary_loss_mlp": 0.01042846, "balance_loss_clip": 1.02636492, "balance_loss_mlp": 1.03710353, "epoch": 0.6149406282879903, "flos": 14063983977600.0, "grad_norm": 3.3808221978553763, "language_loss": 0.61643046, "learning_rate": 1.2935628148367724e-06, "loss": 0.63842821, "num_input_tokens_seen": 220252730, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.75, "step": 10228, "time_per_iteration": 2.6448521614074707 }, { "auxiliary_loss_clip": 0.01127015, "auxiliary_loss_mlp": 0.01036501, "balance_loss_clip": 1.02370405, "balance_loss_mlp": 1.03719139, "epoch": 0.6150007515406584, "flos": 25190186400000.0, "grad_norm": 1.3610160556929998, "language_loss": 0.74565709, "learning_rate": 1.2932093592342122e-06, "loss": 0.76729226, "num_input_tokens_seen": 220273345, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 10229, "time_per_iteration": 2.7239696979522705 }, { "auxiliary_loss_clip": 0.01111744, "auxiliary_loss_mlp": 0.01039006, "balance_loss_clip": 1.02511799, "balance_loss_mlp": 1.03894591, "epoch": 0.6150608747933263, "flos": 21945262625280.0, "grad_norm": 1.7734451540171696, "language_loss": 0.77847993, "learning_rate": 1.2928559288551921e-06, "loss": 0.79998744, "num_input_tokens_seen": 220293845, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7265625, "step": 10230, "time_per_iteration": 2.731448173522949 }, { "auxiliary_loss_clip": 0.01126646, "auxiliary_loss_mlp": 0.01034552, "balance_loss_clip": 1.02155781, "balance_loss_mlp": 1.03605175, "epoch": 0.6151209980459943, "flos": 30117453523200.0, "grad_norm": 1.5407096616241187, "language_loss": 0.73087263, "learning_rate": 1.2925025237123253e-06, "loss": 0.75248462, "num_input_tokens_seen": 220316070, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 10231, "time_per_iteration": 2.7346136569976807 }, { "auxiliary_loss_clip": 0.01127899, "auxiliary_loss_mlp": 0.01034824, "balance_loss_clip": 1.022349, "balance_loss_mlp": 1.03770638, "epoch": 0.6151811212986622, "flos": 30008356940160.0, "grad_norm": 1.825829326536989, "language_loss": 0.69625235, "learning_rate": 1.2921491438182232e-06, "loss": 0.71787953, "num_input_tokens_seen": 220335695, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7265625, "step": 10232, "time_per_iteration": 2.6246120929718018 }, { "auxiliary_loss_clip": 0.0111408, "auxiliary_loss_mlp": 0.01279685, "balance_loss_clip": 1.02023554, "balance_loss_mlp": 1.03599691, "epoch": 0.6152412445513302, "flos": 18872888808960.0, "grad_norm": 2.0263985416846957, "language_loss": 0.91726255, "learning_rate": 1.2917957891854974e-06, "loss": 0.94120026, "num_input_tokens_seen": 220353720, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 10233, "time_per_iteration": 2.5163443088531494 }, { "auxiliary_loss_clip": 0.01124559, "auxiliary_loss_mlp": 0.01036686, "balance_loss_clip": 1.02471113, "balance_loss_mlp": 1.03610933, "epoch": 0.6153013678039982, "flos": 25703601448320.0, "grad_norm": 2.013890818435847, "language_loss": 0.71518427, "learning_rate": 1.2914424598267577e-06, "loss": 0.73679674, "num_input_tokens_seen": 220372515, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.69921875, "step": 10234, "time_per_iteration": 3.984869956970215 }, { "auxiliary_loss_clip": 0.01119199, "auxiliary_loss_mlp": 0.01284084, "balance_loss_clip": 1.02359676, "balance_loss_mlp": 1.03646016, "epoch": 0.6153614910566662, "flos": 28510271383680.0, "grad_norm": 2.0816774818547636, "language_loss": 0.66285729, "learning_rate": 1.2910891557546144e-06, "loss": 0.68689013, "num_input_tokens_seen": 220393490, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73828125, "step": 10235, "time_per_iteration": 2.5930325984954834 }, { "auxiliary_loss_clip": 0.01123503, "auxiliary_loss_mlp": 0.01030314, "balance_loss_clip": 1.01713586, "balance_loss_mlp": 1.0387733, "epoch": 0.6154216143093342, "flos": 23549787158400.0, "grad_norm": 1.7494882232406403, "language_loss": 0.81317532, "learning_rate": 1.2907358769816755e-06, "loss": 0.83471352, "num_input_tokens_seen": 220412855, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.75390625, "step": 10236, "time_per_iteration": 2.5253183841705322 }, { "auxiliary_loss_clip": 0.01111817, "auxiliary_loss_mlp": 0.01031389, "balance_loss_clip": 1.01784682, "balance_loss_mlp": 1.03647399, "epoch": 0.6154817375620021, "flos": 22748081552640.0, "grad_norm": 1.4829921620640285, "language_loss": 0.80641484, "learning_rate": 1.2903826235205487e-06, "loss": 0.82784688, "num_input_tokens_seen": 220433440, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75390625, "step": 10237, "time_per_iteration": 2.554964303970337 }, { "auxiliary_loss_clip": 0.01119236, "auxiliary_loss_mlp": 0.01041835, "balance_loss_clip": 1.02872252, "balance_loss_mlp": 1.03795171, "epoch": 0.6155418608146701, "flos": 27162975121920.0, "grad_norm": 1.7386187415002727, "language_loss": 0.75989532, "learning_rate": 1.2900293953838408e-06, "loss": 0.78150606, "num_input_tokens_seen": 220453445, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 10238, "time_per_iteration": 2.572333574295044 }, { "auxiliary_loss_clip": 0.01075115, "auxiliary_loss_mlp": 0.01000811, "balance_loss_clip": 0.99926108, "balance_loss_mlp": 1.00897241, "epoch": 0.615601984067338, "flos": 68811165014400.0, "grad_norm": 0.7634622435048914, "language_loss": 0.57642865, "learning_rate": 1.2896761925841575e-06, "loss": 0.59718788, "num_input_tokens_seen": 220509730, "router_z_loss_clip": 0.01544189, "router_z_loss_mlp": 0.22460938, "step": 10239, "time_per_iteration": 4.52502703666687 }, { "auxiliary_loss_clip": 0.01128027, "auxiliary_loss_mlp": 0.01037048, "balance_loss_clip": 1.02365494, "balance_loss_mlp": 1.03707385, "epoch": 0.615662107320006, "flos": 15517144598400.0, "grad_norm": 1.9033028445781075, "language_loss": 0.77933782, "learning_rate": 1.2893230151341038e-06, "loss": 0.80098855, "num_input_tokens_seen": 220527295, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 10240, "time_per_iteration": 2.578397274017334 }, { "auxiliary_loss_clip": 0.01114364, "auxiliary_loss_mlp": 0.01034115, "balance_loss_clip": 1.02053118, "balance_loss_mlp": 1.04035234, "epoch": 0.615722230572674, "flos": 21063691128960.0, "grad_norm": 2.786979708015757, "language_loss": 0.72656763, "learning_rate": 1.288969863046283e-06, "loss": 0.74805242, "num_input_tokens_seen": 220542730, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7421875, "step": 10241, "time_per_iteration": 2.4887118339538574 }, { "auxiliary_loss_clip": 0.01108169, "auxiliary_loss_mlp": 0.01030847, "balance_loss_clip": 1.01832426, "balance_loss_mlp": 1.03776085, "epoch": 0.615782353825342, "flos": 23256791919360.0, "grad_norm": 1.6943003731367365, "language_loss": 0.71484613, "learning_rate": 1.2886167363332996e-06, "loss": 0.73623627, "num_input_tokens_seen": 220562995, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 10242, "time_per_iteration": 2.604936122894287 }, { "auxiliary_loss_clip": 0.01120158, "auxiliary_loss_mlp": 0.01030765, "balance_loss_clip": 1.0179981, "balance_loss_mlp": 1.03918719, "epoch": 0.6158424770780099, "flos": 21103911383040.0, "grad_norm": 1.949703434655595, "language_loss": 0.72195292, "learning_rate": 1.288263635007755e-06, "loss": 0.74346215, "num_input_tokens_seen": 220581775, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.72265625, "step": 10243, "time_per_iteration": 2.530874490737915 }, { "auxiliary_loss_clip": 0.01039473, "auxiliary_loss_mlp": 0.01000129, "balance_loss_clip": 0.99849588, "balance_loss_mlp": 1.00832868, "epoch": 0.6159026003306779, "flos": 70333276769280.0, "grad_norm": 0.7516551065374343, "language_loss": 0.56841242, "learning_rate": 1.2879105590822497e-06, "loss": 0.58880848, "num_input_tokens_seen": 220646395, "router_z_loss_clip": 0.01635742, "router_z_loss_mlp": 0.22460938, "step": 10244, "time_per_iteration": 4.6721062660217285 }, { "auxiliary_loss_clip": 0.01111528, "auxiliary_loss_mlp": 0.01035926, "balance_loss_clip": 1.0220983, "balance_loss_mlp": 1.03846979, "epoch": 0.6159627235833458, "flos": 33874355802240.0, "grad_norm": 2.037875251781256, "language_loss": 0.63876152, "learning_rate": 1.2875575085693853e-06, "loss": 0.66023606, "num_input_tokens_seen": 220668335, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.73046875, "step": 10245, "time_per_iteration": 4.1692283153533936 }, { "auxiliary_loss_clip": 0.01142855, "auxiliary_loss_mlp": 0.01034976, "balance_loss_clip": 1.02222097, "balance_loss_mlp": 1.03677344, "epoch": 0.6160228468360138, "flos": 26575440359040.0, "grad_norm": 1.7459663645697165, "language_loss": 0.78782439, "learning_rate": 1.2872044834817606e-06, "loss": 0.80960274, "num_input_tokens_seen": 220688915, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 10246, "time_per_iteration": 2.5871458053588867 }, { "auxiliary_loss_clip": 0.01143034, "auxiliary_loss_mlp": 0.01294466, "balance_loss_clip": 1.03255773, "balance_loss_mlp": 1.03940022, "epoch": 0.6160829700886818, "flos": 17193274894080.0, "grad_norm": 2.5471394768643774, "language_loss": 0.87721229, "learning_rate": 1.286851483831975e-06, "loss": 0.90158737, "num_input_tokens_seen": 220703465, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.765625, "step": 10247, "time_per_iteration": 2.5141561031341553 }, { "auxiliary_loss_clip": 0.01159008, "auxiliary_loss_mlp": 0.01034311, "balance_loss_clip": 1.02155578, "balance_loss_mlp": 1.04003859, "epoch": 0.6161430933413498, "flos": 23623547736960.0, "grad_norm": 1.665758294115307, "language_loss": 0.79808021, "learning_rate": 1.2864985096326253e-06, "loss": 0.82001334, "num_input_tokens_seen": 220722090, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.734375, "step": 10248, "time_per_iteration": 2.609238386154175 }, { "auxiliary_loss_clip": 0.01141644, "auxiliary_loss_mlp": 0.01034709, "balance_loss_clip": 1.02191174, "balance_loss_mlp": 1.03587079, "epoch": 0.6162032165940178, "flos": 23002436736000.0, "grad_norm": 1.9263042347801853, "language_loss": 0.86642724, "learning_rate": 1.286145560896308e-06, "loss": 0.88819075, "num_input_tokens_seen": 220741075, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 10249, "time_per_iteration": 2.610931396484375 }, { "auxiliary_loss_clip": 0.01109716, "auxiliary_loss_mlp": 0.01029314, "balance_loss_clip": 1.01575351, "balance_loss_mlp": 1.03742957, "epoch": 0.6162633398466857, "flos": 39421979740800.0, "grad_norm": 2.550524516123802, "language_loss": 0.69032037, "learning_rate": 1.2857926376356196e-06, "loss": 0.71171069, "num_input_tokens_seen": 220763395, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.72265625, "step": 10250, "time_per_iteration": 2.718137264251709 }, { "auxiliary_loss_clip": 0.01121239, "auxiliary_loss_mlp": 0.0102928, "balance_loss_clip": 1.01717472, "balance_loss_mlp": 1.0355016, "epoch": 0.6163234630993537, "flos": 19244672530560.0, "grad_norm": 2.2572585577428774, "language_loss": 0.73860186, "learning_rate": 1.2854397398631544e-06, "loss": 0.76010704, "num_input_tokens_seen": 220780640, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.67578125, "step": 10251, "time_per_iteration": 2.5998239517211914 }, { "auxiliary_loss_clip": 0.01118378, "auxiliary_loss_mlp": 0.01026022, "balance_loss_clip": 1.01314187, "balance_loss_mlp": 1.03685665, "epoch": 0.6163835863520216, "flos": 15961791058560.0, "grad_norm": 2.1761802715530782, "language_loss": 0.68010145, "learning_rate": 1.2850868675915071e-06, "loss": 0.70154542, "num_input_tokens_seen": 220797960, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 10252, "time_per_iteration": 2.5957815647125244 }, { "auxiliary_loss_clip": 0.01083812, "auxiliary_loss_mlp": 0.00999573, "balance_loss_clip": 0.99803478, "balance_loss_mlp": 1.00881588, "epoch": 0.6164437096046896, "flos": 68103834393600.0, "grad_norm": 0.9728398440428278, "language_loss": 0.57835555, "learning_rate": 1.2847340208332705e-06, "loss": 0.59918928, "num_input_tokens_seen": 220856930, "router_z_loss_clip": 0.01531982, "router_z_loss_mlp": 0.22558594, "step": 10253, "time_per_iteration": 3.2635202407836914 }, { "auxiliary_loss_clip": 0.011276, "auxiliary_loss_mlp": 0.01034159, "balance_loss_clip": 1.02116537, "balance_loss_mlp": 1.03751469, "epoch": 0.6165038328573575, "flos": 21361211481600.0, "grad_norm": 1.4938954558507225, "language_loss": 0.79560345, "learning_rate": 1.2843811996010372e-06, "loss": 0.81722105, "num_input_tokens_seen": 220877595, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 10254, "time_per_iteration": 2.5685813426971436 }, { "auxiliary_loss_clip": 0.01132168, "auxiliary_loss_mlp": 0.01031961, "balance_loss_clip": 1.01905632, "balance_loss_mlp": 1.03848338, "epoch": 0.6165639561100256, "flos": 21101972048640.0, "grad_norm": 1.9343812828139806, "language_loss": 0.80017292, "learning_rate": 1.284028403907398e-06, "loss": 0.82181424, "num_input_tokens_seen": 220896880, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7578125, "step": 10255, "time_per_iteration": 2.5774691104888916 }, { "auxiliary_loss_clip": 0.01130341, "auxiliary_loss_mlp": 0.01035806, "balance_loss_clip": 1.02191806, "balance_loss_mlp": 1.03864801, "epoch": 0.6166240793626935, "flos": 25338533569920.0, "grad_norm": 2.4210532044423005, "language_loss": 0.65102094, "learning_rate": 1.2836756337649429e-06, "loss": 0.6726824, "num_input_tokens_seen": 220916425, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7421875, "step": 10256, "time_per_iteration": 2.620500087738037 }, { "auxiliary_loss_clip": 0.01117516, "auxiliary_loss_mlp": 0.01031486, "balance_loss_clip": 1.0190345, "balance_loss_mlp": 1.03869271, "epoch": 0.6166842026153615, "flos": 19682639061120.0, "grad_norm": 2.6366080530015337, "language_loss": 0.71798241, "learning_rate": 1.2833228891862619e-06, "loss": 0.73947245, "num_input_tokens_seen": 220935050, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 10257, "time_per_iteration": 2.579118013381958 }, { "auxiliary_loss_clip": 0.01130614, "auxiliary_loss_mlp": 0.01031424, "balance_loss_clip": 1.01779246, "balance_loss_mlp": 1.03902102, "epoch": 0.6167443258680294, "flos": 19318361281920.0, "grad_norm": 1.6311588712941814, "language_loss": 0.7172606, "learning_rate": 1.2829701701839434e-06, "loss": 0.73888099, "num_input_tokens_seen": 220953085, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73828125, "step": 10258, "time_per_iteration": 2.5987322330474854 }, { "auxiliary_loss_clip": 0.01128318, "auxiliary_loss_mlp": 0.01033291, "balance_loss_clip": 1.02018976, "balance_loss_mlp": 1.03744507, "epoch": 0.6168044491206974, "flos": 25265239868160.0, "grad_norm": 4.3017135969521485, "language_loss": 0.63572633, "learning_rate": 1.2826174767705758e-06, "loss": 0.65734237, "num_input_tokens_seen": 220969050, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 10259, "time_per_iteration": 2.6075799465179443 }, { "auxiliary_loss_clip": 0.01134132, "auxiliary_loss_mlp": 0.0103275, "balance_loss_clip": 1.01874304, "balance_loss_mlp": 1.03796434, "epoch": 0.6168645723733654, "flos": 13219903301760.0, "grad_norm": 1.9215979783268722, "language_loss": 0.71259308, "learning_rate": 1.282264808958745e-06, "loss": 0.73426187, "num_input_tokens_seen": 220985825, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.69921875, "step": 10260, "time_per_iteration": 2.596870183944702 }, { "auxiliary_loss_clip": 0.0112587, "auxiliary_loss_mlp": 0.01035714, "balance_loss_clip": 1.02137327, "balance_loss_mlp": 1.04185247, "epoch": 0.6169246956260334, "flos": 26652038112000.0, "grad_norm": 1.9123627502855156, "language_loss": 0.68156832, "learning_rate": 1.2819121667610363e-06, "loss": 0.70318413, "num_input_tokens_seen": 221004465, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.75, "step": 10261, "time_per_iteration": 2.5823707580566406 }, { "auxiliary_loss_clip": 0.01133883, "auxiliary_loss_mlp": 0.01040474, "balance_loss_clip": 1.02821279, "balance_loss_mlp": 1.0360775, "epoch": 0.6169848188787014, "flos": 23148413608320.0, "grad_norm": 1.7658048311823433, "language_loss": 0.71254492, "learning_rate": 1.2815595501900358e-06, "loss": 0.73428845, "num_input_tokens_seen": 221023260, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.71875, "step": 10262, "time_per_iteration": 2.6259593963623047 }, { "auxiliary_loss_clip": 0.01127493, "auxiliary_loss_mlp": 0.01034355, "balance_loss_clip": 1.02066934, "balance_loss_mlp": 1.03763366, "epoch": 0.6170449421313693, "flos": 23331917214720.0, "grad_norm": 1.9261550808008865, "language_loss": 0.6982708, "learning_rate": 1.2812069592583265e-06, "loss": 0.71988922, "num_input_tokens_seen": 221043090, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 10263, "time_per_iteration": 2.6557464599609375 }, { "auxiliary_loss_clip": 0.01156422, "auxiliary_loss_mlp": 0.01031294, "balance_loss_clip": 1.01864028, "balance_loss_mlp": 1.03844643, "epoch": 0.6171050653840373, "flos": 15851617067520.0, "grad_norm": 2.4087688917027084, "language_loss": 0.76099712, "learning_rate": 1.2808543939784922e-06, "loss": 0.78287429, "num_input_tokens_seen": 221061435, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.734375, "step": 10264, "time_per_iteration": 2.586919069290161 }, { "auxiliary_loss_clip": 0.01111148, "auxiliary_loss_mlp": 0.01032996, "balance_loss_clip": 1.01900733, "balance_loss_mlp": 1.0390048, "epoch": 0.6171651886367052, "flos": 20045516209920.0, "grad_norm": 2.6062925829729116, "language_loss": 0.85310757, "learning_rate": 1.2805018543631148e-06, "loss": 0.87454903, "num_input_tokens_seen": 221078705, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.72265625, "step": 10265, "time_per_iteration": 2.573648691177368 }, { "auxiliary_loss_clip": 0.01152089, "auxiliary_loss_mlp": 0.01037339, "balance_loss_clip": 1.0243156, "balance_loss_mlp": 1.03732407, "epoch": 0.6172253118893732, "flos": 26432695710720.0, "grad_norm": 1.7761884697618093, "language_loss": 0.64386296, "learning_rate": 1.2801493404247748e-06, "loss": 0.6657573, "num_input_tokens_seen": 221099245, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 10266, "time_per_iteration": 2.7007341384887695 }, { "auxiliary_loss_clip": 0.01134355, "auxiliary_loss_mlp": 0.01031148, "balance_loss_clip": 1.01839268, "balance_loss_mlp": 1.03514099, "epoch": 0.6172854351420412, "flos": 22632879657600.0, "grad_norm": 1.5499647289304723, "language_loss": 0.75495684, "learning_rate": 1.279796852176054e-06, "loss": 0.77661186, "num_input_tokens_seen": 221116930, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.72265625, "step": 10267, "time_per_iteration": 2.6200084686279297 }, { "auxiliary_loss_clip": 0.01138029, "auxiliary_loss_mlp": 0.01029743, "balance_loss_clip": 1.01606345, "balance_loss_mlp": 1.03668797, "epoch": 0.6173455583947092, "flos": 21212936138880.0, "grad_norm": 1.7924070378891215, "language_loss": 0.74908513, "learning_rate": 1.2794443896295299e-06, "loss": 0.7707628, "num_input_tokens_seen": 221137660, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 10268, "time_per_iteration": 2.5734379291534424 }, { "auxiliary_loss_clip": 0.01129969, "auxiliary_loss_mlp": 0.01031455, "balance_loss_clip": 1.01794887, "balance_loss_mlp": 1.03711069, "epoch": 0.6174056816473771, "flos": 19500284689920.0, "grad_norm": 3.1423389932449424, "language_loss": 0.75714934, "learning_rate": 1.279091952797783e-06, "loss": 0.77876353, "num_input_tokens_seen": 221156225, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75390625, "step": 10269, "time_per_iteration": 2.624455451965332 }, { "auxiliary_loss_clip": 0.01107342, "auxiliary_loss_mlp": 0.01031107, "balance_loss_clip": 1.01832795, "balance_loss_mlp": 1.03506672, "epoch": 0.6174658049000451, "flos": 15997342544640.0, "grad_norm": 2.1476996680784834, "language_loss": 0.76412213, "learning_rate": 1.2787395416933895e-06, "loss": 0.78550661, "num_input_tokens_seen": 221173820, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.72265625, "step": 10270, "time_per_iteration": 2.5091328620910645 }, { "auxiliary_loss_clip": 0.01127723, "auxiliary_loss_mlp": 0.01027851, "balance_loss_clip": 1.01437438, "balance_loss_mlp": 1.03709221, "epoch": 0.617525928152713, "flos": 21903893136000.0, "grad_norm": 1.656826613435691, "language_loss": 0.825634, "learning_rate": 1.2783871563289263e-06, "loss": 0.84718966, "num_input_tokens_seen": 221191815, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73046875, "step": 10271, "time_per_iteration": 2.6022608280181885 }, { "auxiliary_loss_clip": 0.01111244, "auxiliary_loss_mlp": 0.01280002, "balance_loss_clip": 1.01959753, "balance_loss_mlp": 1.03786039, "epoch": 0.617586051405381, "flos": 21105958458240.0, "grad_norm": 1.5525065652285757, "language_loss": 0.77057791, "learning_rate": 1.2780347967169697e-06, "loss": 0.79449034, "num_input_tokens_seen": 221211205, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 10272, "time_per_iteration": 2.548882484436035 }, { "auxiliary_loss_clip": 0.01058637, "auxiliary_loss_mlp": 0.01001431, "balance_loss_clip": 1.00000024, "balance_loss_mlp": 1.00940871, "epoch": 0.617646174658049, "flos": 58610776665600.0, "grad_norm": 0.8069460816862805, "language_loss": 0.59142089, "learning_rate": 1.2776824628700938e-06, "loss": 0.61202157, "num_input_tokens_seen": 221268430, "router_z_loss_clip": 0.01428223, "router_z_loss_mlp": 0.22363281, "step": 10273, "time_per_iteration": 3.0886662006378174 }, { "auxiliary_loss_clip": 0.01127036, "auxiliary_loss_mlp": 0.01030721, "balance_loss_clip": 1.01722646, "balance_loss_mlp": 1.0379889, "epoch": 0.617706297910717, "flos": 13878684691200.0, "grad_norm": 5.382628251946681, "language_loss": 0.73229945, "learning_rate": 1.2773301548008728e-06, "loss": 0.75387698, "num_input_tokens_seen": 221281930, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 10274, "time_per_iteration": 2.592132091522217 }, { "auxiliary_loss_clip": 0.01117, "auxiliary_loss_mlp": 0.01275377, "balance_loss_clip": 1.01597285, "balance_loss_mlp": 1.03677285, "epoch": 0.617766421163385, "flos": 19208438686080.0, "grad_norm": 1.881472482148657, "language_loss": 0.77252424, "learning_rate": 1.2769778725218797e-06, "loss": 0.79644799, "num_input_tokens_seen": 221301605, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71484375, "step": 10275, "time_per_iteration": 4.03045129776001 }, { "auxiliary_loss_clip": 0.01135565, "auxiliary_loss_mlp": 0.01030473, "balance_loss_clip": 1.01747894, "balance_loss_mlp": 1.03708696, "epoch": 0.6178265444160529, "flos": 22565978576640.0, "grad_norm": 1.6575339102871183, "language_loss": 0.7945717, "learning_rate": 1.2766256160456866e-06, "loss": 0.81623209, "num_input_tokens_seen": 221320105, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 10276, "time_per_iteration": 2.6565916538238525 }, { "auxiliary_loss_clip": 0.01158032, "auxiliary_loss_mlp": 0.01037748, "balance_loss_clip": 1.02406263, "balance_loss_mlp": 1.0381, "epoch": 0.6178866676687209, "flos": 11984289402240.0, "grad_norm": 2.3134530412433856, "language_loss": 0.80851567, "learning_rate": 1.2762733853848647e-06, "loss": 0.83047342, "num_input_tokens_seen": 221335915, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.74609375, "step": 10277, "time_per_iteration": 2.613889694213867 }, { "auxiliary_loss_clip": 0.01139338, "auxiliary_loss_mlp": 0.01031915, "balance_loss_clip": 1.01808107, "balance_loss_mlp": 1.03789902, "epoch": 0.6179467909213888, "flos": 20991510748800.0, "grad_norm": 1.6017156413095333, "language_loss": 0.81475592, "learning_rate": 1.2759211805519835e-06, "loss": 0.83646846, "num_input_tokens_seen": 221353965, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.74609375, "step": 10278, "time_per_iteration": 2.608706474304199 }, { "auxiliary_loss_clip": 0.01145177, "auxiliary_loss_mlp": 0.01279101, "balance_loss_clip": 1.01979399, "balance_loss_mlp": 1.03746343, "epoch": 0.6180069141740568, "flos": 25338102606720.0, "grad_norm": 1.6151348130404763, "language_loss": 0.7399736, "learning_rate": 1.2755690015596133e-06, "loss": 0.7642163, "num_input_tokens_seen": 221374080, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.73046875, "step": 10279, "time_per_iteration": 2.658519744873047 }, { "auxiliary_loss_clip": 0.01115466, "auxiliary_loss_mlp": 0.01032365, "balance_loss_clip": 1.01978803, "balance_loss_mlp": 1.03484881, "epoch": 0.6180670374267248, "flos": 19645722858240.0, "grad_norm": 1.8471694121800646, "language_loss": 0.70707047, "learning_rate": 1.2752168484203215e-06, "loss": 0.72854877, "num_input_tokens_seen": 221392910, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 10280, "time_per_iteration": 4.0563600063323975 }, { "auxiliary_loss_clip": 0.01109193, "auxiliary_loss_mlp": 0.01032967, "balance_loss_clip": 1.02002704, "balance_loss_mlp": 1.03730154, "epoch": 0.6181271606793928, "flos": 19464876858240.0, "grad_norm": 1.4340522907384485, "language_loss": 0.72736841, "learning_rate": 1.2748647211466766e-06, "loss": 0.74879003, "num_input_tokens_seen": 221410990, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 10281, "time_per_iteration": 2.5640366077423096 }, { "auxiliary_loss_clip": 0.0112617, "auxiliary_loss_mlp": 0.01035727, "balance_loss_clip": 1.02375841, "balance_loss_mlp": 1.03968024, "epoch": 0.6181872839320607, "flos": 25594289383680.0, "grad_norm": 1.614382368805412, "language_loss": 0.76477706, "learning_rate": 1.274512619751244e-06, "loss": 0.78639603, "num_input_tokens_seen": 221431020, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 10282, "time_per_iteration": 2.6232478618621826 }, { "auxiliary_loss_clip": 0.01127549, "auxiliary_loss_mlp": 0.0103541, "balance_loss_clip": 1.01903033, "balance_loss_mlp": 1.0360539, "epoch": 0.6182474071847287, "flos": 25551806572800.0, "grad_norm": 1.7604986560925995, "language_loss": 0.68971026, "learning_rate": 1.27416054424659e-06, "loss": 0.71133989, "num_input_tokens_seen": 221453235, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.734375, "step": 10283, "time_per_iteration": 2.618886947631836 }, { "auxiliary_loss_clip": 0.01123637, "auxiliary_loss_mlp": 0.01031567, "balance_loss_clip": 1.0176549, "balance_loss_mlp": 1.03753877, "epoch": 0.6183075304373966, "flos": 22123738327680.0, "grad_norm": 1.5563625418880707, "language_loss": 0.7504741, "learning_rate": 1.2738084946452791e-06, "loss": 0.77202606, "num_input_tokens_seen": 221472560, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76953125, "step": 10284, "time_per_iteration": 2.5968968868255615 }, { "auxiliary_loss_clip": 0.01114275, "auxiliary_loss_mlp": 0.01034172, "balance_loss_clip": 1.02175689, "balance_loss_mlp": 1.0373342, "epoch": 0.6183676536900646, "flos": 22455589104000.0, "grad_norm": 1.7364593955838419, "language_loss": 0.75814486, "learning_rate": 1.273456470959875e-06, "loss": 0.77962935, "num_input_tokens_seen": 221492835, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6796875, "step": 10285, "time_per_iteration": 2.6978766918182373 }, { "auxiliary_loss_clip": 0.01135737, "auxiliary_loss_mlp": 0.01031393, "balance_loss_clip": 1.0186193, "balance_loss_mlp": 1.03701603, "epoch": 0.6184277769427327, "flos": 23364128736000.0, "grad_norm": 2.0984953021134913, "language_loss": 0.72838545, "learning_rate": 1.2731044732029406e-06, "loss": 0.75005674, "num_input_tokens_seen": 221511870, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 10286, "time_per_iteration": 6.4569761753082275 }, { "auxiliary_loss_clip": 0.01106466, "auxiliary_loss_mlp": 0.01032827, "balance_loss_clip": 1.02020311, "balance_loss_mlp": 1.03602409, "epoch": 0.6184879001954006, "flos": 22711057608960.0, "grad_norm": 2.6661639255513077, "language_loss": 0.75869811, "learning_rate": 1.272752501387038e-06, "loss": 0.78009105, "num_input_tokens_seen": 221529915, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 10287, "time_per_iteration": 2.565199136734009 }, { "auxiliary_loss_clip": 0.0113407, "auxiliary_loss_mlp": 0.01031055, "balance_loss_clip": 1.0187583, "balance_loss_mlp": 1.03649771, "epoch": 0.6185480234480686, "flos": 23841920471040.0, "grad_norm": 1.654704220159536, "language_loss": 0.72943491, "learning_rate": 1.2724005555247273e-06, "loss": 0.75108612, "num_input_tokens_seen": 221549745, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.70703125, "step": 10288, "time_per_iteration": 2.59431529045105 }, { "auxiliary_loss_clip": 0.01112665, "auxiliary_loss_mlp": 0.01033014, "balance_loss_clip": 1.0206157, "balance_loss_mlp": 1.03555846, "epoch": 0.6186081467007365, "flos": 45477595774080.0, "grad_norm": 1.501933340416251, "language_loss": 0.72383285, "learning_rate": 1.2720486356285698e-06, "loss": 0.74528968, "num_input_tokens_seen": 221572455, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6796875, "step": 10289, "time_per_iteration": 2.745887517929077 }, { "auxiliary_loss_clip": 0.01129813, "auxiliary_loss_mlp": 0.01034014, "balance_loss_clip": 1.02045429, "balance_loss_mlp": 1.03736949, "epoch": 0.6186682699534045, "flos": 23550864566400.0, "grad_norm": 1.795055087582383, "language_loss": 0.79428726, "learning_rate": 1.2716967417111235e-06, "loss": 0.81592548, "num_input_tokens_seen": 221591325, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7421875, "step": 10290, "time_per_iteration": 2.5445752143859863 }, { "auxiliary_loss_clip": 0.01119817, "auxiliary_loss_mlp": 0.01034309, "balance_loss_clip": 1.02121985, "balance_loss_mlp": 1.03745234, "epoch": 0.6187283932060724, "flos": 25774201630080.0, "grad_norm": 1.5583003129261046, "language_loss": 0.81255519, "learning_rate": 1.2713448737849474e-06, "loss": 0.83409649, "num_input_tokens_seen": 221611640, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7421875, "step": 10291, "time_per_iteration": 2.585493326187134 }, { "auxiliary_loss_clip": 0.01109638, "auxiliary_loss_mlp": 0.0103166, "balance_loss_clip": 1.01901162, "balance_loss_mlp": 1.03735864, "epoch": 0.6187885164587404, "flos": 25265203954560.0, "grad_norm": 2.0201584076172607, "language_loss": 0.77700877, "learning_rate": 1.2709930318625989e-06, "loss": 0.79842174, "num_input_tokens_seen": 221631225, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.72265625, "step": 10292, "time_per_iteration": 2.5602245330810547 }, { "auxiliary_loss_clip": 0.01134925, "auxiliary_loss_mlp": 0.0104375, "balance_loss_clip": 1.02874184, "balance_loss_mlp": 1.03947306, "epoch": 0.6188486397114084, "flos": 26250772302720.0, "grad_norm": 4.062822972431012, "language_loss": 0.73269999, "learning_rate": 1.270641215956633e-06, "loss": 0.7544868, "num_input_tokens_seen": 221651035, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7734375, "step": 10293, "time_per_iteration": 2.589629888534546 }, { "auxiliary_loss_clip": 0.01136793, "auxiliary_loss_mlp": 0.01282924, "balance_loss_clip": 1.02319098, "balance_loss_mlp": 1.03809786, "epoch": 0.6189087629640764, "flos": 20923388605440.0, "grad_norm": 1.7710468497149972, "language_loss": 0.8274014, "learning_rate": 1.2702894260796062e-06, "loss": 0.8515985, "num_input_tokens_seen": 221671300, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.72265625, "step": 10294, "time_per_iteration": 2.686062812805176 }, { "auxiliary_loss_clip": 0.01129642, "auxiliary_loss_mlp": 0.01030659, "balance_loss_clip": 1.01792181, "balance_loss_mlp": 1.03899825, "epoch": 0.6189688862167443, "flos": 14829814874880.0, "grad_norm": 2.1149241374815024, "language_loss": 0.70290589, "learning_rate": 1.2699376622440727e-06, "loss": 0.72450894, "num_input_tokens_seen": 221687320, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 10295, "time_per_iteration": 2.560574531555176 }, { "auxiliary_loss_clip": 0.01110403, "auxiliary_loss_mlp": 0.01037338, "balance_loss_clip": 1.02430296, "balance_loss_mlp": 1.03902829, "epoch": 0.6190290094694123, "flos": 24285058560000.0, "grad_norm": 1.444837835757856, "language_loss": 0.70056736, "learning_rate": 1.2695859244625864e-06, "loss": 0.72204483, "num_input_tokens_seen": 221710175, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 10296, "time_per_iteration": 2.5983171463012695 }, { "auxiliary_loss_clip": 0.01140501, "auxiliary_loss_mlp": 0.01038097, "balance_loss_clip": 1.02353537, "balance_loss_mlp": 1.03976393, "epoch": 0.6190891327220802, "flos": 22529457423360.0, "grad_norm": 32.264437266941954, "language_loss": 0.71323246, "learning_rate": 1.269234212747699e-06, "loss": 0.73501843, "num_input_tokens_seen": 221728145, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7421875, "step": 10297, "time_per_iteration": 2.594268798828125 }, { "auxiliary_loss_clip": 0.01031934, "auxiliary_loss_mlp": 0.01002158, "balance_loss_clip": 1.00073338, "balance_loss_mlp": 1.0094583, "epoch": 0.6191492559747482, "flos": 67729357152000.0, "grad_norm": 0.8790490827005201, "language_loss": 0.64153844, "learning_rate": 1.2688825271119634e-06, "loss": 0.66187936, "num_input_tokens_seen": 221786100, "router_z_loss_clip": 0.01422119, "router_z_loss_mlp": 0.22460938, "step": 10298, "time_per_iteration": 3.01350998878479 }, { "auxiliary_loss_clip": 0.0111161, "auxiliary_loss_mlp": 0.01035101, "balance_loss_clip": 1.02192831, "balance_loss_mlp": 1.03799224, "epoch": 0.6192093792274163, "flos": 22346672088960.0, "grad_norm": 1.9763341605582143, "language_loss": 0.74050242, "learning_rate": 1.2685308675679295e-06, "loss": 0.76196957, "num_input_tokens_seen": 221806450, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 10299, "time_per_iteration": 2.610121965408325 }, { "auxiliary_loss_clip": 0.01124854, "auxiliary_loss_mlp": 0.01030553, "balance_loss_clip": 1.01722527, "balance_loss_mlp": 1.03952074, "epoch": 0.6192695024800842, "flos": 13553944807680.0, "grad_norm": 1.854641977392891, "language_loss": 0.68105745, "learning_rate": 1.2681792341281474e-06, "loss": 0.70261157, "num_input_tokens_seen": 221823330, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.765625, "step": 10300, "time_per_iteration": 2.5215322971343994 }, { "auxiliary_loss_clip": 0.01122267, "auxiliary_loss_mlp": 0.01283694, "balance_loss_clip": 1.02432919, "balance_loss_mlp": 1.03936815, "epoch": 0.6193296257327522, "flos": 17415310815360.0, "grad_norm": 1.9427098192960592, "language_loss": 0.66380048, "learning_rate": 1.267827626805166e-06, "loss": 0.68786007, "num_input_tokens_seen": 221839360, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.73828125, "step": 10301, "time_per_iteration": 2.5717990398406982 }, { "auxiliary_loss_clip": 0.01134638, "auxiliary_loss_mlp": 0.01034113, "balance_loss_clip": 1.0216378, "balance_loss_mlp": 1.03506207, "epoch": 0.6193897489854201, "flos": 31101118450560.0, "grad_norm": 1.899044046259515, "language_loss": 0.73153472, "learning_rate": 1.267476045611533e-06, "loss": 0.75322223, "num_input_tokens_seen": 221859465, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7265625, "step": 10302, "time_per_iteration": 2.654287576675415 }, { "auxiliary_loss_clip": 0.01127225, "auxiliary_loss_mlp": 0.01030731, "balance_loss_clip": 1.01712906, "balance_loss_mlp": 1.03737879, "epoch": 0.6194498722380881, "flos": 19134031662720.0, "grad_norm": 1.7109575658967775, "language_loss": 0.80086136, "learning_rate": 1.267124490559796e-06, "loss": 0.82244092, "num_input_tokens_seen": 221878555, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 10303, "time_per_iteration": 2.574704170227051 }, { "auxiliary_loss_clip": 0.01113649, "auxiliary_loss_mlp": 0.01033604, "balance_loss_clip": 1.01915622, "balance_loss_mlp": 1.03832626, "epoch": 0.619509995490756, "flos": 21835088634240.0, "grad_norm": 1.6457156887207478, "language_loss": 0.76696908, "learning_rate": 1.2667729616625006e-06, "loss": 0.7884416, "num_input_tokens_seen": 221898790, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.75390625, "step": 10304, "time_per_iteration": 2.5487732887268066 }, { "auxiliary_loss_clip": 0.01142255, "auxiliary_loss_mlp": 0.01033818, "balance_loss_clip": 1.02016854, "balance_loss_mlp": 1.03859544, "epoch": 0.619570118743424, "flos": 23806548552960.0, "grad_norm": 3.1810651921527207, "language_loss": 0.76836056, "learning_rate": 1.266421458932192e-06, "loss": 0.79012132, "num_input_tokens_seen": 221918875, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.76953125, "step": 10305, "time_per_iteration": 2.6179497241973877 }, { "auxiliary_loss_clip": 0.01127055, "auxiliary_loss_mlp": 0.01035666, "balance_loss_clip": 1.02115846, "balance_loss_mlp": 1.04143202, "epoch": 0.619630241996092, "flos": 21101612912640.0, "grad_norm": 1.6078974718823997, "language_loss": 0.78446615, "learning_rate": 1.2660699823814147e-06, "loss": 0.80609334, "num_input_tokens_seen": 221937895, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.765625, "step": 10306, "time_per_iteration": 2.5917999744415283 }, { "auxiliary_loss_clip": 0.01140799, "auxiliary_loss_mlp": 0.01029272, "balance_loss_clip": 1.01762486, "balance_loss_mlp": 1.03570235, "epoch": 0.61969036524876, "flos": 27308269635840.0, "grad_norm": 1.6105667105594794, "language_loss": 0.80092883, "learning_rate": 1.2657185320227122e-06, "loss": 0.82262957, "num_input_tokens_seen": 221955920, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.69140625, "step": 10307, "time_per_iteration": 2.702091932296753 }, { "auxiliary_loss_clip": 0.01058202, "auxiliary_loss_mlp": 0.01002241, "balance_loss_clip": 1.00079906, "balance_loss_mlp": 1.00885344, "epoch": 0.6197504885014279, "flos": 51648955384320.0, "grad_norm": 0.8114398422124715, "language_loss": 0.59380096, "learning_rate": 1.2653671078686261e-06, "loss": 0.61440539, "num_input_tokens_seen": 222011405, "router_z_loss_clip": 0.0144043, "router_z_loss_mlp": 0.22265625, "step": 10308, "time_per_iteration": 3.2298619747161865 }, { "auxiliary_loss_clip": 0.01125004, "auxiliary_loss_mlp": 0.01032329, "balance_loss_clip": 1.02084923, "balance_loss_mlp": 1.03708482, "epoch": 0.6198106117540959, "flos": 30557107992960.0, "grad_norm": 1.732897001482808, "language_loss": 0.67662382, "learning_rate": 1.2650157099316982e-06, "loss": 0.69819713, "num_input_tokens_seen": 222034545, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.69921875, "step": 10309, "time_per_iteration": 2.655867099761963 }, { "auxiliary_loss_clip": 0.01125713, "auxiliary_loss_mlp": 0.0103186, "balance_loss_clip": 1.01967096, "balance_loss_mlp": 1.03715074, "epoch": 0.6198707350067638, "flos": 18909733184640.0, "grad_norm": 1.5264453149183126, "language_loss": 0.72070295, "learning_rate": 1.264664338224469e-06, "loss": 0.7422787, "num_input_tokens_seen": 222052690, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 10310, "time_per_iteration": 2.557612657546997 }, { "auxiliary_loss_clip": 0.01133118, "auxiliary_loss_mlp": 0.01035707, "balance_loss_clip": 1.02108634, "balance_loss_mlp": 1.0412519, "epoch": 0.6199308582594318, "flos": 21433858738560.0, "grad_norm": 1.9350877492815834, "language_loss": 0.78941512, "learning_rate": 1.2643129927594781e-06, "loss": 0.81110346, "num_input_tokens_seen": 222069095, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.74609375, "step": 10311, "time_per_iteration": 2.5491676330566406 }, { "auxiliary_loss_clip": 0.01134217, "auxiliary_loss_mlp": 0.01036024, "balance_loss_clip": 1.02280354, "balance_loss_mlp": 1.03453088, "epoch": 0.6199909815120999, "flos": 18407379525120.0, "grad_norm": 1.6505464524933025, "language_loss": 0.72414982, "learning_rate": 1.2639616735492639e-06, "loss": 0.74585223, "num_input_tokens_seen": 222087360, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 10312, "time_per_iteration": 2.5670485496520996 }, { "auxiliary_loss_clip": 0.01149003, "auxiliary_loss_mlp": 0.01032319, "balance_loss_clip": 1.01905143, "balance_loss_mlp": 1.03905702, "epoch": 0.6200511047647678, "flos": 21466860359040.0, "grad_norm": 2.074876427901862, "language_loss": 0.71684647, "learning_rate": 1.2636103806063644e-06, "loss": 0.73865962, "num_input_tokens_seen": 222106130, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7421875, "step": 10313, "time_per_iteration": 2.543835401535034 }, { "auxiliary_loss_clip": 0.01121585, "auxiliary_loss_mlp": 0.01033878, "balance_loss_clip": 1.02018678, "balance_loss_mlp": 1.03778768, "epoch": 0.6201112280174358, "flos": 18215903099520.0, "grad_norm": 1.7609119276676037, "language_loss": 0.78401971, "learning_rate": 1.2632591139433167e-06, "loss": 0.80557436, "num_input_tokens_seen": 222123125, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75, "step": 10314, "time_per_iteration": 2.56086802482605 }, { "auxiliary_loss_clip": 0.01117015, "auxiliary_loss_mlp": 0.01033413, "balance_loss_clip": 1.02049661, "balance_loss_mlp": 1.03446627, "epoch": 0.6201713512701037, "flos": 20011185786240.0, "grad_norm": 1.8813941026828913, "language_loss": 0.78020221, "learning_rate": 1.2629078735726553e-06, "loss": 0.80170643, "num_input_tokens_seen": 222140655, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73828125, "step": 10315, "time_per_iteration": 2.548096179962158 }, { "auxiliary_loss_clip": 0.01141653, "auxiliary_loss_mlp": 0.01037393, "balance_loss_clip": 1.02489984, "balance_loss_mlp": 1.04019094, "epoch": 0.6202314745227717, "flos": 22487692884480.0, "grad_norm": 1.7551681121906006, "language_loss": 0.75992566, "learning_rate": 1.2625566595069162e-06, "loss": 0.78171611, "num_input_tokens_seen": 222160450, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.74609375, "step": 10316, "time_per_iteration": 2.6532363891601562 }, { "auxiliary_loss_clip": 0.01109469, "auxiliary_loss_mlp": 0.01032884, "balance_loss_clip": 1.0195564, "balance_loss_mlp": 1.03394735, "epoch": 0.6202915977754396, "flos": 26828682220800.0, "grad_norm": 1.9415474403573965, "language_loss": 0.77829778, "learning_rate": 1.2622054717586328e-06, "loss": 0.7997213, "num_input_tokens_seen": 222179170, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75390625, "step": 10317, "time_per_iteration": 4.139019727706909 }, { "auxiliary_loss_clip": 0.01048378, "auxiliary_loss_mlp": 0.0100444, "balance_loss_clip": 1.00303292, "balance_loss_mlp": 1.00812423, "epoch": 0.6203517210281076, "flos": 62742694890240.0, "grad_norm": 0.7044642998016788, "language_loss": 0.59082067, "learning_rate": 1.2618543103403385e-06, "loss": 0.61134887, "num_input_tokens_seen": 222242660, "router_z_loss_clip": 0.01403809, "router_z_loss_mlp": 0.22265625, "step": 10318, "time_per_iteration": 3.2311978340148926 }, { "auxiliary_loss_clip": 0.01132044, "auxiliary_loss_mlp": 0.01033145, "balance_loss_clip": 1.01948965, "balance_loss_mlp": 1.04032421, "epoch": 0.6204118442807756, "flos": 23404277162880.0, "grad_norm": 1.6885297382271778, "language_loss": 0.77616119, "learning_rate": 1.261503175264565e-06, "loss": 0.79781306, "num_input_tokens_seen": 222262170, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 10319, "time_per_iteration": 2.628753662109375 }, { "auxiliary_loss_clip": 0.01126605, "auxiliary_loss_mlp": 0.0103682, "balance_loss_clip": 1.02457774, "balance_loss_mlp": 1.03714776, "epoch": 0.6204719675334436, "flos": 20193647898240.0, "grad_norm": 2.0298840806623293, "language_loss": 0.6592952, "learning_rate": 1.2611520665438435e-06, "loss": 0.68092942, "num_input_tokens_seen": 222280375, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.71875, "step": 10320, "time_per_iteration": 2.572185754776001 }, { "auxiliary_loss_clip": 0.01132375, "auxiliary_loss_mlp": 0.01030501, "balance_loss_clip": 1.01825857, "balance_loss_mlp": 1.03582633, "epoch": 0.6205320907861115, "flos": 13188050916480.0, "grad_norm": 1.6366403337140043, "language_loss": 0.76326191, "learning_rate": 1.2608009841907046e-06, "loss": 0.78489065, "num_input_tokens_seen": 222297325, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 10321, "time_per_iteration": 2.5835483074188232 }, { "auxiliary_loss_clip": 0.01123431, "auxiliary_loss_mlp": 0.01026539, "balance_loss_clip": 1.01424885, "balance_loss_mlp": 1.03483319, "epoch": 0.6205922140387795, "flos": 20668386977280.0, "grad_norm": 1.7574941602831546, "language_loss": 0.7367866, "learning_rate": 1.2604499282176768e-06, "loss": 0.7582863, "num_input_tokens_seen": 222317095, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 10322, "time_per_iteration": 3.9265806674957275 }, { "auxiliary_loss_clip": 0.01134459, "auxiliary_loss_mlp": 0.0102882, "balance_loss_clip": 1.01682758, "balance_loss_mlp": 1.03677869, "epoch": 0.6206523372914474, "flos": 23877831093120.0, "grad_norm": 2.0318338446959694, "language_loss": 0.72972739, "learning_rate": 1.260098898637289e-06, "loss": 0.75136024, "num_input_tokens_seen": 222337055, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.70703125, "step": 10323, "time_per_iteration": 2.641709327697754 }, { "auxiliary_loss_clip": 0.01124964, "auxiliary_loss_mlp": 0.0102957, "balance_loss_clip": 1.01592064, "balance_loss_mlp": 1.03937423, "epoch": 0.6207124605441154, "flos": 13406603218560.0, "grad_norm": 2.212373327823365, "language_loss": 0.58419478, "learning_rate": 1.2597478954620677e-06, "loss": 0.60574013, "num_input_tokens_seen": 222354515, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.76953125, "step": 10324, "time_per_iteration": 2.56364107131958 }, { "auxiliary_loss_clip": 0.01138942, "auxiliary_loss_mlp": 0.01032992, "balance_loss_clip": 1.01915765, "balance_loss_mlp": 1.03842187, "epoch": 0.6207725837967835, "flos": 18916341287040.0, "grad_norm": 1.7502425221765718, "language_loss": 0.76078397, "learning_rate": 1.2593969187045402e-06, "loss": 0.78250337, "num_input_tokens_seen": 222372755, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.73828125, "step": 10325, "time_per_iteration": 2.541351079940796 }, { "auxiliary_loss_clip": 0.01121908, "auxiliary_loss_mlp": 0.01031253, "balance_loss_clip": 1.0176158, "balance_loss_mlp": 1.03757787, "epoch": 0.6208327070494514, "flos": 23980211832960.0, "grad_norm": 1.6532785012112134, "language_loss": 0.72343314, "learning_rate": 1.2590459683772317e-06, "loss": 0.74496472, "num_input_tokens_seen": 222391380, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7578125, "step": 10326, "time_per_iteration": 2.6305227279663086 }, { "auxiliary_loss_clip": 0.01122859, "auxiliary_loss_mlp": 0.01039418, "balance_loss_clip": 1.02594113, "balance_loss_mlp": 1.0381918, "epoch": 0.6208928303021194, "flos": 22820405587200.0, "grad_norm": 2.459521880992658, "language_loss": 0.73854399, "learning_rate": 1.2586950444926663e-06, "loss": 0.76016676, "num_input_tokens_seen": 222411165, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 10327, "time_per_iteration": 2.5398552417755127 }, { "auxiliary_loss_clip": 0.01132533, "auxiliary_loss_mlp": 0.01034763, "balance_loss_clip": 1.02041578, "balance_loss_mlp": 1.03918195, "epoch": 0.6209529535547873, "flos": 17564519911680.0, "grad_norm": 2.5986087566794125, "language_loss": 0.79715073, "learning_rate": 1.2583441470633683e-06, "loss": 0.8188237, "num_input_tokens_seen": 222428110, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.75390625, "step": 10328, "time_per_iteration": 5.987366199493408 }, { "auxiliary_loss_clip": 0.01137875, "auxiliary_loss_mlp": 0.01039155, "balance_loss_clip": 1.02548218, "balance_loss_mlp": 1.03716445, "epoch": 0.6210130768074553, "flos": 22011912311040.0, "grad_norm": 1.9196671120601632, "language_loss": 0.77704895, "learning_rate": 1.2579932761018596e-06, "loss": 0.79881924, "num_input_tokens_seen": 222446385, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 10329, "time_per_iteration": 2.6201248168945312 }, { "auxiliary_loss_clip": 0.01040283, "auxiliary_loss_mlp": 0.01002491, "balance_loss_clip": 1.00104809, "balance_loss_mlp": 1.00929117, "epoch": 0.6210732000601232, "flos": 63676873854720.0, "grad_norm": 1.044657381648618, "language_loss": 0.62176144, "learning_rate": 1.2576424316206624e-06, "loss": 0.64218915, "num_input_tokens_seen": 222502150, "router_z_loss_clip": 0.0144043, "router_z_loss_mlp": 0.22460938, "step": 10330, "time_per_iteration": 3.164309501647949 }, { "auxiliary_loss_clip": 0.01122862, "auxiliary_loss_mlp": 0.01039666, "balance_loss_clip": 1.02507472, "balance_loss_mlp": 1.03708732, "epoch": 0.6211333233127913, "flos": 24243365848320.0, "grad_norm": 2.2073441009699795, "language_loss": 0.77577394, "learning_rate": 1.2572916136322974e-06, "loss": 0.79739922, "num_input_tokens_seen": 222519880, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.76953125, "step": 10331, "time_per_iteration": 2.6283886432647705 }, { "auxiliary_loss_clip": 0.01113553, "auxiliary_loss_mlp": 0.01039552, "balance_loss_clip": 1.02555656, "balance_loss_mlp": 1.03807902, "epoch": 0.6211934465654592, "flos": 16943803960320.0, "grad_norm": 1.9018526252160446, "language_loss": 0.67649281, "learning_rate": 1.2569408221492835e-06, "loss": 0.69802386, "num_input_tokens_seen": 222538545, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75390625, "step": 10332, "time_per_iteration": 2.5729198455810547 }, { "auxiliary_loss_clip": 0.01137328, "auxiliary_loss_mlp": 0.01034628, "balance_loss_clip": 1.02125823, "balance_loss_mlp": 1.03741264, "epoch": 0.6212535698181272, "flos": 15267386355840.0, "grad_norm": 1.5911110299758058, "language_loss": 0.76279902, "learning_rate": 1.256590057184141e-06, "loss": 0.78451854, "num_input_tokens_seen": 222556935, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 10333, "time_per_iteration": 2.5649211406707764 }, { "auxiliary_loss_clip": 0.01126531, "auxiliary_loss_mlp": 0.01033124, "balance_loss_clip": 1.02057171, "balance_loss_mlp": 1.03774238, "epoch": 0.6213136930707951, "flos": 13443950384640.0, "grad_norm": 2.0896167098888774, "language_loss": 0.69639134, "learning_rate": 1.2562393187493866e-06, "loss": 0.7179879, "num_input_tokens_seen": 222574035, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 10334, "time_per_iteration": 2.6301581859588623 }, { "auxiliary_loss_clip": 0.01128596, "auxiliary_loss_mlp": 0.01280512, "balance_loss_clip": 1.02138138, "balance_loss_mlp": 1.03937459, "epoch": 0.6213738163234631, "flos": 18111223889280.0, "grad_norm": 1.8450295770623286, "language_loss": 0.70164424, "learning_rate": 1.2558886068575381e-06, "loss": 0.72573531, "num_input_tokens_seen": 222592290, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.71484375, "step": 10335, "time_per_iteration": 2.59139084815979 }, { "auxiliary_loss_clip": 0.01141111, "auxiliary_loss_mlp": 0.01033345, "balance_loss_clip": 1.02092302, "balance_loss_mlp": 1.03502512, "epoch": 0.621433939576131, "flos": 25337348421120.0, "grad_norm": 1.4988747666926996, "language_loss": 0.80309129, "learning_rate": 1.2555379215211113e-06, "loss": 0.82483584, "num_input_tokens_seen": 222612805, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 10336, "time_per_iteration": 2.625718355178833 }, { "auxiliary_loss_clip": 0.01107471, "auxiliary_loss_mlp": 0.01032101, "balance_loss_clip": 1.01934612, "balance_loss_mlp": 1.03727305, "epoch": 0.621494062828799, "flos": 22565619440640.0, "grad_norm": 1.7054211318000787, "language_loss": 0.73113728, "learning_rate": 1.2551872627526208e-06, "loss": 0.75253296, "num_input_tokens_seen": 222632260, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 10337, "time_per_iteration": 2.5672202110290527 }, { "auxiliary_loss_clip": 0.01114208, "auxiliary_loss_mlp": 0.01041201, "balance_loss_clip": 1.0274086, "balance_loss_mlp": 1.03845131, "epoch": 0.621554186081467, "flos": 27417976750080.0, "grad_norm": 2.0156689047012417, "language_loss": 0.62849587, "learning_rate": 1.2548366305645815e-06, "loss": 0.65005004, "num_input_tokens_seen": 222653570, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 10338, "time_per_iteration": 2.5719990730285645 }, { "auxiliary_loss_clip": 0.01119552, "auxiliary_loss_mlp": 0.01033833, "balance_loss_clip": 1.0203383, "balance_loss_mlp": 1.03881872, "epoch": 0.621614309334135, "flos": 22346815743360.0, "grad_norm": 1.624387688791576, "language_loss": 0.71728694, "learning_rate": 1.2544860249695052e-06, "loss": 0.73882073, "num_input_tokens_seen": 222672480, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 10339, "time_per_iteration": 2.5382778644561768 }, { "auxiliary_loss_clip": 0.01143725, "auxiliary_loss_mlp": 0.01033016, "balance_loss_clip": 1.02000427, "balance_loss_mlp": 1.03799009, "epoch": 0.621674432586803, "flos": 19281229597440.0, "grad_norm": 1.48645655534741, "language_loss": 0.69533253, "learning_rate": 1.2541354459799057e-06, "loss": 0.71709991, "num_input_tokens_seen": 222691200, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 10340, "time_per_iteration": 2.609528064727783 }, { "auxiliary_loss_clip": 0.01133444, "auxiliary_loss_mlp": 0.01029348, "balance_loss_clip": 1.01665246, "balance_loss_mlp": 1.03564095, "epoch": 0.6217345558394709, "flos": 21609533180160.0, "grad_norm": 2.0674878452781513, "language_loss": 0.68730253, "learning_rate": 1.2537848936082926e-06, "loss": 0.70893049, "num_input_tokens_seen": 222709975, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 10341, "time_per_iteration": 2.5564424991607666 }, { "auxiliary_loss_clip": 0.01121883, "auxiliary_loss_mlp": 0.01032503, "balance_loss_clip": 1.01809072, "balance_loss_mlp": 1.03851104, "epoch": 0.6217946790921389, "flos": 18004102554240.0, "grad_norm": 1.7727643368516752, "language_loss": 0.80502188, "learning_rate": 1.253434367867178e-06, "loss": 0.8265658, "num_input_tokens_seen": 222729005, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.75, "step": 10342, "time_per_iteration": 2.520808696746826 }, { "auxiliary_loss_clip": 0.01049666, "auxiliary_loss_mlp": 0.0100025, "balance_loss_clip": 0.99878377, "balance_loss_mlp": 1.00948465, "epoch": 0.6218548023448068, "flos": 61973631768960.0, "grad_norm": 0.7800337899957208, "language_loss": 0.57311106, "learning_rate": 1.2530838687690704e-06, "loss": 0.59361023, "num_input_tokens_seen": 222786090, "router_z_loss_clip": 0.01464844, "router_z_loss_mlp": 0.21972656, "step": 10343, "time_per_iteration": 3.061079740524292 }, { "auxiliary_loss_clip": 0.0113295, "auxiliary_loss_mlp": 0.01028988, "balance_loss_clip": 1.01779437, "balance_loss_mlp": 1.03723621, "epoch": 0.6219149255974749, "flos": 25739152934400.0, "grad_norm": 1.8649468109761969, "language_loss": 0.72898924, "learning_rate": 1.2527333963264777e-06, "loss": 0.75060862, "num_input_tokens_seen": 222806100, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6875, "step": 10344, "time_per_iteration": 2.6632657051086426 }, { "auxiliary_loss_clip": 0.01121828, "auxiliary_loss_mlp": 0.0104046, "balance_loss_clip": 1.02836061, "balance_loss_mlp": 1.03989816, "epoch": 0.6219750488501428, "flos": 25411073086080.0, "grad_norm": 2.071848161588267, "language_loss": 0.59794378, "learning_rate": 1.2523829505519083e-06, "loss": 0.61956674, "num_input_tokens_seen": 222826575, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.73046875, "step": 10345, "time_per_iteration": 2.558793544769287 }, { "auxiliary_loss_clip": 0.01154572, "auxiliary_loss_mlp": 0.01035341, "balance_loss_clip": 1.02192414, "balance_loss_mlp": 1.03657603, "epoch": 0.6220351721028108, "flos": 20047383717120.0, "grad_norm": 3.095251215041045, "language_loss": 0.7801168, "learning_rate": 1.252032531457868e-06, "loss": 0.80201602, "num_input_tokens_seen": 222845285, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73046875, "step": 10346, "time_per_iteration": 2.636655569076538 }, { "auxiliary_loss_clip": 0.0114066, "auxiliary_loss_mlp": 0.0103907, "balance_loss_clip": 1.02449059, "balance_loss_mlp": 1.03810656, "epoch": 0.6220952953554787, "flos": 27488397363840.0, "grad_norm": 1.87805619733528, "language_loss": 0.70925587, "learning_rate": 1.251682139056863e-06, "loss": 0.73105317, "num_input_tokens_seen": 222864575, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7578125, "step": 10347, "time_per_iteration": 2.7087666988372803 }, { "auxiliary_loss_clip": 0.01114821, "auxiliary_loss_mlp": 0.01030624, "balance_loss_clip": 1.01765382, "balance_loss_mlp": 1.03474033, "epoch": 0.6221554186081467, "flos": 19207612673280.0, "grad_norm": 1.8561179819287865, "language_loss": 0.71493387, "learning_rate": 1.2513317733613976e-06, "loss": 0.73638833, "num_input_tokens_seen": 222884420, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 10348, "time_per_iteration": 2.6462695598602295 }, { "auxiliary_loss_clip": 0.01058359, "auxiliary_loss_mlp": 0.01001274, "balance_loss_clip": 0.99980181, "balance_loss_mlp": 1.00945568, "epoch": 0.6222155418608146, "flos": 62950939989120.0, "grad_norm": 0.8135084580048545, "language_loss": 0.54123038, "learning_rate": 1.2509814343839748e-06, "loss": 0.56182671, "num_input_tokens_seen": 222944690, "router_z_loss_clip": 0.01470947, "router_z_loss_mlp": 0.22167969, "step": 10349, "time_per_iteration": 3.250488042831421 }, { "auxiliary_loss_clip": 0.01136596, "auxiliary_loss_mlp": 0.01030603, "balance_loss_clip": 1.01761544, "balance_loss_mlp": 1.03785706, "epoch": 0.6222756651134826, "flos": 22601099099520.0, "grad_norm": 2.1187032330127193, "language_loss": 0.69668084, "learning_rate": 1.2506311221370984e-06, "loss": 0.71835285, "num_input_tokens_seen": 222962990, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 10350, "time_per_iteration": 2.6982765197753906 }, { "auxiliary_loss_clip": 0.01111821, "auxiliary_loss_mlp": 0.01032574, "balance_loss_clip": 1.01975918, "balance_loss_mlp": 1.03896689, "epoch": 0.6223357883661506, "flos": 21142228216320.0, "grad_norm": 2.1904455888069103, "language_loss": 0.57196879, "learning_rate": 1.2502808366332694e-06, "loss": 0.59341276, "num_input_tokens_seen": 222980715, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 10351, "time_per_iteration": 2.7061328887939453 }, { "auxiliary_loss_clip": 0.01040585, "auxiliary_loss_mlp": 0.01000223, "balance_loss_clip": 0.99863797, "balance_loss_mlp": 1.00959837, "epoch": 0.6223959116188186, "flos": 63765071700480.0, "grad_norm": 0.8367203152188539, "language_loss": 0.61154443, "learning_rate": 1.2499305778849895e-06, "loss": 0.63195252, "num_input_tokens_seen": 223040685, "router_z_loss_clip": 0.01586914, "router_z_loss_mlp": 0.22265625, "step": 10352, "time_per_iteration": 3.1733970642089844 }, { "auxiliary_loss_clip": 0.01106593, "auxiliary_loss_mlp": 0.01033748, "balance_loss_clip": 1.02087963, "balance_loss_mlp": 1.0361892, "epoch": 0.6224560348714866, "flos": 22565727181440.0, "grad_norm": 2.370963138269045, "language_loss": 0.82031357, "learning_rate": 1.2495803459047576e-06, "loss": 0.84171695, "num_input_tokens_seen": 223059000, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 10353, "time_per_iteration": 2.5895490646362305 }, { "auxiliary_loss_clip": 0.01112895, "auxiliary_loss_mlp": 0.01035032, "balance_loss_clip": 1.02334952, "balance_loss_mlp": 1.03589046, "epoch": 0.6225161581241545, "flos": 24097748112000.0, "grad_norm": 1.7443675651229738, "language_loss": 0.75649869, "learning_rate": 1.2492301407050722e-06, "loss": 0.777978, "num_input_tokens_seen": 223079345, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.68359375, "step": 10354, "time_per_iteration": 2.6250081062316895 }, { "auxiliary_loss_clip": 0.01135001, "auxiliary_loss_mlp": 0.01033624, "balance_loss_clip": 1.0211246, "balance_loss_mlp": 1.03704834, "epoch": 0.6225762813768225, "flos": 20443513881600.0, "grad_norm": 1.426482562578763, "language_loss": 0.78786439, "learning_rate": 1.2488799622984325e-06, "loss": 0.80955064, "num_input_tokens_seen": 223097880, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 10355, "time_per_iteration": 2.5968422889709473 }, { "auxiliary_loss_clip": 0.01118059, "auxiliary_loss_mlp": 0.01032382, "balance_loss_clip": 1.02021074, "balance_loss_mlp": 1.03809333, "epoch": 0.6226364046294904, "flos": 27198131558400.0, "grad_norm": 1.9483023335862915, "language_loss": 0.78375018, "learning_rate": 1.2485298106973344e-06, "loss": 0.80525464, "num_input_tokens_seen": 223118185, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7109375, "step": 10356, "time_per_iteration": 2.569976568222046 }, { "auxiliary_loss_clip": 0.01132421, "auxiliary_loss_mlp": 0.0128083, "balance_loss_clip": 1.01983094, "balance_loss_mlp": 1.03874159, "epoch": 0.6226965278821585, "flos": 20445776438400.0, "grad_norm": 2.5900801223652654, "language_loss": 0.67190492, "learning_rate": 1.2481796859142745e-06, "loss": 0.69603741, "num_input_tokens_seen": 223137600, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.765625, "step": 10357, "time_per_iteration": 2.5542259216308594 }, { "auxiliary_loss_clip": 0.01130827, "auxiliary_loss_mlp": 0.01034304, "balance_loss_clip": 1.01936102, "balance_loss_mlp": 1.03777122, "epoch": 0.6227566511348264, "flos": 22162737519360.0, "grad_norm": 2.9445337805445453, "language_loss": 0.76011962, "learning_rate": 1.247829587961748e-06, "loss": 0.78177094, "num_input_tokens_seen": 223154360, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.75390625, "step": 10358, "time_per_iteration": 3.9966049194335938 }, { "auxiliary_loss_clip": 0.01126493, "auxiliary_loss_mlp": 0.01033835, "balance_loss_clip": 1.02094233, "balance_loss_mlp": 1.0357542, "epoch": 0.6228167743874944, "flos": 18040875102720.0, "grad_norm": 2.034130272813085, "language_loss": 0.82506037, "learning_rate": 1.2474795168522483e-06, "loss": 0.84666371, "num_input_tokens_seen": 223172255, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 10359, "time_per_iteration": 2.5221595764160156 }, { "auxiliary_loss_clip": 0.01122642, "auxiliary_loss_mlp": 0.01040008, "balance_loss_clip": 1.02722323, "balance_loss_mlp": 1.03475904, "epoch": 0.6228768976401623, "flos": 17742851959680.0, "grad_norm": 2.0714355547987147, "language_loss": 0.73564398, "learning_rate": 1.247129472598269e-06, "loss": 0.75727051, "num_input_tokens_seen": 223186965, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 10360, "time_per_iteration": 2.5406880378723145 }, { "auxiliary_loss_clip": 0.01116723, "auxiliary_loss_mlp": 0.01039663, "balance_loss_clip": 1.02647853, "balance_loss_mlp": 1.0365591, "epoch": 0.6229370208928303, "flos": 17894934144000.0, "grad_norm": 2.231129422151538, "language_loss": 0.774382, "learning_rate": 1.246779455212302e-06, "loss": 0.79594588, "num_input_tokens_seen": 223206045, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71484375, "step": 10361, "time_per_iteration": 2.506761312484741 }, { "auxiliary_loss_clip": 0.01135177, "auxiliary_loss_mlp": 0.01030694, "balance_loss_clip": 1.0185883, "balance_loss_mlp": 1.03763008, "epoch": 0.6229971441454982, "flos": 17347763289600.0, "grad_norm": 1.6200794268668075, "language_loss": 0.67458993, "learning_rate": 1.2464294647068392e-06, "loss": 0.69624865, "num_input_tokens_seen": 223224820, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.7109375, "step": 10362, "time_per_iteration": 2.5968122482299805 }, { "auxiliary_loss_clip": 0.01126418, "auxiliary_loss_mlp": 0.01030353, "balance_loss_clip": 1.017115, "balance_loss_mlp": 1.03616285, "epoch": 0.6230572673981662, "flos": 29241376807680.0, "grad_norm": 2.1511263089811834, "language_loss": 0.67385101, "learning_rate": 1.24607950109437e-06, "loss": 0.69541872, "num_input_tokens_seen": 223243205, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 10363, "time_per_iteration": 4.037724733352661 }, { "auxiliary_loss_clip": 0.01147355, "auxiliary_loss_mlp": 0.01036334, "balance_loss_clip": 1.02255297, "balance_loss_mlp": 1.03742623, "epoch": 0.6231173906508342, "flos": 16325961096960.0, "grad_norm": 2.078353206319745, "language_loss": 0.86336923, "learning_rate": 1.2457295643873845e-06, "loss": 0.8852061, "num_input_tokens_seen": 223261370, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7421875, "step": 10364, "time_per_iteration": 2.5996897220611572 }, { "auxiliary_loss_clip": 0.01049774, "auxiliary_loss_mlp": 0.01003643, "balance_loss_clip": 1.00208187, "balance_loss_mlp": 1.01006436, "epoch": 0.6231775139035022, "flos": 68702032517760.0, "grad_norm": 0.888905948911251, "language_loss": 0.60781586, "learning_rate": 1.2453796545983704e-06, "loss": 0.62835002, "num_input_tokens_seen": 223315050, "router_z_loss_clip": 0.015625, "router_z_loss_mlp": 0.22167969, "step": 10365, "time_per_iteration": 3.148259162902832 }, { "auxiliary_loss_clip": 0.0114114, "auxiliary_loss_mlp": 0.01286756, "balance_loss_clip": 1.02514541, "balance_loss_mlp": 1.03777933, "epoch": 0.6232376371561702, "flos": 19821038163840.0, "grad_norm": 4.16553126654954, "language_loss": 0.75264966, "learning_rate": 1.2450297717398151e-06, "loss": 0.77692866, "num_input_tokens_seen": 223332130, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.765625, "step": 10366, "time_per_iteration": 2.5212900638580322 }, { "auxiliary_loss_clip": 0.01121373, "auxiliary_loss_mlp": 0.01038541, "balance_loss_clip": 1.02489161, "balance_loss_mlp": 1.03831601, "epoch": 0.6232977604088381, "flos": 23258264376960.0, "grad_norm": 1.9568156628999427, "language_loss": 0.75910389, "learning_rate": 1.2446799158242056e-06, "loss": 0.78070307, "num_input_tokens_seen": 223351605, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 10367, "time_per_iteration": 2.557091474533081 }, { "auxiliary_loss_clip": 0.01117972, "auxiliary_loss_mlp": 0.01032834, "balance_loss_clip": 1.02043653, "balance_loss_mlp": 1.03944409, "epoch": 0.6233578836615061, "flos": 21106425335040.0, "grad_norm": 1.5240174036654783, "language_loss": 0.78418618, "learning_rate": 1.244330086864027e-06, "loss": 0.80569422, "num_input_tokens_seen": 223372090, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 10368, "time_per_iteration": 2.5199129581451416 }, { "auxiliary_loss_clip": 0.01158573, "auxiliary_loss_mlp": 0.01034517, "balance_loss_clip": 1.02128434, "balance_loss_mlp": 1.04013145, "epoch": 0.623418006914174, "flos": 23769416868480.0, "grad_norm": 1.9921375952902949, "language_loss": 0.68243539, "learning_rate": 1.2439802848717637e-06, "loss": 0.70436633, "num_input_tokens_seen": 223390110, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7421875, "step": 10369, "time_per_iteration": 4.985487461090088 }, { "auxiliary_loss_clip": 0.01112381, "auxiliary_loss_mlp": 0.01037779, "balance_loss_clip": 1.02395082, "balance_loss_mlp": 1.03889966, "epoch": 0.6234781301668421, "flos": 17890480857600.0, "grad_norm": 4.042342124231563, "language_loss": 0.87744987, "learning_rate": 1.2436305098598997e-06, "loss": 0.89895153, "num_input_tokens_seen": 223404205, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 10370, "time_per_iteration": 2.5828628540039062 }, { "auxiliary_loss_clip": 0.01107525, "auxiliary_loss_mlp": 0.01030267, "balance_loss_clip": 1.01771462, "balance_loss_mlp": 1.03682959, "epoch": 0.62353825341951, "flos": 26175503352960.0, "grad_norm": 1.7718939016679494, "language_loss": 0.66363734, "learning_rate": 1.2432807618409163e-06, "loss": 0.68501532, "num_input_tokens_seen": 223424855, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 10371, "time_per_iteration": 2.678119421005249 }, { "auxiliary_loss_clip": 0.01112216, "auxiliary_loss_mlp": 0.0103138, "balance_loss_clip": 1.01985228, "balance_loss_mlp": 1.03549862, "epoch": 0.623598376672178, "flos": 31139902160640.0, "grad_norm": 1.3152033838583674, "language_loss": 0.77709109, "learning_rate": 1.2429310408272966e-06, "loss": 0.798527, "num_input_tokens_seen": 223447225, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6796875, "step": 10372, "time_per_iteration": 2.677333116531372 }, { "auxiliary_loss_clip": 0.01116772, "auxiliary_loss_mlp": 0.01032494, "balance_loss_clip": 1.01969695, "balance_loss_mlp": 1.03474069, "epoch": 0.6236584999248459, "flos": 23730202195200.0, "grad_norm": 1.7808312816033014, "language_loss": 0.77684623, "learning_rate": 1.24258134683152e-06, "loss": 0.79833883, "num_input_tokens_seen": 223467520, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.734375, "step": 10373, "time_per_iteration": 2.660371780395508 }, { "auxiliary_loss_clip": 0.01113558, "auxiliary_loss_mlp": 0.01027777, "balance_loss_clip": 1.0154624, "balance_loss_mlp": 1.03615999, "epoch": 0.6237186231775139, "flos": 21762764599680.0, "grad_norm": 1.4386649259692263, "language_loss": 0.69715881, "learning_rate": 1.2422316798660677e-06, "loss": 0.71857214, "num_input_tokens_seen": 223488130, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 10374, "time_per_iteration": 2.556802988052368 }, { "auxiliary_loss_clip": 0.0112768, "auxiliary_loss_mlp": 0.01029508, "balance_loss_clip": 1.01728916, "balance_loss_mlp": 1.03685308, "epoch": 0.6237787464301818, "flos": 14939486075520.0, "grad_norm": 2.188547355289931, "language_loss": 0.77083671, "learning_rate": 1.2418820399434171e-06, "loss": 0.79240865, "num_input_tokens_seen": 223505105, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7265625, "step": 10375, "time_per_iteration": 2.549649477005005 }, { "auxiliary_loss_clip": 0.01125311, "auxiliary_loss_mlp": 0.0102886, "balance_loss_clip": 1.01660538, "balance_loss_mlp": 1.03747773, "epoch": 0.6238388696828499, "flos": 35590311302400.0, "grad_norm": 1.4424357543805624, "language_loss": 0.70069814, "learning_rate": 1.241532427076046e-06, "loss": 0.72223985, "num_input_tokens_seen": 223528065, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 10376, "time_per_iteration": 2.683537244796753 }, { "auxiliary_loss_clip": 0.01139563, "auxiliary_loss_mlp": 0.01035562, "balance_loss_clip": 1.02153051, "balance_loss_mlp": 1.03928173, "epoch": 0.6238989929355178, "flos": 23623511823360.0, "grad_norm": 1.558601575130079, "language_loss": 0.7634747, "learning_rate": 1.2411828412764322e-06, "loss": 0.78522587, "num_input_tokens_seen": 223547305, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.73828125, "step": 10377, "time_per_iteration": 2.6059913635253906 }, { "auxiliary_loss_clip": 0.01127915, "auxiliary_loss_mlp": 0.01033856, "balance_loss_clip": 1.02230442, "balance_loss_mlp": 1.03762245, "epoch": 0.6239591161881858, "flos": 22087468569600.0, "grad_norm": 1.7025282137145588, "language_loss": 0.67766869, "learning_rate": 1.2408332825570504e-06, "loss": 0.69928634, "num_input_tokens_seen": 223567205, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.7265625, "step": 10378, "time_per_iteration": 2.628828287124634 }, { "auxiliary_loss_clip": 0.01126978, "auxiliary_loss_mlp": 0.01027946, "balance_loss_clip": 1.01499975, "balance_loss_mlp": 1.03693354, "epoch": 0.6240192394408538, "flos": 24535930124160.0, "grad_norm": 1.8981792830547057, "language_loss": 0.76563513, "learning_rate": 1.2404837509303763e-06, "loss": 0.78718442, "num_input_tokens_seen": 223586560, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 10379, "time_per_iteration": 2.5793633460998535 }, { "auxiliary_loss_clip": 0.01122486, "auxiliary_loss_mlp": 0.0102777, "balance_loss_clip": 1.01535428, "balance_loss_mlp": 1.03424716, "epoch": 0.6240793626935217, "flos": 27931930502400.0, "grad_norm": 1.3784186559333853, "language_loss": 0.7967416, "learning_rate": 1.2401342464088835e-06, "loss": 0.81824416, "num_input_tokens_seen": 223610595, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 10380, "time_per_iteration": 2.6307358741760254 }, { "auxiliary_loss_clip": 0.01116663, "auxiliary_loss_mlp": 0.01033458, "balance_loss_clip": 1.02171552, "balance_loss_mlp": 1.0377667, "epoch": 0.6241394859461897, "flos": 22892514140160.0, "grad_norm": 1.578227412817486, "language_loss": 0.79959965, "learning_rate": 1.2397847690050442e-06, "loss": 0.82110083, "num_input_tokens_seen": 223630230, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.69921875, "step": 10381, "time_per_iteration": 2.586215019226074 }, { "auxiliary_loss_clip": 0.01127231, "auxiliary_loss_mlp": 0.0103076, "balance_loss_clip": 1.01792753, "balance_loss_mlp": 1.03660047, "epoch": 0.6241996091988576, "flos": 12750766744320.0, "grad_norm": 2.279231339525147, "language_loss": 0.74932206, "learning_rate": 1.2394353187313318e-06, "loss": 0.77090192, "num_input_tokens_seen": 223648360, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 10382, "time_per_iteration": 2.5113205909729004 }, { "auxiliary_loss_clip": 0.0111395, "auxiliary_loss_mlp": 0.01026259, "balance_loss_clip": 1.01453483, "balance_loss_mlp": 1.03539419, "epoch": 0.6242597324515257, "flos": 25851302173440.0, "grad_norm": 1.4574770355261046, "language_loss": 0.78449887, "learning_rate": 1.2390858956002163e-06, "loss": 0.80590093, "num_input_tokens_seen": 223671255, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6953125, "step": 10383, "time_per_iteration": 2.6358728408813477 }, { "auxiliary_loss_clip": 0.01133594, "auxiliary_loss_mlp": 0.01028753, "balance_loss_clip": 1.01640248, "balance_loss_mlp": 1.03565025, "epoch": 0.6243198557041936, "flos": 19937712516480.0, "grad_norm": 1.6966072365792444, "language_loss": 0.74751151, "learning_rate": 1.2387364996241678e-06, "loss": 0.76913494, "num_input_tokens_seen": 223689860, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.7109375, "step": 10384, "time_per_iteration": 2.557796001434326 }, { "auxiliary_loss_clip": 0.01125757, "auxiliary_loss_mlp": 0.01030562, "balance_loss_clip": 1.01757979, "balance_loss_mlp": 1.0354507, "epoch": 0.6243799789568616, "flos": 18406194376320.0, "grad_norm": 1.8177190463996937, "language_loss": 0.6672734, "learning_rate": 1.238387130815655e-06, "loss": 0.68883657, "num_input_tokens_seen": 223707835, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 10385, "time_per_iteration": 2.705782890319824 }, { "auxiliary_loss_clip": 0.01048311, "auxiliary_loss_mlp": 0.01001729, "balance_loss_clip": 1.00016737, "balance_loss_mlp": 1.00824165, "epoch": 0.6244401022095295, "flos": 66027587523840.0, "grad_norm": 0.7519198865751977, "language_loss": 0.61998117, "learning_rate": 1.2380377891871469e-06, "loss": 0.64048153, "num_input_tokens_seen": 223771875, "router_z_loss_clip": 0.015625, "router_z_loss_mlp": 0.22070312, "step": 10386, "time_per_iteration": 3.1788837909698486 }, { "auxiliary_loss_clip": 0.01116044, "auxiliary_loss_mlp": 0.01032592, "balance_loss_clip": 1.01927066, "balance_loss_mlp": 1.03449917, "epoch": 0.6245002254621975, "flos": 24571266128640.0, "grad_norm": 4.0134207141056715, "language_loss": 0.71789688, "learning_rate": 1.23768847475111e-06, "loss": 0.73938322, "num_input_tokens_seen": 223788895, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 10387, "time_per_iteration": 2.604660987854004 }, { "auxiliary_loss_clip": 0.01122491, "auxiliary_loss_mlp": 0.01039377, "balance_loss_clip": 1.02550077, "balance_loss_mlp": 1.03690863, "epoch": 0.6245603487148654, "flos": 29168837291520.0, "grad_norm": 2.406724865365744, "language_loss": 0.65599722, "learning_rate": 1.23733918752001e-06, "loss": 0.67761594, "num_input_tokens_seen": 223810385, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.765625, "step": 10388, "time_per_iteration": 2.553117275238037 }, { "auxiliary_loss_clip": 0.01133888, "auxiliary_loss_mlp": 0.01028102, "balance_loss_clip": 1.01646721, "balance_loss_mlp": 1.03601146, "epoch": 0.6246204719675335, "flos": 14790097411200.0, "grad_norm": 1.5559283576123006, "language_loss": 0.78957349, "learning_rate": 1.2369899275063133e-06, "loss": 0.81119335, "num_input_tokens_seen": 223826040, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.71484375, "step": 10389, "time_per_iteration": 2.5794663429260254 }, { "auxiliary_loss_clip": 0.01124398, "auxiliary_loss_mlp": 0.01034232, "balance_loss_clip": 1.02102351, "balance_loss_mlp": 1.03582191, "epoch": 0.6246805952202014, "flos": 12493538472960.0, "grad_norm": 1.8483108606372942, "language_loss": 0.60349023, "learning_rate": 1.236640694722483e-06, "loss": 0.62507653, "num_input_tokens_seen": 223842300, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 10390, "time_per_iteration": 2.5168609619140625 }, { "auxiliary_loss_clip": 0.01128109, "auxiliary_loss_mlp": 0.01035947, "balance_loss_clip": 1.02326298, "balance_loss_mlp": 1.03664327, "epoch": 0.6247407184728694, "flos": 12786677366400.0, "grad_norm": 6.106279954272594, "language_loss": 0.76899445, "learning_rate": 1.2362914891809828e-06, "loss": 0.79063499, "num_input_tokens_seen": 223858320, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.734375, "step": 10391, "time_per_iteration": 2.5780582427978516 }, { "auxiliary_loss_clip": 0.01128507, "auxiliary_loss_mlp": 0.01032608, "balance_loss_clip": 1.01932192, "balance_loss_mlp": 1.03838146, "epoch": 0.6248008417255374, "flos": 40629188960640.0, "grad_norm": 1.5039379937852093, "language_loss": 0.64765882, "learning_rate": 1.2359423108942752e-06, "loss": 0.66926998, "num_input_tokens_seen": 223883545, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 10392, "time_per_iteration": 2.7271087169647217 }, { "auxiliary_loss_clip": 0.01128544, "auxiliary_loss_mlp": 0.01033392, "balance_loss_clip": 1.02042162, "balance_loss_mlp": 1.03780544, "epoch": 0.6248609649782053, "flos": 19902017376000.0, "grad_norm": 1.7018225053527922, "language_loss": 0.76892996, "learning_rate": 1.2355931598748206e-06, "loss": 0.79054928, "num_input_tokens_seen": 223901445, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.734375, "step": 10393, "time_per_iteration": 2.5712924003601074 }, { "auxiliary_loss_clip": 0.01139912, "auxiliary_loss_mlp": 0.01284728, "balance_loss_clip": 1.02409363, "balance_loss_mlp": 1.03886294, "epoch": 0.6249210882308733, "flos": 19682746801920.0, "grad_norm": 1.86989955199941, "language_loss": 0.82615614, "learning_rate": 1.2352440361350803e-06, "loss": 0.85040259, "num_input_tokens_seen": 223920170, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7421875, "step": 10394, "time_per_iteration": 2.5659685134887695 }, { "auxiliary_loss_clip": 0.01128749, "auxiliary_loss_mlp": 0.01036545, "balance_loss_clip": 1.02303255, "balance_loss_mlp": 1.03755248, "epoch": 0.6249812114835412, "flos": 13990726189440.0, "grad_norm": 1.7552962576723228, "language_loss": 0.74744034, "learning_rate": 1.2348949396875125e-06, "loss": 0.76909328, "num_input_tokens_seen": 223936495, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 10395, "time_per_iteration": 2.5696027278900146 }, { "auxiliary_loss_clip": 0.01129532, "auxiliary_loss_mlp": 0.01033403, "balance_loss_clip": 1.02002156, "balance_loss_mlp": 1.03712916, "epoch": 0.6250413347362093, "flos": 14530031965440.0, "grad_norm": 2.3760821487328494, "language_loss": 0.72755444, "learning_rate": 1.2345458705445771e-06, "loss": 0.74918377, "num_input_tokens_seen": 223950070, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.75, "step": 10396, "time_per_iteration": 2.527587890625 }, { "auxiliary_loss_clip": 0.01136143, "auxiliary_loss_mlp": 0.01036362, "balance_loss_clip": 1.02382731, "balance_loss_mlp": 1.03745866, "epoch": 0.6251014579888772, "flos": 22963006581120.0, "grad_norm": 1.621668829288656, "language_loss": 0.76061893, "learning_rate": 1.23419682871873e-06, "loss": 0.78234398, "num_input_tokens_seen": 223970065, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 10397, "time_per_iteration": 2.6077651977539062 }, { "auxiliary_loss_clip": 0.01125816, "auxiliary_loss_mlp": 0.01038107, "balance_loss_clip": 1.02527452, "balance_loss_mlp": 1.0362972, "epoch": 0.6251615812415452, "flos": 28111232217600.0, "grad_norm": 1.8749154915470216, "language_loss": 0.74977577, "learning_rate": 1.2338478142224285e-06, "loss": 0.771415, "num_input_tokens_seen": 223990315, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 10398, "time_per_iteration": 2.648998975753784 }, { "auxiliary_loss_clip": 0.01111251, "auxiliary_loss_mlp": 0.01033401, "balance_loss_clip": 1.01905441, "balance_loss_mlp": 1.03598583, "epoch": 0.6252217044942131, "flos": 26724469887360.0, "grad_norm": 1.6958902055385756, "language_loss": 0.74128103, "learning_rate": 1.2334988270681277e-06, "loss": 0.76272756, "num_input_tokens_seen": 224009960, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.75390625, "step": 10399, "time_per_iteration": 2.5846712589263916 }, { "auxiliary_loss_clip": 0.01137367, "auxiliary_loss_mlp": 0.01037484, "balance_loss_clip": 1.02441847, "balance_loss_mlp": 1.03761411, "epoch": 0.6252818277468811, "flos": 20006768413440.0, "grad_norm": 7.18609496476077, "language_loss": 0.74209374, "learning_rate": 1.2331498672682819e-06, "loss": 0.76384223, "num_input_tokens_seen": 224028870, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 10400, "time_per_iteration": 3.997321844100952 }, { "auxiliary_loss_clip": 0.0105758, "auxiliary_loss_mlp": 0.01003617, "balance_loss_clip": 1.00215042, "balance_loss_mlp": 1.00841093, "epoch": 0.625341950999549, "flos": 59278285059840.0, "grad_norm": 0.8374543588268224, "language_loss": 0.56466901, "learning_rate": 1.232800934835345e-06, "loss": 0.58528101, "num_input_tokens_seen": 224094140, "router_z_loss_clip": 0.01464844, "router_z_loss_mlp": 0.22167969, "step": 10401, "time_per_iteration": 3.2908740043640137 }, { "auxiliary_loss_clip": 0.01120626, "auxiliary_loss_mlp": 0.01036485, "balance_loss_clip": 1.02297235, "balance_loss_mlp": 1.0379734, "epoch": 0.625402074252217, "flos": 20157090831360.0, "grad_norm": 1.9047028612426788, "language_loss": 0.82782674, "learning_rate": 1.2324520297817693e-06, "loss": 0.8493979, "num_input_tokens_seen": 224113235, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73828125, "step": 10402, "time_per_iteration": 2.594947099685669 }, { "auxiliary_loss_clip": 0.01160717, "auxiliary_loss_mlp": 0.01037044, "balance_loss_clip": 1.02419305, "balance_loss_mlp": 1.03610027, "epoch": 0.625462197504885, "flos": 29132531619840.0, "grad_norm": 1.9243495007746574, "language_loss": 0.69485307, "learning_rate": 1.2321031521200057e-06, "loss": 0.71683073, "num_input_tokens_seen": 224134530, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 10403, "time_per_iteration": 2.8161094188690186 }, { "auxiliary_loss_clip": 0.01138376, "auxiliary_loss_mlp": 0.0102986, "balance_loss_clip": 1.01594806, "balance_loss_mlp": 1.03690672, "epoch": 0.625522320757553, "flos": 26104436294400.0, "grad_norm": 1.5340785167273543, "language_loss": 0.7203303, "learning_rate": 1.2317543018625058e-06, "loss": 0.74201262, "num_input_tokens_seen": 224154170, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75, "step": 10404, "time_per_iteration": 2.6864936351776123 }, { "auxiliary_loss_clip": 0.01145663, "auxiliary_loss_mlp": 0.01037689, "balance_loss_clip": 1.02483797, "balance_loss_mlp": 1.0383172, "epoch": 0.625582444010221, "flos": 20630967984000.0, "grad_norm": 1.8492875079222693, "language_loss": 0.69807053, "learning_rate": 1.2314054790217184e-06, "loss": 0.71990407, "num_input_tokens_seen": 224172730, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.72265625, "step": 10405, "time_per_iteration": 4.00269627571106 }, { "auxiliary_loss_clip": 0.01127508, "auxiliary_loss_mlp": 0.01034996, "balance_loss_clip": 1.02190089, "balance_loss_mlp": 1.03730702, "epoch": 0.6256425672628889, "flos": 20521512264960.0, "grad_norm": 1.622069301760686, "language_loss": 0.79145634, "learning_rate": 1.2310566836100927e-06, "loss": 0.81308138, "num_input_tokens_seen": 224192620, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 10406, "time_per_iteration": 2.6458091735839844 }, { "auxiliary_loss_clip": 0.01137673, "auxiliary_loss_mlp": 0.01037903, "balance_loss_clip": 1.02529109, "balance_loss_mlp": 1.03835583, "epoch": 0.6257026905155569, "flos": 29529200488320.0, "grad_norm": 1.9739393020255895, "language_loss": 0.68642545, "learning_rate": 1.2307079156400756e-06, "loss": 0.7081812, "num_input_tokens_seen": 224214660, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.72265625, "step": 10407, "time_per_iteration": 2.7767415046691895 }, { "auxiliary_loss_clip": 0.01114959, "auxiliary_loss_mlp": 0.01279517, "balance_loss_clip": 1.02077079, "balance_loss_mlp": 1.03723347, "epoch": 0.6257628137682248, "flos": 24024885373440.0, "grad_norm": 1.7557280350466302, "language_loss": 0.85432774, "learning_rate": 1.2303591751241146e-06, "loss": 0.87827247, "num_input_tokens_seen": 224234170, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 10408, "time_per_iteration": 2.6496877670288086 }, { "auxiliary_loss_clip": 0.01124563, "auxiliary_loss_mlp": 0.01284496, "balance_loss_clip": 1.02537525, "balance_loss_mlp": 1.03601456, "epoch": 0.6258229370208929, "flos": 20850956830080.0, "grad_norm": 1.6920359971492607, "language_loss": 0.79475182, "learning_rate": 1.230010462074655e-06, "loss": 0.81884241, "num_input_tokens_seen": 224253115, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 10409, "time_per_iteration": 2.6087563037872314 }, { "auxiliary_loss_clip": 0.01126081, "auxiliary_loss_mlp": 0.01030492, "balance_loss_clip": 1.01783836, "balance_loss_mlp": 1.03692794, "epoch": 0.6258830602735608, "flos": 22231542021120.0, "grad_norm": 4.500265523630965, "language_loss": 0.70035559, "learning_rate": 1.2296617765041408e-06, "loss": 0.72192132, "num_input_tokens_seen": 224271375, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 10410, "time_per_iteration": 2.5720651149749756 }, { "auxiliary_loss_clip": 0.01141624, "auxiliary_loss_mlp": 0.01029435, "balance_loss_clip": 1.01737702, "balance_loss_mlp": 1.03689873, "epoch": 0.6259431835262288, "flos": 25076887925760.0, "grad_norm": 1.7367150643737483, "language_loss": 0.67514813, "learning_rate": 1.2293131184250167e-06, "loss": 0.69685876, "num_input_tokens_seen": 224290315, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69140625, "step": 10411, "time_per_iteration": 4.293856382369995 }, { "auxiliary_loss_clip": 0.01145752, "auxiliary_loss_mlp": 0.01034581, "balance_loss_clip": 1.02214718, "balance_loss_mlp": 1.03759539, "epoch": 0.6260033067788967, "flos": 28252288926720.0, "grad_norm": 1.9621288541412973, "language_loss": 0.69762313, "learning_rate": 1.2289644878497244e-06, "loss": 0.71942651, "num_input_tokens_seen": 224310545, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7265625, "step": 10412, "time_per_iteration": 2.7006969451904297 }, { "auxiliary_loss_clip": 0.01124493, "auxiliary_loss_mlp": 0.01033335, "balance_loss_clip": 1.02134848, "balance_loss_mlp": 1.03581095, "epoch": 0.6260634300315647, "flos": 23367432787200.0, "grad_norm": 1.4555370942174284, "language_loss": 0.69404483, "learning_rate": 1.2286158847907074e-06, "loss": 0.71562314, "num_input_tokens_seen": 224331115, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.70703125, "step": 10413, "time_per_iteration": 2.576486349105835 }, { "auxiliary_loss_clip": 0.01137323, "auxiliary_loss_mlp": 0.01032364, "balance_loss_clip": 1.01792812, "balance_loss_mlp": 1.03535891, "epoch": 0.6261235532842326, "flos": 18035308494720.0, "grad_norm": 2.4599523094154696, "language_loss": 0.81143272, "learning_rate": 1.2282673092604045e-06, "loss": 0.83312964, "num_input_tokens_seen": 224347525, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7578125, "step": 10414, "time_per_iteration": 2.579819440841675 }, { "auxiliary_loss_clip": 0.01119428, "auxiliary_loss_mlp": 0.01038456, "balance_loss_clip": 1.02558172, "balance_loss_mlp": 1.0390811, "epoch": 0.6261836765369007, "flos": 22011265866240.0, "grad_norm": 1.6767489494766026, "language_loss": 0.7459327, "learning_rate": 1.227918761271256e-06, "loss": 0.76751155, "num_input_tokens_seen": 224367045, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 10415, "time_per_iteration": 2.5926125049591064 }, { "auxiliary_loss_clip": 0.01142923, "auxiliary_loss_mlp": 0.01032005, "balance_loss_clip": 1.01950026, "balance_loss_mlp": 1.03624034, "epoch": 0.6262437997895686, "flos": 24936010784640.0, "grad_norm": 1.4890367799002826, "language_loss": 0.74291861, "learning_rate": 1.227570240835701e-06, "loss": 0.76466787, "num_input_tokens_seen": 224388860, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 10416, "time_per_iteration": 2.628690004348755 }, { "auxiliary_loss_clip": 0.01124173, "auxiliary_loss_mlp": 0.01033575, "balance_loss_clip": 1.02094507, "balance_loss_mlp": 1.03705072, "epoch": 0.6263039230422366, "flos": 31608428186880.0, "grad_norm": 1.5858233705035805, "language_loss": 0.84133643, "learning_rate": 1.2272217479661771e-06, "loss": 0.86291391, "num_input_tokens_seen": 224409645, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 10417, "time_per_iteration": 2.7348110675811768 }, { "auxiliary_loss_clip": 0.01117535, "auxiliary_loss_mlp": 0.01036666, "balance_loss_clip": 1.02332616, "balance_loss_mlp": 1.03639627, "epoch": 0.6263640462949046, "flos": 17639465639040.0, "grad_norm": 2.439314022565577, "language_loss": 0.56981486, "learning_rate": 1.2268732826751214e-06, "loss": 0.59135687, "num_input_tokens_seen": 224428530, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 10418, "time_per_iteration": 2.5554282665252686 }, { "auxiliary_loss_clip": 0.01122887, "auxiliary_loss_mlp": 0.01037467, "balance_loss_clip": 1.02359688, "balance_loss_mlp": 1.03785753, "epoch": 0.6264241695475725, "flos": 19974951941760.0, "grad_norm": 1.858845169070303, "language_loss": 0.84420478, "learning_rate": 1.2265248449749694e-06, "loss": 0.86580831, "num_input_tokens_seen": 224447175, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76171875, "step": 10419, "time_per_iteration": 2.534184217453003 }, { "auxiliary_loss_clip": 0.01112172, "auxiliary_loss_mlp": 0.01032659, "balance_loss_clip": 1.02011263, "balance_loss_mlp": 1.03945184, "epoch": 0.6264842928002405, "flos": 27344323912320.0, "grad_norm": 1.52543071621215, "language_loss": 0.645235, "learning_rate": 1.2261764348781558e-06, "loss": 0.66668332, "num_input_tokens_seen": 224469445, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 10420, "time_per_iteration": 2.600407123565674 }, { "auxiliary_loss_clip": 0.01137373, "auxiliary_loss_mlp": 0.01031631, "balance_loss_clip": 1.01835704, "balance_loss_mlp": 1.03616476, "epoch": 0.6265444160529084, "flos": 22997265177600.0, "grad_norm": 2.1262291789081833, "language_loss": 0.86213404, "learning_rate": 1.2258280523971154e-06, "loss": 0.88382405, "num_input_tokens_seen": 224486590, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7421875, "step": 10421, "time_per_iteration": 2.582214832305908 }, { "auxiliary_loss_clip": 0.01138294, "auxiliary_loss_mlp": 0.01035906, "balance_loss_clip": 1.02344894, "balance_loss_mlp": 1.038854, "epoch": 0.6266045393055765, "flos": 19938323047680.0, "grad_norm": 3.4530176018281367, "language_loss": 0.7944355, "learning_rate": 1.2254796975442795e-06, "loss": 0.81617749, "num_input_tokens_seen": 224502795, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7265625, "step": 10422, "time_per_iteration": 2.5233938694000244 }, { "auxiliary_loss_clip": 0.01114468, "auxiliary_loss_mlp": 0.01028346, "balance_loss_clip": 1.01523924, "balance_loss_mlp": 1.03476751, "epoch": 0.6266646625582444, "flos": 24389091325440.0, "grad_norm": 1.7333344081719615, "language_loss": 0.74107426, "learning_rate": 1.2251313703320816e-06, "loss": 0.76250243, "num_input_tokens_seen": 224522300, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 10423, "time_per_iteration": 2.6074490547180176 }, { "auxiliary_loss_clip": 0.01106815, "auxiliary_loss_mlp": 0.01028747, "balance_loss_clip": 1.01631343, "balance_loss_mlp": 1.03585589, "epoch": 0.6267247858109124, "flos": 14683802088960.0, "grad_norm": 1.8861984131566831, "language_loss": 0.77541113, "learning_rate": 1.2247830707729518e-06, "loss": 0.79676676, "num_input_tokens_seen": 224538260, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7109375, "step": 10424, "time_per_iteration": 2.5843350887298584 }, { "auxiliary_loss_clip": 0.01109007, "auxiliary_loss_mlp": 0.01033118, "balance_loss_clip": 1.01906288, "balance_loss_mlp": 1.03798819, "epoch": 0.6267849090635803, "flos": 24929977299840.0, "grad_norm": 2.021259232513663, "language_loss": 0.69264829, "learning_rate": 1.2244347988793198e-06, "loss": 0.71406949, "num_input_tokens_seen": 224559155, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7109375, "step": 10425, "time_per_iteration": 2.666942834854126 }, { "auxiliary_loss_clip": 0.01121051, "auxiliary_loss_mlp": 0.01033887, "balance_loss_clip": 1.02240121, "balance_loss_mlp": 1.0356164, "epoch": 0.6268450323162483, "flos": 25337851211520.0, "grad_norm": 1.9978882582204625, "language_loss": 0.74425328, "learning_rate": 1.2240865546636152e-06, "loss": 0.76580268, "num_input_tokens_seen": 224578660, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6796875, "step": 10426, "time_per_iteration": 2.6413745880126953 }, { "auxiliary_loss_clip": 0.01118694, "auxiliary_loss_mlp": 0.01284198, "balance_loss_clip": 1.02424669, "balance_loss_mlp": 1.03793025, "epoch": 0.6269051555689162, "flos": 26177299032960.0, "grad_norm": 1.4351676131538451, "language_loss": 0.8047992, "learning_rate": 1.2237383381382652e-06, "loss": 0.8288281, "num_input_tokens_seen": 224599080, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 10427, "time_per_iteration": 2.6255624294281006 }, { "auxiliary_loss_clip": 0.01136997, "auxiliary_loss_mlp": 0.01035351, "balance_loss_clip": 1.02239299, "balance_loss_mlp": 1.0373311, "epoch": 0.6269652788215843, "flos": 18256877539200.0, "grad_norm": 1.8420880704690812, "language_loss": 0.68040192, "learning_rate": 1.2233901493156978e-06, "loss": 0.70212543, "num_input_tokens_seen": 224614225, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.73046875, "step": 10428, "time_per_iteration": 2.4730286598205566 }, { "auxiliary_loss_clip": 0.01136886, "auxiliary_loss_mlp": 0.01037942, "balance_loss_clip": 1.02556169, "balance_loss_mlp": 1.03870964, "epoch": 0.6270254020742522, "flos": 11765413877760.0, "grad_norm": 2.289960904702118, "language_loss": 0.71794593, "learning_rate": 1.2230419882083375e-06, "loss": 0.73969424, "num_input_tokens_seen": 224632365, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.7109375, "step": 10429, "time_per_iteration": 2.601503610610962 }, { "auxiliary_loss_clip": 0.0112218, "auxiliary_loss_mlp": 0.01035255, "balance_loss_clip": 1.02166569, "balance_loss_mlp": 1.03950465, "epoch": 0.6270855253269202, "flos": 23475631530240.0, "grad_norm": 1.372617954316908, "language_loss": 0.80033338, "learning_rate": 1.2226938548286105e-06, "loss": 0.82190776, "num_input_tokens_seen": 224651125, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73828125, "step": 10430, "time_per_iteration": 2.575310230255127 }, { "auxiliary_loss_clip": 0.01031849, "auxiliary_loss_mlp": 0.01005478, "balance_loss_clip": 1.00385714, "balance_loss_mlp": 1.00955594, "epoch": 0.6271456485795882, "flos": 70064520232320.0, "grad_norm": 0.7631709546661145, "language_loss": 0.59144491, "learning_rate": 1.2223457491889404e-06, "loss": 0.61181813, "num_input_tokens_seen": 224716115, "router_z_loss_clip": 0.01623535, "router_z_loss_mlp": 0.22265625, "step": 10431, "time_per_iteration": 3.2753002643585205 }, { "auxiliary_loss_clip": 0.010487, "auxiliary_loss_mlp": 0.01001121, "balance_loss_clip": 0.99954712, "balance_loss_mlp": 1.00931644, "epoch": 0.6272057718322561, "flos": 65156718280320.0, "grad_norm": 0.9107955200375415, "language_loss": 0.6378684, "learning_rate": 1.22199767130175e-06, "loss": 0.65836668, "num_input_tokens_seen": 224782930, "router_z_loss_clip": 0.01574707, "router_z_loss_mlp": 0.22265625, "step": 10432, "time_per_iteration": 3.3159046173095703 }, { "auxiliary_loss_clip": 0.01123836, "auxiliary_loss_mlp": 0.01031737, "balance_loss_clip": 1.01977444, "balance_loss_mlp": 1.03519511, "epoch": 0.6272658950849241, "flos": 24389342720640.0, "grad_norm": 1.82596320980402, "language_loss": 0.64918846, "learning_rate": 1.2216496211794609e-06, "loss": 0.67074418, "num_input_tokens_seen": 224802010, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.7109375, "step": 10433, "time_per_iteration": 2.6265084743499756 }, { "auxiliary_loss_clip": 0.01123105, "auxiliary_loss_mlp": 0.01035331, "balance_loss_clip": 1.02164567, "balance_loss_mlp": 1.03911495, "epoch": 0.627326018337592, "flos": 17966001202560.0, "grad_norm": 5.043370288834652, "language_loss": 0.62097126, "learning_rate": 1.221301598834496e-06, "loss": 0.64255559, "num_input_tokens_seen": 224818875, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75390625, "step": 10434, "time_per_iteration": 2.5667567253112793 }, { "auxiliary_loss_clip": 0.01133617, "auxiliary_loss_mlp": 0.0102889, "balance_loss_clip": 1.01580107, "balance_loss_mlp": 1.03443229, "epoch": 0.6273861415902601, "flos": 20230097224320.0, "grad_norm": 1.7388611797918727, "language_loss": 0.84582835, "learning_rate": 1.220953604279273e-06, "loss": 0.86745346, "num_input_tokens_seen": 224837790, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 10435, "time_per_iteration": 2.564401865005493 }, { "auxiliary_loss_clip": 0.01031394, "auxiliary_loss_mlp": 0.01001859, "balance_loss_clip": 1.00028515, "balance_loss_mlp": 1.00916576, "epoch": 0.627446264842928, "flos": 64953210798720.0, "grad_norm": 0.7366037900431672, "language_loss": 0.61532438, "learning_rate": 1.2206056375262116e-06, "loss": 0.63565695, "num_input_tokens_seen": 224899685, "router_z_loss_clip": 0.01574707, "router_z_loss_mlp": 0.22265625, "step": 10436, "time_per_iteration": 3.1936566829681396 }, { "auxiliary_loss_clip": 0.0112948, "auxiliary_loss_mlp": 0.01036481, "balance_loss_clip": 1.02273047, "balance_loss_mlp": 1.03954959, "epoch": 0.627506388095596, "flos": 23584261236480.0, "grad_norm": 1.5539845922441238, "language_loss": 0.77410805, "learning_rate": 1.2202576985877312e-06, "loss": 0.79576755, "num_input_tokens_seen": 224918650, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.72265625, "step": 10437, "time_per_iteration": 2.6611647605895996 }, { "auxiliary_loss_clip": 0.0103977, "auxiliary_loss_mlp": 0.01003927, "balance_loss_clip": 1.00237775, "balance_loss_mlp": 1.00887465, "epoch": 0.6275665113482639, "flos": 67583631674880.0, "grad_norm": 1.0211360409566559, "language_loss": 0.54351777, "learning_rate": 1.2199097874762472e-06, "loss": 0.56395477, "num_input_tokens_seen": 224981575, "router_z_loss_clip": 0.01544189, "router_z_loss_mlp": 0.22070312, "step": 10438, "time_per_iteration": 3.124326467514038 }, { "auxiliary_loss_clip": 0.01143895, "auxiliary_loss_mlp": 0.01043811, "balance_loss_clip": 1.030424, "balance_loss_mlp": 1.03620315, "epoch": 0.6276266346009319, "flos": 27636924101760.0, "grad_norm": 1.6857996616894555, "language_loss": 0.84295493, "learning_rate": 1.219561904204176e-06, "loss": 0.86483198, "num_input_tokens_seen": 225000820, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.72265625, "step": 10439, "time_per_iteration": 2.6839687824249268 }, { "auxiliary_loss_clip": 0.01138484, "auxiliary_loss_mlp": 0.01042033, "balance_loss_clip": 1.02838361, "balance_loss_mlp": 1.03701842, "epoch": 0.6276867578535998, "flos": 22746142218240.0, "grad_norm": 2.0584205161664224, "language_loss": 0.80221796, "learning_rate": 1.2192140487839328e-06, "loss": 0.82402313, "num_input_tokens_seen": 225017585, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 10440, "time_per_iteration": 2.6410748958587646 }, { "auxiliary_loss_clip": 0.01138584, "auxiliary_loss_mlp": 0.01034191, "balance_loss_clip": 1.02244329, "balance_loss_mlp": 1.03476465, "epoch": 0.6277468811062679, "flos": 24644200694400.0, "grad_norm": 2.9295976531181096, "language_loss": 0.74451804, "learning_rate": 1.218866221227933e-06, "loss": 0.76624584, "num_input_tokens_seen": 225039085, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6796875, "step": 10441, "time_per_iteration": 4.062891244888306 }, { "auxiliary_loss_clip": 0.01116557, "auxiliary_loss_mlp": 0.01032366, "balance_loss_clip": 1.01877606, "balance_loss_mlp": 1.03610253, "epoch": 0.6278070043589358, "flos": 19678975873920.0, "grad_norm": 1.9180867635387053, "language_loss": 0.72350812, "learning_rate": 1.2185184215485873e-06, "loss": 0.74499738, "num_input_tokens_seen": 225058105, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71484375, "step": 10442, "time_per_iteration": 2.5424728393554688 }, { "auxiliary_loss_clip": 0.01136102, "auxiliary_loss_mlp": 0.01036142, "balance_loss_clip": 1.02332711, "balance_loss_mlp": 1.03732669, "epoch": 0.6278671276116038, "flos": 22121834906880.0, "grad_norm": 1.4616038331297716, "language_loss": 0.71282852, "learning_rate": 1.2181706497583096e-06, "loss": 0.73455095, "num_input_tokens_seen": 225077605, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 10443, "time_per_iteration": 2.5851924419403076 }, { "auxiliary_loss_clip": 0.01109144, "auxiliary_loss_mlp": 0.01028224, "balance_loss_clip": 1.01560569, "balance_loss_mlp": 1.03717709, "epoch": 0.6279272508642717, "flos": 23038562839680.0, "grad_norm": 2.013483308471448, "language_loss": 0.73710692, "learning_rate": 1.2178229058695104e-06, "loss": 0.75848055, "num_input_tokens_seen": 225097775, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 10444, "time_per_iteration": 2.565725088119507 }, { "auxiliary_loss_clip": 0.01135032, "auxiliary_loss_mlp": 0.01033031, "balance_loss_clip": 1.02019787, "balance_loss_mlp": 1.03761578, "epoch": 0.6279873741169397, "flos": 19824090819840.0, "grad_norm": 2.07540505048441, "language_loss": 0.72181994, "learning_rate": 1.217475189894599e-06, "loss": 0.74350059, "num_input_tokens_seen": 225115585, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 10445, "time_per_iteration": 2.627387046813965 }, { "auxiliary_loss_clip": 0.01117329, "auxiliary_loss_mlp": 0.01029965, "balance_loss_clip": 1.01732278, "balance_loss_mlp": 1.03611839, "epoch": 0.6280474973696077, "flos": 23915393740800.0, "grad_norm": 1.77453356743156, "language_loss": 0.68978918, "learning_rate": 1.2171275018459853e-06, "loss": 0.71126211, "num_input_tokens_seen": 225135575, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.72265625, "step": 10446, "time_per_iteration": 2.5815958976745605 }, { "auxiliary_loss_clip": 0.01150397, "auxiliary_loss_mlp": 0.01034284, "balance_loss_clip": 1.02036023, "balance_loss_mlp": 1.0382067, "epoch": 0.6281076206222757, "flos": 17967976450560.0, "grad_norm": 1.8166571129075637, "language_loss": 0.73676479, "learning_rate": 1.216779841736078e-06, "loss": 0.75861156, "num_input_tokens_seen": 225154230, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.76171875, "step": 10447, "time_per_iteration": 3.904294729232788 }, { "auxiliary_loss_clip": 0.01039202, "auxiliary_loss_mlp": 0.01004434, "balance_loss_clip": 1.00287235, "balance_loss_mlp": 1.00842845, "epoch": 0.6281677438749437, "flos": 66778370622720.0, "grad_norm": 0.683245971151366, "language_loss": 0.52318376, "learning_rate": 1.2164322095772826e-06, "loss": 0.54362011, "num_input_tokens_seen": 225213650, "router_z_loss_clip": 0.015625, "router_z_loss_mlp": 0.22070312, "step": 10448, "time_per_iteration": 3.054943084716797 }, { "auxiliary_loss_clip": 0.01129898, "auxiliary_loss_mlp": 0.01043714, "balance_loss_clip": 1.0300467, "balance_loss_mlp": 1.03840661, "epoch": 0.6282278671276116, "flos": 11656173640320.0, "grad_norm": 3.568246282812034, "language_loss": 0.91084743, "learning_rate": 1.216084605382006e-06, "loss": 0.93258357, "num_input_tokens_seen": 225230135, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 10449, "time_per_iteration": 2.5028321743011475 }, { "auxiliary_loss_clip": 0.01107949, "auxiliary_loss_mlp": 0.01032588, "balance_loss_clip": 1.02011323, "balance_loss_mlp": 1.03641021, "epoch": 0.6282879903802796, "flos": 42741597847680.0, "grad_norm": 1.6402977419427918, "language_loss": 0.60352385, "learning_rate": 1.2157370291626534e-06, "loss": 0.62492925, "num_input_tokens_seen": 225253520, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.71484375, "step": 10450, "time_per_iteration": 2.7406392097473145 }, { "auxiliary_loss_clip": 0.0112454, "auxiliary_loss_mlp": 0.01031189, "balance_loss_clip": 1.01877952, "balance_loss_mlp": 1.03681064, "epoch": 0.6283481136329475, "flos": 20009210538240.0, "grad_norm": 1.5423049660519106, "language_loss": 0.76958632, "learning_rate": 1.2153894809316297e-06, "loss": 0.7911436, "num_input_tokens_seen": 225272460, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 10451, "time_per_iteration": 2.569436550140381 }, { "auxiliary_loss_clip": 0.01128599, "auxiliary_loss_mlp": 0.01030871, "balance_loss_clip": 1.01786494, "balance_loss_mlp": 1.03626895, "epoch": 0.6284082368856155, "flos": 21904431840000.0, "grad_norm": 2.3567461009141097, "language_loss": 0.77610868, "learning_rate": 1.2150419607013365e-06, "loss": 0.79770339, "num_input_tokens_seen": 225291700, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7421875, "step": 10452, "time_per_iteration": 2.561436653137207 }, { "auxiliary_loss_clip": 0.01135329, "auxiliary_loss_mlp": 0.01035765, "balance_loss_clip": 1.02229476, "balance_loss_mlp": 1.03762937, "epoch": 0.6284683601382834, "flos": 25484187219840.0, "grad_norm": 1.5965686051913655, "language_loss": 0.72478878, "learning_rate": 1.2146944684841764e-06, "loss": 0.74649978, "num_input_tokens_seen": 225311470, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 10453, "time_per_iteration": 5.597359895706177 }, { "auxiliary_loss_clip": 0.0114329, "auxiliary_loss_mlp": 0.01031552, "balance_loss_clip": 1.01805162, "balance_loss_mlp": 1.03440583, "epoch": 0.6285284833909515, "flos": 16538695395840.0, "grad_norm": 1.693152639943506, "language_loss": 0.80277777, "learning_rate": 1.2143470042925516e-06, "loss": 0.82452625, "num_input_tokens_seen": 225328385, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 10454, "time_per_iteration": 2.6397979259490967 }, { "auxiliary_loss_clip": 0.0113388, "auxiliary_loss_mlp": 0.01034341, "balance_loss_clip": 1.02198482, "balance_loss_mlp": 1.03691578, "epoch": 0.6285886066436194, "flos": 22820692896000.0, "grad_norm": 1.7350102701862902, "language_loss": 0.81602168, "learning_rate": 1.2139995681388603e-06, "loss": 0.83770388, "num_input_tokens_seen": 225348415, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 10455, "time_per_iteration": 2.6529555320739746 }, { "auxiliary_loss_clip": 0.01143896, "auxiliary_loss_mlp": 0.01278701, "balance_loss_clip": 1.01926887, "balance_loss_mlp": 1.03640127, "epoch": 0.6286487298962874, "flos": 24715734629760.0, "grad_norm": 1.56516111970911, "language_loss": 0.81700182, "learning_rate": 1.2136521600355028e-06, "loss": 0.84122777, "num_input_tokens_seen": 225367740, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 10456, "time_per_iteration": 2.680241107940674 }, { "auxiliary_loss_clip": 0.01131842, "auxiliary_loss_mlp": 0.01032091, "balance_loss_clip": 1.0184412, "balance_loss_mlp": 1.03919387, "epoch": 0.6287088531489553, "flos": 20740818752640.0, "grad_norm": 1.7352123712193046, "language_loss": 0.72102034, "learning_rate": 1.2133047799948776e-06, "loss": 0.74265963, "num_input_tokens_seen": 225388405, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 10457, "time_per_iteration": 2.608950138092041 }, { "auxiliary_loss_clip": 0.01121744, "auxiliary_loss_mlp": 0.01037098, "balance_loss_clip": 1.02358532, "balance_loss_mlp": 1.03600454, "epoch": 0.6287689764016233, "flos": 23070630706560.0, "grad_norm": 1.867224518749435, "language_loss": 0.79581243, "learning_rate": 1.2129574280293808e-06, "loss": 0.81740087, "num_input_tokens_seen": 225408360, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.76953125, "step": 10458, "time_per_iteration": 2.6182119846343994 }, { "auxiliary_loss_clip": 0.01121103, "auxiliary_loss_mlp": 0.0103334, "balance_loss_clip": 1.02026296, "balance_loss_mlp": 1.03739023, "epoch": 0.6288290996542913, "flos": 32233669251840.0, "grad_norm": 1.6904971148728987, "language_loss": 0.61186624, "learning_rate": 1.2126101041514085e-06, "loss": 0.63341057, "num_input_tokens_seen": 225431310, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.75, "step": 10459, "time_per_iteration": 2.684458017349243 }, { "auxiliary_loss_clip": 0.01127415, "auxiliary_loss_mlp": 0.010321, "balance_loss_clip": 1.01957154, "balance_loss_mlp": 1.03702629, "epoch": 0.6288892229069593, "flos": 24641327606400.0, "grad_norm": 1.667357471697319, "language_loss": 0.78534746, "learning_rate": 1.2122628083733562e-06, "loss": 0.80694264, "num_input_tokens_seen": 225450385, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.73046875, "step": 10460, "time_per_iteration": 2.633915901184082 }, { "auxiliary_loss_clip": 0.01137223, "auxiliary_loss_mlp": 0.01031769, "balance_loss_clip": 1.01913285, "balance_loss_mlp": 1.03862143, "epoch": 0.6289493461596273, "flos": 17858341163520.0, "grad_norm": 1.7811486880599332, "language_loss": 0.74295831, "learning_rate": 1.211915540707619e-06, "loss": 0.7646482, "num_input_tokens_seen": 225467325, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 10461, "time_per_iteration": 2.6551811695098877 }, { "auxiliary_loss_clip": 0.01124149, "auxiliary_loss_mlp": 0.01037375, "balance_loss_clip": 1.02450061, "balance_loss_mlp": 1.03670359, "epoch": 0.6290094694122952, "flos": 22345379199360.0, "grad_norm": 1.6278960018293496, "language_loss": 0.70115405, "learning_rate": 1.2115683011665877e-06, "loss": 0.72276926, "num_input_tokens_seen": 225487370, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 10462, "time_per_iteration": 2.6381285190582275 }, { "auxiliary_loss_clip": 0.01102747, "auxiliary_loss_mlp": 0.01030508, "balance_loss_clip": 1.01929617, "balance_loss_mlp": 1.03637087, "epoch": 0.6290695926649632, "flos": 28402431776640.0, "grad_norm": 6.867859493804436, "language_loss": 0.72210276, "learning_rate": 1.211221089762656e-06, "loss": 0.74343526, "num_input_tokens_seen": 225506915, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6640625, "step": 10463, "time_per_iteration": 2.647329568862915 }, { "auxiliary_loss_clip": 0.01126451, "auxiliary_loss_mlp": 0.01034031, "balance_loss_clip": 1.02116823, "balance_loss_mlp": 1.03978288, "epoch": 0.6291297159176311, "flos": 21505464501120.0, "grad_norm": 2.384584620551202, "language_loss": 0.7279833, "learning_rate": 1.2108739065082155e-06, "loss": 0.74958813, "num_input_tokens_seen": 225525670, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 10464, "time_per_iteration": 2.6288976669311523 }, { "auxiliary_loss_clip": 0.01125338, "auxiliary_loss_mlp": 0.0103231, "balance_loss_clip": 1.02007914, "balance_loss_mlp": 1.03793836, "epoch": 0.6291898391702991, "flos": 12203308581120.0, "grad_norm": 1.8799597891916398, "language_loss": 0.69395161, "learning_rate": 1.2105267514156544e-06, "loss": 0.71552813, "num_input_tokens_seen": 225542235, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 10465, "time_per_iteration": 2.55804181098938 }, { "auxiliary_loss_clip": 0.01038053, "auxiliary_loss_mlp": 0.01003451, "balance_loss_clip": 1.00181758, "balance_loss_mlp": 1.00717521, "epoch": 0.629249962422967, "flos": 69299479434240.0, "grad_norm": 0.6832023877474886, "language_loss": 0.59768254, "learning_rate": 1.2101796244973626e-06, "loss": 0.61809754, "num_input_tokens_seen": 225607185, "router_z_loss_clip": 0.01635742, "router_z_loss_mlp": 0.22167969, "step": 10466, "time_per_iteration": 3.264246940612793 }, { "auxiliary_loss_clip": 0.01129778, "auxiliary_loss_mlp": 0.01032077, "balance_loss_clip": 1.02016211, "balance_loss_mlp": 1.03232777, "epoch": 0.6293100856756351, "flos": 40077888042240.0, "grad_norm": 2.167788585339715, "language_loss": 0.64748478, "learning_rate": 1.2098325257657286e-06, "loss": 0.66910338, "num_input_tokens_seen": 225628785, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.7109375, "step": 10467, "time_per_iteration": 2.756331205368042 }, { "auxiliary_loss_clip": 0.01046971, "auxiliary_loss_mlp": 0.01002535, "balance_loss_clip": 1.0009253, "balance_loss_mlp": 1.0074904, "epoch": 0.629370208928303, "flos": 67501108177920.0, "grad_norm": 0.806318664257155, "language_loss": 0.56952101, "learning_rate": 1.2094854552331398e-06, "loss": 0.59001607, "num_input_tokens_seen": 225678980, "router_z_loss_clip": 0.01611328, "router_z_loss_mlp": 0.22070312, "step": 10468, "time_per_iteration": 2.9907517433166504 }, { "auxiliary_loss_clip": 0.01064714, "auxiliary_loss_mlp": 0.01002893, "balance_loss_clip": 1.00143266, "balance_loss_mlp": 1.00713658, "epoch": 0.629430332180971, "flos": 60660450449280.0, "grad_norm": 0.7289843467214335, "language_loss": 0.57976925, "learning_rate": 1.2091384129119809e-06, "loss": 0.60044527, "num_input_tokens_seen": 225740295, "router_z_loss_clip": 0.0145874, "router_z_loss_mlp": 0.22070312, "step": 10469, "time_per_iteration": 3.213733434677124 }, { "auxiliary_loss_clip": 0.01038362, "auxiliary_loss_mlp": 0.01004189, "balance_loss_clip": 1.0026629, "balance_loss_mlp": 1.00690174, "epoch": 0.6294904554336389, "flos": 66869764778880.0, "grad_norm": 0.723750707420777, "language_loss": 0.52123368, "learning_rate": 1.2087913988146379e-06, "loss": 0.54165918, "num_input_tokens_seen": 225805615, "router_z_loss_clip": 0.01525879, "router_z_loss_mlp": 0.22265625, "step": 10470, "time_per_iteration": 3.192275285720825 }, { "auxiliary_loss_clip": 0.01115661, "auxiliary_loss_mlp": 0.01032038, "balance_loss_clip": 1.01924729, "balance_loss_mlp": 1.03598809, "epoch": 0.6295505786863069, "flos": 42522794150400.0, "grad_norm": 1.6874813990369397, "language_loss": 0.74374032, "learning_rate": 1.2084444129534951e-06, "loss": 0.7652173, "num_input_tokens_seen": 225826585, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 10471, "time_per_iteration": 2.856802463531494 }, { "auxiliary_loss_clip": 0.0112774, "auxiliary_loss_mlp": 0.01027926, "balance_loss_clip": 1.01574254, "balance_loss_mlp": 1.03699827, "epoch": 0.629610701938975, "flos": 17384140788480.0, "grad_norm": 1.822622941530614, "language_loss": 0.62910974, "learning_rate": 1.2080974553409347e-06, "loss": 0.65066648, "num_input_tokens_seen": 225844095, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.7265625, "step": 10472, "time_per_iteration": 2.669682025909424 }, { "auxiliary_loss_clip": 0.01113741, "auxiliary_loss_mlp": 0.01031623, "balance_loss_clip": 1.01848626, "balance_loss_mlp": 1.03997564, "epoch": 0.6296708251916429, "flos": 24242934885120.0, "grad_norm": 1.6552633645764254, "language_loss": 0.69320905, "learning_rate": 1.2077505259893392e-06, "loss": 0.71466267, "num_input_tokens_seen": 225864310, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73828125, "step": 10473, "time_per_iteration": 2.678981304168701 }, { "auxiliary_loss_clip": 0.01124753, "auxiliary_loss_mlp": 0.01033174, "balance_loss_clip": 1.0209434, "balance_loss_mlp": 1.03562546, "epoch": 0.6297309484443109, "flos": 19278536077440.0, "grad_norm": 1.5724903503073242, "language_loss": 0.74492782, "learning_rate": 1.2074036249110901e-06, "loss": 0.76650715, "num_input_tokens_seen": 225883830, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7109375, "step": 10474, "time_per_iteration": 2.650020122528076 }, { "auxiliary_loss_clip": 0.01126427, "auxiliary_loss_mlp": 0.01275857, "balance_loss_clip": 1.01641083, "balance_loss_mlp": 1.03658712, "epoch": 0.6297910716969788, "flos": 30662685043200.0, "grad_norm": 1.4659521790722636, "language_loss": 0.66305852, "learning_rate": 1.2070567521185656e-06, "loss": 0.68708134, "num_input_tokens_seen": 225905755, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 10475, "time_per_iteration": 2.6270639896392822 }, { "auxiliary_loss_clip": 0.01139422, "auxiliary_loss_mlp": 0.01032924, "balance_loss_clip": 1.02044904, "balance_loss_mlp": 1.03552914, "epoch": 0.6298511949496468, "flos": 14423018371200.0, "grad_norm": 2.0608471668265635, "language_loss": 0.90058243, "learning_rate": 1.2067099076241465e-06, "loss": 0.92230594, "num_input_tokens_seen": 225922155, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 10476, "time_per_iteration": 2.550826072692871 }, { "auxiliary_loss_clip": 0.01105843, "auxiliary_loss_mlp": 0.01035571, "balance_loss_clip": 1.02340531, "balance_loss_mlp": 1.03582585, "epoch": 0.6299113182023147, "flos": 23514163845120.0, "grad_norm": 2.076201449325894, "language_loss": 0.75796914, "learning_rate": 1.20636309144021e-06, "loss": 0.77938318, "num_input_tokens_seen": 225941060, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69921875, "step": 10477, "time_per_iteration": 2.527005434036255 }, { "auxiliary_loss_clip": 0.01117603, "auxiliary_loss_mlp": 0.01031025, "balance_loss_clip": 1.01778066, "balance_loss_mlp": 1.03597438, "epoch": 0.6299714414549827, "flos": 22674500542080.0, "grad_norm": 1.814091712695974, "language_loss": 0.70264518, "learning_rate": 1.2060163035791341e-06, "loss": 0.72413141, "num_input_tokens_seen": 225960870, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 10478, "time_per_iteration": 2.5808823108673096 }, { "auxiliary_loss_clip": 0.01121783, "auxiliary_loss_mlp": 0.01028151, "balance_loss_clip": 1.01487684, "balance_loss_mlp": 1.03803217, "epoch": 0.6300315647076506, "flos": 14501735026560.0, "grad_norm": 2.5498303958408877, "language_loss": 0.67124355, "learning_rate": 1.2056695440532932e-06, "loss": 0.69274288, "num_input_tokens_seen": 225977895, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75, "step": 10479, "time_per_iteration": 2.5304131507873535 }, { "auxiliary_loss_clip": 0.01125227, "auxiliary_loss_mlp": 0.01279134, "balance_loss_clip": 1.0195775, "balance_loss_mlp": 1.03701091, "epoch": 0.6300916879603187, "flos": 21871681614720.0, "grad_norm": 1.6517091892842046, "language_loss": 0.73570788, "learning_rate": 1.205322812875063e-06, "loss": 0.7597515, "num_input_tokens_seen": 225997835, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 10480, "time_per_iteration": 2.631890296936035 }, { "auxiliary_loss_clip": 0.0111639, "auxiliary_loss_mlp": 0.01031238, "balance_loss_clip": 1.01762486, "balance_loss_mlp": 1.03501391, "epoch": 0.6301518112129866, "flos": 21834047139840.0, "grad_norm": 2.157381167945589, "language_loss": 0.78970671, "learning_rate": 1.2049761100568182e-06, "loss": 0.81118298, "num_input_tokens_seen": 226017620, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 10481, "time_per_iteration": 2.5324723720550537 }, { "auxiliary_loss_clip": 0.01048524, "auxiliary_loss_mlp": 0.01004135, "balance_loss_clip": 1.00266325, "balance_loss_mlp": 1.00865269, "epoch": 0.6302119344656546, "flos": 44334237957120.0, "grad_norm": 0.8983991160565077, "language_loss": 0.61813796, "learning_rate": 1.2046294356109302e-06, "loss": 0.6386646, "num_input_tokens_seen": 226068755, "router_z_loss_clip": 0.01470947, "router_z_loss_mlp": 0.22070312, "step": 10482, "time_per_iteration": 2.949894905090332 }, { "auxiliary_loss_clip": 0.01123012, "auxiliary_loss_mlp": 0.01035105, "balance_loss_clip": 1.0210923, "balance_loss_mlp": 1.03750539, "epoch": 0.6302720577183225, "flos": 11217919800960.0, "grad_norm": 2.0390874962571566, "language_loss": 0.8294704, "learning_rate": 1.2042827895497714e-06, "loss": 0.85105151, "num_input_tokens_seen": 226084395, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.765625, "step": 10483, "time_per_iteration": 3.945035219192505 }, { "auxiliary_loss_clip": 0.01110736, "auxiliary_loss_mlp": 0.01032096, "balance_loss_clip": 1.02004957, "balance_loss_mlp": 1.03470111, "epoch": 0.6303321809709905, "flos": 27964932122880.0, "grad_norm": 1.5935558276867479, "language_loss": 0.72444046, "learning_rate": 1.2039361718857132e-06, "loss": 0.74586868, "num_input_tokens_seen": 226105890, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.66796875, "step": 10484, "time_per_iteration": 2.7839956283569336 }, { "auxiliary_loss_clip": 0.01133492, "auxiliary_loss_mlp": 0.01029749, "balance_loss_clip": 1.01663589, "balance_loss_mlp": 1.03446674, "epoch": 0.6303923042236586, "flos": 28220759763840.0, "grad_norm": 1.7846376597092797, "language_loss": 0.74092543, "learning_rate": 1.2035895826311265e-06, "loss": 0.7625578, "num_input_tokens_seen": 226126760, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 10485, "time_per_iteration": 2.708819627761841 }, { "auxiliary_loss_clip": 0.01131688, "auxiliary_loss_mlp": 0.01282303, "balance_loss_clip": 1.02193487, "balance_loss_mlp": 1.03848922, "epoch": 0.6304524274763265, "flos": 27631034271360.0, "grad_norm": 2.0301224404081077, "language_loss": 0.81268221, "learning_rate": 1.2032430217983778e-06, "loss": 0.83682215, "num_input_tokens_seen": 226147315, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75390625, "step": 10486, "time_per_iteration": 2.6771411895751953 }, { "auxiliary_loss_clip": 0.01125146, "auxiliary_loss_mlp": 0.01036784, "balance_loss_clip": 1.02446377, "balance_loss_mlp": 1.03685474, "epoch": 0.6305125507289945, "flos": 17311313963520.0, "grad_norm": 1.890450733016892, "language_loss": 0.63315558, "learning_rate": 1.2028964893998362e-06, "loss": 0.65477484, "num_input_tokens_seen": 226165935, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.70703125, "step": 10487, "time_per_iteration": 2.607302665710449 }, { "auxiliary_loss_clip": 0.01108103, "auxiliary_loss_mlp": 0.01040113, "balance_loss_clip": 1.02743483, "balance_loss_mlp": 1.03655803, "epoch": 0.6305726739816624, "flos": 25808280658560.0, "grad_norm": 1.410071480139472, "language_loss": 0.66946572, "learning_rate": 1.2025499854478698e-06, "loss": 0.69094789, "num_input_tokens_seen": 226186890, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 10488, "time_per_iteration": 4.085178852081299 }, { "auxiliary_loss_clip": 0.01126706, "auxiliary_loss_mlp": 0.0103042, "balance_loss_clip": 1.01800489, "balance_loss_mlp": 1.03598094, "epoch": 0.6306327972343304, "flos": 21797454159360.0, "grad_norm": 2.932938573410996, "language_loss": 0.67372358, "learning_rate": 1.2022035099548418e-06, "loss": 0.69529486, "num_input_tokens_seen": 226206710, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.73046875, "step": 10489, "time_per_iteration": 2.6297378540039062 }, { "auxiliary_loss_clip": 0.01142467, "auxiliary_loss_mlp": 0.01035415, "balance_loss_clip": 1.02147365, "balance_loss_mlp": 1.03802943, "epoch": 0.6306929204869983, "flos": 20777375819520.0, "grad_norm": 2.2170820702319953, "language_loss": 0.69370866, "learning_rate": 1.2018570629331184e-06, "loss": 0.71548748, "num_input_tokens_seen": 226225565, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7734375, "step": 10490, "time_per_iteration": 2.5578410625457764 }, { "auxiliary_loss_clip": 0.01128603, "auxiliary_loss_mlp": 0.01040796, "balance_loss_clip": 1.02788568, "balance_loss_mlp": 1.03702474, "epoch": 0.6307530437396663, "flos": 23654214973440.0, "grad_norm": 2.1636484147953006, "language_loss": 0.7848534, "learning_rate": 1.2015106443950641e-06, "loss": 0.8065474, "num_input_tokens_seen": 226243680, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73828125, "step": 10491, "time_per_iteration": 2.571316719055176 }, { "auxiliary_loss_clip": 0.0112696, "auxiliary_loss_mlp": 0.01036558, "balance_loss_clip": 1.02391005, "balance_loss_mlp": 1.03779387, "epoch": 0.6308131669923343, "flos": 24719002767360.0, "grad_norm": 1.7573078278858498, "language_loss": 0.55228519, "learning_rate": 1.2011642543530403e-06, "loss": 0.57392037, "num_input_tokens_seen": 226264345, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 10492, "time_per_iteration": 2.6327362060546875 }, { "auxiliary_loss_clip": 0.01110598, "auxiliary_loss_mlp": 0.01038555, "balance_loss_clip": 1.02509022, "balance_loss_mlp": 1.03744507, "epoch": 0.6308732902450023, "flos": 22565403959040.0, "grad_norm": 1.8922793996462748, "language_loss": 0.63860929, "learning_rate": 1.2008178928194092e-06, "loss": 0.66010076, "num_input_tokens_seen": 226283165, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73046875, "step": 10493, "time_per_iteration": 2.5553557872772217 }, { "auxiliary_loss_clip": 0.01136149, "auxiliary_loss_mlp": 0.01031282, "balance_loss_clip": 1.01820433, "balance_loss_mlp": 1.03743935, "epoch": 0.6309334134976702, "flos": 24644200694400.0, "grad_norm": 1.472334526382574, "language_loss": 0.82696313, "learning_rate": 1.2004715598065321e-06, "loss": 0.84863746, "num_input_tokens_seen": 226304080, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 10494, "time_per_iteration": 4.091301918029785 }, { "auxiliary_loss_clip": 0.0112898, "auxiliary_loss_mlp": 0.01034633, "balance_loss_clip": 1.02128184, "balance_loss_mlp": 1.03776383, "epoch": 0.6309935367503382, "flos": 41427949651200.0, "grad_norm": 2.099852294399451, "language_loss": 0.79121828, "learning_rate": 1.200125255326769e-06, "loss": 0.81285441, "num_input_tokens_seen": 226325925, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 10495, "time_per_iteration": 4.461355924606323 }, { "auxiliary_loss_clip": 0.01127738, "auxiliary_loss_mlp": 0.01034026, "balance_loss_clip": 1.02066326, "balance_loss_mlp": 1.0369699, "epoch": 0.6310536600030061, "flos": 15118931445120.0, "grad_norm": 2.2574413366997645, "language_loss": 0.70357901, "learning_rate": 1.1997789793924772e-06, "loss": 0.72519672, "num_input_tokens_seen": 226344190, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 10496, "time_per_iteration": 2.5480668544769287 }, { "auxiliary_loss_clip": 0.01129506, "auxiliary_loss_mlp": 0.0103963, "balance_loss_clip": 1.02627265, "balance_loss_mlp": 1.03786898, "epoch": 0.6311137832556741, "flos": 15231619388160.0, "grad_norm": 2.3650934552555847, "language_loss": 0.79799652, "learning_rate": 1.1994327320160151e-06, "loss": 0.81968784, "num_input_tokens_seen": 226361520, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73828125, "step": 10497, "time_per_iteration": 2.5814106464385986 }, { "auxiliary_loss_clip": 0.01124702, "auxiliary_loss_mlp": 0.01030115, "balance_loss_clip": 1.01850438, "balance_loss_mlp": 1.036309, "epoch": 0.6311739065083422, "flos": 22018664067840.0, "grad_norm": 2.00448251328589, "language_loss": 0.74186993, "learning_rate": 1.1990865132097404e-06, "loss": 0.7634182, "num_input_tokens_seen": 226381920, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.703125, "step": 10498, "time_per_iteration": 2.6050641536712646 }, { "auxiliary_loss_clip": 0.01118304, "auxiliary_loss_mlp": 0.01031641, "balance_loss_clip": 1.01861739, "balance_loss_mlp": 1.03690732, "epoch": 0.6312340297610101, "flos": 22710770300160.0, "grad_norm": 2.0632532720153525, "language_loss": 0.69656813, "learning_rate": 1.1987403229860071e-06, "loss": 0.71806765, "num_input_tokens_seen": 226400035, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 10499, "time_per_iteration": 2.644153356552124 }, { "auxiliary_loss_clip": 0.01117105, "auxiliary_loss_mlp": 0.01041484, "balance_loss_clip": 1.02903819, "balance_loss_mlp": 1.0394206, "epoch": 0.6312941530136781, "flos": 24280102483200.0, "grad_norm": 1.7297948319144436, "language_loss": 0.69448197, "learning_rate": 1.1983941613571704e-06, "loss": 0.71606791, "num_input_tokens_seen": 226418280, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 10500, "time_per_iteration": 2.6914896965026855 }, { "auxiliary_loss_clip": 0.01110759, "auxiliary_loss_mlp": 0.01033144, "balance_loss_clip": 1.02095449, "balance_loss_mlp": 1.03838992, "epoch": 0.631354276266346, "flos": 21725956137600.0, "grad_norm": 2.0275310076201376, "language_loss": 0.7447257, "learning_rate": 1.1980480283355849e-06, "loss": 0.76616472, "num_input_tokens_seen": 226436650, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.72265625, "step": 10501, "time_per_iteration": 2.6482062339782715 }, { "auxiliary_loss_clip": 0.01119007, "auxiliary_loss_mlp": 0.01281011, "balance_loss_clip": 1.02084708, "balance_loss_mlp": 1.03869379, "epoch": 0.631414399519014, "flos": 24025100855040.0, "grad_norm": 1.781276037854815, "language_loss": 0.75473964, "learning_rate": 1.197701923933602e-06, "loss": 0.77873987, "num_input_tokens_seen": 226456275, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71484375, "step": 10502, "time_per_iteration": 2.7140800952911377 }, { "auxiliary_loss_clip": 0.01123467, "auxiliary_loss_mlp": 0.01052497, "balance_loss_clip": 1.03887749, "balance_loss_mlp": 1.03984368, "epoch": 0.6314745227716819, "flos": 24315797623680.0, "grad_norm": 2.0611779213290626, "language_loss": 0.85282904, "learning_rate": 1.1973558481635738e-06, "loss": 0.87458873, "num_input_tokens_seen": 226473610, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75, "step": 10503, "time_per_iteration": 2.6504173278808594 }, { "auxiliary_loss_clip": 0.0112849, "auxiliary_loss_mlp": 0.01033131, "balance_loss_clip": 1.02060199, "balance_loss_mlp": 1.0367893, "epoch": 0.6315346460243499, "flos": 23366391292800.0, "grad_norm": 1.7298498716220347, "language_loss": 0.86494464, "learning_rate": 1.1970098010378501e-06, "loss": 0.8865608, "num_input_tokens_seen": 226493665, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.734375, "step": 10504, "time_per_iteration": 2.7196462154388428 }, { "auxiliary_loss_clip": 0.0114031, "auxiliary_loss_mlp": 0.01036951, "balance_loss_clip": 1.02313423, "balance_loss_mlp": 1.03830218, "epoch": 0.6315947692770179, "flos": 20260333497600.0, "grad_norm": 2.0025108436691115, "language_loss": 0.76566792, "learning_rate": 1.1966637825687822e-06, "loss": 0.78744054, "num_input_tokens_seen": 226511625, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 10505, "time_per_iteration": 2.61251163482666 }, { "auxiliary_loss_clip": 0.01147421, "auxiliary_loss_mlp": 0.01041592, "balance_loss_clip": 1.02835393, "balance_loss_mlp": 1.0381496, "epoch": 0.6316548925296859, "flos": 25265850399360.0, "grad_norm": 2.017248865737353, "language_loss": 0.81600839, "learning_rate": 1.1963177927687167e-06, "loss": 0.83789849, "num_input_tokens_seen": 226530085, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 10506, "time_per_iteration": 2.735949754714966 }, { "auxiliary_loss_clip": 0.01111255, "auxiliary_loss_mlp": 0.01033645, "balance_loss_clip": 1.02097261, "balance_loss_mlp": 1.03859079, "epoch": 0.6317150157823538, "flos": 22930579578240.0, "grad_norm": 1.7535744070519284, "language_loss": 0.74675941, "learning_rate": 1.195971831650002e-06, "loss": 0.76820844, "num_input_tokens_seen": 226548115, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 10507, "time_per_iteration": 2.59093976020813 }, { "auxiliary_loss_clip": 0.01134561, "auxiliary_loss_mlp": 0.01039184, "balance_loss_clip": 1.02571964, "balance_loss_mlp": 1.04112387, "epoch": 0.6317751390350218, "flos": 22527051212160.0, "grad_norm": 1.674195618266253, "language_loss": 0.67870545, "learning_rate": 1.1956258992249847e-06, "loss": 0.70044291, "num_input_tokens_seen": 226567955, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75390625, "step": 10508, "time_per_iteration": 2.6828463077545166 }, { "auxiliary_loss_clip": 0.01136493, "auxiliary_loss_mlp": 0.01034243, "balance_loss_clip": 1.02209616, "balance_loss_mlp": 1.03784263, "epoch": 0.6318352622876897, "flos": 23294749616640.0, "grad_norm": 1.8414128067927853, "language_loss": 0.7065798, "learning_rate": 1.1952799955060094e-06, "loss": 0.72828722, "num_input_tokens_seen": 226588205, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.71875, "step": 10509, "time_per_iteration": 2.6691439151763916 }, { "auxiliary_loss_clip": 0.01117962, "auxiliary_loss_mlp": 0.0102933, "balance_loss_clip": 1.01680696, "balance_loss_mlp": 1.03670835, "epoch": 0.6318953855403577, "flos": 20704082117760.0, "grad_norm": 1.5667353518867166, "language_loss": 0.79315245, "learning_rate": 1.194934120505421e-06, "loss": 0.81462532, "num_input_tokens_seen": 226606965, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 10510, "time_per_iteration": 2.693974494934082 }, { "auxiliary_loss_clip": 0.01147053, "auxiliary_loss_mlp": 0.01283641, "balance_loss_clip": 1.02338958, "balance_loss_mlp": 1.03832173, "epoch": 0.6319555087930258, "flos": 22820046451200.0, "grad_norm": 1.556070132948354, "language_loss": 0.70849609, "learning_rate": 1.194588274235563e-06, "loss": 0.73280311, "num_input_tokens_seen": 226627845, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 10511, "time_per_iteration": 2.685434579849243 }, { "auxiliary_loss_clip": 0.01107648, "auxiliary_loss_mlp": 0.01035824, "balance_loss_clip": 1.02345586, "balance_loss_mlp": 1.0375005, "epoch": 0.6320156320456937, "flos": 19970929618560.0, "grad_norm": 1.6122493894738885, "language_loss": 0.79901117, "learning_rate": 1.1942424567087787e-06, "loss": 0.8204459, "num_input_tokens_seen": 226645855, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 10512, "time_per_iteration": 2.621347665786743 }, { "auxiliary_loss_clip": 0.01120258, "auxiliary_loss_mlp": 0.01032532, "balance_loss_clip": 1.01929939, "balance_loss_mlp": 1.03872705, "epoch": 0.6320757552983617, "flos": 27013406889600.0, "grad_norm": 2.7252117696748623, "language_loss": 0.70809138, "learning_rate": 1.1938966679374075e-06, "loss": 0.72961938, "num_input_tokens_seen": 226665375, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 10513, "time_per_iteration": 2.6409664154052734 }, { "auxiliary_loss_clip": 0.01123829, "auxiliary_loss_mlp": 0.01032414, "balance_loss_clip": 1.01890731, "balance_loss_mlp": 1.03899288, "epoch": 0.6321358785510296, "flos": 23695943598720.0, "grad_norm": 1.7436244006847774, "language_loss": 0.66407728, "learning_rate": 1.193550907933791e-06, "loss": 0.68563974, "num_input_tokens_seen": 226685270, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 10514, "time_per_iteration": 2.641993522644043 }, { "auxiliary_loss_clip": 0.01132629, "auxiliary_loss_mlp": 0.01031757, "balance_loss_clip": 1.01966286, "balance_loss_mlp": 1.03515673, "epoch": 0.6321960018036976, "flos": 25995231970560.0, "grad_norm": 2.070188599128747, "language_loss": 0.74372685, "learning_rate": 1.1932051767102685e-06, "loss": 0.76537073, "num_input_tokens_seen": 226705325, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.703125, "step": 10515, "time_per_iteration": 2.762406349182129 }, { "auxiliary_loss_clip": 0.01123821, "auxiliary_loss_mlp": 0.01031814, "balance_loss_clip": 1.01983917, "balance_loss_mlp": 1.03687847, "epoch": 0.6322561250563655, "flos": 22821016118400.0, "grad_norm": 1.8667740331513127, "language_loss": 0.90014023, "learning_rate": 1.1928594742791774e-06, "loss": 0.92169654, "num_input_tokens_seen": 226723815, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.69140625, "step": 10516, "time_per_iteration": 2.644334554672241 }, { "auxiliary_loss_clip": 0.01116204, "auxiliary_loss_mlp": 0.01033208, "balance_loss_clip": 1.01839042, "balance_loss_mlp": 1.03941488, "epoch": 0.6323162483090335, "flos": 18988413926400.0, "grad_norm": 1.7710512365947897, "language_loss": 0.81697476, "learning_rate": 1.1925138006528552e-06, "loss": 0.83846891, "num_input_tokens_seen": 226741550, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.76953125, "step": 10517, "time_per_iteration": 2.6395442485809326 }, { "auxiliary_loss_clip": 0.01156378, "auxiliary_loss_mlp": 0.01040407, "balance_loss_clip": 1.02752066, "balance_loss_mlp": 1.0402565, "epoch": 0.6323763715617015, "flos": 19865173000320.0, "grad_norm": 1.7231104703594071, "language_loss": 0.77746236, "learning_rate": 1.19216815584364e-06, "loss": 0.79943013, "num_input_tokens_seen": 226761115, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 10518, "time_per_iteration": 2.665856122970581 }, { "auxiliary_loss_clip": 0.0111587, "auxiliary_loss_mlp": 0.01035228, "balance_loss_clip": 1.02274704, "balance_loss_mlp": 1.03640711, "epoch": 0.6324364948143695, "flos": 22782699285120.0, "grad_norm": 1.4646366115135776, "language_loss": 0.85086399, "learning_rate": 1.1918225398638636e-06, "loss": 0.87237495, "num_input_tokens_seen": 226782225, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 10519, "time_per_iteration": 2.6418344974517822 }, { "auxiliary_loss_clip": 0.01124496, "auxiliary_loss_mlp": 0.01033431, "balance_loss_clip": 1.02128363, "balance_loss_mlp": 1.03763342, "epoch": 0.6324966180670374, "flos": 22235923480320.0, "grad_norm": 1.5075508906014752, "language_loss": 0.72110504, "learning_rate": 1.1914769527258621e-06, "loss": 0.74268436, "num_input_tokens_seen": 226802375, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.69140625, "step": 10520, "time_per_iteration": 2.5783908367156982 }, { "auxiliary_loss_clip": 0.01116172, "auxiliary_loss_mlp": 0.0103045, "balance_loss_clip": 1.01805854, "balance_loss_mlp": 1.03631902, "epoch": 0.6325567413197054, "flos": 21689183589120.0, "grad_norm": 2.2052150658934613, "language_loss": 0.71147299, "learning_rate": 1.1911313944419683e-06, "loss": 0.73293924, "num_input_tokens_seen": 226822165, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 10521, "time_per_iteration": 2.546482801437378 }, { "auxiliary_loss_clip": 0.01138482, "auxiliary_loss_mlp": 0.01029299, "balance_loss_clip": 1.01575732, "balance_loss_mlp": 1.03806663, "epoch": 0.6326168645723733, "flos": 19937137898880.0, "grad_norm": 2.781737535364515, "language_loss": 0.71849525, "learning_rate": 1.1907858650245154e-06, "loss": 0.7401731, "num_input_tokens_seen": 226841645, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73046875, "step": 10522, "time_per_iteration": 2.5793612003326416 }, { "auxiliary_loss_clip": 0.01145632, "auxiliary_loss_mlp": 0.01036348, "balance_loss_clip": 1.02356267, "balance_loss_mlp": 1.03765678, "epoch": 0.6326769878250413, "flos": 20230348619520.0, "grad_norm": 1.9747862470909596, "language_loss": 0.81594646, "learning_rate": 1.1904403644858324e-06, "loss": 0.83776617, "num_input_tokens_seen": 226860355, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 10523, "time_per_iteration": 2.595170021057129 }, { "auxiliary_loss_clip": 0.01117309, "auxiliary_loss_mlp": 0.01029132, "balance_loss_clip": 1.01626372, "balance_loss_mlp": 1.03701019, "epoch": 0.6327371110777094, "flos": 20775759707520.0, "grad_norm": 2.1413058394897795, "language_loss": 0.73819315, "learning_rate": 1.1900948928382506e-06, "loss": 0.75965762, "num_input_tokens_seen": 226878390, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 10524, "time_per_iteration": 2.5045034885406494 }, { "auxiliary_loss_clip": 0.01135421, "auxiliary_loss_mlp": 0.01038648, "balance_loss_clip": 1.02424765, "balance_loss_mlp": 1.03921843, "epoch": 0.6327972343303773, "flos": 30336544529280.0, "grad_norm": 2.6098914765717107, "language_loss": 0.84518313, "learning_rate": 1.1897494500940993e-06, "loss": 0.86692381, "num_input_tokens_seen": 226898420, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.78125, "step": 10525, "time_per_iteration": 3.992161273956299 }, { "auxiliary_loss_clip": 0.01124752, "auxiliary_loss_mlp": 0.01026569, "balance_loss_clip": 1.0146836, "balance_loss_mlp": 1.03603673, "epoch": 0.6328573575830453, "flos": 17092258871040.0, "grad_norm": 1.796901033588254, "language_loss": 0.67051238, "learning_rate": 1.1894040362657052e-06, "loss": 0.69202566, "num_input_tokens_seen": 226916305, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.70703125, "step": 10526, "time_per_iteration": 2.488192319869995 }, { "auxiliary_loss_clip": 0.01124267, "auxiliary_loss_mlp": 0.01036066, "balance_loss_clip": 1.02213061, "balance_loss_mlp": 1.04058576, "epoch": 0.6329174808357132, "flos": 25047154442880.0, "grad_norm": 1.4197199297425693, "language_loss": 0.73584414, "learning_rate": 1.189058651365396e-06, "loss": 0.75744748, "num_input_tokens_seen": 226937705, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75, "step": 10527, "time_per_iteration": 2.5601234436035156 }, { "auxiliary_loss_clip": 0.01117731, "auxiliary_loss_mlp": 0.01033403, "balance_loss_clip": 1.02108264, "balance_loss_mlp": 1.03899264, "epoch": 0.6329776040883812, "flos": 16836826279680.0, "grad_norm": 2.0657635194475445, "language_loss": 0.71585786, "learning_rate": 1.1887132954054975e-06, "loss": 0.73736918, "num_input_tokens_seen": 226954880, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 10528, "time_per_iteration": 2.566849946975708 }, { "auxiliary_loss_clip": 0.0111695, "auxiliary_loss_mlp": 0.01028555, "balance_loss_clip": 1.01502454, "balance_loss_mlp": 1.03677237, "epoch": 0.6330377273410491, "flos": 13516705382400.0, "grad_norm": 2.148082968941102, "language_loss": 0.66186231, "learning_rate": 1.1883679683983354e-06, "loss": 0.68331736, "num_input_tokens_seen": 226972595, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71484375, "step": 10529, "time_per_iteration": 2.5749473571777344 }, { "auxiliary_loss_clip": 0.01122705, "auxiliary_loss_mlp": 0.01033301, "balance_loss_clip": 1.0197233, "balance_loss_mlp": 1.03914404, "epoch": 0.6330978505937171, "flos": 21538825257600.0, "grad_norm": 2.0755299623197314, "language_loss": 0.749412, "learning_rate": 1.188022670356232e-06, "loss": 0.77097207, "num_input_tokens_seen": 226991910, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.74609375, "step": 10530, "time_per_iteration": 3.96939754486084 }, { "auxiliary_loss_clip": 0.01122156, "auxiliary_loss_mlp": 0.01279875, "balance_loss_clip": 1.02036524, "balance_loss_mlp": 1.04025459, "epoch": 0.6331579738463851, "flos": 25009484054400.0, "grad_norm": 1.5321849318927474, "language_loss": 0.74173111, "learning_rate": 1.1876774012915108e-06, "loss": 0.76575142, "num_input_tokens_seen": 227010175, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73046875, "step": 10531, "time_per_iteration": 2.6500120162963867 }, { "auxiliary_loss_clip": 0.01132612, "auxiliary_loss_mlp": 0.0103079, "balance_loss_clip": 1.01818931, "balance_loss_mlp": 1.03690374, "epoch": 0.6332180970990531, "flos": 14976007228800.0, "grad_norm": 2.04505190761644, "language_loss": 0.79848003, "learning_rate": 1.1873321612164944e-06, "loss": 0.82011402, "num_input_tokens_seen": 227025540, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 10532, "time_per_iteration": 2.634786367416382 }, { "auxiliary_loss_clip": 0.01114169, "auxiliary_loss_mlp": 0.01029352, "balance_loss_clip": 1.01801538, "balance_loss_mlp": 1.03544545, "epoch": 0.633278220351721, "flos": 22706963458560.0, "grad_norm": 1.7612187408790778, "language_loss": 0.74571824, "learning_rate": 1.1869869501435023e-06, "loss": 0.7671535, "num_input_tokens_seen": 227045520, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.69921875, "step": 10533, "time_per_iteration": 2.536588430404663 }, { "auxiliary_loss_clip": 0.01133654, "auxiliary_loss_mlp": 0.01035627, "balance_loss_clip": 1.02195406, "balance_loss_mlp": 1.03866518, "epoch": 0.633338343604389, "flos": 12602922364800.0, "grad_norm": 2.327809999852644, "language_loss": 0.77426308, "learning_rate": 1.1866417680848542e-06, "loss": 0.7959559, "num_input_tokens_seen": 227059420, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7734375, "step": 10534, "time_per_iteration": 2.5384411811828613 }, { "auxiliary_loss_clip": 0.01137323, "auxiliary_loss_mlp": 0.0103591, "balance_loss_clip": 1.02340519, "balance_loss_mlp": 1.04004478, "epoch": 0.6333984668570569, "flos": 25960111447680.0, "grad_norm": 1.501216962712146, "language_loss": 0.85585785, "learning_rate": 1.1862966150528702e-06, "loss": 0.87759018, "num_input_tokens_seen": 227081310, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 10535, "time_per_iteration": 4.1922287940979 }, { "auxiliary_loss_clip": 0.0112096, "auxiliary_loss_mlp": 0.01030717, "balance_loss_clip": 1.01750851, "balance_loss_mlp": 1.03872645, "epoch": 0.6334585901097249, "flos": 23659242877440.0, "grad_norm": 1.9101156564310415, "language_loss": 0.76490575, "learning_rate": 1.1859514910598658e-06, "loss": 0.78642249, "num_input_tokens_seen": 227100365, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 10536, "time_per_iteration": 4.054410934448242 }, { "auxiliary_loss_clip": 0.01120246, "auxiliary_loss_mlp": 0.01027354, "balance_loss_clip": 1.01481283, "balance_loss_mlp": 1.04092777, "epoch": 0.633518713362393, "flos": 28760496503040.0, "grad_norm": 1.8053921557276384, "language_loss": 0.60184497, "learning_rate": 1.185606396118159e-06, "loss": 0.62332094, "num_input_tokens_seen": 227119680, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 10537, "time_per_iteration": 2.611795425415039 }, { "auxiliary_loss_clip": 0.0105543, "auxiliary_loss_mlp": 0.01001614, "balance_loss_clip": 1.00005233, "balance_loss_mlp": 1.00678492, "epoch": 0.6335788366150609, "flos": 70420322401920.0, "grad_norm": 0.778512560742722, "language_loss": 0.52414978, "learning_rate": 1.1852613302400648e-06, "loss": 0.54472017, "num_input_tokens_seen": 227184465, "router_z_loss_clip": 0.015625, "router_z_loss_mlp": 0.22460938, "step": 10538, "time_per_iteration": 3.2317302227020264 }, { "auxiliary_loss_clip": 0.01132077, "auxiliary_loss_mlp": 0.01031182, "balance_loss_clip": 1.01765144, "balance_loss_mlp": 1.03859043, "epoch": 0.6336389598677289, "flos": 23732069702400.0, "grad_norm": 1.8795476843115557, "language_loss": 0.83710867, "learning_rate": 1.184916293437899e-06, "loss": 0.85874122, "num_input_tokens_seen": 227202185, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 10539, "time_per_iteration": 2.658069610595703 }, { "auxiliary_loss_clip": 0.0112931, "auxiliary_loss_mlp": 0.01029556, "balance_loss_clip": 1.01619279, "balance_loss_mlp": 1.03822684, "epoch": 0.6336990831203968, "flos": 29276676898560.0, "grad_norm": 1.5508615730294337, "language_loss": 0.86720556, "learning_rate": 1.1845712857239732e-06, "loss": 0.88879424, "num_input_tokens_seen": 227222020, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73046875, "step": 10540, "time_per_iteration": 2.6372663974761963 }, { "auxiliary_loss_clip": 0.0112429, "auxiliary_loss_mlp": 0.01031371, "balance_loss_clip": 1.02040386, "balance_loss_mlp": 1.0377152, "epoch": 0.6337592063730648, "flos": 29096836479360.0, "grad_norm": 1.7254893591313836, "language_loss": 0.72538787, "learning_rate": 1.1842263071106005e-06, "loss": 0.74694449, "num_input_tokens_seen": 227240885, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.69140625, "step": 10541, "time_per_iteration": 2.561957836151123 }, { "auxiliary_loss_clip": 0.01122637, "auxiliary_loss_mlp": 0.01034964, "balance_loss_clip": 1.02067661, "balance_loss_mlp": 1.03780043, "epoch": 0.6338193296257327, "flos": 34706477249280.0, "grad_norm": 2.8904032514016857, "language_loss": 0.84900182, "learning_rate": 1.1838813576100935e-06, "loss": 0.87057787, "num_input_tokens_seen": 227257880, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7578125, "step": 10542, "time_per_iteration": 2.6498656272888184 }, { "auxiliary_loss_clip": 0.01118612, "auxiliary_loss_mlp": 0.01030564, "balance_loss_clip": 1.01652694, "balance_loss_mlp": 1.03749406, "epoch": 0.6338794528784008, "flos": 16687581269760.0, "grad_norm": 1.6466233909778507, "language_loss": 0.77757883, "learning_rate": 1.1835364372347604e-06, "loss": 0.79907066, "num_input_tokens_seen": 227274840, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7265625, "step": 10543, "time_per_iteration": 2.5123307704925537 }, { "auxiliary_loss_clip": 0.01138799, "auxiliary_loss_mlp": 0.01036669, "balance_loss_clip": 1.0248729, "balance_loss_mlp": 1.03585196, "epoch": 0.6339395761310687, "flos": 22346600261760.0, "grad_norm": 2.127610943561708, "language_loss": 0.73197746, "learning_rate": 1.183191545996912e-06, "loss": 0.75373209, "num_input_tokens_seen": 227294835, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 10544, "time_per_iteration": 2.6207432746887207 }, { "auxiliary_loss_clip": 0.01141313, "auxiliary_loss_mlp": 0.0103655, "balance_loss_clip": 1.02380073, "balance_loss_mlp": 1.03877568, "epoch": 0.6339996993837367, "flos": 18551812112640.0, "grad_norm": 2.038720580340454, "language_loss": 0.6844697, "learning_rate": 1.1828466839088568e-06, "loss": 0.70624834, "num_input_tokens_seen": 227314935, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7578125, "step": 10545, "time_per_iteration": 2.5851409435272217 }, { "auxiliary_loss_clip": 0.01123694, "auxiliary_loss_mlp": 0.01038727, "balance_loss_clip": 1.02557802, "balance_loss_mlp": 1.03597283, "epoch": 0.6340598226364046, "flos": 12969498614400.0, "grad_norm": 1.9981910674218915, "language_loss": 0.71090579, "learning_rate": 1.1825018509829007e-06, "loss": 0.73253, "num_input_tokens_seen": 227332905, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 10546, "time_per_iteration": 2.57230281829834 }, { "auxiliary_loss_clip": 0.01113367, "auxiliary_loss_mlp": 0.01030607, "balance_loss_clip": 1.01890647, "balance_loss_mlp": 1.03644061, "epoch": 0.6341199458890726, "flos": 26687984647680.0, "grad_norm": 1.4307555569073507, "language_loss": 0.78057837, "learning_rate": 1.182157047231351e-06, "loss": 0.80201817, "num_input_tokens_seen": 227354915, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.68359375, "step": 10547, "time_per_iteration": 2.578124523162842 }, { "auxiliary_loss_clip": 0.01146412, "auxiliary_loss_mlp": 0.01036101, "balance_loss_clip": 1.02369761, "balance_loss_mlp": 1.03715467, "epoch": 0.6341800691417405, "flos": 18734274224640.0, "grad_norm": 1.7562838525016453, "language_loss": 0.63142538, "learning_rate": 1.1818122726665128e-06, "loss": 0.65325052, "num_input_tokens_seen": 227372990, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.73828125, "step": 10548, "time_per_iteration": 2.5788557529449463 }, { "auxiliary_loss_clip": 0.01128296, "auxiliary_loss_mlp": 0.01031457, "balance_loss_clip": 1.0192976, "balance_loss_mlp": 1.03889918, "epoch": 0.6342401923944085, "flos": 26249443499520.0, "grad_norm": 1.5719682389682543, "language_loss": 0.61629981, "learning_rate": 1.1814675273006902e-06, "loss": 0.63789731, "num_input_tokens_seen": 227393270, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.7109375, "step": 10549, "time_per_iteration": 2.572758913040161 }, { "auxiliary_loss_clip": 0.01142435, "auxiliary_loss_mlp": 0.01032076, "balance_loss_clip": 1.02026844, "balance_loss_mlp": 1.03635573, "epoch": 0.6343003156470765, "flos": 24680937329280.0, "grad_norm": 2.0787919476403256, "language_loss": 0.73722291, "learning_rate": 1.1811228111461855e-06, "loss": 0.758968, "num_input_tokens_seen": 227413630, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.70703125, "step": 10550, "time_per_iteration": 2.5891566276550293 }, { "auxiliary_loss_clip": 0.01056887, "auxiliary_loss_mlp": 0.01001833, "balance_loss_clip": 1.00036049, "balance_loss_mlp": 1.00801599, "epoch": 0.6343604388997445, "flos": 69805352626560.0, "grad_norm": 0.698088671202064, "language_loss": 0.57749563, "learning_rate": 1.180778124215301e-06, "loss": 0.59808284, "num_input_tokens_seen": 227476630, "router_z_loss_clip": 0.01470947, "router_z_loss_mlp": 0.22363281, "step": 10551, "time_per_iteration": 3.2490477561950684 }, { "auxiliary_loss_clip": 0.0113325, "auxiliary_loss_mlp": 0.01031861, "balance_loss_clip": 1.0197612, "balance_loss_mlp": 1.03665614, "epoch": 0.6344205621524125, "flos": 21982430223360.0, "grad_norm": 1.856456466072477, "language_loss": 0.66995102, "learning_rate": 1.180433466520339e-06, "loss": 0.69160211, "num_input_tokens_seen": 227496060, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69921875, "step": 10552, "time_per_iteration": 2.5713868141174316 }, { "auxiliary_loss_clip": 0.01138778, "auxiliary_loss_mlp": 0.01029276, "balance_loss_clip": 1.01636553, "balance_loss_mlp": 1.037292, "epoch": 0.6344806854050804, "flos": 20448865008000.0, "grad_norm": 2.0122285838257072, "language_loss": 0.82111025, "learning_rate": 1.180088838073597e-06, "loss": 0.84279078, "num_input_tokens_seen": 227513440, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.74609375, "step": 10553, "time_per_iteration": 2.6192359924316406 }, { "auxiliary_loss_clip": 0.01119051, "auxiliary_loss_mlp": 0.01033652, "balance_loss_clip": 1.02111077, "balance_loss_mlp": 1.03701735, "epoch": 0.6345408086577484, "flos": 40510611187200.0, "grad_norm": 3.157482297156619, "language_loss": 0.55275041, "learning_rate": 1.179744238887376e-06, "loss": 0.5742774, "num_input_tokens_seen": 227535395, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.73046875, "step": 10554, "time_per_iteration": 2.7106776237487793 }, { "auxiliary_loss_clip": 0.01127977, "auxiliary_loss_mlp": 0.01033416, "balance_loss_clip": 1.02046371, "balance_loss_mlp": 1.0380491, "epoch": 0.6346009319104163, "flos": 21361319222400.0, "grad_norm": 1.8575644801233113, "language_loss": 0.70701611, "learning_rate": 1.1793996689739729e-06, "loss": 0.72863007, "num_input_tokens_seen": 227554545, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 10555, "time_per_iteration": 2.658785343170166 }, { "auxiliary_loss_clip": 0.01047636, "auxiliary_loss_mlp": 0.01000783, "balance_loss_clip": 0.99936467, "balance_loss_mlp": 1.00772452, "epoch": 0.6346610551630844, "flos": 71365419100800.0, "grad_norm": 0.7806267613553033, "language_loss": 0.55346203, "learning_rate": 1.1790551283456855e-06, "loss": 0.57394624, "num_input_tokens_seen": 227608575, "router_z_loss_clip": 0.01416016, "router_z_loss_mlp": 0.22460938, "step": 10556, "time_per_iteration": 3.1298696994781494 }, { "auxiliary_loss_clip": 0.01128696, "auxiliary_loss_mlp": 0.01031854, "balance_loss_clip": 1.01787031, "balance_loss_mlp": 1.0371418, "epoch": 0.6347211784157523, "flos": 25411504049280.0, "grad_norm": 1.8391595056350534, "language_loss": 0.68163776, "learning_rate": 1.1787106170148082e-06, "loss": 0.70324326, "num_input_tokens_seen": 227628175, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.73828125, "step": 10557, "time_per_iteration": 2.6551191806793213 }, { "auxiliary_loss_clip": 0.0112681, "auxiliary_loss_mlp": 0.0103433, "balance_loss_clip": 1.02228987, "balance_loss_mlp": 1.03877211, "epoch": 0.6347813016684203, "flos": 15742735966080.0, "grad_norm": 1.8581759564154063, "language_loss": 0.70041418, "learning_rate": 1.1783661349936363e-06, "loss": 0.72202551, "num_input_tokens_seen": 227645330, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.703125, "step": 10558, "time_per_iteration": 2.63118839263916 }, { "auxiliary_loss_clip": 0.01127629, "auxiliary_loss_mlp": 0.01031828, "balance_loss_clip": 1.01881659, "balance_loss_mlp": 1.0371058, "epoch": 0.6348414249210882, "flos": 21464777370240.0, "grad_norm": 1.5288734094375172, "language_loss": 0.78178036, "learning_rate": 1.1780216822944647e-06, "loss": 0.80337501, "num_input_tokens_seen": 227665250, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 10559, "time_per_iteration": 2.6041197776794434 }, { "auxiliary_loss_clip": 0.0113168, "auxiliary_loss_mlp": 0.01032639, "balance_loss_clip": 1.01941311, "balance_loss_mlp": 1.04008305, "epoch": 0.6349015481737562, "flos": 21653057485440.0, "grad_norm": 1.7729528924918854, "language_loss": 0.68381768, "learning_rate": 1.1776772589295836e-06, "loss": 0.70546085, "num_input_tokens_seen": 227685070, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 10560, "time_per_iteration": 2.5850467681884766 }, { "auxiliary_loss_clip": 0.0112719, "auxiliary_loss_mlp": 0.01033291, "balance_loss_clip": 1.02059555, "balance_loss_mlp": 1.03726721, "epoch": 0.6349616714264241, "flos": 22194984954240.0, "grad_norm": 1.7274279178970797, "language_loss": 0.77035439, "learning_rate": 1.1773328649112858e-06, "loss": 0.79195923, "num_input_tokens_seen": 227704430, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 10561, "time_per_iteration": 2.605086326599121 }, { "auxiliary_loss_clip": 0.0112837, "auxiliary_loss_mlp": 0.01031127, "balance_loss_clip": 1.0178647, "balance_loss_mlp": 1.03804302, "epoch": 0.6350217946790921, "flos": 25410354814080.0, "grad_norm": 1.967910040667252, "language_loss": 0.71887445, "learning_rate": 1.176988500251863e-06, "loss": 0.74046946, "num_input_tokens_seen": 227724920, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 10562, "time_per_iteration": 2.6191132068634033 }, { "auxiliary_loss_clip": 0.01121538, "auxiliary_loss_mlp": 0.01028665, "balance_loss_clip": 1.01548648, "balance_loss_mlp": 1.03847146, "epoch": 0.63508191793176, "flos": 19718944732800.0, "grad_norm": 1.873730425655801, "language_loss": 0.80534929, "learning_rate": 1.176644164963603e-06, "loss": 0.82685131, "num_input_tokens_seen": 227743400, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73828125, "step": 10563, "time_per_iteration": 2.545025587081909 }, { "auxiliary_loss_clip": 0.01140634, "auxiliary_loss_mlp": 0.01034602, "balance_loss_clip": 1.0216198, "balance_loss_mlp": 1.0398519, "epoch": 0.6351420411844281, "flos": 18186923802240.0, "grad_norm": 2.286308965184864, "language_loss": 0.80796981, "learning_rate": 1.1762998590587946e-06, "loss": 0.82972217, "num_input_tokens_seen": 227759990, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7421875, "step": 10564, "time_per_iteration": 2.603262186050415 }, { "auxiliary_loss_clip": 0.01130289, "auxiliary_loss_mlp": 0.01032675, "balance_loss_clip": 1.01998532, "balance_loss_mlp": 1.03961825, "epoch": 0.6352021644370961, "flos": 33726511422720.0, "grad_norm": 1.608318588403948, "language_loss": 0.72571218, "learning_rate": 1.1759555825497253e-06, "loss": 0.74734181, "num_input_tokens_seen": 227780835, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 10565, "time_per_iteration": 2.7298319339752197 }, { "auxiliary_loss_clip": 0.01120326, "auxiliary_loss_mlp": 0.01034438, "balance_loss_clip": 1.02092588, "balance_loss_mlp": 1.03765154, "epoch": 0.635262287689764, "flos": 20374781207040.0, "grad_norm": 1.9855723050176577, "language_loss": 0.69127643, "learning_rate": 1.1756113354486826e-06, "loss": 0.71282399, "num_input_tokens_seen": 227798580, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 10566, "time_per_iteration": 3.9843578338623047 }, { "auxiliary_loss_clip": 0.01116969, "auxiliary_loss_mlp": 0.0103251, "balance_loss_clip": 1.02030325, "balance_loss_mlp": 1.03868926, "epoch": 0.635322410942432, "flos": 27525421307520.0, "grad_norm": 1.61515063136177, "language_loss": 0.69609046, "learning_rate": 1.1752671177679495e-06, "loss": 0.71758521, "num_input_tokens_seen": 227819210, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 10567, "time_per_iteration": 2.564391613006592 }, { "auxiliary_loss_clip": 0.01116904, "auxiliary_loss_mlp": 0.01034168, "balance_loss_clip": 1.02199674, "balance_loss_mlp": 1.03939152, "epoch": 0.6353825341950999, "flos": 21543601766400.0, "grad_norm": 1.8193366842578753, "language_loss": 0.84580475, "learning_rate": 1.1749229295198117e-06, "loss": 0.86731547, "num_input_tokens_seen": 227838340, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 10568, "time_per_iteration": 2.5695555210113525 }, { "auxiliary_loss_clip": 0.01148511, "auxiliary_loss_mlp": 0.01037434, "balance_loss_clip": 1.02395761, "balance_loss_mlp": 1.03888464, "epoch": 0.635442657447768, "flos": 31759756185600.0, "grad_norm": 1.7308702025739118, "language_loss": 0.83938682, "learning_rate": 1.174578770716553e-06, "loss": 0.86124623, "num_input_tokens_seen": 227859170, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 10569, "time_per_iteration": 2.622507333755493 }, { "auxiliary_loss_clip": 0.01116314, "auxiliary_loss_mlp": 0.01029827, "balance_loss_clip": 1.01721454, "balance_loss_mlp": 1.03757405, "epoch": 0.6355027807004359, "flos": 19828831415040.0, "grad_norm": 1.8777010688155888, "language_loss": 0.69411504, "learning_rate": 1.1742346413704542e-06, "loss": 0.71557653, "num_input_tokens_seen": 227878545, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 10570, "time_per_iteration": 2.5011091232299805 }, { "auxiliary_loss_clip": 0.01119817, "auxiliary_loss_mlp": 0.01032997, "balance_loss_clip": 1.01940751, "balance_loss_mlp": 1.03650045, "epoch": 0.6355629039531039, "flos": 30372383324160.0, "grad_norm": 3.7281893362048177, "language_loss": 0.65645236, "learning_rate": 1.1738905414937967e-06, "loss": 0.67798054, "num_input_tokens_seen": 227898875, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.74609375, "step": 10571, "time_per_iteration": 4.008267164230347 }, { "auxiliary_loss_clip": 0.011241, "auxiliary_loss_mlp": 0.01027606, "balance_loss_clip": 1.0145824, "balance_loss_mlp": 1.04129446, "epoch": 0.6356230272057718, "flos": 17932065828480.0, "grad_norm": 1.6637712601083525, "language_loss": 0.71270704, "learning_rate": 1.1735464710988608e-06, "loss": 0.73422408, "num_input_tokens_seen": 227917130, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73828125, "step": 10572, "time_per_iteration": 2.5274550914764404 }, { "auxiliary_loss_clip": 0.0113457, "auxiliary_loss_mlp": 0.01030969, "balance_loss_clip": 1.01883376, "balance_loss_mlp": 1.03777075, "epoch": 0.6356831504584398, "flos": 25375844822400.0, "grad_norm": 1.554498760339987, "language_loss": 0.80989671, "learning_rate": 1.1732024301979264e-06, "loss": 0.83155209, "num_input_tokens_seen": 227939550, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.703125, "step": 10573, "time_per_iteration": 2.6609609127044678 }, { "auxiliary_loss_clip": 0.01129513, "auxiliary_loss_mlp": 0.01029248, "balance_loss_clip": 1.01671934, "balance_loss_mlp": 1.03904033, "epoch": 0.6357432737111077, "flos": 46500331720320.0, "grad_norm": 1.6991167661624968, "language_loss": 0.6898517, "learning_rate": 1.1728584188032695e-06, "loss": 0.71143937, "num_input_tokens_seen": 227962200, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.73046875, "step": 10574, "time_per_iteration": 2.816164493560791 }, { "auxiliary_loss_clip": 0.01109804, "auxiliary_loss_mlp": 0.01028783, "balance_loss_clip": 1.01615262, "balance_loss_mlp": 1.0390135, "epoch": 0.6358033969637757, "flos": 17274361847040.0, "grad_norm": 2.4448004847230815, "language_loss": 0.8676514, "learning_rate": 1.1725144369271678e-06, "loss": 0.88903725, "num_input_tokens_seen": 227979270, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 10575, "time_per_iteration": 2.5568766593933105 }, { "auxiliary_loss_clip": 0.01109075, "auxiliary_loss_mlp": 0.01031351, "balance_loss_clip": 1.01901865, "balance_loss_mlp": 1.0381248, "epoch": 0.6358635202164437, "flos": 27125520215040.0, "grad_norm": 1.6104310990580368, "language_loss": 0.72254288, "learning_rate": 1.1721704845818986e-06, "loss": 0.74394709, "num_input_tokens_seen": 228000550, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 10576, "time_per_iteration": 2.6002466678619385 }, { "auxiliary_loss_clip": 0.01129068, "auxiliary_loss_mlp": 0.01029795, "balance_loss_clip": 1.01641393, "balance_loss_mlp": 1.03821301, "epoch": 0.6359236434691117, "flos": 27525205825920.0, "grad_norm": 1.699963213555634, "language_loss": 0.69320071, "learning_rate": 1.1718265617797341e-06, "loss": 0.71478933, "num_input_tokens_seen": 228022005, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 10577, "time_per_iteration": 6.5463175773620605 }, { "auxiliary_loss_clip": 0.01144263, "auxiliary_loss_mlp": 0.0103033, "balance_loss_clip": 1.01806951, "balance_loss_mlp": 1.03733182, "epoch": 0.6359837667217797, "flos": 39348290989440.0, "grad_norm": 2.5627723347650893, "language_loss": 0.71982282, "learning_rate": 1.17148266853295e-06, "loss": 0.74156874, "num_input_tokens_seen": 228043770, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.71875, "step": 10578, "time_per_iteration": 2.745817184448242 }, { "auxiliary_loss_clip": 0.01056387, "auxiliary_loss_mlp": 0.01250387, "balance_loss_clip": 1.00302207, "balance_loss_mlp": 1.0073061, "epoch": 0.6360438899744476, "flos": 56413797206400.0, "grad_norm": 0.7139968854084883, "language_loss": 0.54547995, "learning_rate": 1.1711388048538182e-06, "loss": 0.56854767, "num_input_tokens_seen": 228104985, "router_z_loss_clip": 0.01470947, "router_z_loss_mlp": 0.22363281, "step": 10579, "time_per_iteration": 3.2688846588134766 }, { "auxiliary_loss_clip": 0.01128295, "auxiliary_loss_mlp": 0.0103224, "balance_loss_clip": 1.01960945, "balance_loss_mlp": 1.03855133, "epoch": 0.6361040132271156, "flos": 24973106555520.0, "grad_norm": 1.6828304654522115, "language_loss": 0.77666998, "learning_rate": 1.17079497075461e-06, "loss": 0.79827535, "num_input_tokens_seen": 228125620, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 10580, "time_per_iteration": 2.5854146480560303 }, { "auxiliary_loss_clip": 0.0113427, "auxiliary_loss_mlp": 0.01275137, "balance_loss_clip": 1.01617408, "balance_loss_mlp": 1.03648829, "epoch": 0.6361641364797835, "flos": 23259198130560.0, "grad_norm": 2.1548571388229476, "language_loss": 0.66596115, "learning_rate": 1.1704511662475964e-06, "loss": 0.69005525, "num_input_tokens_seen": 228143495, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7109375, "step": 10581, "time_per_iteration": 2.677398681640625 }, { "auxiliary_loss_clip": 0.011166, "auxiliary_loss_mlp": 0.01031554, "balance_loss_clip": 1.01956189, "balance_loss_mlp": 1.03659785, "epoch": 0.6362242597324516, "flos": 25994513698560.0, "grad_norm": 1.4204424545940968, "language_loss": 0.68414783, "learning_rate": 1.1701073913450465e-06, "loss": 0.70562935, "num_input_tokens_seen": 228166500, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7109375, "step": 10582, "time_per_iteration": 2.6590960025787354 }, { "auxiliary_loss_clip": 0.01128368, "auxiliary_loss_mlp": 0.01038116, "balance_loss_clip": 1.02573001, "balance_loss_mlp": 1.03712988, "epoch": 0.6362843829851195, "flos": 25703242312320.0, "grad_norm": 1.7624588747932846, "language_loss": 0.8453055, "learning_rate": 1.1697636460592301e-06, "loss": 0.8669703, "num_input_tokens_seen": 228185325, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.734375, "step": 10583, "time_per_iteration": 2.639812469482422 }, { "auxiliary_loss_clip": 0.01126218, "auxiliary_loss_mlp": 0.01030607, "balance_loss_clip": 1.01813793, "balance_loss_mlp": 1.0394758, "epoch": 0.6363445062377875, "flos": 20522912895360.0, "grad_norm": 1.577748697047147, "language_loss": 0.75551569, "learning_rate": 1.1694199304024125e-06, "loss": 0.77708387, "num_input_tokens_seen": 228204050, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6875, "step": 10584, "time_per_iteration": 2.6029746532440186 }, { "auxiliary_loss_clip": 0.01128145, "auxiliary_loss_mlp": 0.01035507, "balance_loss_clip": 1.02160096, "balance_loss_mlp": 1.0373013, "epoch": 0.6364046294904554, "flos": 19463799450240.0, "grad_norm": 1.7923396463706143, "language_loss": 0.72340441, "learning_rate": 1.1690762443868613e-06, "loss": 0.74504089, "num_input_tokens_seen": 228222430, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7265625, "step": 10585, "time_per_iteration": 2.6335694789886475 }, { "auxiliary_loss_clip": 0.01135259, "auxiliary_loss_mlp": 0.0102923, "balance_loss_clip": 1.01809549, "balance_loss_mlp": 1.03798056, "epoch": 0.6364647527431234, "flos": 20995892208000.0, "grad_norm": 1.8593031354248901, "language_loss": 0.82869697, "learning_rate": 1.1687325880248424e-06, "loss": 0.8503418, "num_input_tokens_seen": 228241925, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.703125, "step": 10586, "time_per_iteration": 2.6036880016326904 }, { "auxiliary_loss_clip": 0.01116882, "auxiliary_loss_mlp": 0.01026505, "balance_loss_clip": 1.01479828, "balance_loss_mlp": 1.03823113, "epoch": 0.6365248759957913, "flos": 25770789838080.0, "grad_norm": 1.5459560396575989, "language_loss": 0.72269428, "learning_rate": 1.1683889613286183e-06, "loss": 0.74412811, "num_input_tokens_seen": 228262535, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69921875, "step": 10587, "time_per_iteration": 2.63004732131958 }, { "auxiliary_loss_clip": 0.01126933, "auxiliary_loss_mlp": 0.01028984, "balance_loss_clip": 1.01671147, "balance_loss_mlp": 1.03816152, "epoch": 0.6365849992484593, "flos": 22455589104000.0, "grad_norm": 2.17153853610866, "language_loss": 0.76702124, "learning_rate": 1.1680453643104527e-06, "loss": 0.78858042, "num_input_tokens_seen": 228281340, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 10588, "time_per_iteration": 2.604146957397461 }, { "auxiliary_loss_clip": 0.01140942, "auxiliary_loss_mlp": 0.01030212, "balance_loss_clip": 1.01720667, "balance_loss_mlp": 1.03841305, "epoch": 0.6366451225011273, "flos": 19025689265280.0, "grad_norm": 1.3800163845168956, "language_loss": 0.80013025, "learning_rate": 1.1677017969826093e-06, "loss": 0.82184178, "num_input_tokens_seen": 228300865, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6796875, "step": 10589, "time_per_iteration": 2.5695242881774902 }, { "auxiliary_loss_clip": 0.01124131, "auxiliary_loss_mlp": 0.01028657, "balance_loss_clip": 1.0160507, "balance_loss_mlp": 1.03769433, "epoch": 0.6367052457537953, "flos": 25228395492480.0, "grad_norm": 2.362621210776988, "language_loss": 0.67279172, "learning_rate": 1.167358259357347e-06, "loss": 0.69431961, "num_input_tokens_seen": 228320815, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 10590, "time_per_iteration": 2.611973285675049 }, { "auxiliary_loss_clip": 0.01123345, "auxiliary_loss_mlp": 0.01032884, "balance_loss_clip": 1.02020621, "balance_loss_mlp": 1.03811121, "epoch": 0.6367653690064633, "flos": 19208438686080.0, "grad_norm": 1.6790135339515355, "language_loss": 0.78940558, "learning_rate": 1.167014751446926e-06, "loss": 0.81096786, "num_input_tokens_seen": 228339065, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.765625, "step": 10591, "time_per_iteration": 2.5503363609313965 }, { "auxiliary_loss_clip": 0.01123494, "auxiliary_loss_mlp": 0.0102872, "balance_loss_clip": 1.01750278, "balance_loss_mlp": 1.03711104, "epoch": 0.6368254922591312, "flos": 23546806329600.0, "grad_norm": 1.4554019492414156, "language_loss": 0.89220953, "learning_rate": 1.1666712732636069e-06, "loss": 0.91373169, "num_input_tokens_seen": 228359210, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6875, "step": 10592, "time_per_iteration": 2.6356165409088135 }, { "auxiliary_loss_clip": 0.01112897, "auxiliary_loss_mlp": 0.01028801, "balance_loss_clip": 1.01767898, "balance_loss_mlp": 1.03512955, "epoch": 0.6368856155117992, "flos": 26467313443200.0, "grad_norm": 1.423976539641805, "language_loss": 0.68466878, "learning_rate": 1.166327824819646e-06, "loss": 0.70608574, "num_input_tokens_seen": 228379630, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6875, "step": 10593, "time_per_iteration": 2.63653302192688 }, { "auxiliary_loss_clip": 0.01140027, "auxiliary_loss_mlp": 0.01031631, "balance_loss_clip": 1.02006185, "balance_loss_mlp": 1.03478312, "epoch": 0.6369457387644671, "flos": 33692432394240.0, "grad_norm": 1.6387619333115573, "language_loss": 0.63843274, "learning_rate": 1.1659844061273007e-06, "loss": 0.66014928, "num_input_tokens_seen": 228401410, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6953125, "step": 10594, "time_per_iteration": 2.7178003787994385 }, { "auxiliary_loss_clip": 0.01037264, "auxiliary_loss_mlp": 0.01001673, "balance_loss_clip": 1.00021303, "balance_loss_mlp": 1.00614357, "epoch": 0.6370058620171352, "flos": 70908600908160.0, "grad_norm": 0.7691656796330725, "language_loss": 0.54712451, "learning_rate": 1.1656410171988259e-06, "loss": 0.56751382, "num_input_tokens_seen": 228470335, "router_z_loss_clip": 0.0145874, "router_z_loss_mlp": 0.22363281, "step": 10595, "time_per_iteration": 3.3263320922851562 }, { "auxiliary_loss_clip": 0.01110363, "auxiliary_loss_mlp": 0.01037765, "balance_loss_clip": 1.02437818, "balance_loss_mlp": 1.03766823, "epoch": 0.6370659852698031, "flos": 21141940907520.0, "grad_norm": 1.6885551074336256, "language_loss": 0.67056268, "learning_rate": 1.1652976580464787e-06, "loss": 0.69204396, "num_input_tokens_seen": 228490765, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 10596, "time_per_iteration": 2.6821036338806152 }, { "auxiliary_loss_clip": 0.01116659, "auxiliary_loss_mlp": 0.01031919, "balance_loss_clip": 1.01996839, "balance_loss_mlp": 1.03627551, "epoch": 0.6371261085224711, "flos": 20193288762240.0, "grad_norm": 2.0810222348672998, "language_loss": 0.792799, "learning_rate": 1.164954328682509e-06, "loss": 0.81428468, "num_input_tokens_seen": 228509700, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.7109375, "step": 10597, "time_per_iteration": 2.5109217166900635 }, { "auxiliary_loss_clip": 0.01118391, "auxiliary_loss_mlp": 0.0103166, "balance_loss_clip": 1.01998937, "balance_loss_mlp": 1.03870225, "epoch": 0.637186231775139, "flos": 19683536901120.0, "grad_norm": 1.7763830472449815, "language_loss": 0.74767387, "learning_rate": 1.1646110291191724e-06, "loss": 0.76917434, "num_input_tokens_seen": 228529050, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.7109375, "step": 10598, "time_per_iteration": 2.5584871768951416 }, { "auxiliary_loss_clip": 0.01133337, "auxiliary_loss_mlp": 0.01285001, "balance_loss_clip": 1.02609622, "balance_loss_mlp": 1.03541052, "epoch": 0.637246355027807, "flos": 13071196995840.0, "grad_norm": 4.33618094805948, "language_loss": 0.67829913, "learning_rate": 1.1642677593687184e-06, "loss": 0.70248252, "num_input_tokens_seen": 228544665, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.71875, "step": 10599, "time_per_iteration": 2.5047194957733154 }, { "auxiliary_loss_clip": 0.01136774, "auxiliary_loss_mlp": 0.01035057, "balance_loss_clip": 1.0221169, "balance_loss_mlp": 1.03614306, "epoch": 0.6373064782804749, "flos": 18222654856320.0, "grad_norm": 2.31643432796841, "language_loss": 0.80401623, "learning_rate": 1.1639245194434e-06, "loss": 0.8257345, "num_input_tokens_seen": 228562060, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.734375, "step": 10600, "time_per_iteration": 2.5529696941375732 }, { "auxiliary_loss_clip": 0.01114358, "auxiliary_loss_mlp": 0.01032384, "balance_loss_clip": 1.02065444, "balance_loss_mlp": 1.03500879, "epoch": 0.637366601533143, "flos": 24498475217280.0, "grad_norm": 1.4239424834288037, "language_loss": 0.79562294, "learning_rate": 1.163581309355464e-06, "loss": 0.81709027, "num_input_tokens_seen": 228582550, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.703125, "step": 10601, "time_per_iteration": 2.528438091278076 }, { "auxiliary_loss_clip": 0.01133502, "auxiliary_loss_mlp": 0.01027841, "balance_loss_clip": 1.01576567, "balance_loss_mlp": 1.03542006, "epoch": 0.6374267247858109, "flos": 26359042872960.0, "grad_norm": 2.173079512646842, "language_loss": 0.6701299, "learning_rate": 1.163238129117159e-06, "loss": 0.69174325, "num_input_tokens_seen": 228604960, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.71875, "step": 10602, "time_per_iteration": 2.613125801086426 }, { "auxiliary_loss_clip": 0.01157778, "auxiliary_loss_mlp": 0.01028009, "balance_loss_clip": 1.01621294, "balance_loss_mlp": 1.03453362, "epoch": 0.6374868480384789, "flos": 20371728551040.0, "grad_norm": 1.7391207220252585, "language_loss": 0.79199618, "learning_rate": 1.1628949787407338e-06, "loss": 0.8138541, "num_input_tokens_seen": 228622195, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 10603, "time_per_iteration": 2.56563401222229 }, { "auxiliary_loss_clip": 0.01140635, "auxiliary_loss_mlp": 0.01276685, "balance_loss_clip": 1.0170145, "balance_loss_mlp": 1.03497279, "epoch": 0.6375469712911469, "flos": 20996251344000.0, "grad_norm": 2.0203185024125374, "language_loss": 0.76433802, "learning_rate": 1.1625518582384323e-06, "loss": 0.78851128, "num_input_tokens_seen": 228639735, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 10604, "time_per_iteration": 2.591034173965454 }, { "auxiliary_loss_clip": 0.0112667, "auxiliary_loss_mlp": 0.0103123, "balance_loss_clip": 1.01887953, "balance_loss_mlp": 1.03559315, "epoch": 0.6376070945438148, "flos": 19715748422400.0, "grad_norm": 2.0274575634217, "language_loss": 0.77039826, "learning_rate": 1.1622087676225017e-06, "loss": 0.79197729, "num_input_tokens_seen": 228658195, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.734375, "step": 10605, "time_per_iteration": 2.660571336746216 }, { "auxiliary_loss_clip": 0.01153405, "auxiliary_loss_mlp": 0.01031753, "balance_loss_clip": 1.01776421, "balance_loss_mlp": 1.03653026, "epoch": 0.6376672177964828, "flos": 21506757390720.0, "grad_norm": 1.748160763499794, "language_loss": 0.65547538, "learning_rate": 1.1618657069051847e-06, "loss": 0.67732698, "num_input_tokens_seen": 228677415, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7265625, "step": 10606, "time_per_iteration": 2.7012782096862793 }, { "auxiliary_loss_clip": 0.01120291, "auxiliary_loss_mlp": 0.01036064, "balance_loss_clip": 1.02451897, "balance_loss_mlp": 1.03550935, "epoch": 0.6377273410491507, "flos": 18843873598080.0, "grad_norm": 1.8015103616628543, "language_loss": 0.75461459, "learning_rate": 1.1615226760987252e-06, "loss": 0.77617818, "num_input_tokens_seen": 228696450, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 10607, "time_per_iteration": 4.075138568878174 }, { "auxiliary_loss_clip": 0.01124097, "auxiliary_loss_mlp": 0.01040333, "balance_loss_clip": 1.02798843, "balance_loss_mlp": 1.03579807, "epoch": 0.6377874643018188, "flos": 53062970181120.0, "grad_norm": 1.9565478016601414, "language_loss": 0.65934837, "learning_rate": 1.1611796752153633e-06, "loss": 0.68099272, "num_input_tokens_seen": 228721600, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.7109375, "step": 10608, "time_per_iteration": 2.877941846847534 }, { "auxiliary_loss_clip": 0.01080945, "auxiliary_loss_mlp": 0.01004068, "balance_loss_clip": 1.00272083, "balance_loss_mlp": 1.00554383, "epoch": 0.6378475875544867, "flos": 65135026465920.0, "grad_norm": 0.7204960288145921, "language_loss": 0.51901925, "learning_rate": 1.1608367042673421e-06, "loss": 0.53986937, "num_input_tokens_seen": 228784535, "router_z_loss_clip": 0.01348877, "router_z_loss_mlp": 0.22460938, "step": 10609, "time_per_iteration": 3.205127000808716 }, { "auxiliary_loss_clip": 0.0111038, "auxiliary_loss_mlp": 0.01022862, "balance_loss_clip": 1.01231194, "balance_loss_mlp": 1.0339601, "epoch": 0.6379077108071547, "flos": 23002759958400.0, "grad_norm": 1.5023332284536877, "language_loss": 0.74691004, "learning_rate": 1.1604937632669006e-06, "loss": 0.76824242, "num_input_tokens_seen": 228804110, "router_z_loss_clip": 0.10546875, "router_z_loss_mlp": 0.67578125, "step": 10610, "time_per_iteration": 2.5986201763153076 }, { "auxiliary_loss_clip": 0.01044404, "auxiliary_loss_mlp": 0.01005422, "balance_loss_clip": 1.00406861, "balance_loss_mlp": 1.0051465, "epoch": 0.6379678340598226, "flos": 67601947610880.0, "grad_norm": 0.8311728167550401, "language_loss": 0.63145775, "learning_rate": 1.1601508522262767e-06, "loss": 0.65195602, "num_input_tokens_seen": 228867705, "router_z_loss_clip": 0.0135498, "router_z_loss_mlp": 0.22265625, "step": 10611, "time_per_iteration": 3.2360947132110596 }, { "auxiliary_loss_clip": 0.0111522, "auxiliary_loss_mlp": 0.01029545, "balance_loss_clip": 1.01709938, "balance_loss_mlp": 1.0353986, "epoch": 0.6380279573124906, "flos": 29680061610240.0, "grad_norm": 2.608026151425905, "language_loss": 0.72551227, "learning_rate": 1.1598079711577083e-06, "loss": 0.74695992, "num_input_tokens_seen": 228889215, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 10612, "time_per_iteration": 2.6873559951782227 }, { "auxiliary_loss_clip": 0.01045358, "auxiliary_loss_mlp": 0.01002285, "balance_loss_clip": 1.00093806, "balance_loss_mlp": 1.00529838, "epoch": 0.6380880805651585, "flos": 66484046580480.0, "grad_norm": 0.7107765633262715, "language_loss": 0.57859105, "learning_rate": 1.1594651200734333e-06, "loss": 0.59906751, "num_input_tokens_seen": 228948465, "router_z_loss_clip": 0.01348877, "router_z_loss_mlp": 0.22460938, "step": 10613, "time_per_iteration": 4.5248517990112305 }, { "auxiliary_loss_clip": 0.01142376, "auxiliary_loss_mlp": 0.0103519, "balance_loss_clip": 1.02232695, "balance_loss_mlp": 1.0355885, "epoch": 0.6381482038178266, "flos": 23914998691200.0, "grad_norm": 1.8381853865985767, "language_loss": 0.75530398, "learning_rate": 1.1591222989856847e-06, "loss": 0.77707958, "num_input_tokens_seen": 228967955, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 10614, "time_per_iteration": 2.6336801052093506 }, { "auxiliary_loss_clip": 0.01125318, "auxiliary_loss_mlp": 0.0128131, "balance_loss_clip": 1.02266622, "balance_loss_mlp": 1.03673148, "epoch": 0.6382083270704945, "flos": 24243042625920.0, "grad_norm": 1.8118639161815198, "language_loss": 0.79595816, "learning_rate": 1.158779507906699e-06, "loss": 0.82002437, "num_input_tokens_seen": 228985495, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.70703125, "step": 10615, "time_per_iteration": 2.6185152530670166 }, { "auxiliary_loss_clip": 0.0104538, "auxiliary_loss_mlp": 0.01002283, "balance_loss_clip": 1.00087059, "balance_loss_mlp": 1.00539947, "epoch": 0.6382684503231625, "flos": 70775552931840.0, "grad_norm": 0.654285717730853, "language_loss": 0.55585575, "learning_rate": 1.1584367468487087e-06, "loss": 0.57633239, "num_input_tokens_seen": 229052995, "router_z_loss_clip": 0.01409912, "router_z_loss_mlp": 0.22460938, "step": 10616, "time_per_iteration": 3.2948338985443115 }, { "auxiliary_loss_clip": 0.01123641, "auxiliary_loss_mlp": 0.01031177, "balance_loss_clip": 1.01851058, "balance_loss_mlp": 1.03509855, "epoch": 0.6383285735758305, "flos": 16544836621440.0, "grad_norm": 1.7950278261009005, "language_loss": 0.83779597, "learning_rate": 1.158094015823946e-06, "loss": 0.85934412, "num_input_tokens_seen": 229071030, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 10617, "time_per_iteration": 2.616448163986206 }, { "auxiliary_loss_clip": 0.0111554, "auxiliary_loss_mlp": 0.01035346, "balance_loss_clip": 1.02235174, "balance_loss_mlp": 1.03495979, "epoch": 0.6383886968284984, "flos": 14427651225600.0, "grad_norm": 1.800459000143044, "language_loss": 0.87098539, "learning_rate": 1.1577513148446426e-06, "loss": 0.89249426, "num_input_tokens_seen": 229088275, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 10618, "time_per_iteration": 4.048682928085327 }, { "auxiliary_loss_clip": 0.01154139, "auxiliary_loss_mlp": 0.01034381, "balance_loss_clip": 1.02312756, "balance_loss_mlp": 1.03782392, "epoch": 0.6384488200811664, "flos": 17929659617280.0, "grad_norm": 2.6378668188585443, "language_loss": 0.73660713, "learning_rate": 1.1574086439230273e-06, "loss": 0.75849235, "num_input_tokens_seen": 229105190, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.71875, "step": 10619, "time_per_iteration": 4.445390224456787 }, { "auxiliary_loss_clip": 0.01129793, "auxiliary_loss_mlp": 0.01038315, "balance_loss_clip": 1.02445698, "balance_loss_mlp": 1.03688717, "epoch": 0.6385089433338343, "flos": 18515578268160.0, "grad_norm": 2.070844858508105, "language_loss": 0.76612771, "learning_rate": 1.1570660030713315e-06, "loss": 0.78780878, "num_input_tokens_seen": 229122290, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75, "step": 10620, "time_per_iteration": 2.5636038780212402 }, { "auxiliary_loss_clip": 0.01123009, "auxiliary_loss_mlp": 0.01031127, "balance_loss_clip": 1.0189085, "balance_loss_mlp": 1.03600597, "epoch": 0.6385690665865024, "flos": 24753620499840.0, "grad_norm": 2.688921175687261, "language_loss": 0.70451516, "learning_rate": 1.1567233923017805e-06, "loss": 0.72605658, "num_input_tokens_seen": 229141620, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 10621, "time_per_iteration": 2.5936191082000732 }, { "auxiliary_loss_clip": 0.011167, "auxiliary_loss_mlp": 0.01023176, "balance_loss_clip": 1.0125252, "balance_loss_mlp": 1.03323376, "epoch": 0.6386291898391703, "flos": 20120569678080.0, "grad_norm": 1.6404763968913052, "language_loss": 0.77536714, "learning_rate": 1.1563808116266032e-06, "loss": 0.79676592, "num_input_tokens_seen": 229161570, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.65625, "step": 10622, "time_per_iteration": 2.6024563312530518 }, { "auxiliary_loss_clip": 0.01129814, "auxiliary_loss_mlp": 0.01026819, "balance_loss_clip": 1.0150528, "balance_loss_mlp": 1.03435159, "epoch": 0.6386893130918383, "flos": 16867278034560.0, "grad_norm": 1.6770975498088387, "language_loss": 0.75136018, "learning_rate": 1.1560382610580245e-06, "loss": 0.77292645, "num_input_tokens_seen": 229178465, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6875, "step": 10623, "time_per_iteration": 2.59014630317688 }, { "auxiliary_loss_clip": 0.01120114, "auxiliary_loss_mlp": 0.01027673, "balance_loss_clip": 1.01636016, "balance_loss_mlp": 1.03435302, "epoch": 0.6387494363445062, "flos": 22966274718720.0, "grad_norm": 1.3729383600350455, "language_loss": 0.76647818, "learning_rate": 1.1556957406082694e-06, "loss": 0.78795606, "num_input_tokens_seen": 229198975, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6796875, "step": 10624, "time_per_iteration": 2.6045219898223877 }, { "auxiliary_loss_clip": 0.01148373, "auxiliary_loss_mlp": 0.0102961, "balance_loss_clip": 1.01820183, "balance_loss_mlp": 1.03488541, "epoch": 0.6388095595971742, "flos": 22857716839680.0, "grad_norm": 1.6768751760434095, "language_loss": 0.80374253, "learning_rate": 1.155353250289561e-06, "loss": 0.8255223, "num_input_tokens_seen": 229218825, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.69140625, "step": 10625, "time_per_iteration": 2.6404836177825928 }, { "auxiliary_loss_clip": 0.01125258, "auxiliary_loss_mlp": 0.01032774, "balance_loss_clip": 1.02018571, "balance_loss_mlp": 1.03683972, "epoch": 0.6388696828498421, "flos": 17311529445120.0, "grad_norm": 1.7068354620384083, "language_loss": 0.72468507, "learning_rate": 1.1550107901141228e-06, "loss": 0.74626541, "num_input_tokens_seen": 229236060, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 10626, "time_per_iteration": 2.7013940811157227 }, { "auxiliary_loss_clip": 0.01124426, "auxiliary_loss_mlp": 0.01029156, "balance_loss_clip": 1.01644242, "balance_loss_mlp": 1.03569984, "epoch": 0.6389298061025102, "flos": 17128636369920.0, "grad_norm": 1.8318020717255141, "language_loss": 0.72493017, "learning_rate": 1.154668360094176e-06, "loss": 0.74646598, "num_input_tokens_seen": 229255160, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 10627, "time_per_iteration": 2.6899940967559814 }, { "auxiliary_loss_clip": 0.01138383, "auxiliary_loss_mlp": 0.0102999, "balance_loss_clip": 1.01622128, "balance_loss_mlp": 1.03736734, "epoch": 0.6389899293551781, "flos": 27710971989120.0, "grad_norm": 2.23649120106946, "language_loss": 0.6669935, "learning_rate": 1.154325960241941e-06, "loss": 0.68867719, "num_input_tokens_seen": 229278705, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7421875, "step": 10628, "time_per_iteration": 2.6504459381103516 }, { "auxiliary_loss_clip": 0.01101295, "auxiliary_loss_mlp": 0.01027878, "balance_loss_clip": 1.0167321, "balance_loss_mlp": 1.034374, "epoch": 0.6390500526078461, "flos": 21215701486080.0, "grad_norm": 1.9134570809296056, "language_loss": 0.67666519, "learning_rate": 1.1539835905696365e-06, "loss": 0.69795692, "num_input_tokens_seen": 229299990, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.671875, "step": 10629, "time_per_iteration": 2.615525960922241 }, { "auxiliary_loss_clip": 0.01139447, "auxiliary_loss_mlp": 0.01037403, "balance_loss_clip": 1.02333641, "balance_loss_mlp": 1.03696501, "epoch": 0.6391101758605141, "flos": 21581056673280.0, "grad_norm": 2.842982905811784, "language_loss": 0.75386447, "learning_rate": 1.1536412510894828e-06, "loss": 0.77563298, "num_input_tokens_seen": 229319230, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75390625, "step": 10630, "time_per_iteration": 2.663637399673462 }, { "auxiliary_loss_clip": 0.01120949, "auxiliary_loss_mlp": 0.01029359, "balance_loss_clip": 1.0187614, "balance_loss_mlp": 1.03481126, "epoch": 0.639170299113182, "flos": 19900473091200.0, "grad_norm": 1.6959968879436507, "language_loss": 0.70426619, "learning_rate": 1.1532989418136951e-06, "loss": 0.72576928, "num_input_tokens_seen": 229338600, "router_z_loss_clip": 0.10595703, "router_z_loss_mlp": 0.68359375, "step": 10631, "time_per_iteration": 2.5176234245300293 }, { "auxiliary_loss_clip": 0.01047383, "auxiliary_loss_mlp": 0.0100255, "balance_loss_clip": 1.00122726, "balance_loss_mlp": 1.00670481, "epoch": 0.63923042236585, "flos": 69877604833920.0, "grad_norm": 0.7674577164875156, "language_loss": 0.62942231, "learning_rate": 1.1529566627544894e-06, "loss": 0.64992166, "num_input_tokens_seen": 229402420, "router_z_loss_clip": 0.01324463, "router_z_loss_mlp": 0.22460938, "step": 10632, "time_per_iteration": 3.227869987487793 }, { "auxiliary_loss_clip": 0.01121, "auxiliary_loss_mlp": 0.01030238, "balance_loss_clip": 1.01788783, "balance_loss_mlp": 1.03485417, "epoch": 0.639290545618518, "flos": 22674823764480.0, "grad_norm": 1.8841864321780941, "language_loss": 0.66992414, "learning_rate": 1.1526144139240832e-06, "loss": 0.69143653, "num_input_tokens_seen": 229419185, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.68359375, "step": 10633, "time_per_iteration": 2.516944169998169 }, { "auxiliary_loss_clip": 0.0115825, "auxiliary_loss_mlp": 0.01026652, "balance_loss_clip": 1.01502919, "balance_loss_mlp": 1.0363338, "epoch": 0.639350668871186, "flos": 19829190551040.0, "grad_norm": 2.031823493428768, "language_loss": 0.81779087, "learning_rate": 1.152272195334687e-06, "loss": 0.83963984, "num_input_tokens_seen": 229436735, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6875, "step": 10634, "time_per_iteration": 2.647333860397339 }, { "auxiliary_loss_clip": 0.0112435, "auxiliary_loss_mlp": 0.01035905, "balance_loss_clip": 1.02269673, "balance_loss_mlp": 1.03672528, "epoch": 0.6394107921238539, "flos": 20553328736640.0, "grad_norm": 2.182848083359375, "language_loss": 0.7514599, "learning_rate": 1.1519300069985165e-06, "loss": 0.77306247, "num_input_tokens_seen": 229455595, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6953125, "step": 10635, "time_per_iteration": 2.5246434211730957 }, { "auxiliary_loss_clip": 0.01115176, "auxiliary_loss_mlp": 0.01032098, "balance_loss_clip": 1.01971865, "balance_loss_mlp": 1.0349406, "epoch": 0.6394709153765219, "flos": 25774991729280.0, "grad_norm": 1.7666916132490857, "language_loss": 0.71322465, "learning_rate": 1.151587848927782e-06, "loss": 0.73469734, "num_input_tokens_seen": 229476230, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 10636, "time_per_iteration": 2.616863489151001 }, { "auxiliary_loss_clip": 0.01126248, "auxiliary_loss_mlp": 0.0103372, "balance_loss_clip": 1.02078557, "balance_loss_mlp": 1.03650117, "epoch": 0.6395310386291898, "flos": 17530153574400.0, "grad_norm": 1.813830428645392, "language_loss": 0.73811018, "learning_rate": 1.1512457211346963e-06, "loss": 0.75970984, "num_input_tokens_seen": 229494300, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 10637, "time_per_iteration": 2.550997734069824 }, { "auxiliary_loss_clip": 0.01114669, "auxiliary_loss_mlp": 0.01030559, "balance_loss_clip": 1.01809597, "balance_loss_mlp": 1.03356814, "epoch": 0.6395911618818578, "flos": 18588225525120.0, "grad_norm": 1.7428761794635457, "language_loss": 0.77627575, "learning_rate": 1.1509036236314656e-06, "loss": 0.79772806, "num_input_tokens_seen": 229512985, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 10638, "time_per_iteration": 2.7059614658355713 }, { "auxiliary_loss_clip": 0.01121951, "auxiliary_loss_mlp": 0.01032144, "balance_loss_clip": 1.01950169, "balance_loss_mlp": 1.03420353, "epoch": 0.6396512851345257, "flos": 28366557068160.0, "grad_norm": 1.7732764008106592, "language_loss": 0.81688195, "learning_rate": 1.1505615564303016e-06, "loss": 0.83842289, "num_input_tokens_seen": 229534270, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 10639, "time_per_iteration": 2.6654369831085205 }, { "auxiliary_loss_clip": 0.01046014, "auxiliary_loss_mlp": 0.00999075, "balance_loss_clip": 0.99778765, "balance_loss_mlp": 1.00603747, "epoch": 0.6397114083871938, "flos": 70724307202560.0, "grad_norm": 0.8165532715911373, "language_loss": 0.59002769, "learning_rate": 1.1502195195434104e-06, "loss": 0.61047864, "num_input_tokens_seen": 229596455, "router_z_loss_clip": 0.01287842, "router_z_loss_mlp": 0.22265625, "step": 10640, "time_per_iteration": 3.2501533031463623 }, { "auxiliary_loss_clip": 0.01119381, "auxiliary_loss_mlp": 0.01027484, "balance_loss_clip": 1.0162549, "balance_loss_mlp": 1.03385234, "epoch": 0.6397715316398617, "flos": 18142537570560.0, "grad_norm": 2.3971686602183193, "language_loss": 0.78429735, "learning_rate": 1.1498775129829988e-06, "loss": 0.80576605, "num_input_tokens_seen": 229612860, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.67578125, "step": 10641, "time_per_iteration": 2.6250786781311035 }, { "auxiliary_loss_clip": 0.01145601, "auxiliary_loss_mlp": 0.01033313, "balance_loss_clip": 1.02097523, "balance_loss_mlp": 1.03528798, "epoch": 0.6398316548925297, "flos": 25739512070400.0, "grad_norm": 1.6462543174721609, "language_loss": 0.63363373, "learning_rate": 1.149535536761271e-06, "loss": 0.65542287, "num_input_tokens_seen": 229633960, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7421875, "step": 10642, "time_per_iteration": 2.651380777359009 }, { "auxiliary_loss_clip": 0.01047395, "auxiliary_loss_mlp": 0.01001725, "balance_loss_clip": 1.00036609, "balance_loss_mlp": 1.00679183, "epoch": 0.6398917781451977, "flos": 71214234756480.0, "grad_norm": 0.9131837357251368, "language_loss": 0.55949271, "learning_rate": 1.1491935908904328e-06, "loss": 0.57998395, "num_input_tokens_seen": 229686730, "router_z_loss_clip": 0.01361084, "router_z_loss_mlp": 0.22265625, "step": 10643, "time_per_iteration": 3.153076171875 }, { "auxiliary_loss_clip": 0.01135616, "auxiliary_loss_mlp": 0.01026469, "balance_loss_clip": 1.01414323, "balance_loss_mlp": 1.03583801, "epoch": 0.6399519013978656, "flos": 20521835487360.0, "grad_norm": 2.1783772043641063, "language_loss": 0.7649225, "learning_rate": 1.1488516753826874e-06, "loss": 0.78654331, "num_input_tokens_seen": 229704800, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.73046875, "step": 10644, "time_per_iteration": 2.604316234588623 }, { "auxiliary_loss_clip": 0.0113431, "auxiliary_loss_mlp": 0.0103202, "balance_loss_clip": 1.0193541, "balance_loss_mlp": 1.03823912, "epoch": 0.6400120246505336, "flos": 24460840742400.0, "grad_norm": 1.8323221388107456, "language_loss": 0.82709301, "learning_rate": 1.148509790250236e-06, "loss": 0.84875631, "num_input_tokens_seen": 229725265, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 10645, "time_per_iteration": 2.618191719055176 }, { "auxiliary_loss_clip": 0.01119502, "auxiliary_loss_mlp": 0.01040412, "balance_loss_clip": 1.02668464, "balance_loss_mlp": 1.03626323, "epoch": 0.6400721479032015, "flos": 28366090191360.0, "grad_norm": 2.143870204325821, "language_loss": 0.73103058, "learning_rate": 1.14816793550528e-06, "loss": 0.75262964, "num_input_tokens_seen": 229744840, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7421875, "step": 10646, "time_per_iteration": 2.6424994468688965 }, { "auxiliary_loss_clip": 0.011339, "auxiliary_loss_mlp": 0.01031058, "balance_loss_clip": 1.01815391, "balance_loss_mlp": 1.03507161, "epoch": 0.6401322711558696, "flos": 17816540711040.0, "grad_norm": 2.0133014655540187, "language_loss": 0.79667616, "learning_rate": 1.1478261111600191e-06, "loss": 0.8183257, "num_input_tokens_seen": 229759095, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 10647, "time_per_iteration": 2.5660290718078613 }, { "auxiliary_loss_clip": 0.01141426, "auxiliary_loss_mlp": 0.01030391, "balance_loss_clip": 1.01863122, "balance_loss_mlp": 1.03751922, "epoch": 0.6401923944085375, "flos": 26030855283840.0, "grad_norm": 1.590799001812361, "language_loss": 0.75831884, "learning_rate": 1.1474843172266525e-06, "loss": 0.78003705, "num_input_tokens_seen": 229777750, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.68359375, "step": 10648, "time_per_iteration": 2.650456428527832 }, { "auxiliary_loss_clip": 0.01121804, "auxiliary_loss_mlp": 0.01028669, "balance_loss_clip": 1.01702189, "balance_loss_mlp": 1.03390419, "epoch": 0.6402525176612055, "flos": 23586451966080.0, "grad_norm": 1.8459831048310027, "language_loss": 0.78842503, "learning_rate": 1.1471425537173764e-06, "loss": 0.80992973, "num_input_tokens_seen": 229796785, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.69921875, "step": 10649, "time_per_iteration": 4.0382983684539795 }, { "auxiliary_loss_clip": 0.01150623, "auxiliary_loss_mlp": 0.01035736, "balance_loss_clip": 1.02262294, "balance_loss_mlp": 1.03617382, "epoch": 0.6403126409138734, "flos": 18041413806720.0, "grad_norm": 1.6468897782652325, "language_loss": 0.75865293, "learning_rate": 1.1468008206443907e-06, "loss": 0.78051656, "num_input_tokens_seen": 229815425, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 10650, "time_per_iteration": 2.6291158199310303 }, { "auxiliary_loss_clip": 0.011057, "auxiliary_loss_mlp": 0.01034918, "balance_loss_clip": 1.02184665, "balance_loss_mlp": 1.03562856, "epoch": 0.6403727641665414, "flos": 21979485308160.0, "grad_norm": 2.469359971044209, "language_loss": 0.70968616, "learning_rate": 1.1464591180198872e-06, "loss": 0.73109227, "num_input_tokens_seen": 229834545, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 10651, "time_per_iteration": 2.554288148880005 }, { "auxiliary_loss_clip": 0.0113213, "auxiliary_loss_mlp": 0.01033211, "balance_loss_clip": 1.02042007, "balance_loss_mlp": 1.03494918, "epoch": 0.6404328874192093, "flos": 24895539135360.0, "grad_norm": 2.4103084924138822, "language_loss": 0.63975108, "learning_rate": 1.1461174458560634e-06, "loss": 0.66140449, "num_input_tokens_seen": 229849175, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 10652, "time_per_iteration": 2.6545867919921875 }, { "auxiliary_loss_clip": 0.01125554, "auxiliary_loss_mlp": 0.01028391, "balance_loss_clip": 1.01673198, "balance_loss_mlp": 1.03777361, "epoch": 0.6404930106718774, "flos": 17597198309760.0, "grad_norm": 2.0650679575808297, "language_loss": 0.78967154, "learning_rate": 1.1457758041651104e-06, "loss": 0.81121099, "num_input_tokens_seen": 229865400, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6953125, "step": 10653, "time_per_iteration": 2.5872719287872314 }, { "auxiliary_loss_clip": 0.01132212, "auxiliary_loss_mlp": 0.01043724, "balance_loss_clip": 1.03006244, "balance_loss_mlp": 1.0362792, "epoch": 0.6405531339245453, "flos": 20157880930560.0, "grad_norm": 2.700777367288188, "language_loss": 0.70983016, "learning_rate": 1.1454341929592231e-06, "loss": 0.73158956, "num_input_tokens_seen": 229882945, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.78125, "step": 10654, "time_per_iteration": 2.5818819999694824 }, { "auxiliary_loss_clip": 0.01118174, "auxiliary_loss_mlp": 0.01036905, "balance_loss_clip": 1.02403092, "balance_loss_mlp": 1.03588963, "epoch": 0.6406132571772133, "flos": 21942281796480.0, "grad_norm": 1.7124971580700703, "language_loss": 0.72340846, "learning_rate": 1.14509261225059e-06, "loss": 0.74495924, "num_input_tokens_seen": 229901590, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 10655, "time_per_iteration": 3.959707736968994 }, { "auxiliary_loss_clip": 0.01136135, "auxiliary_loss_mlp": 0.01034984, "balance_loss_clip": 1.02252674, "balance_loss_mlp": 1.03673851, "epoch": 0.6406733804298813, "flos": 28768002445440.0, "grad_norm": 5.293836158150321, "language_loss": 0.8273198, "learning_rate": 1.144751062051403e-06, "loss": 0.84903103, "num_input_tokens_seen": 229922535, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7265625, "step": 10656, "time_per_iteration": 2.637442111968994 }, { "auxiliary_loss_clip": 0.01141835, "auxiliary_loss_mlp": 0.01036229, "balance_loss_clip": 1.02463007, "balance_loss_mlp": 1.03476739, "epoch": 0.6407335036825492, "flos": 17457183095040.0, "grad_norm": 2.508755676712655, "language_loss": 0.72751266, "learning_rate": 1.1444095423738506e-06, "loss": 0.74929333, "num_input_tokens_seen": 229939575, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.7109375, "step": 10657, "time_per_iteration": 2.525161027908325 }, { "auxiliary_loss_clip": 0.01132681, "auxiliary_loss_mlp": 0.0103494, "balance_loss_clip": 1.02321601, "balance_loss_mlp": 1.03555155, "epoch": 0.6407936269352172, "flos": 22125282612480.0, "grad_norm": 1.8499838545335314, "language_loss": 0.77274013, "learning_rate": 1.144068053230121e-06, "loss": 0.79441631, "num_input_tokens_seen": 229958840, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.7109375, "step": 10658, "time_per_iteration": 2.620671510696411 }, { "auxiliary_loss_clip": 0.01110207, "auxiliary_loss_mlp": 0.01040558, "balance_loss_clip": 1.02783275, "balance_loss_mlp": 1.03742445, "epoch": 0.6408537501878852, "flos": 23110635479040.0, "grad_norm": 1.6917773339541013, "language_loss": 0.75957215, "learning_rate": 1.1437265946324002e-06, "loss": 0.78107983, "num_input_tokens_seen": 229979680, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 10659, "time_per_iteration": 2.5476596355438232 }, { "auxiliary_loss_clip": 0.0111578, "auxiliary_loss_mlp": 0.01034587, "balance_loss_clip": 1.0218612, "balance_loss_mlp": 1.03621817, "epoch": 0.6409138734405532, "flos": 16472440759680.0, "grad_norm": 1.723479068796963, "language_loss": 0.78102404, "learning_rate": 1.1433851665928751e-06, "loss": 0.80252773, "num_input_tokens_seen": 229996830, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 10660, "time_per_iteration": 5.589581251144409 }, { "auxiliary_loss_clip": 0.01130363, "auxiliary_loss_mlp": 0.01035698, "balance_loss_clip": 1.02248335, "balance_loss_mlp": 1.03757071, "epoch": 0.6409739966932211, "flos": 22777922776320.0, "grad_norm": 1.933139283701291, "language_loss": 0.68606114, "learning_rate": 1.143043769123731e-06, "loss": 0.70772171, "num_input_tokens_seen": 230015115, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7421875, "step": 10661, "time_per_iteration": 2.510796546936035 }, { "auxiliary_loss_clip": 0.01105894, "auxiliary_loss_mlp": 0.01037389, "balance_loss_clip": 1.02604043, "balance_loss_mlp": 1.03763413, "epoch": 0.6410341199458891, "flos": 25152049134720.0, "grad_norm": 1.5245409853259875, "language_loss": 0.7574445, "learning_rate": 1.1427024022371486e-06, "loss": 0.77887726, "num_input_tokens_seen": 230035515, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.68359375, "step": 10662, "time_per_iteration": 2.5466904640197754 }, { "auxiliary_loss_clip": 0.01114094, "auxiliary_loss_mlp": 0.01030985, "balance_loss_clip": 1.01901031, "balance_loss_mlp": 1.0345459, "epoch": 0.641094243198557, "flos": 27046193028480.0, "grad_norm": 1.7940149211511507, "language_loss": 0.6934365, "learning_rate": 1.142361065945313e-06, "loss": 0.71488726, "num_input_tokens_seen": 230054355, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 10663, "time_per_iteration": 2.51090145111084 }, { "auxiliary_loss_clip": 0.01136327, "auxiliary_loss_mlp": 0.01042045, "balance_loss_clip": 1.02811503, "balance_loss_mlp": 1.03800821, "epoch": 0.641154366451225, "flos": 25374551932800.0, "grad_norm": 2.6078497103633542, "language_loss": 0.68445283, "learning_rate": 1.1420197602604052e-06, "loss": 0.7062366, "num_input_tokens_seen": 230074605, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.71875, "step": 10664, "time_per_iteration": 2.6257619857788086 }, { "auxiliary_loss_clip": 0.01120281, "auxiliary_loss_mlp": 0.01030564, "balance_loss_clip": 1.01923966, "balance_loss_mlp": 1.03457773, "epoch": 0.6412144897038929, "flos": 25153342024320.0, "grad_norm": 1.7034748575236947, "language_loss": 0.66234452, "learning_rate": 1.1416784851946045e-06, "loss": 0.68385303, "num_input_tokens_seen": 230093820, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6796875, "step": 10665, "time_per_iteration": 2.5617918968200684 }, { "auxiliary_loss_clip": 0.01114921, "auxiliary_loss_mlp": 0.01031601, "balance_loss_clip": 1.01922679, "balance_loss_mlp": 1.03706145, "epoch": 0.641274612956561, "flos": 23440762402560.0, "grad_norm": 3.8970497651398777, "language_loss": 0.64427906, "learning_rate": 1.1413372407600907e-06, "loss": 0.6657443, "num_input_tokens_seen": 230114285, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 10666, "time_per_iteration": 2.7037887573242188 }, { "auxiliary_loss_clip": 0.0111707, "auxiliary_loss_mlp": 0.01034185, "balance_loss_clip": 1.02185893, "balance_loss_mlp": 1.03678989, "epoch": 0.6413347362092289, "flos": 19427493778560.0, "grad_norm": 1.6414911736408584, "language_loss": 0.71164906, "learning_rate": 1.1409960269690433e-06, "loss": 0.73316169, "num_input_tokens_seen": 230132760, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71875, "step": 10667, "time_per_iteration": 2.5850162506103516 }, { "auxiliary_loss_clip": 0.01161133, "auxiliary_loss_mlp": 0.0128796, "balance_loss_clip": 1.02724648, "balance_loss_mlp": 1.03909922, "epoch": 0.6413948594618969, "flos": 17196578945280.0, "grad_norm": 2.2520512951522136, "language_loss": 0.77747244, "learning_rate": 1.1406548438336368e-06, "loss": 0.80196339, "num_input_tokens_seen": 230149690, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78125, "step": 10668, "time_per_iteration": 2.605537176132202 }, { "auxiliary_loss_clip": 0.01119191, "auxiliary_loss_mlp": 0.01035621, "balance_loss_clip": 1.02316928, "balance_loss_mlp": 1.04003978, "epoch": 0.6414549827145648, "flos": 22269787027200.0, "grad_norm": 1.7008840524432784, "language_loss": 0.67972612, "learning_rate": 1.1403136913660488e-06, "loss": 0.70127428, "num_input_tokens_seen": 230166950, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 10669, "time_per_iteration": 2.574969530105591 }, { "auxiliary_loss_clip": 0.01125533, "auxiliary_loss_mlp": 0.01038265, "balance_loss_clip": 1.0252533, "balance_loss_mlp": 1.04084873, "epoch": 0.6415151059672328, "flos": 19640192163840.0, "grad_norm": 2.4707495624340794, "language_loss": 0.78625417, "learning_rate": 1.139972569578453e-06, "loss": 0.80789214, "num_input_tokens_seen": 230184785, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7578125, "step": 10670, "time_per_iteration": 2.6678476333618164 }, { "auxiliary_loss_clip": 0.01137473, "auxiliary_loss_mlp": 0.01035014, "balance_loss_clip": 1.02185321, "balance_loss_mlp": 1.03629947, "epoch": 0.6415752292199008, "flos": 14865833237760.0, "grad_norm": 5.110440315289705, "language_loss": 0.88175428, "learning_rate": 1.1396314784830257e-06, "loss": 0.9034791, "num_input_tokens_seen": 230201385, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.74609375, "step": 10671, "time_per_iteration": 2.6953704357147217 }, { "auxiliary_loss_clip": 0.01115173, "auxiliary_loss_mlp": 0.01031464, "balance_loss_clip": 1.02006125, "balance_loss_mlp": 1.03813767, "epoch": 0.6416353524725688, "flos": 13735580906880.0, "grad_norm": 1.5073045382270767, "language_loss": 0.69244528, "learning_rate": 1.1392904180919363e-06, "loss": 0.71391165, "num_input_tokens_seen": 230220380, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6796875, "step": 10672, "time_per_iteration": 2.618359327316284 }, { "auxiliary_loss_clip": 0.0112048, "auxiliary_loss_mlp": 0.01032411, "balance_loss_clip": 1.01792729, "balance_loss_mlp": 1.03674424, "epoch": 0.6416954757252368, "flos": 24534924543360.0, "grad_norm": 1.983669440680287, "language_loss": 0.7411809, "learning_rate": 1.1389493884173584e-06, "loss": 0.76270974, "num_input_tokens_seen": 230239845, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.75, "step": 10673, "time_per_iteration": 2.7114756107330322 }, { "auxiliary_loss_clip": 0.01133781, "auxiliary_loss_mlp": 0.01037109, "balance_loss_clip": 1.02375805, "balance_loss_mlp": 1.03598475, "epoch": 0.6417555989779047, "flos": 27710002321920.0, "grad_norm": 2.1293587170446964, "language_loss": 0.69426119, "learning_rate": 1.1386083894714622e-06, "loss": 0.71597004, "num_input_tokens_seen": 230262420, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 10674, "time_per_iteration": 2.5967893600463867 }, { "auxiliary_loss_clip": 0.01112934, "auxiliary_loss_mlp": 0.01030793, "balance_loss_clip": 1.01762056, "balance_loss_mlp": 1.03723288, "epoch": 0.6418157222305727, "flos": 20556632787840.0, "grad_norm": 2.128694734930178, "language_loss": 0.66543669, "learning_rate": 1.1382674212664167e-06, "loss": 0.68687397, "num_input_tokens_seen": 230279950, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7578125, "step": 10675, "time_per_iteration": 2.492365598678589 }, { "auxiliary_loss_clip": 0.01109708, "auxiliary_loss_mlp": 0.01035393, "balance_loss_clip": 1.02312064, "balance_loss_mlp": 1.03890264, "epoch": 0.6418758454832406, "flos": 22601530062720.0, "grad_norm": 1.914669377195742, "language_loss": 0.661259, "learning_rate": 1.1379264838143902e-06, "loss": 0.68271005, "num_input_tokens_seen": 230299705, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.70703125, "step": 10676, "time_per_iteration": 2.533143997192383 }, { "auxiliary_loss_clip": 0.01111316, "auxiliary_loss_mlp": 0.01032431, "balance_loss_clip": 1.0182215, "balance_loss_mlp": 1.03855348, "epoch": 0.6419359687359086, "flos": 27375098889600.0, "grad_norm": 1.7991200103870615, "language_loss": 0.75529766, "learning_rate": 1.1375855771275503e-06, "loss": 0.77673507, "num_input_tokens_seen": 230320030, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7265625, "step": 10677, "time_per_iteration": 2.5442557334899902 }, { "auxiliary_loss_clip": 0.01047354, "auxiliary_loss_mlp": 0.0101031, "balance_loss_clip": 1.00899267, "balance_loss_mlp": 1.00762105, "epoch": 0.6419960919885765, "flos": 67251924552960.0, "grad_norm": 0.7600185842531484, "language_loss": 0.60781705, "learning_rate": 1.1372447012180624e-06, "loss": 0.62839365, "num_input_tokens_seen": 230381495, "router_z_loss_clip": 0.01318359, "router_z_loss_mlp": 0.2265625, "step": 10678, "time_per_iteration": 3.2348458766937256 }, { "auxiliary_loss_clip": 0.01116619, "auxiliary_loss_mlp": 0.01034312, "balance_loss_clip": 1.02221227, "balance_loss_mlp": 1.03853738, "epoch": 0.6420562152412446, "flos": 19901873721600.0, "grad_norm": 1.728898914708432, "language_loss": 0.6723243, "learning_rate": 1.1369038560980912e-06, "loss": 0.69383359, "num_input_tokens_seen": 230401385, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 10679, "time_per_iteration": 2.5430147647857666 }, { "auxiliary_loss_clip": 0.01119623, "auxiliary_loss_mlp": 0.01030882, "balance_loss_clip": 1.01720285, "balance_loss_mlp": 1.03828168, "epoch": 0.6421163384939125, "flos": 24790177566720.0, "grad_norm": 1.8472136360188174, "language_loss": 0.73376274, "learning_rate": 1.136563041779802e-06, "loss": 0.75526786, "num_input_tokens_seen": 230421340, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 10680, "time_per_iteration": 2.6143600940704346 }, { "auxiliary_loss_clip": 0.01133059, "auxiliary_loss_mlp": 0.01027719, "balance_loss_clip": 1.01619196, "balance_loss_mlp": 1.03543878, "epoch": 0.6421764617465805, "flos": 25592816926080.0, "grad_norm": 1.8674210188290288, "language_loss": 0.67909926, "learning_rate": 1.1362222582753567e-06, "loss": 0.70070702, "num_input_tokens_seen": 230441270, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.7109375, "step": 10681, "time_per_iteration": 2.7186427116394043 }, { "auxiliary_loss_clip": 0.0110656, "auxiliary_loss_mlp": 0.01030776, "balance_loss_clip": 1.01846123, "balance_loss_mlp": 1.03650522, "epoch": 0.6422365849992484, "flos": 14134727813760.0, "grad_norm": 1.814702784053424, "language_loss": 0.74609137, "learning_rate": 1.1358815055969174e-06, "loss": 0.7674647, "num_input_tokens_seen": 230457455, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 10682, "time_per_iteration": 2.5553836822509766 }, { "auxiliary_loss_clip": 0.01147604, "auxiliary_loss_mlp": 0.01036384, "balance_loss_clip": 1.02494621, "balance_loss_mlp": 1.03546822, "epoch": 0.6422967082519164, "flos": 22383911514240.0, "grad_norm": 1.4425396255668743, "language_loss": 0.79108208, "learning_rate": 1.1355407837566433e-06, "loss": 0.812922, "num_input_tokens_seen": 230478955, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6796875, "step": 10683, "time_per_iteration": 2.7616398334503174 }, { "auxiliary_loss_clip": 0.01109623, "auxiliary_loss_mlp": 0.01034177, "balance_loss_clip": 1.02083111, "balance_loss_mlp": 1.03766072, "epoch": 0.6423568315045844, "flos": 14647927380480.0, "grad_norm": 1.7338297178571183, "language_loss": 0.67364538, "learning_rate": 1.1352000927666966e-06, "loss": 0.69508338, "num_input_tokens_seen": 230496425, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 10684, "time_per_iteration": 2.6294498443603516 }, { "auxiliary_loss_clip": 0.01115856, "auxiliary_loss_mlp": 0.01029209, "balance_loss_clip": 1.01591134, "balance_loss_mlp": 1.03514767, "epoch": 0.6424169547572524, "flos": 26833925606400.0, "grad_norm": 1.9788806235553467, "language_loss": 0.71460062, "learning_rate": 1.1348594326392324e-06, "loss": 0.73605126, "num_input_tokens_seen": 230516245, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 10685, "time_per_iteration": 2.6099236011505127 }, { "auxiliary_loss_clip": 0.01131391, "auxiliary_loss_mlp": 0.01032057, "balance_loss_clip": 1.02028465, "balance_loss_mlp": 1.03574657, "epoch": 0.6424770780099204, "flos": 22707430335360.0, "grad_norm": 1.5579605419396978, "language_loss": 0.75889796, "learning_rate": 1.1345188033864107e-06, "loss": 0.78053248, "num_input_tokens_seen": 230534745, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6875, "step": 10686, "time_per_iteration": 2.62174391746521 }, { "auxiliary_loss_clip": 0.01137215, "auxiliary_loss_mlp": 0.01031523, "balance_loss_clip": 1.01773024, "balance_loss_mlp": 1.0376848, "epoch": 0.6425372012625883, "flos": 28469512425600.0, "grad_norm": 6.299042117202646, "language_loss": 0.6890313, "learning_rate": 1.1341782050203859e-06, "loss": 0.71071875, "num_input_tokens_seen": 230555895, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.73046875, "step": 10687, "time_per_iteration": 2.715517520904541 }, { "auxiliary_loss_clip": 0.01128877, "auxiliary_loss_mlp": 0.01031119, "balance_loss_clip": 1.01836967, "balance_loss_mlp": 1.03828073, "epoch": 0.6425973245152563, "flos": 29351694453120.0, "grad_norm": 1.7841294428331675, "language_loss": 0.66458994, "learning_rate": 1.1338376375533153e-06, "loss": 0.68618989, "num_input_tokens_seen": 230577460, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 10688, "time_per_iteration": 2.630492925643921 }, { "auxiliary_loss_clip": 0.01107008, "auxiliary_loss_mlp": 0.0102945, "balance_loss_clip": 1.0164144, "balance_loss_mlp": 1.03579998, "epoch": 0.6426574477679242, "flos": 16430388912000.0, "grad_norm": 3.2892375006281385, "language_loss": 0.73459113, "learning_rate": 1.1334971009973492e-06, "loss": 0.7559557, "num_input_tokens_seen": 230595030, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 10689, "time_per_iteration": 2.5704703330993652 }, { "auxiliary_loss_clip": 0.01114142, "auxiliary_loss_mlp": 0.01029426, "balance_loss_clip": 1.01749325, "balance_loss_mlp": 1.03579295, "epoch": 0.6427175710205922, "flos": 21835914647040.0, "grad_norm": 8.312118678599212, "language_loss": 0.7210927, "learning_rate": 1.1331565953646443e-06, "loss": 0.74252832, "num_input_tokens_seen": 230615135, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69140625, "step": 10690, "time_per_iteration": 4.0099945068359375 }, { "auxiliary_loss_clip": 0.01122422, "auxiliary_loss_mlp": 0.01028448, "balance_loss_clip": 1.01580572, "balance_loss_mlp": 1.0359143, "epoch": 0.6427776942732601, "flos": 17786627660160.0, "grad_norm": 1.8279313255157288, "language_loss": 0.7770955, "learning_rate": 1.1328161206673512e-06, "loss": 0.79860419, "num_input_tokens_seen": 230631965, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 10691, "time_per_iteration": 2.608461856842041 }, { "auxiliary_loss_clip": 0.01128135, "auxiliary_loss_mlp": 0.01035059, "balance_loss_clip": 1.02275681, "balance_loss_mlp": 1.03843725, "epoch": 0.6428378175259282, "flos": 15085893911040.0, "grad_norm": 2.225938229925694, "language_loss": 0.74395502, "learning_rate": 1.1324756769176183e-06, "loss": 0.76558697, "num_input_tokens_seen": 230649565, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7265625, "step": 10692, "time_per_iteration": 2.672086477279663 }, { "auxiliary_loss_clip": 0.01116895, "auxiliary_loss_mlp": 0.01033653, "balance_loss_clip": 1.02128482, "balance_loss_mlp": 1.03730702, "epoch": 0.6428979407785961, "flos": 23841776816640.0, "grad_norm": 1.8503139314777026, "language_loss": 0.61362082, "learning_rate": 1.1321352641275978e-06, "loss": 0.63512629, "num_input_tokens_seen": 230669265, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.70703125, "step": 10693, "time_per_iteration": 2.582624912261963 }, { "auxiliary_loss_clip": 0.01136822, "auxiliary_loss_mlp": 0.01026343, "balance_loss_clip": 1.01359415, "balance_loss_mlp": 1.03568709, "epoch": 0.6429580640312641, "flos": 32926852892160.0, "grad_norm": 1.5824386275991615, "language_loss": 0.59558797, "learning_rate": 1.1317948823094376e-06, "loss": 0.61721969, "num_input_tokens_seen": 230690575, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.74609375, "step": 10694, "time_per_iteration": 2.7662997245788574 }, { "auxiliary_loss_clip": 0.01127849, "auxiliary_loss_mlp": 0.01032059, "balance_loss_clip": 1.01991105, "balance_loss_mlp": 1.03773236, "epoch": 0.643018187283932, "flos": 21068359896960.0, "grad_norm": 1.4505937446060813, "language_loss": 0.80003786, "learning_rate": 1.1314545314752844e-06, "loss": 0.82163697, "num_input_tokens_seen": 230709420, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.71875, "step": 10695, "time_per_iteration": 2.628962993621826 }, { "auxiliary_loss_clip": 0.01109899, "auxiliary_loss_mlp": 0.01036892, "balance_loss_clip": 1.02376664, "balance_loss_mlp": 1.03690553, "epoch": 0.6430783105366, "flos": 26724649455360.0, "grad_norm": 1.6972801450285724, "language_loss": 0.73767769, "learning_rate": 1.1311142116372843e-06, "loss": 0.75914562, "num_input_tokens_seen": 230729350, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73046875, "step": 10696, "time_per_iteration": 4.070636034011841 }, { "auxiliary_loss_clip": 0.0113335, "auxiliary_loss_mlp": 0.01026504, "balance_loss_clip": 1.0146966, "balance_loss_mlp": 1.03838718, "epoch": 0.643138433789268, "flos": 23696841438720.0, "grad_norm": 1.6021103626917812, "language_loss": 0.75375217, "learning_rate": 1.1307739228075838e-06, "loss": 0.77535069, "num_input_tokens_seen": 230749220, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.68359375, "step": 10697, "time_per_iteration": 2.866555690765381 }, { "auxiliary_loss_clip": 0.01122783, "auxiliary_loss_mlp": 0.01032086, "balance_loss_clip": 1.01996863, "balance_loss_mlp": 1.03587317, "epoch": 0.643198557041936, "flos": 34202184255360.0, "grad_norm": 1.5922801884418536, "language_loss": 0.6657185, "learning_rate": 1.1304336649983257e-06, "loss": 0.68726718, "num_input_tokens_seen": 230770245, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 10698, "time_per_iteration": 2.83129620552063 }, { "auxiliary_loss_clip": 0.01028747, "auxiliary_loss_mlp": 0.0100357, "balance_loss_clip": 1.00215781, "balance_loss_mlp": 1.00644469, "epoch": 0.643258680294604, "flos": 67626473621760.0, "grad_norm": 0.8491605278436571, "language_loss": 0.63399482, "learning_rate": 1.1300934382216536e-06, "loss": 0.65431798, "num_input_tokens_seen": 230837030, "router_z_loss_clip": 0.01409912, "router_z_loss_mlp": 0.22265625, "step": 10699, "time_per_iteration": 3.20121169090271 }, { "auxiliary_loss_clip": 0.01113522, "auxiliary_loss_mlp": 0.01027435, "balance_loss_clip": 1.01593125, "balance_loss_mlp": 1.03658152, "epoch": 0.6433188035472719, "flos": 25185984508800.0, "grad_norm": 1.6895466254380498, "language_loss": 0.69117701, "learning_rate": 1.129753242489708e-06, "loss": 0.71258664, "num_input_tokens_seen": 230856845, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.68359375, "step": 10700, "time_per_iteration": 2.5895214080810547 }, { "auxiliary_loss_clip": 0.01108167, "auxiliary_loss_mlp": 0.01026334, "balance_loss_clip": 1.0137161, "balance_loss_mlp": 1.03693533, "epoch": 0.6433789267999399, "flos": 24973573432320.0, "grad_norm": 1.9548218573952898, "language_loss": 0.73543775, "learning_rate": 1.1294130778146325e-06, "loss": 0.75678277, "num_input_tokens_seen": 230878785, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 10701, "time_per_iteration": 4.154095888137817 }, { "auxiliary_loss_clip": 0.01117294, "auxiliary_loss_mlp": 0.01032667, "balance_loss_clip": 1.01829052, "balance_loss_mlp": 1.03417611, "epoch": 0.6434390500526078, "flos": 17566028282880.0, "grad_norm": 1.7716245639203254, "language_loss": 0.81732446, "learning_rate": 1.129072944208563e-06, "loss": 0.83882403, "num_input_tokens_seen": 230895445, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.74609375, "step": 10702, "time_per_iteration": 4.8889319896698 }, { "auxiliary_loss_clip": 0.01123313, "auxiliary_loss_mlp": 0.01035035, "balance_loss_clip": 1.02270913, "balance_loss_mlp": 1.03562403, "epoch": 0.6434991733052758, "flos": 20843594542080.0, "grad_norm": 1.9130357225296084, "language_loss": 0.74641418, "learning_rate": 1.1287328416836408e-06, "loss": 0.76799768, "num_input_tokens_seen": 230911375, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.703125, "step": 10703, "time_per_iteration": 2.6021251678466797 }, { "auxiliary_loss_clip": 0.01073307, "auxiliary_loss_mlp": 0.01002594, "balance_loss_clip": 1.00121713, "balance_loss_mlp": 1.00631809, "epoch": 0.6435592965579437, "flos": 66094596345600.0, "grad_norm": 0.6530191518996996, "language_loss": 0.54642165, "learning_rate": 1.1283927702520013e-06, "loss": 0.56718063, "num_input_tokens_seen": 230975990, "router_z_loss_clip": 0.01379395, "router_z_loss_mlp": 0.22265625, "step": 10704, "time_per_iteration": 3.2472522258758545 }, { "auxiliary_loss_clip": 0.01133581, "auxiliary_loss_mlp": 0.01025563, "balance_loss_clip": 1.01454258, "balance_loss_mlp": 1.03524089, "epoch": 0.6436194198106118, "flos": 23768842250880.0, "grad_norm": 3.1687615916040186, "language_loss": 0.77056944, "learning_rate": 1.1280527299257835e-06, "loss": 0.79216087, "num_input_tokens_seen": 230997110, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.71484375, "step": 10705, "time_per_iteration": 2.598747968673706 }, { "auxiliary_loss_clip": 0.01135221, "auxiliary_loss_mlp": 0.0103957, "balance_loss_clip": 1.02726769, "balance_loss_mlp": 1.0367285, "epoch": 0.6436795430632797, "flos": 20230312705920.0, "grad_norm": 3.6039540792879663, "language_loss": 0.792337, "learning_rate": 1.1277127207171201e-06, "loss": 0.81408489, "num_input_tokens_seen": 231015590, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71875, "step": 10706, "time_per_iteration": 2.5560855865478516 }, { "auxiliary_loss_clip": 0.01133564, "auxiliary_loss_mlp": 0.01033311, "balance_loss_clip": 1.02015638, "balance_loss_mlp": 1.03527057, "epoch": 0.6437396663159477, "flos": 20301846641280.0, "grad_norm": 2.7160849739249495, "language_loss": 0.80023545, "learning_rate": 1.127372742638145e-06, "loss": 0.82190418, "num_input_tokens_seen": 231033800, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71484375, "step": 10707, "time_per_iteration": 2.5992987155914307 }, { "auxiliary_loss_clip": 0.01114899, "auxiliary_loss_mlp": 0.01031261, "balance_loss_clip": 1.01864886, "balance_loss_mlp": 1.03427351, "epoch": 0.6437997895686156, "flos": 23878585278720.0, "grad_norm": 1.8401823438323892, "language_loss": 0.8577472, "learning_rate": 1.1270327957009937e-06, "loss": 0.8792088, "num_input_tokens_seen": 231053160, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 10708, "time_per_iteration": 2.593090534210205 }, { "auxiliary_loss_clip": 0.01154114, "auxiliary_loss_mlp": 0.01041623, "balance_loss_clip": 1.02716935, "balance_loss_mlp": 1.04041815, "epoch": 0.6438599128212836, "flos": 18989275852800.0, "grad_norm": 2.14094502957057, "language_loss": 0.6526106, "learning_rate": 1.126692879917795e-06, "loss": 0.67456794, "num_input_tokens_seen": 231069470, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78125, "step": 10709, "time_per_iteration": 2.6196601390838623 }, { "auxiliary_loss_clip": 0.01105671, "auxiliary_loss_mlp": 0.01031227, "balance_loss_clip": 1.01900244, "balance_loss_mlp": 1.03521395, "epoch": 0.6439200360739517, "flos": 24096347481600.0, "grad_norm": 2.2175567083660135, "language_loss": 0.80712527, "learning_rate": 1.1263529953006816e-06, "loss": 0.82849431, "num_input_tokens_seen": 231088205, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.70703125, "step": 10710, "time_per_iteration": 2.575336217880249 }, { "auxiliary_loss_clip": 0.01113261, "auxiliary_loss_mlp": 0.01029422, "balance_loss_clip": 1.01749492, "balance_loss_mlp": 1.03398395, "epoch": 0.6439801593266196, "flos": 31902141697920.0, "grad_norm": 1.9462300774429437, "language_loss": 0.66144317, "learning_rate": 1.1260131418617826e-06, "loss": 0.68286997, "num_input_tokens_seen": 231107850, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.703125, "step": 10711, "time_per_iteration": 2.6560611724853516 }, { "auxiliary_loss_clip": 0.01138482, "auxiliary_loss_mlp": 0.01029542, "balance_loss_clip": 1.01681626, "balance_loss_mlp": 1.03917241, "epoch": 0.6440402825792876, "flos": 27125879351040.0, "grad_norm": 2.820799485777526, "language_loss": 0.78644395, "learning_rate": 1.1256733196132264e-06, "loss": 0.80812413, "num_input_tokens_seen": 231127200, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 10712, "time_per_iteration": 2.647282361984253 }, { "auxiliary_loss_clip": 0.01119698, "auxiliary_loss_mlp": 0.01030625, "balance_loss_clip": 1.01798892, "balance_loss_mlp": 1.03665936, "epoch": 0.6441004058319555, "flos": 20667704618880.0, "grad_norm": 1.8677749152964025, "language_loss": 0.82717967, "learning_rate": 1.1253335285671393e-06, "loss": 0.84868288, "num_input_tokens_seen": 231146360, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7421875, "step": 10713, "time_per_iteration": 2.5469954013824463 }, { "auxiliary_loss_clip": 0.01105687, "auxiliary_loss_mlp": 0.01035563, "balance_loss_clip": 1.02354097, "balance_loss_mlp": 1.03731155, "epoch": 0.6441605290846235, "flos": 26026006947840.0, "grad_norm": 1.4678788887567669, "language_loss": 0.78345853, "learning_rate": 1.1249937687356497e-06, "loss": 0.80487102, "num_input_tokens_seen": 231168350, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.68359375, "step": 10714, "time_per_iteration": 2.607616662979126 }, { "auxiliary_loss_clip": 0.01132674, "auxiliary_loss_mlp": 0.01028176, "balance_loss_clip": 1.01638603, "balance_loss_mlp": 1.03630424, "epoch": 0.6442206523372914, "flos": 24899489631360.0, "grad_norm": 1.4305073295007518, "language_loss": 0.81604016, "learning_rate": 1.1246540401308818e-06, "loss": 0.83764869, "num_input_tokens_seen": 231188385, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69921875, "step": 10715, "time_per_iteration": 2.653336763381958 }, { "auxiliary_loss_clip": 0.01123964, "auxiliary_loss_mlp": 0.01028359, "balance_loss_clip": 1.01583624, "balance_loss_mlp": 1.03344166, "epoch": 0.6442807755899594, "flos": 25156322853120.0, "grad_norm": 3.6313458510028167, "language_loss": 0.81525654, "learning_rate": 1.1243143427649596e-06, "loss": 0.83677971, "num_input_tokens_seen": 231209880, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.72265625, "step": 10716, "time_per_iteration": 2.6435282230377197 }, { "auxiliary_loss_clip": 0.01135433, "auxiliary_loss_mlp": 0.01038614, "balance_loss_clip": 1.02423167, "balance_loss_mlp": 1.03619897, "epoch": 0.6443408988426274, "flos": 27344503480320.0, "grad_norm": 1.9556660151944023, "language_loss": 0.78231341, "learning_rate": 1.1239746766500048e-06, "loss": 0.8040539, "num_input_tokens_seen": 231230765, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.734375, "step": 10717, "time_per_iteration": 2.6319148540496826 }, { "auxiliary_loss_clip": 0.01105957, "auxiliary_loss_mlp": 0.01030415, "balance_loss_clip": 1.01856565, "balance_loss_mlp": 1.03720021, "epoch": 0.6444010220952954, "flos": 27928339142400.0, "grad_norm": 1.709955391797683, "language_loss": 0.68141699, "learning_rate": 1.123635041798142e-06, "loss": 0.70278072, "num_input_tokens_seen": 231252350, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 10718, "time_per_iteration": 2.604454755783081 }, { "auxiliary_loss_clip": 0.01037533, "auxiliary_loss_mlp": 0.0100418, "balance_loss_clip": 1.00266612, "balance_loss_mlp": 1.00631154, "epoch": 0.6444611453479633, "flos": 71215024855680.0, "grad_norm": 0.7750765378067781, "language_loss": 0.4958382, "learning_rate": 1.123295438221489e-06, "loss": 0.51625532, "num_input_tokens_seen": 231313865, "router_z_loss_clip": 0.01513672, "router_z_loss_mlp": 0.22460938, "step": 10719, "time_per_iteration": 3.3419110774993896 }, { "auxiliary_loss_clip": 0.0111056, "auxiliary_loss_mlp": 0.01035726, "balance_loss_clip": 1.02255893, "balance_loss_mlp": 1.03757143, "epoch": 0.6445212686006313, "flos": 22705131864960.0, "grad_norm": 1.9238205866097389, "language_loss": 0.77920151, "learning_rate": 1.1229558659321674e-06, "loss": 0.80066437, "num_input_tokens_seen": 231331710, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 10720, "time_per_iteration": 2.5480093955993652 }, { "auxiliary_loss_clip": 0.01124283, "auxiliary_loss_mlp": 0.01033182, "balance_loss_clip": 1.02017045, "balance_loss_mlp": 1.03599119, "epoch": 0.6445813918532992, "flos": 21178821196800.0, "grad_norm": 2.240184215365856, "language_loss": 0.77450085, "learning_rate": 1.1226163249422955e-06, "loss": 0.79607552, "num_input_tokens_seen": 231350705, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 10721, "time_per_iteration": 2.6290087699890137 }, { "auxiliary_loss_clip": 0.01134975, "auxiliary_loss_mlp": 0.01031702, "balance_loss_clip": 1.01854098, "balance_loss_mlp": 1.03548932, "epoch": 0.6446415151059672, "flos": 25191910252800.0, "grad_norm": 2.0675541704869187, "language_loss": 0.73093355, "learning_rate": 1.1222768152639887e-06, "loss": 0.75260031, "num_input_tokens_seen": 231369550, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 10722, "time_per_iteration": 2.6129565238952637 }, { "auxiliary_loss_clip": 0.01105363, "auxiliary_loss_mlp": 0.01028015, "balance_loss_clip": 1.01654112, "balance_loss_mlp": 1.0362587, "epoch": 0.6447016383586353, "flos": 25302227898240.0, "grad_norm": 1.455088003426506, "language_loss": 0.77874637, "learning_rate": 1.1219373369093652e-06, "loss": 0.80008018, "num_input_tokens_seen": 231389285, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6875, "step": 10723, "time_per_iteration": 2.6586854457855225 }, { "auxiliary_loss_clip": 0.01120298, "auxiliary_loss_mlp": 0.01033979, "balance_loss_clip": 1.02062213, "balance_loss_mlp": 1.03732288, "epoch": 0.6447617616113032, "flos": 27703142824320.0, "grad_norm": 1.5963133259890834, "language_loss": 0.58457625, "learning_rate": 1.121597889890539e-06, "loss": 0.60611904, "num_input_tokens_seen": 231408820, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7421875, "step": 10724, "time_per_iteration": 2.6117401123046875 }, { "auxiliary_loss_clip": 0.01114472, "auxiliary_loss_mlp": 0.01032887, "balance_loss_clip": 1.02127016, "balance_loss_mlp": 1.03743005, "epoch": 0.6448218848639712, "flos": 23039101543680.0, "grad_norm": 1.8271468520982068, "language_loss": 0.83350074, "learning_rate": 1.1212584742196258e-06, "loss": 0.85497433, "num_input_tokens_seen": 231428100, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6875, "step": 10725, "time_per_iteration": 2.626392126083374 }, { "auxiliary_loss_clip": 0.01122835, "auxiliary_loss_mlp": 0.01033942, "balance_loss_clip": 1.0213418, "balance_loss_mlp": 1.03391755, "epoch": 0.6448820081166391, "flos": 24496104919680.0, "grad_norm": 2.0069144829225882, "language_loss": 0.8215009, "learning_rate": 1.120919089908736e-06, "loss": 0.84306866, "num_input_tokens_seen": 231445810, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 10726, "time_per_iteration": 2.5997414588928223 }, { "auxiliary_loss_clip": 0.01119596, "auxiliary_loss_mlp": 0.01034274, "balance_loss_clip": 1.02207339, "balance_loss_mlp": 1.03692234, "epoch": 0.6449421313693071, "flos": 22419283432320.0, "grad_norm": 1.9165729801204419, "language_loss": 0.81128877, "learning_rate": 1.1205797369699835e-06, "loss": 0.83282745, "num_input_tokens_seen": 231463570, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.73828125, "step": 10727, "time_per_iteration": 2.671642541885376 }, { "auxiliary_loss_clip": 0.01122141, "auxiliary_loss_mlp": 0.01032632, "balance_loss_clip": 1.01903665, "balance_loss_mlp": 1.03672326, "epoch": 0.645002254621975, "flos": 20225715765120.0, "grad_norm": 2.258701129310186, "language_loss": 0.7907024, "learning_rate": 1.1202404154154773e-06, "loss": 0.81225014, "num_input_tokens_seen": 231482155, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.765625, "step": 10728, "time_per_iteration": 2.56480073928833 }, { "auxiliary_loss_clip": 0.01172758, "auxiliary_loss_mlp": 0.0103442, "balance_loss_clip": 1.02028751, "balance_loss_mlp": 1.0360657, "epoch": 0.645062377874643, "flos": 27855440490240.0, "grad_norm": 1.6938757715546027, "language_loss": 0.74368095, "learning_rate": 1.1199011252573284e-06, "loss": 0.76575267, "num_input_tokens_seen": 231502465, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.73828125, "step": 10729, "time_per_iteration": 2.742587089538574 }, { "auxiliary_loss_clip": 0.01136179, "auxiliary_loss_mlp": 0.01036097, "balance_loss_clip": 1.02249479, "balance_loss_mlp": 1.03566813, "epoch": 0.645122501127311, "flos": 25301509626240.0, "grad_norm": 1.6107502540688412, "language_loss": 0.66341102, "learning_rate": 1.1195618665076434e-06, "loss": 0.68513376, "num_input_tokens_seen": 231522740, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73046875, "step": 10730, "time_per_iteration": 2.6439220905303955 }, { "auxiliary_loss_clip": 0.01152104, "auxiliary_loss_mlp": 0.01029054, "balance_loss_clip": 1.01552999, "balance_loss_mlp": 1.03531396, "epoch": 0.645182624379979, "flos": 18807352444800.0, "grad_norm": 1.4427940672431125, "language_loss": 0.6355266, "learning_rate": 1.1192226391785315e-06, "loss": 0.65733814, "num_input_tokens_seen": 231542050, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 10731, "time_per_iteration": 2.633280038833618 }, { "auxiliary_loss_clip": 0.01104508, "auxiliary_loss_mlp": 0.01032846, "balance_loss_clip": 1.02014422, "balance_loss_mlp": 1.03513646, "epoch": 0.6452427476326469, "flos": 18332182402560.0, "grad_norm": 1.8544076368023052, "language_loss": 0.68200743, "learning_rate": 1.118883443282098e-06, "loss": 0.70338094, "num_input_tokens_seen": 231560380, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 10732, "time_per_iteration": 3.9228787422180176 }, { "auxiliary_loss_clip": 0.01111554, "auxiliary_loss_mlp": 0.01033053, "balance_loss_clip": 1.02085757, "balance_loss_mlp": 1.03467536, "epoch": 0.6453028708853149, "flos": 22784746360320.0, "grad_norm": 1.8163286868435842, "language_loss": 0.75802445, "learning_rate": 1.1185442788304477e-06, "loss": 0.77947056, "num_input_tokens_seen": 231580810, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 10733, "time_per_iteration": 2.586531162261963 }, { "auxiliary_loss_clip": 0.01121541, "auxiliary_loss_mlp": 0.01037756, "balance_loss_clip": 1.02448797, "balance_loss_mlp": 1.03827262, "epoch": 0.6453629941379828, "flos": 23945989150080.0, "grad_norm": 1.857769961675361, "language_loss": 0.66585255, "learning_rate": 1.118205145835684e-06, "loss": 0.68744552, "num_input_tokens_seen": 231600585, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7421875, "step": 10734, "time_per_iteration": 2.595771312713623 }, { "auxiliary_loss_clip": 0.01104558, "auxiliary_loss_mlp": 0.01042196, "balance_loss_clip": 1.03059661, "balance_loss_mlp": 1.03796399, "epoch": 0.6454231173906508, "flos": 17676381841920.0, "grad_norm": 1.8968074826723134, "language_loss": 0.73507178, "learning_rate": 1.1178660443099124e-06, "loss": 0.75653934, "num_input_tokens_seen": 231618765, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6640625, "step": 10735, "time_per_iteration": 2.637988567352295 }, { "auxiliary_loss_clip": 0.01151126, "auxiliary_loss_mlp": 0.01282948, "balance_loss_clip": 1.02349877, "balance_loss_mlp": 1.03562212, "epoch": 0.6454832406433189, "flos": 23292774368640.0, "grad_norm": 1.630758658868428, "language_loss": 0.74717128, "learning_rate": 1.1175269742652313e-06, "loss": 0.77151203, "num_input_tokens_seen": 231638525, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 10736, "time_per_iteration": 2.6789391040802 }, { "auxiliary_loss_clip": 0.01119562, "auxiliary_loss_mlp": 0.01030426, "balance_loss_clip": 1.01746798, "balance_loss_mlp": 1.03748894, "epoch": 0.6455433638959868, "flos": 20157198572160.0, "grad_norm": 2.198336143097407, "language_loss": 0.70417607, "learning_rate": 1.117187935713742e-06, "loss": 0.72567594, "num_input_tokens_seen": 231656785, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.734375, "step": 10737, "time_per_iteration": 2.55466365814209 }, { "auxiliary_loss_clip": 0.01038457, "auxiliary_loss_mlp": 0.01002223, "balance_loss_clip": 1.00074482, "balance_loss_mlp": 1.00692153, "epoch": 0.6456034871486548, "flos": 66532922012160.0, "grad_norm": 0.7731103295208996, "language_loss": 0.58419013, "learning_rate": 1.1168489286675455e-06, "loss": 0.60459691, "num_input_tokens_seen": 231719075, "router_z_loss_clip": 0.01477051, "router_z_loss_mlp": 0.22460938, "step": 10738, "time_per_iteration": 4.656949043273926 }, { "auxiliary_loss_clip": 0.01132993, "auxiliary_loss_mlp": 0.01029812, "balance_loss_clip": 1.01725936, "balance_loss_mlp": 1.03637671, "epoch": 0.6456636104013227, "flos": 24206090509440.0, "grad_norm": 1.7129484572486982, "language_loss": 0.74665308, "learning_rate": 1.1165099531387379e-06, "loss": 0.7682811, "num_input_tokens_seen": 231737810, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 10739, "time_per_iteration": 2.654907703399658 }, { "auxiliary_loss_clip": 0.01127409, "auxiliary_loss_mlp": 0.0127685, "balance_loss_clip": 1.01713121, "balance_loss_mlp": 1.03658032, "epoch": 0.6457237336539907, "flos": 23624086440960.0, "grad_norm": 1.7985016702202001, "language_loss": 0.71416831, "learning_rate": 1.116171009139418e-06, "loss": 0.73821092, "num_input_tokens_seen": 231756140, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.734375, "step": 10740, "time_per_iteration": 2.612758159637451 }, { "auxiliary_loss_clip": 0.01046796, "auxiliary_loss_mlp": 0.01000511, "balance_loss_clip": 0.99918777, "balance_loss_mlp": 1.00657988, "epoch": 0.6457838569066586, "flos": 65846023251840.0, "grad_norm": 0.663340339592675, "language_loss": 0.55327964, "learning_rate": 1.1158320966816806e-06, "loss": 0.57375276, "num_input_tokens_seen": 231823665, "router_z_loss_clip": 0.01324463, "router_z_loss_mlp": 0.22265625, "step": 10741, "time_per_iteration": 3.234105348587036 }, { "auxiliary_loss_clip": 0.01103925, "auxiliary_loss_mlp": 0.01030964, "balance_loss_clip": 1.01933551, "balance_loss_mlp": 1.03539956, "epoch": 0.6458439801593266, "flos": 22381972179840.0, "grad_norm": 1.566488950905933, "language_loss": 0.80640793, "learning_rate": 1.1154932157776228e-06, "loss": 0.82775688, "num_input_tokens_seen": 231844500, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.68359375, "step": 10742, "time_per_iteration": 2.590503454208374 }, { "auxiliary_loss_clip": 0.01119669, "auxiliary_loss_mlp": 0.01030002, "balance_loss_clip": 1.01848078, "balance_loss_mlp": 1.03462017, "epoch": 0.6459041034119946, "flos": 24789243813120.0, "grad_norm": 1.4956424597249578, "language_loss": 0.81700516, "learning_rate": 1.1151543664393354e-06, "loss": 0.83850193, "num_input_tokens_seen": 231864510, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.67578125, "step": 10743, "time_per_iteration": 5.842194557189941 }, { "auxiliary_loss_clip": 0.01123374, "auxiliary_loss_mlp": 0.01030284, "balance_loss_clip": 1.01777899, "balance_loss_mlp": 1.03610754, "epoch": 0.6459642266646626, "flos": 18325358818560.0, "grad_norm": 2.02417623424876, "language_loss": 0.71856904, "learning_rate": 1.1148155486789134e-06, "loss": 0.74010563, "num_input_tokens_seen": 231881555, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 10744, "time_per_iteration": 2.567509174346924 }, { "auxiliary_loss_clip": 0.01105596, "auxiliary_loss_mlp": 0.01025322, "balance_loss_clip": 1.01310253, "balance_loss_mlp": 1.03493464, "epoch": 0.6460243499173305, "flos": 43581368891520.0, "grad_norm": 1.7054465006788202, "language_loss": 0.66527998, "learning_rate": 1.1144767625084477e-06, "loss": 0.68658912, "num_input_tokens_seen": 231905945, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.70703125, "step": 10745, "time_per_iteration": 2.750060558319092 }, { "auxiliary_loss_clip": 0.01116787, "auxiliary_loss_mlp": 0.01034517, "balance_loss_clip": 1.0217917, "balance_loss_mlp": 1.03733838, "epoch": 0.6460844731699985, "flos": 19244026085760.0, "grad_norm": 2.176340619560887, "language_loss": 0.73487604, "learning_rate": 1.1141380079400282e-06, "loss": 0.75638902, "num_input_tokens_seen": 231922535, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 10746, "time_per_iteration": 2.5438616275787354 }, { "auxiliary_loss_clip": 0.01117717, "auxiliary_loss_mlp": 0.0103522, "balance_loss_clip": 1.02359676, "balance_loss_mlp": 1.03700519, "epoch": 0.6461445964226664, "flos": 27453348668160.0, "grad_norm": 1.3753354227652472, "language_loss": 0.66266114, "learning_rate": 1.1137992849857437e-06, "loss": 0.68419051, "num_input_tokens_seen": 231944800, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.71875, "step": 10747, "time_per_iteration": 2.6037862300872803 }, { "auxiliary_loss_clip": 0.0112609, "auxiliary_loss_mlp": 0.01037404, "balance_loss_clip": 1.0256139, "balance_loss_mlp": 1.03692853, "epoch": 0.6462047196753344, "flos": 20295489934080.0, "grad_norm": 1.625698556719924, "language_loss": 0.67180073, "learning_rate": 1.1134605936576841e-06, "loss": 0.69343567, "num_input_tokens_seen": 231962970, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.70703125, "step": 10748, "time_per_iteration": 2.6346442699432373 }, { "auxiliary_loss_clip": 0.01111056, "auxiliary_loss_mlp": 0.01042242, "balance_loss_clip": 1.02875388, "balance_loss_mlp": 1.03663063, "epoch": 0.6462648429280025, "flos": 22018340845440.0, "grad_norm": 1.86857353052748, "language_loss": 0.75541264, "learning_rate": 1.1131219339679355e-06, "loss": 0.77694559, "num_input_tokens_seen": 231981195, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.74609375, "step": 10749, "time_per_iteration": 2.5296661853790283 }, { "auxiliary_loss_clip": 0.01134264, "auxiliary_loss_mlp": 0.01032295, "balance_loss_clip": 1.02026677, "balance_loss_mlp": 1.03498745, "epoch": 0.6463249661806704, "flos": 27781141207680.0, "grad_norm": 1.530358084256228, "language_loss": 0.77149951, "learning_rate": 1.1127833059285837e-06, "loss": 0.79316509, "num_input_tokens_seen": 232001735, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.72265625, "step": 10750, "time_per_iteration": 2.590557098388672 }, { "auxiliary_loss_clip": 0.01128525, "auxiliary_loss_mlp": 0.01030522, "balance_loss_clip": 1.01684833, "balance_loss_mlp": 1.03665745, "epoch": 0.6463850894333384, "flos": 22050588280320.0, "grad_norm": 2.2374557962133026, "language_loss": 0.68741858, "learning_rate": 1.1124447095517132e-06, "loss": 0.70900899, "num_input_tokens_seen": 232019830, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 10751, "time_per_iteration": 2.585279941558838 }, { "auxiliary_loss_clip": 0.01125218, "auxiliary_loss_mlp": 0.01032161, "balance_loss_clip": 1.01911902, "balance_loss_mlp": 1.03481126, "epoch": 0.6464452126860063, "flos": 21106245767040.0, "grad_norm": 1.7298302729823594, "language_loss": 0.71223718, "learning_rate": 1.1121061448494082e-06, "loss": 0.73381096, "num_input_tokens_seen": 232039625, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 10752, "time_per_iteration": 2.5683443546295166 }, { "auxiliary_loss_clip": 0.01138223, "auxiliary_loss_mlp": 0.01036154, "balance_loss_clip": 1.02171791, "balance_loss_mlp": 1.03768814, "epoch": 0.6465053359386743, "flos": 16028045694720.0, "grad_norm": 2.4722319302926574, "language_loss": 0.78166056, "learning_rate": 1.111767611833751e-06, "loss": 0.80340433, "num_input_tokens_seen": 232055855, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.73828125, "step": 10753, "time_per_iteration": 2.538985252380371 }, { "auxiliary_loss_clip": 0.0110766, "auxiliary_loss_mlp": 0.01040955, "balance_loss_clip": 1.02858663, "balance_loss_mlp": 1.03711748, "epoch": 0.6465654591913422, "flos": 23398674641280.0, "grad_norm": 1.6781620825224537, "language_loss": 0.84946692, "learning_rate": 1.111429110516822e-06, "loss": 0.87095308, "num_input_tokens_seen": 232073475, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 10754, "time_per_iteration": 2.499770402908325 }, { "auxiliary_loss_clip": 0.01123747, "auxiliary_loss_mlp": 0.01033675, "balance_loss_clip": 1.0208894, "balance_loss_mlp": 1.03599358, "epoch": 0.6466255824440102, "flos": 15377273038080.0, "grad_norm": 2.2801111753122325, "language_loss": 0.59883279, "learning_rate": 1.1110906409107042e-06, "loss": 0.62040699, "num_input_tokens_seen": 232091090, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 10755, "time_per_iteration": 2.5380070209503174 }, { "auxiliary_loss_clip": 0.01134721, "auxiliary_loss_mlp": 0.01034096, "balance_loss_clip": 1.02142382, "balance_loss_mlp": 1.03623331, "epoch": 0.6466857056966782, "flos": 16252846963200.0, "grad_norm": 2.077387029032673, "language_loss": 0.68090308, "learning_rate": 1.1107522030274733e-06, "loss": 0.70259124, "num_input_tokens_seen": 232107320, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 10756, "time_per_iteration": 2.563870429992676 }, { "auxiliary_loss_clip": 0.01150285, "auxiliary_loss_mlp": 0.01036381, "balance_loss_clip": 1.02373266, "balance_loss_mlp": 1.03881407, "epoch": 0.6467458289493462, "flos": 21178246579200.0, "grad_norm": 2.3213156230761807, "language_loss": 0.74122238, "learning_rate": 1.110413796879209e-06, "loss": 0.76308906, "num_input_tokens_seen": 232123930, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7578125, "step": 10757, "time_per_iteration": 2.566361904144287 }, { "auxiliary_loss_clip": 0.01143963, "auxiliary_loss_mlp": 0.01033579, "balance_loss_clip": 1.0210557, "balance_loss_mlp": 1.03813004, "epoch": 0.6468059522020141, "flos": 17968299672960.0, "grad_norm": 1.5034124379280505, "language_loss": 0.74508572, "learning_rate": 1.1100754224779879e-06, "loss": 0.76686114, "num_input_tokens_seen": 232142905, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 10758, "time_per_iteration": 2.5708258152008057 }, { "auxiliary_loss_clip": 0.01134801, "auxiliary_loss_mlp": 0.01034824, "balance_loss_clip": 1.02088249, "balance_loss_mlp": 1.03674042, "epoch": 0.6468660754546821, "flos": 17890157635200.0, "grad_norm": 1.7512514591704993, "language_loss": 0.6760155, "learning_rate": 1.1097370798358871e-06, "loss": 0.69771171, "num_input_tokens_seen": 232162230, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71484375, "step": 10759, "time_per_iteration": 2.592674493789673 }, { "auxiliary_loss_clip": 0.01138853, "auxiliary_loss_mlp": 0.01033583, "balance_loss_clip": 1.01991594, "balance_loss_mlp": 1.03724408, "epoch": 0.64692619870735, "flos": 22600991358720.0, "grad_norm": 1.6339675030578087, "language_loss": 0.75614923, "learning_rate": 1.1093987689649784e-06, "loss": 0.77787364, "num_input_tokens_seen": 232182700, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75, "step": 10760, "time_per_iteration": 2.625875473022461 }, { "auxiliary_loss_clip": 0.01123239, "auxiliary_loss_mlp": 0.01028711, "balance_loss_clip": 1.01620626, "balance_loss_mlp": 1.03476775, "epoch": 0.646986321960018, "flos": 49600786993920.0, "grad_norm": 1.75392088155918, "language_loss": 0.65656328, "learning_rate": 1.1090604898773377e-06, "loss": 0.67808276, "num_input_tokens_seen": 232208235, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 10761, "time_per_iteration": 2.8131113052368164 }, { "auxiliary_loss_clip": 0.0114484, "auxiliary_loss_mlp": 0.01034626, "balance_loss_clip": 1.02135205, "balance_loss_mlp": 1.03651428, "epoch": 0.6470464452126861, "flos": 21908454163200.0, "grad_norm": 2.424823337218999, "language_loss": 0.69377995, "learning_rate": 1.1087222425850362e-06, "loss": 0.71557462, "num_input_tokens_seen": 232228720, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 10762, "time_per_iteration": 2.5932040214538574 }, { "auxiliary_loss_clip": 0.01111444, "auxiliary_loss_mlp": 0.0103382, "balance_loss_clip": 1.02078485, "balance_loss_mlp": 1.03704286, "epoch": 0.647106568465354, "flos": 18106124158080.0, "grad_norm": 3.8708583947362545, "language_loss": 0.82453489, "learning_rate": 1.1083840271001452e-06, "loss": 0.84598756, "num_input_tokens_seen": 232244655, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7421875, "step": 10763, "time_per_iteration": 2.556924343109131 }, { "auxiliary_loss_clip": 0.01028903, "auxiliary_loss_mlp": 0.0100089, "balance_loss_clip": 0.99947184, "balance_loss_mlp": 1.00667357, "epoch": 0.647166691718022, "flos": 69480038125440.0, "grad_norm": 0.7143055465654241, "language_loss": 0.57716525, "learning_rate": 1.1080458434347337e-06, "loss": 0.59746319, "num_input_tokens_seen": 232308685, "router_z_loss_clip": 0.01416016, "router_z_loss_mlp": 0.22265625, "step": 10764, "time_per_iteration": 3.319498062133789 }, { "auxiliary_loss_clip": 0.01134238, "auxiliary_loss_mlp": 0.01032928, "balance_loss_clip": 1.01966619, "balance_loss_mlp": 1.0366478, "epoch": 0.6472268149706899, "flos": 34095170661120.0, "grad_norm": 1.9663570208607029, "language_loss": 0.60744631, "learning_rate": 1.107707691600873e-06, "loss": 0.62911803, "num_input_tokens_seen": 232327520, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 10765, "time_per_iteration": 2.727107286453247 }, { "auxiliary_loss_clip": 0.01107271, "auxiliary_loss_mlp": 0.0102864, "balance_loss_clip": 1.01649845, "balance_loss_mlp": 1.03653407, "epoch": 0.6472869382233579, "flos": 28111232217600.0, "grad_norm": 7.794648347206227, "language_loss": 0.63388598, "learning_rate": 1.1073695716106293e-06, "loss": 0.65524513, "num_input_tokens_seen": 232349025, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.70703125, "step": 10766, "time_per_iteration": 2.6311116218566895 }, { "auxiliary_loss_clip": 0.01125388, "auxiliary_loss_mlp": 0.01035784, "balance_loss_clip": 1.02300453, "balance_loss_mlp": 1.03682899, "epoch": 0.6473470614760258, "flos": 22492146170880.0, "grad_norm": 1.7801157058740749, "language_loss": 0.75680304, "learning_rate": 1.1070314834760693e-06, "loss": 0.77841479, "num_input_tokens_seen": 232367835, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 10767, "time_per_iteration": 2.5841033458709717 }, { "auxiliary_loss_clip": 0.01116863, "auxiliary_loss_mlp": 0.01035535, "balance_loss_clip": 1.02270794, "balance_loss_mlp": 1.0367322, "epoch": 0.6474071847286939, "flos": 14538938538240.0, "grad_norm": 3.124541136529268, "language_loss": 0.77675086, "learning_rate": 1.1066934272092588e-06, "loss": 0.79827487, "num_input_tokens_seen": 232385840, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 10768, "time_per_iteration": 2.5453367233276367 }, { "auxiliary_loss_clip": 0.0114439, "auxiliary_loss_mlp": 0.01031061, "balance_loss_clip": 1.0183171, "balance_loss_mlp": 1.0365057, "epoch": 0.6474673079813618, "flos": 24098214988800.0, "grad_norm": 1.6515641681287283, "language_loss": 0.7166177, "learning_rate": 1.106355402822262e-06, "loss": 0.73837227, "num_input_tokens_seen": 232406205, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 10769, "time_per_iteration": 2.625546455383301 }, { "auxiliary_loss_clip": 0.01105866, "auxiliary_loss_mlp": 0.01037088, "balance_loss_clip": 1.02499402, "balance_loss_mlp": 1.03753316, "epoch": 0.6475274312340298, "flos": 14976186796800.0, "grad_norm": 2.0772254105296275, "language_loss": 0.72320449, "learning_rate": 1.106017410327142e-06, "loss": 0.74463403, "num_input_tokens_seen": 232424995, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.68359375, "step": 10770, "time_per_iteration": 2.539085626602173 }, { "auxiliary_loss_clip": 0.0114887, "auxiliary_loss_mlp": 0.01031343, "balance_loss_clip": 1.01745462, "balance_loss_mlp": 1.03759146, "epoch": 0.6475875544866977, "flos": 25045322849280.0, "grad_norm": 1.579782691628728, "language_loss": 0.73284656, "learning_rate": 1.1056794497359604e-06, "loss": 0.75464869, "num_input_tokens_seen": 232445870, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 10771, "time_per_iteration": 2.6519534587860107 }, { "auxiliary_loss_clip": 0.01135558, "auxiliary_loss_mlp": 0.01037449, "balance_loss_clip": 1.02417493, "balance_loss_mlp": 1.03795624, "epoch": 0.6476476777393657, "flos": 16472153450880.0, "grad_norm": 1.9697640029234977, "language_loss": 0.73633033, "learning_rate": 1.1053415210607803e-06, "loss": 0.7580604, "num_input_tokens_seen": 232464285, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 10772, "time_per_iteration": 2.587029457092285 }, { "auxiliary_loss_clip": 0.01123146, "auxiliary_loss_mlp": 0.01029805, "balance_loss_clip": 1.01822937, "balance_loss_mlp": 1.03570354, "epoch": 0.6477078009920336, "flos": 25812267068160.0, "grad_norm": 1.5053016254433544, "language_loss": 0.83060575, "learning_rate": 1.1050036243136587e-06, "loss": 0.85213518, "num_input_tokens_seen": 232485815, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6953125, "step": 10773, "time_per_iteration": 2.622023105621338 }, { "auxiliary_loss_clip": 0.01113327, "auxiliary_loss_mlp": 0.01274949, "balance_loss_clip": 1.0159955, "balance_loss_mlp": 1.03590155, "epoch": 0.6477679242447016, "flos": 17676130446720.0, "grad_norm": 1.6462648226038867, "language_loss": 0.78335065, "learning_rate": 1.104665759506656e-06, "loss": 0.80723345, "num_input_tokens_seen": 232504875, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 10774, "time_per_iteration": 3.9566595554351807 }, { "auxiliary_loss_clip": 0.01119411, "auxiliary_loss_mlp": 0.01039702, "balance_loss_clip": 1.02602315, "balance_loss_mlp": 1.03754187, "epoch": 0.6478280474973696, "flos": 21032305620480.0, "grad_norm": 2.0026255208455477, "language_loss": 0.68804175, "learning_rate": 1.1043279266518285e-06, "loss": 0.70963287, "num_input_tokens_seen": 232521945, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73046875, "step": 10775, "time_per_iteration": 2.5560250282287598 }, { "auxiliary_loss_clip": 0.01139001, "auxiliary_loss_mlp": 0.01035895, "balance_loss_clip": 1.02311611, "balance_loss_mlp": 1.03768349, "epoch": 0.6478881707500376, "flos": 21616931381760.0, "grad_norm": 1.79496161591095, "language_loss": 0.65838718, "learning_rate": 1.103990125761235e-06, "loss": 0.6801362, "num_input_tokens_seen": 232541500, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7421875, "step": 10776, "time_per_iteration": 2.592097043991089 }, { "auxiliary_loss_clip": 0.01126923, "auxiliary_loss_mlp": 0.01037608, "balance_loss_clip": 1.02431011, "balance_loss_mlp": 1.03562021, "epoch": 0.6479482940027056, "flos": 18442571875200.0, "grad_norm": 4.264521219398013, "language_loss": 0.79074061, "learning_rate": 1.1036523568469276e-06, "loss": 0.81238592, "num_input_tokens_seen": 232559720, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 10777, "time_per_iteration": 2.563328504562378 }, { "auxiliary_loss_clip": 0.01118345, "auxiliary_loss_mlp": 0.010313, "balance_loss_clip": 1.01846147, "balance_loss_mlp": 1.03813648, "epoch": 0.6480084172553735, "flos": 22164066322560.0, "grad_norm": 2.072394068472354, "language_loss": 0.73186743, "learning_rate": 1.1033146199209627e-06, "loss": 0.75336385, "num_input_tokens_seen": 232579370, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 10778, "time_per_iteration": 2.5845181941986084 }, { "auxiliary_loss_clip": 0.01113852, "auxiliary_loss_mlp": 0.01032622, "balance_loss_clip": 1.02033186, "balance_loss_mlp": 1.03469789, "epoch": 0.6480685405080415, "flos": 24316228586880.0, "grad_norm": 1.379465688338646, "language_loss": 0.78043032, "learning_rate": 1.1029769149953922e-06, "loss": 0.80189508, "num_input_tokens_seen": 232600495, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.703125, "step": 10779, "time_per_iteration": 4.1042726039886475 }, { "auxiliary_loss_clip": 0.01038407, "auxiliary_loss_mlp": 0.01000707, "balance_loss_clip": 0.99940157, "balance_loss_mlp": 1.00755143, "epoch": 0.6481286637607094, "flos": 59891207760000.0, "grad_norm": 0.7214353437515583, "language_loss": 0.59430909, "learning_rate": 1.1026392420822684e-06, "loss": 0.6147002, "num_input_tokens_seen": 232663165, "router_z_loss_clip": 0.01306152, "router_z_loss_mlp": 0.22070312, "step": 10780, "time_per_iteration": 3.119917392730713 }, { "auxiliary_loss_clip": 0.01177651, "auxiliary_loss_mlp": 0.01032703, "balance_loss_clip": 1.02042425, "balance_loss_mlp": 1.036057, "epoch": 0.6481887870133775, "flos": 25484187219840.0, "grad_norm": 1.8602212891410483, "language_loss": 0.78914791, "learning_rate": 1.1023016011936417e-06, "loss": 0.81125152, "num_input_tokens_seen": 232683385, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 10781, "time_per_iteration": 2.6626293659210205 }, { "auxiliary_loss_clip": 0.01112066, "auxiliary_loss_mlp": 0.0103654, "balance_loss_clip": 1.02224064, "balance_loss_mlp": 1.03851473, "epoch": 0.6482489102660454, "flos": 19930206574080.0, "grad_norm": 2.1975370135938874, "language_loss": 0.78822011, "learning_rate": 1.1019639923415618e-06, "loss": 0.80970615, "num_input_tokens_seen": 232699095, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.734375, "step": 10782, "time_per_iteration": 2.5697691440582275 }, { "auxiliary_loss_clip": 0.01120975, "auxiliary_loss_mlp": 0.01284635, "balance_loss_clip": 1.02435231, "balance_loss_mlp": 1.03856349, "epoch": 0.6483090335187134, "flos": 26979471515520.0, "grad_norm": 2.7786474021952916, "language_loss": 0.64058918, "learning_rate": 1.1016264155380768e-06, "loss": 0.66464525, "num_input_tokens_seen": 232717920, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 10783, "time_per_iteration": 2.5807695388793945 }, { "auxiliary_loss_clip": 0.01113223, "auxiliary_loss_mlp": 0.01034419, "balance_loss_clip": 1.01997638, "balance_loss_mlp": 1.03884172, "epoch": 0.6483691567713813, "flos": 25077965333760.0, "grad_norm": 1.9544382058679124, "language_loss": 0.88430166, "learning_rate": 1.1012888707952335e-06, "loss": 0.90577805, "num_input_tokens_seen": 232737605, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7421875, "step": 10784, "time_per_iteration": 2.6112184524536133 }, { "auxiliary_loss_clip": 0.01124199, "auxiliary_loss_mlp": 0.01028687, "balance_loss_clip": 1.0154314, "balance_loss_mlp": 1.03586257, "epoch": 0.6484292800240493, "flos": 16105972250880.0, "grad_norm": 2.6169945490058066, "language_loss": 0.72699356, "learning_rate": 1.1009513581250795e-06, "loss": 0.7485224, "num_input_tokens_seen": 232755110, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 10785, "time_per_iteration": 5.493525743484497 }, { "auxiliary_loss_clip": 0.01132747, "auxiliary_loss_mlp": 0.01031852, "balance_loss_clip": 1.01935244, "balance_loss_mlp": 1.03645349, "epoch": 0.6484894032767172, "flos": 28840398307200.0, "grad_norm": 1.4729800154246304, "language_loss": 0.69419742, "learning_rate": 1.1006138775396588e-06, "loss": 0.71584344, "num_input_tokens_seen": 232779040, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 10786, "time_per_iteration": 2.6660845279693604 }, { "auxiliary_loss_clip": 0.0113424, "auxiliary_loss_mlp": 0.01034259, "balance_loss_clip": 1.02145565, "balance_loss_mlp": 1.0357058, "epoch": 0.6485495265293852, "flos": 30227052896640.0, "grad_norm": 1.8116998218323916, "language_loss": 0.71281457, "learning_rate": 1.1002764290510151e-06, "loss": 0.73449963, "num_input_tokens_seen": 232800515, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 10787, "time_per_iteration": 2.6814844608306885 }, { "auxiliary_loss_clip": 0.01114082, "auxiliary_loss_mlp": 0.01033639, "balance_loss_clip": 1.01985192, "balance_loss_mlp": 1.03880239, "epoch": 0.6486096497820532, "flos": 20082181017600.0, "grad_norm": 1.9479050464249967, "language_loss": 0.84212279, "learning_rate": 1.0999390126711907e-06, "loss": 0.86360002, "num_input_tokens_seen": 232818450, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75390625, "step": 10788, "time_per_iteration": 2.5810139179229736 }, { "auxiliary_loss_clip": 0.01137222, "auxiliary_loss_mlp": 0.01033225, "balance_loss_clip": 1.02008235, "balance_loss_mlp": 1.03885794, "epoch": 0.6486697730347212, "flos": 17129067333120.0, "grad_norm": 1.7666054112274854, "language_loss": 0.76936531, "learning_rate": 1.0996016284122293e-06, "loss": 0.79106975, "num_input_tokens_seen": 232834785, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 10789, "time_per_iteration": 2.6467947959899902 }, { "auxiliary_loss_clip": 0.01115561, "auxiliary_loss_mlp": 0.01030421, "balance_loss_clip": 1.01812506, "balance_loss_mlp": 1.03705049, "epoch": 0.6487298962873892, "flos": 38911940570880.0, "grad_norm": 1.6422948456450461, "language_loss": 0.75626367, "learning_rate": 1.0992642762861682e-06, "loss": 0.77772349, "num_input_tokens_seen": 232856050, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 10790, "time_per_iteration": 2.860879898071289 }, { "auxiliary_loss_clip": 0.01109246, "auxiliary_loss_mlp": 0.0103019, "balance_loss_clip": 1.01803613, "balance_loss_mlp": 1.03930378, "epoch": 0.6487900195400571, "flos": 11947840076160.0, "grad_norm": 2.3533503597512686, "language_loss": 0.59970164, "learning_rate": 1.0989269563050487e-06, "loss": 0.62109596, "num_input_tokens_seen": 232873945, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.69921875, "step": 10791, "time_per_iteration": 2.5969738960266113 }, { "auxiliary_loss_clip": 0.01134089, "auxiliary_loss_mlp": 0.01029811, "balance_loss_clip": 1.01758039, "balance_loss_mlp": 1.03586113, "epoch": 0.6488501427927251, "flos": 22344445445760.0, "grad_norm": 1.6447945767591132, "language_loss": 0.85954082, "learning_rate": 1.0985896684809076e-06, "loss": 0.88117987, "num_input_tokens_seen": 232892160, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.71875, "step": 10792, "time_per_iteration": 2.6454763412475586 }, { "auxiliary_loss_clip": 0.01130649, "auxiliary_loss_mlp": 0.01040616, "balance_loss_clip": 1.0269959, "balance_loss_mlp": 1.03787339, "epoch": 0.648910266045393, "flos": 22236282616320.0, "grad_norm": 2.3860933554681774, "language_loss": 0.77992237, "learning_rate": 1.0982524128257842e-06, "loss": 0.80163497, "num_input_tokens_seen": 232911725, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75, "step": 10793, "time_per_iteration": 2.6197845935821533 }, { "auxiliary_loss_clip": 0.01139585, "auxiliary_loss_mlp": 0.01029555, "balance_loss_clip": 1.01696086, "balance_loss_mlp": 1.04002857, "epoch": 0.6489703892980611, "flos": 25301258231040.0, "grad_norm": 1.8276508118644352, "language_loss": 0.75270122, "learning_rate": 1.0979151893517108e-06, "loss": 0.7743926, "num_input_tokens_seen": 232929085, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.734375, "step": 10794, "time_per_iteration": 2.6070244312286377 }, { "auxiliary_loss_clip": 0.0111524, "auxiliary_loss_mlp": 0.0127812, "balance_loss_clip": 1.01855755, "balance_loss_mlp": 1.03463423, "epoch": 0.649030512550729, "flos": 24571912573440.0, "grad_norm": 1.7914772320244154, "language_loss": 0.69819385, "learning_rate": 1.097577998070725e-06, "loss": 0.7221275, "num_input_tokens_seen": 232949455, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 10795, "time_per_iteration": 2.6232714653015137 }, { "auxiliary_loss_clip": 0.01130345, "auxiliary_loss_mlp": 0.01034584, "balance_loss_clip": 1.02083302, "balance_loss_mlp": 1.03909636, "epoch": 0.649090635803397, "flos": 26244702904320.0, "grad_norm": 1.6661989345506591, "language_loss": 0.53502673, "learning_rate": 1.0972408389948586e-06, "loss": 0.55667603, "num_input_tokens_seen": 232969445, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.734375, "step": 10796, "time_per_iteration": 2.641995429992676 }, { "auxiliary_loss_clip": 0.01117703, "auxiliary_loss_mlp": 0.01031096, "balance_loss_clip": 1.01906157, "balance_loss_mlp": 1.0368818, "epoch": 0.6491507590560649, "flos": 24937375501440.0, "grad_norm": 2.054370668283823, "language_loss": 0.77873957, "learning_rate": 1.0969037121361448e-06, "loss": 0.80022752, "num_input_tokens_seen": 232988900, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.71875, "step": 10797, "time_per_iteration": 2.621635913848877 }, { "auxiliary_loss_clip": 0.01141614, "auxiliary_loss_mlp": 0.01029581, "balance_loss_clip": 1.01780272, "balance_loss_mlp": 1.03785586, "epoch": 0.6492108823087329, "flos": 19499781899520.0, "grad_norm": 1.824692721725387, "language_loss": 0.70527357, "learning_rate": 1.0965666175066144e-06, "loss": 0.72698545, "num_input_tokens_seen": 233005060, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.68359375, "step": 10798, "time_per_iteration": 2.5999369621276855 }, { "auxiliary_loss_clip": 0.0114215, "auxiliary_loss_mlp": 0.0102926, "balance_loss_clip": 1.01735139, "balance_loss_mlp": 1.03552902, "epoch": 0.6492710055614008, "flos": 19719303868800.0, "grad_norm": 1.7143836738662146, "language_loss": 0.76934886, "learning_rate": 1.0962295551182976e-06, "loss": 0.79106295, "num_input_tokens_seen": 233023375, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.7109375, "step": 10799, "time_per_iteration": 2.6317124366760254 }, { "auxiliary_loss_clip": 0.01119932, "auxiliary_loss_mlp": 0.01037283, "balance_loss_clip": 1.02324581, "balance_loss_mlp": 1.03594661, "epoch": 0.6493311288140688, "flos": 24317018686080.0, "grad_norm": 1.7788954825378194, "language_loss": 0.719796, "learning_rate": 1.095892524983223e-06, "loss": 0.74136811, "num_input_tokens_seen": 233043130, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75, "step": 10800, "time_per_iteration": 2.5886714458465576 }, { "auxiliary_loss_clip": 0.01039897, "auxiliary_loss_mlp": 0.0100223, "balance_loss_clip": 1.00085294, "balance_loss_mlp": 1.00864911, "epoch": 0.6493912520667368, "flos": 70934635290240.0, "grad_norm": 0.7732036454387685, "language_loss": 0.60207397, "learning_rate": 1.0955555271134182e-06, "loss": 0.62249523, "num_input_tokens_seen": 233110560, "router_z_loss_clip": 0.01379395, "router_z_loss_mlp": 0.22167969, "step": 10801, "time_per_iteration": 3.2516605854034424 }, { "auxiliary_loss_clip": 0.01123879, "auxiliary_loss_mlp": 0.010378, "balance_loss_clip": 1.02406096, "balance_loss_mlp": 1.04019558, "epoch": 0.6494513753194048, "flos": 25337779384320.0, "grad_norm": 1.776046713594641, "language_loss": 0.78467733, "learning_rate": 1.0952185615209107e-06, "loss": 0.80629408, "num_input_tokens_seen": 233130080, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.74609375, "step": 10802, "time_per_iteration": 2.5939061641693115 }, { "auxiliary_loss_clip": 0.01109502, "auxiliary_loss_mlp": 0.01039158, "balance_loss_clip": 1.025962, "balance_loss_mlp": 1.03668392, "epoch": 0.6495114985720728, "flos": 24681978823680.0, "grad_norm": 1.8183730297595506, "language_loss": 0.74735188, "learning_rate": 1.0948816282177253e-06, "loss": 0.76883847, "num_input_tokens_seen": 233150235, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 10803, "time_per_iteration": 2.6732289791107178 }, { "auxiliary_loss_clip": 0.0115321, "auxiliary_loss_mlp": 0.01032627, "balance_loss_clip": 1.02044439, "balance_loss_mlp": 1.03692317, "epoch": 0.6495716218247407, "flos": 23651162317440.0, "grad_norm": 2.0659713831192534, "language_loss": 0.6988731, "learning_rate": 1.0945447272158863e-06, "loss": 0.7207315, "num_input_tokens_seen": 233166710, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.71484375, "step": 10804, "time_per_iteration": 2.713939666748047 }, { "auxiliary_loss_clip": 0.01118108, "auxiliary_loss_mlp": 0.01033537, "balance_loss_clip": 1.01964283, "balance_loss_mlp": 1.03710651, "epoch": 0.6496317450774087, "flos": 22346169298560.0, "grad_norm": 1.783281848044352, "language_loss": 0.73196256, "learning_rate": 1.0942078585274162e-06, "loss": 0.753479, "num_input_tokens_seen": 233185445, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71875, "step": 10805, "time_per_iteration": 2.54962420463562 }, { "auxiliary_loss_clip": 0.01126155, "auxiliary_loss_mlp": 0.01033652, "balance_loss_clip": 1.02142143, "balance_loss_mlp": 1.03577685, "epoch": 0.6496918683300766, "flos": 30518647505280.0, "grad_norm": 3.1521288577155153, "language_loss": 0.66229063, "learning_rate": 1.0938710221643392e-06, "loss": 0.68388873, "num_input_tokens_seen": 233205805, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.72265625, "step": 10806, "time_per_iteration": 2.635390043258667 }, { "auxiliary_loss_clip": 0.01128906, "auxiliary_loss_mlp": 0.01282058, "balance_loss_clip": 1.02124333, "balance_loss_mlp": 1.03656161, "epoch": 0.6497519915827447, "flos": 12458992567680.0, "grad_norm": 2.09346437166596, "language_loss": 0.7881189, "learning_rate": 1.0935342181386729e-06, "loss": 0.81222856, "num_input_tokens_seen": 233224215, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.74609375, "step": 10807, "time_per_iteration": 2.5201404094696045 }, { "auxiliary_loss_clip": 0.01048938, "auxiliary_loss_mlp": 0.01004806, "balance_loss_clip": 1.00347638, "balance_loss_mlp": 1.00865483, "epoch": 0.6498121148354126, "flos": 69093748287360.0, "grad_norm": 0.7965700511541888, "language_loss": 0.58951163, "learning_rate": 1.0931974464624394e-06, "loss": 0.61004913, "num_input_tokens_seen": 233294440, "router_z_loss_clip": 0.01330566, "router_z_loss_mlp": 0.22363281, "step": 10808, "time_per_iteration": 3.318557024002075 }, { "auxiliary_loss_clip": 0.01126638, "auxiliary_loss_mlp": 0.01030444, "balance_loss_clip": 1.0177958, "balance_loss_mlp": 1.03940213, "epoch": 0.6498722380880806, "flos": 36897135914880.0, "grad_norm": 1.7173131499005412, "language_loss": 0.63165087, "learning_rate": 1.0928607071476559e-06, "loss": 0.65322167, "num_input_tokens_seen": 233316125, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 10809, "time_per_iteration": 2.7204301357269287 }, { "auxiliary_loss_clip": 0.01117615, "auxiliary_loss_mlp": 0.01283519, "balance_loss_clip": 1.02400136, "balance_loss_mlp": 1.03917527, "epoch": 0.6499323613407485, "flos": 29017760688000.0, "grad_norm": 2.090624266344244, "language_loss": 0.81834996, "learning_rate": 1.0925240002063418e-06, "loss": 0.84236133, "num_input_tokens_seen": 233336140, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 10810, "time_per_iteration": 2.583190679550171 }, { "auxiliary_loss_clip": 0.0113456, "auxiliary_loss_mlp": 0.01034425, "balance_loss_clip": 1.0226351, "balance_loss_mlp": 1.03798318, "epoch": 0.6499924845934165, "flos": 20119240874880.0, "grad_norm": 1.5660980732376146, "language_loss": 0.71876729, "learning_rate": 1.09218732565051e-06, "loss": 0.74045712, "num_input_tokens_seen": 233356095, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69921875, "step": 10811, "time_per_iteration": 2.587507963180542 }, { "auxiliary_loss_clip": 0.01152539, "auxiliary_loss_mlp": 0.01033862, "balance_loss_clip": 1.02175057, "balance_loss_mlp": 1.04005373, "epoch": 0.6500526078460844, "flos": 24421338760320.0, "grad_norm": 1.5132913492135143, "language_loss": 0.77875388, "learning_rate": 1.0918506834921787e-06, "loss": 0.80061793, "num_input_tokens_seen": 233376830, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.68359375, "step": 10812, "time_per_iteration": 2.594545364379883 }, { "auxiliary_loss_clip": 0.01119146, "auxiliary_loss_mlp": 0.01031309, "balance_loss_clip": 1.01822615, "balance_loss_mlp": 1.03675556, "epoch": 0.6501127310987524, "flos": 23331019374720.0, "grad_norm": 1.7472974015087286, "language_loss": 0.85319602, "learning_rate": 1.0915140737433607e-06, "loss": 0.87470055, "num_input_tokens_seen": 233395275, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 10813, "time_per_iteration": 2.584223508834839 }, { "auxiliary_loss_clip": 0.01137571, "auxiliary_loss_mlp": 0.01036052, "balance_loss_clip": 1.02279651, "balance_loss_mlp": 1.03832304, "epoch": 0.6501728543514204, "flos": 18697824898560.0, "grad_norm": 1.633947666130369, "language_loss": 0.79292536, "learning_rate": 1.0911774964160674e-06, "loss": 0.81466156, "num_input_tokens_seen": 233413345, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 10814, "time_per_iteration": 2.5455849170684814 }, { "auxiliary_loss_clip": 0.0114853, "auxiliary_loss_mlp": 0.01283895, "balance_loss_clip": 1.0236609, "balance_loss_mlp": 1.03860402, "epoch": 0.6502329776040884, "flos": 44199858199680.0, "grad_norm": 1.6454806686965178, "language_loss": 0.65305549, "learning_rate": 1.090840951522312e-06, "loss": 0.67737973, "num_input_tokens_seen": 233436105, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73828125, "step": 10815, "time_per_iteration": 4.152738332748413 }, { "auxiliary_loss_clip": 0.01114833, "auxiliary_loss_mlp": 0.01035519, "balance_loss_clip": 1.02002168, "balance_loss_mlp": 1.03908074, "epoch": 0.6502931008567564, "flos": 14574741419520.0, "grad_norm": 1.9264769437215903, "language_loss": 0.7517882, "learning_rate": 1.0905044390741043e-06, "loss": 0.77329177, "num_input_tokens_seen": 233452320, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.7578125, "step": 10816, "time_per_iteration": 2.528301954269409 }, { "auxiliary_loss_clip": 0.01128359, "auxiliary_loss_mlp": 0.01032508, "balance_loss_clip": 1.01956749, "balance_loss_mlp": 1.03958964, "epoch": 0.6503532241094243, "flos": 21395003201280.0, "grad_norm": 1.895391898248547, "language_loss": 0.73314345, "learning_rate": 1.090167959083454e-06, "loss": 0.75475216, "num_input_tokens_seen": 233469920, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 10817, "time_per_iteration": 2.5996615886688232 }, { "auxiliary_loss_clip": 0.01137045, "auxiliary_loss_mlp": 0.01036516, "balance_loss_clip": 1.02306294, "balance_loss_mlp": 1.03725696, "epoch": 0.6504133473620923, "flos": 74740840986240.0, "grad_norm": 1.4932343801373222, "language_loss": 0.72126013, "learning_rate": 1.0898315115623678e-06, "loss": 0.74299574, "num_input_tokens_seen": 233499780, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 10818, "time_per_iteration": 3.019779920578003 }, { "auxiliary_loss_clip": 0.01140536, "auxiliary_loss_mlp": 0.01029973, "balance_loss_clip": 1.01710427, "balance_loss_mlp": 1.03939366, "epoch": 0.6504734706147602, "flos": 19713270384000.0, "grad_norm": 2.122449341616554, "language_loss": 0.6505751, "learning_rate": 1.0894950965228547e-06, "loss": 0.67228019, "num_input_tokens_seen": 233518235, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.74609375, "step": 10819, "time_per_iteration": 2.569633722305298 }, { "auxiliary_loss_clip": 0.0113914, "auxiliary_loss_mlp": 0.01031663, "balance_loss_clip": 1.01871061, "balance_loss_mlp": 1.03914857, "epoch": 0.6505335938674283, "flos": 25556870390400.0, "grad_norm": 1.9042145502973151, "language_loss": 0.84057719, "learning_rate": 1.0891587139769195e-06, "loss": 0.86228526, "num_input_tokens_seen": 233535215, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.73046875, "step": 10820, "time_per_iteration": 2.650315999984741 }, { "auxiliary_loss_clip": 0.01122894, "auxiliary_loss_mlp": 0.01033551, "balance_loss_clip": 1.01999736, "balance_loss_mlp": 1.03872418, "epoch": 0.6505937171200962, "flos": 17821424960640.0, "grad_norm": 2.2924577497041883, "language_loss": 0.77822095, "learning_rate": 1.0888223639365666e-06, "loss": 0.79978538, "num_input_tokens_seen": 233552775, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.75, "step": 10821, "time_per_iteration": 3.9800100326538086 }, { "auxiliary_loss_clip": 0.01133621, "auxiliary_loss_mlp": 0.01033918, "balance_loss_clip": 1.02250957, "balance_loss_mlp": 1.03836632, "epoch": 0.6506538403727642, "flos": 20668135582080.0, "grad_norm": 1.9216837271666447, "language_loss": 0.79992259, "learning_rate": 1.0884860464137991e-06, "loss": 0.82159805, "num_input_tokens_seen": 233572080, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6953125, "step": 10822, "time_per_iteration": 2.616558313369751 }, { "auxiliary_loss_clip": 0.01135857, "auxiliary_loss_mlp": 0.01033716, "balance_loss_clip": 1.02087164, "balance_loss_mlp": 1.03721607, "epoch": 0.6507139636254321, "flos": 11721422695680.0, "grad_norm": 1.792557703345516, "language_loss": 0.87338895, "learning_rate": 1.0881497614206215e-06, "loss": 0.89508468, "num_input_tokens_seen": 233589155, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 10823, "time_per_iteration": 2.583773612976074 }, { "auxiliary_loss_clip": 0.0110951, "auxiliary_loss_mlp": 0.01031251, "balance_loss_clip": 1.0179112, "balance_loss_mlp": 1.03670144, "epoch": 0.6507740868781001, "flos": 26761745226240.0, "grad_norm": 1.5336707010772375, "language_loss": 0.664074, "learning_rate": 1.0878135089690316e-06, "loss": 0.68548167, "num_input_tokens_seen": 233608180, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 10824, "time_per_iteration": 2.5703980922698975 }, { "auxiliary_loss_clip": 0.01129317, "auxiliary_loss_mlp": 0.01031764, "balance_loss_clip": 1.01844203, "balance_loss_mlp": 1.03731251, "epoch": 0.650834210130768, "flos": 16471722487680.0, "grad_norm": 2.484389823676859, "language_loss": 0.87249553, "learning_rate": 1.0874772890710322e-06, "loss": 0.89410639, "num_input_tokens_seen": 233625750, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7421875, "step": 10825, "time_per_iteration": 2.5285537242889404 }, { "auxiliary_loss_clip": 0.01129029, "auxiliary_loss_mlp": 0.01030174, "balance_loss_clip": 1.01518965, "balance_loss_mlp": 1.03657091, "epoch": 0.650894333383436, "flos": 17128672283520.0, "grad_norm": 2.158219346792471, "language_loss": 0.72935092, "learning_rate": 1.087141101738621e-06, "loss": 0.75094295, "num_input_tokens_seen": 233644235, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.75, "step": 10826, "time_per_iteration": 4.063177585601807 }, { "auxiliary_loss_clip": 0.01119559, "auxiliary_loss_mlp": 0.01030975, "balance_loss_clip": 1.01882136, "balance_loss_mlp": 1.03894353, "epoch": 0.650954456636104, "flos": 18734238311040.0, "grad_norm": 1.9326466998547567, "language_loss": 0.68566918, "learning_rate": 1.0868049469837956e-06, "loss": 0.70717454, "num_input_tokens_seen": 233662845, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.71875, "step": 10827, "time_per_iteration": 4.0349225997924805 }, { "auxiliary_loss_clip": 0.0112777, "auxiliary_loss_mlp": 0.01028576, "balance_loss_clip": 1.01617885, "balance_loss_mlp": 1.03733516, "epoch": 0.651014579888772, "flos": 24528244613760.0, "grad_norm": 2.7565022367630068, "language_loss": 0.77045131, "learning_rate": 1.0864688248185526e-06, "loss": 0.79201478, "num_input_tokens_seen": 233681990, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.72265625, "step": 10828, "time_per_iteration": 2.5795350074768066 }, { "auxiliary_loss_clip": 0.01116945, "auxiliary_loss_mlp": 0.01029012, "balance_loss_clip": 1.01673889, "balance_loss_mlp": 1.03762794, "epoch": 0.65107470314144, "flos": 24061083304320.0, "grad_norm": 2.2380105964884613, "language_loss": 0.89346021, "learning_rate": 1.0861327352548865e-06, "loss": 0.91491979, "num_input_tokens_seen": 233698930, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 10829, "time_per_iteration": 2.6672937870025635 }, { "auxiliary_loss_clip": 0.0103033, "auxiliary_loss_mlp": 0.01000427, "balance_loss_clip": 0.99904388, "balance_loss_mlp": 1.00820255, "epoch": 0.6511348263941079, "flos": 72480734352000.0, "grad_norm": 0.6454211418676816, "language_loss": 0.55392575, "learning_rate": 1.0857966783047943e-06, "loss": 0.57423335, "num_input_tokens_seen": 233769825, "router_z_loss_clip": 0.01385498, "router_z_loss_mlp": 0.22167969, "step": 10830, "time_per_iteration": 3.341310739517212 }, { "auxiliary_loss_clip": 0.01140172, "auxiliary_loss_mlp": 0.01032895, "balance_loss_clip": 1.01858997, "balance_loss_mlp": 1.03839839, "epoch": 0.6511949496467759, "flos": 23367684182400.0, "grad_norm": 1.9526451150199273, "language_loss": 0.74929255, "learning_rate": 1.085460653980265e-06, "loss": 0.77102327, "num_input_tokens_seen": 233787095, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75, "step": 10831, "time_per_iteration": 2.6326918601989746 }, { "auxiliary_loss_clip": 0.01048184, "auxiliary_loss_mlp": 0.00999789, "balance_loss_clip": 0.9985016, "balance_loss_mlp": 1.00801885, "epoch": 0.6512550728994438, "flos": 67333191073920.0, "grad_norm": 0.6483946055899978, "language_loss": 0.51060945, "learning_rate": 1.0851246622932935e-06, "loss": 0.53108919, "num_input_tokens_seen": 233853050, "router_z_loss_clip": 0.01287842, "router_z_loss_mlp": 0.22363281, "step": 10832, "time_per_iteration": 3.2603230476379395 }, { "auxiliary_loss_clip": 0.01128522, "auxiliary_loss_mlp": 0.01035485, "balance_loss_clip": 1.02154934, "balance_loss_mlp": 1.03541803, "epoch": 0.6513151961521119, "flos": 21141689512320.0, "grad_norm": 2.172268359493832, "language_loss": 0.8346625, "learning_rate": 1.0847887032558696e-06, "loss": 0.85630256, "num_input_tokens_seen": 233871385, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7578125, "step": 10833, "time_per_iteration": 2.5974717140197754 }, { "auxiliary_loss_clip": 0.01130111, "auxiliary_loss_mlp": 0.01035768, "balance_loss_clip": 1.02254772, "balance_loss_mlp": 1.03975964, "epoch": 0.6513753194047798, "flos": 15158828476800.0, "grad_norm": 1.983945207372091, "language_loss": 0.83581525, "learning_rate": 1.0844527768799825e-06, "loss": 0.85747409, "num_input_tokens_seen": 233888175, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 10834, "time_per_iteration": 2.5496790409088135 }, { "auxiliary_loss_clip": 0.01121149, "auxiliary_loss_mlp": 0.01034856, "balance_loss_clip": 1.02200484, "balance_loss_mlp": 1.03879333, "epoch": 0.6514354426574478, "flos": 30226621933440.0, "grad_norm": 5.147422340875153, "language_loss": 0.76877606, "learning_rate": 1.08411688317762e-06, "loss": 0.79033607, "num_input_tokens_seen": 233911470, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.734375, "step": 10835, "time_per_iteration": 2.6683287620544434 }, { "auxiliary_loss_clip": 0.011306, "auxiliary_loss_mlp": 0.01032328, "balance_loss_clip": 1.01956046, "balance_loss_mlp": 1.04039264, "epoch": 0.6514955659101157, "flos": 24205587719040.0, "grad_norm": 1.5184004752730915, "language_loss": 0.77231419, "learning_rate": 1.0837810221607705e-06, "loss": 0.79394341, "num_input_tokens_seen": 233932135, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 10836, "time_per_iteration": 2.6290392875671387 }, { "auxiliary_loss_clip": 0.01136908, "auxiliary_loss_mlp": 0.01036663, "balance_loss_clip": 1.02342439, "balance_loss_mlp": 1.03831184, "epoch": 0.6515556891627837, "flos": 12377761960320.0, "grad_norm": 2.3848208786133878, "language_loss": 0.82077539, "learning_rate": 1.0834451938414199e-06, "loss": 0.84251106, "num_input_tokens_seen": 233947880, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 10837, "time_per_iteration": 2.6712100505828857 }, { "auxiliary_loss_clip": 0.01135664, "auxiliary_loss_mlp": 0.01035734, "balance_loss_clip": 1.02108884, "balance_loss_mlp": 1.03894019, "epoch": 0.6516158124154516, "flos": 49601217957120.0, "grad_norm": 1.9309492744719168, "language_loss": 0.58913493, "learning_rate": 1.0831093982315526e-06, "loss": 0.6108489, "num_input_tokens_seen": 233971475, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.703125, "step": 10838, "time_per_iteration": 2.826974630355835 }, { "auxiliary_loss_clip": 0.01030467, "auxiliary_loss_mlp": 0.01002561, "balance_loss_clip": 1.00121975, "balance_loss_mlp": 1.00810647, "epoch": 0.6516759356681197, "flos": 59702748076800.0, "grad_norm": 0.7237383435535341, "language_loss": 0.60871673, "learning_rate": 1.0827736353431517e-06, "loss": 0.62904704, "num_input_tokens_seen": 234030690, "router_z_loss_clip": 0.01342773, "router_z_loss_mlp": 0.22363281, "step": 10839, "time_per_iteration": 3.2101261615753174 }, { "auxiliary_loss_clip": 0.01126751, "auxiliary_loss_mlp": 0.01030355, "balance_loss_clip": 1.01808834, "balance_loss_mlp": 1.03833807, "epoch": 0.6517360589207876, "flos": 37450807130880.0, "grad_norm": 1.6747194596166415, "language_loss": 0.67796618, "learning_rate": 1.0824379051882016e-06, "loss": 0.69953728, "num_input_tokens_seen": 234052470, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 10840, "time_per_iteration": 2.6993680000305176 }, { "auxiliary_loss_clip": 0.01142753, "auxiliary_loss_mlp": 0.01032575, "balance_loss_clip": 1.02005196, "balance_loss_mlp": 1.03562653, "epoch": 0.6517961821734556, "flos": 25374911068800.0, "grad_norm": 2.3071744687291695, "language_loss": 0.73785549, "learning_rate": 1.082102207778681e-06, "loss": 0.75960875, "num_input_tokens_seen": 234071495, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 10841, "time_per_iteration": 2.650386095046997 }, { "auxiliary_loss_clip": 0.01111011, "auxiliary_loss_mlp": 0.01037684, "balance_loss_clip": 1.02393878, "balance_loss_mlp": 1.03658509, "epoch": 0.6518563054261236, "flos": 28766996864640.0, "grad_norm": 1.4277807245036158, "language_loss": 0.62667996, "learning_rate": 1.0817665431265722e-06, "loss": 0.64816684, "num_input_tokens_seen": 234092325, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7421875, "step": 10842, "time_per_iteration": 2.6380624771118164 }, { "auxiliary_loss_clip": 0.01038459, "auxiliary_loss_mlp": 0.01001786, "balance_loss_clip": 1.00030828, "balance_loss_mlp": 1.00742364, "epoch": 0.6519164286787915, "flos": 68924750797440.0, "grad_norm": 0.8177224051256702, "language_loss": 0.56167936, "learning_rate": 1.0814309112438544e-06, "loss": 0.58208179, "num_input_tokens_seen": 234148005, "router_z_loss_clip": 0.01477051, "router_z_loss_mlp": 0.22265625, "step": 10843, "time_per_iteration": 3.004301071166992 }, { "auxiliary_loss_clip": 0.01122707, "auxiliary_loss_mlp": 0.01038058, "balance_loss_clip": 1.02406871, "balance_loss_mlp": 1.03769565, "epoch": 0.6519765519314595, "flos": 20441933683200.0, "grad_norm": 1.860269876347619, "language_loss": 0.82773554, "learning_rate": 1.0810953121425028e-06, "loss": 0.84934318, "num_input_tokens_seen": 234164280, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.76171875, "step": 10844, "time_per_iteration": 2.5894525051116943 }, { "auxiliary_loss_clip": 0.01117942, "auxiliary_loss_mlp": 0.01028818, "balance_loss_clip": 1.0145303, "balance_loss_mlp": 1.03620064, "epoch": 0.6520366751841274, "flos": 28402970480640.0, "grad_norm": 1.6913958433690068, "language_loss": 0.604074, "learning_rate": 1.0807597458344967e-06, "loss": 0.62554169, "num_input_tokens_seen": 234185090, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.73046875, "step": 10845, "time_per_iteration": 2.698767900466919 }, { "auxiliary_loss_clip": 0.01119628, "auxiliary_loss_mlp": 0.01031369, "balance_loss_clip": 1.0178504, "balance_loss_mlp": 1.03837693, "epoch": 0.6520967984367955, "flos": 22273414300800.0, "grad_norm": 1.965301411458299, "language_loss": 0.7953673, "learning_rate": 1.0804242123318101e-06, "loss": 0.81687725, "num_input_tokens_seen": 234204050, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 10846, "time_per_iteration": 2.5743327140808105 }, { "auxiliary_loss_clip": 0.01038779, "auxiliary_loss_mlp": 0.00999905, "balance_loss_clip": 0.9985283, "balance_loss_mlp": 1.00779319, "epoch": 0.6521569216894634, "flos": 68917140092160.0, "grad_norm": 0.7093581595792372, "language_loss": 0.6015811, "learning_rate": 1.0800887116464194e-06, "loss": 0.62196791, "num_input_tokens_seen": 234269790, "router_z_loss_clip": 0.01379395, "router_z_loss_mlp": 0.22070312, "step": 10847, "time_per_iteration": 3.3262994289398193 }, { "auxiliary_loss_clip": 0.01113946, "auxiliary_loss_mlp": 0.01038971, "balance_loss_clip": 1.02601922, "balance_loss_mlp": 1.03928697, "epoch": 0.6522170449421314, "flos": 29130520458240.0, "grad_norm": 1.632905557193924, "language_loss": 0.80523348, "learning_rate": 1.0797532437902946e-06, "loss": 0.82676268, "num_input_tokens_seen": 234290135, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.74609375, "step": 10848, "time_per_iteration": 2.68758225440979 }, { "auxiliary_loss_clip": 0.01125954, "auxiliary_loss_mlp": 0.01037047, "balance_loss_clip": 1.02363026, "balance_loss_mlp": 1.03652644, "epoch": 0.6522771681947993, "flos": 26651930371200.0, "grad_norm": 2.30239338855358, "language_loss": 0.74229163, "learning_rate": 1.0794178087754102e-06, "loss": 0.76392162, "num_input_tokens_seen": 234309535, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71484375, "step": 10849, "time_per_iteration": 2.641770124435425 }, { "auxiliary_loss_clip": 0.01142686, "auxiliary_loss_mlp": 0.01031209, "balance_loss_clip": 1.01841211, "balance_loss_mlp": 1.03694487, "epoch": 0.6523372914474673, "flos": 25739763465600.0, "grad_norm": 1.4288443027948985, "language_loss": 0.67805791, "learning_rate": 1.079082406613736e-06, "loss": 0.6997968, "num_input_tokens_seen": 234328755, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 10850, "time_per_iteration": 2.663038730621338 }, { "auxiliary_loss_clip": 0.01124044, "auxiliary_loss_mlp": 0.01273625, "balance_loss_clip": 1.0140909, "balance_loss_mlp": 1.03692198, "epoch": 0.6523974147001352, "flos": 24827345164800.0, "grad_norm": 1.6381662332076383, "language_loss": 0.66683215, "learning_rate": 1.078747037317242e-06, "loss": 0.69080883, "num_input_tokens_seen": 234348655, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 10851, "time_per_iteration": 2.6475000381469727 }, { "auxiliary_loss_clip": 0.01131435, "auxiliary_loss_mlp": 0.0103153, "balance_loss_clip": 1.01876235, "balance_loss_mlp": 1.03895879, "epoch": 0.6524575379528033, "flos": 26317637470080.0, "grad_norm": 2.2353791951381927, "language_loss": 0.73720908, "learning_rate": 1.0784117008978958e-06, "loss": 0.75883871, "num_input_tokens_seen": 234367445, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.74609375, "step": 10852, "time_per_iteration": 2.5998339653015137 }, { "auxiliary_loss_clip": 0.01132631, "auxiliary_loss_mlp": 0.01030754, "balance_loss_clip": 1.01679444, "balance_loss_mlp": 1.03907323, "epoch": 0.6525176612054712, "flos": 19494143464320.0, "grad_norm": 1.9513409326617146, "language_loss": 0.66843379, "learning_rate": 1.078076397367666e-06, "loss": 0.69006765, "num_input_tokens_seen": 234384825, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7578125, "step": 10853, "time_per_iteration": 2.56746506690979 }, { "auxiliary_loss_clip": 0.01058026, "auxiliary_loss_mlp": 0.01002062, "balance_loss_clip": 1.00070941, "balance_loss_mlp": 1.00830102, "epoch": 0.6525777844581392, "flos": 71706894721920.0, "grad_norm": 0.7234814968043561, "language_loss": 0.63072193, "learning_rate": 1.0777411267385183e-06, "loss": 0.65132278, "num_input_tokens_seen": 234450630, "router_z_loss_clip": 0.0135498, "router_z_loss_mlp": 0.22265625, "step": 10854, "time_per_iteration": 3.2884857654571533 }, { "auxiliary_loss_clip": 0.01120221, "auxiliary_loss_mlp": 0.01038191, "balance_loss_clip": 1.02361155, "balance_loss_mlp": 1.03767514, "epoch": 0.6526379077108072, "flos": 26653115520000.0, "grad_norm": 1.6382767758475916, "language_loss": 0.77441305, "learning_rate": 1.0774058890224175e-06, "loss": 0.7959972, "num_input_tokens_seen": 234473505, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.734375, "step": 10855, "time_per_iteration": 2.633344888687134 }, { "auxiliary_loss_clip": 0.01132632, "auxiliary_loss_mlp": 0.01026602, "balance_loss_clip": 1.01330459, "balance_loss_mlp": 1.03585958, "epoch": 0.6526980309634751, "flos": 22820369673600.0, "grad_norm": 1.9407019867922435, "language_loss": 0.79011685, "learning_rate": 1.0770706842313262e-06, "loss": 0.81170917, "num_input_tokens_seen": 234492485, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69921875, "step": 10856, "time_per_iteration": 2.5951905250549316 }, { "auxiliary_loss_clip": 0.01109518, "auxiliary_loss_mlp": 0.01034252, "balance_loss_clip": 1.01948214, "balance_loss_mlp": 1.03679001, "epoch": 0.6527581542161431, "flos": 28365048696960.0, "grad_norm": 1.5758040205385275, "language_loss": 0.73292869, "learning_rate": 1.07673551237721e-06, "loss": 0.7543664, "num_input_tokens_seen": 234512645, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7265625, "step": 10857, "time_per_iteration": 3.991342782974243 }, { "auxiliary_loss_clip": 0.01129596, "auxiliary_loss_mlp": 0.01031665, "balance_loss_clip": 1.01911807, "balance_loss_mlp": 1.03849268, "epoch": 0.652818277468811, "flos": 18369206346240.0, "grad_norm": 2.363545290204121, "language_loss": 0.62954903, "learning_rate": 1.0764003734720275e-06, "loss": 0.65116161, "num_input_tokens_seen": 234529310, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.73046875, "step": 10858, "time_per_iteration": 2.5670323371887207 }, { "auxiliary_loss_clip": 0.01106535, "auxiliary_loss_mlp": 0.01033924, "balance_loss_clip": 1.02110314, "balance_loss_mlp": 1.03605008, "epoch": 0.6528784007214791, "flos": 18036170421120.0, "grad_norm": 1.5385413821930625, "language_loss": 0.78002799, "learning_rate": 1.0760652675277393e-06, "loss": 0.80143249, "num_input_tokens_seen": 234546685, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 10859, "time_per_iteration": 2.571031332015991 }, { "auxiliary_loss_clip": 0.01122001, "auxiliary_loss_mlp": 0.01030356, "balance_loss_clip": 1.01656377, "balance_loss_mlp": 1.0380801, "epoch": 0.652938523974147, "flos": 22382008093440.0, "grad_norm": 1.6289825312986876, "language_loss": 0.67778277, "learning_rate": 1.0757301945563064e-06, "loss": 0.69930637, "num_input_tokens_seen": 234566255, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75, "step": 10860, "time_per_iteration": 2.5541858673095703 }, { "auxiliary_loss_clip": 0.01124464, "auxiliary_loss_mlp": 0.01030408, "balance_loss_clip": 1.01724172, "balance_loss_mlp": 1.04003644, "epoch": 0.652998647226815, "flos": 16764035368320.0, "grad_norm": 1.7479314365783318, "language_loss": 0.66372931, "learning_rate": 1.075395154569684e-06, "loss": 0.685278, "num_input_tokens_seen": 234585405, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.75390625, "step": 10861, "time_per_iteration": 2.566150188446045 }, { "auxiliary_loss_clip": 0.01129599, "auxiliary_loss_mlp": 0.0103411, "balance_loss_clip": 1.01925683, "balance_loss_mlp": 1.03826714, "epoch": 0.6530587704794829, "flos": 35772522019200.0, "grad_norm": 1.7452501787709076, "language_loss": 0.65101421, "learning_rate": 1.0750601475798307e-06, "loss": 0.67265135, "num_input_tokens_seen": 234608095, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.734375, "step": 10862, "time_per_iteration": 2.72481369972229 }, { "auxiliary_loss_clip": 0.01127117, "auxiliary_loss_mlp": 0.01032998, "balance_loss_clip": 1.01989698, "balance_loss_mlp": 1.03769469, "epoch": 0.6531188937321509, "flos": 19316134638720.0, "grad_norm": 1.6885750242775843, "language_loss": 0.77028835, "learning_rate": 1.0747251735987009e-06, "loss": 0.79188949, "num_input_tokens_seen": 234627335, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 10863, "time_per_iteration": 3.944995880126953 }, { "auxiliary_loss_clip": 0.01127518, "auxiliary_loss_mlp": 0.01029694, "balance_loss_clip": 1.01767182, "balance_loss_mlp": 1.03748262, "epoch": 0.6531790169848188, "flos": 22893771116160.0, "grad_norm": 1.5887835317666608, "language_loss": 0.74517655, "learning_rate": 1.074390232638251e-06, "loss": 0.76674873, "num_input_tokens_seen": 234646540, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.71875, "step": 10864, "time_per_iteration": 2.657677173614502 }, { "auxiliary_loss_clip": 0.01117709, "auxiliary_loss_mlp": 0.01034174, "balance_loss_clip": 1.02151406, "balance_loss_mlp": 1.03787518, "epoch": 0.6532391402374869, "flos": 29563530912000.0, "grad_norm": 1.7342921639980295, "language_loss": 0.85811782, "learning_rate": 1.0740553247104315e-06, "loss": 0.87963665, "num_input_tokens_seen": 234665470, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 10865, "time_per_iteration": 2.6299262046813965 }, { "auxiliary_loss_clip": 0.01141962, "auxiliary_loss_mlp": 0.01282997, "balance_loss_clip": 1.02300894, "balance_loss_mlp": 1.03999555, "epoch": 0.6532992634901548, "flos": 23105463920640.0, "grad_norm": 1.564929187328307, "language_loss": 0.81195474, "learning_rate": 1.0737204498271958e-06, "loss": 0.83620435, "num_input_tokens_seen": 234683955, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.75, "step": 10866, "time_per_iteration": 2.617370128631592 }, { "auxiliary_loss_clip": 0.01126605, "auxiliary_loss_mlp": 0.01029177, "balance_loss_clip": 1.01646924, "balance_loss_mlp": 1.03828859, "epoch": 0.6533593867428228, "flos": 26067340523520.0, "grad_norm": 1.6135275014141226, "language_loss": 0.81967485, "learning_rate": 1.0733856080004952e-06, "loss": 0.84123266, "num_input_tokens_seen": 234704595, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 10867, "time_per_iteration": 2.6210057735443115 }, { "auxiliary_loss_clip": 0.01128714, "auxiliary_loss_mlp": 0.01027785, "balance_loss_clip": 1.01503563, "balance_loss_mlp": 1.03755164, "epoch": 0.6534195099954908, "flos": 21212469262080.0, "grad_norm": 1.8604559478126395, "language_loss": 0.80739051, "learning_rate": 1.0730507992422784e-06, "loss": 0.82895553, "num_input_tokens_seen": 234724090, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.734375, "step": 10868, "time_per_iteration": 4.750847101211548 }, { "auxiliary_loss_clip": 0.01139443, "auxiliary_loss_mlp": 0.01027974, "balance_loss_clip": 1.0135026, "balance_loss_mlp": 1.03843355, "epoch": 0.6534796332481587, "flos": 19646584784640.0, "grad_norm": 1.8337438225110252, "language_loss": 0.79517752, "learning_rate": 1.0727160235644932e-06, "loss": 0.81685174, "num_input_tokens_seen": 234742560, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7421875, "step": 10869, "time_per_iteration": 2.619218587875366 }, { "auxiliary_loss_clip": 0.01120924, "auxiliary_loss_mlp": 0.01036798, "balance_loss_clip": 1.0232023, "balance_loss_mlp": 1.03941965, "epoch": 0.6535397565008267, "flos": 24022479162240.0, "grad_norm": 1.906654174758853, "language_loss": 0.72031271, "learning_rate": 1.0723812809790898e-06, "loss": 0.74188989, "num_input_tokens_seen": 234762315, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 10870, "time_per_iteration": 2.634036064147949 }, { "auxiliary_loss_clip": 0.01138199, "auxiliary_loss_mlp": 0.01036977, "balance_loss_clip": 1.02386975, "balance_loss_mlp": 1.0400207, "epoch": 0.6535998797534947, "flos": 24602759377920.0, "grad_norm": 2.681368056692238, "language_loss": 0.74760824, "learning_rate": 1.0720465714980106e-06, "loss": 0.76936001, "num_input_tokens_seen": 234781300, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 10871, "time_per_iteration": 2.663516044616699 }, { "auxiliary_loss_clip": 0.01115512, "auxiliary_loss_mlp": 0.01280654, "balance_loss_clip": 1.0212816, "balance_loss_mlp": 1.03713012, "epoch": 0.6536600030061627, "flos": 23364164649600.0, "grad_norm": 1.5411145589253414, "language_loss": 0.55871195, "learning_rate": 1.0717118951332032e-06, "loss": 0.58267355, "num_input_tokens_seen": 234801040, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 10872, "time_per_iteration": 2.6346020698547363 }, { "auxiliary_loss_clip": 0.01124967, "auxiliary_loss_mlp": 0.01034437, "balance_loss_clip": 1.02167571, "balance_loss_mlp": 1.03756094, "epoch": 0.6537201262588306, "flos": 23878477537920.0, "grad_norm": 1.5917557897991412, "language_loss": 0.74893087, "learning_rate": 1.0713772518966102e-06, "loss": 0.77052486, "num_input_tokens_seen": 234821415, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 10873, "time_per_iteration": 2.620284080505371 }, { "auxiliary_loss_clip": 0.01120153, "auxiliary_loss_mlp": 0.01032767, "balance_loss_clip": 1.0195291, "balance_loss_mlp": 1.03902841, "epoch": 0.6537802495114986, "flos": 24354760901760.0, "grad_norm": 1.5502189033701752, "language_loss": 0.7572825, "learning_rate": 1.0710426418001746e-06, "loss": 0.77881169, "num_input_tokens_seen": 234843795, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 10874, "time_per_iteration": 2.647736072540283 }, { "auxiliary_loss_clip": 0.01136621, "auxiliary_loss_mlp": 0.01036114, "balance_loss_clip": 1.02282763, "balance_loss_mlp": 1.03838754, "epoch": 0.6538403727641665, "flos": 27996892248960.0, "grad_norm": 1.5919511786520029, "language_loss": 0.81508517, "learning_rate": 1.0707080648558374e-06, "loss": 0.8368125, "num_input_tokens_seen": 234862350, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 10875, "time_per_iteration": 2.6257669925689697 }, { "auxiliary_loss_clip": 0.01108832, "auxiliary_loss_mlp": 0.01036128, "balance_loss_clip": 1.02373612, "balance_loss_mlp": 1.03667355, "epoch": 0.6539004960168345, "flos": 27563594486400.0, "grad_norm": 1.8885834066750988, "language_loss": 0.70037568, "learning_rate": 1.0703735210755383e-06, "loss": 0.7218253, "num_input_tokens_seen": 234881790, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.72265625, "step": 10876, "time_per_iteration": 2.5836477279663086 }, { "auxiliary_loss_clip": 0.01172574, "auxiliary_loss_mlp": 0.01038611, "balance_loss_clip": 1.0253675, "balance_loss_mlp": 1.03821468, "epoch": 0.6539606192695024, "flos": 14530067879040.0, "grad_norm": 2.236094230543315, "language_loss": 0.79436415, "learning_rate": 1.0700390104712184e-06, "loss": 0.81647605, "num_input_tokens_seen": 234897775, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 10877, "time_per_iteration": 2.6109375953674316 }, { "auxiliary_loss_clip": 0.01128865, "auxiliary_loss_mlp": 0.0127473, "balance_loss_clip": 1.01545715, "balance_loss_mlp": 1.03912735, "epoch": 0.6540207425221705, "flos": 21616356764160.0, "grad_norm": 2.4100040393016178, "language_loss": 0.79857737, "learning_rate": 1.0697045330548127e-06, "loss": 0.82261336, "num_input_tokens_seen": 234918395, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.72265625, "step": 10878, "time_per_iteration": 2.5715885162353516 }, { "auxiliary_loss_clip": 0.01127721, "auxiliary_loss_mlp": 0.0103236, "balance_loss_clip": 1.01862717, "balance_loss_mlp": 1.03765833, "epoch": 0.6540808657748384, "flos": 17668983640320.0, "grad_norm": 4.533434916782956, "language_loss": 0.84261036, "learning_rate": 1.06937008883826e-06, "loss": 0.8642112, "num_input_tokens_seen": 234936260, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 10879, "time_per_iteration": 2.5705206394195557 }, { "auxiliary_loss_clip": 0.01127718, "auxiliary_loss_mlp": 0.01028775, "balance_loss_clip": 1.01562607, "balance_loss_mlp": 1.03656864, "epoch": 0.6541409890275064, "flos": 14538292093440.0, "grad_norm": 2.287582850115739, "language_loss": 0.71652645, "learning_rate": 1.069035677833494e-06, "loss": 0.73809141, "num_input_tokens_seen": 234952110, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73828125, "step": 10880, "time_per_iteration": 2.5744264125823975 }, { "auxiliary_loss_clip": 0.01134227, "auxiliary_loss_mlp": 0.01029445, "balance_loss_clip": 1.01668406, "balance_loss_mlp": 1.03652322, "epoch": 0.6542011122801744, "flos": 17165301177600.0, "grad_norm": 1.8451676886881483, "language_loss": 0.84275746, "learning_rate": 1.0687013000524513e-06, "loss": 0.86439425, "num_input_tokens_seen": 234970810, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 10881, "time_per_iteration": 2.6567165851593018 }, { "auxiliary_loss_clip": 0.01138175, "auxiliary_loss_mlp": 0.01034841, "balance_loss_clip": 1.02072084, "balance_loss_mlp": 1.03761399, "epoch": 0.6542612355328423, "flos": 18186600579840.0, "grad_norm": 1.7723649490386342, "language_loss": 0.77849877, "learning_rate": 1.0683669555070624e-06, "loss": 0.80022895, "num_input_tokens_seen": 234989565, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.73828125, "step": 10882, "time_per_iteration": 2.5676302909851074 }, { "auxiliary_loss_clip": 0.01113538, "auxiliary_loss_mlp": 0.01031686, "balance_loss_clip": 1.01819742, "balance_loss_mlp": 1.03982198, "epoch": 0.6543213587855103, "flos": 19792453916160.0, "grad_norm": 2.239688380808332, "language_loss": 0.81944394, "learning_rate": 1.068032644209261e-06, "loss": 0.84089619, "num_input_tokens_seen": 235007955, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73828125, "step": 10883, "time_per_iteration": 2.5567915439605713 }, { "auxiliary_loss_clip": 0.0115042, "auxiliary_loss_mlp": 0.01039959, "balance_loss_clip": 1.02639902, "balance_loss_mlp": 1.04048157, "epoch": 0.6543814820381783, "flos": 21105096531840.0, "grad_norm": 2.2727381412390883, "language_loss": 0.85417068, "learning_rate": 1.0676983661709774e-06, "loss": 0.87607449, "num_input_tokens_seen": 235024860, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7421875, "step": 10884, "time_per_iteration": 2.720346212387085 }, { "auxiliary_loss_clip": 0.01129613, "auxiliary_loss_mlp": 0.01036323, "balance_loss_clip": 1.02293599, "balance_loss_mlp": 1.03880012, "epoch": 0.6544416052908463, "flos": 20194042947840.0, "grad_norm": 2.3061999996309015, "language_loss": 0.79631877, "learning_rate": 1.067364121404141e-06, "loss": 0.81797814, "num_input_tokens_seen": 235043815, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73046875, "step": 10885, "time_per_iteration": 2.6426875591278076 }, { "auxiliary_loss_clip": 0.01135152, "auxiliary_loss_mlp": 0.01030469, "balance_loss_clip": 1.01732612, "balance_loss_mlp": 1.03838992, "epoch": 0.6545017285435142, "flos": 23368258800000.0, "grad_norm": 1.9948272730406567, "language_loss": 0.71930259, "learning_rate": 1.067029909920679e-06, "loss": 0.74095881, "num_input_tokens_seen": 235062985, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 10886, "time_per_iteration": 2.6753361225128174 }, { "auxiliary_loss_clip": 0.01116904, "auxiliary_loss_mlp": 0.01029009, "balance_loss_clip": 1.01681995, "balance_loss_mlp": 1.037081, "epoch": 0.6545618517961822, "flos": 19134714021120.0, "grad_norm": 1.8632751716537475, "language_loss": 0.78146446, "learning_rate": 1.0666957317325215e-06, "loss": 0.80292356, "num_input_tokens_seen": 235081670, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.70703125, "step": 10887, "time_per_iteration": 2.6018917560577393 }, { "auxiliary_loss_clip": 0.01126986, "auxiliary_loss_mlp": 0.01033068, "balance_loss_clip": 1.02040839, "balance_loss_mlp": 1.0381546, "epoch": 0.6546219750488501, "flos": 14938624149120.0, "grad_norm": 2.095878659487443, "language_loss": 0.78976971, "learning_rate": 1.0663615868515913e-06, "loss": 0.81137025, "num_input_tokens_seen": 235098510, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 10888, "time_per_iteration": 2.5642471313476562 }, { "auxiliary_loss_clip": 0.01114396, "auxiliary_loss_mlp": 0.01029555, "balance_loss_clip": 1.01780093, "balance_loss_mlp": 1.03692949, "epoch": 0.6546820983015181, "flos": 36320518886400.0, "grad_norm": 1.5267425802222954, "language_loss": 0.66552508, "learning_rate": 1.066027475289814e-06, "loss": 0.68696457, "num_input_tokens_seen": 235119990, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 10889, "time_per_iteration": 2.7049062252044678 }, { "auxiliary_loss_clip": 0.01125467, "auxiliary_loss_mlp": 0.01033525, "balance_loss_clip": 1.01945245, "balance_loss_mlp": 1.03538144, "epoch": 0.654742221554186, "flos": 20411446014720.0, "grad_norm": 1.443696427141094, "language_loss": 0.79584849, "learning_rate": 1.0656933970591145e-06, "loss": 0.81743836, "num_input_tokens_seen": 235139255, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7265625, "step": 10890, "time_per_iteration": 2.567704677581787 }, { "auxiliary_loss_clip": 0.01118037, "auxiliary_loss_mlp": 0.01278449, "balance_loss_clip": 1.01827848, "balance_loss_mlp": 1.03580594, "epoch": 0.6548023448068541, "flos": 24863650836480.0, "grad_norm": 1.993326298111749, "language_loss": 0.65209198, "learning_rate": 1.0653593521714144e-06, "loss": 0.67605686, "num_input_tokens_seen": 235158455, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 10891, "time_per_iteration": 2.594783067703247 }, { "auxiliary_loss_clip": 0.01118942, "auxiliary_loss_mlp": 0.01034258, "balance_loss_clip": 1.02159762, "balance_loss_mlp": 1.03824425, "epoch": 0.654862468059522, "flos": 21427573858560.0, "grad_norm": 1.7887935988854333, "language_loss": 0.79419696, "learning_rate": 1.0650253406386347e-06, "loss": 0.81572896, "num_input_tokens_seen": 235177350, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 10892, "time_per_iteration": 2.6159286499023438 }, { "auxiliary_loss_clip": 0.01123895, "auxiliary_loss_mlp": 0.01033285, "balance_loss_clip": 1.02084553, "balance_loss_mlp": 1.04008913, "epoch": 0.65492259131219, "flos": 26577846570240.0, "grad_norm": 1.85240869590577, "language_loss": 0.77951789, "learning_rate": 1.0646913624726947e-06, "loss": 0.80108964, "num_input_tokens_seen": 235196435, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.75, "step": 10893, "time_per_iteration": 2.646223783493042 }, { "auxiliary_loss_clip": 0.01113912, "auxiliary_loss_mlp": 0.01033578, "balance_loss_clip": 1.01948714, "balance_loss_mlp": 1.03884113, "epoch": 0.6549827145648579, "flos": 21501334437120.0, "grad_norm": 1.5609333871142073, "language_loss": 0.70355517, "learning_rate": 1.0643574176855158e-06, "loss": 0.72503006, "num_input_tokens_seen": 235215430, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75, "step": 10894, "time_per_iteration": 2.547428607940674 }, { "auxiliary_loss_clip": 0.01122757, "auxiliary_loss_mlp": 0.01031907, "balance_loss_clip": 1.01835907, "balance_loss_mlp": 1.0387094, "epoch": 0.6550428378175259, "flos": 22594275515520.0, "grad_norm": 4.2630084689739896, "language_loss": 0.62607884, "learning_rate": 1.0640235062890121e-06, "loss": 0.64762545, "num_input_tokens_seen": 235232015, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.75, "step": 10895, "time_per_iteration": 2.618182420730591 }, { "auxiliary_loss_clip": 0.01116026, "auxiliary_loss_mlp": 0.01034346, "balance_loss_clip": 1.02193606, "balance_loss_mlp": 1.03550291, "epoch": 0.655102961070194, "flos": 12823809050880.0, "grad_norm": 1.885877092934813, "language_loss": 0.79101181, "learning_rate": 1.0636896282951028e-06, "loss": 0.8125155, "num_input_tokens_seen": 235248115, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71484375, "step": 10896, "time_per_iteration": 2.541246175765991 }, { "auxiliary_loss_clip": 0.01125497, "auxiliary_loss_mlp": 0.01031705, "balance_loss_clip": 1.01947379, "balance_loss_mlp": 1.03772581, "epoch": 0.6551630843228619, "flos": 24791075406720.0, "grad_norm": 1.402289969738073, "language_loss": 0.70645338, "learning_rate": 1.0633557837157016e-06, "loss": 0.72802538, "num_input_tokens_seen": 235270785, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 10897, "time_per_iteration": 2.793079376220703 }, { "auxiliary_loss_clip": 0.01110688, "auxiliary_loss_mlp": 0.0103291, "balance_loss_clip": 1.02013052, "balance_loss_mlp": 1.03695059, "epoch": 0.6552232075755299, "flos": 16724461559040.0, "grad_norm": 1.7475098181464372, "language_loss": 0.75732255, "learning_rate": 1.0630219725627245e-06, "loss": 0.77875853, "num_input_tokens_seen": 235287905, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.73828125, "step": 10898, "time_per_iteration": 3.9269397258758545 }, { "auxiliary_loss_clip": 0.01128732, "auxiliary_loss_mlp": 0.01035872, "balance_loss_clip": 1.02250838, "balance_loss_mlp": 1.03965318, "epoch": 0.6552833308281978, "flos": 22016473338240.0, "grad_norm": 2.9883691651759334, "language_loss": 0.73094523, "learning_rate": 1.0626881948480813e-06, "loss": 0.75259125, "num_input_tokens_seen": 235305525, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71484375, "step": 10899, "time_per_iteration": 2.615772008895874 }, { "auxiliary_loss_clip": 0.01126194, "auxiliary_loss_mlp": 0.01030579, "balance_loss_clip": 1.01775241, "balance_loss_mlp": 1.03691339, "epoch": 0.6553434540808658, "flos": 24863399441280.0, "grad_norm": 2.041593112862531, "language_loss": 0.55778587, "learning_rate": 1.0623544505836863e-06, "loss": 0.57935369, "num_input_tokens_seen": 235324415, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 10900, "time_per_iteration": 2.6094062328338623 }, { "auxiliary_loss_clip": 0.01120881, "auxiliary_loss_mlp": 0.01035498, "balance_loss_clip": 1.02111506, "balance_loss_mlp": 1.03954399, "epoch": 0.6554035773335337, "flos": 23221060865280.0, "grad_norm": 2.728576094844802, "language_loss": 0.7674287, "learning_rate": 1.0620207397814492e-06, "loss": 0.78899246, "num_input_tokens_seen": 235341595, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7265625, "step": 10901, "time_per_iteration": 2.5581233501434326 }, { "auxiliary_loss_clip": 0.01108844, "auxiliary_loss_mlp": 0.0102949, "balance_loss_clip": 1.01628733, "balance_loss_mlp": 1.03791022, "epoch": 0.6554637005862017, "flos": 22783597125120.0, "grad_norm": 1.7764301940834637, "language_loss": 0.73170614, "learning_rate": 1.0616870624532789e-06, "loss": 0.75308943, "num_input_tokens_seen": 235361700, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 10902, "time_per_iteration": 2.530000925064087 }, { "auxiliary_loss_clip": 0.01110015, "auxiliary_loss_mlp": 0.01028741, "balance_loss_clip": 1.01603365, "balance_loss_mlp": 1.03922164, "epoch": 0.6555238238388696, "flos": 21507224267520.0, "grad_norm": 1.5917203181723947, "language_loss": 0.67743474, "learning_rate": 1.0613534186110838e-06, "loss": 0.69882232, "num_input_tokens_seen": 235382065, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 10903, "time_per_iteration": 2.6075124740600586 }, { "auxiliary_loss_clip": 0.0113073, "auxiliary_loss_mlp": 0.01284139, "balance_loss_clip": 1.02396965, "balance_loss_mlp": 1.03685367, "epoch": 0.6555839470915377, "flos": 30519473518080.0, "grad_norm": 1.847050744216691, "language_loss": 0.67123246, "learning_rate": 1.0610198082667706e-06, "loss": 0.69538116, "num_input_tokens_seen": 235402130, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.76171875, "step": 10904, "time_per_iteration": 4.032412528991699 }, { "auxiliary_loss_clip": 0.01119845, "auxiliary_loss_mlp": 0.01038714, "balance_loss_clip": 1.02421784, "balance_loss_mlp": 1.0363555, "epoch": 0.6556440703442056, "flos": 24642943718400.0, "grad_norm": 2.1069182008401945, "language_loss": 0.90142477, "learning_rate": 1.0606862314322454e-06, "loss": 0.92301035, "num_input_tokens_seen": 235420435, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7421875, "step": 10905, "time_per_iteration": 2.582808494567871 }, { "auxiliary_loss_clip": 0.01114745, "auxiliary_loss_mlp": 0.01034345, "balance_loss_clip": 1.02195942, "balance_loss_mlp": 1.03611636, "epoch": 0.6557041935968736, "flos": 23732464752000.0, "grad_norm": 1.754831561867117, "language_loss": 0.75597411, "learning_rate": 1.060352688119411e-06, "loss": 0.77746505, "num_input_tokens_seen": 235439960, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69921875, "step": 10906, "time_per_iteration": 2.6101372241973877 }, { "auxiliary_loss_clip": 0.01113559, "auxiliary_loss_mlp": 0.01041154, "balance_loss_clip": 1.02745128, "balance_loss_mlp": 1.03902233, "epoch": 0.6557643168495415, "flos": 11102753819520.0, "grad_norm": 2.6602719696901223, "language_loss": 0.74239403, "learning_rate": 1.0600191783401732e-06, "loss": 0.76394117, "num_input_tokens_seen": 235457495, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.74609375, "step": 10907, "time_per_iteration": 2.588308572769165 }, { "auxiliary_loss_clip": 0.01135618, "auxiliary_loss_mlp": 0.01031288, "balance_loss_clip": 1.01677442, "balance_loss_mlp": 1.03616762, "epoch": 0.6558244401022095, "flos": 30191034533760.0, "grad_norm": 1.441500374312004, "language_loss": 0.72423887, "learning_rate": 1.0596857021064333e-06, "loss": 0.74590796, "num_input_tokens_seen": 235479525, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7265625, "step": 10908, "time_per_iteration": 2.680715799331665 }, { "auxiliary_loss_clip": 0.01110851, "auxiliary_loss_mlp": 0.01033844, "balance_loss_clip": 1.01961064, "balance_loss_mlp": 1.03827548, "epoch": 0.6558845633548775, "flos": 17931060247680.0, "grad_norm": 2.0975296470402998, "language_loss": 0.81025136, "learning_rate": 1.0593522594300917e-06, "loss": 0.8316983, "num_input_tokens_seen": 235496305, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7265625, "step": 10909, "time_per_iteration": 2.5340447425842285 }, { "auxiliary_loss_clip": 0.01110998, "auxiliary_loss_mlp": 0.01035252, "balance_loss_clip": 1.02138245, "balance_loss_mlp": 1.03753138, "epoch": 0.6559446866075455, "flos": 21904144531200.0, "grad_norm": 2.1374559441116165, "language_loss": 0.63631463, "learning_rate": 1.0590188503230475e-06, "loss": 0.65777713, "num_input_tokens_seen": 235512545, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 10910, "time_per_iteration": 4.772505283355713 }, { "auxiliary_loss_clip": 0.01135344, "auxiliary_loss_mlp": 0.01037095, "balance_loss_clip": 1.02248549, "balance_loss_mlp": 1.03958762, "epoch": 0.6560048098602135, "flos": 14127976056960.0, "grad_norm": 2.5123868870596175, "language_loss": 0.75451672, "learning_rate": 1.0586854747972015e-06, "loss": 0.77624106, "num_input_tokens_seen": 235526045, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.77734375, "step": 10911, "time_per_iteration": 2.5786595344543457 }, { "auxiliary_loss_clip": 0.01104499, "auxiliary_loss_mlp": 0.01029811, "balance_loss_clip": 1.01805663, "balance_loss_mlp": 1.03548503, "epoch": 0.6560649331128814, "flos": 18807567926400.0, "grad_norm": 1.6509580386685374, "language_loss": 0.75197768, "learning_rate": 1.0583521328644485e-06, "loss": 0.77332079, "num_input_tokens_seen": 235545285, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6875, "step": 10912, "time_per_iteration": 2.5576133728027344 }, { "auxiliary_loss_clip": 0.01145416, "auxiliary_loss_mlp": 0.01282565, "balance_loss_clip": 1.02151024, "balance_loss_mlp": 1.04021239, "epoch": 0.6561250563655494, "flos": 17053618815360.0, "grad_norm": 1.5363827992242907, "language_loss": 0.77658224, "learning_rate": 1.058018824536686e-06, "loss": 0.80086207, "num_input_tokens_seen": 235563150, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78125, "step": 10913, "time_per_iteration": 2.6264896392822266 }, { "auxiliary_loss_clip": 0.01123113, "auxiliary_loss_mlp": 0.01028169, "balance_loss_clip": 1.01626575, "balance_loss_mlp": 1.03644276, "epoch": 0.6561851796182173, "flos": 22637656166400.0, "grad_norm": 1.7772554460475192, "language_loss": 0.70771134, "learning_rate": 1.0576855498258087e-06, "loss": 0.72922415, "num_input_tokens_seen": 235582535, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69140625, "step": 10914, "time_per_iteration": 2.640862464904785 }, { "auxiliary_loss_clip": 0.01134419, "auxiliary_loss_mlp": 0.01033044, "balance_loss_clip": 1.01995468, "balance_loss_mlp": 1.03642285, "epoch": 0.6562453028708853, "flos": 19239213663360.0, "grad_norm": 1.6555786776173025, "language_loss": 0.73630798, "learning_rate": 1.05735230874371e-06, "loss": 0.75798255, "num_input_tokens_seen": 235601490, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 10915, "time_per_iteration": 2.6101996898651123 }, { "auxiliary_loss_clip": 0.01133658, "auxiliary_loss_mlp": 0.01030399, "balance_loss_clip": 1.01668453, "balance_loss_mlp": 1.03604841, "epoch": 0.6563054261235532, "flos": 23801305167360.0, "grad_norm": 1.536518590284915, "language_loss": 0.79549915, "learning_rate": 1.057019101302282e-06, "loss": 0.81713974, "num_input_tokens_seen": 235619165, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.703125, "step": 10916, "time_per_iteration": 2.6309306621551514 }, { "auxiliary_loss_clip": 0.01127153, "auxiliary_loss_mlp": 0.01032798, "balance_loss_clip": 1.02039409, "balance_loss_mlp": 1.03663373, "epoch": 0.6563655493762213, "flos": 19240039676160.0, "grad_norm": 1.8218003278664539, "language_loss": 0.76316762, "learning_rate": 1.0566859275134174e-06, "loss": 0.78476715, "num_input_tokens_seen": 235637115, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71875, "step": 10917, "time_per_iteration": 2.5898587703704834 }, { "auxiliary_loss_clip": 0.01107159, "auxiliary_loss_mlp": 0.01279897, "balance_loss_clip": 1.02000999, "balance_loss_mlp": 1.03598261, "epoch": 0.6564256726288892, "flos": 25556439427200.0, "grad_norm": 2.035555723186929, "language_loss": 0.70292008, "learning_rate": 1.0563527873890063e-06, "loss": 0.72679073, "num_input_tokens_seen": 235656330, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 10918, "time_per_iteration": 2.588191270828247 }, { "auxiliary_loss_clip": 0.0113271, "auxiliary_loss_mlp": 0.01038203, "balance_loss_clip": 1.02601945, "balance_loss_mlp": 1.03835225, "epoch": 0.6564857958815572, "flos": 22200623389440.0, "grad_norm": 1.5776232994567656, "language_loss": 0.76534277, "learning_rate": 1.0560196809409356e-06, "loss": 0.78705186, "num_input_tokens_seen": 235674510, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6796875, "step": 10919, "time_per_iteration": 2.6376876831054688 }, { "auxiliary_loss_clip": 0.01039152, "auxiliary_loss_mlp": 0.01252092, "balance_loss_clip": 1.00485122, "balance_loss_mlp": 1.00817204, "epoch": 0.6565459191342251, "flos": 58123144604160.0, "grad_norm": 0.7183827070158149, "language_loss": 0.53045452, "learning_rate": 1.0556866081810948e-06, "loss": 0.55336702, "num_input_tokens_seen": 235735050, "router_z_loss_clip": 0.01281738, "router_z_loss_mlp": 0.22167969, "step": 10920, "time_per_iteration": 3.1714110374450684 }, { "auxiliary_loss_clip": 0.01117582, "auxiliary_loss_mlp": 0.01032555, "balance_loss_clip": 1.01906681, "balance_loss_mlp": 1.0386765, "epoch": 0.6566060423868931, "flos": 30809631582720.0, "grad_norm": 1.7872776587017414, "language_loss": 0.65704143, "learning_rate": 1.05535356912137e-06, "loss": 0.67854279, "num_input_tokens_seen": 235757545, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 10921, "time_per_iteration": 2.6537396907806396 }, { "auxiliary_loss_clip": 0.01124972, "auxiliary_loss_mlp": 0.0103381, "balance_loss_clip": 1.01948738, "balance_loss_mlp": 1.03606868, "epoch": 0.6566661656395612, "flos": 23367432787200.0, "grad_norm": 1.7950417666812342, "language_loss": 0.81361687, "learning_rate": 1.0550205637736462e-06, "loss": 0.83520472, "num_input_tokens_seen": 235777265, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7109375, "step": 10922, "time_per_iteration": 2.604128837585449 }, { "auxiliary_loss_clip": 0.0114477, "auxiliary_loss_mlp": 0.012819, "balance_loss_clip": 1.02157259, "balance_loss_mlp": 1.03676212, "epoch": 0.6567262888922291, "flos": 25735597488000.0, "grad_norm": 3.047153606213389, "language_loss": 0.71466857, "learning_rate": 1.054687592149807e-06, "loss": 0.73893529, "num_input_tokens_seen": 235796565, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 10923, "time_per_iteration": 2.6123273372650146 }, { "auxiliary_loss_clip": 0.01138384, "auxiliary_loss_mlp": 0.01035524, "balance_loss_clip": 1.02251792, "balance_loss_mlp": 1.0384841, "epoch": 0.6567864121448971, "flos": 17123716206720.0, "grad_norm": 1.9826809408136448, "language_loss": 0.80786121, "learning_rate": 1.054354654261737e-06, "loss": 0.82960033, "num_input_tokens_seen": 235814805, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.73046875, "step": 10924, "time_per_iteration": 2.5750277042388916 }, { "auxiliary_loss_clip": 0.0111799, "auxiliary_loss_mlp": 0.01028172, "balance_loss_clip": 1.01650155, "balance_loss_mlp": 1.03780973, "epoch": 0.656846535397565, "flos": 22419319345920.0, "grad_norm": 1.6723892963041458, "language_loss": 0.7242564, "learning_rate": 1.0540217501213166e-06, "loss": 0.74571806, "num_input_tokens_seen": 235833405, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.7109375, "step": 10925, "time_per_iteration": 2.604625940322876 }, { "auxiliary_loss_clip": 0.01119151, "auxiliary_loss_mlp": 0.01029324, "balance_loss_clip": 1.0165509, "balance_loss_mlp": 1.03770459, "epoch": 0.656906658650233, "flos": 17704535126400.0, "grad_norm": 4.301126884264629, "language_loss": 0.72691798, "learning_rate": 1.0536888797404268e-06, "loss": 0.74840271, "num_input_tokens_seen": 235848530, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.72265625, "step": 10926, "time_per_iteration": 2.5508925914764404 }, { "auxiliary_loss_clip": 0.0114743, "auxiliary_loss_mlp": 0.01034089, "balance_loss_clip": 1.02039158, "balance_loss_mlp": 1.0370239, "epoch": 0.6569667819029009, "flos": 21175158009600.0, "grad_norm": 1.9097033148325138, "language_loss": 0.72463858, "learning_rate": 1.0533560431309458e-06, "loss": 0.74645382, "num_input_tokens_seen": 235867225, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75, "step": 10927, "time_per_iteration": 2.580146312713623 }, { "auxiliary_loss_clip": 0.0105513, "auxiliary_loss_mlp": 0.01006307, "balance_loss_clip": 1.00488281, "balance_loss_mlp": 1.00739753, "epoch": 0.6570269051555689, "flos": 68761897511040.0, "grad_norm": 0.739052266291955, "language_loss": 0.64493638, "learning_rate": 1.0530232403047541e-06, "loss": 0.66555071, "num_input_tokens_seen": 235932925, "router_z_loss_clip": 0.01422119, "router_z_loss_mlp": 0.21972656, "step": 10928, "time_per_iteration": 3.2010576725006104 }, { "auxiliary_loss_clip": 0.01116668, "auxiliary_loss_mlp": 0.01033312, "balance_loss_clip": 1.02006149, "balance_loss_mlp": 1.03607905, "epoch": 0.6570870284082369, "flos": 26319289495680.0, "grad_norm": 1.5476747692853643, "language_loss": 0.77788663, "learning_rate": 1.0526904712737254e-06, "loss": 0.79938644, "num_input_tokens_seen": 235952680, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 10929, "time_per_iteration": 2.5895698070526123 }, { "auxiliary_loss_clip": 0.01135614, "auxiliary_loss_mlp": 0.01031777, "balance_loss_clip": 1.0186702, "balance_loss_mlp": 1.03756309, "epoch": 0.6571471516609049, "flos": 26174749167360.0, "grad_norm": 1.5997842923655794, "language_loss": 0.65358841, "learning_rate": 1.0523577360497383e-06, "loss": 0.67526233, "num_input_tokens_seen": 235972075, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 10930, "time_per_iteration": 2.6056628227233887 }, { "auxiliary_loss_clip": 0.01143549, "auxiliary_loss_mlp": 0.01032828, "balance_loss_clip": 1.01923192, "balance_loss_mlp": 1.0364275, "epoch": 0.6572072749135728, "flos": 20376253664640.0, "grad_norm": 1.5592086069531153, "language_loss": 0.70306289, "learning_rate": 1.0520250346446654e-06, "loss": 0.72482657, "num_input_tokens_seen": 235990340, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 10931, "time_per_iteration": 2.5900397300720215 }, { "auxiliary_loss_clip": 0.01147788, "auxiliary_loss_mlp": 0.01037961, "balance_loss_clip": 1.0247823, "balance_loss_mlp": 1.03817821, "epoch": 0.6572673981662408, "flos": 17128744110720.0, "grad_norm": 1.7695206712041363, "language_loss": 0.68911296, "learning_rate": 1.0516923670703808e-06, "loss": 0.71097046, "num_input_tokens_seen": 236007470, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7421875, "step": 10932, "time_per_iteration": 2.6492624282836914 }, { "auxiliary_loss_clip": 0.01125149, "auxiliary_loss_mlp": 0.01279509, "balance_loss_clip": 1.01967704, "balance_loss_mlp": 1.03627694, "epoch": 0.6573275214189087, "flos": 41275113281280.0, "grad_norm": 1.8307237495669075, "language_loss": 0.80225778, "learning_rate": 1.051359733338756e-06, "loss": 0.82630432, "num_input_tokens_seen": 236029030, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 10933, "time_per_iteration": 2.72979998588562 }, { "auxiliary_loss_clip": 0.01148746, "auxiliary_loss_mlp": 0.0103567, "balance_loss_clip": 1.02170479, "balance_loss_mlp": 1.0395906, "epoch": 0.6573876446715767, "flos": 22890143842560.0, "grad_norm": 1.7545224440010287, "language_loss": 0.73743731, "learning_rate": 1.0510271334616616e-06, "loss": 0.75928152, "num_input_tokens_seen": 236047160, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.734375, "step": 10934, "time_per_iteration": 2.557074785232544 }, { "auxiliary_loss_clip": 0.01117235, "auxiliary_loss_mlp": 0.01033741, "balance_loss_clip": 1.02081823, "balance_loss_mlp": 1.03745449, "epoch": 0.6574477679242448, "flos": 44018150273280.0, "grad_norm": 1.7559459289651729, "language_loss": 0.76279461, "learning_rate": 1.0506945674509693e-06, "loss": 0.78430438, "num_input_tokens_seen": 236069215, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 10935, "time_per_iteration": 2.8481502532958984 }, { "auxiliary_loss_clip": 0.01137412, "auxiliary_loss_mlp": 0.01040714, "balance_loss_clip": 1.02626014, "balance_loss_mlp": 1.03751612, "epoch": 0.6575078911769127, "flos": 24571517523840.0, "grad_norm": 1.632939499302196, "language_loss": 0.78377664, "learning_rate": 1.0503620353185443e-06, "loss": 0.80555797, "num_input_tokens_seen": 236088335, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.73046875, "step": 10936, "time_per_iteration": 2.5767593383789062 }, { "auxiliary_loss_clip": 0.01132429, "auxiliary_loss_mlp": 0.01032692, "balance_loss_clip": 1.01988888, "balance_loss_mlp": 1.03592873, "epoch": 0.6575680144295807, "flos": 20924035050240.0, "grad_norm": 1.8463678672178485, "language_loss": 0.69198531, "learning_rate": 1.0500295370762565e-06, "loss": 0.71363652, "num_input_tokens_seen": 236108540, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 10937, "time_per_iteration": 2.620258092880249 }, { "auxiliary_loss_clip": 0.01124369, "auxiliary_loss_mlp": 0.01030553, "balance_loss_clip": 1.01734507, "balance_loss_mlp": 1.03595507, "epoch": 0.6576281376822486, "flos": 10925642833920.0, "grad_norm": 2.3497766941054423, "language_loss": 0.68409431, "learning_rate": 1.0496970727359707e-06, "loss": 0.70564353, "num_input_tokens_seen": 236124495, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 10938, "time_per_iteration": 2.5890614986419678 }, { "auxiliary_loss_clip": 0.01129794, "auxiliary_loss_mlp": 0.01033289, "balance_loss_clip": 1.01927018, "balance_loss_mlp": 1.03834522, "epoch": 0.6576882609349166, "flos": 19281552819840.0, "grad_norm": 2.33991080481986, "language_loss": 0.71496814, "learning_rate": 1.049364642309552e-06, "loss": 0.73659897, "num_input_tokens_seen": 236142550, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.73046875, "step": 10939, "time_per_iteration": 3.9810333251953125 }, { "auxiliary_loss_clip": 0.01131414, "auxiliary_loss_mlp": 0.01278872, "balance_loss_clip": 1.0178678, "balance_loss_mlp": 1.03857863, "epoch": 0.6577483841875845, "flos": 20220544206720.0, "grad_norm": 1.9688305625516185, "language_loss": 0.77904034, "learning_rate": 1.049032245808863e-06, "loss": 0.80314326, "num_input_tokens_seen": 236156620, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.75, "step": 10940, "time_per_iteration": 2.589589834213257 }, { "auxiliary_loss_clip": 0.01149347, "auxiliary_loss_mlp": 0.01036782, "balance_loss_clip": 1.02310848, "balance_loss_mlp": 1.03842378, "epoch": 0.6578085074402525, "flos": 34751078962560.0, "grad_norm": 2.0834017561847817, "language_loss": 0.68504614, "learning_rate": 1.0486998832457676e-06, "loss": 0.70690739, "num_input_tokens_seen": 236177095, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7578125, "step": 10941, "time_per_iteration": 2.6770057678222656 }, { "auxiliary_loss_clip": 0.01115263, "auxiliary_loss_mlp": 0.01276705, "balance_loss_clip": 1.01703238, "balance_loss_mlp": 1.03542125, "epoch": 0.6578686306929205, "flos": 23470998675840.0, "grad_norm": 5.79586502375571, "language_loss": 0.67719352, "learning_rate": 1.0483675546321267e-06, "loss": 0.70111322, "num_input_tokens_seen": 236194695, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 10942, "time_per_iteration": 2.622509479522705 }, { "auxiliary_loss_clip": 0.011418, "auxiliary_loss_mlp": 0.01038137, "balance_loss_clip": 1.02342713, "balance_loss_mlp": 1.03781092, "epoch": 0.6579287539455885, "flos": 18077073033600.0, "grad_norm": 1.9782927622881372, "language_loss": 0.72184867, "learning_rate": 1.0480352599798e-06, "loss": 0.74364811, "num_input_tokens_seen": 236213885, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7734375, "step": 10943, "time_per_iteration": 2.5399322509765625 }, { "auxiliary_loss_clip": 0.0113491, "auxiliary_loss_mlp": 0.01029615, "balance_loss_clip": 1.01618004, "balance_loss_mlp": 1.03708458, "epoch": 0.6579888771982564, "flos": 28661383900800.0, "grad_norm": 1.5388785749806593, "language_loss": 0.59200978, "learning_rate": 1.047702999300645e-06, "loss": 0.61365503, "num_input_tokens_seen": 236237315, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.70703125, "step": 10944, "time_per_iteration": 2.680946111679077 }, { "auxiliary_loss_clip": 0.01122807, "auxiliary_loss_mlp": 0.01035538, "balance_loss_clip": 1.02147138, "balance_loss_mlp": 1.03796411, "epoch": 0.6580490004509244, "flos": 25046543911680.0, "grad_norm": 1.537121218319573, "language_loss": 0.72446132, "learning_rate": 1.0473707726065217e-06, "loss": 0.74604475, "num_input_tokens_seen": 236256345, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 10945, "time_per_iteration": 3.9291515350341797 }, { "auxiliary_loss_clip": 0.0112648, "auxiliary_loss_mlp": 0.01029015, "balance_loss_clip": 1.01696324, "balance_loss_mlp": 1.03809321, "epoch": 0.6581091237035923, "flos": 43508793461760.0, "grad_norm": 1.8574280608765452, "language_loss": 0.70407921, "learning_rate": 1.0470385799092841e-06, "loss": 0.7256341, "num_input_tokens_seen": 236281890, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.70703125, "step": 10946, "time_per_iteration": 2.713365077972412 }, { "auxiliary_loss_clip": 0.0111922, "auxiliary_loss_mlp": 0.01032665, "balance_loss_clip": 1.01913464, "balance_loss_mlp": 1.03832459, "epoch": 0.6581692469562603, "flos": 22415404763520.0, "grad_norm": 2.1099322966692076, "language_loss": 0.82707095, "learning_rate": 1.0467064212207888e-06, "loss": 0.84858984, "num_input_tokens_seen": 236298370, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 10947, "time_per_iteration": 2.5360701084136963 }, { "auxiliary_loss_clip": 0.01146985, "auxiliary_loss_mlp": 0.01276005, "balance_loss_clip": 1.01633847, "balance_loss_mlp": 1.0370512, "epoch": 0.6582293702089284, "flos": 24859772167680.0, "grad_norm": 1.5751005475749522, "language_loss": 0.76728427, "learning_rate": 1.04637429655289e-06, "loss": 0.79151416, "num_input_tokens_seen": 236317380, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73828125, "step": 10948, "time_per_iteration": 2.5965781211853027 }, { "auxiliary_loss_clip": 0.01117543, "auxiliary_loss_mlp": 0.01030916, "balance_loss_clip": 1.01818466, "balance_loss_mlp": 1.03753424, "epoch": 0.6582894934615963, "flos": 23039676161280.0, "grad_norm": 1.6341284723587526, "language_loss": 0.79273009, "learning_rate": 1.0460422059174376e-06, "loss": 0.81421471, "num_input_tokens_seen": 236336210, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 10949, "time_per_iteration": 2.5508196353912354 }, { "auxiliary_loss_clip": 0.01038136, "auxiliary_loss_mlp": 0.01001524, "balance_loss_clip": 1.00010586, "balance_loss_mlp": 1.00741696, "epoch": 0.6583496167142643, "flos": 72551980978560.0, "grad_norm": 0.7408806550495053, "language_loss": 0.61853176, "learning_rate": 1.045710149326286e-06, "loss": 0.63892835, "num_input_tokens_seen": 236403090, "router_z_loss_clip": 0.01416016, "router_z_loss_mlp": 0.21875, "step": 10950, "time_per_iteration": 3.2231292724609375 }, { "auxiliary_loss_clip": 0.01114542, "auxiliary_loss_mlp": 0.01034928, "balance_loss_clip": 1.02294123, "balance_loss_mlp": 1.03606391, "epoch": 0.6584097399669322, "flos": 13078846592640.0, "grad_norm": 2.030673850902435, "language_loss": 0.67170167, "learning_rate": 1.0453781267912838e-06, "loss": 0.69319642, "num_input_tokens_seen": 236420475, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6953125, "step": 10951, "time_per_iteration": 4.091194152832031 }, { "auxiliary_loss_clip": 0.01118533, "auxiliary_loss_mlp": 0.01034179, "balance_loss_clip": 1.02158487, "balance_loss_mlp": 1.03855228, "epoch": 0.6584698632196002, "flos": 28693164458880.0, "grad_norm": 1.4158252704842131, "language_loss": 0.76467466, "learning_rate": 1.0450461383242821e-06, "loss": 0.78620183, "num_input_tokens_seen": 236441915, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 10952, "time_per_iteration": 2.611037254333496 }, { "auxiliary_loss_clip": 0.01109141, "auxiliary_loss_mlp": 0.01032996, "balance_loss_clip": 1.02007961, "balance_loss_mlp": 1.0390625, "epoch": 0.6585299864722681, "flos": 14319272914560.0, "grad_norm": 1.714953426316767, "language_loss": 0.73328596, "learning_rate": 1.044714183937126e-06, "loss": 0.75470734, "num_input_tokens_seen": 236460340, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 10953, "time_per_iteration": 2.5029733180999756 }, { "auxiliary_loss_clip": 0.01135492, "auxiliary_loss_mlp": 0.01037623, "balance_loss_clip": 1.02421784, "balance_loss_mlp": 1.03718865, "epoch": 0.6585901097249361, "flos": 26797907243520.0, "grad_norm": 2.195311689759322, "language_loss": 0.7894423, "learning_rate": 1.0443822636416637e-06, "loss": 0.81117344, "num_input_tokens_seen": 236478280, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 10954, "time_per_iteration": 2.6273610591888428 }, { "auxiliary_loss_clip": 0.01130426, "auxiliary_loss_mlp": 0.01033962, "balance_loss_clip": 1.02026451, "balance_loss_mlp": 1.03933609, "epoch": 0.658650232977604, "flos": 18733124989440.0, "grad_norm": 2.0963494214771905, "language_loss": 0.69455791, "learning_rate": 1.0440503774497406e-06, "loss": 0.71620178, "num_input_tokens_seen": 236493225, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73046875, "step": 10955, "time_per_iteration": 2.6052005290985107 }, { "auxiliary_loss_clip": 0.01123457, "auxiliary_loss_mlp": 0.01034512, "balance_loss_clip": 1.02268672, "balance_loss_mlp": 1.03688431, "epoch": 0.6587103562302721, "flos": 24753440931840.0, "grad_norm": 2.1626563722255603, "language_loss": 0.8019675, "learning_rate": 1.0437185253732006e-06, "loss": 0.82354712, "num_input_tokens_seen": 236514420, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 10956, "time_per_iteration": 2.7142481803894043 }, { "auxiliary_loss_clip": 0.01118709, "auxiliary_loss_mlp": 0.01040538, "balance_loss_clip": 1.02669227, "balance_loss_mlp": 1.03927124, "epoch": 0.65877047948294, "flos": 22346133384960.0, "grad_norm": 2.0034414318952303, "language_loss": 0.81359172, "learning_rate": 1.0433867074238856e-06, "loss": 0.83518416, "num_input_tokens_seen": 236532785, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.70703125, "step": 10957, "time_per_iteration": 2.570643424987793 }, { "auxiliary_loss_clip": 0.01126921, "auxiliary_loss_mlp": 0.01029755, "balance_loss_clip": 1.01813793, "balance_loss_mlp": 1.0397799, "epoch": 0.658830602735608, "flos": 45180542298240.0, "grad_norm": 1.58247801713919, "language_loss": 0.76083523, "learning_rate": 1.0430549236136399e-06, "loss": 0.78240204, "num_input_tokens_seen": 236553330, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6953125, "step": 10958, "time_per_iteration": 2.775575876235962 }, { "auxiliary_loss_clip": 0.01135223, "auxiliary_loss_mlp": 0.01031267, "balance_loss_clip": 1.01889253, "balance_loss_mlp": 1.03870833, "epoch": 0.6588907259882759, "flos": 19901622326400.0, "grad_norm": 1.6363959127996983, "language_loss": 0.75126934, "learning_rate": 1.0427231739543009e-06, "loss": 0.7729342, "num_input_tokens_seen": 236572960, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 10959, "time_per_iteration": 2.63962984085083 }, { "auxiliary_loss_clip": 0.01117614, "auxiliary_loss_mlp": 0.01031341, "balance_loss_clip": 1.01862144, "balance_loss_mlp": 1.03790987, "epoch": 0.6589508492409439, "flos": 24133766474880.0, "grad_norm": 1.4989466638158937, "language_loss": 0.64811707, "learning_rate": 1.0423914584577102e-06, "loss": 0.66960663, "num_input_tokens_seen": 236594090, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 10960, "time_per_iteration": 2.5666110515594482 }, { "auxiliary_loss_clip": 0.01115794, "auxiliary_loss_mlp": 0.0103292, "balance_loss_clip": 1.01876926, "balance_loss_mlp": 1.03997731, "epoch": 0.659010972493612, "flos": 18222906251520.0, "grad_norm": 2.063445611482453, "language_loss": 0.82401788, "learning_rate": 1.0420597771357042e-06, "loss": 0.845505, "num_input_tokens_seen": 236610190, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7578125, "step": 10961, "time_per_iteration": 2.515186309814453 }, { "auxiliary_loss_clip": 0.01137708, "auxiliary_loss_mlp": 0.01029745, "balance_loss_clip": 1.01709068, "balance_loss_mlp": 1.03959036, "epoch": 0.6590710957462799, "flos": 27600007898880.0, "grad_norm": 2.2356946366546895, "language_loss": 0.73698437, "learning_rate": 1.041728130000122e-06, "loss": 0.75865889, "num_input_tokens_seen": 236631575, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 10962, "time_per_iteration": 2.646449327468872 }, { "auxiliary_loss_clip": 0.0111949, "auxiliary_loss_mlp": 0.01032243, "balance_loss_clip": 1.01925468, "balance_loss_mlp": 1.03710628, "epoch": 0.6591312189989479, "flos": 20302959962880.0, "grad_norm": 1.7214928006648105, "language_loss": 0.79799128, "learning_rate": 1.0413965170627976e-06, "loss": 0.81950861, "num_input_tokens_seen": 236649815, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.73046875, "step": 10963, "time_per_iteration": 2.5349626541137695 }, { "auxiliary_loss_clip": 0.01142352, "auxiliary_loss_mlp": 0.01275324, "balance_loss_clip": 1.01586509, "balance_loss_mlp": 1.03686011, "epoch": 0.6591913422516158, "flos": 12312943868160.0, "grad_norm": 1.5266186990953299, "language_loss": 0.78284264, "learning_rate": 1.0410649383355648e-06, "loss": 0.80701935, "num_input_tokens_seen": 236668335, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 10964, "time_per_iteration": 2.615671157836914 }, { "auxiliary_loss_clip": 0.01135769, "auxiliary_loss_mlp": 0.01037463, "balance_loss_clip": 1.02437401, "balance_loss_mlp": 1.03593779, "epoch": 0.6592514655042838, "flos": 25884591102720.0, "grad_norm": 1.674636437265508, "language_loss": 0.73958212, "learning_rate": 1.0407333938302589e-06, "loss": 0.76131445, "num_input_tokens_seen": 236688945, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73046875, "step": 10965, "time_per_iteration": 2.5873334407806396 }, { "auxiliary_loss_clip": 0.01141598, "auxiliary_loss_mlp": 0.01034544, "balance_loss_clip": 1.02157378, "balance_loss_mlp": 1.03949237, "epoch": 0.6593115887569517, "flos": 14063624841600.0, "grad_norm": 1.8480716919187161, "language_loss": 0.7360661, "learning_rate": 1.0404018835587095e-06, "loss": 0.75782752, "num_input_tokens_seen": 236707055, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7578125, "step": 10966, "time_per_iteration": 2.5697598457336426 }, { "auxiliary_loss_clip": 0.01118406, "auxiliary_loss_mlp": 0.01033608, "balance_loss_clip": 1.02029872, "balance_loss_mlp": 1.03705812, "epoch": 0.6593717120096197, "flos": 24717925359360.0, "grad_norm": 1.7027563388508076, "language_loss": 0.77027673, "learning_rate": 1.040070407532749e-06, "loss": 0.79179686, "num_input_tokens_seen": 236725900, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 10967, "time_per_iteration": 2.5934884548187256 }, { "auxiliary_loss_clip": 0.01039897, "auxiliary_loss_mlp": 0.01004041, "balance_loss_clip": 1.00271797, "balance_loss_mlp": 1.00909567, "epoch": 0.6594318352622877, "flos": 55558083502080.0, "grad_norm": 0.6947899310193774, "language_loss": 0.4845714, "learning_rate": 1.0397389657642058e-06, "loss": 0.50501078, "num_input_tokens_seen": 236788415, "router_z_loss_clip": 0.01324463, "router_z_loss_mlp": 0.21972656, "step": 10968, "time_per_iteration": 3.168135166168213 }, { "auxiliary_loss_clip": 0.01145357, "auxiliary_loss_mlp": 0.01033201, "balance_loss_clip": 1.0204041, "balance_loss_mlp": 1.03677654, "epoch": 0.6594919585149557, "flos": 17456931699840.0, "grad_norm": 1.9902149419611965, "language_loss": 0.79168397, "learning_rate": 1.0394075582649102e-06, "loss": 0.81346953, "num_input_tokens_seen": 236805155, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 10969, "time_per_iteration": 2.568363666534424 }, { "auxiliary_loss_clip": 0.0111668, "auxiliary_loss_mlp": 0.01031575, "balance_loss_clip": 1.0188731, "balance_loss_mlp": 1.03686011, "epoch": 0.6595520817676236, "flos": 18223229473920.0, "grad_norm": 2.147583724037875, "language_loss": 0.65213621, "learning_rate": 1.0390761850466864e-06, "loss": 0.67361879, "num_input_tokens_seen": 236824360, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 10970, "time_per_iteration": 2.5545690059661865 }, { "auxiliary_loss_clip": 0.01117261, "auxiliary_loss_mlp": 0.01029876, "balance_loss_clip": 1.01694775, "balance_loss_mlp": 1.036569, "epoch": 0.6596122050202916, "flos": 22199761463040.0, "grad_norm": 1.798777560411224, "language_loss": 0.7607739, "learning_rate": 1.0387448461213626e-06, "loss": 0.78224522, "num_input_tokens_seen": 236844640, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 10971, "time_per_iteration": 2.6585168838500977 }, { "auxiliary_loss_clip": 0.0115785, "auxiliary_loss_mlp": 0.01034021, "balance_loss_clip": 1.02134895, "balance_loss_mlp": 1.03920126, "epoch": 0.6596723282729595, "flos": 14173834746240.0, "grad_norm": 4.051975014997996, "language_loss": 0.70094657, "learning_rate": 1.0384135415007627e-06, "loss": 0.72286534, "num_input_tokens_seen": 236861160, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73828125, "step": 10972, "time_per_iteration": 2.630850076675415 }, { "auxiliary_loss_clip": 0.01118427, "auxiliary_loss_mlp": 0.01026482, "balance_loss_clip": 1.01349401, "balance_loss_mlp": 1.03870296, "epoch": 0.6597324515256275, "flos": 30553193410560.0, "grad_norm": 2.264359450496678, "language_loss": 0.55803502, "learning_rate": 1.0380822711967097e-06, "loss": 0.57948411, "num_input_tokens_seen": 236880465, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 10973, "time_per_iteration": 2.6515579223632812 }, { "auxiliary_loss_clip": 0.01132477, "auxiliary_loss_mlp": 0.01038749, "balance_loss_clip": 1.0252192, "balance_loss_mlp": 1.03843319, "epoch": 0.6597925747782956, "flos": 17639860688640.0, "grad_norm": 1.8881897809270713, "language_loss": 0.78557736, "learning_rate": 1.0377510352210256e-06, "loss": 0.8072896, "num_input_tokens_seen": 236897730, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.765625, "step": 10974, "time_per_iteration": 2.5990777015686035 }, { "auxiliary_loss_clip": 0.01119579, "auxiliary_loss_mlp": 0.01036453, "balance_loss_clip": 1.02336383, "balance_loss_mlp": 1.03768194, "epoch": 0.6598526980309635, "flos": 22819112697600.0, "grad_norm": 2.372402735349407, "language_loss": 0.68792844, "learning_rate": 1.0374198335855334e-06, "loss": 0.70948875, "num_input_tokens_seen": 236917300, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 10975, "time_per_iteration": 2.566807746887207 }, { "auxiliary_loss_clip": 0.01118336, "auxiliary_loss_mlp": 0.0102765, "balance_loss_clip": 1.01503205, "balance_loss_mlp": 1.03679395, "epoch": 0.6599128212836315, "flos": 21068036674560.0, "grad_norm": 1.710481593691629, "language_loss": 0.70600224, "learning_rate": 1.0370886663020498e-06, "loss": 0.72746211, "num_input_tokens_seen": 236935590, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 10976, "time_per_iteration": 2.628253698348999 }, { "auxiliary_loss_clip": 0.01123624, "auxiliary_loss_mlp": 0.01033223, "balance_loss_clip": 1.01980019, "balance_loss_mlp": 1.03615451, "epoch": 0.6599729445362994, "flos": 22163527618560.0, "grad_norm": 1.6056225230264227, "language_loss": 0.67715317, "learning_rate": 1.0367575333823953e-06, "loss": 0.69872159, "num_input_tokens_seen": 236952830, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.6953125, "step": 10977, "time_per_iteration": 2.5529234409332275 }, { "auxiliary_loss_clip": 0.01142069, "auxiliary_loss_mlp": 0.01031933, "balance_loss_clip": 1.01803863, "balance_loss_mlp": 1.03940678, "epoch": 0.6600330677889674, "flos": 18150079426560.0, "grad_norm": 1.9620956444136555, "language_loss": 0.80939847, "learning_rate": 1.0364264348383868e-06, "loss": 0.83113849, "num_input_tokens_seen": 236971930, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 10978, "time_per_iteration": 2.5058019161224365 }, { "auxiliary_loss_clip": 0.0113924, "auxiliary_loss_mlp": 0.01037086, "balance_loss_clip": 1.02321005, "balance_loss_mlp": 1.03843713, "epoch": 0.6600931910416353, "flos": 18150115340160.0, "grad_norm": 2.8947797851927906, "language_loss": 0.67631692, "learning_rate": 1.0360953706818402e-06, "loss": 0.69808018, "num_input_tokens_seen": 236989920, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.74609375, "step": 10979, "time_per_iteration": 2.5564959049224854 }, { "auxiliary_loss_clip": 0.01128573, "auxiliary_loss_mlp": 0.01030931, "balance_loss_clip": 1.01729906, "balance_loss_mlp": 1.03684711, "epoch": 0.6601533142943034, "flos": 17420733768960.0, "grad_norm": 2.1056718233104594, "language_loss": 0.7294237, "learning_rate": 1.0357643409245703e-06, "loss": 0.7510187, "num_input_tokens_seen": 237006570, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 10980, "time_per_iteration": 2.546334981918335 }, { "auxiliary_loss_clip": 0.01131107, "auxiliary_loss_mlp": 0.01032418, "balance_loss_clip": 1.02066994, "balance_loss_mlp": 1.03633153, "epoch": 0.6602134375469713, "flos": 28219574615040.0, "grad_norm": 1.6423834664038677, "language_loss": 0.72937524, "learning_rate": 1.0354333455783901e-06, "loss": 0.75101054, "num_input_tokens_seen": 237028415, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.68359375, "step": 10981, "time_per_iteration": 4.0535218715667725 }, { "auxiliary_loss_clip": 0.01148547, "auxiliary_loss_mlp": 0.01033511, "balance_loss_clip": 1.02074933, "balance_loss_mlp": 1.03792346, "epoch": 0.6602735607996393, "flos": 29418056830080.0, "grad_norm": 2.0051056289929465, "language_loss": 0.68489528, "learning_rate": 1.0351023846551141e-06, "loss": 0.70671588, "num_input_tokens_seen": 237046595, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7421875, "step": 10982, "time_per_iteration": 2.6688523292541504 }, { "auxiliary_loss_clip": 0.01117093, "auxiliary_loss_mlp": 0.01030636, "balance_loss_clip": 1.01806498, "balance_loss_mlp": 1.03841341, "epoch": 0.6603336840523072, "flos": 18588045957120.0, "grad_norm": 1.466180187482211, "language_loss": 0.69574207, "learning_rate": 1.0347714581665504e-06, "loss": 0.71721935, "num_input_tokens_seen": 237066150, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 10983, "time_per_iteration": 2.5511598587036133 }, { "auxiliary_loss_clip": 0.01118159, "auxiliary_loss_mlp": 0.01032604, "balance_loss_clip": 1.01968217, "balance_loss_mlp": 1.03737247, "epoch": 0.6603938073049752, "flos": 33254860913280.0, "grad_norm": 1.8408369356869474, "language_loss": 0.70596027, "learning_rate": 1.0344405661245117e-06, "loss": 0.72746789, "num_input_tokens_seen": 237087060, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 10984, "time_per_iteration": 2.6235971450805664 }, { "auxiliary_loss_clip": 0.01132606, "auxiliary_loss_mlp": 0.01034354, "balance_loss_clip": 1.02190876, "balance_loss_mlp": 1.03626287, "epoch": 0.6604539305576431, "flos": 17384284442880.0, "grad_norm": 4.701607773713169, "language_loss": 0.83856547, "learning_rate": 1.0341097085408041e-06, "loss": 0.8602351, "num_input_tokens_seen": 237103825, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.703125, "step": 10985, "time_per_iteration": 2.492548704147339 }, { "auxiliary_loss_clip": 0.01120389, "auxiliary_loss_mlp": 0.01032851, "balance_loss_clip": 1.01938057, "balance_loss_mlp": 1.03833461, "epoch": 0.6605140538103111, "flos": 21251145231360.0, "grad_norm": 2.1512909691614106, "language_loss": 0.73860961, "learning_rate": 1.0337788854272385e-06, "loss": 0.76014203, "num_input_tokens_seen": 237121740, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 10986, "time_per_iteration": 2.541964530944824 }, { "auxiliary_loss_clip": 0.01116246, "auxiliary_loss_mlp": 0.01030205, "balance_loss_clip": 1.01756299, "balance_loss_mlp": 1.03792608, "epoch": 0.6605741770629792, "flos": 13881701433600.0, "grad_norm": 1.6308810710510124, "language_loss": 0.78958577, "learning_rate": 1.033448096795617e-06, "loss": 0.8110503, "num_input_tokens_seen": 237139565, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 10987, "time_per_iteration": 3.9422922134399414 }, { "auxiliary_loss_clip": 0.01050148, "auxiliary_loss_mlp": 0.00999697, "balance_loss_clip": 0.99826092, "balance_loss_mlp": 1.00997233, "epoch": 0.6606343003156471, "flos": 69316215171840.0, "grad_norm": 0.8322395776655201, "language_loss": 0.54109204, "learning_rate": 1.0331173426577477e-06, "loss": 0.56159055, "num_input_tokens_seen": 237201055, "router_z_loss_clip": 0.01434326, "router_z_loss_mlp": 0.21875, "step": 10988, "time_per_iteration": 3.2365615367889404 }, { "auxiliary_loss_clip": 0.01152492, "auxiliary_loss_mlp": 0.01034508, "balance_loss_clip": 1.02298617, "balance_loss_mlp": 1.03841186, "epoch": 0.6606944235683151, "flos": 27272394927360.0, "grad_norm": 1.6521781404452307, "language_loss": 0.77283752, "learning_rate": 1.0327866230254336e-06, "loss": 0.7947076, "num_input_tokens_seen": 237221805, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.69921875, "step": 10989, "time_per_iteration": 2.612786054611206 }, { "auxiliary_loss_clip": 0.01129607, "auxiliary_loss_mlp": 0.01037312, "balance_loss_clip": 1.02459192, "balance_loss_mlp": 1.041291, "epoch": 0.660754546820983, "flos": 13772820332160.0, "grad_norm": 2.1805030870235473, "language_loss": 0.77034831, "learning_rate": 1.0324559379104766e-06, "loss": 0.79201746, "num_input_tokens_seen": 237238270, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 10990, "time_per_iteration": 2.5269062519073486 }, { "auxiliary_loss_clip": 0.01110871, "auxiliary_loss_mlp": 0.01032327, "balance_loss_clip": 1.01894009, "balance_loss_mlp": 1.03786898, "epoch": 0.660814670073651, "flos": 15705209232000.0, "grad_norm": 2.740481495997137, "language_loss": 0.60493565, "learning_rate": 1.0321252873246774e-06, "loss": 0.62636769, "num_input_tokens_seen": 237255400, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73046875, "step": 10991, "time_per_iteration": 2.4488656520843506 }, { "auxiliary_loss_clip": 0.0113012, "auxiliary_loss_mlp": 0.0103886, "balance_loss_clip": 1.02547288, "balance_loss_mlp": 1.03854418, "epoch": 0.6608747933263189, "flos": 20850023076480.0, "grad_norm": 2.141823101266354, "language_loss": 0.68307602, "learning_rate": 1.0317946712798388e-06, "loss": 0.7047658, "num_input_tokens_seen": 237273105, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73828125, "step": 10992, "time_per_iteration": 2.5797629356384277 }, { "auxiliary_loss_clip": 0.01120429, "auxiliary_loss_mlp": 0.010294, "balance_loss_clip": 1.01608443, "balance_loss_mlp": 1.03713763, "epoch": 0.660934916578987, "flos": 20632117219200.0, "grad_norm": 1.6736973273920082, "language_loss": 0.87545103, "learning_rate": 1.0314640897877574e-06, "loss": 0.89694929, "num_input_tokens_seen": 237292650, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7421875, "step": 10993, "time_per_iteration": 5.583447217941284 }, { "auxiliary_loss_clip": 0.01125699, "auxiliary_loss_mlp": 0.01036226, "balance_loss_clip": 1.021438, "balance_loss_mlp": 1.03956199, "epoch": 0.6609950398316549, "flos": 25113588647040.0, "grad_norm": 1.6375790424276089, "language_loss": 0.66778928, "learning_rate": 1.0311335428602302e-06, "loss": 0.68940854, "num_input_tokens_seen": 237312865, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7734375, "step": 10994, "time_per_iteration": 2.594970941543579 }, { "auxiliary_loss_clip": 0.01139427, "auxiliary_loss_mlp": 0.01037704, "balance_loss_clip": 1.02413225, "balance_loss_mlp": 1.03965378, "epoch": 0.6610551630843229, "flos": 18661196004480.0, "grad_norm": 1.6825789164465546, "language_loss": 0.7670207, "learning_rate": 1.0308030305090553e-06, "loss": 0.78879207, "num_input_tokens_seen": 237331210, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 10995, "time_per_iteration": 2.5477261543273926 }, { "auxiliary_loss_clip": 0.01128109, "auxiliary_loss_mlp": 0.0102851, "balance_loss_clip": 1.01667285, "balance_loss_mlp": 1.03846598, "epoch": 0.6611152863369908, "flos": 23258192549760.0, "grad_norm": 1.7741001872351214, "language_loss": 0.7431953, "learning_rate": 1.0304725527460271e-06, "loss": 0.76476151, "num_input_tokens_seen": 237349455, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.72265625, "step": 10996, "time_per_iteration": 2.5451748371124268 }, { "auxiliary_loss_clip": 0.01111335, "auxiliary_loss_mlp": 0.01035821, "balance_loss_clip": 1.02280903, "balance_loss_mlp": 1.03976011, "epoch": 0.6611754095896588, "flos": 22159720776960.0, "grad_norm": 1.7114277924715604, "language_loss": 0.69183248, "learning_rate": 1.0301421095829402e-06, "loss": 0.71330404, "num_input_tokens_seen": 237367100, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 10997, "time_per_iteration": 2.472961902618408 }, { "auxiliary_loss_clip": 0.01131363, "auxiliary_loss_mlp": 0.01032045, "balance_loss_clip": 1.01933193, "balance_loss_mlp": 1.03910255, "epoch": 0.6612355328423267, "flos": 13991228979840.0, "grad_norm": 2.0118520049934405, "language_loss": 0.68625474, "learning_rate": 1.0298117010315853e-06, "loss": 0.70788884, "num_input_tokens_seen": 237384840, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.74609375, "step": 10998, "time_per_iteration": 2.5138533115386963 }, { "auxiliary_loss_clip": 0.01041074, "auxiliary_loss_mlp": 0.01000695, "balance_loss_clip": 0.99938411, "balance_loss_mlp": 1.01007414, "epoch": 0.6612956560949947, "flos": 61453716359040.0, "grad_norm": 0.6497488208126977, "language_loss": 0.51193005, "learning_rate": 1.0294813271037569e-06, "loss": 0.53234774, "num_input_tokens_seen": 237443355, "router_z_loss_clip": 0.01312256, "router_z_loss_mlp": 0.22070312, "step": 10999, "time_per_iteration": 3.102837562561035 }, { "auxiliary_loss_clip": 0.01112632, "auxiliary_loss_mlp": 0.01278983, "balance_loss_clip": 1.01949859, "balance_loss_mlp": 1.03776503, "epoch": 0.6613557793476627, "flos": 21616644072960.0, "grad_norm": 2.1526263415455626, "language_loss": 0.70024294, "learning_rate": 1.0291509878112416e-06, "loss": 0.72415912, "num_input_tokens_seen": 237459205, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.75, "step": 11000, "time_per_iteration": 2.553792953491211 }, { "auxiliary_loss_clip": 0.0112389, "auxiliary_loss_mlp": 0.01037393, "balance_loss_clip": 1.02546644, "balance_loss_mlp": 1.03637612, "epoch": 0.6614159026003307, "flos": 34020117192960.0, "grad_norm": 1.6429084180045346, "language_loss": 0.64859831, "learning_rate": 1.0288206831658314e-06, "loss": 0.6702112, "num_input_tokens_seen": 237483580, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 11001, "time_per_iteration": 2.679833173751831 }, { "auxiliary_loss_clip": 0.01108119, "auxiliary_loss_mlp": 0.01032866, "balance_loss_clip": 1.02016985, "balance_loss_mlp": 1.03716683, "epoch": 0.6614760258529987, "flos": 24097281235200.0, "grad_norm": 1.7035030850995005, "language_loss": 0.73113692, "learning_rate": 1.0284904131793127e-06, "loss": 0.75254679, "num_input_tokens_seen": 237502860, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 11002, "time_per_iteration": 2.560235023498535 }, { "auxiliary_loss_clip": 0.01117076, "auxiliary_loss_mlp": 0.01032758, "balance_loss_clip": 1.01973462, "balance_loss_mlp": 1.0372653, "epoch": 0.6615361491056666, "flos": 14903790935040.0, "grad_norm": 2.5212854131938855, "language_loss": 0.79095221, "learning_rate": 1.0281601778634722e-06, "loss": 0.81245053, "num_input_tokens_seen": 237521030, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 11003, "time_per_iteration": 2.5690739154815674 }, { "auxiliary_loss_clip": 0.01120923, "auxiliary_loss_mlp": 0.01039559, "balance_loss_clip": 1.02529597, "balance_loss_mlp": 1.03890014, "epoch": 0.6615962723583346, "flos": 15304877176320.0, "grad_norm": 1.7422219191075732, "language_loss": 0.68653125, "learning_rate": 1.0278299772300943e-06, "loss": 0.70813608, "num_input_tokens_seen": 237539585, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.734375, "step": 11004, "time_per_iteration": 2.5380489826202393 }, { "auxiliary_loss_clip": 0.01121422, "auxiliary_loss_mlp": 0.01036456, "balance_loss_clip": 1.02181697, "balance_loss_mlp": 1.03780484, "epoch": 0.6616563956110025, "flos": 18732586285440.0, "grad_norm": 2.0089497620023042, "language_loss": 0.69733405, "learning_rate": 1.0274998112909642e-06, "loss": 0.71891284, "num_input_tokens_seen": 237557655, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.75, "step": 11005, "time_per_iteration": 2.488262176513672 }, { "auxiliary_loss_clip": 0.01125744, "auxiliary_loss_mlp": 0.01030588, "balance_loss_clip": 1.01821399, "balance_loss_mlp": 1.03751421, "epoch": 0.6617165188636706, "flos": 24495063425280.0, "grad_norm": 2.710455134016196, "language_loss": 0.78351545, "learning_rate": 1.0271696800578646e-06, "loss": 0.80507874, "num_input_tokens_seen": 237577000, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.70703125, "step": 11006, "time_per_iteration": 2.5407049655914307 }, { "auxiliary_loss_clip": 0.01118045, "auxiliary_loss_mlp": 0.01035963, "balance_loss_clip": 1.02234316, "balance_loss_mlp": 1.03768957, "epoch": 0.6617766421163385, "flos": 22379673709440.0, "grad_norm": 1.4257982471453012, "language_loss": 0.76421183, "learning_rate": 1.0268395835425767e-06, "loss": 0.78575188, "num_input_tokens_seen": 237597960, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71484375, "step": 11007, "time_per_iteration": 2.5339534282684326 }, { "auxiliary_loss_clip": 0.0111846, "auxiliary_loss_mlp": 0.01032952, "balance_loss_clip": 1.02051258, "balance_loss_mlp": 1.03916979, "epoch": 0.6618367653690065, "flos": 20850418126080.0, "grad_norm": 1.6602562113424215, "language_loss": 0.78280681, "learning_rate": 1.0265095217568806e-06, "loss": 0.80432093, "num_input_tokens_seen": 237616385, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.70703125, "step": 11008, "time_per_iteration": 2.5715324878692627 }, { "auxiliary_loss_clip": 0.01114527, "auxiliary_loss_mlp": 0.01037396, "balance_loss_clip": 1.02332318, "balance_loss_mlp": 1.03814578, "epoch": 0.6618968886216744, "flos": 17712328377600.0, "grad_norm": 1.8964786110844791, "language_loss": 0.81707966, "learning_rate": 1.0261794947125556e-06, "loss": 0.83859897, "num_input_tokens_seen": 237634930, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.765625, "step": 11009, "time_per_iteration": 2.4689459800720215 }, { "auxiliary_loss_clip": 0.01049264, "auxiliary_loss_mlp": 0.01005739, "balance_loss_clip": 1.00442779, "balance_loss_mlp": 1.00898767, "epoch": 0.6619570118743424, "flos": 67035347498880.0, "grad_norm": 0.9843370996872105, "language_loss": 0.67395133, "learning_rate": 1.02584950242138e-06, "loss": 0.69450134, "num_input_tokens_seen": 237693175, "router_z_loss_clip": 0.01312256, "router_z_loss_mlp": 0.21875, "step": 11010, "time_per_iteration": 3.1107656955718994 }, { "auxiliary_loss_clip": 0.01118355, "auxiliary_loss_mlp": 0.01034105, "balance_loss_clip": 1.02067018, "balance_loss_mlp": 1.03655481, "epoch": 0.6620171351270103, "flos": 18660908695680.0, "grad_norm": 1.8637144447490588, "language_loss": 0.70951527, "learning_rate": 1.0255195448951287e-06, "loss": 0.73103988, "num_input_tokens_seen": 237713160, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73046875, "step": 11011, "time_per_iteration": 2.6057348251342773 }, { "auxiliary_loss_clip": 0.01119911, "auxiliary_loss_mlp": 0.01039456, "balance_loss_clip": 1.02729034, "balance_loss_mlp": 1.04000652, "epoch": 0.6620772583796783, "flos": 24170503109760.0, "grad_norm": 2.02718862527206, "language_loss": 0.72255492, "learning_rate": 1.0251896221455787e-06, "loss": 0.74414861, "num_input_tokens_seen": 237733600, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.7109375, "step": 11012, "time_per_iteration": 2.5966124534606934 }, { "auxiliary_loss_clip": 0.01113384, "auxiliary_loss_mlp": 0.01030146, "balance_loss_clip": 1.01865423, "balance_loss_mlp": 1.03845477, "epoch": 0.6621373816323463, "flos": 23623547736960.0, "grad_norm": 1.5679371091648129, "language_loss": 0.79095215, "learning_rate": 1.0248597341845039e-06, "loss": 0.81238747, "num_input_tokens_seen": 237752135, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.66015625, "step": 11013, "time_per_iteration": 2.5624895095825195 }, { "auxiliary_loss_clip": 0.01128228, "auxiliary_loss_mlp": 0.0102963, "balance_loss_clip": 1.01699412, "balance_loss_mlp": 1.03885162, "epoch": 0.6621975048850143, "flos": 18442212739200.0, "grad_norm": 2.004896673938156, "language_loss": 0.70249271, "learning_rate": 1.0245298810236764e-06, "loss": 0.72407126, "num_input_tokens_seen": 237770735, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 11014, "time_per_iteration": 2.55556321144104 }, { "auxiliary_loss_clip": 0.01125975, "auxiliary_loss_mlp": 0.01279096, "balance_loss_clip": 1.02055216, "balance_loss_mlp": 1.03888106, "epoch": 0.6622576281376823, "flos": 14063876236800.0, "grad_norm": 1.8430588043463196, "language_loss": 0.76814097, "learning_rate": 1.0242000626748679e-06, "loss": 0.79219168, "num_input_tokens_seen": 237789005, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69140625, "step": 11015, "time_per_iteration": 2.6465020179748535 }, { "auxiliary_loss_clip": 0.01116861, "auxiliary_loss_mlp": 0.01277788, "balance_loss_clip": 1.01795125, "balance_loss_mlp": 1.03789163, "epoch": 0.6623177513903502, "flos": 17018965169280.0, "grad_norm": 3.8526616716687356, "language_loss": 0.82703054, "learning_rate": 1.0238702791498506e-06, "loss": 0.85097694, "num_input_tokens_seen": 237807740, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 11016, "time_per_iteration": 2.5675737857818604 }, { "auxiliary_loss_clip": 0.01118626, "auxiliary_loss_mlp": 0.01032461, "balance_loss_clip": 1.01944923, "balance_loss_mlp": 1.03584206, "epoch": 0.6623778746430182, "flos": 17271021882240.0, "grad_norm": 2.09796725369912, "language_loss": 0.69597137, "learning_rate": 1.0235405304603904e-06, "loss": 0.71748227, "num_input_tokens_seen": 237826340, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.734375, "step": 11017, "time_per_iteration": 2.5980961322784424 }, { "auxiliary_loss_clip": 0.01125389, "auxiliary_loss_mlp": 0.0103816, "balance_loss_clip": 1.02527368, "balance_loss_mlp": 1.03655171, "epoch": 0.6624379978956861, "flos": 48792688767360.0, "grad_norm": 2.0736413546218553, "language_loss": 0.77161294, "learning_rate": 1.023210816618258e-06, "loss": 0.79324841, "num_input_tokens_seen": 237848305, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 11018, "time_per_iteration": 2.8575189113616943 }, { "auxiliary_loss_clip": 0.01125108, "auxiliary_loss_mlp": 0.01039178, "balance_loss_clip": 1.0254513, "balance_loss_mlp": 1.0364821, "epoch": 0.6624981211483542, "flos": 18952431477120.0, "grad_norm": 1.8095035162194864, "language_loss": 0.82864588, "learning_rate": 1.0228811376352187e-06, "loss": 0.85028881, "num_input_tokens_seen": 237867020, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7109375, "step": 11019, "time_per_iteration": 2.6182057857513428 }, { "auxiliary_loss_clip": 0.01116312, "auxiliary_loss_mlp": 0.01279092, "balance_loss_clip": 1.02017784, "balance_loss_mlp": 1.03813696, "epoch": 0.6625582444010221, "flos": 23256576437760.0, "grad_norm": 1.527409748017681, "language_loss": 0.72056556, "learning_rate": 1.022551493523038e-06, "loss": 0.74451965, "num_input_tokens_seen": 237886710, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69140625, "step": 11020, "time_per_iteration": 2.592233419418335 }, { "auxiliary_loss_clip": 0.01132375, "auxiliary_loss_mlp": 0.01030568, "balance_loss_clip": 1.01627529, "balance_loss_mlp": 1.03911483, "epoch": 0.6626183676536901, "flos": 21394823633280.0, "grad_norm": 1.7799746363607885, "language_loss": 0.72617155, "learning_rate": 1.0222218842934799e-06, "loss": 0.74780095, "num_input_tokens_seen": 237904795, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75390625, "step": 11021, "time_per_iteration": 2.6126863956451416 }, { "auxiliary_loss_clip": 0.01129578, "auxiliary_loss_mlp": 0.01035233, "balance_loss_clip": 1.02189946, "balance_loss_mlp": 1.03896475, "epoch": 0.662678490906358, "flos": 14571293713920.0, "grad_norm": 2.2809483036125613, "language_loss": 0.83170515, "learning_rate": 1.0218923099583082e-06, "loss": 0.85335326, "num_input_tokens_seen": 237921320, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 11022, "time_per_iteration": 3.936128854751587 }, { "auxiliary_loss_clip": 0.01134697, "auxiliary_loss_mlp": 0.01034779, "balance_loss_clip": 1.02089071, "balance_loss_mlp": 1.03867149, "epoch": 0.662738614159026, "flos": 15992350554240.0, "grad_norm": 2.663018485540003, "language_loss": 0.72674656, "learning_rate": 1.0215627705292844e-06, "loss": 0.74844134, "num_input_tokens_seen": 237933525, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78515625, "step": 11023, "time_per_iteration": 2.6936028003692627 }, { "auxiliary_loss_clip": 0.01134012, "auxiliary_loss_mlp": 0.01032275, "balance_loss_clip": 1.01979983, "balance_loss_mlp": 1.03574157, "epoch": 0.6627987374116939, "flos": 19536338966400.0, "grad_norm": 1.774140166782948, "language_loss": 0.74796379, "learning_rate": 1.021233266018167e-06, "loss": 0.76962668, "num_input_tokens_seen": 237953395, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71484375, "step": 11024, "time_per_iteration": 2.6109604835510254 }, { "auxiliary_loss_clip": 0.01137219, "auxiliary_loss_mlp": 0.0103187, "balance_loss_clip": 1.01805353, "balance_loss_mlp": 1.03852224, "epoch": 0.662858860664362, "flos": 15702838934400.0, "grad_norm": 2.3882284130734925, "language_loss": 0.70002121, "learning_rate": 1.0209037964367177e-06, "loss": 0.72171211, "num_input_tokens_seen": 237971445, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.71875, "step": 11025, "time_per_iteration": 2.6141340732574463 }, { "auxiliary_loss_clip": 0.01112338, "auxiliary_loss_mlp": 0.01036317, "balance_loss_clip": 1.0233711, "balance_loss_mlp": 1.04071236, "epoch": 0.6629189839170299, "flos": 20154289570560.0, "grad_norm": 1.4545002621450815, "language_loss": 0.78581268, "learning_rate": 1.0205743617966932e-06, "loss": 0.80729926, "num_input_tokens_seen": 237989965, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 11026, "time_per_iteration": 2.537993907928467 }, { "auxiliary_loss_clip": 0.0111867, "auxiliary_loss_mlp": 0.01032971, "balance_loss_clip": 1.02068663, "balance_loss_mlp": 1.03964078, "epoch": 0.6629791071696979, "flos": 20915415786240.0, "grad_norm": 1.757794087897927, "language_loss": 0.75970179, "learning_rate": 1.0202449621098505e-06, "loss": 0.78121817, "num_input_tokens_seen": 238006820, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 11027, "time_per_iteration": 2.6402103900909424 }, { "auxiliary_loss_clip": 0.01128236, "auxiliary_loss_mlp": 0.01036263, "balance_loss_clip": 1.02296555, "balance_loss_mlp": 1.03842926, "epoch": 0.6630392304223659, "flos": 20846898593280.0, "grad_norm": 2.1974630709822747, "language_loss": 0.70341021, "learning_rate": 1.0199155973879442e-06, "loss": 0.72505522, "num_input_tokens_seen": 238022560, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 11028, "time_per_iteration": 2.588852643966675 }, { "auxiliary_loss_clip": 0.01115982, "auxiliary_loss_mlp": 0.01033603, "balance_loss_clip": 1.02213466, "balance_loss_mlp": 1.03591943, "epoch": 0.6630993536750338, "flos": 20995820380800.0, "grad_norm": 1.9125820749745412, "language_loss": 0.79287833, "learning_rate": 1.0195862676427297e-06, "loss": 0.81437415, "num_input_tokens_seen": 238041895, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.7109375, "step": 11029, "time_per_iteration": 3.8679275512695312 }, { "auxiliary_loss_clip": 0.01110751, "auxiliary_loss_mlp": 0.01030834, "balance_loss_clip": 1.01728559, "balance_loss_mlp": 1.0371381, "epoch": 0.6631594769277018, "flos": 18259032355200.0, "grad_norm": 2.1352376516957112, "language_loss": 0.75356591, "learning_rate": 1.0192569728859593e-06, "loss": 0.7749818, "num_input_tokens_seen": 238060445, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 11030, "time_per_iteration": 2.4822378158569336 }, { "auxiliary_loss_clip": 0.01125301, "auxiliary_loss_mlp": 0.0103238, "balance_loss_clip": 1.01881993, "balance_loss_mlp": 1.04001808, "epoch": 0.6632196001803697, "flos": 17820491207040.0, "grad_norm": 2.20612076146433, "language_loss": 0.75012982, "learning_rate": 1.018927713129385e-06, "loss": 0.77170658, "num_input_tokens_seen": 238077080, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.765625, "step": 11031, "time_per_iteration": 2.4982223510742188 }, { "auxiliary_loss_clip": 0.01119768, "auxiliary_loss_mlp": 0.01031573, "balance_loss_clip": 1.01848364, "balance_loss_mlp": 1.03699327, "epoch": 0.6632797234330378, "flos": 12670182581760.0, "grad_norm": 2.609472685558007, "language_loss": 0.74729288, "learning_rate": 1.0185984883847561e-06, "loss": 0.76880628, "num_input_tokens_seen": 238091045, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7421875, "step": 11032, "time_per_iteration": 2.451296806335449 }, { "auxiliary_loss_clip": 0.01142532, "auxiliary_loss_mlp": 0.01031495, "balance_loss_clip": 1.01921093, "balance_loss_mlp": 1.03610659, "epoch": 0.6633398466857057, "flos": 23584728113280.0, "grad_norm": 1.7688869806479262, "language_loss": 0.80334157, "learning_rate": 1.018269298663824e-06, "loss": 0.82508183, "num_input_tokens_seen": 238110220, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.7109375, "step": 11033, "time_per_iteration": 2.661247968673706 }, { "auxiliary_loss_clip": 0.01126632, "auxiliary_loss_mlp": 0.01033188, "balance_loss_clip": 1.01987875, "balance_loss_mlp": 1.03650522, "epoch": 0.6633999699383737, "flos": 20631686256000.0, "grad_norm": 1.695062592892495, "language_loss": 0.80387968, "learning_rate": 1.017940143978334e-06, "loss": 0.8254779, "num_input_tokens_seen": 238130400, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 11034, "time_per_iteration": 2.592427968978882 }, { "auxiliary_loss_clip": 0.01115477, "auxiliary_loss_mlp": 0.01028622, "balance_loss_clip": 1.01679027, "balance_loss_mlp": 1.03710508, "epoch": 0.6634600931910416, "flos": 21797095023360.0, "grad_norm": 1.5829425535675243, "language_loss": 0.76150274, "learning_rate": 1.0176110243400348e-06, "loss": 0.78294379, "num_input_tokens_seen": 238148165, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 11035, "time_per_iteration": 5.508101463317871 }, { "auxiliary_loss_clip": 0.01136223, "auxiliary_loss_mlp": 0.01025326, "balance_loss_clip": 1.01263618, "balance_loss_mlp": 1.03588724, "epoch": 0.6635202164437096, "flos": 18732873594240.0, "grad_norm": 1.7894388148899372, "language_loss": 0.82992333, "learning_rate": 1.0172819397606714e-06, "loss": 0.8515389, "num_input_tokens_seen": 238166360, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.734375, "step": 11036, "time_per_iteration": 2.5549750328063965 }, { "auxiliary_loss_clip": 0.01029394, "auxiliary_loss_mlp": 0.01000746, "balance_loss_clip": 0.99947625, "balance_loss_mlp": 1.00738108, "epoch": 0.6635803396963775, "flos": 60222771227520.0, "grad_norm": 0.7932445945391976, "language_loss": 0.52385026, "learning_rate": 1.0169528902519874e-06, "loss": 0.54415166, "num_input_tokens_seen": 238227630, "router_z_loss_clip": 0.01269531, "router_z_loss_mlp": 0.22070312, "step": 11037, "time_per_iteration": 3.102621555328369 }, { "auxiliary_loss_clip": 0.01146082, "auxiliary_loss_mlp": 0.01036294, "balance_loss_clip": 1.0229429, "balance_loss_mlp": 1.03724551, "epoch": 0.6636404629490456, "flos": 29167041611520.0, "grad_norm": 1.6525510023361567, "language_loss": 0.79224014, "learning_rate": 1.016623875825726e-06, "loss": 0.81406391, "num_input_tokens_seen": 238248435, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 11038, "time_per_iteration": 2.7176661491394043 }, { "auxiliary_loss_clip": 0.01136777, "auxiliary_loss_mlp": 0.01042693, "balance_loss_clip": 1.02842975, "balance_loss_mlp": 1.03878236, "epoch": 0.6637005862017135, "flos": 38907702766080.0, "grad_norm": 4.7788417047678955, "language_loss": 0.63410509, "learning_rate": 1.0162948964936284e-06, "loss": 0.65589976, "num_input_tokens_seen": 238268755, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.71484375, "step": 11039, "time_per_iteration": 2.7080190181732178 }, { "auxiliary_loss_clip": 0.01137846, "auxiliary_loss_mlp": 0.01029188, "balance_loss_clip": 1.0162065, "balance_loss_mlp": 1.03608036, "epoch": 0.6637607094543815, "flos": 22783345729920.0, "grad_norm": 1.8949736690382573, "language_loss": 0.6373198, "learning_rate": 1.0159659522674374e-06, "loss": 0.65899026, "num_input_tokens_seen": 238290120, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.74609375, "step": 11040, "time_per_iteration": 2.5805232524871826 }, { "auxiliary_loss_clip": 0.01132693, "auxiliary_loss_mlp": 0.01033548, "balance_loss_clip": 1.02104855, "balance_loss_mlp": 1.03594542, "epoch": 0.6638208327070495, "flos": 18114096977280.0, "grad_norm": 1.970711710342361, "language_loss": 0.72137904, "learning_rate": 1.0156370431588882e-06, "loss": 0.7430414, "num_input_tokens_seen": 238309290, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 11041, "time_per_iteration": 2.5674924850463867 }, { "auxiliary_loss_clip": 0.01115806, "auxiliary_loss_mlp": 0.01039778, "balance_loss_clip": 1.02621233, "balance_loss_mlp": 1.03600645, "epoch": 0.6638809559597174, "flos": 29424880414080.0, "grad_norm": 2.0306148072150543, "language_loss": 0.61346149, "learning_rate": 1.015308169179722e-06, "loss": 0.6350174, "num_input_tokens_seen": 238327280, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.70703125, "step": 11042, "time_per_iteration": 2.603856086730957 }, { "auxiliary_loss_clip": 0.01119935, "auxiliary_loss_mlp": 0.01027255, "balance_loss_clip": 1.01313496, "balance_loss_mlp": 1.03610432, "epoch": 0.6639410792123854, "flos": 28072699902720.0, "grad_norm": 1.8710139814193276, "language_loss": 0.68368673, "learning_rate": 1.0149793303416738e-06, "loss": 0.70515871, "num_input_tokens_seen": 238346330, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75, "step": 11043, "time_per_iteration": 2.567427158355713 }, { "auxiliary_loss_clip": 0.01113585, "auxiliary_loss_mlp": 0.01027346, "balance_loss_clip": 1.01631904, "balance_loss_mlp": 1.03735685, "epoch": 0.6640012024650533, "flos": 25556367600000.0, "grad_norm": 1.6349900114812288, "language_loss": 0.83869749, "learning_rate": 1.01465052665648e-06, "loss": 0.86010683, "num_input_tokens_seen": 238364650, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.671875, "step": 11044, "time_per_iteration": 2.565063238143921 }, { "auxiliary_loss_clip": 0.01133978, "auxiliary_loss_mlp": 0.01032615, "balance_loss_clip": 1.01978815, "balance_loss_mlp": 1.03604794, "epoch": 0.6640613257177214, "flos": 14866946559360.0, "grad_norm": 2.552503297911501, "language_loss": 0.69669712, "learning_rate": 1.0143217581358733e-06, "loss": 0.71836305, "num_input_tokens_seen": 238381630, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 11045, "time_per_iteration": 2.546689033508301 }, { "auxiliary_loss_clip": 0.01118552, "auxiliary_loss_mlp": 0.01027735, "balance_loss_clip": 1.01512289, "balance_loss_mlp": 1.03752923, "epoch": 0.6641214489703893, "flos": 23221096778880.0, "grad_norm": 1.8163918375580939, "language_loss": 0.64465022, "learning_rate": 1.0139930247915894e-06, "loss": 0.66611314, "num_input_tokens_seen": 238402595, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.72265625, "step": 11046, "time_per_iteration": 2.55743670463562 }, { "auxiliary_loss_clip": 0.01134858, "auxiliary_loss_mlp": 0.01028171, "balance_loss_clip": 1.01571918, "balance_loss_mlp": 1.03775239, "epoch": 0.6641815722230573, "flos": 37742617221120.0, "grad_norm": 1.4616498149043575, "language_loss": 0.71287036, "learning_rate": 1.0136643266353564e-06, "loss": 0.73450065, "num_input_tokens_seen": 238426860, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.703125, "step": 11047, "time_per_iteration": 2.7465474605560303 }, { "auxiliary_loss_clip": 0.0115495, "auxiliary_loss_mlp": 0.01038076, "balance_loss_clip": 1.02493322, "balance_loss_mlp": 1.03958607, "epoch": 0.6642416954757252, "flos": 17931132074880.0, "grad_norm": 2.3098309247531716, "language_loss": 0.77347624, "learning_rate": 1.013335663678907e-06, "loss": 0.79540646, "num_input_tokens_seen": 238443990, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 11048, "time_per_iteration": 2.5688865184783936 }, { "auxiliary_loss_clip": 0.01056053, "auxiliary_loss_mlp": 0.01004236, "balance_loss_clip": 1.00302017, "balance_loss_mlp": 1.0076952, "epoch": 0.6643018187283932, "flos": 51995384104320.0, "grad_norm": 0.7613820635002926, "language_loss": 0.55056703, "learning_rate": 1.0130070359339693e-06, "loss": 0.57116985, "num_input_tokens_seen": 238503045, "router_z_loss_clip": 0.012146, "router_z_loss_mlp": 0.21875, "step": 11049, "time_per_iteration": 3.192763090133667 }, { "auxiliary_loss_clip": 0.01115269, "auxiliary_loss_mlp": 0.01028938, "balance_loss_clip": 1.0171541, "balance_loss_mlp": 1.03662252, "epoch": 0.6643619419810611, "flos": 30226657847040.0, "grad_norm": 1.7834233895788614, "language_loss": 0.63946658, "learning_rate": 1.012678443412273e-06, "loss": 0.66090858, "num_input_tokens_seen": 238527320, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 11050, "time_per_iteration": 2.6255507469177246 }, { "auxiliary_loss_clip": 0.01104949, "auxiliary_loss_mlp": 0.01029356, "balance_loss_clip": 1.01761377, "balance_loss_mlp": 1.03622746, "epoch": 0.6644220652337292, "flos": 22966131064320.0, "grad_norm": 2.48494302225796, "language_loss": 0.78867924, "learning_rate": 1.0123498861255417e-06, "loss": 0.81002223, "num_input_tokens_seen": 238546030, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 11051, "time_per_iteration": 2.5525424480438232 }, { "auxiliary_loss_clip": 0.01126027, "auxiliary_loss_mlp": 0.01031763, "balance_loss_clip": 1.01909697, "balance_loss_mlp": 1.0380857, "epoch": 0.6644821884863971, "flos": 21142228216320.0, "grad_norm": 1.7705600466047475, "language_loss": 0.85564172, "learning_rate": 1.0120213640855035e-06, "loss": 0.87721956, "num_input_tokens_seen": 238564175, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 11052, "time_per_iteration": 2.5400917530059814 }, { "auxiliary_loss_clip": 0.01146043, "auxiliary_loss_mlp": 0.01036349, "balance_loss_clip": 1.0220679, "balance_loss_mlp": 1.03737092, "epoch": 0.6645423117390651, "flos": 20192821885440.0, "grad_norm": 2.515240245678351, "language_loss": 0.75212383, "learning_rate": 1.011692877303882e-06, "loss": 0.77394772, "num_input_tokens_seen": 238581010, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.734375, "step": 11053, "time_per_iteration": 2.589205741882324 }, { "auxiliary_loss_clip": 0.01121702, "auxiliary_loss_mlp": 0.01025711, "balance_loss_clip": 1.01381993, "balance_loss_mlp": 1.03504717, "epoch": 0.6646024349917331, "flos": 24351959640960.0, "grad_norm": 1.5348544253966283, "language_loss": 0.79580116, "learning_rate": 1.011364425792398e-06, "loss": 0.81727529, "num_input_tokens_seen": 238601365, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 11054, "time_per_iteration": 2.5910160541534424 }, { "auxiliary_loss_clip": 0.0113917, "auxiliary_loss_mlp": 0.01031082, "balance_loss_clip": 1.01935768, "balance_loss_mlp": 1.03509259, "epoch": 0.664662558244401, "flos": 18806706000000.0, "grad_norm": 1.5404822619328364, "language_loss": 0.73905843, "learning_rate": 1.0110360095627755e-06, "loss": 0.7607609, "num_input_tokens_seen": 238619850, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69140625, "step": 11055, "time_per_iteration": 2.6522836685180664 }, { "auxiliary_loss_clip": 0.01136171, "auxiliary_loss_mlp": 0.01039761, "balance_loss_clip": 1.02606392, "balance_loss_mlp": 1.03723097, "epoch": 0.664722681497069, "flos": 18952790613120.0, "grad_norm": 2.128733330360503, "language_loss": 0.72443557, "learning_rate": 1.0107076286267329e-06, "loss": 0.74619484, "num_input_tokens_seen": 238637635, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 11056, "time_per_iteration": 2.5623908042907715 }, { "auxiliary_loss_clip": 0.01139475, "auxiliary_loss_mlp": 0.01032127, "balance_loss_clip": 1.01993835, "balance_loss_mlp": 1.03451908, "epoch": 0.6647828047497369, "flos": 19571279921280.0, "grad_norm": 2.310937431831877, "language_loss": 0.6952517, "learning_rate": 1.0103792829959919e-06, "loss": 0.71696776, "num_input_tokens_seen": 238656200, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 11057, "time_per_iteration": 2.5793232917785645 }, { "auxiliary_loss_clip": 0.01115847, "auxiliary_loss_mlp": 0.01031172, "balance_loss_clip": 1.01844037, "balance_loss_mlp": 1.03624535, "epoch": 0.664842928002405, "flos": 23149455102720.0, "grad_norm": 1.8887047840166098, "language_loss": 0.80278045, "learning_rate": 1.0100509726822671e-06, "loss": 0.82425064, "num_input_tokens_seen": 238675005, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 11058, "time_per_iteration": 2.5487253665924072 }, { "auxiliary_loss_clip": 0.01055906, "auxiliary_loss_mlp": 0.01004013, "balance_loss_clip": 1.00271928, "balance_loss_mlp": 1.00760651, "epoch": 0.6649030512550729, "flos": 65244913148160.0, "grad_norm": 0.7997617669315227, "language_loss": 0.62650704, "learning_rate": 1.0097226976972776e-06, "loss": 0.64710623, "num_input_tokens_seen": 238731425, "router_z_loss_clip": 0.01293945, "router_z_loss_mlp": 0.21875, "step": 11059, "time_per_iteration": 3.0714402198791504 }, { "auxiliary_loss_clip": 0.01112909, "auxiliary_loss_mlp": 0.01035789, "balance_loss_clip": 1.02393913, "balance_loss_mlp": 1.03531051, "epoch": 0.6649631745077409, "flos": 20194797133440.0, "grad_norm": 1.3573426303663683, "language_loss": 0.78806818, "learning_rate": 1.0093944580527374e-06, "loss": 0.80955511, "num_input_tokens_seen": 238752020, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.69140625, "step": 11060, "time_per_iteration": 2.5650861263275146 }, { "auxiliary_loss_clip": 0.01135507, "auxiliary_loss_mlp": 0.01033933, "balance_loss_clip": 1.02099228, "balance_loss_mlp": 1.03767061, "epoch": 0.6650232977604088, "flos": 17238558965760.0, "grad_norm": 1.6704293188602706, "language_loss": 0.78658956, "learning_rate": 1.0090662537603612e-06, "loss": 0.80828404, "num_input_tokens_seen": 238769665, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 11061, "time_per_iteration": 2.6277918815612793 }, { "auxiliary_loss_clip": 0.01124077, "auxiliary_loss_mlp": 0.0102788, "balance_loss_clip": 1.01587546, "balance_loss_mlp": 1.03726745, "epoch": 0.6650834210130768, "flos": 10006867825920.0, "grad_norm": 2.171684078266175, "language_loss": 0.57367247, "learning_rate": 1.0087380848318603e-06, "loss": 0.59519202, "num_input_tokens_seen": 238782180, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 11062, "time_per_iteration": 2.547205686569214 }, { "auxiliary_loss_clip": 0.01119291, "auxiliary_loss_mlp": 0.0103071, "balance_loss_clip": 1.01683426, "balance_loss_mlp": 1.0357492, "epoch": 0.6651435442657447, "flos": 10452088903680.0, "grad_norm": 1.8172093753384069, "language_loss": 0.7627725, "learning_rate": 1.0084099512789493e-06, "loss": 0.78427255, "num_input_tokens_seen": 238800315, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7421875, "step": 11063, "time_per_iteration": 2.615631103515625 }, { "auxiliary_loss_clip": 0.01152489, "auxiliary_loss_mlp": 0.0103713, "balance_loss_clip": 1.02510166, "balance_loss_mlp": 1.03789306, "epoch": 0.6652036675184128, "flos": 22344229964160.0, "grad_norm": 1.4205555679791828, "language_loss": 0.70501339, "learning_rate": 1.0080818531133343e-06, "loss": 0.72690958, "num_input_tokens_seen": 238822250, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 11064, "time_per_iteration": 4.0787975788116455 }, { "auxiliary_loss_clip": 0.01028613, "auxiliary_loss_mlp": 0.01001731, "balance_loss_clip": 1.00041962, "balance_loss_mlp": 1.00683856, "epoch": 0.6652637907710807, "flos": 52909633998720.0, "grad_norm": 0.8459366313733927, "language_loss": 0.63052917, "learning_rate": 1.0077537903467276e-06, "loss": 0.65083265, "num_input_tokens_seen": 238877190, "router_z_loss_clip": 0.01312256, "router_z_loss_mlp": 0.21777344, "step": 11065, "time_per_iteration": 3.0850610733032227 }, { "auxiliary_loss_clip": 0.01114992, "auxiliary_loss_mlp": 0.01030657, "balance_loss_clip": 1.01840806, "balance_loss_mlp": 1.0356741, "epoch": 0.6653239140237487, "flos": 23104637907840.0, "grad_norm": 1.6574704985939717, "language_loss": 0.62067509, "learning_rate": 1.007425762990835e-06, "loss": 0.64213157, "num_input_tokens_seen": 238896010, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 11066, "time_per_iteration": 2.58512806892395 }, { "auxiliary_loss_clip": 0.0113481, "auxiliary_loss_mlp": 0.01029553, "balance_loss_clip": 1.01601028, "balance_loss_mlp": 1.03570366, "epoch": 0.6653840372764167, "flos": 25959393175680.0, "grad_norm": 1.5360925461358095, "language_loss": 0.699646, "learning_rate": 1.0070977710573654e-06, "loss": 0.72128963, "num_input_tokens_seen": 238918990, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.72265625, "step": 11067, "time_per_iteration": 2.7101950645446777 }, { "auxiliary_loss_clip": 0.01046866, "auxiliary_loss_mlp": 0.01002402, "balance_loss_clip": 1.00113285, "balance_loss_mlp": 1.00741005, "epoch": 0.6654441605290846, "flos": 66041985899520.0, "grad_norm": 0.9165324803465001, "language_loss": 0.72126597, "learning_rate": 1.0067698145580213e-06, "loss": 0.74175864, "num_input_tokens_seen": 238975735, "router_z_loss_clip": 0.01269531, "router_z_loss_mlp": 0.21679688, "step": 11068, "time_per_iteration": 3.1654937267303467 }, { "auxiliary_loss_clip": 0.01055124, "auxiliary_loss_mlp": 0.01002745, "balance_loss_clip": 1.00132036, "balance_loss_mlp": 1.0071733, "epoch": 0.6655042837817526, "flos": 65196112521600.0, "grad_norm": 0.7046402049458218, "language_loss": 0.57802653, "learning_rate": 1.0064418935045066e-06, "loss": 0.59860516, "num_input_tokens_seen": 239042360, "router_z_loss_clip": 0.01422119, "router_z_loss_mlp": 0.21679688, "step": 11069, "time_per_iteration": 3.3268074989318848 }, { "auxiliary_loss_clip": 0.01038203, "auxiliary_loss_mlp": 0.01001652, "balance_loss_clip": 1.00038242, "balance_loss_mlp": 1.00737727, "epoch": 0.6655644070344205, "flos": 69008746752000.0, "grad_norm": 0.7566645645708837, "language_loss": 0.63548642, "learning_rate": 1.0061140079085268e-06, "loss": 0.65588498, "num_input_tokens_seen": 239109410, "router_z_loss_clip": 0.01269531, "router_z_loss_mlp": 0.21582031, "step": 11070, "time_per_iteration": 4.590510606765747 }, { "auxiliary_loss_clip": 0.01103968, "auxiliary_loss_mlp": 0.01027061, "balance_loss_clip": 1.01500928, "balance_loss_mlp": 1.03362274, "epoch": 0.6656245302870886, "flos": 36315562809600.0, "grad_norm": 1.7678292582739839, "language_loss": 0.58750176, "learning_rate": 1.0057861577817801e-06, "loss": 0.60881209, "num_input_tokens_seen": 239135345, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.703125, "step": 11071, "time_per_iteration": 2.663057565689087 }, { "auxiliary_loss_clip": 0.01116831, "auxiliary_loss_mlp": 0.01029553, "balance_loss_clip": 1.01787055, "balance_loss_mlp": 1.03781545, "epoch": 0.6656846535397565, "flos": 21794832466560.0, "grad_norm": 4.005976617596629, "language_loss": 0.72937918, "learning_rate": 1.0054583431359686e-06, "loss": 0.75084299, "num_input_tokens_seen": 239154340, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.703125, "step": 11072, "time_per_iteration": 2.6277825832366943 }, { "auxiliary_loss_clip": 0.01115069, "auxiliary_loss_mlp": 0.0103204, "balance_loss_clip": 1.01861095, "balance_loss_mlp": 1.03738523, "epoch": 0.6657447767924245, "flos": 37487615592960.0, "grad_norm": 1.651664438188586, "language_loss": 0.705881, "learning_rate": 1.0051305639827898e-06, "loss": 0.72735208, "num_input_tokens_seen": 239177815, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.69140625, "step": 11073, "time_per_iteration": 2.7690751552581787 }, { "auxiliary_loss_clip": 0.01135837, "auxiliary_loss_mlp": 0.01034014, "balance_loss_clip": 1.02109146, "balance_loss_mlp": 1.03699327, "epoch": 0.6658049000450924, "flos": 16837688206080.0, "grad_norm": 2.251996423390063, "language_loss": 0.55781996, "learning_rate": 1.0048028203339435e-06, "loss": 0.57951838, "num_input_tokens_seen": 239195735, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 11074, "time_per_iteration": 2.5893592834472656 }, { "auxiliary_loss_clip": 0.01108848, "auxiliary_loss_mlp": 0.01280412, "balance_loss_clip": 1.021281, "balance_loss_mlp": 1.03863335, "epoch": 0.6658650232977604, "flos": 33510975863040.0, "grad_norm": 1.502712056367405, "language_loss": 0.72521025, "learning_rate": 1.0044751122011233e-06, "loss": 0.74910289, "num_input_tokens_seen": 239217535, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 11075, "time_per_iteration": 2.6484274864196777 }, { "auxiliary_loss_clip": 0.01113324, "auxiliary_loss_mlp": 0.01030709, "balance_loss_clip": 1.01922905, "balance_loss_mlp": 1.03609967, "epoch": 0.6659251465504283, "flos": 37706311549440.0, "grad_norm": 1.4869916173390934, "language_loss": 0.65839028, "learning_rate": 1.0041474395960263e-06, "loss": 0.67983061, "num_input_tokens_seen": 239241975, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6875, "step": 11076, "time_per_iteration": 5.73615026473999 }, { "auxiliary_loss_clip": 0.01124328, "auxiliary_loss_mlp": 0.01035354, "balance_loss_clip": 1.02218115, "balance_loss_mlp": 1.03657913, "epoch": 0.6659852698030964, "flos": 24893420232960.0, "grad_norm": 1.6574685698270026, "language_loss": 0.75251234, "learning_rate": 1.0038198025303452e-06, "loss": 0.77410924, "num_input_tokens_seen": 239262025, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 11077, "time_per_iteration": 2.5984044075012207 }, { "auxiliary_loss_clip": 0.01123373, "auxiliary_loss_mlp": 0.01034411, "balance_loss_clip": 1.02166116, "balance_loss_mlp": 1.03484201, "epoch": 0.6660453930557643, "flos": 24352821567360.0, "grad_norm": 1.8217782544482504, "language_loss": 0.66805899, "learning_rate": 1.0034922010157734e-06, "loss": 0.68963677, "num_input_tokens_seen": 239282775, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 11078, "time_per_iteration": 2.674661636352539 }, { "auxiliary_loss_clip": 0.01107839, "auxiliary_loss_mlp": 0.01031684, "balance_loss_clip": 1.01824903, "balance_loss_mlp": 1.03583312, "epoch": 0.6661055163084323, "flos": 10597814380800.0, "grad_norm": 1.9523771354027735, "language_loss": 0.69466126, "learning_rate": 1.0031646350640005e-06, "loss": 0.71605653, "num_input_tokens_seen": 239299775, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 11079, "time_per_iteration": 2.5162439346313477 }, { "auxiliary_loss_clip": 0.01122754, "auxiliary_loss_mlp": 0.01024573, "balance_loss_clip": 1.01267588, "balance_loss_mlp": 1.03603339, "epoch": 0.6661656395611003, "flos": 24057491944320.0, "grad_norm": 1.6576584545161035, "language_loss": 0.80319071, "learning_rate": 1.0028371046867191e-06, "loss": 0.824664, "num_input_tokens_seen": 239319660, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69140625, "step": 11080, "time_per_iteration": 2.633237600326538 }, { "auxiliary_loss_clip": 0.0112959, "auxiliary_loss_mlp": 0.01028681, "balance_loss_clip": 1.01746917, "balance_loss_mlp": 1.03605461, "epoch": 0.6662257628137682, "flos": 23036192542080.0, "grad_norm": 1.814291149926583, "language_loss": 0.78191543, "learning_rate": 1.002509609895615e-06, "loss": 0.80349815, "num_input_tokens_seen": 239339215, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.671875, "step": 11081, "time_per_iteration": 2.645974636077881 }, { "auxiliary_loss_clip": 0.01136321, "auxiliary_loss_mlp": 0.01034323, "balance_loss_clip": 1.02144814, "balance_loss_mlp": 1.03801596, "epoch": 0.6662858860664362, "flos": 24754446512640.0, "grad_norm": 1.7938390404158884, "language_loss": 0.79795921, "learning_rate": 1.002182150702378e-06, "loss": 0.81966567, "num_input_tokens_seen": 239358545, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 11082, "time_per_iteration": 2.671234369277954 }, { "auxiliary_loss_clip": 0.01132215, "auxiliary_loss_mlp": 0.01034804, "balance_loss_clip": 1.02312779, "balance_loss_mlp": 1.03577399, "epoch": 0.6663460093191041, "flos": 20009066883840.0, "grad_norm": 1.5599834180884473, "language_loss": 0.83968306, "learning_rate": 1.001854727118693e-06, "loss": 0.86135328, "num_input_tokens_seen": 239376665, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.69921875, "step": 11083, "time_per_iteration": 2.571294069290161 }, { "auxiliary_loss_clip": 0.01140173, "auxiliary_loss_mlp": 0.01031751, "balance_loss_clip": 1.01895404, "balance_loss_mlp": 1.03898287, "epoch": 0.6664061325717722, "flos": 17821389047040.0, "grad_norm": 2.5849235151584598, "language_loss": 0.85153359, "learning_rate": 1.0015273391562456e-06, "loss": 0.87325281, "num_input_tokens_seen": 239394345, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7421875, "step": 11084, "time_per_iteration": 2.560962438583374 }, { "auxiliary_loss_clip": 0.01127581, "auxiliary_loss_mlp": 0.01031454, "balance_loss_clip": 1.01830578, "balance_loss_mlp": 1.03738296, "epoch": 0.6664662558244401, "flos": 18076893465600.0, "grad_norm": 2.4555340940825086, "language_loss": 0.73200297, "learning_rate": 1.0011999868267188e-06, "loss": 0.75359333, "num_input_tokens_seen": 239410605, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 11085, "time_per_iteration": 2.530992269515991 }, { "auxiliary_loss_clip": 0.01123489, "auxiliary_loss_mlp": 0.01032989, "balance_loss_clip": 1.02034128, "balance_loss_mlp": 1.03563142, "epoch": 0.6665263790771081, "flos": 21574197175680.0, "grad_norm": 1.8525694446980794, "language_loss": 0.80706388, "learning_rate": 1.0008726701417946e-06, "loss": 0.82862866, "num_input_tokens_seen": 239427155, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 11086, "time_per_iteration": 2.6276233196258545 }, { "auxiliary_loss_clip": 0.01124561, "auxiliary_loss_mlp": 0.01034344, "balance_loss_clip": 1.02145708, "balance_loss_mlp": 1.03709102, "epoch": 0.666586502329776, "flos": 24206629213440.0, "grad_norm": 2.0495738964933103, "language_loss": 0.74408782, "learning_rate": 1.0005453891131562e-06, "loss": 0.76567686, "num_input_tokens_seen": 239445510, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 11087, "time_per_iteration": 2.6182029247283936 }, { "auxiliary_loss_clip": 0.01114701, "auxiliary_loss_mlp": 0.01030011, "balance_loss_clip": 1.01714242, "balance_loss_mlp": 1.03561282, "epoch": 0.666646625582444, "flos": 22200515648640.0, "grad_norm": 1.98863687748167, "language_loss": 0.64768434, "learning_rate": 1.0002181437524804e-06, "loss": 0.66913146, "num_input_tokens_seen": 239464805, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 11088, "time_per_iteration": 2.6215243339538574 }, { "auxiliary_loss_clip": 0.01117137, "auxiliary_loss_mlp": 0.01029512, "balance_loss_clip": 1.01660728, "balance_loss_mlp": 1.03676987, "epoch": 0.6667067488351119, "flos": 18259930195200.0, "grad_norm": 2.9529413882433833, "language_loss": 0.64353406, "learning_rate": 9.998909340714484e-07, "loss": 0.66500056, "num_input_tokens_seen": 239483890, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 11089, "time_per_iteration": 2.5530102252960205 }, { "auxiliary_loss_clip": 0.01106376, "auxiliary_loss_mlp": 0.01033777, "balance_loss_clip": 1.02193928, "balance_loss_mlp": 1.03665805, "epoch": 0.66676687208778, "flos": 17236547804160.0, "grad_norm": 2.3219220795047453, "language_loss": 0.81366587, "learning_rate": 9.995637600817359e-07, "loss": 0.83506739, "num_input_tokens_seen": 239500080, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6953125, "step": 11090, "time_per_iteration": 2.5766890048980713 }, { "auxiliary_loss_clip": 0.01127055, "auxiliary_loss_mlp": 0.01033855, "balance_loss_clip": 1.02095044, "balance_loss_mlp": 1.03713489, "epoch": 0.6668269953404479, "flos": 19752197748480.0, "grad_norm": 2.5815115316615045, "language_loss": 0.77640152, "learning_rate": 9.992366217950197e-07, "loss": 0.79801065, "num_input_tokens_seen": 239517335, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 11091, "time_per_iteration": 2.597414255142212 }, { "auxiliary_loss_clip": 0.01106968, "auxiliary_loss_mlp": 0.01033081, "balance_loss_clip": 1.02058244, "balance_loss_mlp": 1.03655553, "epoch": 0.6668871185931159, "flos": 20558428467840.0, "grad_norm": 1.7471119330431075, "language_loss": 0.7935729, "learning_rate": 9.989095192229734e-07, "loss": 0.81497335, "num_input_tokens_seen": 239536240, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 11092, "time_per_iteration": 2.624809741973877 }, { "auxiliary_loss_clip": 0.01056598, "auxiliary_loss_mlp": 0.0100116, "balance_loss_clip": 0.99981862, "balance_loss_mlp": 1.00805962, "epoch": 0.6669472418457839, "flos": 58088167735680.0, "grad_norm": 0.7704401791501634, "language_loss": 0.57700419, "learning_rate": 9.985824523772718e-07, "loss": 0.59758174, "num_input_tokens_seen": 239598000, "router_z_loss_clip": 0.01342773, "router_z_loss_mlp": 0.21679688, "step": 11093, "time_per_iteration": 3.2896134853363037 }, { "auxiliary_loss_clip": 0.01106738, "auxiliary_loss_mlp": 0.01032677, "balance_loss_clip": 1.02036905, "balance_loss_mlp": 1.03759742, "epoch": 0.6670073650984518, "flos": 26065113880320.0, "grad_norm": 1.6341824719912246, "language_loss": 0.76919639, "learning_rate": 9.982554212695869e-07, "loss": 0.79059052, "num_input_tokens_seen": 239617650, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 11094, "time_per_iteration": 2.627232074737549 }, { "auxiliary_loss_clip": 0.0110814, "auxiliary_loss_mlp": 0.01037063, "balance_loss_clip": 1.02382529, "balance_loss_mlp": 1.03706408, "epoch": 0.6670674883511198, "flos": 32416849635840.0, "grad_norm": 1.727931021145771, "language_loss": 0.73051399, "learning_rate": 9.97928425911589e-07, "loss": 0.75196606, "num_input_tokens_seen": 239639825, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 11095, "time_per_iteration": 2.700310230255127 }, { "auxiliary_loss_clip": 0.011163, "auxiliary_loss_mlp": 0.01035763, "balance_loss_clip": 1.02384222, "balance_loss_mlp": 1.0381031, "epoch": 0.6671276116037878, "flos": 18037786533120.0, "grad_norm": 2.018813343272716, "language_loss": 0.73852193, "learning_rate": 9.97601466314947e-07, "loss": 0.76004255, "num_input_tokens_seen": 239656300, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69140625, "step": 11096, "time_per_iteration": 2.5441064834594727 }, { "auxiliary_loss_clip": 0.01133074, "auxiliary_loss_mlp": 0.01033549, "balance_loss_clip": 1.02146149, "balance_loss_mlp": 1.03787065, "epoch": 0.6671877348564558, "flos": 23767046570880.0, "grad_norm": 1.8319173278098522, "language_loss": 0.64928746, "learning_rate": 9.97274542491332e-07, "loss": 0.67095369, "num_input_tokens_seen": 239676655, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 11097, "time_per_iteration": 2.601066827774048 }, { "auxiliary_loss_clip": 0.01117843, "auxiliary_loss_mlp": 0.01033028, "balance_loss_clip": 1.02085114, "balance_loss_mlp": 1.03664875, "epoch": 0.6672478581091237, "flos": 20918360701440.0, "grad_norm": 2.1255777839815937, "language_loss": 0.75063562, "learning_rate": 9.969476544524086e-07, "loss": 0.77214432, "num_input_tokens_seen": 239695430, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.7265625, "step": 11098, "time_per_iteration": 2.507948160171509 }, { "auxiliary_loss_clip": 0.01108888, "auxiliary_loss_mlp": 0.01035065, "balance_loss_clip": 1.0227505, "balance_loss_mlp": 1.03757787, "epoch": 0.6673079813617917, "flos": 27855799626240.0, "grad_norm": 1.6822487975085643, "language_loss": 0.74393421, "learning_rate": 9.96620802209842e-07, "loss": 0.76537371, "num_input_tokens_seen": 239717070, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 11099, "time_per_iteration": 2.5714175701141357 }, { "auxiliary_loss_clip": 0.01113219, "auxiliary_loss_mlp": 0.01034464, "balance_loss_clip": 1.02225101, "balance_loss_mlp": 1.03614128, "epoch": 0.6673681046144596, "flos": 21616859554560.0, "grad_norm": 1.8227642276418439, "language_loss": 0.78413415, "learning_rate": 9.96293985775299e-07, "loss": 0.80561095, "num_input_tokens_seen": 239737105, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 11100, "time_per_iteration": 2.5537002086639404 }, { "auxiliary_loss_clip": 0.0110597, "auxiliary_loss_mlp": 0.01036324, "balance_loss_clip": 1.0239737, "balance_loss_mlp": 1.03697753, "epoch": 0.6674282278671276, "flos": 20889884194560.0, "grad_norm": 2.193921458836568, "language_loss": 0.59829873, "learning_rate": 9.95967205160442e-07, "loss": 0.61972165, "num_input_tokens_seen": 239757835, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 11101, "time_per_iteration": 2.5716350078582764 }, { "auxiliary_loss_clip": 0.01131044, "auxiliary_loss_mlp": 0.01038787, "balance_loss_clip": 1.02524507, "balance_loss_mlp": 1.03749585, "epoch": 0.6674883511197955, "flos": 23624194181760.0, "grad_norm": 1.7319993784203365, "language_loss": 0.71739566, "learning_rate": 9.956404603769327e-07, "loss": 0.73909396, "num_input_tokens_seen": 239775425, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7578125, "step": 11102, "time_per_iteration": 2.5587949752807617 }, { "auxiliary_loss_clip": 0.01117013, "auxiliary_loss_mlp": 0.01031853, "balance_loss_clip": 1.019485, "balance_loss_mlp": 1.03657603, "epoch": 0.6675484743724636, "flos": 19609668581760.0, "grad_norm": 1.7432419974917344, "language_loss": 0.84123433, "learning_rate": 9.953137514364308e-07, "loss": 0.86272293, "num_input_tokens_seen": 239794605, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 11103, "time_per_iteration": 2.5618510246276855 }, { "auxiliary_loss_clip": 0.01126529, "auxiliary_loss_mlp": 0.01026188, "balance_loss_clip": 1.01442838, "balance_loss_mlp": 1.03724706, "epoch": 0.6676085976251315, "flos": 14319452482560.0, "grad_norm": 1.794558363630564, "language_loss": 0.78034377, "learning_rate": 9.949870783505985e-07, "loss": 0.80187094, "num_input_tokens_seen": 239812135, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.71875, "step": 11104, "time_per_iteration": 2.533815383911133 }, { "auxiliary_loss_clip": 0.01147304, "auxiliary_loss_mlp": 0.01031104, "balance_loss_clip": 1.01876593, "balance_loss_mlp": 1.03855467, "epoch": 0.6676687208777995, "flos": 38104596529920.0, "grad_norm": 1.7336051084053294, "language_loss": 0.58129066, "learning_rate": 9.946604411310906e-07, "loss": 0.60307479, "num_input_tokens_seen": 239835845, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.73046875, "step": 11105, "time_per_iteration": 2.825089931488037 }, { "auxiliary_loss_clip": 0.01121754, "auxiliary_loss_mlp": 0.01034648, "balance_loss_clip": 1.02046168, "balance_loss_mlp": 1.03752935, "epoch": 0.6677288441304675, "flos": 23981576549760.0, "grad_norm": 1.8560652623084801, "language_loss": 0.7350629, "learning_rate": 9.943338397895662e-07, "loss": 0.7566269, "num_input_tokens_seen": 239853820, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.75390625, "step": 11106, "time_per_iteration": 3.9238638877868652 }, { "auxiliary_loss_clip": 0.01107898, "auxiliary_loss_mlp": 0.0102968, "balance_loss_clip": 1.01693022, "balance_loss_mlp": 1.03671765, "epoch": 0.6677889673831354, "flos": 24170682677760.0, "grad_norm": 1.6181552117144915, "language_loss": 0.76922423, "learning_rate": 9.940072743376801e-07, "loss": 0.7906, "num_input_tokens_seen": 239873365, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 11107, "time_per_iteration": 2.5831732749938965 }, { "auxiliary_loss_clip": 0.01122147, "auxiliary_loss_mlp": 0.01029911, "balance_loss_clip": 1.01745927, "balance_loss_mlp": 1.03676939, "epoch": 0.6678490906358034, "flos": 22309648145280.0, "grad_norm": 2.190091516189277, "language_loss": 0.9030149, "learning_rate": 9.936807447870869e-07, "loss": 0.92453551, "num_input_tokens_seen": 239891215, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.67578125, "step": 11108, "time_per_iteration": 2.618025064468384 }, { "auxiliary_loss_clip": 0.01132647, "auxiliary_loss_mlp": 0.01026275, "balance_loss_clip": 1.01495051, "balance_loss_mlp": 1.03647065, "epoch": 0.6679092138884714, "flos": 36898752026880.0, "grad_norm": 1.43789708675623, "language_loss": 0.82898992, "learning_rate": 9.933542511494387e-07, "loss": 0.85057914, "num_input_tokens_seen": 239913490, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6953125, "step": 11109, "time_per_iteration": 2.648831844329834 }, { "auxiliary_loss_clip": 0.01125154, "auxiliary_loss_mlp": 0.01030323, "balance_loss_clip": 1.01719785, "balance_loss_mlp": 1.03650641, "epoch": 0.6679693371411394, "flos": 18150294908160.0, "grad_norm": 1.9476574443911057, "language_loss": 0.69391721, "learning_rate": 9.930277934363884e-07, "loss": 0.71547198, "num_input_tokens_seen": 239931565, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 11110, "time_per_iteration": 2.604473114013672 }, { "auxiliary_loss_clip": 0.01128046, "auxiliary_loss_mlp": 0.01034487, "balance_loss_clip": 1.02042675, "balance_loss_mlp": 1.03656363, "epoch": 0.6680294603938073, "flos": 27198167472000.0, "grad_norm": 1.642058677148687, "language_loss": 0.73419082, "learning_rate": 9.927013716595859e-07, "loss": 0.7558161, "num_input_tokens_seen": 239952395, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.734375, "step": 11111, "time_per_iteration": 2.645444869995117 }, { "auxiliary_loss_clip": 0.01105214, "auxiliary_loss_mlp": 0.01032071, "balance_loss_clip": 1.01969123, "balance_loss_mlp": 1.03592801, "epoch": 0.6680895836464753, "flos": 21725309692800.0, "grad_norm": 2.218902380167321, "language_loss": 0.64965433, "learning_rate": 9.923749858306806e-07, "loss": 0.67102718, "num_input_tokens_seen": 239968910, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 11112, "time_per_iteration": 4.034523248672485 }, { "auxiliary_loss_clip": 0.01125989, "auxiliary_loss_mlp": 0.01031389, "balance_loss_clip": 1.01894355, "balance_loss_mlp": 1.03683043, "epoch": 0.6681497068991432, "flos": 19646477043840.0, "grad_norm": 1.8122897580161343, "language_loss": 0.63370657, "learning_rate": 9.920486359613198e-07, "loss": 0.65528035, "num_input_tokens_seen": 239987680, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7109375, "step": 11113, "time_per_iteration": 2.576162815093994 }, { "auxiliary_loss_clip": 0.01142791, "auxiliary_loss_mlp": 0.0103121, "balance_loss_clip": 1.01933634, "balance_loss_mlp": 1.03809011, "epoch": 0.6682098301518112, "flos": 17419153570560.0, "grad_norm": 2.203166447082526, "language_loss": 0.65878475, "learning_rate": 9.917223220631506e-07, "loss": 0.68052477, "num_input_tokens_seen": 240005790, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 11114, "time_per_iteration": 2.571342706680298 }, { "auxiliary_loss_clip": 0.01116489, "auxiliary_loss_mlp": 0.01036277, "balance_loss_clip": 1.02339005, "balance_loss_mlp": 1.03633285, "epoch": 0.6682699534044791, "flos": 22599016110720.0, "grad_norm": 1.6915323134062703, "language_loss": 0.78618026, "learning_rate": 9.91396044147818e-07, "loss": 0.80770791, "num_input_tokens_seen": 240025895, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 11115, "time_per_iteration": 2.6011836528778076 }, { "auxiliary_loss_clip": 0.0114172, "auxiliary_loss_mlp": 0.01275826, "balance_loss_clip": 1.01639271, "balance_loss_mlp": 1.03451824, "epoch": 0.6683300766571472, "flos": 24863686750080.0, "grad_norm": 1.8049000666499173, "language_loss": 0.79237843, "learning_rate": 9.910698022269655e-07, "loss": 0.81655389, "num_input_tokens_seen": 240044880, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 11116, "time_per_iteration": 2.6448020935058594 }, { "auxiliary_loss_clip": 0.01135649, "auxiliary_loss_mlp": 0.01033909, "balance_loss_clip": 1.02040291, "balance_loss_mlp": 1.03709996, "epoch": 0.6683901999098151, "flos": 27126633536640.0, "grad_norm": 1.6421571927710683, "language_loss": 0.79256064, "learning_rate": 9.907435963122372e-07, "loss": 0.81425625, "num_input_tokens_seen": 240065785, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 11117, "time_per_iteration": 4.215818166732788 }, { "auxiliary_loss_clip": 0.01145439, "auxiliary_loss_mlp": 0.01031529, "balance_loss_clip": 1.01726007, "balance_loss_mlp": 1.0381397, "epoch": 0.6684503231624831, "flos": 20739023072640.0, "grad_norm": 1.634664922489833, "language_loss": 0.65508258, "learning_rate": 9.904174264152738e-07, "loss": 0.67685229, "num_input_tokens_seen": 240085130, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.71875, "step": 11118, "time_per_iteration": 4.121062994003296 }, { "auxiliary_loss_clip": 0.0112321, "auxiliary_loss_mlp": 0.01031048, "balance_loss_clip": 1.01921701, "balance_loss_mlp": 1.03671312, "epoch": 0.668510446415151, "flos": 21762189982080.0, "grad_norm": 1.6755860812930436, "language_loss": 0.68489009, "learning_rate": 9.900912925477157e-07, "loss": 0.7064327, "num_input_tokens_seen": 240105495, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69140625, "step": 11119, "time_per_iteration": 2.6253156661987305 }, { "auxiliary_loss_clip": 0.01125029, "auxiliary_loss_mlp": 0.0103054, "balance_loss_clip": 1.01767695, "balance_loss_mlp": 1.03666019, "epoch": 0.668570569667819, "flos": 30191250015360.0, "grad_norm": 1.719889692547912, "language_loss": 0.67297685, "learning_rate": 9.897651947212007e-07, "loss": 0.69453251, "num_input_tokens_seen": 240125455, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 11120, "time_per_iteration": 2.7843480110168457 }, { "auxiliary_loss_clip": 0.01135435, "auxiliary_loss_mlp": 0.01031028, "balance_loss_clip": 1.0178678, "balance_loss_mlp": 1.03803813, "epoch": 0.668630692920487, "flos": 24170646764160.0, "grad_norm": 1.6182233794154857, "language_loss": 0.71928424, "learning_rate": 9.894391329473685e-07, "loss": 0.7409488, "num_input_tokens_seen": 240143870, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 11121, "time_per_iteration": 2.6448304653167725 }, { "auxiliary_loss_clip": 0.01125955, "auxiliary_loss_mlp": 0.0103851, "balance_loss_clip": 1.02475941, "balance_loss_mlp": 1.03529263, "epoch": 0.668690816173155, "flos": 17457147181440.0, "grad_norm": 3.134436004979092, "language_loss": 0.70100868, "learning_rate": 9.891131072378532e-07, "loss": 0.72265333, "num_input_tokens_seen": 240161020, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 11122, "time_per_iteration": 2.5829520225524902 }, { "auxiliary_loss_clip": 0.01152905, "auxiliary_loss_mlp": 0.01283705, "balance_loss_clip": 1.02437663, "balance_loss_mlp": 1.03772211, "epoch": 0.668750939425823, "flos": 25005102595200.0, "grad_norm": 1.6174280367620053, "language_loss": 0.71409369, "learning_rate": 9.88787117604291e-07, "loss": 0.73845977, "num_input_tokens_seen": 240179820, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 11123, "time_per_iteration": 2.739048719406128 }, { "auxiliary_loss_clip": 0.0113002, "auxiliary_loss_mlp": 0.01032608, "balance_loss_clip": 1.01979303, "balance_loss_mlp": 1.03965926, "epoch": 0.6688110626784909, "flos": 24096778444800.0, "grad_norm": 1.719411104254889, "language_loss": 0.79297727, "learning_rate": 9.884611640583158e-07, "loss": 0.81460357, "num_input_tokens_seen": 240200130, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 11124, "time_per_iteration": 2.7280125617980957 }, { "auxiliary_loss_clip": 0.01126964, "auxiliary_loss_mlp": 0.01036926, "balance_loss_clip": 1.02381921, "balance_loss_mlp": 1.03789759, "epoch": 0.6688711859311589, "flos": 21759532375680.0, "grad_norm": 1.7507481097743596, "language_loss": 0.74080986, "learning_rate": 9.881352466115596e-07, "loss": 0.76244873, "num_input_tokens_seen": 240217945, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 11125, "time_per_iteration": 2.672999143600464 }, { "auxiliary_loss_clip": 0.0114047, "auxiliary_loss_mlp": 0.01037182, "balance_loss_clip": 1.02409911, "balance_loss_mlp": 1.04097271, "epoch": 0.6689313091838268, "flos": 22929645824640.0, "grad_norm": 2.343476333938272, "language_loss": 0.66559929, "learning_rate": 9.878093652756528e-07, "loss": 0.68737578, "num_input_tokens_seen": 240237220, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 11126, "time_per_iteration": 2.6242496967315674 }, { "auxiliary_loss_clip": 0.01116698, "auxiliary_loss_mlp": 0.01027295, "balance_loss_clip": 1.01493263, "balance_loss_mlp": 1.03625154, "epoch": 0.6689914324364948, "flos": 20886149180160.0, "grad_norm": 1.5379173895149176, "language_loss": 0.71185935, "learning_rate": 9.874835200622266e-07, "loss": 0.73329926, "num_input_tokens_seen": 240256000, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.7109375, "step": 11127, "time_per_iteration": 2.6464145183563232 }, { "auxiliary_loss_clip": 0.01128845, "auxiliary_loss_mlp": 0.01030736, "balance_loss_clip": 1.01688409, "balance_loss_mlp": 1.03626513, "epoch": 0.6690515556891627, "flos": 22748225207040.0, "grad_norm": 1.984050143408539, "language_loss": 0.80550003, "learning_rate": 9.871577109829101e-07, "loss": 0.82709587, "num_input_tokens_seen": 240275845, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75390625, "step": 11128, "time_per_iteration": 2.5873377323150635 }, { "auxiliary_loss_clip": 0.01109585, "auxiliary_loss_mlp": 0.01029763, "balance_loss_clip": 1.01683486, "balance_loss_mlp": 1.03716683, "epoch": 0.6691116789418308, "flos": 23331450337920.0, "grad_norm": 1.766143559019861, "language_loss": 0.80815768, "learning_rate": 9.868319380493283e-07, "loss": 0.82955122, "num_input_tokens_seen": 240294095, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 11129, "time_per_iteration": 2.608144521713257 }, { "auxiliary_loss_clip": 0.01112723, "auxiliary_loss_mlp": 0.01036036, "balance_loss_clip": 1.02441311, "balance_loss_mlp": 1.03622735, "epoch": 0.6691718021944987, "flos": 32447014081920.0, "grad_norm": 1.483117599299767, "language_loss": 0.70280933, "learning_rate": 9.865062012731088e-07, "loss": 0.72429693, "num_input_tokens_seen": 240313460, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 11130, "time_per_iteration": 2.622711181640625 }, { "auxiliary_loss_clip": 0.01124106, "auxiliary_loss_mlp": 0.01036814, "balance_loss_clip": 1.02497661, "balance_loss_mlp": 1.03829551, "epoch": 0.6692319254471667, "flos": 23731602825600.0, "grad_norm": 1.9254720472543827, "language_loss": 0.70180798, "learning_rate": 9.86180500665876e-07, "loss": 0.72341716, "num_input_tokens_seen": 240333540, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 11131, "time_per_iteration": 2.613844156265259 }, { "auxiliary_loss_clip": 0.01117448, "auxiliary_loss_mlp": 0.01033114, "balance_loss_clip": 1.0192678, "balance_loss_mlp": 1.03655386, "epoch": 0.6692920486998346, "flos": 14427902620800.0, "grad_norm": 2.2031819424984347, "language_loss": 0.65538079, "learning_rate": 9.858548362392534e-07, "loss": 0.67688644, "num_input_tokens_seen": 240350085, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.72265625, "step": 11132, "time_per_iteration": 2.5099523067474365 }, { "auxiliary_loss_clip": 0.01118357, "auxiliary_loss_mlp": 0.01031673, "balance_loss_clip": 1.01819658, "balance_loss_mlp": 1.03718376, "epoch": 0.6693521719525026, "flos": 21507475662720.0, "grad_norm": 1.605933340745644, "language_loss": 0.74730253, "learning_rate": 9.855292080048622e-07, "loss": 0.76880288, "num_input_tokens_seen": 240370015, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 11133, "time_per_iteration": 2.6095314025878906 }, { "auxiliary_loss_clip": 0.01123852, "auxiliary_loss_mlp": 0.01033266, "balance_loss_clip": 1.0190208, "balance_loss_mlp": 1.04087353, "epoch": 0.6694122952051706, "flos": 25406943022080.0, "grad_norm": 1.8571733274344229, "language_loss": 0.66526997, "learning_rate": 9.852036159743255e-07, "loss": 0.68684113, "num_input_tokens_seen": 240390770, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.74609375, "step": 11134, "time_per_iteration": 2.561732053756714 }, { "auxiliary_loss_clip": 0.01125448, "auxiliary_loss_mlp": 0.01035635, "balance_loss_clip": 1.02143109, "balance_loss_mlp": 1.04067826, "epoch": 0.6694724184578386, "flos": 25661729168640.0, "grad_norm": 2.020879825634783, "language_loss": 0.77328688, "learning_rate": 9.8487806015926e-07, "loss": 0.79489768, "num_input_tokens_seen": 240409590, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7578125, "step": 11135, "time_per_iteration": 2.577582836151123 }, { "auxiliary_loss_clip": 0.01114298, "auxiliary_loss_mlp": 0.01029505, "balance_loss_clip": 1.01679134, "balance_loss_mlp": 1.03653502, "epoch": 0.6695325417105066, "flos": 17709311635200.0, "grad_norm": 1.6334268615466034, "language_loss": 0.73668015, "learning_rate": 9.84552540571286e-07, "loss": 0.75811821, "num_input_tokens_seen": 240428180, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 11136, "time_per_iteration": 2.4939935207366943 }, { "auxiliary_loss_clip": 0.01123212, "auxiliary_loss_mlp": 0.01035558, "balance_loss_clip": 1.02151537, "balance_loss_mlp": 1.04005885, "epoch": 0.6695926649631745, "flos": 24460050643200.0, "grad_norm": 1.5978651449124646, "language_loss": 0.61964869, "learning_rate": 9.84227057222019e-07, "loss": 0.64123642, "num_input_tokens_seen": 240447815, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7421875, "step": 11137, "time_per_iteration": 2.5749146938323975 }, { "auxiliary_loss_clip": 0.01107765, "auxiliary_loss_mlp": 0.0103301, "balance_loss_clip": 1.01999807, "balance_loss_mlp": 1.03744102, "epoch": 0.6696527882158425, "flos": 24280138396800.0, "grad_norm": 1.7066970086510542, "language_loss": 0.65426445, "learning_rate": 9.83901610123077e-07, "loss": 0.67567217, "num_input_tokens_seen": 240468635, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 11138, "time_per_iteration": 2.5434460639953613 }, { "auxiliary_loss_clip": 0.0112939, "auxiliary_loss_mlp": 0.0103361, "balance_loss_clip": 1.02140832, "balance_loss_mlp": 1.03468227, "epoch": 0.6697129114685104, "flos": 23002759958400.0, "grad_norm": 4.830556040082551, "language_loss": 0.72893757, "learning_rate": 9.835761992860711e-07, "loss": 0.7505675, "num_input_tokens_seen": 240488550, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 11139, "time_per_iteration": 2.568821907043457 }, { "auxiliary_loss_clip": 0.01130504, "auxiliary_loss_mlp": 0.01031394, "balance_loss_clip": 1.01987886, "balance_loss_mlp": 1.03647208, "epoch": 0.6697730347211784, "flos": 22638123043200.0, "grad_norm": 2.296891162266806, "language_loss": 0.70577341, "learning_rate": 9.832508247226172e-07, "loss": 0.72739238, "num_input_tokens_seen": 240508330, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 11140, "time_per_iteration": 2.558504581451416 }, { "auxiliary_loss_clip": 0.01117455, "auxiliary_loss_mlp": 0.01028608, "balance_loss_clip": 1.0151968, "balance_loss_mlp": 1.03583062, "epoch": 0.6698331579738463, "flos": 28877242682880.0, "grad_norm": 2.569144456613661, "language_loss": 0.75591475, "learning_rate": 9.829254864443258e-07, "loss": 0.77737534, "num_input_tokens_seen": 240528470, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 11141, "time_per_iteration": 2.6413493156433105 }, { "auxiliary_loss_clip": 0.01116623, "auxiliary_loss_mlp": 0.01032535, "balance_loss_clip": 1.01961267, "balance_loss_mlp": 1.0362525, "epoch": 0.6698932812265144, "flos": 24207096090240.0, "grad_norm": 1.8185738802037414, "language_loss": 0.82561809, "learning_rate": 9.826001844628075e-07, "loss": 0.84710968, "num_input_tokens_seen": 240547815, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 11142, "time_per_iteration": 2.5528669357299805 }, { "auxiliary_loss_clip": 0.01140793, "auxiliary_loss_mlp": 0.01028302, "balance_loss_clip": 1.01511109, "balance_loss_mlp": 1.0356741, "epoch": 0.6699534044791823, "flos": 22090269830400.0, "grad_norm": 1.625296601979211, "language_loss": 0.69972634, "learning_rate": 9.822749187896716e-07, "loss": 0.72141719, "num_input_tokens_seen": 240567765, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6953125, "step": 11143, "time_per_iteration": 2.665050745010376 }, { "auxiliary_loss_clip": 0.01130476, "auxiliary_loss_mlp": 0.01278054, "balance_loss_clip": 1.01955438, "balance_loss_mlp": 1.03587282, "epoch": 0.6700135277318503, "flos": 25192377129600.0, "grad_norm": 1.3860279601628198, "language_loss": 0.69710833, "learning_rate": 9.819496894365254e-07, "loss": 0.72119361, "num_input_tokens_seen": 240590750, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 11144, "time_per_iteration": 2.6315722465515137 }, { "auxiliary_loss_clip": 0.01125229, "auxiliary_loss_mlp": 0.01031449, "balance_loss_clip": 1.01806808, "balance_loss_mlp": 1.03718376, "epoch": 0.6700736509845182, "flos": 23440187784960.0, "grad_norm": 1.5794746188025077, "language_loss": 0.7428149, "learning_rate": 9.816244964149773e-07, "loss": 0.76438165, "num_input_tokens_seen": 240608875, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.69921875, "step": 11145, "time_per_iteration": 2.5941269397735596 }, { "auxiliary_loss_clip": 0.01128273, "auxiliary_loss_mlp": 0.01030569, "balance_loss_clip": 1.01806402, "balance_loss_mlp": 1.03741384, "epoch": 0.6701337742371862, "flos": 24389953251840.0, "grad_norm": 1.4654393432249577, "language_loss": 0.70607865, "learning_rate": 9.812993397366301e-07, "loss": 0.72766703, "num_input_tokens_seen": 240628565, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.734375, "step": 11146, "time_per_iteration": 2.5626847743988037 }, { "auxiliary_loss_clip": 0.01103137, "auxiliary_loss_mlp": 0.01275144, "balance_loss_clip": 1.01634502, "balance_loss_mlp": 1.03592944, "epoch": 0.6701938974898543, "flos": 14793652857600.0, "grad_norm": 1.951518231994728, "language_loss": 0.78442419, "learning_rate": 9.809742194130895e-07, "loss": 0.80820704, "num_input_tokens_seen": 240646325, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.671875, "step": 11147, "time_per_iteration": 3.928403854370117 }, { "auxiliary_loss_clip": 0.01118729, "auxiliary_loss_mlp": 0.01285338, "balance_loss_clip": 1.02590513, "balance_loss_mlp": 1.03942311, "epoch": 0.6702540207425222, "flos": 20154002261760.0, "grad_norm": 3.9310814926149558, "language_loss": 0.70617276, "learning_rate": 9.806491354559579e-07, "loss": 0.73021346, "num_input_tokens_seen": 240666145, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 11148, "time_per_iteration": 2.5692436695098877 }, { "auxiliary_loss_clip": 0.01124694, "auxiliary_loss_mlp": 0.01036851, "balance_loss_clip": 1.02380991, "balance_loss_mlp": 1.03534055, "epoch": 0.6703141439951902, "flos": 21214157201280.0, "grad_norm": 1.912353310705701, "language_loss": 0.70033759, "learning_rate": 9.803240878768366e-07, "loss": 0.72195303, "num_input_tokens_seen": 240685570, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 11149, "time_per_iteration": 2.5471255779266357 }, { "auxiliary_loss_clip": 0.01126183, "auxiliary_loss_mlp": 0.0102955, "balance_loss_clip": 1.01747966, "balance_loss_mlp": 1.03624046, "epoch": 0.6703742672478581, "flos": 23112538899840.0, "grad_norm": 1.7988249476697182, "language_loss": 0.73459721, "learning_rate": 9.799990766873246e-07, "loss": 0.75615448, "num_input_tokens_seen": 240706945, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.71875, "step": 11150, "time_per_iteration": 2.6121795177459717 }, { "auxiliary_loss_clip": 0.0111913, "auxiliary_loss_mlp": 0.01035791, "balance_loss_clip": 1.02310753, "balance_loss_mlp": 1.03865731, "epoch": 0.6704343905005261, "flos": 22528918719360.0, "grad_norm": 1.9835860633973745, "language_loss": 0.78803766, "learning_rate": 9.796741018990237e-07, "loss": 0.80958688, "num_input_tokens_seen": 240727990, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 11151, "time_per_iteration": 2.5373892784118652 }, { "auxiliary_loss_clip": 0.0106577, "auxiliary_loss_mlp": 0.01000927, "balance_loss_clip": 0.99966925, "balance_loss_mlp": 1.00822544, "epoch": 0.670494513753194, "flos": 64793158773120.0, "grad_norm": 0.7882753485439624, "language_loss": 0.55553603, "learning_rate": 9.79349163523528e-07, "loss": 0.57620299, "num_input_tokens_seen": 240790380, "router_z_loss_clip": 0.01257324, "router_z_loss_mlp": 0.21679688, "step": 11152, "time_per_iteration": 3.256586790084839 }, { "auxiliary_loss_clip": 0.01124936, "auxiliary_loss_mlp": 0.01036785, "balance_loss_clip": 1.02362418, "balance_loss_mlp": 1.03585672, "epoch": 0.670554637005862, "flos": 23511506238720.0, "grad_norm": 1.611789486070735, "language_loss": 0.81049025, "learning_rate": 9.790242615724358e-07, "loss": 0.83210742, "num_input_tokens_seen": 240811545, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71484375, "step": 11153, "time_per_iteration": 3.9088969230651855 }, { "auxiliary_loss_clip": 0.01133695, "auxiliary_loss_mlp": 0.0128115, "balance_loss_clip": 1.02280676, "balance_loss_mlp": 1.03564799, "epoch": 0.67061476025853, "flos": 19463404400640.0, "grad_norm": 1.442367590177511, "language_loss": 0.76110989, "learning_rate": 9.78699396057341e-07, "loss": 0.78525835, "num_input_tokens_seen": 240831380, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.7109375, "step": 11154, "time_per_iteration": 2.521730422973633 }, { "auxiliary_loss_clip": 0.01131296, "auxiliary_loss_mlp": 0.01034286, "balance_loss_clip": 1.02136958, "balance_loss_mlp": 1.03908134, "epoch": 0.670674883511198, "flos": 20519967980160.0, "grad_norm": 1.5514652852856567, "language_loss": 0.76204902, "learning_rate": 9.783745669898388e-07, "loss": 0.78370488, "num_input_tokens_seen": 240851855, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7421875, "step": 11155, "time_per_iteration": 2.558795928955078 }, { "auxiliary_loss_clip": 0.01109147, "auxiliary_loss_mlp": 0.01037768, "balance_loss_clip": 1.02526248, "balance_loss_mlp": 1.03810692, "epoch": 0.6707350067638659, "flos": 25483971738240.0, "grad_norm": 2.030234451301611, "language_loss": 0.82164609, "learning_rate": 9.78049774381519e-07, "loss": 0.84311521, "num_input_tokens_seen": 240869980, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 11156, "time_per_iteration": 2.5504231452941895 }, { "auxiliary_loss_clip": 0.01105592, "auxiliary_loss_mlp": 0.01030895, "balance_loss_clip": 1.01912928, "balance_loss_mlp": 1.03558934, "epoch": 0.6707951300165339, "flos": 22273450214400.0, "grad_norm": 3.007303633981176, "language_loss": 0.74725389, "learning_rate": 9.777250182439746e-07, "loss": 0.76861876, "num_input_tokens_seen": 240888680, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69921875, "step": 11157, "time_per_iteration": 2.6313676834106445 }, { "auxiliary_loss_clip": 0.01119296, "auxiliary_loss_mlp": 0.01036591, "balance_loss_clip": 1.02350819, "balance_loss_mlp": 1.03846335, "epoch": 0.6708552532692018, "flos": 23984593292160.0, "grad_norm": 1.5432550062382686, "language_loss": 0.74060661, "learning_rate": 9.774002985887957e-07, "loss": 0.76216549, "num_input_tokens_seen": 240909050, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 11158, "time_per_iteration": 2.5795841217041016 }, { "auxiliary_loss_clip": 0.01111954, "auxiliary_loss_mlp": 0.01035324, "balance_loss_clip": 1.02229428, "balance_loss_mlp": 1.03846717, "epoch": 0.6709153765218698, "flos": 24937519155840.0, "grad_norm": 1.406522613917821, "language_loss": 0.81350911, "learning_rate": 9.770756154275681e-07, "loss": 0.83498192, "num_input_tokens_seen": 240930035, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 11159, "time_per_iteration": 4.023266792297363 }, { "auxiliary_loss_clip": 0.01124576, "auxiliary_loss_mlp": 0.01032788, "balance_loss_clip": 1.02014565, "balance_loss_mlp": 1.03563845, "epoch": 0.6709754997745379, "flos": 17530225401600.0, "grad_norm": 3.191295759561293, "language_loss": 0.7727949, "learning_rate": 9.767509687718811e-07, "loss": 0.79436857, "num_input_tokens_seen": 240948895, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 11160, "time_per_iteration": 4.064082384109497 }, { "auxiliary_loss_clip": 0.01122409, "auxiliary_loss_mlp": 0.01027837, "balance_loss_clip": 1.01598787, "balance_loss_mlp": 1.03576183, "epoch": 0.6710356230272058, "flos": 22090880361600.0, "grad_norm": 1.772664561032425, "language_loss": 0.73173243, "learning_rate": 9.764263586333195e-07, "loss": 0.75323486, "num_input_tokens_seen": 240967770, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.69140625, "step": 11161, "time_per_iteration": 2.5434038639068604 }, { "auxiliary_loss_clip": 0.01129718, "auxiliary_loss_mlp": 0.01038221, "balance_loss_clip": 1.02457738, "balance_loss_mlp": 1.03761721, "epoch": 0.6710957462798738, "flos": 24206449645440.0, "grad_norm": 1.8152735991817133, "language_loss": 0.6808691, "learning_rate": 9.761017850234695e-07, "loss": 0.7025485, "num_input_tokens_seen": 240988985, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 11162, "time_per_iteration": 2.6202986240386963 }, { "auxiliary_loss_clip": 0.0112524, "auxiliary_loss_mlp": 0.01036448, "balance_loss_clip": 1.0245626, "balance_loss_mlp": 1.03686404, "epoch": 0.6711558695325417, "flos": 19093955063040.0, "grad_norm": 1.8058990102714787, "language_loss": 0.69968146, "learning_rate": 9.757772479539116e-07, "loss": 0.72129834, "num_input_tokens_seen": 241005455, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.703125, "step": 11163, "time_per_iteration": 2.4939017295837402 }, { "auxiliary_loss_clip": 0.01105911, "auxiliary_loss_mlp": 0.01031497, "balance_loss_clip": 1.01961184, "balance_loss_mlp": 1.03782773, "epoch": 0.6712159927852097, "flos": 25557875971200.0, "grad_norm": 1.5392458272042087, "language_loss": 0.75495172, "learning_rate": 9.754527474362296e-07, "loss": 0.77632576, "num_input_tokens_seen": 241026175, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 11164, "time_per_iteration": 2.560450553894043 }, { "auxiliary_loss_clip": 0.01106036, "auxiliary_loss_mlp": 0.01031115, "balance_loss_clip": 1.01866388, "balance_loss_mlp": 1.03646207, "epoch": 0.6712761160378776, "flos": 22228812587520.0, "grad_norm": 2.805223411312557, "language_loss": 0.65296733, "learning_rate": 9.751282834820039e-07, "loss": 0.67433882, "num_input_tokens_seen": 241044040, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 11165, "time_per_iteration": 2.5476584434509277 }, { "auxiliary_loss_clip": 0.01105516, "auxiliary_loss_mlp": 0.01029605, "balance_loss_clip": 1.01755261, "balance_loss_mlp": 1.03688312, "epoch": 0.6713362392905456, "flos": 22455517276800.0, "grad_norm": 1.7554580157114934, "language_loss": 0.7127862, "learning_rate": 9.74803856102813e-07, "loss": 0.73413742, "num_input_tokens_seen": 241063615, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 11166, "time_per_iteration": 2.523268222808838 }, { "auxiliary_loss_clip": 0.01104965, "auxiliary_loss_mlp": 0.01027589, "balance_loss_clip": 1.01631117, "balance_loss_mlp": 1.0371598, "epoch": 0.6713963625432136, "flos": 25630200005760.0, "grad_norm": 1.9161462066602348, "language_loss": 0.77033001, "learning_rate": 9.74479465310235e-07, "loss": 0.79165554, "num_input_tokens_seen": 241082520, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6796875, "step": 11167, "time_per_iteration": 2.566023349761963 }, { "auxiliary_loss_clip": 0.0111461, "auxiliary_loss_mlp": 0.01030346, "balance_loss_clip": 1.01840687, "balance_loss_mlp": 1.03490663, "epoch": 0.6714564857958816, "flos": 35006475640320.0, "grad_norm": 2.6019682556989197, "language_loss": 0.68739545, "learning_rate": 9.741551111158485e-07, "loss": 0.70884502, "num_input_tokens_seen": 241103505, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.70703125, "step": 11168, "time_per_iteration": 2.702897310256958 }, { "auxiliary_loss_clip": 0.01136981, "auxiliary_loss_mlp": 0.01036912, "balance_loss_clip": 1.02341127, "balance_loss_mlp": 1.03712761, "epoch": 0.6715166090485495, "flos": 26279931168000.0, "grad_norm": 1.7341465865085277, "language_loss": 0.73096317, "learning_rate": 9.738307935312257e-07, "loss": 0.75270212, "num_input_tokens_seen": 241122885, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 11169, "time_per_iteration": 2.587561845779419 }, { "auxiliary_loss_clip": 0.011165, "auxiliary_loss_mlp": 0.01035669, "balance_loss_clip": 1.02236557, "balance_loss_mlp": 1.03598809, "epoch": 0.6715767323012175, "flos": 15924156583680.0, "grad_norm": 2.08104905249275, "language_loss": 0.76200199, "learning_rate": 9.735065125679432e-07, "loss": 0.7835238, "num_input_tokens_seen": 241140865, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 11170, "time_per_iteration": 2.516273021697998 }, { "auxiliary_loss_clip": 0.01126843, "auxiliary_loss_mlp": 0.01028794, "balance_loss_clip": 1.01609218, "balance_loss_mlp": 1.0362339, "epoch": 0.6716368555538854, "flos": 17491441691520.0, "grad_norm": 8.048750765039722, "language_loss": 0.74000192, "learning_rate": 9.731822682375717e-07, "loss": 0.76155829, "num_input_tokens_seen": 241158225, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 11171, "time_per_iteration": 2.5318102836608887 }, { "auxiliary_loss_clip": 0.01138004, "auxiliary_loss_mlp": 0.01040729, "balance_loss_clip": 1.02770579, "balance_loss_mlp": 1.03890121, "epoch": 0.6716969788065534, "flos": 16761521416320.0, "grad_norm": 1.597040219142306, "language_loss": 0.86442298, "learning_rate": 9.728580605516854e-07, "loss": 0.88621032, "num_input_tokens_seen": 241175215, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.72265625, "step": 11172, "time_per_iteration": 2.550807476043701 }, { "auxiliary_loss_clip": 0.01113804, "auxiliary_loss_mlp": 0.01033801, "balance_loss_clip": 1.02072966, "balance_loss_mlp": 1.03436399, "epoch": 0.6717571020592215, "flos": 22709800632960.0, "grad_norm": 1.6385536289021236, "language_loss": 0.63591504, "learning_rate": 9.72533889521852e-07, "loss": 0.65739113, "num_input_tokens_seen": 241195250, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 11173, "time_per_iteration": 2.520836353302002 }, { "auxiliary_loss_clip": 0.01121782, "auxiliary_loss_mlp": 0.01041077, "balance_loss_clip": 1.02658081, "balance_loss_mlp": 1.03640604, "epoch": 0.6718172253118894, "flos": 18734094656640.0, "grad_norm": 2.5424358884755383, "language_loss": 0.71087813, "learning_rate": 9.722097551596404e-07, "loss": 0.73250663, "num_input_tokens_seen": 241210720, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.765625, "step": 11174, "time_per_iteration": 2.4799656867980957 }, { "auxiliary_loss_clip": 0.01130182, "auxiliary_loss_mlp": 0.01029986, "balance_loss_clip": 1.0176661, "balance_loss_mlp": 1.034899, "epoch": 0.6718773485645574, "flos": 15632526061440.0, "grad_norm": 2.333290677542884, "language_loss": 0.68824458, "learning_rate": 9.718856574766205e-07, "loss": 0.70984632, "num_input_tokens_seen": 241227395, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 11175, "time_per_iteration": 2.5212349891662598 }, { "auxiliary_loss_clip": 0.01123814, "auxiliary_loss_mlp": 0.01037345, "balance_loss_clip": 1.0252986, "balance_loss_mlp": 1.0373466, "epoch": 0.6719374718172253, "flos": 19354774694400.0, "grad_norm": 1.7909106401514336, "language_loss": 0.73596638, "learning_rate": 9.71561596484355e-07, "loss": 0.75757796, "num_input_tokens_seen": 241246355, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 11176, "time_per_iteration": 2.532752275466919 }, { "auxiliary_loss_clip": 0.01124599, "auxiliary_loss_mlp": 0.01033158, "balance_loss_clip": 1.01989603, "balance_loss_mlp": 1.03562188, "epoch": 0.6719975950698933, "flos": 21981316901760.0, "grad_norm": 1.814832602565748, "language_loss": 0.72754157, "learning_rate": 9.712375721944117e-07, "loss": 0.74911916, "num_input_tokens_seen": 241264180, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 11177, "time_per_iteration": 2.5785980224609375 }, { "auxiliary_loss_clip": 0.01108447, "auxiliary_loss_mlp": 0.01032523, "balance_loss_clip": 1.01949382, "balance_loss_mlp": 1.03732479, "epoch": 0.6720577183225612, "flos": 25228072270080.0, "grad_norm": 1.9098669329680795, "language_loss": 0.76385152, "learning_rate": 9.709135846183531e-07, "loss": 0.78526127, "num_input_tokens_seen": 241282245, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 11178, "time_per_iteration": 2.5394673347473145 }, { "auxiliary_loss_clip": 0.01119183, "auxiliary_loss_mlp": 0.01028642, "balance_loss_clip": 1.01686406, "balance_loss_mlp": 1.0384407, "epoch": 0.6721178415752292, "flos": 16945886949120.0, "grad_norm": 1.8007915868113753, "language_loss": 0.70052004, "learning_rate": 9.705896337677418e-07, "loss": 0.72199827, "num_input_tokens_seen": 241300745, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.71875, "step": 11179, "time_per_iteration": 2.552093982696533 }, { "auxiliary_loss_clip": 0.01141373, "auxiliary_loss_mlp": 0.01029454, "balance_loss_clip": 1.01746154, "balance_loss_mlp": 1.03711534, "epoch": 0.6721779648278972, "flos": 21541375123200.0, "grad_norm": 1.5108055373185294, "language_loss": 0.7368412, "learning_rate": 9.702657196541372e-07, "loss": 0.75854945, "num_input_tokens_seen": 241319320, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 11180, "time_per_iteration": 2.5364291667938232 }, { "auxiliary_loss_clip": 0.01123642, "auxiliary_loss_mlp": 0.0103101, "balance_loss_clip": 1.01877952, "balance_loss_mlp": 1.03643417, "epoch": 0.6722380880805652, "flos": 22605444645120.0, "grad_norm": 1.5424830100318974, "language_loss": 0.75071967, "learning_rate": 9.699418422891014e-07, "loss": 0.77226621, "num_input_tokens_seen": 241342225, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 11181, "time_per_iteration": 2.582280397415161 }, { "auxiliary_loss_clip": 0.01124082, "auxiliary_loss_mlp": 0.01029411, "balance_loss_clip": 1.0175854, "balance_loss_mlp": 1.03479409, "epoch": 0.6722982113332331, "flos": 15925269905280.0, "grad_norm": 2.2602799970570033, "language_loss": 0.74377894, "learning_rate": 9.696180016841917e-07, "loss": 0.76531386, "num_input_tokens_seen": 241358240, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.71875, "step": 11182, "time_per_iteration": 2.495279550552368 }, { "auxiliary_loss_clip": 0.01112999, "auxiliary_loss_mlp": 0.01028445, "balance_loss_clip": 1.0170784, "balance_loss_mlp": 1.03586388, "epoch": 0.6723583345859011, "flos": 20596170683520.0, "grad_norm": 1.7092715327544314, "language_loss": 0.69880182, "learning_rate": 9.692941978509649e-07, "loss": 0.72021627, "num_input_tokens_seen": 241378420, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.68359375, "step": 11183, "time_per_iteration": 2.532309055328369 }, { "auxiliary_loss_clip": 0.01142478, "auxiliary_loss_mlp": 0.01032358, "balance_loss_clip": 1.01954329, "balance_loss_mlp": 1.03556514, "epoch": 0.672418457838569, "flos": 21725848396800.0, "grad_norm": 1.7822534859963512, "language_loss": 0.77545965, "learning_rate": 9.68970430800976e-07, "loss": 0.79720795, "num_input_tokens_seen": 241397185, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 11184, "time_per_iteration": 2.545194387435913 }, { "auxiliary_loss_clip": 0.01135048, "auxiliary_loss_mlp": 0.01280755, "balance_loss_clip": 1.01988256, "balance_loss_mlp": 1.04037941, "epoch": 0.672478581091237, "flos": 21470379891840.0, "grad_norm": 1.9323386993692513, "language_loss": 0.65783489, "learning_rate": 9.68646700545782e-07, "loss": 0.68199289, "num_input_tokens_seen": 241415785, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76953125, "step": 11185, "time_per_iteration": 2.574862003326416 }, { "auxiliary_loss_clip": 0.01133443, "auxiliary_loss_mlp": 0.01031668, "balance_loss_clip": 1.01881146, "balance_loss_mlp": 1.03594899, "epoch": 0.6725387043439051, "flos": 30846763267200.0, "grad_norm": 1.5762939283616115, "language_loss": 0.80486417, "learning_rate": 9.683230070969328e-07, "loss": 0.82651532, "num_input_tokens_seen": 241437390, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 11186, "time_per_iteration": 2.593979835510254 }, { "auxiliary_loss_clip": 0.01114669, "auxiliary_loss_mlp": 0.01032391, "balance_loss_clip": 1.0212214, "balance_loss_mlp": 1.03855085, "epoch": 0.672598827596573, "flos": 24055947659520.0, "grad_norm": 1.4153772982127322, "language_loss": 0.80526, "learning_rate": 9.679993504659823e-07, "loss": 0.82673061, "num_input_tokens_seen": 241458085, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.67578125, "step": 11187, "time_per_iteration": 2.5665225982666016 }, { "auxiliary_loss_clip": 0.01148208, "auxiliary_loss_mlp": 0.01037144, "balance_loss_clip": 1.02337575, "balance_loss_mlp": 1.03842664, "epoch": 0.672658950849241, "flos": 21871861182720.0, "grad_norm": 2.4082848754770363, "language_loss": 0.71101213, "learning_rate": 9.676757306644805e-07, "loss": 0.73286569, "num_input_tokens_seen": 241476880, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.73828125, "step": 11188, "time_per_iteration": 2.5425398349761963 }, { "auxiliary_loss_clip": 0.01157661, "auxiliary_loss_mlp": 0.01031765, "balance_loss_clip": 1.01973629, "balance_loss_mlp": 1.0365932, "epoch": 0.6727190741019089, "flos": 23222102359680.0, "grad_norm": 1.9007222730900424, "language_loss": 0.7597971, "learning_rate": 9.673521477039763e-07, "loss": 0.78169131, "num_input_tokens_seen": 241496535, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 11189, "time_per_iteration": 4.054349422454834 }, { "auxiliary_loss_clip": 0.01109281, "auxiliary_loss_mlp": 0.01030825, "balance_loss_clip": 1.01656771, "balance_loss_mlp": 1.0356282, "epoch": 0.6727791973545769, "flos": 15778610674560.0, "grad_norm": 2.4222889765237445, "language_loss": 0.74767238, "learning_rate": 9.670286015960178e-07, "loss": 0.76907343, "num_input_tokens_seen": 241513465, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.734375, "step": 11190, "time_per_iteration": 2.512699842453003 }, { "auxiliary_loss_clip": 0.01150189, "auxiliary_loss_mlp": 0.01032001, "balance_loss_clip": 1.01928174, "balance_loss_mlp": 1.0351119, "epoch": 0.6728393206072448, "flos": 21249852341760.0, "grad_norm": 1.4685670938883726, "language_loss": 0.76981461, "learning_rate": 9.667050923521504e-07, "loss": 0.79163659, "num_input_tokens_seen": 241534125, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 11191, "time_per_iteration": 2.6040468215942383 }, { "auxiliary_loss_clip": 0.01104413, "auxiliary_loss_mlp": 0.01028556, "balance_loss_clip": 1.01624775, "balance_loss_mlp": 1.03562522, "epoch": 0.6728994438599128, "flos": 32123279779200.0, "grad_norm": 1.9794798034416983, "language_loss": 0.86268651, "learning_rate": 9.66381619983922e-07, "loss": 0.88401622, "num_input_tokens_seen": 241556340, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 11192, "time_per_iteration": 2.5863099098205566 }, { "auxiliary_loss_clip": 0.01118517, "auxiliary_loss_mlp": 0.0103572, "balance_loss_clip": 1.02248752, "balance_loss_mlp": 1.0381341, "epoch": 0.6729595671125808, "flos": 23112359331840.0, "grad_norm": 2.109319573902116, "language_loss": 0.76436424, "learning_rate": 9.660581845028732e-07, "loss": 0.78590667, "num_input_tokens_seen": 241575185, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 11193, "time_per_iteration": 2.5613014698028564 }, { "auxiliary_loss_clip": 0.01135337, "auxiliary_loss_mlp": 0.01032593, "balance_loss_clip": 1.01984334, "balance_loss_mlp": 1.03821898, "epoch": 0.6730196903652488, "flos": 14611406227200.0, "grad_norm": 2.0974523235247147, "language_loss": 0.78736055, "learning_rate": 9.65734785920549e-07, "loss": 0.80903983, "num_input_tokens_seen": 241592970, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 11194, "time_per_iteration": 2.500615119934082 }, { "auxiliary_loss_clip": 0.01118015, "auxiliary_loss_mlp": 0.01027807, "balance_loss_clip": 1.01596916, "balance_loss_mlp": 1.03691602, "epoch": 0.6730798136179167, "flos": 21105922544640.0, "grad_norm": 1.7482278510700495, "language_loss": 0.89818096, "learning_rate": 9.654114242484899e-07, "loss": 0.91963923, "num_input_tokens_seen": 241610245, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.72265625, "step": 11195, "time_per_iteration": 3.928445339202881 }, { "auxiliary_loss_clip": 0.01114001, "auxiliary_loss_mlp": 0.01033591, "balance_loss_clip": 1.02126431, "balance_loss_mlp": 1.03471696, "epoch": 0.6731399368705847, "flos": 28986267438720.0, "grad_norm": 1.821150380264299, "language_loss": 0.72200477, "learning_rate": 9.650880994982358e-07, "loss": 0.74348068, "num_input_tokens_seen": 241630350, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.70703125, "step": 11196, "time_per_iteration": 2.5452070236206055 }, { "auxiliary_loss_clip": 0.01073645, "auxiliary_loss_mlp": 0.010002, "balance_loss_clip": 0.9989242, "balance_loss_mlp": 1.00728226, "epoch": 0.6732000601232526, "flos": 64743708723840.0, "grad_norm": 0.7731011212043365, "language_loss": 0.56611919, "learning_rate": 9.647648116813245e-07, "loss": 0.58685768, "num_input_tokens_seen": 241692380, "router_z_loss_clip": 0.01275635, "router_z_loss_mlp": 0.21875, "step": 11197, "time_per_iteration": 3.090268850326538 }, { "auxiliary_loss_clip": 0.0112304, "auxiliary_loss_mlp": 0.01028903, "balance_loss_clip": 1.01708889, "balance_loss_mlp": 1.038203, "epoch": 0.6732601833759206, "flos": 17201642762880.0, "grad_norm": 1.8863999304829608, "language_loss": 0.75041914, "learning_rate": 9.64441560809295e-07, "loss": 0.77193856, "num_input_tokens_seen": 241710430, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 11198, "time_per_iteration": 2.508254289627075 }, { "auxiliary_loss_clip": 0.01117917, "auxiliary_loss_mlp": 0.01030808, "balance_loss_clip": 1.01745653, "balance_loss_mlp": 1.03729486, "epoch": 0.6733203066285887, "flos": 18658861620480.0, "grad_norm": 2.2339580875575447, "language_loss": 0.81249636, "learning_rate": 9.64118346893682e-07, "loss": 0.83398354, "num_input_tokens_seen": 241724775, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 11199, "time_per_iteration": 2.511929988861084 }, { "auxiliary_loss_clip": 0.01144412, "auxiliary_loss_mlp": 0.01033543, "balance_loss_clip": 1.02075219, "balance_loss_mlp": 1.03586638, "epoch": 0.6733804298812566, "flos": 35809330481280.0, "grad_norm": 1.7572459112898033, "language_loss": 0.71484208, "learning_rate": 9.63795169946021e-07, "loss": 0.73662162, "num_input_tokens_seen": 241744440, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 11200, "time_per_iteration": 2.6464452743530273 }, { "auxiliary_loss_clip": 0.01108545, "auxiliary_loss_mlp": 0.0103361, "balance_loss_clip": 1.02118206, "balance_loss_mlp": 1.03796792, "epoch": 0.6734405531339246, "flos": 61638833099520.0, "grad_norm": 1.5839218037262328, "language_loss": 0.64713109, "learning_rate": 9.63472029977844e-07, "loss": 0.66855264, "num_input_tokens_seen": 241771705, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 11201, "time_per_iteration": 4.403092622756958 }, { "auxiliary_loss_clip": 0.01126076, "auxiliary_loss_mlp": 0.01036085, "balance_loss_clip": 1.02273369, "balance_loss_mlp": 1.03550982, "epoch": 0.6735006763865925, "flos": 20522338277760.0, "grad_norm": 1.5556124367344302, "language_loss": 0.62974048, "learning_rate": 9.631489270006855e-07, "loss": 0.65136218, "num_input_tokens_seen": 241790830, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73046875, "step": 11202, "time_per_iteration": 2.5458357334136963 }, { "auxiliary_loss_clip": 0.01106896, "auxiliary_loss_mlp": 0.01029884, "balance_loss_clip": 1.01806998, "balance_loss_mlp": 1.03621149, "epoch": 0.6735607996392605, "flos": 13918869031680.0, "grad_norm": 1.7321791944375453, "language_loss": 0.74801993, "learning_rate": 9.628258610260742e-07, "loss": 0.76938772, "num_input_tokens_seen": 241808165, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.70703125, "step": 11203, "time_per_iteration": 2.544010877609253 }, { "auxiliary_loss_clip": 0.01129874, "auxiliary_loss_mlp": 0.01031552, "balance_loss_clip": 1.0171392, "balance_loss_mlp": 1.03754401, "epoch": 0.6736209228919284, "flos": 18807244704000.0, "grad_norm": 1.8657190051801704, "language_loss": 0.6756283, "learning_rate": 9.625028320655387e-07, "loss": 0.69724256, "num_input_tokens_seen": 241826925, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.74609375, "step": 11204, "time_per_iteration": 2.563692569732666 }, { "auxiliary_loss_clip": 0.01123314, "auxiliary_loss_mlp": 0.0103069, "balance_loss_clip": 1.01769662, "balance_loss_mlp": 1.04015446, "epoch": 0.6736810461445965, "flos": 20373129181440.0, "grad_norm": 1.5465333496951634, "language_loss": 0.73766237, "learning_rate": 9.621798401306095e-07, "loss": 0.75920248, "num_input_tokens_seen": 241845525, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7421875, "step": 11205, "time_per_iteration": 2.5400145053863525 }, { "auxiliary_loss_clip": 0.01104267, "auxiliary_loss_mlp": 0.01034242, "balance_loss_clip": 1.02199984, "balance_loss_mlp": 1.03494906, "epoch": 0.6737411693972644, "flos": 30007530927360.0, "grad_norm": 1.5450373020488415, "language_loss": 0.71494615, "learning_rate": 9.618568852328123e-07, "loss": 0.73633122, "num_input_tokens_seen": 241866815, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 11206, "time_per_iteration": 2.577634811401367 }, { "auxiliary_loss_clip": 0.01140787, "auxiliary_loss_mlp": 0.01282721, "balance_loss_clip": 1.02190113, "balance_loss_mlp": 1.03853786, "epoch": 0.6738012926499324, "flos": 25447342844160.0, "grad_norm": 3.5941693882859584, "language_loss": 0.67694914, "learning_rate": 9.615339673836724e-07, "loss": 0.70118427, "num_input_tokens_seen": 241887050, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7578125, "step": 11207, "time_per_iteration": 2.6239333152770996 }, { "auxiliary_loss_clip": 0.01139853, "auxiliary_loss_mlp": 0.01032879, "balance_loss_clip": 1.02079737, "balance_loss_mlp": 1.03635347, "epoch": 0.6738614159026003, "flos": 20776873029120.0, "grad_norm": 1.973964527679312, "language_loss": 0.74046469, "learning_rate": 9.612110865947133e-07, "loss": 0.76219201, "num_input_tokens_seen": 241904280, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 11208, "time_per_iteration": 2.534024953842163 }, { "auxiliary_loss_clip": 0.01116805, "auxiliary_loss_mlp": 0.01031897, "balance_loss_clip": 1.01777077, "balance_loss_mlp": 1.03566337, "epoch": 0.6739215391552683, "flos": 19566898462080.0, "grad_norm": 1.8742843477150455, "language_loss": 0.7580747, "learning_rate": 9.608882428774595e-07, "loss": 0.77956164, "num_input_tokens_seen": 241919190, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.72265625, "step": 11209, "time_per_iteration": 2.488008499145508 }, { "auxiliary_loss_clip": 0.01129162, "auxiliary_loss_mlp": 0.0102812, "balance_loss_clip": 1.01665187, "balance_loss_mlp": 1.03570497, "epoch": 0.6739816624079362, "flos": 24388193485440.0, "grad_norm": 2.1495048331547273, "language_loss": 0.66623688, "learning_rate": 9.605654362434302e-07, "loss": 0.68780971, "num_input_tokens_seen": 241940525, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.671875, "step": 11210, "time_per_iteration": 2.6199254989624023 }, { "auxiliary_loss_clip": 0.01102376, "auxiliary_loss_mlp": 0.01028307, "balance_loss_clip": 1.01680899, "balance_loss_mlp": 1.03440011, "epoch": 0.6740417856606042, "flos": 22528164533760.0, "grad_norm": 2.0822428391493397, "language_loss": 0.79945427, "learning_rate": 9.602426667041475e-07, "loss": 0.82076108, "num_input_tokens_seen": 241959290, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6796875, "step": 11211, "time_per_iteration": 2.5227622985839844 }, { "auxiliary_loss_clip": 0.01139393, "auxiliary_loss_mlp": 0.01031058, "balance_loss_clip": 1.01824868, "balance_loss_mlp": 1.03583479, "epoch": 0.6741019089132723, "flos": 25775458606080.0, "grad_norm": 1.3790174217628723, "language_loss": 0.76542455, "learning_rate": 9.599199342711293e-07, "loss": 0.78712904, "num_input_tokens_seen": 241980715, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6796875, "step": 11212, "time_per_iteration": 2.602060079574585 }, { "auxiliary_loss_clip": 0.01133849, "auxiliary_loss_mlp": 0.01276082, "balance_loss_clip": 1.01598251, "balance_loss_mlp": 1.03541327, "epoch": 0.6741620321659402, "flos": 21105671149440.0, "grad_norm": 1.5369762576919277, "language_loss": 0.779782, "learning_rate": 9.595972389558932e-07, "loss": 0.80388129, "num_input_tokens_seen": 241999985, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 11213, "time_per_iteration": 2.539247989654541 }, { "auxiliary_loss_clip": 0.01145662, "auxiliary_loss_mlp": 0.01035499, "balance_loss_clip": 1.02189171, "balance_loss_mlp": 1.03692865, "epoch": 0.6742221554186082, "flos": 20740423703040.0, "grad_norm": 1.712359925225985, "language_loss": 0.67654639, "learning_rate": 9.592745807699548e-07, "loss": 0.69835806, "num_input_tokens_seen": 242018990, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73046875, "step": 11214, "time_per_iteration": 2.556471347808838 }, { "auxiliary_loss_clip": 0.01107649, "auxiliary_loss_mlp": 0.01283068, "balance_loss_clip": 1.02368128, "balance_loss_mlp": 1.03607297, "epoch": 0.6742822786712761, "flos": 37774146384000.0, "grad_norm": 1.6421371787287127, "language_loss": 0.72969437, "learning_rate": 9.589519597248304e-07, "loss": 0.75360149, "num_input_tokens_seen": 242039340, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 11215, "time_per_iteration": 2.647658586502075 }, { "auxiliary_loss_clip": 0.01117175, "auxiliary_loss_mlp": 0.01278995, "balance_loss_clip": 1.01996589, "balance_loss_mlp": 1.036134, "epoch": 0.6743424019239441, "flos": 37263891732480.0, "grad_norm": 1.7520396384728385, "language_loss": 0.67163175, "learning_rate": 9.586293758320326e-07, "loss": 0.69559348, "num_input_tokens_seen": 242062215, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.71875, "step": 11216, "time_per_iteration": 2.6849160194396973 }, { "auxiliary_loss_clip": 0.01029256, "auxiliary_loss_mlp": 0.01000726, "balance_loss_clip": 0.99937862, "balance_loss_mlp": 1.00756192, "epoch": 0.674402525176612, "flos": 65997746300160.0, "grad_norm": 0.6707639865468006, "language_loss": 0.56241494, "learning_rate": 9.583068291030736e-07, "loss": 0.5827148, "num_input_tokens_seen": 242131130, "router_z_loss_clip": 0.01348877, "router_z_loss_mlp": 0.21679688, "step": 11217, "time_per_iteration": 3.2530336380004883 }, { "auxiliary_loss_clip": 0.0111383, "auxiliary_loss_mlp": 0.01031582, "balance_loss_clip": 1.01863587, "balance_loss_mlp": 1.03542197, "epoch": 0.67446264842928, "flos": 26461208131200.0, "grad_norm": 4.132072167696597, "language_loss": 0.74509537, "learning_rate": 9.57984319549464e-07, "loss": 0.76654947, "num_input_tokens_seen": 242149720, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 11218, "time_per_iteration": 2.6629252433776855 }, { "auxiliary_loss_clip": 0.01133897, "auxiliary_loss_mlp": 0.01048689, "balance_loss_clip": 1.03428888, "balance_loss_mlp": 1.0363977, "epoch": 0.674522771681948, "flos": 23732392924800.0, "grad_norm": 1.6378845852632227, "language_loss": 0.66254878, "learning_rate": 9.576618471827143e-07, "loss": 0.68437469, "num_input_tokens_seen": 242168875, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.71875, "step": 11219, "time_per_iteration": 2.5680267810821533 }, { "auxiliary_loss_clip": 0.01135959, "auxiliary_loss_mlp": 0.01033042, "balance_loss_clip": 1.01973212, "balance_loss_mlp": 1.0353924, "epoch": 0.674582894934616, "flos": 24754338771840.0, "grad_norm": 1.6356591774326594, "language_loss": 0.7471931, "learning_rate": 9.573394120143318e-07, "loss": 0.76888311, "num_input_tokens_seen": 242188465, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73828125, "step": 11220, "time_per_iteration": 2.6303210258483887 }, { "auxiliary_loss_clip": 0.01117035, "auxiliary_loss_mlp": 0.01032308, "balance_loss_clip": 1.01899266, "balance_loss_mlp": 1.03627062, "epoch": 0.6746430181872839, "flos": 24826626892800.0, "grad_norm": 1.5991886334439203, "language_loss": 0.70218873, "learning_rate": 9.570170140558226e-07, "loss": 0.72368217, "num_input_tokens_seen": 242208675, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 11221, "time_per_iteration": 2.647961139678955 }, { "auxiliary_loss_clip": 0.01133621, "auxiliary_loss_mlp": 0.01025658, "balance_loss_clip": 1.01383209, "balance_loss_mlp": 1.0369513, "epoch": 0.6747031414399519, "flos": 16873491087360.0, "grad_norm": 1.979465055177878, "language_loss": 0.5831756, "learning_rate": 9.56694653318695e-07, "loss": 0.6047684, "num_input_tokens_seen": 242227440, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69921875, "step": 11222, "time_per_iteration": 2.5813510417938232 }, { "auxiliary_loss_clip": 0.01117021, "auxiliary_loss_mlp": 0.01033406, "balance_loss_clip": 1.02060294, "balance_loss_mlp": 1.03674531, "epoch": 0.6747632646926198, "flos": 22784925928320.0, "grad_norm": 1.7284201679614035, "language_loss": 0.76737392, "learning_rate": 9.563723298144499e-07, "loss": 0.78887814, "num_input_tokens_seen": 242245240, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 11223, "time_per_iteration": 2.504746437072754 }, { "auxiliary_loss_clip": 0.01119566, "auxiliary_loss_mlp": 0.01032234, "balance_loss_clip": 1.01984227, "balance_loss_mlp": 1.03838038, "epoch": 0.6748233879452878, "flos": 20046090827520.0, "grad_norm": 1.6789260321058062, "language_loss": 0.75401574, "learning_rate": 9.56050043554593e-07, "loss": 0.77553368, "num_input_tokens_seen": 242263435, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.72265625, "step": 11224, "time_per_iteration": 2.5466530323028564 }, { "auxiliary_loss_clip": 0.01138667, "auxiliary_loss_mlp": 0.01028981, "balance_loss_clip": 1.01786447, "balance_loss_mlp": 1.03583157, "epoch": 0.6748835111979558, "flos": 23002831785600.0, "grad_norm": 1.6373868216063092, "language_loss": 0.63220227, "learning_rate": 9.557277945506235e-07, "loss": 0.65387869, "num_input_tokens_seen": 242282765, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.671875, "step": 11225, "time_per_iteration": 2.5792317390441895 }, { "auxiliary_loss_clip": 0.01103135, "auxiliary_loss_mlp": 0.01278082, "balance_loss_clip": 1.0203104, "balance_loss_mlp": 1.03624988, "epoch": 0.6749436344506238, "flos": 12197311009920.0, "grad_norm": 2.5986640819375832, "language_loss": 0.63991874, "learning_rate": 9.554055828140443e-07, "loss": 0.66373092, "num_input_tokens_seen": 242298980, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.66796875, "step": 11226, "time_per_iteration": 2.5134079456329346 }, { "auxiliary_loss_clip": 0.01117154, "auxiliary_loss_mlp": 0.01032922, "balance_loss_clip": 1.02027392, "balance_loss_mlp": 1.03700709, "epoch": 0.6750037577032918, "flos": 11873720361600.0, "grad_norm": 2.4473966789021717, "language_loss": 0.71641076, "learning_rate": 9.550834083563516e-07, "loss": 0.73791152, "num_input_tokens_seen": 242315420, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 11227, "time_per_iteration": 2.4744338989257812 }, { "auxiliary_loss_clip": 0.01121654, "auxiliary_loss_mlp": 0.01029128, "balance_loss_clip": 1.01658726, "balance_loss_mlp": 1.03519726, "epoch": 0.6750638809559597, "flos": 17019611614080.0, "grad_norm": 2.066910226555307, "language_loss": 0.71707296, "learning_rate": 9.54761271189045e-07, "loss": 0.73858076, "num_input_tokens_seen": 242332805, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 11228, "time_per_iteration": 2.5305497646331787 }, { "auxiliary_loss_clip": 0.01047585, "auxiliary_loss_mlp": 0.01000682, "balance_loss_clip": 0.99933511, "balance_loss_mlp": 1.00781131, "epoch": 0.6751240042086277, "flos": 70951011891840.0, "grad_norm": 0.7592170713470626, "language_loss": 0.53326201, "learning_rate": 9.544391713236198e-07, "loss": 0.55374467, "num_input_tokens_seen": 242396160, "router_z_loss_clip": 0.01348877, "router_z_loss_mlp": 0.21679688, "step": 11229, "time_per_iteration": 3.171673536300659 }, { "auxiliary_loss_clip": 0.01114188, "auxiliary_loss_mlp": 0.01029506, "balance_loss_clip": 1.01732278, "balance_loss_mlp": 1.03560019, "epoch": 0.6751841274612956, "flos": 22675146986880.0, "grad_norm": 1.6935743665860392, "language_loss": 0.80498606, "learning_rate": 9.54117108771571e-07, "loss": 0.82642299, "num_input_tokens_seen": 242414660, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6953125, "step": 11230, "time_per_iteration": 4.071353197097778 }, { "auxiliary_loss_clip": 0.01122267, "auxiliary_loss_mlp": 0.01031161, "balance_loss_clip": 1.01991391, "balance_loss_mlp": 1.03626847, "epoch": 0.6752442507139637, "flos": 21288636051840.0, "grad_norm": 1.539406137060372, "language_loss": 0.65757775, "learning_rate": 9.537950835443916e-07, "loss": 0.67911196, "num_input_tokens_seen": 242434225, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6796875, "step": 11231, "time_per_iteration": 2.5591633319854736 }, { "auxiliary_loss_clip": 0.01147787, "auxiliary_loss_mlp": 0.01039037, "balance_loss_clip": 1.02470827, "balance_loss_mlp": 1.03808415, "epoch": 0.6753043739666316, "flos": 28256921781120.0, "grad_norm": 2.015892630368062, "language_loss": 0.66298074, "learning_rate": 9.53473095653575e-07, "loss": 0.68484896, "num_input_tokens_seen": 242454355, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7421875, "step": 11232, "time_per_iteration": 2.5952565670013428 }, { "auxiliary_loss_clip": 0.01131354, "auxiliary_loss_mlp": 0.01026504, "balance_loss_clip": 1.01478028, "balance_loss_mlp": 1.03521097, "epoch": 0.6753644972192996, "flos": 21360349555200.0, "grad_norm": 1.7377169946840538, "language_loss": 0.72137809, "learning_rate": 9.531511451106127e-07, "loss": 0.74295664, "num_input_tokens_seen": 242474935, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 11233, "time_per_iteration": 2.5800564289093018 }, { "auxiliary_loss_clip": 0.01115925, "auxiliary_loss_mlp": 0.01034112, "balance_loss_clip": 1.02134466, "balance_loss_mlp": 1.03612864, "epoch": 0.6754246204719675, "flos": 26541971861760.0, "grad_norm": 1.5409354710636642, "language_loss": 0.7686612, "learning_rate": 9.528292319269918e-07, "loss": 0.79016161, "num_input_tokens_seen": 242495530, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 11234, "time_per_iteration": 2.5989301204681396 }, { "auxiliary_loss_clip": 0.0110649, "auxiliary_loss_mlp": 0.01033751, "balance_loss_clip": 1.02088249, "balance_loss_mlp": 1.03696406, "epoch": 0.6754847437246355, "flos": 25556690822400.0, "grad_norm": 1.5871816519099224, "language_loss": 0.75346506, "learning_rate": 9.525073561142023e-07, "loss": 0.77486748, "num_input_tokens_seen": 242514550, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 11235, "time_per_iteration": 2.5884876251220703 }, { "auxiliary_loss_clip": 0.01038283, "auxiliary_loss_mlp": 0.01003556, "balance_loss_clip": 1.00226259, "balance_loss_mlp": 1.00771546, "epoch": 0.6755448669773034, "flos": 59513318726400.0, "grad_norm": 0.7796990495258717, "language_loss": 0.51357698, "learning_rate": 9.521855176837312e-07, "loss": 0.53399539, "num_input_tokens_seen": 242569200, "router_z_loss_clip": 0.01293945, "router_z_loss_mlp": 0.21679688, "step": 11236, "time_per_iteration": 3.1302499771118164 }, { "auxiliary_loss_clip": 0.01133147, "auxiliary_loss_mlp": 0.01030272, "balance_loss_clip": 1.01597834, "balance_loss_mlp": 1.03643608, "epoch": 0.6756049902299714, "flos": 23294534135040.0, "grad_norm": 1.8844681595785242, "language_loss": 0.75420105, "learning_rate": 9.518637166470635e-07, "loss": 0.77583528, "num_input_tokens_seen": 242586950, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.703125, "step": 11237, "time_per_iteration": 3.9834296703338623 }, { "auxiliary_loss_clip": 0.01126177, "auxiliary_loss_mlp": 0.01034289, "balance_loss_clip": 1.02151537, "balance_loss_mlp": 1.03838813, "epoch": 0.6756651134826394, "flos": 31575426566400.0, "grad_norm": 1.9223325494106138, "language_loss": 0.77614719, "learning_rate": 9.515419530156828e-07, "loss": 0.79775184, "num_input_tokens_seen": 242607380, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 11238, "time_per_iteration": 2.629879951477051 }, { "auxiliary_loss_clip": 0.01119348, "auxiliary_loss_mlp": 0.01030512, "balance_loss_clip": 1.01719594, "balance_loss_mlp": 1.03591442, "epoch": 0.6757252367353074, "flos": 27272287186560.0, "grad_norm": 1.7739127834883577, "language_loss": 0.66343129, "learning_rate": 9.512202268010745e-07, "loss": 0.68492997, "num_input_tokens_seen": 242628025, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7421875, "step": 11239, "time_per_iteration": 2.569578170776367 }, { "auxiliary_loss_clip": 0.01124259, "auxiliary_loss_mlp": 0.01031403, "balance_loss_clip": 1.01885033, "balance_loss_mlp": 1.03804135, "epoch": 0.6757853599879754, "flos": 16830900535680.0, "grad_norm": 2.816553003566328, "language_loss": 0.82825351, "learning_rate": 9.50898538014717e-07, "loss": 0.84981012, "num_input_tokens_seen": 242643825, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 11240, "time_per_iteration": 2.522867441177368 }, { "auxiliary_loss_clip": 0.01141564, "auxiliary_loss_mlp": 0.01034536, "balance_loss_clip": 1.01899123, "balance_loss_mlp": 1.0386169, "epoch": 0.6758454832406433, "flos": 23220055284480.0, "grad_norm": 1.9559358553541517, "language_loss": 0.74322999, "learning_rate": 9.505768866680925e-07, "loss": 0.76499093, "num_input_tokens_seen": 242661820, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.76171875, "step": 11241, "time_per_iteration": 2.598798990249634 }, { "auxiliary_loss_clip": 0.01145557, "auxiliary_loss_mlp": 0.01036818, "balance_loss_clip": 1.02404499, "balance_loss_mlp": 1.03821075, "epoch": 0.6759056064933113, "flos": 16289547684480.0, "grad_norm": 2.110231913454763, "language_loss": 0.80345488, "learning_rate": 9.502552727726791e-07, "loss": 0.82527864, "num_input_tokens_seen": 242679890, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 11242, "time_per_iteration": 4.216466426849365 }, { "auxiliary_loss_clip": 0.01114443, "auxiliary_loss_mlp": 0.01030841, "balance_loss_clip": 1.01908708, "balance_loss_mlp": 1.03494453, "epoch": 0.6759657297459792, "flos": 25922297404800.0, "grad_norm": 2.3705432656553023, "language_loss": 0.72728986, "learning_rate": 9.499336963399562e-07, "loss": 0.74874276, "num_input_tokens_seen": 242699495, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.703125, "step": 11243, "time_per_iteration": 4.036808490753174 }, { "auxiliary_loss_clip": 0.01116843, "auxiliary_loss_mlp": 0.01036603, "balance_loss_clip": 1.02448559, "balance_loss_mlp": 1.03772783, "epoch": 0.6760258529986473, "flos": 23000820624000.0, "grad_norm": 2.120142811253909, "language_loss": 0.73029447, "learning_rate": 9.49612157381397e-07, "loss": 0.75182891, "num_input_tokens_seen": 242719500, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.703125, "step": 11244, "time_per_iteration": 2.603817939758301 }, { "auxiliary_loss_clip": 0.01109766, "auxiliary_loss_mlp": 0.01041853, "balance_loss_clip": 1.02898431, "balance_loss_mlp": 1.0375526, "epoch": 0.6760859762513152, "flos": 20959335141120.0, "grad_norm": 1.9950304036034296, "language_loss": 0.85755461, "learning_rate": 9.492906559084788e-07, "loss": 0.87907076, "num_input_tokens_seen": 242738325, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.72265625, "step": 11245, "time_per_iteration": 2.5001862049102783 }, { "auxiliary_loss_clip": 0.01111186, "auxiliary_loss_mlp": 0.01281916, "balance_loss_clip": 1.02170467, "balance_loss_mlp": 1.03600216, "epoch": 0.6761460995039832, "flos": 23622937205760.0, "grad_norm": 1.7298406108938125, "language_loss": 0.73902476, "learning_rate": 9.489691919326743e-07, "loss": 0.76295578, "num_input_tokens_seen": 242756620, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.75, "step": 11246, "time_per_iteration": 2.5896599292755127 }, { "auxiliary_loss_clip": 0.01116133, "auxiliary_loss_mlp": 0.01028047, "balance_loss_clip": 1.01559579, "balance_loss_mlp": 1.03621399, "epoch": 0.6762062227566511, "flos": 20770875457920.0, "grad_norm": 1.8204910699442638, "language_loss": 0.87921858, "learning_rate": 9.486477654654557e-07, "loss": 0.9006604, "num_input_tokens_seen": 242774505, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7109375, "step": 11247, "time_per_iteration": 2.5026767253875732 }, { "auxiliary_loss_clip": 0.01047578, "auxiliary_loss_mlp": 0.01002714, "balance_loss_clip": 1.00134933, "balance_loss_mlp": 1.00778639, "epoch": 0.6762663460093191, "flos": 52818099166080.0, "grad_norm": 0.8225444354590814, "language_loss": 0.54025573, "learning_rate": 9.48326376518294e-07, "loss": 0.56075865, "num_input_tokens_seen": 242828645, "router_z_loss_clip": 0.01367188, "router_z_loss_mlp": 0.21679688, "step": 11248, "time_per_iteration": 3.1706480979919434 }, { "auxiliary_loss_clip": 0.01148952, "auxiliary_loss_mlp": 0.0103014, "balance_loss_clip": 1.01802289, "balance_loss_mlp": 1.03736436, "epoch": 0.676326469261987, "flos": 23696302734720.0, "grad_norm": 1.6111430324761795, "language_loss": 0.73492444, "learning_rate": 9.480050251026579e-07, "loss": 0.75671542, "num_input_tokens_seen": 242850100, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.671875, "step": 11249, "time_per_iteration": 2.6144378185272217 }, { "auxiliary_loss_clip": 0.01117512, "auxiliary_loss_mlp": 0.01038644, "balance_loss_clip": 1.02511382, "balance_loss_mlp": 1.03520155, "epoch": 0.676386592514655, "flos": 14063732582400.0, "grad_norm": 1.8072069394242616, "language_loss": 0.73523939, "learning_rate": 9.47683711230018e-07, "loss": 0.75680089, "num_input_tokens_seen": 242867775, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 11250, "time_per_iteration": 2.526907444000244 }, { "auxiliary_loss_clip": 0.01109818, "auxiliary_loss_mlp": 0.01030236, "balance_loss_clip": 1.01782644, "balance_loss_mlp": 1.03778601, "epoch": 0.676446715767323, "flos": 20412236113920.0, "grad_norm": 1.8168011714367225, "language_loss": 0.75203192, "learning_rate": 9.473624349118381e-07, "loss": 0.77343243, "num_input_tokens_seen": 242886865, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71875, "step": 11251, "time_per_iteration": 2.5533013343811035 }, { "auxiliary_loss_clip": 0.01135151, "auxiliary_loss_mlp": 0.01033424, "balance_loss_clip": 1.02063262, "balance_loss_mlp": 1.03782558, "epoch": 0.676506839019991, "flos": 21288241002240.0, "grad_norm": 1.572187530505146, "language_loss": 0.70413709, "learning_rate": 9.470411961595859e-07, "loss": 0.72582281, "num_input_tokens_seen": 242906705, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 11252, "time_per_iteration": 2.5641238689422607 }, { "auxiliary_loss_clip": 0.01143613, "auxiliary_loss_mlp": 0.01031293, "balance_loss_clip": 1.01881766, "balance_loss_mlp": 1.03720808, "epoch": 0.676566962272659, "flos": 29932477459200.0, "grad_norm": 2.1650139510245436, "language_loss": 0.66835034, "learning_rate": 9.467199949847249e-07, "loss": 0.69009936, "num_input_tokens_seen": 242925215, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 11253, "time_per_iteration": 2.6489734649658203 }, { "auxiliary_loss_clip": 0.01136683, "auxiliary_loss_mlp": 0.0128198, "balance_loss_clip": 1.02155828, "balance_loss_mlp": 1.03727984, "epoch": 0.6766270855253269, "flos": 17931203902080.0, "grad_norm": 1.782105615330668, "language_loss": 0.76983887, "learning_rate": 9.463988313987177e-07, "loss": 0.79402554, "num_input_tokens_seen": 242944750, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 11254, "time_per_iteration": 2.6047439575195312 }, { "auxiliary_loss_clip": 0.01116785, "auxiliary_loss_mlp": 0.01031638, "balance_loss_clip": 1.01831603, "balance_loss_mlp": 1.03637898, "epoch": 0.6766872087779949, "flos": 23104853389440.0, "grad_norm": 2.472522772173484, "language_loss": 0.72256017, "learning_rate": 9.460777054130256e-07, "loss": 0.7440443, "num_input_tokens_seen": 242963860, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71484375, "step": 11255, "time_per_iteration": 2.523081064224243 }, { "auxiliary_loss_clip": 0.01141648, "auxiliary_loss_mlp": 0.01285648, "balance_loss_clip": 1.02611852, "balance_loss_mlp": 1.03421378, "epoch": 0.6767473320306628, "flos": 26213137827840.0, "grad_norm": 2.0744330090646073, "language_loss": 0.74926186, "learning_rate": 9.457566170391105e-07, "loss": 0.77353483, "num_input_tokens_seen": 242983050, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 11256, "time_per_iteration": 2.623680591583252 }, { "auxiliary_loss_clip": 0.01119971, "auxiliary_loss_mlp": 0.01034337, "balance_loss_clip": 1.02163529, "balance_loss_mlp": 1.03920984, "epoch": 0.6768074552833309, "flos": 18368739469440.0, "grad_norm": 2.3453758022346305, "language_loss": 0.65270197, "learning_rate": 9.454355662884283e-07, "loss": 0.67424512, "num_input_tokens_seen": 243001125, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 11257, "time_per_iteration": 2.4960763454437256 }, { "auxiliary_loss_clip": 0.01140693, "auxiliary_loss_mlp": 0.01032485, "balance_loss_clip": 1.02040899, "balance_loss_mlp": 1.03618479, "epoch": 0.6768675785359988, "flos": 23039927556480.0, "grad_norm": 1.447134809157057, "language_loss": 0.75552452, "learning_rate": 9.451145531724389e-07, "loss": 0.77725631, "num_input_tokens_seen": 243021865, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 11258, "time_per_iteration": 2.5981318950653076 }, { "auxiliary_loss_clip": 0.0112281, "auxiliary_loss_mlp": 0.01034357, "balance_loss_clip": 1.02237046, "balance_loss_mlp": 1.03704643, "epoch": 0.6769277017886668, "flos": 33036524092800.0, "grad_norm": 1.5993197306234743, "language_loss": 0.66815102, "learning_rate": 9.447935777025968e-07, "loss": 0.68972278, "num_input_tokens_seen": 243042970, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 11259, "time_per_iteration": 2.6196374893188477 }, { "auxiliary_loss_clip": 0.01125328, "auxiliary_loss_mlp": 0.01032017, "balance_loss_clip": 1.01966071, "balance_loss_mlp": 1.03649116, "epoch": 0.6769878250413347, "flos": 20848406964480.0, "grad_norm": 2.2853486656138777, "language_loss": 0.85621458, "learning_rate": 9.444726398903593e-07, "loss": 0.87778807, "num_input_tokens_seen": 243058470, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.71484375, "step": 11260, "time_per_iteration": 2.651064872741699 }, { "auxiliary_loss_clip": 0.01127525, "auxiliary_loss_mlp": 0.01034541, "balance_loss_clip": 1.02151716, "balance_loss_mlp": 1.03577912, "epoch": 0.6770479482940027, "flos": 15595968994560.0, "grad_norm": 2.239221734066814, "language_loss": 0.77637494, "learning_rate": 9.441517397471765e-07, "loss": 0.79799557, "num_input_tokens_seen": 243076630, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73828125, "step": 11261, "time_per_iteration": 2.507078170776367 }, { "auxiliary_loss_clip": 0.01125183, "auxiliary_loss_mlp": 0.01038531, "balance_loss_clip": 1.02545977, "balance_loss_mlp": 1.03506541, "epoch": 0.6771080715466706, "flos": 18621011664000.0, "grad_norm": 1.96786323177869, "language_loss": 0.87860036, "learning_rate": 9.43830877284503e-07, "loss": 0.90023756, "num_input_tokens_seen": 243092260, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 11262, "time_per_iteration": 2.507011651992798 }, { "auxiliary_loss_clip": 0.01129802, "auxiliary_loss_mlp": 0.01034405, "balance_loss_clip": 1.02125025, "balance_loss_mlp": 1.03839099, "epoch": 0.6771681947993387, "flos": 12495441893760.0, "grad_norm": 1.8269897584316805, "language_loss": 0.74145406, "learning_rate": 9.435100525137893e-07, "loss": 0.76309609, "num_input_tokens_seen": 243109405, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73828125, "step": 11263, "time_per_iteration": 2.5601770877838135 }, { "auxiliary_loss_clip": 0.01120055, "auxiliary_loss_mlp": 0.01036302, "balance_loss_clip": 1.02224672, "balance_loss_mlp": 1.03734887, "epoch": 0.6772283180520066, "flos": 22236964974720.0, "grad_norm": 2.2725102311146603, "language_loss": 0.6775831, "learning_rate": 9.431892654464828e-07, "loss": 0.69914663, "num_input_tokens_seen": 243128135, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.73828125, "step": 11264, "time_per_iteration": 2.5235893726348877 }, { "auxiliary_loss_clip": 0.01127667, "auxiliary_loss_mlp": 0.01033206, "balance_loss_clip": 1.01975965, "balance_loss_mlp": 1.03886986, "epoch": 0.6772884413046746, "flos": 16143139848960.0, "grad_norm": 2.0074628226876987, "language_loss": 0.73017031, "learning_rate": 9.428685160940337e-07, "loss": 0.75177908, "num_input_tokens_seen": 243146785, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 11265, "time_per_iteration": 2.682957172393799 }, { "auxiliary_loss_clip": 0.01134124, "auxiliary_loss_mlp": 0.01032665, "balance_loss_clip": 1.02038062, "balance_loss_mlp": 1.03744292, "epoch": 0.6773485645573426, "flos": 19135755515520.0, "grad_norm": 1.6680625102700315, "language_loss": 0.61619532, "learning_rate": 9.42547804467888e-07, "loss": 0.63786316, "num_input_tokens_seen": 243165275, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 11266, "time_per_iteration": 2.5803587436676025 }, { "auxiliary_loss_clip": 0.01136311, "auxiliary_loss_mlp": 0.01036299, "balance_loss_clip": 1.0228225, "balance_loss_mlp": 1.03748751, "epoch": 0.6774086878100105, "flos": 14136918543360.0, "grad_norm": 1.6998233399429756, "language_loss": 0.70195168, "learning_rate": 9.422271305794911e-07, "loss": 0.72367775, "num_input_tokens_seen": 243182845, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 11267, "time_per_iteration": 2.5514745712280273 }, { "auxiliary_loss_clip": 0.01107929, "auxiliary_loss_mlp": 0.0103266, "balance_loss_clip": 1.02017272, "balance_loss_mlp": 1.03623319, "epoch": 0.6774688110626785, "flos": 22197067943040.0, "grad_norm": 1.9050339115278554, "language_loss": 0.71096992, "learning_rate": 9.419064944402863e-07, "loss": 0.73237586, "num_input_tokens_seen": 243201475, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 11268, "time_per_iteration": 2.5097739696502686 }, { "auxiliary_loss_clip": 0.0112901, "auxiliary_loss_mlp": 0.01032306, "balance_loss_clip": 1.01910996, "balance_loss_mlp": 1.03961086, "epoch": 0.6775289343153464, "flos": 23039963470080.0, "grad_norm": 1.450031971214294, "language_loss": 0.76778412, "learning_rate": 9.415858960617176e-07, "loss": 0.7893973, "num_input_tokens_seen": 243221850, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71484375, "step": 11269, "time_per_iteration": 2.551361322402954 }, { "auxiliary_loss_clip": 0.0113699, "auxiliary_loss_mlp": 0.01039322, "balance_loss_clip": 1.02558303, "balance_loss_mlp": 1.03770876, "epoch": 0.6775890575680145, "flos": 18293506433280.0, "grad_norm": 2.1090589088514986, "language_loss": 0.74000633, "learning_rate": 9.412653354552258e-07, "loss": 0.76176947, "num_input_tokens_seen": 243239855, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 11270, "time_per_iteration": 2.5135631561279297 }, { "auxiliary_loss_clip": 0.01147819, "auxiliary_loss_mlp": 0.01036598, "balance_loss_clip": 1.02312148, "balance_loss_mlp": 1.04046047, "epoch": 0.6776491808206824, "flos": 25336450581120.0, "grad_norm": 1.7161811603620951, "language_loss": 0.72889853, "learning_rate": 9.409448126322506e-07, "loss": 0.75074273, "num_input_tokens_seen": 243260085, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 11271, "time_per_iteration": 2.6091527938842773 }, { "auxiliary_loss_clip": 0.01109314, "auxiliary_loss_mlp": 0.01034166, "balance_loss_clip": 1.02164888, "balance_loss_mlp": 1.03819799, "epoch": 0.6777093040733504, "flos": 26028233591040.0, "grad_norm": 1.5292517528316183, "language_loss": 0.67849183, "learning_rate": 9.406243276042303e-07, "loss": 0.69992661, "num_input_tokens_seen": 243280065, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 11272, "time_per_iteration": 4.039214611053467 }, { "auxiliary_loss_clip": 0.01122188, "auxiliary_loss_mlp": 0.01035863, "balance_loss_clip": 1.02158141, "balance_loss_mlp": 1.03864479, "epoch": 0.6777694273260183, "flos": 18003599763840.0, "grad_norm": 1.7363198792046102, "language_loss": 0.74022257, "learning_rate": 9.40303880382604e-07, "loss": 0.76180309, "num_input_tokens_seen": 243297775, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.74609375, "step": 11273, "time_per_iteration": 2.637053966522217 }, { "auxiliary_loss_clip": 0.01116222, "auxiliary_loss_mlp": 0.01041753, "balance_loss_clip": 1.0290215, "balance_loss_mlp": 1.03742528, "epoch": 0.6778295505786863, "flos": 23441085624960.0, "grad_norm": 1.5859194996540378, "language_loss": 0.70210725, "learning_rate": 9.399834709788051e-07, "loss": 0.72368693, "num_input_tokens_seen": 243315760, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 11274, "time_per_iteration": 2.5763514041900635 }, { "auxiliary_loss_clip": 0.0111093, "auxiliary_loss_mlp": 0.01029947, "balance_loss_clip": 1.0168339, "balance_loss_mlp": 1.03844011, "epoch": 0.6778896738313542, "flos": 19098408349440.0, "grad_norm": 1.621834710901086, "language_loss": 0.65407163, "learning_rate": 9.3966309940427e-07, "loss": 0.67548037, "num_input_tokens_seen": 243335715, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 11275, "time_per_iteration": 2.519070625305176 }, { "auxiliary_loss_clip": 0.01119315, "auxiliary_loss_mlp": 0.01033141, "balance_loss_clip": 1.02108943, "balance_loss_mlp": 1.03955889, "epoch": 0.6779497970840223, "flos": 26103933504000.0, "grad_norm": 1.5227021765567943, "language_loss": 0.7272945, "learning_rate": 9.393427656704307e-07, "loss": 0.74881911, "num_input_tokens_seen": 243356935, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.70703125, "step": 11276, "time_per_iteration": 2.596759557723999 }, { "auxiliary_loss_clip": 0.01126588, "auxiliary_loss_mlp": 0.01279121, "balance_loss_clip": 1.01870334, "balance_loss_mlp": 1.03829265, "epoch": 0.6780099203366902, "flos": 19719232041600.0, "grad_norm": 1.7862085593418353, "language_loss": 0.76599652, "learning_rate": 9.39022469788721e-07, "loss": 0.79005361, "num_input_tokens_seen": 243375625, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 11277, "time_per_iteration": 2.550264596939087 }, { "auxiliary_loss_clip": 0.01128282, "auxiliary_loss_mlp": 0.01028849, "balance_loss_clip": 1.01585507, "balance_loss_mlp": 1.03790617, "epoch": 0.6780700435893582, "flos": 18214538382720.0, "grad_norm": 2.42270555598052, "language_loss": 0.83842719, "learning_rate": 9.387022117705699e-07, "loss": 0.85999846, "num_input_tokens_seen": 243390195, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 11278, "time_per_iteration": 2.4865074157714844 }, { "auxiliary_loss_clip": 0.01124928, "auxiliary_loss_mlp": 0.01277279, "balance_loss_clip": 1.01883054, "balance_loss_mlp": 1.0384326, "epoch": 0.6781301668420262, "flos": 25376239872000.0, "grad_norm": 1.7340040774777292, "language_loss": 0.70250893, "learning_rate": 9.383819916274059e-07, "loss": 0.72653103, "num_input_tokens_seen": 243411690, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 11279, "time_per_iteration": 4.0405073165893555 }, { "auxiliary_loss_clip": 0.01118043, "auxiliary_loss_mlp": 0.01032316, "balance_loss_clip": 1.01912558, "balance_loss_mlp": 1.03770506, "epoch": 0.6781902900946941, "flos": 24020432087040.0, "grad_norm": 2.2884521685495187, "language_loss": 0.73978394, "learning_rate": 9.380618093706592e-07, "loss": 0.76128757, "num_input_tokens_seen": 243430280, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71484375, "step": 11280, "time_per_iteration": 2.614150047302246 }, { "auxiliary_loss_clip": 0.011203, "auxiliary_loss_mlp": 0.01029827, "balance_loss_clip": 1.01685655, "balance_loss_mlp": 1.03738976, "epoch": 0.6782504133473621, "flos": 19646764352640.0, "grad_norm": 1.7419385087249812, "language_loss": 0.70373446, "learning_rate": 9.377416650117533e-07, "loss": 0.7252357, "num_input_tokens_seen": 243448690, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7421875, "step": 11281, "time_per_iteration": 2.537328004837036 }, { "auxiliary_loss_clip": 0.01104965, "auxiliary_loss_mlp": 0.01027561, "balance_loss_clip": 1.01582456, "balance_loss_mlp": 1.03658903, "epoch": 0.67831053660003, "flos": 24932742647040.0, "grad_norm": 1.604975411130124, "language_loss": 0.64090669, "learning_rate": 9.374215585621159e-07, "loss": 0.66223198, "num_input_tokens_seen": 243470695, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.68359375, "step": 11282, "time_per_iteration": 2.5949110984802246 }, { "auxiliary_loss_clip": 0.01140195, "auxiliary_loss_mlp": 0.01038343, "balance_loss_clip": 1.02435434, "balance_loss_mlp": 1.03893685, "epoch": 0.6783706598526981, "flos": 31208383440000.0, "grad_norm": 1.5963659866023236, "language_loss": 0.74368572, "learning_rate": 9.371014900331699e-07, "loss": 0.7654711, "num_input_tokens_seen": 243493345, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7421875, "step": 11283, "time_per_iteration": 2.7781295776367188 }, { "auxiliary_loss_clip": 0.01116492, "auxiliary_loss_mlp": 0.01028665, "balance_loss_clip": 1.01666057, "balance_loss_mlp": 1.03741682, "epoch": 0.678430783105366, "flos": 35441317687680.0, "grad_norm": 1.5608707518927505, "language_loss": 0.56734562, "learning_rate": 9.367814594363374e-07, "loss": 0.58879721, "num_input_tokens_seen": 243515670, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 11284, "time_per_iteration": 6.452959299087524 }, { "auxiliary_loss_clip": 0.01125077, "auxiliary_loss_mlp": 0.01030783, "balance_loss_clip": 1.01801527, "balance_loss_mlp": 1.03645301, "epoch": 0.678490906358034, "flos": 14428800460800.0, "grad_norm": 2.2397666263620395, "language_loss": 0.75589794, "learning_rate": 9.36461466783039e-07, "loss": 0.77745652, "num_input_tokens_seen": 243533625, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 11285, "time_per_iteration": 2.5977299213409424 }, { "auxiliary_loss_clip": 0.01108547, "auxiliary_loss_mlp": 0.01030466, "balance_loss_clip": 1.01812172, "balance_loss_mlp": 1.03948593, "epoch": 0.6785510296107019, "flos": 24311236596480.0, "grad_norm": 1.5156946770482518, "language_loss": 0.66249692, "learning_rate": 9.361415120846958e-07, "loss": 0.683887, "num_input_tokens_seen": 243553040, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.69140625, "step": 11286, "time_per_iteration": 2.5841758251190186 }, { "auxiliary_loss_clip": 0.01126829, "auxiliary_loss_mlp": 0.01029164, "balance_loss_clip": 1.01640844, "balance_loss_mlp": 1.03663766, "epoch": 0.6786111528633699, "flos": 26977244872320.0, "grad_norm": 2.8329098736148723, "language_loss": 0.52785623, "learning_rate": 9.358215953527256e-07, "loss": 0.54941618, "num_input_tokens_seen": 243572590, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 11287, "time_per_iteration": 2.6067957878112793 }, { "auxiliary_loss_clip": 0.01135496, "auxiliary_loss_mlp": 0.01029889, "balance_loss_clip": 1.01692498, "balance_loss_mlp": 1.03793621, "epoch": 0.6786712761160378, "flos": 24317557390080.0, "grad_norm": 1.5894855233108993, "language_loss": 0.77414519, "learning_rate": 9.355017165985453e-07, "loss": 0.79579902, "num_input_tokens_seen": 243594140, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 11288, "time_per_iteration": 2.6133174896240234 }, { "auxiliary_loss_clip": 0.01122817, "auxiliary_loss_mlp": 0.01034943, "balance_loss_clip": 1.02278376, "balance_loss_mlp": 1.04135346, "epoch": 0.6787313993687059, "flos": 22930435923840.0, "grad_norm": 1.8863710674671457, "language_loss": 0.73580718, "learning_rate": 9.351818758335696e-07, "loss": 0.75738478, "num_input_tokens_seen": 243615170, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.7265625, "step": 11289, "time_per_iteration": 2.693911552429199 }, { "auxiliary_loss_clip": 0.01109453, "auxiliary_loss_mlp": 0.01034565, "balance_loss_clip": 1.0216608, "balance_loss_mlp": 1.03798461, "epoch": 0.6787915226213738, "flos": 26868435598080.0, "grad_norm": 1.4408970693812975, "language_loss": 0.80026132, "learning_rate": 9.348620730692154e-07, "loss": 0.82170153, "num_input_tokens_seen": 243635675, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 11290, "time_per_iteration": 2.5699172019958496 }, { "auxiliary_loss_clip": 0.01115095, "auxiliary_loss_mlp": 0.01033149, "balance_loss_clip": 1.02118015, "balance_loss_mlp": 1.03865814, "epoch": 0.6788516458740418, "flos": 20008851402240.0, "grad_norm": 1.4397444445589227, "language_loss": 0.74708033, "learning_rate": 9.345423083168921e-07, "loss": 0.76856273, "num_input_tokens_seen": 243654950, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.67578125, "step": 11291, "time_per_iteration": 2.5059244632720947 }, { "auxiliary_loss_clip": 0.01126799, "auxiliary_loss_mlp": 0.01282426, "balance_loss_clip": 1.02255177, "balance_loss_mlp": 1.03662944, "epoch": 0.6789117691267098, "flos": 28727099832960.0, "grad_norm": 2.546098368501295, "language_loss": 0.75466025, "learning_rate": 9.342225815880142e-07, "loss": 0.77875251, "num_input_tokens_seen": 243674970, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.72265625, "step": 11292, "time_per_iteration": 2.5981249809265137 }, { "auxiliary_loss_clip": 0.01134661, "auxiliary_loss_mlp": 0.01029232, "balance_loss_clip": 1.01600599, "balance_loss_mlp": 1.03778934, "epoch": 0.6789718923793777, "flos": 23403451150080.0, "grad_norm": 1.8695345083249857, "language_loss": 0.8409487, "learning_rate": 9.339028928939907e-07, "loss": 0.86258757, "num_input_tokens_seen": 243693440, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.69921875, "step": 11293, "time_per_iteration": 2.546787977218628 }, { "auxiliary_loss_clip": 0.01114523, "auxiliary_loss_mlp": 0.01037767, "balance_loss_clip": 1.02342653, "balance_loss_mlp": 1.03916407, "epoch": 0.6790320156320457, "flos": 20448865008000.0, "grad_norm": 1.994952909595602, "language_loss": 0.79222834, "learning_rate": 9.335832422462308e-07, "loss": 0.81375122, "num_input_tokens_seen": 243710055, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.75390625, "step": 11294, "time_per_iteration": 2.5094974040985107 }, { "auxiliary_loss_clip": 0.01124455, "auxiliary_loss_mlp": 0.01026508, "balance_loss_clip": 1.01438403, "balance_loss_mlp": 1.03673029, "epoch": 0.6790921388847136, "flos": 24167199058560.0, "grad_norm": 1.857286932744124, "language_loss": 0.78953832, "learning_rate": 9.332636296561418e-07, "loss": 0.81104791, "num_input_tokens_seen": 243728635, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69921875, "step": 11295, "time_per_iteration": 2.6161434650421143 }, { "auxiliary_loss_clip": 0.01122905, "auxiliary_loss_mlp": 0.01029193, "balance_loss_clip": 1.01845777, "balance_loss_mlp": 1.03737521, "epoch": 0.6791522621373817, "flos": 21908095027200.0, "grad_norm": 1.8735209754762654, "language_loss": 0.71686208, "learning_rate": 9.329440551351289e-07, "loss": 0.73838305, "num_input_tokens_seen": 243748330, "router_z_loss_clip": 0.10742188, "router_z_loss_mlp": 0.67578125, "step": 11296, "time_per_iteration": 2.6137921810150146 }, { "auxiliary_loss_clip": 0.01114873, "auxiliary_loss_mlp": 0.01028816, "balance_loss_clip": 1.01671028, "balance_loss_mlp": 1.03606641, "epoch": 0.6792123853900496, "flos": 24826519152000.0, "grad_norm": 1.5802972742819459, "language_loss": 0.70375955, "learning_rate": 9.326245186945996e-07, "loss": 0.72519648, "num_input_tokens_seen": 243769380, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69921875, "step": 11297, "time_per_iteration": 2.5996055603027344 }, { "auxiliary_loss_clip": 0.01139346, "auxiliary_loss_mlp": 0.0103323, "balance_loss_clip": 1.02002192, "balance_loss_mlp": 1.03897023, "epoch": 0.6792725086427176, "flos": 17566279678080.0, "grad_norm": 1.8781165933042006, "language_loss": 0.66072249, "learning_rate": 9.323050203459539e-07, "loss": 0.68244827, "num_input_tokens_seen": 243785510, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73828125, "step": 11298, "time_per_iteration": 2.5900819301605225 }, { "auxiliary_loss_clip": 0.01119032, "auxiliary_loss_mlp": 0.01026085, "balance_loss_clip": 1.01481986, "balance_loss_mlp": 1.03853083, "epoch": 0.6793326318953855, "flos": 26941837040640.0, "grad_norm": 1.8371167270636295, "language_loss": 0.71492863, "learning_rate": 9.319855601005966e-07, "loss": 0.73637974, "num_input_tokens_seen": 243805545, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.71484375, "step": 11299, "time_per_iteration": 2.588899612426758 }, { "auxiliary_loss_clip": 0.01109664, "auxiliary_loss_mlp": 0.01029661, "balance_loss_clip": 1.01659584, "balance_loss_mlp": 1.03803062, "epoch": 0.6793927551480535, "flos": 24318275662080.0, "grad_norm": 1.2893890462167934, "language_loss": 0.77298594, "learning_rate": 9.316661379699274e-07, "loss": 0.79437923, "num_input_tokens_seen": 243825185, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 11300, "time_per_iteration": 2.5507166385650635 }, { "auxiliary_loss_clip": 0.01115465, "auxiliary_loss_mlp": 0.01031066, "balance_loss_clip": 1.01823282, "balance_loss_mlp": 1.03570032, "epoch": 0.6794528784007214, "flos": 11436615757440.0, "grad_norm": 2.1102955187629044, "language_loss": 0.62487894, "learning_rate": 9.313467539653454e-07, "loss": 0.6463443, "num_input_tokens_seen": 243841600, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 11301, "time_per_iteration": 2.5659546852111816 }, { "auxiliary_loss_clip": 0.01115798, "auxiliary_loss_mlp": 0.0103046, "balance_loss_clip": 1.0184021, "balance_loss_mlp": 1.03705096, "epoch": 0.6795130016533895, "flos": 25229688382080.0, "grad_norm": 1.7463136303423108, "language_loss": 0.83131886, "learning_rate": 9.310274080982483e-07, "loss": 0.85278147, "num_input_tokens_seen": 243862250, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.703125, "step": 11302, "time_per_iteration": 2.5501725673675537 }, { "auxiliary_loss_clip": 0.01118674, "auxiliary_loss_mlp": 0.01031195, "balance_loss_clip": 1.01837397, "balance_loss_mlp": 1.03730309, "epoch": 0.6795731249060574, "flos": 18296415434880.0, "grad_norm": 2.6703476756367457, "language_loss": 0.69734436, "learning_rate": 9.307081003800339e-07, "loss": 0.7188431, "num_input_tokens_seen": 243880560, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 11303, "time_per_iteration": 2.561631202697754 }, { "auxiliary_loss_clip": 0.01137705, "auxiliary_loss_mlp": 0.01032702, "balance_loss_clip": 1.02034605, "balance_loss_mlp": 1.03790188, "epoch": 0.6796332481587254, "flos": 20300374183680.0, "grad_norm": 1.8790220939534577, "language_loss": 0.69820863, "learning_rate": 9.303888308220969e-07, "loss": 0.71991265, "num_input_tokens_seen": 243900635, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.73046875, "step": 11304, "time_per_iteration": 2.5374908447265625 }, { "auxiliary_loss_clip": 0.01121673, "auxiliary_loss_mlp": 0.01032686, "balance_loss_clip": 1.0194838, "balance_loss_mlp": 1.04058468, "epoch": 0.6796933714113934, "flos": 23586847015680.0, "grad_norm": 2.794574754185962, "language_loss": 0.72802019, "learning_rate": 9.300695994358312e-07, "loss": 0.74956381, "num_input_tokens_seen": 243920160, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 11305, "time_per_iteration": 2.574084997177124 }, { "auxiliary_loss_clip": 0.0112582, "auxiliary_loss_mlp": 0.0103082, "balance_loss_clip": 1.01793909, "balance_loss_mlp": 1.03680515, "epoch": 0.6797534946640613, "flos": 27119917693440.0, "grad_norm": 1.8349056208157204, "language_loss": 0.65803218, "learning_rate": 9.297504062326285e-07, "loss": 0.67959857, "num_input_tokens_seen": 243939015, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 11306, "time_per_iteration": 2.559810161590576 }, { "auxiliary_loss_clip": 0.01124392, "auxiliary_loss_mlp": 0.01028693, "balance_loss_clip": 1.01517427, "balance_loss_mlp": 1.03696895, "epoch": 0.6798136179167293, "flos": 22127437428480.0, "grad_norm": 1.6382951365785283, "language_loss": 0.80131769, "learning_rate": 9.294312512238823e-07, "loss": 0.82284856, "num_input_tokens_seen": 243958470, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.6953125, "step": 11307, "time_per_iteration": 2.544670343399048 }, { "auxiliary_loss_clip": 0.01125352, "auxiliary_loss_mlp": 0.0103202, "balance_loss_clip": 1.01982498, "balance_loss_mlp": 1.0378139, "epoch": 0.6798737411693972, "flos": 17488640430720.0, "grad_norm": 1.5677531879359063, "language_loss": 0.88903332, "learning_rate": 9.291121344209802e-07, "loss": 0.91060704, "num_input_tokens_seen": 243975450, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 11308, "time_per_iteration": 2.5106301307678223 }, { "auxiliary_loss_clip": 0.01135507, "auxiliary_loss_mlp": 0.01036138, "balance_loss_clip": 1.02270901, "balance_loss_mlp": 1.03627884, "epoch": 0.6799338644220653, "flos": 22892262744960.0, "grad_norm": 1.904910497156317, "language_loss": 0.71471614, "learning_rate": 9.287930558353106e-07, "loss": 0.73643255, "num_input_tokens_seen": 243994355, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.72265625, "step": 11309, "time_per_iteration": 2.5878524780273438 }, { "auxiliary_loss_clip": 0.01134802, "auxiliary_loss_mlp": 0.01038326, "balance_loss_clip": 1.02515316, "balance_loss_mlp": 1.03696156, "epoch": 0.6799939876747332, "flos": 23180409648000.0, "grad_norm": 2.301906268701038, "language_loss": 0.84575433, "learning_rate": 9.284740154782622e-07, "loss": 0.86748564, "num_input_tokens_seen": 244011620, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71484375, "step": 11310, "time_per_iteration": 2.5442564487457275 }, { "auxiliary_loss_clip": 0.01124102, "auxiliary_loss_mlp": 0.01029694, "balance_loss_clip": 1.01797581, "balance_loss_mlp": 1.03774607, "epoch": 0.6800541109274012, "flos": 19499925553920.0, "grad_norm": 2.2175876315972345, "language_loss": 0.82571065, "learning_rate": 9.281550133612197e-07, "loss": 0.84724855, "num_input_tokens_seen": 244029925, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 11311, "time_per_iteration": 2.5346884727478027 }, { "auxiliary_loss_clip": 0.01147786, "auxiliary_loss_mlp": 0.01030347, "balance_loss_clip": 1.01624417, "balance_loss_mlp": 1.03598118, "epoch": 0.6801142341800691, "flos": 22277652105600.0, "grad_norm": 2.1824854004944076, "language_loss": 0.76097614, "learning_rate": 9.278360494955677e-07, "loss": 0.7827574, "num_input_tokens_seen": 244051225, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 11312, "time_per_iteration": 2.6010122299194336 }, { "auxiliary_loss_clip": 0.01031209, "auxiliary_loss_mlp": 0.01000624, "balance_loss_clip": 0.99917519, "balance_loss_mlp": 1.00911546, "epoch": 0.6801743574327371, "flos": 68714817759360.0, "grad_norm": 0.6952621757236729, "language_loss": 0.57279795, "learning_rate": 9.275171238926884e-07, "loss": 0.59311628, "num_input_tokens_seen": 244115930, "router_z_loss_clip": 0.01446533, "router_z_loss_mlp": 0.22070312, "step": 11313, "time_per_iteration": 3.248304605484009 }, { "auxiliary_loss_clip": 0.0111706, "auxiliary_loss_mlp": 0.0102797, "balance_loss_clip": 1.01510131, "balance_loss_mlp": 1.03562379, "epoch": 0.680234480685405, "flos": 29460467813760.0, "grad_norm": 2.0713319706876474, "language_loss": 0.68609184, "learning_rate": 9.271982365639659e-07, "loss": 0.70754218, "num_input_tokens_seen": 244137320, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 11314, "time_per_iteration": 3.9652411937713623 }, { "auxiliary_loss_clip": 0.01113086, "auxiliary_loss_mlp": 0.01029184, "balance_loss_clip": 1.01724494, "balance_loss_mlp": 1.03418136, "epoch": 0.6802946039380731, "flos": 15916866122880.0, "grad_norm": 1.7998960297750575, "language_loss": 0.81505573, "learning_rate": 9.268793875207772e-07, "loss": 0.83647835, "num_input_tokens_seen": 244152755, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69921875, "step": 11315, "time_per_iteration": 2.4941797256469727 }, { "auxiliary_loss_clip": 0.01141802, "auxiliary_loss_mlp": 0.01027682, "balance_loss_clip": 1.01498616, "balance_loss_mlp": 1.03584778, "epoch": 0.680354727190741, "flos": 22018664067840.0, "grad_norm": 3.330855724923747, "language_loss": 0.69933414, "learning_rate": 9.265605767745033e-07, "loss": 0.72102904, "num_input_tokens_seen": 244171480, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 11316, "time_per_iteration": 2.6445906162261963 }, { "auxiliary_loss_clip": 0.01115719, "auxiliary_loss_mlp": 0.0103062, "balance_loss_clip": 1.01838291, "balance_loss_mlp": 1.03702891, "epoch": 0.680414850443409, "flos": 18441494467200.0, "grad_norm": 2.641595299109739, "language_loss": 0.66315395, "learning_rate": 9.262418043365215e-07, "loss": 0.6846174, "num_input_tokens_seen": 244187920, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69921875, "step": 11317, "time_per_iteration": 2.4979307651519775 }, { "auxiliary_loss_clip": 0.01120507, "auxiliary_loss_mlp": 0.01040711, "balance_loss_clip": 1.02862859, "balance_loss_mlp": 1.04002225, "epoch": 0.680474973696077, "flos": 26358611909760.0, "grad_norm": 1.5118061342546762, "language_loss": 0.74537969, "learning_rate": 9.259230702182075e-07, "loss": 0.76699185, "num_input_tokens_seen": 244209565, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.71875, "step": 11318, "time_per_iteration": 2.5658931732177734 }, { "auxiliary_loss_clip": 0.01120636, "auxiliary_loss_mlp": 0.01027355, "balance_loss_clip": 1.01591694, "balance_loss_mlp": 1.03478134, "epoch": 0.6805350969487449, "flos": 18333116156160.0, "grad_norm": 1.7264179196718292, "language_loss": 0.67976546, "learning_rate": 9.256043744309354e-07, "loss": 0.70124543, "num_input_tokens_seen": 244228015, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6796875, "step": 11319, "time_per_iteration": 2.5009591579437256 }, { "auxiliary_loss_clip": 0.01139931, "auxiliary_loss_mlp": 0.01284827, "balance_loss_clip": 1.02437854, "balance_loss_mlp": 1.03495169, "epoch": 0.6805952202014129, "flos": 19937497034880.0, "grad_norm": 1.6052282546182546, "language_loss": 0.76532626, "learning_rate": 9.252857169860804e-07, "loss": 0.78957379, "num_input_tokens_seen": 244245615, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.6953125, "step": 11320, "time_per_iteration": 4.013296365737915 }, { "auxiliary_loss_clip": 0.01124458, "auxiliary_loss_mlp": 0.01031602, "balance_loss_clip": 1.01887655, "balance_loss_mlp": 1.03617191, "epoch": 0.6806553434540809, "flos": 25224301342080.0, "grad_norm": 2.576223925686968, "language_loss": 0.7454989, "learning_rate": 9.249670978950137e-07, "loss": 0.76705945, "num_input_tokens_seen": 244263625, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 11321, "time_per_iteration": 2.5576822757720947 }, { "auxiliary_loss_clip": 0.01126032, "auxiliary_loss_mlp": 0.01037176, "balance_loss_clip": 1.024647, "balance_loss_mlp": 1.03756511, "epoch": 0.6807154667067489, "flos": 17785586165760.0, "grad_norm": 1.6651809181546486, "language_loss": 0.72584498, "learning_rate": 9.246485171691058e-07, "loss": 0.74747699, "num_input_tokens_seen": 244282745, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 11322, "time_per_iteration": 2.54258131980896 }, { "auxiliary_loss_clip": 0.01143431, "auxiliary_loss_mlp": 0.01276315, "balance_loss_clip": 1.01616192, "balance_loss_mlp": 1.03642142, "epoch": 0.6807755899594168, "flos": 22199905117440.0, "grad_norm": 1.7705835903219533, "language_loss": 0.78533089, "learning_rate": 9.243299748197264e-07, "loss": 0.80952835, "num_input_tokens_seen": 244303770, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 11323, "time_per_iteration": 2.580430030822754 }, { "auxiliary_loss_clip": 0.01056746, "auxiliary_loss_mlp": 0.01001256, "balance_loss_clip": 0.9999094, "balance_loss_mlp": 1.0082922, "epoch": 0.6808357132120848, "flos": 68631073200000.0, "grad_norm": 0.7493364481649791, "language_loss": 0.57120311, "learning_rate": 9.240114708582432e-07, "loss": 0.59178311, "num_input_tokens_seen": 244355910, "router_z_loss_clip": 0.01348877, "router_z_loss_mlp": 0.21875, "step": 11324, "time_per_iteration": 2.9583046436309814 }, { "auxiliary_loss_clip": 0.01127625, "auxiliary_loss_mlp": 0.01036922, "balance_loss_clip": 1.02379751, "balance_loss_mlp": 1.03793454, "epoch": 0.6808958364647527, "flos": 23843357015040.0, "grad_norm": 1.8698920812110387, "language_loss": 0.68338358, "learning_rate": 9.236930052960225e-07, "loss": 0.70502907, "num_input_tokens_seen": 244376610, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 11325, "time_per_iteration": 4.3126060962677 }, { "auxiliary_loss_clip": 0.01119616, "auxiliary_loss_mlp": 0.01032198, "balance_loss_clip": 1.01888847, "balance_loss_mlp": 1.03615797, "epoch": 0.6809559597174207, "flos": 17711717846400.0, "grad_norm": 2.002597475893294, "language_loss": 0.70070648, "learning_rate": 9.233745781444295e-07, "loss": 0.72222459, "num_input_tokens_seen": 244393000, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.74609375, "step": 11326, "time_per_iteration": 2.5363948345184326 }, { "auxiliary_loss_clip": 0.01106132, "auxiliary_loss_mlp": 0.0103158, "balance_loss_clip": 1.0189383, "balance_loss_mlp": 1.03489256, "epoch": 0.6810160829700886, "flos": 22491894775680.0, "grad_norm": 1.879601870369577, "language_loss": 0.72742701, "learning_rate": 9.230561894148298e-07, "loss": 0.74880415, "num_input_tokens_seen": 244409515, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 11327, "time_per_iteration": 2.607151985168457 }, { "auxiliary_loss_clip": 0.01113078, "auxiliary_loss_mlp": 0.01028207, "balance_loss_clip": 1.01542163, "balance_loss_mlp": 1.03501654, "epoch": 0.6810762062227567, "flos": 16832875783680.0, "grad_norm": 1.6966099356982822, "language_loss": 0.77519989, "learning_rate": 9.227378391185829e-07, "loss": 0.79661274, "num_input_tokens_seen": 244427165, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 11328, "time_per_iteration": 2.495609998703003 }, { "auxiliary_loss_clip": 0.011089, "auxiliary_loss_mlp": 0.01031371, "balance_loss_clip": 1.01885355, "balance_loss_mlp": 1.03738487, "epoch": 0.6811363294754246, "flos": 12714676554240.0, "grad_norm": 1.7588529935722277, "language_loss": 0.6413461, "learning_rate": 9.224195272670523e-07, "loss": 0.66274887, "num_input_tokens_seen": 244445705, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71484375, "step": 11329, "time_per_iteration": 2.5033087730407715 }, { "auxiliary_loss_clip": 0.01152678, "auxiliary_loss_mlp": 0.01276524, "balance_loss_clip": 1.0165019, "balance_loss_mlp": 1.03652143, "epoch": 0.6811964527280926, "flos": 17711969241600.0, "grad_norm": 2.2776208587129014, "language_loss": 0.79656035, "learning_rate": 9.22101253871596e-07, "loss": 0.82085234, "num_input_tokens_seen": 244460415, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 11330, "time_per_iteration": 2.5711774826049805 }, { "auxiliary_loss_clip": 0.01117098, "auxiliary_loss_mlp": 0.01029795, "balance_loss_clip": 1.01720643, "balance_loss_mlp": 1.03740644, "epoch": 0.6812565759807605, "flos": 24863471268480.0, "grad_norm": 1.7704057819862655, "language_loss": 0.63882685, "learning_rate": 9.217830189435749e-07, "loss": 0.66029572, "num_input_tokens_seen": 244480555, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 11331, "time_per_iteration": 2.5452628135681152 }, { "auxiliary_loss_clip": 0.01144248, "auxiliary_loss_mlp": 0.01034499, "balance_loss_clip": 1.02163649, "balance_loss_mlp": 1.03786933, "epoch": 0.6813166992334285, "flos": 17166019449600.0, "grad_norm": 1.4616196464559927, "language_loss": 0.72409767, "learning_rate": 9.214648224943429e-07, "loss": 0.74588513, "num_input_tokens_seen": 244498540, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 11332, "time_per_iteration": 2.5390372276306152 }, { "auxiliary_loss_clip": 0.01118668, "auxiliary_loss_mlp": 0.0103359, "balance_loss_clip": 1.02045298, "balance_loss_mlp": 1.03696573, "epoch": 0.6813768224860965, "flos": 18843550375680.0, "grad_norm": 2.0637778854606172, "language_loss": 0.74262667, "learning_rate": 9.211466645352577e-07, "loss": 0.76414919, "num_input_tokens_seen": 244517015, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 11333, "time_per_iteration": 2.4858834743499756 }, { "auxiliary_loss_clip": 0.01118733, "auxiliary_loss_mlp": 0.01032457, "balance_loss_clip": 1.01895642, "balance_loss_mlp": 1.03661847, "epoch": 0.6814369457387645, "flos": 24532733813760.0, "grad_norm": 1.6052732030537893, "language_loss": 0.72240782, "learning_rate": 9.20828545077673e-07, "loss": 0.74391967, "num_input_tokens_seen": 244537450, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 11334, "time_per_iteration": 2.5491180419921875 }, { "auxiliary_loss_clip": 0.01115949, "auxiliary_loss_mlp": 0.01031435, "balance_loss_clip": 1.01952052, "balance_loss_mlp": 1.03696942, "epoch": 0.6814970689914325, "flos": 18222978078720.0, "grad_norm": 1.9389829945442805, "language_loss": 0.85990143, "learning_rate": 9.205104641329416e-07, "loss": 0.88137531, "num_input_tokens_seen": 244555640, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 11335, "time_per_iteration": 2.5270533561706543 }, { "auxiliary_loss_clip": 0.01109477, "auxiliary_loss_mlp": 0.01027235, "balance_loss_clip": 1.0160408, "balance_loss_mlp": 1.03447092, "epoch": 0.6815571922441004, "flos": 25228790542080.0, "grad_norm": 1.6580832830222367, "language_loss": 0.82088935, "learning_rate": 9.201924217124139e-07, "loss": 0.84225643, "num_input_tokens_seen": 244574005, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.66015625, "step": 11336, "time_per_iteration": 2.545595169067383 }, { "auxiliary_loss_clip": 0.01131538, "auxiliary_loss_mlp": 0.01028082, "balance_loss_clip": 1.01632833, "balance_loss_mlp": 1.03577125, "epoch": 0.6816173154967684, "flos": 19456078026240.0, "grad_norm": 1.6987812788026724, "language_loss": 0.81507802, "learning_rate": 9.198744178274421e-07, "loss": 0.83667421, "num_input_tokens_seen": 244591395, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 11337, "time_per_iteration": 2.5730903148651123 }, { "auxiliary_loss_clip": 0.01139849, "auxiliary_loss_mlp": 0.01028329, "balance_loss_clip": 1.01665807, "balance_loss_mlp": 1.03565359, "epoch": 0.6816774387494363, "flos": 17931455297280.0, "grad_norm": 4.129526371711024, "language_loss": 0.72377825, "learning_rate": 9.195564524893738e-07, "loss": 0.74546003, "num_input_tokens_seen": 244610400, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6796875, "step": 11338, "time_per_iteration": 2.548509359359741 }, { "auxiliary_loss_clip": 0.01143333, "auxiliary_loss_mlp": 0.01034525, "balance_loss_clip": 1.0220201, "balance_loss_mlp": 1.03720689, "epoch": 0.6817375620021043, "flos": 22233014478720.0, "grad_norm": 1.6932786510746023, "language_loss": 0.776124, "learning_rate": 9.192385257095565e-07, "loss": 0.79790264, "num_input_tokens_seen": 244630400, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 11339, "time_per_iteration": 2.553786277770996 }, { "auxiliary_loss_clip": 0.01134844, "auxiliary_loss_mlp": 0.01036356, "balance_loss_clip": 1.02364802, "balance_loss_mlp": 1.03746784, "epoch": 0.6817976852547722, "flos": 25374408278400.0, "grad_norm": 2.2835421556873197, "language_loss": 0.70797044, "learning_rate": 9.189206374993361e-07, "loss": 0.72968245, "num_input_tokens_seen": 244649155, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 11340, "time_per_iteration": 2.646721363067627 }, { "auxiliary_loss_clip": 0.01129509, "auxiliary_loss_mlp": 0.01035071, "balance_loss_clip": 1.02218485, "balance_loss_mlp": 1.03782225, "epoch": 0.6818578085074403, "flos": 22265764704000.0, "grad_norm": 2.1593226107252184, "language_loss": 0.83475006, "learning_rate": 9.186027878700576e-07, "loss": 0.85639584, "num_input_tokens_seen": 244665470, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7421875, "step": 11341, "time_per_iteration": 2.573875665664673 }, { "auxiliary_loss_clip": 0.01136553, "auxiliary_loss_mlp": 0.01032677, "balance_loss_clip": 1.01860976, "balance_loss_mlp": 1.0373956, "epoch": 0.6819179317601082, "flos": 19318145800320.0, "grad_norm": 1.7858336214214827, "language_loss": 0.6805411, "learning_rate": 9.182849768330636e-07, "loss": 0.70223343, "num_input_tokens_seen": 244684390, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7265625, "step": 11342, "time_per_iteration": 2.6492788791656494 }, { "auxiliary_loss_clip": 0.01141641, "auxiliary_loss_mlp": 0.01029487, "balance_loss_clip": 1.01670194, "balance_loss_mlp": 1.03579617, "epoch": 0.6819780550127762, "flos": 21104126864640.0, "grad_norm": 1.5980933071583379, "language_loss": 0.7462014, "learning_rate": 9.179672043996956e-07, "loss": 0.76791269, "num_input_tokens_seen": 244703370, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 11343, "time_per_iteration": 2.6066689491271973 }, { "auxiliary_loss_clip": 0.01136537, "auxiliary_loss_mlp": 0.01033818, "balance_loss_clip": 1.02118778, "balance_loss_mlp": 1.03765011, "epoch": 0.6820381782654441, "flos": 29716403195520.0, "grad_norm": 1.8223512033318092, "language_loss": 0.79892778, "learning_rate": 9.176494705812963e-07, "loss": 0.82063127, "num_input_tokens_seen": 244723325, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 11344, "time_per_iteration": 2.6767783164978027 }, { "auxiliary_loss_clip": 0.01133816, "auxiliary_loss_mlp": 0.01033808, "balance_loss_clip": 1.02092195, "balance_loss_mlp": 1.03623807, "epoch": 0.6820983015181121, "flos": 29242130993280.0, "grad_norm": 1.9532901848978224, "language_loss": 0.66125965, "learning_rate": 9.173317753892016e-07, "loss": 0.68293595, "num_input_tokens_seen": 244745650, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 11345, "time_per_iteration": 2.6050503253936768 }, { "auxiliary_loss_clip": 0.0112933, "auxiliary_loss_mlp": 0.01035818, "balance_loss_clip": 1.02294385, "balance_loss_mlp": 1.03831816, "epoch": 0.6821584247707801, "flos": 18871775487360.0, "grad_norm": 2.5719665777717022, "language_loss": 0.651034, "learning_rate": 9.170141188347517e-07, "loss": 0.67268544, "num_input_tokens_seen": 244760270, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 11346, "time_per_iteration": 2.4607112407684326 }, { "auxiliary_loss_clip": 0.01153102, "auxiliary_loss_mlp": 0.01276509, "balance_loss_clip": 1.01660371, "balance_loss_mlp": 1.03645742, "epoch": 0.6822185480234481, "flos": 21324582587520.0, "grad_norm": 1.643507883238721, "language_loss": 0.78394264, "learning_rate": 9.166965009292815e-07, "loss": 0.80823874, "num_input_tokens_seen": 244779565, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 11347, "time_per_iteration": 2.5526156425476074 }, { "auxiliary_loss_clip": 0.01037446, "auxiliary_loss_mlp": 0.01008525, "balance_loss_clip": 1.00718939, "balance_loss_mlp": 1.00667036, "epoch": 0.6822786712761161, "flos": 63488306430720.0, "grad_norm": 0.7227657930369862, "language_loss": 0.52569073, "learning_rate": 9.16378921684128e-07, "loss": 0.54615045, "num_input_tokens_seen": 244838480, "router_z_loss_clip": 0.0133667, "router_z_loss_mlp": 0.21875, "step": 11348, "time_per_iteration": 3.089508533477783 }, { "auxiliary_loss_clip": 0.01104413, "auxiliary_loss_mlp": 0.01033601, "balance_loss_clip": 1.0213697, "balance_loss_mlp": 1.03528917, "epoch": 0.682338794528784, "flos": 21068934514560.0, "grad_norm": 1.9606220726027788, "language_loss": 0.79985303, "learning_rate": 9.16061381110622e-07, "loss": 0.82123315, "num_input_tokens_seen": 244855265, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69140625, "step": 11349, "time_per_iteration": 2.463968276977539 }, { "auxiliary_loss_clip": 0.01123225, "auxiliary_loss_mlp": 0.01030602, "balance_loss_clip": 1.01509333, "balance_loss_mlp": 1.03717399, "epoch": 0.682398917781452, "flos": 36283243547520.0, "grad_norm": 1.5896647566131958, "language_loss": 0.73756063, "learning_rate": 9.157438792200975e-07, "loss": 0.75909889, "num_input_tokens_seen": 244875555, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.7734375, "step": 11350, "time_per_iteration": 2.6526339054107666 }, { "auxiliary_loss_clip": 0.01108125, "auxiliary_loss_mlp": 0.0103051, "balance_loss_clip": 1.01730788, "balance_loss_mlp": 1.03772664, "epoch": 0.6824590410341199, "flos": 24859197550080.0, "grad_norm": 1.7622381752145073, "language_loss": 0.79564261, "learning_rate": 9.154264160238853e-07, "loss": 0.81702894, "num_input_tokens_seen": 244895270, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 11351, "time_per_iteration": 2.5218987464904785 }, { "auxiliary_loss_clip": 0.01115118, "auxiliary_loss_mlp": 0.01038703, "balance_loss_clip": 1.02609622, "balance_loss_mlp": 1.03537154, "epoch": 0.6825191642867879, "flos": 22452392793600.0, "grad_norm": 1.5638112916659666, "language_loss": 0.73331487, "learning_rate": 9.151089915333143e-07, "loss": 0.75485301, "num_input_tokens_seen": 244914535, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 11352, "time_per_iteration": 2.5251400470733643 }, { "auxiliary_loss_clip": 0.01144648, "auxiliary_loss_mlp": 0.01039045, "balance_loss_clip": 1.02537167, "balance_loss_mlp": 1.03604579, "epoch": 0.6825792875394558, "flos": 29424377623680.0, "grad_norm": 1.5568224044365535, "language_loss": 0.80101079, "learning_rate": 9.147916057597127e-07, "loss": 0.82284772, "num_input_tokens_seen": 244936095, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73046875, "step": 11353, "time_per_iteration": 2.629981279373169 }, { "auxiliary_loss_clip": 0.01116873, "auxiliary_loss_mlp": 0.01027687, "balance_loss_clip": 1.01472342, "balance_loss_mlp": 1.03679347, "epoch": 0.6826394107921239, "flos": 18770974945920.0, "grad_norm": 1.6505413865629683, "language_loss": 0.78088152, "learning_rate": 9.144742587144065e-07, "loss": 0.8023271, "num_input_tokens_seen": 244955290, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 11354, "time_per_iteration": 2.579052448272705 }, { "auxiliary_loss_clip": 0.01149961, "auxiliary_loss_mlp": 0.01028212, "balance_loss_clip": 1.01602864, "balance_loss_mlp": 1.03619194, "epoch": 0.6826995340447918, "flos": 16617591619200.0, "grad_norm": 2.0576561746447974, "language_loss": 0.62050605, "learning_rate": 9.141569504087232e-07, "loss": 0.64228779, "num_input_tokens_seen": 244972935, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69921875, "step": 11355, "time_per_iteration": 3.9588897228240967 }, { "auxiliary_loss_clip": 0.01142903, "auxiliary_loss_mlp": 0.0102845, "balance_loss_clip": 1.01598632, "balance_loss_mlp": 1.035321, "epoch": 0.6827596572974598, "flos": 20848299223680.0, "grad_norm": 3.5950155916963467, "language_loss": 0.82324064, "learning_rate": 9.138396808539837e-07, "loss": 0.84495419, "num_input_tokens_seen": 244989440, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 11356, "time_per_iteration": 2.5505616664886475 }, { "auxiliary_loss_clip": 0.01129161, "auxiliary_loss_mlp": 0.01031787, "balance_loss_clip": 1.01888275, "balance_loss_mlp": 1.0391891, "epoch": 0.6828197805501277, "flos": 22748081552640.0, "grad_norm": 2.446687712512625, "language_loss": 0.78513825, "learning_rate": 9.135224500615126e-07, "loss": 0.80674767, "num_input_tokens_seen": 245007830, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.72265625, "step": 11357, "time_per_iteration": 2.612826108932495 }, { "auxiliary_loss_clip": 0.01047358, "auxiliary_loss_mlp": 0.0100497, "balance_loss_clip": 1.00358105, "balance_loss_mlp": 1.00720131, "epoch": 0.6828799038027957, "flos": 71646565829760.0, "grad_norm": 0.8225272916674389, "language_loss": 0.59679031, "learning_rate": 9.132052580426309e-07, "loss": 0.6173135, "num_input_tokens_seen": 245070720, "router_z_loss_clip": 0.01391602, "router_z_loss_mlp": 0.21875, "step": 11358, "time_per_iteration": 3.2182278633117676 }, { "auxiliary_loss_clip": 0.01141791, "auxiliary_loss_mlp": 0.01035144, "balance_loss_clip": 1.02142334, "balance_loss_mlp": 1.03900576, "epoch": 0.6829400270554637, "flos": 19829154637440.0, "grad_norm": 1.6163750092182207, "language_loss": 0.78335345, "learning_rate": 9.128881048086576e-07, "loss": 0.80512273, "num_input_tokens_seen": 245089070, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7578125, "step": 11359, "time_per_iteration": 2.609118700027466 }, { "auxiliary_loss_clip": 0.0111785, "auxiliary_loss_mlp": 0.01036043, "balance_loss_clip": 1.02253056, "balance_loss_mlp": 1.0373255, "epoch": 0.6830001503081317, "flos": 21980634543360.0, "grad_norm": 2.1511218871048454, "language_loss": 0.81785893, "learning_rate": 9.125709903709109e-07, "loss": 0.83939791, "num_input_tokens_seen": 245106500, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 11360, "time_per_iteration": 2.517321825027466 }, { "auxiliary_loss_clip": 0.01122557, "auxiliary_loss_mlp": 0.01037071, "balance_loss_clip": 1.02390397, "balance_loss_mlp": 1.04078746, "epoch": 0.6830602735607997, "flos": 24316767290880.0, "grad_norm": 2.758124849546689, "language_loss": 0.75645429, "learning_rate": 9.122539147407098e-07, "loss": 0.7780506, "num_input_tokens_seen": 245125260, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 11361, "time_per_iteration": 2.557323932647705 }, { "auxiliary_loss_clip": 0.01047399, "auxiliary_loss_mlp": 0.01001226, "balance_loss_clip": 0.99993259, "balance_loss_mlp": 1.00795984, "epoch": 0.6831203968134676, "flos": 57690062323200.0, "grad_norm": 0.8779037185055879, "language_loss": 0.59733057, "learning_rate": 9.11936877929367e-07, "loss": 0.61781681, "num_input_tokens_seen": 245188730, "router_z_loss_clip": 0.01293945, "router_z_loss_mlp": 0.21875, "step": 11362, "time_per_iteration": 4.577887058258057 }, { "auxiliary_loss_clip": 0.0112738, "auxiliary_loss_mlp": 0.01034493, "balance_loss_clip": 1.02086115, "balance_loss_mlp": 1.03602481, "epoch": 0.6831805200661356, "flos": 14388436552320.0, "grad_norm": 2.4764359709160257, "language_loss": 0.75614202, "learning_rate": 9.116198799481988e-07, "loss": 0.77776068, "num_input_tokens_seen": 245205065, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73046875, "step": 11363, "time_per_iteration": 2.550938844680786 }, { "auxiliary_loss_clip": 0.01125423, "auxiliary_loss_mlp": 0.01037569, "balance_loss_clip": 1.02390742, "balance_loss_mlp": 1.03650069, "epoch": 0.6832406433188035, "flos": 22820297846400.0, "grad_norm": 2.239565599335331, "language_loss": 0.90286267, "learning_rate": 9.113029208085171e-07, "loss": 0.92449248, "num_input_tokens_seen": 245224265, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71484375, "step": 11364, "time_per_iteration": 2.6380653381347656 }, { "auxiliary_loss_clip": 0.01122894, "auxiliary_loss_mlp": 0.01033012, "balance_loss_clip": 1.02120399, "balance_loss_mlp": 1.03613997, "epoch": 0.6833007665714715, "flos": 17561718650880.0, "grad_norm": 1.7636980227461068, "language_loss": 0.87936723, "learning_rate": 9.109860005216347e-07, "loss": 0.90092635, "num_input_tokens_seen": 245243360, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69140625, "step": 11365, "time_per_iteration": 2.57183837890625 }, { "auxiliary_loss_clip": 0.01142308, "auxiliary_loss_mlp": 0.01040059, "balance_loss_clip": 1.0268451, "balance_loss_mlp": 1.03936291, "epoch": 0.6833608898241395, "flos": 22445928345600.0, "grad_norm": 2.0845492022910856, "language_loss": 0.81485832, "learning_rate": 9.106691190988596e-07, "loss": 0.83668202, "num_input_tokens_seen": 245256350, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7578125, "step": 11366, "time_per_iteration": 2.57829213142395 }, { "auxiliary_loss_clip": 0.0112602, "auxiliary_loss_mlp": 0.01028307, "balance_loss_clip": 1.01567078, "balance_loss_mlp": 1.03610027, "epoch": 0.6834210130768075, "flos": 24534637234560.0, "grad_norm": 1.6744917197630818, "language_loss": 0.76707101, "learning_rate": 9.10352276551502e-07, "loss": 0.78861427, "num_input_tokens_seen": 245277575, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.72265625, "step": 11367, "time_per_iteration": 5.540399074554443 }, { "auxiliary_loss_clip": 0.01126039, "auxiliary_loss_mlp": 0.01037192, "balance_loss_clip": 1.02354848, "balance_loss_mlp": 1.03655457, "epoch": 0.6834811363294754, "flos": 20047132321920.0, "grad_norm": 1.511730953586383, "language_loss": 0.69044781, "learning_rate": 9.100354728908688e-07, "loss": 0.71208012, "num_input_tokens_seen": 245296615, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 11368, "time_per_iteration": 2.5718932151794434 }, { "auxiliary_loss_clip": 0.01135722, "auxiliary_loss_mlp": 0.01035983, "balance_loss_clip": 1.02353191, "balance_loss_mlp": 1.03695369, "epoch": 0.6835412595821434, "flos": 24790752184320.0, "grad_norm": 1.8505516797518347, "language_loss": 0.7343027, "learning_rate": 9.097187081282658e-07, "loss": 0.75601971, "num_input_tokens_seen": 245316275, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 11369, "time_per_iteration": 2.589413642883301 }, { "auxiliary_loss_clip": 0.01117885, "auxiliary_loss_mlp": 0.01033076, "balance_loss_clip": 1.0198741, "balance_loss_mlp": 1.03633332, "epoch": 0.6836013828348113, "flos": 19500356517120.0, "grad_norm": 1.837928181405775, "language_loss": 0.791444, "learning_rate": 9.094019822749976e-07, "loss": 0.81295371, "num_input_tokens_seen": 245334595, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 11370, "time_per_iteration": 2.5197503566741943 }, { "auxiliary_loss_clip": 0.0103885, "auxiliary_loss_mlp": 0.01000371, "balance_loss_clip": 0.99895287, "balance_loss_mlp": 1.0079844, "epoch": 0.6836615060874793, "flos": 58363999251840.0, "grad_norm": 0.7444853936884896, "language_loss": 0.59786415, "learning_rate": 9.090852953423674e-07, "loss": 0.61825633, "num_input_tokens_seen": 245389750, "router_z_loss_clip": 0.01416016, "router_z_loss_mlp": 0.21875, "step": 11371, "time_per_iteration": 3.0420830249786377 }, { "auxiliary_loss_clip": 0.01124328, "auxiliary_loss_mlp": 0.01029541, "balance_loss_clip": 1.01732767, "balance_loss_mlp": 1.03770494, "epoch": 0.6837216293401474, "flos": 12166895168640.0, "grad_norm": 1.5009587669145767, "language_loss": 0.63512695, "learning_rate": 9.087686473416766e-07, "loss": 0.65666562, "num_input_tokens_seen": 245407530, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 11372, "time_per_iteration": 2.5792288780212402 }, { "auxiliary_loss_clip": 0.01115992, "auxiliary_loss_mlp": 0.01030586, "balance_loss_clip": 1.01756287, "balance_loss_mlp": 1.03621149, "epoch": 0.6837817525928153, "flos": 22127581082880.0, "grad_norm": 1.759042261313284, "language_loss": 0.72085637, "learning_rate": 9.084520382842253e-07, "loss": 0.74232209, "num_input_tokens_seen": 245427000, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 11373, "time_per_iteration": 2.530263900756836 }, { "auxiliary_loss_clip": 0.01056401, "auxiliary_loss_mlp": 0.01002624, "balance_loss_clip": 1.0013541, "balance_loss_mlp": 1.00812173, "epoch": 0.6838418758454833, "flos": 65005928985600.0, "grad_norm": 1.2603456200848708, "language_loss": 0.56746161, "learning_rate": 9.081354681813136e-07, "loss": 0.5880518, "num_input_tokens_seen": 245491620, "router_z_loss_clip": 0.01269531, "router_z_loss_mlp": 0.21875, "step": 11374, "time_per_iteration": 3.339992046356201 }, { "auxiliary_loss_clip": 0.01121289, "auxiliary_loss_mlp": 0.01028958, "balance_loss_clip": 1.01554728, "balance_loss_mlp": 1.03968072, "epoch": 0.6839019990981512, "flos": 21030833162880.0, "grad_norm": 1.6191222238440945, "language_loss": 0.73997992, "learning_rate": 9.078189370442386e-07, "loss": 0.76148236, "num_input_tokens_seen": 245511285, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 11375, "time_per_iteration": 2.5772807598114014 }, { "auxiliary_loss_clip": 0.01125497, "auxiliary_loss_mlp": 0.01034644, "balance_loss_clip": 1.02188265, "balance_loss_mlp": 1.03762114, "epoch": 0.6839621223508192, "flos": 24935543907840.0, "grad_norm": 1.7579440000459725, "language_loss": 0.7096132, "learning_rate": 9.07502444884296e-07, "loss": 0.73121464, "num_input_tokens_seen": 245532910, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 11376, "time_per_iteration": 2.664077043533325 }, { "auxiliary_loss_clip": 0.01110521, "auxiliary_loss_mlp": 0.01035581, "balance_loss_clip": 1.02175856, "balance_loss_mlp": 1.0367682, "epoch": 0.6840222456034871, "flos": 26358827391360.0, "grad_norm": 1.6915985621413825, "language_loss": 0.74412358, "learning_rate": 9.071859917127804e-07, "loss": 0.76558459, "num_input_tokens_seen": 245550540, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.734375, "step": 11377, "time_per_iteration": 2.5409352779388428 }, { "auxiliary_loss_clip": 0.01047398, "auxiliary_loss_mlp": 0.01001693, "balance_loss_clip": 1.00043583, "balance_loss_mlp": 1.00744009, "epoch": 0.6840823688561551, "flos": 65988336936960.0, "grad_norm": 1.0057767389510839, "language_loss": 0.56846976, "learning_rate": 9.068695775409872e-07, "loss": 0.58896065, "num_input_tokens_seen": 245619570, "router_z_loss_clip": 0.01257324, "router_z_loss_mlp": 0.21875, "step": 11378, "time_per_iteration": 3.231229066848755 }, { "auxiliary_loss_clip": 0.01109351, "auxiliary_loss_mlp": 0.01034673, "balance_loss_clip": 1.02099371, "balance_loss_mlp": 1.03664899, "epoch": 0.684142492108823, "flos": 21397588980480.0, "grad_norm": 1.5908536322201796, "language_loss": 0.78548026, "learning_rate": 9.065532023802051e-07, "loss": 0.80692053, "num_input_tokens_seen": 245637980, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 11379, "time_per_iteration": 2.4977962970733643 }, { "auxiliary_loss_clip": 0.01103673, "auxiliary_loss_mlp": 0.01028063, "balance_loss_clip": 1.01638007, "balance_loss_mlp": 1.03715253, "epoch": 0.6842026153614911, "flos": 18801426700800.0, "grad_norm": 1.7721983919977118, "language_loss": 0.68815649, "learning_rate": 9.062368662417276e-07, "loss": 0.70947385, "num_input_tokens_seen": 245655690, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6640625, "step": 11380, "time_per_iteration": 2.538133144378662 }, { "auxiliary_loss_clip": 0.01125292, "auxiliary_loss_mlp": 0.01033611, "balance_loss_clip": 1.0205214, "balance_loss_mlp": 1.03645897, "epoch": 0.684262738614159, "flos": 19646405216640.0, "grad_norm": 2.0743911489292723, "language_loss": 0.78258395, "learning_rate": 9.059205691368421e-07, "loss": 0.80417299, "num_input_tokens_seen": 245671525, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 11381, "time_per_iteration": 2.557225465774536 }, { "auxiliary_loss_clip": 0.01037216, "auxiliary_loss_mlp": 0.01001326, "balance_loss_clip": 0.99999732, "balance_loss_mlp": 1.00720751, "epoch": 0.684322861866827, "flos": 62354462739840.0, "grad_norm": 0.8666897786848258, "language_loss": 0.67165601, "learning_rate": 9.056043110768385e-07, "loss": 0.69204152, "num_input_tokens_seen": 245724115, "router_z_loss_clip": 0.01330566, "router_z_loss_mlp": 0.21875, "step": 11382, "time_per_iteration": 3.050482988357544 }, { "auxiliary_loss_clip": 0.01037624, "auxiliary_loss_mlp": 0.01003286, "balance_loss_clip": 1.00203991, "balance_loss_mlp": 1.00703979, "epoch": 0.6843829851194949, "flos": 65805048812160.0, "grad_norm": 0.8273582740223545, "language_loss": 0.58159745, "learning_rate": 9.052880920730006e-07, "loss": 0.60200649, "num_input_tokens_seen": 245789245, "router_z_loss_clip": 0.01245117, "router_z_loss_mlp": 0.21875, "step": 11383, "time_per_iteration": 3.200329542160034 }, { "auxiliary_loss_clip": 0.01142296, "auxiliary_loss_mlp": 0.01026041, "balance_loss_clip": 1.01360714, "balance_loss_mlp": 1.03592718, "epoch": 0.6844431083721629, "flos": 27855153181440.0, "grad_norm": 1.8614069628969623, "language_loss": 0.79808873, "learning_rate": 9.049719121366153e-07, "loss": 0.81977212, "num_input_tokens_seen": 245812420, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 11384, "time_per_iteration": 2.68599271774292 }, { "auxiliary_loss_clip": 0.01118942, "auxiliary_loss_mlp": 0.01034009, "balance_loss_clip": 1.02081835, "balance_loss_mlp": 1.03583944, "epoch": 0.684503231624831, "flos": 18255010032000.0, "grad_norm": 1.9846589339152418, "language_loss": 0.76961231, "learning_rate": 9.046557712789667e-07, "loss": 0.79114187, "num_input_tokens_seen": 245829135, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7421875, "step": 11385, "time_per_iteration": 2.548198699951172 }, { "auxiliary_loss_clip": 0.01143491, "auxiliary_loss_mlp": 0.01040389, "balance_loss_clip": 1.02575564, "balance_loss_mlp": 1.03882003, "epoch": 0.6845633548774989, "flos": 17639681120640.0, "grad_norm": 1.7862734387738692, "language_loss": 0.8457818, "learning_rate": 9.043396695113344e-07, "loss": 0.86762059, "num_input_tokens_seen": 245847140, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.77734375, "step": 11386, "time_per_iteration": 2.5021488666534424 }, { "auxiliary_loss_clip": 0.01118424, "auxiliary_loss_mlp": 0.01036788, "balance_loss_clip": 1.02467036, "balance_loss_mlp": 1.03870797, "epoch": 0.6846234781301669, "flos": 20807576179200.0, "grad_norm": 2.619803003681151, "language_loss": 0.83425939, "learning_rate": 9.040236068450016e-07, "loss": 0.85581148, "num_input_tokens_seen": 245862855, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.7109375, "step": 11387, "time_per_iteration": 2.5948450565338135 }, { "auxiliary_loss_clip": 0.01114651, "auxiliary_loss_mlp": 0.01027408, "balance_loss_clip": 1.01478934, "balance_loss_mlp": 1.0356257, "epoch": 0.6846836013828348, "flos": 36101176485120.0, "grad_norm": 1.451409700803065, "language_loss": 0.72109497, "learning_rate": 9.037075832912473e-07, "loss": 0.74251556, "num_input_tokens_seen": 245885415, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 11388, "time_per_iteration": 2.7305283546447754 }, { "auxiliary_loss_clip": 0.01115221, "auxiliary_loss_mlp": 0.0102459, "balance_loss_clip": 1.01272821, "balance_loss_mlp": 1.03610969, "epoch": 0.6847437246355028, "flos": 43142468607360.0, "grad_norm": 1.7607838461589889, "language_loss": 0.62105417, "learning_rate": 9.033915988613492e-07, "loss": 0.6424523, "num_input_tokens_seen": 245906285, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.69921875, "step": 11389, "time_per_iteration": 2.7086026668548584 }, { "auxiliary_loss_clip": 0.01045265, "auxiliary_loss_mlp": 0.01000996, "balance_loss_clip": 0.99974996, "balance_loss_mlp": 1.00690961, "epoch": 0.6848038478881707, "flos": 71663729552640.0, "grad_norm": 0.739215000048144, "language_loss": 0.55982381, "learning_rate": 9.030756535665834e-07, "loss": 0.58028638, "num_input_tokens_seen": 245967620, "router_z_loss_clip": 0.01245117, "router_z_loss_mlp": 0.21679688, "step": 11390, "time_per_iteration": 3.2475087642669678 }, { "auxiliary_loss_clip": 0.01137014, "auxiliary_loss_mlp": 0.01280154, "balance_loss_clip": 1.02018523, "balance_loss_mlp": 1.03924608, "epoch": 0.6848639711408387, "flos": 19937820257280.0, "grad_norm": 1.9166536171187452, "language_loss": 0.88152605, "learning_rate": 9.027597474182267e-07, "loss": 0.9056977, "num_input_tokens_seen": 245985075, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 11391, "time_per_iteration": 2.5431394577026367 }, { "auxiliary_loss_clip": 0.01148893, "auxiliary_loss_mlp": 0.01031712, "balance_loss_clip": 1.01924884, "balance_loss_mlp": 1.04023075, "epoch": 0.6849240943935067, "flos": 26867501844480.0, "grad_norm": 1.5798990393426688, "language_loss": 0.79161745, "learning_rate": 9.02443880427552e-07, "loss": 0.81342351, "num_input_tokens_seen": 246003560, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7265625, "step": 11392, "time_per_iteration": 2.672312021255493 }, { "auxiliary_loss_clip": 0.01122653, "auxiliary_loss_mlp": 0.01029484, "balance_loss_clip": 1.01712155, "balance_loss_mlp": 1.03621864, "epoch": 0.6849842176461747, "flos": 13735365425280.0, "grad_norm": 2.201723463227377, "language_loss": 0.70983565, "learning_rate": 9.021280526058322e-07, "loss": 0.73135698, "num_input_tokens_seen": 246019600, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 11393, "time_per_iteration": 2.50504207611084 }, { "auxiliary_loss_clip": 0.01147542, "auxiliary_loss_mlp": 0.01033852, "balance_loss_clip": 1.01979673, "balance_loss_mlp": 1.03767598, "epoch": 0.6850443408988426, "flos": 24973070641920.0, "grad_norm": 1.613351751043866, "language_loss": 0.64393514, "learning_rate": 9.018122639643373e-07, "loss": 0.66574907, "num_input_tokens_seen": 246038920, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7421875, "step": 11394, "time_per_iteration": 2.6544573307037354 }, { "auxiliary_loss_clip": 0.01128192, "auxiliary_loss_mlp": 0.01284726, "balance_loss_clip": 1.02439511, "balance_loss_mlp": 1.03779721, "epoch": 0.6851044641515106, "flos": 27744225004800.0, "grad_norm": 1.6797676061264624, "language_loss": 0.80353421, "learning_rate": 9.014965145143392e-07, "loss": 0.82766336, "num_input_tokens_seen": 246060490, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 11395, "time_per_iteration": 2.6138038635253906 }, { "auxiliary_loss_clip": 0.01136309, "auxiliary_loss_mlp": 0.01032088, "balance_loss_clip": 1.01815271, "balance_loss_mlp": 1.03862023, "epoch": 0.6851645874041785, "flos": 24351061800960.0, "grad_norm": 1.5751439476838676, "language_loss": 0.72692055, "learning_rate": 9.011808042671035e-07, "loss": 0.74860454, "num_input_tokens_seen": 246081465, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7109375, "step": 11396, "time_per_iteration": 2.6947100162506104 }, { "auxiliary_loss_clip": 0.01139032, "auxiliary_loss_mlp": 0.01028295, "balance_loss_clip": 1.01452625, "balance_loss_mlp": 1.03856945, "epoch": 0.6852247106568465, "flos": 15077849264640.0, "grad_norm": 2.1372972985854766, "language_loss": 0.78763026, "learning_rate": 9.00865133233899e-07, "loss": 0.80930352, "num_input_tokens_seen": 246096110, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7421875, "step": 11397, "time_per_iteration": 3.9438135623931885 }, { "auxiliary_loss_clip": 0.01161248, "auxiliary_loss_mlp": 0.01028874, "balance_loss_clip": 1.0153079, "balance_loss_mlp": 1.03567696, "epoch": 0.6852848339095146, "flos": 18770005278720.0, "grad_norm": 1.7939930531527037, "language_loss": 0.71488482, "learning_rate": 9.005495014259905e-07, "loss": 0.73678601, "num_input_tokens_seen": 246114785, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.72265625, "step": 11398, "time_per_iteration": 2.600865364074707 }, { "auxiliary_loss_clip": 0.01141238, "auxiliary_loss_mlp": 0.01034753, "balance_loss_clip": 1.02268946, "balance_loss_mlp": 1.0360322, "epoch": 0.6853449571621825, "flos": 27854363082240.0, "grad_norm": 1.6504609868100235, "language_loss": 0.70692617, "learning_rate": 9.002339088546424e-07, "loss": 0.72868609, "num_input_tokens_seen": 246136375, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6953125, "step": 11399, "time_per_iteration": 2.6488826274871826 }, { "auxiliary_loss_clip": 0.01120157, "auxiliary_loss_mlp": 0.01031089, "balance_loss_clip": 1.01857841, "balance_loss_mlp": 1.03890669, "epoch": 0.6854050804148505, "flos": 18150510389760.0, "grad_norm": 1.723273329465479, "language_loss": 0.70361978, "learning_rate": 8.999183555311169e-07, "loss": 0.72513223, "num_input_tokens_seen": 246155090, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 11400, "time_per_iteration": 2.4919543266296387 }, { "auxiliary_loss_clip": 0.01155664, "auxiliary_loss_mlp": 0.01036539, "balance_loss_clip": 1.02025485, "balance_loss_mlp": 1.04060185, "epoch": 0.6854652036675184, "flos": 16326212492160.0, "grad_norm": 1.8627172438810056, "language_loss": 0.77883726, "learning_rate": 8.996028414666752e-07, "loss": 0.80075938, "num_input_tokens_seen": 246172645, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.79296875, "step": 11401, "time_per_iteration": 2.566129684448242 }, { "auxiliary_loss_clip": 0.01105982, "auxiliary_loss_mlp": 0.0103632, "balance_loss_clip": 1.02374387, "balance_loss_mlp": 1.03638351, "epoch": 0.6855253269201864, "flos": 14940814878720.0, "grad_norm": 1.7686284246385655, "language_loss": 0.75083357, "learning_rate": 8.992873666725786e-07, "loss": 0.77225661, "num_input_tokens_seen": 246189055, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 11402, "time_per_iteration": 2.51363468170166 }, { "auxiliary_loss_clip": 0.01118074, "auxiliary_loss_mlp": 0.01037634, "balance_loss_clip": 1.02427626, "balance_loss_mlp": 1.03599954, "epoch": 0.6855854501728543, "flos": 23037736826880.0, "grad_norm": 1.5655828236403224, "language_loss": 0.72833472, "learning_rate": 8.989719311600832e-07, "loss": 0.74989182, "num_input_tokens_seen": 246207990, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73046875, "step": 11403, "time_per_iteration": 3.9651896953582764 }, { "auxiliary_loss_clip": 0.01106458, "auxiliary_loss_mlp": 0.01029275, "balance_loss_clip": 1.01693702, "balance_loss_mlp": 1.0372709, "epoch": 0.6856455734255223, "flos": 13253623194240.0, "grad_norm": 1.9656434292420868, "language_loss": 0.81696033, "learning_rate": 8.986565349404482e-07, "loss": 0.83831763, "num_input_tokens_seen": 246221595, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 11404, "time_per_iteration": 2.4810736179351807 }, { "auxiliary_loss_clip": 0.01126635, "auxiliary_loss_mlp": 0.01033396, "balance_loss_clip": 1.02098012, "balance_loss_mlp": 1.03841293, "epoch": 0.6857056966781903, "flos": 23333461499520.0, "grad_norm": 1.523756048928085, "language_loss": 0.78073812, "learning_rate": 8.983411780249284e-07, "loss": 0.80233836, "num_input_tokens_seen": 246242970, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 11405, "time_per_iteration": 2.5998613834381104 }, { "auxiliary_loss_clip": 0.01038483, "auxiliary_loss_mlp": 0.00998843, "balance_loss_clip": 0.99760294, "balance_loss_mlp": 1.00767636, "epoch": 0.6857658199308583, "flos": 61852647784320.0, "grad_norm": 0.7957852982805897, "language_loss": 0.61113596, "learning_rate": 8.980258604247781e-07, "loss": 0.63150918, "num_input_tokens_seen": 246300405, "router_z_loss_clip": 0.01239014, "router_z_loss_mlp": 0.21875, "step": 11406, "time_per_iteration": 3.178250789642334 }, { "auxiliary_loss_clip": 0.01133531, "auxiliary_loss_mlp": 0.01035247, "balance_loss_clip": 1.02138877, "balance_loss_mlp": 1.03558946, "epoch": 0.6858259431835262, "flos": 16654543735680.0, "grad_norm": 1.7591306706553242, "language_loss": 0.76760525, "learning_rate": 8.977105821512496e-07, "loss": 0.78929305, "num_input_tokens_seen": 246318780, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71484375, "step": 11407, "time_per_iteration": 2.5621142387390137 }, { "auxiliary_loss_clip": 0.011273, "auxiliary_loss_mlp": 0.01040253, "balance_loss_clip": 1.02666879, "balance_loss_mlp": 1.0366559, "epoch": 0.6858860664361942, "flos": 21872974504320.0, "grad_norm": 3.4550148273350416, "language_loss": 0.71137297, "learning_rate": 8.973953432155956e-07, "loss": 0.73304856, "num_input_tokens_seen": 246339405, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73046875, "step": 11408, "time_per_iteration": 2.5883028507232666 }, { "auxiliary_loss_clip": 0.01118481, "auxiliary_loss_mlp": 0.01029841, "balance_loss_clip": 1.01659131, "balance_loss_mlp": 1.03637505, "epoch": 0.6859461896888621, "flos": 15267637751040.0, "grad_norm": 1.9293175468904116, "language_loss": 0.695719, "learning_rate": 8.970801436290658e-07, "loss": 0.71720225, "num_input_tokens_seen": 246357055, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 11409, "time_per_iteration": 5.464868545532227 }, { "auxiliary_loss_clip": 0.01137824, "auxiliary_loss_mlp": 0.01027516, "balance_loss_clip": 1.01601815, "balance_loss_mlp": 1.03693581, "epoch": 0.6860063129415301, "flos": 18620293392000.0, "grad_norm": 1.7135832746854514, "language_loss": 0.78307104, "learning_rate": 8.967649834029085e-07, "loss": 0.80472445, "num_input_tokens_seen": 246374050, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.65625, "step": 11410, "time_per_iteration": 2.5872750282287598 }, { "auxiliary_loss_clip": 0.01118782, "auxiliary_loss_mlp": 0.01038122, "balance_loss_clip": 1.02405548, "balance_loss_mlp": 1.03596056, "epoch": 0.6860664361941982, "flos": 23951376190080.0, "grad_norm": 1.7437142838432884, "language_loss": 0.71768939, "learning_rate": 8.964498625483703e-07, "loss": 0.73925841, "num_input_tokens_seen": 246392910, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7421875, "step": 11411, "time_per_iteration": 2.557950019836426 }, { "auxiliary_loss_clip": 0.01146672, "auxiliary_loss_mlp": 0.0102862, "balance_loss_clip": 1.01531053, "balance_loss_mlp": 1.03874195, "epoch": 0.6861265594468661, "flos": 20407782827520.0, "grad_norm": 3.343779541704904, "language_loss": 0.69781613, "learning_rate": 8.961347810766993e-07, "loss": 0.71956909, "num_input_tokens_seen": 246411540, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 11412, "time_per_iteration": 2.5296008586883545 }, { "auxiliary_loss_clip": 0.01108163, "auxiliary_loss_mlp": 0.01032198, "balance_loss_clip": 1.01890063, "balance_loss_mlp": 1.03664339, "epoch": 0.6861866826995341, "flos": 11428571111040.0, "grad_norm": 3.1640384970831543, "language_loss": 0.71674597, "learning_rate": 8.958197389991371e-07, "loss": 0.73814958, "num_input_tokens_seen": 246423295, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 11413, "time_per_iteration": 2.4542014598846436 }, { "auxiliary_loss_clip": 0.01117891, "auxiliary_loss_mlp": 0.01032375, "balance_loss_clip": 1.02010202, "balance_loss_mlp": 1.03708947, "epoch": 0.686246805952202, "flos": 15997593939840.0, "grad_norm": 1.7595550960575874, "language_loss": 0.73997295, "learning_rate": 8.955047363269288e-07, "loss": 0.76147556, "num_input_tokens_seen": 246441045, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.71875, "step": 11414, "time_per_iteration": 2.524388313293457 }, { "auxiliary_loss_clip": 0.01131855, "auxiliary_loss_mlp": 0.01031827, "balance_loss_clip": 1.01840973, "balance_loss_mlp": 1.04009354, "epoch": 0.68630692920487, "flos": 19826712512640.0, "grad_norm": 10.503144012622924, "language_loss": 0.86927927, "learning_rate": 8.95189773071316e-07, "loss": 0.89091611, "num_input_tokens_seen": 246456905, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 11415, "time_per_iteration": 2.5631296634674072 }, { "auxiliary_loss_clip": 0.01120057, "auxiliary_loss_mlp": 0.01035736, "balance_loss_clip": 1.02242649, "balance_loss_mlp": 1.03832197, "epoch": 0.6863670524575379, "flos": 26286216048000.0, "grad_norm": 1.6841580177470399, "language_loss": 0.67307377, "learning_rate": 8.948748492435369e-07, "loss": 0.6946317, "num_input_tokens_seen": 246477545, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 11416, "time_per_iteration": 2.5578956604003906 }, { "auxiliary_loss_clip": 0.01126666, "auxiliary_loss_mlp": 0.01033695, "balance_loss_clip": 1.02132082, "balance_loss_mlp": 1.03726339, "epoch": 0.686427175710206, "flos": 19173138595200.0, "grad_norm": 1.4386695008640569, "language_loss": 0.7614255, "learning_rate": 8.945599648548325e-07, "loss": 0.78302908, "num_input_tokens_seen": 246496705, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71484375, "step": 11417, "time_per_iteration": 2.5477113723754883 }, { "auxiliary_loss_clip": 0.01126346, "auxiliary_loss_mlp": 0.01031793, "balance_loss_clip": 1.01952088, "balance_loss_mlp": 1.03737783, "epoch": 0.6864872989628739, "flos": 18916628595840.0, "grad_norm": 1.8216882116019977, "language_loss": 0.77516061, "learning_rate": 8.942451199164386e-07, "loss": 0.79674196, "num_input_tokens_seen": 246514860, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 11418, "time_per_iteration": 2.5306038856506348 }, { "auxiliary_loss_clip": 0.0114428, "auxiliary_loss_mlp": 0.01030262, "balance_loss_clip": 1.01815033, "balance_loss_mlp": 1.03843963, "epoch": 0.6865474222155419, "flos": 25956196865280.0, "grad_norm": 1.7269207068697214, "language_loss": 0.76078862, "learning_rate": 8.939303144395936e-07, "loss": 0.78253406, "num_input_tokens_seen": 246536145, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.703125, "step": 11419, "time_per_iteration": 2.656003713607788 }, { "auxiliary_loss_clip": 0.01107413, "auxiliary_loss_mlp": 0.01038422, "balance_loss_clip": 1.02669764, "balance_loss_mlp": 1.03741097, "epoch": 0.6866075454682098, "flos": 18478087447680.0, "grad_norm": 1.6723209273176, "language_loss": 0.71473849, "learning_rate": 8.93615548435529e-07, "loss": 0.73619676, "num_input_tokens_seen": 246553265, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69921875, "step": 11420, "time_per_iteration": 2.524867296218872 }, { "auxiliary_loss_clip": 0.01132133, "auxiliary_loss_mlp": 0.01026578, "balance_loss_clip": 1.01457405, "balance_loss_mlp": 1.03612757, "epoch": 0.6866676687208778, "flos": 34239998298240.0, "grad_norm": 1.3696167961386037, "language_loss": 0.74475205, "learning_rate": 8.933008219154803e-07, "loss": 0.76633918, "num_input_tokens_seen": 246575130, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69921875, "step": 11421, "time_per_iteration": 2.6814165115356445 }, { "auxiliary_loss_clip": 0.01113372, "auxiliary_loss_mlp": 0.01031266, "balance_loss_clip": 1.01922607, "balance_loss_mlp": 1.03589678, "epoch": 0.6867277919735457, "flos": 21721754246400.0, "grad_norm": 1.6979342524831602, "language_loss": 0.77116323, "learning_rate": 8.929861348906784e-07, "loss": 0.79260957, "num_input_tokens_seen": 246593095, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 11422, "time_per_iteration": 2.578193187713623 }, { "auxiliary_loss_clip": 0.01118339, "auxiliary_loss_mlp": 0.01040977, "balance_loss_clip": 1.02752399, "balance_loss_mlp": 1.03824914, "epoch": 0.6867879152262137, "flos": 24097999507200.0, "grad_norm": 1.9286613028686104, "language_loss": 0.7704972, "learning_rate": 8.926714873723537e-07, "loss": 0.79209042, "num_input_tokens_seen": 246612165, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 11423, "time_per_iteration": 2.6795570850372314 }, { "auxiliary_loss_clip": 0.01137172, "auxiliary_loss_mlp": 0.01032798, "balance_loss_clip": 1.01936293, "balance_loss_mlp": 1.03973591, "epoch": 0.6868480384788818, "flos": 21615818060160.0, "grad_norm": 1.7555627948904946, "language_loss": 0.72733855, "learning_rate": 8.923568793717347e-07, "loss": 0.74903828, "num_input_tokens_seen": 246632065, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 11424, "time_per_iteration": 2.6060469150543213 }, { "auxiliary_loss_clip": 0.01125184, "auxiliary_loss_mlp": 0.01027835, "balance_loss_clip": 1.01563358, "balance_loss_mlp": 1.03680801, "epoch": 0.6869081617315497, "flos": 26286144220800.0, "grad_norm": 2.090370875388154, "language_loss": 0.65368664, "learning_rate": 8.920423109000501e-07, "loss": 0.67521679, "num_input_tokens_seen": 246651245, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.70703125, "step": 11425, "time_per_iteration": 2.6799912452697754 }, { "auxiliary_loss_clip": 0.01135058, "auxiliary_loss_mlp": 0.0102865, "balance_loss_clip": 1.01637745, "balance_loss_mlp": 1.03955376, "epoch": 0.6869682849842177, "flos": 21105096531840.0, "grad_norm": 1.377527294598981, "language_loss": 0.7157588, "learning_rate": 8.917277819685254e-07, "loss": 0.73739588, "num_input_tokens_seen": 246672225, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 11426, "time_per_iteration": 2.6803064346313477 }, { "auxiliary_loss_clip": 0.01141992, "auxiliary_loss_mlp": 0.01028192, "balance_loss_clip": 1.01603246, "balance_loss_mlp": 1.03581429, "epoch": 0.6870284082368856, "flos": 17092653920640.0, "grad_norm": 2.1443067690324344, "language_loss": 0.84933251, "learning_rate": 8.914132925883855e-07, "loss": 0.87103438, "num_input_tokens_seen": 246688385, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 11427, "time_per_iteration": 2.5438408851623535 }, { "auxiliary_loss_clip": 0.01114999, "auxiliary_loss_mlp": 0.01030797, "balance_loss_clip": 1.01898348, "balance_loss_mlp": 1.0380888, "epoch": 0.6870885314895536, "flos": 27308090067840.0, "grad_norm": 1.4951550221576415, "language_loss": 0.76003695, "learning_rate": 8.910988427708526e-07, "loss": 0.78149498, "num_input_tokens_seen": 246710730, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.68359375, "step": 11428, "time_per_iteration": 2.5769448280334473 }, { "auxiliary_loss_clip": 0.01138415, "auxiliary_loss_mlp": 0.01035399, "balance_loss_clip": 1.02188647, "balance_loss_mlp": 1.03818178, "epoch": 0.6871486547422215, "flos": 20814543417600.0, "grad_norm": 1.843429957063337, "language_loss": 0.72739458, "learning_rate": 8.907844325271511e-07, "loss": 0.74913275, "num_input_tokens_seen": 246730350, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 11429, "time_per_iteration": 2.632075548171997 }, { "auxiliary_loss_clip": 0.01129889, "auxiliary_loss_mlp": 0.01028932, "balance_loss_clip": 1.01648664, "balance_loss_mlp": 1.04025376, "epoch": 0.6872087779948896, "flos": 30154118330880.0, "grad_norm": 1.6723123161528788, "language_loss": 0.83126712, "learning_rate": 8.904700618684993e-07, "loss": 0.85285532, "num_input_tokens_seen": 246751700, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71875, "step": 11430, "time_per_iteration": 2.699007034301758 }, { "auxiliary_loss_clip": 0.01115783, "auxiliary_loss_mlp": 0.01036876, "balance_loss_clip": 1.02491331, "balance_loss_mlp": 1.03673351, "epoch": 0.6872689012475575, "flos": 20704584908160.0, "grad_norm": 3.1839663437900865, "language_loss": 0.70300388, "learning_rate": 8.901557308061163e-07, "loss": 0.72453046, "num_input_tokens_seen": 246769860, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.703125, "step": 11431, "time_per_iteration": 2.5738754272460938 }, { "auxiliary_loss_clip": 0.01126285, "auxiliary_loss_mlp": 0.01278139, "balance_loss_clip": 1.01847696, "balance_loss_mlp": 1.03637683, "epoch": 0.6873290245002255, "flos": 25520852027520.0, "grad_norm": 2.0821038588369016, "language_loss": 0.79967642, "learning_rate": 8.898414393512217e-07, "loss": 0.82372069, "num_input_tokens_seen": 246789905, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.72265625, "step": 11432, "time_per_iteration": 2.68798828125 }, { "auxiliary_loss_clip": 0.01122533, "auxiliary_loss_mlp": 0.01027833, "balance_loss_clip": 1.01631701, "balance_loss_mlp": 1.03729558, "epoch": 0.6873891477528934, "flos": 25191479289600.0, "grad_norm": 1.597269919019705, "language_loss": 0.67837304, "learning_rate": 8.89527187515029e-07, "loss": 0.69987667, "num_input_tokens_seen": 246808815, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.67578125, "step": 11433, "time_per_iteration": 2.5859062671661377 }, { "auxiliary_loss_clip": 0.01120566, "auxiliary_loss_mlp": 0.01038486, "balance_loss_clip": 1.02593279, "balance_loss_mlp": 1.04149532, "epoch": 0.6874492710055614, "flos": 35152380685440.0, "grad_norm": 2.4631017151182784, "language_loss": 0.72866678, "learning_rate": 8.892129753087554e-07, "loss": 0.75025737, "num_input_tokens_seen": 246829775, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 11434, "time_per_iteration": 2.688739061355591 }, { "auxiliary_loss_clip": 0.01135907, "auxiliary_loss_mlp": 0.01029524, "balance_loss_clip": 1.01689363, "balance_loss_mlp": 1.03964412, "epoch": 0.6875093942582293, "flos": 17822215059840.0, "grad_norm": 1.6418597885898578, "language_loss": 0.80708921, "learning_rate": 8.888988027436124e-07, "loss": 0.82874358, "num_input_tokens_seen": 246848045, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 11435, "time_per_iteration": 2.5700483322143555 }, { "auxiliary_loss_clip": 0.0110927, "auxiliary_loss_mlp": 0.01033336, "balance_loss_clip": 1.02021122, "balance_loss_mlp": 1.03707266, "epoch": 0.6875695175108973, "flos": 20704548994560.0, "grad_norm": 2.1082688948653363, "language_loss": 0.81354749, "learning_rate": 8.885846698308148e-07, "loss": 0.83497351, "num_input_tokens_seen": 246866095, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 11436, "time_per_iteration": 2.5466508865356445 }, { "auxiliary_loss_clip": 0.01130074, "auxiliary_loss_mlp": 0.01030734, "balance_loss_clip": 1.02008224, "balance_loss_mlp": 1.0371716, "epoch": 0.6876296407635654, "flos": 25374013228800.0, "grad_norm": 1.7252824606378527, "language_loss": 0.8210308, "learning_rate": 8.882705765815697e-07, "loss": 0.84263885, "num_input_tokens_seen": 246883975, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.65625, "step": 11437, "time_per_iteration": 2.5591354370117188 }, { "auxiliary_loss_clip": 0.01134019, "auxiliary_loss_mlp": 0.01037423, "balance_loss_clip": 1.02392888, "balance_loss_mlp": 1.04028869, "epoch": 0.6876897640162333, "flos": 23222317841280.0, "grad_norm": 2.3545253887004685, "language_loss": 0.78063107, "learning_rate": 8.879565230070889e-07, "loss": 0.80234545, "num_input_tokens_seen": 246901560, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 11438, "time_per_iteration": 3.9514284133911133 }, { "auxiliary_loss_clip": 0.01102135, "auxiliary_loss_mlp": 0.01029184, "balance_loss_clip": 1.01751912, "balance_loss_mlp": 1.03579795, "epoch": 0.6877498872689013, "flos": 27124335066240.0, "grad_norm": 1.7834710624440226, "language_loss": 0.72048128, "learning_rate": 8.876425091185793e-07, "loss": 0.74179447, "num_input_tokens_seen": 246922655, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6640625, "step": 11439, "time_per_iteration": 2.577054023742676 }, { "auxiliary_loss_clip": 0.01115851, "auxiliary_loss_mlp": 0.0102801, "balance_loss_clip": 1.01531386, "balance_loss_mlp": 1.03709745, "epoch": 0.6878100105215692, "flos": 11581658876160.0, "grad_norm": 2.645657877225895, "language_loss": 0.7550503, "learning_rate": 8.873285349272472e-07, "loss": 0.7764889, "num_input_tokens_seen": 246940100, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 11440, "time_per_iteration": 2.4588210582733154 }, { "auxiliary_loss_clip": 0.01131211, "auxiliary_loss_mlp": 0.01035452, "balance_loss_clip": 1.02379966, "balance_loss_mlp": 1.03548706, "epoch": 0.6878701337742372, "flos": 20303175444480.0, "grad_norm": 1.5538065303693018, "language_loss": 0.71824169, "learning_rate": 8.870146004442969e-07, "loss": 0.73990834, "num_input_tokens_seen": 246958545, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6875, "step": 11441, "time_per_iteration": 2.5395243167877197 }, { "auxiliary_loss_clip": 0.01139244, "auxiliary_loss_mlp": 0.01030064, "balance_loss_clip": 1.01661718, "balance_loss_mlp": 1.03813028, "epoch": 0.6879302570269051, "flos": 13840080549120.0, "grad_norm": 1.6028703417508048, "language_loss": 0.66464865, "learning_rate": 8.86700705680933e-07, "loss": 0.68634164, "num_input_tokens_seen": 246974805, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7421875, "step": 11442, "time_per_iteration": 2.5304062366485596 }, { "auxiliary_loss_clip": 0.01118031, "auxiliary_loss_mlp": 0.01030471, "balance_loss_clip": 1.01849675, "balance_loss_mlp": 1.03951669, "epoch": 0.6879903802795732, "flos": 21324654414720.0, "grad_norm": 2.3417504910996616, "language_loss": 0.6949935, "learning_rate": 8.863868506483574e-07, "loss": 0.71647859, "num_input_tokens_seen": 246992505, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 11443, "time_per_iteration": 2.5593066215515137 }, { "auxiliary_loss_clip": 0.01130956, "auxiliary_loss_mlp": 0.0103313, "balance_loss_clip": 1.02040994, "balance_loss_mlp": 1.04159617, "epoch": 0.6880505035322411, "flos": 25152049134720.0, "grad_norm": 1.5691014004442614, "language_loss": 0.76281393, "learning_rate": 8.860730353577705e-07, "loss": 0.78445482, "num_input_tokens_seen": 247013370, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 11444, "time_per_iteration": 2.5786755084991455 }, { "auxiliary_loss_clip": 0.01105202, "auxiliary_loss_mlp": 0.01029311, "balance_loss_clip": 1.01739609, "balance_loss_mlp": 1.03658795, "epoch": 0.6881106267849091, "flos": 23215530170880.0, "grad_norm": 1.8849673941352814, "language_loss": 0.76774716, "learning_rate": 8.857592598203718e-07, "loss": 0.78909224, "num_input_tokens_seen": 247029855, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 11445, "time_per_iteration": 3.919660806655884 }, { "auxiliary_loss_clip": 0.01038152, "auxiliary_loss_mlp": 0.01003904, "balance_loss_clip": 1.00278306, "balance_loss_mlp": 1.00715101, "epoch": 0.688170750037577, "flos": 48484397312640.0, "grad_norm": 0.8164951889677471, "language_loss": 0.58435303, "learning_rate": 8.854455240473587e-07, "loss": 0.60477358, "num_input_tokens_seen": 247085030, "router_z_loss_clip": 0.01123047, "router_z_loss_mlp": 0.21777344, "step": 11446, "time_per_iteration": 3.1421258449554443 }, { "auxiliary_loss_clip": 0.01117635, "auxiliary_loss_mlp": 0.01028182, "balance_loss_clip": 1.01539648, "balance_loss_mlp": 1.0361197, "epoch": 0.688230873290245, "flos": 22783633038720.0, "grad_norm": 1.6755536401364193, "language_loss": 0.75684983, "learning_rate": 8.85131828049928e-07, "loss": 0.77830797, "num_input_tokens_seen": 247104840, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.72265625, "step": 11447, "time_per_iteration": 2.5495388507843018 }, { "auxiliary_loss_clip": 0.01127316, "auxiliary_loss_mlp": 0.01036266, "balance_loss_clip": 1.02376688, "balance_loss_mlp": 1.03729117, "epoch": 0.6882909965429129, "flos": 22455660931200.0, "grad_norm": 2.5415048171618695, "language_loss": 0.73459452, "learning_rate": 8.848181718392737e-07, "loss": 0.75623035, "num_input_tokens_seen": 247121905, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.72265625, "step": 11448, "time_per_iteration": 2.558654308319092 }, { "auxiliary_loss_clip": 0.01118758, "auxiliary_loss_mlp": 0.01033973, "balance_loss_clip": 1.02088416, "balance_loss_mlp": 1.03619874, "epoch": 0.688351119795581, "flos": 26214143408640.0, "grad_norm": 1.7298646651974687, "language_loss": 0.74682558, "learning_rate": 8.84504555426592e-07, "loss": 0.76835287, "num_input_tokens_seen": 247142375, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 11449, "time_per_iteration": 2.568652868270874 }, { "auxiliary_loss_clip": 0.01104954, "auxiliary_loss_mlp": 0.01035739, "balance_loss_clip": 1.02402639, "balance_loss_mlp": 1.03590345, "epoch": 0.6884112430482489, "flos": 22565260304640.0, "grad_norm": 1.7200938117033515, "language_loss": 0.69864273, "learning_rate": 8.841909788230715e-07, "loss": 0.72004974, "num_input_tokens_seen": 247161095, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69140625, "step": 11450, "time_per_iteration": 4.218649864196777 }, { "auxiliary_loss_clip": 0.01138449, "auxiliary_loss_mlp": 0.01035762, "balance_loss_clip": 1.02235663, "balance_loss_mlp": 1.03649497, "epoch": 0.6884713663009169, "flos": 17341047446400.0, "grad_norm": 1.8212959181502857, "language_loss": 0.76361418, "learning_rate": 8.838774420399058e-07, "loss": 0.78535628, "num_input_tokens_seen": 247178565, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.75, "step": 11451, "time_per_iteration": 4.002967596054077 }, { "auxiliary_loss_clip": 0.01117525, "auxiliary_loss_mlp": 0.01030627, "balance_loss_clip": 1.01856923, "balance_loss_mlp": 1.03733611, "epoch": 0.6885314895535849, "flos": 26470832976000.0, "grad_norm": 1.3492320449378867, "language_loss": 0.69363546, "learning_rate": 8.835639450882821e-07, "loss": 0.71511698, "num_input_tokens_seen": 247202345, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.7109375, "step": 11452, "time_per_iteration": 2.5778260231018066 }, { "auxiliary_loss_clip": 0.0110736, "auxiliary_loss_mlp": 0.01035653, "balance_loss_clip": 1.02089453, "balance_loss_mlp": 1.03724408, "epoch": 0.6885916128062528, "flos": 20521548178560.0, "grad_norm": 2.1300285833220416, "language_loss": 0.71701479, "learning_rate": 8.832504879793912e-07, "loss": 0.73844492, "num_input_tokens_seen": 247219240, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.703125, "step": 11453, "time_per_iteration": 2.556244134902954 }, { "auxiliary_loss_clip": 0.01037989, "auxiliary_loss_mlp": 0.01249561, "balance_loss_clip": 1.00251126, "balance_loss_mlp": 1.00734258, "epoch": 0.6886517360589208, "flos": 70715795679360.0, "grad_norm": 0.7847607909097464, "language_loss": 0.50711966, "learning_rate": 8.829370707244162e-07, "loss": 0.52999508, "num_input_tokens_seen": 247272010, "router_z_loss_clip": 0.01135254, "router_z_loss_mlp": 0.21777344, "step": 11454, "time_per_iteration": 3.0135796070098877 }, { "auxiliary_loss_clip": 0.01113276, "auxiliary_loss_mlp": 0.01030664, "balance_loss_clip": 1.01871324, "balance_loss_mlp": 1.03874099, "epoch": 0.6887118593115887, "flos": 17893533513600.0, "grad_norm": 1.7024901050391121, "language_loss": 0.75339901, "learning_rate": 8.826236933345443e-07, "loss": 0.77483845, "num_input_tokens_seen": 247290630, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.66015625, "step": 11455, "time_per_iteration": 2.5019943714141846 }, { "auxiliary_loss_clip": 0.01126979, "auxiliary_loss_mlp": 0.01036597, "balance_loss_clip": 1.02331734, "balance_loss_mlp": 1.03564382, "epoch": 0.6887719825642568, "flos": 17453017117440.0, "grad_norm": 2.137397930547861, "language_loss": 0.72881877, "learning_rate": 8.823103558209586e-07, "loss": 0.7504546, "num_input_tokens_seen": 247304800, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 11456, "time_per_iteration": 2.545501232147217 }, { "auxiliary_loss_clip": 0.01124652, "auxiliary_loss_mlp": 0.01035411, "balance_loss_clip": 1.02334738, "balance_loss_mlp": 1.03891683, "epoch": 0.6888321058169247, "flos": 23070199743360.0, "grad_norm": 1.8007025032013324, "language_loss": 0.81081909, "learning_rate": 8.819970581948415e-07, "loss": 0.83241975, "num_input_tokens_seen": 247323450, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 11457, "time_per_iteration": 2.6041858196258545 }, { "auxiliary_loss_clip": 0.01120292, "auxiliary_loss_mlp": 0.01276855, "balance_loss_clip": 1.01694512, "balance_loss_mlp": 1.03976536, "epoch": 0.6888922290695927, "flos": 23368833417600.0, "grad_norm": 1.6608746689652565, "language_loss": 0.75939381, "learning_rate": 8.816838004673725e-07, "loss": 0.78336525, "num_input_tokens_seen": 247343845, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 11458, "time_per_iteration": 2.5595972537994385 }, { "auxiliary_loss_clip": 0.0110771, "auxiliary_loss_mlp": 0.01031615, "balance_loss_clip": 1.01942015, "balance_loss_mlp": 1.03716159, "epoch": 0.6889523523222606, "flos": 17631636474240.0, "grad_norm": 3.0812199441268993, "language_loss": 0.67689776, "learning_rate": 8.813705826497337e-07, "loss": 0.69829106, "num_input_tokens_seen": 247356650, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 11459, "time_per_iteration": 2.5009169578552246 }, { "auxiliary_loss_clip": 0.01135367, "auxiliary_loss_mlp": 0.01033577, "balance_loss_clip": 1.0220015, "balance_loss_mlp": 1.03776598, "epoch": 0.6890124755749286, "flos": 25228144097280.0, "grad_norm": 1.4585469490409333, "language_loss": 0.68864083, "learning_rate": 8.810574047531006e-07, "loss": 0.71033025, "num_input_tokens_seen": 247377340, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.703125, "step": 11460, "time_per_iteration": 2.6375925540924072 }, { "auxiliary_loss_clip": 0.01138095, "auxiliary_loss_mlp": 0.01033067, "balance_loss_clip": 1.01938152, "balance_loss_mlp": 1.03575325, "epoch": 0.6890725988275965, "flos": 20230240878720.0, "grad_norm": 2.140377781604627, "language_loss": 0.76582003, "learning_rate": 8.807442667886496e-07, "loss": 0.78753173, "num_input_tokens_seen": 247395805, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7578125, "step": 11461, "time_per_iteration": 2.641423225402832 }, { "auxiliary_loss_clip": 0.01126698, "auxiliary_loss_mlp": 0.01038139, "balance_loss_clip": 1.02585411, "balance_loss_mlp": 1.03832293, "epoch": 0.6891327220802645, "flos": 14535311264640.0, "grad_norm": 1.6208551272932383, "language_loss": 0.69142294, "learning_rate": 8.804311687675574e-07, "loss": 0.71307135, "num_input_tokens_seen": 247413165, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.70703125, "step": 11462, "time_per_iteration": 2.552241325378418 }, { "auxiliary_loss_clip": 0.01118809, "auxiliary_loss_mlp": 0.01027526, "balance_loss_clip": 1.01434135, "balance_loss_mlp": 1.03825176, "epoch": 0.6891928453329325, "flos": 21139139646720.0, "grad_norm": 1.6037575840847427, "language_loss": 0.87244594, "learning_rate": 8.801181107009969e-07, "loss": 0.89390922, "num_input_tokens_seen": 247433140, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 11463, "time_per_iteration": 2.6772940158843994 }, { "auxiliary_loss_clip": 0.01139342, "auxiliary_loss_mlp": 0.01029386, "balance_loss_clip": 1.01842523, "balance_loss_mlp": 1.03703129, "epoch": 0.6892529685856005, "flos": 17858520731520.0, "grad_norm": 1.6510321993358255, "language_loss": 0.68465698, "learning_rate": 8.798050926001404e-07, "loss": 0.70634425, "num_input_tokens_seen": 247451265, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.6640625, "step": 11464, "time_per_iteration": 2.6150965690612793 }, { "auxiliary_loss_clip": 0.01117636, "auxiliary_loss_mlp": 0.01035569, "balance_loss_clip": 1.02278948, "balance_loss_mlp": 1.03803849, "epoch": 0.6893130918382685, "flos": 29934811843200.0, "grad_norm": 1.7006579178307433, "language_loss": 0.64897782, "learning_rate": 8.794921144761578e-07, "loss": 0.67050982, "num_input_tokens_seen": 247471645, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 11465, "time_per_iteration": 2.6737325191497803 }, { "auxiliary_loss_clip": 0.01118886, "auxiliary_loss_mlp": 0.01038063, "balance_loss_clip": 1.02576649, "balance_loss_mlp": 1.0363847, "epoch": 0.6893732150909364, "flos": 24388516707840.0, "grad_norm": 1.4066753545990751, "language_loss": 0.72587609, "learning_rate": 8.79179176340221e-07, "loss": 0.74744558, "num_input_tokens_seen": 247491170, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.734375, "step": 11466, "time_per_iteration": 2.6224205493927 }, { "auxiliary_loss_clip": 0.01108718, "auxiliary_loss_mlp": 0.01028393, "balance_loss_clip": 1.01633477, "balance_loss_mlp": 1.03767252, "epoch": 0.6894333383436044, "flos": 16982874979200.0, "grad_norm": 1.887897818640751, "language_loss": 0.72092164, "learning_rate": 8.788662782034948e-07, "loss": 0.74229276, "num_input_tokens_seen": 247509005, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.7109375, "step": 11467, "time_per_iteration": 2.5136592388153076 }, { "auxiliary_loss_clip": 0.01127515, "auxiliary_loss_mlp": 0.01035132, "balance_loss_clip": 1.02111328, "balance_loss_mlp": 1.03631353, "epoch": 0.6894934615962723, "flos": 18985540838400.0, "grad_norm": 1.8720800430944016, "language_loss": 0.80992001, "learning_rate": 8.785534200771478e-07, "loss": 0.83154649, "num_input_tokens_seen": 247527050, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.734375, "step": 11468, "time_per_iteration": 2.5345137119293213 }, { "auxiliary_loss_clip": 0.01111148, "auxiliary_loss_mlp": 0.01033511, "balance_loss_clip": 1.020648, "balance_loss_mlp": 1.03852916, "epoch": 0.6895535848489404, "flos": 34531664734080.0, "grad_norm": 1.5083147450518024, "language_loss": 0.65956312, "learning_rate": 8.782406019723441e-07, "loss": 0.68100971, "num_input_tokens_seen": 247547765, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 11469, "time_per_iteration": 2.632439136505127 }, { "auxiliary_loss_clip": 0.0103909, "auxiliary_loss_mlp": 0.01001284, "balance_loss_clip": 0.99997234, "balance_loss_mlp": 1.0084126, "epoch": 0.6896137081016083, "flos": 50075852273280.0, "grad_norm": 0.9614096418126257, "language_loss": 0.55228698, "learning_rate": 8.77927823900249e-07, "loss": 0.57269073, "num_input_tokens_seen": 247603515, "router_z_loss_clip": 0.01312256, "router_z_loss_mlp": 0.21875, "step": 11470, "time_per_iteration": 3.0501551628112793 }, { "auxiliary_loss_clip": 0.01112042, "auxiliary_loss_mlp": 0.01034448, "balance_loss_clip": 1.02392173, "balance_loss_mlp": 1.03634977, "epoch": 0.6896738313542763, "flos": 19938215306880.0, "grad_norm": 1.7428062021918431, "language_loss": 0.77965015, "learning_rate": 8.776150858720222e-07, "loss": 0.80111504, "num_input_tokens_seen": 247622110, "router_z_loss_clip": 0.10546875, "router_z_loss_mlp": 0.66796875, "step": 11471, "time_per_iteration": 2.540346622467041 }, { "auxiliary_loss_clip": 0.01124645, "auxiliary_loss_mlp": 0.0103301, "balance_loss_clip": 1.0205524, "balance_loss_mlp": 1.03659582, "epoch": 0.6897339546069442, "flos": 21725489260800.0, "grad_norm": 1.6448973697395324, "language_loss": 0.78189248, "learning_rate": 8.773023878988266e-07, "loss": 0.803469, "num_input_tokens_seen": 247641905, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.703125, "step": 11472, "time_per_iteration": 2.6831839084625244 }, { "auxiliary_loss_clip": 0.01121857, "auxiliary_loss_mlp": 0.01029232, "balance_loss_clip": 1.01728761, "balance_loss_mlp": 1.0340606, "epoch": 0.6897940778596122, "flos": 19826497031040.0, "grad_norm": 1.6182235199897952, "language_loss": 0.76209557, "learning_rate": 8.769897299918208e-07, "loss": 0.78360641, "num_input_tokens_seen": 247660945, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.69921875, "step": 11473, "time_per_iteration": 2.5631039142608643 }, { "auxiliary_loss_clip": 0.0113802, "auxiliary_loss_mlp": 0.01036723, "balance_loss_clip": 1.02364016, "balance_loss_mlp": 1.03790021, "epoch": 0.6898542011122801, "flos": 17310056987520.0, "grad_norm": 1.9869681585472931, "language_loss": 0.75253302, "learning_rate": 8.766771121621628e-07, "loss": 0.77428049, "num_input_tokens_seen": 247678395, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 11474, "time_per_iteration": 2.598327159881592 }, { "auxiliary_loss_clip": 0.01153222, "auxiliary_loss_mlp": 0.0103239, "balance_loss_clip": 1.01963425, "balance_loss_mlp": 1.0384779, "epoch": 0.6899143243649482, "flos": 24754051463040.0, "grad_norm": 1.522592793207733, "language_loss": 0.74180162, "learning_rate": 8.763645344210091e-07, "loss": 0.76365775, "num_input_tokens_seen": 247698380, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 11475, "time_per_iteration": 2.6074230670928955 }, { "auxiliary_loss_clip": 0.01134911, "auxiliary_loss_mlp": 0.01030312, "balance_loss_clip": 1.01755047, "balance_loss_mlp": 1.03687191, "epoch": 0.6899744476176161, "flos": 17234536642560.0, "grad_norm": 2.6364934013450267, "language_loss": 0.88911748, "learning_rate": 8.76051996779515e-07, "loss": 0.9107697, "num_input_tokens_seen": 247716370, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 11476, "time_per_iteration": 2.62370228767395 }, { "auxiliary_loss_clip": 0.01106123, "auxiliary_loss_mlp": 0.01032933, "balance_loss_clip": 1.02078557, "balance_loss_mlp": 1.03653741, "epoch": 0.6900345708702841, "flos": 25410678036480.0, "grad_norm": 1.9380418427661288, "language_loss": 0.69930387, "learning_rate": 8.757394992488338e-07, "loss": 0.72069442, "num_input_tokens_seen": 247737335, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6953125, "step": 11477, "time_per_iteration": 2.657217025756836 }, { "auxiliary_loss_clip": 0.01122902, "auxiliary_loss_mlp": 0.01041495, "balance_loss_clip": 1.0286268, "balance_loss_mlp": 1.03927112, "epoch": 0.6900946941229521, "flos": 23434190213760.0, "grad_norm": 2.2446442373170887, "language_loss": 0.68252242, "learning_rate": 8.754270418401173e-07, "loss": 0.70416641, "num_input_tokens_seen": 247756680, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.75, "step": 11478, "time_per_iteration": 2.675300359725952 }, { "auxiliary_loss_clip": 0.01118161, "auxiliary_loss_mlp": 0.01032515, "balance_loss_clip": 1.02037382, "balance_loss_mlp": 1.0373354, "epoch": 0.69015481737562, "flos": 17820096157440.0, "grad_norm": 1.8109226722294332, "language_loss": 0.7637769, "learning_rate": 8.751146245645178e-07, "loss": 0.78528363, "num_input_tokens_seen": 247774265, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.71875, "step": 11479, "time_per_iteration": 2.5967624187469482 }, { "auxiliary_loss_clip": 0.0111797, "auxiliary_loss_mlp": 0.01031816, "balance_loss_clip": 1.01866722, "balance_loss_mlp": 1.03766727, "epoch": 0.690214940628288, "flos": 17456500736640.0, "grad_norm": 1.4945184983031716, "language_loss": 0.78652573, "learning_rate": 8.748022474331835e-07, "loss": 0.80802357, "num_input_tokens_seen": 247792395, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 11480, "time_per_iteration": 4.024914264678955 }, { "auxiliary_loss_clip": 0.01135332, "auxiliary_loss_mlp": 0.01025855, "balance_loss_clip": 1.01321888, "balance_loss_mlp": 1.03855395, "epoch": 0.6902750638809559, "flos": 29566691308800.0, "grad_norm": 1.5082413219534623, "language_loss": 0.7550534, "learning_rate": 8.74489910457263e-07, "loss": 0.77666533, "num_input_tokens_seen": 247811985, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 11481, "time_per_iteration": 2.6420936584472656 }, { "auxiliary_loss_clip": 0.01132569, "auxiliary_loss_mlp": 0.01030126, "balance_loss_clip": 1.01748991, "balance_loss_mlp": 1.03560281, "epoch": 0.690335187133624, "flos": 25557121785600.0, "grad_norm": 2.7426535266604177, "language_loss": 0.69330114, "learning_rate": 8.741776136479014e-07, "loss": 0.71492815, "num_input_tokens_seen": 247831880, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 11482, "time_per_iteration": 2.596811056137085 }, { "auxiliary_loss_clip": 0.01108523, "auxiliary_loss_mlp": 0.01032438, "balance_loss_clip": 1.0198195, "balance_loss_mlp": 1.03762245, "epoch": 0.6903953103862919, "flos": 22488447070080.0, "grad_norm": 1.4466628135762756, "language_loss": 0.82809854, "learning_rate": 8.738653570162464e-07, "loss": 0.84950817, "num_input_tokens_seen": 247851170, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 11483, "time_per_iteration": 2.577728509902954 }, { "auxiliary_loss_clip": 0.01108404, "auxiliary_loss_mlp": 0.01030349, "balance_loss_clip": 1.01867867, "balance_loss_mlp": 1.03691471, "epoch": 0.6904554336389599, "flos": 26100521712000.0, "grad_norm": 1.795914920762958, "language_loss": 0.65142632, "learning_rate": 8.735531405734387e-07, "loss": 0.67281389, "num_input_tokens_seen": 247868950, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.71484375, "step": 11484, "time_per_iteration": 2.5384485721588135 }, { "auxiliary_loss_clip": 0.01122454, "auxiliary_loss_mlp": 0.01036147, "balance_loss_clip": 1.02420807, "balance_loss_mlp": 1.03526747, "epoch": 0.6905155568916278, "flos": 31171754545920.0, "grad_norm": 1.457756004215036, "language_loss": 0.73624253, "learning_rate": 8.732409643306223e-07, "loss": 0.75782853, "num_input_tokens_seen": 247889805, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 11485, "time_per_iteration": 2.666785478591919 }, { "auxiliary_loss_clip": 0.01119818, "auxiliary_loss_mlp": 0.01283496, "balance_loss_clip": 1.02343535, "balance_loss_mlp": 1.03877926, "epoch": 0.6905756801442958, "flos": 17639681120640.0, "grad_norm": 1.8560789735934808, "language_loss": 0.84724337, "learning_rate": 8.729288282989369e-07, "loss": 0.8712765, "num_input_tokens_seen": 247908585, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 11486, "time_per_iteration": 2.5067803859710693 }, { "auxiliary_loss_clip": 0.01152285, "auxiliary_loss_mlp": 0.0103296, "balance_loss_clip": 1.02010942, "balance_loss_mlp": 1.03735089, "epoch": 0.6906358033969637, "flos": 22343691260160.0, "grad_norm": 1.696053029998662, "language_loss": 0.72511876, "learning_rate": 8.72616732489524e-07, "loss": 0.74697125, "num_input_tokens_seen": 247928480, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 11487, "time_per_iteration": 3.986182689666748 }, { "auxiliary_loss_clip": 0.01046233, "auxiliary_loss_mlp": 0.0100352, "balance_loss_clip": 1.00232792, "balance_loss_mlp": 1.00664902, "epoch": 0.6906959266496318, "flos": 69747789081600.0, "grad_norm": 0.9010959496020438, "language_loss": 0.66695178, "learning_rate": 8.723046769135183e-07, "loss": 0.68744934, "num_input_tokens_seen": 247988855, "router_z_loss_clip": 0.01190186, "router_z_loss_mlp": 0.21777344, "step": 11488, "time_per_iteration": 3.1444807052612305 }, { "auxiliary_loss_clip": 0.01129847, "auxiliary_loss_mlp": 0.01036282, "balance_loss_clip": 1.02223969, "balance_loss_mlp": 1.03707957, "epoch": 0.6907560499022997, "flos": 21434253788160.0, "grad_norm": 1.6650281570151326, "language_loss": 0.74646884, "learning_rate": 8.719926615820587e-07, "loss": 0.76813018, "num_input_tokens_seen": 248007685, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75, "step": 11489, "time_per_iteration": 2.5143609046936035 }, { "auxiliary_loss_clip": 0.01118893, "auxiliary_loss_mlp": 0.01034553, "balance_loss_clip": 1.02088571, "balance_loss_mlp": 1.03696346, "epoch": 0.6908161731549677, "flos": 14392207480320.0, "grad_norm": 2.073572013286774, "language_loss": 0.62657934, "learning_rate": 8.716806865062803e-07, "loss": 0.64811379, "num_input_tokens_seen": 248025145, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73046875, "step": 11490, "time_per_iteration": 2.502866268157959 }, { "auxiliary_loss_clip": 0.01125635, "auxiliary_loss_mlp": 0.01030902, "balance_loss_clip": 1.01843286, "balance_loss_mlp": 1.03664017, "epoch": 0.6908762964076357, "flos": 20010970304640.0, "grad_norm": 2.093906306819275, "language_loss": 0.72863448, "learning_rate": 8.713687516973142e-07, "loss": 0.75019985, "num_input_tokens_seen": 248043750, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 11491, "time_per_iteration": 2.560422658920288 }, { "auxiliary_loss_clip": 0.01123949, "auxiliary_loss_mlp": 0.01287021, "balance_loss_clip": 1.02715898, "balance_loss_mlp": 1.03613472, "epoch": 0.6909364196603036, "flos": 28769079853440.0, "grad_norm": 1.352068508517847, "language_loss": 0.70277441, "learning_rate": 8.710568571662948e-07, "loss": 0.72688407, "num_input_tokens_seen": 248065765, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 11492, "time_per_iteration": 5.635981798171997 }, { "auxiliary_loss_clip": 0.01147663, "auxiliary_loss_mlp": 0.01033476, "balance_loss_clip": 1.01992226, "balance_loss_mlp": 1.03726029, "epoch": 0.6909965429129716, "flos": 22528128620160.0, "grad_norm": 1.8554703101133536, "language_loss": 0.74548233, "learning_rate": 8.707450029243524e-07, "loss": 0.76729369, "num_input_tokens_seen": 248083810, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.75, "step": 11493, "time_per_iteration": 2.68057918548584 }, { "auxiliary_loss_clip": 0.01109253, "auxiliary_loss_mlp": 0.01033302, "balance_loss_clip": 1.02092218, "balance_loss_mlp": 1.03868747, "epoch": 0.6910566661656395, "flos": 18405942981120.0, "grad_norm": 1.8270626319043077, "language_loss": 0.7407943, "learning_rate": 8.70433188982616e-07, "loss": 0.76221991, "num_input_tokens_seen": 248103185, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.703125, "step": 11494, "time_per_iteration": 2.5367178916931152 }, { "auxiliary_loss_clip": 0.01123586, "auxiliary_loss_mlp": 0.01031422, "balance_loss_clip": 1.01852322, "balance_loss_mlp": 1.0355103, "epoch": 0.6911167894183076, "flos": 30773972355840.0, "grad_norm": 1.7895201456674854, "language_loss": 0.68137491, "learning_rate": 8.701214153522127e-07, "loss": 0.70292497, "num_input_tokens_seen": 248125665, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 11495, "time_per_iteration": 2.698366641998291 }, { "auxiliary_loss_clip": 0.01123622, "auxiliary_loss_mlp": 0.01031083, "balance_loss_clip": 1.01814318, "balance_loss_mlp": 1.03435242, "epoch": 0.6911769126709755, "flos": 13735724561280.0, "grad_norm": 1.7642563215116758, "language_loss": 0.7401281, "learning_rate": 8.698096820442704e-07, "loss": 0.76167512, "num_input_tokens_seen": 248142545, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 11496, "time_per_iteration": 2.5640103816986084 }, { "auxiliary_loss_clip": 0.01131883, "auxiliary_loss_mlp": 0.01029501, "balance_loss_clip": 1.01767588, "balance_loss_mlp": 1.03543901, "epoch": 0.6912370359236435, "flos": 17566854295680.0, "grad_norm": 1.7478563350441574, "language_loss": 0.79862082, "learning_rate": 8.694979890699135e-07, "loss": 0.82023472, "num_input_tokens_seen": 248160225, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69921875, "step": 11497, "time_per_iteration": 2.5344057083129883 }, { "auxiliary_loss_clip": 0.01138032, "auxiliary_loss_mlp": 0.01037619, "balance_loss_clip": 1.02426124, "balance_loss_mlp": 1.03747082, "epoch": 0.6912971591763114, "flos": 22090772620800.0, "grad_norm": 1.7746158144139141, "language_loss": 0.80560195, "learning_rate": 8.691863364402655e-07, "loss": 0.82735848, "num_input_tokens_seen": 248180430, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73828125, "step": 11498, "time_per_iteration": 2.6408276557922363 }, { "auxiliary_loss_clip": 0.01112972, "auxiliary_loss_mlp": 0.01031674, "balance_loss_clip": 1.01930547, "balance_loss_mlp": 1.03514004, "epoch": 0.6913572824289794, "flos": 29971476650880.0, "grad_norm": 1.5266006387566733, "language_loss": 0.86372119, "learning_rate": 8.688747241664471e-07, "loss": 0.88516766, "num_input_tokens_seen": 248202365, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.69140625, "step": 11499, "time_per_iteration": 2.6023824214935303 }, { "auxiliary_loss_clip": 0.01124316, "auxiliary_loss_mlp": 0.01280666, "balance_loss_clip": 1.02159274, "balance_loss_mlp": 1.03703272, "epoch": 0.6914174056816473, "flos": 20448936835200.0, "grad_norm": 2.5655880800756727, "language_loss": 0.75610149, "learning_rate": 8.68563152259582e-07, "loss": 0.78015125, "num_input_tokens_seen": 248221750, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 11500, "time_per_iteration": 2.592444896697998 }, { "auxiliary_loss_clip": 0.01153589, "auxiliary_loss_mlp": 0.01035384, "balance_loss_clip": 1.02260518, "balance_loss_mlp": 1.03725481, "epoch": 0.6914775289343154, "flos": 21282530739840.0, "grad_norm": 3.5439574846885464, "language_loss": 0.76769829, "learning_rate": 8.682516207307862e-07, "loss": 0.78958797, "num_input_tokens_seen": 248239535, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 11501, "time_per_iteration": 2.5907270908355713 }, { "auxiliary_loss_clip": 0.01119375, "auxiliary_loss_mlp": 0.01036581, "balance_loss_clip": 1.02337265, "balance_loss_mlp": 1.03697777, "epoch": 0.6915376521869833, "flos": 23878118401920.0, "grad_norm": 1.9108826818926032, "language_loss": 0.73681426, "learning_rate": 8.679401295911794e-07, "loss": 0.75837386, "num_input_tokens_seen": 248259055, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 11502, "time_per_iteration": 2.578219413757324 }, { "auxiliary_loss_clip": 0.0116462, "auxiliary_loss_mlp": 0.01035131, "balance_loss_clip": 1.02135634, "balance_loss_mlp": 1.03813338, "epoch": 0.6915977754396513, "flos": 11510268595200.0, "grad_norm": 2.1161399375277328, "language_loss": 0.73384595, "learning_rate": 8.676286788518774e-07, "loss": 0.75584346, "num_input_tokens_seen": 248276765, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.73828125, "step": 11503, "time_per_iteration": 2.5984795093536377 }, { "auxiliary_loss_clip": 0.01115881, "auxiliary_loss_mlp": 0.01033848, "balance_loss_clip": 1.02179027, "balance_loss_mlp": 1.03602326, "epoch": 0.6916578986923193, "flos": 22601278667520.0, "grad_norm": 1.7580866580590846, "language_loss": 0.76963031, "learning_rate": 8.673172685239951e-07, "loss": 0.79112756, "num_input_tokens_seen": 248295310, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.7109375, "step": 11504, "time_per_iteration": 2.5855159759521484 }, { "auxiliary_loss_clip": 0.01134252, "auxiliary_loss_mlp": 0.01029755, "balance_loss_clip": 1.01733983, "balance_loss_mlp": 1.03585386, "epoch": 0.6917180219449872, "flos": 23477355383040.0, "grad_norm": 1.7045561931263866, "language_loss": 0.73869759, "learning_rate": 8.670058986186459e-07, "loss": 0.76033765, "num_input_tokens_seen": 248315230, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71484375, "step": 11505, "time_per_iteration": 2.6041548252105713 }, { "auxiliary_loss_clip": 0.0115123, "auxiliary_loss_mlp": 0.0128455, "balance_loss_clip": 1.02464426, "balance_loss_mlp": 1.03622746, "epoch": 0.6917781451976552, "flos": 23732536579200.0, "grad_norm": 3.687457091941571, "language_loss": 0.87434816, "learning_rate": 8.666945691469409e-07, "loss": 0.89870596, "num_input_tokens_seen": 248332980, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 11506, "time_per_iteration": 2.6775543689727783 }, { "auxiliary_loss_clip": 0.01124462, "auxiliary_loss_mlp": 0.01029676, "balance_loss_clip": 1.01715863, "balance_loss_mlp": 1.03702033, "epoch": 0.6918382684503231, "flos": 31466760946560.0, "grad_norm": 1.949961839889125, "language_loss": 0.70042777, "learning_rate": 8.663832801199933e-07, "loss": 0.72196913, "num_input_tokens_seen": 248352865, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 11507, "time_per_iteration": 2.597153902053833 }, { "auxiliary_loss_clip": 0.01127434, "auxiliary_loss_mlp": 0.01033275, "balance_loss_clip": 1.02047813, "balance_loss_mlp": 1.03565955, "epoch": 0.6918983917029912, "flos": 21650471706240.0, "grad_norm": 2.17932522637064, "language_loss": 0.77036464, "learning_rate": 8.660720315489087e-07, "loss": 0.7919718, "num_input_tokens_seen": 248371125, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.734375, "step": 11508, "time_per_iteration": 2.5865538120269775 }, { "auxiliary_loss_clip": 0.01114598, "auxiliary_loss_mlp": 0.01033686, "balance_loss_clip": 1.02164626, "balance_loss_mlp": 1.03756201, "epoch": 0.6919585149556591, "flos": 25550082720000.0, "grad_norm": 1.7768968047159306, "language_loss": 0.75277311, "learning_rate": 8.657608234447972e-07, "loss": 0.77425599, "num_input_tokens_seen": 248390455, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 11509, "time_per_iteration": 2.5689446926116943 }, { "auxiliary_loss_clip": 0.01144599, "auxiliary_loss_mlp": 0.01033125, "balance_loss_clip": 1.02082849, "balance_loss_mlp": 1.0385201, "epoch": 0.6920186382083271, "flos": 23659781581440.0, "grad_norm": 1.547991899723378, "language_loss": 0.6444217, "learning_rate": 8.654496558187643e-07, "loss": 0.66619897, "num_input_tokens_seen": 248411305, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.70703125, "step": 11510, "time_per_iteration": 2.6431221961975098 }, { "auxiliary_loss_clip": 0.01140295, "auxiliary_loss_mlp": 0.01035687, "balance_loss_clip": 1.02128065, "balance_loss_mlp": 1.03944468, "epoch": 0.692078761460995, "flos": 19061959023360.0, "grad_norm": 1.7788569118285267, "language_loss": 0.75154293, "learning_rate": 8.651385286819149e-07, "loss": 0.77330279, "num_input_tokens_seen": 248430190, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.74609375, "step": 11511, "time_per_iteration": 2.510439872741699 }, { "auxiliary_loss_clip": 0.01133849, "auxiliary_loss_mlp": 0.0103258, "balance_loss_clip": 1.02096939, "balance_loss_mlp": 1.03714144, "epoch": 0.692138884713663, "flos": 29023291382400.0, "grad_norm": 1.626781896920418, "language_loss": 0.62307209, "learning_rate": 8.648274420453514e-07, "loss": 0.64473641, "num_input_tokens_seen": 248450830, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.69921875, "step": 11512, "time_per_iteration": 2.5951943397521973 }, { "auxiliary_loss_clip": 0.01135408, "auxiliary_loss_mlp": 0.01034716, "balance_loss_clip": 1.02175236, "balance_loss_mlp": 1.03590322, "epoch": 0.6921990079663309, "flos": 14757849976320.0, "grad_norm": 1.7541408172854291, "language_loss": 0.83098453, "learning_rate": 8.645163959201771e-07, "loss": 0.85268581, "num_input_tokens_seen": 248468585, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 11513, "time_per_iteration": 2.5091066360473633 }, { "auxiliary_loss_clip": 0.01150495, "auxiliary_loss_mlp": 0.01035735, "balance_loss_clip": 1.02385628, "balance_loss_mlp": 1.03653359, "epoch": 0.692259131218999, "flos": 23841848643840.0, "grad_norm": 1.41783883842568, "language_loss": 0.78090394, "learning_rate": 8.64205390317492e-07, "loss": 0.8027662, "num_input_tokens_seen": 248490535, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69921875, "step": 11514, "time_per_iteration": 2.6422603130340576 }, { "auxiliary_loss_clip": 0.01144589, "auxiliary_loss_mlp": 0.01035432, "balance_loss_clip": 1.0228374, "balance_loss_mlp": 1.03595698, "epoch": 0.6923192544716669, "flos": 19135073157120.0, "grad_norm": 2.416572853260162, "language_loss": 0.74549794, "learning_rate": 8.638944252483948e-07, "loss": 0.76729816, "num_input_tokens_seen": 248508575, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.734375, "step": 11515, "time_per_iteration": 2.498450756072998 }, { "auxiliary_loss_clip": 0.01140829, "auxiliary_loss_mlp": 0.01033827, "balance_loss_clip": 1.02133965, "balance_loss_mlp": 1.03682697, "epoch": 0.6923793777243349, "flos": 28074639237120.0, "grad_norm": 2.021791362489911, "language_loss": 0.54096675, "learning_rate": 8.635835007239824e-07, "loss": 0.56271333, "num_input_tokens_seen": 248527025, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 11516, "time_per_iteration": 2.654163122177124 }, { "auxiliary_loss_clip": 0.01114061, "auxiliary_loss_mlp": 0.01033767, "balance_loss_clip": 1.02096939, "balance_loss_mlp": 1.03715551, "epoch": 0.6924395009770029, "flos": 16581250033920.0, "grad_norm": 2.028243661843294, "language_loss": 0.73083699, "learning_rate": 8.632726167553532e-07, "loss": 0.75231528, "num_input_tokens_seen": 248544275, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6796875, "step": 11517, "time_per_iteration": 2.5143661499023438 }, { "auxiliary_loss_clip": 0.01110552, "auxiliary_loss_mlp": 0.01039966, "balance_loss_clip": 1.02608991, "balance_loss_mlp": 1.03749001, "epoch": 0.6924996242296708, "flos": 16655297921280.0, "grad_norm": 2.6373812551786378, "language_loss": 0.76203817, "learning_rate": 8.629617733535987e-07, "loss": 0.78354335, "num_input_tokens_seen": 248561870, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.73046875, "step": 11518, "time_per_iteration": 2.5018503665924072 }, { "auxiliary_loss_clip": 0.01121028, "auxiliary_loss_mlp": 0.01036565, "balance_loss_clip": 1.02195597, "balance_loss_mlp": 1.03752708, "epoch": 0.6925597474823388, "flos": 34754167532160.0, "grad_norm": 1.3861163036535664, "language_loss": 0.6451124, "learning_rate": 8.626509705298146e-07, "loss": 0.66668832, "num_input_tokens_seen": 248588190, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.75, "step": 11519, "time_per_iteration": 2.648400068283081 }, { "auxiliary_loss_clip": 0.01126756, "auxiliary_loss_mlp": 0.01035346, "balance_loss_clip": 1.02264404, "balance_loss_mlp": 1.03705347, "epoch": 0.6926198707350067, "flos": 21871717528320.0, "grad_norm": 1.879643323002809, "language_loss": 0.62438512, "learning_rate": 8.623402082950926e-07, "loss": 0.64600617, "num_input_tokens_seen": 248606460, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 11520, "time_per_iteration": 2.5343456268310547 }, { "auxiliary_loss_clip": 0.01142505, "auxiliary_loss_mlp": 0.01039134, "balance_loss_clip": 1.02425623, "balance_loss_mlp": 1.03788602, "epoch": 0.6926799939876748, "flos": 13006271162880.0, "grad_norm": 1.974053993499911, "language_loss": 0.77241886, "learning_rate": 8.620294866605204e-07, "loss": 0.79423523, "num_input_tokens_seen": 248623715, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7734375, "step": 11521, "time_per_iteration": 2.5004923343658447 }, { "auxiliary_loss_clip": 0.01121294, "auxiliary_loss_mlp": 0.01033457, "balance_loss_clip": 1.0199213, "balance_loss_mlp": 1.03716326, "epoch": 0.6927401172403427, "flos": 16761234107520.0, "grad_norm": 1.7766874570875584, "language_loss": 0.82027626, "learning_rate": 8.617188056371894e-07, "loss": 0.84182382, "num_input_tokens_seen": 248640575, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75390625, "step": 11522, "time_per_iteration": 3.9167776107788086 }, { "auxiliary_loss_clip": 0.01119708, "auxiliary_loss_mlp": 0.01033478, "balance_loss_clip": 1.0211277, "balance_loss_mlp": 1.03995323, "epoch": 0.6928002404930107, "flos": 25705648523520.0, "grad_norm": 1.3636179331494611, "language_loss": 0.76993591, "learning_rate": 8.614081652361855e-07, "loss": 0.79146779, "num_input_tokens_seen": 248663535, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.703125, "step": 11523, "time_per_iteration": 2.565722703933716 }, { "auxiliary_loss_clip": 0.01132595, "auxiliary_loss_mlp": 0.01035935, "balance_loss_clip": 1.02424622, "balance_loss_mlp": 1.03634548, "epoch": 0.6928603637456786, "flos": 18588261438720.0, "grad_norm": 1.8364765242318244, "language_loss": 0.68263417, "learning_rate": 8.61097565468597e-07, "loss": 0.70431948, "num_input_tokens_seen": 248681125, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6953125, "step": 11524, "time_per_iteration": 2.506350040435791 }, { "auxiliary_loss_clip": 0.01136067, "auxiliary_loss_mlp": 0.01034703, "balance_loss_clip": 1.02119064, "balance_loss_mlp": 1.03679442, "epoch": 0.6929204869983466, "flos": 22200874784640.0, "grad_norm": 1.9546897107883723, "language_loss": 0.64329523, "learning_rate": 8.607870063455051e-07, "loss": 0.66500288, "num_input_tokens_seen": 248700555, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73046875, "step": 11525, "time_per_iteration": 2.587184190750122 }, { "auxiliary_loss_clip": 0.01145867, "auxiliary_loss_mlp": 0.01039234, "balance_loss_clip": 1.02616835, "balance_loss_mlp": 1.03901994, "epoch": 0.6929806102510145, "flos": 17894754576000.0, "grad_norm": 2.001898359822949, "language_loss": 0.70351946, "learning_rate": 8.604764878779953e-07, "loss": 0.72537041, "num_input_tokens_seen": 248716095, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 11526, "time_per_iteration": 2.562670946121216 }, { "auxiliary_loss_clip": 0.01124106, "auxiliary_loss_mlp": 0.0103231, "balance_loss_clip": 1.02019882, "balance_loss_mlp": 1.03636897, "epoch": 0.6930407335036826, "flos": 19755178577280.0, "grad_norm": 1.8349294405007515, "language_loss": 0.76288903, "learning_rate": 8.601660100771486e-07, "loss": 0.78445327, "num_input_tokens_seen": 248735330, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69921875, "step": 11527, "time_per_iteration": 2.5861244201660156 }, { "auxiliary_loss_clip": 0.01124636, "auxiliary_loss_mlp": 0.01029923, "balance_loss_clip": 1.01803184, "balance_loss_mlp": 1.03635335, "epoch": 0.6931008567563505, "flos": 21544248211200.0, "grad_norm": 1.4258156589152369, "language_loss": 0.79288065, "learning_rate": 8.598555729540449e-07, "loss": 0.81442618, "num_input_tokens_seen": 248754530, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.703125, "step": 11528, "time_per_iteration": 3.9847426414489746 }, { "auxiliary_loss_clip": 0.01115016, "auxiliary_loss_mlp": 0.01035501, "balance_loss_clip": 1.02288902, "balance_loss_mlp": 1.03788328, "epoch": 0.6931609800090185, "flos": 26250018117120.0, "grad_norm": 1.3503898082922483, "language_loss": 0.75762063, "learning_rate": 8.595451765197624e-07, "loss": 0.77912581, "num_input_tokens_seen": 248775825, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.68359375, "step": 11529, "time_per_iteration": 2.5718884468078613 }, { "auxiliary_loss_clip": 0.01106368, "auxiliary_loss_mlp": 0.01280507, "balance_loss_clip": 1.02135015, "balance_loss_mlp": 1.03764772, "epoch": 0.6932211032616865, "flos": 32343376366080.0, "grad_norm": 1.9098613910073388, "language_loss": 0.72342658, "learning_rate": 8.592348207853795e-07, "loss": 0.74729538, "num_input_tokens_seen": 248796180, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 11530, "time_per_iteration": 2.5742781162261963 }, { "auxiliary_loss_clip": 0.01133582, "auxiliary_loss_mlp": 0.01035346, "balance_loss_clip": 1.02051663, "balance_loss_mlp": 1.04051805, "epoch": 0.6932812265143544, "flos": 22049079909120.0, "grad_norm": 1.6927138008951115, "language_loss": 0.78743064, "learning_rate": 8.589245057619714e-07, "loss": 0.80911994, "num_input_tokens_seen": 248814735, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.75390625, "step": 11531, "time_per_iteration": 2.5405771732330322 }, { "auxiliary_loss_clip": 0.01115156, "auxiliary_loss_mlp": 0.01031558, "balance_loss_clip": 1.01986969, "balance_loss_mlp": 1.03648639, "epoch": 0.6933413497670224, "flos": 26256626219520.0, "grad_norm": 1.3542312684036775, "language_loss": 0.69451398, "learning_rate": 8.586142314606126e-07, "loss": 0.71598113, "num_input_tokens_seen": 248839140, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6953125, "step": 11532, "time_per_iteration": 2.645061731338501 }, { "auxiliary_loss_clip": 0.01129224, "auxiliary_loss_mlp": 0.01027532, "balance_loss_clip": 1.01438308, "balance_loss_mlp": 1.03848255, "epoch": 0.6934014730196904, "flos": 19573003774080.0, "grad_norm": 1.6263752381504388, "language_loss": 0.66786671, "learning_rate": 8.583039978923751e-07, "loss": 0.68943423, "num_input_tokens_seen": 248858300, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 11533, "time_per_iteration": 2.56371808052063 }, { "auxiliary_loss_clip": 0.01129434, "auxiliary_loss_mlp": 0.01033609, "balance_loss_clip": 1.02013218, "balance_loss_mlp": 1.03875339, "epoch": 0.6934615962723584, "flos": 22119249127680.0, "grad_norm": 2.411874974700055, "language_loss": 0.58785182, "learning_rate": 8.579938050683326e-07, "loss": 0.60948229, "num_input_tokens_seen": 248876310, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73046875, "step": 11534, "time_per_iteration": 4.39221978187561 }, { "auxiliary_loss_clip": 0.01133107, "auxiliary_loss_mlp": 0.010333, "balance_loss_clip": 1.0195787, "balance_loss_mlp": 1.03575563, "epoch": 0.6935217195250263, "flos": 21360816432000.0, "grad_norm": 1.8269899367866984, "language_loss": 0.71112156, "learning_rate": 8.57683652999553e-07, "loss": 0.73278558, "num_input_tokens_seen": 248895650, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.703125, "step": 11535, "time_per_iteration": 2.6112284660339355 }, { "auxiliary_loss_clip": 0.01048477, "auxiliary_loss_mlp": 0.00999548, "balance_loss_clip": 0.99821913, "balance_loss_mlp": 1.00863218, "epoch": 0.6935818427776943, "flos": 64063813115520.0, "grad_norm": 0.7128189409700666, "language_loss": 0.59068292, "learning_rate": 8.573735416971046e-07, "loss": 0.61116314, "num_input_tokens_seen": 248963920, "router_z_loss_clip": 0.01330566, "router_z_loss_mlp": 0.21875, "step": 11536, "time_per_iteration": 3.3118138313293457 }, { "auxiliary_loss_clip": 0.01122055, "auxiliary_loss_mlp": 0.01281249, "balance_loss_clip": 1.02103722, "balance_loss_mlp": 1.03825068, "epoch": 0.6936419660303622, "flos": 20302564913280.0, "grad_norm": 1.934924544888596, "language_loss": 0.72724575, "learning_rate": 8.570634711720568e-07, "loss": 0.75127876, "num_input_tokens_seen": 248983380, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.74609375, "step": 11537, "time_per_iteration": 2.5735626220703125 }, { "auxiliary_loss_clip": 0.01039317, "auxiliary_loss_mlp": 0.0124763, "balance_loss_clip": 1.00035882, "balance_loss_mlp": 1.00837648, "epoch": 0.6937020892830302, "flos": 67182581347200.0, "grad_norm": 0.7487573176392698, "language_loss": 0.55542743, "learning_rate": 8.567534414354722e-07, "loss": 0.57829696, "num_input_tokens_seen": 249044680, "router_z_loss_clip": 0.01269531, "router_z_loss_mlp": 0.21777344, "step": 11538, "time_per_iteration": 3.120731830596924 }, { "auxiliary_loss_clip": 0.01113283, "auxiliary_loss_mlp": 0.01031681, "balance_loss_clip": 1.01996899, "balance_loss_mlp": 1.03712726, "epoch": 0.6937622125356981, "flos": 23878190229120.0, "grad_norm": 1.4348959489603061, "language_loss": 0.77590156, "learning_rate": 8.564434524984172e-07, "loss": 0.79735124, "num_input_tokens_seen": 249061060, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 11539, "time_per_iteration": 2.6061651706695557 }, { "auxiliary_loss_clip": 0.01124446, "auxiliary_loss_mlp": 0.01031656, "balance_loss_clip": 1.01955068, "balance_loss_mlp": 1.03715968, "epoch": 0.6938223357883662, "flos": 28730619365760.0, "grad_norm": 1.8743758662649468, "language_loss": 0.63997185, "learning_rate": 8.561335043719531e-07, "loss": 0.66153282, "num_input_tokens_seen": 249081430, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69921875, "step": 11540, "time_per_iteration": 2.6159634590148926 }, { "auxiliary_loss_clip": 0.01133893, "auxiliary_loss_mlp": 0.01031769, "balance_loss_clip": 1.01910341, "balance_loss_mlp": 1.03713512, "epoch": 0.6938824590410341, "flos": 28655027193600.0, "grad_norm": 1.3904199667051034, "language_loss": 0.86769938, "learning_rate": 8.558235970671434e-07, "loss": 0.88935602, "num_input_tokens_seen": 249103020, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 11541, "time_per_iteration": 2.675045967102051 }, { "auxiliary_loss_clip": 0.01109571, "auxiliary_loss_mlp": 0.01282853, "balance_loss_clip": 1.02334046, "balance_loss_mlp": 1.03773928, "epoch": 0.6939425822937021, "flos": 18983062800000.0, "grad_norm": 1.6891224526866333, "language_loss": 0.84451365, "learning_rate": 8.555137305950448e-07, "loss": 0.86843789, "num_input_tokens_seen": 249120810, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 11542, "time_per_iteration": 2.5174031257629395 }, { "auxiliary_loss_clip": 0.01117312, "auxiliary_loss_mlp": 0.01033252, "balance_loss_clip": 1.01929879, "balance_loss_mlp": 1.0364176, "epoch": 0.6940027055463701, "flos": 23075838178560.0, "grad_norm": 1.7735777574953295, "language_loss": 0.75292003, "learning_rate": 8.552039049667181e-07, "loss": 0.77442563, "num_input_tokens_seen": 249138050, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.72265625, "step": 11543, "time_per_iteration": 2.6265645027160645 }, { "auxiliary_loss_clip": 0.01124305, "auxiliary_loss_mlp": 0.01033021, "balance_loss_clip": 1.02065313, "balance_loss_mlp": 1.03648031, "epoch": 0.694062828799038, "flos": 18186564666240.0, "grad_norm": 1.5952672760247615, "language_loss": 0.75828785, "learning_rate": 8.548941201932191e-07, "loss": 0.77986109, "num_input_tokens_seen": 249155570, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.703125, "step": 11544, "time_per_iteration": 2.509408712387085 }, { "auxiliary_loss_clip": 0.01146268, "auxiliary_loss_mlp": 0.01033831, "balance_loss_clip": 1.02131391, "balance_loss_mlp": 1.03766501, "epoch": 0.694122952051706, "flos": 17821532701440.0, "grad_norm": 2.1235851797706586, "language_loss": 0.70973498, "learning_rate": 8.545843762856033e-07, "loss": 0.73153603, "num_input_tokens_seen": 249172960, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7265625, "step": 11545, "time_per_iteration": 2.5263569355010986 }, { "auxiliary_loss_clip": 0.01112772, "auxiliary_loss_mlp": 0.01026048, "balance_loss_clip": 1.01483059, "balance_loss_mlp": 1.03698087, "epoch": 0.694183075304374, "flos": 21215306436480.0, "grad_norm": 1.5533010767286972, "language_loss": 0.79464304, "learning_rate": 8.542746732549241e-07, "loss": 0.81603122, "num_input_tokens_seen": 249192450, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.66796875, "step": 11546, "time_per_iteration": 2.5148141384124756 }, { "auxiliary_loss_clip": 0.01134602, "auxiliary_loss_mlp": 0.01032992, "balance_loss_clip": 1.02056456, "balance_loss_mlp": 1.03740144, "epoch": 0.694243198557042, "flos": 24060508686720.0, "grad_norm": 1.3686963698501209, "language_loss": 0.78435248, "learning_rate": 8.539650111122363e-07, "loss": 0.80602837, "num_input_tokens_seen": 249214320, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 11547, "time_per_iteration": 2.5927131175994873 }, { "auxiliary_loss_clip": 0.0112629, "auxiliary_loss_mlp": 0.01286757, "balance_loss_clip": 1.02666473, "balance_loss_mlp": 1.03816581, "epoch": 0.6943033218097099, "flos": 21141869080320.0, "grad_norm": 2.105002998412319, "language_loss": 0.80778968, "learning_rate": 8.536553898685876e-07, "loss": 0.83192015, "num_input_tokens_seen": 249230925, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 11548, "time_per_iteration": 2.588773012161255 }, { "auxiliary_loss_clip": 0.01028925, "auxiliary_loss_mlp": 0.01248032, "balance_loss_clip": 1.00077236, "balance_loss_mlp": 1.00703847, "epoch": 0.6943634450623779, "flos": 57812015975040.0, "grad_norm": 0.690266888235356, "language_loss": 0.53775847, "learning_rate": 8.533458095350302e-07, "loss": 0.56052804, "num_input_tokens_seen": 249293975, "router_z_loss_clip": 0.01275635, "router_z_loss_mlp": 0.21875, "step": 11549, "time_per_iteration": 3.126960039138794 }, { "auxiliary_loss_clip": 0.01116455, "auxiliary_loss_mlp": 0.01033037, "balance_loss_clip": 1.02011478, "balance_loss_mlp": 1.03622878, "epoch": 0.6944235683150458, "flos": 30590684231040.0, "grad_norm": 1.6155853859694338, "language_loss": 0.73439938, "learning_rate": 8.530362701226111e-07, "loss": 0.7558943, "num_input_tokens_seen": 249315285, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 11550, "time_per_iteration": 2.6102135181427 }, { "auxiliary_loss_clip": 0.01129797, "auxiliary_loss_mlp": 0.01037623, "balance_loss_clip": 1.0250051, "balance_loss_mlp": 1.03934515, "epoch": 0.6944836915677138, "flos": 19719447523200.0, "grad_norm": 1.9835661072508506, "language_loss": 0.73786902, "learning_rate": 8.527267716423774e-07, "loss": 0.75954324, "num_input_tokens_seen": 249333505, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 11551, "time_per_iteration": 2.5196375846862793 }, { "auxiliary_loss_clip": 0.01108321, "auxiliary_loss_mlp": 0.01034233, "balance_loss_clip": 1.02139413, "balance_loss_mlp": 1.03731835, "epoch": 0.6945438148203817, "flos": 24863579009280.0, "grad_norm": 1.5007977412039768, "language_loss": 0.84465939, "learning_rate": 8.524173141053739e-07, "loss": 0.86608493, "num_input_tokens_seen": 249354180, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 11552, "time_per_iteration": 2.554957628250122 }, { "auxiliary_loss_clip": 0.01106912, "auxiliary_loss_mlp": 0.01037992, "balance_loss_clip": 1.02490234, "balance_loss_mlp": 1.03649461, "epoch": 0.6946039380730498, "flos": 33326646243840.0, "grad_norm": 1.48970798131636, "language_loss": 0.67541468, "learning_rate": 8.521078975226439e-07, "loss": 0.69686371, "num_input_tokens_seen": 249377035, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 11553, "time_per_iteration": 2.6402084827423096 }, { "auxiliary_loss_clip": 0.01130204, "auxiliary_loss_mlp": 0.01031018, "balance_loss_clip": 1.0174278, "balance_loss_mlp": 1.03806853, "epoch": 0.6946640613257177, "flos": 20850956830080.0, "grad_norm": 1.584545852759246, "language_loss": 0.79413462, "learning_rate": 8.517985219052317e-07, "loss": 0.8157469, "num_input_tokens_seen": 249396155, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.74609375, "step": 11554, "time_per_iteration": 2.555922746658325 }, { "auxiliary_loss_clip": 0.01116872, "auxiliary_loss_mlp": 0.01030631, "balance_loss_clip": 1.01786959, "balance_loss_mlp": 1.03526938, "epoch": 0.6947241845783857, "flos": 19354846521600.0, "grad_norm": 1.7357803642021004, "language_loss": 0.72281051, "learning_rate": 8.514891872641751e-07, "loss": 0.74428552, "num_input_tokens_seen": 249414555, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 11555, "time_per_iteration": 2.5532779693603516 }, { "auxiliary_loss_clip": 0.011339, "auxiliary_loss_mlp": 0.01032935, "balance_loss_clip": 1.02053142, "balance_loss_mlp": 1.03589845, "epoch": 0.6947843078310536, "flos": 27120240915840.0, "grad_norm": 1.9463375187870655, "language_loss": 0.78327072, "learning_rate": 8.511798936105162e-07, "loss": 0.80493909, "num_input_tokens_seen": 249433570, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71484375, "step": 11556, "time_per_iteration": 2.5738563537597656 }, { "auxiliary_loss_clip": 0.01152196, "auxiliary_loss_mlp": 0.01037808, "balance_loss_clip": 1.02506495, "balance_loss_mlp": 1.03632116, "epoch": 0.6948444310837216, "flos": 28585109370240.0, "grad_norm": 2.3712186174413317, "language_loss": 0.6068809, "learning_rate": 8.508706409552908e-07, "loss": 0.62878096, "num_input_tokens_seen": 249453735, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 11557, "time_per_iteration": 2.6701457500457764 }, { "auxiliary_loss_clip": 0.01132832, "auxiliary_loss_mlp": 0.01038337, "balance_loss_clip": 1.02607656, "balance_loss_mlp": 1.03695345, "epoch": 0.6949045543363896, "flos": 15669262696320.0, "grad_norm": 1.6734135025238663, "language_loss": 0.85511214, "learning_rate": 8.505614293095378e-07, "loss": 0.87682384, "num_input_tokens_seen": 249470805, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.69140625, "step": 11558, "time_per_iteration": 2.5774524211883545 }, { "auxiliary_loss_clip": 0.01118934, "auxiliary_loss_mlp": 0.01033829, "balance_loss_clip": 1.02050698, "balance_loss_mlp": 1.03736115, "epoch": 0.6949646775890576, "flos": 23259413612160.0, "grad_norm": 1.5454380925319495, "language_loss": 0.70599777, "learning_rate": 8.502522586842893e-07, "loss": 0.72752535, "num_input_tokens_seen": 249491150, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 11559, "time_per_iteration": 2.7005488872528076 }, { "auxiliary_loss_clip": 0.01133236, "auxiliary_loss_mlp": 0.01032476, "balance_loss_clip": 1.02037048, "balance_loss_mlp": 1.03693116, "epoch": 0.6950248008417256, "flos": 22382546797440.0, "grad_norm": 2.0905023350230403, "language_loss": 0.78665364, "learning_rate": 8.499431290905809e-07, "loss": 0.80831087, "num_input_tokens_seen": 249511560, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69921875, "step": 11560, "time_per_iteration": 2.63751482963562 }, { "auxiliary_loss_clip": 0.01124496, "auxiliary_loss_mlp": 0.01033459, "balance_loss_clip": 1.0215323, "balance_loss_mlp": 1.03758693, "epoch": 0.6950849240943935, "flos": 23477355383040.0, "grad_norm": 1.627360055695505, "language_loss": 0.76673603, "learning_rate": 8.496340405394437e-07, "loss": 0.78831553, "num_input_tokens_seen": 249531910, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69140625, "step": 11561, "time_per_iteration": 2.650009870529175 }, { "auxiliary_loss_clip": 0.01104356, "auxiliary_loss_mlp": 0.01033019, "balance_loss_clip": 1.01957798, "balance_loss_mlp": 1.03446066, "epoch": 0.6951450473470615, "flos": 17420554200960.0, "grad_norm": 1.9989919698715095, "language_loss": 0.78557408, "learning_rate": 8.493249930419089e-07, "loss": 0.80694783, "num_input_tokens_seen": 249550300, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.6953125, "step": 11562, "time_per_iteration": 2.670393228530884 }, { "auxiliary_loss_clip": 0.0111712, "auxiliary_loss_mlp": 0.010332, "balance_loss_clip": 1.02059317, "balance_loss_mlp": 1.03671145, "epoch": 0.6952051705997294, "flos": 20485745297280.0, "grad_norm": 2.3705778237195645, "language_loss": 0.69487733, "learning_rate": 8.490159866090043e-07, "loss": 0.71638054, "num_input_tokens_seen": 249567740, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 11563, "time_per_iteration": 4.151263475418091 }, { "auxiliary_loss_clip": 0.01112663, "auxiliary_loss_mlp": 0.01027301, "balance_loss_clip": 1.0161314, "balance_loss_mlp": 1.03606415, "epoch": 0.6952652938523974, "flos": 13989541040640.0, "grad_norm": 2.0699466885330833, "language_loss": 0.73235106, "learning_rate": 8.487070212517598e-07, "loss": 0.75375074, "num_input_tokens_seen": 249582700, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.671875, "step": 11564, "time_per_iteration": 2.6061410903930664 }, { "auxiliary_loss_clip": 0.01149214, "auxiliary_loss_mlp": 0.0103678, "balance_loss_clip": 1.02099669, "balance_loss_mlp": 1.04146814, "epoch": 0.6953254171050653, "flos": 30953956429440.0, "grad_norm": 1.9998287224264473, "language_loss": 0.71986169, "learning_rate": 8.483980969811994e-07, "loss": 0.74172163, "num_input_tokens_seen": 249602920, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8046875, "step": 11565, "time_per_iteration": 2.63208270072937 }, { "auxiliary_loss_clip": 0.0104544, "auxiliary_loss_mlp": 0.01003956, "balance_loss_clip": 1.00257289, "balance_loss_mlp": 1.00581491, "epoch": 0.6953855403577334, "flos": 61670257499520.0, "grad_norm": 0.8943740437715636, "language_loss": 0.58406538, "learning_rate": 8.480892138083482e-07, "loss": 0.60455936, "num_input_tokens_seen": 249660400, "router_z_loss_clip": 0.01385498, "router_z_loss_mlp": 0.21777344, "step": 11566, "time_per_iteration": 3.1313042640686035 }, { "auxiliary_loss_clip": 0.01136995, "auxiliary_loss_mlp": 0.01030898, "balance_loss_clip": 1.01732039, "balance_loss_mlp": 1.03566813, "epoch": 0.6954456636104013, "flos": 23039029716480.0, "grad_norm": 2.3656322973639607, "language_loss": 0.74477178, "learning_rate": 8.477803717442305e-07, "loss": 0.76645064, "num_input_tokens_seen": 249679335, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.74609375, "step": 11567, "time_per_iteration": 2.5602316856384277 }, { "auxiliary_loss_clip": 0.01122387, "auxiliary_loss_mlp": 0.0103343, "balance_loss_clip": 1.02097821, "balance_loss_mlp": 1.03535521, "epoch": 0.6955057868630693, "flos": 23918518224000.0, "grad_norm": 1.2473863103013711, "language_loss": 0.7681005, "learning_rate": 8.474715707998676e-07, "loss": 0.78965867, "num_input_tokens_seen": 249701805, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69140625, "step": 11568, "time_per_iteration": 2.5837290287017822 }, { "auxiliary_loss_clip": 0.01117854, "auxiliary_loss_mlp": 0.01033405, "balance_loss_clip": 1.02159166, "balance_loss_mlp": 1.03887653, "epoch": 0.6955659101157372, "flos": 22594634651520.0, "grad_norm": 1.76746198775891, "language_loss": 0.72648716, "learning_rate": 8.471628109862794e-07, "loss": 0.74799979, "num_input_tokens_seen": 249720550, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69921875, "step": 11569, "time_per_iteration": 2.508690118789673 }, { "auxiliary_loss_clip": 0.01126252, "auxiliary_loss_mlp": 0.01029947, "balance_loss_clip": 1.01708448, "balance_loss_mlp": 1.03638315, "epoch": 0.6956260333684052, "flos": 24572523104640.0, "grad_norm": 1.4473183080791336, "language_loss": 0.76900369, "learning_rate": 8.468540923144845e-07, "loss": 0.79056567, "num_input_tokens_seen": 249740325, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 11570, "time_per_iteration": 3.9445364475250244 }, { "auxiliary_loss_clip": 0.01117277, "auxiliary_loss_mlp": 0.0103208, "balance_loss_clip": 1.0187583, "balance_loss_mlp": 1.03653193, "epoch": 0.6956861566210732, "flos": 25846058787840.0, "grad_norm": 7.370745445184467, "language_loss": 0.74864763, "learning_rate": 8.465454147955023e-07, "loss": 0.77014124, "num_input_tokens_seen": 249760570, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 11571, "time_per_iteration": 2.56609845161438 }, { "auxiliary_loss_clip": 0.01110767, "auxiliary_loss_mlp": 0.01031555, "balance_loss_clip": 1.01919889, "balance_loss_mlp": 1.03490305, "epoch": 0.6957462798737412, "flos": 15301393557120.0, "grad_norm": 1.9066100390313292, "language_loss": 0.74522948, "learning_rate": 8.462367784403457e-07, "loss": 0.76665264, "num_input_tokens_seen": 249778290, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.66796875, "step": 11572, "time_per_iteration": 2.5725619792938232 }, { "auxiliary_loss_clip": 0.011234, "auxiliary_loss_mlp": 0.01028808, "balance_loss_clip": 1.01636875, "balance_loss_mlp": 1.0341146, "epoch": 0.6958064031264092, "flos": 36246830135040.0, "grad_norm": 2.0665659398371248, "language_loss": 0.69982934, "learning_rate": 8.459281832600314e-07, "loss": 0.72135139, "num_input_tokens_seen": 249800925, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71484375, "step": 11573, "time_per_iteration": 2.744084596633911 }, { "auxiliary_loss_clip": 0.01130642, "auxiliary_loss_mlp": 0.01034448, "balance_loss_clip": 1.02095318, "balance_loss_mlp": 1.03878534, "epoch": 0.6958665263790771, "flos": 19208725994880.0, "grad_norm": 1.7038485931727714, "language_loss": 0.74608135, "learning_rate": 8.456196292655706e-07, "loss": 0.76773226, "num_input_tokens_seen": 249820500, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 11574, "time_per_iteration": 2.576648473739624 }, { "auxiliary_loss_clip": 0.01136478, "auxiliary_loss_mlp": 0.01032678, "balance_loss_clip": 1.02016687, "balance_loss_mlp": 1.03684807, "epoch": 0.6959266496317451, "flos": 21795838047360.0, "grad_norm": 1.6669807459714363, "language_loss": 0.74345636, "learning_rate": 8.453111164679776e-07, "loss": 0.76514792, "num_input_tokens_seen": 249839845, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7265625, "step": 11575, "time_per_iteration": 5.625749826431274 }, { "auxiliary_loss_clip": 0.01117372, "auxiliary_loss_mlp": 0.01030173, "balance_loss_clip": 1.01678026, "balance_loss_mlp": 1.03610969, "epoch": 0.695986772884413, "flos": 20558248899840.0, "grad_norm": 3.189744810435978, "language_loss": 0.78172213, "learning_rate": 8.45002644878259e-07, "loss": 0.80319762, "num_input_tokens_seen": 249857400, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.72265625, "step": 11576, "time_per_iteration": 2.606302499771118 }, { "auxiliary_loss_clip": 0.01145513, "auxiliary_loss_mlp": 0.01031434, "balance_loss_clip": 1.01895833, "balance_loss_mlp": 1.03627014, "epoch": 0.696046896137081, "flos": 14936217937920.0, "grad_norm": 3.2123773652044942, "language_loss": 0.56739455, "learning_rate": 8.446942145074258e-07, "loss": 0.58916402, "num_input_tokens_seen": 249871645, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7421875, "step": 11577, "time_per_iteration": 2.557811975479126 }, { "auxiliary_loss_clip": 0.01156203, "auxiliary_loss_mlp": 0.01025287, "balance_loss_clip": 1.01464188, "balance_loss_mlp": 1.03538454, "epoch": 0.696107019389749, "flos": 30740216549760.0, "grad_norm": 1.331348848085455, "language_loss": 0.76568568, "learning_rate": 8.443858253664844e-07, "loss": 0.78750062, "num_input_tokens_seen": 249894215, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.66796875, "step": 11578, "time_per_iteration": 2.682413339614868 }, { "auxiliary_loss_clip": 0.0113498, "auxiliary_loss_mlp": 0.01034341, "balance_loss_clip": 1.02154374, "balance_loss_mlp": 1.03746951, "epoch": 0.696167142642417, "flos": 20776729374720.0, "grad_norm": 1.8029918531954503, "language_loss": 0.79710662, "learning_rate": 8.440774774664401e-07, "loss": 0.81879985, "num_input_tokens_seen": 249912850, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 11579, "time_per_iteration": 2.5891733169555664 }, { "auxiliary_loss_clip": 0.01130609, "auxiliary_loss_mlp": 0.01027423, "balance_loss_clip": 1.01459026, "balance_loss_mlp": 1.03429604, "epoch": 0.6962272658950849, "flos": 22565152563840.0, "grad_norm": 1.712073638392675, "language_loss": 0.7232424, "learning_rate": 8.437691708182975e-07, "loss": 0.74482274, "num_input_tokens_seen": 249932650, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 11580, "time_per_iteration": 2.573986053466797 }, { "auxiliary_loss_clip": 0.01127624, "auxiliary_loss_mlp": 0.01030481, "balance_loss_clip": 1.01704597, "balance_loss_mlp": 1.03721261, "epoch": 0.6962873891477529, "flos": 22200156512640.0, "grad_norm": 1.9094714116796054, "language_loss": 0.65783602, "learning_rate": 8.434609054330586e-07, "loss": 0.67941701, "num_input_tokens_seen": 249951205, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 11581, "time_per_iteration": 2.592005491256714 }, { "auxiliary_loss_clip": 0.01108483, "auxiliary_loss_mlp": 0.01030729, "balance_loss_clip": 1.0184983, "balance_loss_mlp": 1.03701639, "epoch": 0.6963475124004208, "flos": 12489695717760.0, "grad_norm": 1.9013764088877787, "language_loss": 0.76163715, "learning_rate": 8.431526813217254e-07, "loss": 0.7830292, "num_input_tokens_seen": 249967045, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.71484375, "step": 11582, "time_per_iteration": 2.5101821422576904 }, { "auxiliary_loss_clip": 0.01125177, "auxiliary_loss_mlp": 0.01029728, "balance_loss_clip": 1.01795053, "balance_loss_mlp": 1.03629208, "epoch": 0.6964076356530888, "flos": 17165085696000.0, "grad_norm": 1.8563349336756574, "language_loss": 0.69632578, "learning_rate": 8.428444984952962e-07, "loss": 0.71787488, "num_input_tokens_seen": 249984565, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.7109375, "step": 11583, "time_per_iteration": 2.554539680480957 }, { "auxiliary_loss_clip": 0.01135802, "auxiliary_loss_mlp": 0.01035323, "balance_loss_clip": 1.0221802, "balance_loss_mlp": 1.03765392, "epoch": 0.6964677589057569, "flos": 19937317466880.0, "grad_norm": 1.9473324681439859, "language_loss": 0.82361871, "learning_rate": 8.425363569647712e-07, "loss": 0.84533, "num_input_tokens_seen": 250004235, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71484375, "step": 11584, "time_per_iteration": 2.5807323455810547 }, { "auxiliary_loss_clip": 0.01155708, "auxiliary_loss_mlp": 0.01034353, "balance_loss_clip": 1.02084696, "balance_loss_mlp": 1.03794491, "epoch": 0.6965278821584248, "flos": 22784064001920.0, "grad_norm": 2.3416358859729356, "language_loss": 0.79378796, "learning_rate": 8.422282567411463e-07, "loss": 0.81568855, "num_input_tokens_seen": 250017645, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 11585, "time_per_iteration": 2.576768398284912 }, { "auxiliary_loss_clip": 0.01104454, "auxiliary_loss_mlp": 0.0102784, "balance_loss_clip": 1.01639032, "balance_loss_mlp": 1.03636301, "epoch": 0.6965880054110928, "flos": 20047563285120.0, "grad_norm": 1.593473245386771, "language_loss": 0.77580988, "learning_rate": 8.419201978354167e-07, "loss": 0.79713285, "num_input_tokens_seen": 250037640, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6796875, "step": 11586, "time_per_iteration": 2.5993564128875732 }, { "auxiliary_loss_clip": 0.01110089, "auxiliary_loss_mlp": 0.01032269, "balance_loss_clip": 1.02058089, "balance_loss_mlp": 1.03533649, "epoch": 0.6966481286637607, "flos": 21908238681600.0, "grad_norm": 1.586441630661087, "language_loss": 0.78160149, "learning_rate": 8.416121802585756e-07, "loss": 0.80302501, "num_input_tokens_seen": 250056490, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.66015625, "step": 11587, "time_per_iteration": 2.573204517364502 }, { "auxiliary_loss_clip": 0.01121553, "auxiliary_loss_mlp": 0.01031878, "balance_loss_clip": 1.01908708, "balance_loss_mlp": 1.03626299, "epoch": 0.6967082519164287, "flos": 15633172506240.0, "grad_norm": 3.1012242357600255, "language_loss": 0.73028785, "learning_rate": 8.413042040216173e-07, "loss": 0.75182211, "num_input_tokens_seen": 250074285, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6796875, "step": 11588, "time_per_iteration": 2.5963540077209473 }, { "auxiliary_loss_clip": 0.01122416, "auxiliary_loss_mlp": 0.01029176, "balance_loss_clip": 1.01746988, "balance_loss_mlp": 1.03472781, "epoch": 0.6967683751690966, "flos": 24024598064640.0, "grad_norm": 1.79018964440348, "language_loss": 0.75152582, "learning_rate": 8.409962691355303e-07, "loss": 0.77304173, "num_input_tokens_seen": 250093350, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69921875, "step": 11589, "time_per_iteration": 2.690282106399536 }, { "auxiliary_loss_clip": 0.01126077, "auxiliary_loss_mlp": 0.01031562, "balance_loss_clip": 1.01952827, "balance_loss_mlp": 1.03766978, "epoch": 0.6968284984217646, "flos": 31024700265600.0, "grad_norm": 1.7572435305421592, "language_loss": 0.63869506, "learning_rate": 8.406883756113059e-07, "loss": 0.66027141, "num_input_tokens_seen": 250114170, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.70703125, "step": 11590, "time_per_iteration": 2.686669111251831 }, { "auxiliary_loss_clip": 0.01062648, "auxiliary_loss_mlp": 0.01003775, "balance_loss_clip": 1.00251091, "balance_loss_mlp": 1.00534463, "epoch": 0.6968886216744326, "flos": 67622990002560.0, "grad_norm": 0.7501120571244518, "language_loss": 0.61283016, "learning_rate": 8.403805234599311e-07, "loss": 0.63349438, "num_input_tokens_seen": 250178250, "router_z_loss_clip": 0.01263428, "router_z_loss_mlp": 0.21777344, "step": 11591, "time_per_iteration": 3.2634878158569336 }, { "auxiliary_loss_clip": 0.01154764, "auxiliary_loss_mlp": 0.0127868, "balance_loss_clip": 1.0182426, "balance_loss_mlp": 1.03900099, "epoch": 0.6969487449271006, "flos": 24863686750080.0, "grad_norm": 1.6398402503641285, "language_loss": 0.69307506, "learning_rate": 8.400727126923926e-07, "loss": 0.71740949, "num_input_tokens_seen": 250198420, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 11592, "time_per_iteration": 2.6883347034454346 }, { "auxiliary_loss_clip": 0.01136115, "auxiliary_loss_mlp": 0.01030617, "balance_loss_clip": 1.01895189, "balance_loss_mlp": 1.03981447, "epoch": 0.6970088681797685, "flos": 28767858791040.0, "grad_norm": 2.0754718145805233, "language_loss": 0.62645245, "learning_rate": 8.397649433196742e-07, "loss": 0.64811981, "num_input_tokens_seen": 250220650, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6953125, "step": 11593, "time_per_iteration": 2.6584744453430176 }, { "auxiliary_loss_clip": 0.01119979, "auxiliary_loss_mlp": 0.01026649, "balance_loss_clip": 1.01536572, "balance_loss_mlp": 1.03575993, "epoch": 0.6970689914324365, "flos": 27308556944640.0, "grad_norm": 1.6403215551579673, "language_loss": 0.54520309, "learning_rate": 8.394572153527617e-07, "loss": 0.56666934, "num_input_tokens_seen": 250241750, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.671875, "step": 11594, "time_per_iteration": 2.609941005706787 }, { "auxiliary_loss_clip": 0.01114422, "auxiliary_loss_mlp": 0.01026522, "balance_loss_clip": 1.01487482, "balance_loss_mlp": 1.03603148, "epoch": 0.6971291146851044, "flos": 19136258305920.0, "grad_norm": 1.7749377845257188, "language_loss": 0.76786429, "learning_rate": 8.391495288026365e-07, "loss": 0.78927374, "num_input_tokens_seen": 250259445, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6953125, "step": 11595, "time_per_iteration": 2.521397829055786 }, { "auxiliary_loss_clip": 0.01129461, "auxiliary_loss_mlp": 0.01281669, "balance_loss_clip": 1.02184129, "balance_loss_mlp": 1.03844392, "epoch": 0.6971892379377724, "flos": 14610508387200.0, "grad_norm": 3.6369311577420347, "language_loss": 0.72123051, "learning_rate": 8.388418836802771e-07, "loss": 0.74534178, "num_input_tokens_seen": 250275640, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 11596, "time_per_iteration": 2.4954874515533447 }, { "auxiliary_loss_clip": 0.0110998, "auxiliary_loss_mlp": 0.01032994, "balance_loss_clip": 1.01964903, "balance_loss_mlp": 1.03926492, "epoch": 0.6972493611904405, "flos": 22307457415680.0, "grad_norm": 1.764570942261112, "language_loss": 0.76389658, "learning_rate": 8.385342799966646e-07, "loss": 0.7853263, "num_input_tokens_seen": 250296435, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 11597, "time_per_iteration": 2.5253992080688477 }, { "auxiliary_loss_clip": 0.01114746, "auxiliary_loss_mlp": 0.01031139, "balance_loss_clip": 1.01838386, "balance_loss_mlp": 1.03620815, "epoch": 0.6973094844431084, "flos": 17420374632960.0, "grad_norm": 1.8415299348536904, "language_loss": 0.75046384, "learning_rate": 8.382267177627762e-07, "loss": 0.77192271, "num_input_tokens_seen": 250314035, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 11598, "time_per_iteration": 2.498758554458618 }, { "auxiliary_loss_clip": 0.0112858, "auxiliary_loss_mlp": 0.01030753, "balance_loss_clip": 1.01733565, "balance_loss_mlp": 1.03755069, "epoch": 0.6973696076957764, "flos": 27235370983680.0, "grad_norm": 1.7330274804686154, "language_loss": 0.89394498, "learning_rate": 8.379191969895876e-07, "loss": 0.91553831, "num_input_tokens_seen": 250332995, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73046875, "step": 11599, "time_per_iteration": 2.6383354663848877 }, { "auxiliary_loss_clip": 0.01123209, "auxiliary_loss_mlp": 0.01040193, "balance_loss_clip": 1.02623999, "balance_loss_mlp": 1.0393225, "epoch": 0.6974297309484443, "flos": 22018089450240.0, "grad_norm": 2.075189708073031, "language_loss": 0.69385779, "learning_rate": 8.37611717688073e-07, "loss": 0.71549177, "num_input_tokens_seen": 250352120, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75, "step": 11600, "time_per_iteration": 2.5906200408935547 }, { "auxiliary_loss_clip": 0.01132303, "auxiliary_loss_mlp": 0.0103507, "balance_loss_clip": 1.02252364, "balance_loss_mlp": 1.03471041, "epoch": 0.6974898542011123, "flos": 28366449327360.0, "grad_norm": 1.670570635658903, "language_loss": 0.76976496, "learning_rate": 8.37304279869207e-07, "loss": 0.7914387, "num_input_tokens_seen": 250371705, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 11601, "time_per_iteration": 2.6251583099365234 }, { "auxiliary_loss_clip": 0.01109931, "auxiliary_loss_mlp": 0.01279019, "balance_loss_clip": 1.02053595, "balance_loss_mlp": 1.03380895, "epoch": 0.6975499774537802, "flos": 15232050351360.0, "grad_norm": 1.7263660751566463, "language_loss": 0.72118598, "learning_rate": 8.369968835439604e-07, "loss": 0.74507546, "num_input_tokens_seen": 250390485, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 11602, "time_per_iteration": 2.548170328140259 }, { "auxiliary_loss_clip": 0.01123913, "auxiliary_loss_mlp": 0.01284027, "balance_loss_clip": 1.0253427, "balance_loss_mlp": 1.03621376, "epoch": 0.6976101007064482, "flos": 22157422306560.0, "grad_norm": 1.9894666101430047, "language_loss": 0.76577699, "learning_rate": 8.366895287233033e-07, "loss": 0.78985637, "num_input_tokens_seen": 250407020, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 11603, "time_per_iteration": 2.660233736038208 }, { "auxiliary_loss_clip": 0.01141559, "auxiliary_loss_mlp": 0.01033682, "balance_loss_clip": 1.02091467, "balance_loss_mlp": 1.0365063, "epoch": 0.6976702239591162, "flos": 22273522041600.0, "grad_norm": 1.5685357976052736, "language_loss": 0.62056696, "learning_rate": 8.363822154182039e-07, "loss": 0.64231938, "num_input_tokens_seen": 250425880, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 11604, "time_per_iteration": 2.6325902938842773 }, { "auxiliary_loss_clip": 0.01117687, "auxiliary_loss_mlp": 0.01034005, "balance_loss_clip": 1.02023613, "balance_loss_mlp": 1.03648615, "epoch": 0.6977303472117842, "flos": 25848608653440.0, "grad_norm": 2.2100188134851346, "language_loss": 0.81519151, "learning_rate": 8.360749436396315e-07, "loss": 0.83670837, "num_input_tokens_seen": 250442925, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.71875, "step": 11605, "time_per_iteration": 4.049005746841431 }, { "auxiliary_loss_clip": 0.01116014, "auxiliary_loss_mlp": 0.0103386, "balance_loss_clip": 1.02111697, "balance_loss_mlp": 1.03625536, "epoch": 0.6977904704644521, "flos": 20959586536320.0, "grad_norm": 1.710051066394821, "language_loss": 0.70516944, "learning_rate": 8.35767713398549e-07, "loss": 0.72666812, "num_input_tokens_seen": 250461220, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 11606, "time_per_iteration": 2.609341859817505 }, { "auxiliary_loss_clip": 0.01128543, "auxiliary_loss_mlp": 0.01030809, "balance_loss_clip": 1.01819634, "balance_loss_mlp": 1.03738761, "epoch": 0.6978505937171201, "flos": 22055041566720.0, "grad_norm": 1.921600042504683, "language_loss": 0.82170385, "learning_rate": 8.354605247059228e-07, "loss": 0.84329736, "num_input_tokens_seen": 250480975, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.734375, "step": 11607, "time_per_iteration": 2.5914864540100098 }, { "auxiliary_loss_clip": 0.01128014, "auxiliary_loss_mlp": 0.0103274, "balance_loss_clip": 1.01924562, "balance_loss_mlp": 1.0362606, "epoch": 0.697910716969788, "flos": 20043720529920.0, "grad_norm": 1.9633681172848814, "language_loss": 0.78659737, "learning_rate": 8.351533775727147e-07, "loss": 0.80820489, "num_input_tokens_seen": 250497980, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 11608, "time_per_iteration": 2.5531253814697266 }, { "auxiliary_loss_clip": 0.01125733, "auxiliary_loss_mlp": 0.01035217, "balance_loss_clip": 1.02245009, "balance_loss_mlp": 1.03708196, "epoch": 0.697970840222456, "flos": 15888245961600.0, "grad_norm": 3.7000010789624898, "language_loss": 0.89807159, "learning_rate": 8.348462720098863e-07, "loss": 0.91968119, "num_input_tokens_seen": 250511910, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 11609, "time_per_iteration": 2.504011392593384 }, { "auxiliary_loss_clip": 0.01135154, "auxiliary_loss_mlp": 0.01032777, "balance_loss_clip": 1.01995027, "balance_loss_mlp": 1.0353713, "epoch": 0.698030963475124, "flos": 21215629658880.0, "grad_norm": 1.562281509382626, "language_loss": 0.63847363, "learning_rate": 8.345392080283972e-07, "loss": 0.66015291, "num_input_tokens_seen": 250531090, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73046875, "step": 11610, "time_per_iteration": 2.570919990539551 }, { "auxiliary_loss_clip": 0.0113181, "auxiliary_loss_mlp": 0.01031314, "balance_loss_clip": 1.01986396, "balance_loss_mlp": 1.0354234, "epoch": 0.698091086727792, "flos": 33759728524800.0, "grad_norm": 1.8408884793726186, "language_loss": 0.84889174, "learning_rate": 8.342321856392054e-07, "loss": 0.87052298, "num_input_tokens_seen": 250551565, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.69921875, "step": 11611, "time_per_iteration": 4.0877039432525635 }, { "auxiliary_loss_clip": 0.0111734, "auxiliary_loss_mlp": 0.01036176, "balance_loss_clip": 1.02336752, "balance_loss_mlp": 1.03599441, "epoch": 0.69815120998046, "flos": 15887850912000.0, "grad_norm": 1.9094425939158346, "language_loss": 0.70873588, "learning_rate": 8.339252048532695e-07, "loss": 0.73027104, "num_input_tokens_seen": 250569625, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.73046875, "step": 11612, "time_per_iteration": 2.4932029247283936 }, { "auxiliary_loss_clip": 0.0111536, "auxiliary_loss_mlp": 0.01032897, "balance_loss_clip": 1.02021945, "balance_loss_mlp": 1.03584504, "epoch": 0.6982113332331279, "flos": 18947044437120.0, "grad_norm": 1.5823360378798417, "language_loss": 0.80962288, "learning_rate": 8.33618265681542e-07, "loss": 0.83110541, "num_input_tokens_seen": 250586960, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 11613, "time_per_iteration": 2.5897178649902344 }, { "auxiliary_loss_clip": 0.01114223, "auxiliary_loss_mlp": 0.01035395, "balance_loss_clip": 1.02386713, "balance_loss_mlp": 1.0365212, "epoch": 0.6982714564857959, "flos": 24389594115840.0, "grad_norm": 2.8129051904064597, "language_loss": 0.75425637, "learning_rate": 8.333113681349792e-07, "loss": 0.77575254, "num_input_tokens_seen": 250605080, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6875, "step": 11614, "time_per_iteration": 2.6722054481506348 }, { "auxiliary_loss_clip": 0.0115968, "auxiliary_loss_mlp": 0.01029486, "balance_loss_clip": 1.01677871, "balance_loss_mlp": 1.03644443, "epoch": 0.6983315797384638, "flos": 20083725302400.0, "grad_norm": 2.359073986236642, "language_loss": 0.77395582, "learning_rate": 8.330045122245326e-07, "loss": 0.79584754, "num_input_tokens_seen": 250623965, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 11615, "time_per_iteration": 2.579179048538208 }, { "auxiliary_loss_clip": 0.01121251, "auxiliary_loss_mlp": 0.01032328, "balance_loss_clip": 1.02031803, "balance_loss_mlp": 1.03574252, "epoch": 0.6983917029911318, "flos": 13512431664000.0, "grad_norm": 1.9985327396118064, "language_loss": 0.72580636, "learning_rate": 8.326976979611528e-07, "loss": 0.74734217, "num_input_tokens_seen": 250640675, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.67578125, "step": 11616, "time_per_iteration": 2.533087730407715 }, { "auxiliary_loss_clip": 0.01107058, "auxiliary_loss_mlp": 0.0103649, "balance_loss_clip": 1.02402687, "balance_loss_mlp": 1.03800583, "epoch": 0.6984518262437998, "flos": 22018412672640.0, "grad_norm": 1.6364121241278204, "language_loss": 0.84126586, "learning_rate": 8.323909253557891e-07, "loss": 0.8627013, "num_input_tokens_seen": 250660295, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69140625, "step": 11617, "time_per_iteration": 5.657101392745972 }, { "auxiliary_loss_clip": 0.01137516, "auxiliary_loss_mlp": 0.01039523, "balance_loss_clip": 1.02647555, "balance_loss_mlp": 1.03760409, "epoch": 0.6985119494964678, "flos": 18770615809920.0, "grad_norm": 2.9794474265205206, "language_loss": 0.58920056, "learning_rate": 8.320841944193904e-07, "loss": 0.61097091, "num_input_tokens_seen": 250678155, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73046875, "step": 11618, "time_per_iteration": 2.555457592010498 }, { "auxiliary_loss_clip": 0.01125509, "auxiliary_loss_mlp": 0.01036085, "balance_loss_clip": 1.0241164, "balance_loss_mlp": 1.0373112, "epoch": 0.6985720727491357, "flos": 22382834106240.0, "grad_norm": 1.9519458382051567, "language_loss": 0.83228648, "learning_rate": 8.317775051629026e-07, "loss": 0.8539024, "num_input_tokens_seen": 250697230, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.703125, "step": 11619, "time_per_iteration": 2.632988691329956 }, { "auxiliary_loss_clip": 0.01110702, "auxiliary_loss_mlp": 0.0103012, "balance_loss_clip": 1.01920652, "balance_loss_mlp": 1.03423834, "epoch": 0.6986321960018037, "flos": 39567884785920.0, "grad_norm": 1.5753845526326375, "language_loss": 0.67325389, "learning_rate": 8.314708575972706e-07, "loss": 0.69466209, "num_input_tokens_seen": 250719865, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.67578125, "step": 11620, "time_per_iteration": 2.720465898513794 }, { "auxiliary_loss_clip": 0.01150895, "auxiliary_loss_mlp": 0.01034871, "balance_loss_clip": 1.02216315, "balance_loss_mlp": 1.03506756, "epoch": 0.6986923192544716, "flos": 17967725055360.0, "grad_norm": 2.1767847112236565, "language_loss": 0.72027016, "learning_rate": 8.311642517334371e-07, "loss": 0.74212778, "num_input_tokens_seen": 250736565, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 11621, "time_per_iteration": 2.5515224933624268 }, { "auxiliary_loss_clip": 0.01147695, "auxiliary_loss_mlp": 0.01036611, "balance_loss_clip": 1.02389753, "balance_loss_mlp": 1.03785944, "epoch": 0.6987524425071396, "flos": 25594325297280.0, "grad_norm": 1.6885026237726104, "language_loss": 0.68041539, "learning_rate": 8.308576875823463e-07, "loss": 0.70225847, "num_input_tokens_seen": 250757235, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73828125, "step": 11622, "time_per_iteration": 2.631762981414795 }, { "auxiliary_loss_clip": 0.01146434, "auxiliary_loss_mlp": 0.01029441, "balance_loss_clip": 1.01755643, "balance_loss_mlp": 1.03397512, "epoch": 0.6988125657598077, "flos": 17530081747200.0, "grad_norm": 2.136739662903997, "language_loss": 0.62601125, "learning_rate": 8.305511651549359e-07, "loss": 0.64777005, "num_input_tokens_seen": 250775585, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6796875, "step": 11623, "time_per_iteration": 2.5257389545440674 }, { "auxiliary_loss_clip": 0.01110575, "auxiliary_loss_mlp": 0.01030148, "balance_loss_clip": 1.01695168, "balance_loss_mlp": 1.03560042, "epoch": 0.6988726890124756, "flos": 39165721136640.0, "grad_norm": 1.8275446443124148, "language_loss": 0.60723984, "learning_rate": 8.302446844621469e-07, "loss": 0.62864703, "num_input_tokens_seen": 250795725, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.75, "step": 11624, "time_per_iteration": 2.677361011505127 }, { "auxiliary_loss_clip": 0.01115636, "auxiliary_loss_mlp": 0.01038078, "balance_loss_clip": 1.02458382, "balance_loss_mlp": 1.03363812, "epoch": 0.6989328122651436, "flos": 20193468330240.0, "grad_norm": 1.7573330205732696, "language_loss": 0.78362209, "learning_rate": 8.299382455149169e-07, "loss": 0.80515921, "num_input_tokens_seen": 250814555, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73046875, "step": 11625, "time_per_iteration": 2.5139362812042236 }, { "auxiliary_loss_clip": 0.01130846, "auxiliary_loss_mlp": 0.01031187, "balance_loss_clip": 1.01940322, "balance_loss_mlp": 1.03667855, "epoch": 0.6989929355178115, "flos": 21834873152640.0, "grad_norm": 1.908286960362306, "language_loss": 0.65459442, "learning_rate": 8.296318483241797e-07, "loss": 0.67621481, "num_input_tokens_seen": 250833105, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.67578125, "step": 11626, "time_per_iteration": 2.589341163635254 }, { "auxiliary_loss_clip": 0.01127379, "auxiliary_loss_mlp": 0.01285468, "balance_loss_clip": 1.02493024, "balance_loss_mlp": 1.03757846, "epoch": 0.6990530587704795, "flos": 26322880855680.0, "grad_norm": 1.8089663949474568, "language_loss": 0.70148456, "learning_rate": 8.293254929008719e-07, "loss": 0.72561306, "num_input_tokens_seen": 250852570, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 11627, "time_per_iteration": 2.5613021850585938 }, { "auxiliary_loss_clip": 0.01104723, "auxiliary_loss_mlp": 0.01028036, "balance_loss_clip": 1.01703882, "balance_loss_mlp": 1.03734732, "epoch": 0.6991131820231474, "flos": 19828975069440.0, "grad_norm": 1.6708358857643275, "language_loss": 0.62665975, "learning_rate": 8.290191792559253e-07, "loss": 0.64798737, "num_input_tokens_seen": 250870500, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.671875, "step": 11628, "time_per_iteration": 2.5676679611206055 }, { "auxiliary_loss_clip": 0.01117486, "auxiliary_loss_mlp": 0.01034725, "balance_loss_clip": 1.02171886, "balance_loss_mlp": 1.03615785, "epoch": 0.6991733052758154, "flos": 33984817102080.0, "grad_norm": 2.0379174855270508, "language_loss": 0.68259734, "learning_rate": 8.287129074002735e-07, "loss": 0.70411944, "num_input_tokens_seen": 250892745, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.72265625, "step": 11629, "time_per_iteration": 2.5953617095947266 }, { "auxiliary_loss_clip": 0.01107468, "auxiliary_loss_mlp": 0.01035735, "balance_loss_clip": 1.02309847, "balance_loss_mlp": 1.03748107, "epoch": 0.6992334285284834, "flos": 15633136592640.0, "grad_norm": 1.7531092601825522, "language_loss": 0.72900832, "learning_rate": 8.284066773448437e-07, "loss": 0.75044036, "num_input_tokens_seen": 250910225, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 11630, "time_per_iteration": 2.5056991577148438 }, { "auxiliary_loss_clip": 0.0110689, "auxiliary_loss_mlp": 0.01034236, "balance_loss_clip": 1.02224326, "balance_loss_mlp": 1.03790808, "epoch": 0.6992935517811514, "flos": 21726279360000.0, "grad_norm": 1.6405134499702894, "language_loss": 0.74089885, "learning_rate": 8.281004891005666e-07, "loss": 0.76231009, "num_input_tokens_seen": 250929715, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 11631, "time_per_iteration": 2.4976401329040527 }, { "auxiliary_loss_clip": 0.01121374, "auxiliary_loss_mlp": 0.01036376, "balance_loss_clip": 1.02292967, "balance_loss_mlp": 1.03936553, "epoch": 0.6993536750338193, "flos": 20115254465280.0, "grad_norm": 2.11399093362421, "language_loss": 0.8939653, "learning_rate": 8.277943426783684e-07, "loss": 0.91554284, "num_input_tokens_seen": 250944230, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 11632, "time_per_iteration": 2.5537655353546143 }, { "auxiliary_loss_clip": 0.01128619, "auxiliary_loss_mlp": 0.01037097, "balance_loss_clip": 1.02438354, "balance_loss_mlp": 1.03849316, "epoch": 0.6994137982864873, "flos": 22010547594240.0, "grad_norm": 6.136441230659433, "language_loss": 0.80090421, "learning_rate": 8.274882380891752e-07, "loss": 0.82256138, "num_input_tokens_seen": 250961865, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 11633, "time_per_iteration": 2.5821218490600586 }, { "auxiliary_loss_clip": 0.01111634, "auxiliary_loss_mlp": 0.01032527, "balance_loss_clip": 1.01898444, "balance_loss_mlp": 1.03796077, "epoch": 0.6994739215391552, "flos": 25519020433920.0, "grad_norm": 1.9179702508597225, "language_loss": 0.67503792, "learning_rate": 8.271821753439097e-07, "loss": 0.69647956, "num_input_tokens_seen": 250982025, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73828125, "step": 11634, "time_per_iteration": 2.6453089714050293 }, { "auxiliary_loss_clip": 0.01054161, "auxiliary_loss_mlp": 0.01001216, "balance_loss_clip": 0.99989903, "balance_loss_mlp": 1.00588834, "epoch": 0.6995340447918232, "flos": 59128357691520.0, "grad_norm": 0.6845694627491541, "language_loss": 0.53200519, "learning_rate": 8.26876154453497e-07, "loss": 0.55255902, "num_input_tokens_seen": 251046900, "router_z_loss_clip": 0.01318359, "router_z_loss_mlp": 0.21679688, "step": 11635, "time_per_iteration": 3.2944514751434326 }, { "auxiliary_loss_clip": 0.01125712, "auxiliary_loss_mlp": 0.01033051, "balance_loss_clip": 1.02017105, "balance_loss_mlp": 1.03746343, "epoch": 0.6995941680444913, "flos": 17967832796160.0, "grad_norm": 1.5096097923126108, "language_loss": 0.8225143, "learning_rate": 8.265701754288554e-07, "loss": 0.84410191, "num_input_tokens_seen": 251065050, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 11636, "time_per_iteration": 2.6027908325195312 }, { "auxiliary_loss_clip": 0.01107915, "auxiliary_loss_mlp": 0.01029284, "balance_loss_clip": 1.01707113, "balance_loss_mlp": 1.03644586, "epoch": 0.6996542912971592, "flos": 21980095839360.0, "grad_norm": 2.0264975429296403, "language_loss": 0.83263791, "learning_rate": 8.262642382809064e-07, "loss": 0.85400987, "num_input_tokens_seen": 251083355, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.71484375, "step": 11637, "time_per_iteration": 2.522042751312256 }, { "auxiliary_loss_clip": 0.01140973, "auxiliary_loss_mlp": 0.01034751, "balance_loss_clip": 1.02253246, "balance_loss_mlp": 1.03651178, "epoch": 0.6997144145498272, "flos": 11686158518400.0, "grad_norm": 2.1706294629420695, "language_loss": 0.68244565, "learning_rate": 8.259583430205668e-07, "loss": 0.70420289, "num_input_tokens_seen": 251096420, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 11638, "time_per_iteration": 2.5454397201538086 }, { "auxiliary_loss_clip": 0.01120696, "auxiliary_loss_mlp": 0.01032033, "balance_loss_clip": 1.01858056, "balance_loss_mlp": 1.0375253, "epoch": 0.6997745378024951, "flos": 29607162958080.0, "grad_norm": 1.9842720299784502, "language_loss": 0.77930665, "learning_rate": 8.256524896587555e-07, "loss": 0.80083394, "num_input_tokens_seen": 251115410, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 11639, "time_per_iteration": 2.577955722808838 }, { "auxiliary_loss_clip": 0.01128643, "auxiliary_loss_mlp": 0.01041962, "balance_loss_clip": 1.02876592, "balance_loss_mlp": 1.03882122, "epoch": 0.6998346610551631, "flos": 20886616056960.0, "grad_norm": 2.1448751699931314, "language_loss": 0.82173133, "learning_rate": 8.253466782063854e-07, "loss": 0.84343737, "num_input_tokens_seen": 251133530, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71484375, "step": 11640, "time_per_iteration": 2.5866265296936035 }, { "auxiliary_loss_clip": 0.01131948, "auxiliary_loss_mlp": 0.01029617, "balance_loss_clip": 1.01747549, "balance_loss_mlp": 1.03473473, "epoch": 0.699894784307831, "flos": 27163046949120.0, "grad_norm": 1.7836889723681222, "language_loss": 0.75369638, "learning_rate": 8.250409086743699e-07, "loss": 0.77531207, "num_input_tokens_seen": 251153985, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.70703125, "step": 11641, "time_per_iteration": 2.5915110111236572 }, { "auxiliary_loss_clip": 0.01125862, "auxiliary_loss_mlp": 0.01024261, "balance_loss_clip": 1.01091576, "balance_loss_mlp": 1.03702617, "epoch": 0.699954907560499, "flos": 20923640000640.0, "grad_norm": 2.1649125623060574, "language_loss": 0.78052783, "learning_rate": 8.247351810736234e-07, "loss": 0.80202901, "num_input_tokens_seen": 251173225, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 11642, "time_per_iteration": 2.524789333343506 }, { "auxiliary_loss_clip": 0.01123126, "auxiliary_loss_mlp": 0.0103668, "balance_loss_clip": 1.02200508, "balance_loss_mlp": 1.03927505, "epoch": 0.700015030813167, "flos": 28657792540800.0, "grad_norm": 2.2827500884903746, "language_loss": 0.74648249, "learning_rate": 8.244294954150539e-07, "loss": 0.76808059, "num_input_tokens_seen": 251192485, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.75, "step": 11643, "time_per_iteration": 2.5776779651641846 }, { "auxiliary_loss_clip": 0.0113571, "auxiliary_loss_mlp": 0.01028548, "balance_loss_clip": 1.01572669, "balance_loss_mlp": 1.03696823, "epoch": 0.700075154065835, "flos": 29205286617600.0, "grad_norm": 1.5125116184831842, "language_loss": 0.60104179, "learning_rate": 8.241238517095723e-07, "loss": 0.62268442, "num_input_tokens_seen": 251214965, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 11644, "time_per_iteration": 2.6109492778778076 }, { "auxiliary_loss_clip": 0.01113904, "auxiliary_loss_mlp": 0.01028402, "balance_loss_clip": 1.01649308, "balance_loss_mlp": 1.03638327, "epoch": 0.7001352773185029, "flos": 23112431159040.0, "grad_norm": 1.6151926348227523, "language_loss": 0.81821156, "learning_rate": 8.238182499680853e-07, "loss": 0.83963466, "num_input_tokens_seen": 251234500, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 11645, "time_per_iteration": 2.5567710399627686 }, { "auxiliary_loss_clip": 0.01114638, "auxiliary_loss_mlp": 0.01032426, "balance_loss_clip": 1.02011776, "balance_loss_mlp": 1.0362761, "epoch": 0.7001954005711709, "flos": 21322858734720.0, "grad_norm": 2.1898003020018795, "language_loss": 0.68306017, "learning_rate": 8.235126902015006e-07, "loss": 0.70453084, "num_input_tokens_seen": 251254360, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 11646, "time_per_iteration": 2.4956746101379395 }, { "auxiliary_loss_clip": 0.01115809, "auxiliary_loss_mlp": 0.01032882, "balance_loss_clip": 1.02068734, "balance_loss_mlp": 1.03694034, "epoch": 0.7002555238238388, "flos": 24535822383360.0, "grad_norm": 1.536230869757328, "language_loss": 0.70866644, "learning_rate": 8.232071724207204e-07, "loss": 0.73015338, "num_input_tokens_seen": 251274790, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69921875, "step": 11647, "time_per_iteration": 3.920212507247925 }, { "auxiliary_loss_clip": 0.01119122, "auxiliary_loss_mlp": 0.01034178, "balance_loss_clip": 1.02096939, "balance_loss_mlp": 1.03822947, "epoch": 0.7003156470765068, "flos": 39056552726400.0, "grad_norm": 1.6741873335014301, "language_loss": 0.71227157, "learning_rate": 8.229016966366498e-07, "loss": 0.73380452, "num_input_tokens_seen": 251296275, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 11648, "time_per_iteration": 2.7126686573028564 }, { "auxiliary_loss_clip": 0.01140391, "auxiliary_loss_mlp": 0.01031636, "balance_loss_clip": 1.01908326, "balance_loss_mlp": 1.03493989, "epoch": 0.7003757703291749, "flos": 28804092635520.0, "grad_norm": 1.7635368555585247, "language_loss": 0.7730602, "learning_rate": 8.225962628601897e-07, "loss": 0.79478049, "num_input_tokens_seen": 251317375, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 11649, "time_per_iteration": 2.6526601314544678 }, { "auxiliary_loss_clip": 0.01123357, "auxiliary_loss_mlp": 0.01035128, "balance_loss_clip": 1.02287292, "balance_loss_mlp": 1.03595638, "epoch": 0.7004358935818428, "flos": 15953854152960.0, "grad_norm": 1.6183828777046227, "language_loss": 0.78589344, "learning_rate": 8.222908711022404e-07, "loss": 0.80747831, "num_input_tokens_seen": 251333570, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.69921875, "step": 11650, "time_per_iteration": 2.503960132598877 }, { "auxiliary_loss_clip": 0.01122798, "auxiliary_loss_mlp": 0.01029531, "balance_loss_clip": 1.01747942, "balance_loss_mlp": 1.03475952, "epoch": 0.7004960168345108, "flos": 20411984718720.0, "grad_norm": 1.8968055959033971, "language_loss": 0.77972841, "learning_rate": 8.219855213736999e-07, "loss": 0.80125165, "num_input_tokens_seen": 251351070, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.70703125, "step": 11651, "time_per_iteration": 2.5528814792633057 }, { "auxiliary_loss_clip": 0.01137659, "auxiliary_loss_mlp": 0.0103268, "balance_loss_clip": 1.02064538, "balance_loss_mlp": 1.03851593, "epoch": 0.7005561400871787, "flos": 17347547808000.0, "grad_norm": 1.5306695650429587, "language_loss": 0.69561481, "learning_rate": 8.216802136854673e-07, "loss": 0.71731824, "num_input_tokens_seen": 251370005, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.72265625, "step": 11652, "time_per_iteration": 2.5562450885772705 }, { "auxiliary_loss_clip": 0.01115852, "auxiliary_loss_mlp": 0.01030468, "balance_loss_clip": 1.01800466, "balance_loss_mlp": 1.03729415, "epoch": 0.7006162633398467, "flos": 25302120157440.0, "grad_norm": 1.336175741864956, "language_loss": 0.74380064, "learning_rate": 8.213749480484353e-07, "loss": 0.7652638, "num_input_tokens_seen": 251391210, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69921875, "step": 11653, "time_per_iteration": 3.907386541366577 }, { "auxiliary_loss_clip": 0.01125367, "auxiliary_loss_mlp": 0.01032943, "balance_loss_clip": 1.01932395, "balance_loss_mlp": 1.03610384, "epoch": 0.7006763865925146, "flos": 20668997508480.0, "grad_norm": 2.223313749431819, "language_loss": 0.70886993, "learning_rate": 8.210697244735006e-07, "loss": 0.73045301, "num_input_tokens_seen": 251411505, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7109375, "step": 11654, "time_per_iteration": 2.590374708175659 }, { "auxiliary_loss_clip": 0.01144618, "auxiliary_loss_mlp": 0.01032997, "balance_loss_clip": 1.01924622, "balance_loss_mlp": 1.03652585, "epoch": 0.7007365098451827, "flos": 20046449963520.0, "grad_norm": 2.4050495020947023, "language_loss": 0.73415601, "learning_rate": 8.207645429715546e-07, "loss": 0.75593215, "num_input_tokens_seen": 251428975, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7265625, "step": 11655, "time_per_iteration": 2.5710926055908203 }, { "auxiliary_loss_clip": 0.01126183, "auxiliary_loss_mlp": 0.0103955, "balance_loss_clip": 1.02514327, "balance_loss_mlp": 1.04021657, "epoch": 0.7007966330978506, "flos": 20777375819520.0, "grad_norm": 2.1265935608034363, "language_loss": 0.70298374, "learning_rate": 8.204594035534888e-07, "loss": 0.72464108, "num_input_tokens_seen": 251446940, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7734375, "step": 11656, "time_per_iteration": 2.5959601402282715 }, { "auxiliary_loss_clip": 0.01139653, "auxiliary_loss_mlp": 0.01033284, "balance_loss_clip": 1.02188766, "balance_loss_mlp": 1.0348177, "epoch": 0.7008567563505186, "flos": 29638189330560.0, "grad_norm": 1.5984856435872963, "language_loss": 0.77816057, "learning_rate": 8.201543062301928e-07, "loss": 0.79988986, "num_input_tokens_seen": 251466205, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6953125, "step": 11657, "time_per_iteration": 2.602409601211548 }, { "auxiliary_loss_clip": 0.01112382, "auxiliary_loss_mlp": 0.01037419, "balance_loss_clip": 1.02317381, "balance_loss_mlp": 1.03783846, "epoch": 0.7009168796031865, "flos": 17092007475840.0, "grad_norm": 1.8769817297249582, "language_loss": 0.7750026, "learning_rate": 8.198492510125541e-07, "loss": 0.79650062, "num_input_tokens_seen": 251484820, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7421875, "step": 11658, "time_per_iteration": 3.9283523559570312 }, { "auxiliary_loss_clip": 0.01121793, "auxiliary_loss_mlp": 0.01027643, "balance_loss_clip": 1.0149591, "balance_loss_mlp": 1.03523254, "epoch": 0.7009770028558545, "flos": 20448972748800.0, "grad_norm": 2.005138965284253, "language_loss": 0.82531989, "learning_rate": 8.19544237911461e-07, "loss": 0.84681422, "num_input_tokens_seen": 251502670, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 11659, "time_per_iteration": 4.152260780334473 }, { "auxiliary_loss_clip": 0.01132499, "auxiliary_loss_mlp": 0.01030006, "balance_loss_clip": 1.01813817, "balance_loss_mlp": 1.03609085, "epoch": 0.7010371261085224, "flos": 19245139407360.0, "grad_norm": 1.8762464253388242, "language_loss": 0.69161463, "learning_rate": 8.192392669377963e-07, "loss": 0.71323967, "num_input_tokens_seen": 251521630, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 11660, "time_per_iteration": 2.52817702293396 }, { "auxiliary_loss_clip": 0.01108993, "auxiliary_loss_mlp": 0.01036529, "balance_loss_clip": 1.02246261, "balance_loss_mlp": 1.03609157, "epoch": 0.7010972493611904, "flos": 22127581082880.0, "grad_norm": 1.7734102581868192, "language_loss": 0.80681741, "learning_rate": 8.189343381024456e-07, "loss": 0.82827264, "num_input_tokens_seen": 251540105, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7265625, "step": 11661, "time_per_iteration": 2.517400026321411 }, { "auxiliary_loss_clip": 0.01113368, "auxiliary_loss_mlp": 0.01036964, "balance_loss_clip": 1.02458966, "balance_loss_mlp": 1.03335667, "epoch": 0.7011573726138585, "flos": 31391132860800.0, "grad_norm": 1.4806120139428154, "language_loss": 0.78679955, "learning_rate": 8.186294514162897e-07, "loss": 0.80830282, "num_input_tokens_seen": 251560530, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 11662, "time_per_iteration": 2.587806224822998 }, { "auxiliary_loss_clip": 0.01117999, "auxiliary_loss_mlp": 0.01025914, "balance_loss_clip": 1.01302791, "balance_loss_mlp": 1.03630066, "epoch": 0.7012174958665264, "flos": 18150582216960.0, "grad_norm": 1.8988565585183312, "language_loss": 0.83496535, "learning_rate": 8.183246068902113e-07, "loss": 0.85640442, "num_input_tokens_seen": 251577930, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 11663, "time_per_iteration": 2.5169169902801514 }, { "auxiliary_loss_clip": 0.01034217, "auxiliary_loss_mlp": 0.00999822, "balance_loss_clip": 0.99845076, "balance_loss_mlp": 1.00406921, "epoch": 0.7012776191191944, "flos": 60651256567680.0, "grad_norm": 0.8655249905075323, "language_loss": 0.53772819, "learning_rate": 8.180198045350864e-07, "loss": 0.55806857, "num_input_tokens_seen": 251638820, "router_z_loss_clip": 0.01373291, "router_z_loss_mlp": 0.21484375, "step": 11664, "time_per_iteration": 3.1054210662841797 }, { "auxiliary_loss_clip": 0.01128666, "auxiliary_loss_mlp": 0.01033224, "balance_loss_clip": 1.02015877, "balance_loss_mlp": 1.03755295, "epoch": 0.7013377423718623, "flos": 27198598435200.0, "grad_norm": 9.25266792435823, "language_loss": 0.7851553, "learning_rate": 8.17715044361795e-07, "loss": 0.80677414, "num_input_tokens_seen": 251658070, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73046875, "step": 11665, "time_per_iteration": 2.543694257736206 }, { "auxiliary_loss_clip": 0.01114827, "auxiliary_loss_mlp": 0.01030011, "balance_loss_clip": 1.01773834, "balance_loss_mlp": 1.03536153, "epoch": 0.7013978656245303, "flos": 16543543731840.0, "grad_norm": 2.3282388428701934, "language_loss": 0.77266818, "learning_rate": 8.174103263812124e-07, "loss": 0.79411662, "num_input_tokens_seen": 251671575, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.703125, "step": 11666, "time_per_iteration": 2.5338621139526367 }, { "auxiliary_loss_clip": 0.01114915, "auxiliary_loss_mlp": 0.01032292, "balance_loss_clip": 1.01985252, "balance_loss_mlp": 1.03642797, "epoch": 0.7014579888771982, "flos": 23143780753920.0, "grad_norm": 1.9056494350765596, "language_loss": 0.80965257, "learning_rate": 8.171056506042135e-07, "loss": 0.83112466, "num_input_tokens_seen": 251689350, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 11667, "time_per_iteration": 2.6176979541778564 }, { "auxiliary_loss_clip": 0.01132218, "auxiliary_loss_mlp": 0.01035117, "balance_loss_clip": 1.02252269, "balance_loss_mlp": 1.03529286, "epoch": 0.7015181121298663, "flos": 25082095397760.0, "grad_norm": 2.846818945226934, "language_loss": 0.65721482, "learning_rate": 8.168010170416704e-07, "loss": 0.6788882, "num_input_tokens_seen": 251704635, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 11668, "time_per_iteration": 2.6245789527893066 }, { "auxiliary_loss_clip": 0.0112644, "auxiliary_loss_mlp": 0.01028435, "balance_loss_clip": 1.01557875, "balance_loss_mlp": 1.03630078, "epoch": 0.7015782353825342, "flos": 23327894891520.0, "grad_norm": 2.0113132274889445, "language_loss": 0.7638061, "learning_rate": 8.164964257044569e-07, "loss": 0.78535485, "num_input_tokens_seen": 251723035, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 11669, "time_per_iteration": 2.5392744541168213 }, { "auxiliary_loss_clip": 0.01043499, "auxiliary_loss_mlp": 0.01001283, "balance_loss_clip": 0.99996012, "balance_loss_mlp": 1.00436282, "epoch": 0.7016383586352022, "flos": 70397161107840.0, "grad_norm": 0.7572870102018326, "language_loss": 0.5457468, "learning_rate": 8.161918766034408e-07, "loss": 0.56619459, "num_input_tokens_seen": 251791630, "router_z_loss_clip": 0.01324463, "router_z_loss_mlp": 0.21582031, "step": 11670, "time_per_iteration": 3.2756288051605225 }, { "auxiliary_loss_clip": 0.01124114, "auxiliary_loss_mlp": 0.01039525, "balance_loss_clip": 1.02641189, "balance_loss_mlp": 1.03590846, "epoch": 0.7016984818878701, "flos": 19792274348160.0, "grad_norm": 1.615861443577631, "language_loss": 0.81899357, "learning_rate": 8.158873697494908e-07, "loss": 0.84062994, "num_input_tokens_seen": 251809840, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 11671, "time_per_iteration": 2.5284173488616943 }, { "auxiliary_loss_clip": 0.01120431, "auxiliary_loss_mlp": 0.01035466, "balance_loss_clip": 1.02224612, "balance_loss_mlp": 1.03931415, "epoch": 0.7017586051405381, "flos": 12896923184640.0, "grad_norm": 1.8999788683278547, "language_loss": 0.75106561, "learning_rate": 8.155829051534753e-07, "loss": 0.77262455, "num_input_tokens_seen": 251827550, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 11672, "time_per_iteration": 2.4908061027526855 }, { "auxiliary_loss_clip": 0.01114952, "auxiliary_loss_mlp": 0.01030285, "balance_loss_clip": 1.01706445, "balance_loss_mlp": 1.03524661, "epoch": 0.701818728393206, "flos": 18332828847360.0, "grad_norm": 1.6057355743480444, "language_loss": 0.87332731, "learning_rate": 8.152784828262593e-07, "loss": 0.89477968, "num_input_tokens_seen": 251844880, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.70703125, "step": 11673, "time_per_iteration": 2.512892961502075 }, { "auxiliary_loss_clip": 0.0111299, "auxiliary_loss_mlp": 0.01029913, "balance_loss_clip": 1.01748598, "balance_loss_mlp": 1.0351162, "epoch": 0.701878851645874, "flos": 17384212615680.0, "grad_norm": 2.4403611371297504, "language_loss": 0.72864366, "learning_rate": 8.149741027787069e-07, "loss": 0.75007272, "num_input_tokens_seen": 251861025, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 11674, "time_per_iteration": 2.555863857269287 }, { "auxiliary_loss_clip": 0.01114442, "auxiliary_loss_mlp": 0.01281001, "balance_loss_clip": 1.0223608, "balance_loss_mlp": 1.03766656, "epoch": 0.701938974898542, "flos": 23915501481600.0, "grad_norm": 1.6117681756691162, "language_loss": 0.71955925, "learning_rate": 8.146697650216798e-07, "loss": 0.7435137, "num_input_tokens_seen": 251880175, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.67578125, "step": 11675, "time_per_iteration": 2.567979574203491 }, { "auxiliary_loss_clip": 0.01134678, "auxiliary_loss_mlp": 0.01028834, "balance_loss_clip": 1.01593518, "balance_loss_mlp": 1.03671837, "epoch": 0.70199909815121, "flos": 21795586652160.0, "grad_norm": 2.182209393873213, "language_loss": 0.50758076, "learning_rate": 8.143654695660412e-07, "loss": 0.52921587, "num_input_tokens_seen": 251899005, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 11676, "time_per_iteration": 2.5806052684783936 }, { "auxiliary_loss_clip": 0.01043376, "auxiliary_loss_mlp": 0.01249688, "balance_loss_clip": 1.00237048, "balance_loss_mlp": 1.00473177, "epoch": 0.702059221403878, "flos": 71715047109120.0, "grad_norm": 0.7529611467461693, "language_loss": 0.59201717, "learning_rate": 8.140612164226475e-07, "loss": 0.6149478, "num_input_tokens_seen": 251966790, "router_z_loss_clip": 0.01379395, "router_z_loss_mlp": 0.21679688, "step": 11677, "time_per_iteration": 3.2090530395507812 }, { "auxiliary_loss_clip": 0.01117411, "auxiliary_loss_mlp": 0.01028544, "balance_loss_clip": 1.01565766, "balance_loss_mlp": 1.0353415, "epoch": 0.7021193446565459, "flos": 28111052649600.0, "grad_norm": 2.567313175951336, "language_loss": 0.62663341, "learning_rate": 8.137570056023593e-07, "loss": 0.64809299, "num_input_tokens_seen": 251989315, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73046875, "step": 11678, "time_per_iteration": 2.6524760723114014 }, { "auxiliary_loss_clip": 0.01106225, "auxiliary_loss_mlp": 0.0103563, "balance_loss_clip": 1.02302384, "balance_loss_mlp": 1.03609967, "epoch": 0.7021794679092139, "flos": 22924905229440.0, "grad_norm": 1.7078241745305194, "language_loss": 0.79324019, "learning_rate": 8.134528371160321e-07, "loss": 0.81465876, "num_input_tokens_seen": 252006620, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 11679, "time_per_iteration": 2.5509355068206787 }, { "auxiliary_loss_clip": 0.01125894, "auxiliary_loss_mlp": 0.01036597, "balance_loss_clip": 1.02517641, "balance_loss_mlp": 1.03883851, "epoch": 0.7022395911618818, "flos": 18077827219200.0, "grad_norm": 2.1942858719378036, "language_loss": 0.71007758, "learning_rate": 8.131487109745212e-07, "loss": 0.73170245, "num_input_tokens_seen": 252024570, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.69140625, "step": 11680, "time_per_iteration": 2.538181781768799 }, { "auxiliary_loss_clip": 0.01137504, "auxiliary_loss_mlp": 0.01032356, "balance_loss_clip": 1.01874804, "balance_loss_mlp": 1.03742957, "epoch": 0.7022997144145499, "flos": 16034294661120.0, "grad_norm": 1.6960050320780575, "language_loss": 0.74613881, "learning_rate": 8.128446271886789e-07, "loss": 0.76783735, "num_input_tokens_seen": 252042775, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73046875, "step": 11681, "time_per_iteration": 2.5481576919555664 }, { "auxiliary_loss_clip": 0.0112416, "auxiliary_loss_mlp": 0.01034505, "balance_loss_clip": 1.02251291, "balance_loss_mlp": 1.03523338, "epoch": 0.7023598376672178, "flos": 26468678160000.0, "grad_norm": 1.511798873305423, "language_loss": 0.76995075, "learning_rate": 8.125405857693588e-07, "loss": 0.7915374, "num_input_tokens_seen": 252063690, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.70703125, "step": 11682, "time_per_iteration": 2.5855495929718018 }, { "auxiliary_loss_clip": 0.01136326, "auxiliary_loss_mlp": 0.01281667, "balance_loss_clip": 1.02187514, "balance_loss_mlp": 1.03648853, "epoch": 0.7024199609198858, "flos": 17055917285760.0, "grad_norm": 1.8902120675252128, "language_loss": 0.7322309, "learning_rate": 8.12236586727411e-07, "loss": 0.7564109, "num_input_tokens_seen": 252080335, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 11683, "time_per_iteration": 2.565732002258301 }, { "auxiliary_loss_clip": 0.01133905, "auxiliary_loss_mlp": 0.01029029, "balance_loss_clip": 1.01574326, "balance_loss_mlp": 1.03671145, "epoch": 0.7024800841725537, "flos": 25849039616640.0, "grad_norm": 1.7021077885604925, "language_loss": 0.71266532, "learning_rate": 8.119326300736837e-07, "loss": 0.73429465, "num_input_tokens_seen": 252101075, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 11684, "time_per_iteration": 2.5539588928222656 }, { "auxiliary_loss_clip": 0.01117044, "auxiliary_loss_mlp": 0.01032832, "balance_loss_clip": 1.01943302, "balance_loss_mlp": 1.03681695, "epoch": 0.7025402074252217, "flos": 23513014609920.0, "grad_norm": 2.004416880656261, "language_loss": 0.71738076, "learning_rate": 8.116287158190251e-07, "loss": 0.7388795, "num_input_tokens_seen": 252120510, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 11685, "time_per_iteration": 2.6465671062469482 }, { "auxiliary_loss_clip": 0.01116695, "auxiliary_loss_mlp": 0.01031533, "balance_loss_clip": 1.01976681, "balance_loss_mlp": 1.03633559, "epoch": 0.7026003306778896, "flos": 20150985519360.0, "grad_norm": 1.651901528399524, "language_loss": 0.84534049, "learning_rate": 8.113248439742808e-07, "loss": 0.86682278, "num_input_tokens_seen": 252137590, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.7109375, "step": 11686, "time_per_iteration": 2.5005099773406982 }, { "auxiliary_loss_clip": 0.01035141, "auxiliary_loss_mlp": 0.01004028, "balance_loss_clip": 1.00266349, "balance_loss_mlp": 1.00431752, "epoch": 0.7026604539305576, "flos": 64772400712320.0, "grad_norm": 0.9797037433334685, "language_loss": 0.69938278, "learning_rate": 8.110210145502949e-07, "loss": 0.71977448, "num_input_tokens_seen": 252199830, "router_z_loss_clip": 0.01367188, "router_z_loss_mlp": 0.21679688, "step": 11687, "time_per_iteration": 3.3182883262634277 }, { "auxiliary_loss_clip": 0.01135944, "auxiliary_loss_mlp": 0.01029011, "balance_loss_clip": 1.01592231, "balance_loss_mlp": 1.03782868, "epoch": 0.7027205771832256, "flos": 21871466133120.0, "grad_norm": 2.080654512897086, "language_loss": 0.77118909, "learning_rate": 8.107172275579099e-07, "loss": 0.79283869, "num_input_tokens_seen": 252217200, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 11688, "time_per_iteration": 3.995640277862549 }, { "auxiliary_loss_clip": 0.01106503, "auxiliary_loss_mlp": 0.01031782, "balance_loss_clip": 1.01904428, "balance_loss_mlp": 1.03583312, "epoch": 0.7027807004358936, "flos": 23367791923200.0, "grad_norm": 3.0676859093908195, "language_loss": 0.69082284, "learning_rate": 8.104134830079688e-07, "loss": 0.71220565, "num_input_tokens_seen": 252236105, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 11689, "time_per_iteration": 2.5629119873046875 }, { "auxiliary_loss_clip": 0.01126633, "auxiliary_loss_mlp": 0.01041382, "balance_loss_clip": 1.02746487, "balance_loss_mlp": 1.03572178, "epoch": 0.7028408236885616, "flos": 15304266645120.0, "grad_norm": 2.3831901335298338, "language_loss": 0.79530436, "learning_rate": 8.101097809113105e-07, "loss": 0.81698453, "num_input_tokens_seen": 252253315, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 11690, "time_per_iteration": 2.596174478530884 }, { "auxiliary_loss_clip": 0.01108041, "auxiliary_loss_mlp": 0.01038278, "balance_loss_clip": 1.02545702, "balance_loss_mlp": 1.03724515, "epoch": 0.7029009469412295, "flos": 22018197191040.0, "grad_norm": 1.622041343953723, "language_loss": 0.75766867, "learning_rate": 8.098061212787732e-07, "loss": 0.77913189, "num_input_tokens_seen": 252272765, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 11691, "time_per_iteration": 2.486422538757324 }, { "auxiliary_loss_clip": 0.01125268, "auxiliary_loss_mlp": 0.01027562, "balance_loss_clip": 1.01573634, "balance_loss_mlp": 1.03821588, "epoch": 0.7029610701938975, "flos": 21835519597440.0, "grad_norm": 1.7242784092288463, "language_loss": 0.81531298, "learning_rate": 8.095025041211932e-07, "loss": 0.83684123, "num_input_tokens_seen": 252290510, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 11692, "time_per_iteration": 2.545616388320923 }, { "auxiliary_loss_clip": 0.01123196, "auxiliary_loss_mlp": 0.01031131, "balance_loss_clip": 1.01801836, "balance_loss_mlp": 1.03488672, "epoch": 0.7030211934465654, "flos": 19135647774720.0, "grad_norm": 1.4646460499743819, "language_loss": 0.76414371, "learning_rate": 8.091989294494079e-07, "loss": 0.78568697, "num_input_tokens_seen": 252309365, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 11693, "time_per_iteration": 2.545222759246826 }, { "auxiliary_loss_clip": 0.01125656, "auxiliary_loss_mlp": 0.0103299, "balance_loss_clip": 1.02052116, "balance_loss_mlp": 1.03815711, "epoch": 0.7030813166992335, "flos": 38546010766080.0, "grad_norm": 1.4868689237123403, "language_loss": 0.68494153, "learning_rate": 8.088953972742482e-07, "loss": 0.70652801, "num_input_tokens_seen": 252333010, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 11694, "time_per_iteration": 4.088347673416138 }, { "auxiliary_loss_clip": 0.01107206, "auxiliary_loss_mlp": 0.01032967, "balance_loss_clip": 1.02047384, "balance_loss_mlp": 1.03562331, "epoch": 0.7031414399519014, "flos": 14720897859840.0, "grad_norm": 4.35630911492126, "language_loss": 0.75316989, "learning_rate": 8.085919076065488e-07, "loss": 0.77457166, "num_input_tokens_seen": 252351330, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71484375, "step": 11695, "time_per_iteration": 2.533006429672241 }, { "auxiliary_loss_clip": 0.01127734, "auxiliary_loss_mlp": 0.01039065, "balance_loss_clip": 1.0258981, "balance_loss_mlp": 1.03714573, "epoch": 0.7032015632045694, "flos": 14027247342720.0, "grad_norm": 1.918148979115508, "language_loss": 0.7396276, "learning_rate": 8.082884604571394e-07, "loss": 0.76129562, "num_input_tokens_seen": 252369580, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 11696, "time_per_iteration": 2.518536329269409 }, { "auxiliary_loss_clip": 0.01126189, "auxiliary_loss_mlp": 0.0103148, "balance_loss_clip": 1.01792574, "balance_loss_mlp": 1.03629899, "epoch": 0.7032616864572373, "flos": 27637175496960.0, "grad_norm": 1.6295976596128117, "language_loss": 0.75167227, "learning_rate": 8.079850558368495e-07, "loss": 0.77324891, "num_input_tokens_seen": 252390525, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 11697, "time_per_iteration": 2.6481809616088867 }, { "auxiliary_loss_clip": 0.01148323, "auxiliary_loss_mlp": 0.01037708, "balance_loss_clip": 1.022771, "balance_loss_mlp": 1.03884339, "epoch": 0.7033218097099053, "flos": 17967294092160.0, "grad_norm": 1.7381771230666676, "language_loss": 0.8085795, "learning_rate": 8.076816937565061e-07, "loss": 0.83043987, "num_input_tokens_seen": 252407470, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.734375, "step": 11698, "time_per_iteration": 2.568655490875244 }, { "auxiliary_loss_clip": 0.0112567, "auxiliary_loss_mlp": 0.01038004, "balance_loss_clip": 1.02508819, "balance_loss_mlp": 1.03586614, "epoch": 0.7033819329625732, "flos": 19501721233920.0, "grad_norm": 1.3712887698588296, "language_loss": 0.84630913, "learning_rate": 8.073783742269364e-07, "loss": 0.86794585, "num_input_tokens_seen": 252427025, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 11699, "time_per_iteration": 2.5591204166412354 }, { "auxiliary_loss_clip": 0.01117475, "auxiliary_loss_mlp": 0.01031954, "balance_loss_clip": 1.01856685, "balance_loss_mlp": 1.03815985, "epoch": 0.7034420562152413, "flos": 23987645948160.0, "grad_norm": 1.8584647031734634, "language_loss": 0.79031181, "learning_rate": 8.070750972589658e-07, "loss": 0.81180608, "num_input_tokens_seen": 252445410, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.703125, "step": 11700, "time_per_iteration": 5.701578617095947 }, { "auxiliary_loss_clip": 0.01115412, "auxiliary_loss_mlp": 0.01027651, "balance_loss_clip": 1.01400149, "balance_loss_mlp": 1.03482711, "epoch": 0.7035021794679092, "flos": 35043427756800.0, "grad_norm": 1.6398274942264361, "language_loss": 0.74083912, "learning_rate": 8.067718628634148e-07, "loss": 0.76226974, "num_input_tokens_seen": 252463905, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 11701, "time_per_iteration": 2.6784677505493164 }, { "auxiliary_loss_clip": 0.01120062, "auxiliary_loss_mlp": 0.01032603, "balance_loss_clip": 1.01825058, "balance_loss_mlp": 1.03827333, "epoch": 0.7035623027205772, "flos": 10997428164480.0, "grad_norm": 2.1611943991667912, "language_loss": 0.83897102, "learning_rate": 8.064686710511075e-07, "loss": 0.86049771, "num_input_tokens_seen": 252478655, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7265625, "step": 11702, "time_per_iteration": 2.530979633331299 }, { "auxiliary_loss_clip": 0.0110741, "auxiliary_loss_mlp": 0.01036963, "balance_loss_clip": 1.02439249, "balance_loss_mlp": 1.03592145, "epoch": 0.7036224259732452, "flos": 23623727304960.0, "grad_norm": 1.7392525168949273, "language_loss": 0.61003661, "learning_rate": 8.061655218328631e-07, "loss": 0.63148034, "num_input_tokens_seen": 252498740, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 11703, "time_per_iteration": 2.532123565673828 }, { "auxiliary_loss_clip": 0.01105826, "auxiliary_loss_mlp": 0.01029448, "balance_loss_clip": 1.0162518, "balance_loss_mlp": 1.03455329, "epoch": 0.7036825492259131, "flos": 31686175175040.0, "grad_norm": 2.139246702756548, "language_loss": 0.61136782, "learning_rate": 8.058624152195003e-07, "loss": 0.63272059, "num_input_tokens_seen": 252517800, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71484375, "step": 11704, "time_per_iteration": 2.626338481903076 }, { "auxiliary_loss_clip": 0.01110924, "auxiliary_loss_mlp": 0.01031, "balance_loss_clip": 1.01962709, "balance_loss_mlp": 1.0339582, "epoch": 0.7037426724785811, "flos": 30192866127360.0, "grad_norm": 1.6351261585163683, "language_loss": 0.70712483, "learning_rate": 8.055593512218357e-07, "loss": 0.72854406, "num_input_tokens_seen": 252539620, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6796875, "step": 11705, "time_per_iteration": 2.6269636154174805 }, { "auxiliary_loss_clip": 0.01113449, "auxiliary_loss_mlp": 0.01031246, "balance_loss_clip": 1.01864529, "balance_loss_mlp": 1.03566706, "epoch": 0.703802795731249, "flos": 24311523905280.0, "grad_norm": 2.708379001016491, "language_loss": 0.62018776, "learning_rate": 8.052563298506858e-07, "loss": 0.64163476, "num_input_tokens_seen": 252557300, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 11706, "time_per_iteration": 2.5779757499694824 }, { "auxiliary_loss_clip": 0.01116453, "auxiliary_loss_mlp": 0.01028273, "balance_loss_clip": 1.01586914, "balance_loss_mlp": 1.03665721, "epoch": 0.7038629189839171, "flos": 22528954632960.0, "grad_norm": 1.8087538581313245, "language_loss": 0.68289161, "learning_rate": 8.049533511168645e-07, "loss": 0.70433885, "num_input_tokens_seen": 252576715, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 11707, "time_per_iteration": 2.5055367946624756 }, { "auxiliary_loss_clip": 0.01102757, "auxiliary_loss_mlp": 0.01029162, "balance_loss_clip": 1.01696086, "balance_loss_mlp": 1.03447425, "epoch": 0.703923042236585, "flos": 26250484993920.0, "grad_norm": 1.7876932776481929, "language_loss": 0.76308823, "learning_rate": 8.04650415031184e-07, "loss": 0.78440738, "num_input_tokens_seen": 252596190, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.68359375, "step": 11708, "time_per_iteration": 2.5838866233825684 }, { "auxiliary_loss_clip": 0.01122973, "auxiliary_loss_mlp": 0.0102854, "balance_loss_clip": 1.01644588, "balance_loss_mlp": 1.0362916, "epoch": 0.703983165489253, "flos": 19390254353280.0, "grad_norm": 2.2655107200013775, "language_loss": 0.7209996, "learning_rate": 8.043475216044547e-07, "loss": 0.74251473, "num_input_tokens_seen": 252613410, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 11709, "time_per_iteration": 2.544542074203491 }, { "auxiliary_loss_clip": 0.01113579, "auxiliary_loss_mlp": 0.01027294, "balance_loss_clip": 1.01527739, "balance_loss_mlp": 1.03463769, "epoch": 0.7040432887419209, "flos": 16683630773760.0, "grad_norm": 1.9092739235959972, "language_loss": 0.78715503, "learning_rate": 8.040446708474879e-07, "loss": 0.80856377, "num_input_tokens_seen": 252629150, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 11710, "time_per_iteration": 2.585263967514038 }, { "auxiliary_loss_clip": 0.0111784, "auxiliary_loss_mlp": 0.01032848, "balance_loss_clip": 1.01983047, "balance_loss_mlp": 1.03839779, "epoch": 0.7041034119945889, "flos": 21141402203520.0, "grad_norm": 1.6410384789120644, "language_loss": 0.77119303, "learning_rate": 8.037418627710892e-07, "loss": 0.79269999, "num_input_tokens_seen": 252648225, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 11711, "time_per_iteration": 2.6338775157928467 }, { "auxiliary_loss_clip": 0.01129897, "auxiliary_loss_mlp": 0.0102907, "balance_loss_clip": 1.01783442, "balance_loss_mlp": 1.03459775, "epoch": 0.7041635352472568, "flos": 16910299549440.0, "grad_norm": 2.1740556038832413, "language_loss": 0.74850899, "learning_rate": 8.034390973860672e-07, "loss": 0.77009869, "num_input_tokens_seen": 252665380, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6875, "step": 11712, "time_per_iteration": 2.6383626461029053 }, { "auxiliary_loss_clip": 0.01052306, "auxiliary_loss_mlp": 0.01001193, "balance_loss_clip": 0.99984038, "balance_loss_mlp": 1.00382876, "epoch": 0.7042236584999249, "flos": 71681219475840.0, "grad_norm": 0.8764600212207241, "language_loss": 0.64600885, "learning_rate": 8.031363747032256e-07, "loss": 0.66654384, "num_input_tokens_seen": 252727950, "router_z_loss_clip": 0.0135498, "router_z_loss_mlp": 0.21679688, "step": 11713, "time_per_iteration": 3.2400107383728027 }, { "auxiliary_loss_clip": 0.01112646, "auxiliary_loss_mlp": 0.01026405, "balance_loss_clip": 1.01452029, "balance_loss_mlp": 1.03355336, "epoch": 0.7042837817525928, "flos": 28658187590400.0, "grad_norm": 3.36463423774462, "language_loss": 0.73157376, "learning_rate": 8.028336947333682e-07, "loss": 0.75296426, "num_input_tokens_seen": 252746770, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.703125, "step": 11714, "time_per_iteration": 2.6669387817382812 }, { "auxiliary_loss_clip": 0.01122859, "auxiliary_loss_mlp": 0.01282821, "balance_loss_clip": 1.02340436, "balance_loss_mlp": 1.03564131, "epoch": 0.7043439050052608, "flos": 19753562465280.0, "grad_norm": 1.8769610987160363, "language_loss": 0.79494107, "learning_rate": 8.025310574872967e-07, "loss": 0.81899786, "num_input_tokens_seen": 252765610, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 11715, "time_per_iteration": 2.613255500793457 }, { "auxiliary_loss_clip": 0.01125931, "auxiliary_loss_mlp": 0.01035067, "balance_loss_clip": 1.02181733, "balance_loss_mlp": 1.0357393, "epoch": 0.7044040282579288, "flos": 11538529620480.0, "grad_norm": 2.259278332750592, "language_loss": 0.71362615, "learning_rate": 8.022284629758109e-07, "loss": 0.73523617, "num_input_tokens_seen": 252781610, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 11716, "time_per_iteration": 2.5042812824249268 }, { "auxiliary_loss_clip": 0.01145545, "auxiliary_loss_mlp": 0.01033786, "balance_loss_clip": 1.02118564, "balance_loss_mlp": 1.03772533, "epoch": 0.7044641515105967, "flos": 33656126722560.0, "grad_norm": 1.790390723006574, "language_loss": 0.66454536, "learning_rate": 8.019259112097117e-07, "loss": 0.68633866, "num_input_tokens_seen": 252800600, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 11717, "time_per_iteration": 2.6675374507904053 }, { "auxiliary_loss_clip": 0.0110731, "auxiliary_loss_mlp": 0.01029381, "balance_loss_clip": 1.01762104, "balance_loss_mlp": 1.03659129, "epoch": 0.7045242747632647, "flos": 26723859356160.0, "grad_norm": 1.4391767807118219, "language_loss": 0.74136758, "learning_rate": 8.016234021997934e-07, "loss": 0.76273453, "num_input_tokens_seen": 252822310, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.70703125, "step": 11718, "time_per_iteration": 2.534653902053833 }, { "auxiliary_loss_clip": 0.01115016, "auxiliary_loss_mlp": 0.01030684, "balance_loss_clip": 1.01925159, "balance_loss_mlp": 1.03665113, "epoch": 0.7045843980159326, "flos": 26797655848320.0, "grad_norm": 1.6240404840489886, "language_loss": 0.80044365, "learning_rate": 8.01320935956854e-07, "loss": 0.82190061, "num_input_tokens_seen": 252842355, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6953125, "step": 11719, "time_per_iteration": 2.5737972259521484 }, { "auxiliary_loss_clip": 0.01115987, "auxiliary_loss_mlp": 0.01036492, "balance_loss_clip": 1.02367663, "balance_loss_mlp": 1.03707409, "epoch": 0.7046445212686007, "flos": 41574824363520.0, "grad_norm": 1.624912531380609, "language_loss": 0.65702677, "learning_rate": 8.010185124916868e-07, "loss": 0.67855155, "num_input_tokens_seen": 252866785, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 11720, "time_per_iteration": 2.6762051582336426 }, { "auxiliary_loss_clip": 0.01128813, "auxiliary_loss_mlp": 0.01032648, "balance_loss_clip": 1.01989913, "balance_loss_mlp": 1.03866887, "epoch": 0.7047046445212686, "flos": 15560166113280.0, "grad_norm": 3.714505560575761, "language_loss": 0.80290604, "learning_rate": 8.007161318150851e-07, "loss": 0.82452059, "num_input_tokens_seen": 252881870, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 11721, "time_per_iteration": 2.55318284034729 }, { "auxiliary_loss_clip": 0.01052106, "auxiliary_loss_mlp": 0.01002033, "balance_loss_clip": 1.00072193, "balance_loss_mlp": 1.00397396, "epoch": 0.7047647677739366, "flos": 70410269571840.0, "grad_norm": 0.7618405128903322, "language_loss": 0.64767289, "learning_rate": 8.004137939378388e-07, "loss": 0.66821432, "num_input_tokens_seen": 252951300, "router_z_loss_clip": 0.01312256, "router_z_loss_mlp": 0.21484375, "step": 11722, "time_per_iteration": 3.260162115097046 }, { "auxiliary_loss_clip": 0.01124839, "auxiliary_loss_mlp": 0.01034209, "balance_loss_clip": 1.02235913, "balance_loss_mlp": 1.0380559, "epoch": 0.7048248910266045, "flos": 23660032976640.0, "grad_norm": 1.6863071991303507, "language_loss": 0.66062367, "learning_rate": 8.0011149887074e-07, "loss": 0.68221414, "num_input_tokens_seen": 252971400, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 11723, "time_per_iteration": 2.6898179054260254 }, { "auxiliary_loss_clip": 0.01134864, "auxiliary_loss_mlp": 0.01029104, "balance_loss_clip": 1.01646161, "balance_loss_mlp": 1.03706455, "epoch": 0.7048850142792725, "flos": 21397158017280.0, "grad_norm": 1.7474011337013902, "language_loss": 0.81305867, "learning_rate": 7.998092466245739e-07, "loss": 0.83469832, "num_input_tokens_seen": 252989475, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 11724, "time_per_iteration": 2.5609261989593506 }, { "auxiliary_loss_clip": 0.01115304, "auxiliary_loss_mlp": 0.01036179, "balance_loss_clip": 1.02337611, "balance_loss_mlp": 1.03492188, "epoch": 0.7049451375319404, "flos": 21648101408640.0, "grad_norm": 1.601549778812927, "language_loss": 0.73323357, "learning_rate": 7.995070372101291e-07, "loss": 0.75474846, "num_input_tokens_seen": 253007220, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 11725, "time_per_iteration": 2.553948163986206 }, { "auxiliary_loss_clip": 0.01142789, "auxiliary_loss_mlp": 0.01028789, "balance_loss_clip": 1.01589036, "balance_loss_mlp": 1.03557599, "epoch": 0.7050052607846085, "flos": 14866802904960.0, "grad_norm": 1.9429453630383016, "language_loss": 0.78212678, "learning_rate": 7.992048706381896e-07, "loss": 0.80384254, "num_input_tokens_seen": 253025410, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 11726, "time_per_iteration": 2.59024977684021 }, { "auxiliary_loss_clip": 0.01124807, "auxiliary_loss_mlp": 0.01036049, "balance_loss_clip": 1.02388394, "balance_loss_mlp": 1.03500211, "epoch": 0.7050653840372764, "flos": 19241763528960.0, "grad_norm": 1.9041764935783705, "language_loss": 0.70707577, "learning_rate": 7.989027469195409e-07, "loss": 0.72868431, "num_input_tokens_seen": 253043305, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.71875, "step": 11727, "time_per_iteration": 2.5362658500671387 }, { "auxiliary_loss_clip": 0.0109671, "auxiliary_loss_mlp": 0.01270115, "balance_loss_clip": 1.01228321, "balance_loss_mlp": 1.03198862, "epoch": 0.7051255072899444, "flos": 27780422935680.0, "grad_norm": 2.134443157560892, "language_loss": 0.69188958, "learning_rate": 7.98600666064962e-07, "loss": 0.71555781, "num_input_tokens_seen": 253062790, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.6484375, "step": 11728, "time_per_iteration": 2.5824780464172363 }, { "auxiliary_loss_clip": 0.01132681, "auxiliary_loss_mlp": 0.01276618, "balance_loss_clip": 1.01778126, "balance_loss_mlp": 1.03708076, "epoch": 0.7051856305426124, "flos": 27892033470720.0, "grad_norm": 1.6424359973523535, "language_loss": 0.72619766, "learning_rate": 7.982986280852355e-07, "loss": 0.75029063, "num_input_tokens_seen": 253082055, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 11729, "time_per_iteration": 4.047966241836548 }, { "auxiliary_loss_clip": 0.01128541, "auxiliary_loss_mlp": 0.01033814, "balance_loss_clip": 1.02048659, "balance_loss_mlp": 1.03625357, "epoch": 0.7052457537952803, "flos": 25043563082880.0, "grad_norm": 1.6279006931671953, "language_loss": 0.78222597, "learning_rate": 7.97996632991141e-07, "loss": 0.80384958, "num_input_tokens_seen": 253102575, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.74609375, "step": 11730, "time_per_iteration": 2.5910484790802 }, { "auxiliary_loss_clip": 0.01106098, "auxiliary_loss_mlp": 0.01035042, "balance_loss_clip": 1.02246499, "balance_loss_mlp": 1.03484595, "epoch": 0.7053058770479483, "flos": 21871717528320.0, "grad_norm": 2.6268574247981955, "language_loss": 0.63544798, "learning_rate": 7.976946807934528e-07, "loss": 0.6568594, "num_input_tokens_seen": 253121290, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 11731, "time_per_iteration": 2.5135281085968018 }, { "auxiliary_loss_clip": 0.01143191, "auxiliary_loss_mlp": 0.01027972, "balance_loss_clip": 1.01519847, "balance_loss_mlp": 1.03504443, "epoch": 0.7053660003006162, "flos": 16398716094720.0, "grad_norm": 1.804408253885557, "language_loss": 0.74466646, "learning_rate": 7.973927715029499e-07, "loss": 0.76637805, "num_input_tokens_seen": 253139720, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 11732, "time_per_iteration": 2.5753774642944336 }, { "auxiliary_loss_clip": 0.0111395, "auxiliary_loss_mlp": 0.010293, "balance_loss_clip": 1.01749265, "balance_loss_mlp": 1.03603351, "epoch": 0.7054261235532843, "flos": 22711560399360.0, "grad_norm": 1.4951892085538125, "language_loss": 0.71034694, "learning_rate": 7.970909051304044e-07, "loss": 0.73177946, "num_input_tokens_seen": 253160250, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 11733, "time_per_iteration": 2.564598798751831 }, { "auxiliary_loss_clip": 0.01130513, "auxiliary_loss_mlp": 0.01033982, "balance_loss_clip": 1.02150726, "balance_loss_mlp": 1.03453493, "epoch": 0.7054862468059522, "flos": 13589711775360.0, "grad_norm": 1.9339502217708378, "language_loss": 0.73766661, "learning_rate": 7.967890816865921e-07, "loss": 0.75931156, "num_input_tokens_seen": 253178710, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 11734, "time_per_iteration": 2.5484538078308105 }, { "auxiliary_loss_clip": 0.01137679, "auxiliary_loss_mlp": 0.01035697, "balance_loss_clip": 1.02283394, "balance_loss_mlp": 1.03711176, "epoch": 0.7055463700586202, "flos": 15880704105600.0, "grad_norm": 2.9932758846722325, "language_loss": 0.68538058, "learning_rate": 7.964873011822808e-07, "loss": 0.70711434, "num_input_tokens_seen": 253194805, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7421875, "step": 11735, "time_per_iteration": 2.6042799949645996 }, { "auxiliary_loss_clip": 0.01125103, "auxiliary_loss_mlp": 0.01037084, "balance_loss_clip": 1.02264214, "balance_loss_mlp": 1.03581655, "epoch": 0.7056064933112881, "flos": 23076161400960.0, "grad_norm": 1.7777964379770528, "language_loss": 0.72388703, "learning_rate": 7.961855636282427e-07, "loss": 0.74550891, "num_input_tokens_seen": 253213895, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.71875, "step": 11736, "time_per_iteration": 3.87506365776062 }, { "auxiliary_loss_clip": 0.01122998, "auxiliary_loss_mlp": 0.01024824, "balance_loss_clip": 1.01363015, "balance_loss_mlp": 1.03705144, "epoch": 0.7056666165639561, "flos": 24057168721920.0, "grad_norm": 1.860136543468803, "language_loss": 0.69038284, "learning_rate": 7.958838690352449e-07, "loss": 0.71186101, "num_input_tokens_seen": 253231620, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.68359375, "step": 11737, "time_per_iteration": 2.5820136070251465 }, { "auxiliary_loss_clip": 0.01136525, "auxiliary_loss_mlp": 0.0103707, "balance_loss_clip": 1.02370656, "balance_loss_mlp": 1.03788447, "epoch": 0.705726739816624, "flos": 17493237371520.0, "grad_norm": 1.8500020337270788, "language_loss": 0.67410505, "learning_rate": 7.955822174140549e-07, "loss": 0.69584107, "num_input_tokens_seen": 253249590, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 11738, "time_per_iteration": 2.5249650478363037 }, { "auxiliary_loss_clip": 0.01106605, "auxiliary_loss_mlp": 0.01037566, "balance_loss_clip": 1.02411962, "balance_loss_mlp": 1.03612721, "epoch": 0.7057868630692921, "flos": 51350426472960.0, "grad_norm": 2.5527506877896755, "language_loss": 0.74815035, "learning_rate": 7.952806087754364e-07, "loss": 0.76959217, "num_input_tokens_seen": 253273870, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 11739, "time_per_iteration": 2.8391897678375244 }, { "auxiliary_loss_clip": 0.01120182, "auxiliary_loss_mlp": 0.01273209, "balance_loss_clip": 1.01384223, "balance_loss_mlp": 1.03507257, "epoch": 0.70584698632196, "flos": 26102963836800.0, "grad_norm": 1.786492298178974, "language_loss": 0.71378434, "learning_rate": 7.949790431301557e-07, "loss": 0.73771822, "num_input_tokens_seen": 253293720, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.67578125, "step": 11740, "time_per_iteration": 2.576969623565674 }, { "auxiliary_loss_clip": 0.0113314, "auxiliary_loss_mlp": 0.01028249, "balance_loss_clip": 1.01676345, "balance_loss_mlp": 1.03755116, "epoch": 0.705907109574628, "flos": 21543134889600.0, "grad_norm": 1.770968162048259, "language_loss": 0.81880158, "learning_rate": 7.94677520488972e-07, "loss": 0.84041548, "num_input_tokens_seen": 253313700, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.69140625, "step": 11741, "time_per_iteration": 4.058112621307373 }, { "auxiliary_loss_clip": 0.01124579, "auxiliary_loss_mlp": 0.01034281, "balance_loss_clip": 1.02156746, "balance_loss_mlp": 1.03609085, "epoch": 0.705967232827296, "flos": 22710842127360.0, "grad_norm": 1.7816662288866199, "language_loss": 0.78470272, "learning_rate": 7.943760408626474e-07, "loss": 0.80629134, "num_input_tokens_seen": 253332425, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 11742, "time_per_iteration": 4.278971433639526 }, { "auxiliary_loss_clip": 0.01109134, "auxiliary_loss_mlp": 0.01030632, "balance_loss_clip": 1.01795459, "balance_loss_mlp": 1.03622627, "epoch": 0.7060273560799639, "flos": 28691225124480.0, "grad_norm": 1.704748429208701, "language_loss": 0.64076066, "learning_rate": 7.940746042619404e-07, "loss": 0.66215837, "num_input_tokens_seen": 253353620, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 11743, "time_per_iteration": 2.7095606327056885 }, { "auxiliary_loss_clip": 0.01125746, "auxiliary_loss_mlp": 0.01030369, "balance_loss_clip": 1.01740503, "balance_loss_mlp": 1.03566754, "epoch": 0.7060874793326319, "flos": 15706178899200.0, "grad_norm": 1.9270819545083293, "language_loss": 0.65855396, "learning_rate": 7.937732106976098e-07, "loss": 0.68011504, "num_input_tokens_seen": 253370930, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 11744, "time_per_iteration": 2.574098825454712 }, { "auxiliary_loss_clip": 0.0111381, "auxiliary_loss_mlp": 0.01031017, "balance_loss_clip": 1.01857734, "balance_loss_mlp": 1.0358758, "epoch": 0.7061476025852998, "flos": 21506757390720.0, "grad_norm": 1.703246955663434, "language_loss": 0.63663638, "learning_rate": 7.9347186018041e-07, "loss": 0.65808463, "num_input_tokens_seen": 253389810, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 11745, "time_per_iteration": 2.6629891395568848 }, { "auxiliary_loss_clip": 0.01133568, "auxiliary_loss_mlp": 0.01033578, "balance_loss_clip": 1.02131152, "balance_loss_mlp": 1.0362308, "epoch": 0.7062077258379679, "flos": 28181832399360.0, "grad_norm": 1.8126216964901227, "language_loss": 0.71736193, "learning_rate": 7.931705527210952e-07, "loss": 0.73903346, "num_input_tokens_seen": 253408685, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.703125, "step": 11746, "time_per_iteration": 2.6124720573425293 }, { "auxiliary_loss_clip": 0.01150667, "auxiliary_loss_mlp": 0.01034322, "balance_loss_clip": 1.02110791, "balance_loss_mlp": 1.03611708, "epoch": 0.7062678490906358, "flos": 27853680723840.0, "grad_norm": 1.425233160115935, "language_loss": 0.78997266, "learning_rate": 7.928692883304206e-07, "loss": 0.81182253, "num_input_tokens_seen": 253429685, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 11747, "time_per_iteration": 2.6502647399902344 }, { "auxiliary_loss_clip": 0.01112555, "auxiliary_loss_mlp": 0.01030669, "balance_loss_clip": 1.01922488, "balance_loss_mlp": 1.03565657, "epoch": 0.7063279723433038, "flos": 23184862934400.0, "grad_norm": 3.3716811905617576, "language_loss": 0.65093428, "learning_rate": 7.925680670191344e-07, "loss": 0.6723665, "num_input_tokens_seen": 253448260, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6796875, "step": 11748, "time_per_iteration": 2.5679244995117188 }, { "auxiliary_loss_clip": 0.01140428, "auxiliary_loss_mlp": 0.01039854, "balance_loss_clip": 1.02670491, "balance_loss_mlp": 1.0344497, "epoch": 0.7063880955959717, "flos": 20188655907840.0, "grad_norm": 1.8614834716052178, "language_loss": 0.79660094, "learning_rate": 7.922668887979889e-07, "loss": 0.81840372, "num_input_tokens_seen": 253467725, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.70703125, "step": 11749, "time_per_iteration": 2.587034225463867 }, { "auxiliary_loss_clip": 0.01025367, "auxiliary_loss_mlp": 0.01001364, "balance_loss_clip": 0.99997538, "balance_loss_mlp": 1.00355649, "epoch": 0.7064482188486397, "flos": 63668182763520.0, "grad_norm": 0.7900949657728047, "language_loss": 0.54033303, "learning_rate": 7.919657536777304e-07, "loss": 0.56060034, "num_input_tokens_seen": 253526940, "router_z_loss_clip": 0.01391602, "router_z_loss_mlp": 0.21875, "step": 11750, "time_per_iteration": 3.088942527770996 }, { "auxiliary_loss_clip": 0.01124903, "auxiliary_loss_mlp": 0.01036371, "balance_loss_clip": 1.02349019, "balance_loss_mlp": 1.03743708, "epoch": 0.7065083421013076, "flos": 25191227894400.0, "grad_norm": 1.6976916353045761, "language_loss": 0.78830498, "learning_rate": 7.916646616691085e-07, "loss": 0.80991769, "num_input_tokens_seen": 253546160, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 11751, "time_per_iteration": 2.6299593448638916 }, { "auxiliary_loss_clip": 0.01127344, "auxiliary_loss_mlp": 0.01030323, "balance_loss_clip": 1.01842618, "balance_loss_mlp": 1.03350544, "epoch": 0.7065684653539757, "flos": 22893699288960.0, "grad_norm": 1.8010566802404564, "language_loss": 0.68055016, "learning_rate": 7.913636127828651e-07, "loss": 0.70212686, "num_input_tokens_seen": 253565505, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 11752, "time_per_iteration": 2.61712384223938 }, { "auxiliary_loss_clip": 0.01150152, "auxiliary_loss_mlp": 0.01285962, "balance_loss_clip": 1.02667725, "balance_loss_mlp": 1.03633595, "epoch": 0.7066285886066436, "flos": 23550254035200.0, "grad_norm": 7.7082536439745875, "language_loss": 0.76817858, "learning_rate": 7.910626070297461e-07, "loss": 0.79253972, "num_input_tokens_seen": 253585125, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 11753, "time_per_iteration": 2.647080659866333 }, { "auxiliary_loss_clip": 0.01129608, "auxiliary_loss_mlp": 0.01279168, "balance_loss_clip": 1.02047038, "balance_loss_mlp": 1.03534162, "epoch": 0.7066887118593116, "flos": 21069293650560.0, "grad_norm": 2.021206863323333, "language_loss": 0.70930851, "learning_rate": 7.907616444204928e-07, "loss": 0.73339635, "num_input_tokens_seen": 253604815, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.67578125, "step": 11754, "time_per_iteration": 2.5904242992401123 }, { "auxiliary_loss_clip": 0.01132528, "auxiliary_loss_mlp": 0.0103547, "balance_loss_clip": 1.02271485, "balance_loss_mlp": 1.035007, "epoch": 0.7067488351119796, "flos": 21176307244800.0, "grad_norm": 1.6927794155910159, "language_loss": 0.89381748, "learning_rate": 7.90460724965846e-07, "loss": 0.91549742, "num_input_tokens_seen": 253622855, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 11755, "time_per_iteration": 2.647106647491455 }, { "auxiliary_loss_clip": 0.01121322, "auxiliary_loss_mlp": 0.01282774, "balance_loss_clip": 1.02309108, "balance_loss_mlp": 1.03478742, "epoch": 0.7068089583646475, "flos": 20449224144000.0, "grad_norm": 5.899267093352281, "language_loss": 0.79812872, "learning_rate": 7.901598486765438e-07, "loss": 0.82216966, "num_input_tokens_seen": 253642760, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 11756, "time_per_iteration": 2.604221820831299 }, { "auxiliary_loss_clip": 0.01061241, "auxiliary_loss_mlp": 0.01001381, "balance_loss_clip": 1.00005758, "balance_loss_mlp": 1.00407708, "epoch": 0.7068690816173155, "flos": 59109179829120.0, "grad_norm": 0.8249022557141642, "language_loss": 0.60316718, "learning_rate": 7.89859015563326e-07, "loss": 0.62379336, "num_input_tokens_seen": 253695685, "router_z_loss_clip": 0.01324463, "router_z_loss_mlp": 0.21777344, "step": 11757, "time_per_iteration": 2.997695207595825 }, { "auxiliary_loss_clip": 0.01133338, "auxiliary_loss_mlp": 0.01030452, "balance_loss_clip": 1.01774383, "balance_loss_mlp": 1.03619671, "epoch": 0.7069292048699835, "flos": 16251554073600.0, "grad_norm": 1.841805486249637, "language_loss": 0.80669779, "learning_rate": 7.895582256369256e-07, "loss": 0.82833564, "num_input_tokens_seen": 253713305, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 11758, "time_per_iteration": 2.578122615814209 }, { "auxiliary_loss_clip": 0.01129179, "auxiliary_loss_mlp": 0.01280397, "balance_loss_clip": 1.02103353, "balance_loss_mlp": 1.03358173, "epoch": 0.7069893281226515, "flos": 41172768455040.0, "grad_norm": 1.6606920661332096, "language_loss": 0.7779094, "learning_rate": 7.892574789080793e-07, "loss": 0.80200517, "num_input_tokens_seen": 253736100, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 11759, "time_per_iteration": 2.8030548095703125 }, { "auxiliary_loss_clip": 0.01113988, "auxiliary_loss_mlp": 0.01030629, "balance_loss_clip": 1.01824284, "balance_loss_mlp": 1.03473401, "epoch": 0.7070494513753194, "flos": 24207275658240.0, "grad_norm": 1.5900257675542868, "language_loss": 0.67700183, "learning_rate": 7.88956775387519e-07, "loss": 0.698448, "num_input_tokens_seen": 253757350, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 11760, "time_per_iteration": 2.5652642250061035 }, { "auxiliary_loss_clip": 0.01134332, "auxiliary_loss_mlp": 0.01031855, "balance_loss_clip": 1.01998734, "balance_loss_mlp": 1.03635716, "epoch": 0.7071095746279874, "flos": 20185675079040.0, "grad_norm": 2.5837533973936178, "language_loss": 0.80363876, "learning_rate": 7.886561150859763e-07, "loss": 0.82530063, "num_input_tokens_seen": 253772855, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.7109375, "step": 11761, "time_per_iteration": 2.54240083694458 }, { "auxiliary_loss_clip": 0.01123516, "auxiliary_loss_mlp": 0.01033584, "balance_loss_clip": 1.02084684, "balance_loss_mlp": 1.03498805, "epoch": 0.7071696978806553, "flos": 18183045133440.0, "grad_norm": 2.3026794209023347, "language_loss": 0.7466557, "learning_rate": 7.883554980141811e-07, "loss": 0.76822674, "num_input_tokens_seen": 253790360, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 11762, "time_per_iteration": 2.5904414653778076 }, { "auxiliary_loss_clip": 0.01121661, "auxiliary_loss_mlp": 0.0102937, "balance_loss_clip": 1.01769304, "balance_loss_mlp": 1.0355376, "epoch": 0.7072298211333233, "flos": 24131719399680.0, "grad_norm": 1.7508637570678973, "language_loss": 0.76861167, "learning_rate": 7.880549241828604e-07, "loss": 0.79012197, "num_input_tokens_seen": 253810585, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.68359375, "step": 11763, "time_per_iteration": 2.62898850440979 }, { "auxiliary_loss_clip": 0.01131495, "auxiliary_loss_mlp": 0.01285442, "balance_loss_clip": 1.02560353, "balance_loss_mlp": 1.0354054, "epoch": 0.7072899443859912, "flos": 27198418867200.0, "grad_norm": 4.598357108887443, "language_loss": 0.79161048, "learning_rate": 7.877543936027437e-07, "loss": 0.81577986, "num_input_tokens_seen": 253829080, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 11764, "time_per_iteration": 2.651897430419922 }, { "auxiliary_loss_clip": 0.01144329, "auxiliary_loss_mlp": 0.01037078, "balance_loss_clip": 1.02381051, "balance_loss_mlp": 1.03649831, "epoch": 0.7073500676386593, "flos": 16435596384000.0, "grad_norm": 1.6326453783366721, "language_loss": 0.79572701, "learning_rate": 7.874539062845528e-07, "loss": 0.81754112, "num_input_tokens_seen": 253846780, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 11765, "time_per_iteration": 2.535557270050049 }, { "auxiliary_loss_clip": 0.01102337, "auxiliary_loss_mlp": 0.01027313, "balance_loss_clip": 1.01586854, "balance_loss_mlp": 1.03442585, "epoch": 0.7074101908913272, "flos": 27673732563840.0, "grad_norm": 1.7181786341505982, "language_loss": 0.68663168, "learning_rate": 7.871534622390137e-07, "loss": 0.70792818, "num_input_tokens_seen": 253867075, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6796875, "step": 11766, "time_per_iteration": 2.563793897628784 }, { "auxiliary_loss_clip": 0.01122384, "auxiliary_loss_mlp": 0.01036726, "balance_loss_clip": 1.02367282, "balance_loss_mlp": 1.03420174, "epoch": 0.7074703141439952, "flos": 22238078296320.0, "grad_norm": 2.009576264951776, "language_loss": 0.64423209, "learning_rate": 7.868530614768478e-07, "loss": 0.66582322, "num_input_tokens_seen": 253885790, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 11767, "time_per_iteration": 2.5144753456115723 }, { "auxiliary_loss_clip": 0.01114202, "auxiliary_loss_mlp": 0.01029092, "balance_loss_clip": 1.01659322, "balance_loss_mlp": 1.03469741, "epoch": 0.7075304373966632, "flos": 29643217234560.0, "grad_norm": 6.7870293120919145, "language_loss": 0.52967048, "learning_rate": 7.865527040087756e-07, "loss": 0.55110347, "num_input_tokens_seen": 253907070, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 11768, "time_per_iteration": 2.6376495361328125 }, { "auxiliary_loss_clip": 0.01120286, "auxiliary_loss_mlp": 0.01276514, "balance_loss_clip": 1.01786482, "balance_loss_mlp": 1.03474474, "epoch": 0.7075905606493311, "flos": 19755214490880.0, "grad_norm": 1.886130265020927, "language_loss": 0.75649762, "learning_rate": 7.862523898455151e-07, "loss": 0.7804656, "num_input_tokens_seen": 253927290, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 11769, "time_per_iteration": 2.5150063037872314 }, { "auxiliary_loss_clip": 0.01044432, "auxiliary_loss_mlp": 0.01002465, "balance_loss_clip": 1.00111222, "balance_loss_mlp": 1.00449729, "epoch": 0.7076506839019991, "flos": 65716132694400.0, "grad_norm": 0.8498749485709713, "language_loss": 0.61992097, "learning_rate": 7.859521189977856e-07, "loss": 0.64038992, "num_input_tokens_seen": 253983440, "router_z_loss_clip": 0.0135498, "router_z_loss_mlp": 0.21679688, "step": 11770, "time_per_iteration": 3.097136974334717 }, { "auxiliary_loss_clip": 0.01134421, "auxiliary_loss_mlp": 0.01033734, "balance_loss_clip": 1.02096677, "balance_loss_mlp": 1.03634644, "epoch": 0.707710807154667, "flos": 23765286804480.0, "grad_norm": 1.6935012612436686, "language_loss": 0.76267087, "learning_rate": 7.856518914763019e-07, "loss": 0.78435242, "num_input_tokens_seen": 254003825, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 11771, "time_per_iteration": 4.004818916320801 }, { "auxiliary_loss_clip": 0.01110277, "auxiliary_loss_mlp": 0.01031184, "balance_loss_clip": 1.01959085, "balance_loss_mlp": 1.03413844, "epoch": 0.7077709304073351, "flos": 21251360712960.0, "grad_norm": 1.447629023729539, "language_loss": 0.71246886, "learning_rate": 7.853517072917786e-07, "loss": 0.7338835, "num_input_tokens_seen": 254023345, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.671875, "step": 11772, "time_per_iteration": 2.5173354148864746 }, { "auxiliary_loss_clip": 0.01143406, "auxiliary_loss_mlp": 0.01030139, "balance_loss_clip": 1.01726449, "balance_loss_mlp": 1.03574038, "epoch": 0.707831053660003, "flos": 20740746925440.0, "grad_norm": 1.9483563193003723, "language_loss": 0.69708127, "learning_rate": 7.850515664549278e-07, "loss": 0.7188167, "num_input_tokens_seen": 254041815, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 11773, "time_per_iteration": 2.648911714553833 }, { "auxiliary_loss_clip": 0.01115209, "auxiliary_loss_mlp": 0.01032265, "balance_loss_clip": 1.02024841, "balance_loss_mlp": 1.03563571, "epoch": 0.707891176912671, "flos": 21980993679360.0, "grad_norm": 1.4807589550383808, "language_loss": 0.70121348, "learning_rate": 7.847514689764625e-07, "loss": 0.7226882, "num_input_tokens_seen": 254062065, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7109375, "step": 11774, "time_per_iteration": 2.5955934524536133 }, { "auxiliary_loss_clip": 0.01120843, "auxiliary_loss_mlp": 0.01028853, "balance_loss_clip": 1.01711726, "balance_loss_mlp": 1.03644347, "epoch": 0.7079513001653389, "flos": 21068970428160.0, "grad_norm": 1.4710283704894371, "language_loss": 0.74355161, "learning_rate": 7.844514148670909e-07, "loss": 0.76504862, "num_input_tokens_seen": 254080605, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6640625, "step": 11775, "time_per_iteration": 2.573528528213501 }, { "auxiliary_loss_clip": 0.01133687, "auxiliary_loss_mlp": 0.012765, "balance_loss_clip": 1.01731884, "balance_loss_mlp": 1.03693497, "epoch": 0.7080114234180069, "flos": 18040659621120.0, "grad_norm": 1.748027694859169, "language_loss": 0.87132776, "learning_rate": 7.841514041375206e-07, "loss": 0.89542967, "num_input_tokens_seen": 254098710, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 11776, "time_per_iteration": 2.580476760864258 }, { "auxiliary_loss_clip": 0.01113397, "auxiliary_loss_mlp": 0.01035055, "balance_loss_clip": 1.02322948, "balance_loss_mlp": 1.03525352, "epoch": 0.7080715466706748, "flos": 15122271409920.0, "grad_norm": 1.5490410432965844, "language_loss": 0.74945575, "learning_rate": 7.838514367984599e-07, "loss": 0.7709403, "num_input_tokens_seen": 254117200, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69140625, "step": 11777, "time_per_iteration": 3.9942126274108887 }, { "auxiliary_loss_clip": 0.01129537, "auxiliary_loss_mlp": 0.01034637, "balance_loss_clip": 1.02219152, "balance_loss_mlp": 1.03846979, "epoch": 0.7081316699233429, "flos": 14422802889600.0, "grad_norm": 2.293574900208399, "language_loss": 0.8232705, "learning_rate": 7.835515128606132e-07, "loss": 0.84491229, "num_input_tokens_seen": 254132115, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.734375, "step": 11778, "time_per_iteration": 2.576681137084961 }, { "auxiliary_loss_clip": 0.01119419, "auxiliary_loss_mlp": 0.01032038, "balance_loss_clip": 1.02108824, "balance_loss_mlp": 1.03583014, "epoch": 0.7081917931760108, "flos": 23222389668480.0, "grad_norm": 1.5007775921854831, "language_loss": 0.84844458, "learning_rate": 7.832516323346839e-07, "loss": 0.86995912, "num_input_tokens_seen": 254152285, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.65625, "step": 11779, "time_per_iteration": 2.6404149532318115 }, { "auxiliary_loss_clip": 0.01125612, "auxiliary_loss_mlp": 0.01034905, "balance_loss_clip": 1.02254319, "balance_loss_mlp": 1.03713548, "epoch": 0.7082519164286788, "flos": 39308429871360.0, "grad_norm": 2.02136138168992, "language_loss": 0.71955216, "learning_rate": 7.829517952313733e-07, "loss": 0.74115735, "num_input_tokens_seen": 254172805, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.70703125, "step": 11780, "time_per_iteration": 2.707073926925659 }, { "auxiliary_loss_clip": 0.01139944, "auxiliary_loss_mlp": 0.01032338, "balance_loss_clip": 1.02036893, "balance_loss_mlp": 1.03541315, "epoch": 0.7083120396813467, "flos": 21651154064640.0, "grad_norm": 2.52009540085245, "language_loss": 0.72873342, "learning_rate": 7.82652001561384e-07, "loss": 0.75045621, "num_input_tokens_seen": 254191890, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 11781, "time_per_iteration": 2.5690958499908447 }, { "auxiliary_loss_clip": 0.01113496, "auxiliary_loss_mlp": 0.0103348, "balance_loss_clip": 1.02139807, "balance_loss_mlp": 1.03594196, "epoch": 0.7083721629340147, "flos": 17567033863680.0, "grad_norm": 1.8639651897547045, "language_loss": 0.77401304, "learning_rate": 7.823522513354117e-07, "loss": 0.79548275, "num_input_tokens_seen": 254210150, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 11782, "time_per_iteration": 2.5371527671813965 }, { "auxiliary_loss_clip": 0.01119766, "auxiliary_loss_mlp": 0.010287, "balance_loss_clip": 1.01771486, "balance_loss_mlp": 1.03416622, "epoch": 0.7084322861866827, "flos": 29350509304320.0, "grad_norm": 2.868073291357221, "language_loss": 0.69819403, "learning_rate": 7.820525445641564e-07, "loss": 0.71967864, "num_input_tokens_seen": 254233015, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.68359375, "step": 11783, "time_per_iteration": 4.758159637451172 }, { "auxiliary_loss_clip": 0.01143564, "auxiliary_loss_mlp": 0.01032448, "balance_loss_clip": 1.01976418, "balance_loss_mlp": 1.03610539, "epoch": 0.7084924094393507, "flos": 20194294343040.0, "grad_norm": 1.6867656962822108, "language_loss": 0.78899604, "learning_rate": 7.817528812583125e-07, "loss": 0.81075621, "num_input_tokens_seen": 254251345, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 11784, "time_per_iteration": 4.111543655395508 }, { "auxiliary_loss_clip": 0.01105343, "auxiliary_loss_mlp": 0.01031131, "balance_loss_clip": 1.01905537, "balance_loss_mlp": 1.03645658, "epoch": 0.7085525326920187, "flos": 23477211728640.0, "grad_norm": 1.8168762516602535, "language_loss": 0.77242351, "learning_rate": 7.81453261428575e-07, "loss": 0.79378819, "num_input_tokens_seen": 254269905, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 11785, "time_per_iteration": 2.581491231918335 }, { "auxiliary_loss_clip": 0.01121816, "auxiliary_loss_mlp": 0.01033896, "balance_loss_clip": 1.0212301, "balance_loss_mlp": 1.03548813, "epoch": 0.7086126559446866, "flos": 25958818558080.0, "grad_norm": 1.8260300453120233, "language_loss": 0.78179103, "learning_rate": 7.811536850856351e-07, "loss": 0.80334812, "num_input_tokens_seen": 254289990, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.68359375, "step": 11786, "time_per_iteration": 2.705859661102295 }, { "auxiliary_loss_clip": 0.01106246, "auxiliary_loss_mlp": 0.01026587, "balance_loss_clip": 1.01405251, "balance_loss_mlp": 1.03521907, "epoch": 0.7086727791973546, "flos": 26724793109760.0, "grad_norm": 2.1979276308361784, "language_loss": 0.79258633, "learning_rate": 7.808541522401856e-07, "loss": 0.81391466, "num_input_tokens_seen": 254309085, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 11787, "time_per_iteration": 2.6312172412872314 }, { "auxiliary_loss_clip": 0.01114579, "auxiliary_loss_mlp": 0.01027452, "balance_loss_clip": 1.01548314, "balance_loss_mlp": 1.03668118, "epoch": 0.7087329024500225, "flos": 21683365585920.0, "grad_norm": 1.6715755065527056, "language_loss": 0.76738906, "learning_rate": 7.805546629029156e-07, "loss": 0.78880942, "num_input_tokens_seen": 254327045, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.69140625, "step": 11788, "time_per_iteration": 2.5990161895751953 }, { "auxiliary_loss_clip": 0.01136677, "auxiliary_loss_mlp": 0.01031364, "balance_loss_clip": 1.0181613, "balance_loss_mlp": 1.0371449, "epoch": 0.7087930257026905, "flos": 17931060247680.0, "grad_norm": 1.9285189830679468, "language_loss": 0.68144774, "learning_rate": 7.802552170845126e-07, "loss": 0.70312816, "num_input_tokens_seen": 254344585, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 11789, "time_per_iteration": 2.6657426357269287 }, { "auxiliary_loss_clip": 0.01152065, "auxiliary_loss_mlp": 0.01033492, "balance_loss_clip": 1.02027726, "balance_loss_mlp": 1.03569961, "epoch": 0.7088531489553584, "flos": 18911528864640.0, "grad_norm": 1.6197764745787042, "language_loss": 0.77544308, "learning_rate": 7.799558147956637e-07, "loss": 0.79729867, "num_input_tokens_seen": 254362470, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 11790, "time_per_iteration": 2.627671003341675 }, { "auxiliary_loss_clip": 0.01127834, "auxiliary_loss_mlp": 0.01028721, "balance_loss_clip": 1.0157156, "balance_loss_mlp": 1.03644574, "epoch": 0.7089132722080265, "flos": 27380880979200.0, "grad_norm": 2.2288345273613253, "language_loss": 0.71371633, "learning_rate": 7.796564560470534e-07, "loss": 0.73528188, "num_input_tokens_seen": 254383190, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.73828125, "step": 11791, "time_per_iteration": 2.5816688537597656 }, { "auxiliary_loss_clip": 0.01115194, "auxiliary_loss_mlp": 0.01033255, "balance_loss_clip": 1.02043986, "balance_loss_mlp": 1.0365715, "epoch": 0.7089733954606944, "flos": 22162917087360.0, "grad_norm": 1.489703632927042, "language_loss": 0.8219161, "learning_rate": 7.793571408493649e-07, "loss": 0.8434006, "num_input_tokens_seen": 254403115, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 11792, "time_per_iteration": 2.5056023597717285 }, { "auxiliary_loss_clip": 0.01134442, "auxiliary_loss_mlp": 0.01031323, "balance_loss_clip": 1.01862752, "balance_loss_mlp": 1.03401947, "epoch": 0.7090335187133624, "flos": 24425827960320.0, "grad_norm": 1.9488414882841525, "language_loss": 0.64651561, "learning_rate": 7.790578692132794e-07, "loss": 0.66817331, "num_input_tokens_seen": 254421875, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.734375, "step": 11793, "time_per_iteration": 2.5667600631713867 }, { "auxiliary_loss_clip": 0.0111471, "auxiliary_loss_mlp": 0.0102989, "balance_loss_clip": 1.01772428, "balance_loss_mlp": 1.03693199, "epoch": 0.7090936419660303, "flos": 21835232288640.0, "grad_norm": 1.8816899903193494, "language_loss": 0.70447582, "learning_rate": 7.787586411494788e-07, "loss": 0.72592175, "num_input_tokens_seen": 254440765, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6875, "step": 11794, "time_per_iteration": 2.55570912361145 }, { "auxiliary_loss_clip": 0.01125755, "auxiliary_loss_mlp": 0.0103546, "balance_loss_clip": 1.0235393, "balance_loss_mlp": 1.03934097, "epoch": 0.7091537652186983, "flos": 20082360585600.0, "grad_norm": 1.7461398101167256, "language_loss": 0.7621659, "learning_rate": 7.784594566686409e-07, "loss": 0.78377813, "num_input_tokens_seen": 254459480, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 11795, "time_per_iteration": 2.570291757583618 }, { "auxiliary_loss_clip": 0.01132943, "auxiliary_loss_mlp": 0.01033515, "balance_loss_clip": 1.02160585, "balance_loss_mlp": 1.03656197, "epoch": 0.7092138884713663, "flos": 13151565676800.0, "grad_norm": 3.414105750306028, "language_loss": 0.75069594, "learning_rate": 7.781603157814427e-07, "loss": 0.77236056, "num_input_tokens_seen": 254473985, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 11796, "time_per_iteration": 2.551567316055298 }, { "auxiliary_loss_clip": 0.01121303, "auxiliary_loss_mlp": 0.01038942, "balance_loss_clip": 1.02485776, "balance_loss_mlp": 1.03878784, "epoch": 0.7092740117240343, "flos": 21645982506240.0, "grad_norm": 1.5918571573143507, "language_loss": 0.73922408, "learning_rate": 7.778612184985592e-07, "loss": 0.76082653, "num_input_tokens_seen": 254492135, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.734375, "step": 11797, "time_per_iteration": 2.5365381240844727 }, { "auxiliary_loss_clip": 0.01142921, "auxiliary_loss_mlp": 0.01031511, "balance_loss_clip": 1.01947701, "balance_loss_mlp": 1.0371716, "epoch": 0.7093341349767023, "flos": 21032521102080.0, "grad_norm": 1.4150955040684787, "language_loss": 0.7931931, "learning_rate": 7.775621648306665e-07, "loss": 0.81493735, "num_input_tokens_seen": 254512865, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.703125, "step": 11798, "time_per_iteration": 2.5848586559295654 }, { "auxiliary_loss_clip": 0.01122359, "auxiliary_loss_mlp": 0.01032596, "balance_loss_clip": 1.02003098, "balance_loss_mlp": 1.03475761, "epoch": 0.7093942582293702, "flos": 22017658487040.0, "grad_norm": 1.787288040896809, "language_loss": 0.66528583, "learning_rate": 7.772631547884343e-07, "loss": 0.68683541, "num_input_tokens_seen": 254532605, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 11799, "time_per_iteration": 2.5486223697662354 }, { "auxiliary_loss_clip": 0.01105438, "auxiliary_loss_mlp": 0.01028361, "balance_loss_clip": 1.01529574, "balance_loss_mlp": 1.0352422, "epoch": 0.7094543814820382, "flos": 27235586465280.0, "grad_norm": 1.595936682169011, "language_loss": 0.81365848, "learning_rate": 7.769641883825355e-07, "loss": 0.83499652, "num_input_tokens_seen": 254553780, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 11800, "time_per_iteration": 2.6253252029418945 }, { "auxiliary_loss_clip": 0.01120212, "auxiliary_loss_mlp": 0.01028121, "balance_loss_clip": 1.01677203, "balance_loss_mlp": 1.03420377, "epoch": 0.7095145047347061, "flos": 12089148180480.0, "grad_norm": 1.8782889088770085, "language_loss": 0.86662877, "learning_rate": 7.76665265623639e-07, "loss": 0.88811213, "num_input_tokens_seen": 254567510, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.68359375, "step": 11801, "time_per_iteration": 2.5960254669189453 }, { "auxiliary_loss_clip": 0.01106508, "auxiliary_loss_mlp": 0.01035793, "balance_loss_clip": 1.0238961, "balance_loss_mlp": 1.03597033, "epoch": 0.7095746279873741, "flos": 19383789905280.0, "grad_norm": 1.625417194196909, "language_loss": 0.76107621, "learning_rate": 7.763663865224122e-07, "loss": 0.78249925, "num_input_tokens_seen": 254585565, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.703125, "step": 11802, "time_per_iteration": 2.590914011001587 }, { "auxiliary_loss_clip": 0.01119575, "auxiliary_loss_mlp": 0.01040075, "balance_loss_clip": 1.02782035, "balance_loss_mlp": 1.03836322, "epoch": 0.709634751240042, "flos": 21360600950400.0, "grad_norm": 1.735718667126886, "language_loss": 0.81677586, "learning_rate": 7.760675510895207e-07, "loss": 0.83837235, "num_input_tokens_seen": 254603465, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.72265625, "step": 11803, "time_per_iteration": 2.5575051307678223 }, { "auxiliary_loss_clip": 0.0110693, "auxiliary_loss_mlp": 0.01033063, "balance_loss_clip": 1.01987815, "balance_loss_mlp": 1.03492737, "epoch": 0.7096948744927101, "flos": 13917037438080.0, "grad_norm": 2.1753060210453232, "language_loss": 0.6769762, "learning_rate": 7.757687593356308e-07, "loss": 0.69837612, "num_input_tokens_seen": 254620500, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 11804, "time_per_iteration": 2.5329318046569824 }, { "auxiliary_loss_clip": 0.01108982, "auxiliary_loss_mlp": 0.01026462, "balance_loss_clip": 1.01537597, "balance_loss_mlp": 1.03420711, "epoch": 0.709754997745378, "flos": 30298335436800.0, "grad_norm": 2.062858658914514, "language_loss": 0.78277498, "learning_rate": 7.754700112714054e-07, "loss": 0.80412942, "num_input_tokens_seen": 254638565, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.65625, "step": 11805, "time_per_iteration": 2.5790860652923584 }, { "auxiliary_loss_clip": 0.01122429, "auxiliary_loss_mlp": 0.01281874, "balance_loss_clip": 1.02214336, "balance_loss_mlp": 1.03461957, "epoch": 0.709815120998046, "flos": 18515147304960.0, "grad_norm": 1.7563869606659863, "language_loss": 0.79188859, "learning_rate": 7.751713069075041e-07, "loss": 0.81593168, "num_input_tokens_seen": 254657505, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 11806, "time_per_iteration": 2.567713499069214 }, { "auxiliary_loss_clip": 0.01124951, "auxiliary_loss_mlp": 0.01036885, "balance_loss_clip": 1.02490449, "balance_loss_mlp": 1.0372535, "epoch": 0.7098752442507139, "flos": 22272588288000.0, "grad_norm": 1.9765868799075095, "language_loss": 0.55847675, "learning_rate": 7.74872646254589e-07, "loss": 0.58009505, "num_input_tokens_seen": 254674730, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.69921875, "step": 11807, "time_per_iteration": 2.5355372428894043 }, { "auxiliary_loss_clip": 0.01109125, "auxiliary_loss_mlp": 0.01040106, "balance_loss_clip": 1.02759528, "balance_loss_mlp": 1.03525198, "epoch": 0.7099353675033819, "flos": 19275447507840.0, "grad_norm": 1.9520799608419088, "language_loss": 0.68659663, "learning_rate": 7.745740293233176e-07, "loss": 0.70808887, "num_input_tokens_seen": 254691665, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.73828125, "step": 11808, "time_per_iteration": 2.541836977005005 }, { "auxiliary_loss_clip": 0.01115922, "auxiliary_loss_mlp": 0.01031203, "balance_loss_clip": 1.01934791, "balance_loss_mlp": 1.03502679, "epoch": 0.70999549075605, "flos": 21908525990400.0, "grad_norm": 1.9711258161198808, "language_loss": 0.71342254, "learning_rate": 7.742754561243469e-07, "loss": 0.7348938, "num_input_tokens_seen": 254711610, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.71875, "step": 11809, "time_per_iteration": 2.545727252960205 }, { "auxiliary_loss_clip": 0.01026083, "auxiliary_loss_mlp": 0.01002985, "balance_loss_clip": 1.00157833, "balance_loss_mlp": 1.00422716, "epoch": 0.7100556140087179, "flos": 70456053456000.0, "grad_norm": 0.7636868883787757, "language_loss": 0.59441298, "learning_rate": 7.739769266683318e-07, "loss": 0.61470371, "num_input_tokens_seen": 254772615, "router_z_loss_clip": 0.01403809, "router_z_loss_mlp": 0.21875, "step": 11810, "time_per_iteration": 3.258805751800537 }, { "auxiliary_loss_clip": 0.01162095, "auxiliary_loss_mlp": 0.01039508, "balance_loss_clip": 1.027951, "balance_loss_mlp": 1.03872037, "epoch": 0.7101157372613859, "flos": 23039568420480.0, "grad_norm": 1.6053374469188744, "language_loss": 0.7401374, "learning_rate": 7.73678440965927e-07, "loss": 0.76215345, "num_input_tokens_seen": 254791375, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6953125, "step": 11811, "time_per_iteration": 2.627955913543701 }, { "auxiliary_loss_clip": 0.01129042, "auxiliary_loss_mlp": 0.010279, "balance_loss_clip": 1.01612759, "balance_loss_mlp": 1.03541493, "epoch": 0.7101758605140538, "flos": 23185329811200.0, "grad_norm": 1.7848130926165238, "language_loss": 0.83766848, "learning_rate": 7.73379999027784e-07, "loss": 0.85923791, "num_input_tokens_seen": 254809300, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.66796875, "step": 11812, "time_per_iteration": 2.579465627670288 }, { "auxiliary_loss_clip": 0.0110775, "auxiliary_loss_mlp": 0.01028918, "balance_loss_clip": 1.01624608, "balance_loss_mlp": 1.03639388, "epoch": 0.7102359837667218, "flos": 23696123166720.0, "grad_norm": 1.4940320902519302, "language_loss": 0.699314, "learning_rate": 7.730816008645537e-07, "loss": 0.72068071, "num_input_tokens_seen": 254829325, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 11813, "time_per_iteration": 3.9543092250823975 }, { "auxiliary_loss_clip": 0.01101442, "auxiliary_loss_mlp": 0.01025011, "balance_loss_clip": 1.01357865, "balance_loss_mlp": 1.03424585, "epoch": 0.7102961070193897, "flos": 19391116279680.0, "grad_norm": 1.8946511733731268, "language_loss": 0.815027, "learning_rate": 7.727832464868846e-07, "loss": 0.83629155, "num_input_tokens_seen": 254847690, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.671875, "step": 11814, "time_per_iteration": 2.4927778244018555 }, { "auxiliary_loss_clip": 0.01106398, "auxiliary_loss_mlp": 0.01031776, "balance_loss_clip": 1.01954544, "balance_loss_mlp": 1.03755581, "epoch": 0.7103562302720577, "flos": 21507511576320.0, "grad_norm": 1.7463149268541112, "language_loss": 0.75729799, "learning_rate": 7.724849359054257e-07, "loss": 0.77867973, "num_input_tokens_seen": 254865960, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 11815, "time_per_iteration": 2.667189598083496 }, { "auxiliary_loss_clip": 0.01137416, "auxiliary_loss_mlp": 0.01030475, "balance_loss_clip": 1.01839924, "balance_loss_mlp": 1.03423893, "epoch": 0.7104163535247257, "flos": 14535059869440.0, "grad_norm": 2.716738306086288, "language_loss": 0.78636682, "learning_rate": 7.721866691308208e-07, "loss": 0.80804574, "num_input_tokens_seen": 254882815, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 11816, "time_per_iteration": 2.6467015743255615 }, { "auxiliary_loss_clip": 0.01124529, "auxiliary_loss_mlp": 0.01037963, "balance_loss_clip": 1.0264771, "balance_loss_mlp": 1.03652453, "epoch": 0.7104764767773937, "flos": 11400310085760.0, "grad_norm": 1.7492508419003758, "language_loss": 0.86436713, "learning_rate": 7.718884461737159e-07, "loss": 0.88599205, "num_input_tokens_seen": 254898705, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.69921875, "step": 11817, "time_per_iteration": 2.505369186401367 }, { "auxiliary_loss_clip": 0.01138406, "auxiliary_loss_mlp": 0.01030468, "balance_loss_clip": 1.01919103, "balance_loss_mlp": 1.03485107, "epoch": 0.7105366000300616, "flos": 11690432236800.0, "grad_norm": 2.1327500732516915, "language_loss": 0.84393239, "learning_rate": 7.715902670447532e-07, "loss": 0.86562115, "num_input_tokens_seen": 254913665, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6796875, "step": 11818, "time_per_iteration": 2.5007612705230713 }, { "auxiliary_loss_clip": 0.01103738, "auxiliary_loss_mlp": 0.01031689, "balance_loss_clip": 1.01837349, "balance_loss_mlp": 1.03354025, "epoch": 0.7105967232827296, "flos": 19354020508800.0, "grad_norm": 2.0439363685234118, "language_loss": 0.75300723, "learning_rate": 7.712921317545742e-07, "loss": 0.77436149, "num_input_tokens_seen": 254932140, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.703125, "step": 11819, "time_per_iteration": 4.0022194385528564 }, { "auxiliary_loss_clip": 0.0114127, "auxiliary_loss_mlp": 0.01030375, "balance_loss_clip": 1.01663625, "balance_loss_mlp": 1.03558362, "epoch": 0.7106568465353975, "flos": 22930400010240.0, "grad_norm": 1.5996706214097756, "language_loss": 0.70785344, "learning_rate": 7.709940403138182e-07, "loss": 0.72956991, "num_input_tokens_seen": 254951580, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.703125, "step": 11820, "time_per_iteration": 2.6236159801483154 }, { "auxiliary_loss_clip": 0.0111366, "auxiliary_loss_mlp": 0.01027167, "balance_loss_clip": 1.0155561, "balance_loss_mlp": 1.03638029, "epoch": 0.7107169697880655, "flos": 19099665325440.0, "grad_norm": 1.8580054009739655, "language_loss": 0.75403696, "learning_rate": 7.706959927331232e-07, "loss": 0.77544528, "num_input_tokens_seen": 254969425, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.68359375, "step": 11821, "time_per_iteration": 2.50443959236145 }, { "auxiliary_loss_clip": 0.01062844, "auxiliary_loss_mlp": 0.01000329, "balance_loss_clip": 0.99894601, "balance_loss_mlp": 1.00484872, "epoch": 0.7107770930407336, "flos": 63638054231040.0, "grad_norm": 0.7701085806471544, "language_loss": 0.55130595, "learning_rate": 7.703979890231272e-07, "loss": 0.57193762, "num_input_tokens_seen": 255032680, "router_z_loss_clip": 0.01385498, "router_z_loss_mlp": 0.21875, "step": 11822, "time_per_iteration": 3.2302303314208984 }, { "auxiliary_loss_clip": 0.01108122, "auxiliary_loss_mlp": 0.01026314, "balance_loss_clip": 1.0139457, "balance_loss_mlp": 1.03375149, "epoch": 0.7108372162934015, "flos": 22054466949120.0, "grad_norm": 2.5217083164706215, "language_loss": 0.6033814, "learning_rate": 7.701000291944626e-07, "loss": 0.62472576, "num_input_tokens_seen": 255054400, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.65625, "step": 11823, "time_per_iteration": 2.6623587608337402 }, { "auxiliary_loss_clip": 0.01113449, "auxiliary_loss_mlp": 0.01029673, "balance_loss_clip": 1.01816297, "balance_loss_mlp": 1.03576398, "epoch": 0.7108973395460695, "flos": 19135144984320.0, "grad_norm": 1.8599360831511593, "language_loss": 0.71102273, "learning_rate": 7.69802113257765e-07, "loss": 0.732454, "num_input_tokens_seen": 255072785, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.69140625, "step": 11824, "time_per_iteration": 4.365592002868652 }, { "auxiliary_loss_clip": 0.01117458, "auxiliary_loss_mlp": 0.01276982, "balance_loss_clip": 1.01931906, "balance_loss_mlp": 1.03400421, "epoch": 0.7109574627987374, "flos": 17894431353600.0, "grad_norm": 1.9024229087549518, "language_loss": 0.72735304, "learning_rate": 7.695042412236656e-07, "loss": 0.75129747, "num_input_tokens_seen": 255091820, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.65234375, "step": 11825, "time_per_iteration": 4.030904769897461 }, { "auxiliary_loss_clip": 0.01123354, "auxiliary_loss_mlp": 0.01029484, "balance_loss_clip": 1.01767671, "balance_loss_mlp": 1.03476834, "epoch": 0.7110175860514054, "flos": 28979623422720.0, "grad_norm": 1.70517339726715, "language_loss": 0.79601276, "learning_rate": 7.692064131027947e-07, "loss": 0.81754112, "num_input_tokens_seen": 255111720, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.703125, "step": 11826, "time_per_iteration": 2.6939048767089844 }, { "auxiliary_loss_clip": 0.01110329, "auxiliary_loss_mlp": 0.01030946, "balance_loss_clip": 1.01963353, "balance_loss_mlp": 1.03329062, "epoch": 0.7110777093040733, "flos": 26173312623360.0, "grad_norm": 1.4849753262395553, "language_loss": 0.83138001, "learning_rate": 7.6890862890578e-07, "loss": 0.85279274, "num_input_tokens_seen": 255133495, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.68359375, "step": 11827, "time_per_iteration": 2.614384651184082 }, { "auxiliary_loss_clip": 0.01131596, "auxiliary_loss_mlp": 0.01033375, "balance_loss_clip": 1.0207926, "balance_loss_mlp": 1.03540254, "epoch": 0.7111378325567413, "flos": 26869943969280.0, "grad_norm": 1.3986444345116398, "language_loss": 0.62295651, "learning_rate": 7.686108886432512e-07, "loss": 0.64460623, "num_input_tokens_seen": 255156880, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 11828, "time_per_iteration": 2.6252548694610596 }, { "auxiliary_loss_clip": 0.01103352, "auxiliary_loss_mlp": 0.01031925, "balance_loss_clip": 1.01950359, "balance_loss_mlp": 1.0346632, "epoch": 0.7111979558094093, "flos": 27271820309760.0, "grad_norm": 1.6069991454547277, "language_loss": 0.72011918, "learning_rate": 7.683131923258308e-07, "loss": 0.74147195, "num_input_tokens_seen": 255178920, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6875, "step": 11829, "time_per_iteration": 2.5329301357269287 }, { "auxiliary_loss_clip": 0.01102506, "auxiliary_loss_mlp": 0.01030159, "balance_loss_clip": 1.01865506, "balance_loss_mlp": 1.03458881, "epoch": 0.7112580790620773, "flos": 25046938961280.0, "grad_norm": 1.9476843311544385, "language_loss": 0.80775827, "learning_rate": 7.680155399641448e-07, "loss": 0.82908487, "num_input_tokens_seen": 255198095, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6796875, "step": 11830, "time_per_iteration": 2.589531660079956 }, { "auxiliary_loss_clip": 0.01113965, "auxiliary_loss_mlp": 0.01037453, "balance_loss_clip": 1.02577043, "balance_loss_mlp": 1.03563261, "epoch": 0.7113182023147452, "flos": 21646628951040.0, "grad_norm": 1.7565609812808818, "language_loss": 0.84147286, "learning_rate": 7.677179315688147e-07, "loss": 0.86298704, "num_input_tokens_seen": 255215860, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69140625, "step": 11831, "time_per_iteration": 2.5011372566223145 }, { "auxiliary_loss_clip": 0.01122757, "auxiliary_loss_mlp": 0.01030431, "balance_loss_clip": 1.01874304, "balance_loss_mlp": 1.03557265, "epoch": 0.7113783255674132, "flos": 20996287257600.0, "grad_norm": 1.9888686518513123, "language_loss": 0.76874387, "learning_rate": 7.67420367150463e-07, "loss": 0.79027581, "num_input_tokens_seen": 255235425, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69140625, "step": 11832, "time_per_iteration": 2.5258491039276123 }, { "auxiliary_loss_clip": 0.01035728, "auxiliary_loss_mlp": 0.01001806, "balance_loss_clip": 1.00050068, "balance_loss_mlp": 1.0051477, "epoch": 0.7114384488200811, "flos": 66771080161920.0, "grad_norm": 0.7452922641182331, "language_loss": 0.56553459, "learning_rate": 7.671228467197069e-07, "loss": 0.58590996, "num_input_tokens_seen": 255291680, "router_z_loss_clip": 0.01306152, "router_z_loss_mlp": 0.21875, "step": 11833, "time_per_iteration": 3.040184736251831 }, { "auxiliary_loss_clip": 0.01136649, "auxiliary_loss_mlp": 0.01035306, "balance_loss_clip": 1.02384996, "balance_loss_mlp": 1.03283668, "epoch": 0.7114985720727491, "flos": 25010058672000.0, "grad_norm": 1.6024430508798548, "language_loss": 0.70572823, "learning_rate": 7.668253702871652e-07, "loss": 0.72744781, "num_input_tokens_seen": 255313880, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6796875, "step": 11834, "time_per_iteration": 2.6852872371673584 }, { "auxiliary_loss_clip": 0.01107885, "auxiliary_loss_mlp": 0.01029961, "balance_loss_clip": 1.01806939, "balance_loss_mlp": 1.03745592, "epoch": 0.7115586953254172, "flos": 21470128496640.0, "grad_norm": 1.9518086571404272, "language_loss": 0.79083204, "learning_rate": 7.665279378634548e-07, "loss": 0.81221044, "num_input_tokens_seen": 255332390, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.703125, "step": 11835, "time_per_iteration": 2.489264965057373 }, { "auxiliary_loss_clip": 0.01110015, "auxiliary_loss_mlp": 0.01029063, "balance_loss_clip": 1.017887, "balance_loss_mlp": 1.03361928, "epoch": 0.7116188185780851, "flos": 28622600190720.0, "grad_norm": 1.9342752116002764, "language_loss": 0.75960374, "learning_rate": 7.662305494591883e-07, "loss": 0.78099453, "num_input_tokens_seen": 255354025, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6796875, "step": 11836, "time_per_iteration": 2.634476661682129 }, { "auxiliary_loss_clip": 0.0115537, "auxiliary_loss_mlp": 0.01031706, "balance_loss_clip": 1.01950479, "balance_loss_mlp": 1.03339016, "epoch": 0.7116789418307531, "flos": 25293608634240.0, "grad_norm": 1.9340320157943138, "language_loss": 0.70376325, "learning_rate": 7.659332050849803e-07, "loss": 0.72563404, "num_input_tokens_seen": 255371400, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.68359375, "step": 11837, "time_per_iteration": 2.579958915710449 }, { "auxiliary_loss_clip": 0.01117838, "auxiliary_loss_mlp": 0.01037258, "balance_loss_clip": 1.02368557, "balance_loss_mlp": 1.03709733, "epoch": 0.711739065083421, "flos": 25557301353600.0, "grad_norm": 2.152306319386469, "language_loss": 0.61575806, "learning_rate": 7.656359047514411e-07, "loss": 0.63730901, "num_input_tokens_seen": 255390710, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 11838, "time_per_iteration": 2.617579936981201 }, { "auxiliary_loss_clip": 0.01114565, "auxiliary_loss_mlp": 0.0103013, "balance_loss_clip": 1.01807809, "balance_loss_mlp": 1.03659749, "epoch": 0.711799188336089, "flos": 26140993361280.0, "grad_norm": 2.1112926060642025, "language_loss": 0.6749711, "learning_rate": 7.653386484691828e-07, "loss": 0.69641805, "num_input_tokens_seen": 255408790, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 11839, "time_per_iteration": 2.527482509613037 }, { "auxiliary_loss_clip": 0.01106512, "auxiliary_loss_mlp": 0.01031924, "balance_loss_clip": 1.02038479, "balance_loss_mlp": 1.03731513, "epoch": 0.7118593115887569, "flos": 21140648017920.0, "grad_norm": 1.8781260133856599, "language_loss": 0.83995545, "learning_rate": 7.650414362488107e-07, "loss": 0.86133981, "num_input_tokens_seen": 255426280, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.69140625, "step": 11840, "time_per_iteration": 2.54530930519104 }, { "auxiliary_loss_clip": 0.01126254, "auxiliary_loss_mlp": 0.01032901, "balance_loss_clip": 1.02081907, "balance_loss_mlp": 1.03661656, "epoch": 0.711919434841425, "flos": 14975684006400.0, "grad_norm": 1.9361949051405627, "language_loss": 0.76141781, "learning_rate": 7.647442681009337e-07, "loss": 0.78300935, "num_input_tokens_seen": 255442935, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.71875, "step": 11841, "time_per_iteration": 2.521681070327759 }, { "auxiliary_loss_clip": 0.01123897, "auxiliary_loss_mlp": 0.01031827, "balance_loss_clip": 1.01919055, "balance_loss_mlp": 1.03591228, "epoch": 0.7119795580940929, "flos": 16508997826560.0, "grad_norm": 1.9330254708257504, "language_loss": 0.75409484, "learning_rate": 7.644471440361564e-07, "loss": 0.77565205, "num_input_tokens_seen": 255460925, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 11842, "time_per_iteration": 2.531212091445923 }, { "auxiliary_loss_clip": 0.01036131, "auxiliary_loss_mlp": 0.0100165, "balance_loss_clip": 1.00036895, "balance_loss_mlp": 1.005198, "epoch": 0.7120396813467609, "flos": 66570736055040.0, "grad_norm": 0.8051751865308565, "language_loss": 0.61647558, "learning_rate": 7.641500640650825e-07, "loss": 0.63685346, "num_input_tokens_seen": 255521360, "router_z_loss_clip": 0.01281738, "router_z_loss_mlp": 0.21777344, "step": 11843, "time_per_iteration": 3.0973124504089355 }, { "auxiliary_loss_clip": 0.01102129, "auxiliary_loss_mlp": 0.01026254, "balance_loss_clip": 1.01436853, "balance_loss_mlp": 1.03441238, "epoch": 0.7120998045994288, "flos": 26432731624320.0, "grad_norm": 1.5153154592913363, "language_loss": 0.805089, "learning_rate": 7.638530281983128e-07, "loss": 0.8263728, "num_input_tokens_seen": 255541435, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.67578125, "step": 11844, "time_per_iteration": 2.5261664390563965 }, { "auxiliary_loss_clip": 0.01138675, "auxiliary_loss_mlp": 0.0103047, "balance_loss_clip": 1.01803648, "balance_loss_mlp": 1.03374124, "epoch": 0.7121599278520968, "flos": 16427982700800.0, "grad_norm": 2.1319269976083803, "language_loss": 0.78510499, "learning_rate": 7.635560364464504e-07, "loss": 0.80679643, "num_input_tokens_seen": 255558505, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 11845, "time_per_iteration": 2.515753746032715 }, { "auxiliary_loss_clip": 0.01111816, "auxiliary_loss_mlp": 0.01032569, "balance_loss_clip": 1.02070165, "balance_loss_mlp": 1.03547335, "epoch": 0.7122200511047647, "flos": 28949889939840.0, "grad_norm": 1.9943877285797824, "language_loss": 0.77085745, "learning_rate": 7.632590888200912e-07, "loss": 0.7923013, "num_input_tokens_seen": 255577815, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.671875, "step": 11846, "time_per_iteration": 2.5883853435516357 }, { "auxiliary_loss_clip": 0.01114798, "auxiliary_loss_mlp": 0.01034142, "balance_loss_clip": 1.02175069, "balance_loss_mlp": 1.0359447, "epoch": 0.7122801743574327, "flos": 16471866142080.0, "grad_norm": 2.05848082962144, "language_loss": 0.58342695, "learning_rate": 7.629621853298343e-07, "loss": 0.60491639, "num_input_tokens_seen": 255595885, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69921875, "step": 11847, "time_per_iteration": 2.479625940322876 }, { "auxiliary_loss_clip": 0.01108875, "auxiliary_loss_mlp": 0.01278412, "balance_loss_clip": 1.02016115, "balance_loss_mlp": 1.03464401, "epoch": 0.7123402976101008, "flos": 20631039811200.0, "grad_norm": 1.4312128529234722, "language_loss": 0.71473873, "learning_rate": 7.626653259862743e-07, "loss": 0.73861164, "num_input_tokens_seen": 255616750, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.65625, "step": 11848, "time_per_iteration": 2.5034146308898926 }, { "auxiliary_loss_clip": 0.01035405, "auxiliary_loss_mlp": 0.01004179, "balance_loss_clip": 1.00291526, "balance_loss_mlp": 1.00462723, "epoch": 0.7124004208627687, "flos": 62325734837760.0, "grad_norm": 0.8418734316492189, "language_loss": 0.63072622, "learning_rate": 7.623685108000075e-07, "loss": 0.65112203, "num_input_tokens_seen": 255677900, "router_z_loss_clip": 0.01263428, "router_z_loss_mlp": 0.21679688, "step": 11849, "time_per_iteration": 3.097914695739746 }, { "auxiliary_loss_clip": 0.01122916, "auxiliary_loss_mlp": 0.01032264, "balance_loss_clip": 1.02036715, "balance_loss_mlp": 1.03467309, "epoch": 0.7124605441154367, "flos": 39675975788160.0, "grad_norm": 1.4720323762458247, "language_loss": 0.641132, "learning_rate": 7.620717397816243e-07, "loss": 0.66268378, "num_input_tokens_seen": 255699140, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69921875, "step": 11850, "time_per_iteration": 2.642489194869995 }, { "auxiliary_loss_clip": 0.01113173, "auxiliary_loss_mlp": 0.01031371, "balance_loss_clip": 1.01918793, "balance_loss_mlp": 1.03489375, "epoch": 0.7125206673681046, "flos": 28181868312960.0, "grad_norm": 1.7030019155611542, "language_loss": 0.69835365, "learning_rate": 7.617750129417157e-07, "loss": 0.71979916, "num_input_tokens_seen": 255719640, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.69140625, "step": 11851, "time_per_iteration": 2.557105541229248 }, { "auxiliary_loss_clip": 0.01112801, "auxiliary_loss_mlp": 0.01031724, "balance_loss_clip": 1.020298, "balance_loss_mlp": 1.03600049, "epoch": 0.7125807906207726, "flos": 26176939896960.0, "grad_norm": 1.531180213150648, "language_loss": 0.83078408, "learning_rate": 7.614783302908731e-07, "loss": 0.85222936, "num_input_tokens_seen": 255740450, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6796875, "step": 11852, "time_per_iteration": 2.550132989883423 }, { "auxiliary_loss_clip": 0.0111716, "auxiliary_loss_mlp": 0.01280401, "balance_loss_clip": 1.01990151, "balance_loss_mlp": 1.03657866, "epoch": 0.7126409138734405, "flos": 17157328358400.0, "grad_norm": 2.256863386170249, "language_loss": 0.73007369, "learning_rate": 7.611816918396816e-07, "loss": 0.7540493, "num_input_tokens_seen": 255758070, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71484375, "step": 11853, "time_per_iteration": 2.5131916999816895 }, { "auxiliary_loss_clip": 0.01126121, "auxiliary_loss_mlp": 0.01030189, "balance_loss_clip": 1.01913846, "balance_loss_mlp": 1.03378212, "epoch": 0.7127010371261085, "flos": 18769933451520.0, "grad_norm": 1.7880579049078151, "language_loss": 0.92009431, "learning_rate": 7.608850975987297e-07, "loss": 0.94165742, "num_input_tokens_seen": 255775685, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.65625, "step": 11854, "time_per_iteration": 3.953376293182373 }, { "auxiliary_loss_clip": 0.01110657, "auxiliary_loss_mlp": 0.01028935, "balance_loss_clip": 1.01735425, "balance_loss_mlp": 1.03425813, "epoch": 0.7127611603787765, "flos": 20376433232640.0, "grad_norm": 2.0024845126844233, "language_loss": 0.795488, "learning_rate": 7.605885475786007e-07, "loss": 0.81688398, "num_input_tokens_seen": 255794750, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.67578125, "step": 11855, "time_per_iteration": 2.7039988040924072 }, { "auxiliary_loss_clip": 0.0113934, "auxiliary_loss_mlp": 0.0103925, "balance_loss_clip": 1.02631569, "balance_loss_mlp": 1.03435183, "epoch": 0.7128212836314445, "flos": 20449008662400.0, "grad_norm": 1.9593871500505329, "language_loss": 0.72931242, "learning_rate": 7.602920417898795e-07, "loss": 0.75109828, "num_input_tokens_seen": 255813325, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69140625, "step": 11856, "time_per_iteration": 2.5981574058532715 }, { "auxiliary_loss_clip": 0.01117111, "auxiliary_loss_mlp": 0.01029493, "balance_loss_clip": 1.01736355, "balance_loss_mlp": 1.03709257, "epoch": 0.7128814068841124, "flos": 23440834229760.0, "grad_norm": 1.8683770285342896, "language_loss": 0.70076203, "learning_rate": 7.59995580243145e-07, "loss": 0.72222805, "num_input_tokens_seen": 255832470, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.7109375, "step": 11857, "time_per_iteration": 2.6373836994171143 }, { "auxiliary_loss_clip": 0.0111109, "auxiliary_loss_mlp": 0.01030059, "balance_loss_clip": 1.01799488, "balance_loss_mlp": 1.03441644, "epoch": 0.7129415301367804, "flos": 18222942165120.0, "grad_norm": 2.1612035172043145, "language_loss": 0.85262126, "learning_rate": 7.596991629489793e-07, "loss": 0.8740328, "num_input_tokens_seen": 255849740, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 11858, "time_per_iteration": 2.6002795696258545 }, { "auxiliary_loss_clip": 0.0111116, "auxiliary_loss_mlp": 0.01033805, "balance_loss_clip": 1.02083552, "balance_loss_mlp": 1.0383364, "epoch": 0.7130016533894483, "flos": 15523896355200.0, "grad_norm": 1.7528317893196026, "language_loss": 0.80047166, "learning_rate": 7.594027899179602e-07, "loss": 0.82192135, "num_input_tokens_seen": 255866975, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 11859, "time_per_iteration": 2.5197157859802246 }, { "auxiliary_loss_clip": 0.01119617, "auxiliary_loss_mlp": 0.01032192, "balance_loss_clip": 1.02009237, "balance_loss_mlp": 1.03427458, "epoch": 0.7130617766421163, "flos": 57115668960000.0, "grad_norm": 1.2023873753311076, "language_loss": 0.69035625, "learning_rate": 7.591064611606642e-07, "loss": 0.71187437, "num_input_tokens_seen": 255892915, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.67578125, "step": 11860, "time_per_iteration": 4.2938172817230225 }, { "auxiliary_loss_clip": 0.01113237, "auxiliary_loss_mlp": 0.01030824, "balance_loss_clip": 1.01765752, "balance_loss_mlp": 1.03500617, "epoch": 0.7131218998947844, "flos": 19788252024960.0, "grad_norm": 1.5937089423986583, "language_loss": 0.64352858, "learning_rate": 7.58810176687666e-07, "loss": 0.66496909, "num_input_tokens_seen": 255911480, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6953125, "step": 11861, "time_per_iteration": 2.5443172454833984 }, { "auxiliary_loss_clip": 0.01128368, "auxiliary_loss_mlp": 0.01030418, "balance_loss_clip": 1.01788318, "balance_loss_mlp": 1.03830588, "epoch": 0.7131820231474523, "flos": 26651894457600.0, "grad_norm": 1.7844200917849005, "language_loss": 0.67052126, "learning_rate": 7.585139365095412e-07, "loss": 0.69210911, "num_input_tokens_seen": 255931140, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.72265625, "step": 11862, "time_per_iteration": 2.5471019744873047 }, { "auxiliary_loss_clip": 0.01121272, "auxiliary_loss_mlp": 0.01034936, "balance_loss_clip": 1.02393913, "balance_loss_mlp": 1.03575385, "epoch": 0.7132421464001203, "flos": 29205609840000.0, "grad_norm": 2.8096153851315697, "language_loss": 0.66721177, "learning_rate": 7.582177406368591e-07, "loss": 0.68877387, "num_input_tokens_seen": 255951665, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.6796875, "step": 11863, "time_per_iteration": 2.636215925216675 }, { "auxiliary_loss_clip": 0.01148566, "auxiliary_loss_mlp": 0.01037473, "balance_loss_clip": 1.02450323, "balance_loss_mlp": 1.03895211, "epoch": 0.7133022696527882, "flos": 23073611535360.0, "grad_norm": 2.151745860807047, "language_loss": 0.65835744, "learning_rate": 7.579215890801923e-07, "loss": 0.68021786, "num_input_tokens_seen": 255970055, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7421875, "step": 11864, "time_per_iteration": 2.604266405105591 }, { "auxiliary_loss_clip": 0.0111728, "auxiliary_loss_mlp": 0.01031219, "balance_loss_clip": 1.01855278, "balance_loss_mlp": 1.03707409, "epoch": 0.7133623929054562, "flos": 17457111267840.0, "grad_norm": 2.1057716902034893, "language_loss": 0.85704237, "learning_rate": 7.576254818501091e-07, "loss": 0.8785274, "num_input_tokens_seen": 255987720, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 11865, "time_per_iteration": 2.561349391937256 }, { "auxiliary_loss_clip": 0.01132763, "auxiliary_loss_mlp": 0.01030251, "balance_loss_clip": 1.01750779, "balance_loss_mlp": 1.0354749, "epoch": 0.7134225161581241, "flos": 19536554448000.0, "grad_norm": 2.1194133879515964, "language_loss": 0.74533874, "learning_rate": 7.573294189571766e-07, "loss": 0.76696885, "num_input_tokens_seen": 256005490, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 11866, "time_per_iteration": 4.1267991065979 }, { "auxiliary_loss_clip": 0.01137033, "auxiliary_loss_mlp": 0.01036221, "balance_loss_clip": 1.02272606, "balance_loss_mlp": 1.03868651, "epoch": 0.7134826394107922, "flos": 26250089944320.0, "grad_norm": 2.0576537706812124, "language_loss": 0.70485806, "learning_rate": 7.570334004119606e-07, "loss": 0.72659057, "num_input_tokens_seen": 256026030, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 11867, "time_per_iteration": 4.046630859375 }, { "auxiliary_loss_clip": 0.01112818, "auxiliary_loss_mlp": 0.0102713, "balance_loss_clip": 1.01518512, "balance_loss_mlp": 1.03593969, "epoch": 0.7135427626634601, "flos": 15815311395840.0, "grad_norm": 2.166506419290184, "language_loss": 0.71737927, "learning_rate": 7.567374262250246e-07, "loss": 0.73877871, "num_input_tokens_seen": 256043680, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 11868, "time_per_iteration": 2.614428997039795 }, { "auxiliary_loss_clip": 0.01145749, "auxiliary_loss_mlp": 0.01030981, "balance_loss_clip": 1.01663458, "balance_loss_mlp": 1.03801942, "epoch": 0.7136028859161281, "flos": 18223409041920.0, "grad_norm": 2.804519444142392, "language_loss": 0.66409397, "learning_rate": 7.56441496406933e-07, "loss": 0.68586123, "num_input_tokens_seen": 256059705, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7265625, "step": 11869, "time_per_iteration": 2.5060248374938965 }, { "auxiliary_loss_clip": 0.01121961, "auxiliary_loss_mlp": 0.01026185, "balance_loss_clip": 1.01377487, "balance_loss_mlp": 1.03300238, "epoch": 0.713663009168796, "flos": 24314827956480.0, "grad_norm": 1.4612073260691802, "language_loss": 0.77738035, "learning_rate": 7.561456109682442e-07, "loss": 0.79886186, "num_input_tokens_seen": 256079785, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 11870, "time_per_iteration": 2.596874237060547 }, { "auxiliary_loss_clip": 0.01142227, "auxiliary_loss_mlp": 0.01032947, "balance_loss_clip": 1.01991725, "balance_loss_mlp": 1.03787649, "epoch": 0.713723132421464, "flos": 26538488242560.0, "grad_norm": 1.6527500352445763, "language_loss": 0.81195962, "learning_rate": 7.558497699195198e-07, "loss": 0.83371139, "num_input_tokens_seen": 256099000, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6875, "step": 11871, "time_per_iteration": 2.5478122234344482 }, { "auxiliary_loss_clip": 0.01133758, "auxiliary_loss_mlp": 0.01034512, "balance_loss_clip": 1.02134573, "balance_loss_mlp": 1.03523004, "epoch": 0.7137832556741319, "flos": 19865675790720.0, "grad_norm": 1.6039301163496564, "language_loss": 0.79065287, "learning_rate": 7.55553973271317e-07, "loss": 0.81233555, "num_input_tokens_seen": 256117985, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 11872, "time_per_iteration": 2.5169222354888916 }, { "auxiliary_loss_clip": 0.01116407, "auxiliary_loss_mlp": 0.01028802, "balance_loss_clip": 1.01517689, "balance_loss_mlp": 1.03622925, "epoch": 0.7138433789267999, "flos": 21688932193920.0, "grad_norm": 2.2971235781654964, "language_loss": 0.84048617, "learning_rate": 7.552582210341916e-07, "loss": 0.86193824, "num_input_tokens_seen": 256134350, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7109375, "step": 11873, "time_per_iteration": 2.505392551422119 }, { "auxiliary_loss_clip": 0.01128589, "auxiliary_loss_mlp": 0.01030433, "balance_loss_clip": 1.0191555, "balance_loss_mlp": 1.03397846, "epoch": 0.713903502179468, "flos": 17602729004160.0, "grad_norm": 1.9709182932825395, "language_loss": 0.8633306, "learning_rate": 7.549625132186976e-07, "loss": 0.88492084, "num_input_tokens_seen": 256150610, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6796875, "step": 11874, "time_per_iteration": 2.60122013092041 }, { "auxiliary_loss_clip": 0.0111418, "auxiliary_loss_mlp": 0.0103251, "balance_loss_clip": 1.01995134, "balance_loss_mlp": 1.03425479, "epoch": 0.7139636254321359, "flos": 18040336398720.0, "grad_norm": 1.6763900003875067, "language_loss": 0.82673919, "learning_rate": 7.546668498353896e-07, "loss": 0.8482061, "num_input_tokens_seen": 256168620, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 11875, "time_per_iteration": 2.48189377784729 }, { "auxiliary_loss_clip": 0.01128714, "auxiliary_loss_mlp": 0.01037596, "balance_loss_clip": 1.02243233, "balance_loss_mlp": 1.03546357, "epoch": 0.7140237486848039, "flos": 23331127115520.0, "grad_norm": 1.9356513840782914, "language_loss": 0.69728005, "learning_rate": 7.543712308948185e-07, "loss": 0.71894312, "num_input_tokens_seen": 256186700, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.7578125, "step": 11876, "time_per_iteration": 2.531872272491455 }, { "auxiliary_loss_clip": 0.01109388, "auxiliary_loss_mlp": 0.01033522, "balance_loss_clip": 1.020576, "balance_loss_mlp": 1.03761506, "epoch": 0.7140838719374718, "flos": 16837077674880.0, "grad_norm": 3.1909855581349573, "language_loss": 0.77626681, "learning_rate": 7.540756564075341e-07, "loss": 0.79769593, "num_input_tokens_seen": 256205390, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 11877, "time_per_iteration": 2.449892520904541 }, { "auxiliary_loss_clip": 0.01113809, "auxiliary_loss_mlp": 0.01033418, "balance_loss_clip": 1.0215323, "balance_loss_mlp": 1.03572249, "epoch": 0.7141439951901398, "flos": 21142012734720.0, "grad_norm": 1.8453954476181504, "language_loss": 0.69249851, "learning_rate": 7.537801263840837e-07, "loss": 0.71397078, "num_input_tokens_seen": 256224575, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69140625, "step": 11878, "time_per_iteration": 2.551666021347046 }, { "auxiliary_loss_clip": 0.0113121, "auxiliary_loss_mlp": 0.01036195, "balance_loss_clip": 1.02491808, "balance_loss_mlp": 1.03657627, "epoch": 0.7142041184428077, "flos": 24717709877760.0, "grad_norm": 1.7807704784171283, "language_loss": 0.67511797, "learning_rate": 7.534846408350163e-07, "loss": 0.69679201, "num_input_tokens_seen": 256242130, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.68359375, "step": 11879, "time_per_iteration": 2.6251697540283203 }, { "auxiliary_loss_clip": 0.01142289, "auxiliary_loss_mlp": 0.01036197, "balance_loss_clip": 1.02307212, "balance_loss_mlp": 1.03597534, "epoch": 0.7142642416954758, "flos": 21908202768000.0, "grad_norm": 8.83514100779987, "language_loss": 0.68912983, "learning_rate": 7.531891997708752e-07, "loss": 0.71091473, "num_input_tokens_seen": 256261920, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 11880, "time_per_iteration": 2.6399030685424805 }, { "auxiliary_loss_clip": 0.01125412, "auxiliary_loss_mlp": 0.01034571, "balance_loss_clip": 1.02128482, "balance_loss_mlp": 1.0351367, "epoch": 0.7143243649481437, "flos": 20805636844800.0, "grad_norm": 2.2511668828825497, "language_loss": 0.80531096, "learning_rate": 7.528938032022036e-07, "loss": 0.82691085, "num_input_tokens_seen": 256277970, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 11881, "time_per_iteration": 2.501676559448242 }, { "auxiliary_loss_clip": 0.01130984, "auxiliary_loss_mlp": 0.01031283, "balance_loss_clip": 1.01974916, "balance_loss_mlp": 1.03459978, "epoch": 0.7143844882008117, "flos": 27235011847680.0, "grad_norm": 1.4554070194750042, "language_loss": 0.63588011, "learning_rate": 7.525984511395449e-07, "loss": 0.65750277, "num_input_tokens_seen": 256298205, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6953125, "step": 11882, "time_per_iteration": 2.6325461864471436 }, { "auxiliary_loss_clip": 0.01109784, "auxiliary_loss_mlp": 0.01031307, "balance_loss_clip": 1.0190165, "balance_loss_mlp": 1.03775096, "epoch": 0.7144446114534796, "flos": 17929623703680.0, "grad_norm": 3.6056905493032176, "language_loss": 0.68625236, "learning_rate": 7.523031435934386e-07, "loss": 0.7076633, "num_input_tokens_seen": 256316685, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71875, "step": 11883, "time_per_iteration": 2.535219430923462 }, { "auxiliary_loss_clip": 0.0111563, "auxiliary_loss_mlp": 0.01036764, "balance_loss_clip": 1.02407455, "balance_loss_mlp": 1.03771675, "epoch": 0.7145047347061476, "flos": 20740962407040.0, "grad_norm": 1.9165060824862723, "language_loss": 0.77753699, "learning_rate": 7.520078805744239e-07, "loss": 0.79906094, "num_input_tokens_seen": 256334205, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 11884, "time_per_iteration": 2.526987314224243 }, { "auxiliary_loss_clip": 0.01119606, "auxiliary_loss_mlp": 0.01033537, "balance_loss_clip": 1.01919055, "balance_loss_mlp": 1.03561473, "epoch": 0.7145648579588155, "flos": 21178605715200.0, "grad_norm": 2.16553878003356, "language_loss": 0.73754209, "learning_rate": 7.51712662093037e-07, "loss": 0.7590735, "num_input_tokens_seen": 256353340, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.75, "step": 11885, "time_per_iteration": 2.516913414001465 }, { "auxiliary_loss_clip": 0.0104449, "auxiliary_loss_mlp": 0.0099793, "balance_loss_clip": 0.99677354, "balance_loss_mlp": 1.00465012, "epoch": 0.7146249812114835, "flos": 64784539509120.0, "grad_norm": 0.8895995322257613, "language_loss": 0.6638602, "learning_rate": 7.514174881598155e-07, "loss": 0.68428445, "num_input_tokens_seen": 256411550, "router_z_loss_clip": 0.01153564, "router_z_loss_mlp": 0.22070312, "step": 11886, "time_per_iteration": 3.0897514820098877 }, { "auxiliary_loss_clip": 0.01140412, "auxiliary_loss_mlp": 0.01030295, "balance_loss_clip": 1.01743865, "balance_loss_mlp": 1.03490543, "epoch": 0.7146851044641516, "flos": 18113881495680.0, "grad_norm": 1.5687069732096384, "language_loss": 0.75230253, "learning_rate": 7.511223587852906e-07, "loss": 0.77400959, "num_input_tokens_seen": 256430360, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 11887, "time_per_iteration": 2.540809154510498 }, { "auxiliary_loss_clip": 0.01113576, "auxiliary_loss_mlp": 0.01028649, "balance_loss_clip": 1.0167166, "balance_loss_mlp": 1.03501272, "epoch": 0.7147452277168195, "flos": 19243846517760.0, "grad_norm": 1.6944893885199106, "language_loss": 0.7143383, "learning_rate": 7.508272739799972e-07, "loss": 0.73576057, "num_input_tokens_seen": 256449750, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.703125, "step": 11888, "time_per_iteration": 2.5276341438293457 }, { "auxiliary_loss_clip": 0.01117496, "auxiliary_loss_mlp": 0.01033954, "balance_loss_clip": 1.02120495, "balance_loss_mlp": 1.03573632, "epoch": 0.7148053509694875, "flos": 23764712186880.0, "grad_norm": 1.786600422742048, "language_loss": 0.84429801, "learning_rate": 7.50532233754465e-07, "loss": 0.86581254, "num_input_tokens_seen": 256467330, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 11889, "time_per_iteration": 2.50416898727417 }, { "auxiliary_loss_clip": 0.01141276, "auxiliary_loss_mlp": 0.01029845, "balance_loss_clip": 1.01761985, "balance_loss_mlp": 1.0357089, "epoch": 0.7148654742221554, "flos": 22485322586880.0, "grad_norm": 2.3486226270780795, "language_loss": 0.75599337, "learning_rate": 7.502372381192233e-07, "loss": 0.77770448, "num_input_tokens_seen": 256485705, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69921875, "step": 11890, "time_per_iteration": 2.549016237258911 }, { "auxiliary_loss_clip": 0.01130319, "auxiliary_loss_mlp": 0.01030531, "balance_loss_clip": 1.01891947, "balance_loss_mlp": 1.03501797, "epoch": 0.7149255974748234, "flos": 24679213476480.0, "grad_norm": 1.5709151279890403, "language_loss": 0.7395308, "learning_rate": 7.499422870847991e-07, "loss": 0.76113927, "num_input_tokens_seen": 256504755, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6875, "step": 11891, "time_per_iteration": 2.6043343544006348 }, { "auxiliary_loss_clip": 0.01143065, "auxiliary_loss_mlp": 0.01035898, "balance_loss_clip": 1.02312493, "balance_loss_mlp": 1.03643858, "epoch": 0.7149857207274913, "flos": 18405583845120.0, "grad_norm": 1.8086578759505085, "language_loss": 0.67577153, "learning_rate": 7.4964738066172e-07, "loss": 0.69756114, "num_input_tokens_seen": 256523670, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 11892, "time_per_iteration": 2.573129653930664 }, { "auxiliary_loss_clip": 0.01135159, "auxiliary_loss_mlp": 0.0103239, "balance_loss_clip": 1.01849067, "balance_loss_mlp": 1.03526115, "epoch": 0.7150458439801594, "flos": 24969515195520.0, "grad_norm": 2.11569279522947, "language_loss": 0.73791724, "learning_rate": 7.493525188605095e-07, "loss": 0.75959277, "num_input_tokens_seen": 256542225, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.73046875, "step": 11893, "time_per_iteration": 2.574272394180298 }, { "auxiliary_loss_clip": 0.01108818, "auxiliary_loss_mlp": 0.01031787, "balance_loss_clip": 1.01834583, "balance_loss_mlp": 1.03588712, "epoch": 0.7151059672328273, "flos": 16690777580160.0, "grad_norm": 2.467294532655441, "language_loss": 0.66100526, "learning_rate": 7.490577016916905e-07, "loss": 0.68241131, "num_input_tokens_seen": 256560730, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73046875, "step": 11894, "time_per_iteration": 2.444427013397217 }, { "auxiliary_loss_clip": 0.01135234, "auxiliary_loss_mlp": 0.01032475, "balance_loss_clip": 1.01986277, "balance_loss_mlp": 1.0366832, "epoch": 0.7151660904854953, "flos": 27271820309760.0, "grad_norm": 1.936359949076732, "language_loss": 0.77917999, "learning_rate": 7.487629291657844e-07, "loss": 0.80085707, "num_input_tokens_seen": 256580505, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 11895, "time_per_iteration": 2.591738224029541 }, { "auxiliary_loss_clip": 0.01116461, "auxiliary_loss_mlp": 0.0103312, "balance_loss_clip": 1.01958394, "balance_loss_mlp": 1.03585327, "epoch": 0.7152262137381632, "flos": 18332254229760.0, "grad_norm": 2.0059276864903417, "language_loss": 0.69271231, "learning_rate": 7.484682012933107e-07, "loss": 0.71420813, "num_input_tokens_seen": 256597330, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 11896, "time_per_iteration": 3.897886037826538 }, { "auxiliary_loss_clip": 0.01124204, "auxiliary_loss_mlp": 0.01041846, "balance_loss_clip": 1.02837503, "balance_loss_mlp": 1.03467441, "epoch": 0.7152863369908312, "flos": 21799285752960.0, "grad_norm": 1.672856937980548, "language_loss": 0.86858892, "learning_rate": 7.481735180847876e-07, "loss": 0.89024949, "num_input_tokens_seen": 256616030, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 11897, "time_per_iteration": 2.61897349357605 }, { "auxiliary_loss_clip": 0.01122281, "auxiliary_loss_mlp": 0.01033835, "balance_loss_clip": 1.02106214, "balance_loss_mlp": 1.03598595, "epoch": 0.7153464602434991, "flos": 22158427887360.0, "grad_norm": 2.548678006985083, "language_loss": 0.78246379, "learning_rate": 7.478788795507309e-07, "loss": 0.80402493, "num_input_tokens_seen": 256635570, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 11898, "time_per_iteration": 2.554694414138794 }, { "auxiliary_loss_clip": 0.01127814, "auxiliary_loss_mlp": 0.01031222, "balance_loss_clip": 1.01762652, "balance_loss_mlp": 1.03630304, "epoch": 0.7154065834961671, "flos": 24716057852160.0, "grad_norm": 1.781257195624274, "language_loss": 0.72696781, "learning_rate": 7.47584285701657e-07, "loss": 0.74855816, "num_input_tokens_seen": 256655290, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 11899, "time_per_iteration": 2.561047315597534 }, { "auxiliary_loss_clip": 0.01113154, "auxiliary_loss_mlp": 0.01036529, "balance_loss_clip": 1.02323103, "balance_loss_mlp": 1.03402996, "epoch": 0.7154667067488351, "flos": 22601494149120.0, "grad_norm": 3.3377972146401818, "language_loss": 0.75458306, "learning_rate": 7.472897365480781e-07, "loss": 0.77607989, "num_input_tokens_seen": 256671605, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 11900, "time_per_iteration": 2.5177645683288574 }, { "auxiliary_loss_clip": 0.01117003, "auxiliary_loss_mlp": 0.01032269, "balance_loss_clip": 1.02054524, "balance_loss_mlp": 1.03673673, "epoch": 0.7155268300015031, "flos": 18771154513920.0, "grad_norm": 2.3454472010309835, "language_loss": 0.81009531, "learning_rate": 7.469952321005061e-07, "loss": 0.83158803, "num_input_tokens_seen": 256689680, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.71484375, "step": 11901, "time_per_iteration": 2.4854836463928223 }, { "auxiliary_loss_clip": 0.01137966, "auxiliary_loss_mlp": 0.01035056, "balance_loss_clip": 1.02190685, "balance_loss_mlp": 1.03674126, "epoch": 0.7155869532541711, "flos": 18296343607680.0, "grad_norm": 1.8476955384311669, "language_loss": 0.81739259, "learning_rate": 7.467007723694507e-07, "loss": 0.83912277, "num_input_tokens_seen": 256707760, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.74609375, "step": 11902, "time_per_iteration": 3.9115030765533447 }, { "auxiliary_loss_clip": 0.01131943, "auxiliary_loss_mlp": 0.01029801, "balance_loss_clip": 1.01763582, "balance_loss_mlp": 1.03475618, "epoch": 0.715647076506839, "flos": 11980805783040.0, "grad_norm": 1.7169978966136623, "language_loss": 0.68262208, "learning_rate": 7.464063573654222e-07, "loss": 0.70423949, "num_input_tokens_seen": 256724150, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 11903, "time_per_iteration": 2.5137062072753906 }, { "auxiliary_loss_clip": 0.01129506, "auxiliary_loss_mlp": 0.01030837, "balance_loss_clip": 1.01920247, "balance_loss_mlp": 1.03430355, "epoch": 0.715707199759507, "flos": 18951641377920.0, "grad_norm": 1.6163792580229224, "language_loss": 0.75937444, "learning_rate": 7.461119870989248e-07, "loss": 0.78097785, "num_input_tokens_seen": 256742780, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.68359375, "step": 11904, "time_per_iteration": 2.5771780014038086 }, { "auxiliary_loss_clip": 0.01124625, "auxiliary_loss_mlp": 0.01036673, "balance_loss_clip": 1.02316093, "balance_loss_mlp": 1.03637767, "epoch": 0.7157673230121749, "flos": 15304410299520.0, "grad_norm": 1.8014885683943833, "language_loss": 0.72040963, "learning_rate": 7.458176615804657e-07, "loss": 0.74202263, "num_input_tokens_seen": 256761355, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.70703125, "step": 11905, "time_per_iteration": 2.56868052482605 }, { "auxiliary_loss_clip": 0.01131047, "auxiliary_loss_mlp": 0.01033386, "balance_loss_clip": 1.02132201, "balance_loss_mlp": 1.03500032, "epoch": 0.715827446264843, "flos": 23221850964480.0, "grad_norm": 1.555989259185934, "language_loss": 0.77863449, "learning_rate": 7.455233808205483e-07, "loss": 0.80027878, "num_input_tokens_seen": 256781335, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6953125, "step": 11906, "time_per_iteration": 2.555938959121704 }, { "auxiliary_loss_clip": 0.01043873, "auxiliary_loss_mlp": 0.0099941, "balance_loss_clip": 0.99805051, "balance_loss_mlp": 1.00474501, "epoch": 0.7158875695175109, "flos": 60975421833600.0, "grad_norm": 0.7354960877787423, "language_loss": 0.5532223, "learning_rate": 7.452291448296744e-07, "loss": 0.57365513, "num_input_tokens_seen": 256838890, "router_z_loss_clip": 0.01361084, "router_z_loss_mlp": 0.21777344, "step": 11907, "time_per_iteration": 3.1184043884277344 }, { "auxiliary_loss_clip": 0.0111485, "auxiliary_loss_mlp": 0.01035051, "balance_loss_clip": 1.02245021, "balance_loss_mlp": 1.03492379, "epoch": 0.7159476927701789, "flos": 17128780024320.0, "grad_norm": 2.444466992432463, "language_loss": 0.69907475, "learning_rate": 7.44934953618344e-07, "loss": 0.72057378, "num_input_tokens_seen": 256858145, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 11908, "time_per_iteration": 3.9907078742980957 }, { "auxiliary_loss_clip": 0.01143427, "auxiliary_loss_mlp": 0.01033373, "balance_loss_clip": 1.01978302, "balance_loss_mlp": 1.03627443, "epoch": 0.7160078160228468, "flos": 22490601886080.0, "grad_norm": 1.7492270834125574, "language_loss": 0.71025264, "learning_rate": 7.446408071970576e-07, "loss": 0.73202062, "num_input_tokens_seen": 256878545, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7109375, "step": 11909, "time_per_iteration": 4.245952129364014 }, { "auxiliary_loss_clip": 0.01115518, "auxiliary_loss_mlp": 0.01034179, "balance_loss_clip": 1.02212119, "balance_loss_mlp": 1.03561711, "epoch": 0.7160679392755148, "flos": 30590935626240.0, "grad_norm": 1.6358142805775637, "language_loss": 0.75052857, "learning_rate": 7.443467055763113e-07, "loss": 0.77202559, "num_input_tokens_seen": 256899920, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.7109375, "step": 11910, "time_per_iteration": 2.625933885574341 }, { "auxiliary_loss_clip": 0.01123569, "auxiliary_loss_mlp": 0.01031723, "balance_loss_clip": 1.02034402, "balance_loss_mlp": 1.03593016, "epoch": 0.7161280625281827, "flos": 21323648833920.0, "grad_norm": 1.6015480810663245, "language_loss": 0.76683968, "learning_rate": 7.440526487666014e-07, "loss": 0.78839266, "num_input_tokens_seen": 256918460, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.69921875, "step": 11911, "time_per_iteration": 2.6009585857391357 }, { "auxiliary_loss_clip": 0.01134603, "auxiliary_loss_mlp": 0.01030955, "balance_loss_clip": 1.01768756, "balance_loss_mlp": 1.03656054, "epoch": 0.7161881857808508, "flos": 61860078921600.0, "grad_norm": 1.8455661161695198, "language_loss": 0.58684444, "learning_rate": 7.437586367784217e-07, "loss": 0.6085, "num_input_tokens_seen": 256942015, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 11912, "time_per_iteration": 3.005626916885376 }, { "auxiliary_loss_clip": 0.01116034, "auxiliary_loss_mlp": 0.01034418, "balance_loss_clip": 1.02128732, "balance_loss_mlp": 1.03767204, "epoch": 0.7162483090335187, "flos": 20812101292800.0, "grad_norm": 1.677023769181008, "language_loss": 0.7770443, "learning_rate": 7.434646696222648e-07, "loss": 0.79854882, "num_input_tokens_seen": 256961065, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6953125, "step": 11913, "time_per_iteration": 2.6017279624938965 }, { "auxiliary_loss_clip": 0.0111934, "auxiliary_loss_mlp": 0.01027745, "balance_loss_clip": 1.01638401, "balance_loss_mlp": 1.03410709, "epoch": 0.7163084322861867, "flos": 24097532630400.0, "grad_norm": 1.5953576623828187, "language_loss": 0.73893261, "learning_rate": 7.431707473086215e-07, "loss": 0.76040339, "num_input_tokens_seen": 256982165, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6796875, "step": 11914, "time_per_iteration": 2.6026408672332764 }, { "auxiliary_loss_clip": 0.01128875, "auxiliary_loss_mlp": 0.01035132, "balance_loss_clip": 1.02315164, "balance_loss_mlp": 1.03500903, "epoch": 0.7163685555388547, "flos": 20080888128000.0, "grad_norm": 1.6052610471495659, "language_loss": 0.7395848, "learning_rate": 7.428768698479808e-07, "loss": 0.76122487, "num_input_tokens_seen": 256999825, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.67578125, "step": 11915, "time_per_iteration": 2.5692508220672607 }, { "auxiliary_loss_clip": 0.01105719, "auxiliary_loss_mlp": 0.01032821, "balance_loss_clip": 1.02048254, "balance_loss_mlp": 1.0343684, "epoch": 0.7164286787915226, "flos": 17456967613440.0, "grad_norm": 1.719595834420095, "language_loss": 0.80913049, "learning_rate": 7.425830372508324e-07, "loss": 0.83051586, "num_input_tokens_seen": 257017450, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.71484375, "step": 11916, "time_per_iteration": 2.51489520072937 }, { "auxiliary_loss_clip": 0.01160601, "auxiliary_loss_mlp": 0.01032403, "balance_loss_clip": 1.02035058, "balance_loss_mlp": 1.03585005, "epoch": 0.7164888020441906, "flos": 19718908819200.0, "grad_norm": 1.933636734745716, "language_loss": 0.68175513, "learning_rate": 7.422892495276593e-07, "loss": 0.70368516, "num_input_tokens_seen": 257035465, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7109375, "step": 11917, "time_per_iteration": 2.5895583629608154 }, { "auxiliary_loss_clip": 0.01123488, "auxiliary_loss_mlp": 0.01031819, "balance_loss_clip": 1.01910567, "balance_loss_mlp": 1.03544688, "epoch": 0.7165489252968585, "flos": 21470523546240.0, "grad_norm": 1.522363620204492, "language_loss": 0.76012158, "learning_rate": 7.419955066889485e-07, "loss": 0.78167462, "num_input_tokens_seen": 257053750, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 11918, "time_per_iteration": 2.546103000640869 }, { "auxiliary_loss_clip": 0.01043985, "auxiliary_loss_mlp": 0.0100398, "balance_loss_clip": 1.0027703, "balance_loss_mlp": 1.00505567, "epoch": 0.7166090485495266, "flos": 69928060464000.0, "grad_norm": 0.6291303068001057, "language_loss": 0.53965741, "learning_rate": 7.417018087451812e-07, "loss": 0.56013709, "num_input_tokens_seen": 257121215, "router_z_loss_clip": 0.01208496, "router_z_loss_mlp": 0.21679688, "step": 11919, "time_per_iteration": 3.185166835784912 }, { "auxiliary_loss_clip": 0.01126337, "auxiliary_loss_mlp": 0.01031519, "balance_loss_clip": 1.01960373, "balance_loss_mlp": 1.03784537, "epoch": 0.7166691718021945, "flos": 27343892949120.0, "grad_norm": 1.6963521973485864, "language_loss": 0.68698204, "learning_rate": 7.414081557068412e-07, "loss": 0.70856059, "num_input_tokens_seen": 257143370, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.7109375, "step": 11920, "time_per_iteration": 2.5890653133392334 }, { "auxiliary_loss_clip": 0.01124852, "auxiliary_loss_mlp": 0.01037596, "balance_loss_clip": 1.02459025, "balance_loss_mlp": 1.03652489, "epoch": 0.7167292950548625, "flos": 30408868563840.0, "grad_norm": 1.9387650985419036, "language_loss": 0.74823308, "learning_rate": 7.411145475844052e-07, "loss": 0.76985753, "num_input_tokens_seen": 257162160, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 11921, "time_per_iteration": 2.608916997909546 }, { "auxiliary_loss_clip": 0.01137816, "auxiliary_loss_mlp": 0.01034559, "balance_loss_clip": 1.0192703, "balance_loss_mlp": 1.03708363, "epoch": 0.7167894183075304, "flos": 14571257800320.0, "grad_norm": 2.1300842112720124, "language_loss": 0.75672722, "learning_rate": 7.408209843883536e-07, "loss": 0.77845097, "num_input_tokens_seen": 257179300, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.73828125, "step": 11922, "time_per_iteration": 2.537220001220703 }, { "auxiliary_loss_clip": 0.01044404, "auxiliary_loss_mlp": 0.01004409, "balance_loss_clip": 1.00332439, "balance_loss_mlp": 1.00449872, "epoch": 0.7168495415601984, "flos": 64110674407680.0, "grad_norm": 0.7481204902121096, "language_loss": 0.55152154, "learning_rate": 7.405274661291619e-07, "loss": 0.57200968, "num_input_tokens_seen": 257235470, "router_z_loss_clip": 0.01086426, "router_z_loss_mlp": 0.21679688, "step": 11923, "time_per_iteration": 2.950328826904297 }, { "auxiliary_loss_clip": 0.01132923, "auxiliary_loss_mlp": 0.01033389, "balance_loss_clip": 1.02056825, "balance_loss_mlp": 1.03548265, "epoch": 0.7169096648128663, "flos": 24681440119680.0, "grad_norm": 1.677778547528008, "language_loss": 0.76836067, "learning_rate": 7.402339928173051e-07, "loss": 0.7900238, "num_input_tokens_seen": 257255850, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 11924, "time_per_iteration": 2.567828416824341 }, { "auxiliary_loss_clip": 0.01130071, "auxiliary_loss_mlp": 0.01028807, "balance_loss_clip": 1.01764894, "balance_loss_mlp": 1.03457367, "epoch": 0.7169697880655344, "flos": 20667525050880.0, "grad_norm": 2.036516797098429, "language_loss": 0.68409938, "learning_rate": 7.39940564463256e-07, "loss": 0.70568812, "num_input_tokens_seen": 257275425, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.69140625, "step": 11925, "time_per_iteration": 2.6304707527160645 }, { "auxiliary_loss_clip": 0.01117373, "auxiliary_loss_mlp": 0.010295, "balance_loss_clip": 1.01823473, "balance_loss_mlp": 1.0329299, "epoch": 0.7170299113182023, "flos": 21032700670080.0, "grad_norm": 1.5361470665880774, "language_loss": 0.7747463, "learning_rate": 7.396471810774876e-07, "loss": 0.79621506, "num_input_tokens_seen": 257295740, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6640625, "step": 11926, "time_per_iteration": 2.6361424922943115 }, { "auxiliary_loss_clip": 0.0111772, "auxiliary_loss_mlp": 0.01281587, "balance_loss_clip": 1.02144885, "balance_loss_mlp": 1.03471744, "epoch": 0.7170900345708703, "flos": 22893304239360.0, "grad_norm": 1.9854695373982583, "language_loss": 0.77157778, "learning_rate": 7.3935384267047e-07, "loss": 0.79557085, "num_input_tokens_seen": 257315970, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73828125, "step": 11927, "time_per_iteration": 2.578803062438965 }, { "auxiliary_loss_clip": 0.01114785, "auxiliary_loss_mlp": 0.01029453, "balance_loss_clip": 1.01663208, "balance_loss_mlp": 1.03786898, "epoch": 0.7171501578235383, "flos": 15518688883200.0, "grad_norm": 2.0046574589983526, "language_loss": 0.68769181, "learning_rate": 7.390605492526696e-07, "loss": 0.70913416, "num_input_tokens_seen": 257334230, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6796875, "step": 11928, "time_per_iteration": 2.4906742572784424 }, { "auxiliary_loss_clip": 0.01138583, "auxiliary_loss_mlp": 0.01030206, "balance_loss_clip": 1.01852918, "balance_loss_mlp": 1.03570533, "epoch": 0.7172102810762062, "flos": 26104292640000.0, "grad_norm": 1.745037595976899, "language_loss": 0.65308654, "learning_rate": 7.387673008345552e-07, "loss": 0.67477441, "num_input_tokens_seen": 257352145, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.671875, "step": 11929, "time_per_iteration": 2.6313397884368896 }, { "auxiliary_loss_clip": 0.0112576, "auxiliary_loss_mlp": 0.01028519, "balance_loss_clip": 1.01606131, "balance_loss_mlp": 1.03633022, "epoch": 0.7172704043288742, "flos": 21506649649920.0, "grad_norm": 1.8649073945103338, "language_loss": 0.69622183, "learning_rate": 7.384740974265917e-07, "loss": 0.71776462, "num_input_tokens_seen": 257371460, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.71484375, "step": 11930, "time_per_iteration": 2.542630434036255 }, { "auxiliary_loss_clip": 0.01131887, "auxiliary_loss_mlp": 0.01028211, "balance_loss_clip": 1.01621866, "balance_loss_mlp": 1.03491616, "epoch": 0.7173305275815421, "flos": 18770939032320.0, "grad_norm": 1.923743922922832, "language_loss": 0.80771172, "learning_rate": 7.381809390392426e-07, "loss": 0.82931268, "num_input_tokens_seen": 257390800, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.70703125, "step": 11931, "time_per_iteration": 2.559703826904297 }, { "auxiliary_loss_clip": 0.01112191, "auxiliary_loss_mlp": 0.01029398, "balance_loss_clip": 1.01794803, "balance_loss_mlp": 1.0347414, "epoch": 0.7173906508342102, "flos": 16179876483840.0, "grad_norm": 1.8858617679501646, "language_loss": 0.78163534, "learning_rate": 7.378878256829695e-07, "loss": 0.80305123, "num_input_tokens_seen": 257407495, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6875, "step": 11932, "time_per_iteration": 2.4897940158843994 }, { "auxiliary_loss_clip": 0.01113425, "auxiliary_loss_mlp": 0.01029692, "balance_loss_clip": 1.01785493, "balance_loss_mlp": 1.03535819, "epoch": 0.7174507740868781, "flos": 26613864933120.0, "grad_norm": 1.6100768554654716, "language_loss": 0.74908578, "learning_rate": 7.375947573682344e-07, "loss": 0.77051699, "num_input_tokens_seen": 257429675, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 11933, "time_per_iteration": 2.616417169570923 }, { "auxiliary_loss_clip": 0.01113158, "auxiliary_loss_mlp": 0.01033211, "balance_loss_clip": 1.01847076, "balance_loss_mlp": 1.03837192, "epoch": 0.7175108973395461, "flos": 18432911116800.0, "grad_norm": 2.337419693582176, "language_loss": 0.69239956, "learning_rate": 7.373017341054939e-07, "loss": 0.71386325, "num_input_tokens_seen": 257442765, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.75, "step": 11934, "time_per_iteration": 2.444918155670166 }, { "auxiliary_loss_clip": 0.0111387, "auxiliary_loss_mlp": 0.01031954, "balance_loss_clip": 1.01931787, "balance_loss_mlp": 1.03493595, "epoch": 0.717571020592214, "flos": 23914962777600.0, "grad_norm": 2.0891224978899294, "language_loss": 0.86588228, "learning_rate": 7.370087559052072e-07, "loss": 0.88734055, "num_input_tokens_seen": 257459310, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 11935, "time_per_iteration": 2.6399619579315186 }, { "auxiliary_loss_clip": 0.01132242, "auxiliary_loss_mlp": 0.01031802, "balance_loss_clip": 1.01936877, "balance_loss_mlp": 1.03498363, "epoch": 0.717631143844882, "flos": 38256930109440.0, "grad_norm": 1.5622025282375591, "language_loss": 0.73899221, "learning_rate": 7.367158227778285e-07, "loss": 0.76063269, "num_input_tokens_seen": 257484750, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 11936, "time_per_iteration": 2.6723029613494873 }, { "auxiliary_loss_clip": 0.01025867, "auxiliary_loss_mlp": 0.01000527, "balance_loss_clip": 0.99944824, "balance_loss_mlp": 1.00423455, "epoch": 0.7176912670975499, "flos": 65515896328320.0, "grad_norm": 0.7541845815143959, "language_loss": 0.55972445, "learning_rate": 7.36422934733814e-07, "loss": 0.57998842, "num_input_tokens_seen": 257543110, "router_z_loss_clip": 0.01080322, "router_z_loss_mlp": 0.21679688, "step": 11937, "time_per_iteration": 4.451771020889282 }, { "auxiliary_loss_clip": 0.01119096, "auxiliary_loss_mlp": 0.01031639, "balance_loss_clip": 1.01794231, "balance_loss_mlp": 1.0374167, "epoch": 0.717751390350218, "flos": 31281066610560.0, "grad_norm": 2.4110663737346947, "language_loss": 0.5471065, "learning_rate": 7.361300917836131e-07, "loss": 0.56861389, "num_input_tokens_seen": 257567410, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 11938, "time_per_iteration": 2.6295342445373535 }, { "auxiliary_loss_clip": 0.01109597, "auxiliary_loss_mlp": 0.01027914, "balance_loss_clip": 1.0160886, "balance_loss_mlp": 1.03375077, "epoch": 0.7178115136028859, "flos": 19859031774720.0, "grad_norm": 1.8022119803464824, "language_loss": 0.76531637, "learning_rate": 7.358372939376789e-07, "loss": 0.78669143, "num_input_tokens_seen": 257586270, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 11939, "time_per_iteration": 2.5226199626922607 }, { "auxiliary_loss_clip": 0.01043838, "auxiliary_loss_mlp": 0.01001616, "balance_loss_clip": 1.00037038, "balance_loss_mlp": 1.0044322, "epoch": 0.7178716368555539, "flos": 64348655967360.0, "grad_norm": 0.7553129792028276, "language_loss": 0.61422038, "learning_rate": 7.355445412064598e-07, "loss": 0.63467491, "num_input_tokens_seen": 257647415, "router_z_loss_clip": 0.01245117, "router_z_loss_mlp": 0.21679688, "step": 11940, "time_per_iteration": 3.133127450942993 }, { "auxiliary_loss_clip": 0.01129843, "auxiliary_loss_mlp": 0.01030643, "balance_loss_clip": 1.01891828, "balance_loss_mlp": 1.03522003, "epoch": 0.7179317601082219, "flos": 26762607152640.0, "grad_norm": 1.50377613600341, "language_loss": 0.59040767, "learning_rate": 7.352518336004037e-07, "loss": 0.61201251, "num_input_tokens_seen": 257669795, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.68359375, "step": 11941, "time_per_iteration": 2.6490018367767334 }, { "auxiliary_loss_clip": 0.01122536, "auxiliary_loss_mlp": 0.01032142, "balance_loss_clip": 1.02010226, "balance_loss_mlp": 1.03600931, "epoch": 0.7179918833608898, "flos": 23513804709120.0, "grad_norm": 4.252185779701552, "language_loss": 0.79106259, "learning_rate": 7.349591711299561e-07, "loss": 0.81260943, "num_input_tokens_seen": 257687415, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 11942, "time_per_iteration": 2.6757001876831055 }, { "auxiliary_loss_clip": 0.01115603, "auxiliary_loss_mlp": 0.01039576, "balance_loss_clip": 1.02660024, "balance_loss_mlp": 1.03577948, "epoch": 0.7180520066135578, "flos": 17165588486400.0, "grad_norm": 2.2245320797926382, "language_loss": 0.66344583, "learning_rate": 7.34666553805561e-07, "loss": 0.68499756, "num_input_tokens_seen": 257706215, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 11943, "time_per_iteration": 2.497282028198242 }, { "auxiliary_loss_clip": 0.01103701, "auxiliary_loss_mlp": 0.01030327, "balance_loss_clip": 1.01770878, "balance_loss_mlp": 1.03568459, "epoch": 0.7181121298662257, "flos": 17566638814080.0, "grad_norm": 1.6409699436099918, "language_loss": 0.78658152, "learning_rate": 7.343739816376631e-07, "loss": 0.80792177, "num_input_tokens_seen": 257724740, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 11944, "time_per_iteration": 3.8868227005004883 }, { "auxiliary_loss_clip": 0.01128409, "auxiliary_loss_mlp": 0.01287233, "balance_loss_clip": 1.02712834, "balance_loss_mlp": 1.03774142, "epoch": 0.7181722531188938, "flos": 11947660508160.0, "grad_norm": 1.9569674604313037, "language_loss": 0.62557852, "learning_rate": 7.34081454636701e-07, "loss": 0.64973497, "num_input_tokens_seen": 257742060, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 11945, "time_per_iteration": 2.506345748901367 }, { "auxiliary_loss_clip": 0.01035386, "auxiliary_loss_mlp": 0.01000067, "balance_loss_clip": 0.99882716, "balance_loss_mlp": 1.00462019, "epoch": 0.7182323763715617, "flos": 65503649790720.0, "grad_norm": 0.703547808953433, "language_loss": 0.51060343, "learning_rate": 7.337889728131159e-07, "loss": 0.53095794, "num_input_tokens_seen": 257802250, "router_z_loss_clip": 0.01239014, "router_z_loss_mlp": 0.21679688, "step": 11946, "time_per_iteration": 3.010918378829956 }, { "auxiliary_loss_clip": 0.0113581, "auxiliary_loss_mlp": 0.01038479, "balance_loss_clip": 1.02364993, "balance_loss_mlp": 1.03694797, "epoch": 0.7182924996242297, "flos": 20630932070400.0, "grad_norm": 2.118219348025446, "language_loss": 0.74463528, "learning_rate": 7.334965361773453e-07, "loss": 0.76637816, "num_input_tokens_seen": 257821155, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7265625, "step": 11947, "time_per_iteration": 2.5692408084869385 }, { "auxiliary_loss_clip": 0.01137662, "auxiliary_loss_mlp": 0.01275321, "balance_loss_clip": 1.01562881, "balance_loss_mlp": 1.03327203, "epoch": 0.7183526228768976, "flos": 16216433550720.0, "grad_norm": 1.706711009912547, "language_loss": 0.72878087, "learning_rate": 7.332041447398256e-07, "loss": 0.75291073, "num_input_tokens_seen": 257839905, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 11948, "time_per_iteration": 2.509108066558838 }, { "auxiliary_loss_clip": 0.01126581, "auxiliary_loss_mlp": 0.01039586, "balance_loss_clip": 1.02604389, "balance_loss_mlp": 1.03708386, "epoch": 0.7184127461295656, "flos": 22232655342720.0, "grad_norm": 1.739138586328369, "language_loss": 0.71691167, "learning_rate": 7.329117985109908e-07, "loss": 0.73857331, "num_input_tokens_seen": 257860055, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71484375, "step": 11949, "time_per_iteration": 4.059336185455322 }, { "auxiliary_loss_clip": 0.0110344, "auxiliary_loss_mlp": 0.01278003, "balance_loss_clip": 1.01950383, "balance_loss_mlp": 1.03564918, "epoch": 0.7184728693822335, "flos": 27344503480320.0, "grad_norm": 1.9377675761645512, "language_loss": 0.7607336, "learning_rate": 7.326194975012759e-07, "loss": 0.78454804, "num_input_tokens_seen": 257879315, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.67578125, "step": 11950, "time_per_iteration": 4.049072504043579 }, { "auxiliary_loss_clip": 0.01109988, "auxiliary_loss_mlp": 0.01037167, "balance_loss_clip": 1.02334476, "balance_loss_mlp": 1.03852248, "epoch": 0.7185329926349016, "flos": 16508530949760.0, "grad_norm": 2.5646475381892317, "language_loss": 0.67626035, "learning_rate": 7.323272417211095e-07, "loss": 0.69773185, "num_input_tokens_seen": 257896570, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71484375, "step": 11951, "time_per_iteration": 2.491605281829834 }, { "auxiliary_loss_clip": 0.01129738, "auxiliary_loss_mlp": 0.01035251, "balance_loss_clip": 1.02135158, "balance_loss_mlp": 1.03956091, "epoch": 0.7185931158875695, "flos": 23951052967680.0, "grad_norm": 1.920370633132741, "language_loss": 0.78159654, "learning_rate": 7.320350311809238e-07, "loss": 0.80324638, "num_input_tokens_seen": 257916855, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.72265625, "step": 11952, "time_per_iteration": 2.627823829650879 }, { "auxiliary_loss_clip": 0.01117947, "auxiliary_loss_mlp": 0.01031413, "balance_loss_clip": 1.01806808, "balance_loss_mlp": 1.03657699, "epoch": 0.7186532391402375, "flos": 26542007775360.0, "grad_norm": 1.8097971558462442, "language_loss": 0.74930388, "learning_rate": 7.317428658911456e-07, "loss": 0.77079743, "num_input_tokens_seen": 257937140, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.72265625, "step": 11953, "time_per_iteration": 2.5544230937957764 }, { "auxiliary_loss_clip": 0.0111603, "auxiliary_loss_mlp": 0.01031992, "balance_loss_clip": 1.01909935, "balance_loss_mlp": 1.03615522, "epoch": 0.7187133623929055, "flos": 22383049587840.0, "grad_norm": 1.8103549797365375, "language_loss": 0.73101819, "learning_rate": 7.314507458622033e-07, "loss": 0.75249839, "num_input_tokens_seen": 257956785, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 11954, "time_per_iteration": 2.5713160037994385 }, { "auxiliary_loss_clip": 0.01126808, "auxiliary_loss_mlp": 0.01040087, "balance_loss_clip": 1.0272007, "balance_loss_mlp": 1.03809524, "epoch": 0.7187734856455734, "flos": 15779580341760.0, "grad_norm": 1.9004453911982824, "language_loss": 0.74174023, "learning_rate": 7.311586711045197e-07, "loss": 0.7634092, "num_input_tokens_seen": 257975455, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 11955, "time_per_iteration": 2.564631938934326 }, { "auxiliary_loss_clip": 0.01133702, "auxiliary_loss_mlp": 0.01033328, "balance_loss_clip": 1.02084649, "balance_loss_mlp": 1.0361706, "epoch": 0.7188336088982414, "flos": 31759612531200.0, "grad_norm": 1.625947616448871, "language_loss": 0.73213661, "learning_rate": 7.308666416285198e-07, "loss": 0.75380683, "num_input_tokens_seen": 257996850, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 11956, "time_per_iteration": 2.6745684146881104 }, { "auxiliary_loss_clip": 0.01158783, "auxiliary_loss_mlp": 0.01033326, "balance_loss_clip": 1.02024329, "balance_loss_mlp": 1.03348172, "epoch": 0.7188937321509093, "flos": 21465208333440.0, "grad_norm": 1.722198674305708, "language_loss": 0.70764041, "learning_rate": 7.305746574446256e-07, "loss": 0.72956151, "num_input_tokens_seen": 258016145, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 11957, "time_per_iteration": 2.618929386138916 }, { "auxiliary_loss_clip": 0.01151783, "auxiliary_loss_mlp": 0.0104201, "balance_loss_clip": 1.02858758, "balance_loss_mlp": 1.03525662, "epoch": 0.7189538554035774, "flos": 27271497087360.0, "grad_norm": 1.8222121214865905, "language_loss": 0.73435128, "learning_rate": 7.302827185632552e-07, "loss": 0.75628924, "num_input_tokens_seen": 258035420, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 11958, "time_per_iteration": 2.6951234340667725 }, { "auxiliary_loss_clip": 0.01113966, "auxiliary_loss_mlp": 0.01037714, "balance_loss_clip": 1.02497065, "balance_loss_mlp": 1.03646874, "epoch": 0.7190139786562453, "flos": 21580625710080.0, "grad_norm": 2.2754736907657565, "language_loss": 0.84077799, "learning_rate": 7.29990824994829e-07, "loss": 0.86229479, "num_input_tokens_seen": 258053520, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 11959, "time_per_iteration": 2.6058011054992676 }, { "auxiliary_loss_clip": 0.01112083, "auxiliary_loss_mlp": 0.01033664, "balance_loss_clip": 1.02010942, "balance_loss_mlp": 1.03902507, "epoch": 0.7190741019089133, "flos": 26721237663360.0, "grad_norm": 1.5223712536542542, "language_loss": 0.81833839, "learning_rate": 7.296989767497635e-07, "loss": 0.83979589, "num_input_tokens_seen": 258073020, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73046875, "step": 11960, "time_per_iteration": 2.594036102294922 }, { "auxiliary_loss_clip": 0.01124157, "auxiliary_loss_mlp": 0.01036883, "balance_loss_clip": 1.02309656, "balance_loss_mlp": 1.0375241, "epoch": 0.7191342251615812, "flos": 26104759516800.0, "grad_norm": 1.7563448755237194, "language_loss": 0.77754629, "learning_rate": 7.294071738384739e-07, "loss": 0.79915667, "num_input_tokens_seen": 258093155, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.6875, "step": 11961, "time_per_iteration": 2.616046905517578 }, { "auxiliary_loss_clip": 0.01110461, "auxiliary_loss_mlp": 0.01031264, "balance_loss_clip": 1.01951575, "balance_loss_mlp": 1.03528953, "epoch": 0.7191943484142492, "flos": 22967028904320.0, "grad_norm": 1.3497439095388981, "language_loss": 0.75077069, "learning_rate": 7.291154162713733e-07, "loss": 0.77218795, "num_input_tokens_seen": 258113905, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6640625, "step": 11962, "time_per_iteration": 2.5566112995147705 }, { "auxiliary_loss_clip": 0.0111518, "auxiliary_loss_mlp": 0.01030763, "balance_loss_clip": 1.01692891, "balance_loss_mlp": 1.03556311, "epoch": 0.7192544716669171, "flos": 22565332131840.0, "grad_norm": 4.474019668870703, "language_loss": 0.75142241, "learning_rate": 7.28823704058875e-07, "loss": 0.77288181, "num_input_tokens_seen": 258132820, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7109375, "step": 11963, "time_per_iteration": 2.545167922973633 }, { "auxiliary_loss_clip": 0.01136495, "auxiliary_loss_mlp": 0.01036138, "balance_loss_clip": 1.02193427, "balance_loss_mlp": 1.0367763, "epoch": 0.7193145949195852, "flos": 18982200873600.0, "grad_norm": 3.7402885500034153, "language_loss": 0.81235409, "learning_rate": 7.285320372113888e-07, "loss": 0.8340804, "num_input_tokens_seen": 258148055, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7265625, "step": 11964, "time_per_iteration": 2.5012242794036865 }, { "auxiliary_loss_clip": 0.01113628, "auxiliary_loss_mlp": 0.01031217, "balance_loss_clip": 1.01903987, "balance_loss_mlp": 1.03613448, "epoch": 0.7193747181722531, "flos": 18004246208640.0, "grad_norm": 1.8349862100761138, "language_loss": 0.74984133, "learning_rate": 7.282404157393239e-07, "loss": 0.77128977, "num_input_tokens_seen": 258165995, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6875, "step": 11965, "time_per_iteration": 2.5190975666046143 }, { "auxiliary_loss_clip": 0.01126051, "auxiliary_loss_mlp": 0.01033757, "balance_loss_clip": 1.02032185, "balance_loss_mlp": 1.0363822, "epoch": 0.7194348414249211, "flos": 24389414547840.0, "grad_norm": 1.4985954254512461, "language_loss": 0.77534187, "learning_rate": 7.279488396530862e-07, "loss": 0.79693991, "num_input_tokens_seen": 258186165, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71484375, "step": 11966, "time_per_iteration": 2.5534660816192627 }, { "auxiliary_loss_clip": 0.01124341, "auxiliary_loss_mlp": 0.01029548, "balance_loss_clip": 1.01689422, "balance_loss_mlp": 1.03644896, "epoch": 0.7194949646775891, "flos": 22163455791360.0, "grad_norm": 2.811150566235383, "language_loss": 0.72733521, "learning_rate": 7.276573089630837e-07, "loss": 0.74887413, "num_input_tokens_seen": 258204595, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 11967, "time_per_iteration": 2.5458271503448486 }, { "auxiliary_loss_clip": 0.01121189, "auxiliary_loss_mlp": 0.0103191, "balance_loss_clip": 1.0206449, "balance_loss_mlp": 1.03821421, "epoch": 0.719555087930257, "flos": 20266366982400.0, "grad_norm": 1.5723912093429877, "language_loss": 0.8144238, "learning_rate": 7.273658236797176e-07, "loss": 0.83595479, "num_input_tokens_seen": 258223110, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.65234375, "step": 11968, "time_per_iteration": 2.533754587173462 }, { "auxiliary_loss_clip": 0.01121797, "auxiliary_loss_mlp": 0.01025216, "balance_loss_clip": 1.01311016, "balance_loss_mlp": 1.03535831, "epoch": 0.719615211182925, "flos": 24716309247360.0, "grad_norm": 1.7024030104758392, "language_loss": 0.76588678, "learning_rate": 7.270743838133923e-07, "loss": 0.78735685, "num_input_tokens_seen": 258242660, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 11969, "time_per_iteration": 2.631234645843506 }, { "auxiliary_loss_clip": 0.01131794, "auxiliary_loss_mlp": 0.0103127, "balance_loss_clip": 1.01803231, "balance_loss_mlp": 1.03391457, "epoch": 0.719675334435593, "flos": 20009641501440.0, "grad_norm": 1.750869084307273, "language_loss": 0.71078908, "learning_rate": 7.267829893745075e-07, "loss": 0.73241973, "num_input_tokens_seen": 258261850, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 11970, "time_per_iteration": 2.577347755432129 }, { "auxiliary_loss_clip": 0.01043015, "auxiliary_loss_mlp": 0.01000414, "balance_loss_clip": 0.99903691, "balance_loss_mlp": 1.00364542, "epoch": 0.719735457688261, "flos": 44199861177600.0, "grad_norm": 0.9220622044819065, "language_loss": 0.60842699, "learning_rate": 7.264916403734638e-07, "loss": 0.62886131, "num_input_tokens_seen": 258312570, "router_z_loss_clip": 0.01379395, "router_z_loss_mlp": 0.21679688, "step": 11971, "time_per_iteration": 3.0629048347473145 }, { "auxiliary_loss_clip": 0.01112157, "auxiliary_loss_mlp": 0.01031846, "balance_loss_clip": 1.01825655, "balance_loss_mlp": 1.03712678, "epoch": 0.7197955809409289, "flos": 16802890905600.0, "grad_norm": 1.802832913589439, "language_loss": 0.7989521, "learning_rate": 7.262003368206571e-07, "loss": 0.82039213, "num_input_tokens_seen": 258331600, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.75, "step": 11972, "time_per_iteration": 2.532116651535034 }, { "auxiliary_loss_clip": 0.01128127, "auxiliary_loss_mlp": 0.01037823, "balance_loss_clip": 1.02452493, "balance_loss_mlp": 1.03856802, "epoch": 0.7198557041935969, "flos": 24535391420160.0, "grad_norm": 1.4919035253298232, "language_loss": 0.75635689, "learning_rate": 7.25909078726483e-07, "loss": 0.77801639, "num_input_tokens_seen": 258351785, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 11973, "time_per_iteration": 2.634169101715088 }, { "auxiliary_loss_clip": 0.01061394, "auxiliary_loss_mlp": 0.01000685, "balance_loss_clip": 0.99950469, "balance_loss_mlp": 1.00452018, "epoch": 0.7199158274462648, "flos": 70710839602560.0, "grad_norm": 0.8550692410725076, "language_loss": 0.57274228, "learning_rate": 7.256178661013376e-07, "loss": 0.59336305, "num_input_tokens_seen": 258404035, "router_z_loss_clip": 0.01177979, "router_z_loss_mlp": 0.21875, "step": 11974, "time_per_iteration": 3.063836097717285 }, { "auxiliary_loss_clip": 0.01125048, "auxiliary_loss_mlp": 0.01029467, "balance_loss_clip": 1.01638985, "balance_loss_mlp": 1.03426623, "epoch": 0.7199759506989328, "flos": 29347995352320.0, "grad_norm": 2.5448264394153006, "language_loss": 0.61757672, "learning_rate": 7.253266989556115e-07, "loss": 0.63912189, "num_input_tokens_seen": 258424850, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 11975, "time_per_iteration": 2.6520543098449707 }, { "auxiliary_loss_clip": 0.01113633, "auxiliary_loss_mlp": 0.01032703, "balance_loss_clip": 1.02025771, "balance_loss_mlp": 1.03468728, "epoch": 0.7200360739516007, "flos": 24640465680000.0, "grad_norm": 1.7243152079875947, "language_loss": 0.67718852, "learning_rate": 7.250355772996972e-07, "loss": 0.69865185, "num_input_tokens_seen": 258445485, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69921875, "step": 11976, "time_per_iteration": 2.5638861656188965 }, { "auxiliary_loss_clip": 0.01133379, "auxiliary_loss_mlp": 0.01031615, "balance_loss_clip": 1.01986146, "balance_loss_mlp": 1.03616333, "epoch": 0.7200961972042688, "flos": 20812855478400.0, "grad_norm": 2.3801046981840694, "language_loss": 0.67012751, "learning_rate": 7.247445011439836e-07, "loss": 0.69177747, "num_input_tokens_seen": 258464505, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.703125, "step": 11977, "time_per_iteration": 2.5620646476745605 }, { "auxiliary_loss_clip": 0.01126289, "auxiliary_loss_mlp": 0.01034398, "balance_loss_clip": 1.02138078, "balance_loss_mlp": 1.03717422, "epoch": 0.7201563204569367, "flos": 31245910174080.0, "grad_norm": 2.90006737563856, "language_loss": 0.75568724, "learning_rate": 7.244534704988582e-07, "loss": 0.77729416, "num_input_tokens_seen": 258487190, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 11978, "time_per_iteration": 4.005890130996704 }, { "auxiliary_loss_clip": 0.01103565, "auxiliary_loss_mlp": 0.01030834, "balance_loss_clip": 1.01826298, "balance_loss_mlp": 1.034374, "epoch": 0.7202164437096047, "flos": 26651391667200.0, "grad_norm": 1.771943331681026, "language_loss": 0.78479534, "learning_rate": 7.24162485374707e-07, "loss": 0.80613935, "num_input_tokens_seen": 258503790, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 11979, "time_per_iteration": 2.5155045986175537 }, { "auxiliary_loss_clip": 0.01116037, "auxiliary_loss_mlp": 0.01028282, "balance_loss_clip": 1.0153594, "balance_loss_mlp": 1.03625703, "epoch": 0.7202765669622727, "flos": 25959608657280.0, "grad_norm": 1.7452606575940812, "language_loss": 0.64800715, "learning_rate": 7.238715457819154e-07, "loss": 0.66945034, "num_input_tokens_seen": 258527335, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 11980, "time_per_iteration": 2.607738971710205 }, { "auxiliary_loss_clip": 0.01118618, "auxiliary_loss_mlp": 0.01037347, "balance_loss_clip": 1.02248788, "balance_loss_mlp": 1.03780961, "epoch": 0.7203366902149406, "flos": 28512354372480.0, "grad_norm": 1.8792954718291752, "language_loss": 0.66956079, "learning_rate": 7.235806517308656e-07, "loss": 0.69112039, "num_input_tokens_seen": 258546690, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.71875, "step": 11981, "time_per_iteration": 2.5864593982696533 }, { "auxiliary_loss_clip": 0.01126768, "auxiliary_loss_mlp": 0.01032904, "balance_loss_clip": 1.01968956, "balance_loss_mlp": 1.03683853, "epoch": 0.7203968134676086, "flos": 21106030285440.0, "grad_norm": 1.8090664264188108, "language_loss": 0.73999965, "learning_rate": 7.232898032319392e-07, "loss": 0.76159644, "num_input_tokens_seen": 258566340, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 11982, "time_per_iteration": 2.5912320613861084 }, { "auxiliary_loss_clip": 0.01123735, "auxiliary_loss_mlp": 0.01284703, "balance_loss_clip": 1.02591729, "balance_loss_mlp": 1.0343802, "epoch": 0.7204569367202766, "flos": 18332146488960.0, "grad_norm": 2.076956240273984, "language_loss": 0.65837568, "learning_rate": 7.229990002955148e-07, "loss": 0.68246007, "num_input_tokens_seen": 258584455, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.7109375, "step": 11983, "time_per_iteration": 2.5567307472229004 }, { "auxiliary_loss_clip": 0.01127168, "auxiliary_loss_mlp": 0.01035393, "balance_loss_clip": 1.02239895, "balance_loss_mlp": 1.0383091, "epoch": 0.7205170599729446, "flos": 23255103980160.0, "grad_norm": 1.7019304317436923, "language_loss": 0.66726363, "learning_rate": 7.227082429319726e-07, "loss": 0.68888927, "num_input_tokens_seen": 258604725, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 11984, "time_per_iteration": 2.586287260055542 }, { "auxiliary_loss_clip": 0.01122424, "auxiliary_loss_mlp": 0.01037041, "balance_loss_clip": 1.02294993, "balance_loss_mlp": 1.03939795, "epoch": 0.7205771832256125, "flos": 20120892900480.0, "grad_norm": 1.9210490496048762, "language_loss": 0.73504174, "learning_rate": 7.224175311516865e-07, "loss": 0.75663644, "num_input_tokens_seen": 258622885, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.73828125, "step": 11985, "time_per_iteration": 3.895598888397217 }, { "auxiliary_loss_clip": 0.01116059, "auxiliary_loss_mlp": 0.0102912, "balance_loss_clip": 1.01700795, "balance_loss_mlp": 1.03814197, "epoch": 0.7206373064782805, "flos": 27703250565120.0, "grad_norm": 1.99225162675546, "language_loss": 0.62735021, "learning_rate": 7.221268649650328e-07, "loss": 0.64880204, "num_input_tokens_seen": 258644305, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 11986, "time_per_iteration": 2.5966124534606934 }, { "auxiliary_loss_clip": 0.01127208, "auxiliary_loss_mlp": 0.01035779, "balance_loss_clip": 1.02143824, "balance_loss_mlp": 1.03828597, "epoch": 0.7206974297309484, "flos": 17968156018560.0, "grad_norm": 1.6306645121996688, "language_loss": 0.72395718, "learning_rate": 7.218362443823842e-07, "loss": 0.74558705, "num_input_tokens_seen": 258661775, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7109375, "step": 11987, "time_per_iteration": 2.5591847896575928 }, { "auxiliary_loss_clip": 0.01127134, "auxiliary_loss_mlp": 0.01031623, "balance_loss_clip": 1.01802683, "balance_loss_mlp": 1.03678656, "epoch": 0.7207575529836164, "flos": 16983162288000.0, "grad_norm": 1.8155571386161005, "language_loss": 0.78647292, "learning_rate": 7.215456694141122e-07, "loss": 0.80806047, "num_input_tokens_seen": 258679830, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73046875, "step": 11988, "time_per_iteration": 2.510684013366699 }, { "auxiliary_loss_clip": 0.01105735, "auxiliary_loss_mlp": 0.01029908, "balance_loss_clip": 1.01756418, "balance_loss_mlp": 1.03576708, "epoch": 0.7208176762362843, "flos": 18727594295040.0, "grad_norm": 1.9330442005763007, "language_loss": 0.79020631, "learning_rate": 7.212551400705868e-07, "loss": 0.81156278, "num_input_tokens_seen": 258697415, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 11989, "time_per_iteration": 2.4896957874298096 }, { "auxiliary_loss_clip": 0.01104945, "auxiliary_loss_mlp": 0.01035107, "balance_loss_clip": 1.02232754, "balance_loss_mlp": 1.03634095, "epoch": 0.7208777994889524, "flos": 18734489706240.0, "grad_norm": 1.9122016651134661, "language_loss": 0.82509208, "learning_rate": 7.209646563621754e-07, "loss": 0.84649253, "num_input_tokens_seen": 258716755, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 11990, "time_per_iteration": 3.9961845874786377 }, { "auxiliary_loss_clip": 0.01124744, "auxiliary_loss_mlp": 0.01033613, "balance_loss_clip": 1.02086997, "balance_loss_mlp": 1.03619826, "epoch": 0.7209379227416203, "flos": 14793437376000.0, "grad_norm": 3.5117274297362946, "language_loss": 0.76086843, "learning_rate": 7.206742182992467e-07, "loss": 0.78245205, "num_input_tokens_seen": 258733270, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 11991, "time_per_iteration": 4.039175510406494 }, { "auxiliary_loss_clip": 0.011301, "auxiliary_loss_mlp": 0.01026873, "balance_loss_clip": 1.01492786, "balance_loss_mlp": 1.03524184, "epoch": 0.7209980459942883, "flos": 29636860527360.0, "grad_norm": 1.5968790215427584, "language_loss": 0.72195739, "learning_rate": 7.203838258921631e-07, "loss": 0.74352711, "num_input_tokens_seen": 258755270, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6796875, "step": 11992, "time_per_iteration": 2.6666078567504883 }, { "auxiliary_loss_clip": 0.01115772, "auxiliary_loss_mlp": 0.01031307, "balance_loss_clip": 1.01880765, "balance_loss_mlp": 1.03608489, "epoch": 0.7210581692469563, "flos": 23477175815040.0, "grad_norm": 1.6912015200706465, "language_loss": 0.6643846, "learning_rate": 7.200934791512898e-07, "loss": 0.68585539, "num_input_tokens_seen": 258775340, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 11993, "time_per_iteration": 2.587714672088623 }, { "auxiliary_loss_clip": 0.01114663, "auxiliary_loss_mlp": 0.01029533, "balance_loss_clip": 1.01739144, "balance_loss_mlp": 1.03535652, "epoch": 0.7211182924996242, "flos": 26099839353600.0, "grad_norm": 1.7884732608317035, "language_loss": 0.65804279, "learning_rate": 7.198031780869878e-07, "loss": 0.67948472, "num_input_tokens_seen": 258794580, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.703125, "step": 11994, "time_per_iteration": 2.629864454269409 }, { "auxiliary_loss_clip": 0.01118508, "auxiliary_loss_mlp": 0.01034827, "balance_loss_clip": 1.02086711, "balance_loss_mlp": 1.03785336, "epoch": 0.7211784157522922, "flos": 17712076982400.0, "grad_norm": 1.731880796888898, "language_loss": 0.66771293, "learning_rate": 7.195129227096172e-07, "loss": 0.6892463, "num_input_tokens_seen": 258812330, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.71484375, "step": 11995, "time_per_iteration": 2.5185546875 }, { "auxiliary_loss_clip": 0.01121859, "auxiliary_loss_mlp": 0.01030425, "balance_loss_clip": 1.01736546, "balance_loss_mlp": 1.03665829, "epoch": 0.7212385390049602, "flos": 24423637230720.0, "grad_norm": 1.7503420403122778, "language_loss": 0.79448915, "learning_rate": 7.192227130295363e-07, "loss": 0.81601191, "num_input_tokens_seen": 258831770, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6796875, "step": 11996, "time_per_iteration": 2.577585220336914 }, { "auxiliary_loss_clip": 0.01108941, "auxiliary_loss_mlp": 0.01033592, "balance_loss_clip": 1.02065754, "balance_loss_mlp": 1.03770471, "epoch": 0.7212986622576282, "flos": 28147250580480.0, "grad_norm": 1.964383308973384, "language_loss": 0.81706905, "learning_rate": 7.189325490571025e-07, "loss": 0.83849442, "num_input_tokens_seen": 258849090, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 11997, "time_per_iteration": 2.5587189197540283 }, { "auxiliary_loss_clip": 0.01107921, "auxiliary_loss_mlp": 0.01034404, "balance_loss_clip": 1.02064741, "balance_loss_mlp": 1.0354712, "epoch": 0.7213587855102961, "flos": 21835770992640.0, "grad_norm": 3.2015083702496234, "language_loss": 0.66583419, "learning_rate": 7.18642430802671e-07, "loss": 0.68725741, "num_input_tokens_seen": 258868230, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7265625, "step": 11998, "time_per_iteration": 2.504300117492676 }, { "auxiliary_loss_clip": 0.01106628, "auxiliary_loss_mlp": 0.0102979, "balance_loss_clip": 1.01768446, "balance_loss_mlp": 1.03651452, "epoch": 0.7214189087629641, "flos": 14611549881600.0, "grad_norm": 1.887044424966965, "language_loss": 0.72204489, "learning_rate": 7.183523582765952e-07, "loss": 0.74340904, "num_input_tokens_seen": 258885525, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.703125, "step": 11999, "time_per_iteration": 2.473801612854004 }, { "auxiliary_loss_clip": 0.01125417, "auxiliary_loss_mlp": 0.01029764, "balance_loss_clip": 1.01616216, "balance_loss_mlp": 1.03663182, "epoch": 0.721479032015632, "flos": 19390864884480.0, "grad_norm": 2.2888711442945087, "language_loss": 0.83414471, "learning_rate": 7.18062331489226e-07, "loss": 0.8556965, "num_input_tokens_seen": 258903245, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7109375, "step": 12000, "time_per_iteration": 2.599557876586914 }, { "auxiliary_loss_clip": 0.01117936, "auxiliary_loss_mlp": 0.01034253, "balance_loss_clip": 1.02135491, "balance_loss_mlp": 1.03778172, "epoch": 0.7215391552683, "flos": 18512884748160.0, "grad_norm": 2.4604924095196976, "language_loss": 0.77204883, "learning_rate": 7.177723504509161e-07, "loss": 0.7935707, "num_input_tokens_seen": 258921245, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 12001, "time_per_iteration": 2.4863805770874023 }, { "auxiliary_loss_clip": 0.01117748, "auxiliary_loss_mlp": 0.01040863, "balance_loss_clip": 1.02811909, "balance_loss_mlp": 1.03996086, "epoch": 0.721599278520968, "flos": 23258731253760.0, "grad_norm": 1.7845861165482553, "language_loss": 0.81659877, "learning_rate": 7.17482415172012e-07, "loss": 0.83818495, "num_input_tokens_seen": 258939425, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 12002, "time_per_iteration": 2.5664713382720947 }, { "auxiliary_loss_clip": 0.01119453, "auxiliary_loss_mlp": 0.01033485, "balance_loss_clip": 1.0197103, "balance_loss_mlp": 1.03890371, "epoch": 0.721659401773636, "flos": 39199045979520.0, "grad_norm": 2.95996732066922, "language_loss": 0.6225, "learning_rate": 7.171925256628609e-07, "loss": 0.64402938, "num_input_tokens_seen": 258960710, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.71875, "step": 12003, "time_per_iteration": 2.680575370788574 }, { "auxiliary_loss_clip": 0.01108039, "auxiliary_loss_mlp": 0.01036577, "balance_loss_clip": 1.02290988, "balance_loss_mlp": 1.0364244, "epoch": 0.7217195250263039, "flos": 14939917038720.0, "grad_norm": 2.260500324869042, "language_loss": 0.6855638, "learning_rate": 7.169026819338099e-07, "loss": 0.70700997, "num_input_tokens_seen": 258978475, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 12004, "time_per_iteration": 2.5675389766693115 }, { "auxiliary_loss_clip": 0.01044774, "auxiliary_loss_mlp": 0.01000411, "balance_loss_clip": 0.99910563, "balance_loss_mlp": 1.00494981, "epoch": 0.7217796482789719, "flos": 70869206666880.0, "grad_norm": 0.8565309820536448, "language_loss": 0.54087073, "learning_rate": 7.166128839952006e-07, "loss": 0.56132257, "num_input_tokens_seen": 259037520, "router_z_loss_clip": 0.01306152, "router_z_loss_mlp": 0.21875, "step": 12005, "time_per_iteration": 3.145825147628784 }, { "auxiliary_loss_clip": 0.01134696, "auxiliary_loss_mlp": 0.01035765, "balance_loss_clip": 1.02194858, "balance_loss_mlp": 1.03756821, "epoch": 0.7218397715316398, "flos": 37451525402880.0, "grad_norm": 1.6503514183467376, "language_loss": 0.63410008, "learning_rate": 7.163231318573766e-07, "loss": 0.65580469, "num_input_tokens_seen": 259061325, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7109375, "step": 12006, "time_per_iteration": 2.6722540855407715 }, { "auxiliary_loss_clip": 0.01125501, "auxiliary_loss_mlp": 0.01032904, "balance_loss_clip": 1.02090561, "balance_loss_mlp": 1.03676033, "epoch": 0.7218998947843078, "flos": 22710662559360.0, "grad_norm": 1.6912596237869655, "language_loss": 0.91765833, "learning_rate": 7.160334255306775e-07, "loss": 0.93924242, "num_input_tokens_seen": 259078135, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.70703125, "step": 12007, "time_per_iteration": 2.563007116317749 }, { "auxiliary_loss_clip": 0.01135651, "auxiliary_loss_mlp": 0.01035235, "balance_loss_clip": 1.02163327, "balance_loss_mlp": 1.03599811, "epoch": 0.7219600180369758, "flos": 12167182477440.0, "grad_norm": 1.9175257962253054, "language_loss": 0.64271021, "learning_rate": 7.15743765025444e-07, "loss": 0.66441905, "num_input_tokens_seen": 259095910, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73046875, "step": 12008, "time_per_iteration": 2.6125710010528564 }, { "auxiliary_loss_clip": 0.01117618, "auxiliary_loss_mlp": 0.0103545, "balance_loss_clip": 1.0218184, "balance_loss_mlp": 1.03634071, "epoch": 0.7220201412896438, "flos": 22596573985920.0, "grad_norm": 2.327147393912649, "language_loss": 0.78227675, "learning_rate": 7.154541503520109e-07, "loss": 0.80380744, "num_input_tokens_seen": 259114225, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73046875, "step": 12009, "time_per_iteration": 2.669506549835205 }, { "auxiliary_loss_clip": 0.01120303, "auxiliary_loss_mlp": 0.0104026, "balance_loss_clip": 1.02552557, "balance_loss_mlp": 1.03836429, "epoch": 0.7220802645423118, "flos": 26718651884160.0, "grad_norm": 1.8395208963286374, "language_loss": 0.637716, "learning_rate": 7.151645815207152e-07, "loss": 0.65932167, "num_input_tokens_seen": 259134660, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.73046875, "step": 12010, "time_per_iteration": 2.5857620239257812 }, { "auxiliary_loss_clip": 0.01107355, "auxiliary_loss_mlp": 0.01033708, "balance_loss_clip": 1.02047014, "balance_loss_mlp": 1.03607106, "epoch": 0.7221403877949797, "flos": 24420548661120.0, "grad_norm": 1.6952878494218975, "language_loss": 0.77326912, "learning_rate": 7.14875058541891e-07, "loss": 0.79467976, "num_input_tokens_seen": 259153300, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 12011, "time_per_iteration": 2.5511438846588135 }, { "auxiliary_loss_clip": 0.01116648, "auxiliary_loss_mlp": 0.01032337, "balance_loss_clip": 1.02053523, "balance_loss_mlp": 1.03766429, "epoch": 0.7222005110476477, "flos": 23514379326720.0, "grad_norm": 1.8236737786597224, "language_loss": 0.79479897, "learning_rate": 7.145855814258699e-07, "loss": 0.81628883, "num_input_tokens_seen": 259172115, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69921875, "step": 12012, "time_per_iteration": 2.5689008235931396 }, { "auxiliary_loss_clip": 0.01128803, "auxiliary_loss_mlp": 0.01031179, "balance_loss_clip": 1.01854253, "balance_loss_mlp": 1.03860617, "epoch": 0.7222606343003156, "flos": 23112538899840.0, "grad_norm": 2.407156566392568, "language_loss": 0.75693762, "learning_rate": 7.142961501829825e-07, "loss": 0.77853751, "num_input_tokens_seen": 259191345, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 12013, "time_per_iteration": 2.57320237159729 }, { "auxiliary_loss_clip": 0.01107089, "auxiliary_loss_mlp": 0.01029446, "balance_loss_clip": 1.01722145, "balance_loss_mlp": 1.03678417, "epoch": 0.7223207575529836, "flos": 24351169541760.0, "grad_norm": 1.4295354752854001, "language_loss": 0.75823367, "learning_rate": 7.140067648235588e-07, "loss": 0.77959895, "num_input_tokens_seen": 259211700, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 12014, "time_per_iteration": 2.578376531600952 }, { "auxiliary_loss_clip": 0.01117548, "auxiliary_loss_mlp": 0.01033554, "balance_loss_clip": 1.02091169, "balance_loss_mlp": 1.03645444, "epoch": 0.7223808808056515, "flos": 28330179569280.0, "grad_norm": 1.5395175055631605, "language_loss": 0.86730242, "learning_rate": 7.137174253579257e-07, "loss": 0.88881344, "num_input_tokens_seen": 259233825, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.72265625, "step": 12015, "time_per_iteration": 2.657158851623535 }, { "auxiliary_loss_clip": 0.01114319, "auxiliary_loss_mlp": 0.01034833, "balance_loss_clip": 1.02232218, "balance_loss_mlp": 1.03604162, "epoch": 0.7224410040583196, "flos": 21069437304960.0, "grad_norm": 1.887698673597811, "language_loss": 0.77827036, "learning_rate": 7.134281317964091e-07, "loss": 0.79976189, "num_input_tokens_seen": 259253055, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 12016, "time_per_iteration": 2.5723397731781006 }, { "auxiliary_loss_clip": 0.01132271, "auxiliary_loss_mlp": 0.01040512, "balance_loss_clip": 1.02770865, "balance_loss_mlp": 1.03618312, "epoch": 0.7225011273109875, "flos": 26795429205120.0, "grad_norm": 1.436006216210064, "language_loss": 0.77697361, "learning_rate": 7.131388841493327e-07, "loss": 0.79870141, "num_input_tokens_seen": 259273420, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 12017, "time_per_iteration": 2.603888750076294 }, { "auxiliary_loss_clip": 0.01103108, "auxiliary_loss_mlp": 0.01028435, "balance_loss_clip": 1.01638913, "balance_loss_mlp": 1.03574979, "epoch": 0.7225612505636555, "flos": 23583578878080.0, "grad_norm": 1.982320938269473, "language_loss": 0.73767304, "learning_rate": 7.128496824270196e-07, "loss": 0.75898838, "num_input_tokens_seen": 259291000, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 12018, "time_per_iteration": 2.691445827484131 }, { "auxiliary_loss_clip": 0.0113075, "auxiliary_loss_mlp": 0.01030938, "balance_loss_clip": 1.01944613, "balance_loss_mlp": 1.03813171, "epoch": 0.7226213738163234, "flos": 20777627214720.0, "grad_norm": 1.6242745432413028, "language_loss": 0.77631378, "learning_rate": 7.125605266397903e-07, "loss": 0.7979306, "num_input_tokens_seen": 259312390, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6640625, "step": 12019, "time_per_iteration": 2.6208159923553467 }, { "auxiliary_loss_clip": 0.01123783, "auxiliary_loss_mlp": 0.01028615, "balance_loss_clip": 1.01582992, "balance_loss_mlp": 1.03582001, "epoch": 0.7226814970689914, "flos": 32635832901120.0, "grad_norm": 1.6231991333923947, "language_loss": 0.73517191, "learning_rate": 7.122714167979635e-07, "loss": 0.75669587, "num_input_tokens_seen": 259332645, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 12020, "time_per_iteration": 4.026370048522949 }, { "auxiliary_loss_clip": 0.01117433, "auxiliary_loss_mlp": 0.01031142, "balance_loss_clip": 1.01863146, "balance_loss_mlp": 1.03525782, "epoch": 0.7227416203216595, "flos": 22454368041600.0, "grad_norm": 1.553772675950699, "language_loss": 0.77111912, "learning_rate": 7.119823529118587e-07, "loss": 0.79260492, "num_input_tokens_seen": 259353810, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.73046875, "step": 12021, "time_per_iteration": 2.5902657508850098 }, { "auxiliary_loss_clip": 0.01131941, "auxiliary_loss_mlp": 0.01032181, "balance_loss_clip": 1.019503, "balance_loss_mlp": 1.03557026, "epoch": 0.7228017435743274, "flos": 21652303299840.0, "grad_norm": 2.0407224801179575, "language_loss": 0.6817199, "learning_rate": 7.116933349917892e-07, "loss": 0.70336115, "num_input_tokens_seen": 259372460, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 12022, "time_per_iteration": 2.582306385040283 }, { "auxiliary_loss_clip": 0.01115613, "auxiliary_loss_mlp": 0.01032713, "balance_loss_clip": 1.01917064, "balance_loss_mlp": 1.03587115, "epoch": 0.7228618668269954, "flos": 29533474206720.0, "grad_norm": 1.8529848298108278, "language_loss": 0.69572192, "learning_rate": 7.114043630480713e-07, "loss": 0.71720517, "num_input_tokens_seen": 259393275, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7109375, "step": 12023, "time_per_iteration": 2.603586435317993 }, { "auxiliary_loss_clip": 0.01122756, "auxiliary_loss_mlp": 0.0102788, "balance_loss_clip": 1.01498199, "balance_loss_mlp": 1.03447711, "epoch": 0.7229219900796633, "flos": 27453815544960.0, "grad_norm": 1.7549335997397586, "language_loss": 0.71278018, "learning_rate": 7.111154370910164e-07, "loss": 0.73428649, "num_input_tokens_seen": 259416205, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 12024, "time_per_iteration": 2.592195749282837 }, { "auxiliary_loss_clip": 0.01132813, "auxiliary_loss_mlp": 0.01036191, "balance_loss_clip": 1.02376378, "balance_loss_mlp": 1.03668523, "epoch": 0.7229821133323313, "flos": 16289368116480.0, "grad_norm": 1.6982827015873696, "language_loss": 0.75546122, "learning_rate": 7.108265571309376e-07, "loss": 0.77715123, "num_input_tokens_seen": 259433115, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 12025, "time_per_iteration": 2.527120351791382 }, { "auxiliary_loss_clip": 0.01114666, "auxiliary_loss_mlp": 0.01030119, "balance_loss_clip": 1.01816869, "balance_loss_mlp": 1.03734803, "epoch": 0.7230422365849992, "flos": 20412343854720.0, "grad_norm": 2.009477852111088, "language_loss": 0.76323593, "learning_rate": 7.105377231781414e-07, "loss": 0.7846837, "num_input_tokens_seen": 259450475, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.68359375, "step": 12026, "time_per_iteration": 2.6252362728118896 }, { "auxiliary_loss_clip": 0.01115232, "auxiliary_loss_mlp": 0.01040844, "balance_loss_clip": 1.02855968, "balance_loss_mlp": 1.03518891, "epoch": 0.7231023598376672, "flos": 25593499284480.0, "grad_norm": 2.283586437387154, "language_loss": 0.67822951, "learning_rate": 7.102489352429375e-07, "loss": 0.69979024, "num_input_tokens_seen": 259469355, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 12027, "time_per_iteration": 4.258789300918579 }, { "auxiliary_loss_clip": 0.01145393, "auxiliary_loss_mlp": 0.01032347, "balance_loss_clip": 1.01938868, "balance_loss_mlp": 1.03733993, "epoch": 0.7231624830903352, "flos": 25149607009920.0, "grad_norm": 2.1883564811876104, "language_loss": 0.79256666, "learning_rate": 7.099601933356314e-07, "loss": 0.81434405, "num_input_tokens_seen": 259486565, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 12028, "time_per_iteration": 2.6660802364349365 }, { "auxiliary_loss_clip": 0.01132374, "auxiliary_loss_mlp": 0.01027767, "balance_loss_clip": 1.01450539, "balance_loss_mlp": 1.03461409, "epoch": 0.7232226063430032, "flos": 21725740656000.0, "grad_norm": 1.8755364374779135, "language_loss": 0.82382607, "learning_rate": 7.096714974665279e-07, "loss": 0.84542745, "num_input_tokens_seen": 259505070, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 12029, "time_per_iteration": 2.561567544937134 }, { "auxiliary_loss_clip": 0.01043993, "auxiliary_loss_mlp": 0.01001773, "balance_loss_clip": 1.00045538, "balance_loss_mlp": 1.00457382, "epoch": 0.7232827295956711, "flos": 68436798491520.0, "grad_norm": 0.8116506571134082, "language_loss": 0.61756426, "learning_rate": 7.093828476459287e-07, "loss": 0.63802195, "num_input_tokens_seen": 259569135, "router_z_loss_clip": 0.01318359, "router_z_loss_mlp": 0.21777344, "step": 12030, "time_per_iteration": 3.287327289581299 }, { "auxiliary_loss_clip": 0.01112579, "auxiliary_loss_mlp": 0.01029802, "balance_loss_clip": 1.01676655, "balance_loss_mlp": 1.0348742, "epoch": 0.7233428528483391, "flos": 20192642317440.0, "grad_norm": 1.6124145649006951, "language_loss": 0.78352094, "learning_rate": 7.090942438841365e-07, "loss": 0.80494475, "num_input_tokens_seen": 259587035, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6875, "step": 12031, "time_per_iteration": 2.549546957015991 }, { "auxiliary_loss_clip": 0.01117686, "auxiliary_loss_mlp": 0.01031048, "balance_loss_clip": 1.01834583, "balance_loss_mlp": 1.0369308, "epoch": 0.723402976101007, "flos": 23949472769280.0, "grad_norm": 1.8323676147600383, "language_loss": 0.81590945, "learning_rate": 7.088056861914509e-07, "loss": 0.83739686, "num_input_tokens_seen": 259606140, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 12032, "time_per_iteration": 4.038147687911987 }, { "auxiliary_loss_clip": 0.0113789, "auxiliary_loss_mlp": 0.01032142, "balance_loss_clip": 1.01950562, "balance_loss_mlp": 1.03792512, "epoch": 0.723463099353675, "flos": 20813394182400.0, "grad_norm": 1.9158360365299725, "language_loss": 0.75289512, "learning_rate": 7.085171745781676e-07, "loss": 0.7745955, "num_input_tokens_seen": 259624275, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73046875, "step": 12033, "time_per_iteration": 4.085987329483032 }, { "auxiliary_loss_clip": 0.01128962, "auxiliary_loss_mlp": 0.01029916, "balance_loss_clip": 1.01813841, "balance_loss_mlp": 1.03420603, "epoch": 0.723523222606343, "flos": 19098013299840.0, "grad_norm": 1.7323107595219327, "language_loss": 0.74975216, "learning_rate": 7.082287090545848e-07, "loss": 0.77134097, "num_input_tokens_seen": 259643465, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6875, "step": 12034, "time_per_iteration": 2.5441834926605225 }, { "auxiliary_loss_clip": 0.01122108, "auxiliary_loss_mlp": 0.01029954, "balance_loss_clip": 1.01829481, "balance_loss_mlp": 1.03499925, "epoch": 0.723583345859011, "flos": 26506994993280.0, "grad_norm": 1.8435231991318406, "language_loss": 0.80111629, "learning_rate": 7.079402896309967e-07, "loss": 0.82263696, "num_input_tokens_seen": 259662500, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6953125, "step": 12035, "time_per_iteration": 2.600916624069214 }, { "auxiliary_loss_clip": 0.01152408, "auxiliary_loss_mlp": 0.01030362, "balance_loss_clip": 1.01764882, "balance_loss_mlp": 1.03721499, "epoch": 0.723643469111679, "flos": 16033863697920.0, "grad_norm": 1.9522117087965876, "language_loss": 0.60765922, "learning_rate": 7.07651916317696e-07, "loss": 0.62948686, "num_input_tokens_seen": 259680140, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 12036, "time_per_iteration": 2.5889816284179688 }, { "auxiliary_loss_clip": 0.0111071, "auxiliary_loss_mlp": 0.01033963, "balance_loss_clip": 1.02271533, "balance_loss_mlp": 1.03437638, "epoch": 0.7237035923643469, "flos": 21945549934080.0, "grad_norm": 1.9223302997250507, "language_loss": 0.6787895, "learning_rate": 7.073635891249734e-07, "loss": 0.7002362, "num_input_tokens_seen": 259700160, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6796875, "step": 12037, "time_per_iteration": 2.5777437686920166 }, { "auxiliary_loss_clip": 0.01105528, "auxiliary_loss_mlp": 0.01036179, "balance_loss_clip": 1.02314925, "balance_loss_mlp": 1.03567886, "epoch": 0.7237637156170149, "flos": 23583112001280.0, "grad_norm": 2.0927355183830083, "language_loss": 0.72421354, "learning_rate": 7.070753080631207e-07, "loss": 0.74563068, "num_input_tokens_seen": 259720525, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69921875, "step": 12038, "time_per_iteration": 2.5326361656188965 }, { "auxiliary_loss_clip": 0.01106694, "auxiliary_loss_mlp": 0.01033601, "balance_loss_clip": 1.01958227, "balance_loss_mlp": 1.03645158, "epoch": 0.7238238388696828, "flos": 20594698225920.0, "grad_norm": 1.7440734145046481, "language_loss": 0.72302282, "learning_rate": 7.06787073142423e-07, "loss": 0.74442577, "num_input_tokens_seen": 259738680, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.703125, "step": 12039, "time_per_iteration": 2.5213985443115234 }, { "auxiliary_loss_clip": 0.01106594, "auxiliary_loss_mlp": 0.01031937, "balance_loss_clip": 1.0200398, "balance_loss_mlp": 1.03610015, "epoch": 0.7238839621223508, "flos": 24207024263040.0, "grad_norm": 2.2437178389285215, "language_loss": 0.76276439, "learning_rate": 7.06498884373169e-07, "loss": 0.78414971, "num_input_tokens_seen": 259758790, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.703125, "step": 12040, "time_per_iteration": 2.4895827770233154 }, { "auxiliary_loss_clip": 0.01132476, "auxiliary_loss_mlp": 0.01034758, "balance_loss_clip": 1.02144802, "balance_loss_mlp": 1.03504562, "epoch": 0.7239440853750188, "flos": 14209745368320.0, "grad_norm": 1.755946628235432, "language_loss": 0.76850075, "learning_rate": 7.062107417656416e-07, "loss": 0.79017305, "num_input_tokens_seen": 259777370, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 12041, "time_per_iteration": 2.5007073879241943 }, { "auxiliary_loss_clip": 0.01114692, "auxiliary_loss_mlp": 0.01026177, "balance_loss_clip": 1.01415515, "balance_loss_mlp": 1.03433776, "epoch": 0.7240042086276868, "flos": 21614812479360.0, "grad_norm": 2.219535754564812, "language_loss": 0.6350081, "learning_rate": 7.059226453301264e-07, "loss": 0.65641677, "num_input_tokens_seen": 259794665, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.71484375, "step": 12042, "time_per_iteration": 2.4896297454833984 }, { "auxiliary_loss_clip": 0.01136912, "auxiliary_loss_mlp": 0.01032901, "balance_loss_clip": 1.0197103, "balance_loss_mlp": 1.03776503, "epoch": 0.7240643318803547, "flos": 23331450337920.0, "grad_norm": 1.9938854537888038, "language_loss": 0.83632457, "learning_rate": 7.056345950769016e-07, "loss": 0.85802269, "num_input_tokens_seen": 259811110, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 12043, "time_per_iteration": 2.5276424884796143 }, { "auxiliary_loss_clip": 0.01133258, "auxiliary_loss_mlp": 0.01029625, "balance_loss_clip": 1.01592231, "balance_loss_mlp": 1.03520286, "epoch": 0.7241244551330227, "flos": 24024849459840.0, "grad_norm": 1.938967680587569, "language_loss": 0.63983893, "learning_rate": 7.053465910162494e-07, "loss": 0.66146779, "num_input_tokens_seen": 259831080, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 12044, "time_per_iteration": 2.5175833702087402 }, { "auxiliary_loss_clip": 0.01107487, "auxiliary_loss_mlp": 0.01036495, "balance_loss_clip": 1.02273273, "balance_loss_mlp": 1.03778076, "epoch": 0.7241845783856906, "flos": 18730323728640.0, "grad_norm": 1.5518829841109374, "language_loss": 0.81681347, "learning_rate": 7.050586331584472e-07, "loss": 0.83825326, "num_input_tokens_seen": 259850135, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.6953125, "step": 12045, "time_per_iteration": 2.5027575492858887 }, { "auxiliary_loss_clip": 0.01102881, "auxiliary_loss_mlp": 0.01029172, "balance_loss_clip": 1.01662552, "balance_loss_mlp": 1.03493547, "epoch": 0.7242447016383586, "flos": 19498668577920.0, "grad_norm": 1.9780076722948017, "language_loss": 0.71726531, "learning_rate": 7.047707215137712e-07, "loss": 0.73858583, "num_input_tokens_seen": 259868185, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 12046, "time_per_iteration": 2.4422028064727783 }, { "auxiliary_loss_clip": 0.01132971, "auxiliary_loss_mlp": 0.01032607, "balance_loss_clip": 1.01943421, "balance_loss_mlp": 1.03516364, "epoch": 0.7243048248910267, "flos": 22163491704960.0, "grad_norm": 1.947823793955726, "language_loss": 0.70938516, "learning_rate": 7.044828560924967e-07, "loss": 0.73104084, "num_input_tokens_seen": 259887055, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 12047, "time_per_iteration": 2.5981521606445312 }, { "auxiliary_loss_clip": 0.01130046, "auxiliary_loss_mlp": 0.01038042, "balance_loss_clip": 1.024822, "balance_loss_mlp": 1.03832483, "epoch": 0.7243649481436946, "flos": 27672762896640.0, "grad_norm": 2.0866300284163266, "language_loss": 0.70014668, "learning_rate": 7.041950369048964e-07, "loss": 0.72182751, "num_input_tokens_seen": 259908295, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 12048, "time_per_iteration": 2.6743850708007812 }, { "auxiliary_loss_clip": 0.01043857, "auxiliary_loss_mlp": 0.00998936, "balance_loss_clip": 0.99755299, "balance_loss_mlp": 1.00407195, "epoch": 0.7244250713963626, "flos": 63244545759360.0, "grad_norm": 0.8170502712985724, "language_loss": 0.53747278, "learning_rate": 7.03907263961242e-07, "loss": 0.55790073, "num_input_tokens_seen": 259968475, "router_z_loss_clip": 0.01385498, "router_z_loss_mlp": 0.21679688, "step": 12049, "time_per_iteration": 3.213332176208496 }, { "auxiliary_loss_clip": 0.01129804, "auxiliary_loss_mlp": 0.01029905, "balance_loss_clip": 1.01529598, "balance_loss_mlp": 1.03712344, "epoch": 0.7244851946490305, "flos": 17967114524160.0, "grad_norm": 2.010129063767852, "language_loss": 0.59988868, "learning_rate": 7.036195372718028e-07, "loss": 0.62148571, "num_input_tokens_seen": 259984865, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.74609375, "step": 12050, "time_per_iteration": 2.529817581176758 }, { "auxiliary_loss_clip": 0.01062594, "auxiliary_loss_mlp": 0.0100213, "balance_loss_clip": 1.00079525, "balance_loss_mlp": 1.00506568, "epoch": 0.7245453179016985, "flos": 70943649603840.0, "grad_norm": 0.7200271423549672, "language_loss": 0.53315395, "learning_rate": 7.033318568468482e-07, "loss": 0.55380124, "num_input_tokens_seen": 260046735, "router_z_loss_clip": 0.0133667, "router_z_loss_mlp": 0.21777344, "step": 12051, "time_per_iteration": 3.2136738300323486 }, { "auxiliary_loss_clip": 0.01113766, "auxiliary_loss_mlp": 0.01031016, "balance_loss_clip": 1.01954234, "balance_loss_mlp": 1.0363009, "epoch": 0.7246054411543664, "flos": 24568464867840.0, "grad_norm": 1.5245015058456266, "language_loss": 0.72010231, "learning_rate": 7.030442226966445e-07, "loss": 0.74155015, "num_input_tokens_seen": 260067950, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.68359375, "step": 12052, "time_per_iteration": 2.548253059387207 }, { "auxiliary_loss_clip": 0.01115183, "auxiliary_loss_mlp": 0.01029901, "balance_loss_clip": 1.01625192, "balance_loss_mlp": 1.03637099, "epoch": 0.7246655644070344, "flos": 32338312548480.0, "grad_norm": 2.342677388376715, "language_loss": 0.7367276, "learning_rate": 7.02756634831456e-07, "loss": 0.75817841, "num_input_tokens_seen": 260087730, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.6953125, "step": 12053, "time_per_iteration": 2.6184394359588623 }, { "auxiliary_loss_clip": 0.01133349, "auxiliary_loss_mlp": 0.01034641, "balance_loss_clip": 1.02140927, "balance_loss_mlp": 1.03586304, "epoch": 0.7247256876597024, "flos": 21872471713920.0, "grad_norm": 1.9669043211799146, "language_loss": 0.78289181, "learning_rate": 7.024690932615458e-07, "loss": 0.80457169, "num_input_tokens_seen": 260107760, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 12054, "time_per_iteration": 2.511096715927124 }, { "auxiliary_loss_clip": 0.01133014, "auxiliary_loss_mlp": 0.01034714, "balance_loss_clip": 1.02165437, "balance_loss_mlp": 1.03687596, "epoch": 0.7247858109123704, "flos": 16213093585920.0, "grad_norm": 4.00125193401666, "language_loss": 0.68434399, "learning_rate": 7.021815979971772e-07, "loss": 0.70602131, "num_input_tokens_seen": 260123660, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 12055, "time_per_iteration": 2.4740967750549316 }, { "auxiliary_loss_clip": 0.01130645, "auxiliary_loss_mlp": 0.01033249, "balance_loss_clip": 1.02103019, "balance_loss_mlp": 1.03499353, "epoch": 0.7248459341650383, "flos": 20850705434880.0, "grad_norm": 1.8255908123874682, "language_loss": 0.73927265, "learning_rate": 7.018941490486079e-07, "loss": 0.76091158, "num_input_tokens_seen": 260142690, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 12056, "time_per_iteration": 2.5293195247650146 }, { "auxiliary_loss_clip": 0.01102388, "auxiliary_loss_mlp": 0.01025487, "balance_loss_clip": 1.01343453, "balance_loss_mlp": 1.03535604, "epoch": 0.7249060574177063, "flos": 25921794614400.0, "grad_norm": 1.4690517727018713, "language_loss": 0.70892048, "learning_rate": 7.016067464260977e-07, "loss": 0.73019922, "num_input_tokens_seen": 260162590, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.671875, "step": 12057, "time_per_iteration": 2.5336108207702637 }, { "auxiliary_loss_clip": 0.01144674, "auxiliary_loss_mlp": 0.01031317, "balance_loss_clip": 1.01801872, "balance_loss_mlp": 1.0375303, "epoch": 0.7249661806703742, "flos": 17345536646400.0, "grad_norm": 1.7735128515899345, "language_loss": 0.62552059, "learning_rate": 7.013193901399024e-07, "loss": 0.64728045, "num_input_tokens_seen": 260181065, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 12058, "time_per_iteration": 2.5585291385650635 }, { "auxiliary_loss_clip": 0.01143733, "auxiliary_loss_mlp": 0.01029593, "balance_loss_clip": 1.01618147, "balance_loss_mlp": 1.0370307, "epoch": 0.7250263039230422, "flos": 19574153009280.0, "grad_norm": 1.8647588017157404, "language_loss": 0.75087947, "learning_rate": 7.010320802002785e-07, "loss": 0.77261269, "num_input_tokens_seen": 260200330, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71484375, "step": 12059, "time_per_iteration": 2.545722723007202 }, { "auxiliary_loss_clip": 0.01111286, "auxiliary_loss_mlp": 0.01032454, "balance_loss_clip": 1.02130234, "balance_loss_mlp": 1.03522325, "epoch": 0.7250864271757103, "flos": 21976648133760.0, "grad_norm": 1.570248315611706, "language_loss": 0.79345882, "learning_rate": 7.007448166174772e-07, "loss": 0.81489623, "num_input_tokens_seen": 260219975, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.671875, "step": 12060, "time_per_iteration": 2.574753999710083 }, { "auxiliary_loss_clip": 0.01116224, "auxiliary_loss_mlp": 0.01027626, "balance_loss_clip": 1.01476336, "balance_loss_mlp": 1.03664136, "epoch": 0.7251465504283782, "flos": 25012608537600.0, "grad_norm": 1.8198899285219872, "language_loss": 0.76244223, "learning_rate": 7.004575994017521e-07, "loss": 0.78388071, "num_input_tokens_seen": 260242025, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 12061, "time_per_iteration": 3.9842591285705566 }, { "auxiliary_loss_clip": 0.01104777, "auxiliary_loss_mlp": 0.01278112, "balance_loss_clip": 1.01931024, "balance_loss_mlp": 1.0368619, "epoch": 0.7252066736810462, "flos": 16690131135360.0, "grad_norm": 1.8527964700380581, "language_loss": 0.81268394, "learning_rate": 7.00170428563353e-07, "loss": 0.8365128, "num_input_tokens_seen": 260260015, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 12062, "time_per_iteration": 2.4988908767700195 }, { "auxiliary_loss_clip": 0.01147791, "auxiliary_loss_mlp": 0.01031151, "balance_loss_clip": 1.01731658, "balance_loss_mlp": 1.03873277, "epoch": 0.7252667969337141, "flos": 25703026830720.0, "grad_norm": 2.256248181680209, "language_loss": 0.68839049, "learning_rate": 6.998833041125263e-07, "loss": 0.71017992, "num_input_tokens_seen": 260278635, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 12063, "time_per_iteration": 2.5731887817382812 }, { "auxiliary_loss_clip": 0.01130329, "auxiliary_loss_mlp": 0.01029775, "balance_loss_clip": 1.01728177, "balance_loss_mlp": 1.03543615, "epoch": 0.7253269201863821, "flos": 18259930195200.0, "grad_norm": 1.4442623881201142, "language_loss": 0.69772607, "learning_rate": 6.995962260595207e-07, "loss": 0.71932709, "num_input_tokens_seen": 260298510, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 12064, "time_per_iteration": 2.564300060272217 }, { "auxiliary_loss_clip": 0.01125668, "auxiliary_loss_mlp": 0.01033065, "balance_loss_clip": 1.02029777, "balance_loss_mlp": 1.03817558, "epoch": 0.72538704343905, "flos": 20411805150720.0, "grad_norm": 1.7151268841250882, "language_loss": 0.90348572, "learning_rate": 6.99309194414581e-07, "loss": 0.92507303, "num_input_tokens_seen": 260317405, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 12065, "time_per_iteration": 2.56504487991333 }, { "auxiliary_loss_clip": 0.01035734, "auxiliary_loss_mlp": 0.0099964, "balance_loss_clip": 0.99835885, "balance_loss_mlp": 1.00507784, "epoch": 0.725447166691718, "flos": 70151209706880.0, "grad_norm": 0.6529678498034623, "language_loss": 0.56043106, "learning_rate": 6.990222091879506e-07, "loss": 0.58078474, "num_input_tokens_seen": 260388085, "router_z_loss_clip": 0.01281738, "router_z_loss_mlp": 0.21777344, "step": 12066, "time_per_iteration": 3.2560884952545166 }, { "auxiliary_loss_clip": 0.01138892, "auxiliary_loss_mlp": 0.01031547, "balance_loss_clip": 1.01975727, "balance_loss_mlp": 1.0340966, "epoch": 0.725507289944386, "flos": 27052334254080.0, "grad_norm": 1.8376718504745708, "language_loss": 0.76836759, "learning_rate": 6.987352703898699e-07, "loss": 0.79007196, "num_input_tokens_seen": 260406165, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6953125, "step": 12067, "time_per_iteration": 2.60367751121521 }, { "auxiliary_loss_clip": 0.01043654, "auxiliary_loss_mlp": 0.01001272, "balance_loss_clip": 0.99984187, "balance_loss_mlp": 1.00446081, "epoch": 0.725567413197054, "flos": 62921924778240.0, "grad_norm": 0.722228435268518, "language_loss": 0.57022738, "learning_rate": 6.984483780305812e-07, "loss": 0.59067667, "num_input_tokens_seen": 260461365, "router_z_loss_clip": 0.01428223, "router_z_loss_mlp": 0.21679688, "step": 12068, "time_per_iteration": 3.0346529483795166 }, { "auxiliary_loss_clip": 0.01061682, "auxiliary_loss_mlp": 0.01000012, "balance_loss_clip": 0.99876577, "balance_loss_mlp": 1.00465226, "epoch": 0.7256275364497219, "flos": 60295957188480.0, "grad_norm": 0.6628572511396886, "language_loss": 0.55459177, "learning_rate": 6.981615321203216e-07, "loss": 0.57520878, "num_input_tokens_seen": 260523795, "router_z_loss_clip": 0.01245117, "router_z_loss_mlp": 0.21679688, "step": 12069, "time_per_iteration": 4.657979249954224 }, { "auxiliary_loss_clip": 0.01130158, "auxiliary_loss_mlp": 0.0102544, "balance_loss_clip": 1.01410306, "balance_loss_mlp": 1.03659511, "epoch": 0.7256876597023899, "flos": 24498511130880.0, "grad_norm": 1.74072760360784, "language_loss": 0.79616523, "learning_rate": 6.978747326693283e-07, "loss": 0.81772119, "num_input_tokens_seen": 260544765, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.671875, "step": 12070, "time_per_iteration": 2.654818296432495 }, { "auxiliary_loss_clip": 0.01118679, "auxiliary_loss_mlp": 0.01034999, "balance_loss_clip": 1.02318573, "balance_loss_mlp": 1.0359621, "epoch": 0.7257477829550578, "flos": 24352749740160.0, "grad_norm": 1.7865959727209393, "language_loss": 0.71780765, "learning_rate": 6.975879796878357e-07, "loss": 0.73934442, "num_input_tokens_seen": 260564340, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6484375, "step": 12071, "time_per_iteration": 2.5891671180725098 }, { "auxiliary_loss_clip": 0.01104315, "auxiliary_loss_mlp": 0.0103581, "balance_loss_clip": 1.02400208, "balance_loss_mlp": 1.03653955, "epoch": 0.7258079062077258, "flos": 17202217380480.0, "grad_norm": 1.8937547226141271, "language_loss": 0.70372897, "learning_rate": 6.973012731860792e-07, "loss": 0.7251302, "num_input_tokens_seen": 260582565, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 12072, "time_per_iteration": 2.5634078979492188 }, { "auxiliary_loss_clip": 0.01134775, "auxiliary_loss_mlp": 0.01030008, "balance_loss_clip": 1.01694226, "balance_loss_mlp": 1.03606868, "epoch": 0.7258680294603939, "flos": 21580338401280.0, "grad_norm": 1.8405207206957799, "language_loss": 0.78424037, "learning_rate": 6.97014613174288e-07, "loss": 0.80588824, "num_input_tokens_seen": 260601700, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 12073, "time_per_iteration": 2.6744272708892822 }, { "auxiliary_loss_clip": 0.01121772, "auxiliary_loss_mlp": 0.01026919, "balance_loss_clip": 1.01441419, "balance_loss_mlp": 1.03621268, "epoch": 0.7259281527130618, "flos": 34855399036800.0, "grad_norm": 1.488086646982578, "language_loss": 0.70363998, "learning_rate": 6.967279996626943e-07, "loss": 0.72512686, "num_input_tokens_seen": 260623040, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.67578125, "step": 12074, "time_per_iteration": 5.596317768096924 }, { "auxiliary_loss_clip": 0.01124441, "auxiliary_loss_mlp": 0.01030318, "balance_loss_clip": 1.01724076, "balance_loss_mlp": 1.03577256, "epoch": 0.7259882759657298, "flos": 25404644551680.0, "grad_norm": 3.5320018860371505, "language_loss": 0.74102175, "learning_rate": 6.964414326615251e-07, "loss": 0.76256931, "num_input_tokens_seen": 260642735, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 12075, "time_per_iteration": 2.5606584548950195 }, { "auxiliary_loss_clip": 0.01115364, "auxiliary_loss_mlp": 0.01035843, "balance_loss_clip": 1.02270603, "balance_loss_mlp": 1.03633928, "epoch": 0.7260483992183977, "flos": 62953630531200.0, "grad_norm": 1.4546000217115571, "language_loss": 0.63562763, "learning_rate": 6.961549121810095e-07, "loss": 0.65713978, "num_input_tokens_seen": 260669935, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 12076, "time_per_iteration": 2.8700671195983887 }, { "auxiliary_loss_clip": 0.01130927, "auxiliary_loss_mlp": 0.01027222, "balance_loss_clip": 1.01482987, "balance_loss_mlp": 1.03440189, "epoch": 0.7261085224710657, "flos": 26467528924800.0, "grad_norm": 1.7838340932565553, "language_loss": 0.79203463, "learning_rate": 6.958684382313704e-07, "loss": 0.8136161, "num_input_tokens_seen": 260689605, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 12077, "time_per_iteration": 2.5833022594451904 }, { "auxiliary_loss_clip": 0.01026265, "auxiliary_loss_mlp": 0.01247715, "balance_loss_clip": 1.00047112, "balance_loss_mlp": 1.00446534, "epoch": 0.7261686457237336, "flos": 66772732187520.0, "grad_norm": 0.8865773862541374, "language_loss": 0.6492185, "learning_rate": 6.955820108228314e-07, "loss": 0.67195833, "num_input_tokens_seen": 260748265, "router_z_loss_clip": 0.01263428, "router_z_loss_mlp": 0.21875, "step": 12078, "time_per_iteration": 3.1296133995056152 }, { "auxiliary_loss_clip": 0.01035084, "auxiliary_loss_mlp": 0.01002146, "balance_loss_clip": 1.00090075, "balance_loss_mlp": 1.00467372, "epoch": 0.7262287689764017, "flos": 69999594399360.0, "grad_norm": 0.7186811468341892, "language_loss": 0.59250677, "learning_rate": 6.952956299656166e-07, "loss": 0.61287904, "num_input_tokens_seen": 260816715, "router_z_loss_clip": 0.01245117, "router_z_loss_mlp": 0.21679688, "step": 12079, "time_per_iteration": 3.3328256607055664 }, { "auxiliary_loss_clip": 0.01138895, "auxiliary_loss_mlp": 0.01035135, "balance_loss_clip": 1.02170634, "balance_loss_mlp": 1.03514051, "epoch": 0.7262888922290696, "flos": 23805435231360.0, "grad_norm": 1.9855307101334316, "language_loss": 0.64710474, "learning_rate": 6.950092956699432e-07, "loss": 0.66884512, "num_input_tokens_seen": 260836765, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6875, "step": 12080, "time_per_iteration": 2.6180267333984375 }, { "auxiliary_loss_clip": 0.01140822, "auxiliary_loss_mlp": 0.01029781, "balance_loss_clip": 1.01666152, "balance_loss_mlp": 1.0346756, "epoch": 0.7263490154817376, "flos": 19500320603520.0, "grad_norm": 2.5192230533245845, "language_loss": 0.70345008, "learning_rate": 6.947230079460317e-07, "loss": 0.72515607, "num_input_tokens_seen": 260854610, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 12081, "time_per_iteration": 2.5297863483428955 }, { "auxiliary_loss_clip": 0.01109227, "auxiliary_loss_mlp": 0.01030232, "balance_loss_clip": 1.01787555, "balance_loss_mlp": 1.03782845, "epoch": 0.7264091387344055, "flos": 16286243633280.0, "grad_norm": 1.8318020059857356, "language_loss": 0.81357998, "learning_rate": 6.944367668040987e-07, "loss": 0.83497453, "num_input_tokens_seen": 260871620, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71484375, "step": 12082, "time_per_iteration": 2.4581754207611084 }, { "auxiliary_loss_clip": 0.01120761, "auxiliary_loss_mlp": 0.01034012, "balance_loss_clip": 1.02032697, "balance_loss_mlp": 1.03638446, "epoch": 0.7264692619870735, "flos": 24352031468160.0, "grad_norm": 1.7148364610266185, "language_loss": 0.77181196, "learning_rate": 6.941505722543592e-07, "loss": 0.79335964, "num_input_tokens_seen": 260890490, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75390625, "step": 12083, "time_per_iteration": 2.550952196121216 }, { "auxiliary_loss_clip": 0.01108043, "auxiliary_loss_mlp": 0.01030109, "balance_loss_clip": 1.01756859, "balance_loss_mlp": 1.03745532, "epoch": 0.7265293852397414, "flos": 25119478477440.0, "grad_norm": 1.8142633239116064, "language_loss": 0.72564918, "learning_rate": 6.93864424307026e-07, "loss": 0.74703074, "num_input_tokens_seen": 260909700, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 12084, "time_per_iteration": 2.5109002590179443 }, { "auxiliary_loss_clip": 0.01109318, "auxiliary_loss_mlp": 0.01032637, "balance_loss_clip": 1.01985192, "balance_loss_mlp": 1.0378747, "epoch": 0.7265895084924094, "flos": 22638230784000.0, "grad_norm": 1.800808480334448, "language_loss": 0.77833271, "learning_rate": 6.935783229723125e-07, "loss": 0.79975224, "num_input_tokens_seen": 260929090, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 12085, "time_per_iteration": 2.5305440425872803 }, { "auxiliary_loss_clip": 0.01114591, "auxiliary_loss_mlp": 0.0103327, "balance_loss_clip": 1.02092564, "balance_loss_mlp": 1.03510332, "epoch": 0.7266496317450775, "flos": 23368222886400.0, "grad_norm": 1.7628286277502383, "language_loss": 0.72722989, "learning_rate": 6.932922682604279e-07, "loss": 0.74870849, "num_input_tokens_seen": 260946615, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.703125, "step": 12086, "time_per_iteration": 2.5400214195251465 }, { "auxiliary_loss_clip": 0.01132888, "auxiliary_loss_mlp": 0.0103209, "balance_loss_clip": 1.01978207, "balance_loss_mlp": 1.03704906, "epoch": 0.7267097549977454, "flos": 28074603323520.0, "grad_norm": 1.693512018320774, "language_loss": 0.69562745, "learning_rate": 6.930062601815811e-07, "loss": 0.71727729, "num_input_tokens_seen": 260968515, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 12087, "time_per_iteration": 2.590493679046631 }, { "auxiliary_loss_clip": 0.01108526, "auxiliary_loss_mlp": 0.01035638, "balance_loss_clip": 1.02182174, "balance_loss_mlp": 1.0367806, "epoch": 0.7267698782504134, "flos": 22195523658240.0, "grad_norm": 2.6931674827717966, "language_loss": 0.78781152, "learning_rate": 6.927202987459781e-07, "loss": 0.8092531, "num_input_tokens_seen": 260986790, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71875, "step": 12088, "time_per_iteration": 2.474607229232788 }, { "auxiliary_loss_clip": 0.01115167, "auxiliary_loss_mlp": 0.01034851, "balance_loss_clip": 1.0226562, "balance_loss_mlp": 1.03511953, "epoch": 0.7268300015030813, "flos": 18514859996160.0, "grad_norm": 4.337941466141017, "language_loss": 0.73937315, "learning_rate": 6.924343839638264e-07, "loss": 0.76087332, "num_input_tokens_seen": 261004925, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7109375, "step": 12089, "time_per_iteration": 2.485687017440796 }, { "auxiliary_loss_clip": 0.01126012, "auxiliary_loss_mlp": 0.01033199, "balance_loss_clip": 1.02050924, "balance_loss_mlp": 1.03722107, "epoch": 0.7268901247557493, "flos": 23986029836160.0, "grad_norm": 1.8475354151662118, "language_loss": 0.71409667, "learning_rate": 6.921485158453268e-07, "loss": 0.73568881, "num_input_tokens_seen": 261023895, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 12090, "time_per_iteration": 2.5728933811187744 }, { "auxiliary_loss_clip": 0.01150361, "auxiliary_loss_mlp": 0.01033788, "balance_loss_clip": 1.02054405, "balance_loss_mlp": 1.03519821, "epoch": 0.7269502480084172, "flos": 32088087429120.0, "grad_norm": 1.7826129527349523, "language_loss": 0.77149332, "learning_rate": 6.918626944006831e-07, "loss": 0.79333484, "num_input_tokens_seen": 261045445, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 12091, "time_per_iteration": 2.6570494174957275 }, { "auxiliary_loss_clip": 0.0111398, "auxiliary_loss_mlp": 0.01277597, "balance_loss_clip": 1.01825678, "balance_loss_mlp": 1.03482056, "epoch": 0.7270103712610853, "flos": 19062785036160.0, "grad_norm": 1.8099275745813017, "language_loss": 0.71220112, "learning_rate": 6.915769196400956e-07, "loss": 0.73611689, "num_input_tokens_seen": 261064275, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 12092, "time_per_iteration": 2.5305562019348145 }, { "auxiliary_loss_clip": 0.01113169, "auxiliary_loss_mlp": 0.01029042, "balance_loss_clip": 1.01651883, "balance_loss_mlp": 1.03581262, "epoch": 0.7270704945137532, "flos": 34532921710080.0, "grad_norm": 10.814941803500165, "language_loss": 0.61104679, "learning_rate": 6.912911915737607e-07, "loss": 0.63246888, "num_input_tokens_seen": 261083310, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 12093, "time_per_iteration": 2.6269924640655518 }, { "auxiliary_loss_clip": 0.01122447, "auxiliary_loss_mlp": 0.01036796, "balance_loss_clip": 1.02404094, "balance_loss_mlp": 1.03665209, "epoch": 0.7271306177664212, "flos": 21507583403520.0, "grad_norm": 1.5880642728998717, "language_loss": 0.75386679, "learning_rate": 6.910055102118775e-07, "loss": 0.77545923, "num_input_tokens_seen": 261103460, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 12094, "time_per_iteration": 2.592773199081421 }, { "auxiliary_loss_clip": 0.01129995, "auxiliary_loss_mlp": 0.01032127, "balance_loss_clip": 1.02009916, "balance_loss_mlp": 1.03411794, "epoch": 0.7271907410190891, "flos": 22272444633600.0, "grad_norm": 1.864183602104722, "language_loss": 0.84940052, "learning_rate": 6.907198755646397e-07, "loss": 0.87102175, "num_input_tokens_seen": 261121375, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 12095, "time_per_iteration": 2.647001028060913 }, { "auxiliary_loss_clip": 0.01114088, "auxiliary_loss_mlp": 0.01039329, "balance_loss_clip": 1.0255487, "balance_loss_mlp": 1.03507411, "epoch": 0.7272508642717571, "flos": 22893124671360.0, "grad_norm": 2.0179289269627962, "language_loss": 0.77654511, "learning_rate": 6.904342876422433e-07, "loss": 0.79807925, "num_input_tokens_seen": 261141105, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.703125, "step": 12096, "time_per_iteration": 2.5683672428131104 }, { "auxiliary_loss_clip": 0.01107711, "auxiliary_loss_mlp": 0.010367, "balance_loss_clip": 1.02412319, "balance_loss_mlp": 1.03833747, "epoch": 0.727310987524425, "flos": 11655886331520.0, "grad_norm": 1.9781088270649259, "language_loss": 0.72305042, "learning_rate": 6.90148746454877e-07, "loss": 0.7444945, "num_input_tokens_seen": 261159255, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 12097, "time_per_iteration": 2.4976015090942383 }, { "auxiliary_loss_clip": 0.01115894, "auxiliary_loss_mlp": 0.01285714, "balance_loss_clip": 1.02562118, "balance_loss_mlp": 1.03645217, "epoch": 0.727371110777093, "flos": 24535319592960.0, "grad_norm": 1.7392510289569794, "language_loss": 0.7656247, "learning_rate": 6.898632520127334e-07, "loss": 0.78964078, "num_input_tokens_seen": 261177960, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 12098, "time_per_iteration": 2.516009569168091 }, { "auxiliary_loss_clip": 0.01125435, "auxiliary_loss_mlp": 0.01030812, "balance_loss_clip": 1.01738858, "balance_loss_mlp": 1.03570771, "epoch": 0.7274312340297611, "flos": 74739835405440.0, "grad_norm": 1.8571483028311426, "language_loss": 0.67790306, "learning_rate": 6.895778043260001e-07, "loss": 0.69946551, "num_input_tokens_seen": 261205660, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 12099, "time_per_iteration": 2.9115042686462402 }, { "auxiliary_loss_clip": 0.01119359, "auxiliary_loss_mlp": 0.01035334, "balance_loss_clip": 1.02079654, "balance_loss_mlp": 1.03545046, "epoch": 0.727491357282429, "flos": 22342865247360.0, "grad_norm": 1.9666120435501395, "language_loss": 0.72445512, "learning_rate": 6.892924034048644e-07, "loss": 0.74600208, "num_input_tokens_seen": 261225185, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.74609375, "step": 12100, "time_per_iteration": 2.497751235961914 }, { "auxiliary_loss_clip": 0.0111567, "auxiliary_loss_mlp": 0.01035572, "balance_loss_clip": 1.02328134, "balance_loss_mlp": 1.03727531, "epoch": 0.727551480535097, "flos": 23297550877440.0, "grad_norm": 1.3140600399607534, "language_loss": 0.74603081, "learning_rate": 6.890070492595104e-07, "loss": 0.7675432, "num_input_tokens_seen": 261247965, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 12101, "time_per_iteration": 2.568650245666504 }, { "auxiliary_loss_clip": 0.01133851, "auxiliary_loss_mlp": 0.01028671, "balance_loss_clip": 1.01731682, "balance_loss_mlp": 1.03794241, "epoch": 0.7276116037877649, "flos": 21470559459840.0, "grad_norm": 1.715812065471722, "language_loss": 0.8241691, "learning_rate": 6.887217419001232e-07, "loss": 0.84579432, "num_input_tokens_seen": 261267585, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.69140625, "step": 12102, "time_per_iteration": 2.5017659664154053 }, { "auxiliary_loss_clip": 0.01113869, "auxiliary_loss_mlp": 0.01035055, "balance_loss_clip": 1.02296686, "balance_loss_mlp": 1.03603268, "epoch": 0.7276717270404329, "flos": 21464059098240.0, "grad_norm": 2.0049021654497468, "language_loss": 0.82023621, "learning_rate": 6.884364813368841e-07, "loss": 0.84172547, "num_input_tokens_seen": 261285200, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 12103, "time_per_iteration": 3.876591682434082 }, { "auxiliary_loss_clip": 0.01124797, "auxiliary_loss_mlp": 0.01029122, "balance_loss_clip": 1.01557374, "balance_loss_mlp": 1.03719115, "epoch": 0.7277318502931008, "flos": 16837221329280.0, "grad_norm": 2.079786910641987, "language_loss": 0.66450858, "learning_rate": 6.881512675799735e-07, "loss": 0.68604779, "num_input_tokens_seen": 261303645, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.69921875, "step": 12104, "time_per_iteration": 2.5197389125823975 }, { "auxiliary_loss_clip": 0.01129544, "auxiliary_loss_mlp": 0.01032473, "balance_loss_clip": 1.02018833, "balance_loss_mlp": 1.03602874, "epoch": 0.7277919735457689, "flos": 33400550476800.0, "grad_norm": 1.7997341557913078, "language_loss": 0.65891135, "learning_rate": 6.878661006395687e-07, "loss": 0.6805315, "num_input_tokens_seen": 261323265, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.66796875, "step": 12105, "time_per_iteration": 2.657062530517578 }, { "auxiliary_loss_clip": 0.01108004, "auxiliary_loss_mlp": 0.01029336, "balance_loss_clip": 1.016837, "balance_loss_mlp": 1.03826606, "epoch": 0.7278520967984368, "flos": 19206499351680.0, "grad_norm": 2.0681447604917556, "language_loss": 0.75576162, "learning_rate": 6.875809805258488e-07, "loss": 0.77713501, "num_input_tokens_seen": 261339745, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 12106, "time_per_iteration": 2.4423675537109375 }, { "auxiliary_loss_clip": 0.01118315, "auxiliary_loss_mlp": 0.01033058, "balance_loss_clip": 1.02019572, "balance_loss_mlp": 1.03854346, "epoch": 0.7279122200511048, "flos": 34094667870720.0, "grad_norm": 6.542937475331887, "language_loss": 0.70527142, "learning_rate": 6.872959072489872e-07, "loss": 0.72678518, "num_input_tokens_seen": 261359310, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 12107, "time_per_iteration": 2.619032859802246 }, { "auxiliary_loss_clip": 0.01114375, "auxiliary_loss_mlp": 0.01033405, "balance_loss_clip": 1.02095985, "balance_loss_mlp": 1.03589964, "epoch": 0.7279723433037727, "flos": 54599049348480.0, "grad_norm": 2.823187227293482, "language_loss": 0.75615036, "learning_rate": 6.870108808191574e-07, "loss": 0.77762818, "num_input_tokens_seen": 261384640, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 12108, "time_per_iteration": 2.8001954555511475 }, { "auxiliary_loss_clip": 0.01105926, "auxiliary_loss_mlp": 0.0103116, "balance_loss_clip": 1.01841044, "balance_loss_mlp": 1.03582883, "epoch": 0.7280324665564407, "flos": 36137482156800.0, "grad_norm": 1.6065660904065462, "language_loss": 0.67157024, "learning_rate": 6.867259012465331e-07, "loss": 0.69294107, "num_input_tokens_seen": 261405290, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 12109, "time_per_iteration": 2.637219190597534 }, { "auxiliary_loss_clip": 0.01133637, "auxiliary_loss_mlp": 0.01030062, "balance_loss_clip": 1.01702082, "balance_loss_mlp": 1.03730857, "epoch": 0.7280925898091086, "flos": 11618539165440.0, "grad_norm": 2.0612767294463277, "language_loss": 0.63620019, "learning_rate": 6.864409685412822e-07, "loss": 0.65783715, "num_input_tokens_seen": 261419710, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 12110, "time_per_iteration": 3.8894941806793213 }, { "auxiliary_loss_clip": 0.01133096, "auxiliary_loss_mlp": 0.01030626, "balance_loss_clip": 1.01734638, "balance_loss_mlp": 1.03420854, "epoch": 0.7281527130617766, "flos": 34277094069120.0, "grad_norm": 2.03716007317814, "language_loss": 0.5852592, "learning_rate": 6.861560827135746e-07, "loss": 0.6068964, "num_input_tokens_seen": 261442385, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 12111, "time_per_iteration": 2.6279335021972656 }, { "auxiliary_loss_clip": 0.01152739, "auxiliary_loss_mlp": 0.01032185, "balance_loss_clip": 1.0193522, "balance_loss_mlp": 1.03790188, "epoch": 0.7282128363144446, "flos": 13918043018880.0, "grad_norm": 2.2506866639672047, "language_loss": 0.73845744, "learning_rate": 6.858712437735761e-07, "loss": 0.76030666, "num_input_tokens_seen": 261459805, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 12112, "time_per_iteration": 2.6105785369873047 }, { "auxiliary_loss_clip": 0.01111452, "auxiliary_loss_mlp": 0.01031148, "balance_loss_clip": 1.01916122, "balance_loss_mlp": 1.03394437, "epoch": 0.7282729595671126, "flos": 20777627214720.0, "grad_norm": 2.078779629123679, "language_loss": 0.66751546, "learning_rate": 6.855864517314541e-07, "loss": 0.68894148, "num_input_tokens_seen": 261477175, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.68359375, "step": 12113, "time_per_iteration": 2.5075364112854004 }, { "auxiliary_loss_clip": 0.01132308, "auxiliary_loss_mlp": 0.01033742, "balance_loss_clip": 1.02034283, "balance_loss_mlp": 1.03440213, "epoch": 0.7283330828197806, "flos": 16325422392960.0, "grad_norm": 1.5918374725552815, "language_loss": 0.73102134, "learning_rate": 6.853017065973692e-07, "loss": 0.75268185, "num_input_tokens_seen": 261494990, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 12114, "time_per_iteration": 2.5412821769714355 }, { "auxiliary_loss_clip": 0.01122879, "auxiliary_loss_mlp": 0.01031616, "balance_loss_clip": 1.01852095, "balance_loss_mlp": 1.03537929, "epoch": 0.7283932060724485, "flos": 27490193043840.0, "grad_norm": 2.0487220933503263, "language_loss": 0.68390346, "learning_rate": 6.850170083814852e-07, "loss": 0.70544839, "num_input_tokens_seen": 261514445, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 12115, "time_per_iteration": 4.066434860229492 }, { "auxiliary_loss_clip": 0.01107284, "auxiliary_loss_mlp": 0.01030352, "balance_loss_clip": 1.01784134, "balance_loss_mlp": 1.03590953, "epoch": 0.7284533293251165, "flos": 18367877543040.0, "grad_norm": 2.0422857501598726, "language_loss": 0.59706986, "learning_rate": 6.847323570939616e-07, "loss": 0.61844623, "num_input_tokens_seen": 261533565, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71484375, "step": 12116, "time_per_iteration": 4.035883903503418 }, { "auxiliary_loss_clip": 0.01113841, "auxiliary_loss_mlp": 0.01030914, "balance_loss_clip": 1.01863527, "balance_loss_mlp": 1.03630006, "epoch": 0.7285134525777844, "flos": 21725525174400.0, "grad_norm": 2.3762601496677083, "language_loss": 0.73128176, "learning_rate": 6.844477527449568e-07, "loss": 0.7527293, "num_input_tokens_seen": 261553795, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 12117, "time_per_iteration": 2.5249693393707275 }, { "auxiliary_loss_clip": 0.01115049, "auxiliary_loss_mlp": 0.01028172, "balance_loss_clip": 1.01600623, "balance_loss_mlp": 1.03601575, "epoch": 0.7285735758304525, "flos": 20741357456640.0, "grad_norm": 1.913736293050699, "language_loss": 0.69509053, "learning_rate": 6.841631953446272e-07, "loss": 0.71652269, "num_input_tokens_seen": 261572565, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.69921875, "step": 12118, "time_per_iteration": 2.5301437377929688 }, { "auxiliary_loss_clip": 0.0113721, "auxiliary_loss_mlp": 0.01032667, "balance_loss_clip": 1.02118123, "balance_loss_mlp": 1.03513849, "epoch": 0.7286336990831204, "flos": 17310954827520.0, "grad_norm": 1.8066036897126674, "language_loss": 0.84047025, "learning_rate": 6.838786849031291e-07, "loss": 0.86216903, "num_input_tokens_seen": 261590910, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6640625, "step": 12119, "time_per_iteration": 2.5849099159240723 }, { "auxiliary_loss_clip": 0.01105992, "auxiliary_loss_mlp": 0.01029609, "balance_loss_clip": 1.0177716, "balance_loss_mlp": 1.035918, "epoch": 0.7286938223357884, "flos": 19787390098560.0, "grad_norm": 2.036533301116985, "language_loss": 0.81865132, "learning_rate": 6.835942214306151e-07, "loss": 0.84000731, "num_input_tokens_seen": 261606005, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69921875, "step": 12120, "time_per_iteration": 2.4592278003692627 }, { "auxiliary_loss_clip": 0.01135712, "auxiliary_loss_mlp": 0.01038101, "balance_loss_clip": 1.02591205, "balance_loss_mlp": 1.0386796, "epoch": 0.7287539455884563, "flos": 15340859625600.0, "grad_norm": 1.8109594380341894, "language_loss": 0.78909093, "learning_rate": 6.833098049372375e-07, "loss": 0.81082904, "num_input_tokens_seen": 261622305, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 12121, "time_per_iteration": 2.514432191848755 }, { "auxiliary_loss_clip": 0.01106454, "auxiliary_loss_mlp": 0.01032571, "balance_loss_clip": 1.01961899, "balance_loss_mlp": 1.03423715, "epoch": 0.7288140688411243, "flos": 25192484870400.0, "grad_norm": 2.347402280205935, "language_loss": 0.69248348, "learning_rate": 6.830254354331458e-07, "loss": 0.71387374, "num_input_tokens_seen": 261642465, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 12122, "time_per_iteration": 2.51283597946167 }, { "auxiliary_loss_clip": 0.01136923, "auxiliary_loss_mlp": 0.01033267, "balance_loss_clip": 1.0214057, "balance_loss_mlp": 1.03393352, "epoch": 0.7288741920937922, "flos": 23984162328960.0, "grad_norm": 1.6732520434762828, "language_loss": 0.8728956, "learning_rate": 6.827411129284886e-07, "loss": 0.89459753, "num_input_tokens_seen": 261661420, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.67578125, "step": 12123, "time_per_iteration": 2.5632426738739014 }, { "auxiliary_loss_clip": 0.01130078, "auxiliary_loss_mlp": 0.01032397, "balance_loss_clip": 1.02035129, "balance_loss_mlp": 1.03471088, "epoch": 0.7289343153464602, "flos": 22744921155840.0, "grad_norm": 3.050531104144931, "language_loss": 0.82849872, "learning_rate": 6.824568374334125e-07, "loss": 0.85012347, "num_input_tokens_seen": 261680865, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 12124, "time_per_iteration": 2.5630156993865967 }, { "auxiliary_loss_clip": 0.01122347, "auxiliary_loss_mlp": 0.01030163, "balance_loss_clip": 1.01811695, "balance_loss_mlp": 1.03647017, "epoch": 0.7289944385991282, "flos": 24900028335360.0, "grad_norm": 1.9973659134607713, "language_loss": 0.6713798, "learning_rate": 6.821726089580624e-07, "loss": 0.69290489, "num_input_tokens_seen": 261701455, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 12125, "time_per_iteration": 2.5733213424682617 }, { "auxiliary_loss_clip": 0.0111832, "auxiliary_loss_mlp": 0.01036677, "balance_loss_clip": 1.02309299, "balance_loss_mlp": 1.03636694, "epoch": 0.7290545618517962, "flos": 22967064817920.0, "grad_norm": 2.333006758721315, "language_loss": 0.75044537, "learning_rate": 6.818884275125831e-07, "loss": 0.77199531, "num_input_tokens_seen": 261721260, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73046875, "step": 12126, "time_per_iteration": 2.5592172145843506 }, { "auxiliary_loss_clip": 0.01045971, "auxiliary_loss_mlp": 0.01000979, "balance_loss_clip": 0.99966192, "balance_loss_mlp": 1.00676799, "epoch": 0.7291146851044642, "flos": 61901523216000.0, "grad_norm": 0.8172941201977943, "language_loss": 0.58688819, "learning_rate": 6.816042931071142e-07, "loss": 0.60735768, "num_input_tokens_seen": 261779370, "router_z_loss_clip": 0.01318359, "router_z_loss_mlp": 0.21679688, "step": 12127, "time_per_iteration": 3.100898504257202 }, { "auxiliary_loss_clip": 0.0113096, "auxiliary_loss_mlp": 0.010285, "balance_loss_clip": 1.01717472, "balance_loss_mlp": 1.03720117, "epoch": 0.7291748083571321, "flos": 23330947547520.0, "grad_norm": 2.0574567322603228, "language_loss": 0.6851244, "learning_rate": 6.813202057517973e-07, "loss": 0.70671898, "num_input_tokens_seen": 261798050, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.671875, "step": 12128, "time_per_iteration": 2.6231446266174316 }, { "auxiliary_loss_clip": 0.01126329, "auxiliary_loss_mlp": 0.01032015, "balance_loss_clip": 1.01908088, "balance_loss_mlp": 1.03727889, "epoch": 0.7292349316098001, "flos": 28330000001280.0, "grad_norm": 2.305193879302649, "language_loss": 0.6526165, "learning_rate": 6.810361654567695e-07, "loss": 0.67419994, "num_input_tokens_seen": 261817660, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 12129, "time_per_iteration": 2.5715737342834473 }, { "auxiliary_loss_clip": 0.01102724, "auxiliary_loss_mlp": 0.01028201, "balance_loss_clip": 1.01546335, "balance_loss_mlp": 1.03565264, "epoch": 0.729295054862468, "flos": 24132222190080.0, "grad_norm": 2.060531661394977, "language_loss": 0.73978841, "learning_rate": 6.807521722321697e-07, "loss": 0.76109767, "num_input_tokens_seen": 261837935, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.671875, "step": 12130, "time_per_iteration": 2.5228214263916016 }, { "auxiliary_loss_clip": 0.01122667, "auxiliary_loss_mlp": 0.01031624, "balance_loss_clip": 1.01892185, "balance_loss_mlp": 1.03550911, "epoch": 0.7293551781151361, "flos": 22816239609600.0, "grad_norm": 1.619295832818318, "language_loss": 0.69596779, "learning_rate": 6.804682260881298e-07, "loss": 0.71751076, "num_input_tokens_seen": 261857575, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 12131, "time_per_iteration": 2.550715923309326 }, { "auxiliary_loss_clip": 0.01130905, "auxiliary_loss_mlp": 0.0103302, "balance_loss_clip": 1.02073598, "balance_loss_mlp": 1.03692985, "epoch": 0.729415301367804, "flos": 22126683242880.0, "grad_norm": 3.0757593730882395, "language_loss": 0.77204251, "learning_rate": 6.801843270347854e-07, "loss": 0.7936818, "num_input_tokens_seen": 261877265, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 12132, "time_per_iteration": 2.5609304904937744 }, { "auxiliary_loss_clip": 0.01114896, "auxiliary_loss_mlp": 0.0103402, "balance_loss_clip": 1.02200341, "balance_loss_mlp": 1.03667474, "epoch": 0.729475424620472, "flos": 12349608675840.0, "grad_norm": 1.9841450396420826, "language_loss": 0.79308385, "learning_rate": 6.799004750822672e-07, "loss": 0.81457305, "num_input_tokens_seen": 261893695, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 12133, "time_per_iteration": 2.543867349624634 }, { "auxiliary_loss_clip": 0.01140109, "auxiliary_loss_mlp": 0.01029544, "balance_loss_clip": 1.01726532, "balance_loss_mlp": 1.03485084, "epoch": 0.7295355478731399, "flos": 22195308176640.0, "grad_norm": 1.7996094612168456, "language_loss": 0.72073871, "learning_rate": 6.796166702407055e-07, "loss": 0.74243522, "num_input_tokens_seen": 261911825, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6953125, "step": 12134, "time_per_iteration": 2.7475249767303467 }, { "auxiliary_loss_clip": 0.0112575, "auxiliary_loss_mlp": 0.01035343, "balance_loss_clip": 1.0223788, "balance_loss_mlp": 1.03823709, "epoch": 0.7295956711258079, "flos": 23222030532480.0, "grad_norm": 1.7034450345324295, "language_loss": 0.71259874, "learning_rate": 6.793329125202278e-07, "loss": 0.73420966, "num_input_tokens_seen": 261931190, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69921875, "step": 12135, "time_per_iteration": 2.680656909942627 }, { "auxiliary_loss_clip": 0.0113551, "auxiliary_loss_mlp": 0.01034398, "balance_loss_clip": 1.02152371, "balance_loss_mlp": 1.03703523, "epoch": 0.7296557943784758, "flos": 31869104163840.0, "grad_norm": 1.9694802828824274, "language_loss": 0.61771518, "learning_rate": 6.790492019309628e-07, "loss": 0.63941431, "num_input_tokens_seen": 261951240, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 12136, "time_per_iteration": 2.6971092224121094 }, { "auxiliary_loss_clip": 0.01113047, "auxiliary_loss_mlp": 0.0128517, "balance_loss_clip": 1.02655411, "balance_loss_mlp": 1.03531003, "epoch": 0.7297159176311439, "flos": 26651714889600.0, "grad_norm": 1.899844761745923, "language_loss": 0.74552888, "learning_rate": 6.787655384830328e-07, "loss": 0.76951098, "num_input_tokens_seen": 261971605, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.68359375, "step": 12137, "time_per_iteration": 2.642807960510254 }, { "auxiliary_loss_clip": 0.01116861, "auxiliary_loss_mlp": 0.01283034, "balance_loss_clip": 1.02382445, "balance_loss_mlp": 1.03749919, "epoch": 0.7297760408838118, "flos": 24749562263040.0, "grad_norm": 1.7064536248569064, "language_loss": 0.74186885, "learning_rate": 6.784819221865619e-07, "loss": 0.76586783, "num_input_tokens_seen": 261990830, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 12138, "time_per_iteration": 2.6412692070007324 }, { "auxiliary_loss_clip": 0.01129705, "auxiliary_loss_mlp": 0.01029254, "balance_loss_clip": 1.01722622, "balance_loss_mlp": 1.03440034, "epoch": 0.7298361641364798, "flos": 18073768982400.0, "grad_norm": 2.6948137311843863, "language_loss": 0.71544355, "learning_rate": 6.781983530516722e-07, "loss": 0.73703319, "num_input_tokens_seen": 262008190, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 12139, "time_per_iteration": 2.5960469245910645 }, { "auxiliary_loss_clip": 0.01109937, "auxiliary_loss_mlp": 0.01028397, "balance_loss_clip": 1.01707232, "balance_loss_mlp": 1.03528965, "epoch": 0.7298962873891478, "flos": 29895597169920.0, "grad_norm": 1.4419718737041305, "language_loss": 0.73491871, "learning_rate": 6.779148310884832e-07, "loss": 0.756302, "num_input_tokens_seen": 262030460, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.66015625, "step": 12140, "time_per_iteration": 2.6699275970458984 }, { "auxiliary_loss_clip": 0.01123888, "auxiliary_loss_mlp": 0.01031154, "balance_loss_clip": 1.01841688, "balance_loss_mlp": 1.03644252, "epoch": 0.7299564106418157, "flos": 32266096254720.0, "grad_norm": 1.4062658940546284, "language_loss": 0.55438077, "learning_rate": 6.776313563071132e-07, "loss": 0.57593119, "num_input_tokens_seen": 262050830, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 12141, "time_per_iteration": 2.630370855331421 }, { "auxiliary_loss_clip": 0.01129173, "auxiliary_loss_mlp": 0.01025023, "balance_loss_clip": 1.01347756, "balance_loss_mlp": 1.0354048, "epoch": 0.7300165338944837, "flos": 22930292269440.0, "grad_norm": 1.8101981377204266, "language_loss": 0.72809869, "learning_rate": 6.77347928717678e-07, "loss": 0.74964064, "num_input_tokens_seen": 262071245, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 12142, "time_per_iteration": 2.5593910217285156 }, { "auxiliary_loss_clip": 0.01102696, "auxiliary_loss_mlp": 0.01034407, "balance_loss_clip": 1.0226469, "balance_loss_mlp": 1.03481483, "epoch": 0.7300766571471516, "flos": 19828795501440.0, "grad_norm": 1.7950420433418914, "language_loss": 0.7346794, "learning_rate": 6.770645483302941e-07, "loss": 0.75605047, "num_input_tokens_seen": 262087525, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6796875, "step": 12143, "time_per_iteration": 2.5885016918182373 }, { "auxiliary_loss_clip": 0.01133173, "auxiliary_loss_mlp": 0.01030398, "balance_loss_clip": 1.01804233, "balance_loss_mlp": 1.03662479, "epoch": 0.7301367803998197, "flos": 24347829576960.0, "grad_norm": 4.320547609178292, "language_loss": 0.66244549, "learning_rate": 6.767812151550722e-07, "loss": 0.6840812, "num_input_tokens_seen": 262107355, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 12144, "time_per_iteration": 4.019986391067505 }, { "auxiliary_loss_clip": 0.01123171, "auxiliary_loss_mlp": 0.01029647, "balance_loss_clip": 1.01729107, "balance_loss_mlp": 1.03586626, "epoch": 0.7301969036524876, "flos": 15304518040320.0, "grad_norm": 2.23680937534753, "language_loss": 0.78920925, "learning_rate": 6.764979292021256e-07, "loss": 0.81073749, "num_input_tokens_seen": 262125645, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 12145, "time_per_iteration": 2.5103988647460938 }, { "auxiliary_loss_clip": 0.01130693, "auxiliary_loss_mlp": 0.010331, "balance_loss_clip": 1.0206722, "balance_loss_mlp": 1.03662884, "epoch": 0.7302570269051556, "flos": 23507268433920.0, "grad_norm": 1.9108290283874765, "language_loss": 0.9137606, "learning_rate": 6.762146904815629e-07, "loss": 0.93539852, "num_input_tokens_seen": 262144075, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.671875, "step": 12146, "time_per_iteration": 2.5486416816711426 }, { "auxiliary_loss_clip": 0.01104156, "auxiliary_loss_mlp": 0.01027578, "balance_loss_clip": 1.01581812, "balance_loss_mlp": 1.03737187, "epoch": 0.7303171501578235, "flos": 20523056549760.0, "grad_norm": 1.6103249651012232, "language_loss": 0.62114185, "learning_rate": 6.759314990034939e-07, "loss": 0.64245921, "num_input_tokens_seen": 262165940, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.66796875, "step": 12147, "time_per_iteration": 2.581435441970825 }, { "auxiliary_loss_clip": 0.01107419, "auxiliary_loss_mlp": 0.01037743, "balance_loss_clip": 1.02467167, "balance_loss_mlp": 1.03719902, "epoch": 0.7303772734104915, "flos": 18332613365760.0, "grad_norm": 2.1644550524381443, "language_loss": 0.75564736, "learning_rate": 6.756483547780225e-07, "loss": 0.77709895, "num_input_tokens_seen": 262184520, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 12148, "time_per_iteration": 2.592263698577881 }, { "auxiliary_loss_clip": 0.01045034, "auxiliary_loss_mlp": 0.01248962, "balance_loss_clip": 1.00167358, "balance_loss_mlp": 1.00562787, "epoch": 0.7304373966631594, "flos": 60654776100480.0, "grad_norm": 0.7120769746849595, "language_loss": 0.56678605, "learning_rate": 6.753652578152555e-07, "loss": 0.58972597, "num_input_tokens_seen": 262247070, "router_z_loss_clip": 0.01379395, "router_z_loss_mlp": 0.21875, "step": 12149, "time_per_iteration": 3.166290044784546 }, { "auxiliary_loss_clip": 0.01123421, "auxiliary_loss_mlp": 0.01032012, "balance_loss_clip": 1.01932847, "balance_loss_mlp": 1.03482902, "epoch": 0.7304975199158275, "flos": 19828077229440.0, "grad_norm": 1.900757431049318, "language_loss": 0.73740268, "learning_rate": 6.75082208125295e-07, "loss": 0.75895703, "num_input_tokens_seen": 262266605, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 12150, "time_per_iteration": 2.578439235687256 }, { "auxiliary_loss_clip": 0.01107991, "auxiliary_loss_mlp": 0.01032447, "balance_loss_clip": 1.01898789, "balance_loss_mlp": 1.03663266, "epoch": 0.7305576431684954, "flos": 13223997452160.0, "grad_norm": 2.1540258729194792, "language_loss": 0.84037119, "learning_rate": 6.747992057182423e-07, "loss": 0.86177552, "num_input_tokens_seen": 262283880, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 12151, "time_per_iteration": 3.8525328636169434 }, { "auxiliary_loss_clip": 0.01133866, "auxiliary_loss_mlp": 0.01035239, "balance_loss_clip": 1.02208447, "balance_loss_mlp": 1.0359962, "epoch": 0.7306177664211634, "flos": 24060472773120.0, "grad_norm": 2.1052405392585736, "language_loss": 0.78173226, "learning_rate": 6.745162506041972e-07, "loss": 0.80342335, "num_input_tokens_seen": 262304155, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.70703125, "step": 12152, "time_per_iteration": 2.558547019958496 }, { "auxiliary_loss_clip": 0.01133132, "auxiliary_loss_mlp": 0.01032824, "balance_loss_clip": 1.02019989, "balance_loss_mlp": 1.03753054, "epoch": 0.7306778896738314, "flos": 27089106802560.0, "grad_norm": 1.5004907722547582, "language_loss": 0.79465163, "learning_rate": 6.742333427932577e-07, "loss": 0.81631112, "num_input_tokens_seen": 262325660, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 12153, "time_per_iteration": 2.640615701675415 }, { "auxiliary_loss_clip": 0.01115771, "auxiliary_loss_mlp": 0.01033265, "balance_loss_clip": 1.02039003, "balance_loss_mlp": 1.03591919, "epoch": 0.7307380129264993, "flos": 16690669839360.0, "grad_norm": 1.6419550258962035, "language_loss": 0.67729396, "learning_rate": 6.739504822955195e-07, "loss": 0.69878435, "num_input_tokens_seen": 262344075, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 12154, "time_per_iteration": 2.556596279144287 }, { "auxiliary_loss_clip": 0.01141759, "auxiliary_loss_mlp": 0.01034805, "balance_loss_clip": 1.02197766, "balance_loss_mlp": 1.0347302, "epoch": 0.7307981361791673, "flos": 21725740656000.0, "grad_norm": 1.8740083845312234, "language_loss": 0.66372675, "learning_rate": 6.736676691210772e-07, "loss": 0.6854924, "num_input_tokens_seen": 262363305, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 12155, "time_per_iteration": 2.619086980819702 }, { "auxiliary_loss_clip": 0.01118675, "auxiliary_loss_mlp": 0.01031462, "balance_loss_clip": 1.01917791, "balance_loss_mlp": 1.03329992, "epoch": 0.7308582594318352, "flos": 18040659621120.0, "grad_norm": 1.824272072274117, "language_loss": 0.81827533, "learning_rate": 6.733849032800247e-07, "loss": 0.83977664, "num_input_tokens_seen": 262380730, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 12156, "time_per_iteration": 2.580556869506836 }, { "auxiliary_loss_clip": 0.01128444, "auxiliary_loss_mlp": 0.01035385, "balance_loss_clip": 1.02382755, "balance_loss_mlp": 1.03476179, "epoch": 0.7309183826845033, "flos": 13844964798720.0, "grad_norm": 2.0395882984631375, "language_loss": 0.75371057, "learning_rate": 6.731021847824528e-07, "loss": 0.7753489, "num_input_tokens_seen": 262395480, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.671875, "step": 12157, "time_per_iteration": 4.094721555709839 }, { "auxiliary_loss_clip": 0.0110898, "auxiliary_loss_mlp": 0.01028232, "balance_loss_clip": 1.01690102, "balance_loss_mlp": 1.03401232, "epoch": 0.7309785059371712, "flos": 17019216564480.0, "grad_norm": 1.878650558487531, "language_loss": 0.72470343, "learning_rate": 6.728195136384502e-07, "loss": 0.74607551, "num_input_tokens_seen": 262413340, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6640625, "step": 12158, "time_per_iteration": 4.035585880279541 }, { "auxiliary_loss_clip": 0.01126014, "auxiliary_loss_mlp": 0.01033866, "balance_loss_clip": 1.02139044, "balance_loss_mlp": 1.03720903, "epoch": 0.7310386291898392, "flos": 26502398052480.0, "grad_norm": 2.6842042605865397, "language_loss": 0.86199331, "learning_rate": 6.725368898581049e-07, "loss": 0.88359213, "num_input_tokens_seen": 262433455, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 12159, "time_per_iteration": 2.5607049465179443 }, { "auxiliary_loss_clip": 0.01125234, "auxiliary_loss_mlp": 0.01034004, "balance_loss_clip": 1.02052176, "balance_loss_mlp": 1.03512895, "epoch": 0.7310987524425071, "flos": 16945922862720.0, "grad_norm": 1.9279251864046338, "language_loss": 0.73555994, "learning_rate": 6.722543134515046e-07, "loss": 0.75715232, "num_input_tokens_seen": 262450335, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73046875, "step": 12160, "time_per_iteration": 2.5774059295654297 }, { "auxiliary_loss_clip": 0.01117162, "auxiliary_loss_mlp": 0.01032904, "balance_loss_clip": 1.01933801, "balance_loss_mlp": 1.03645265, "epoch": 0.7311588756951751, "flos": 13845288021120.0, "grad_norm": 2.110711012058187, "language_loss": 0.73294938, "learning_rate": 6.719717844287314e-07, "loss": 0.75445002, "num_input_tokens_seen": 262468240, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 12161, "time_per_iteration": 2.5002636909484863 }, { "auxiliary_loss_clip": 0.01138849, "auxiliary_loss_mlp": 0.01281057, "balance_loss_clip": 1.02037203, "balance_loss_mlp": 1.03788745, "epoch": 0.731218998947843, "flos": 28767894704640.0, "grad_norm": 3.1419290516565916, "language_loss": 0.69587052, "learning_rate": 6.716893027998695e-07, "loss": 0.72006959, "num_input_tokens_seen": 262487045, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7421875, "step": 12162, "time_per_iteration": 2.620030641555786 }, { "auxiliary_loss_clip": 0.01117892, "auxiliary_loss_mlp": 0.01034485, "balance_loss_clip": 1.02112126, "balance_loss_mlp": 1.0372045, "epoch": 0.7312791222005111, "flos": 27088783580160.0, "grad_norm": 1.7082443110940178, "language_loss": 0.66939384, "learning_rate": 6.71406868574999e-07, "loss": 0.69091761, "num_input_tokens_seen": 262504855, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.72265625, "step": 12163, "time_per_iteration": 2.551830530166626 }, { "auxiliary_loss_clip": 0.01122368, "auxiliary_loss_mlp": 0.01029848, "balance_loss_clip": 1.01632977, "balance_loss_mlp": 1.03453302, "epoch": 0.731339245453179, "flos": 20924035050240.0, "grad_norm": 1.5386065196967642, "language_loss": 0.68757641, "learning_rate": 6.71124481764201e-07, "loss": 0.70909858, "num_input_tokens_seen": 262524920, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 12164, "time_per_iteration": 2.590738296508789 }, { "auxiliary_loss_clip": 0.01140537, "auxiliary_loss_mlp": 0.01030883, "balance_loss_clip": 1.01881361, "balance_loss_mlp": 1.03786469, "epoch": 0.731399368705847, "flos": 23075694524160.0, "grad_norm": 1.5462296592633764, "language_loss": 0.73281658, "learning_rate": 6.708421423775507e-07, "loss": 0.75453079, "num_input_tokens_seen": 262545725, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.671875, "step": 12165, "time_per_iteration": 2.5651190280914307 }, { "auxiliary_loss_clip": 0.01116527, "auxiliary_loss_mlp": 0.01038083, "balance_loss_clip": 1.02504754, "balance_loss_mlp": 1.03649759, "epoch": 0.731459491958515, "flos": 23582681038080.0, "grad_norm": 1.873275548427225, "language_loss": 0.76455677, "learning_rate": 6.705598504251262e-07, "loss": 0.78610283, "num_input_tokens_seen": 262565480, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 12166, "time_per_iteration": 2.53840708732605 }, { "auxiliary_loss_clip": 0.01122188, "auxiliary_loss_mlp": 0.01031851, "balance_loss_clip": 1.01939368, "balance_loss_mlp": 1.03575087, "epoch": 0.7315196152111829, "flos": 22379278659840.0, "grad_norm": 4.603842536557336, "language_loss": 0.79528046, "learning_rate": 6.702776059170014e-07, "loss": 0.81682086, "num_input_tokens_seen": 262584145, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6875, "step": 12167, "time_per_iteration": 2.507084608078003 }, { "auxiliary_loss_clip": 0.01114635, "auxiliary_loss_mlp": 0.01039337, "balance_loss_clip": 1.02711236, "balance_loss_mlp": 1.03701866, "epoch": 0.7315797384638509, "flos": 26177047637760.0, "grad_norm": 1.6368998946071245, "language_loss": 0.7682935, "learning_rate": 6.699954088632471e-07, "loss": 0.78983325, "num_input_tokens_seen": 262604045, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 12168, "time_per_iteration": 2.5509135723114014 }, { "auxiliary_loss_clip": 0.01115958, "auxiliary_loss_mlp": 0.01040174, "balance_loss_clip": 1.02630401, "balance_loss_mlp": 1.0357362, "epoch": 0.7316398617165188, "flos": 21506326427520.0, "grad_norm": 1.9209955534973255, "language_loss": 0.8170805, "learning_rate": 6.697132592739363e-07, "loss": 0.83864188, "num_input_tokens_seen": 262624540, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7109375, "step": 12169, "time_per_iteration": 2.5189223289489746 }, { "auxiliary_loss_clip": 0.01117788, "auxiliary_loss_mlp": 0.01036537, "balance_loss_clip": 1.02351308, "balance_loss_mlp": 1.03730536, "epoch": 0.7316999849691869, "flos": 30482557315200.0, "grad_norm": 2.1656358831560545, "language_loss": 0.70119381, "learning_rate": 6.694311571591371e-07, "loss": 0.72273701, "num_input_tokens_seen": 262644545, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 12170, "time_per_iteration": 2.630079507827759 }, { "auxiliary_loss_clip": 0.01103939, "auxiliary_loss_mlp": 0.01032325, "balance_loss_clip": 1.01838303, "balance_loss_mlp": 1.03480709, "epoch": 0.7317601082218548, "flos": 21543781334400.0, "grad_norm": 2.308181001874524, "language_loss": 0.69856715, "learning_rate": 6.691491025289173e-07, "loss": 0.71992981, "num_input_tokens_seen": 262662570, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.6875, "step": 12171, "time_per_iteration": 2.527238607406616 }, { "auxiliary_loss_clip": 0.01106013, "auxiliary_loss_mlp": 0.01037437, "balance_loss_clip": 1.02462792, "balance_loss_mlp": 1.03689659, "epoch": 0.7318202314745228, "flos": 33251592775680.0, "grad_norm": 2.0980185635032407, "language_loss": 0.65708768, "learning_rate": 6.688670953933422e-07, "loss": 0.67852217, "num_input_tokens_seen": 262683245, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 12172, "time_per_iteration": 2.7237517833709717 }, { "auxiliary_loss_clip": 0.01107934, "auxiliary_loss_mlp": 0.01030947, "balance_loss_clip": 1.01787007, "balance_loss_mlp": 1.03758621, "epoch": 0.7318803547271907, "flos": 20157054917760.0, "grad_norm": 2.7325260798789923, "language_loss": 0.61230695, "learning_rate": 6.685851357624769e-07, "loss": 0.63369572, "num_input_tokens_seen": 262701585, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 12173, "time_per_iteration": 2.5413806438446045 }, { "auxiliary_loss_clip": 0.01141505, "auxiliary_loss_mlp": 0.01030537, "balance_loss_clip": 1.01859796, "balance_loss_mlp": 1.03630316, "epoch": 0.7319404779798587, "flos": 20558536208640.0, "grad_norm": 2.191389783349126, "language_loss": 0.73704123, "learning_rate": 6.683032236463833e-07, "loss": 0.7587617, "num_input_tokens_seen": 262719295, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6953125, "step": 12174, "time_per_iteration": 2.606921672821045 }, { "auxiliary_loss_clip": 0.0110335, "auxiliary_loss_mlp": 0.01026531, "balance_loss_clip": 1.01439512, "balance_loss_mlp": 1.03599954, "epoch": 0.7320006012325266, "flos": 28695391102080.0, "grad_norm": 1.6882259323098707, "language_loss": 0.81010628, "learning_rate": 6.680213590551222e-07, "loss": 0.83140504, "num_input_tokens_seen": 262739995, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.671875, "step": 12175, "time_per_iteration": 2.606602430343628 }, { "auxiliary_loss_clip": 0.01123768, "auxiliary_loss_mlp": 0.01032537, "balance_loss_clip": 1.02030635, "balance_loss_mlp": 1.03567052, "epoch": 0.7320607244851947, "flos": 16362697731840.0, "grad_norm": 2.188094132065135, "language_loss": 0.76414621, "learning_rate": 6.67739541998752e-07, "loss": 0.78570926, "num_input_tokens_seen": 262757680, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 12176, "time_per_iteration": 2.6266543865203857 }, { "auxiliary_loss_clip": 0.01114889, "auxiliary_loss_mlp": 0.01033197, "balance_loss_clip": 1.02083468, "balance_loss_mlp": 1.03777289, "epoch": 0.7321208477378626, "flos": 20955097336320.0, "grad_norm": 1.4739040157625547, "language_loss": 0.76334357, "learning_rate": 6.674577724873316e-07, "loss": 0.78482437, "num_input_tokens_seen": 262776990, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.68359375, "step": 12177, "time_per_iteration": 2.4991562366485596 }, { "auxiliary_loss_clip": 0.01130386, "auxiliary_loss_mlp": 0.01035833, "balance_loss_clip": 1.02413237, "balance_loss_mlp": 1.03401518, "epoch": 0.7321809709905306, "flos": 13845072539520.0, "grad_norm": 2.241088753002608, "language_loss": 0.743716, "learning_rate": 6.671760505309143e-07, "loss": 0.76537824, "num_input_tokens_seen": 262795440, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69921875, "step": 12178, "time_per_iteration": 2.511544704437256 }, { "auxiliary_loss_clip": 0.01125472, "auxiliary_loss_mlp": 0.01033353, "balance_loss_clip": 1.02037168, "balance_loss_mlp": 1.03569591, "epoch": 0.7322410942431986, "flos": 26979938392320.0, "grad_norm": 1.9409850727920237, "language_loss": 0.82438254, "learning_rate": 6.66894376139556e-07, "loss": 0.84597087, "num_input_tokens_seen": 262816385, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.72265625, "step": 12179, "time_per_iteration": 2.554614782333374 }, { "auxiliary_loss_clip": 0.01120197, "auxiliary_loss_mlp": 0.01029917, "balance_loss_clip": 1.01779985, "balance_loss_mlp": 1.03599596, "epoch": 0.7323012174958665, "flos": 17639717034240.0, "grad_norm": 1.473876980236355, "language_loss": 0.74397099, "learning_rate": 6.666127493233084e-07, "loss": 0.76547211, "num_input_tokens_seen": 262834955, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.66015625, "step": 12180, "time_per_iteration": 2.515637159347534 }, { "auxiliary_loss_clip": 0.01116351, "auxiliary_loss_mlp": 0.01276111, "balance_loss_clip": 1.01536024, "balance_loss_mlp": 1.03405333, "epoch": 0.7323613407485345, "flos": 32342765834880.0, "grad_norm": 1.8732123626140738, "language_loss": 0.7931416, "learning_rate": 6.663311700922218e-07, "loss": 0.81706619, "num_input_tokens_seen": 262853555, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.734375, "step": 12181, "time_per_iteration": 2.578842878341675 }, { "auxiliary_loss_clip": 0.01117988, "auxiliary_loss_mlp": 0.01279196, "balance_loss_clip": 1.01927197, "balance_loss_mlp": 1.03800595, "epoch": 0.7324214640012024, "flos": 18362777811840.0, "grad_norm": 1.9017192913962118, "language_loss": 0.72059846, "learning_rate": 6.660496384563452e-07, "loss": 0.74457026, "num_input_tokens_seen": 262870975, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 12182, "time_per_iteration": 2.5044679641723633 }, { "auxiliary_loss_clip": 0.01109113, "auxiliary_loss_mlp": 0.01036551, "balance_loss_clip": 1.02404022, "balance_loss_mlp": 1.03771091, "epoch": 0.7324815872538705, "flos": 30812289189120.0, "grad_norm": 1.6152005368326743, "language_loss": 0.71263289, "learning_rate": 6.657681544257249e-07, "loss": 0.73408949, "num_input_tokens_seen": 262892635, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 12183, "time_per_iteration": 2.5657830238342285 }, { "auxiliary_loss_clip": 0.01120647, "auxiliary_loss_mlp": 0.01038676, "balance_loss_clip": 1.02486563, "balance_loss_mlp": 1.03879213, "epoch": 0.7325417105065384, "flos": 21505069451520.0, "grad_norm": 2.053187481522423, "language_loss": 0.72820681, "learning_rate": 6.654867180104085e-07, "loss": 0.74980003, "num_input_tokens_seen": 262910725, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7265625, "step": 12184, "time_per_iteration": 2.537778615951538 }, { "auxiliary_loss_clip": 0.01045717, "auxiliary_loss_mlp": 0.01003152, "balance_loss_clip": 1.00179338, "balance_loss_mlp": 1.00588334, "epoch": 0.7326018337592064, "flos": 67257742556160.0, "grad_norm": 0.7532834551247231, "language_loss": 0.6514675, "learning_rate": 6.652053292204371e-07, "loss": 0.67195618, "num_input_tokens_seen": 262974150, "router_z_loss_clip": 0.01361084, "router_z_loss_mlp": 0.21875, "step": 12185, "time_per_iteration": 3.1209514141082764 }, { "auxiliary_loss_clip": 0.0110572, "auxiliary_loss_mlp": 0.01027325, "balance_loss_clip": 1.01418245, "balance_loss_mlp": 1.03443754, "epoch": 0.7326619570118743, "flos": 22857070394880.0, "grad_norm": 1.9738667849341454, "language_loss": 0.80218709, "learning_rate": 6.649239880658546e-07, "loss": 0.82351756, "num_input_tokens_seen": 262993370, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 12186, "time_per_iteration": 3.927349090576172 }, { "auxiliary_loss_clip": 0.01114911, "auxiliary_loss_mlp": 0.01034095, "balance_loss_clip": 1.02027869, "balance_loss_mlp": 1.03445077, "epoch": 0.7327220802645423, "flos": 23327499841920.0, "grad_norm": 1.7561444457428685, "language_loss": 0.73178065, "learning_rate": 6.646426945567008e-07, "loss": 0.75327075, "num_input_tokens_seen": 263012665, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7109375, "step": 12187, "time_per_iteration": 2.5616185665130615 }, { "auxiliary_loss_clip": 0.01108061, "auxiliary_loss_mlp": 0.01039212, "balance_loss_clip": 1.02659392, "balance_loss_mlp": 1.03665745, "epoch": 0.7327822035172102, "flos": 23180661043200.0, "grad_norm": 1.6595215847361668, "language_loss": 0.8900128, "learning_rate": 6.64361448703014e-07, "loss": 0.91148549, "num_input_tokens_seen": 263031475, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 12188, "time_per_iteration": 2.5957515239715576 }, { "auxiliary_loss_clip": 0.01144033, "auxiliary_loss_mlp": 0.0103633, "balance_loss_clip": 1.02278805, "balance_loss_mlp": 1.03547287, "epoch": 0.7328423267698783, "flos": 21066600130560.0, "grad_norm": 2.1204801152944768, "language_loss": 0.74430394, "learning_rate": 6.64080250514831e-07, "loss": 0.76610756, "num_input_tokens_seen": 263051445, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73046875, "step": 12189, "time_per_iteration": 2.6200175285339355 }, { "auxiliary_loss_clip": 0.0113411, "auxiliary_loss_mlp": 0.01032445, "balance_loss_clip": 1.0189029, "balance_loss_mlp": 1.03608918, "epoch": 0.7329024500225462, "flos": 21689578638720.0, "grad_norm": 1.5931184894422934, "language_loss": 0.82085764, "learning_rate": 6.637991000021883e-07, "loss": 0.84252322, "num_input_tokens_seen": 263070835, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 12190, "time_per_iteration": 2.585402011871338 }, { "auxiliary_loss_clip": 0.01135731, "auxiliary_loss_mlp": 0.01038766, "balance_loss_clip": 1.0248723, "balance_loss_mlp": 1.0375036, "epoch": 0.7329625732752142, "flos": 24164038661760.0, "grad_norm": 1.7724352355018775, "language_loss": 0.71916211, "learning_rate": 6.635179971751184e-07, "loss": 0.74090707, "num_input_tokens_seen": 263090070, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7109375, "step": 12191, "time_per_iteration": 2.643873691558838 }, { "auxiliary_loss_clip": 0.01107313, "auxiliary_loss_mlp": 0.01035855, "balance_loss_clip": 1.02214575, "balance_loss_mlp": 1.03603363, "epoch": 0.7330226965278822, "flos": 30077915627520.0, "grad_norm": 1.5573132252955673, "language_loss": 0.69380522, "learning_rate": 6.632369420436532e-07, "loss": 0.7152369, "num_input_tokens_seen": 263110030, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71484375, "step": 12192, "time_per_iteration": 2.5868608951568604 }, { "auxiliary_loss_clip": 0.01112424, "auxiliary_loss_mlp": 0.01031999, "balance_loss_clip": 1.01965451, "balance_loss_mlp": 1.03454828, "epoch": 0.7330828197805501, "flos": 23368294713600.0, "grad_norm": 1.5045127016705313, "language_loss": 0.73111439, "learning_rate": 6.629559346178226e-07, "loss": 0.75255859, "num_input_tokens_seen": 263129735, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 12193, "time_per_iteration": 3.9155220985412598 }, { "auxiliary_loss_clip": 0.01115963, "auxiliary_loss_mlp": 0.0103588, "balance_loss_clip": 1.02227247, "balance_loss_mlp": 1.03458095, "epoch": 0.7331429430332181, "flos": 21032808410880.0, "grad_norm": 1.6803307669822742, "language_loss": 0.76926416, "learning_rate": 6.626749749076566e-07, "loss": 0.79078257, "num_input_tokens_seen": 263149100, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.72265625, "step": 12194, "time_per_iteration": 2.5517632961273193 }, { "auxiliary_loss_clip": 0.01115477, "auxiliary_loss_mlp": 0.01030237, "balance_loss_clip": 1.01686192, "balance_loss_mlp": 1.03709078, "epoch": 0.733203066285886, "flos": 14647891466880.0, "grad_norm": 1.6835483504063105, "language_loss": 0.7096796, "learning_rate": 6.623940629231793e-07, "loss": 0.73113668, "num_input_tokens_seen": 263166620, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6953125, "step": 12195, "time_per_iteration": 2.4725863933563232 }, { "auxiliary_loss_clip": 0.01106561, "auxiliary_loss_mlp": 0.01037159, "balance_loss_clip": 1.02464843, "balance_loss_mlp": 1.03546023, "epoch": 0.7332631895385541, "flos": 17165301177600.0, "grad_norm": 2.100905828998338, "language_loss": 0.72038627, "learning_rate": 6.621131986744179e-07, "loss": 0.74182343, "num_input_tokens_seen": 263184780, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 12196, "time_per_iteration": 2.500861883163452 }, { "auxiliary_loss_clip": 0.01104601, "auxiliary_loss_mlp": 0.01031894, "balance_loss_clip": 1.01881123, "balance_loss_mlp": 1.03551626, "epoch": 0.733323312791222, "flos": 28658151676800.0, "grad_norm": 1.6188313305429305, "language_loss": 0.71339804, "learning_rate": 6.618323821713956e-07, "loss": 0.73476303, "num_input_tokens_seen": 263204625, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6875, "step": 12197, "time_per_iteration": 2.5410664081573486 }, { "auxiliary_loss_clip": 0.01153416, "auxiliary_loss_mlp": 0.01288202, "balance_loss_clip": 1.0266819, "balance_loss_mlp": 1.0355022, "epoch": 0.73338343604389, "flos": 16618417632000.0, "grad_norm": 1.9924437157731396, "language_loss": 0.78172678, "learning_rate": 6.615516134241321e-07, "loss": 0.80614299, "num_input_tokens_seen": 263221565, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7421875, "step": 12198, "time_per_iteration": 2.6602659225463867 }, { "auxiliary_loss_clip": 0.01150836, "auxiliary_loss_mlp": 0.01032268, "balance_loss_clip": 1.02013862, "balance_loss_mlp": 1.03607869, "epoch": 0.7334435592965579, "flos": 21142084561920.0, "grad_norm": 1.735915672326466, "language_loss": 0.74081242, "learning_rate": 6.612708924426496e-07, "loss": 0.76264346, "num_input_tokens_seen": 263240620, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.703125, "step": 12199, "time_per_iteration": 5.585903167724609 }, { "auxiliary_loss_clip": 0.01127931, "auxiliary_loss_mlp": 0.01036029, "balance_loss_clip": 1.0226773, "balance_loss_mlp": 1.0373913, "epoch": 0.7335036825492259, "flos": 17125332318720.0, "grad_norm": 2.2963962320966416, "language_loss": 0.77285308, "learning_rate": 6.609902192369643e-07, "loss": 0.79449266, "num_input_tokens_seen": 263254365, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 12200, "time_per_iteration": 2.588008165359497 }, { "auxiliary_loss_clip": 0.01133632, "auxiliary_loss_mlp": 0.01031816, "balance_loss_clip": 1.01940656, "balance_loss_mlp": 1.03658104, "epoch": 0.7335638058018938, "flos": 23731818307200.0, "grad_norm": 1.8154948393143489, "language_loss": 0.61602843, "learning_rate": 6.60709593817095e-07, "loss": 0.63768291, "num_input_tokens_seen": 263275880, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 12201, "time_per_iteration": 2.612086057662964 }, { "auxiliary_loss_clip": 0.01132292, "auxiliary_loss_mlp": 0.01274001, "balance_loss_clip": 1.01485646, "balance_loss_mlp": 1.03586411, "epoch": 0.7336239290545619, "flos": 34933289679360.0, "grad_norm": 1.7477290623676212, "language_loss": 0.52060741, "learning_rate": 6.604290161930541e-07, "loss": 0.54467034, "num_input_tokens_seen": 263298315, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 12202, "time_per_iteration": 2.6693918704986572 }, { "auxiliary_loss_clip": 0.01123444, "auxiliary_loss_mlp": 0.01028169, "balance_loss_clip": 1.0155623, "balance_loss_mlp": 1.03614116, "epoch": 0.7336840523072298, "flos": 21103049456640.0, "grad_norm": 1.6886895348144215, "language_loss": 0.68382883, "learning_rate": 6.601484863748565e-07, "loss": 0.70534503, "num_input_tokens_seen": 263318615, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 12203, "time_per_iteration": 2.549607276916504 }, { "auxiliary_loss_clip": 0.01127763, "auxiliary_loss_mlp": 0.01038959, "balance_loss_clip": 1.02558327, "balance_loss_mlp": 1.03573465, "epoch": 0.7337441755598978, "flos": 24024418496640.0, "grad_norm": 2.363157889850197, "language_loss": 0.66003609, "learning_rate": 6.598680043725129e-07, "loss": 0.68170333, "num_input_tokens_seen": 263336705, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73828125, "step": 12204, "time_per_iteration": 2.595898389816284 }, { "auxiliary_loss_clip": 0.01123325, "auxiliary_loss_mlp": 0.01032754, "balance_loss_clip": 1.02018952, "balance_loss_mlp": 1.03688979, "epoch": 0.7338042988125658, "flos": 22711309004160.0, "grad_norm": 1.903529461113738, "language_loss": 0.77652335, "learning_rate": 6.59587570196033e-07, "loss": 0.7980842, "num_input_tokens_seen": 263355065, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 12205, "time_per_iteration": 2.5849337577819824 }, { "auxiliary_loss_clip": 0.01140799, "auxiliary_loss_mlp": 0.01031194, "balance_loss_clip": 1.01904035, "balance_loss_mlp": 1.03689384, "epoch": 0.7338644220652337, "flos": 21360996000000.0, "grad_norm": 1.5756562990286649, "language_loss": 0.79773784, "learning_rate": 6.593071838554239e-07, "loss": 0.81945783, "num_input_tokens_seen": 263374460, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6796875, "step": 12206, "time_per_iteration": 2.5453872680664062 }, { "auxiliary_loss_clip": 0.01131929, "auxiliary_loss_mlp": 0.01029769, "balance_loss_clip": 1.01662052, "balance_loss_mlp": 1.03492522, "epoch": 0.7339245453179017, "flos": 30920236536960.0, "grad_norm": 1.8911127003685548, "language_loss": 0.71326184, "learning_rate": 6.590268453606936e-07, "loss": 0.73487878, "num_input_tokens_seen": 263393610, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.70703125, "step": 12207, "time_per_iteration": 2.581480026245117 }, { "auxiliary_loss_clip": 0.01044661, "auxiliary_loss_mlp": 0.01002218, "balance_loss_clip": 1.00104356, "balance_loss_mlp": 1.00529885, "epoch": 0.7339846685705697, "flos": 67899429072000.0, "grad_norm": 0.7814351273541423, "language_loss": 0.5483098, "learning_rate": 6.587465547218456e-07, "loss": 0.56877863, "num_input_tokens_seen": 263450340, "router_z_loss_clip": 0.01171875, "router_z_loss_mlp": 0.21484375, "step": 12208, "time_per_iteration": 3.2135066986083984 }, { "auxiliary_loss_clip": 0.01104825, "auxiliary_loss_mlp": 0.01030525, "balance_loss_clip": 1.01939714, "balance_loss_mlp": 1.03599989, "epoch": 0.7340447918232377, "flos": 22236749493120.0, "grad_norm": 1.4883323243831186, "language_loss": 0.80463845, "learning_rate": 6.584663119488832e-07, "loss": 0.82599193, "num_input_tokens_seen": 263471735, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6875, "step": 12209, "time_per_iteration": 2.5022027492523193 }, { "auxiliary_loss_clip": 0.0110312, "auxiliary_loss_mlp": 0.01030258, "balance_loss_clip": 1.01803887, "balance_loss_mlp": 1.03437042, "epoch": 0.7341049150759056, "flos": 23764784014080.0, "grad_norm": 1.498201325685599, "language_loss": 0.78961122, "learning_rate": 6.581861170518064e-07, "loss": 0.81094503, "num_input_tokens_seen": 263493245, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 12210, "time_per_iteration": 2.515187978744507 }, { "auxiliary_loss_clip": 0.01103966, "auxiliary_loss_mlp": 0.01030025, "balance_loss_clip": 1.0173893, "balance_loss_mlp": 1.03600812, "epoch": 0.7341650383285736, "flos": 17236547804160.0, "grad_norm": 2.0724978430930943, "language_loss": 0.76562071, "learning_rate": 6.579059700406171e-07, "loss": 0.7869606, "num_input_tokens_seen": 263511660, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 12211, "time_per_iteration": 2.4556357860565186 }, { "auxiliary_loss_clip": 0.01124755, "auxiliary_loss_mlp": 0.01029565, "balance_loss_clip": 1.01681006, "balance_loss_mlp": 1.03572154, "epoch": 0.7342251615812415, "flos": 23403953940480.0, "grad_norm": 1.9838141717971474, "language_loss": 0.71947455, "learning_rate": 6.576258709253106e-07, "loss": 0.74101782, "num_input_tokens_seen": 263530875, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 12212, "time_per_iteration": 2.5690085887908936 }, { "auxiliary_loss_clip": 0.01107649, "auxiliary_loss_mlp": 0.010315, "balance_loss_clip": 1.01841664, "balance_loss_mlp": 1.03765297, "epoch": 0.7342852848339095, "flos": 22747183712640.0, "grad_norm": 1.6732815269296157, "language_loss": 0.68831581, "learning_rate": 6.573458197158833e-07, "loss": 0.70970726, "num_input_tokens_seen": 263551585, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 12213, "time_per_iteration": 2.5118775367736816 }, { "auxiliary_loss_clip": 0.01026482, "auxiliary_loss_mlp": 0.01001148, "balance_loss_clip": 0.99990857, "balance_loss_mlp": 1.00490499, "epoch": 0.7343454080865774, "flos": 53942353925760.0, "grad_norm": 0.7223989353037218, "language_loss": 0.54311967, "learning_rate": 6.570658164223311e-07, "loss": 0.56339598, "num_input_tokens_seen": 263609545, "router_z_loss_clip": 0.01239014, "router_z_loss_mlp": 0.21582031, "step": 12214, "time_per_iteration": 3.0182933807373047 }, { "auxiliary_loss_clip": 0.01106683, "auxiliary_loss_mlp": 0.01029367, "balance_loss_clip": 1.01671863, "balance_loss_mlp": 1.03483617, "epoch": 0.7344055313392455, "flos": 12166859255040.0, "grad_norm": 1.955328359878252, "language_loss": 0.70341331, "learning_rate": 6.567858610546442e-07, "loss": 0.72477376, "num_input_tokens_seen": 263627880, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 12215, "time_per_iteration": 2.5011110305786133 }, { "auxiliary_loss_clip": 0.01120446, "auxiliary_loss_mlp": 0.01029973, "balance_loss_clip": 1.01835573, "balance_loss_mlp": 1.03587162, "epoch": 0.7344656545919134, "flos": 18550052346240.0, "grad_norm": 1.9095955679396075, "language_loss": 0.7287513, "learning_rate": 6.565059536228153e-07, "loss": 0.75025553, "num_input_tokens_seen": 263645665, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.66796875, "step": 12216, "time_per_iteration": 2.6142778396606445 }, { "auxiliary_loss_clip": 0.01111469, "auxiliary_loss_mlp": 0.0103844, "balance_loss_clip": 1.02297878, "balance_loss_mlp": 1.03636479, "epoch": 0.7345257778445814, "flos": 23661649088640.0, "grad_norm": 2.2665396552486645, "language_loss": 0.78112251, "learning_rate": 6.562260941368325e-07, "loss": 0.8026216, "num_input_tokens_seen": 263668170, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.75, "step": 12217, "time_per_iteration": 2.5760228633880615 }, { "auxiliary_loss_clip": 0.01129208, "auxiliary_loss_mlp": 0.01025562, "balance_loss_clip": 1.01394534, "balance_loss_mlp": 1.03439283, "epoch": 0.7345859010972494, "flos": 13808659127040.0, "grad_norm": 1.9602104039527048, "language_loss": 0.77955544, "learning_rate": 6.55946282606685e-07, "loss": 0.80110312, "num_input_tokens_seen": 263684190, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 12218, "time_per_iteration": 2.542818784713745 }, { "auxiliary_loss_clip": 0.01139045, "auxiliary_loss_mlp": 0.01029273, "balance_loss_clip": 1.01635671, "balance_loss_mlp": 1.03515244, "epoch": 0.7346460243499173, "flos": 22272731942400.0, "grad_norm": 1.861800678855167, "language_loss": 0.7203002, "learning_rate": 6.556665190423562e-07, "loss": 0.74198341, "num_input_tokens_seen": 263702095, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.68359375, "step": 12219, "time_per_iteration": 2.5502564907073975 }, { "auxiliary_loss_clip": 0.01119275, "auxiliary_loss_mlp": 0.01030767, "balance_loss_clip": 1.01877403, "balance_loss_mlp": 1.0340929, "epoch": 0.7347061476025853, "flos": 23255247634560.0, "grad_norm": 1.6407885013734111, "language_loss": 0.74450713, "learning_rate": 6.553868034538319e-07, "loss": 0.76600754, "num_input_tokens_seen": 263721385, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 12220, "time_per_iteration": 2.5225894451141357 }, { "auxiliary_loss_clip": 0.01116031, "auxiliary_loss_mlp": 0.01029186, "balance_loss_clip": 1.01663303, "balance_loss_mlp": 1.03555322, "epoch": 0.7347662708552533, "flos": 15267565923840.0, "grad_norm": 1.9180364651844852, "language_loss": 0.66002703, "learning_rate": 6.55107135851094e-07, "loss": 0.68147916, "num_input_tokens_seen": 263737835, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71484375, "step": 12221, "time_per_iteration": 2.4859583377838135 }, { "auxiliary_loss_clip": 0.01122972, "auxiliary_loss_mlp": 0.01031945, "balance_loss_clip": 1.01975608, "balance_loss_mlp": 1.03559732, "epoch": 0.7348263941079213, "flos": 24859987649280.0, "grad_norm": 1.7014708010140016, "language_loss": 0.69316828, "learning_rate": 6.548275162441228e-07, "loss": 0.71471739, "num_input_tokens_seen": 263756480, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6953125, "step": 12222, "time_per_iteration": 2.55436635017395 }, { "auxiliary_loss_clip": 0.01138972, "auxiliary_loss_mlp": 0.01029376, "balance_loss_clip": 1.01798606, "balance_loss_mlp": 1.03516293, "epoch": 0.7348865173605892, "flos": 24352103295360.0, "grad_norm": 1.9861690687291547, "language_loss": 0.65604007, "learning_rate": 6.545479446428965e-07, "loss": 0.67772353, "num_input_tokens_seen": 263776440, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6875, "step": 12223, "time_per_iteration": 2.6103715896606445 }, { "auxiliary_loss_clip": 0.01116469, "auxiliary_loss_mlp": 0.01031809, "balance_loss_clip": 1.01932836, "balance_loss_mlp": 1.03641987, "epoch": 0.7349466406132572, "flos": 20004613597440.0, "grad_norm": 1.8097510967386445, "language_loss": 0.72133362, "learning_rate": 6.542684210573948e-07, "loss": 0.74281645, "num_input_tokens_seen": 263793700, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 12224, "time_per_iteration": 2.5506904125213623 }, { "auxiliary_loss_clip": 0.01132822, "auxiliary_loss_mlp": 0.01031369, "balance_loss_clip": 1.01897144, "balance_loss_mlp": 1.03583252, "epoch": 0.7350067638659251, "flos": 29825068815360.0, "grad_norm": 1.6520995398823417, "language_loss": 0.72641373, "learning_rate": 6.5398894549759e-07, "loss": 0.74805564, "num_input_tokens_seen": 263814620, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 12225, "time_per_iteration": 2.6314613819122314 }, { "auxiliary_loss_clip": 0.01109549, "auxiliary_loss_mlp": 0.01030391, "balance_loss_clip": 1.01665819, "balance_loss_mlp": 1.03637755, "epoch": 0.7350668871185931, "flos": 21866150920320.0, "grad_norm": 1.7663580750822596, "language_loss": 0.76369274, "learning_rate": 6.53709517973458e-07, "loss": 0.78509212, "num_input_tokens_seen": 263832725, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.73046875, "step": 12226, "time_per_iteration": 2.491612195968628 }, { "auxiliary_loss_clip": 0.01122842, "auxiliary_loss_mlp": 0.01032318, "balance_loss_clip": 1.01918721, "balance_loss_mlp": 1.03569686, "epoch": 0.735127010371261, "flos": 22566122231040.0, "grad_norm": 2.011560800640259, "language_loss": 0.67022097, "learning_rate": 6.534301384949703e-07, "loss": 0.69177258, "num_input_tokens_seen": 263853850, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69140625, "step": 12227, "time_per_iteration": 2.5459747314453125 }, { "auxiliary_loss_clip": 0.0111411, "auxiliary_loss_mlp": 0.01034546, "balance_loss_clip": 1.02232063, "balance_loss_mlp": 1.03453183, "epoch": 0.7351871336239291, "flos": 25884339707520.0, "grad_norm": 1.5650023015921106, "language_loss": 0.63774753, "learning_rate": 6.531508070720972e-07, "loss": 0.65923417, "num_input_tokens_seen": 263874760, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.70703125, "step": 12228, "time_per_iteration": 3.9292569160461426 }, { "auxiliary_loss_clip": 0.01122762, "auxiliary_loss_mlp": 0.01032538, "balance_loss_clip": 1.02110577, "balance_loss_mlp": 1.03532171, "epoch": 0.735247256876597, "flos": 17932173569280.0, "grad_norm": 1.5579044147454006, "language_loss": 0.63635981, "learning_rate": 6.528715237148073e-07, "loss": 0.65791279, "num_input_tokens_seen": 263893390, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6953125, "step": 12229, "time_per_iteration": 2.6623239517211914 }, { "auxiliary_loss_clip": 0.01125974, "auxiliary_loss_mlp": 0.01035405, "balance_loss_clip": 1.0221014, "balance_loss_mlp": 1.03609681, "epoch": 0.735307380129265, "flos": 28875159694080.0, "grad_norm": 1.919944634792023, "language_loss": 0.73106533, "learning_rate": 6.525922884330668e-07, "loss": 0.75267911, "num_input_tokens_seen": 263911180, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 12230, "time_per_iteration": 2.5716142654418945 }, { "auxiliary_loss_clip": 0.01150918, "auxiliary_loss_mlp": 0.01033443, "balance_loss_clip": 1.02034807, "balance_loss_mlp": 1.03620207, "epoch": 0.7353675033819329, "flos": 13625658311040.0, "grad_norm": 2.1401664294696454, "language_loss": 0.72397488, "learning_rate": 6.523131012368428e-07, "loss": 0.7458185, "num_input_tokens_seen": 263928975, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 12231, "time_per_iteration": 2.591911554336548 }, { "auxiliary_loss_clip": 0.01116572, "auxiliary_loss_mlp": 0.01039171, "balance_loss_clip": 1.02499747, "balance_loss_mlp": 1.03682101, "epoch": 0.7354276266346009, "flos": 19463081178240.0, "grad_norm": 2.3857443179133915, "language_loss": 0.63724756, "learning_rate": 6.520339621360964e-07, "loss": 0.65880501, "num_input_tokens_seen": 263944495, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7109375, "step": 12232, "time_per_iteration": 2.5013115406036377 }, { "auxiliary_loss_clip": 0.01124568, "auxiliary_loss_mlp": 0.01031363, "balance_loss_clip": 1.01844633, "balance_loss_mlp": 1.03559136, "epoch": 0.735487749887269, "flos": 15771858917760.0, "grad_norm": 1.7452283481076725, "language_loss": 0.75273013, "learning_rate": 6.51754871140791e-07, "loss": 0.77428949, "num_input_tokens_seen": 263961325, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 12233, "time_per_iteration": 2.5184178352355957 }, { "auxiliary_loss_clip": 0.01105961, "auxiliary_loss_mlp": 0.01030105, "balance_loss_clip": 1.01712918, "balance_loss_mlp": 1.03546596, "epoch": 0.7355478731399369, "flos": 18260648467200.0, "grad_norm": 2.1966115554224497, "language_loss": 0.73283732, "learning_rate": 6.514758282608856e-07, "loss": 0.75419796, "num_input_tokens_seen": 263980445, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 12234, "time_per_iteration": 3.9082770347595215 }, { "auxiliary_loss_clip": 0.01116344, "auxiliary_loss_mlp": 0.01029245, "balance_loss_clip": 1.01622748, "balance_loss_mlp": 1.03650963, "epoch": 0.7356079963926049, "flos": 26542043688960.0, "grad_norm": 2.0982731311545595, "language_loss": 0.59368646, "learning_rate": 6.511968335063405e-07, "loss": 0.61514235, "num_input_tokens_seen": 263999330, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 12235, "time_per_iteration": 2.564251184463501 }, { "auxiliary_loss_clip": 0.01123202, "auxiliary_loss_mlp": 0.01029303, "balance_loss_clip": 1.01737642, "balance_loss_mlp": 1.03696942, "epoch": 0.7356681196452728, "flos": 10778624467200.0, "grad_norm": 2.2426423616674165, "language_loss": 0.86037469, "learning_rate": 6.509178868871092e-07, "loss": 0.88189977, "num_input_tokens_seen": 264014150, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 12236, "time_per_iteration": 2.529439687728882 }, { "auxiliary_loss_clip": 0.01115469, "auxiliary_loss_mlp": 0.01274741, "balance_loss_clip": 1.01551771, "balance_loss_mlp": 1.0355016, "epoch": 0.7357282428979408, "flos": 19718693337600.0, "grad_norm": 1.8089320940794844, "language_loss": 0.69544661, "learning_rate": 6.506389884131494e-07, "loss": 0.71934879, "num_input_tokens_seen": 264033140, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 12237, "time_per_iteration": 2.5236074924468994 }, { "auxiliary_loss_clip": 0.01166457, "auxiliary_loss_mlp": 0.01028042, "balance_loss_clip": 1.01593649, "balance_loss_mlp": 1.034477, "epoch": 0.7357883661506087, "flos": 19464014931840.0, "grad_norm": 1.6182298836976898, "language_loss": 0.72223592, "learning_rate": 6.503601380944128e-07, "loss": 0.74418092, "num_input_tokens_seen": 264052105, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69921875, "step": 12238, "time_per_iteration": 2.578054189682007 }, { "auxiliary_loss_clip": 0.01135459, "auxiliary_loss_mlp": 0.01028667, "balance_loss_clip": 1.01552415, "balance_loss_mlp": 1.03435922, "epoch": 0.7358484894032767, "flos": 27123006263040.0, "grad_norm": 1.9529283742405845, "language_loss": 0.72300565, "learning_rate": 6.500813359408513e-07, "loss": 0.74464691, "num_input_tokens_seen": 264070690, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.74609375, "step": 12239, "time_per_iteration": 2.6389331817626953 }, { "auxiliary_loss_clip": 0.01119616, "auxiliary_loss_mlp": 0.01032516, "balance_loss_clip": 1.02017832, "balance_loss_mlp": 1.03487039, "epoch": 0.7359086126559446, "flos": 24502282058880.0, "grad_norm": 1.3579358491495572, "language_loss": 0.78872454, "learning_rate": 6.498025819624138e-07, "loss": 0.81024587, "num_input_tokens_seen": 264094225, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.66796875, "step": 12240, "time_per_iteration": 4.33468222618103 }, { "auxiliary_loss_clip": 0.01115531, "auxiliary_loss_mlp": 0.01038266, "balance_loss_clip": 1.02555275, "balance_loss_mlp": 1.03697538, "epoch": 0.7359687359086127, "flos": 23331270769920.0, "grad_norm": 1.6906538424273816, "language_loss": 0.82630086, "learning_rate": 6.495238761690503e-07, "loss": 0.84783882, "num_input_tokens_seen": 264113190, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 12241, "time_per_iteration": 4.111192464828491 }, { "auxiliary_loss_clip": 0.01113252, "auxiliary_loss_mlp": 0.01028159, "balance_loss_clip": 1.0156008, "balance_loss_mlp": 1.0350014, "epoch": 0.7360288591612806, "flos": 20193396503040.0, "grad_norm": 1.8233941441183459, "language_loss": 0.78972888, "learning_rate": 6.492452185707052e-07, "loss": 0.81114298, "num_input_tokens_seen": 264132050, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 12242, "time_per_iteration": 2.4889166355133057 }, { "auxiliary_loss_clip": 0.01141453, "auxiliary_loss_mlp": 0.01027549, "balance_loss_clip": 1.01538932, "balance_loss_mlp": 1.03660989, "epoch": 0.7360889824139486, "flos": 24972783333120.0, "grad_norm": 1.73852895921119, "language_loss": 0.79178905, "learning_rate": 6.489666091773231e-07, "loss": 0.81347907, "num_input_tokens_seen": 264152800, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.69140625, "step": 12243, "time_per_iteration": 2.6164252758026123 }, { "auxiliary_loss_clip": 0.0111613, "auxiliary_loss_mlp": 0.01034329, "balance_loss_clip": 1.02112091, "balance_loss_mlp": 1.0352844, "epoch": 0.7361491056666165, "flos": 15012312900480.0, "grad_norm": 8.238028290968998, "language_loss": 0.7432214, "learning_rate": 6.486880479988481e-07, "loss": 0.76472598, "num_input_tokens_seen": 264169650, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 12244, "time_per_iteration": 2.526484489440918 }, { "auxiliary_loss_clip": 0.01113912, "auxiliary_loss_mlp": 0.01032934, "balance_loss_clip": 1.01999354, "balance_loss_mlp": 1.03414083, "epoch": 0.7362092289192845, "flos": 22930400010240.0, "grad_norm": 2.0065564060904513, "language_loss": 0.69225264, "learning_rate": 6.484095350452205e-07, "loss": 0.7137211, "num_input_tokens_seen": 264190530, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 12245, "time_per_iteration": 2.5925862789154053 }, { "auxiliary_loss_clip": 0.01128285, "auxiliary_loss_mlp": 0.01034006, "balance_loss_clip": 1.02090526, "balance_loss_mlp": 1.03472769, "epoch": 0.7362693521719526, "flos": 20702681487360.0, "grad_norm": 1.5278540219372245, "language_loss": 0.73146081, "learning_rate": 6.4813107032638e-07, "loss": 0.75308371, "num_input_tokens_seen": 264210820, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.671875, "step": 12246, "time_per_iteration": 2.580479860305786 }, { "auxiliary_loss_clip": 0.01117578, "auxiliary_loss_mlp": 0.0102435, "balance_loss_clip": 1.01264954, "balance_loss_mlp": 1.0336107, "epoch": 0.7363294754246205, "flos": 13111381336320.0, "grad_norm": 2.0666437332971195, "language_loss": 0.73844272, "learning_rate": 6.478526538522638e-07, "loss": 0.75986201, "num_input_tokens_seen": 264227430, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6640625, "step": 12247, "time_per_iteration": 2.4980926513671875 }, { "auxiliary_loss_clip": 0.01114938, "auxiliary_loss_mlp": 0.01030217, "balance_loss_clip": 1.01856422, "balance_loss_mlp": 1.03917205, "epoch": 0.7363895986772885, "flos": 14027426910720.0, "grad_norm": 2.0422389225175555, "language_loss": 0.7410841, "learning_rate": 6.475742856328093e-07, "loss": 0.76253569, "num_input_tokens_seen": 264245230, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.671875, "step": 12248, "time_per_iteration": 2.50314998626709 }, { "auxiliary_loss_clip": 0.01122839, "auxiliary_loss_mlp": 0.01036656, "balance_loss_clip": 1.02357936, "balance_loss_mlp": 1.03497958, "epoch": 0.7364497219299564, "flos": 19719986227200.0, "grad_norm": 1.7869805671862395, "language_loss": 0.72668999, "learning_rate": 6.472959656779482e-07, "loss": 0.74828494, "num_input_tokens_seen": 264263945, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 12249, "time_per_iteration": 2.5792300701141357 }, { "auxiliary_loss_clip": 0.01124511, "auxiliary_loss_mlp": 0.01030855, "balance_loss_clip": 1.01781964, "balance_loss_mlp": 1.03715038, "epoch": 0.7365098451826244, "flos": 21361391049600.0, "grad_norm": 1.888849185497976, "language_loss": 0.77197599, "learning_rate": 6.470176939976153e-07, "loss": 0.79352963, "num_input_tokens_seen": 264281500, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 12250, "time_per_iteration": 2.5617995262145996 }, { "auxiliary_loss_clip": 0.01142984, "auxiliary_loss_mlp": 0.01029474, "balance_loss_clip": 1.01618791, "balance_loss_mlp": 1.03625822, "epoch": 0.7365699684352923, "flos": 23368222886400.0, "grad_norm": 1.665774438095798, "language_loss": 0.71315491, "learning_rate": 6.467394706017402e-07, "loss": 0.73487949, "num_input_tokens_seen": 264301625, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 12251, "time_per_iteration": 2.6338822841644287 }, { "auxiliary_loss_clip": 0.01113318, "auxiliary_loss_mlp": 0.01031891, "balance_loss_clip": 1.01958847, "balance_loss_mlp": 1.03478885, "epoch": 0.7366300916879603, "flos": 59524879927680.0, "grad_norm": 1.491185174557051, "language_loss": 0.65820909, "learning_rate": 6.464612955002535e-07, "loss": 0.67966115, "num_input_tokens_seen": 264323975, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 12252, "time_per_iteration": 2.8546886444091797 }, { "auxiliary_loss_clip": 0.01146954, "auxiliary_loss_mlp": 0.01035229, "balance_loss_clip": 1.0226469, "balance_loss_mlp": 1.0395391, "epoch": 0.7366902149406283, "flos": 20923137210240.0, "grad_norm": 1.5542517090691703, "language_loss": 0.79447258, "learning_rate": 6.461831687030801e-07, "loss": 0.81629443, "num_input_tokens_seen": 264343785, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 12253, "time_per_iteration": 2.5828685760498047 }, { "auxiliary_loss_clip": 0.01099396, "auxiliary_loss_mlp": 0.01276499, "balance_loss_clip": 1.01759052, "balance_loss_mlp": 1.03293359, "epoch": 0.7367503381932963, "flos": 17348158339200.0, "grad_norm": 1.9542860774834052, "language_loss": 0.75730902, "learning_rate": 6.459050902201477e-07, "loss": 0.78106803, "num_input_tokens_seen": 264361130, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6640625, "step": 12254, "time_per_iteration": 2.4852118492126465 }, { "auxiliary_loss_clip": 0.01122947, "auxiliary_loss_mlp": 0.01034312, "balance_loss_clip": 1.02164578, "balance_loss_mlp": 1.03534138, "epoch": 0.7368104614459642, "flos": 17821317219840.0, "grad_norm": 3.7383922985833666, "language_loss": 0.69745743, "learning_rate": 6.456270600613795e-07, "loss": 0.71903002, "num_input_tokens_seen": 264376965, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 12255, "time_per_iteration": 2.48750376701355 }, { "auxiliary_loss_clip": 0.01114283, "auxiliary_loss_mlp": 0.01028279, "balance_loss_clip": 1.01521921, "balance_loss_mlp": 1.0357343, "epoch": 0.7368705846986322, "flos": 24606099342720.0, "grad_norm": 1.7792920531606053, "language_loss": 0.75374556, "learning_rate": 6.453490782366977e-07, "loss": 0.77517116, "num_input_tokens_seen": 264396310, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 12256, "time_per_iteration": 2.542698860168457 }, { "auxiliary_loss_clip": 0.01116052, "auxiliary_loss_mlp": 0.01032583, "balance_loss_clip": 1.01944613, "balance_loss_mlp": 1.03608584, "epoch": 0.7369307079513001, "flos": 34970169968640.0, "grad_norm": 1.664198465250681, "language_loss": 0.73444462, "learning_rate": 6.450711447560227e-07, "loss": 0.7559309, "num_input_tokens_seen": 264418085, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 12257, "time_per_iteration": 2.627373695373535 }, { "auxiliary_loss_clip": 0.01113312, "auxiliary_loss_mlp": 0.01036629, "balance_loss_clip": 1.02427912, "balance_loss_mlp": 1.03476024, "epoch": 0.7369908312039681, "flos": 21214588164480.0, "grad_norm": 1.5398963722992498, "language_loss": 0.77868903, "learning_rate": 6.447932596292731e-07, "loss": 0.80018842, "num_input_tokens_seen": 264437595, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.69140625, "step": 12258, "time_per_iteration": 2.6668317317962646 }, { "auxiliary_loss_clip": 0.01117773, "auxiliary_loss_mlp": 0.01035004, "balance_loss_clip": 1.02268398, "balance_loss_mlp": 1.03750336, "epoch": 0.7370509544566362, "flos": 23623655477760.0, "grad_norm": 1.427435164739765, "language_loss": 0.66246212, "learning_rate": 6.44515422866366e-07, "loss": 0.68398988, "num_input_tokens_seen": 264457385, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71484375, "step": 12259, "time_per_iteration": 2.5394437313079834 }, { "auxiliary_loss_clip": 0.01115267, "auxiliary_loss_mlp": 0.01032664, "balance_loss_clip": 1.01932454, "balance_loss_mlp": 1.03661716, "epoch": 0.7371110777093041, "flos": 24827704300800.0, "grad_norm": 1.5727874798767458, "language_loss": 0.73285025, "learning_rate": 6.442376344772165e-07, "loss": 0.7543295, "num_input_tokens_seen": 264477205, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69921875, "step": 12260, "time_per_iteration": 2.5660645961761475 }, { "auxiliary_loss_clip": 0.01117712, "auxiliary_loss_mlp": 0.01030738, "balance_loss_clip": 1.01800644, "balance_loss_mlp": 1.03706765, "epoch": 0.7371712009619721, "flos": 23149491016320.0, "grad_norm": 1.6085302971429645, "language_loss": 0.73502183, "learning_rate": 6.439598944717386e-07, "loss": 0.75650632, "num_input_tokens_seen": 264497195, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 12261, "time_per_iteration": 2.5236027240753174 }, { "auxiliary_loss_clip": 0.0112554, "auxiliary_loss_mlp": 0.01036772, "balance_loss_clip": 1.02266979, "balance_loss_mlp": 1.03515577, "epoch": 0.73723132421464, "flos": 23112898035840.0, "grad_norm": 2.3431851988791412, "language_loss": 0.66767067, "learning_rate": 6.436822028598441e-07, "loss": 0.6892938, "num_input_tokens_seen": 264516950, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7265625, "step": 12262, "time_per_iteration": 2.5591115951538086 }, { "auxiliary_loss_clip": 0.01106619, "auxiliary_loss_mlp": 0.01031063, "balance_loss_clip": 1.01721692, "balance_loss_mlp": 1.03503394, "epoch": 0.737291447467308, "flos": 19273328605440.0, "grad_norm": 1.7664801431631705, "language_loss": 0.88576889, "learning_rate": 6.434045596514431e-07, "loss": 0.90714574, "num_input_tokens_seen": 264532675, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71875, "step": 12263, "time_per_iteration": 2.505232334136963 }, { "auxiliary_loss_clip": 0.01099805, "auxiliary_loss_mlp": 0.01027754, "balance_loss_clip": 1.0166198, "balance_loss_mlp": 1.03411996, "epoch": 0.7373515707199759, "flos": 25118257415040.0, "grad_norm": 11.208101903621067, "language_loss": 0.67280924, "learning_rate": 6.431269648564428e-07, "loss": 0.69408488, "num_input_tokens_seen": 264555635, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.65625, "step": 12264, "time_per_iteration": 2.578049421310425 }, { "auxiliary_loss_clip": 0.01100528, "auxiliary_loss_mlp": 0.01028727, "balance_loss_clip": 1.01680565, "balance_loss_mlp": 1.03365803, "epoch": 0.737411693972644, "flos": 32408481767040.0, "grad_norm": 1.7738660080640005, "language_loss": 0.80252147, "learning_rate": 6.428494184847524e-07, "loss": 0.82381403, "num_input_tokens_seen": 264573140, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.66796875, "step": 12265, "time_per_iteration": 2.57637357711792 }, { "auxiliary_loss_clip": 0.01118109, "auxiliary_loss_mlp": 0.01032802, "balance_loss_clip": 1.02005887, "balance_loss_mlp": 1.03590655, "epoch": 0.7374718172253119, "flos": 24315797623680.0, "grad_norm": 1.7846685597874241, "language_loss": 0.74319494, "learning_rate": 6.425719205462737e-07, "loss": 0.76470405, "num_input_tokens_seen": 264591610, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.734375, "step": 12266, "time_per_iteration": 2.5750949382781982 }, { "auxiliary_loss_clip": 0.0111582, "auxiliary_loss_mlp": 0.0103932, "balance_loss_clip": 1.02575958, "balance_loss_mlp": 1.03582418, "epoch": 0.7375319404779799, "flos": 27156115624320.0, "grad_norm": 1.587145927822232, "language_loss": 0.73198014, "learning_rate": 6.422944710509121e-07, "loss": 0.75353152, "num_input_tokens_seen": 264611170, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7109375, "step": 12267, "time_per_iteration": 2.59509015083313 }, { "auxiliary_loss_clip": 0.01129659, "auxiliary_loss_mlp": 0.01030821, "balance_loss_clip": 1.01869118, "balance_loss_mlp": 1.03470016, "epoch": 0.7375920637306478, "flos": 18879999701760.0, "grad_norm": 2.0202086933045833, "language_loss": 0.83231139, "learning_rate": 6.420170700085687e-07, "loss": 0.85391623, "num_input_tokens_seen": 264629365, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 12268, "time_per_iteration": 2.6685941219329834 }, { "auxiliary_loss_clip": 0.01103967, "auxiliary_loss_mlp": 0.01041433, "balance_loss_clip": 1.02895164, "balance_loss_mlp": 1.03603125, "epoch": 0.7376521869833158, "flos": 15669765486720.0, "grad_norm": 2.405237827730977, "language_loss": 0.73489392, "learning_rate": 6.417397174291426e-07, "loss": 0.75634789, "num_input_tokens_seen": 264647915, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 12269, "time_per_iteration": 3.9347140789031982 }, { "auxiliary_loss_clip": 0.01125381, "auxiliary_loss_mlp": 0.01032449, "balance_loss_clip": 1.01938355, "balance_loss_mlp": 1.03524327, "epoch": 0.7377123102359837, "flos": 36971973901440.0, "grad_norm": 1.8932582574656311, "language_loss": 0.70333922, "learning_rate": 6.414624133225317e-07, "loss": 0.72491753, "num_input_tokens_seen": 264669620, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 12270, "time_per_iteration": 2.6465070247650146 }, { "auxiliary_loss_clip": 0.01111741, "auxiliary_loss_mlp": 0.01029627, "balance_loss_clip": 1.01795018, "balance_loss_mlp": 1.03406966, "epoch": 0.7377724334886517, "flos": 24496284487680.0, "grad_norm": 2.2634188015727355, "language_loss": 0.69297141, "learning_rate": 6.411851576986331e-07, "loss": 0.71438509, "num_input_tokens_seen": 264689345, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6875, "step": 12271, "time_per_iteration": 2.524191379547119 }, { "auxiliary_loss_clip": 0.01125245, "auxiliary_loss_mlp": 0.01033831, "balance_loss_clip": 1.02060473, "balance_loss_mlp": 1.03524661, "epoch": 0.7378325567413198, "flos": 24390025079040.0, "grad_norm": 2.481084405320237, "language_loss": 0.67625582, "learning_rate": 6.409079505673418e-07, "loss": 0.69784659, "num_input_tokens_seen": 264707625, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 12272, "time_per_iteration": 2.5445473194122314 }, { "auxiliary_loss_clip": 0.01119557, "auxiliary_loss_mlp": 0.01032492, "balance_loss_clip": 1.02015996, "balance_loss_mlp": 1.03522468, "epoch": 0.7378926799939877, "flos": 17416388223360.0, "grad_norm": 1.7558885576277468, "language_loss": 0.78255522, "learning_rate": 6.406307919385483e-07, "loss": 0.80407566, "num_input_tokens_seen": 264725575, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.671875, "step": 12273, "time_per_iteration": 2.5404415130615234 }, { "auxiliary_loss_clip": 0.01102109, "auxiliary_loss_mlp": 0.01033713, "balance_loss_clip": 1.02128601, "balance_loss_mlp": 1.03334391, "epoch": 0.7379528032466557, "flos": 18474208778880.0, "grad_norm": 2.0762759478257724, "language_loss": 0.83796412, "learning_rate": 6.40353681822146e-07, "loss": 0.85932231, "num_input_tokens_seen": 264742855, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 12274, "time_per_iteration": 2.5028269290924072 }, { "auxiliary_loss_clip": 0.0112838, "auxiliary_loss_mlp": 0.01276224, "balance_loss_clip": 1.01510108, "balance_loss_mlp": 1.03598189, "epoch": 0.7380129264993236, "flos": 17821999578240.0, "grad_norm": 2.10593745614291, "language_loss": 0.73577976, "learning_rate": 6.400766202280232e-07, "loss": 0.75982583, "num_input_tokens_seen": 264761155, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.74609375, "step": 12275, "time_per_iteration": 2.5375144481658936 }, { "auxiliary_loss_clip": 0.01136594, "auxiliary_loss_mlp": 0.01039287, "balance_loss_clip": 1.02570856, "balance_loss_mlp": 1.03662586, "epoch": 0.7380730497519916, "flos": 22997372918400.0, "grad_norm": 1.736076454451931, "language_loss": 0.73112231, "learning_rate": 6.397996071660676e-07, "loss": 0.75288117, "num_input_tokens_seen": 264780660, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 12276, "time_per_iteration": 3.9972500801086426 }, { "auxiliary_loss_clip": 0.01108483, "auxiliary_loss_mlp": 0.01031789, "balance_loss_clip": 1.01912928, "balance_loss_mlp": 1.03655744, "epoch": 0.7381331730046595, "flos": 20266259241600.0, "grad_norm": 2.258601483639579, "language_loss": 0.77493262, "learning_rate": 6.395226426461646e-07, "loss": 0.79633534, "num_input_tokens_seen": 264798850, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 12277, "time_per_iteration": 2.5964550971984863 }, { "auxiliary_loss_clip": 0.01103851, "auxiliary_loss_mlp": 0.0103961, "balance_loss_clip": 1.02688444, "balance_loss_mlp": 1.03554392, "epoch": 0.7381932962573275, "flos": 19754532132480.0, "grad_norm": 1.6875793105272034, "language_loss": 0.78683007, "learning_rate": 6.392457266781996e-07, "loss": 0.80826467, "num_input_tokens_seen": 264816795, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.68359375, "step": 12278, "time_per_iteration": 2.464493751525879 }, { "auxiliary_loss_clip": 0.01114973, "auxiliary_loss_mlp": 0.01279255, "balance_loss_clip": 1.01979601, "balance_loss_mlp": 1.03535223, "epoch": 0.7382534195099955, "flos": 17305316392320.0, "grad_norm": 1.9831659668037807, "language_loss": 0.72350097, "learning_rate": 6.389688592720543e-07, "loss": 0.7474432, "num_input_tokens_seen": 264834105, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 12279, "time_per_iteration": 2.548251152038574 }, { "auxiliary_loss_clip": 0.01114708, "auxiliary_loss_mlp": 0.01038699, "balance_loss_clip": 1.02470946, "balance_loss_mlp": 1.03374565, "epoch": 0.7383135427626635, "flos": 18697358021760.0, "grad_norm": 2.3631495487506644, "language_loss": 0.85612321, "learning_rate": 6.386920404376095e-07, "loss": 0.87765723, "num_input_tokens_seen": 264850895, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.72265625, "step": 12280, "time_per_iteration": 2.470703125 }, { "auxiliary_loss_clip": 0.01133892, "auxiliary_loss_mlp": 0.01030035, "balance_loss_clip": 1.01705933, "balance_loss_mlp": 1.03599262, "epoch": 0.7383736660153314, "flos": 20881300844160.0, "grad_norm": 3.103901886403304, "language_loss": 0.72367471, "learning_rate": 6.384152701847434e-07, "loss": 0.745314, "num_input_tokens_seen": 264869505, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 12281, "time_per_iteration": 2.581366777420044 }, { "auxiliary_loss_clip": 0.01141177, "auxiliary_loss_mlp": 0.01033753, "balance_loss_clip": 1.0204556, "balance_loss_mlp": 1.03488493, "epoch": 0.7384337892679994, "flos": 20663215418880.0, "grad_norm": 1.6109629430560108, "language_loss": 0.60842121, "learning_rate": 6.38138548523335e-07, "loss": 0.63017052, "num_input_tokens_seen": 264886915, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 12282, "time_per_iteration": 3.9956719875335693 }, { "auxiliary_loss_clip": 0.01133938, "auxiliary_loss_mlp": 0.01029618, "balance_loss_clip": 1.01645768, "balance_loss_mlp": 1.03615332, "epoch": 0.7384939125206673, "flos": 29169627390720.0, "grad_norm": 1.7369326523427306, "language_loss": 0.6773901, "learning_rate": 6.378618754632576e-07, "loss": 0.69902563, "num_input_tokens_seen": 264910350, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71484375, "step": 12283, "time_per_iteration": 4.1229588985443115 }, { "auxiliary_loss_clip": 0.01106145, "auxiliary_loss_mlp": 0.01284201, "balance_loss_clip": 1.02423739, "balance_loss_mlp": 1.03710091, "epoch": 0.7385540357733353, "flos": 36312833376000.0, "grad_norm": 1.7426674218526308, "language_loss": 0.75883359, "learning_rate": 6.375852510143867e-07, "loss": 0.78273702, "num_input_tokens_seen": 264930705, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69140625, "step": 12284, "time_per_iteration": 2.65901517868042 }, { "auxiliary_loss_clip": 0.0111511, "auxiliary_loss_mlp": 0.01030228, "balance_loss_clip": 1.01719809, "balance_loss_mlp": 1.03482246, "epoch": 0.7386141590260034, "flos": 20302600826880.0, "grad_norm": 2.1274918388140573, "language_loss": 0.69187009, "learning_rate": 6.373086751865935e-07, "loss": 0.71332353, "num_input_tokens_seen": 264946975, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 12285, "time_per_iteration": 2.5720410346984863 }, { "auxiliary_loss_clip": 0.0112767, "auxiliary_loss_mlp": 0.01032972, "balance_loss_clip": 1.01928091, "balance_loss_mlp": 1.03558993, "epoch": 0.7386742822786713, "flos": 25483792170240.0, "grad_norm": 2.1548810896706105, "language_loss": 0.79890859, "learning_rate": 6.370321479897485e-07, "loss": 0.82051504, "num_input_tokens_seen": 264967665, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.74609375, "step": 12286, "time_per_iteration": 2.6461386680603027 }, { "auxiliary_loss_clip": 0.01139639, "auxiliary_loss_mlp": 0.01029594, "balance_loss_clip": 1.01684451, "balance_loss_mlp": 1.03488517, "epoch": 0.7387344055313393, "flos": 13771958405760.0, "grad_norm": 2.1859474281916476, "language_loss": 0.65285373, "learning_rate": 6.367556694337199e-07, "loss": 0.67454612, "num_input_tokens_seen": 264985480, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 12287, "time_per_iteration": 2.6405043601989746 }, { "auxiliary_loss_clip": 0.01119472, "auxiliary_loss_mlp": 0.01028963, "balance_loss_clip": 1.01667261, "balance_loss_mlp": 1.03465009, "epoch": 0.7387945287840072, "flos": 27855189095040.0, "grad_norm": 1.7169694646125044, "language_loss": 0.76621777, "learning_rate": 6.364792395283744e-07, "loss": 0.78770214, "num_input_tokens_seen": 265004790, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.671875, "step": 12288, "time_per_iteration": 2.6106209754943848 }, { "auxiliary_loss_clip": 0.01118187, "auxiliary_loss_mlp": 0.01280612, "balance_loss_clip": 1.02022803, "balance_loss_mlp": 1.03670609, "epoch": 0.7388546520366752, "flos": 44233039388160.0, "grad_norm": 1.6553799658582222, "language_loss": 0.58339775, "learning_rate": 6.362028582835788e-07, "loss": 0.60738575, "num_input_tokens_seen": 265028790, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 12289, "time_per_iteration": 2.752814531326294 }, { "auxiliary_loss_clip": 0.01027486, "auxiliary_loss_mlp": 0.01251375, "balance_loss_clip": 1.00414431, "balance_loss_mlp": 1.00590479, "epoch": 0.7389147752893431, "flos": 70680890638080.0, "grad_norm": 0.6392927226554402, "language_loss": 0.49330664, "learning_rate": 6.359265257091937e-07, "loss": 0.51609528, "num_input_tokens_seen": 265096660, "router_z_loss_clip": 0.01226807, "router_z_loss_mlp": 0.21582031, "step": 12290, "time_per_iteration": 3.220038890838623 }, { "auxiliary_loss_clip": 0.01116445, "auxiliary_loss_mlp": 0.0103229, "balance_loss_clip": 1.01840854, "balance_loss_mlp": 1.03566146, "epoch": 0.7389748985420111, "flos": 25994980575360.0, "grad_norm": 1.9954091764722057, "language_loss": 0.67175025, "learning_rate": 6.356502418150827e-07, "loss": 0.69323766, "num_input_tokens_seen": 265116375, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71484375, "step": 12291, "time_per_iteration": 2.57578182220459 }, { "auxiliary_loss_clip": 0.01062703, "auxiliary_loss_mlp": 0.01012655, "balance_loss_clip": 1.01140308, "balance_loss_mlp": 1.00604558, "epoch": 0.7390350217946791, "flos": 54403661318400.0, "grad_norm": 0.9929115269965743, "language_loss": 0.60855752, "learning_rate": 6.353740066111051e-07, "loss": 0.62931108, "num_input_tokens_seen": 265161230, "router_z_loss_clip": 0.01251221, "router_z_loss_mlp": 0.21484375, "step": 12292, "time_per_iteration": 2.891098737716675 }, { "auxiliary_loss_clip": 0.01141756, "auxiliary_loss_mlp": 0.01032793, "balance_loss_clip": 1.02099741, "balance_loss_mlp": 1.03704846, "epoch": 0.7390951450473471, "flos": 32196968530560.0, "grad_norm": 2.4466400965359165, "language_loss": 0.66742933, "learning_rate": 6.350978201071189e-07, "loss": 0.68917477, "num_input_tokens_seen": 265182515, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69140625, "step": 12293, "time_per_iteration": 2.664376974105835 }, { "auxiliary_loss_clip": 0.01113672, "auxiliary_loss_mlp": 0.01034299, "balance_loss_clip": 1.02175248, "balance_loss_mlp": 1.03522217, "epoch": 0.739155268300015, "flos": 16684241304960.0, "grad_norm": 1.774957779755938, "language_loss": 0.8347165, "learning_rate": 6.3482168231298e-07, "loss": 0.85619622, "num_input_tokens_seen": 265198160, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 12294, "time_per_iteration": 2.5144498348236084 }, { "auxiliary_loss_clip": 0.01129836, "auxiliary_loss_mlp": 0.01033983, "balance_loss_clip": 1.02099538, "balance_loss_mlp": 1.03460443, "epoch": 0.739215391552683, "flos": 31649761762560.0, "grad_norm": 1.4722545801655926, "language_loss": 0.7271679, "learning_rate": 6.345455932385442e-07, "loss": 0.74880612, "num_input_tokens_seen": 265218480, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6875, "step": 12295, "time_per_iteration": 2.6123623847961426 }, { "auxiliary_loss_clip": 0.01111377, "auxiliary_loss_mlp": 0.01036399, "balance_loss_clip": 1.02416205, "balance_loss_mlp": 1.03562748, "epoch": 0.7392755148053509, "flos": 29718522097920.0, "grad_norm": 1.8779217514392261, "language_loss": 0.78968441, "learning_rate": 6.342695528936637e-07, "loss": 0.81116217, "num_input_tokens_seen": 265240165, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 12296, "time_per_iteration": 2.6276628971099854 }, { "auxiliary_loss_clip": 0.01106472, "auxiliary_loss_mlp": 0.01029457, "balance_loss_clip": 1.01575947, "balance_loss_mlp": 1.03664207, "epoch": 0.7393356380580189, "flos": 37050475075200.0, "grad_norm": 1.9390096061867366, "language_loss": 0.66429865, "learning_rate": 6.3399356128819e-07, "loss": 0.68565798, "num_input_tokens_seen": 265263295, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.69921875, "step": 12297, "time_per_iteration": 2.644869804382324 }, { "auxiliary_loss_clip": 0.01124126, "auxiliary_loss_mlp": 0.01032386, "balance_loss_clip": 1.01952386, "balance_loss_mlp": 1.03605056, "epoch": 0.739395761310687, "flos": 19719627091200.0, "grad_norm": 6.656444119074827, "language_loss": 0.68715727, "learning_rate": 6.337176184319715e-07, "loss": 0.70872235, "num_input_tokens_seen": 265282740, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 12298, "time_per_iteration": 2.546565055847168 }, { "auxiliary_loss_clip": 0.01124307, "auxiliary_loss_mlp": 0.01031645, "balance_loss_clip": 1.0190506, "balance_loss_mlp": 1.03539205, "epoch": 0.7394558845633549, "flos": 11801504067840.0, "grad_norm": 1.7909901204749024, "language_loss": 0.74763155, "learning_rate": 6.33441724334858e-07, "loss": 0.76919103, "num_input_tokens_seen": 265300175, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 12299, "time_per_iteration": 2.545093059539795 }, { "auxiliary_loss_clip": 0.01045318, "auxiliary_loss_mlp": 0.01004987, "balance_loss_clip": 1.00371742, "balance_loss_mlp": 1.00564182, "epoch": 0.7395160078160229, "flos": 66195827850240.0, "grad_norm": 0.7163890251001025, "language_loss": 0.60879791, "learning_rate": 6.33165879006693e-07, "loss": 0.62930095, "num_input_tokens_seen": 265363275, "router_z_loss_clip": 0.01269531, "router_z_loss_mlp": 0.21582031, "step": 12300, "time_per_iteration": 3.171102285385132 }, { "auxiliary_loss_clip": 0.01124287, "auxiliary_loss_mlp": 0.0102821, "balance_loss_clip": 1.01537132, "balance_loss_mlp": 1.03690207, "epoch": 0.7395761310686908, "flos": 21249708687360.0, "grad_norm": 1.6138473831823965, "language_loss": 0.80161202, "learning_rate": 6.328900824573222e-07, "loss": 0.82313699, "num_input_tokens_seen": 265382935, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 12301, "time_per_iteration": 2.543426990509033 }, { "auxiliary_loss_clip": 0.01103768, "auxiliary_loss_mlp": 0.01026584, "balance_loss_clip": 1.01490152, "balance_loss_mlp": 1.03458548, "epoch": 0.7396362543213588, "flos": 25955299025280.0, "grad_norm": 1.5844604584858704, "language_loss": 0.73275149, "learning_rate": 6.326143346965887e-07, "loss": 0.75405502, "num_input_tokens_seen": 265403245, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69140625, "step": 12302, "time_per_iteration": 2.5828351974487305 }, { "auxiliary_loss_clip": 0.01106322, "auxiliary_loss_mlp": 0.01038546, "balance_loss_clip": 1.02580225, "balance_loss_mlp": 1.03510249, "epoch": 0.7396963775740267, "flos": 27377936064000.0, "grad_norm": 1.7874044541025949, "language_loss": 0.73841095, "learning_rate": 6.323386357343308e-07, "loss": 0.75985956, "num_input_tokens_seen": 265423105, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 12303, "time_per_iteration": 2.528878927230835 }, { "auxiliary_loss_clip": 0.01102634, "auxiliary_loss_mlp": 0.01028518, "balance_loss_clip": 1.01649618, "balance_loss_mlp": 1.03440809, "epoch": 0.7397565008266948, "flos": 25520133755520.0, "grad_norm": 2.0287521058218565, "language_loss": 0.53882164, "learning_rate": 6.320629855803897e-07, "loss": 0.56013316, "num_input_tokens_seen": 265443445, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 12304, "time_per_iteration": 2.514188289642334 }, { "auxiliary_loss_clip": 0.0112344, "auxiliary_loss_mlp": 0.01030508, "balance_loss_clip": 1.01724589, "balance_loss_mlp": 1.03584385, "epoch": 0.7398166240793627, "flos": 23727760070400.0, "grad_norm": 4.064818626472812, "language_loss": 0.84387082, "learning_rate": 6.317873842446011e-07, "loss": 0.86541027, "num_input_tokens_seen": 265462085, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69921875, "step": 12305, "time_per_iteration": 2.5999276638031006 }, { "auxiliary_loss_clip": 0.01044825, "auxiliary_loss_mlp": 0.01000171, "balance_loss_clip": 0.99880618, "balance_loss_mlp": 1.00527692, "epoch": 0.7398767473320307, "flos": 67267582882560.0, "grad_norm": 0.8767079233514465, "language_loss": 0.57728612, "learning_rate": 6.315118317368027e-07, "loss": 0.59773612, "num_input_tokens_seen": 265521190, "router_z_loss_clip": 0.01367188, "router_z_loss_mlp": 0.21484375, "step": 12306, "time_per_iteration": 3.16927170753479 }, { "auxiliary_loss_clip": 0.01126818, "auxiliary_loss_mlp": 0.01031311, "balance_loss_clip": 1.01797104, "balance_loss_mlp": 1.03743982, "epoch": 0.7399368705846986, "flos": 22018699981440.0, "grad_norm": 1.968269296220247, "language_loss": 0.81412721, "learning_rate": 6.312363280668253e-07, "loss": 0.8357085, "num_input_tokens_seen": 265539705, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 12307, "time_per_iteration": 2.592970609664917 }, { "auxiliary_loss_clip": 0.01112622, "auxiliary_loss_mlp": 0.01032669, "balance_loss_clip": 1.0207659, "balance_loss_mlp": 1.03652227, "epoch": 0.7399969938373666, "flos": 14173870659840.0, "grad_norm": 1.914173456183972, "language_loss": 0.6992358, "learning_rate": 6.309608732445035e-07, "loss": 0.7206887, "num_input_tokens_seen": 265555855, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.671875, "step": 12308, "time_per_iteration": 2.5404884815216064 }, { "auxiliary_loss_clip": 0.01119058, "auxiliary_loss_mlp": 0.01029315, "balance_loss_clip": 1.01738226, "balance_loss_mlp": 1.03416288, "epoch": 0.7400571170900345, "flos": 25301473712640.0, "grad_norm": 1.787491158440563, "language_loss": 0.81402636, "learning_rate": 6.306854672796664e-07, "loss": 0.83551002, "num_input_tokens_seen": 265575455, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 12309, "time_per_iteration": 2.558058500289917 }, { "auxiliary_loss_clip": 0.01116236, "auxiliary_loss_mlp": 0.0103321, "balance_loss_clip": 1.01998997, "balance_loss_mlp": 1.0357579, "epoch": 0.7401172403427025, "flos": 22711344917760.0, "grad_norm": 2.0565047228957334, "language_loss": 0.72323203, "learning_rate": 6.304101101821426e-07, "loss": 0.74472654, "num_input_tokens_seen": 265595250, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 12310, "time_per_iteration": 2.5520153045654297 }, { "auxiliary_loss_clip": 0.01134384, "auxiliary_loss_mlp": 0.01037354, "balance_loss_clip": 1.02349043, "balance_loss_mlp": 1.0365566, "epoch": 0.7401773635953706, "flos": 18067448188800.0, "grad_norm": 2.4618370676097534, "language_loss": 0.88941115, "learning_rate": 6.301348019617585e-07, "loss": 0.91112852, "num_input_tokens_seen": 265606945, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7109375, "step": 12311, "time_per_iteration": 3.954346179962158 }, { "auxiliary_loss_clip": 0.01147684, "auxiliary_loss_mlp": 0.01027866, "balance_loss_clip": 1.01584351, "balance_loss_mlp": 1.03331089, "epoch": 0.7402374868480385, "flos": 22712135016960.0, "grad_norm": 1.6489547227856982, "language_loss": 0.80179578, "learning_rate": 6.298595426283399e-07, "loss": 0.8235513, "num_input_tokens_seen": 265626115, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69921875, "step": 12312, "time_per_iteration": 2.6077733039855957 }, { "auxiliary_loss_clip": 0.01140275, "auxiliary_loss_mlp": 0.01028699, "balance_loss_clip": 1.01658142, "balance_loss_mlp": 1.03581786, "epoch": 0.7402976101007065, "flos": 22856675345280.0, "grad_norm": 1.6232861585346863, "language_loss": 0.78081083, "learning_rate": 6.295843321917102e-07, "loss": 0.80250061, "num_input_tokens_seen": 265646520, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 12313, "time_per_iteration": 2.6013002395629883 }, { "auxiliary_loss_clip": 0.01140814, "auxiliary_loss_mlp": 0.01037944, "balance_loss_clip": 1.02446115, "balance_loss_mlp": 1.033553, "epoch": 0.7403577333533744, "flos": 12345801834240.0, "grad_norm": 1.8174954610201917, "language_loss": 0.78629351, "learning_rate": 6.293091706616905e-07, "loss": 0.80808115, "num_input_tokens_seen": 265661875, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 12314, "time_per_iteration": 2.5595247745513916 }, { "auxiliary_loss_clip": 0.0103495, "auxiliary_loss_mlp": 0.01002159, "balance_loss_clip": 1.00087726, "balance_loss_mlp": 1.0052036, "epoch": 0.7404178566060424, "flos": 60327270869760.0, "grad_norm": 0.8308390360880534, "language_loss": 0.55198807, "learning_rate": 6.290340580480997e-07, "loss": 0.5723592, "num_input_tokens_seen": 265721255, "router_z_loss_clip": 0.01281738, "router_z_loss_mlp": 0.21484375, "step": 12315, "time_per_iteration": 3.1621265411376953 }, { "auxiliary_loss_clip": 0.01111124, "auxiliary_loss_mlp": 0.010322, "balance_loss_clip": 1.01996279, "balance_loss_mlp": 1.0356288, "epoch": 0.7404779798587103, "flos": 32014650072960.0, "grad_norm": 1.4553112967700783, "language_loss": 0.79521221, "learning_rate": 6.287589943607584e-07, "loss": 0.8166455, "num_input_tokens_seen": 265743970, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.66796875, "step": 12316, "time_per_iteration": 2.606501817703247 }, { "auxiliary_loss_clip": 0.01111875, "auxiliary_loss_mlp": 0.01027612, "balance_loss_clip": 1.01519084, "balance_loss_mlp": 1.03333676, "epoch": 0.7405381031113784, "flos": 12889704551040.0, "grad_norm": 1.8628600812513034, "language_loss": 0.7497319, "learning_rate": 6.284839796094806e-07, "loss": 0.77112675, "num_input_tokens_seen": 265760890, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69921875, "step": 12317, "time_per_iteration": 3.9746899604797363 }, { "auxiliary_loss_clip": 0.01118902, "auxiliary_loss_mlp": 0.01034376, "balance_loss_clip": 1.02221072, "balance_loss_mlp": 1.03484011, "epoch": 0.7405982263640463, "flos": 20229127557120.0, "grad_norm": 1.6532163734541743, "language_loss": 0.81396675, "learning_rate": 6.28209013804081e-07, "loss": 0.83549953, "num_input_tokens_seen": 265779600, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.66015625, "step": 12318, "time_per_iteration": 2.598388910293579 }, { "auxiliary_loss_clip": 0.01122237, "auxiliary_loss_mlp": 0.01030777, "balance_loss_clip": 1.01827753, "balance_loss_mlp": 1.03381467, "epoch": 0.7406583496167143, "flos": 17567213431680.0, "grad_norm": 2.2317519144117934, "language_loss": 0.7680608, "learning_rate": 6.279340969543742e-07, "loss": 0.78959095, "num_input_tokens_seen": 265797030, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 12319, "time_per_iteration": 2.5629491806030273 }, { "auxiliary_loss_clip": 0.0112113, "auxiliary_loss_mlp": 0.01030279, "balance_loss_clip": 1.01858437, "balance_loss_mlp": 1.03493381, "epoch": 0.7407184728693822, "flos": 18295733076480.0, "grad_norm": 4.723566593702287, "language_loss": 0.63576245, "learning_rate": 6.27659229070169e-07, "loss": 0.65727651, "num_input_tokens_seen": 265815055, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6796875, "step": 12320, "time_per_iteration": 2.5281360149383545 }, { "auxiliary_loss_clip": 0.01116637, "auxiliary_loss_mlp": 0.01036476, "balance_loss_clip": 1.02381635, "balance_loss_mlp": 1.03587222, "epoch": 0.7407785961220502, "flos": 16690562098560.0, "grad_norm": 2.8375374204454498, "language_loss": 0.82070982, "learning_rate": 6.273844101612765e-07, "loss": 0.84224099, "num_input_tokens_seen": 265828480, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 12321, "time_per_iteration": 2.480504274368286 }, { "auxiliary_loss_clip": 0.01129065, "auxiliary_loss_mlp": 0.01045741, "balance_loss_clip": 1.03122091, "balance_loss_mlp": 1.03637099, "epoch": 0.7408387193747181, "flos": 22088330496000.0, "grad_norm": 22.97860357706183, "language_loss": 0.72275954, "learning_rate": 6.271096402375027e-07, "loss": 0.74450761, "num_input_tokens_seen": 265845825, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.75390625, "step": 12322, "time_per_iteration": 2.5836806297302246 }, { "auxiliary_loss_clip": 0.01113707, "auxiliary_loss_mlp": 0.01280817, "balance_loss_clip": 1.02136588, "balance_loss_mlp": 1.03579569, "epoch": 0.7408988426273861, "flos": 24236721832320.0, "grad_norm": 1.7408126074334815, "language_loss": 0.63930523, "learning_rate": 6.268349193086557e-07, "loss": 0.66325045, "num_input_tokens_seen": 265866335, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 12323, "time_per_iteration": 2.5969340801239014 }, { "auxiliary_loss_clip": 0.01132578, "auxiliary_loss_mlp": 0.01028443, "balance_loss_clip": 1.01539564, "balance_loss_mlp": 1.03526902, "epoch": 0.7409589658800542, "flos": 29023004073600.0, "grad_norm": 1.357589401211911, "language_loss": 0.7603116, "learning_rate": 6.26560247384537e-07, "loss": 0.78192186, "num_input_tokens_seen": 265888945, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 12324, "time_per_iteration": 5.641865491867065 }, { "auxiliary_loss_clip": 0.01121563, "auxiliary_loss_mlp": 0.01026018, "balance_loss_clip": 1.01384664, "balance_loss_mlp": 1.03306508, "epoch": 0.7410190891327221, "flos": 19351362902400.0, "grad_norm": 1.9287610056747215, "language_loss": 0.74933678, "learning_rate": 6.262856244749508e-07, "loss": 0.77081263, "num_input_tokens_seen": 265908030, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 12325, "time_per_iteration": 2.545260429382324 }, { "auxiliary_loss_clip": 0.01144661, "auxiliary_loss_mlp": 0.01029656, "balance_loss_clip": 1.01598251, "balance_loss_mlp": 1.03523672, "epoch": 0.7410792123853901, "flos": 22747650589440.0, "grad_norm": 1.831744916877436, "language_loss": 0.68558407, "learning_rate": 6.260110505896971e-07, "loss": 0.70732725, "num_input_tokens_seen": 265927030, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 12326, "time_per_iteration": 2.6150424480438232 }, { "auxiliary_loss_clip": 0.01131815, "auxiliary_loss_mlp": 0.01031307, "balance_loss_clip": 1.01890302, "balance_loss_mlp": 1.03502965, "epoch": 0.741139335638058, "flos": 25372433030400.0, "grad_norm": 1.7239850186703556, "language_loss": 0.89522231, "learning_rate": 6.257365257385748e-07, "loss": 0.91685355, "num_input_tokens_seen": 265945490, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 12327, "time_per_iteration": 2.6738476753234863 }, { "auxiliary_loss_clip": 0.01105558, "auxiliary_loss_mlp": 0.01032055, "balance_loss_clip": 1.01978827, "balance_loss_mlp": 1.03617215, "epoch": 0.741199458890726, "flos": 18585639745920.0, "grad_norm": 1.885038473066877, "language_loss": 0.85301125, "learning_rate": 6.2546204993138e-07, "loss": 0.87438738, "num_input_tokens_seen": 265963265, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6953125, "step": 12328, "time_per_iteration": 2.4978153705596924 }, { "auxiliary_loss_clip": 0.0111126, "auxiliary_loss_mlp": 0.01030918, "balance_loss_clip": 1.0175252, "balance_loss_mlp": 1.0345211, "epoch": 0.7412595821433939, "flos": 22127078292480.0, "grad_norm": 1.7916144776436418, "language_loss": 0.66496336, "learning_rate": 6.251876231779103e-07, "loss": 0.68638521, "num_input_tokens_seen": 265982270, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6796875, "step": 12329, "time_per_iteration": 2.545027256011963 }, { "auxiliary_loss_clip": 0.01136724, "auxiliary_loss_mlp": 0.01030164, "balance_loss_clip": 1.01740813, "balance_loss_mlp": 1.03673196, "epoch": 0.741319705396062, "flos": 29169699217920.0, "grad_norm": 1.7986024743662032, "language_loss": 0.6627236, "learning_rate": 6.249132454879564e-07, "loss": 0.68439245, "num_input_tokens_seen": 266003835, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.734375, "step": 12330, "time_per_iteration": 2.5771596431732178 }, { "auxiliary_loss_clip": 0.01141307, "auxiliary_loss_mlp": 0.0103438, "balance_loss_clip": 1.01972294, "balance_loss_mlp": 1.03890538, "epoch": 0.7413798286487299, "flos": 20667489137280.0, "grad_norm": 2.3570432980576714, "language_loss": 0.85327095, "learning_rate": 6.246389168713127e-07, "loss": 0.87502784, "num_input_tokens_seen": 266021595, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.75, "step": 12331, "time_per_iteration": 2.547640323638916 }, { "auxiliary_loss_clip": 0.01159636, "auxiliary_loss_mlp": 0.01029129, "balance_loss_clip": 1.01621294, "balance_loss_mlp": 1.03448045, "epoch": 0.7414399519013979, "flos": 16398895662720.0, "grad_norm": 1.8963553221963132, "language_loss": 0.69936466, "learning_rate": 6.243646373377678e-07, "loss": 0.72125232, "num_input_tokens_seen": 266039860, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 12332, "time_per_iteration": 2.8620760440826416 }, { "auxiliary_loss_clip": 0.01105767, "auxiliary_loss_mlp": 0.01037298, "balance_loss_clip": 1.02370191, "balance_loss_mlp": 1.03522253, "epoch": 0.7415000751540658, "flos": 25630235919360.0, "grad_norm": 1.8028272928654199, "language_loss": 0.63541985, "learning_rate": 6.240904068971107e-07, "loss": 0.65685046, "num_input_tokens_seen": 266058050, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.703125, "step": 12333, "time_per_iteration": 2.53550124168396 }, { "auxiliary_loss_clip": 0.01121109, "auxiliary_loss_mlp": 0.01037252, "balance_loss_clip": 1.0250988, "balance_loss_mlp": 1.03724265, "epoch": 0.7415601984067338, "flos": 24499732193280.0, "grad_norm": 1.5976713717952467, "language_loss": 0.71425873, "learning_rate": 6.238162255591275e-07, "loss": 0.73584235, "num_input_tokens_seen": 266078060, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.66015625, "step": 12334, "time_per_iteration": 2.596057891845703 }, { "auxiliary_loss_clip": 0.01130758, "auxiliary_loss_mlp": 0.01027971, "balance_loss_clip": 1.01556098, "balance_loss_mlp": 1.03387904, "epoch": 0.7416203216594017, "flos": 20887154760960.0, "grad_norm": 2.6443970748535377, "language_loss": 0.82256651, "learning_rate": 6.235420933336026e-07, "loss": 0.84415388, "num_input_tokens_seen": 266097110, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 12335, "time_per_iteration": 2.565228223800659 }, { "auxiliary_loss_clip": 0.01105484, "auxiliary_loss_mlp": 0.01031022, "balance_loss_clip": 1.0174799, "balance_loss_mlp": 1.03530049, "epoch": 0.7416804449120697, "flos": 15624265933440.0, "grad_norm": 2.693400803134491, "language_loss": 0.74517512, "learning_rate": 6.232680102303212e-07, "loss": 0.76654017, "num_input_tokens_seen": 266110870, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.703125, "step": 12336, "time_per_iteration": 2.4631106853485107 }, { "auxiliary_loss_clip": 0.01143785, "auxiliary_loss_mlp": 0.01032662, "balance_loss_clip": 1.01981747, "balance_loss_mlp": 1.03665686, "epoch": 0.7417405681647377, "flos": 17120483982720.0, "grad_norm": 1.8165565663935117, "language_loss": 0.7324537, "learning_rate": 6.229939762590617e-07, "loss": 0.75421822, "num_input_tokens_seen": 266127845, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 12337, "time_per_iteration": 2.519272565841675 }, { "auxiliary_loss_clip": 0.01122932, "auxiliary_loss_mlp": 0.01033096, "balance_loss_clip": 1.02093041, "balance_loss_mlp": 1.03745031, "epoch": 0.7418006914174057, "flos": 18880322924160.0, "grad_norm": 1.7425699936844394, "language_loss": 0.76446462, "learning_rate": 6.22719991429606e-07, "loss": 0.78602493, "num_input_tokens_seen": 266145400, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.671875, "step": 12338, "time_per_iteration": 2.5434608459472656 }, { "auxiliary_loss_clip": 0.0111081, "auxiliary_loss_mlp": 0.01028118, "balance_loss_clip": 1.01664424, "balance_loss_mlp": 1.03396487, "epoch": 0.7418608146700737, "flos": 21580733450880.0, "grad_norm": 1.792422290498987, "language_loss": 0.73075813, "learning_rate": 6.224460557517301e-07, "loss": 0.75214744, "num_input_tokens_seen": 266164430, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6796875, "step": 12339, "time_per_iteration": 2.5203335285186768 }, { "auxiliary_loss_clip": 0.01123001, "auxiliary_loss_mlp": 0.01031096, "balance_loss_clip": 1.01795912, "balance_loss_mlp": 1.03516221, "epoch": 0.7419209379227416, "flos": 22340459036160.0, "grad_norm": 1.9069792788211577, "language_loss": 0.79635435, "learning_rate": 6.221721692352123e-07, "loss": 0.81789529, "num_input_tokens_seen": 266183855, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 12340, "time_per_iteration": 2.5659754276275635 }, { "auxiliary_loss_clip": 0.01131794, "auxiliary_loss_mlp": 0.010369, "balance_loss_clip": 1.02415681, "balance_loss_mlp": 1.03427482, "epoch": 0.7419810611754096, "flos": 16762275601920.0, "grad_norm": 1.610460126870071, "language_loss": 0.75599831, "learning_rate": 6.218983318898243e-07, "loss": 0.77768528, "num_input_tokens_seen": 266202085, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 12341, "time_per_iteration": 2.5390853881835938 }, { "auxiliary_loss_clip": 0.0111579, "auxiliary_loss_mlp": 0.01033404, "balance_loss_clip": 1.02061319, "balance_loss_mlp": 1.03485346, "epoch": 0.7420411844280775, "flos": 26212958259840.0, "grad_norm": 1.4584527127270062, "language_loss": 0.80037081, "learning_rate": 6.216245437253407e-07, "loss": 0.8218627, "num_input_tokens_seen": 266223445, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6328125, "step": 12342, "time_per_iteration": 2.658043146133423 }, { "auxiliary_loss_clip": 0.01130058, "auxiliary_loss_mlp": 0.0102912, "balance_loss_clip": 1.01677597, "balance_loss_mlp": 1.03464448, "epoch": 0.7421013076807456, "flos": 68529371840640.0, "grad_norm": 1.7558145910560665, "language_loss": 0.77586424, "learning_rate": 6.213508047515314e-07, "loss": 0.79745603, "num_input_tokens_seen": 266246575, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 12343, "time_per_iteration": 2.9414138793945312 }, { "auxiliary_loss_clip": 0.01119187, "auxiliary_loss_mlp": 0.01032156, "balance_loss_clip": 1.01993716, "balance_loss_mlp": 1.03269911, "epoch": 0.7421614309334135, "flos": 24425325169920.0, "grad_norm": 3.6718457029476075, "language_loss": 0.6768195, "learning_rate": 6.210771149781655e-07, "loss": 0.69833291, "num_input_tokens_seen": 266266055, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69140625, "step": 12344, "time_per_iteration": 2.6130263805389404 }, { "auxiliary_loss_clip": 0.01140414, "auxiliary_loss_mlp": 0.010359, "balance_loss_clip": 1.0221194, "balance_loss_mlp": 1.03455234, "epoch": 0.7422215541860815, "flos": 12311076360960.0, "grad_norm": 2.132056018774216, "language_loss": 0.80747384, "learning_rate": 6.208034744150099e-07, "loss": 0.82923698, "num_input_tokens_seen": 266282240, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.70703125, "step": 12345, "time_per_iteration": 2.5221457481384277 }, { "auxiliary_loss_clip": 0.0113304, "auxiliary_loss_mlp": 0.01036554, "balance_loss_clip": 1.02385855, "balance_loss_mlp": 1.03541958, "epoch": 0.7422816774387494, "flos": 19645579203840.0, "grad_norm": 3.7031001857498698, "language_loss": 0.70975608, "learning_rate": 6.205298830718317e-07, "loss": 0.73145205, "num_input_tokens_seen": 266300980, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 12346, "time_per_iteration": 2.518285036087036 }, { "auxiliary_loss_clip": 0.01114824, "auxiliary_loss_mlp": 0.01032066, "balance_loss_clip": 1.01953745, "balance_loss_mlp": 1.0358963, "epoch": 0.7423418006914174, "flos": 32015978876160.0, "grad_norm": 1.5590445293014192, "language_loss": 0.73349667, "learning_rate": 6.202563409583931e-07, "loss": 0.75496554, "num_input_tokens_seen": 266322215, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 12347, "time_per_iteration": 2.6017775535583496 }, { "auxiliary_loss_clip": 0.01126731, "auxiliary_loss_mlp": 0.01032409, "balance_loss_clip": 1.01962399, "balance_loss_mlp": 1.03566861, "epoch": 0.7424019239440853, "flos": 18916951818240.0, "grad_norm": 1.758420747684508, "language_loss": 0.80725831, "learning_rate": 6.199828480844558e-07, "loss": 0.82884979, "num_input_tokens_seen": 266341600, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.73046875, "step": 12348, "time_per_iteration": 2.52729868888855 }, { "auxiliary_loss_clip": 0.011036, "auxiliary_loss_mlp": 0.01033251, "balance_loss_clip": 1.02099085, "balance_loss_mlp": 1.03491521, "epoch": 0.7424620471967533, "flos": 35876518871040.0, "grad_norm": 2.134146863091454, "language_loss": 0.72350919, "learning_rate": 6.197094044597814e-07, "loss": 0.74487764, "num_input_tokens_seen": 266362895, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 12349, "time_per_iteration": 2.647427797317505 }, { "auxiliary_loss_clip": 0.01124676, "auxiliary_loss_mlp": 0.01033583, "balance_loss_clip": 1.01909351, "balance_loss_mlp": 1.0335741, "epoch": 0.7425221704494213, "flos": 27016603200000.0, "grad_norm": 2.3272519992426326, "language_loss": 0.79348403, "learning_rate": 6.19436010094128e-07, "loss": 0.81506664, "num_input_tokens_seen": 266384015, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.734375, "step": 12350, "time_per_iteration": 2.5537784099578857 }, { "auxiliary_loss_clip": 0.01138997, "auxiliary_loss_mlp": 0.01033532, "balance_loss_clip": 1.02104449, "balance_loss_mlp": 1.03446102, "epoch": 0.7425822937020893, "flos": 34167135559680.0, "grad_norm": 2.8138136821584636, "language_loss": 0.74901474, "learning_rate": 6.191626649972521e-07, "loss": 0.77074003, "num_input_tokens_seen": 266405990, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 12351, "time_per_iteration": 2.6949656009674072 }, { "auxiliary_loss_clip": 0.01147173, "auxiliary_loss_mlp": 0.01024414, "balance_loss_clip": 1.0121181, "balance_loss_mlp": 1.03464293, "epoch": 0.7426424169547573, "flos": 21283572234240.0, "grad_norm": 1.8479227502941773, "language_loss": 0.81487119, "learning_rate": 6.188893691789081e-07, "loss": 0.83658707, "num_input_tokens_seen": 266424260, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 12352, "time_per_iteration": 4.033181428909302 }, { "auxiliary_loss_clip": 0.01114755, "auxiliary_loss_mlp": 0.0103621, "balance_loss_clip": 1.02418733, "balance_loss_mlp": 1.03660786, "epoch": 0.7427025402074252, "flos": 22448442297600.0, "grad_norm": 1.7689958024674852, "language_loss": 0.71862274, "learning_rate": 6.186161226488511e-07, "loss": 0.74013245, "num_input_tokens_seen": 266444580, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 12353, "time_per_iteration": 2.5896120071411133 }, { "auxiliary_loss_clip": 0.01113981, "auxiliary_loss_mlp": 0.01032604, "balance_loss_clip": 1.02082586, "balance_loss_mlp": 1.03635323, "epoch": 0.7427626634600932, "flos": 22524609087360.0, "grad_norm": 1.8310989168993572, "language_loss": 0.72105581, "learning_rate": 6.183429254168302e-07, "loss": 0.74252164, "num_input_tokens_seen": 266465640, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6875, "step": 12354, "time_per_iteration": 2.536749839782715 }, { "auxiliary_loss_clip": 0.01034261, "auxiliary_loss_mlp": 0.0100261, "balance_loss_clip": 1.00117326, "balance_loss_mlp": 1.00374806, "epoch": 0.7428227867127611, "flos": 67209477655680.0, "grad_norm": 0.6950486469781187, "language_loss": 0.59589112, "learning_rate": 6.180697774925967e-07, "loss": 0.61625981, "num_input_tokens_seen": 266531950, "router_z_loss_clip": 0.01434326, "router_z_loss_mlp": 0.21679688, "step": 12355, "time_per_iteration": 3.255192279815674 }, { "auxiliary_loss_clip": 0.0110493, "auxiliary_loss_mlp": 0.01030673, "balance_loss_clip": 1.01781583, "balance_loss_mlp": 1.03446126, "epoch": 0.7428829099654292, "flos": 14721221082240.0, "grad_norm": 1.8589979339928608, "language_loss": 0.67381185, "learning_rate": 6.177966788858977e-07, "loss": 0.6951679, "num_input_tokens_seen": 266550665, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 12356, "time_per_iteration": 2.476846933364868 }, { "auxiliary_loss_clip": 0.01113792, "auxiliary_loss_mlp": 0.01280183, "balance_loss_clip": 1.02075005, "balance_loss_mlp": 1.03279197, "epoch": 0.7429430332180971, "flos": 48646496413440.0, "grad_norm": 1.7760508857853445, "language_loss": 0.71857476, "learning_rate": 6.175236296064807e-07, "loss": 0.74251449, "num_input_tokens_seen": 266572455, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.72265625, "step": 12357, "time_per_iteration": 2.8160884380340576 }, { "auxiliary_loss_clip": 0.01120189, "auxiliary_loss_mlp": 0.01030733, "balance_loss_clip": 1.01871061, "balance_loss_mlp": 1.03660429, "epoch": 0.7430031564707651, "flos": 16764071281920.0, "grad_norm": 1.856442675723748, "language_loss": 0.65410149, "learning_rate": 6.172506296640883e-07, "loss": 0.67561072, "num_input_tokens_seen": 266590895, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.65625, "step": 12358, "time_per_iteration": 2.5537867546081543 }, { "auxiliary_loss_clip": 0.01142313, "auxiliary_loss_mlp": 0.01033748, "balance_loss_clip": 1.01991415, "balance_loss_mlp": 1.03530526, "epoch": 0.743063279723433, "flos": 23870576545920.0, "grad_norm": 2.3489374840281725, "language_loss": 0.80719554, "learning_rate": 6.169776790684644e-07, "loss": 0.82895613, "num_input_tokens_seen": 266607660, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71484375, "step": 12359, "time_per_iteration": 3.91042423248291 }, { "auxiliary_loss_clip": 0.01111197, "auxiliary_loss_mlp": 0.01030459, "balance_loss_clip": 1.01798332, "balance_loss_mlp": 1.03254175, "epoch": 0.743123402976101, "flos": 14391704689920.0, "grad_norm": 2.289548573801093, "language_loss": 0.68137306, "learning_rate": 6.167047778293497e-07, "loss": 0.7027896, "num_input_tokens_seen": 266624260, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 12360, "time_per_iteration": 2.4897871017456055 }, { "auxiliary_loss_clip": 0.01110801, "auxiliary_loss_mlp": 0.01275901, "balance_loss_clip": 1.01614964, "balance_loss_mlp": 1.03304887, "epoch": 0.7431835262287689, "flos": 27454318335360.0, "grad_norm": 1.8332585664687122, "language_loss": 0.727175, "learning_rate": 6.164319259564834e-07, "loss": 0.75104201, "num_input_tokens_seen": 266644210, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 12361, "time_per_iteration": 2.5472662448883057 }, { "auxiliary_loss_clip": 0.01141648, "auxiliary_loss_mlp": 0.01033914, "balance_loss_clip": 1.0212481, "balance_loss_mlp": 1.03514755, "epoch": 0.743243649481437, "flos": 20959514709120.0, "grad_norm": 2.220976920314029, "language_loss": 0.56105584, "learning_rate": 6.161591234596024e-07, "loss": 0.58281147, "num_input_tokens_seen": 266664230, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 12362, "time_per_iteration": 2.5529229640960693 }, { "auxiliary_loss_clip": 0.01132683, "auxiliary_loss_mlp": 0.01031425, "balance_loss_clip": 1.01905751, "balance_loss_mlp": 1.03693938, "epoch": 0.7433037727341049, "flos": 22783166161920.0, "grad_norm": 1.7316549021899812, "language_loss": 0.77296293, "learning_rate": 6.158863703484427e-07, "loss": 0.79460406, "num_input_tokens_seen": 266683270, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 12363, "time_per_iteration": 2.536158561706543 }, { "auxiliary_loss_clip": 0.01121463, "auxiliary_loss_mlp": 0.01033721, "balance_loss_clip": 1.02176476, "balance_loss_mlp": 1.03495669, "epoch": 0.7433638959867729, "flos": 22196708807040.0, "grad_norm": 1.7797106254362194, "language_loss": 0.77784348, "learning_rate": 6.156136666327383e-07, "loss": 0.79939532, "num_input_tokens_seen": 266701235, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 12364, "time_per_iteration": 2.6703476905822754 }, { "auxiliary_loss_clip": 0.01110854, "auxiliary_loss_mlp": 0.01032063, "balance_loss_clip": 1.01975477, "balance_loss_mlp": 1.0327487, "epoch": 0.7434240192394409, "flos": 23296760778240.0, "grad_norm": 1.5945581772254014, "language_loss": 0.78558224, "learning_rate": 6.153410123222202e-07, "loss": 0.80701143, "num_input_tokens_seen": 266721495, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 12365, "time_per_iteration": 4.082525014877319 }, { "auxiliary_loss_clip": 0.01125484, "auxiliary_loss_mlp": 0.01032352, "balance_loss_clip": 1.01926327, "balance_loss_mlp": 1.035743, "epoch": 0.7434841424921088, "flos": 54009575251200.0, "grad_norm": 2.686875610614869, "language_loss": 0.76814133, "learning_rate": 6.150684074266203e-07, "loss": 0.7897197, "num_input_tokens_seen": 266747400, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 12366, "time_per_iteration": 4.286290884017944 }, { "auxiliary_loss_clip": 0.01110075, "auxiliary_loss_mlp": 0.01027639, "balance_loss_clip": 1.01667738, "balance_loss_mlp": 1.03469133, "epoch": 0.7435442657447768, "flos": 21433966479360.0, "grad_norm": 1.4237897078428912, "language_loss": 0.71163881, "learning_rate": 6.147958519556664e-07, "loss": 0.73301595, "num_input_tokens_seen": 266767630, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.6640625, "step": 12367, "time_per_iteration": 2.5538086891174316 }, { "auxiliary_loss_clip": 0.01133687, "auxiliary_loss_mlp": 0.01034675, "balance_loss_clip": 1.02222943, "balance_loss_mlp": 1.03521216, "epoch": 0.7436043889974447, "flos": 24499408970880.0, "grad_norm": 1.5525321461522097, "language_loss": 0.74423695, "learning_rate": 6.145233459190855e-07, "loss": 0.76592052, "num_input_tokens_seen": 266788015, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71875, "step": 12368, "time_per_iteration": 2.5836079120635986 }, { "auxiliary_loss_clip": 0.01110411, "auxiliary_loss_mlp": 0.01033756, "balance_loss_clip": 1.0228188, "balance_loss_mlp": 1.03511453, "epoch": 0.7436645122501128, "flos": 40698388512000.0, "grad_norm": 1.803418923248207, "language_loss": 0.69175136, "learning_rate": 6.142508893266019e-07, "loss": 0.71319294, "num_input_tokens_seen": 266809010, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.6640625, "step": 12369, "time_per_iteration": 2.7241241931915283 }, { "auxiliary_loss_clip": 0.01121975, "auxiliary_loss_mlp": 0.01276367, "balance_loss_clip": 1.01716399, "balance_loss_mlp": 1.03503191, "epoch": 0.7437246355027807, "flos": 18908835344640.0, "grad_norm": 2.104092745088158, "language_loss": 0.75700915, "learning_rate": 6.139784821879406e-07, "loss": 0.78099251, "num_input_tokens_seen": 266825390, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 12370, "time_per_iteration": 2.5427119731903076 }, { "auxiliary_loss_clip": 0.01103406, "auxiliary_loss_mlp": 0.01032863, "balance_loss_clip": 1.02034593, "balance_loss_mlp": 1.03567243, "epoch": 0.7437847587554487, "flos": 21543817248000.0, "grad_norm": 1.377810414153789, "language_loss": 0.7816714, "learning_rate": 6.137061245128208e-07, "loss": 0.80303407, "num_input_tokens_seen": 266844675, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 12371, "time_per_iteration": 2.5764002799987793 }, { "auxiliary_loss_clip": 0.01122384, "auxiliary_loss_mlp": 0.01027424, "balance_loss_clip": 1.01496625, "balance_loss_mlp": 1.03534997, "epoch": 0.7438448820081166, "flos": 27782470010880.0, "grad_norm": 2.418848703205563, "language_loss": 0.6913178, "learning_rate": 6.13433816310964e-07, "loss": 0.71281588, "num_input_tokens_seen": 266865160, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 12372, "time_per_iteration": 2.6269690990448 }, { "auxiliary_loss_clip": 0.01120697, "auxiliary_loss_mlp": 0.0103163, "balance_loss_clip": 1.01936328, "balance_loss_mlp": 1.03466821, "epoch": 0.7439050052607846, "flos": 17967832796160.0, "grad_norm": 2.228058617975886, "language_loss": 0.75503445, "learning_rate": 6.131615575920879e-07, "loss": 0.77655774, "num_input_tokens_seen": 266883285, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6796875, "step": 12373, "time_per_iteration": 2.585810899734497 }, { "auxiliary_loss_clip": 0.011413, "auxiliary_loss_mlp": 0.01035402, "balance_loss_clip": 1.02190185, "balance_loss_mlp": 1.03375435, "epoch": 0.7439651285134525, "flos": 22958696949120.0, "grad_norm": 1.849699474982119, "language_loss": 0.76977003, "learning_rate": 6.128893483659081e-07, "loss": 0.79153711, "num_input_tokens_seen": 266900960, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71484375, "step": 12374, "time_per_iteration": 2.594045400619507 }, { "auxiliary_loss_clip": 0.01149587, "auxiliary_loss_mlp": 0.01033679, "balance_loss_clip": 1.0205301, "balance_loss_mlp": 1.03464246, "epoch": 0.7440252517661206, "flos": 18806777827200.0, "grad_norm": 2.272495906826284, "language_loss": 0.76856661, "learning_rate": 6.126171886421389e-07, "loss": 0.79039919, "num_input_tokens_seen": 266917710, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 12375, "time_per_iteration": 2.5506293773651123 }, { "auxiliary_loss_clip": 0.01109879, "auxiliary_loss_mlp": 0.01027487, "balance_loss_clip": 1.01616251, "balance_loss_mlp": 1.03370321, "epoch": 0.7440853750187885, "flos": 20266295155200.0, "grad_norm": 1.787336197696655, "language_loss": 0.77222717, "learning_rate": 6.123450784304942e-07, "loss": 0.7936008, "num_input_tokens_seen": 266934220, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.671875, "step": 12376, "time_per_iteration": 2.5108373165130615 }, { "auxiliary_loss_clip": 0.0113185, "auxiliary_loss_mlp": 0.01028606, "balance_loss_clip": 1.01612449, "balance_loss_mlp": 1.03452778, "epoch": 0.7441454982714565, "flos": 25337276593920.0, "grad_norm": 1.6590600847973365, "language_loss": 0.79437941, "learning_rate": 6.120730177406848e-07, "loss": 0.81598401, "num_input_tokens_seen": 266955210, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 12377, "time_per_iteration": 2.5996503829956055 }, { "auxiliary_loss_clip": 0.01115108, "auxiliary_loss_mlp": 0.01028278, "balance_loss_clip": 1.01519465, "balance_loss_mlp": 1.03474379, "epoch": 0.7442056215241245, "flos": 64480910866560.0, "grad_norm": 1.8635424718884597, "language_loss": 0.67640525, "learning_rate": 6.118010065824177e-07, "loss": 0.69783914, "num_input_tokens_seen": 266976555, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 12378, "time_per_iteration": 3.048156261444092 }, { "auxiliary_loss_clip": 0.01133995, "auxiliary_loss_mlp": 0.01036572, "balance_loss_clip": 1.02238631, "balance_loss_mlp": 1.03599501, "epoch": 0.7442657447767924, "flos": 31285376242560.0, "grad_norm": 1.667361074900578, "language_loss": 0.71346164, "learning_rate": 6.115290449654027e-07, "loss": 0.73516738, "num_input_tokens_seen": 266997640, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.71484375, "step": 12379, "time_per_iteration": 2.6430795192718506 }, { "auxiliary_loss_clip": 0.01130168, "auxiliary_loss_mlp": 0.01283807, "balance_loss_clip": 1.02397144, "balance_loss_mlp": 1.03391981, "epoch": 0.7443258680294604, "flos": 20807899401600.0, "grad_norm": 2.0163424359096935, "language_loss": 0.65155065, "learning_rate": 6.112571328993443e-07, "loss": 0.67569035, "num_input_tokens_seen": 267016165, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 12380, "time_per_iteration": 2.636235237121582 }, { "auxiliary_loss_clip": 0.0111536, "auxiliary_loss_mlp": 0.01030541, "balance_loss_clip": 1.01813745, "balance_loss_mlp": 1.03576374, "epoch": 0.7443859912821283, "flos": 22199833290240.0, "grad_norm": 1.8915511772118734, "language_loss": 0.78738564, "learning_rate": 6.109852703939466e-07, "loss": 0.80884463, "num_input_tokens_seen": 267034075, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 12381, "time_per_iteration": 2.5876543521881104 }, { "auxiliary_loss_clip": 0.01135433, "auxiliary_loss_mlp": 0.0103432, "balance_loss_clip": 1.02099836, "balance_loss_mlp": 1.0369668, "epoch": 0.7444461145347964, "flos": 22017838055040.0, "grad_norm": 2.93689125850084, "language_loss": 0.7285459, "learning_rate": 6.107134574589111e-07, "loss": 0.75024343, "num_input_tokens_seen": 267053645, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 12382, "time_per_iteration": 2.627506732940674 }, { "auxiliary_loss_clip": 0.01051455, "auxiliary_loss_mlp": 0.01004388, "balance_loss_clip": 1.00284982, "balance_loss_mlp": 1.00350809, "epoch": 0.7445062377874643, "flos": 70559047704960.0, "grad_norm": 0.6740077501251022, "language_loss": 0.54665661, "learning_rate": 6.104416941039392e-07, "loss": 0.56721497, "num_input_tokens_seen": 267121830, "router_z_loss_clip": 0.01538086, "router_z_loss_mlp": 0.21679688, "step": 12383, "time_per_iteration": 3.3995466232299805 }, { "auxiliary_loss_clip": 0.01122576, "auxiliary_loss_mlp": 0.01030312, "balance_loss_clip": 1.01783133, "balance_loss_mlp": 1.03346455, "epoch": 0.7445663610401323, "flos": 22164425458560.0, "grad_norm": 2.0460278028701047, "language_loss": 0.7600168, "learning_rate": 6.101699803387288e-07, "loss": 0.7815457, "num_input_tokens_seen": 267141145, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 12384, "time_per_iteration": 2.60184907913208 }, { "auxiliary_loss_clip": 0.01111841, "auxiliary_loss_mlp": 0.01031205, "balance_loss_clip": 1.01833677, "balance_loss_mlp": 1.0335753, "epoch": 0.7446264842928002, "flos": 24170251714560.0, "grad_norm": 2.0280946785917444, "language_loss": 0.79457462, "learning_rate": 6.098983161729769e-07, "loss": 0.81600511, "num_input_tokens_seen": 267159280, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 12385, "time_per_iteration": 2.6055378913879395 }, { "auxiliary_loss_clip": 0.0113547, "auxiliary_loss_mlp": 0.01036941, "balance_loss_clip": 1.02261841, "balance_loss_mlp": 1.03619003, "epoch": 0.7446866075454682, "flos": 24134556574080.0, "grad_norm": 1.7313917105928398, "language_loss": 0.81686115, "learning_rate": 6.096267016163777e-07, "loss": 0.83858526, "num_input_tokens_seen": 267179390, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7265625, "step": 12386, "time_per_iteration": 2.59184193611145 }, { "auxiliary_loss_clip": 0.01112223, "auxiliary_loss_mlp": 0.01031471, "balance_loss_clip": 1.01921046, "balance_loss_mlp": 1.03589678, "epoch": 0.7447467307981361, "flos": 23548063305600.0, "grad_norm": 1.6913472865673602, "language_loss": 0.7099604, "learning_rate": 6.09355136678626e-07, "loss": 0.73139733, "num_input_tokens_seen": 267198165, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.671875, "step": 12387, "time_per_iteration": 2.574881076812744 }, { "auxiliary_loss_clip": 0.01131244, "auxiliary_loss_mlp": 0.01030421, "balance_loss_clip": 1.01824367, "balance_loss_mlp": 1.03518665, "epoch": 0.7448068540508042, "flos": 19567832215680.0, "grad_norm": 1.9918675906037557, "language_loss": 0.70142829, "learning_rate": 6.090836213694115e-07, "loss": 0.72304493, "num_input_tokens_seen": 267214520, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6953125, "step": 12388, "time_per_iteration": 2.515174627304077 }, { "auxiliary_loss_clip": 0.01112538, "auxiliary_loss_mlp": 0.01031661, "balance_loss_clip": 1.01932907, "balance_loss_mlp": 1.03456521, "epoch": 0.7448669773034721, "flos": 21839721488640.0, "grad_norm": 1.9490982836502477, "language_loss": 0.85171658, "learning_rate": 6.088121556984249e-07, "loss": 0.87315857, "num_input_tokens_seen": 267236555, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 12389, "time_per_iteration": 2.5980985164642334 }, { "auxiliary_loss_clip": 0.01138632, "auxiliary_loss_mlp": 0.01036488, "balance_loss_clip": 1.02407813, "balance_loss_mlp": 1.03338051, "epoch": 0.7449271005561401, "flos": 25155389099520.0, "grad_norm": 1.9721881763507443, "language_loss": 0.79376471, "learning_rate": 6.085407396753541e-07, "loss": 0.81551588, "num_input_tokens_seen": 267254800, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69921875, "step": 12390, "time_per_iteration": 2.553408145904541 }, { "auxiliary_loss_clip": 0.01125734, "auxiliary_loss_mlp": 0.0103204, "balance_loss_clip": 1.01870012, "balance_loss_mlp": 1.03723443, "epoch": 0.7449872238088081, "flos": 22273342473600.0, "grad_norm": 1.8881775966151835, "language_loss": 0.84867448, "learning_rate": 6.082693733098851e-07, "loss": 0.87025219, "num_input_tokens_seen": 267274610, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.70703125, "step": 12391, "time_per_iteration": 2.517456531524658 }, { "auxiliary_loss_clip": 0.01104324, "auxiliary_loss_mlp": 0.01029351, "balance_loss_clip": 1.0172987, "balance_loss_mlp": 1.03502357, "epoch": 0.745047347061476, "flos": 20594805966720.0, "grad_norm": 1.66052817453698, "language_loss": 0.73481113, "learning_rate": 6.079980566117022e-07, "loss": 0.75614786, "num_input_tokens_seen": 267292600, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6953125, "step": 12392, "time_per_iteration": 2.4571938514709473 }, { "auxiliary_loss_clip": 0.01109446, "auxiliary_loss_mlp": 0.01037106, "balance_loss_clip": 1.02548337, "balance_loss_mlp": 1.03373003, "epoch": 0.745107470314144, "flos": 22127545169280.0, "grad_norm": 1.8345645332083709, "language_loss": 0.76822591, "learning_rate": 6.077267895904872e-07, "loss": 0.78969145, "num_input_tokens_seen": 267311295, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 12393, "time_per_iteration": 2.492281436920166 }, { "auxiliary_loss_clip": 0.01115281, "auxiliary_loss_mlp": 0.01036784, "balance_loss_clip": 1.02302718, "balance_loss_mlp": 1.03722143, "epoch": 0.745167593566812, "flos": 22236498097920.0, "grad_norm": 2.0087158401767846, "language_loss": 0.72239804, "learning_rate": 6.074555722559232e-07, "loss": 0.74391866, "num_input_tokens_seen": 267328390, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.6953125, "step": 12394, "time_per_iteration": 3.935915946960449 }, { "auxiliary_loss_clip": 0.01151255, "auxiliary_loss_mlp": 0.0127866, "balance_loss_clip": 1.01875138, "balance_loss_mlp": 1.03618956, "epoch": 0.74522771681948, "flos": 20666232161280.0, "grad_norm": 3.8464707502336224, "language_loss": 0.81541336, "learning_rate": 6.071844046176863e-07, "loss": 0.83971244, "num_input_tokens_seen": 267348185, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 12395, "time_per_iteration": 2.560309648513794 }, { "auxiliary_loss_clip": 0.01118082, "auxiliary_loss_mlp": 0.01039239, "balance_loss_clip": 1.02620339, "balance_loss_mlp": 1.03690183, "epoch": 0.7452878400721479, "flos": 21106999952640.0, "grad_norm": 2.548272906387803, "language_loss": 0.71362221, "learning_rate": 6.069132866854561e-07, "loss": 0.7351954, "num_input_tokens_seen": 267367010, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 12396, "time_per_iteration": 2.5315356254577637 }, { "auxiliary_loss_clip": 0.01142692, "auxiliary_loss_mlp": 0.01032931, "balance_loss_clip": 1.02007413, "balance_loss_mlp": 1.03578877, "epoch": 0.7453479633248159, "flos": 26688056474880.0, "grad_norm": 1.9013946970840907, "language_loss": 0.68293232, "learning_rate": 6.06642218468907e-07, "loss": 0.70468855, "num_input_tokens_seen": 267386605, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 12397, "time_per_iteration": 2.640815019607544 }, { "auxiliary_loss_clip": 0.01103897, "auxiliary_loss_mlp": 0.01040658, "balance_loss_clip": 1.02764022, "balance_loss_mlp": 1.03517056, "epoch": 0.7454080865774838, "flos": 17016056167680.0, "grad_norm": 1.7295127619164, "language_loss": 0.76719713, "learning_rate": 6.063711999777132e-07, "loss": 0.7886427, "num_input_tokens_seen": 267404135, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6875, "step": 12398, "time_per_iteration": 2.481516122817993 }, { "auxiliary_loss_clip": 0.01111088, "auxiliary_loss_mlp": 0.01029033, "balance_loss_clip": 1.01640248, "balance_loss_mlp": 1.0346005, "epoch": 0.7454682098301518, "flos": 21323900229120.0, "grad_norm": 1.6889418489172514, "language_loss": 0.77932549, "learning_rate": 6.061002312215457e-07, "loss": 0.80072671, "num_input_tokens_seen": 267423120, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 12399, "time_per_iteration": 2.550330638885498 }, { "auxiliary_loss_clip": 0.01118015, "auxiliary_loss_mlp": 0.01035574, "balance_loss_clip": 1.02413011, "balance_loss_mlp": 1.03416538, "epoch": 0.7455283330828197, "flos": 17858341163520.0, "grad_norm": 2.2593682098060612, "language_loss": 0.73723364, "learning_rate": 6.058293122100761e-07, "loss": 0.75876951, "num_input_tokens_seen": 267441250, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.66015625, "step": 12400, "time_per_iteration": 3.985426664352417 }, { "auxiliary_loss_clip": 0.01110308, "auxiliary_loss_mlp": 0.01033031, "balance_loss_clip": 1.0208478, "balance_loss_mlp": 1.03296638, "epoch": 0.7455884563354878, "flos": 30774259664640.0, "grad_norm": 1.8272615871967808, "language_loss": 0.81923687, "learning_rate": 6.055584429529721e-07, "loss": 0.84067023, "num_input_tokens_seen": 267462820, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 12401, "time_per_iteration": 2.566725015640259 }, { "auxiliary_loss_clip": 0.01130489, "auxiliary_loss_mlp": 0.01034417, "balance_loss_clip": 1.02255607, "balance_loss_mlp": 1.03566909, "epoch": 0.7456485795881557, "flos": 23185545292800.0, "grad_norm": 1.9080062956295893, "language_loss": 0.65134346, "learning_rate": 6.052876234599003e-07, "loss": 0.67299259, "num_input_tokens_seen": 267483065, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.68359375, "step": 12402, "time_per_iteration": 2.5670344829559326 }, { "auxiliary_loss_clip": 0.01129346, "auxiliary_loss_mlp": 0.01031201, "balance_loss_clip": 1.01974463, "balance_loss_mlp": 1.03412914, "epoch": 0.7457087028408237, "flos": 38727144074880.0, "grad_norm": 2.0552710820765054, "language_loss": 0.73109353, "learning_rate": 6.050168537405249e-07, "loss": 0.75269896, "num_input_tokens_seen": 267504825, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6875, "step": 12403, "time_per_iteration": 2.666877508163452 }, { "auxiliary_loss_clip": 0.01050842, "auxiliary_loss_mlp": 0.00998256, "balance_loss_clip": 0.99680787, "balance_loss_mlp": 1.00353813, "epoch": 0.7457688260934917, "flos": 56043737337600.0, "grad_norm": 0.8356489067315386, "language_loss": 0.58894205, "learning_rate": 6.04746133804511e-07, "loss": 0.60943305, "num_input_tokens_seen": 267559260, "router_z_loss_clip": 0.01446533, "router_z_loss_mlp": 0.21582031, "step": 12404, "time_per_iteration": 2.97414493560791 }, { "auxiliary_loss_clip": 0.01103085, "auxiliary_loss_mlp": 0.01035763, "balance_loss_clip": 1.02358603, "balance_loss_mlp": 1.03374314, "epoch": 0.7458289493461596, "flos": 20116152305280.0, "grad_norm": 1.4556172380900865, "language_loss": 0.77845764, "learning_rate": 6.044754636615172e-07, "loss": 0.79984617, "num_input_tokens_seen": 267578720, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6953125, "step": 12405, "time_per_iteration": 2.5517942905426025 }, { "auxiliary_loss_clip": 0.01120526, "auxiliary_loss_mlp": 0.01036064, "balance_loss_clip": 1.02360106, "balance_loss_mlp": 1.0350492, "epoch": 0.7458890725988276, "flos": 20193073280640.0, "grad_norm": 1.761489662216733, "language_loss": 0.69123244, "learning_rate": 6.042048433212052e-07, "loss": 0.71279842, "num_input_tokens_seen": 267598250, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.67578125, "step": 12406, "time_per_iteration": 2.5136358737945557 }, { "auxiliary_loss_clip": 0.01163287, "auxiliary_loss_mlp": 0.01035286, "balance_loss_clip": 1.02160072, "balance_loss_mlp": 1.03612208, "epoch": 0.7459491958514956, "flos": 17018749687680.0, "grad_norm": 1.6218641067209518, "language_loss": 0.64957082, "learning_rate": 6.039342727932319e-07, "loss": 0.67155653, "num_input_tokens_seen": 267615430, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 12407, "time_per_iteration": 5.424073934555054 }, { "auxiliary_loss_clip": 0.01132607, "auxiliary_loss_mlp": 0.01031323, "balance_loss_clip": 1.01769161, "balance_loss_mlp": 1.03406811, "epoch": 0.7460093191041636, "flos": 25078719519360.0, "grad_norm": 1.8192918631921642, "language_loss": 0.71914959, "learning_rate": 6.036637520872531e-07, "loss": 0.74078894, "num_input_tokens_seen": 267635075, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71484375, "step": 12408, "time_per_iteration": 2.6234681606292725 }, { "auxiliary_loss_clip": 0.01117247, "auxiliary_loss_mlp": 0.01034065, "balance_loss_clip": 1.02055895, "balance_loss_mlp": 1.03633738, "epoch": 0.7460694423568315, "flos": 21908525990400.0, "grad_norm": 1.7424504300883412, "language_loss": 0.72245193, "learning_rate": 6.033932812129234e-07, "loss": 0.74396503, "num_input_tokens_seen": 267654105, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 12409, "time_per_iteration": 2.4935920238494873 }, { "auxiliary_loss_clip": 0.01109921, "auxiliary_loss_mlp": 0.01034552, "balance_loss_clip": 1.0232029, "balance_loss_mlp": 1.03465509, "epoch": 0.7461295656094995, "flos": 21215737399680.0, "grad_norm": 1.6244812532545174, "language_loss": 0.66126454, "learning_rate": 6.031228601798944e-07, "loss": 0.68270922, "num_input_tokens_seen": 267673090, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6640625, "step": 12410, "time_per_iteration": 2.5464789867401123 }, { "auxiliary_loss_clip": 0.0111123, "auxiliary_loss_mlp": 0.01280695, "balance_loss_clip": 1.02090693, "balance_loss_mlp": 1.03424585, "epoch": 0.7461896888621674, "flos": 22346851656960.0, "grad_norm": 2.4802901058567706, "language_loss": 0.84510022, "learning_rate": 6.028524889978184e-07, "loss": 0.86901951, "num_input_tokens_seen": 267690605, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6796875, "step": 12411, "time_per_iteration": 2.513075828552246 }, { "auxiliary_loss_clip": 0.01133887, "auxiliary_loss_mlp": 0.0102918, "balance_loss_clip": 1.01632953, "balance_loss_mlp": 1.03586006, "epoch": 0.7462498121148354, "flos": 25482930243840.0, "grad_norm": 1.5120432179911958, "language_loss": 0.78154361, "learning_rate": 6.025821676763421e-07, "loss": 0.80317426, "num_input_tokens_seen": 267710540, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 12412, "time_per_iteration": 2.5615015029907227 }, { "auxiliary_loss_clip": 0.01132671, "auxiliary_loss_mlp": 0.01037673, "balance_loss_clip": 1.02571654, "balance_loss_mlp": 1.03500724, "epoch": 0.7463099353675033, "flos": 33947936812800.0, "grad_norm": 1.851467543182368, "language_loss": 0.62358403, "learning_rate": 6.023118962251141e-07, "loss": 0.64528745, "num_input_tokens_seen": 267730780, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.703125, "step": 12413, "time_per_iteration": 2.6090705394744873 }, { "auxiliary_loss_clip": 0.01036046, "auxiliary_loss_mlp": 0.01249094, "balance_loss_clip": 1.00168502, "balance_loss_mlp": 1.00494814, "epoch": 0.7463700586201714, "flos": 62767723691520.0, "grad_norm": 0.715593159138689, "language_loss": 0.54918152, "learning_rate": 6.020416746537793e-07, "loss": 0.57203293, "num_input_tokens_seen": 267794240, "router_z_loss_clip": 0.01446533, "router_z_loss_mlp": 0.21875, "step": 12414, "time_per_iteration": 3.1815669536590576 }, { "auxiliary_loss_clip": 0.01121651, "auxiliary_loss_mlp": 0.01032037, "balance_loss_clip": 1.01905537, "balance_loss_mlp": 1.03332925, "epoch": 0.7464301818728393, "flos": 33432654257280.0, "grad_norm": 1.5354587836581073, "language_loss": 0.54926372, "learning_rate": 6.01771502971981e-07, "loss": 0.5708006, "num_input_tokens_seen": 267817190, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 12415, "time_per_iteration": 2.6180293560028076 }, { "auxiliary_loss_clip": 0.01149061, "auxiliary_loss_mlp": 0.01034101, "balance_loss_clip": 1.02103531, "balance_loss_mlp": 1.03509092, "epoch": 0.7464903051255073, "flos": 26869872142080.0, "grad_norm": 1.5447496107178271, "language_loss": 0.74985957, "learning_rate": 6.015013811893608e-07, "loss": 0.7716912, "num_input_tokens_seen": 267836245, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6875, "step": 12416, "time_per_iteration": 2.653242349624634 }, { "auxiliary_loss_clip": 0.01140054, "auxiliary_loss_mlp": 0.01037888, "balance_loss_clip": 1.02534699, "balance_loss_mlp": 1.0353632, "epoch": 0.7465504283781753, "flos": 44086954775040.0, "grad_norm": 1.5440287611305756, "language_loss": 0.69494641, "learning_rate": 6.012313093155598e-07, "loss": 0.71672583, "num_input_tokens_seen": 267858310, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 12417, "time_per_iteration": 2.794734239578247 }, { "auxiliary_loss_clip": 0.01133787, "auxiliary_loss_mlp": 0.0103306, "balance_loss_clip": 1.01994658, "balance_loss_mlp": 1.03663778, "epoch": 0.7466105516308432, "flos": 19676102785920.0, "grad_norm": 1.7505942051327108, "language_loss": 0.73749608, "learning_rate": 6.009612873602143e-07, "loss": 0.75916451, "num_input_tokens_seen": 267876345, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 12418, "time_per_iteration": 2.5263419151306152 }, { "auxiliary_loss_clip": 0.01113, "auxiliary_loss_mlp": 0.01029248, "balance_loss_clip": 1.01699972, "balance_loss_mlp": 1.03480411, "epoch": 0.7466706748835112, "flos": 20520722165760.0, "grad_norm": 2.516562646337158, "language_loss": 0.68898463, "learning_rate": 6.006913153329623e-07, "loss": 0.7104072, "num_input_tokens_seen": 267896740, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 12419, "time_per_iteration": 2.607264995574951 }, { "auxiliary_loss_clip": 0.01099062, "auxiliary_loss_mlp": 0.0103454, "balance_loss_clip": 1.023036, "balance_loss_mlp": 1.03376484, "epoch": 0.7467307981361792, "flos": 21690260997120.0, "grad_norm": 2.0193725902552497, "language_loss": 0.74520874, "learning_rate": 6.004213932434373e-07, "loss": 0.76654476, "num_input_tokens_seen": 267914765, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.65234375, "step": 12420, "time_per_iteration": 2.5062851905822754 }, { "auxiliary_loss_clip": 0.01117314, "auxiliary_loss_mlp": 0.01029433, "balance_loss_clip": 1.01641512, "balance_loss_mlp": 1.03803992, "epoch": 0.7467909213888472, "flos": 19573686132480.0, "grad_norm": 1.8137814535951988, "language_loss": 0.67323089, "learning_rate": 6.001515211012736e-07, "loss": 0.69469833, "num_input_tokens_seen": 267934085, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 12421, "time_per_iteration": 2.487535238265991 }, { "auxiliary_loss_clip": 0.01126428, "auxiliary_loss_mlp": 0.01031985, "balance_loss_clip": 1.01804924, "balance_loss_mlp": 1.03598857, "epoch": 0.7468510446415151, "flos": 23695225326720.0, "grad_norm": 1.857482733414429, "language_loss": 0.72692859, "learning_rate": 5.998816989161008e-07, "loss": 0.74851274, "num_input_tokens_seen": 267955170, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7265625, "step": 12422, "time_per_iteration": 2.56063175201416 }, { "auxiliary_loss_clip": 0.01130665, "auxiliary_loss_mlp": 0.01030926, "balance_loss_clip": 1.01825404, "balance_loss_mlp": 1.03586483, "epoch": 0.7469111678941831, "flos": 29315783831040.0, "grad_norm": 1.9806182851006422, "language_loss": 0.74632913, "learning_rate": 5.996119266975479e-07, "loss": 0.76794505, "num_input_tokens_seen": 267974980, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 12423, "time_per_iteration": 2.6307387351989746 }, { "auxiliary_loss_clip": 0.01109635, "auxiliary_loss_mlp": 0.01028332, "balance_loss_clip": 1.01674509, "balance_loss_mlp": 1.03261054, "epoch": 0.746971291146851, "flos": 21798639308160.0, "grad_norm": 1.519193768145952, "language_loss": 0.67694795, "learning_rate": 5.993422044552445e-07, "loss": 0.6983276, "num_input_tokens_seen": 267994985, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.68359375, "step": 12424, "time_per_iteration": 2.5684759616851807 }, { "auxiliary_loss_clip": 0.01117266, "auxiliary_loss_mlp": 0.0103897, "balance_loss_clip": 1.02613759, "balance_loss_mlp": 1.03656149, "epoch": 0.747031414399519, "flos": 36245070368640.0, "grad_norm": 3.775165616734161, "language_loss": 0.74403381, "learning_rate": 5.990725321988137e-07, "loss": 0.76559621, "num_input_tokens_seen": 268014985, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 12425, "time_per_iteration": 2.6920878887176514 }, { "auxiliary_loss_clip": 0.01130586, "auxiliary_loss_mlp": 0.01029204, "balance_loss_clip": 1.01691341, "balance_loss_mlp": 1.03458977, "epoch": 0.7470915376521869, "flos": 19974916028160.0, "grad_norm": 2.078652151146108, "language_loss": 0.69352376, "learning_rate": 5.988029099378811e-07, "loss": 0.71512163, "num_input_tokens_seen": 268034395, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 12426, "time_per_iteration": 2.5931661128997803 }, { "auxiliary_loss_clip": 0.01127317, "auxiliary_loss_mlp": 0.01037193, "balance_loss_clip": 1.02475917, "balance_loss_mlp": 1.03458393, "epoch": 0.747151660904855, "flos": 20084299920000.0, "grad_norm": 1.4297559411040264, "language_loss": 0.65618777, "learning_rate": 5.985333376820679e-07, "loss": 0.67783284, "num_input_tokens_seen": 268054485, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6640625, "step": 12427, "time_per_iteration": 2.5585453510284424 }, { "auxiliary_loss_clip": 0.01105228, "auxiliary_loss_mlp": 0.01030267, "balance_loss_clip": 1.01704705, "balance_loss_mlp": 1.03517544, "epoch": 0.7472117841575229, "flos": 16290373697280.0, "grad_norm": 1.503095426942143, "language_loss": 0.74402028, "learning_rate": 5.982638154409958e-07, "loss": 0.76537526, "num_input_tokens_seen": 268072250, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.69921875, "step": 12428, "time_per_iteration": 2.461993932723999 }, { "auxiliary_loss_clip": 0.01117033, "auxiliary_loss_mlp": 0.01031876, "balance_loss_clip": 1.01790488, "balance_loss_mlp": 1.0352428, "epoch": 0.7472719074101909, "flos": 21389939383680.0, "grad_norm": 2.4763061440660117, "language_loss": 0.58056962, "learning_rate": 5.979943432242814e-07, "loss": 0.60205865, "num_input_tokens_seen": 268089840, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.73046875, "step": 12429, "time_per_iteration": 2.564387798309326 }, { "auxiliary_loss_clip": 0.0111684, "auxiliary_loss_mlp": 0.01030435, "balance_loss_clip": 1.01714945, "balance_loss_mlp": 1.03567076, "epoch": 0.7473320306628589, "flos": 29643289061760.0, "grad_norm": 1.7270350983460045, "language_loss": 0.60530758, "learning_rate": 5.977249210415429e-07, "loss": 0.62678027, "num_input_tokens_seen": 268109360, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 12430, "time_per_iteration": 2.57480788230896 }, { "auxiliary_loss_clip": 0.01122108, "auxiliary_loss_mlp": 0.01031496, "balance_loss_clip": 1.01903844, "balance_loss_mlp": 1.03567171, "epoch": 0.7473921539155268, "flos": 24136100858880.0, "grad_norm": 1.425312125813824, "language_loss": 0.75548565, "learning_rate": 5.974555489023951e-07, "loss": 0.77702177, "num_input_tokens_seen": 268131840, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 12431, "time_per_iteration": 2.5757856369018555 }, { "auxiliary_loss_clip": 0.01113521, "auxiliary_loss_mlp": 0.01031694, "balance_loss_clip": 1.01917696, "balance_loss_mlp": 1.03474951, "epoch": 0.7474522771681948, "flos": 17487958072320.0, "grad_norm": 1.868874530091181, "language_loss": 0.75525641, "learning_rate": 5.971862268164511e-07, "loss": 0.7767086, "num_input_tokens_seen": 268148300, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 12432, "time_per_iteration": 2.4842371940612793 }, { "auxiliary_loss_clip": 0.01127231, "auxiliary_loss_mlp": 0.01031389, "balance_loss_clip": 1.01733983, "balance_loss_mlp": 1.03543186, "epoch": 0.7475124004208628, "flos": 16727298733440.0, "grad_norm": 5.635402290374739, "language_loss": 0.70339185, "learning_rate": 5.969169547933213e-07, "loss": 0.72497803, "num_input_tokens_seen": 268166450, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.73828125, "step": 12433, "time_per_iteration": 2.4972076416015625 }, { "auxiliary_loss_clip": 0.01140245, "auxiliary_loss_mlp": 0.01032714, "balance_loss_clip": 1.01908255, "balance_loss_mlp": 1.03446841, "epoch": 0.7475725236735308, "flos": 19720237622400.0, "grad_norm": 1.697707198945631, "language_loss": 0.66978359, "learning_rate": 5.966477328426176e-07, "loss": 0.69151318, "num_input_tokens_seen": 268186165, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.703125, "step": 12434, "time_per_iteration": 2.5767459869384766 }, { "auxiliary_loss_clip": 0.0110082, "auxiliary_loss_mlp": 0.01026607, "balance_loss_clip": 1.01522279, "balance_loss_mlp": 1.0360117, "epoch": 0.7476326469261987, "flos": 26286000566400.0, "grad_norm": 1.4937845007974588, "language_loss": 0.79582769, "learning_rate": 5.963785609739453e-07, "loss": 0.81710196, "num_input_tokens_seen": 268208145, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6484375, "step": 12435, "time_per_iteration": 3.945082187652588 }, { "auxiliary_loss_clip": 0.01133325, "auxiliary_loss_mlp": 0.01029807, "balance_loss_clip": 1.01712298, "balance_loss_mlp": 1.03498185, "epoch": 0.7476927701788667, "flos": 31831828824960.0, "grad_norm": 1.8898633816428505, "language_loss": 0.67621386, "learning_rate": 5.961094391969121e-07, "loss": 0.69784516, "num_input_tokens_seen": 268228345, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 12436, "time_per_iteration": 2.6543688774108887 }, { "auxiliary_loss_clip": 0.01135079, "auxiliary_loss_mlp": 0.01032414, "balance_loss_clip": 1.01983702, "balance_loss_mlp": 1.03565013, "epoch": 0.7477528934315346, "flos": 31795487239680.0, "grad_norm": 1.6865321979700394, "language_loss": 0.70848429, "learning_rate": 5.958403675211219e-07, "loss": 0.73015922, "num_input_tokens_seen": 268250260, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 12437, "time_per_iteration": 2.6449999809265137 }, { "auxiliary_loss_clip": 0.01124868, "auxiliary_loss_mlp": 0.01026472, "balance_loss_clip": 1.01507556, "balance_loss_mlp": 1.03286266, "epoch": 0.7478130166842026, "flos": 20371979946240.0, "grad_norm": 2.464395624534378, "language_loss": 0.67300439, "learning_rate": 5.955713459561768e-07, "loss": 0.69451785, "num_input_tokens_seen": 268268440, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.65234375, "step": 12438, "time_per_iteration": 2.6014244556427 }, { "auxiliary_loss_clip": 0.01123019, "auxiliary_loss_mlp": 0.01030037, "balance_loss_clip": 1.01787186, "balance_loss_mlp": 1.03573704, "epoch": 0.7478731399368705, "flos": 18148930191360.0, "grad_norm": 1.8283009082066646, "language_loss": 0.80581212, "learning_rate": 5.953023745116781e-07, "loss": 0.82734275, "num_input_tokens_seen": 268285765, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6953125, "step": 12439, "time_per_iteration": 2.5427184104919434 }, { "auxiliary_loss_clip": 0.01128286, "auxiliary_loss_mlp": 0.01037787, "balance_loss_clip": 1.02585471, "balance_loss_mlp": 1.03372681, "epoch": 0.7479332631895386, "flos": 15267889146240.0, "grad_norm": 2.015141593386828, "language_loss": 0.70914626, "learning_rate": 5.950334531972234e-07, "loss": 0.73080701, "num_input_tokens_seen": 268304015, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 12440, "time_per_iteration": 2.5524253845214844 }, { "auxiliary_loss_clip": 0.01121107, "auxiliary_loss_mlp": 0.01028386, "balance_loss_clip": 1.01673341, "balance_loss_mlp": 1.03689742, "epoch": 0.7479933864422065, "flos": 21142515525120.0, "grad_norm": 1.6805947735892053, "language_loss": 0.74109101, "learning_rate": 5.947645820224123e-07, "loss": 0.762586, "num_input_tokens_seen": 268323290, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.66796875, "step": 12441, "time_per_iteration": 4.025407314300537 }, { "auxiliary_loss_clip": 0.0111605, "auxiliary_loss_mlp": 0.01281763, "balance_loss_clip": 1.0212431, "balance_loss_mlp": 1.03547645, "epoch": 0.7480535096948745, "flos": 14392027912320.0, "grad_norm": 1.9933079308673678, "language_loss": 0.82389349, "learning_rate": 5.94495760996837e-07, "loss": 0.84787166, "num_input_tokens_seen": 268339490, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 12442, "time_per_iteration": 2.5534656047821045 }, { "auxiliary_loss_clip": 0.01151688, "auxiliary_loss_mlp": 0.01032735, "balance_loss_clip": 1.01963425, "balance_loss_mlp": 1.03603888, "epoch": 0.7481136329475425, "flos": 27344683048320.0, "grad_norm": 2.7924003481845694, "language_loss": 0.62760782, "learning_rate": 5.942269901300934e-07, "loss": 0.64945203, "num_input_tokens_seen": 268359865, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 12443, "time_per_iteration": 2.6015048027038574 }, { "auxiliary_loss_clip": 0.01128928, "auxiliary_loss_mlp": 0.01029869, "balance_loss_clip": 1.01824605, "balance_loss_mlp": 1.03436828, "epoch": 0.7481737562002104, "flos": 19531454716800.0, "grad_norm": 1.7441568449689253, "language_loss": 0.71469855, "learning_rate": 5.939582694317717e-07, "loss": 0.73628652, "num_input_tokens_seen": 268377065, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.67578125, "step": 12444, "time_per_iteration": 2.528578281402588 }, { "auxiliary_loss_clip": 0.01140132, "auxiliary_loss_mlp": 0.01029429, "balance_loss_clip": 1.01681709, "balance_loss_mlp": 1.03518963, "epoch": 0.7482338794528784, "flos": 21760035166080.0, "grad_norm": 1.3816300263093968, "language_loss": 0.68959928, "learning_rate": 5.936895989114641e-07, "loss": 0.71129489, "num_input_tokens_seen": 268396935, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 12445, "time_per_iteration": 2.542057752609253 }, { "auxiliary_loss_clip": 0.01110974, "auxiliary_loss_mlp": 0.01024286, "balance_loss_clip": 1.01252639, "balance_loss_mlp": 1.03445697, "epoch": 0.7482940027055464, "flos": 18697358021760.0, "grad_norm": 1.562359177199196, "language_loss": 0.73884153, "learning_rate": 5.934209785787559e-07, "loss": 0.76019412, "num_input_tokens_seen": 268414460, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.67578125, "step": 12446, "time_per_iteration": 2.4684362411499023 }, { "auxiliary_loss_clip": 0.01126124, "auxiliary_loss_mlp": 0.0128253, "balance_loss_clip": 1.02176785, "balance_loss_mlp": 1.0360055, "epoch": 0.7483541259582144, "flos": 15998024903040.0, "grad_norm": 2.4598683639767196, "language_loss": 0.73024583, "learning_rate": 5.931524084432353e-07, "loss": 0.75433242, "num_input_tokens_seen": 268432225, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.72265625, "step": 12447, "time_per_iteration": 2.506983757019043 }, { "auxiliary_loss_clip": 0.0113711, "auxiliary_loss_mlp": 0.01029529, "balance_loss_clip": 1.01757824, "balance_loss_mlp": 1.03277993, "epoch": 0.7484142492108823, "flos": 25556295772800.0, "grad_norm": 2.519623633771113, "language_loss": 0.71961427, "learning_rate": 5.928838885144864e-07, "loss": 0.74128067, "num_input_tokens_seen": 268449270, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69140625, "step": 12448, "time_per_iteration": 4.083613395690918 }, { "auxiliary_loss_clip": 0.0110588, "auxiliary_loss_mlp": 0.01035555, "balance_loss_clip": 1.02332377, "balance_loss_mlp": 1.03547359, "epoch": 0.7484743724635503, "flos": 22887737631360.0, "grad_norm": 1.734451421175611, "language_loss": 0.73792326, "learning_rate": 5.926154188020922e-07, "loss": 0.75933766, "num_input_tokens_seen": 268467250, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 12449, "time_per_iteration": 3.9595909118652344 }, { "auxiliary_loss_clip": 0.01136841, "auxiliary_loss_mlp": 0.01035155, "balance_loss_clip": 1.02207732, "balance_loss_mlp": 1.03709435, "epoch": 0.7485344957162182, "flos": 25300288563840.0, "grad_norm": 1.9330448089937475, "language_loss": 0.61423206, "learning_rate": 5.923469993156327e-07, "loss": 0.63595212, "num_input_tokens_seen": 268487270, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73046875, "step": 12450, "time_per_iteration": 2.586575508117676 }, { "auxiliary_loss_clip": 0.01098067, "auxiliary_loss_mlp": 0.01028429, "balance_loss_clip": 1.01687765, "balance_loss_mlp": 1.03220463, "epoch": 0.7485946189688862, "flos": 27053016612480.0, "grad_norm": 1.999382745879527, "language_loss": 0.70622134, "learning_rate": 5.920786300646892e-07, "loss": 0.72748631, "num_input_tokens_seen": 268508020, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.65625, "step": 12451, "time_per_iteration": 2.550636053085327 }, { "auxiliary_loss_clip": 0.01115962, "auxiliary_loss_mlp": 0.01027671, "balance_loss_clip": 1.01489782, "balance_loss_mlp": 1.03506851, "epoch": 0.7486547422215541, "flos": 26906752431360.0, "grad_norm": 1.9123089802596753, "language_loss": 0.80581528, "learning_rate": 5.918103110588364e-07, "loss": 0.82725155, "num_input_tokens_seen": 268527375, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.72265625, "step": 12452, "time_per_iteration": 2.5278067588806152 }, { "auxiliary_loss_clip": 0.01111511, "auxiliary_loss_mlp": 0.01034289, "balance_loss_clip": 1.02116394, "balance_loss_mlp": 1.0383718, "epoch": 0.7487148654742222, "flos": 22346277039360.0, "grad_norm": 2.616945076674802, "language_loss": 0.71769345, "learning_rate": 5.91542042307652e-07, "loss": 0.73915148, "num_input_tokens_seen": 268544870, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 12453, "time_per_iteration": 2.5393128395080566 }, { "auxiliary_loss_clip": 0.01125028, "auxiliary_loss_mlp": 0.01031465, "balance_loss_clip": 1.01911485, "balance_loss_mlp": 1.03722131, "epoch": 0.7487749887268901, "flos": 23038814234880.0, "grad_norm": 1.5321203607805278, "language_loss": 0.73755634, "learning_rate": 5.912738238207091e-07, "loss": 0.7591213, "num_input_tokens_seen": 268564580, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.703125, "step": 12454, "time_per_iteration": 2.5559933185577393 }, { "auxiliary_loss_clip": 0.01043819, "auxiliary_loss_mlp": 0.01005204, "balance_loss_clip": 1.00368404, "balance_loss_mlp": 1.00388074, "epoch": 0.7488351119795581, "flos": 71525294536320.0, "grad_norm": 0.7351419137597036, "language_loss": 0.59439445, "learning_rate": 5.9100565560758e-07, "loss": 0.61488473, "num_input_tokens_seen": 268629550, "router_z_loss_clip": 0.01519775, "router_z_loss_mlp": 0.21582031, "step": 12455, "time_per_iteration": 3.337402105331421 }, { "auxiliary_loss_clip": 0.0112973, "auxiliary_loss_mlp": 0.01027715, "balance_loss_clip": 1.01539421, "balance_loss_mlp": 1.0348295, "epoch": 0.748895235232226, "flos": 17196255722880.0, "grad_norm": 1.7577251000658924, "language_loss": 0.79554594, "learning_rate": 5.907375376778343e-07, "loss": 0.81712043, "num_input_tokens_seen": 268646645, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 12456, "time_per_iteration": 2.533933401107788 }, { "auxiliary_loss_clip": 0.01034105, "auxiliary_loss_mlp": 0.01000925, "balance_loss_clip": 0.99939901, "balance_loss_mlp": 1.00360227, "epoch": 0.748955358484894, "flos": 58979256336000.0, "grad_norm": 0.9607924961247806, "language_loss": 0.61484748, "learning_rate": 5.904694700410404e-07, "loss": 0.63519776, "num_input_tokens_seen": 268702275, "router_z_loss_clip": 0.01525879, "router_z_loss_mlp": 0.21679688, "step": 12457, "time_per_iteration": 3.12215518951416 }, { "auxiliary_loss_clip": 0.0112784, "auxiliary_loss_mlp": 0.01035633, "balance_loss_clip": 1.02263927, "balance_loss_mlp": 1.03860211, "epoch": 0.749015481737562, "flos": 11360413054080.0, "grad_norm": 2.088083887877489, "language_loss": 0.67951399, "learning_rate": 5.902014527067667e-07, "loss": 0.70114869, "num_input_tokens_seen": 268716265, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 12458, "time_per_iteration": 2.501121997833252 }, { "auxiliary_loss_clip": 0.01137305, "auxiliary_loss_mlp": 0.01030719, "balance_loss_clip": 1.01850629, "balance_loss_mlp": 1.033149, "epoch": 0.74907560499023, "flos": 21106497162240.0, "grad_norm": 1.7763205627821541, "language_loss": 0.80217922, "learning_rate": 5.899334856845753e-07, "loss": 0.82385939, "num_input_tokens_seen": 268734330, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.68359375, "step": 12459, "time_per_iteration": 2.5630104541778564 }, { "auxiliary_loss_clip": 0.01129443, "auxiliary_loss_mlp": 0.0103208, "balance_loss_clip": 1.01913404, "balance_loss_mlp": 1.0347842, "epoch": 0.749135728242898, "flos": 22268027260800.0, "grad_norm": 1.4594948028150978, "language_loss": 0.80478215, "learning_rate": 5.896655689840313e-07, "loss": 0.82639736, "num_input_tokens_seen": 268753500, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.67578125, "step": 12460, "time_per_iteration": 2.5546116828918457 }, { "auxiliary_loss_clip": 0.01121923, "auxiliary_loss_mlp": 0.01029294, "balance_loss_clip": 1.01591897, "balance_loss_mlp": 1.0351994, "epoch": 0.7491958514955659, "flos": 24057527857920.0, "grad_norm": 1.7048042520474624, "language_loss": 0.86050844, "learning_rate": 5.893977026146955e-07, "loss": 0.88202059, "num_input_tokens_seen": 268772055, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6875, "step": 12461, "time_per_iteration": 2.5253713130950928 }, { "auxiliary_loss_clip": 0.01110625, "auxiliary_loss_mlp": 0.0103171, "balance_loss_clip": 1.01977718, "balance_loss_mlp": 1.03460658, "epoch": 0.7492559747482339, "flos": 24492118510080.0, "grad_norm": 2.7192563542907675, "language_loss": 0.69553995, "learning_rate": 5.89129886586127e-07, "loss": 0.71696329, "num_input_tokens_seen": 268792265, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.66796875, "step": 12462, "time_per_iteration": 2.561210870742798 }, { "auxiliary_loss_clip": 0.01104742, "auxiliary_loss_mlp": 0.01032402, "balance_loss_clip": 1.01956964, "balance_loss_mlp": 1.03572893, "epoch": 0.7493160980009018, "flos": 27745338326400.0, "grad_norm": 1.8320041068171427, "language_loss": 0.70327699, "learning_rate": 5.888621209078833e-07, "loss": 0.72464848, "num_input_tokens_seen": 268812735, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 12463, "time_per_iteration": 2.5940518379211426 }, { "auxiliary_loss_clip": 0.01131843, "auxiliary_loss_mlp": 0.01031617, "balance_loss_clip": 1.01982749, "balance_loss_mlp": 1.03653288, "epoch": 0.7493762212535698, "flos": 30226190970240.0, "grad_norm": 1.9213390280885434, "language_loss": 0.77598709, "learning_rate": 5.885944055895208e-07, "loss": 0.79762173, "num_input_tokens_seen": 268833090, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 12464, "time_per_iteration": 2.679452896118164 }, { "auxiliary_loss_clip": 0.01118605, "auxiliary_loss_mlp": 0.01025698, "balance_loss_clip": 1.01390243, "balance_loss_mlp": 1.03220081, "epoch": 0.7494363445062378, "flos": 21944472526080.0, "grad_norm": 1.6900999256877882, "language_loss": 0.783701, "learning_rate": 5.883267406405938e-07, "loss": 0.80514401, "num_input_tokens_seen": 268851880, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 12465, "time_per_iteration": 2.5593457221984863 }, { "auxiliary_loss_clip": 0.01139127, "auxiliary_loss_mlp": 0.01037167, "balance_loss_clip": 1.02306414, "balance_loss_mlp": 1.03704429, "epoch": 0.7494964677589058, "flos": 12490342162560.0, "grad_norm": 2.3593930895707547, "language_loss": 0.74548715, "learning_rate": 5.88059126070654e-07, "loss": 0.76725012, "num_input_tokens_seen": 268867910, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75390625, "step": 12466, "time_per_iteration": 2.5824155807495117 }, { "auxiliary_loss_clip": 0.0111507, "auxiliary_loss_mlp": 0.01031789, "balance_loss_clip": 1.01912284, "balance_loss_mlp": 1.03553343, "epoch": 0.7495565910115737, "flos": 21653057485440.0, "grad_norm": 2.095699411611017, "language_loss": 0.66652083, "learning_rate": 5.877915618892521e-07, "loss": 0.68798947, "num_input_tokens_seen": 268887260, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 12467, "time_per_iteration": 2.5122783184051514 }, { "auxiliary_loss_clip": 0.01140511, "auxiliary_loss_mlp": 0.0128326, "balance_loss_clip": 1.02355421, "balance_loss_mlp": 1.03507853, "epoch": 0.7496167142642417, "flos": 15268535591040.0, "grad_norm": 2.5404599999843716, "language_loss": 0.76680923, "learning_rate": 5.875240481059367e-07, "loss": 0.79104698, "num_input_tokens_seen": 268902520, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 12468, "time_per_iteration": 2.5682075023651123 }, { "auxiliary_loss_clip": 0.01130694, "auxiliary_loss_mlp": 0.0103175, "balance_loss_clip": 1.01831484, "balance_loss_mlp": 1.03574669, "epoch": 0.7496768375169096, "flos": 22054933825920.0, "grad_norm": 1.9316833817635288, "language_loss": 0.69140112, "learning_rate": 5.872565847302547e-07, "loss": 0.71302551, "num_input_tokens_seen": 268920970, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.6796875, "step": 12469, "time_per_iteration": 2.51393723487854 }, { "auxiliary_loss_clip": 0.01123721, "auxiliary_loss_mlp": 0.01032103, "balance_loss_clip": 1.0190022, "balance_loss_mlp": 1.03565645, "epoch": 0.7497369607695776, "flos": 19057038860160.0, "grad_norm": 2.503590334410866, "language_loss": 0.69481891, "learning_rate": 5.869891717717505e-07, "loss": 0.71637714, "num_input_tokens_seen": 268936600, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 12470, "time_per_iteration": 2.5200400352478027 }, { "auxiliary_loss_clip": 0.01125754, "auxiliary_loss_mlp": 0.0103338, "balance_loss_clip": 1.01934314, "balance_loss_mlp": 1.03468776, "epoch": 0.7497970840222457, "flos": 21617434172160.0, "grad_norm": 2.209180635788495, "language_loss": 0.75395429, "learning_rate": 5.867218092399688e-07, "loss": 0.7755456, "num_input_tokens_seen": 268956560, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7265625, "step": 12471, "time_per_iteration": 2.5223171710968018 }, { "auxiliary_loss_clip": 0.01123069, "auxiliary_loss_mlp": 0.01029547, "balance_loss_clip": 1.01669049, "balance_loss_mlp": 1.03489995, "epoch": 0.7498572072749136, "flos": 13735580906880.0, "grad_norm": 2.0064730503108055, "language_loss": 0.76953959, "learning_rate": 5.864544971444503e-07, "loss": 0.79106569, "num_input_tokens_seen": 268973945, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 12472, "time_per_iteration": 2.4771740436553955 }, { "auxiliary_loss_clip": 0.01110313, "auxiliary_loss_mlp": 0.01277148, "balance_loss_clip": 1.01832604, "balance_loss_mlp": 1.03320706, "epoch": 0.7499173305275816, "flos": 22966526113920.0, "grad_norm": 1.4929202754978732, "language_loss": 0.84003007, "learning_rate": 5.861872354947345e-07, "loss": 0.8639046, "num_input_tokens_seen": 268993245, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 12473, "time_per_iteration": 2.5251238346099854 }, { "auxiliary_loss_clip": 0.01136602, "auxiliary_loss_mlp": 0.01035985, "balance_loss_clip": 1.02249646, "balance_loss_mlp": 1.03716564, "epoch": 0.7499774537802495, "flos": 22740467869440.0, "grad_norm": 2.1170135183811305, "language_loss": 0.7390067, "learning_rate": 5.859200243003592e-07, "loss": 0.76073253, "num_input_tokens_seen": 269012125, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 12474, "time_per_iteration": 2.515277147293091 }, { "auxiliary_loss_clip": 0.01127821, "auxiliary_loss_mlp": 0.01037455, "balance_loss_clip": 1.02441978, "balance_loss_mlp": 1.03603387, "epoch": 0.7500375770329175, "flos": 18296559089280.0, "grad_norm": 1.7870019052607793, "language_loss": 0.74901098, "learning_rate": 5.856528635708619e-07, "loss": 0.77066374, "num_input_tokens_seen": 269030545, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73828125, "step": 12475, "time_per_iteration": 2.4922800064086914 }, { "auxiliary_loss_clip": 0.01119994, "auxiliary_loss_mlp": 0.01037919, "balance_loss_clip": 1.02454352, "balance_loss_mlp": 1.03712702, "epoch": 0.7500977002855854, "flos": 19169978198400.0, "grad_norm": 1.894906545081729, "language_loss": 0.80309224, "learning_rate": 5.853857533157747e-07, "loss": 0.82467133, "num_input_tokens_seen": 269048180, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73828125, "step": 12476, "time_per_iteration": 2.4930739402770996 }, { "auxiliary_loss_clip": 0.01130591, "auxiliary_loss_mlp": 0.01032332, "balance_loss_clip": 1.02018452, "balance_loss_mlp": 1.03427458, "epoch": 0.7501578235382534, "flos": 22163886754560.0, "grad_norm": 1.588453913404838, "language_loss": 0.77693236, "learning_rate": 5.851186935446316e-07, "loss": 0.79856151, "num_input_tokens_seen": 269068600, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6953125, "step": 12477, "time_per_iteration": 4.028241157531738 }, { "auxiliary_loss_clip": 0.01043002, "auxiliary_loss_mlp": 0.01001405, "balance_loss_clip": 0.99999279, "balance_loss_mlp": 1.00372171, "epoch": 0.7502179467909214, "flos": 64465040033280.0, "grad_norm": 0.8064529583797861, "language_loss": 0.54436648, "learning_rate": 5.848516842669626e-07, "loss": 0.56481051, "num_input_tokens_seen": 269119045, "router_z_loss_clip": 0.01409912, "router_z_loss_mlp": 0.21679688, "step": 12478, "time_per_iteration": 3.1475231647491455 }, { "auxiliary_loss_clip": 0.01120327, "auxiliary_loss_mlp": 0.0103371, "balance_loss_clip": 1.02171779, "balance_loss_mlp": 1.03496099, "epoch": 0.7502780700435894, "flos": 20478275268480.0, "grad_norm": 1.6426761810289185, "language_loss": 0.80238557, "learning_rate": 5.845847254922971e-07, "loss": 0.82392597, "num_input_tokens_seen": 269136755, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 12479, "time_per_iteration": 2.4970970153808594 }, { "auxiliary_loss_clip": 0.01118232, "auxiliary_loss_mlp": 0.01032366, "balance_loss_clip": 1.01877618, "balance_loss_mlp": 1.03691971, "epoch": 0.7503381932962573, "flos": 20445273648000.0, "grad_norm": 2.378671007008747, "language_loss": 0.6226269, "learning_rate": 5.843178172301613e-07, "loss": 0.64413285, "num_input_tokens_seen": 269156120, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 12480, "time_per_iteration": 2.5274391174316406 }, { "auxiliary_loss_clip": 0.01118827, "auxiliary_loss_mlp": 0.0103614, "balance_loss_clip": 1.02468431, "balance_loss_mlp": 1.03477407, "epoch": 0.7503983165489253, "flos": 22381936266240.0, "grad_norm": 2.771237400871557, "language_loss": 0.77676183, "learning_rate": 5.840509594900813e-07, "loss": 0.79831147, "num_input_tokens_seen": 269175650, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.66015625, "step": 12481, "time_per_iteration": 2.539619207382202 }, { "auxiliary_loss_clip": 0.01131678, "auxiliary_loss_mlp": 0.01031023, "balance_loss_clip": 1.01799893, "balance_loss_mlp": 1.03309965, "epoch": 0.7504584398015932, "flos": 24899453717760.0, "grad_norm": 1.9526687142742876, "language_loss": 0.7133826, "learning_rate": 5.837841522815805e-07, "loss": 0.73500961, "num_input_tokens_seen": 269197080, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 12482, "time_per_iteration": 2.593351125717163 }, { "auxiliary_loss_clip": 0.0110591, "auxiliary_loss_mlp": 0.01036744, "balance_loss_clip": 1.02422738, "balance_loss_mlp": 1.03569007, "epoch": 0.7505185630542612, "flos": 25885237547520.0, "grad_norm": 3.3205779700952545, "language_loss": 0.70130378, "learning_rate": 5.835173956141805e-07, "loss": 0.72273034, "num_input_tokens_seen": 269218600, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 12483, "time_per_iteration": 3.899379014968872 }, { "auxiliary_loss_clip": 0.01104905, "auxiliary_loss_mlp": 0.01026037, "balance_loss_clip": 1.01500392, "balance_loss_mlp": 1.03541875, "epoch": 0.7505786863069293, "flos": 23143852581120.0, "grad_norm": 2.120220420077143, "language_loss": 0.74730819, "learning_rate": 5.83250689497401e-07, "loss": 0.76861763, "num_input_tokens_seen": 269239245, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6953125, "step": 12484, "time_per_iteration": 2.497337579727173 }, { "auxiliary_loss_clip": 0.01120779, "auxiliary_loss_mlp": 0.01029399, "balance_loss_clip": 1.01710868, "balance_loss_mlp": 1.03501892, "epoch": 0.7506388095595972, "flos": 16983377769600.0, "grad_norm": 8.47667741761115, "language_loss": 0.84202331, "learning_rate": 5.829840339407599e-07, "loss": 0.86352515, "num_input_tokens_seen": 269258520, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.68359375, "step": 12485, "time_per_iteration": 2.5024287700653076 }, { "auxiliary_loss_clip": 0.01129637, "auxiliary_loss_mlp": 0.01032231, "balance_loss_clip": 1.02008963, "balance_loss_mlp": 1.03445721, "epoch": 0.7506989328122652, "flos": 22344984149760.0, "grad_norm": 1.4161655532073263, "language_loss": 0.78214431, "learning_rate": 5.827174289537738e-07, "loss": 0.80376303, "num_input_tokens_seen": 269278320, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.68359375, "step": 12486, "time_per_iteration": 2.5583691596984863 }, { "auxiliary_loss_clip": 0.01117741, "auxiliary_loss_mlp": 0.01032887, "balance_loss_clip": 1.01904047, "balance_loss_mlp": 1.03687549, "epoch": 0.7507590560649331, "flos": 25776069137280.0, "grad_norm": 4.49436621687914, "language_loss": 0.72492838, "learning_rate": 5.824508745459562e-07, "loss": 0.74643463, "num_input_tokens_seen": 269298025, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71875, "step": 12487, "time_per_iteration": 2.5030951499938965 }, { "auxiliary_loss_clip": 0.01122288, "auxiliary_loss_mlp": 0.01026249, "balance_loss_clip": 1.01410198, "balance_loss_mlp": 1.0340488, "epoch": 0.7508191793176011, "flos": 24279420124800.0, "grad_norm": 2.3089212298929094, "language_loss": 0.67060912, "learning_rate": 5.82184370726821e-07, "loss": 0.6920945, "num_input_tokens_seen": 269316770, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.70703125, "step": 12488, "time_per_iteration": 2.5713648796081543 }, { "auxiliary_loss_clip": 0.01149608, "auxiliary_loss_mlp": 0.01029069, "balance_loss_clip": 1.01622391, "balance_loss_mlp": 1.03523755, "epoch": 0.750879302570269, "flos": 19899575251200.0, "grad_norm": 1.767766916845489, "language_loss": 0.77355933, "learning_rate": 5.819179175058789e-07, "loss": 0.79534608, "num_input_tokens_seen": 269334755, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 12489, "time_per_iteration": 2.5404655933380127 }, { "auxiliary_loss_clip": 0.01110738, "auxiliary_loss_mlp": 0.01031113, "balance_loss_clip": 1.01984227, "balance_loss_mlp": 1.03486586, "epoch": 0.750939425822937, "flos": 29205681667200.0, "grad_norm": 1.6901740818725144, "language_loss": 0.75051463, "learning_rate": 5.816515148926384e-07, "loss": 0.77193314, "num_input_tokens_seen": 269353810, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.671875, "step": 12490, "time_per_iteration": 4.124460458755493 }, { "auxiliary_loss_clip": 0.01104508, "auxiliary_loss_mlp": 0.01028598, "balance_loss_clip": 1.01634336, "balance_loss_mlp": 1.03574955, "epoch": 0.750999549075605, "flos": 21142300043520.0, "grad_norm": 1.561266454007901, "language_loss": 0.78225732, "learning_rate": 5.813851628966062e-07, "loss": 0.80358839, "num_input_tokens_seen": 269372910, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 12491, "time_per_iteration": 2.4608914852142334 }, { "auxiliary_loss_clip": 0.01109428, "auxiliary_loss_mlp": 0.01030873, "balance_loss_clip": 1.0193572, "balance_loss_mlp": 1.03313875, "epoch": 0.751059672328273, "flos": 23547740083200.0, "grad_norm": 1.6417055171234622, "language_loss": 0.76081884, "learning_rate": 5.811188615272899e-07, "loss": 0.78222185, "num_input_tokens_seen": 269391545, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 12492, "time_per_iteration": 2.512681245803833 }, { "auxiliary_loss_clip": 0.01128416, "auxiliary_loss_mlp": 0.01032597, "balance_loss_clip": 1.02086663, "balance_loss_mlp": 1.0325979, "epoch": 0.7511197955809409, "flos": 18989742729600.0, "grad_norm": 1.79375073746905, "language_loss": 0.70918393, "learning_rate": 5.808526107941902e-07, "loss": 0.73079407, "num_input_tokens_seen": 269408530, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69140625, "step": 12493, "time_per_iteration": 2.5409233570098877 }, { "auxiliary_loss_clip": 0.01114062, "auxiliary_loss_mlp": 0.01273449, "balance_loss_clip": 1.01437879, "balance_loss_mlp": 1.03590333, "epoch": 0.7511799188336089, "flos": 22046961006720.0, "grad_norm": 1.758930869323038, "language_loss": 0.80527055, "learning_rate": 5.80586410706811e-07, "loss": 0.82914567, "num_input_tokens_seen": 269425930, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 12494, "time_per_iteration": 2.520751714706421 }, { "auxiliary_loss_clip": 0.01105292, "auxiliary_loss_mlp": 0.01028557, "balance_loss_clip": 1.01574802, "balance_loss_mlp": 1.03481877, "epoch": 0.7512400420862768, "flos": 16467125546880.0, "grad_norm": 2.5564395964877633, "language_loss": 0.7840032, "learning_rate": 5.803202612746518e-07, "loss": 0.80534172, "num_input_tokens_seen": 269443945, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 12495, "time_per_iteration": 2.5341992378234863 }, { "auxiliary_loss_clip": 0.01104246, "auxiliary_loss_mlp": 0.01284753, "balance_loss_clip": 1.02511263, "balance_loss_mlp": 1.03532958, "epoch": 0.7513001653389448, "flos": 20448326304000.0, "grad_norm": 1.745366443483108, "language_loss": 0.7100293, "learning_rate": 5.800541625072104e-07, "loss": 0.73391932, "num_input_tokens_seen": 269463625, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 12496, "time_per_iteration": 2.5055251121520996 }, { "auxiliary_loss_clip": 0.01140878, "auxiliary_loss_mlp": 0.0103183, "balance_loss_clip": 1.01993918, "balance_loss_mlp": 1.0357306, "epoch": 0.7513602885916129, "flos": 23476816679040.0, "grad_norm": 2.6325545735385267, "language_loss": 0.78096235, "learning_rate": 5.797881144139829e-07, "loss": 0.80268943, "num_input_tokens_seen": 269483415, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 12497, "time_per_iteration": 2.6179065704345703 }, { "auxiliary_loss_clip": 0.0111427, "auxiliary_loss_mlp": 0.01035641, "balance_loss_clip": 1.02264118, "balance_loss_mlp": 1.03488934, "epoch": 0.7514204118442808, "flos": 26797224885120.0, "grad_norm": 1.444094964705949, "language_loss": 0.77052069, "learning_rate": 5.795221170044648e-07, "loss": 0.79201978, "num_input_tokens_seen": 269504635, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 12498, "time_per_iteration": 2.5865182876586914 }, { "auxiliary_loss_clip": 0.01121036, "auxiliary_loss_mlp": 0.01032886, "balance_loss_clip": 1.02153754, "balance_loss_mlp": 1.0369879, "epoch": 0.7514805350969488, "flos": 19865639877120.0, "grad_norm": 1.9465406449972271, "language_loss": 0.73965609, "learning_rate": 5.792561702881493e-07, "loss": 0.7611953, "num_input_tokens_seen": 269523955, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.66015625, "step": 12499, "time_per_iteration": 2.5931220054626465 }, { "auxiliary_loss_clip": 0.01109976, "auxiliary_loss_mlp": 0.0102708, "balance_loss_clip": 1.01518846, "balance_loss_mlp": 1.03333092, "epoch": 0.7515406583496167, "flos": 24571553437440.0, "grad_norm": 1.8614752165324304, "language_loss": 0.79503417, "learning_rate": 5.789902742745251e-07, "loss": 0.8164047, "num_input_tokens_seen": 269544410, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 12500, "time_per_iteration": 2.5612685680389404 }, { "auxiliary_loss_clip": 0.01121406, "auxiliary_loss_mlp": 0.01038511, "balance_loss_clip": 1.02389026, "balance_loss_mlp": 1.03717148, "epoch": 0.7516007816022847, "flos": 20120246455680.0, "grad_norm": 2.1774808868175275, "language_loss": 0.73386621, "learning_rate": 5.787244289730835e-07, "loss": 0.75546539, "num_input_tokens_seen": 269563315, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.75390625, "step": 12501, "time_per_iteration": 2.5210180282592773 }, { "auxiliary_loss_clip": 0.01120136, "auxiliary_loss_mlp": 0.0102333, "balance_loss_clip": 1.01062179, "balance_loss_mlp": 1.03383851, "epoch": 0.7516609048549526, "flos": 22784638619520.0, "grad_norm": 1.8447419788957897, "language_loss": 0.78740466, "learning_rate": 5.784586343933111e-07, "loss": 0.80883932, "num_input_tokens_seen": 269583950, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 12502, "time_per_iteration": 2.62017822265625 }, { "auxiliary_loss_clip": 0.01123177, "auxiliary_loss_mlp": 0.0103417, "balance_loss_clip": 1.02178419, "balance_loss_mlp": 1.0355792, "epoch": 0.7517210281076206, "flos": 10634012311680.0, "grad_norm": 2.4824739464755017, "language_loss": 0.70384139, "learning_rate": 5.781928905446933e-07, "loss": 0.72541487, "num_input_tokens_seen": 269600120, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69921875, "step": 12503, "time_per_iteration": 2.494915723800659 }, { "auxiliary_loss_clip": 0.01024505, "auxiliary_loss_mlp": 0.01004779, "balance_loss_clip": 1.00340259, "balance_loss_mlp": 1.00289142, "epoch": 0.7517811513602886, "flos": 66052221275520.0, "grad_norm": 0.9835972686231906, "language_loss": 0.64025974, "learning_rate": 5.779271974367132e-07, "loss": 0.66055256, "num_input_tokens_seen": 269659815, "router_z_loss_clip": 0.01379395, "router_z_loss_mlp": 0.21679688, "step": 12504, "time_per_iteration": 3.012113332748413 }, { "auxiliary_loss_clip": 0.01126342, "auxiliary_loss_mlp": 0.01033568, "balance_loss_clip": 1.02192712, "balance_loss_mlp": 1.03327572, "epoch": 0.7518412746129566, "flos": 37268345018880.0, "grad_norm": 1.6966307461053187, "language_loss": 0.6855942, "learning_rate": 5.776615550788548e-07, "loss": 0.70719326, "num_input_tokens_seen": 269684565, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6640625, "step": 12505, "time_per_iteration": 2.693699836730957 }, { "auxiliary_loss_clip": 0.01032808, "auxiliary_loss_mlp": 0.01252211, "balance_loss_clip": 1.00487161, "balance_loss_mlp": 1.00283325, "epoch": 0.7519013978656245, "flos": 60518567727360.0, "grad_norm": 0.6489277418787472, "language_loss": 0.55126739, "learning_rate": 5.773959634805956e-07, "loss": 0.57411754, "num_input_tokens_seen": 269752325, "router_z_loss_clip": 0.01397705, "router_z_loss_mlp": 0.21679688, "step": 12506, "time_per_iteration": 3.197248935699463 }, { "auxiliary_loss_clip": 0.01117792, "auxiliary_loss_mlp": 0.01038, "balance_loss_clip": 1.0251615, "balance_loss_mlp": 1.03745675, "epoch": 0.7519615211182925, "flos": 18806885568000.0, "grad_norm": 1.741186961261477, "language_loss": 0.78319287, "learning_rate": 5.771304226514155e-07, "loss": 0.8047508, "num_input_tokens_seen": 269770630, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 12507, "time_per_iteration": 2.5698792934417725 }, { "auxiliary_loss_clip": 0.01120724, "auxiliary_loss_mlp": 0.01034769, "balance_loss_clip": 1.02328944, "balance_loss_mlp": 1.03604257, "epoch": 0.7520216443709604, "flos": 14575244209920.0, "grad_norm": 1.7111334401868101, "language_loss": 0.7124815, "learning_rate": 5.768649326007902e-07, "loss": 0.73403645, "num_input_tokens_seen": 269787280, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.66796875, "step": 12508, "time_per_iteration": 2.5143425464630127 }, { "auxiliary_loss_clip": 0.01111191, "auxiliary_loss_mlp": 0.01031018, "balance_loss_clip": 1.01925802, "balance_loss_mlp": 1.03465199, "epoch": 0.7520817676236284, "flos": 17056599644160.0, "grad_norm": 1.7173446496731997, "language_loss": 0.72335708, "learning_rate": 5.765994933381957e-07, "loss": 0.74477923, "num_input_tokens_seen": 269805205, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 12509, "time_per_iteration": 2.4713687896728516 }, { "auxiliary_loss_clip": 0.01109401, "auxiliary_loss_mlp": 0.01033357, "balance_loss_clip": 1.01997018, "balance_loss_mlp": 1.03599977, "epoch": 0.7521418908762965, "flos": 25666397936640.0, "grad_norm": 1.622937287017869, "language_loss": 0.61985576, "learning_rate": 5.763341048731028e-07, "loss": 0.64128339, "num_input_tokens_seen": 269824820, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 12510, "time_per_iteration": 2.5333077907562256 }, { "auxiliary_loss_clip": 0.0111031, "auxiliary_loss_mlp": 0.01031736, "balance_loss_clip": 1.01969552, "balance_loss_mlp": 1.03527892, "epoch": 0.7522020141289644, "flos": 20886759711360.0, "grad_norm": 1.7032652132694333, "language_loss": 0.81459117, "learning_rate": 5.760687672149842e-07, "loss": 0.83601165, "num_input_tokens_seen": 269842825, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6640625, "step": 12511, "time_per_iteration": 2.5540497303009033 }, { "auxiliary_loss_clip": 0.01126631, "auxiliary_loss_mlp": 0.01035722, "balance_loss_clip": 1.02219772, "balance_loss_mlp": 1.03567123, "epoch": 0.7522621373816324, "flos": 12640305444480.0, "grad_norm": 2.265149714012028, "language_loss": 0.7609604, "learning_rate": 5.758034803733085e-07, "loss": 0.78258389, "num_input_tokens_seen": 269859000, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73046875, "step": 12512, "time_per_iteration": 2.603818655014038 }, { "auxiliary_loss_clip": 0.01106441, "auxiliary_loss_mlp": 0.01028215, "balance_loss_clip": 1.01665735, "balance_loss_mlp": 1.03269696, "epoch": 0.7523222606343003, "flos": 25626141768960.0, "grad_norm": 3.6545402802031046, "language_loss": 0.82223803, "learning_rate": 5.755382443575429e-07, "loss": 0.8435846, "num_input_tokens_seen": 269878895, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6484375, "step": 12513, "time_per_iteration": 2.5427188873291016 }, { "auxiliary_loss_clip": 0.01115661, "auxiliary_loss_mlp": 0.01034611, "balance_loss_clip": 1.02105057, "balance_loss_mlp": 1.03513491, "epoch": 0.7523823838869683, "flos": 20448900921600.0, "grad_norm": 1.8452150703375416, "language_loss": 0.7479434, "learning_rate": 5.752730591771535e-07, "loss": 0.76944613, "num_input_tokens_seen": 269897280, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7109375, "step": 12514, "time_per_iteration": 2.477987289428711 }, { "auxiliary_loss_clip": 0.01108521, "auxiliary_loss_mlp": 0.01030064, "balance_loss_clip": 1.01898944, "balance_loss_mlp": 1.03367996, "epoch": 0.7524425071396362, "flos": 14720610551040.0, "grad_norm": 1.8834272884588403, "language_loss": 0.69017577, "learning_rate": 5.750079248416031e-07, "loss": 0.71156156, "num_input_tokens_seen": 269914640, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.65625, "step": 12515, "time_per_iteration": 2.4682767391204834 }, { "auxiliary_loss_clip": 0.01114412, "auxiliary_loss_mlp": 0.01033253, "balance_loss_clip": 1.02167225, "balance_loss_mlp": 1.03626871, "epoch": 0.7525026303923043, "flos": 30592048947840.0, "grad_norm": 2.181969768995124, "language_loss": 0.70221722, "learning_rate": 5.747428413603554e-07, "loss": 0.72369391, "num_input_tokens_seen": 269934960, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6953125, "step": 12516, "time_per_iteration": 2.578371524810791 }, { "auxiliary_loss_clip": 0.01137836, "auxiliary_loss_mlp": 0.01032489, "balance_loss_clip": 1.02012706, "balance_loss_mlp": 1.03668582, "epoch": 0.7525627536449722, "flos": 24791757765120.0, "grad_norm": 1.6068412800596603, "language_loss": 0.89573342, "learning_rate": 5.744778087428686e-07, "loss": 0.91743666, "num_input_tokens_seen": 269956655, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.66015625, "step": 12517, "time_per_iteration": 2.6082446575164795 }, { "auxiliary_loss_clip": 0.01123604, "auxiliary_loss_mlp": 0.01035765, "balance_loss_clip": 1.02374244, "balance_loss_mlp": 1.03539729, "epoch": 0.7526228768976402, "flos": 20779782030720.0, "grad_norm": 1.9847950946053283, "language_loss": 0.74234635, "learning_rate": 5.742128269986022e-07, "loss": 0.7639401, "num_input_tokens_seen": 269976835, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.70703125, "step": 12518, "time_per_iteration": 2.5360054969787598 }, { "auxiliary_loss_clip": 0.01060599, "auxiliary_loss_mlp": 0.01001651, "balance_loss_clip": 1.00031018, "balance_loss_mlp": 1.00306797, "epoch": 0.7526830001503081, "flos": 66559243703040.0, "grad_norm": 0.7292841232114401, "language_loss": 0.55761451, "learning_rate": 5.739478961370126e-07, "loss": 0.578237, "num_input_tokens_seen": 270040630, "router_z_loss_clip": 0.01342773, "router_z_loss_mlp": 0.21484375, "step": 12519, "time_per_iteration": 4.626740217208862 }, { "auxiliary_loss_clip": 0.01110709, "auxiliary_loss_mlp": 0.01032618, "balance_loss_clip": 1.02162099, "balance_loss_mlp": 1.03341961, "epoch": 0.7527431234029761, "flos": 23477894087040.0, "grad_norm": 1.4614382442976297, "language_loss": 0.77777755, "learning_rate": 5.736830161675544e-07, "loss": 0.79921079, "num_input_tokens_seen": 270059695, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.6796875, "step": 12520, "time_per_iteration": 2.5156164169311523 }, { "auxiliary_loss_clip": 0.01111718, "auxiliary_loss_mlp": 0.01036466, "balance_loss_clip": 1.02474141, "balance_loss_mlp": 1.03421509, "epoch": 0.752803246655644, "flos": 22049546785920.0, "grad_norm": 2.053424872863303, "language_loss": 0.73899841, "learning_rate": 5.734181870996797e-07, "loss": 0.76048023, "num_input_tokens_seen": 270078420, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.68359375, "step": 12521, "time_per_iteration": 2.555081605911255 }, { "auxiliary_loss_clip": 0.01128331, "auxiliary_loss_mlp": 0.01036128, "balance_loss_clip": 1.02311051, "balance_loss_mlp": 1.03723013, "epoch": 0.752863369908312, "flos": 30153795108480.0, "grad_norm": 2.0459020574267974, "language_loss": 0.66970235, "learning_rate": 5.731534089428413e-07, "loss": 0.69134694, "num_input_tokens_seen": 270097040, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73046875, "step": 12522, "time_per_iteration": 2.636096239089966 }, { "auxiliary_loss_clip": 0.01116112, "auxiliary_loss_mlp": 0.01038386, "balance_loss_clip": 1.02593505, "balance_loss_mlp": 1.03639698, "epoch": 0.7529234931609801, "flos": 24567638855040.0, "grad_norm": 1.4999399262026982, "language_loss": 0.78384268, "learning_rate": 5.728886817064866e-07, "loss": 0.80538768, "num_input_tokens_seen": 270116365, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7109375, "step": 12523, "time_per_iteration": 2.5570216178894043 }, { "auxiliary_loss_clip": 0.01102122, "auxiliary_loss_mlp": 0.0102808, "balance_loss_clip": 1.01619506, "balance_loss_mlp": 1.03467619, "epoch": 0.752983616413648, "flos": 23112395245440.0, "grad_norm": 2.0815366457305564, "language_loss": 0.80409539, "learning_rate": 5.726240054000644e-07, "loss": 0.82539737, "num_input_tokens_seen": 270135395, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.671875, "step": 12524, "time_per_iteration": 3.911132335662842 }, { "auxiliary_loss_clip": 0.0111288, "auxiliary_loss_mlp": 0.01277132, "balance_loss_clip": 1.01876855, "balance_loss_mlp": 1.03580046, "epoch": 0.753043739666316, "flos": 24316946858880.0, "grad_norm": 1.522473249268187, "language_loss": 0.74146879, "learning_rate": 5.723593800330191e-07, "loss": 0.76536894, "num_input_tokens_seen": 270156425, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6796875, "step": 12525, "time_per_iteration": 2.566685914993286 }, { "auxiliary_loss_clip": 0.01129774, "auxiliary_loss_mlp": 0.01034323, "balance_loss_clip": 1.02224112, "balance_loss_mlp": 1.03535318, "epoch": 0.7531038629189839, "flos": 24243294021120.0, "grad_norm": 1.8716148259264536, "language_loss": 0.72342169, "learning_rate": 5.720948056147965e-07, "loss": 0.74506271, "num_input_tokens_seen": 270176905, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.68359375, "step": 12526, "time_per_iteration": 2.6451187133789062 }, { "auxiliary_loss_clip": 0.01118651, "auxiliary_loss_mlp": 0.01026465, "balance_loss_clip": 1.01488388, "balance_loss_mlp": 1.03357208, "epoch": 0.7531639861716519, "flos": 30188807890560.0, "grad_norm": 1.6761906094557166, "language_loss": 0.71786231, "learning_rate": 5.71830282154836e-07, "loss": 0.73931348, "num_input_tokens_seen": 270196640, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.671875, "step": 12527, "time_per_iteration": 2.6462435722351074 }, { "auxiliary_loss_clip": 0.01128307, "auxiliary_loss_mlp": 0.01025789, "balance_loss_clip": 1.01454151, "balance_loss_mlp": 1.03468847, "epoch": 0.7532241094243198, "flos": 18223193560320.0, "grad_norm": 1.780499928765543, "language_loss": 0.81283867, "learning_rate": 5.715658096625797e-07, "loss": 0.83437961, "num_input_tokens_seen": 270213905, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.671875, "step": 12528, "time_per_iteration": 2.5527780055999756 }, { "auxiliary_loss_clip": 0.01149219, "auxiliary_loss_mlp": 0.01031159, "balance_loss_clip": 1.01830816, "balance_loss_mlp": 1.03485012, "epoch": 0.7532842326769879, "flos": 20881049448960.0, "grad_norm": 2.0917368591824665, "language_loss": 0.85400569, "learning_rate": 5.71301388147466e-07, "loss": 0.87580949, "num_input_tokens_seen": 270231995, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 12529, "time_per_iteration": 2.5824151039123535 }, { "auxiliary_loss_clip": 0.0113496, "auxiliary_loss_mlp": 0.01032874, "balance_loss_clip": 1.02007699, "balance_loss_mlp": 1.03666568, "epoch": 0.7533443559296558, "flos": 18078689145600.0, "grad_norm": 1.9723950528451706, "language_loss": 0.73646063, "learning_rate": 5.710370176189292e-07, "loss": 0.75813901, "num_input_tokens_seen": 270251480, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 12530, "time_per_iteration": 2.5480732917785645 }, { "auxiliary_loss_clip": 0.01060475, "auxiliary_loss_mlp": 0.01001193, "balance_loss_clip": 0.99983454, "balance_loss_mlp": 1.00261295, "epoch": 0.7534044791823238, "flos": 50254830766080.0, "grad_norm": 0.8277537909442526, "language_loss": 0.6367327, "learning_rate": 5.707726980864062e-07, "loss": 0.65734935, "num_input_tokens_seen": 270306480, "router_z_loss_clip": 0.01361084, "router_z_loss_mlp": 0.21679688, "step": 12531, "time_per_iteration": 3.0311832427978516 }, { "auxiliary_loss_clip": 0.01124853, "auxiliary_loss_mlp": 0.01030303, "balance_loss_clip": 1.01708269, "balance_loss_mlp": 1.0355221, "epoch": 0.7534646024349917, "flos": 20850274471680.0, "grad_norm": 1.746533624832592, "language_loss": 0.69739181, "learning_rate": 5.705084295593287e-07, "loss": 0.71894336, "num_input_tokens_seen": 270324595, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 12532, "time_per_iteration": 5.6209423542022705 }, { "auxiliary_loss_clip": 0.01118209, "auxiliary_loss_mlp": 0.01029049, "balance_loss_clip": 1.01835012, "balance_loss_mlp": 1.03460348, "epoch": 0.7535247256876597, "flos": 23071779941760.0, "grad_norm": 1.5096258240413765, "language_loss": 0.77498877, "learning_rate": 5.702442120471296e-07, "loss": 0.79646134, "num_input_tokens_seen": 270344375, "router_z_loss_clip": 0.10693359, "router_z_loss_mlp": 0.65234375, "step": 12533, "time_per_iteration": 2.5631537437438965 }, { "auxiliary_loss_clip": 0.01033577, "auxiliary_loss_mlp": 0.01001378, "balance_loss_clip": 0.99992961, "balance_loss_mlp": 1.00276232, "epoch": 0.7535848489403276, "flos": 58623418252800.0, "grad_norm": 0.7817779456408362, "language_loss": 0.5731374, "learning_rate": 5.699800455592354e-07, "loss": 0.59348691, "num_input_tokens_seen": 270405235, "router_z_loss_clip": 0.01446533, "router_z_loss_mlp": 0.21679688, "step": 12534, "time_per_iteration": 3.148210287094116 }, { "auxiliary_loss_clip": 0.01119558, "auxiliary_loss_mlp": 0.01028768, "balance_loss_clip": 1.01656127, "balance_loss_mlp": 1.03306842, "epoch": 0.7536449721929956, "flos": 26577882483840.0, "grad_norm": 1.4223809194966222, "language_loss": 0.7120676, "learning_rate": 5.697159301050756e-07, "loss": 0.73355091, "num_input_tokens_seen": 270425820, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 12535, "time_per_iteration": 2.595353841781616 }, { "auxiliary_loss_clip": 0.01122017, "auxiliary_loss_mlp": 0.01027378, "balance_loss_clip": 1.01434183, "balance_loss_mlp": 1.03271508, "epoch": 0.7537050954456637, "flos": 25735992537600.0, "grad_norm": 2.255489777766896, "language_loss": 0.80766737, "learning_rate": 5.69451865694075e-07, "loss": 0.82916135, "num_input_tokens_seen": 270447120, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 12536, "time_per_iteration": 2.5425031185150146 }, { "auxiliary_loss_clip": 0.01117263, "auxiliary_loss_mlp": 0.01027823, "balance_loss_clip": 1.01587188, "balance_loss_mlp": 1.03271246, "epoch": 0.7537652186983316, "flos": 30224431203840.0, "grad_norm": 1.8649275164212218, "language_loss": 0.74481952, "learning_rate": 5.691878523356574e-07, "loss": 0.7662704, "num_input_tokens_seen": 270468680, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6640625, "step": 12537, "time_per_iteration": 2.622483730316162 }, { "auxiliary_loss_clip": 0.01104898, "auxiliary_loss_mlp": 0.01030784, "balance_loss_clip": 1.01832056, "balance_loss_mlp": 1.03434181, "epoch": 0.7538253419509996, "flos": 12641239198080.0, "grad_norm": 1.6365973602690878, "language_loss": 0.74367893, "learning_rate": 5.689238900392445e-07, "loss": 0.76503569, "num_input_tokens_seen": 270486310, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.70703125, "step": 12538, "time_per_iteration": 2.4853734970092773 }, { "auxiliary_loss_clip": 0.0113136, "auxiliary_loss_mlp": 0.01031696, "balance_loss_clip": 1.01941156, "balance_loss_mlp": 1.03499222, "epoch": 0.7538854652036675, "flos": 23185976256000.0, "grad_norm": 1.5260450097328777, "language_loss": 0.67742765, "learning_rate": 5.686599788142581e-07, "loss": 0.69905823, "num_input_tokens_seen": 270507210, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 12539, "time_per_iteration": 2.5662953853607178 }, { "auxiliary_loss_clip": 0.01102864, "auxiliary_loss_mlp": 0.01026842, "balance_loss_clip": 1.01437902, "balance_loss_mlp": 1.03532743, "epoch": 0.7539455884563355, "flos": 23186227651200.0, "grad_norm": 1.675653322203641, "language_loss": 0.74665487, "learning_rate": 5.683961186701138e-07, "loss": 0.76795185, "num_input_tokens_seen": 270525250, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.67578125, "step": 12540, "time_per_iteration": 2.524477958679199 }, { "auxiliary_loss_clip": 0.01105299, "auxiliary_loss_mlp": 0.01027152, "balance_loss_clip": 1.01427126, "balance_loss_mlp": 1.03456235, "epoch": 0.7540057117090034, "flos": 13181155505280.0, "grad_norm": 2.6060478363490893, "language_loss": 0.72822118, "learning_rate": 5.6813230961623e-07, "loss": 0.74954569, "num_input_tokens_seen": 270539295, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 12541, "time_per_iteration": 2.490368366241455 }, { "auxiliary_loss_clip": 0.01103829, "auxiliary_loss_mlp": 0.01029106, "balance_loss_clip": 1.0167563, "balance_loss_mlp": 1.0347476, "epoch": 0.7540658349616715, "flos": 45478134478080.0, "grad_norm": 1.6562043244122995, "language_loss": 0.71960318, "learning_rate": 5.678685516620206e-07, "loss": 0.74093258, "num_input_tokens_seen": 270562815, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.69140625, "step": 12542, "time_per_iteration": 2.7382190227508545 }, { "auxiliary_loss_clip": 0.01119765, "auxiliary_loss_mlp": 0.01023582, "balance_loss_clip": 1.01139855, "balance_loss_mlp": 1.03366208, "epoch": 0.7541259582143394, "flos": 19930817105280.0, "grad_norm": 1.7665366969549472, "language_loss": 0.84421563, "learning_rate": 5.676048448168995e-07, "loss": 0.8656491, "num_input_tokens_seen": 270579055, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.68359375, "step": 12543, "time_per_iteration": 2.6258351802825928 }, { "auxiliary_loss_clip": 0.01133558, "auxiliary_loss_mlp": 0.01030734, "balance_loss_clip": 1.01832461, "balance_loss_mlp": 1.03943968, "epoch": 0.7541860814670074, "flos": 27198239299200.0, "grad_norm": 2.1914959877743443, "language_loss": 0.73689795, "learning_rate": 5.673411890902766e-07, "loss": 0.75854087, "num_input_tokens_seen": 270599080, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6796875, "step": 12544, "time_per_iteration": 2.640260696411133 }, { "auxiliary_loss_clip": 0.01113746, "auxiliary_loss_mlp": 0.01031023, "balance_loss_clip": 1.01894677, "balance_loss_mlp": 1.03577161, "epoch": 0.7542462047196753, "flos": 21324151624320.0, "grad_norm": 1.9508377552786673, "language_loss": 0.68350297, "learning_rate": 5.670775844915607e-07, "loss": 0.70495069, "num_input_tokens_seen": 270618715, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 12545, "time_per_iteration": 2.528404712677002 }, { "auxiliary_loss_clip": 0.01121511, "auxiliary_loss_mlp": 0.01030831, "balance_loss_clip": 1.01893377, "balance_loss_mlp": 1.03323233, "epoch": 0.7543063279723433, "flos": 11940944664960.0, "grad_norm": 1.6975660198186593, "language_loss": 0.68816221, "learning_rate": 5.668140310301612e-07, "loss": 0.70968562, "num_input_tokens_seen": 270635695, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.703125, "step": 12546, "time_per_iteration": 2.4877095222473145 }, { "auxiliary_loss_clip": 0.01112661, "auxiliary_loss_mlp": 0.01032927, "balance_loss_clip": 1.01960516, "balance_loss_mlp": 1.03412926, "epoch": 0.7543664512250112, "flos": 22819974624000.0, "grad_norm": 2.2943698823482603, "language_loss": 0.73889542, "learning_rate": 5.665505287154812e-07, "loss": 0.76035124, "num_input_tokens_seen": 270654325, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 12547, "time_per_iteration": 2.5543875694274902 }, { "auxiliary_loss_clip": 0.011031, "auxiliary_loss_mlp": 0.01030116, "balance_loss_clip": 1.01682377, "balance_loss_mlp": 1.03430653, "epoch": 0.7544265744776792, "flos": 20923855482240.0, "grad_norm": 1.6390806288516793, "language_loss": 0.6751771, "learning_rate": 5.662870775569262e-07, "loss": 0.69650924, "num_input_tokens_seen": 270674260, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6875, "step": 12548, "time_per_iteration": 2.544442653656006 }, { "auxiliary_loss_clip": 0.01123236, "auxiliary_loss_mlp": 0.01031424, "balance_loss_clip": 1.01860881, "balance_loss_mlp": 1.03457367, "epoch": 0.7544866977303473, "flos": 15195493284480.0, "grad_norm": 2.0366242379789186, "language_loss": 0.86949271, "learning_rate": 5.660236775638971e-07, "loss": 0.89103931, "num_input_tokens_seen": 270692200, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 12549, "time_per_iteration": 2.522045373916626 }, { "auxiliary_loss_clip": 0.01135258, "auxiliary_loss_mlp": 0.01032821, "balance_loss_clip": 1.02082801, "balance_loss_mlp": 1.03415608, "epoch": 0.7545468209830152, "flos": 27083683848960.0, "grad_norm": 1.685132033529304, "language_loss": 0.77108043, "learning_rate": 5.657603287457946e-07, "loss": 0.79276121, "num_input_tokens_seen": 270709675, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.66015625, "step": 12550, "time_per_iteration": 2.616374969482422 }, { "auxiliary_loss_clip": 0.01104031, "auxiliary_loss_mlp": 0.01026926, "balance_loss_clip": 1.01514852, "balance_loss_mlp": 1.03399169, "epoch": 0.7546069442356832, "flos": 26871703735680.0, "grad_norm": 1.4843318633976263, "language_loss": 0.69875884, "learning_rate": 5.654970311120159e-07, "loss": 0.7200684, "num_input_tokens_seen": 270733055, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.69921875, "step": 12551, "time_per_iteration": 2.6131067276000977 }, { "auxiliary_loss_clip": 0.01123258, "auxiliary_loss_mlp": 0.0103001, "balance_loss_clip": 1.01695061, "balance_loss_mlp": 1.03698874, "epoch": 0.7546670674883511, "flos": 15743131015680.0, "grad_norm": 2.4726913800922277, "language_loss": 0.86530364, "learning_rate": 5.65233784671959e-07, "loss": 0.88683629, "num_input_tokens_seen": 270749275, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6875, "step": 12552, "time_per_iteration": 2.5760884284973145 }, { "auxiliary_loss_clip": 0.01113592, "auxiliary_loss_mlp": 0.01031415, "balance_loss_clip": 1.01930964, "balance_loss_mlp": 1.03530502, "epoch": 0.7547271907410191, "flos": 23477714519040.0, "grad_norm": 2.044620265979936, "language_loss": 0.78058541, "learning_rate": 5.649705894350176e-07, "loss": 0.80203545, "num_input_tokens_seen": 270768230, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 12553, "time_per_iteration": 2.54097843170166 }, { "auxiliary_loss_clip": 0.01114622, "auxiliary_loss_mlp": 0.01031998, "balance_loss_clip": 1.0194515, "balance_loss_mlp": 1.03573203, "epoch": 0.754787313993687, "flos": 31722804069120.0, "grad_norm": 2.1535983174283855, "language_loss": 0.62281168, "learning_rate": 5.647074454105845e-07, "loss": 0.64427793, "num_input_tokens_seen": 270786285, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 12554, "time_per_iteration": 2.6012942790985107 }, { "auxiliary_loss_clip": 0.01122298, "auxiliary_loss_mlp": 0.01033857, "balance_loss_clip": 1.02150726, "balance_loss_mlp": 1.03496134, "epoch": 0.7548474372463551, "flos": 27563055782400.0, "grad_norm": 2.4794354320718295, "language_loss": 0.73538971, "learning_rate": 5.6444435260805e-07, "loss": 0.75695127, "num_input_tokens_seen": 270805505, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 12555, "time_per_iteration": 2.5835108757019043 }, { "auxiliary_loss_clip": 0.01125584, "auxiliary_loss_mlp": 0.01030566, "balance_loss_clip": 1.01812696, "balance_loss_mlp": 1.03714013, "epoch": 0.754907560499023, "flos": 19318576763520.0, "grad_norm": 2.112254981051235, "language_loss": 0.78552514, "learning_rate": 5.64181311036805e-07, "loss": 0.80708671, "num_input_tokens_seen": 270824610, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.703125, "step": 12556, "time_per_iteration": 2.6143338680267334 }, { "auxiliary_loss_clip": 0.01033183, "auxiliary_loss_mlp": 0.01001558, "balance_loss_clip": 1.00027013, "balance_loss_mlp": 1.00287271, "epoch": 0.754967683751691, "flos": 69744628684800.0, "grad_norm": 0.7083345653847571, "language_loss": 0.50447112, "learning_rate": 5.639183207062346e-07, "loss": 0.52481848, "num_input_tokens_seen": 270886155, "router_z_loss_clip": 0.01287842, "router_z_loss_mlp": 0.21582031, "step": 12557, "time_per_iteration": 3.1195642948150635 }, { "auxiliary_loss_clip": 0.01116936, "auxiliary_loss_mlp": 0.01032231, "balance_loss_clip": 1.01920772, "balance_loss_mlp": 1.03738856, "epoch": 0.7550278070043589, "flos": 24421913377920.0, "grad_norm": 1.6045749574127235, "language_loss": 0.711025, "learning_rate": 5.636553816257257e-07, "loss": 0.73251671, "num_input_tokens_seen": 270905325, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 12558, "time_per_iteration": 2.535980463027954 }, { "auxiliary_loss_clip": 0.01121604, "auxiliary_loss_mlp": 0.01029383, "balance_loss_clip": 1.01688397, "balance_loss_mlp": 1.03476083, "epoch": 0.7550879302570269, "flos": 32634611838720.0, "grad_norm": 1.7046729492786148, "language_loss": 0.8009603, "learning_rate": 5.633924938046617e-07, "loss": 0.82247019, "num_input_tokens_seen": 270927535, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 12559, "time_per_iteration": 2.6289570331573486 }, { "auxiliary_loss_clip": 0.0114085, "auxiliary_loss_mlp": 0.01030317, "balance_loss_clip": 1.01725173, "balance_loss_mlp": 1.03464484, "epoch": 0.7551480535096948, "flos": 21795550738560.0, "grad_norm": 1.6880547750486825, "language_loss": 0.78634018, "learning_rate": 5.631296572524242e-07, "loss": 0.80805182, "num_input_tokens_seen": 270946920, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 12560, "time_per_iteration": 4.153886079788208 }, { "auxiliary_loss_clip": 0.01136494, "auxiliary_loss_mlp": 0.01033341, "balance_loss_clip": 1.02135468, "balance_loss_mlp": 1.0340333, "epoch": 0.7552081767623628, "flos": 18515111391360.0, "grad_norm": 1.666823917509684, "language_loss": 0.70417506, "learning_rate": 5.628668719783931e-07, "loss": 0.72587341, "num_input_tokens_seen": 270965705, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.671875, "step": 12561, "time_per_iteration": 2.544696569442749 }, { "auxiliary_loss_clip": 0.01120873, "auxiliary_loss_mlp": 0.01277583, "balance_loss_clip": 1.01844978, "balance_loss_mlp": 1.03362226, "epoch": 0.7552683000150308, "flos": 27634805199360.0, "grad_norm": 1.5902140879162727, "language_loss": 0.75624168, "learning_rate": 5.62604137991946e-07, "loss": 0.78022623, "num_input_tokens_seen": 270986550, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 12562, "time_per_iteration": 2.584412097930908 }, { "auxiliary_loss_clip": 0.01121009, "auxiliary_loss_mlp": 0.01029448, "balance_loss_clip": 1.01735985, "balance_loss_mlp": 1.03460002, "epoch": 0.7553284232676988, "flos": 20302924049280.0, "grad_norm": 3.604140003812585, "language_loss": 0.75929046, "learning_rate": 5.62341455302461e-07, "loss": 0.78079498, "num_input_tokens_seen": 271006250, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 12563, "time_per_iteration": 2.5543127059936523 }, { "auxiliary_loss_clip": 0.01139555, "auxiliary_loss_mlp": 0.01032993, "balance_loss_clip": 1.01799631, "balance_loss_mlp": 1.03624225, "epoch": 0.7553885465203668, "flos": 33255471444480.0, "grad_norm": 14.412194268435545, "language_loss": 0.67249209, "learning_rate": 5.620788239193102e-07, "loss": 0.69421756, "num_input_tokens_seen": 271025575, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.76171875, "step": 12564, "time_per_iteration": 2.6189496517181396 }, { "auxiliary_loss_clip": 0.01126204, "auxiliary_loss_mlp": 0.0103532, "balance_loss_clip": 1.02245116, "balance_loss_mlp": 1.03648949, "epoch": 0.7554486697730347, "flos": 21616249023360.0, "grad_norm": 1.9297236449331776, "language_loss": 0.68553007, "learning_rate": 5.618162438518678e-07, "loss": 0.70714533, "num_input_tokens_seen": 271045805, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 12565, "time_per_iteration": 2.5178592205047607 }, { "auxiliary_loss_clip": 0.01125174, "auxiliary_loss_mlp": 0.01029872, "balance_loss_clip": 1.0163182, "balance_loss_mlp": 1.03433204, "epoch": 0.7555087930257027, "flos": 27632973605760.0, "grad_norm": 1.7778906007721902, "language_loss": 0.75161034, "learning_rate": 5.615537151095044e-07, "loss": 0.77316082, "num_input_tokens_seen": 271066065, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 12566, "time_per_iteration": 3.929128408432007 }, { "auxiliary_loss_clip": 0.01117711, "auxiliary_loss_mlp": 0.01035956, "balance_loss_clip": 1.02157915, "balance_loss_mlp": 1.03687692, "epoch": 0.7555689162783706, "flos": 23621644316160.0, "grad_norm": 2.253028951245271, "language_loss": 0.73801428, "learning_rate": 5.612912377015886e-07, "loss": 0.75955093, "num_input_tokens_seen": 271085870, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.71875, "step": 12567, "time_per_iteration": 2.62453293800354 }, { "auxiliary_loss_clip": 0.01103678, "auxiliary_loss_mlp": 0.01027903, "balance_loss_clip": 1.01591063, "balance_loss_mlp": 1.03442407, "epoch": 0.7556290395310387, "flos": 24863076218880.0, "grad_norm": 2.5535858871002834, "language_loss": 0.63347805, "learning_rate": 5.610288116374873e-07, "loss": 0.65479386, "num_input_tokens_seen": 271104260, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6953125, "step": 12568, "time_per_iteration": 2.5714752674102783 }, { "auxiliary_loss_clip": 0.01166052, "auxiliary_loss_mlp": 0.01032196, "balance_loss_clip": 1.02026927, "balance_loss_mlp": 1.03498256, "epoch": 0.7556891627837066, "flos": 43543770330240.0, "grad_norm": 2.511787327390024, "language_loss": 0.66264403, "learning_rate": 5.607664369265668e-07, "loss": 0.68462658, "num_input_tokens_seen": 271125745, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 12569, "time_per_iteration": 2.808485984802246 }, { "auxiliary_loss_clip": 0.01118479, "auxiliary_loss_mlp": 0.01038799, "balance_loss_clip": 1.02422571, "balance_loss_mlp": 1.03685725, "epoch": 0.7557492860363746, "flos": 26650924790400.0, "grad_norm": 4.224541257510619, "language_loss": 0.66847986, "learning_rate": 5.6050411357819e-07, "loss": 0.69005263, "num_input_tokens_seen": 271147145, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.73046875, "step": 12570, "time_per_iteration": 2.5870001316070557 }, { "auxiliary_loss_clip": 0.01105167, "auxiliary_loss_mlp": 0.01032137, "balance_loss_clip": 1.01907158, "balance_loss_mlp": 1.03507197, "epoch": 0.7558094092890425, "flos": 55182885010560.0, "grad_norm": 2.1516781163325898, "language_loss": 0.71615249, "learning_rate": 5.602418416017185e-07, "loss": 0.73752546, "num_input_tokens_seen": 271170865, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 12571, "time_per_iteration": 2.8762307167053223 }, { "auxiliary_loss_clip": 0.01142057, "auxiliary_loss_mlp": 0.01031969, "balance_loss_clip": 1.01900506, "balance_loss_mlp": 1.03479743, "epoch": 0.7558695325417105, "flos": 23988292392960.0, "grad_norm": 1.6304087055270848, "language_loss": 0.73464978, "learning_rate": 5.599796210065118e-07, "loss": 0.75638998, "num_input_tokens_seen": 271191450, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 12572, "time_per_iteration": 2.5912117958068848 }, { "auxiliary_loss_clip": 0.0112593, "auxiliary_loss_mlp": 0.01033746, "balance_loss_clip": 1.02131295, "balance_loss_mlp": 1.0375303, "epoch": 0.7559296557943784, "flos": 14611262572800.0, "grad_norm": 2.262153995272303, "language_loss": 0.76679528, "learning_rate": 5.597174518019292e-07, "loss": 0.78839201, "num_input_tokens_seen": 271207335, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7109375, "step": 12573, "time_per_iteration": 4.039926290512085 }, { "auxiliary_loss_clip": 0.01135439, "auxiliary_loss_mlp": 0.0103537, "balance_loss_clip": 1.02232277, "balance_loss_mlp": 1.03628755, "epoch": 0.7559897790470465, "flos": 18550483309440.0, "grad_norm": 2.673461585254328, "language_loss": 0.68911195, "learning_rate": 5.594553339973254e-07, "loss": 0.71081996, "num_input_tokens_seen": 271226895, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 12574, "time_per_iteration": 3.9816927909851074 }, { "auxiliary_loss_clip": 0.01129577, "auxiliary_loss_mlp": 0.01034992, "balance_loss_clip": 1.02189112, "balance_loss_mlp": 1.03366601, "epoch": 0.7560499022997144, "flos": 17967868709760.0, "grad_norm": 2.577198935632697, "language_loss": 0.71392488, "learning_rate": 5.591932676020545e-07, "loss": 0.73557055, "num_input_tokens_seen": 271244375, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69140625, "step": 12575, "time_per_iteration": 2.5369458198547363 }, { "auxiliary_loss_clip": 0.01102147, "auxiliary_loss_mlp": 0.01280904, "balance_loss_clip": 1.02211034, "balance_loss_mlp": 1.0333395, "epoch": 0.7561100255523824, "flos": 15737815802880.0, "grad_norm": 3.100108305302492, "language_loss": 0.72583079, "learning_rate": 5.589312526254705e-07, "loss": 0.74966133, "num_input_tokens_seen": 271259530, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 12576, "time_per_iteration": 2.523970365524292 }, { "auxiliary_loss_clip": 0.01129907, "auxiliary_loss_mlp": 0.01033426, "balance_loss_clip": 1.02068901, "balance_loss_mlp": 1.03525472, "epoch": 0.7561701488050504, "flos": 15888102307200.0, "grad_norm": 1.8136009335889591, "language_loss": 0.6738261, "learning_rate": 5.586692890769231e-07, "loss": 0.69545937, "num_input_tokens_seen": 271276835, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 12577, "time_per_iteration": 2.581774950027466 }, { "auxiliary_loss_clip": 0.01121345, "auxiliary_loss_mlp": 0.01037304, "balance_loss_clip": 1.02485251, "balance_loss_mlp": 1.03428388, "epoch": 0.7562302720577183, "flos": 20339157893760.0, "grad_norm": 2.4584677880962538, "language_loss": 0.78341544, "learning_rate": 5.584073769657613e-07, "loss": 0.80500185, "num_input_tokens_seen": 271296275, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69140625, "step": 12578, "time_per_iteration": 2.7229716777801514 }, { "auxiliary_loss_clip": 0.01113263, "auxiliary_loss_mlp": 0.01034012, "balance_loss_clip": 1.02128625, "balance_loss_mlp": 1.03390145, "epoch": 0.7562903953103863, "flos": 20812209033600.0, "grad_norm": 1.669556305831909, "language_loss": 0.75973308, "learning_rate": 5.581455163013314e-07, "loss": 0.78120577, "num_input_tokens_seen": 271315685, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 12579, "time_per_iteration": 2.567152976989746 }, { "auxiliary_loss_clip": 0.01147404, "auxiliary_loss_mlp": 0.01034259, "balance_loss_clip": 1.02027023, "balance_loss_mlp": 1.03668439, "epoch": 0.7563505185630542, "flos": 37596999484800.0, "grad_norm": 1.78753114042843, "language_loss": 0.62601954, "learning_rate": 5.5788370709298e-07, "loss": 0.64783621, "num_input_tokens_seen": 271336790, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75390625, "step": 12580, "time_per_iteration": 2.7083849906921387 }, { "auxiliary_loss_clip": 0.01142038, "auxiliary_loss_mlp": 0.01032274, "balance_loss_clip": 1.0193696, "balance_loss_mlp": 1.03705692, "epoch": 0.7564106418157223, "flos": 20230995064320.0, "grad_norm": 1.7245066652746999, "language_loss": 0.75076354, "learning_rate": 5.576219493500487e-07, "loss": 0.77250671, "num_input_tokens_seen": 271355470, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 12581, "time_per_iteration": 2.579272985458374 }, { "auxiliary_loss_clip": 0.01133516, "auxiliary_loss_mlp": 0.01031846, "balance_loss_clip": 1.01933479, "balance_loss_mlp": 1.03523421, "epoch": 0.7564707650683902, "flos": 24754877475840.0, "grad_norm": 1.91218327272038, "language_loss": 0.63288152, "learning_rate": 5.573602430818803e-07, "loss": 0.65453511, "num_input_tokens_seen": 271375810, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71484375, "step": 12582, "time_per_iteration": 2.5905847549438477 }, { "auxiliary_loss_clip": 0.01024617, "auxiliary_loss_mlp": 0.00999794, "balance_loss_clip": 0.99854833, "balance_loss_mlp": 1.00310755, "epoch": 0.7565308883210582, "flos": 48530076433920.0, "grad_norm": 0.9535426116252613, "language_loss": 0.60791934, "learning_rate": 5.570985882978139e-07, "loss": 0.62816346, "num_input_tokens_seen": 271424775, "router_z_loss_clip": 0.01245117, "router_z_loss_mlp": 0.21484375, "step": 12583, "time_per_iteration": 2.8534018993377686 }, { "auxiliary_loss_clip": 0.01124928, "auxiliary_loss_mlp": 0.01035183, "balance_loss_clip": 1.02234387, "balance_loss_mlp": 1.03615785, "epoch": 0.7565910115737261, "flos": 12495082757760.0, "grad_norm": 2.0892232177328096, "language_loss": 0.78951615, "learning_rate": 5.568369850071872e-07, "loss": 0.81111729, "num_input_tokens_seen": 271440500, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 12584, "time_per_iteration": 2.5023739337921143 }, { "auxiliary_loss_clip": 0.01121452, "auxiliary_loss_mlp": 0.01029874, "balance_loss_clip": 1.01765513, "balance_loss_mlp": 1.03585672, "epoch": 0.7566511348263941, "flos": 21173003193600.0, "grad_norm": 1.8362545689088674, "language_loss": 0.77686286, "learning_rate": 5.565754332193357e-07, "loss": 0.79837614, "num_input_tokens_seen": 271458180, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.67578125, "step": 12585, "time_per_iteration": 2.5351171493530273 }, { "auxiliary_loss_clip": 0.01109918, "auxiliary_loss_mlp": 0.01038684, "balance_loss_clip": 1.0250349, "balance_loss_mlp": 1.03792322, "epoch": 0.756711258079062, "flos": 21754827694080.0, "grad_norm": 2.323021372181212, "language_loss": 0.82937288, "learning_rate": 5.563139329435948e-07, "loss": 0.85085893, "num_input_tokens_seen": 271475730, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 12586, "time_per_iteration": 2.5063538551330566 }, { "auxiliary_loss_clip": 0.01109405, "auxiliary_loss_mlp": 0.01031953, "balance_loss_clip": 1.01965046, "balance_loss_mlp": 1.03712416, "epoch": 0.75677138133173, "flos": 22382905933440.0, "grad_norm": 2.212211166482523, "language_loss": 0.83496547, "learning_rate": 5.560524841892959e-07, "loss": 0.85637903, "num_input_tokens_seen": 271495030, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.72265625, "step": 12587, "time_per_iteration": 2.483198404312134 }, { "auxiliary_loss_clip": 0.01131147, "auxiliary_loss_mlp": 0.01027612, "balance_loss_clip": 1.01505315, "balance_loss_mlp": 1.03529942, "epoch": 0.756831504584398, "flos": 22708974620160.0, "grad_norm": 1.5133052510371094, "language_loss": 0.71425164, "learning_rate": 5.557910869657696e-07, "loss": 0.73583925, "num_input_tokens_seen": 271515355, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 12588, "time_per_iteration": 2.5597453117370605 }, { "auxiliary_loss_clip": 0.01108424, "auxiliary_loss_mlp": 0.01277965, "balance_loss_clip": 1.01808465, "balance_loss_mlp": 1.03685594, "epoch": 0.756891627837066, "flos": 24098358643200.0, "grad_norm": 4.029648565451226, "language_loss": 0.6838097, "learning_rate": 5.555297412823444e-07, "loss": 0.70767355, "num_input_tokens_seen": 271535090, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 12589, "time_per_iteration": 2.581064224243164 }, { "auxiliary_loss_clip": 0.01112871, "auxiliary_loss_mlp": 0.01028588, "balance_loss_clip": 1.01563609, "balance_loss_mlp": 1.03419709, "epoch": 0.756951751089734, "flos": 19749001438080.0, "grad_norm": 1.6224135209580597, "language_loss": 0.92357886, "learning_rate": 5.552684471483471e-07, "loss": 0.9449935, "num_input_tokens_seen": 271551075, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69921875, "step": 12590, "time_per_iteration": 2.609304189682007 }, { "auxiliary_loss_clip": 0.01119782, "auxiliary_loss_mlp": 0.01032129, "balance_loss_clip": 1.01963615, "balance_loss_mlp": 1.03410506, "epoch": 0.7570118743424019, "flos": 35079266551680.0, "grad_norm": 1.7975908750926577, "language_loss": 0.6531539, "learning_rate": 5.550072045731027e-07, "loss": 0.67467296, "num_input_tokens_seen": 271571035, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 12591, "time_per_iteration": 2.7140748500823975 }, { "auxiliary_loss_clip": 0.01113391, "auxiliary_loss_mlp": 0.01025158, "balance_loss_clip": 1.01344538, "balance_loss_mlp": 1.03645837, "epoch": 0.7570719975950699, "flos": 25594540778880.0, "grad_norm": 1.6688663769440464, "language_loss": 0.73323238, "learning_rate": 5.547460135659336e-07, "loss": 0.75461781, "num_input_tokens_seen": 271592950, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.68359375, "step": 12592, "time_per_iteration": 2.617475986480713 }, { "auxiliary_loss_clip": 0.01114767, "auxiliary_loss_mlp": 0.01033534, "balance_loss_clip": 1.02092123, "balance_loss_mlp": 1.03445315, "epoch": 0.7571321208477378, "flos": 10816223028480.0, "grad_norm": 2.063388893958438, "language_loss": 0.71616715, "learning_rate": 5.544848741361627e-07, "loss": 0.73765016, "num_input_tokens_seen": 271608835, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 12593, "time_per_iteration": 2.565877676010132 }, { "auxiliary_loss_clip": 0.01107928, "auxiliary_loss_mlp": 0.01031693, "balance_loss_clip": 1.01947403, "balance_loss_mlp": 1.03642106, "epoch": 0.7571922441004059, "flos": 18260109763200.0, "grad_norm": 2.6859256851285562, "language_loss": 0.6648531, "learning_rate": 5.542237862931074e-07, "loss": 0.68624926, "num_input_tokens_seen": 271627730, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.71484375, "step": 12594, "time_per_iteration": 2.544008255004883 }, { "auxiliary_loss_clip": 0.01134754, "auxiliary_loss_mlp": 0.01035787, "balance_loss_clip": 1.02324009, "balance_loss_mlp": 1.03606033, "epoch": 0.7572523673530738, "flos": 22890502978560.0, "grad_norm": 1.6253974895395467, "language_loss": 0.80717552, "learning_rate": 5.539627500460866e-07, "loss": 0.82888091, "num_input_tokens_seen": 271646415, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.72265625, "step": 12595, "time_per_iteration": 2.744553804397583 }, { "auxiliary_loss_clip": 0.01131872, "auxiliary_loss_mlp": 0.01033056, "balance_loss_clip": 1.02089036, "balance_loss_mlp": 1.03658104, "epoch": 0.7573124906057418, "flos": 20996323171200.0, "grad_norm": 1.9001172163347235, "language_loss": 0.71410686, "learning_rate": 5.537017654044152e-07, "loss": 0.73575616, "num_input_tokens_seen": 271666240, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6875, "step": 12596, "time_per_iteration": 2.6208651065826416 }, { "auxiliary_loss_clip": 0.01173689, "auxiliary_loss_mlp": 0.01031389, "balance_loss_clip": 1.01954603, "balance_loss_mlp": 1.03477073, "epoch": 0.7573726138584097, "flos": 20886292834560.0, "grad_norm": 1.665929230273102, "language_loss": 0.8029207, "learning_rate": 5.534408323774085e-07, "loss": 0.8249715, "num_input_tokens_seen": 271686370, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6796875, "step": 12597, "time_per_iteration": 2.6539242267608643 }, { "auxiliary_loss_clip": 0.01138625, "auxiliary_loss_mlp": 0.01031893, "balance_loss_clip": 1.01861894, "balance_loss_mlp": 1.03919256, "epoch": 0.7574327371110777, "flos": 24530507170560.0, "grad_norm": 1.600719570525996, "language_loss": 0.82802194, "learning_rate": 5.531799509743762e-07, "loss": 0.84972715, "num_input_tokens_seen": 271705050, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 12598, "time_per_iteration": 2.5672972202301025 }, { "auxiliary_loss_clip": 0.01122449, "auxiliary_loss_mlp": 0.0102444, "balance_loss_clip": 1.012954, "balance_loss_mlp": 1.03639805, "epoch": 0.7574928603637456, "flos": 23364523785600.0, "grad_norm": 1.6884456467004958, "language_loss": 0.62487543, "learning_rate": 5.529191212046305e-07, "loss": 0.6463443, "num_input_tokens_seen": 271724915, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6796875, "step": 12599, "time_per_iteration": 2.52404522895813 }, { "auxiliary_loss_clip": 0.01126903, "auxiliary_loss_mlp": 0.01279714, "balance_loss_clip": 1.01886344, "balance_loss_mlp": 1.03623438, "epoch": 0.7575529836164137, "flos": 13516274419200.0, "grad_norm": 2.4961446351248995, "language_loss": 0.6320309, "learning_rate": 5.52658343077479e-07, "loss": 0.65609711, "num_input_tokens_seen": 271742410, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7265625, "step": 12600, "time_per_iteration": 2.5545401573181152 }, { "auxiliary_loss_clip": 0.01121432, "auxiliary_loss_mlp": 0.01029561, "balance_loss_clip": 1.01731777, "balance_loss_mlp": 1.03557432, "epoch": 0.7576131068690816, "flos": 19646584784640.0, "grad_norm": 2.0102352033811184, "language_loss": 0.66237122, "learning_rate": 5.523976166022282e-07, "loss": 0.68388116, "num_input_tokens_seen": 271761425, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6796875, "step": 12601, "time_per_iteration": 2.5401546955108643 }, { "auxiliary_loss_clip": 0.01120747, "auxiliary_loss_mlp": 0.01029052, "balance_loss_clip": 1.01728606, "balance_loss_mlp": 1.03521299, "epoch": 0.7576732301217496, "flos": 20048245643520.0, "grad_norm": 2.16238573330673, "language_loss": 0.67797887, "learning_rate": 5.521369417881823e-07, "loss": 0.69947684, "num_input_tokens_seen": 271780875, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.671875, "step": 12602, "time_per_iteration": 3.974874258041382 }, { "auxiliary_loss_clip": 0.01115646, "auxiliary_loss_mlp": 0.01031926, "balance_loss_clip": 1.01893854, "balance_loss_mlp": 1.03575826, "epoch": 0.7577333533744176, "flos": 15377093470080.0, "grad_norm": 1.7737895378321518, "language_loss": 0.66876459, "learning_rate": 5.518763186446451e-07, "loss": 0.69024026, "num_input_tokens_seen": 271799490, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 12603, "time_per_iteration": 2.538630723953247 }, { "auxiliary_loss_clip": 0.01118802, "auxiliary_loss_mlp": 0.01031928, "balance_loss_clip": 1.02043009, "balance_loss_mlp": 1.03289723, "epoch": 0.7577934766270855, "flos": 17894862316800.0, "grad_norm": 9.893023869124727, "language_loss": 0.61700284, "learning_rate": 5.516157471809178e-07, "loss": 0.63851017, "num_input_tokens_seen": 271817040, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6796875, "step": 12604, "time_per_iteration": 2.543255090713501 }, { "auxiliary_loss_clip": 0.0110705, "auxiliary_loss_mlp": 0.01034986, "balance_loss_clip": 1.02153969, "balance_loss_mlp": 1.03798509, "epoch": 0.7578535998797535, "flos": 21613770984960.0, "grad_norm": 2.01721349041137, "language_loss": 0.80015242, "learning_rate": 5.513552274062974e-07, "loss": 0.82157278, "num_input_tokens_seen": 271835480, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.69140625, "step": 12605, "time_per_iteration": 2.6068005561828613 }, { "auxiliary_loss_clip": 0.0110877, "auxiliary_loss_mlp": 0.01028134, "balance_loss_clip": 1.0147469, "balance_loss_mlp": 1.03588939, "epoch": 0.7579137231324214, "flos": 18478374756480.0, "grad_norm": 3.464166283298906, "language_loss": 0.79676604, "learning_rate": 5.510947593300832e-07, "loss": 0.81813514, "num_input_tokens_seen": 271849835, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 12606, "time_per_iteration": 2.590364694595337 }, { "auxiliary_loss_clip": 0.0111145, "auxiliary_loss_mlp": 0.01027708, "balance_loss_clip": 1.01675868, "balance_loss_mlp": 1.03581512, "epoch": 0.7579738463850895, "flos": 23255032152960.0, "grad_norm": 1.4380362128234128, "language_loss": 0.72925663, "learning_rate": 5.508343429615703e-07, "loss": 0.7506482, "num_input_tokens_seen": 271869560, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.6640625, "step": 12607, "time_per_iteration": 3.9668705463409424 }, { "auxiliary_loss_clip": 0.01125758, "auxiliary_loss_mlp": 0.01031524, "balance_loss_clip": 1.0184412, "balance_loss_mlp": 1.03539252, "epoch": 0.7580339696377574, "flos": 14027031861120.0, "grad_norm": 2.200614827396965, "language_loss": 0.75281084, "learning_rate": 5.505739783100516e-07, "loss": 0.77438366, "num_input_tokens_seen": 271887950, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 12608, "time_per_iteration": 2.5384721755981445 }, { "auxiliary_loss_clip": 0.01129377, "auxiliary_loss_mlp": 0.01280306, "balance_loss_clip": 1.02098346, "balance_loss_mlp": 1.03469706, "epoch": 0.7580940928904254, "flos": 25082777756160.0, "grad_norm": 1.5976667306077306, "language_loss": 0.71658826, "learning_rate": 5.503136653848188e-07, "loss": 0.74068511, "num_input_tokens_seen": 271907700, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 12609, "time_per_iteration": 2.5811610221862793 }, { "auxiliary_loss_clip": 0.01106822, "auxiliary_loss_mlp": 0.01031291, "balance_loss_clip": 1.01760554, "balance_loss_mlp": 1.03513145, "epoch": 0.7581542161430933, "flos": 23836425690240.0, "grad_norm": 1.9172822212286804, "language_loss": 0.8153137, "learning_rate": 5.500534041951637e-07, "loss": 0.83669484, "num_input_tokens_seen": 271926840, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 12610, "time_per_iteration": 2.514037609100342 }, { "auxiliary_loss_clip": 0.0112644, "auxiliary_loss_mlp": 0.0103171, "balance_loss_clip": 1.01868081, "balance_loss_mlp": 1.03636372, "epoch": 0.7582143393957613, "flos": 22237000888320.0, "grad_norm": 1.636882371403364, "language_loss": 0.7051506, "learning_rate": 5.497931947503713e-07, "loss": 0.72673213, "num_input_tokens_seen": 271946465, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 12611, "time_per_iteration": 2.5155065059661865 }, { "auxiliary_loss_clip": 0.01104874, "auxiliary_loss_mlp": 0.01029452, "balance_loss_clip": 1.01671457, "balance_loss_mlp": 1.03562641, "epoch": 0.7582744626484292, "flos": 21106389421440.0, "grad_norm": 1.5107263781045257, "language_loss": 0.70706975, "learning_rate": 5.495330370597302e-07, "loss": 0.72841299, "num_input_tokens_seen": 271967295, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 12612, "time_per_iteration": 2.5720574855804443 }, { "auxiliary_loss_clip": 0.01151502, "auxiliary_loss_mlp": 0.01038411, "balance_loss_clip": 1.02641273, "balance_loss_mlp": 1.03588665, "epoch": 0.7583345859010973, "flos": 24604770539520.0, "grad_norm": 1.7914758210278017, "language_loss": 0.59660578, "learning_rate": 5.492729311325232e-07, "loss": 0.61850494, "num_input_tokens_seen": 271987960, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7109375, "step": 12613, "time_per_iteration": 2.6236727237701416 }, { "auxiliary_loss_clip": 0.01123562, "auxiliary_loss_mlp": 0.01035107, "balance_loss_clip": 1.02237499, "balance_loss_mlp": 1.03555477, "epoch": 0.7583947091537652, "flos": 33546814657920.0, "grad_norm": 1.6297291466471866, "language_loss": 0.59708846, "learning_rate": 5.490128769780351e-07, "loss": 0.61867511, "num_input_tokens_seen": 272011780, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 12614, "time_per_iteration": 2.6118500232696533 }, { "auxiliary_loss_clip": 0.0113223, "auxiliary_loss_mlp": 0.01028917, "balance_loss_clip": 1.01593554, "balance_loss_mlp": 1.03444576, "epoch": 0.7584548324064332, "flos": 20121000641280.0, "grad_norm": 1.6998488388657933, "language_loss": 0.73357636, "learning_rate": 5.487528746055436e-07, "loss": 0.75518787, "num_input_tokens_seen": 272030825, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 12615, "time_per_iteration": 5.505589246749878 }, { "auxiliary_loss_clip": 0.01043488, "auxiliary_loss_mlp": 0.0099977, "balance_loss_clip": 0.99823248, "balance_loss_mlp": 1.00443995, "epoch": 0.7585149556591012, "flos": 70402584061440.0, "grad_norm": 0.8051971449552078, "language_loss": 0.67711562, "learning_rate": 5.484929240243294e-07, "loss": 0.69754821, "num_input_tokens_seen": 272095825, "router_z_loss_clip": 0.01531982, "router_z_loss_mlp": 0.21484375, "step": 12616, "time_per_iteration": 3.16146183013916 }, { "auxiliary_loss_clip": 0.01135374, "auxiliary_loss_mlp": 0.0103523, "balance_loss_clip": 1.02215242, "balance_loss_mlp": 1.03584695, "epoch": 0.7585750789117691, "flos": 16143786293760.0, "grad_norm": 1.8327654200787438, "language_loss": 0.84940398, "learning_rate": 5.482330252436693e-07, "loss": 0.87110996, "num_input_tokens_seen": 272113950, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 12617, "time_per_iteration": 2.5487117767333984 }, { "auxiliary_loss_clip": 0.01133873, "auxiliary_loss_mlp": 0.01030106, "balance_loss_clip": 1.01822627, "balance_loss_mlp": 1.03734386, "epoch": 0.7586352021644371, "flos": 17493165544320.0, "grad_norm": 5.002217914342775, "language_loss": 0.74901128, "learning_rate": 5.479731782728381e-07, "loss": 0.7706511, "num_input_tokens_seen": 272130315, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.703125, "step": 12618, "time_per_iteration": 2.5312230587005615 }, { "auxiliary_loss_clip": 0.01128771, "auxiliary_loss_mlp": 0.0103645, "balance_loss_clip": 1.02299714, "balance_loss_mlp": 1.03652573, "epoch": 0.758695325417105, "flos": 17275187859840.0, "grad_norm": 2.7885341821207645, "language_loss": 0.77212501, "learning_rate": 5.477133831211091e-07, "loss": 0.79377723, "num_input_tokens_seen": 272149080, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.74609375, "step": 12619, "time_per_iteration": 2.5474183559417725 }, { "auxiliary_loss_clip": 0.01131593, "auxiliary_loss_mlp": 0.01034201, "balance_loss_clip": 1.02127898, "balance_loss_mlp": 1.03517771, "epoch": 0.7587554486697731, "flos": 29495660163840.0, "grad_norm": 1.6980684879920147, "language_loss": 0.82217246, "learning_rate": 5.474536397977529e-07, "loss": 0.84383041, "num_input_tokens_seen": 272168285, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 12620, "time_per_iteration": 2.65039324760437 }, { "auxiliary_loss_clip": 0.01125895, "auxiliary_loss_mlp": 0.01038447, "balance_loss_clip": 1.02429664, "balance_loss_mlp": 1.03556418, "epoch": 0.758815571922441, "flos": 16100800692480.0, "grad_norm": 3.4517624088042367, "language_loss": 0.82451826, "learning_rate": 5.471939483120413e-07, "loss": 0.84616166, "num_input_tokens_seen": 272184585, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7265625, "step": 12621, "time_per_iteration": 2.6257739067077637 }, { "auxiliary_loss_clip": 0.01124604, "auxiliary_loss_mlp": 0.01033279, "balance_loss_clip": 1.02057779, "balance_loss_mlp": 1.03674626, "epoch": 0.758875695175109, "flos": 16143714466560.0, "grad_norm": 2.071330870143761, "language_loss": 0.73205411, "learning_rate": 5.469343086732396e-07, "loss": 0.75363296, "num_input_tokens_seen": 272200205, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 12622, "time_per_iteration": 2.5233945846557617 }, { "auxiliary_loss_clip": 0.0103397, "auxiliary_loss_mlp": 0.00998851, "balance_loss_clip": 0.99744433, "balance_loss_mlp": 1.00366211, "epoch": 0.7589358184277769, "flos": 68462006860800.0, "grad_norm": 0.8463746586095935, "language_loss": 0.60888994, "learning_rate": 5.466747208906151e-07, "loss": 0.6292181, "num_input_tokens_seen": 272259670, "router_z_loss_clip": 0.01403809, "router_z_loss_mlp": 0.21679688, "step": 12623, "time_per_iteration": 3.189070463180542 }, { "auxiliary_loss_clip": 0.01103807, "auxiliary_loss_mlp": 0.01034188, "balance_loss_clip": 1.022488, "balance_loss_mlp": 1.03611517, "epoch": 0.7589959416804449, "flos": 20047311889920.0, "grad_norm": 1.9871573385432961, "language_loss": 0.67554241, "learning_rate": 5.464151849734313e-07, "loss": 0.69692242, "num_input_tokens_seen": 272277925, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.67578125, "step": 12624, "time_per_iteration": 2.5216238498687744 }, { "auxiliary_loss_clip": 0.01103291, "auxiliary_loss_mlp": 0.01027529, "balance_loss_clip": 1.0148927, "balance_loss_mlp": 1.0353018, "epoch": 0.7590560649331128, "flos": 18771800958720.0, "grad_norm": 1.7718801803947404, "language_loss": 0.76503456, "learning_rate": 5.461557009309507e-07, "loss": 0.7863428, "num_input_tokens_seen": 272296010, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 12625, "time_per_iteration": 2.4677624702453613 }, { "auxiliary_loss_clip": 0.01122625, "auxiliary_loss_mlp": 0.01036062, "balance_loss_clip": 1.02327645, "balance_loss_mlp": 1.03675199, "epoch": 0.7591161881857809, "flos": 29825284296960.0, "grad_norm": 2.0013482273807073, "language_loss": 0.63008159, "learning_rate": 5.458962687724327e-07, "loss": 0.65166843, "num_input_tokens_seen": 272318330, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6796875, "step": 12626, "time_per_iteration": 2.6104934215545654 }, { "auxiliary_loss_clip": 0.01142589, "auxiliary_loss_mlp": 0.0128954, "balance_loss_clip": 1.02920961, "balance_loss_mlp": 1.03681087, "epoch": 0.7591763114384488, "flos": 20302708567680.0, "grad_norm": 4.137211059473716, "language_loss": 0.73570466, "learning_rate": 5.456368885071377e-07, "loss": 0.76002598, "num_input_tokens_seen": 272335265, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 12627, "time_per_iteration": 2.518848180770874 }, { "auxiliary_loss_clip": 0.01115525, "auxiliary_loss_mlp": 0.01031733, "balance_loss_clip": 1.01909065, "balance_loss_mlp": 1.03608, "epoch": 0.7592364346911168, "flos": 20813609664000.0, "grad_norm": 2.263911325788374, "language_loss": 0.68306077, "learning_rate": 5.453775601443198e-07, "loss": 0.70453328, "num_input_tokens_seen": 272354795, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 12628, "time_per_iteration": 2.5175576210021973 }, { "auxiliary_loss_clip": 0.01123778, "auxiliary_loss_mlp": 0.01037218, "balance_loss_clip": 1.02303839, "balance_loss_mlp": 1.03947449, "epoch": 0.7592965579437848, "flos": 21251504367360.0, "grad_norm": 3.1019998812851757, "language_loss": 0.63016689, "learning_rate": 5.451182836932357e-07, "loss": 0.65177691, "num_input_tokens_seen": 272372875, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.75, "step": 12629, "time_per_iteration": 2.552704095840454 }, { "auxiliary_loss_clip": 0.01116831, "auxiliary_loss_mlp": 0.0103056, "balance_loss_clip": 1.01813853, "balance_loss_mlp": 1.03269362, "epoch": 0.7593566811964527, "flos": 26213604704640.0, "grad_norm": 1.4366520209881246, "language_loss": 0.77431607, "learning_rate": 5.448590591631371e-07, "loss": 0.79579002, "num_input_tokens_seen": 272394715, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6640625, "step": 12630, "time_per_iteration": 2.633056879043579 }, { "auxiliary_loss_clip": 0.01025401, "auxiliary_loss_mlp": 0.01001336, "balance_loss_clip": 0.9998402, "balance_loss_mlp": 1.00393343, "epoch": 0.7594168044491207, "flos": 71237255374080.0, "grad_norm": 0.820200471030131, "language_loss": 0.61536151, "learning_rate": 5.445998865632766e-07, "loss": 0.63562888, "num_input_tokens_seen": 272458775, "router_z_loss_clip": 0.01495361, "router_z_loss_mlp": 0.21484375, "step": 12631, "time_per_iteration": 3.168328046798706 }, { "auxiliary_loss_clip": 0.01138431, "auxiliary_loss_mlp": 0.01284297, "balance_loss_clip": 1.02241147, "balance_loss_mlp": 1.03738225, "epoch": 0.7594769277017887, "flos": 26613326229120.0, "grad_norm": 1.6229061246647452, "language_loss": 0.73817372, "learning_rate": 5.443407659029013e-07, "loss": 0.76240098, "num_input_tokens_seen": 272479355, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.74609375, "step": 12632, "time_per_iteration": 2.5687620639801025 }, { "auxiliary_loss_clip": 0.01124797, "auxiliary_loss_mlp": 0.01031449, "balance_loss_clip": 1.01825213, "balance_loss_mlp": 1.03670311, "epoch": 0.7595370509544567, "flos": 17595941333760.0, "grad_norm": 2.40452613917319, "language_loss": 0.7452594, "learning_rate": 5.440816971912605e-07, "loss": 0.76682186, "num_input_tokens_seen": 272493555, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 12633, "time_per_iteration": 2.4716475009918213 }, { "auxiliary_loss_clip": 0.01122912, "auxiliary_loss_mlp": 0.01028763, "balance_loss_clip": 1.01676452, "balance_loss_mlp": 1.03606892, "epoch": 0.7595971742071246, "flos": 18002953319040.0, "grad_norm": 1.823687597328671, "language_loss": 0.73353159, "learning_rate": 5.438226804375991e-07, "loss": 0.75504839, "num_input_tokens_seen": 272508925, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 12634, "time_per_iteration": 2.492053508758545 }, { "auxiliary_loss_clip": 0.01107121, "auxiliary_loss_mlp": 0.01032484, "balance_loss_clip": 1.01947236, "balance_loss_mlp": 1.03711665, "epoch": 0.7596572974597926, "flos": 28840326480000.0, "grad_norm": 1.6733437938584197, "language_loss": 0.64798826, "learning_rate": 5.435637156511597e-07, "loss": 0.66938436, "num_input_tokens_seen": 272528805, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69921875, "step": 12635, "time_per_iteration": 2.557587146759033 }, { "auxiliary_loss_clip": 0.01144379, "auxiliary_loss_mlp": 0.01032698, "balance_loss_clip": 1.01959074, "balance_loss_mlp": 1.03595841, "epoch": 0.7597174207124605, "flos": 14282823588480.0, "grad_norm": 1.7520922985493022, "language_loss": 0.6896733, "learning_rate": 5.43304802841185e-07, "loss": 0.71144408, "num_input_tokens_seen": 272546655, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 12636, "time_per_iteration": 2.6043546199798584 }, { "auxiliary_loss_clip": 0.0112651, "auxiliary_loss_mlp": 0.01034894, "balance_loss_clip": 1.0225203, "balance_loss_mlp": 1.0372026, "epoch": 0.7597775439651285, "flos": 21688932193920.0, "grad_norm": 2.02037955258398, "language_loss": 0.81402725, "learning_rate": 5.430459420169154e-07, "loss": 0.83564126, "num_input_tokens_seen": 272564010, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71484375, "step": 12637, "time_per_iteration": 2.6246979236602783 }, { "auxiliary_loss_clip": 0.0111256, "auxiliary_loss_mlp": 0.01033311, "balance_loss_clip": 1.02125847, "balance_loss_mlp": 1.0333097, "epoch": 0.7598376672177964, "flos": 36101248312320.0, "grad_norm": 1.7540365081061025, "language_loss": 0.66779834, "learning_rate": 5.42787133187588e-07, "loss": 0.68925709, "num_input_tokens_seen": 272585840, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.703125, "step": 12638, "time_per_iteration": 2.701946973800659 }, { "auxiliary_loss_clip": 0.01107295, "auxiliary_loss_mlp": 0.01039066, "balance_loss_clip": 1.02463019, "balance_loss_mlp": 1.0363605, "epoch": 0.7598977904704645, "flos": 18332326056960.0, "grad_norm": 2.0109293280470077, "language_loss": 0.65045828, "learning_rate": 5.425283763624388e-07, "loss": 0.67192185, "num_input_tokens_seen": 272602300, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7109375, "step": 12639, "time_per_iteration": 2.4898393154144287 }, { "auxiliary_loss_clip": 0.01114285, "auxiliary_loss_mlp": 0.01031868, "balance_loss_clip": 1.01863587, "balance_loss_mlp": 1.03592718, "epoch": 0.7599579137231324, "flos": 20192642317440.0, "grad_norm": 1.9880127011475965, "language_loss": 0.69865716, "learning_rate": 5.422696715507036e-07, "loss": 0.72011864, "num_input_tokens_seen": 272619595, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 12640, "time_per_iteration": 2.551833391189575 }, { "auxiliary_loss_clip": 0.01127495, "auxiliary_loss_mlp": 0.0103454, "balance_loss_clip": 1.02081943, "balance_loss_mlp": 1.03725171, "epoch": 0.7600180369758004, "flos": 24024849459840.0, "grad_norm": 1.5954927458630521, "language_loss": 0.66931403, "learning_rate": 5.420110187616138e-07, "loss": 0.69093436, "num_input_tokens_seen": 272638825, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 12641, "time_per_iteration": 2.589448928833008 }, { "auxiliary_loss_clip": 0.01114478, "auxiliary_loss_mlp": 0.01035639, "balance_loss_clip": 1.02232325, "balance_loss_mlp": 1.03401077, "epoch": 0.7600781602284684, "flos": 18989527248000.0, "grad_norm": 2.2609144171483258, "language_loss": 0.66845536, "learning_rate": 5.417524180044007e-07, "loss": 0.68995655, "num_input_tokens_seen": 272657240, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 12642, "time_per_iteration": 2.545835494995117 }, { "auxiliary_loss_clip": 0.01123145, "auxiliary_loss_mlp": 0.01034629, "balance_loss_clip": 1.02234435, "balance_loss_mlp": 1.03786087, "epoch": 0.7601382834811363, "flos": 26067520091520.0, "grad_norm": 2.008562093794108, "language_loss": 0.75356525, "learning_rate": 5.414938692882918e-07, "loss": 0.77514303, "num_input_tokens_seen": 272677520, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.67578125, "step": 12643, "time_per_iteration": 4.001697778701782 }, { "auxiliary_loss_clip": 0.01133689, "auxiliary_loss_mlp": 0.01037355, "balance_loss_clip": 1.02365756, "balance_loss_mlp": 1.03615749, "epoch": 0.7601984067338043, "flos": 18844232734080.0, "grad_norm": 1.9455949637412895, "language_loss": 0.79236996, "learning_rate": 5.412353726225165e-07, "loss": 0.81408036, "num_input_tokens_seen": 272696770, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7109375, "step": 12644, "time_per_iteration": 2.512956142425537 }, { "auxiliary_loss_clip": 0.01122619, "auxiliary_loss_mlp": 0.01029486, "balance_loss_clip": 1.01751089, "balance_loss_mlp": 1.034446, "epoch": 0.7602585299864723, "flos": 24646391424000.0, "grad_norm": 1.5327610317987164, "language_loss": 0.80353415, "learning_rate": 5.409769280162971e-07, "loss": 0.82505512, "num_input_tokens_seen": 272718340, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.69921875, "step": 12645, "time_per_iteration": 2.593092441558838 }, { "auxiliary_loss_clip": 0.01127172, "auxiliary_loss_mlp": 0.01033481, "balance_loss_clip": 1.0205406, "balance_loss_mlp": 1.03671718, "epoch": 0.7603186532391403, "flos": 23842100039040.0, "grad_norm": 1.4491923013150296, "language_loss": 0.73169476, "learning_rate": 5.407185354788584e-07, "loss": 0.75330132, "num_input_tokens_seen": 272739575, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 12646, "time_per_iteration": 2.6107871532440186 }, { "auxiliary_loss_clip": 0.01111554, "auxiliary_loss_mlp": 0.01034707, "balance_loss_clip": 1.02270257, "balance_loss_mlp": 1.03443873, "epoch": 0.7603787764918082, "flos": 22199905117440.0, "grad_norm": 2.1419699908285748, "language_loss": 0.67693031, "learning_rate": 5.40460195019421e-07, "loss": 0.69839299, "num_input_tokens_seen": 272758710, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 12647, "time_per_iteration": 2.5396034717559814 }, { "auxiliary_loss_clip": 0.01034204, "auxiliary_loss_mlp": 0.01004702, "balance_loss_clip": 1.00336099, "balance_loss_mlp": 1.00343919, "epoch": 0.7604388997444762, "flos": 54086894254080.0, "grad_norm": 0.6847863399985771, "language_loss": 0.48975232, "learning_rate": 5.402019066472061e-07, "loss": 0.51014137, "num_input_tokens_seen": 272814855, "router_z_loss_clip": 0.01342773, "router_z_loss_mlp": 0.21679688, "step": 12648, "time_per_iteration": 3.112856388092041 }, { "auxiliary_loss_clip": 0.01110977, "auxiliary_loss_mlp": 0.01035756, "balance_loss_clip": 1.02347779, "balance_loss_mlp": 1.0344038, "epoch": 0.7604990229971441, "flos": 19681920789120.0, "grad_norm": 2.098137984008392, "language_loss": 0.7653901, "learning_rate": 5.399436703714295e-07, "loss": 0.78685737, "num_input_tokens_seen": 272834400, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.67578125, "step": 12649, "time_per_iteration": 3.9595141410827637 }, { "auxiliary_loss_clip": 0.01131071, "auxiliary_loss_mlp": 0.01035647, "balance_loss_clip": 1.02295673, "balance_loss_mlp": 1.03549671, "epoch": 0.7605591462498121, "flos": 25228036356480.0, "grad_norm": 1.6024071559150606, "language_loss": 0.68672824, "learning_rate": 5.39685486201307e-07, "loss": 0.70839548, "num_input_tokens_seen": 272854760, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 12650, "time_per_iteration": 2.66070818901062 }, { "auxiliary_loss_clip": 0.0103383, "auxiliary_loss_mlp": 0.01003307, "balance_loss_clip": 1.00194216, "balance_loss_mlp": 1.00343704, "epoch": 0.76061926950248, "flos": 66783757662720.0, "grad_norm": 0.7504712617311377, "language_loss": 0.62767804, "learning_rate": 5.394273541460543e-07, "loss": 0.64804935, "num_input_tokens_seen": 272919030, "router_z_loss_clip": 0.01367188, "router_z_loss_mlp": 0.21679688, "step": 12651, "time_per_iteration": 3.140227794647217 }, { "auxiliary_loss_clip": 0.01130511, "auxiliary_loss_mlp": 0.01032872, "balance_loss_clip": 1.02081347, "balance_loss_mlp": 1.03553998, "epoch": 0.7606793927551481, "flos": 25338354001920.0, "grad_norm": 1.499451363702502, "language_loss": 0.71431869, "learning_rate": 5.39169274214881e-07, "loss": 0.7359525, "num_input_tokens_seen": 272938925, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.68359375, "step": 12652, "time_per_iteration": 2.6631219387054443 }, { "auxiliary_loss_clip": 0.01152005, "auxiliary_loss_mlp": 0.01034761, "balance_loss_clip": 1.02164769, "balance_loss_mlp": 1.03614783, "epoch": 0.760739516007816, "flos": 18223624523520.0, "grad_norm": 2.1436606811517005, "language_loss": 0.80894518, "learning_rate": 5.389112464169994e-07, "loss": 0.83081281, "num_input_tokens_seen": 272954945, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 12653, "time_per_iteration": 2.5666277408599854 }, { "auxiliary_loss_clip": 0.01111179, "auxiliary_loss_mlp": 0.0102973, "balance_loss_clip": 1.01714158, "balance_loss_mlp": 1.03799939, "epoch": 0.760799639260484, "flos": 22559119079040.0, "grad_norm": 1.5738011062255486, "language_loss": 0.79994035, "learning_rate": 5.386532707616169e-07, "loss": 0.82134938, "num_input_tokens_seen": 272972855, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.73046875, "step": 12654, "time_per_iteration": 2.5140788555145264 }, { "auxiliary_loss_clip": 0.01134827, "auxiliary_loss_mlp": 0.01037752, "balance_loss_clip": 1.02468061, "balance_loss_mlp": 1.03536093, "epoch": 0.760859762513152, "flos": 22309324922880.0, "grad_norm": 2.5289875203204377, "language_loss": 0.79208493, "learning_rate": 5.383953472579401e-07, "loss": 0.81381077, "num_input_tokens_seen": 272989895, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 12655, "time_per_iteration": 2.5376780033111572 }, { "auxiliary_loss_clip": 0.01110046, "auxiliary_loss_mlp": 0.01025701, "balance_loss_clip": 1.01542521, "balance_loss_mlp": 1.0343833, "epoch": 0.7609198857658199, "flos": 24863902231680.0, "grad_norm": 1.5134040539223592, "language_loss": 0.68466187, "learning_rate": 5.381374759151733e-07, "loss": 0.70601928, "num_input_tokens_seen": 273011695, "router_z_loss_clip": 0.10253906, "router_z_loss_mlp": 0.66796875, "step": 12656, "time_per_iteration": 4.033804893493652 }, { "auxiliary_loss_clip": 0.0111676, "auxiliary_loss_mlp": 0.01030168, "balance_loss_clip": 1.0163523, "balance_loss_mlp": 1.03383136, "epoch": 0.760980009018488, "flos": 16836790366080.0, "grad_norm": 2.0246494586617314, "language_loss": 0.73267925, "learning_rate": 5.378796567425198e-07, "loss": 0.75414848, "num_input_tokens_seen": 273028815, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7421875, "step": 12657, "time_per_iteration": 4.027318477630615 }, { "auxiliary_loss_clip": 0.01043366, "auxiliary_loss_mlp": 0.01000599, "balance_loss_clip": 0.9992696, "balance_loss_mlp": 1.00362587, "epoch": 0.7610401322711559, "flos": 61230603029760.0, "grad_norm": 0.8539752806555699, "language_loss": 0.64928895, "learning_rate": 5.376218897491809e-07, "loss": 0.66972864, "num_input_tokens_seen": 273084080, "router_z_loss_clip": 0.01330566, "router_z_loss_mlp": 0.21484375, "step": 12658, "time_per_iteration": 3.0582163333892822 }, { "auxiliary_loss_clip": 0.01111463, "auxiliary_loss_mlp": 0.01031798, "balance_loss_clip": 1.01925731, "balance_loss_mlp": 1.03538656, "epoch": 0.7611002555238239, "flos": 19640730867840.0, "grad_norm": 1.644660722206215, "language_loss": 0.79162312, "learning_rate": 5.373641749443547e-07, "loss": 0.81305575, "num_input_tokens_seen": 273102295, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.66796875, "step": 12659, "time_per_iteration": 2.4965713024139404 }, { "auxiliary_loss_clip": 0.01111585, "auxiliary_loss_mlp": 0.01028577, "balance_loss_clip": 1.01562488, "balance_loss_mlp": 1.0334518, "epoch": 0.7611603787764918, "flos": 26872206526080.0, "grad_norm": 1.5017969493864676, "language_loss": 0.69049567, "learning_rate": 5.371065123372383e-07, "loss": 0.71189725, "num_input_tokens_seen": 273123400, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 12660, "time_per_iteration": 2.6401617527008057 }, { "auxiliary_loss_clip": 0.01107041, "auxiliary_loss_mlp": 0.01029152, "balance_loss_clip": 1.01662934, "balance_loss_mlp": 1.03521419, "epoch": 0.7612205020291598, "flos": 27344252085120.0, "grad_norm": 1.574723557878772, "language_loss": 0.70305693, "learning_rate": 5.368489019370283e-07, "loss": 0.72441894, "num_input_tokens_seen": 273145150, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 12661, "time_per_iteration": 2.5829789638519287 }, { "auxiliary_loss_clip": 0.01111748, "auxiliary_loss_mlp": 0.01032472, "balance_loss_clip": 1.02047408, "balance_loss_mlp": 1.03304493, "epoch": 0.7612806252818277, "flos": 29314598682240.0, "grad_norm": 1.5498970994537196, "language_loss": 0.83211911, "learning_rate": 5.365913437529166e-07, "loss": 0.85356128, "num_input_tokens_seen": 273165180, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69921875, "step": 12662, "time_per_iteration": 2.667036294937134 }, { "auxiliary_loss_clip": 0.0112189, "auxiliary_loss_mlp": 0.01043919, "balance_loss_clip": 1.03152072, "balance_loss_mlp": 1.03361988, "epoch": 0.7613407485344957, "flos": 19026048401280.0, "grad_norm": 1.639158881861939, "language_loss": 0.68587399, "learning_rate": 5.363338377940958e-07, "loss": 0.70753211, "num_input_tokens_seen": 273184005, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 12663, "time_per_iteration": 2.551509380340576 }, { "auxiliary_loss_clip": 0.01110926, "auxiliary_loss_mlp": 0.01026223, "balance_loss_clip": 1.01448107, "balance_loss_mlp": 1.03489184, "epoch": 0.7614008717871636, "flos": 23256037733760.0, "grad_norm": 1.651600830333798, "language_loss": 0.70328832, "learning_rate": 5.360763840697553e-07, "loss": 0.7246598, "num_input_tokens_seen": 273203565, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.671875, "step": 12664, "time_per_iteration": 2.6217217445373535 }, { "auxiliary_loss_clip": 0.0111991, "auxiliary_loss_mlp": 0.01036776, "balance_loss_clip": 1.02297115, "balance_loss_mlp": 1.03783667, "epoch": 0.7614609950398317, "flos": 21579907438080.0, "grad_norm": 1.6372904493426732, "language_loss": 0.79312754, "learning_rate": 5.358189825890833e-07, "loss": 0.8146944, "num_input_tokens_seen": 273221645, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.734375, "step": 12665, "time_per_iteration": 2.581601142883301 }, { "auxiliary_loss_clip": 0.01104544, "auxiliary_loss_mlp": 0.01283438, "balance_loss_clip": 1.02453542, "balance_loss_mlp": 1.0375433, "epoch": 0.7615211182924996, "flos": 29277897960960.0, "grad_norm": 1.673722689966027, "language_loss": 0.8790977, "learning_rate": 5.355616333612651e-07, "loss": 0.90297747, "num_input_tokens_seen": 273242040, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.671875, "step": 12666, "time_per_iteration": 2.591512441635132 }, { "auxiliary_loss_clip": 0.01140768, "auxiliary_loss_mlp": 0.01031829, "balance_loss_clip": 1.01823902, "balance_loss_mlp": 1.03446913, "epoch": 0.7615812415451676, "flos": 13261129136640.0, "grad_norm": 2.548152026803145, "language_loss": 0.8345843, "learning_rate": 5.35304336395485e-07, "loss": 0.85631025, "num_input_tokens_seen": 273257365, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.703125, "step": 12667, "time_per_iteration": 2.637744188308716 }, { "auxiliary_loss_clip": 0.01121754, "auxiliary_loss_mlp": 0.0103793, "balance_loss_clip": 1.02633119, "balance_loss_mlp": 1.03610253, "epoch": 0.7616413647978356, "flos": 18584741905920.0, "grad_norm": 1.8555439418341118, "language_loss": 0.78521931, "learning_rate": 5.350470917009264e-07, "loss": 0.80681616, "num_input_tokens_seen": 273274710, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 12668, "time_per_iteration": 2.6583340167999268 }, { "auxiliary_loss_clip": 0.01130227, "auxiliary_loss_mlp": 0.01033022, "balance_loss_clip": 1.02086246, "balance_loss_mlp": 1.03459001, "epoch": 0.7617014880505035, "flos": 18516188799360.0, "grad_norm": 1.747313556774847, "language_loss": 0.63917923, "learning_rate": 5.347898992867677e-07, "loss": 0.66081172, "num_input_tokens_seen": 273292870, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6875, "step": 12669, "time_per_iteration": 2.5751473903656006 }, { "auxiliary_loss_clip": 0.01125221, "auxiliary_loss_mlp": 0.01037964, "balance_loss_clip": 1.02431452, "balance_loss_mlp": 1.0352149, "epoch": 0.7617616113031715, "flos": 24973178382720.0, "grad_norm": 2.195829504545357, "language_loss": 0.66637844, "learning_rate": 5.345327591621891e-07, "loss": 0.68801028, "num_input_tokens_seen": 273312375, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 12670, "time_per_iteration": 2.5977330207824707 }, { "auxiliary_loss_clip": 0.0111624, "auxiliary_loss_mlp": 0.01034378, "balance_loss_clip": 1.02159309, "balance_loss_mlp": 1.03536701, "epoch": 0.7618217345558395, "flos": 23295036925440.0, "grad_norm": 1.56177636055812, "language_loss": 0.73374951, "learning_rate": 5.342756713363668e-07, "loss": 0.7552557, "num_input_tokens_seen": 273332590, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 12671, "time_per_iteration": 2.5767743587493896 }, { "auxiliary_loss_clip": 0.01129024, "auxiliary_loss_mlp": 0.01032075, "balance_loss_clip": 1.02024913, "balance_loss_mlp": 1.03421283, "epoch": 0.7618818578085075, "flos": 25482894330240.0, "grad_norm": 1.7777711327082466, "language_loss": 0.73188317, "learning_rate": 5.340186358184753e-07, "loss": 0.75349414, "num_input_tokens_seen": 273352885, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 12672, "time_per_iteration": 2.6102938652038574 }, { "auxiliary_loss_clip": 0.0104243, "auxiliary_loss_mlp": 0.01000793, "balance_loss_clip": 0.99953538, "balance_loss_mlp": 1.00326836, "epoch": 0.7619419810611754, "flos": 61151994115200.0, "grad_norm": 0.7594484879317328, "language_loss": 0.56683636, "learning_rate": 5.337616526176873e-07, "loss": 0.58726859, "num_input_tokens_seen": 273411730, "router_z_loss_clip": 0.01257324, "router_z_loss_mlp": 0.21484375, "step": 12673, "time_per_iteration": 3.1651690006256104 }, { "auxiliary_loss_clip": 0.01115612, "auxiliary_loss_mlp": 0.01030236, "balance_loss_clip": 1.01705134, "balance_loss_mlp": 1.03565907, "epoch": 0.7620021043138434, "flos": 23258659426560.0, "grad_norm": 1.6835585244180338, "language_loss": 0.7471931, "learning_rate": 5.33504721743175e-07, "loss": 0.7686516, "num_input_tokens_seen": 273430020, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 12674, "time_per_iteration": 2.709160566329956 }, { "auxiliary_loss_clip": 0.01122765, "auxiliary_loss_mlp": 0.01026569, "balance_loss_clip": 1.01478505, "balance_loss_mlp": 1.0360806, "epoch": 0.7620622275665113, "flos": 25082490447360.0, "grad_norm": 2.0233966682229, "language_loss": 0.72514611, "learning_rate": 5.332478432041065e-07, "loss": 0.74663943, "num_input_tokens_seen": 273448690, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6875, "step": 12675, "time_per_iteration": 2.668121099472046 }, { "auxiliary_loss_clip": 0.01112096, "auxiliary_loss_mlp": 0.01029189, "balance_loss_clip": 1.01745319, "balance_loss_mlp": 1.03592515, "epoch": 0.7621223508191793, "flos": 20155007842560.0, "grad_norm": 1.9991796260135506, "language_loss": 0.7260884, "learning_rate": 5.329910170096499e-07, "loss": 0.74750125, "num_input_tokens_seen": 273465190, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 12676, "time_per_iteration": 2.5577380657196045 }, { "auxiliary_loss_clip": 0.01139094, "auxiliary_loss_mlp": 0.01282494, "balance_loss_clip": 1.02244937, "balance_loss_mlp": 1.03737044, "epoch": 0.7621824740718472, "flos": 18000187971840.0, "grad_norm": 2.844361013915223, "language_loss": 0.54098535, "learning_rate": 5.327342431689696e-07, "loss": 0.56520128, "num_input_tokens_seen": 273478620, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.74609375, "step": 12677, "time_per_iteration": 2.6845297813415527 }, { "auxiliary_loss_clip": 0.01140665, "auxiliary_loss_mlp": 0.010334, "balance_loss_clip": 1.0206387, "balance_loss_mlp": 1.03490496, "epoch": 0.7622425973245153, "flos": 21725668828800.0, "grad_norm": 2.196661357300335, "language_loss": 0.78729463, "learning_rate": 5.324775216912312e-07, "loss": 0.8090353, "num_input_tokens_seen": 273497635, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 12678, "time_per_iteration": 2.570535182952881 }, { "auxiliary_loss_clip": 0.01134567, "auxiliary_loss_mlp": 0.01033297, "balance_loss_clip": 1.02039254, "balance_loss_mlp": 1.03604865, "epoch": 0.7623027205771832, "flos": 19718549683200.0, "grad_norm": 2.034084666172568, "language_loss": 0.77627319, "learning_rate": 5.322208525855942e-07, "loss": 0.79795182, "num_input_tokens_seen": 273513955, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 12679, "time_per_iteration": 2.634788990020752 }, { "auxiliary_loss_clip": 0.01114718, "auxiliary_loss_mlp": 0.01025286, "balance_loss_clip": 1.01259565, "balance_loss_mlp": 1.03509212, "epoch": 0.7623628438298512, "flos": 23988831096960.0, "grad_norm": 1.5811614975959505, "language_loss": 0.79919595, "learning_rate": 5.319642358612191e-07, "loss": 0.82059598, "num_input_tokens_seen": 273533970, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 12680, "time_per_iteration": 2.58100962638855 }, { "auxiliary_loss_clip": 0.01125713, "auxiliary_loss_mlp": 0.01031232, "balance_loss_clip": 1.01696301, "balance_loss_mlp": 1.03658056, "epoch": 0.7624229670825191, "flos": 22345702421760.0, "grad_norm": 2.0044224255848513, "language_loss": 0.62981755, "learning_rate": 5.317076715272652e-07, "loss": 0.65138704, "num_input_tokens_seen": 273553090, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.71484375, "step": 12681, "time_per_iteration": 2.6168792247772217 }, { "auxiliary_loss_clip": 0.01129882, "auxiliary_loss_mlp": 0.01033696, "balance_loss_clip": 1.02131569, "balance_loss_mlp": 1.03479981, "epoch": 0.7624830903351871, "flos": 22711775880960.0, "grad_norm": 1.7912784637198655, "language_loss": 0.7580806, "learning_rate": 5.314511595928867e-07, "loss": 0.77971637, "num_input_tokens_seen": 273572460, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 12682, "time_per_iteration": 2.5477101802825928 }, { "auxiliary_loss_clip": 0.01119606, "auxiliary_loss_mlp": 0.01030334, "balance_loss_clip": 1.01867497, "balance_loss_mlp": 1.03477943, "epoch": 0.7625432135878552, "flos": 25593714766080.0, "grad_norm": 1.6218870350265238, "language_loss": 0.68256444, "learning_rate": 5.311947000672392e-07, "loss": 0.70406383, "num_input_tokens_seen": 273592815, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 12683, "time_per_iteration": 2.6585073471069336 }, { "auxiliary_loss_clip": 0.01132611, "auxiliary_loss_mlp": 0.01278249, "balance_loss_clip": 1.01862645, "balance_loss_mlp": 1.03568697, "epoch": 0.7626033368405231, "flos": 23987645948160.0, "grad_norm": 1.9565105934205345, "language_loss": 0.83120668, "learning_rate": 5.309382929594739e-07, "loss": 0.85531527, "num_input_tokens_seen": 273611790, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 12684, "time_per_iteration": 2.574448823928833 }, { "auxiliary_loss_clip": 0.01123093, "auxiliary_loss_mlp": 0.01039417, "balance_loss_clip": 1.02555895, "balance_loss_mlp": 1.03467524, "epoch": 0.7626634600931911, "flos": 12599115523200.0, "grad_norm": 1.8223517930403563, "language_loss": 0.8297562, "learning_rate": 5.306819382787433e-07, "loss": 0.8513813, "num_input_tokens_seen": 273628340, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7109375, "step": 12685, "time_per_iteration": 3.893387794494629 }, { "auxiliary_loss_clip": 0.01133692, "auxiliary_loss_mlp": 0.01271565, "balance_loss_clip": 1.012236, "balance_loss_mlp": 1.03634501, "epoch": 0.762723583345859, "flos": 26322593546880.0, "grad_norm": 1.9548230979087209, "language_loss": 0.77063525, "learning_rate": 5.304256360341936e-07, "loss": 0.79468781, "num_input_tokens_seen": 273646585, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 12686, "time_per_iteration": 2.6788742542266846 }, { "auxiliary_loss_clip": 0.0114492, "auxiliary_loss_mlp": 0.01041994, "balance_loss_clip": 1.02727795, "balance_loss_mlp": 1.03630042, "epoch": 0.762783706598527, "flos": 21907053532800.0, "grad_norm": 1.6203048374029552, "language_loss": 0.72268575, "learning_rate": 5.301693862349734e-07, "loss": 0.74455488, "num_input_tokens_seen": 273665410, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7265625, "step": 12687, "time_per_iteration": 2.5770797729492188 }, { "auxiliary_loss_clip": 0.01123333, "auxiliary_loss_mlp": 0.01038691, "balance_loss_clip": 1.02515531, "balance_loss_mlp": 1.03574276, "epoch": 0.7628438298511949, "flos": 15339782217600.0, "grad_norm": 2.054900490900355, "language_loss": 0.64904994, "learning_rate": 5.299131888902271e-07, "loss": 0.67067015, "num_input_tokens_seen": 273683035, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.6953125, "step": 12688, "time_per_iteration": 2.542018175125122 }, { "auxiliary_loss_clip": 0.01112663, "auxiliary_loss_mlp": 0.01028303, "balance_loss_clip": 1.01634634, "balance_loss_mlp": 1.03633714, "epoch": 0.7629039531038629, "flos": 13006307076480.0, "grad_norm": 1.9758032022061043, "language_loss": 0.70784605, "learning_rate": 5.296570440090973e-07, "loss": 0.72925574, "num_input_tokens_seen": 273700130, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.67578125, "step": 12689, "time_per_iteration": 2.4858920574188232 }, { "auxiliary_loss_clip": 0.01137964, "auxiliary_loss_mlp": 0.01040506, "balance_loss_clip": 1.02745271, "balance_loss_mlp": 1.03726351, "epoch": 0.7629640763565309, "flos": 26171660597760.0, "grad_norm": 1.7337893013464005, "language_loss": 0.69369239, "learning_rate": 5.29400951600725e-07, "loss": 0.71547711, "num_input_tokens_seen": 273720310, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7421875, "step": 12690, "time_per_iteration": 2.62296462059021 }, { "auxiliary_loss_clip": 0.01124459, "auxiliary_loss_mlp": 0.01033605, "balance_loss_clip": 1.02073681, "balance_loss_mlp": 1.03645086, "epoch": 0.7630241996091989, "flos": 36793713680640.0, "grad_norm": 2.097738678863644, "language_loss": 0.69612199, "learning_rate": 5.291449116742503e-07, "loss": 0.71770263, "num_input_tokens_seen": 273744475, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 12691, "time_per_iteration": 4.033323287963867 }, { "auxiliary_loss_clip": 0.01112482, "auxiliary_loss_mlp": 0.01033713, "balance_loss_clip": 1.02249527, "balance_loss_mlp": 1.03484511, "epoch": 0.7630843228618668, "flos": 21835160461440.0, "grad_norm": 2.7981823429043624, "language_loss": 0.81924617, "learning_rate": 5.288889242388105e-07, "loss": 0.84070814, "num_input_tokens_seen": 273764635, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.68359375, "step": 12692, "time_per_iteration": 2.5781702995300293 }, { "auxiliary_loss_clip": 0.01136501, "auxiliary_loss_mlp": 0.01030386, "balance_loss_clip": 1.01853704, "balance_loss_mlp": 1.03821659, "epoch": 0.7631444461145348, "flos": 12640520926080.0, "grad_norm": 2.0944268542733417, "language_loss": 0.77003503, "learning_rate": 5.286329893035406e-07, "loss": 0.79170394, "num_input_tokens_seen": 273780115, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.71875, "step": 12693, "time_per_iteration": 2.585507392883301 }, { "auxiliary_loss_clip": 0.01133587, "auxiliary_loss_mlp": 0.01032153, "balance_loss_clip": 1.01887298, "balance_loss_mlp": 1.03637338, "epoch": 0.7632045693672027, "flos": 16836610798080.0, "grad_norm": 2.066077356899339, "language_loss": 0.72660977, "learning_rate": 5.283771068775747e-07, "loss": 0.74826717, "num_input_tokens_seen": 273796605, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 12694, "time_per_iteration": 2.5884156227111816 }, { "auxiliary_loss_clip": 0.01111074, "auxiliary_loss_mlp": 0.01026921, "balance_loss_clip": 1.01429081, "balance_loss_mlp": 1.03380275, "epoch": 0.7632646926198707, "flos": 22017335264640.0, "grad_norm": 2.0230179529254064, "language_loss": 0.70488393, "learning_rate": 5.281212769700442e-07, "loss": 0.72626388, "num_input_tokens_seen": 273816515, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 12695, "time_per_iteration": 2.5552682876586914 }, { "auxiliary_loss_clip": 0.01105342, "auxiliary_loss_mlp": 0.01028582, "balance_loss_clip": 1.0167979, "balance_loss_mlp": 1.03573096, "epoch": 0.7633248158725388, "flos": 23114011357440.0, "grad_norm": 1.4824365197278901, "language_loss": 0.72698832, "learning_rate": 5.278654995900793e-07, "loss": 0.74832761, "num_input_tokens_seen": 273837060, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 12696, "time_per_iteration": 2.5861761569976807 }, { "auxiliary_loss_clip": 0.01149859, "auxiliary_loss_mlp": 0.01034129, "balance_loss_clip": 1.02049756, "balance_loss_mlp": 1.03499532, "epoch": 0.7633849391252067, "flos": 10889839952640.0, "grad_norm": 2.3881333749429334, "language_loss": 0.71646905, "learning_rate": 5.276097747468074e-07, "loss": 0.73830891, "num_input_tokens_seen": 273853365, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.703125, "step": 12697, "time_per_iteration": 2.563267707824707 }, { "auxiliary_loss_clip": 0.01132764, "auxiliary_loss_mlp": 0.01030691, "balance_loss_clip": 1.01867521, "balance_loss_mlp": 1.03755975, "epoch": 0.7634450623778747, "flos": 20994168355200.0, "grad_norm": 1.915780389991051, "language_loss": 0.6649859, "learning_rate": 5.273541024493565e-07, "loss": 0.68662041, "num_input_tokens_seen": 273870750, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 12698, "time_per_iteration": 4.16024923324585 }, { "auxiliary_loss_clip": 0.01113939, "auxiliary_loss_mlp": 0.01028389, "balance_loss_clip": 1.01552677, "balance_loss_mlp": 1.03435826, "epoch": 0.7635051856305426, "flos": 18882046776960.0, "grad_norm": 1.7046104291476325, "language_loss": 0.71970981, "learning_rate": 5.27098482706848e-07, "loss": 0.74113303, "num_input_tokens_seen": 273890890, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 12699, "time_per_iteration": 4.126791954040527 }, { "auxiliary_loss_clip": 0.01123111, "auxiliary_loss_mlp": 0.01031189, "balance_loss_clip": 1.01938152, "balance_loss_mlp": 1.03651261, "epoch": 0.7635653088832106, "flos": 34786989584640.0, "grad_norm": 2.3974257079488384, "language_loss": 0.72705221, "learning_rate": 5.268429155284069e-07, "loss": 0.74859524, "num_input_tokens_seen": 273914015, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 12700, "time_per_iteration": 2.6793127059936523 }, { "auxiliary_loss_clip": 0.01104736, "auxiliary_loss_mlp": 0.01030224, "balance_loss_clip": 1.01799273, "balance_loss_mlp": 1.03458393, "epoch": 0.7636254321358785, "flos": 23178434400000.0, "grad_norm": 2.0571866112456028, "language_loss": 0.6891222, "learning_rate": 5.265874009231519e-07, "loss": 0.71047175, "num_input_tokens_seen": 273927415, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 12701, "time_per_iteration": 2.500991106033325 }, { "auxiliary_loss_clip": 0.01125445, "auxiliary_loss_mlp": 0.01030352, "balance_loss_clip": 1.01790667, "balance_loss_mlp": 1.03667688, "epoch": 0.7636855553885465, "flos": 21325229032320.0, "grad_norm": 1.6418961068223916, "language_loss": 0.64388812, "learning_rate": 5.263319389002037e-07, "loss": 0.66544604, "num_input_tokens_seen": 273946690, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 12702, "time_per_iteration": 2.579150676727295 }, { "auxiliary_loss_clip": 0.01133139, "auxiliary_loss_mlp": 0.01030595, "balance_loss_clip": 1.0184896, "balance_loss_mlp": 1.03585792, "epoch": 0.7637456786412145, "flos": 28658079849600.0, "grad_norm": 3.8843979233245216, "language_loss": 0.65473831, "learning_rate": 5.260765294686767e-07, "loss": 0.67637563, "num_input_tokens_seen": 273966870, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.7109375, "step": 12703, "time_per_iteration": 2.725080966949463 }, { "auxiliary_loss_clip": 0.01125182, "auxiliary_loss_mlp": 0.01027613, "balance_loss_clip": 1.01532888, "balance_loss_mlp": 1.03805161, "epoch": 0.7638058018938825, "flos": 21907269014400.0, "grad_norm": 2.0873556563337403, "language_loss": 0.83901525, "learning_rate": 5.258211726376875e-07, "loss": 0.86054313, "num_input_tokens_seen": 273986360, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 12704, "time_per_iteration": 2.6162216663360596 }, { "auxiliary_loss_clip": 0.01125726, "auxiliary_loss_mlp": 0.01032808, "balance_loss_clip": 1.02047038, "balance_loss_mlp": 1.03724957, "epoch": 0.7638659251465504, "flos": 29643899592960.0, "grad_norm": 1.5050577333336033, "language_loss": 0.67937082, "learning_rate": 5.255658684163488e-07, "loss": 0.70095617, "num_input_tokens_seen": 274009745, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 12705, "time_per_iteration": 2.6186866760253906 }, { "auxiliary_loss_clip": 0.01121479, "auxiliary_loss_mlp": 0.01026678, "balance_loss_clip": 1.01397598, "balance_loss_mlp": 1.03546095, "epoch": 0.7639260483992184, "flos": 26141172929280.0, "grad_norm": 1.6916135677487292, "language_loss": 0.73697889, "learning_rate": 5.253106168137715e-07, "loss": 0.7584604, "num_input_tokens_seen": 274028775, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.68359375, "step": 12706, "time_per_iteration": 2.6130287647247314 }, { "auxiliary_loss_clip": 0.01114792, "auxiliary_loss_mlp": 0.01032297, "balance_loss_clip": 1.01949978, "balance_loss_mlp": 1.03591704, "epoch": 0.7639861716518863, "flos": 20156695781760.0, "grad_norm": 2.691844008653138, "language_loss": 0.77934885, "learning_rate": 5.250554178390643e-07, "loss": 0.80081975, "num_input_tokens_seen": 274047520, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 12707, "time_per_iteration": 2.577009916305542 }, { "auxiliary_loss_clip": 0.01122795, "auxiliary_loss_mlp": 0.01030757, "balance_loss_clip": 1.01796603, "balance_loss_mlp": 1.03466833, "epoch": 0.7640462949045543, "flos": 18583125793920.0, "grad_norm": 1.9866882318493095, "language_loss": 0.79982042, "learning_rate": 5.248002715013358e-07, "loss": 0.82135606, "num_input_tokens_seen": 274065350, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 12708, "time_per_iteration": 2.5609662532806396 }, { "auxiliary_loss_clip": 0.01032852, "auxiliary_loss_mlp": 0.01002126, "balance_loss_clip": 1.00068939, "balance_loss_mlp": 1.00267792, "epoch": 0.7641064181572224, "flos": 68321991646080.0, "grad_norm": 0.8239227979199653, "language_loss": 0.5645951, "learning_rate": 5.245451778096914e-07, "loss": 0.58494496, "num_input_tokens_seen": 274122315, "router_z_loss_clip": 0.01434326, "router_z_loss_mlp": 0.21386719, "step": 12709, "time_per_iteration": 3.191039562225342 }, { "auxiliary_loss_clip": 0.01119196, "auxiliary_loss_mlp": 0.01037647, "balance_loss_clip": 1.02341926, "balance_loss_mlp": 1.03690124, "epoch": 0.7641665414098903, "flos": 17968982031360.0, "grad_norm": 1.6728980666171638, "language_loss": 0.63418382, "learning_rate": 5.242901367732333e-07, "loss": 0.65575224, "num_input_tokens_seen": 274140555, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.734375, "step": 12710, "time_per_iteration": 2.5589427947998047 }, { "auxiliary_loss_clip": 0.01143579, "auxiliary_loss_mlp": 0.01284765, "balance_loss_clip": 1.02415705, "balance_loss_mlp": 1.03592825, "epoch": 0.7642266646625583, "flos": 21252078984960.0, "grad_norm": 2.7930110958312575, "language_loss": 0.65201044, "learning_rate": 5.240351484010648e-07, "loss": 0.67629385, "num_input_tokens_seen": 274161125, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 12711, "time_per_iteration": 2.5773167610168457 }, { "auxiliary_loss_clip": 0.01120827, "auxiliary_loss_mlp": 0.01031411, "balance_loss_clip": 1.01913238, "balance_loss_mlp": 1.03405118, "epoch": 0.7642867879152262, "flos": 22747794243840.0, "grad_norm": 1.5665210862557486, "language_loss": 0.72894204, "learning_rate": 5.237802127022853e-07, "loss": 0.75046444, "num_input_tokens_seen": 274180835, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 12712, "time_per_iteration": 2.545891523361206 }, { "auxiliary_loss_clip": 0.01129557, "auxiliary_loss_mlp": 0.01032185, "balance_loss_clip": 1.02017426, "balance_loss_mlp": 1.03531623, "epoch": 0.7643469111678942, "flos": 23332132696320.0, "grad_norm": 1.497260701813097, "language_loss": 0.80411363, "learning_rate": 5.235253296859925e-07, "loss": 0.82573104, "num_input_tokens_seen": 274201190, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 12713, "time_per_iteration": 2.647496461868286 }, { "auxiliary_loss_clip": 0.01117571, "auxiliary_loss_mlp": 0.01277849, "balance_loss_clip": 1.01735091, "balance_loss_mlp": 1.0367763, "epoch": 0.7644070344205621, "flos": 19857092440320.0, "grad_norm": 2.104947074169152, "language_loss": 0.8302083, "learning_rate": 5.232704993612822e-07, "loss": 0.85416251, "num_input_tokens_seen": 274217595, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 12714, "time_per_iteration": 2.471970558166504 }, { "auxiliary_loss_clip": 0.01117268, "auxiliary_loss_mlp": 0.01040016, "balance_loss_clip": 1.02703381, "balance_loss_mlp": 1.0362823, "epoch": 0.7644671576732301, "flos": 22090628966400.0, "grad_norm": 1.5827889524247856, "language_loss": 0.72640598, "learning_rate": 5.230157217372506e-07, "loss": 0.74797881, "num_input_tokens_seen": 274237885, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 12715, "time_per_iteration": 2.6184322834014893 }, { "auxiliary_loss_clip": 0.01132057, "auxiliary_loss_mlp": 0.01025882, "balance_loss_clip": 1.01347804, "balance_loss_mlp": 1.0355351, "epoch": 0.7645272809258981, "flos": 25481421872640.0, "grad_norm": 1.7849954059884199, "language_loss": 0.63261306, "learning_rate": 5.227609968229871e-07, "loss": 0.65419245, "num_input_tokens_seen": 274258820, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 12716, "time_per_iteration": 2.6331894397735596 }, { "auxiliary_loss_clip": 0.0110874, "auxiliary_loss_mlp": 0.01034793, "balance_loss_clip": 1.02063715, "balance_loss_mlp": 1.03573418, "epoch": 0.7645874041785661, "flos": 21541877913600.0, "grad_norm": 1.5074366680494677, "language_loss": 0.7978701, "learning_rate": 5.225063246275844e-07, "loss": 0.81930542, "num_input_tokens_seen": 274278835, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.73046875, "step": 12717, "time_per_iteration": 2.6378047466278076 }, { "auxiliary_loss_clip": 0.01105018, "auxiliary_loss_mlp": 0.01034981, "balance_loss_clip": 1.02244067, "balance_loss_mlp": 1.03576994, "epoch": 0.764647527431234, "flos": 20630896156800.0, "grad_norm": 2.8949980433750517, "language_loss": 0.66349828, "learning_rate": 5.222517051601301e-07, "loss": 0.68489826, "num_input_tokens_seen": 274297110, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 12718, "time_per_iteration": 2.5924386978149414 }, { "auxiliary_loss_clip": 0.01126754, "auxiliary_loss_mlp": 0.01035689, "balance_loss_clip": 1.02372074, "balance_loss_mlp": 1.03291321, "epoch": 0.764707650683902, "flos": 21434074220160.0, "grad_norm": 2.7620787466220342, "language_loss": 0.78070927, "learning_rate": 5.219971384297121e-07, "loss": 0.80233371, "num_input_tokens_seen": 274315610, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.66796875, "step": 12719, "time_per_iteration": 2.6236093044281006 }, { "auxiliary_loss_clip": 0.01106003, "auxiliary_loss_mlp": 0.01032692, "balance_loss_clip": 1.0192039, "balance_loss_mlp": 1.03522635, "epoch": 0.7647677739365699, "flos": 22711201263360.0, "grad_norm": 1.7842487239513674, "language_loss": 0.69818556, "learning_rate": 5.217426244454133e-07, "loss": 0.71957254, "num_input_tokens_seen": 274333975, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.70703125, "step": 12720, "time_per_iteration": 2.5350964069366455 }, { "auxiliary_loss_clip": 0.01132706, "auxiliary_loss_mlp": 0.01035349, "balance_loss_clip": 1.02254581, "balance_loss_mlp": 1.03649616, "epoch": 0.7648278971892379, "flos": 21324115710720.0, "grad_norm": 1.6990817099519206, "language_loss": 0.73886323, "learning_rate": 5.214881632163182e-07, "loss": 0.76054382, "num_input_tokens_seen": 274353695, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 12721, "time_per_iteration": 2.5914993286132812 }, { "auxiliary_loss_clip": 0.01133156, "auxiliary_loss_mlp": 0.01030772, "balance_loss_clip": 1.01788533, "balance_loss_mlp": 1.03627264, "epoch": 0.764888020441906, "flos": 20667345482880.0, "grad_norm": 1.9776280901638346, "language_loss": 0.7361806, "learning_rate": 5.212337547515076e-07, "loss": 0.75781989, "num_input_tokens_seen": 274371120, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 12722, "time_per_iteration": 2.489161252975464 }, { "auxiliary_loss_clip": 0.0111349, "auxiliary_loss_mlp": 0.01037458, "balance_loss_clip": 1.02475595, "balance_loss_mlp": 1.03539658, "epoch": 0.7649481436945739, "flos": 25082526360960.0, "grad_norm": 1.9342463653188975, "language_loss": 0.74103975, "learning_rate": 5.209793990600601e-07, "loss": 0.76254916, "num_input_tokens_seen": 274389665, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 12723, "time_per_iteration": 2.661121129989624 }, { "auxiliary_loss_clip": 0.01106016, "auxiliary_loss_mlp": 0.01030173, "balance_loss_clip": 1.01740575, "balance_loss_mlp": 1.03399515, "epoch": 0.7650082669472419, "flos": 24900890261760.0, "grad_norm": 1.8635265372050824, "language_loss": 0.72917008, "learning_rate": 5.207250961510536e-07, "loss": 0.75053203, "num_input_tokens_seen": 274408750, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 12724, "time_per_iteration": 2.604163646697998 }, { "auxiliary_loss_clip": 0.01111009, "auxiliary_loss_mlp": 0.01026593, "balance_loss_clip": 1.0144875, "balance_loss_mlp": 1.03526902, "epoch": 0.7650683901999098, "flos": 14647388676480.0, "grad_norm": 2.6665937163035673, "language_loss": 0.84075236, "learning_rate": 5.204708460335632e-07, "loss": 0.86212838, "num_input_tokens_seen": 274424600, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.66796875, "step": 12725, "time_per_iteration": 2.5853121280670166 }, { "auxiliary_loss_clip": 0.01132336, "auxiliary_loss_mlp": 0.01035145, "balance_loss_clip": 1.02156162, "balance_loss_mlp": 1.03566349, "epoch": 0.7651285134525778, "flos": 26352434770560.0, "grad_norm": 1.8157297237714278, "language_loss": 0.77549678, "learning_rate": 5.202166487166626e-07, "loss": 0.79717159, "num_input_tokens_seen": 274443075, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.6953125, "step": 12726, "time_per_iteration": 3.9948105812072754 }, { "auxiliary_loss_clip": 0.01114734, "auxiliary_loss_mlp": 0.01031591, "balance_loss_clip": 1.0190438, "balance_loss_mlp": 1.0351696, "epoch": 0.7651886367052457, "flos": 26646866553600.0, "grad_norm": 1.6715920384894762, "language_loss": 0.70410264, "learning_rate": 5.199625042094227e-07, "loss": 0.72556591, "num_input_tokens_seen": 274463240, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 12727, "time_per_iteration": 2.57788348197937 }, { "auxiliary_loss_clip": 0.01099424, "auxiliary_loss_mlp": 0.01027548, "balance_loss_clip": 1.01542473, "balance_loss_mlp": 1.03248143, "epoch": 0.7652487599579137, "flos": 25702847262720.0, "grad_norm": 1.772272205219716, "language_loss": 0.79668742, "learning_rate": 5.197084125209144e-07, "loss": 0.81795716, "num_input_tokens_seen": 274482750, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.66796875, "step": 12728, "time_per_iteration": 2.5587565898895264 }, { "auxiliary_loss_clip": 0.01107033, "auxiliary_loss_mlp": 0.01030404, "balance_loss_clip": 1.01712441, "balance_loss_mlp": 1.03574395, "epoch": 0.7653088832105817, "flos": 28585576247040.0, "grad_norm": 1.823430571799684, "language_loss": 0.55384254, "learning_rate": 5.19454373660205e-07, "loss": 0.57521689, "num_input_tokens_seen": 274503545, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 12729, "time_per_iteration": 2.5540497303009033 }, { "auxiliary_loss_clip": 0.01120677, "auxiliary_loss_mlp": 0.01278055, "balance_loss_clip": 1.01824903, "balance_loss_mlp": 1.03379905, "epoch": 0.7653690064632497, "flos": 23366750428800.0, "grad_norm": 1.7183056563727435, "language_loss": 0.78059095, "learning_rate": 5.192003876363609e-07, "loss": 0.8045783, "num_input_tokens_seen": 274523825, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69140625, "step": 12730, "time_per_iteration": 2.5752382278442383 }, { "auxiliary_loss_clip": 0.01118994, "auxiliary_loss_mlp": 0.01038907, "balance_loss_clip": 1.02575278, "balance_loss_mlp": 1.03922081, "epoch": 0.7654291297159176, "flos": 15773905992960.0, "grad_norm": 1.8544836732074474, "language_loss": 0.68980992, "learning_rate": 5.18946454458445e-07, "loss": 0.71138895, "num_input_tokens_seen": 274541625, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 12731, "time_per_iteration": 2.459199905395508 }, { "auxiliary_loss_clip": 0.0110752, "auxiliary_loss_mlp": 0.01033417, "balance_loss_clip": 1.02067339, "balance_loss_mlp": 1.03578067, "epoch": 0.7654892529685856, "flos": 18033800123520.0, "grad_norm": 2.009982547186321, "language_loss": 0.70317787, "learning_rate": 5.18692574135522e-07, "loss": 0.7245872, "num_input_tokens_seen": 274557580, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 12732, "time_per_iteration": 3.963165044784546 }, { "auxiliary_loss_clip": 0.01125007, "auxiliary_loss_mlp": 0.01031083, "balance_loss_clip": 1.01801789, "balance_loss_mlp": 1.035869, "epoch": 0.7655493762212535, "flos": 27236017428480.0, "grad_norm": 1.59780156592592, "language_loss": 0.78196013, "learning_rate": 5.184387466766491e-07, "loss": 0.80352104, "num_input_tokens_seen": 274578135, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 12733, "time_per_iteration": 2.567960262298584 }, { "auxiliary_loss_clip": 0.01112406, "auxiliary_loss_mlp": 0.01030836, "balance_loss_clip": 1.01849174, "balance_loss_mlp": 1.03548419, "epoch": 0.7656094994739215, "flos": 20773964027520.0, "grad_norm": 1.8157246789167913, "language_loss": 0.77054089, "learning_rate": 5.181849720908868e-07, "loss": 0.79197335, "num_input_tokens_seen": 274595655, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6796875, "step": 12734, "time_per_iteration": 2.4968574047088623 }, { "auxiliary_loss_clip": 0.01125796, "auxiliary_loss_mlp": 0.01031904, "balance_loss_clip": 1.01844561, "balance_loss_mlp": 1.03581786, "epoch": 0.7656696227265896, "flos": 23039245198080.0, "grad_norm": 1.6631440914685294, "language_loss": 0.73386508, "learning_rate": 5.17931250387291e-07, "loss": 0.75544202, "num_input_tokens_seen": 274616305, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 12735, "time_per_iteration": 2.555227279663086 }, { "auxiliary_loss_clip": 0.01123751, "auxiliary_loss_mlp": 0.01030444, "balance_loss_clip": 1.01787305, "balance_loss_mlp": 1.03534067, "epoch": 0.7657297459792575, "flos": 27525636789120.0, "grad_norm": 1.4065298934891088, "language_loss": 0.72712713, "learning_rate": 5.176775815749175e-07, "loss": 0.74866915, "num_input_tokens_seen": 274638110, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 12736, "time_per_iteration": 2.623239755630493 }, { "auxiliary_loss_clip": 0.01118928, "auxiliary_loss_mlp": 0.01288858, "balance_loss_clip": 1.02652717, "balance_loss_mlp": 1.03739512, "epoch": 0.7657898692319255, "flos": 17128456801920.0, "grad_norm": 1.9810269246795333, "language_loss": 0.77636671, "learning_rate": 5.174239656628167e-07, "loss": 0.8004446, "num_input_tokens_seen": 274656565, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.7265625, "step": 12737, "time_per_iteration": 2.4703121185302734 }, { "auxiliary_loss_clip": 0.01122288, "auxiliary_loss_mlp": 0.01031843, "balance_loss_clip": 1.01842606, "balance_loss_mlp": 1.03402925, "epoch": 0.7658499924845934, "flos": 21465747037440.0, "grad_norm": 1.6924369146079559, "language_loss": 0.76794553, "learning_rate": 5.171704026600418e-07, "loss": 0.78948689, "num_input_tokens_seen": 274674215, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 12738, "time_per_iteration": 2.483989953994751 }, { "auxiliary_loss_clip": 0.01126058, "auxiliary_loss_mlp": 0.01029834, "balance_loss_clip": 1.01651824, "balance_loss_mlp": 1.03614712, "epoch": 0.7659101157372614, "flos": 29496665744640.0, "grad_norm": 1.8819235115561754, "language_loss": 0.62777638, "learning_rate": 5.169168925756415e-07, "loss": 0.64933532, "num_input_tokens_seen": 274693445, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 12739, "time_per_iteration": 2.6421327590942383 }, { "auxiliary_loss_clip": 0.01106934, "auxiliary_loss_mlp": 0.01033754, "balance_loss_clip": 1.02042592, "balance_loss_mlp": 1.03641689, "epoch": 0.7659702389899293, "flos": 18551812112640.0, "grad_norm": 1.974408416804648, "language_loss": 0.78959441, "learning_rate": 5.166634354186612e-07, "loss": 0.8110013, "num_input_tokens_seen": 274712815, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 12740, "time_per_iteration": 4.77795672416687 }, { "auxiliary_loss_clip": 0.0111338, "auxiliary_loss_mlp": 0.01033828, "balance_loss_clip": 1.02050018, "balance_loss_mlp": 1.03432405, "epoch": 0.7660303622425974, "flos": 23549176627200.0, "grad_norm": 1.6609043098595606, "language_loss": 0.65286863, "learning_rate": 5.164100311981478e-07, "loss": 0.67434072, "num_input_tokens_seen": 274732690, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 12741, "time_per_iteration": 2.557974338531494 }, { "auxiliary_loss_clip": 0.0112096, "auxiliary_loss_mlp": 0.01028055, "balance_loss_clip": 1.01549053, "balance_loss_mlp": 1.03497016, "epoch": 0.7660904854952653, "flos": 18916736336640.0, "grad_norm": 1.8810647850673778, "language_loss": 0.76006114, "learning_rate": 5.161566799231443e-07, "loss": 0.7815513, "num_input_tokens_seen": 274752460, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 12742, "time_per_iteration": 2.5600907802581787 }, { "auxiliary_loss_clip": 0.01120928, "auxiliary_loss_mlp": 0.01031012, "balance_loss_clip": 1.01918054, "balance_loss_mlp": 1.03629446, "epoch": 0.7661506087479333, "flos": 23147515768320.0, "grad_norm": 1.753624674812096, "language_loss": 0.76690942, "learning_rate": 5.159033816026919e-07, "loss": 0.78842878, "num_input_tokens_seen": 274773070, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 12743, "time_per_iteration": 2.550027847290039 }, { "auxiliary_loss_clip": 0.01121726, "auxiliary_loss_mlp": 0.01032359, "balance_loss_clip": 1.01966882, "balance_loss_mlp": 1.03462076, "epoch": 0.7662107320006012, "flos": 17565776887680.0, "grad_norm": 2.1146001776904875, "language_loss": 0.74688077, "learning_rate": 5.156501362458297e-07, "loss": 0.76842165, "num_input_tokens_seen": 274790220, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 12744, "time_per_iteration": 2.554790735244751 }, { "auxiliary_loss_clip": 0.01115181, "auxiliary_loss_mlp": 0.01033664, "balance_loss_clip": 1.02068269, "balance_loss_mlp": 1.03534257, "epoch": 0.7662708552532692, "flos": 22303075956480.0, "grad_norm": 3.274100699357842, "language_loss": 0.71182388, "learning_rate": 5.153969438615964e-07, "loss": 0.73331237, "num_input_tokens_seen": 274805095, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 12745, "time_per_iteration": 2.5502889156341553 }, { "auxiliary_loss_clip": 0.01114681, "auxiliary_loss_mlp": 0.01034667, "balance_loss_clip": 1.02172661, "balance_loss_mlp": 1.03511024, "epoch": 0.7663309785059371, "flos": 15742053607680.0, "grad_norm": 2.3923479822819638, "language_loss": 0.76909488, "learning_rate": 5.151438044590273e-07, "loss": 0.79058838, "num_input_tokens_seen": 274821800, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 12746, "time_per_iteration": 2.523869037628174 }, { "auxiliary_loss_clip": 0.01051344, "auxiliary_loss_mlp": 0.0100399, "balance_loss_clip": 1.00267303, "balance_loss_mlp": 1.00326061, "epoch": 0.7663911017586051, "flos": 62163312514560.0, "grad_norm": 0.6613285421267806, "language_loss": 0.56776494, "learning_rate": 5.148907180471565e-07, "loss": 0.58831829, "num_input_tokens_seen": 274886970, "router_z_loss_clip": 0.01318359, "router_z_loss_mlp": 0.21582031, "step": 12747, "time_per_iteration": 3.239047050476074 }, { "auxiliary_loss_clip": 0.01101382, "auxiliary_loss_mlp": 0.01030127, "balance_loss_clip": 1.01939785, "balance_loss_mlp": 1.03367734, "epoch": 0.7664512250112732, "flos": 26506025326080.0, "grad_norm": 1.5671704900140657, "language_loss": 0.7240901, "learning_rate": 5.146376846350151e-07, "loss": 0.7454052, "num_input_tokens_seen": 274907240, "router_z_loss_clip": 0.10742188, "router_z_loss_mlp": 0.67578125, "step": 12748, "time_per_iteration": 2.5954058170318604 }, { "auxiliary_loss_clip": 0.0112949, "auxiliary_loss_mlp": 0.01033276, "balance_loss_clip": 1.02018654, "balance_loss_mlp": 1.03379285, "epoch": 0.7665113482639411, "flos": 16249542912000.0, "grad_norm": 1.8557511996574092, "language_loss": 0.69051009, "learning_rate": 5.143847042316351e-07, "loss": 0.71213776, "num_input_tokens_seen": 274924650, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6875, "step": 12749, "time_per_iteration": 2.510802984237671 }, { "auxiliary_loss_clip": 0.01114389, "auxiliary_loss_mlp": 0.0102983, "balance_loss_clip": 1.01715779, "balance_loss_mlp": 1.03578794, "epoch": 0.7665714715166091, "flos": 27197880163200.0, "grad_norm": 2.0472253267011267, "language_loss": 0.7340821, "learning_rate": 5.141317768460425e-07, "loss": 0.75552428, "num_input_tokens_seen": 274944550, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 12750, "time_per_iteration": 2.6372199058532715 }, { "auxiliary_loss_clip": 0.01110226, "auxiliary_loss_mlp": 0.01029511, "balance_loss_clip": 1.01789355, "balance_loss_mlp": 1.0345, "epoch": 0.766631594769277, "flos": 21067785279360.0, "grad_norm": 1.9076913004036873, "language_loss": 0.75581223, "learning_rate": 5.13878902487265e-07, "loss": 0.77720958, "num_input_tokens_seen": 274961330, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 12751, "time_per_iteration": 2.485734462738037 }, { "auxiliary_loss_clip": 0.01132887, "auxiliary_loss_mlp": 0.01035279, "balance_loss_clip": 1.02251816, "balance_loss_mlp": 1.03710437, "epoch": 0.766691718021945, "flos": 24097963593600.0, "grad_norm": 1.6848242803692288, "language_loss": 0.61442935, "learning_rate": 5.136260811643263e-07, "loss": 0.63611102, "num_input_tokens_seen": 274981655, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 12752, "time_per_iteration": 2.573601245880127 }, { "auxiliary_loss_clip": 0.01115633, "auxiliary_loss_mlp": 0.0103431, "balance_loss_clip": 1.02111983, "balance_loss_mlp": 1.03581381, "epoch": 0.7667518412746129, "flos": 23440654661760.0, "grad_norm": 3.202846966113566, "language_loss": 0.69214082, "learning_rate": 5.133733128862506e-07, "loss": 0.71364021, "num_input_tokens_seen": 274999970, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 12753, "time_per_iteration": 2.5523245334625244 }, { "auxiliary_loss_clip": 0.01114606, "auxiliary_loss_mlp": 0.01037599, "balance_loss_clip": 1.02484345, "balance_loss_mlp": 1.03646672, "epoch": 0.766811964527281, "flos": 18148786536960.0, "grad_norm": 2.3161368848017103, "language_loss": 0.6198988, "learning_rate": 5.131205976620565e-07, "loss": 0.6414209, "num_input_tokens_seen": 275015805, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 12754, "time_per_iteration": 2.5438029766082764 }, { "auxiliary_loss_clip": 0.01105877, "auxiliary_loss_mlp": 0.01030285, "balance_loss_clip": 1.0183816, "balance_loss_mlp": 1.03795218, "epoch": 0.7668720877799489, "flos": 19536051657600.0, "grad_norm": 2.3074142541720777, "language_loss": 0.79617137, "learning_rate": 5.128679355007633e-07, "loss": 0.81753302, "num_input_tokens_seen": 275031810, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 12755, "time_per_iteration": 2.487649917602539 }, { "auxiliary_loss_clip": 0.0110462, "auxiliary_loss_mlp": 0.01029066, "balance_loss_clip": 1.01703215, "balance_loss_mlp": 1.03611505, "epoch": 0.7669322110326169, "flos": 22674320974080.0, "grad_norm": 1.7231821639974356, "language_loss": 0.70069933, "learning_rate": 5.126153264113891e-07, "loss": 0.72203618, "num_input_tokens_seen": 275049325, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 12756, "time_per_iteration": 2.5465970039367676 }, { "auxiliary_loss_clip": 0.01113551, "auxiliary_loss_mlp": 0.01036879, "balance_loss_clip": 1.02474928, "balance_loss_mlp": 1.03530455, "epoch": 0.7669923342852848, "flos": 26469396432000.0, "grad_norm": 1.4788031458317634, "language_loss": 0.7030127, "learning_rate": 5.123627704029465e-07, "loss": 0.72451699, "num_input_tokens_seen": 275070865, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 12757, "time_per_iteration": 2.5494461059570312 }, { "auxiliary_loss_clip": 0.01124814, "auxiliary_loss_mlp": 0.01037255, "balance_loss_clip": 1.02357602, "balance_loss_mlp": 1.03747976, "epoch": 0.7670524575379528, "flos": 22856136641280.0, "grad_norm": 1.9086506309253137, "language_loss": 0.75864822, "learning_rate": 5.121102674844509e-07, "loss": 0.78026885, "num_input_tokens_seen": 275088015, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.69921875, "step": 12758, "time_per_iteration": 2.5738635063171387 }, { "auxiliary_loss_clip": 0.01111213, "auxiliary_loss_mlp": 0.01035645, "balance_loss_clip": 1.02387881, "balance_loss_mlp": 1.03517985, "epoch": 0.7671125807906207, "flos": 22452141398400.0, "grad_norm": 1.8444900627218639, "language_loss": 0.76319593, "learning_rate": 5.118578176649124e-07, "loss": 0.78466451, "num_input_tokens_seen": 275106975, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 12759, "time_per_iteration": 2.554633140563965 }, { "auxiliary_loss_clip": 0.01119982, "auxiliary_loss_mlp": 0.01028082, "balance_loss_clip": 1.01619065, "balance_loss_mlp": 1.03247666, "epoch": 0.7671727040432887, "flos": 35371543518720.0, "grad_norm": 2.644906001127598, "language_loss": 0.68155575, "learning_rate": 5.116054209533404e-07, "loss": 0.70303643, "num_input_tokens_seen": 275129560, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 12760, "time_per_iteration": 2.699108839035034 }, { "auxiliary_loss_clip": 0.01124175, "auxiliary_loss_mlp": 0.01034863, "balance_loss_clip": 1.0220418, "balance_loss_mlp": 1.0371877, "epoch": 0.7672328272959568, "flos": 22494947431680.0, "grad_norm": 1.51753043952528, "language_loss": 0.79346859, "learning_rate": 5.113530773587418e-07, "loss": 0.81505901, "num_input_tokens_seen": 275151180, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 12761, "time_per_iteration": 2.627028703689575 }, { "auxiliary_loss_clip": 0.01119774, "auxiliary_loss_mlp": 0.01029687, "balance_loss_clip": 1.01792121, "balance_loss_mlp": 1.0354296, "epoch": 0.7672929505486247, "flos": 22815557251200.0, "grad_norm": 1.6658753449047772, "language_loss": 0.66570985, "learning_rate": 5.111007868901232e-07, "loss": 0.68720448, "num_input_tokens_seen": 275170605, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6640625, "step": 12762, "time_per_iteration": 2.630009651184082 }, { "auxiliary_loss_clip": 0.01114995, "auxiliary_loss_mlp": 0.0103066, "balance_loss_clip": 1.01781487, "balance_loss_mlp": 1.03537083, "epoch": 0.7673530738012927, "flos": 20338834671360.0, "grad_norm": 2.1077246922609563, "language_loss": 0.74674058, "learning_rate": 5.108485495564876e-07, "loss": 0.76819718, "num_input_tokens_seen": 275188750, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 12763, "time_per_iteration": 2.528388023376465 }, { "auxiliary_loss_clip": 0.01124819, "auxiliary_loss_mlp": 0.0103099, "balance_loss_clip": 1.01825213, "balance_loss_mlp": 1.03854513, "epoch": 0.7674131970539606, "flos": 34933576988160.0, "grad_norm": 1.6834405093040863, "language_loss": 0.70176876, "learning_rate": 5.105963653668366e-07, "loss": 0.7233268, "num_input_tokens_seen": 275211365, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 12764, "time_per_iteration": 2.65989089012146 }, { "auxiliary_loss_clip": 0.01113559, "auxiliary_loss_mlp": 0.01028379, "balance_loss_clip": 1.01606452, "balance_loss_mlp": 1.03626752, "epoch": 0.7674733203066286, "flos": 28328850766080.0, "grad_norm": 1.6020332494967413, "language_loss": 0.69515663, "learning_rate": 5.103442343301696e-07, "loss": 0.71657598, "num_input_tokens_seen": 275231670, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 12765, "time_per_iteration": 2.6093127727508545 }, { "auxiliary_loss_clip": 0.01119842, "auxiliary_loss_mlp": 0.01027953, "balance_loss_clip": 1.01693761, "balance_loss_mlp": 1.03342485, "epoch": 0.7675334435592965, "flos": 16289727252480.0, "grad_norm": 2.2127719855983514, "language_loss": 0.60983682, "learning_rate": 5.100921564554863e-07, "loss": 0.63131475, "num_input_tokens_seen": 275249425, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.68359375, "step": 12766, "time_per_iteration": 2.5141241550445557 }, { "auxiliary_loss_clip": 0.01052582, "auxiliary_loss_mlp": 0.01003179, "balance_loss_clip": 1.00191581, "balance_loss_mlp": 1.0041064, "epoch": 0.7675935668119646, "flos": 64826232220800.0, "grad_norm": 0.731501876850627, "language_loss": 0.6076566, "learning_rate": 5.098401317517802e-07, "loss": 0.62821424, "num_input_tokens_seen": 275312485, "router_z_loss_clip": 0.01263428, "router_z_loss_mlp": 0.21484375, "step": 12767, "time_per_iteration": 3.2869224548339844 }, { "auxiliary_loss_clip": 0.01120358, "auxiliary_loss_mlp": 0.0102639, "balance_loss_clip": 1.0148263, "balance_loss_mlp": 1.03530002, "epoch": 0.7676536900646325, "flos": 22675398382080.0, "grad_norm": 1.8029675577165876, "language_loss": 0.69357133, "learning_rate": 5.095881602280472e-07, "loss": 0.71503878, "num_input_tokens_seen": 275331680, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.671875, "step": 12768, "time_per_iteration": 3.9288125038146973 }, { "auxiliary_loss_clip": 0.01120114, "auxiliary_loss_mlp": 0.01033467, "balance_loss_clip": 1.02043164, "balance_loss_mlp": 1.03862405, "epoch": 0.7677138133173005, "flos": 26939682224640.0, "grad_norm": 3.445043912433553, "language_loss": 0.70503455, "learning_rate": 5.093362418932796e-07, "loss": 0.72657037, "num_input_tokens_seen": 275351615, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 12769, "time_per_iteration": 2.5414180755615234 }, { "auxiliary_loss_clip": 0.01116237, "auxiliary_loss_mlp": 0.01026245, "balance_loss_clip": 1.01316142, "balance_loss_mlp": 1.03637445, "epoch": 0.7677739365699684, "flos": 23799545400960.0, "grad_norm": 2.2442036199480406, "language_loss": 0.8043443, "learning_rate": 5.090843767564659e-07, "loss": 0.82576919, "num_input_tokens_seen": 275368815, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 12770, "time_per_iteration": 2.610762596130371 }, { "auxiliary_loss_clip": 0.01128764, "auxiliary_loss_mlp": 0.01030109, "balance_loss_clip": 1.01880836, "balance_loss_mlp": 1.03515458, "epoch": 0.7678340598226364, "flos": 34455497944320.0, "grad_norm": 1.5765838229071678, "language_loss": 0.78442574, "learning_rate": 5.088325648265961e-07, "loss": 0.80601442, "num_input_tokens_seen": 275389345, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.671875, "step": 12771, "time_per_iteration": 2.6649301052093506 }, { "auxiliary_loss_clip": 0.01104064, "auxiliary_loss_mlp": 0.01029866, "balance_loss_clip": 1.01745665, "balance_loss_mlp": 1.03467226, "epoch": 0.7678941830753043, "flos": 23841740903040.0, "grad_norm": 1.7470010892288006, "language_loss": 0.68339169, "learning_rate": 5.085808061126559e-07, "loss": 0.70473099, "num_input_tokens_seen": 275411240, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 12772, "time_per_iteration": 2.569836139678955 }, { "auxiliary_loss_clip": 0.01109431, "auxiliary_loss_mlp": 0.01025497, "balance_loss_clip": 1.01404727, "balance_loss_mlp": 1.03464937, "epoch": 0.7679543063279723, "flos": 25410929431680.0, "grad_norm": 1.6271364729234958, "language_loss": 0.73082769, "learning_rate": 5.083291006236317e-07, "loss": 0.75217688, "num_input_tokens_seen": 275432010, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.66015625, "step": 12773, "time_per_iteration": 2.638444423675537 }, { "auxiliary_loss_clip": 0.01127329, "auxiliary_loss_mlp": 0.01028436, "balance_loss_clip": 1.01718843, "balance_loss_mlp": 1.03421021, "epoch": 0.7680144295806404, "flos": 27962382257280.0, "grad_norm": 1.5966338366902433, "language_loss": 0.80727601, "learning_rate": 5.080774483685033e-07, "loss": 0.8288337, "num_input_tokens_seen": 275453710, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6640625, "step": 12774, "time_per_iteration": 4.071376323699951 }, { "auxiliary_loss_clip": 0.01113031, "auxiliary_loss_mlp": 0.0127888, "balance_loss_clip": 1.01996851, "balance_loss_mlp": 1.03626299, "epoch": 0.7680745528333083, "flos": 20412810731520.0, "grad_norm": 1.6117695296434016, "language_loss": 0.6989519, "learning_rate": 5.078258493562539e-07, "loss": 0.72287095, "num_input_tokens_seen": 275472915, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 12775, "time_per_iteration": 2.612353801727295 }, { "auxiliary_loss_clip": 0.01061015, "auxiliary_loss_mlp": 0.01005034, "balance_loss_clip": 1.00376451, "balance_loss_mlp": 1.00420475, "epoch": 0.7681346760859763, "flos": 68401103351040.0, "grad_norm": 0.6934777602080356, "language_loss": 0.56806952, "learning_rate": 5.075743035958617e-07, "loss": 0.58873004, "num_input_tokens_seen": 275534785, "router_z_loss_clip": 0.01269531, "router_z_loss_mlp": 0.21582031, "step": 12776, "time_per_iteration": 3.236389636993408 }, { "auxiliary_loss_clip": 0.0111112, "auxiliary_loss_mlp": 0.01034115, "balance_loss_clip": 1.02243841, "balance_loss_mlp": 1.03425741, "epoch": 0.7681947993386442, "flos": 21251468453760.0, "grad_norm": 1.7809182534833645, "language_loss": 0.73796964, "learning_rate": 5.073228110963035e-07, "loss": 0.75942206, "num_input_tokens_seen": 275553205, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 12777, "time_per_iteration": 2.5857863426208496 }, { "auxiliary_loss_clip": 0.01130132, "auxiliary_loss_mlp": 0.01034405, "balance_loss_clip": 1.0222342, "balance_loss_mlp": 1.03590322, "epoch": 0.7682549225913122, "flos": 21397696721280.0, "grad_norm": 2.059240843024706, "language_loss": 0.70741284, "learning_rate": 5.070713718665538e-07, "loss": 0.72905815, "num_input_tokens_seen": 275571490, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6796875, "step": 12778, "time_per_iteration": 2.525007486343384 }, { "auxiliary_loss_clip": 0.01112141, "auxiliary_loss_mlp": 0.01031291, "balance_loss_clip": 1.01967359, "balance_loss_mlp": 1.03507328, "epoch": 0.7683150458439801, "flos": 23038921975680.0, "grad_norm": 1.6970084310084868, "language_loss": 0.70088899, "learning_rate": 5.068199859155875e-07, "loss": 0.72232336, "num_input_tokens_seen": 275589665, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 12779, "time_per_iteration": 2.5322751998901367 }, { "auxiliary_loss_clip": 0.01124537, "auxiliary_loss_mlp": 0.0102978, "balance_loss_clip": 1.01738262, "balance_loss_mlp": 1.03761458, "epoch": 0.7683751690966482, "flos": 67332397996800.0, "grad_norm": 1.5806297800189013, "language_loss": 0.58675921, "learning_rate": 5.065686532523748e-07, "loss": 0.60830235, "num_input_tokens_seen": 275615605, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 12780, "time_per_iteration": 2.9141829013824463 }, { "auxiliary_loss_clip": 0.01117404, "auxiliary_loss_mlp": 0.01039798, "balance_loss_clip": 1.02504563, "balance_loss_mlp": 1.03544819, "epoch": 0.7684352923493161, "flos": 21798890703360.0, "grad_norm": 1.7891145414907574, "language_loss": 0.68432331, "learning_rate": 5.063173738858852e-07, "loss": 0.7058953, "num_input_tokens_seen": 275634965, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.734375, "step": 12781, "time_per_iteration": 5.648432970046997 }, { "auxiliary_loss_clip": 0.01138116, "auxiliary_loss_mlp": 0.01031729, "balance_loss_clip": 1.01892579, "balance_loss_mlp": 1.03388917, "epoch": 0.7684954156019841, "flos": 25847603072640.0, "grad_norm": 1.506853730541853, "language_loss": 0.79379779, "learning_rate": 5.060661478250858e-07, "loss": 0.81549621, "num_input_tokens_seen": 275655785, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.68359375, "step": 12782, "time_per_iteration": 2.6012237071990967 }, { "auxiliary_loss_clip": 0.01123267, "auxiliary_loss_mlp": 0.01029836, "balance_loss_clip": 1.01712823, "balance_loss_mlp": 1.03484607, "epoch": 0.768555538854652, "flos": 25447378757760.0, "grad_norm": 14.96382580888413, "language_loss": 0.66786301, "learning_rate": 5.05814975078944e-07, "loss": 0.68939406, "num_input_tokens_seen": 275676160, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 12783, "time_per_iteration": 2.5614535808563232 }, { "auxiliary_loss_clip": 0.01123476, "auxiliary_loss_mlp": 0.0103667, "balance_loss_clip": 1.02330017, "balance_loss_mlp": 1.03592634, "epoch": 0.76861566210732, "flos": 19646369303040.0, "grad_norm": 1.6909382984438444, "language_loss": 0.68874937, "learning_rate": 5.055638556564217e-07, "loss": 0.71035081, "num_input_tokens_seen": 275695660, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.69921875, "step": 12784, "time_per_iteration": 2.513514280319214 }, { "auxiliary_loss_clip": 0.01138919, "auxiliary_loss_mlp": 0.01026817, "balance_loss_clip": 1.01588511, "balance_loss_mlp": 1.03470612, "epoch": 0.7686757853599879, "flos": 22419032037120.0, "grad_norm": 1.785768137358516, "language_loss": 0.80400372, "learning_rate": 5.053127895664804e-07, "loss": 0.82566106, "num_input_tokens_seen": 275714025, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.6875, "step": 12785, "time_per_iteration": 2.5401813983917236 }, { "auxiliary_loss_clip": 0.01113849, "auxiliary_loss_mlp": 0.01034465, "balance_loss_clip": 1.0211314, "balance_loss_mlp": 1.03516078, "epoch": 0.768735908612656, "flos": 47774262453120.0, "grad_norm": 1.5778567091144806, "language_loss": 0.77304089, "learning_rate": 5.050617768180823e-07, "loss": 0.79452401, "num_input_tokens_seen": 275737300, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6953125, "step": 12786, "time_per_iteration": 2.747681140899658 }, { "auxiliary_loss_clip": 0.01102204, "auxiliary_loss_mlp": 0.0103112, "balance_loss_clip": 1.01834655, "balance_loss_mlp": 1.03405845, "epoch": 0.7687960318653239, "flos": 30263179000320.0, "grad_norm": 2.01207431278482, "language_loss": 0.58561742, "learning_rate": 5.048108174201826e-07, "loss": 0.60695064, "num_input_tokens_seen": 275757895, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6796875, "step": 12787, "time_per_iteration": 2.5640718936920166 }, { "auxiliary_loss_clip": 0.01117947, "auxiliary_loss_mlp": 0.01029331, "balance_loss_clip": 1.01671314, "balance_loss_mlp": 1.03213143, "epoch": 0.7688561551179919, "flos": 19573434737280.0, "grad_norm": 1.6161399260818523, "language_loss": 0.7581706, "learning_rate": 5.045599113817394e-07, "loss": 0.77964336, "num_input_tokens_seen": 275776745, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 12788, "time_per_iteration": 2.5344808101654053 }, { "auxiliary_loss_clip": 0.01142528, "auxiliary_loss_mlp": 0.01285762, "balance_loss_clip": 1.02559495, "balance_loss_mlp": 1.03670502, "epoch": 0.7689162783706599, "flos": 22783776693120.0, "grad_norm": 1.8618061776650974, "language_loss": 0.66884434, "learning_rate": 5.043090587117056e-07, "loss": 0.69312716, "num_input_tokens_seen": 275797205, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 12789, "time_per_iteration": 2.5525522232055664 }, { "auxiliary_loss_clip": 0.01123745, "auxiliary_loss_mlp": 0.01035823, "balance_loss_clip": 1.02189326, "balance_loss_mlp": 1.0353477, "epoch": 0.7689764016233278, "flos": 34204195416960.0, "grad_norm": 1.801443495961803, "language_loss": 0.68681991, "learning_rate": 5.040582594190352e-07, "loss": 0.70841563, "num_input_tokens_seen": 275817935, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.70703125, "step": 12790, "time_per_iteration": 2.617788076400757 }, { "auxiliary_loss_clip": 0.01121989, "auxiliary_loss_mlp": 0.0103235, "balance_loss_clip": 1.01986861, "balance_loss_mlp": 1.03413808, "epoch": 0.7690365248759958, "flos": 17274469587840.0, "grad_norm": 1.7594776622178725, "language_loss": 0.68783599, "learning_rate": 5.038075135126765e-07, "loss": 0.70937932, "num_input_tokens_seen": 275837145, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 12791, "time_per_iteration": 2.6079094409942627 }, { "auxiliary_loss_clip": 0.01110681, "auxiliary_loss_mlp": 0.01033284, "balance_loss_clip": 1.02139306, "balance_loss_mlp": 1.03530979, "epoch": 0.7690966481286637, "flos": 18223157646720.0, "grad_norm": 1.8042164250406316, "language_loss": 0.79741859, "learning_rate": 5.035568210015795e-07, "loss": 0.81885827, "num_input_tokens_seen": 275855705, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.66796875, "step": 12792, "time_per_iteration": 2.5133047103881836 }, { "auxiliary_loss_clip": 0.01136992, "auxiliary_loss_mlp": 0.0128092, "balance_loss_clip": 1.02010953, "balance_loss_mlp": 1.03857255, "epoch": 0.7691567713813318, "flos": 21537568281600.0, "grad_norm": 1.7059652448357134, "language_loss": 0.72736698, "learning_rate": 5.033061818946902e-07, "loss": 0.75154603, "num_input_tokens_seen": 275873930, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.71875, "step": 12793, "time_per_iteration": 2.5283098220825195 }, { "auxiliary_loss_clip": 0.01132057, "auxiliary_loss_mlp": 0.0103415, "balance_loss_clip": 1.0217278, "balance_loss_mlp": 1.03600073, "epoch": 0.7692168946339997, "flos": 39379999720320.0, "grad_norm": 1.6434697604883495, "language_loss": 0.63616282, "learning_rate": 5.030555962009532e-07, "loss": 0.65782487, "num_input_tokens_seen": 275895895, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 12794, "time_per_iteration": 2.705476760864258 }, { "auxiliary_loss_clip": 0.01126866, "auxiliary_loss_mlp": 0.01029383, "balance_loss_clip": 1.01716971, "balance_loss_mlp": 1.03744936, "epoch": 0.7692770178866677, "flos": 25009950931200.0, "grad_norm": 1.8244519940706847, "language_loss": 0.7647813, "learning_rate": 5.028050639293111e-07, "loss": 0.78634381, "num_input_tokens_seen": 275917825, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7109375, "step": 12795, "time_per_iteration": 2.6059155464172363 }, { "auxiliary_loss_clip": 0.01130863, "auxiliary_loss_mlp": 0.01029961, "balance_loss_clip": 1.01689577, "balance_loss_mlp": 1.03379166, "epoch": 0.7693371411393356, "flos": 24716273333760.0, "grad_norm": 1.5225908557306607, "language_loss": 0.71491992, "learning_rate": 5.025545850887054e-07, "loss": 0.7365281, "num_input_tokens_seen": 275937890, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 12796, "time_per_iteration": 2.5518338680267334 }, { "auxiliary_loss_clip": 0.01103667, "auxiliary_loss_mlp": 0.01025705, "balance_loss_clip": 1.0131942, "balance_loss_mlp": 1.033988, "epoch": 0.7693972643920036, "flos": 15924803028480.0, "grad_norm": 1.735401986785093, "language_loss": 0.64916277, "learning_rate": 5.023041596880748e-07, "loss": 0.67045653, "num_input_tokens_seen": 275954495, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 12797, "time_per_iteration": 2.4949095249176025 }, { "auxiliary_loss_clip": 0.0113346, "auxiliary_loss_mlp": 0.01032427, "balance_loss_clip": 1.01887846, "balance_loss_mlp": 1.03636885, "epoch": 0.7694573876446715, "flos": 25405901527680.0, "grad_norm": 1.907598694218816, "language_loss": 0.91442573, "learning_rate": 5.02053787736356e-07, "loss": 0.93608463, "num_input_tokens_seen": 275972395, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.703125, "step": 12798, "time_per_iteration": 2.5851190090179443 }, { "auxiliary_loss_clip": 0.01138891, "auxiliary_loss_mlp": 0.01026088, "balance_loss_clip": 1.01367795, "balance_loss_mlp": 1.03359222, "epoch": 0.7695175108973396, "flos": 16654220513280.0, "grad_norm": 2.00399049710198, "language_loss": 0.82472527, "learning_rate": 5.018034692424843e-07, "loss": 0.84637505, "num_input_tokens_seen": 275989020, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 12799, "time_per_iteration": 2.5692808628082275 }, { "auxiliary_loss_clip": 0.01150061, "auxiliary_loss_mlp": 0.0103103, "balance_loss_clip": 1.01795244, "balance_loss_mlp": 1.03619075, "epoch": 0.7695776341500075, "flos": 13626520237440.0, "grad_norm": 1.9668917915850437, "language_loss": 0.77532864, "learning_rate": 5.015532042153933e-07, "loss": 0.79713953, "num_input_tokens_seen": 276006525, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 12800, "time_per_iteration": 2.569735050201416 }, { "auxiliary_loss_clip": 0.01126548, "auxiliary_loss_mlp": 0.01027035, "balance_loss_clip": 1.01331377, "balance_loss_mlp": 1.03484583, "epoch": 0.7696377574026755, "flos": 24276690691200.0, "grad_norm": 2.091260113235571, "language_loss": 0.83570874, "learning_rate": 5.013029926640138e-07, "loss": 0.85724455, "num_input_tokens_seen": 276027130, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 12801, "time_per_iteration": 2.6418020725250244 }, { "auxiliary_loss_clip": 0.0112121, "auxiliary_loss_mlp": 0.01029766, "balance_loss_clip": 1.01693928, "balance_loss_mlp": 1.03510571, "epoch": 0.7696978806553435, "flos": 20923137210240.0, "grad_norm": 1.8434540961846388, "language_loss": 0.72301418, "learning_rate": 5.010528345972749e-07, "loss": 0.744524, "num_input_tokens_seen": 276045715, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 12802, "time_per_iteration": 2.5497219562530518 }, { "auxiliary_loss_clip": 0.01120776, "auxiliary_loss_mlp": 0.01031092, "balance_loss_clip": 1.01901054, "balance_loss_mlp": 1.03489184, "epoch": 0.7697580039080114, "flos": 22929609911040.0, "grad_norm": 1.786874232938472, "language_loss": 0.75973094, "learning_rate": 5.008027300241056e-07, "loss": 0.78124964, "num_input_tokens_seen": 276065375, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 12803, "time_per_iteration": 2.574552536010742 }, { "auxiliary_loss_clip": 0.01042658, "auxiliary_loss_mlp": 0.01001101, "balance_loss_clip": 0.99979556, "balance_loss_mlp": 1.0034678, "epoch": 0.7698181271606794, "flos": 68717654933760.0, "grad_norm": 0.7537164268213943, "language_loss": 0.55877477, "learning_rate": 5.005526789534294e-07, "loss": 0.57921237, "num_input_tokens_seen": 276131405, "router_z_loss_clip": 0.01306152, "router_z_loss_mlp": 0.21582031, "step": 12804, "time_per_iteration": 3.186469554901123 }, { "auxiliary_loss_clip": 0.01042436, "auxiliary_loss_mlp": 0.01003037, "balance_loss_clip": 1.00173175, "balance_loss_mlp": 1.00298762, "epoch": 0.7698782504133473, "flos": 67409716999680.0, "grad_norm": 0.747752461557944, "language_loss": 0.54027236, "learning_rate": 5.003026813941715e-07, "loss": 0.56072712, "num_input_tokens_seen": 276200755, "router_z_loss_clip": 0.01306152, "router_z_loss_mlp": 0.21679688, "step": 12805, "time_per_iteration": 3.310330390930176 }, { "auxiliary_loss_clip": 0.01135241, "auxiliary_loss_mlp": 0.01032089, "balance_loss_clip": 1.01920807, "balance_loss_mlp": 1.03873289, "epoch": 0.7699383736660154, "flos": 22488842119680.0, "grad_norm": 2.5175966304514845, "language_loss": 0.72823501, "learning_rate": 5.000527373552528e-07, "loss": 0.74990833, "num_input_tokens_seen": 276217880, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 12806, "time_per_iteration": 2.68855881690979 }, { "auxiliary_loss_clip": 0.01122747, "auxiliary_loss_mlp": 0.01034206, "balance_loss_clip": 1.02125955, "balance_loss_mlp": 1.0349412, "epoch": 0.7699984969186833, "flos": 21539723097600.0, "grad_norm": 1.767321450972394, "language_loss": 0.75166172, "learning_rate": 4.998028468455946e-07, "loss": 0.77323127, "num_input_tokens_seen": 276234810, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 12807, "time_per_iteration": 2.618276834487915 }, { "auxiliary_loss_clip": 0.01118186, "auxiliary_loss_mlp": 0.01030969, "balance_loss_clip": 1.0176059, "balance_loss_mlp": 1.03652382, "epoch": 0.7700586201713513, "flos": 21719096640000.0, "grad_norm": 1.9933253307754009, "language_loss": 0.80092001, "learning_rate": 4.995530098741128e-07, "loss": 0.82241166, "num_input_tokens_seen": 276252850, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 12808, "time_per_iteration": 2.568138599395752 }, { "auxiliary_loss_clip": 0.01116804, "auxiliary_loss_mlp": 0.01034678, "balance_loss_clip": 1.02158308, "balance_loss_mlp": 1.03528023, "epoch": 0.7701187434240192, "flos": 27856015107840.0, "grad_norm": 1.8711390785173152, "language_loss": 0.79216802, "learning_rate": 4.993032264497248e-07, "loss": 0.81368279, "num_input_tokens_seen": 276272525, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 12809, "time_per_iteration": 2.6145663261413574 }, { "auxiliary_loss_clip": 0.01126827, "auxiliary_loss_mlp": 0.01025138, "balance_loss_clip": 1.01342607, "balance_loss_mlp": 1.03410304, "epoch": 0.7701788666766872, "flos": 28621307301120.0, "grad_norm": 1.646140822438635, "language_loss": 0.70388389, "learning_rate": 4.990534965813446e-07, "loss": 0.72540355, "num_input_tokens_seen": 276294210, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.66015625, "step": 12810, "time_per_iteration": 4.0273191928863525 }, { "auxiliary_loss_clip": 0.01125713, "auxiliary_loss_mlp": 0.01034397, "balance_loss_clip": 1.02070594, "balance_loss_mlp": 1.03558457, "epoch": 0.7702389899293551, "flos": 14246446089600.0, "grad_norm": 2.284484065169696, "language_loss": 0.78218031, "learning_rate": 4.988038202778842e-07, "loss": 0.80378139, "num_input_tokens_seen": 276310290, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 12811, "time_per_iteration": 2.5803301334381104 }, { "auxiliary_loss_clip": 0.01043051, "auxiliary_loss_mlp": 0.01001445, "balance_loss_clip": 1.00021672, "balance_loss_mlp": 1.00326633, "epoch": 0.7702991131820232, "flos": 70574128439040.0, "grad_norm": 0.8108579094265195, "language_loss": 0.56731856, "learning_rate": 4.985541975482533e-07, "loss": 0.58776355, "num_input_tokens_seen": 276371715, "router_z_loss_clip": 0.01226807, "router_z_loss_mlp": 0.21386719, "step": 12812, "time_per_iteration": 3.2226693630218506 }, { "auxiliary_loss_clip": 0.0113262, "auxiliary_loss_mlp": 0.01033947, "balance_loss_clip": 1.02144241, "balance_loss_mlp": 1.03541386, "epoch": 0.7703592364346911, "flos": 25480021242240.0, "grad_norm": 1.6537977299628672, "language_loss": 0.72181541, "learning_rate": 4.983046284013615e-07, "loss": 0.74348116, "num_input_tokens_seen": 276389895, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 12813, "time_per_iteration": 2.659302234649658 }, { "auxiliary_loss_clip": 0.01113651, "auxiliary_loss_mlp": 0.01030023, "balance_loss_clip": 1.01642752, "balance_loss_mlp": 1.03422356, "epoch": 0.7704193596873591, "flos": 19280906375040.0, "grad_norm": 1.573377898754671, "language_loss": 0.66102505, "learning_rate": 4.980551128461152e-07, "loss": 0.68246174, "num_input_tokens_seen": 276408990, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.703125, "step": 12814, "time_per_iteration": 2.591442823410034 }, { "auxiliary_loss_clip": 0.0112249, "auxiliary_loss_mlp": 0.01032302, "balance_loss_clip": 1.02033985, "balance_loss_mlp": 1.03514862, "epoch": 0.7704794829400271, "flos": 23658452778240.0, "grad_norm": 1.927360987175474, "language_loss": 0.65663469, "learning_rate": 4.978056508914175e-07, "loss": 0.67818266, "num_input_tokens_seen": 276428190, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 12815, "time_per_iteration": 2.5820388793945312 }, { "auxiliary_loss_clip": 0.01115261, "auxiliary_loss_mlp": 0.01032782, "balance_loss_clip": 1.01987803, "balance_loss_mlp": 1.03602004, "epoch": 0.770539606192695, "flos": 18989311766400.0, "grad_norm": 2.145229958008069, "language_loss": 0.65259445, "learning_rate": 4.975562425461723e-07, "loss": 0.67407489, "num_input_tokens_seen": 276446855, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 12816, "time_per_iteration": 3.948298692703247 }, { "auxiliary_loss_clip": 0.01114343, "auxiliary_loss_mlp": 0.01033828, "balance_loss_clip": 1.02111983, "balance_loss_mlp": 1.03640199, "epoch": 0.770599729445363, "flos": 11830160142720.0, "grad_norm": 2.0508688093377008, "language_loss": 0.71943343, "learning_rate": 4.973068878192803e-07, "loss": 0.74091518, "num_input_tokens_seen": 276462000, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 12817, "time_per_iteration": 2.4791719913482666 }, { "auxiliary_loss_clip": 0.01139459, "auxiliary_loss_mlp": 0.01033478, "balance_loss_clip": 1.02121103, "balance_loss_mlp": 1.0348897, "epoch": 0.770659852698031, "flos": 17822610109440.0, "grad_norm": 2.433061948936357, "language_loss": 0.6126827, "learning_rate": 4.9705758671964e-07, "loss": 0.63441205, "num_input_tokens_seen": 276481190, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6953125, "step": 12818, "time_per_iteration": 2.569068670272827 }, { "auxiliary_loss_clip": 0.01111308, "auxiliary_loss_mlp": 0.0102873, "balance_loss_clip": 1.01762533, "balance_loss_mlp": 1.03530061, "epoch": 0.770719975950699, "flos": 21871968923520.0, "grad_norm": 2.4471025498114494, "language_loss": 0.67220098, "learning_rate": 4.96808339256148e-07, "loss": 0.69360137, "num_input_tokens_seen": 276499520, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.671875, "step": 12819, "time_per_iteration": 2.5598134994506836 }, { "auxiliary_loss_clip": 0.01106162, "auxiliary_loss_mlp": 0.01028289, "balance_loss_clip": 1.01582551, "balance_loss_mlp": 1.03618574, "epoch": 0.7707800992033669, "flos": 21325049464320.0, "grad_norm": 1.6472784501191293, "language_loss": 0.57567, "learning_rate": 4.965591454377005e-07, "loss": 0.59701449, "num_input_tokens_seen": 276519110, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 12820, "time_per_iteration": 2.5147111415863037 }, { "auxiliary_loss_clip": 0.01122059, "auxiliary_loss_mlp": 0.01030803, "balance_loss_clip": 1.01817262, "balance_loss_mlp": 1.03487885, "epoch": 0.7708402224560349, "flos": 28179426188160.0, "grad_norm": 1.840438808245855, "language_loss": 0.80708724, "learning_rate": 4.96310005273189e-07, "loss": 0.82861584, "num_input_tokens_seen": 276538805, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 12821, "time_per_iteration": 2.666212558746338 }, { "auxiliary_loss_clip": 0.01105513, "auxiliary_loss_mlp": 0.01033326, "balance_loss_clip": 1.02066612, "balance_loss_mlp": 1.03581023, "epoch": 0.7709003457087028, "flos": 15377057556480.0, "grad_norm": 2.010935089378442, "language_loss": 0.682468, "learning_rate": 4.960609187715057e-07, "loss": 0.70385635, "num_input_tokens_seen": 276554770, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 12822, "time_per_iteration": 2.4660086631774902 }, { "auxiliary_loss_clip": 0.0112141, "auxiliary_loss_mlp": 0.01036043, "balance_loss_clip": 1.02352571, "balance_loss_mlp": 1.03556705, "epoch": 0.7709604689613708, "flos": 30621854257920.0, "grad_norm": 1.5858235765115518, "language_loss": 0.72125268, "learning_rate": 4.958118859415393e-07, "loss": 0.74282718, "num_input_tokens_seen": 276574535, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 12823, "time_per_iteration": 5.601003885269165 }, { "auxiliary_loss_clip": 0.01119015, "auxiliary_loss_mlp": 0.01037178, "balance_loss_clip": 1.02289104, "balance_loss_mlp": 1.03672075, "epoch": 0.7710205922140387, "flos": 20301272023680.0, "grad_norm": 6.944050045153098, "language_loss": 0.7649678, "learning_rate": 4.955629067921785e-07, "loss": 0.78652972, "num_input_tokens_seen": 276592925, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.734375, "step": 12824, "time_per_iteration": 2.526991605758667 }, { "auxiliary_loss_clip": 0.01120702, "auxiliary_loss_mlp": 0.01027507, "balance_loss_clip": 1.01481724, "balance_loss_mlp": 1.03416109, "epoch": 0.7710807154667068, "flos": 19644214487040.0, "grad_norm": 4.68145715963981, "language_loss": 0.71974218, "learning_rate": 4.953139813323066e-07, "loss": 0.74122423, "num_input_tokens_seen": 276610540, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.68359375, "step": 12825, "time_per_iteration": 2.674546003341675 }, { "auxiliary_loss_clip": 0.01101543, "auxiliary_loss_mlp": 0.01032558, "balance_loss_clip": 1.02154291, "balance_loss_mlp": 1.03544736, "epoch": 0.7711408387193747, "flos": 20006337450240.0, "grad_norm": 1.4186507644110204, "language_loss": 0.7358672, "learning_rate": 4.950651095708087e-07, "loss": 0.75720817, "num_input_tokens_seen": 276629200, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.66015625, "step": 12826, "time_per_iteration": 2.4956562519073486 }, { "auxiliary_loss_clip": 0.01105715, "auxiliary_loss_mlp": 0.01032276, "balance_loss_clip": 1.0193541, "balance_loss_mlp": 1.03364396, "epoch": 0.7712009619720427, "flos": 24971131307520.0, "grad_norm": 1.6020281989572065, "language_loss": 0.81785285, "learning_rate": 4.948162915165659e-07, "loss": 0.83923274, "num_input_tokens_seen": 276648655, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 12827, "time_per_iteration": 2.5495572090148926 }, { "auxiliary_loss_clip": 0.01024718, "auxiliary_loss_mlp": 0.0100098, "balance_loss_clip": 0.99979401, "balance_loss_mlp": 1.00304198, "epoch": 0.7712610852247107, "flos": 63249681404160.0, "grad_norm": 0.8584284580150721, "language_loss": 0.55120873, "learning_rate": 4.945675271784577e-07, "loss": 0.57146567, "num_input_tokens_seen": 276716500, "router_z_loss_clip": 0.01184082, "router_z_loss_mlp": 0.21679688, "step": 12828, "time_per_iteration": 3.2570135593414307 }, { "auxiliary_loss_clip": 0.01129727, "auxiliary_loss_mlp": 0.01029759, "balance_loss_clip": 1.0180645, "balance_loss_mlp": 1.03539932, "epoch": 0.7713212084773786, "flos": 18697860812160.0, "grad_norm": 1.713926834246484, "language_loss": 0.69592148, "learning_rate": 4.943188165653622e-07, "loss": 0.71751636, "num_input_tokens_seen": 276733535, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.66796875, "step": 12829, "time_per_iteration": 2.580303192138672 }, { "auxiliary_loss_clip": 0.01126738, "auxiliary_loss_mlp": 0.01031897, "balance_loss_clip": 1.01863503, "balance_loss_mlp": 1.03647864, "epoch": 0.7713813317300466, "flos": 14173367869440.0, "grad_norm": 2.397605435398398, "language_loss": 0.79419649, "learning_rate": 4.940701596861552e-07, "loss": 0.8157829, "num_input_tokens_seen": 276749575, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 12830, "time_per_iteration": 2.512185573577881 }, { "auxiliary_loss_clip": 0.01114823, "auxiliary_loss_mlp": 0.01032362, "balance_loss_clip": 1.01878417, "balance_loss_mlp": 1.03424311, "epoch": 0.7714414549827145, "flos": 25703960584320.0, "grad_norm": 1.6574462986723664, "language_loss": 0.78205907, "learning_rate": 4.938215565497102e-07, "loss": 0.80353093, "num_input_tokens_seen": 276769460, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 12831, "time_per_iteration": 2.5732879638671875 }, { "auxiliary_loss_clip": 0.0112206, "auxiliary_loss_mlp": 0.01027662, "balance_loss_clip": 1.01529384, "balance_loss_mlp": 1.03416061, "epoch": 0.7715015782353826, "flos": 30555312312960.0, "grad_norm": 1.4113748470295833, "language_loss": 0.61155117, "learning_rate": 4.935730071648992e-07, "loss": 0.63304842, "num_input_tokens_seen": 276790820, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 12832, "time_per_iteration": 2.6645753383636475 }, { "auxiliary_loss_clip": 0.01122997, "auxiliary_loss_mlp": 0.01037168, "balance_loss_clip": 1.02438855, "balance_loss_mlp": 1.03410411, "epoch": 0.7715617014880505, "flos": 20229343038720.0, "grad_norm": 1.5548367451288718, "language_loss": 0.79397881, "learning_rate": 4.933245115405928e-07, "loss": 0.81558043, "num_input_tokens_seen": 276811345, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 12833, "time_per_iteration": 2.576941728591919 }, { "auxiliary_loss_clip": 0.01059615, "auxiliary_loss_mlp": 0.0100093, "balance_loss_clip": 0.99977964, "balance_loss_mlp": 1.00313985, "epoch": 0.7716218247407185, "flos": 63664770971520.0, "grad_norm": 0.8507829692632357, "language_loss": 0.55424082, "learning_rate": 4.930760696856593e-07, "loss": 0.57484627, "num_input_tokens_seen": 276870950, "router_z_loss_clip": 0.01147461, "router_z_loss_mlp": 0.21484375, "step": 12834, "time_per_iteration": 3.243528127670288 }, { "auxiliary_loss_clip": 0.01140738, "auxiliary_loss_mlp": 0.01030597, "balance_loss_clip": 1.0180974, "balance_loss_mlp": 1.03537631, "epoch": 0.7716819479933864, "flos": 19791807471360.0, "grad_norm": 2.803771595307876, "language_loss": 0.73076963, "learning_rate": 4.928276816089643e-07, "loss": 0.75248289, "num_input_tokens_seen": 276890760, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 12835, "time_per_iteration": 2.623699188232422 }, { "auxiliary_loss_clip": 0.01129925, "auxiliary_loss_mlp": 0.01280625, "balance_loss_clip": 1.02103257, "balance_loss_mlp": 1.03328812, "epoch": 0.7717420712460544, "flos": 18442176825600.0, "grad_norm": 1.621902092996426, "language_loss": 0.70318413, "learning_rate": 4.92579347319372e-07, "loss": 0.72728956, "num_input_tokens_seen": 276909625, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 12836, "time_per_iteration": 2.605360746383667 }, { "auxiliary_loss_clip": 0.01119334, "auxiliary_loss_mlp": 0.01031165, "balance_loss_clip": 1.01901722, "balance_loss_mlp": 1.03473806, "epoch": 0.7718021944987223, "flos": 35189476456320.0, "grad_norm": 1.6869907960238057, "language_loss": 0.59337914, "learning_rate": 4.923310668257466e-07, "loss": 0.61488414, "num_input_tokens_seen": 276930760, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.671875, "step": 12837, "time_per_iteration": 2.6511847972869873 }, { "auxiliary_loss_clip": 0.01033392, "auxiliary_loss_mlp": 0.00999316, "balance_loss_clip": 0.9980759, "balance_loss_mlp": 1.00269032, "epoch": 0.7718623177513904, "flos": 67923167961600.0, "grad_norm": 0.8818873017652621, "language_loss": 0.55813622, "learning_rate": 4.920828401369457e-07, "loss": 0.57846332, "num_input_tokens_seen": 276989580, "router_z_loss_clip": 0.01239014, "router_z_loss_mlp": 0.21484375, "step": 12838, "time_per_iteration": 3.1023871898651123 }, { "auxiliary_loss_clip": 0.01111452, "auxiliary_loss_mlp": 0.01025055, "balance_loss_clip": 1.01370001, "balance_loss_mlp": 1.03456187, "epoch": 0.7719224410040583, "flos": 18581401941120.0, "grad_norm": 2.3659352699180927, "language_loss": 0.69507623, "learning_rate": 4.918346672618303e-07, "loss": 0.71644139, "num_input_tokens_seen": 277005450, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.68359375, "step": 12839, "time_per_iteration": 2.4369149208068848 }, { "auxiliary_loss_clip": 0.0111125, "auxiliary_loss_mlp": 0.01025739, "balance_loss_clip": 1.0137527, "balance_loss_mlp": 1.03504777, "epoch": 0.7719825642567263, "flos": 23075802264960.0, "grad_norm": 1.853874920981627, "language_loss": 0.79463899, "learning_rate": 4.915865482092554e-07, "loss": 0.81600893, "num_input_tokens_seen": 277023055, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 12840, "time_per_iteration": 2.5218968391418457 }, { "auxiliary_loss_clip": 0.01113032, "auxiliary_loss_mlp": 0.01280023, "balance_loss_clip": 1.02015948, "balance_loss_mlp": 1.03534985, "epoch": 0.7720426875093943, "flos": 20339086066560.0, "grad_norm": 1.9557159141091287, "language_loss": 0.80034167, "learning_rate": 4.913384829880778e-07, "loss": 0.82427216, "num_input_tokens_seen": 277041150, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6875, "step": 12841, "time_per_iteration": 2.5030057430267334 }, { "auxiliary_loss_clip": 0.01130913, "auxiliary_loss_mlp": 0.01028944, "balance_loss_clip": 1.01613522, "balance_loss_mlp": 1.03494501, "epoch": 0.7721028107620622, "flos": 23880704181120.0, "grad_norm": 1.4693181191639124, "language_loss": 0.76533651, "learning_rate": 4.910904716071476e-07, "loss": 0.78693497, "num_input_tokens_seen": 277063895, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 12842, "time_per_iteration": 2.6106646060943604 }, { "auxiliary_loss_clip": 0.01143986, "auxiliary_loss_mlp": 0.01280703, "balance_loss_clip": 1.02159679, "balance_loss_mlp": 1.03635049, "epoch": 0.7721629340147302, "flos": 26651571235200.0, "grad_norm": 1.689183089964437, "language_loss": 0.68496031, "learning_rate": 4.908425140753178e-07, "loss": 0.70920724, "num_input_tokens_seen": 277084045, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.7109375, "step": 12843, "time_per_iteration": 2.5730862617492676 }, { "auxiliary_loss_clip": 0.01122541, "auxiliary_loss_mlp": 0.0102865, "balance_loss_clip": 1.01630569, "balance_loss_mlp": 1.03284168, "epoch": 0.7722230572673981, "flos": 21178857110400.0, "grad_norm": 1.9090436689812418, "language_loss": 0.73500526, "learning_rate": 4.905946104014373e-07, "loss": 0.75651723, "num_input_tokens_seen": 277102625, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.71875, "step": 12844, "time_per_iteration": 2.5277788639068604 }, { "auxiliary_loss_clip": 0.01110741, "auxiliary_loss_mlp": 0.0103064, "balance_loss_clip": 1.01873636, "balance_loss_mlp": 1.03362167, "epoch": 0.7722831805200662, "flos": 27964644814080.0, "grad_norm": 1.4705098995371355, "language_loss": 0.71471453, "learning_rate": 4.903467605943515e-07, "loss": 0.73612833, "num_input_tokens_seen": 277123210, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 12845, "time_per_iteration": 2.5284085273742676 }, { "auxiliary_loss_clip": 0.01099671, "auxiliary_loss_mlp": 0.01031064, "balance_loss_clip": 1.01917922, "balance_loss_mlp": 1.03335834, "epoch": 0.7723433037727341, "flos": 33875576864640.0, "grad_norm": 1.8308031612495719, "language_loss": 0.64373785, "learning_rate": 4.900989646629068e-07, "loss": 0.6650452, "num_input_tokens_seen": 277144895, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6640625, "step": 12846, "time_per_iteration": 2.633483648300171 }, { "auxiliary_loss_clip": 0.01033942, "auxiliary_loss_mlp": 0.0124754, "balance_loss_clip": 1.00053489, "balance_loss_mlp": 1.00323725, "epoch": 0.7724034270254021, "flos": 62848271940480.0, "grad_norm": 0.8683678046544717, "language_loss": 0.61778075, "learning_rate": 4.898512226159461e-07, "loss": 0.64059561, "num_input_tokens_seen": 277205160, "router_z_loss_clip": 0.01184082, "router_z_loss_mlp": 0.21484375, "step": 12847, "time_per_iteration": 3.085268974304199 }, { "auxiliary_loss_clip": 0.01101572, "auxiliary_loss_mlp": 0.01029893, "balance_loss_clip": 1.01789427, "balance_loss_mlp": 1.03202701, "epoch": 0.77246355027807, "flos": 23295467888640.0, "grad_norm": 1.701244086908314, "language_loss": 0.78665578, "learning_rate": 4.896035344623108e-07, "loss": 0.8079704, "num_input_tokens_seen": 277223005, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6953125, "step": 12848, "time_per_iteration": 2.607201099395752 }, { "auxiliary_loss_clip": 0.01133366, "auxiliary_loss_mlp": 0.01031072, "balance_loss_clip": 1.01932418, "balance_loss_mlp": 1.03621316, "epoch": 0.772523673530738, "flos": 20121287950080.0, "grad_norm": 1.6885382336896546, "language_loss": 0.72589469, "learning_rate": 4.893559002108396e-07, "loss": 0.74753904, "num_input_tokens_seen": 277241785, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.70703125, "step": 12849, "time_per_iteration": 2.5413405895233154 }, { "auxiliary_loss_clip": 0.0110216, "auxiliary_loss_mlp": 0.01030789, "balance_loss_clip": 1.01878452, "balance_loss_mlp": 1.03344834, "epoch": 0.7725837967834059, "flos": 17820096157440.0, "grad_norm": 1.7809292437284998, "language_loss": 0.78840053, "learning_rate": 4.891083198703711e-07, "loss": 0.80972999, "num_input_tokens_seen": 277259050, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 12850, "time_per_iteration": 2.4978229999542236 }, { "auxiliary_loss_clip": 0.01147561, "auxiliary_loss_mlp": 0.01281383, "balance_loss_clip": 1.02221894, "balance_loss_mlp": 1.03216386, "epoch": 0.772643920036074, "flos": 27198921657600.0, "grad_norm": 1.5792373948918381, "language_loss": 0.79926115, "learning_rate": 4.888607934497402e-07, "loss": 0.82355058, "num_input_tokens_seen": 277278235, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.7109375, "step": 12851, "time_per_iteration": 3.9575932025909424 }, { "auxiliary_loss_clip": 0.01111332, "auxiliary_loss_mlp": 0.01027642, "balance_loss_clip": 1.01522052, "balance_loss_mlp": 1.03425431, "epoch": 0.7727040432887419, "flos": 21579512388480.0, "grad_norm": 1.519936533740312, "language_loss": 0.73871261, "learning_rate": 4.886133209577803e-07, "loss": 0.76010233, "num_input_tokens_seen": 277298355, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6796875, "step": 12852, "time_per_iteration": 2.518401622772217 }, { "auxiliary_loss_clip": 0.01099865, "auxiliary_loss_mlp": 0.01036973, "balance_loss_clip": 1.02495098, "balance_loss_mlp": 1.03195834, "epoch": 0.7727641665414099, "flos": 22236641752320.0, "grad_norm": 1.7329231335859574, "language_loss": 0.82135469, "learning_rate": 4.883659024033228e-07, "loss": 0.84272301, "num_input_tokens_seen": 277316095, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 12853, "time_per_iteration": 2.5883424282073975 }, { "auxiliary_loss_clip": 0.01024766, "auxiliary_loss_mlp": 0.01002927, "balance_loss_clip": 1.00174069, "balance_loss_mlp": 1.00327897, "epoch": 0.7728242897940779, "flos": 54832221463680.0, "grad_norm": 0.7873648553464976, "language_loss": 0.5456664, "learning_rate": 4.88118537795199e-07, "loss": 0.5659433, "num_input_tokens_seen": 277380130, "router_z_loss_clip": 0.01184082, "router_z_loss_mlp": 0.21484375, "step": 12854, "time_per_iteration": 3.1525349617004395 }, { "auxiliary_loss_clip": 0.01114338, "auxiliary_loss_mlp": 0.01029876, "balance_loss_clip": 1.01648879, "balance_loss_mlp": 1.03455925, "epoch": 0.7728844130467458, "flos": 34461962392320.0, "grad_norm": 1.5024966628869547, "language_loss": 0.71620655, "learning_rate": 4.878712271422342e-07, "loss": 0.73764873, "num_input_tokens_seen": 277404015, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 12855, "time_per_iteration": 2.706857204437256 }, { "auxiliary_loss_clip": 0.01103261, "auxiliary_loss_mlp": 0.01029185, "balance_loss_clip": 1.01635814, "balance_loss_mlp": 1.03448558, "epoch": 0.7729445362994138, "flos": 18916341287040.0, "grad_norm": 1.7116086215503235, "language_loss": 0.67996168, "learning_rate": 4.876239704532566e-07, "loss": 0.70128614, "num_input_tokens_seen": 277421375, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 12856, "time_per_iteration": 2.5135488510131836 }, { "auxiliary_loss_clip": 0.01131923, "auxiliary_loss_mlp": 0.01028705, "balance_loss_clip": 1.01710618, "balance_loss_mlp": 1.03500736, "epoch": 0.7730046595520818, "flos": 22200048771840.0, "grad_norm": 2.071221976310875, "language_loss": 0.78986013, "learning_rate": 4.873767677370884e-07, "loss": 0.8114664, "num_input_tokens_seen": 277440170, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.703125, "step": 12857, "time_per_iteration": 4.031262397766113 }, { "auxiliary_loss_clip": 0.01146977, "auxiliary_loss_mlp": 0.0127837, "balance_loss_clip": 1.0191251, "balance_loss_mlp": 1.03551507, "epoch": 0.7730647828047498, "flos": 13552328695680.0, "grad_norm": 2.3756826899341, "language_loss": 0.7822237, "learning_rate": 4.871296190025535e-07, "loss": 0.80647719, "num_input_tokens_seen": 277456880, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.671875, "step": 12858, "time_per_iteration": 2.7222206592559814 }, { "auxiliary_loss_clip": 0.01103329, "auxiliary_loss_mlp": 0.01029755, "balance_loss_clip": 1.01739848, "balance_loss_mlp": 1.03356028, "epoch": 0.7731249060574177, "flos": 21976037602560.0, "grad_norm": 1.9175356174766625, "language_loss": 0.76535803, "learning_rate": 4.868825242584704e-07, "loss": 0.78668892, "num_input_tokens_seen": 277475365, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 12859, "time_per_iteration": 2.561296224594116 }, { "auxiliary_loss_clip": 0.01110044, "auxiliary_loss_mlp": 0.0102982, "balance_loss_clip": 1.01878142, "balance_loss_mlp": 1.0359956, "epoch": 0.7731850293100857, "flos": 22601817371520.0, "grad_norm": 3.228511615782021, "language_loss": 0.67745107, "learning_rate": 4.866354835136575e-07, "loss": 0.69884968, "num_input_tokens_seen": 277494975, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.65234375, "step": 12860, "time_per_iteration": 2.556891679763794 }, { "auxiliary_loss_clip": 0.01105556, "auxiliary_loss_mlp": 0.01033718, "balance_loss_clip": 1.02120137, "balance_loss_mlp": 1.03373241, "epoch": 0.7732451525627536, "flos": 14098422142080.0, "grad_norm": 2.2583898436249554, "language_loss": 0.7475847, "learning_rate": 4.863884967769323e-07, "loss": 0.7689774, "num_input_tokens_seen": 277510520, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 12861, "time_per_iteration": 2.4672796726226807 }, { "auxiliary_loss_clip": 0.01116602, "auxiliary_loss_mlp": 0.01029218, "balance_loss_clip": 1.01636159, "balance_loss_mlp": 1.03585923, "epoch": 0.7733052758154216, "flos": 21470020755840.0, "grad_norm": 1.7843255241237506, "language_loss": 0.74752378, "learning_rate": 4.86141564057107e-07, "loss": 0.76898205, "num_input_tokens_seen": 277530505, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 12862, "time_per_iteration": 2.5365638732910156 }, { "auxiliary_loss_clip": 0.01135798, "auxiliary_loss_mlp": 0.01030437, "balance_loss_clip": 1.01930332, "balance_loss_mlp": 1.03389072, "epoch": 0.7733653990680895, "flos": 21394284929280.0, "grad_norm": 1.52815320132478, "language_loss": 0.83155811, "learning_rate": 4.858946853629957e-07, "loss": 0.85322046, "num_input_tokens_seen": 277550810, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.66796875, "step": 12863, "time_per_iteration": 2.5521960258483887 }, { "auxiliary_loss_clip": 0.01102484, "auxiliary_loss_mlp": 0.01029313, "balance_loss_clip": 1.01728499, "balance_loss_mlp": 1.0336709, "epoch": 0.7734255223207576, "flos": 17676058619520.0, "grad_norm": 2.039928107497407, "language_loss": 0.73024285, "learning_rate": 4.856478607034085e-07, "loss": 0.75156081, "num_input_tokens_seen": 277567680, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 12864, "time_per_iteration": 3.9759674072265625 }, { "auxiliary_loss_clip": 0.01119566, "auxiliary_loss_mlp": 0.01028624, "balance_loss_clip": 1.01764464, "balance_loss_mlp": 1.03230953, "epoch": 0.7734856455734255, "flos": 25230837617280.0, "grad_norm": 1.9010506618271283, "language_loss": 0.82080108, "learning_rate": 4.854010900871534e-07, "loss": 0.84228301, "num_input_tokens_seen": 277588970, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.6953125, "step": 12865, "time_per_iteration": 4.09395432472229 }, { "auxiliary_loss_clip": 0.01108333, "auxiliary_loss_mlp": 0.01032353, "balance_loss_clip": 1.01830435, "balance_loss_mlp": 1.03658426, "epoch": 0.7735457688260935, "flos": 23433112805760.0, "grad_norm": 2.504923611140167, "language_loss": 0.7167685, "learning_rate": 4.851543735230372e-07, "loss": 0.73817539, "num_input_tokens_seen": 277605450, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.71875, "step": 12866, "time_per_iteration": 2.5186009407043457 }, { "auxiliary_loss_clip": 0.01141519, "auxiliary_loss_mlp": 0.01277544, "balance_loss_clip": 1.01777923, "balance_loss_mlp": 1.0363766, "epoch": 0.7736058920787615, "flos": 18729246320640.0, "grad_norm": 2.4384257081414504, "language_loss": 0.64680195, "learning_rate": 4.849077110198652e-07, "loss": 0.67099261, "num_input_tokens_seen": 277622530, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 12867, "time_per_iteration": 2.5251286029815674 }, { "auxiliary_loss_clip": 0.01111679, "auxiliary_loss_mlp": 0.01031994, "balance_loss_clip": 1.02029979, "balance_loss_mlp": 1.03479457, "epoch": 0.7736660153314294, "flos": 22893304239360.0, "grad_norm": 1.5925946634764063, "language_loss": 0.71041042, "learning_rate": 4.846611025864398e-07, "loss": 0.73184717, "num_input_tokens_seen": 277642700, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 12868, "time_per_iteration": 2.5381412506103516 }, { "auxiliary_loss_clip": 0.01114991, "auxiliary_loss_mlp": 0.01025759, "balance_loss_clip": 1.01386225, "balance_loss_mlp": 1.03608954, "epoch": 0.7737261385840974, "flos": 13800901789440.0, "grad_norm": 2.3096139854490194, "language_loss": 0.77851689, "learning_rate": 4.844145482315616e-07, "loss": 0.79992437, "num_input_tokens_seen": 277660005, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69921875, "step": 12869, "time_per_iteration": 2.525426149368286 }, { "auxiliary_loss_clip": 0.01108078, "auxiliary_loss_mlp": 0.01029157, "balance_loss_clip": 1.01681244, "balance_loss_mlp": 1.03314137, "epoch": 0.7737862618367654, "flos": 28730727106560.0, "grad_norm": 1.5823436037296594, "language_loss": 0.73849446, "learning_rate": 4.841680479640291e-07, "loss": 0.75986683, "num_input_tokens_seen": 277682890, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.66015625, "step": 12870, "time_per_iteration": 2.645232915878296 }, { "auxiliary_loss_clip": 0.01113172, "auxiliary_loss_mlp": 0.01025287, "balance_loss_clip": 1.01341975, "balance_loss_mlp": 1.03477049, "epoch": 0.7738463850894334, "flos": 17018570119680.0, "grad_norm": 1.9089189922639453, "language_loss": 0.75767201, "learning_rate": 4.839216017926409e-07, "loss": 0.77905661, "num_input_tokens_seen": 277699330, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6953125, "step": 12871, "time_per_iteration": 2.551344394683838 }, { "auxiliary_loss_clip": 0.01119935, "auxiliary_loss_mlp": 0.01029899, "balance_loss_clip": 1.01831186, "balance_loss_mlp": 1.03400099, "epoch": 0.7739065083421013, "flos": 20704010290560.0, "grad_norm": 1.741952996973686, "language_loss": 0.69108987, "learning_rate": 4.836752097261898e-07, "loss": 0.71258825, "num_input_tokens_seen": 277718750, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6796875, "step": 12872, "time_per_iteration": 2.623398542404175 }, { "auxiliary_loss_clip": 0.01104266, "auxiliary_loss_mlp": 0.01032128, "balance_loss_clip": 1.01873469, "balance_loss_mlp": 1.03432703, "epoch": 0.7739666315947693, "flos": 20697222620160.0, "grad_norm": 2.0051518931627657, "language_loss": 0.8506546, "learning_rate": 4.834288717734707e-07, "loss": 0.87201852, "num_input_tokens_seen": 277734645, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.69921875, "step": 12873, "time_per_iteration": 2.5032477378845215 }, { "auxiliary_loss_clip": 0.01104595, "auxiliary_loss_mlp": 0.01032851, "balance_loss_clip": 1.02040601, "balance_loss_mlp": 1.03463066, "epoch": 0.7740267548474372, "flos": 29570677718400.0, "grad_norm": 2.6072519001040644, "language_loss": 0.66027188, "learning_rate": 4.831825879432744e-07, "loss": 0.68164635, "num_input_tokens_seen": 277755535, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69921875, "step": 12874, "time_per_iteration": 2.6465859413146973 }, { "auxiliary_loss_clip": 0.01140234, "auxiliary_loss_mlp": 0.01033257, "balance_loss_clip": 1.02075207, "balance_loss_mlp": 1.03588629, "epoch": 0.7740868781001052, "flos": 23659099223040.0, "grad_norm": 1.4905588959637797, "language_loss": 0.62495422, "learning_rate": 4.829363582443888e-07, "loss": 0.64668912, "num_input_tokens_seen": 277775585, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 12875, "time_per_iteration": 2.61246395111084 }, { "auxiliary_loss_clip": 0.01112621, "auxiliary_loss_mlp": 0.01032326, "balance_loss_clip": 1.01974916, "balance_loss_mlp": 1.03355694, "epoch": 0.7741470013527731, "flos": 24717314828160.0, "grad_norm": 2.0605190247593304, "language_loss": 0.65723324, "learning_rate": 4.826901826856029e-07, "loss": 0.67868268, "num_input_tokens_seen": 277794795, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 12876, "time_per_iteration": 2.609738826751709 }, { "auxiliary_loss_clip": 0.01118986, "auxiliary_loss_mlp": 0.01035316, "balance_loss_clip": 1.02351391, "balance_loss_mlp": 1.03362644, "epoch": 0.7742071246054412, "flos": 21871645701120.0, "grad_norm": 2.310772706081631, "language_loss": 0.71520698, "learning_rate": 4.824440612757006e-07, "loss": 0.73675001, "num_input_tokens_seen": 277813235, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 12877, "time_per_iteration": 2.5448157787323 }, { "auxiliary_loss_clip": 0.01135011, "auxiliary_loss_mlp": 0.01029278, "balance_loss_clip": 1.01614726, "balance_loss_mlp": 1.03538644, "epoch": 0.7742672478581091, "flos": 22674249146880.0, "grad_norm": 1.850572626332875, "language_loss": 0.82735908, "learning_rate": 4.821979940234675e-07, "loss": 0.84900194, "num_input_tokens_seen": 277832560, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 12878, "time_per_iteration": 2.6246039867401123 }, { "auxiliary_loss_clip": 0.01123324, "auxiliary_loss_mlp": 0.01033867, "balance_loss_clip": 1.02027118, "balance_loss_mlp": 1.03518677, "epoch": 0.7743273711107771, "flos": 18840892769280.0, "grad_norm": 1.7985404844941162, "language_loss": 0.73445666, "learning_rate": 4.819519809376824e-07, "loss": 0.75602853, "num_input_tokens_seen": 277850120, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7109375, "step": 12879, "time_per_iteration": 2.5457603931427 }, { "auxiliary_loss_clip": 0.01116801, "auxiliary_loss_mlp": 0.01029233, "balance_loss_clip": 1.01678181, "balance_loss_mlp": 1.03317285, "epoch": 0.7743874943634451, "flos": 28729326476160.0, "grad_norm": 2.4728080979601117, "language_loss": 0.79558665, "learning_rate": 4.81706022027127e-07, "loss": 0.817047, "num_input_tokens_seen": 277871020, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.66015625, "step": 12880, "time_per_iteration": 2.6954874992370605 }, { "auxiliary_loss_clip": 0.01120534, "auxiliary_loss_mlp": 0.01037912, "balance_loss_clip": 1.02506709, "balance_loss_mlp": 1.03519726, "epoch": 0.774447617616113, "flos": 21909639312000.0, "grad_norm": 1.4748767556344462, "language_loss": 0.70248652, "learning_rate": 4.814601173005781e-07, "loss": 0.72407091, "num_input_tokens_seen": 277891525, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6796875, "step": 12881, "time_per_iteration": 2.5837252140045166 }, { "auxiliary_loss_clip": 0.01124589, "auxiliary_loss_mlp": 0.01037546, "balance_loss_clip": 1.0245285, "balance_loss_mlp": 1.03469062, "epoch": 0.774507740868781, "flos": 19500643825920.0, "grad_norm": 2.2069146353626956, "language_loss": 0.84510374, "learning_rate": 4.812142667668113e-07, "loss": 0.86672515, "num_input_tokens_seen": 277910425, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 12882, "time_per_iteration": 2.6230061054229736 }, { "auxiliary_loss_clip": 0.01052763, "auxiliary_loss_mlp": 0.01001909, "balance_loss_clip": 1.00083065, "balance_loss_mlp": 1.00452471, "epoch": 0.774567864121449, "flos": 59426560402560.0, "grad_norm": 0.7699036918385954, "language_loss": 0.60440528, "learning_rate": 4.809684704346e-07, "loss": 0.62495196, "num_input_tokens_seen": 277972795, "router_z_loss_clip": 0.01080322, "router_z_loss_mlp": 0.21484375, "step": 12883, "time_per_iteration": 3.1437184810638428 }, { "auxiliary_loss_clip": 0.01141111, "auxiliary_loss_mlp": 0.01030963, "balance_loss_clip": 1.01779652, "balance_loss_mlp": 1.03507268, "epoch": 0.774627987374117, "flos": 13225326255360.0, "grad_norm": 1.7744228855256758, "language_loss": 0.72908348, "learning_rate": 4.807227283127173e-07, "loss": 0.75080425, "num_input_tokens_seen": 277990675, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.70703125, "step": 12884, "time_per_iteration": 2.5599873065948486 }, { "auxiliary_loss_clip": 0.01111728, "auxiliary_loss_mlp": 0.01034562, "balance_loss_clip": 1.02335083, "balance_loss_mlp": 1.03496075, "epoch": 0.7746881106267849, "flos": 21394033534080.0, "grad_norm": 1.653044419259296, "language_loss": 0.81609857, "learning_rate": 4.804770404099323e-07, "loss": 0.83756143, "num_input_tokens_seen": 278010050, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6796875, "step": 12885, "time_per_iteration": 2.5657174587249756 }, { "auxiliary_loss_clip": 0.01101594, "auxiliary_loss_mlp": 0.01032929, "balance_loss_clip": 1.02159798, "balance_loss_mlp": 1.03417802, "epoch": 0.7747482338794529, "flos": 25629338079360.0, "grad_norm": 1.7468008798970895, "language_loss": 0.64215541, "learning_rate": 4.80231406735013e-07, "loss": 0.66350067, "num_input_tokens_seen": 278030660, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.67578125, "step": 12886, "time_per_iteration": 2.5493619441986084 }, { "auxiliary_loss_clip": 0.01130055, "auxiliary_loss_mlp": 0.0103252, "balance_loss_clip": 1.02098656, "balance_loss_mlp": 1.03255677, "epoch": 0.7748083571321208, "flos": 11546933402880.0, "grad_norm": 1.8896478527153113, "language_loss": 0.69598508, "learning_rate": 4.79985827296725e-07, "loss": 0.71761084, "num_input_tokens_seen": 278047645, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.70703125, "step": 12887, "time_per_iteration": 2.5175116062164307 }, { "auxiliary_loss_clip": 0.01109841, "auxiliary_loss_mlp": 0.01027191, "balance_loss_clip": 1.01560998, "balance_loss_mlp": 1.03356934, "epoch": 0.7748684803847888, "flos": 19062425900160.0, "grad_norm": 1.6416506239978927, "language_loss": 0.7056638, "learning_rate": 4.79740302103834e-07, "loss": 0.72703409, "num_input_tokens_seen": 278066170, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.671875, "step": 12888, "time_per_iteration": 2.5181734561920166 }, { "auxiliary_loss_clip": 0.01112984, "auxiliary_loss_mlp": 0.01032514, "balance_loss_clip": 1.01995587, "balance_loss_mlp": 1.03342867, "epoch": 0.7749286036374567, "flos": 22273162905600.0, "grad_norm": 1.5727958652410554, "language_loss": 0.8169142, "learning_rate": 4.794948311651004e-07, "loss": 0.83836919, "num_input_tokens_seen": 278085545, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 12889, "time_per_iteration": 2.5069401264190674 }, { "auxiliary_loss_clip": 0.0113125, "auxiliary_loss_mlp": 0.01030171, "balance_loss_clip": 1.0180949, "balance_loss_mlp": 1.03590882, "epoch": 0.7749887268901248, "flos": 20192462749440.0, "grad_norm": 1.8505030957149493, "language_loss": 0.79178566, "learning_rate": 4.792494144892845e-07, "loss": 0.81339985, "num_input_tokens_seen": 278102995, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 12890, "time_per_iteration": 2.5422332286834717 }, { "auxiliary_loss_clip": 0.01135869, "auxiliary_loss_mlp": 0.01030888, "balance_loss_clip": 1.01849604, "balance_loss_mlp": 1.03895724, "epoch": 0.7750488501427927, "flos": 20337541781760.0, "grad_norm": 1.7656273257681914, "language_loss": 0.66461778, "learning_rate": 4.790040520851464e-07, "loss": 0.68628538, "num_input_tokens_seen": 278121460, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 12891, "time_per_iteration": 2.535642385482788 }, { "auxiliary_loss_clip": 0.011123, "auxiliary_loss_mlp": 0.01034603, "balance_loss_clip": 1.02224731, "balance_loss_mlp": 1.03414345, "epoch": 0.7751089733954607, "flos": 28364043116160.0, "grad_norm": 1.3888843003814504, "language_loss": 0.7856552, "learning_rate": 4.7875874396144e-07, "loss": 0.80712426, "num_input_tokens_seen": 278143905, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 12892, "time_per_iteration": 3.969975233078003 }, { "auxiliary_loss_clip": 0.0111652, "auxiliary_loss_mlp": 0.01030222, "balance_loss_clip": 1.01750207, "balance_loss_mlp": 1.0362494, "epoch": 0.7751690966481286, "flos": 16943803960320.0, "grad_norm": 2.593299036841953, "language_loss": 0.67082155, "learning_rate": 4.785134901269214e-07, "loss": 0.69228899, "num_input_tokens_seen": 278160850, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 12893, "time_per_iteration": 2.5235674381256104 }, { "auxiliary_loss_clip": 0.01033547, "auxiliary_loss_mlp": 0.01000371, "balance_loss_clip": 0.99911368, "balance_loss_mlp": 1.0032289, "epoch": 0.7752292199007966, "flos": 65668050339840.0, "grad_norm": 0.8212135508651982, "language_loss": 0.5851841, "learning_rate": 4.782682905903424e-07, "loss": 0.60552323, "num_input_tokens_seen": 278219950, "router_z_loss_clip": 0.01257324, "router_z_loss_mlp": 0.21582031, "step": 12894, "time_per_iteration": 3.1588492393493652 }, { "auxiliary_loss_clip": 0.01123687, "auxiliary_loss_mlp": 0.01030889, "balance_loss_clip": 1.01816893, "balance_loss_mlp": 1.03536344, "epoch": 0.7752893431534646, "flos": 20594662312320.0, "grad_norm": 2.495216949439512, "language_loss": 0.78163755, "learning_rate": 4.780231453604544e-07, "loss": 0.80318338, "num_input_tokens_seen": 278237805, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 12895, "time_per_iteration": 2.5489790439605713 }, { "auxiliary_loss_clip": 0.01132069, "auxiliary_loss_mlp": 0.01028157, "balance_loss_clip": 1.01553845, "balance_loss_mlp": 1.03632414, "epoch": 0.7753494664061326, "flos": 20485350247680.0, "grad_norm": 1.746489087604427, "language_loss": 0.67744929, "learning_rate": 4.777780544460046e-07, "loss": 0.69905156, "num_input_tokens_seen": 278257660, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 12896, "time_per_iteration": 2.5959060192108154 }, { "auxiliary_loss_clip": 0.01120409, "auxiliary_loss_mlp": 0.01033807, "balance_loss_clip": 1.0225594, "balance_loss_mlp": 1.03405917, "epoch": 0.7754095896588006, "flos": 20265900105600.0, "grad_norm": 1.6341414206870126, "language_loss": 0.68890905, "learning_rate": 4.775330178557409e-07, "loss": 0.71045119, "num_input_tokens_seen": 278275110, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.69140625, "step": 12897, "time_per_iteration": 2.5983219146728516 }, { "auxiliary_loss_clip": 0.01106385, "auxiliary_loss_mlp": 0.01036056, "balance_loss_clip": 1.0229075, "balance_loss_mlp": 1.03668702, "epoch": 0.7754697129114685, "flos": 23331091201920.0, "grad_norm": 1.6578229543210126, "language_loss": 0.75154227, "learning_rate": 4.772880355984073e-07, "loss": 0.77296668, "num_input_tokens_seen": 278293035, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 12898, "time_per_iteration": 2.572249174118042 }, { "auxiliary_loss_clip": 0.01113902, "auxiliary_loss_mlp": 0.01277617, "balance_loss_clip": 1.01705635, "balance_loss_mlp": 1.03508663, "epoch": 0.7755298361641365, "flos": 17347619635200.0, "grad_norm": 1.8944971289538712, "language_loss": 0.70101833, "learning_rate": 4.77043107682747e-07, "loss": 0.72493351, "num_input_tokens_seen": 278311010, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.703125, "step": 12899, "time_per_iteration": 4.010085582733154 }, { "auxiliary_loss_clip": 0.01130032, "auxiliary_loss_mlp": 0.0103404, "balance_loss_clip": 1.02121282, "balance_loss_mlp": 1.03483641, "epoch": 0.7755899594168044, "flos": 19645866512640.0, "grad_norm": 1.877513559064657, "language_loss": 0.75330937, "learning_rate": 4.767982341175001e-07, "loss": 0.77495009, "num_input_tokens_seen": 278329900, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 12900, "time_per_iteration": 2.6196205615997314 }, { "auxiliary_loss_clip": 0.01108479, "auxiliary_loss_mlp": 0.01035774, "balance_loss_clip": 1.02456284, "balance_loss_mlp": 1.03205585, "epoch": 0.7756500826694724, "flos": 27414457217280.0, "grad_norm": 1.9717743552151705, "language_loss": 0.77754843, "learning_rate": 4.765534149114068e-07, "loss": 0.79899096, "num_input_tokens_seen": 278349980, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.671875, "step": 12901, "time_per_iteration": 2.601173162460327 }, { "auxiliary_loss_clip": 0.01113914, "auxiliary_loss_mlp": 0.01030821, "balance_loss_clip": 1.0184114, "balance_loss_mlp": 1.03711188, "epoch": 0.7757102059221404, "flos": 28730511624960.0, "grad_norm": 1.3976331622299736, "language_loss": 0.76901793, "learning_rate": 4.763086500732032e-07, "loss": 0.7904653, "num_input_tokens_seen": 278372485, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6796875, "step": 12902, "time_per_iteration": 2.5789308547973633 }, { "auxiliary_loss_clip": 0.01111076, "auxiliary_loss_mlp": 0.01029558, "balance_loss_clip": 1.01799428, "balance_loss_mlp": 1.03407836, "epoch": 0.7757703291748084, "flos": 22486795044480.0, "grad_norm": 1.7409267169515361, "language_loss": 0.73335993, "learning_rate": 4.7606393961162437e-07, "loss": 0.75476623, "num_input_tokens_seen": 278391660, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.68359375, "step": 12903, "time_per_iteration": 2.4893877506256104 }, { "auxiliary_loss_clip": 0.01120381, "auxiliary_loss_mlp": 0.0102833, "balance_loss_clip": 1.0159446, "balance_loss_mlp": 1.0351721, "epoch": 0.7758304524274763, "flos": 21430159637760.0, "grad_norm": 1.9730780391232645, "language_loss": 0.76453203, "learning_rate": 4.7581928353540357e-07, "loss": 0.78601915, "num_input_tokens_seen": 278409125, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6796875, "step": 12904, "time_per_iteration": 2.562756061553955 }, { "auxiliary_loss_clip": 0.01100128, "auxiliary_loss_mlp": 0.01026096, "balance_loss_clip": 1.01459205, "balance_loss_mlp": 1.03389788, "epoch": 0.7758905756801443, "flos": 23659242877440.0, "grad_norm": 2.0627378260598515, "language_loss": 0.68441123, "learning_rate": 4.75574681853272e-07, "loss": 0.70567346, "num_input_tokens_seen": 278429450, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 12905, "time_per_iteration": 2.490920305252075 }, { "auxiliary_loss_clip": 0.01110447, "auxiliary_loss_mlp": 0.01276481, "balance_loss_clip": 1.01753342, "balance_loss_mlp": 1.03507257, "epoch": 0.7759506989328122, "flos": 28365479660160.0, "grad_norm": 1.6324177349752327, "language_loss": 0.67436993, "learning_rate": 4.7533013457395865e-07, "loss": 0.69823921, "num_input_tokens_seen": 278449925, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6640625, "step": 12906, "time_per_iteration": 4.120813369750977 }, { "auxiliary_loss_clip": 0.01131227, "auxiliary_loss_mlp": 0.01030414, "balance_loss_clip": 1.01815379, "balance_loss_mlp": 1.03531623, "epoch": 0.7760108221854802, "flos": 14902785354240.0, "grad_norm": 1.9133403277914667, "language_loss": 0.81383735, "learning_rate": 4.750856417061904e-07, "loss": 0.83545375, "num_input_tokens_seen": 278467255, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 12907, "time_per_iteration": 2.517503261566162 }, { "auxiliary_loss_clip": 0.01110623, "auxiliary_loss_mlp": 0.01033534, "balance_loss_clip": 1.02197087, "balance_loss_mlp": 1.03561258, "epoch": 0.7760709454381483, "flos": 14792503622400.0, "grad_norm": 2.4145883252529274, "language_loss": 0.67559683, "learning_rate": 4.7484120325869414e-07, "loss": 0.69703841, "num_input_tokens_seen": 278484250, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.66015625, "step": 12908, "time_per_iteration": 2.4464175701141357 }, { "auxiliary_loss_clip": 0.0113367, "auxiliary_loss_mlp": 0.01039045, "balance_loss_clip": 1.02714205, "balance_loss_mlp": 1.03620768, "epoch": 0.7761310686908162, "flos": 17379831156480.0, "grad_norm": 1.982779863767315, "language_loss": 0.70221007, "learning_rate": 4.74596819240191e-07, "loss": 0.72393721, "num_input_tokens_seen": 278502740, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.7109375, "step": 12909, "time_per_iteration": 2.553724527359009 }, { "auxiliary_loss_clip": 0.01110736, "auxiliary_loss_mlp": 0.01034484, "balance_loss_clip": 1.02312899, "balance_loss_mlp": 1.03512287, "epoch": 0.7761911919434842, "flos": 25556547168000.0, "grad_norm": 1.584349635620452, "language_loss": 0.67695332, "learning_rate": 4.74352489659404e-07, "loss": 0.6984055, "num_input_tokens_seen": 278523890, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.671875, "step": 12910, "time_per_iteration": 2.5718348026275635 }, { "auxiliary_loss_clip": 0.01139887, "auxiliary_loss_mlp": 0.0103494, "balance_loss_clip": 1.02321005, "balance_loss_mlp": 1.03630495, "epoch": 0.7762513151961521, "flos": 23179763203200.0, "grad_norm": 1.7370848752383736, "language_loss": 0.7181375, "learning_rate": 4.741082145250519e-07, "loss": 0.73988569, "num_input_tokens_seen": 278543185, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.68359375, "step": 12911, "time_per_iteration": 2.5986404418945312 }, { "auxiliary_loss_clip": 0.01107595, "auxiliary_loss_mlp": 0.01276738, "balance_loss_clip": 1.01676083, "balance_loss_mlp": 1.03724551, "epoch": 0.7763114384488201, "flos": 21689614552320.0, "grad_norm": 2.3959937848883652, "language_loss": 0.63313496, "learning_rate": 4.738639938458535e-07, "loss": 0.65697825, "num_input_tokens_seen": 278559220, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 12912, "time_per_iteration": 2.5081610679626465 }, { "auxiliary_loss_clip": 0.0104177, "auxiliary_loss_mlp": 0.01000174, "balance_loss_clip": 0.99898195, "balance_loss_mlp": 1.00272822, "epoch": 0.776371561701488, "flos": 69025554316800.0, "grad_norm": 0.765908227901167, "language_loss": 0.5323956, "learning_rate": 4.736198276305223e-07, "loss": 0.55281502, "num_input_tokens_seen": 278618185, "router_z_loss_clip": 0.01190186, "router_z_loss_mlp": 0.21484375, "step": 12913, "time_per_iteration": 3.229994058609009 }, { "auxiliary_loss_clip": 0.01124443, "auxiliary_loss_mlp": 0.01032778, "balance_loss_clip": 1.02027309, "balance_loss_mlp": 1.036659, "epoch": 0.776431684954156, "flos": 22893914770560.0, "grad_norm": 2.2311713569838436, "language_loss": 0.62260556, "learning_rate": 4.7337571588777406e-07, "loss": 0.64417779, "num_input_tokens_seen": 278636210, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 12914, "time_per_iteration": 2.550816059112549 }, { "auxiliary_loss_clip": 0.01120172, "auxiliary_loss_mlp": 0.010304, "balance_loss_clip": 1.01809788, "balance_loss_mlp": 1.03409696, "epoch": 0.776491808206824, "flos": 20261554560000.0, "grad_norm": 1.8868524393409691, "language_loss": 0.82355922, "learning_rate": 4.731316586263192e-07, "loss": 0.845065, "num_input_tokens_seen": 278653305, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 12915, "time_per_iteration": 2.5240423679351807 }, { "auxiliary_loss_clip": 0.01115484, "auxiliary_loss_mlp": 0.01035012, "balance_loss_clip": 1.02182126, "balance_loss_mlp": 1.03437519, "epoch": 0.776551931459492, "flos": 26759051706240.0, "grad_norm": 1.9562008833522893, "language_loss": 0.75066733, "learning_rate": 4.72887655854868e-07, "loss": 0.77217227, "num_input_tokens_seen": 278671850, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 12916, "time_per_iteration": 2.523249626159668 }, { "auxiliary_loss_clip": 0.01148743, "auxiliary_loss_mlp": 0.01029905, "balance_loss_clip": 1.01756716, "balance_loss_mlp": 1.03465843, "epoch": 0.7766120547121599, "flos": 52665080250240.0, "grad_norm": 1.700486697515939, "language_loss": 0.65603328, "learning_rate": 4.7264370758212766e-07, "loss": 0.67781979, "num_input_tokens_seen": 278697860, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 12917, "time_per_iteration": 2.833946943283081 }, { "auxiliary_loss_clip": 0.01111144, "auxiliary_loss_mlp": 0.01031903, "balance_loss_clip": 1.01994085, "balance_loss_mlp": 1.03332901, "epoch": 0.7766721779648279, "flos": 25156215112320.0, "grad_norm": 1.5111868687628216, "language_loss": 0.64297056, "learning_rate": 4.723998138168055e-07, "loss": 0.66440099, "num_input_tokens_seen": 278720655, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.69140625, "step": 12918, "time_per_iteration": 2.5461111068725586 }, { "auxiliary_loss_clip": 0.01102682, "auxiliary_loss_mlp": 0.01034255, "balance_loss_clip": 1.02247071, "balance_loss_mlp": 1.03477693, "epoch": 0.7767323012174958, "flos": 23760761690880.0, "grad_norm": 2.108666880243232, "language_loss": 0.73880565, "learning_rate": 4.7215597456760426e-07, "loss": 0.76017499, "num_input_tokens_seen": 278737375, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 12919, "time_per_iteration": 2.477415084838867 }, { "auxiliary_loss_clip": 0.01139434, "auxiliary_loss_mlp": 0.01031453, "balance_loss_clip": 1.01872158, "balance_loss_mlp": 1.03480387, "epoch": 0.7767924244701638, "flos": 22086642556800.0, "grad_norm": 1.8221913271499122, "language_loss": 0.78929859, "learning_rate": 4.719121898432255e-07, "loss": 0.8110075, "num_input_tokens_seen": 278756510, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 12920, "time_per_iteration": 2.5726921558380127 }, { "auxiliary_loss_clip": 0.01114317, "auxiliary_loss_mlp": 0.01029663, "balance_loss_clip": 1.01747417, "balance_loss_mlp": 1.03556716, "epoch": 0.7768525477228319, "flos": 21981640124160.0, "grad_norm": 1.6407549932649643, "language_loss": 0.71006161, "learning_rate": 4.7166845965237033e-07, "loss": 0.73150146, "num_input_tokens_seen": 278775410, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69921875, "step": 12921, "time_per_iteration": 2.487826108932495 }, { "auxiliary_loss_clip": 0.01116381, "auxiliary_loss_mlp": 0.0103199, "balance_loss_clip": 1.01868081, "balance_loss_mlp": 1.03574586, "epoch": 0.7769126709754998, "flos": 21794581071360.0, "grad_norm": 1.7010496040675005, "language_loss": 0.75899088, "learning_rate": 4.7142478400373686e-07, "loss": 0.7804746, "num_input_tokens_seen": 278794260, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 12922, "time_per_iteration": 2.5503368377685547 }, { "auxiliary_loss_clip": 0.01140407, "auxiliary_loss_mlp": 0.01038039, "balance_loss_clip": 1.02521777, "balance_loss_mlp": 1.03598762, "epoch": 0.7769727942281678, "flos": 20047994248320.0, "grad_norm": 1.684186799967409, "language_loss": 0.8031776, "learning_rate": 4.7118116290602074e-07, "loss": 0.82496214, "num_input_tokens_seen": 278813290, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 12923, "time_per_iteration": 2.5509274005889893 }, { "auxiliary_loss_clip": 0.01131886, "auxiliary_loss_mlp": 0.01036494, "balance_loss_clip": 1.02434707, "balance_loss_mlp": 1.03622055, "epoch": 0.7770329174808357, "flos": 21686777377920.0, "grad_norm": 1.8673669999223899, "language_loss": 0.92047882, "learning_rate": 4.709375963679156e-07, "loss": 0.94216263, "num_input_tokens_seen": 278830610, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6875, "step": 12924, "time_per_iteration": 2.559769868850708 }, { "auxiliary_loss_clip": 0.01111224, "auxiliary_loss_mlp": 0.01032002, "balance_loss_clip": 1.01952696, "balance_loss_mlp": 1.03287363, "epoch": 0.7770930407335037, "flos": 25849255098240.0, "grad_norm": 2.0209974786236895, "language_loss": 0.65698797, "learning_rate": 4.7069408439811574e-07, "loss": 0.67842019, "num_input_tokens_seen": 278849530, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 12925, "time_per_iteration": 2.5732555389404297 }, { "auxiliary_loss_clip": 0.0113088, "auxiliary_loss_mlp": 0.01035899, "balance_loss_clip": 1.02385259, "balance_loss_mlp": 1.03445411, "epoch": 0.7771531639861716, "flos": 24347865490560.0, "grad_norm": 1.8763678600246034, "language_loss": 0.71888793, "learning_rate": 4.70450627005309e-07, "loss": 0.74055576, "num_input_tokens_seen": 278869005, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69921875, "step": 12926, "time_per_iteration": 2.5696380138397217 }, { "auxiliary_loss_clip": 0.01114759, "auxiliary_loss_mlp": 0.01028089, "balance_loss_clip": 1.01454115, "balance_loss_mlp": 1.03466892, "epoch": 0.7772132872388396, "flos": 25629948610560.0, "grad_norm": 1.7858803844119078, "language_loss": 0.65428817, "learning_rate": 4.702072241981854e-07, "loss": 0.6757167, "num_input_tokens_seen": 278888790, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71484375, "step": 12927, "time_per_iteration": 2.567661762237549 }, { "auxiliary_loss_clip": 0.01121843, "auxiliary_loss_mlp": 0.0103614, "balance_loss_clip": 1.02276492, "balance_loss_mlp": 1.03515029, "epoch": 0.7772734104915076, "flos": 26067412350720.0, "grad_norm": 1.9009104496321223, "language_loss": 0.71999586, "learning_rate": 4.699638759854303e-07, "loss": 0.74157572, "num_input_tokens_seen": 278908150, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.69140625, "step": 12928, "time_per_iteration": 2.6015048027038574 }, { "auxiliary_loss_clip": 0.01136236, "auxiliary_loss_mlp": 0.01032196, "balance_loss_clip": 1.01955962, "balance_loss_mlp": 1.03343344, "epoch": 0.7773335337441756, "flos": 22925048883840.0, "grad_norm": 1.6747669627288002, "language_loss": 0.74344802, "learning_rate": 4.6972058237573e-07, "loss": 0.76513231, "num_input_tokens_seen": 278927425, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.67578125, "step": 12929, "time_per_iteration": 2.533127546310425 }, { "auxiliary_loss_clip": 0.01130871, "auxiliary_loss_mlp": 0.01030962, "balance_loss_clip": 1.01807547, "balance_loss_mlp": 1.03475642, "epoch": 0.7773936569968435, "flos": 20776765288320.0, "grad_norm": 1.9538389610324844, "language_loss": 0.77743477, "learning_rate": 4.6947734337776456e-07, "loss": 0.79905313, "num_input_tokens_seen": 278946475, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 12930, "time_per_iteration": 2.525379180908203 }, { "auxiliary_loss_clip": 0.01104183, "auxiliary_loss_mlp": 0.01027824, "balance_loss_clip": 1.01546216, "balance_loss_mlp": 1.03601813, "epoch": 0.7774537802495115, "flos": 20372267255040.0, "grad_norm": 1.8505955182188507, "language_loss": 0.79577971, "learning_rate": 4.6923415900021623e-07, "loss": 0.81709969, "num_input_tokens_seen": 278964345, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6796875, "step": 12931, "time_per_iteration": 2.488795757293701 }, { "auxiliary_loss_clip": 0.01051092, "auxiliary_loss_mlp": 0.01001501, "balance_loss_clip": 1.00029147, "balance_loss_mlp": 1.00250494, "epoch": 0.7775139035021794, "flos": 53912081738880.0, "grad_norm": 0.8125321689051351, "language_loss": 0.59799552, "learning_rate": 4.689910292517634e-07, "loss": 0.61852145, "num_input_tokens_seen": 279022380, "router_z_loss_clip": 0.01208496, "router_z_loss_mlp": 0.21582031, "step": 12932, "time_per_iteration": 3.114569902420044 }, { "auxiliary_loss_clip": 0.01115112, "auxiliary_loss_mlp": 0.01030269, "balance_loss_clip": 1.01829457, "balance_loss_mlp": 1.03610086, "epoch": 0.7775740267548474, "flos": 28842481296000.0, "grad_norm": 1.5142199843393696, "language_loss": 0.75939989, "learning_rate": 4.687479541410824e-07, "loss": 0.78085363, "num_input_tokens_seen": 279044275, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.703125, "step": 12933, "time_per_iteration": 2.575417995452881 }, { "auxiliary_loss_clip": 0.01122639, "auxiliary_loss_mlp": 0.01034006, "balance_loss_clip": 1.02113771, "balance_loss_mlp": 1.03473544, "epoch": 0.7776341500075155, "flos": 21872471713920.0, "grad_norm": 2.00192665871519, "language_loss": 0.73146629, "learning_rate": 4.685049336768478e-07, "loss": 0.7530328, "num_input_tokens_seen": 279063375, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 12934, "time_per_iteration": 4.016272068023682 }, { "auxiliary_loss_clip": 0.01140868, "auxiliary_loss_mlp": 0.01026781, "balance_loss_clip": 1.0139308, "balance_loss_mlp": 1.03583479, "epoch": 0.7776942732601834, "flos": 20229845829120.0, "grad_norm": 2.2094619180364696, "language_loss": 0.7031787, "learning_rate": 4.682619678677331e-07, "loss": 0.72485518, "num_input_tokens_seen": 279082680, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 12935, "time_per_iteration": 2.5429093837738037 }, { "auxiliary_loss_clip": 0.01124206, "auxiliary_loss_mlp": 0.01286701, "balance_loss_clip": 1.02713132, "balance_loss_mlp": 1.03582227, "epoch": 0.7777543965128514, "flos": 22231829329920.0, "grad_norm": 1.832296808670594, "language_loss": 0.83286422, "learning_rate": 4.680190567224085e-07, "loss": 0.85697329, "num_input_tokens_seen": 279099805, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 12936, "time_per_iteration": 2.5649566650390625 }, { "auxiliary_loss_clip": 0.01115368, "auxiliary_loss_mlp": 0.01028177, "balance_loss_clip": 1.01659608, "balance_loss_mlp": 1.0345912, "epoch": 0.7778145197655193, "flos": 14501950508160.0, "grad_norm": 2.397089412729092, "language_loss": 0.67564994, "learning_rate": 4.677762002495422e-07, "loss": 0.69708538, "num_input_tokens_seen": 279117975, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.71875, "step": 12937, "time_per_iteration": 2.563667058944702 }, { "auxiliary_loss_clip": 0.01105438, "auxiliary_loss_mlp": 0.01029624, "balance_loss_clip": 1.01713645, "balance_loss_mlp": 1.0347476, "epoch": 0.7778746430181873, "flos": 21140288881920.0, "grad_norm": 1.541541483623798, "language_loss": 0.86982691, "learning_rate": 4.6753339845780293e-07, "loss": 0.89117759, "num_input_tokens_seen": 279137255, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 12938, "time_per_iteration": 2.527919054031372 }, { "auxiliary_loss_clip": 0.01114157, "auxiliary_loss_mlp": 0.01027692, "balance_loss_clip": 1.01490104, "balance_loss_mlp": 1.03540492, "epoch": 0.7779347662708552, "flos": 20266366982400.0, "grad_norm": 3.07901661471271, "language_loss": 0.85179377, "learning_rate": 4.6729065135585456e-07, "loss": 0.87321228, "num_input_tokens_seen": 279154500, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 12939, "time_per_iteration": 2.5179667472839355 }, { "auxiliary_loss_clip": 0.01101862, "auxiliary_loss_mlp": 0.01276426, "balance_loss_clip": 1.01780248, "balance_loss_mlp": 1.03513956, "epoch": 0.7779948895235232, "flos": 19209013303680.0, "grad_norm": 2.0070031201762757, "language_loss": 0.68626899, "learning_rate": 4.6704795895236016e-07, "loss": 0.71005189, "num_input_tokens_seen": 279173635, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.66796875, "step": 12940, "time_per_iteration": 3.921017646789551 }, { "auxiliary_loss_clip": 0.01131584, "auxiliary_loss_mlp": 0.01027201, "balance_loss_clip": 1.01520205, "balance_loss_mlp": 1.03436863, "epoch": 0.7780550127761912, "flos": 23914711382400.0, "grad_norm": 1.6997708876440776, "language_loss": 0.77752805, "learning_rate": 4.668053212559804e-07, "loss": 0.7991159, "num_input_tokens_seen": 279194430, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 12941, "time_per_iteration": 2.559110164642334 }, { "auxiliary_loss_clip": 0.01116892, "auxiliary_loss_mlp": 0.01037801, "balance_loss_clip": 1.02378237, "balance_loss_mlp": 1.03504944, "epoch": 0.7781151360288592, "flos": 32415951795840.0, "grad_norm": 2.3644859687657145, "language_loss": 0.73413515, "learning_rate": 4.6656273827537586e-07, "loss": 0.75568199, "num_input_tokens_seen": 279212920, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7265625, "step": 12942, "time_per_iteration": 2.5973691940307617 }, { "auxiliary_loss_clip": 0.01137886, "auxiliary_loss_mlp": 0.01033077, "balance_loss_clip": 1.02101898, "balance_loss_mlp": 1.03428864, "epoch": 0.7781752592815271, "flos": 22346384780160.0, "grad_norm": 1.9656236623065624, "language_loss": 0.68155015, "learning_rate": 4.6632021001920163e-07, "loss": 0.70325983, "num_input_tokens_seen": 279232310, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 12943, "time_per_iteration": 2.577817916870117 }, { "auxiliary_loss_clip": 0.01042512, "auxiliary_loss_mlp": 0.01003197, "balance_loss_clip": 1.0020293, "balance_loss_mlp": 1.00256157, "epoch": 0.7782353825341951, "flos": 70460183520000.0, "grad_norm": 0.7721360904355062, "language_loss": 0.58509851, "learning_rate": 4.660777364961148e-07, "loss": 0.60555565, "num_input_tokens_seen": 279295375, "router_z_loss_clip": 0.01165771, "router_z_loss_mlp": 0.21582031, "step": 12944, "time_per_iteration": 3.3696541786193848 }, { "auxiliary_loss_clip": 0.01106337, "auxiliary_loss_mlp": 0.01036418, "balance_loss_clip": 1.02288783, "balance_loss_mlp": 1.03491545, "epoch": 0.778295505786863, "flos": 19062569554560.0, "grad_norm": 1.851202743262301, "language_loss": 0.67792201, "learning_rate": 4.6583531771476716e-07, "loss": 0.69934958, "num_input_tokens_seen": 279313660, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71484375, "step": 12945, "time_per_iteration": 2.517111301422119 }, { "auxiliary_loss_clip": 0.01115102, "auxiliary_loss_mlp": 0.01033835, "balance_loss_clip": 1.02116275, "balance_loss_mlp": 1.03611469, "epoch": 0.778355629039531, "flos": 20999734963200.0, "grad_norm": 1.9397193803759145, "language_loss": 0.69181013, "learning_rate": 4.655929536838117e-07, "loss": 0.71329951, "num_input_tokens_seen": 279334495, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 12946, "time_per_iteration": 2.522360324859619 }, { "auxiliary_loss_clip": 0.01105394, "auxiliary_loss_mlp": 0.01030745, "balance_loss_clip": 1.01786458, "balance_loss_mlp": 1.03596377, "epoch": 0.7784157522921991, "flos": 21398091770880.0, "grad_norm": 1.9829439873435721, "language_loss": 0.65731734, "learning_rate": 4.6535064441189574e-07, "loss": 0.67867869, "num_input_tokens_seen": 279352985, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 12947, "time_per_iteration": 3.8967738151550293 }, { "auxiliary_loss_clip": 0.01147968, "auxiliary_loss_mlp": 0.01034594, "balance_loss_clip": 1.02167177, "balance_loss_mlp": 1.03498673, "epoch": 0.778475875544867, "flos": 20813861059200.0, "grad_norm": 1.8450644499110695, "language_loss": 0.65269804, "learning_rate": 4.651083899076682e-07, "loss": 0.67452365, "num_input_tokens_seen": 279371360, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.68359375, "step": 12948, "time_per_iteration": 4.104079246520996 }, { "auxiliary_loss_clip": 0.01104556, "auxiliary_loss_mlp": 0.01033031, "balance_loss_clip": 1.02031112, "balance_loss_mlp": 1.03497195, "epoch": 0.778535998797535, "flos": 14355363104640.0, "grad_norm": 1.7984696526386525, "language_loss": 0.75417709, "learning_rate": 4.648661901797746e-07, "loss": 0.77555299, "num_input_tokens_seen": 279389400, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 12949, "time_per_iteration": 2.5660767555236816 }, { "auxiliary_loss_clip": 0.01107946, "auxiliary_loss_mlp": 0.01030317, "balance_loss_clip": 1.01783621, "balance_loss_mlp": 1.03554904, "epoch": 0.7785961220502029, "flos": 19209552007680.0, "grad_norm": 1.5515218196539071, "language_loss": 0.68824524, "learning_rate": 4.646240452368566e-07, "loss": 0.70962787, "num_input_tokens_seen": 279409715, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.72265625, "step": 12950, "time_per_iteration": 2.497981071472168 }, { "auxiliary_loss_clip": 0.01114254, "auxiliary_loss_mlp": 0.01029359, "balance_loss_clip": 1.0169251, "balance_loss_mlp": 1.03441048, "epoch": 0.7786562453028709, "flos": 25738757884800.0, "grad_norm": 1.6787902088064763, "language_loss": 0.72174191, "learning_rate": 4.643819550875576e-07, "loss": 0.74317801, "num_input_tokens_seen": 279427705, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 12951, "time_per_iteration": 2.578963041305542 }, { "auxiliary_loss_clip": 0.01120372, "auxiliary_loss_mlp": 0.01032471, "balance_loss_clip": 1.02011538, "balance_loss_mlp": 1.0351615, "epoch": 0.7787163685555388, "flos": 25739440243200.0, "grad_norm": 1.8703322763609065, "language_loss": 0.65499508, "learning_rate": 4.641399197405167e-07, "loss": 0.67652363, "num_input_tokens_seen": 279448215, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.671875, "step": 12952, "time_per_iteration": 2.5811288356781006 }, { "auxiliary_loss_clip": 0.01114705, "auxiliary_loss_mlp": 0.01029254, "balance_loss_clip": 1.01759553, "balance_loss_mlp": 1.03254914, "epoch": 0.7787764918082068, "flos": 22747722416640.0, "grad_norm": 1.5259468012280268, "language_loss": 0.81150973, "learning_rate": 4.6389793920437116e-07, "loss": 0.8329494, "num_input_tokens_seen": 279466260, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.64453125, "step": 12953, "time_per_iteration": 2.596653938293457 }, { "auxiliary_loss_clip": 0.01109732, "auxiliary_loss_mlp": 0.0103386, "balance_loss_clip": 1.02134347, "balance_loss_mlp": 1.03817534, "epoch": 0.7788366150608748, "flos": 15190860430080.0, "grad_norm": 2.440167762405104, "language_loss": 0.76415378, "learning_rate": 4.636560134877563e-07, "loss": 0.78558969, "num_input_tokens_seen": 279484520, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71484375, "step": 12954, "time_per_iteration": 2.5497586727142334 }, { "auxiliary_loss_clip": 0.0111451, "auxiliary_loss_mlp": 0.01029766, "balance_loss_clip": 1.01781499, "balance_loss_mlp": 1.03478861, "epoch": 0.7788967383135428, "flos": 21210242618880.0, "grad_norm": 1.4920903889854575, "language_loss": 0.72763145, "learning_rate": 4.6341414259930703e-07, "loss": 0.74907416, "num_input_tokens_seen": 279503130, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.70703125, "step": 12955, "time_per_iteration": 2.544325113296509 }, { "auxiliary_loss_clip": 0.01122004, "auxiliary_loss_mlp": 0.0103005, "balance_loss_clip": 1.01748514, "balance_loss_mlp": 1.03356016, "epoch": 0.7789568615662107, "flos": 21682970536320.0, "grad_norm": 1.4326149866352835, "language_loss": 0.68582022, "learning_rate": 4.6317232654765434e-07, "loss": 0.70734078, "num_input_tokens_seen": 279521930, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 12956, "time_per_iteration": 2.531273603439331 }, { "auxiliary_loss_clip": 0.011311, "auxiliary_loss_mlp": 0.01031959, "balance_loss_clip": 1.01956153, "balance_loss_mlp": 1.03548717, "epoch": 0.7790169848188787, "flos": 26360371676160.0, "grad_norm": 1.9463373387291283, "language_loss": 0.75949526, "learning_rate": 4.6293056534142814e-07, "loss": 0.78112584, "num_input_tokens_seen": 279542375, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 12957, "time_per_iteration": 2.574737071990967 }, { "auxiliary_loss_clip": 0.01157632, "auxiliary_loss_mlp": 0.01028276, "balance_loss_clip": 1.01620615, "balance_loss_mlp": 1.03460729, "epoch": 0.7790771080715466, "flos": 25516183259520.0, "grad_norm": 1.7038024519812036, "language_loss": 0.77227604, "learning_rate": 4.6268885898925593e-07, "loss": 0.79413515, "num_input_tokens_seen": 279561885, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6953125, "step": 12958, "time_per_iteration": 2.608771800994873 }, { "auxiliary_loss_clip": 0.0111544, "auxiliary_loss_mlp": 0.01035443, "balance_loss_clip": 1.02251482, "balance_loss_mlp": 1.03586006, "epoch": 0.7791372313242146, "flos": 16034186920320.0, "grad_norm": 2.0275794784847516, "language_loss": 0.72005534, "learning_rate": 4.6244720749976473e-07, "loss": 0.74156415, "num_input_tokens_seen": 279579965, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 12959, "time_per_iteration": 2.5083086490631104 }, { "auxiliary_loss_clip": 0.01113189, "auxiliary_loss_mlp": 0.01280406, "balance_loss_clip": 1.02122009, "balance_loss_mlp": 1.03468537, "epoch": 0.7791973545768827, "flos": 23842207779840.0, "grad_norm": 1.6925868825766153, "language_loss": 0.77742541, "learning_rate": 4.622056108815766e-07, "loss": 0.80136132, "num_input_tokens_seen": 279599030, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.69140625, "step": 12960, "time_per_iteration": 2.5291149616241455 }, { "auxiliary_loss_clip": 0.01115881, "auxiliary_loss_mlp": 0.01037584, "balance_loss_clip": 1.02451313, "balance_loss_mlp": 1.03608382, "epoch": 0.7792574778295506, "flos": 24168384207360.0, "grad_norm": 1.8804572781813116, "language_loss": 0.75002837, "learning_rate": 4.619640691433151e-07, "loss": 0.77156299, "num_input_tokens_seen": 279614400, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 12961, "time_per_iteration": 2.6106417179107666 }, { "auxiliary_loss_clip": 0.01120158, "auxiliary_loss_mlp": 0.01036008, "balance_loss_clip": 1.024158, "balance_loss_mlp": 1.03742135, "epoch": 0.7793176010822186, "flos": 21464921024640.0, "grad_norm": 1.6168297054074154, "language_loss": 0.73853898, "learning_rate": 4.617225822935997e-07, "loss": 0.7601006, "num_input_tokens_seen": 279633745, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6484375, "step": 12962, "time_per_iteration": 2.54213285446167 }, { "auxiliary_loss_clip": 0.01112976, "auxiliary_loss_mlp": 0.01027237, "balance_loss_clip": 1.0151912, "balance_loss_mlp": 1.03563094, "epoch": 0.7793777243348865, "flos": 20666699038080.0, "grad_norm": 1.8289307822266672, "language_loss": 0.6954869, "learning_rate": 4.614811503410483e-07, "loss": 0.71688902, "num_input_tokens_seen": 279651165, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 12963, "time_per_iteration": 2.560976505279541 }, { "auxiliary_loss_clip": 0.01112704, "auxiliary_loss_mlp": 0.0127945, "balance_loss_clip": 1.02022243, "balance_loss_mlp": 1.03584445, "epoch": 0.7794378475875545, "flos": 27125771610240.0, "grad_norm": 1.8050732553501847, "language_loss": 0.63949418, "learning_rate": 4.6123977329427724e-07, "loss": 0.66341573, "num_input_tokens_seen": 279671175, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6796875, "step": 12964, "time_per_iteration": 2.5108482837677 }, { "auxiliary_loss_clip": 0.01122426, "auxiliary_loss_mlp": 0.01033757, "balance_loss_clip": 1.02159762, "balance_loss_mlp": 1.03377032, "epoch": 0.7794979708402224, "flos": 28074136446720.0, "grad_norm": 2.1267892302676152, "language_loss": 0.76343077, "learning_rate": 4.609984511618998e-07, "loss": 0.78499264, "num_input_tokens_seen": 279688675, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.7109375, "step": 12965, "time_per_iteration": 2.5606744289398193 }, { "auxiliary_loss_clip": 0.01106352, "auxiliary_loss_mlp": 0.01030125, "balance_loss_clip": 1.01876998, "balance_loss_mlp": 1.033566, "epoch": 0.7795580940928905, "flos": 26869548919680.0, "grad_norm": 1.5344188820401965, "language_loss": 0.72621202, "learning_rate": 4.6075718395253016e-07, "loss": 0.74757683, "num_input_tokens_seen": 279710245, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.63671875, "step": 12966, "time_per_iteration": 2.5955159664154053 }, { "auxiliary_loss_clip": 0.01120248, "auxiliary_loss_mlp": 0.01274716, "balance_loss_clip": 1.01669765, "balance_loss_mlp": 1.03620863, "epoch": 0.7796182173455584, "flos": 23835384195840.0, "grad_norm": 1.606283354386164, "language_loss": 0.74257588, "learning_rate": 4.605159716747762e-07, "loss": 0.76652551, "num_input_tokens_seen": 279729045, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6640625, "step": 12967, "time_per_iteration": 2.6395304203033447 }, { "auxiliary_loss_clip": 0.01105339, "auxiliary_loss_mlp": 0.01030026, "balance_loss_clip": 1.01663256, "balance_loss_mlp": 1.03528547, "epoch": 0.7796783405982264, "flos": 19792238434560.0, "grad_norm": 2.200020304005409, "language_loss": 0.72109616, "learning_rate": 4.6027481433724746e-07, "loss": 0.74244976, "num_input_tokens_seen": 279748350, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.703125, "step": 12968, "time_per_iteration": 2.5306358337402344 }, { "auxiliary_loss_clip": 0.01122067, "auxiliary_loss_mlp": 0.01036399, "balance_loss_clip": 1.02376306, "balance_loss_mlp": 1.03566623, "epoch": 0.7797384638508943, "flos": 15450207603840.0, "grad_norm": 2.455503781107384, "language_loss": 0.60441697, "learning_rate": 4.6003371194855e-07, "loss": 0.62600166, "num_input_tokens_seen": 279765620, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.68359375, "step": 12969, "time_per_iteration": 2.5474743843078613 }, { "auxiliary_loss_clip": 0.01107547, "auxiliary_loss_mlp": 0.01032687, "balance_loss_clip": 1.0213027, "balance_loss_mlp": 1.03345001, "epoch": 0.7797985871035623, "flos": 20922742160640.0, "grad_norm": 1.8140303365189252, "language_loss": 0.69990391, "learning_rate": 4.5979266451728825e-07, "loss": 0.7213062, "num_input_tokens_seen": 279782485, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.65234375, "step": 12970, "time_per_iteration": 2.499318838119507 }, { "auxiliary_loss_clip": 0.0113318, "auxiliary_loss_mlp": 0.01280328, "balance_loss_clip": 1.01958776, "balance_loss_mlp": 1.03705335, "epoch": 0.7798587103562302, "flos": 36937212514560.0, "grad_norm": 1.833192442524493, "language_loss": 0.72225654, "learning_rate": 4.5955167205206355e-07, "loss": 0.74639165, "num_input_tokens_seen": 279804170, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.6953125, "step": 12971, "time_per_iteration": 2.674513816833496 }, { "auxiliary_loss_clip": 0.01136703, "auxiliary_loss_mlp": 0.01032418, "balance_loss_clip": 1.01972842, "balance_loss_mlp": 1.03604126, "epoch": 0.7799188336088982, "flos": 22419283432320.0, "grad_norm": 2.1414441465364398, "language_loss": 0.74438119, "learning_rate": 4.593107345614782e-07, "loss": 0.76607239, "num_input_tokens_seen": 279823730, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73046875, "step": 12972, "time_per_iteration": 2.5118043422698975 }, { "auxiliary_loss_clip": 0.01104533, "auxiliary_loss_mlp": 0.0102659, "balance_loss_clip": 1.01400161, "balance_loss_mlp": 1.03503489, "epoch": 0.7799789568615663, "flos": 18880466578560.0, "grad_norm": 1.8487591261726783, "language_loss": 0.71329963, "learning_rate": 4.590698520541292e-07, "loss": 0.73461086, "num_input_tokens_seen": 279843035, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 12973, "time_per_iteration": 2.560366630554199 }, { "auxiliary_loss_clip": 0.01111395, "auxiliary_loss_mlp": 0.01030619, "balance_loss_clip": 1.01911557, "balance_loss_mlp": 1.0347631, "epoch": 0.7800390801142342, "flos": 20262272832000.0, "grad_norm": 1.8490154271288306, "language_loss": 0.77392423, "learning_rate": 4.588290245386135e-07, "loss": 0.79534441, "num_input_tokens_seen": 279861450, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6796875, "step": 12974, "time_per_iteration": 2.513758659362793 }, { "auxiliary_loss_clip": 0.01129076, "auxiliary_loss_mlp": 0.0103208, "balance_loss_clip": 1.01957512, "balance_loss_mlp": 1.0346688, "epoch": 0.7800992033669022, "flos": 16690310703360.0, "grad_norm": 3.264113797185815, "language_loss": 0.69231796, "learning_rate": 4.585882520235251e-07, "loss": 0.71392953, "num_input_tokens_seen": 279878660, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 12975, "time_per_iteration": 3.9293367862701416 }, { "auxiliary_loss_clip": 0.01127443, "auxiliary_loss_mlp": 0.01029708, "balance_loss_clip": 1.01781082, "balance_loss_mlp": 1.03455162, "epoch": 0.7801593266195701, "flos": 18585208782720.0, "grad_norm": 2.221455897052593, "language_loss": 0.82026881, "learning_rate": 4.583475345174581e-07, "loss": 0.84184027, "num_input_tokens_seen": 279895685, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6640625, "step": 12976, "time_per_iteration": 2.5485363006591797 }, { "auxiliary_loss_clip": 0.01123368, "auxiliary_loss_mlp": 0.01275074, "balance_loss_clip": 1.01467991, "balance_loss_mlp": 1.03475642, "epoch": 0.7802194498722381, "flos": 25484941405440.0, "grad_norm": 1.4208990000571355, "language_loss": 0.65970016, "learning_rate": 4.5810687202900087e-07, "loss": 0.68368459, "num_input_tokens_seen": 279917240, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 12977, "time_per_iteration": 2.5264132022857666 }, { "auxiliary_loss_clip": 0.01120288, "auxiliary_loss_mlp": 0.01028734, "balance_loss_clip": 1.01703334, "balance_loss_mlp": 1.03570485, "epoch": 0.780279573124906, "flos": 31176315573120.0, "grad_norm": 1.4618674667818865, "language_loss": 0.74405551, "learning_rate": 4.578662645667437e-07, "loss": 0.76554567, "num_input_tokens_seen": 279938665, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 12978, "time_per_iteration": 2.6162800788879395 }, { "auxiliary_loss_clip": 0.01124753, "auxiliary_loss_mlp": 0.01034196, "balance_loss_clip": 1.02101207, "balance_loss_mlp": 1.0357548, "epoch": 0.780339696377574, "flos": 26944027770240.0, "grad_norm": 3.0647087129024113, "language_loss": 0.61993015, "learning_rate": 4.576257121392728e-07, "loss": 0.64151967, "num_input_tokens_seen": 279957965, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 12979, "time_per_iteration": 2.520806312561035 }, { "auxiliary_loss_clip": 0.01103829, "auxiliary_loss_mlp": 0.01026886, "balance_loss_clip": 1.01463127, "balance_loss_mlp": 1.03538764, "epoch": 0.780399819630242, "flos": 27957426180480.0, "grad_norm": 1.9140582020899144, "language_loss": 0.76547945, "learning_rate": 4.5738521475517265e-07, "loss": 0.78678656, "num_input_tokens_seen": 279977490, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.68359375, "step": 12980, "time_per_iteration": 2.5799660682678223 }, { "auxiliary_loss_clip": 0.01106971, "auxiliary_loss_mlp": 0.01030837, "balance_loss_clip": 1.01881492, "balance_loss_mlp": 1.03590035, "epoch": 0.78045994288291, "flos": 22486795044480.0, "grad_norm": 2.1604108248478484, "language_loss": 0.77997524, "learning_rate": 4.571447724230262e-07, "loss": 0.80135334, "num_input_tokens_seen": 279994220, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7109375, "step": 12981, "time_per_iteration": 2.501335382461548 }, { "auxiliary_loss_clip": 0.01135039, "auxiliary_loss_mlp": 0.01034402, "balance_loss_clip": 1.02178991, "balance_loss_mlp": 1.037714, "epoch": 0.7805200661355779, "flos": 20850849089280.0, "grad_norm": 1.92457879610777, "language_loss": 0.72949952, "learning_rate": 4.569043851514134e-07, "loss": 0.75119394, "num_input_tokens_seen": 280012590, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 12982, "time_per_iteration": 4.050354242324829 }, { "auxiliary_loss_clip": 0.01122713, "auxiliary_loss_mlp": 0.01032362, "balance_loss_clip": 1.02065587, "balance_loss_mlp": 1.03409219, "epoch": 0.7805801893882459, "flos": 25665966973440.0, "grad_norm": 1.4358466878889466, "language_loss": 0.73231953, "learning_rate": 4.5666405294891497e-07, "loss": 0.75387025, "num_input_tokens_seen": 280033700, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.703125, "step": 12983, "time_per_iteration": 2.563016176223755 }, { "auxiliary_loss_clip": 0.01121785, "auxiliary_loss_mlp": 0.01028319, "balance_loss_clip": 1.01674414, "balance_loss_mlp": 1.03524113, "epoch": 0.7806403126409138, "flos": 11327806483200.0, "grad_norm": 1.9476106875213608, "language_loss": 0.7453934, "learning_rate": 4.564237758241054e-07, "loss": 0.76689446, "num_input_tokens_seen": 280052215, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6875, "step": 12984, "time_per_iteration": 2.538794755935669 }, { "auxiliary_loss_clip": 0.01121406, "auxiliary_loss_mlp": 0.01032889, "balance_loss_clip": 1.02114117, "balance_loss_mlp": 1.03456867, "epoch": 0.7807004358935818, "flos": 19573362910080.0, "grad_norm": 2.061581047322281, "language_loss": 0.81725287, "learning_rate": 4.561835537855614e-07, "loss": 0.83879584, "num_input_tokens_seen": 280070525, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 12985, "time_per_iteration": 2.50693678855896 }, { "auxiliary_loss_clip": 0.01120883, "auxiliary_loss_mlp": 0.01030753, "balance_loss_clip": 1.01843262, "balance_loss_mlp": 1.03522551, "epoch": 0.7807605591462499, "flos": 19135827342720.0, "grad_norm": 1.814745408995619, "language_loss": 0.76714134, "learning_rate": 4.559433868418552e-07, "loss": 0.78865767, "num_input_tokens_seen": 280089855, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 12986, "time_per_iteration": 2.48016095161438 }, { "auxiliary_loss_clip": 0.01113771, "auxiliary_loss_mlp": 0.01035825, "balance_loss_clip": 1.0233674, "balance_loss_mlp": 1.03536546, "epoch": 0.7808206823989178, "flos": 32374654133760.0, "grad_norm": 3.3682445692383496, "language_loss": 0.74267447, "learning_rate": 4.557032750015577e-07, "loss": 0.76417041, "num_input_tokens_seen": 280109960, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 12987, "time_per_iteration": 2.608381748199463 }, { "auxiliary_loss_clip": 0.0103295, "auxiliary_loss_mlp": 0.01002172, "balance_loss_clip": 1.00094461, "balance_loss_mlp": 1.00296211, "epoch": 0.7808808056515858, "flos": 55050235061760.0, "grad_norm": 0.735994775751893, "language_loss": 0.55151957, "learning_rate": 4.554632182732372e-07, "loss": 0.5718708, "num_input_tokens_seen": 280169805, "router_z_loss_clip": 0.01226807, "router_z_loss_mlp": 0.21582031, "step": 12988, "time_per_iteration": 3.130237579345703 }, { "auxiliary_loss_clip": 0.0111854, "auxiliary_loss_mlp": 0.01036272, "balance_loss_clip": 1.02424955, "balance_loss_mlp": 1.03440642, "epoch": 0.7809409289042537, "flos": 12859468277760.0, "grad_norm": 3.075120441729081, "language_loss": 0.81006134, "learning_rate": 4.5522321666546216e-07, "loss": 0.83160943, "num_input_tokens_seen": 280184630, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.65625, "step": 12989, "time_per_iteration": 4.078670978546143 }, { "auxiliary_loss_clip": 0.01128578, "auxiliary_loss_mlp": 0.01029227, "balance_loss_clip": 1.01709139, "balance_loss_mlp": 1.03403378, "epoch": 0.7810010521569217, "flos": 21687244254720.0, "grad_norm": 1.7381691484101403, "language_loss": 0.70381922, "learning_rate": 4.5498327018679683e-07, "loss": 0.72539729, "num_input_tokens_seen": 280203880, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 12990, "time_per_iteration": 4.141098499298096 }, { "auxiliary_loss_clip": 0.01122428, "auxiliary_loss_mlp": 0.01030815, "balance_loss_clip": 1.01706421, "balance_loss_mlp": 1.03502107, "epoch": 0.7810611754095896, "flos": 16757068129920.0, "grad_norm": 2.186918948321238, "language_loss": 0.77534401, "learning_rate": 4.5474337884580436e-07, "loss": 0.79687643, "num_input_tokens_seen": 280220460, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.6953125, "step": 12991, "time_per_iteration": 2.5584664344787598 }, { "auxiliary_loss_clip": 0.0113052, "auxiliary_loss_mlp": 0.01034732, "balance_loss_clip": 1.02195835, "balance_loss_mlp": 1.03404117, "epoch": 0.7811212986622577, "flos": 43507464658560.0, "grad_norm": 1.7786112288707572, "language_loss": 0.66113651, "learning_rate": 4.545035426510453e-07, "loss": 0.68278897, "num_input_tokens_seen": 280242680, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 12992, "time_per_iteration": 2.7463455200195312 }, { "auxiliary_loss_clip": 0.01106458, "auxiliary_loss_mlp": 0.01033397, "balance_loss_clip": 1.02092171, "balance_loss_mlp": 1.03659892, "epoch": 0.7811814219149256, "flos": 21757700782080.0, "grad_norm": 1.946420959964169, "language_loss": 0.61878872, "learning_rate": 4.5426376161108025e-07, "loss": 0.64018726, "num_input_tokens_seen": 280260655, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 12993, "time_per_iteration": 2.5225703716278076 }, { "auxiliary_loss_clip": 0.01121642, "auxiliary_loss_mlp": 0.01029533, "balance_loss_clip": 1.01684904, "balance_loss_mlp": 1.03516066, "epoch": 0.7812415451675936, "flos": 24061514267520.0, "grad_norm": 1.4587500161703981, "language_loss": 0.68220544, "learning_rate": 4.540240357344649e-07, "loss": 0.70371723, "num_input_tokens_seen": 280281185, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.68359375, "step": 12994, "time_per_iteration": 2.6189587116241455 }, { "auxiliary_loss_clip": 0.01102986, "auxiliary_loss_mlp": 0.01026479, "balance_loss_clip": 1.01445091, "balance_loss_mlp": 1.03472292, "epoch": 0.7813016684202615, "flos": 18989706816000.0, "grad_norm": 1.9587750037165126, "language_loss": 0.69246602, "learning_rate": 4.537843650297546e-07, "loss": 0.71376073, "num_input_tokens_seen": 280298255, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 12995, "time_per_iteration": 2.529268980026245 }, { "auxiliary_loss_clip": 0.01126609, "auxiliary_loss_mlp": 0.01031935, "balance_loss_clip": 1.02040195, "balance_loss_mlp": 1.03304183, "epoch": 0.7813617916729295, "flos": 25260786581760.0, "grad_norm": 1.8690814347741291, "language_loss": 0.75111532, "learning_rate": 4.53544749505504e-07, "loss": 0.77270079, "num_input_tokens_seen": 280319000, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66796875, "step": 12996, "time_per_iteration": 2.5954678058624268 }, { "auxiliary_loss_clip": 0.01107952, "auxiliary_loss_mlp": 0.01032283, "balance_loss_clip": 1.0195694, "balance_loss_mlp": 1.03556919, "epoch": 0.7814219149255974, "flos": 17966037116160.0, "grad_norm": 3.112909821810861, "language_loss": 0.68115628, "learning_rate": 4.533051891702622e-07, "loss": 0.70255864, "num_input_tokens_seen": 280336375, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 12997, "time_per_iteration": 2.495358467102051 }, { "auxiliary_loss_clip": 0.01108414, "auxiliary_loss_mlp": 0.01032363, "balance_loss_clip": 1.01894641, "balance_loss_mlp": 1.03533578, "epoch": 0.7814820381782654, "flos": 25776176878080.0, "grad_norm": 1.7952963819473686, "language_loss": 0.82171458, "learning_rate": 4.5306568403258015e-07, "loss": 0.84312236, "num_input_tokens_seen": 280358760, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73046875, "step": 12998, "time_per_iteration": 2.5717873573303223 }, { "auxiliary_loss_clip": 0.01121776, "auxiliary_loss_mlp": 0.01035895, "balance_loss_clip": 1.02262664, "balance_loss_mlp": 1.03365576, "epoch": 0.7815421614309335, "flos": 20519572930560.0, "grad_norm": 2.2641606354273502, "language_loss": 0.743334, "learning_rate": 4.528262341010043e-07, "loss": 0.7649107, "num_input_tokens_seen": 280377085, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 12999, "time_per_iteration": 2.5536386966705322 }, { "auxiliary_loss_clip": 0.01117182, "auxiliary_loss_mlp": 0.01036482, "balance_loss_clip": 1.02305293, "balance_loss_mlp": 1.0362761, "epoch": 0.7816022846836014, "flos": 21287666384640.0, "grad_norm": 1.5812611358257325, "language_loss": 0.84347212, "learning_rate": 4.5258683938408124e-07, "loss": 0.86500877, "num_input_tokens_seen": 280395465, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 13000, "time_per_iteration": 2.542778491973877 }, { "auxiliary_loss_clip": 0.01115632, "auxiliary_loss_mlp": 0.01030705, "balance_loss_clip": 1.01791346, "balance_loss_mlp": 1.03654373, "epoch": 0.7816624079362694, "flos": 19208402772480.0, "grad_norm": 1.6855188255238525, "language_loss": 0.66059697, "learning_rate": 4.5234749989035247e-07, "loss": 0.6820603, "num_input_tokens_seen": 280412775, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 13001, "time_per_iteration": 2.5608627796173096 }, { "auxiliary_loss_clip": 0.01132645, "auxiliary_loss_mlp": 0.01032255, "balance_loss_clip": 1.0204829, "balance_loss_mlp": 1.03574526, "epoch": 0.7817225311889373, "flos": 26104687689600.0, "grad_norm": 1.5387674221148206, "language_loss": 0.66930974, "learning_rate": 4.521082156283609e-07, "loss": 0.69095874, "num_input_tokens_seen": 280432905, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.703125, "step": 13002, "time_per_iteration": 2.6368613243103027 }, { "auxiliary_loss_clip": 0.01117291, "auxiliary_loss_mlp": 0.01034517, "balance_loss_clip": 1.02093303, "balance_loss_mlp": 1.03793907, "epoch": 0.7817826544416053, "flos": 21250929749760.0, "grad_norm": 2.646705638287762, "language_loss": 0.73243982, "learning_rate": 4.5186898660664543e-07, "loss": 0.75395787, "num_input_tokens_seen": 280450785, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.703125, "step": 13003, "time_per_iteration": 2.634472131729126 }, { "auxiliary_loss_clip": 0.011167, "auxiliary_loss_mlp": 0.0103409, "balance_loss_clip": 1.02186513, "balance_loss_mlp": 1.03791642, "epoch": 0.7818427776942732, "flos": 19932181822080.0, "grad_norm": 1.6998930459840975, "language_loss": 0.6235249, "learning_rate": 4.5162981283374346e-07, "loss": 0.64503282, "num_input_tokens_seen": 280468400, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 13004, "time_per_iteration": 2.5039706230163574 }, { "auxiliary_loss_clip": 0.01098951, "auxiliary_loss_mlp": 0.01031771, "balance_loss_clip": 1.02077937, "balance_loss_mlp": 1.03358936, "epoch": 0.7819029009469413, "flos": 11363753018880.0, "grad_norm": 1.9700541817776736, "language_loss": 0.83047915, "learning_rate": 4.513906943181902e-07, "loss": 0.85178638, "num_input_tokens_seen": 280483930, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.65625, "step": 13005, "time_per_iteration": 2.5216314792633057 }, { "auxiliary_loss_clip": 0.01130318, "auxiliary_loss_mlp": 0.01276959, "balance_loss_clip": 1.01707554, "balance_loss_mlp": 1.03488612, "epoch": 0.7819630241996092, "flos": 24279276470400.0, "grad_norm": 1.8333698037485107, "language_loss": 0.72307038, "learning_rate": 4.511516310685206e-07, "loss": 0.74714315, "num_input_tokens_seen": 280503465, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6875, "step": 13006, "time_per_iteration": 2.6500167846679688 }, { "auxiliary_loss_clip": 0.01114234, "auxiliary_loss_mlp": 0.01277942, "balance_loss_clip": 1.01805532, "balance_loss_mlp": 1.03468657, "epoch": 0.7820231474522772, "flos": 22708902792960.0, "grad_norm": 1.5851509721796953, "language_loss": 0.71752518, "learning_rate": 4.5091262309326404e-07, "loss": 0.74144697, "num_input_tokens_seen": 280523375, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 13007, "time_per_iteration": 2.645092010498047 }, { "auxiliary_loss_clip": 0.01106887, "auxiliary_loss_mlp": 0.01030612, "balance_loss_clip": 1.01728415, "balance_loss_mlp": 1.03443336, "epoch": 0.7820832707049451, "flos": 20047419630720.0, "grad_norm": 2.0660611496890935, "language_loss": 0.68828905, "learning_rate": 4.5067367040095196e-07, "loss": 0.70966405, "num_input_tokens_seen": 280542920, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 13008, "time_per_iteration": 2.5249428749084473 }, { "auxiliary_loss_clip": 0.011203, "auxiliary_loss_mlp": 0.01028596, "balance_loss_clip": 1.01535749, "balance_loss_mlp": 1.03289652, "epoch": 0.7821433939576131, "flos": 27162795553920.0, "grad_norm": 1.696760537379992, "language_loss": 0.6989283, "learning_rate": 4.50434773000111e-07, "loss": 0.72041726, "num_input_tokens_seen": 280561700, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69140625, "step": 13009, "time_per_iteration": 2.5749292373657227 }, { "auxiliary_loss_clip": 0.01128849, "auxiliary_loss_mlp": 0.01026832, "balance_loss_clip": 1.01454151, "balance_loss_mlp": 1.03377926, "epoch": 0.782203517210281, "flos": 22602068766720.0, "grad_norm": 1.715878925128016, "language_loss": 0.81670833, "learning_rate": 4.5019593089926735e-07, "loss": 0.83826512, "num_input_tokens_seen": 280580605, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 13010, "time_per_iteration": 2.56744122505188 }, { "auxiliary_loss_clip": 0.01111353, "auxiliary_loss_mlp": 0.01033257, "balance_loss_clip": 1.02199161, "balance_loss_mlp": 1.03486884, "epoch": 0.782263640462949, "flos": 29059812535680.0, "grad_norm": 1.600612984774932, "language_loss": 0.62416434, "learning_rate": 4.4995714410694405e-07, "loss": 0.64561045, "num_input_tokens_seen": 280601495, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.67578125, "step": 13011, "time_per_iteration": 2.6147968769073486 }, { "auxiliary_loss_clip": 0.01112297, "auxiliary_loss_mlp": 0.01027006, "balance_loss_clip": 1.01627135, "balance_loss_mlp": 1.03504038, "epoch": 0.782323763715617, "flos": 25299498464640.0, "grad_norm": 1.6476738445959453, "language_loss": 0.7007643, "learning_rate": 4.4971841263166263e-07, "loss": 0.72215736, "num_input_tokens_seen": 280622760, "router_z_loss_clip": 0.10742188, "router_z_loss_mlp": 0.68359375, "step": 13012, "time_per_iteration": 2.5917956829071045 }, { "auxiliary_loss_clip": 0.01025036, "auxiliary_loss_mlp": 0.01001829, "balance_loss_clip": 1.00061345, "balance_loss_mlp": 1.0033828, "epoch": 0.782383886968285, "flos": 65194388668800.0, "grad_norm": 0.7122752708199497, "language_loss": 0.54946792, "learning_rate": 4.4947973648194446e-07, "loss": 0.5697366, "num_input_tokens_seen": 280687115, "router_z_loss_clip": 0.012146, "router_z_loss_mlp": 0.21679688, "step": 13013, "time_per_iteration": 3.218535900115967 }, { "auxiliary_loss_clip": 0.01128443, "auxiliary_loss_mlp": 0.01034927, "balance_loss_clip": 1.02274334, "balance_loss_mlp": 1.03907418, "epoch": 0.782444010220953, "flos": 18405440190720.0, "grad_norm": 1.9105633739052992, "language_loss": 0.65735388, "learning_rate": 4.4924111566630474e-07, "loss": 0.67898762, "num_input_tokens_seen": 280705000, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7109375, "step": 13014, "time_per_iteration": 2.5307095050811768 }, { "auxiliary_loss_clip": 0.01042381, "auxiliary_loss_mlp": 0.01001002, "balance_loss_clip": 0.99983931, "balance_loss_mlp": 1.00300956, "epoch": 0.7825041334736209, "flos": 63955003841280.0, "grad_norm": 0.7255316531314775, "language_loss": 0.58515096, "learning_rate": 4.4900255019326126e-07, "loss": 0.60558474, "num_input_tokens_seen": 280773525, "router_z_loss_clip": 0.01159668, "router_z_loss_mlp": 0.21679688, "step": 13015, "time_per_iteration": 3.2092812061309814 }, { "auxiliary_loss_clip": 0.01120226, "auxiliary_loss_mlp": 0.01030085, "balance_loss_clip": 1.01855171, "balance_loss_mlp": 1.03389168, "epoch": 0.7825642567262889, "flos": 20339373375360.0, "grad_norm": 1.5065638312030536, "language_loss": 0.74316972, "learning_rate": 4.4876404007132663e-07, "loss": 0.76467282, "num_input_tokens_seen": 280791915, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6796875, "step": 13016, "time_per_iteration": 2.5038983821868896 }, { "auxiliary_loss_clip": 0.01106047, "auxiliary_loss_mlp": 0.01032542, "balance_loss_clip": 1.01883876, "balance_loss_mlp": 1.03531456, "epoch": 0.7826243799789568, "flos": 20262955190400.0, "grad_norm": 1.6471638001195419, "language_loss": 0.75101662, "learning_rate": 4.4852558530901417e-07, "loss": 0.77240252, "num_input_tokens_seen": 280811460, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.70703125, "step": 13017, "time_per_iteration": 3.9618048667907715 }, { "auxiliary_loss_clip": 0.01123734, "auxiliary_loss_mlp": 0.01033615, "balance_loss_clip": 1.01989412, "balance_loss_mlp": 1.03452671, "epoch": 0.7826845032316249, "flos": 21132926593920.0, "grad_norm": 2.148664894171077, "language_loss": 0.7552675, "learning_rate": 4.4828718591483185e-07, "loss": 0.77684104, "num_input_tokens_seen": 280825415, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.71875, "step": 13018, "time_per_iteration": 2.524472236633301 }, { "auxiliary_loss_clip": 0.01102263, "auxiliary_loss_mlp": 0.01029084, "balance_loss_clip": 1.01619148, "balance_loss_mlp": 1.03367925, "epoch": 0.7827446264842928, "flos": 22492253911680.0, "grad_norm": 1.4538524832654007, "language_loss": 0.77263618, "learning_rate": 4.4804884189728855e-07, "loss": 0.7939496, "num_input_tokens_seen": 280845335, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 13019, "time_per_iteration": 2.483318567276001 }, { "auxiliary_loss_clip": 0.01136655, "auxiliary_loss_mlp": 0.01027356, "balance_loss_clip": 1.0154767, "balance_loss_mlp": 1.03295195, "epoch": 0.7828047497369608, "flos": 28840649702400.0, "grad_norm": 1.5269546669336744, "language_loss": 0.67676717, "learning_rate": 4.4781055326489016e-07, "loss": 0.69840729, "num_input_tokens_seen": 280867145, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 13020, "time_per_iteration": 2.6218643188476562 }, { "auxiliary_loss_clip": 0.01115516, "auxiliary_loss_mlp": 0.01028956, "balance_loss_clip": 1.01659429, "balance_loss_mlp": 1.03703761, "epoch": 0.7828648729896287, "flos": 23257689759360.0, "grad_norm": 2.1518292785685884, "language_loss": 0.62249774, "learning_rate": 4.475723200261405e-07, "loss": 0.64394242, "num_input_tokens_seen": 280886185, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 13021, "time_per_iteration": 2.4855644702911377 }, { "auxiliary_loss_clip": 0.01101671, "auxiliary_loss_mlp": 0.01031046, "balance_loss_clip": 1.01962543, "balance_loss_mlp": 1.0338124, "epoch": 0.7829249962422967, "flos": 25265670831360.0, "grad_norm": 1.5708798428402486, "language_loss": 0.69147694, "learning_rate": 4.473341421895409e-07, "loss": 0.71280408, "num_input_tokens_seen": 280907665, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6796875, "step": 13022, "time_per_iteration": 2.5620250701904297 }, { "auxiliary_loss_clip": 0.01060182, "auxiliary_loss_mlp": 0.00999063, "balance_loss_clip": 0.9978413, "balance_loss_mlp": 1.00257075, "epoch": 0.7829851194949646, "flos": 70722044645760.0, "grad_norm": 0.6400319585187182, "language_loss": 0.562096, "learning_rate": 4.4709601976359267e-07, "loss": 0.58268845, "num_input_tokens_seen": 280971405, "router_z_loss_clip": 0.01220703, "router_z_loss_mlp": 0.21679688, "step": 13023, "time_per_iteration": 4.640643119812012 }, { "auxiliary_loss_clip": 0.01115221, "auxiliary_loss_mlp": 0.01028367, "balance_loss_clip": 1.01717281, "balance_loss_mlp": 1.03165638, "epoch": 0.7830452427476327, "flos": 25660795415040.0, "grad_norm": 1.7195871186742486, "language_loss": 0.67001945, "learning_rate": 4.468579527567922e-07, "loss": 0.6914553, "num_input_tokens_seen": 280989615, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.65625, "step": 13024, "time_per_iteration": 2.545987367630005 }, { "auxiliary_loss_clip": 0.01107834, "auxiliary_loss_mlp": 0.01028506, "balance_loss_clip": 1.01601887, "balance_loss_mlp": 1.03668034, "epoch": 0.7831053660003006, "flos": 22784315397120.0, "grad_norm": 1.7432581125240854, "language_loss": 0.77546895, "learning_rate": 4.466199411776366e-07, "loss": 0.79683232, "num_input_tokens_seen": 281009450, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 13025, "time_per_iteration": 2.5270283222198486 }, { "auxiliary_loss_clip": 0.01121705, "auxiliary_loss_mlp": 0.0103347, "balance_loss_clip": 1.02156711, "balance_loss_mlp": 1.03603733, "epoch": 0.7831654892529686, "flos": 25812267068160.0, "grad_norm": 1.8836372973562578, "language_loss": 0.78358459, "learning_rate": 4.463819850346193e-07, "loss": 0.80513632, "num_input_tokens_seen": 281028120, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 13026, "time_per_iteration": 2.559536933898926 }, { "auxiliary_loss_clip": 0.01114346, "auxiliary_loss_mlp": 0.01026807, "balance_loss_clip": 1.01470137, "balance_loss_mlp": 1.03735423, "epoch": 0.7832256125056366, "flos": 20771557816320.0, "grad_norm": 1.848421569889166, "language_loss": 0.75399214, "learning_rate": 4.4614408433623295e-07, "loss": 0.77540374, "num_input_tokens_seen": 281042130, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.68359375, "step": 13027, "time_per_iteration": 2.4833924770355225 }, { "auxiliary_loss_clip": 0.01113075, "auxiliary_loss_mlp": 0.01027361, "balance_loss_clip": 1.01530898, "balance_loss_mlp": 1.03593326, "epoch": 0.7832857357583045, "flos": 21506541909120.0, "grad_norm": 1.8104389618715004, "language_loss": 0.70526481, "learning_rate": 4.459062390909669e-07, "loss": 0.72666919, "num_input_tokens_seen": 281060945, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.68359375, "step": 13028, "time_per_iteration": 2.5146660804748535 }, { "auxiliary_loss_clip": 0.01143141, "auxiliary_loss_mlp": 0.01035261, "balance_loss_clip": 1.02247608, "balance_loss_mlp": 1.03545403, "epoch": 0.7833458590109725, "flos": 18077791305600.0, "grad_norm": 2.130870787033556, "language_loss": 0.69264442, "learning_rate": 4.456684493073093e-07, "loss": 0.71442842, "num_input_tokens_seen": 281079270, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 13029, "time_per_iteration": 2.6140971183776855 }, { "auxiliary_loss_clip": 0.01132239, "auxiliary_loss_mlp": 0.0103192, "balance_loss_clip": 1.01914716, "balance_loss_mlp": 1.03455639, "epoch": 0.7834059822636404, "flos": 28288738252800.0, "grad_norm": 1.7538310258882832, "language_loss": 0.81087655, "learning_rate": 4.454307149937475e-07, "loss": 0.8325181, "num_input_tokens_seen": 281099500, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 13030, "time_per_iteration": 4.003521919250488 }, { "auxiliary_loss_clip": 0.01115704, "auxiliary_loss_mlp": 0.01029397, "balance_loss_clip": 1.01701677, "balance_loss_mlp": 1.03444242, "epoch": 0.7834661055163085, "flos": 31686211088640.0, "grad_norm": 1.7694510733403126, "language_loss": 0.70582473, "learning_rate": 4.451930361587637e-07, "loss": 0.72727573, "num_input_tokens_seen": 281121250, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.72265625, "step": 13031, "time_per_iteration": 4.128575801849365 }, { "auxiliary_loss_clip": 0.01131804, "auxiliary_loss_mlp": 0.01031712, "balance_loss_clip": 1.01967192, "balance_loss_mlp": 1.03407097, "epoch": 0.7835262287689764, "flos": 12933192942720.0, "grad_norm": 1.7646156577879621, "language_loss": 0.78774714, "learning_rate": 4.4495541281084126e-07, "loss": 0.80938232, "num_input_tokens_seen": 281138760, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7109375, "step": 13032, "time_per_iteration": 2.529792547225952 }, { "auxiliary_loss_clip": 0.01118423, "auxiliary_loss_mlp": 0.01041332, "balance_loss_clip": 1.02798676, "balance_loss_mlp": 1.03782558, "epoch": 0.7835863520216444, "flos": 16143211676160.0, "grad_norm": 2.0029548876379155, "language_loss": 0.6301403, "learning_rate": 4.4471784495845986e-07, "loss": 0.65173793, "num_input_tokens_seen": 281157420, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 13033, "time_per_iteration": 2.4798972606658936 }, { "auxiliary_loss_clip": 0.01113222, "auxiliary_loss_mlp": 0.01031446, "balance_loss_clip": 1.01866722, "balance_loss_mlp": 1.03546381, "epoch": 0.7836464752743123, "flos": 11509909459200.0, "grad_norm": 4.336362518411912, "language_loss": 0.71919507, "learning_rate": 4.444803326100988e-07, "loss": 0.74064183, "num_input_tokens_seen": 281174620, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 13034, "time_per_iteration": 2.521665096282959 }, { "auxiliary_loss_clip": 0.01107552, "auxiliary_loss_mlp": 0.01279784, "balance_loss_clip": 1.02010572, "balance_loss_mlp": 1.03714192, "epoch": 0.7837065985269803, "flos": 18223696350720.0, "grad_norm": 2.3428050528552204, "language_loss": 0.72136122, "learning_rate": 4.442428757742322e-07, "loss": 0.74523461, "num_input_tokens_seen": 281193865, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 13035, "time_per_iteration": 2.5041909217834473 }, { "auxiliary_loss_clip": 0.01106929, "auxiliary_loss_mlp": 0.01034228, "balance_loss_clip": 1.02131796, "balance_loss_mlp": 1.03672028, "epoch": 0.7837667217796482, "flos": 24754410599040.0, "grad_norm": 1.8410572902803726, "language_loss": 0.66224861, "learning_rate": 4.4400547445933624e-07, "loss": 0.68366021, "num_input_tokens_seen": 281212250, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 13036, "time_per_iteration": 2.5329947471618652 }, { "auxiliary_loss_clip": 0.01118466, "auxiliary_loss_mlp": 0.01036317, "balance_loss_clip": 1.02312088, "balance_loss_mlp": 1.03620553, "epoch": 0.7838268450323163, "flos": 22383121415040.0, "grad_norm": 3.0028563054989172, "language_loss": 0.72577167, "learning_rate": 4.4376812867388236e-07, "loss": 0.74731952, "num_input_tokens_seen": 281230850, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 13037, "time_per_iteration": 2.517233371734619 }, { "auxiliary_loss_clip": 0.01127739, "auxiliary_loss_mlp": 0.01034581, "balance_loss_clip": 1.02136087, "balance_loss_mlp": 1.03213418, "epoch": 0.7838869682849842, "flos": 19500284689920.0, "grad_norm": 2.2031054277582696, "language_loss": 0.60119939, "learning_rate": 4.4353083842634077e-07, "loss": 0.62282252, "num_input_tokens_seen": 281249810, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69140625, "step": 13038, "time_per_iteration": 2.55256986618042 }, { "auxiliary_loss_clip": 0.01119889, "auxiliary_loss_mlp": 0.01030217, "balance_loss_clip": 1.01830244, "balance_loss_mlp": 1.0343101, "epoch": 0.7839470915376522, "flos": 32892845690880.0, "grad_norm": 1.8808264431385129, "language_loss": 0.68448091, "learning_rate": 4.4329360372517957e-07, "loss": 0.70598191, "num_input_tokens_seen": 281273730, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 13039, "time_per_iteration": 2.6318764686584473 }, { "auxiliary_loss_clip": 0.01126104, "auxiliary_loss_mlp": 0.01022276, "balance_loss_clip": 1.01140952, "balance_loss_mlp": 1.03222179, "epoch": 0.7840072147903202, "flos": 29676003373440.0, "grad_norm": 2.2308898862484052, "language_loss": 0.69094515, "learning_rate": 4.430564245788662e-07, "loss": 0.71242893, "num_input_tokens_seen": 281293670, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.67578125, "step": 13040, "time_per_iteration": 2.601968288421631 }, { "auxiliary_loss_clip": 0.01139259, "auxiliary_loss_mlp": 0.01031714, "balance_loss_clip": 1.01866043, "balance_loss_mlp": 1.03387356, "epoch": 0.7840673380429881, "flos": 18186744234240.0, "grad_norm": 1.462380711779477, "language_loss": 0.67136049, "learning_rate": 4.428193009958634e-07, "loss": 0.69307017, "num_input_tokens_seen": 281313070, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 13041, "time_per_iteration": 2.500290632247925 }, { "auxiliary_loss_clip": 0.01033325, "auxiliary_loss_mlp": 0.01249902, "balance_loss_clip": 1.00294256, "balance_loss_mlp": 1.00306344, "epoch": 0.7841274612956561, "flos": 66346006613760.0, "grad_norm": 0.661192930905539, "language_loss": 0.57422107, "learning_rate": 4.425822329846338e-07, "loss": 0.59705335, "num_input_tokens_seen": 281374880, "router_z_loss_clip": 0.01171875, "router_z_loss_mlp": 0.21484375, "step": 13042, "time_per_iteration": 3.095259428024292 }, { "auxiliary_loss_clip": 0.01123911, "auxiliary_loss_mlp": 0.01029925, "balance_loss_clip": 1.01771832, "balance_loss_mlp": 1.03568554, "epoch": 0.784187584548324, "flos": 26648482665600.0, "grad_norm": 2.5749752319049763, "language_loss": 0.83862901, "learning_rate": 4.4234522055363885e-07, "loss": 0.86016738, "num_input_tokens_seen": 281392620, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 13043, "time_per_iteration": 2.584975004196167 }, { "auxiliary_loss_clip": 0.01111593, "auxiliary_loss_mlp": 0.01029012, "balance_loss_clip": 1.01737154, "balance_loss_mlp": 1.03511906, "epoch": 0.7842477078009921, "flos": 25740158515200.0, "grad_norm": 1.3830691600545957, "language_loss": 0.88357818, "learning_rate": 4.42108263711336e-07, "loss": 0.90498424, "num_input_tokens_seen": 281413140, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6796875, "step": 13044, "time_per_iteration": 2.535266637802124 }, { "auxiliary_loss_clip": 0.01127579, "auxiliary_loss_mlp": 0.01030499, "balance_loss_clip": 1.01792884, "balance_loss_mlp": 1.03670216, "epoch": 0.78430783105366, "flos": 21980957765760.0, "grad_norm": 1.6513382121927054, "language_loss": 0.78890002, "learning_rate": 4.4187136246618183e-07, "loss": 0.81048083, "num_input_tokens_seen": 281430860, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.734375, "step": 13045, "time_per_iteration": 2.5672988891601562 }, { "auxiliary_loss_clip": 0.01119853, "auxiliary_loss_mlp": 0.01030203, "balance_loss_clip": 1.01834142, "balance_loss_mlp": 1.03394389, "epoch": 0.784367954306328, "flos": 23842279607040.0, "grad_norm": 1.5415772580567708, "language_loss": 0.7227819, "learning_rate": 4.4163451682663045e-07, "loss": 0.74428248, "num_input_tokens_seen": 281451385, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6796875, "step": 13046, "time_per_iteration": 2.5478057861328125 }, { "auxiliary_loss_clip": 0.01138093, "auxiliary_loss_mlp": 0.01036187, "balance_loss_clip": 1.02315712, "balance_loss_mlp": 1.03539741, "epoch": 0.7844280775589959, "flos": 24826662806400.0, "grad_norm": 3.2572409129041544, "language_loss": 0.63332242, "learning_rate": 4.413977268011355e-07, "loss": 0.65506518, "num_input_tokens_seen": 281472255, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.671875, "step": 13047, "time_per_iteration": 2.574883460998535 }, { "auxiliary_loss_clip": 0.01111321, "auxiliary_loss_mlp": 0.01028192, "balance_loss_clip": 1.01637208, "balance_loss_mlp": 1.03516972, "epoch": 0.7844882008116639, "flos": 22455660931200.0, "grad_norm": 1.5235058288428474, "language_loss": 0.73188722, "learning_rate": 4.411609923981454e-07, "loss": 0.75328231, "num_input_tokens_seen": 281492860, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 13048, "time_per_iteration": 2.4972972869873047 }, { "auxiliary_loss_clip": 0.01125223, "auxiliary_loss_mlp": 0.01032324, "balance_loss_clip": 1.01953268, "balance_loss_mlp": 1.03636289, "epoch": 0.7845483240643318, "flos": 26104041244800.0, "grad_norm": 2.1034523914965915, "language_loss": 0.74636042, "learning_rate": 4.4092431362611006e-07, "loss": 0.76793593, "num_input_tokens_seen": 281511815, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 13049, "time_per_iteration": 2.5260229110717773 }, { "auxiliary_loss_clip": 0.01108286, "auxiliary_loss_mlp": 0.01033232, "balance_loss_clip": 1.01946354, "balance_loss_mlp": 1.03503728, "epoch": 0.7846084473169999, "flos": 19354307817600.0, "grad_norm": 1.8266630720279617, "language_loss": 0.72770798, "learning_rate": 4.406876904934758e-07, "loss": 0.74912322, "num_input_tokens_seen": 281530090, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.734375, "step": 13050, "time_per_iteration": 2.4353373050689697 }, { "auxiliary_loss_clip": 0.01112529, "auxiliary_loss_mlp": 0.01034736, "balance_loss_clip": 1.02252936, "balance_loss_mlp": 1.03530657, "epoch": 0.7846685705696678, "flos": 23325811902720.0, "grad_norm": 1.9439736750145802, "language_loss": 0.73286819, "learning_rate": 4.404511230086867e-07, "loss": 0.75434089, "num_input_tokens_seen": 281547075, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.68359375, "step": 13051, "time_per_iteration": 2.591200590133667 }, { "auxiliary_loss_clip": 0.01107251, "auxiliary_loss_mlp": 0.01034887, "balance_loss_clip": 1.02188754, "balance_loss_mlp": 1.03649807, "epoch": 0.7847286938223358, "flos": 35809545962880.0, "grad_norm": 1.7436775160827873, "language_loss": 0.72474837, "learning_rate": 4.4021461118018476e-07, "loss": 0.74616981, "num_input_tokens_seen": 281568080, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 13052, "time_per_iteration": 2.631026268005371 }, { "auxiliary_loss_clip": 0.01118492, "auxiliary_loss_mlp": 0.01033645, "balance_loss_clip": 1.02152109, "balance_loss_mlp": 1.03513408, "epoch": 0.7847888170750038, "flos": 18478159274880.0, "grad_norm": 1.761877986040295, "language_loss": 0.68434185, "learning_rate": 4.399781550164119e-07, "loss": 0.70586324, "num_input_tokens_seen": 281586925, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.65625, "step": 13053, "time_per_iteration": 2.5069267749786377 }, { "auxiliary_loss_clip": 0.01111837, "auxiliary_loss_mlp": 0.01028854, "balance_loss_clip": 1.01711762, "balance_loss_mlp": 1.03462648, "epoch": 0.7848489403276717, "flos": 25119155255040.0, "grad_norm": 2.1076879484726074, "language_loss": 0.69865024, "learning_rate": 4.3974175452580555e-07, "loss": 0.72005713, "num_input_tokens_seen": 281603915, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6796875, "step": 13054, "time_per_iteration": 2.5509655475616455 }, { "auxiliary_loss_clip": 0.01115375, "auxiliary_loss_mlp": 0.01031867, "balance_loss_clip": 1.01926029, "balance_loss_mlp": 1.03537107, "epoch": 0.7849090635803397, "flos": 26502433966080.0, "grad_norm": 1.5652531141818993, "language_loss": 0.75932562, "learning_rate": 4.395054097168027e-07, "loss": 0.78079802, "num_input_tokens_seen": 281624220, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 13055, "time_per_iteration": 2.5513994693756104 }, { "auxiliary_loss_clip": 0.01109238, "auxiliary_loss_mlp": 0.01031933, "balance_loss_clip": 1.0189513, "balance_loss_mlp": 1.03680873, "epoch": 0.7849691868330076, "flos": 20959658363520.0, "grad_norm": 1.8017020206997496, "language_loss": 0.74756902, "learning_rate": 4.3926912059783763e-07, "loss": 0.76898074, "num_input_tokens_seen": 281642325, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.72265625, "step": 13056, "time_per_iteration": 2.468627452850342 }, { "auxiliary_loss_clip": 0.01148796, "auxiliary_loss_mlp": 0.01028044, "balance_loss_clip": 1.0156703, "balance_loss_mlp": 1.03500128, "epoch": 0.7850293100856757, "flos": 26067484177920.0, "grad_norm": 1.8783041766140134, "language_loss": 0.70052034, "learning_rate": 4.39032887177343e-07, "loss": 0.72228873, "num_input_tokens_seen": 281663065, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 13057, "time_per_iteration": 2.651061773300171 }, { "auxiliary_loss_clip": 0.01133551, "auxiliary_loss_mlp": 0.01030041, "balance_loss_clip": 1.01805997, "balance_loss_mlp": 1.03510129, "epoch": 0.7850894333383436, "flos": 22491894775680.0, "grad_norm": 2.355628317171486, "language_loss": 0.76457167, "learning_rate": 4.3879670946374923e-07, "loss": 0.78620756, "num_input_tokens_seen": 281681005, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.70703125, "step": 13058, "time_per_iteration": 3.9454479217529297 }, { "auxiliary_loss_clip": 0.01114648, "auxiliary_loss_mlp": 0.01031148, "balance_loss_clip": 1.01840413, "balance_loss_mlp": 1.03645062, "epoch": 0.7851495565910116, "flos": 20558643949440.0, "grad_norm": 1.988723005574535, "language_loss": 0.70601135, "learning_rate": 4.385605874654845e-07, "loss": 0.72746933, "num_input_tokens_seen": 281697965, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 13059, "time_per_iteration": 2.471653938293457 }, { "auxiliary_loss_clip": 0.0113673, "auxiliary_loss_mlp": 0.01036354, "balance_loss_clip": 1.02272797, "balance_loss_mlp": 1.03638959, "epoch": 0.7852096798436795, "flos": 15924838942080.0, "grad_norm": 1.8100822625604256, "language_loss": 0.76720798, "learning_rate": 4.383245211909765e-07, "loss": 0.78893876, "num_input_tokens_seen": 281716035, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 13060, "time_per_iteration": 2.4964981079101562 }, { "auxiliary_loss_clip": 0.01123643, "auxiliary_loss_mlp": 0.01031665, "balance_loss_clip": 1.01878476, "balance_loss_mlp": 1.03648043, "epoch": 0.7852698030963475, "flos": 19062282245760.0, "grad_norm": 2.0652904078748513, "language_loss": 0.77068079, "learning_rate": 4.380885106486494e-07, "loss": 0.79223388, "num_input_tokens_seen": 281732815, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 13061, "time_per_iteration": 2.486199378967285 }, { "auxiliary_loss_clip": 0.0113117, "auxiliary_loss_mlp": 0.01029441, "balance_loss_clip": 1.01640534, "balance_loss_mlp": 1.03431475, "epoch": 0.7853299263490154, "flos": 24644380262400.0, "grad_norm": 1.8516976718849836, "language_loss": 0.74248004, "learning_rate": 4.378525558469255e-07, "loss": 0.76408613, "num_input_tokens_seen": 281751980, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 13062, "time_per_iteration": 2.5295422077178955 }, { "auxiliary_loss_clip": 0.01110493, "auxiliary_loss_mlp": 0.01031719, "balance_loss_clip": 1.01973224, "balance_loss_mlp": 1.03373122, "epoch": 0.7853900496016835, "flos": 22017981709440.0, "grad_norm": 1.4953666657206133, "language_loss": 0.68400693, "learning_rate": 4.37616656794225e-07, "loss": 0.70542902, "num_input_tokens_seen": 281772670, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 13063, "time_per_iteration": 2.5114922523498535 }, { "auxiliary_loss_clip": 0.01114967, "auxiliary_loss_mlp": 0.01032367, "balance_loss_clip": 1.0201776, "balance_loss_mlp": 1.03852355, "epoch": 0.7854501728543514, "flos": 30227412032640.0, "grad_norm": 1.8458676191661663, "language_loss": 0.73163879, "learning_rate": 4.3738081349896805e-07, "loss": 0.75311214, "num_input_tokens_seen": 281792930, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.67578125, "step": 13064, "time_per_iteration": 2.583364486694336 }, { "auxiliary_loss_clip": 0.01133632, "auxiliary_loss_mlp": 0.01031884, "balance_loss_clip": 1.01947427, "balance_loss_mlp": 1.03558636, "epoch": 0.7855102961070194, "flos": 18843694030080.0, "grad_norm": 1.8398775113206662, "language_loss": 0.68268329, "learning_rate": 4.3714502596956926e-07, "loss": 0.70433843, "num_input_tokens_seen": 281811805, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71484375, "step": 13065, "time_per_iteration": 3.8928613662719727 }, { "auxiliary_loss_clip": 0.01116323, "auxiliary_loss_mlp": 0.01033913, "balance_loss_clip": 1.02165842, "balance_loss_mlp": 1.03821278, "epoch": 0.7855704193596874, "flos": 22309971367680.0, "grad_norm": 1.919884683116487, "language_loss": 0.76289004, "learning_rate": 4.36909294214445e-07, "loss": 0.78439242, "num_input_tokens_seen": 281831885, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 13066, "time_per_iteration": 2.5417776107788086 }, { "auxiliary_loss_clip": 0.01024177, "auxiliary_loss_mlp": 0.01002662, "balance_loss_clip": 1.00148189, "balance_loss_mlp": 1.00270712, "epoch": 0.7856305426123553, "flos": 60004434407040.0, "grad_norm": 0.7020498514961334, "language_loss": 0.53390759, "learning_rate": 4.366736182420074e-07, "loss": 0.55417603, "num_input_tokens_seen": 281900310, "router_z_loss_clip": 0.01177979, "router_z_loss_mlp": 0.21484375, "step": 13067, "time_per_iteration": 3.169151782989502 }, { "auxiliary_loss_clip": 0.01139152, "auxiliary_loss_mlp": 0.01033213, "balance_loss_clip": 1.02004671, "balance_loss_mlp": 1.03381801, "epoch": 0.7856906658650233, "flos": 21868593045120.0, "grad_norm": 3.6446818704845474, "language_loss": 0.67427129, "learning_rate": 4.3643799806066693e-07, "loss": 0.69599497, "num_input_tokens_seen": 281918870, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 13068, "time_per_iteration": 2.57665753364563 }, { "auxiliary_loss_clip": 0.01128398, "auxiliary_loss_mlp": 0.01031142, "balance_loss_clip": 1.01886988, "balance_loss_mlp": 1.03500772, "epoch": 0.7857507891176913, "flos": 23622937205760.0, "grad_norm": 2.1452643175522863, "language_loss": 0.68234909, "learning_rate": 4.3620243367883167e-07, "loss": 0.70394444, "num_input_tokens_seen": 281936905, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.67578125, "step": 13069, "time_per_iteration": 2.577383041381836 }, { "auxiliary_loss_clip": 0.01115098, "auxiliary_loss_mlp": 0.010368, "balance_loss_clip": 1.02416921, "balance_loss_mlp": 1.03617942, "epoch": 0.7858109123703593, "flos": 25520061928320.0, "grad_norm": 1.596844469983988, "language_loss": 0.76946688, "learning_rate": 4.359669251049096e-07, "loss": 0.79098582, "num_input_tokens_seen": 281955625, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 13070, "time_per_iteration": 2.6447768211364746 }, { "auxiliary_loss_clip": 0.01138147, "auxiliary_loss_mlp": 0.0103009, "balance_loss_clip": 1.01822865, "balance_loss_mlp": 1.0335269, "epoch": 0.7858710356230272, "flos": 17457398576640.0, "grad_norm": 1.5861585576155959, "language_loss": 0.65820205, "learning_rate": 4.3573147234730536e-07, "loss": 0.67988443, "num_input_tokens_seen": 281973285, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 13071, "time_per_iteration": 2.521432638168335 }, { "auxiliary_loss_clip": 0.01139876, "auxiliary_loss_mlp": 0.01028294, "balance_loss_clip": 1.01656425, "balance_loss_mlp": 1.03340995, "epoch": 0.7859311588756952, "flos": 24679680353280.0, "grad_norm": 2.7472827716872406, "language_loss": 0.73932803, "learning_rate": 4.3549607541441993e-07, "loss": 0.76100969, "num_input_tokens_seen": 281991410, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.70703125, "step": 13072, "time_per_iteration": 4.037204027175903 }, { "auxiliary_loss_clip": 0.01116526, "auxiliary_loss_mlp": 0.01030609, "balance_loss_clip": 1.01747227, "balance_loss_mlp": 1.03675067, "epoch": 0.7859912821283631, "flos": 21799142098560.0, "grad_norm": 1.9299544729641431, "language_loss": 0.7151649, "learning_rate": 4.352607343146559e-07, "loss": 0.73663622, "num_input_tokens_seen": 282010845, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 13073, "time_per_iteration": 3.968552589416504 }, { "auxiliary_loss_clip": 0.01121581, "auxiliary_loss_mlp": 0.0103187, "balance_loss_clip": 1.0186913, "balance_loss_mlp": 1.03451276, "epoch": 0.7860514053810311, "flos": 20847293642880.0, "grad_norm": 1.5487316692906559, "language_loss": 0.76890171, "learning_rate": 4.3502544905641113e-07, "loss": 0.79043621, "num_input_tokens_seen": 282029635, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6875, "step": 13074, "time_per_iteration": 2.553494453430176 }, { "auxiliary_loss_clip": 0.01127616, "auxiliary_loss_mlp": 0.0102942, "balance_loss_clip": 1.01767778, "balance_loss_mlp": 1.03399074, "epoch": 0.786111528633699, "flos": 24315689882880.0, "grad_norm": 3.227679869203487, "language_loss": 0.75161719, "learning_rate": 4.347902196480826e-07, "loss": 0.77318752, "num_input_tokens_seen": 282050285, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6640625, "step": 13075, "time_per_iteration": 2.5383460521698 }, { "auxiliary_loss_clip": 0.01130623, "auxiliary_loss_mlp": 0.01024413, "balance_loss_clip": 1.01299858, "balance_loss_mlp": 1.03497982, "epoch": 0.7861716518863671, "flos": 24353180703360.0, "grad_norm": 1.5063107536371834, "language_loss": 0.68690199, "learning_rate": 4.3455504609806426e-07, "loss": 0.70845234, "num_input_tokens_seen": 282071040, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6953125, "step": 13076, "time_per_iteration": 2.5624613761901855 }, { "auxiliary_loss_clip": 0.01110935, "auxiliary_loss_mlp": 0.01026456, "balance_loss_clip": 1.01432037, "balance_loss_mlp": 1.03479743, "epoch": 0.786231775139035, "flos": 14022399006720.0, "grad_norm": 2.1085201685919435, "language_loss": 0.79772317, "learning_rate": 4.3431992841475004e-07, "loss": 0.81909716, "num_input_tokens_seen": 282086610, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.671875, "step": 13077, "time_per_iteration": 2.4828412532806396 }, { "auxiliary_loss_clip": 0.01113273, "auxiliary_loss_mlp": 0.01035472, "balance_loss_clip": 1.02265728, "balance_loss_mlp": 1.03557873, "epoch": 0.786291898391703, "flos": 33724248865920.0, "grad_norm": 1.6606783963421203, "language_loss": 0.70709074, "learning_rate": 4.340848666065302e-07, "loss": 0.72857815, "num_input_tokens_seen": 282107440, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 13078, "time_per_iteration": 2.609269142150879 }, { "auxiliary_loss_clip": 0.01112704, "auxiliary_loss_mlp": 0.01035156, "balance_loss_clip": 1.0234139, "balance_loss_mlp": 1.03583789, "epoch": 0.786352021644371, "flos": 25811476968960.0, "grad_norm": 1.4360864601138748, "language_loss": 0.81029248, "learning_rate": 4.338498606817935e-07, "loss": 0.83177102, "num_input_tokens_seen": 282127290, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.67578125, "step": 13079, "time_per_iteration": 2.5717732906341553 }, { "auxiliary_loss_clip": 0.01127782, "auxiliary_loss_mlp": 0.01028212, "balance_loss_clip": 1.01607621, "balance_loss_mlp": 1.0346992, "epoch": 0.7864121448970389, "flos": 28910818920960.0, "grad_norm": 1.482746303701884, "language_loss": 0.68368804, "learning_rate": 4.336149106489262e-07, "loss": 0.705248, "num_input_tokens_seen": 282147505, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.66015625, "step": 13080, "time_per_iteration": 2.613856554031372 }, { "auxiliary_loss_clip": 0.01112415, "auxiliary_loss_mlp": 0.01032997, "balance_loss_clip": 1.02005088, "balance_loss_mlp": 1.03493607, "epoch": 0.7864722681497069, "flos": 19208833735680.0, "grad_norm": 1.7071021728206854, "language_loss": 0.69402772, "learning_rate": 4.3338001651631464e-07, "loss": 0.71548182, "num_input_tokens_seen": 282166450, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6875, "step": 13081, "time_per_iteration": 2.5686025619506836 }, { "auxiliary_loss_clip": 0.01122398, "auxiliary_loss_mlp": 0.01034793, "balance_loss_clip": 1.02244306, "balance_loss_mlp": 1.03429306, "epoch": 0.7865323914023749, "flos": 21871573873920.0, "grad_norm": 1.5983548918364565, "language_loss": 0.68112296, "learning_rate": 4.331451782923392e-07, "loss": 0.70269489, "num_input_tokens_seen": 282186465, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.703125, "step": 13082, "time_per_iteration": 2.5899651050567627 }, { "auxiliary_loss_clip": 0.01121318, "auxiliary_loss_mlp": 0.010307, "balance_loss_clip": 1.01864743, "balance_loss_mlp": 1.03442609, "epoch": 0.7865925146550429, "flos": 25520313323520.0, "grad_norm": 1.8658827946929082, "language_loss": 0.66159177, "learning_rate": 4.3291039598538237e-07, "loss": 0.68311191, "num_input_tokens_seen": 282207180, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 13083, "time_per_iteration": 2.5646753311157227 }, { "auxiliary_loss_clip": 0.0112575, "auxiliary_loss_mlp": 0.01033065, "balance_loss_clip": 1.0197016, "balance_loss_mlp": 1.03528857, "epoch": 0.7866526379077108, "flos": 19097366855040.0, "grad_norm": 2.304708039281244, "language_loss": 0.74830282, "learning_rate": 4.3267566960382273e-07, "loss": 0.76989096, "num_input_tokens_seen": 282225865, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73046875, "step": 13084, "time_per_iteration": 2.530935764312744 }, { "auxiliary_loss_clip": 0.01121619, "auxiliary_loss_mlp": 0.01035159, "balance_loss_clip": 1.02332139, "balance_loss_mlp": 1.03622365, "epoch": 0.7867127611603788, "flos": 16173771171840.0, "grad_norm": 2.1094311955861156, "language_loss": 0.70093679, "learning_rate": 4.324409991560367e-07, "loss": 0.72250456, "num_input_tokens_seen": 282242895, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6796875, "step": 13085, "time_per_iteration": 2.488834857940674 }, { "auxiliary_loss_clip": 0.01124744, "auxiliary_loss_mlp": 0.01028462, "balance_loss_clip": 1.01571822, "balance_loss_mlp": 1.03415942, "epoch": 0.7867728844130467, "flos": 20773640805120.0, "grad_norm": 1.7869421137058878, "language_loss": 0.72004795, "learning_rate": 4.3220638465039916e-07, "loss": 0.74158001, "num_input_tokens_seen": 282260425, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 13086, "time_per_iteration": 2.589829921722412 }, { "auxiliary_loss_clip": 0.01103471, "auxiliary_loss_mlp": 0.01028469, "balance_loss_clip": 1.01620817, "balance_loss_mlp": 1.0340414, "epoch": 0.7868330076657147, "flos": 21760106993280.0, "grad_norm": 2.007013740932568, "language_loss": 0.74548149, "learning_rate": 4.319718260952823e-07, "loss": 0.76680088, "num_input_tokens_seen": 282279335, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6953125, "step": 13087, "time_per_iteration": 2.4672865867614746 }, { "auxiliary_loss_clip": 0.01118973, "auxiliary_loss_mlp": 0.01031954, "balance_loss_clip": 1.01986027, "balance_loss_mlp": 1.03425288, "epoch": 0.7868931309183826, "flos": 25700692446720.0, "grad_norm": 1.5979626510645455, "language_loss": 0.71253097, "learning_rate": 4.317373234990587e-07, "loss": 0.73404026, "num_input_tokens_seen": 282299905, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.671875, "step": 13088, "time_per_iteration": 2.603807210922241 }, { "auxiliary_loss_clip": 0.01024075, "auxiliary_loss_mlp": 0.00998815, "balance_loss_clip": 0.99765825, "balance_loss_mlp": 1.00254393, "epoch": 0.7869532541710507, "flos": 64644883430400.0, "grad_norm": 0.6736944323195354, "language_loss": 0.55496776, "learning_rate": 4.3150287687009477e-07, "loss": 0.57519662, "num_input_tokens_seen": 282367620, "router_z_loss_clip": 0.01153564, "router_z_loss_mlp": 0.21484375, "step": 13089, "time_per_iteration": 3.149465322494507 }, { "auxiliary_loss_clip": 0.01033338, "auxiliary_loss_mlp": 0.01002032, "balance_loss_clip": 1.00087023, "balance_loss_mlp": 1.00324595, "epoch": 0.7870133774237186, "flos": 67453600440960.0, "grad_norm": 0.7219721354136924, "language_loss": 0.49968699, "learning_rate": 4.3126848621675905e-07, "loss": 0.52004063, "num_input_tokens_seen": 282435695, "router_z_loss_clip": 0.01159668, "router_z_loss_mlp": 0.21484375, "step": 13090, "time_per_iteration": 3.235854148864746 }, { "auxiliary_loss_clip": 0.01111555, "auxiliary_loss_mlp": 0.01030698, "balance_loss_clip": 1.01903319, "balance_loss_mlp": 1.03551435, "epoch": 0.7870735006763866, "flos": 26068310190720.0, "grad_norm": 1.794753388217414, "language_loss": 0.8374474, "learning_rate": 4.3103415154741583e-07, "loss": 0.85886997, "num_input_tokens_seen": 282456025, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 13091, "time_per_iteration": 2.5687057971954346 }, { "auxiliary_loss_clip": 0.01033555, "auxiliary_loss_mlp": 0.01002708, "balance_loss_clip": 1.00159323, "balance_loss_mlp": 1.00313163, "epoch": 0.7871336239290546, "flos": 70289572896000.0, "grad_norm": 0.7116422676019067, "language_loss": 0.63944399, "learning_rate": 4.307998728704281e-07, "loss": 0.65980661, "num_input_tokens_seen": 282520995, "router_z_loss_clip": 0.01116943, "router_z_loss_mlp": 0.21484375, "step": 13092, "time_per_iteration": 3.166750192642212 }, { "auxiliary_loss_clip": 0.01124447, "auxiliary_loss_mlp": 0.01035025, "balance_loss_clip": 1.02196002, "balance_loss_mlp": 1.03537452, "epoch": 0.7871937471817225, "flos": 15778574760960.0, "grad_norm": 2.008680429824938, "language_loss": 0.79512042, "learning_rate": 4.305656501941557e-07, "loss": 0.81671512, "num_input_tokens_seen": 282539355, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 13093, "time_per_iteration": 2.572338342666626 }, { "auxiliary_loss_clip": 0.01121113, "auxiliary_loss_mlp": 0.01026112, "balance_loss_clip": 1.01419675, "balance_loss_mlp": 1.03493786, "epoch": 0.7872538704343905, "flos": 20485242506880.0, "grad_norm": 1.7969619495736429, "language_loss": 0.75514591, "learning_rate": 4.3033148352695915e-07, "loss": 0.77661806, "num_input_tokens_seen": 282555735, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 13094, "time_per_iteration": 2.5039663314819336 }, { "auxiliary_loss_clip": 0.01141786, "auxiliary_loss_mlp": 0.01036699, "balance_loss_clip": 1.02484941, "balance_loss_mlp": 1.03576934, "epoch": 0.7873139936870585, "flos": 25082670015360.0, "grad_norm": 1.6701695622390174, "language_loss": 0.79539353, "learning_rate": 4.3009737287719327e-07, "loss": 0.81717837, "num_input_tokens_seen": 282574550, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.703125, "step": 13095, "time_per_iteration": 2.594085693359375 }, { "auxiliary_loss_clip": 0.01113557, "auxiliary_loss_mlp": 0.01035253, "balance_loss_clip": 1.02222943, "balance_loss_mlp": 1.03409564, "epoch": 0.7873741169397265, "flos": 30883176679680.0, "grad_norm": 1.588374415827945, "language_loss": 0.67870033, "learning_rate": 4.2986331825321455e-07, "loss": 0.7001884, "num_input_tokens_seen": 282596520, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 13096, "time_per_iteration": 2.588719129562378 }, { "auxiliary_loss_clip": 0.01115973, "auxiliary_loss_mlp": 0.01028798, "balance_loss_clip": 1.01817024, "balance_loss_mlp": 1.03209782, "epoch": 0.7874342401923944, "flos": 46791962242560.0, "grad_norm": 1.5253878619584014, "language_loss": 0.70634103, "learning_rate": 4.296293196633745e-07, "loss": 0.72778881, "num_input_tokens_seen": 282620560, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.66015625, "step": 13097, "time_per_iteration": 2.7595293521881104 }, { "auxiliary_loss_clip": 0.01125249, "auxiliary_loss_mlp": 0.01038801, "balance_loss_clip": 1.02617669, "balance_loss_mlp": 1.03662729, "epoch": 0.7874943634450624, "flos": 23584548545280.0, "grad_norm": 1.6714663933566487, "language_loss": 0.80622041, "learning_rate": 4.293953771160257e-07, "loss": 0.82786095, "num_input_tokens_seen": 282639830, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 13098, "time_per_iteration": 2.541374444961548 }, { "auxiliary_loss_clip": 0.01128929, "auxiliary_loss_mlp": 0.01030197, "balance_loss_clip": 1.01864576, "balance_loss_mlp": 1.03465092, "epoch": 0.7875544866977303, "flos": 20191169859840.0, "grad_norm": 1.871502427892839, "language_loss": 0.74494761, "learning_rate": 4.291614906195147e-07, "loss": 0.76653886, "num_input_tokens_seen": 282660130, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6796875, "step": 13099, "time_per_iteration": 2.6297290325164795 }, { "auxiliary_loss_clip": 0.01121555, "auxiliary_loss_mlp": 0.01023715, "balance_loss_clip": 1.01165712, "balance_loss_mlp": 1.03521562, "epoch": 0.7876146099503983, "flos": 22602571557120.0, "grad_norm": 2.786330807289081, "language_loss": 0.78328389, "learning_rate": 4.2892766018218985e-07, "loss": 0.80473655, "num_input_tokens_seen": 282681125, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 13100, "time_per_iteration": 3.931926727294922 }, { "auxiliary_loss_clip": 0.01118666, "auxiliary_loss_mlp": 0.01279329, "balance_loss_clip": 1.01867795, "balance_loss_mlp": 1.03527784, "epoch": 0.7876747332030662, "flos": 10705833555840.0, "grad_norm": 2.363124338530355, "language_loss": 0.66367298, "learning_rate": 4.286938858123963e-07, "loss": 0.68765289, "num_input_tokens_seen": 282696690, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7421875, "step": 13101, "time_per_iteration": 2.4582786560058594 }, { "auxiliary_loss_clip": 0.01120846, "auxiliary_loss_mlp": 0.01032962, "balance_loss_clip": 1.02126157, "balance_loss_mlp": 1.034253, "epoch": 0.7877348564557343, "flos": 38399315621760.0, "grad_norm": 1.415500476548467, "language_loss": 0.77531582, "learning_rate": 4.2846016751847494e-07, "loss": 0.7968539, "num_input_tokens_seen": 282721210, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 13102, "time_per_iteration": 2.7059898376464844 }, { "auxiliary_loss_clip": 0.01114415, "auxiliary_loss_mlp": 0.01039492, "balance_loss_clip": 1.02789927, "balance_loss_mlp": 1.03594518, "epoch": 0.7877949797084022, "flos": 18329524796160.0, "grad_norm": 2.0522476717852673, "language_loss": 0.82371426, "learning_rate": 4.282265053087681e-07, "loss": 0.84525335, "num_input_tokens_seen": 282738505, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6953125, "step": 13103, "time_per_iteration": 2.474048137664795 }, { "auxiliary_loss_clip": 0.01116537, "auxiliary_loss_mlp": 0.01033163, "balance_loss_clip": 1.01984191, "balance_loss_mlp": 1.03685164, "epoch": 0.7878551029610702, "flos": 25806736373760.0, "grad_norm": 5.3343784253746005, "language_loss": 0.8037641, "learning_rate": 4.279928991916137e-07, "loss": 0.82526112, "num_input_tokens_seen": 282756895, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 13104, "time_per_iteration": 2.5302889347076416 }, { "auxiliary_loss_clip": 0.01105784, "auxiliary_loss_mlp": 0.01032431, "balance_loss_clip": 1.01987815, "balance_loss_mlp": 1.03529286, "epoch": 0.7879152262137382, "flos": 22342685679360.0, "grad_norm": 1.583515742293529, "language_loss": 0.73996168, "learning_rate": 4.2775934917535015e-07, "loss": 0.76134384, "num_input_tokens_seen": 282774955, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 13105, "time_per_iteration": 2.4892308712005615 }, { "auxiliary_loss_clip": 0.01151818, "auxiliary_loss_mlp": 0.01038589, "balance_loss_clip": 1.0257442, "balance_loss_mlp": 1.03539276, "epoch": 0.7879753494664061, "flos": 24785329230720.0, "grad_norm": 1.6189820880338759, "language_loss": 0.76066858, "learning_rate": 4.275258552683101e-07, "loss": 0.78257263, "num_input_tokens_seen": 282793165, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 13106, "time_per_iteration": 3.9978952407836914 }, { "auxiliary_loss_clip": 0.01140776, "auxiliary_loss_mlp": 0.01031363, "balance_loss_clip": 1.01888752, "balance_loss_mlp": 1.03508902, "epoch": 0.7880354727190741, "flos": 16909078487040.0, "grad_norm": 2.2743706745583965, "language_loss": 0.73288298, "learning_rate": 4.272924174788279e-07, "loss": 0.75460446, "num_input_tokens_seen": 282809820, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 13107, "time_per_iteration": 2.546753168106079 }, { "auxiliary_loss_clip": 0.01102125, "auxiliary_loss_mlp": 0.0102965, "balance_loss_clip": 1.01760364, "balance_loss_mlp": 1.03375793, "epoch": 0.7880955959717421, "flos": 22230500526720.0, "grad_norm": 1.6636336150248185, "language_loss": 0.73137921, "learning_rate": 4.2705903581523396e-07, "loss": 0.75269699, "num_input_tokens_seen": 282828600, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.68359375, "step": 13108, "time_per_iteration": 2.564169406890869 }, { "auxiliary_loss_clip": 0.01023678, "auxiliary_loss_mlp": 0.01000072, "balance_loss_clip": 0.99887389, "balance_loss_mlp": 1.00218463, "epoch": 0.7881557192244101, "flos": 69183200131200.0, "grad_norm": 0.885398994153934, "language_loss": 0.60364175, "learning_rate": 4.268257102858568e-07, "loss": 0.62387919, "num_input_tokens_seen": 282882775, "router_z_loss_clip": 0.01196289, "router_z_loss_mlp": 0.21484375, "step": 13109, "time_per_iteration": 3.0986037254333496 }, { "auxiliary_loss_clip": 0.01129046, "auxiliary_loss_mlp": 0.0103019, "balance_loss_clip": 1.01817966, "balance_loss_mlp": 1.03512704, "epoch": 0.788215842477078, "flos": 24935436167040.0, "grad_norm": 1.6588591267044932, "language_loss": 0.7244041, "learning_rate": 4.265924408990227e-07, "loss": 0.74599648, "num_input_tokens_seen": 282902680, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6640625, "step": 13110, "time_per_iteration": 2.6040544509887695 }, { "auxiliary_loss_clip": 0.01137319, "auxiliary_loss_mlp": 0.01028869, "balance_loss_clip": 1.01715076, "balance_loss_mlp": 1.03509855, "epoch": 0.788275965729746, "flos": 26106483369600.0, "grad_norm": 1.3981988240632974, "language_loss": 0.74932468, "learning_rate": 4.263592276630583e-07, "loss": 0.77098656, "num_input_tokens_seen": 282923625, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.66796875, "step": 13111, "time_per_iteration": 2.631730556488037 }, { "auxiliary_loss_clip": 0.01113604, "auxiliary_loss_mlp": 0.01035839, "balance_loss_clip": 1.02308393, "balance_loss_mlp": 1.03521287, "epoch": 0.7883360889824139, "flos": 21214803646080.0, "grad_norm": 1.8585997342025475, "language_loss": 0.61075795, "learning_rate": 4.2612607058628413e-07, "loss": 0.63225234, "num_input_tokens_seen": 282941955, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 13112, "time_per_iteration": 2.49511456489563 }, { "auxiliary_loss_clip": 0.01121382, "auxiliary_loss_mlp": 0.01032155, "balance_loss_clip": 1.01908386, "balance_loss_mlp": 1.03434753, "epoch": 0.7883962122350819, "flos": 21142551438720.0, "grad_norm": 1.8933825714191141, "language_loss": 0.67319751, "learning_rate": 4.258929696770226e-07, "loss": 0.6947329, "num_input_tokens_seen": 282961280, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 13113, "time_per_iteration": 2.5331828594207764 }, { "auxiliary_loss_clip": 0.01130992, "auxiliary_loss_mlp": 0.01026283, "balance_loss_clip": 1.01459432, "balance_loss_mlp": 1.03621221, "epoch": 0.7884563354877498, "flos": 15302901928320.0, "grad_norm": 2.1665184149692447, "language_loss": 0.573771, "learning_rate": 4.2565992494359127e-07, "loss": 0.59534371, "num_input_tokens_seen": 282978210, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6875, "step": 13114, "time_per_iteration": 5.578385829925537 }, { "auxiliary_loss_clip": 0.01141524, "auxiliary_loss_mlp": 0.01030344, "balance_loss_clip": 1.01746345, "balance_loss_mlp": 1.03512073, "epoch": 0.7885164587404179, "flos": 24388301226240.0, "grad_norm": 1.6309991631218927, "language_loss": 0.66650724, "learning_rate": 4.254269363943086e-07, "loss": 0.68822598, "num_input_tokens_seen": 282998845, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 13115, "time_per_iteration": 2.601189136505127 }, { "auxiliary_loss_clip": 0.01120671, "auxiliary_loss_mlp": 0.01028278, "balance_loss_clip": 1.01590967, "balance_loss_mlp": 1.03426659, "epoch": 0.7885765819930858, "flos": 14385886686720.0, "grad_norm": 1.891894506383719, "language_loss": 0.88874388, "learning_rate": 4.2519400403748796e-07, "loss": 0.91023338, "num_input_tokens_seen": 283015200, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 13116, "time_per_iteration": 2.524256944656372 }, { "auxiliary_loss_clip": 0.01129173, "auxiliary_loss_mlp": 0.01035423, "balance_loss_clip": 1.02167273, "balance_loss_mlp": 1.0359621, "epoch": 0.7886367052457538, "flos": 18259930195200.0, "grad_norm": 1.8868522925706515, "language_loss": 0.7246266, "learning_rate": 4.2496112788144157e-07, "loss": 0.74627256, "num_input_tokens_seen": 283033680, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75390625, "step": 13117, "time_per_iteration": 2.467620849609375 }, { "auxiliary_loss_clip": 0.01111199, "auxiliary_loss_mlp": 0.01025165, "balance_loss_clip": 1.01370287, "balance_loss_mlp": 1.03509498, "epoch": 0.7886968284984217, "flos": 15305092657920.0, "grad_norm": 1.5571937485623597, "language_loss": 0.80092502, "learning_rate": 4.2472830793448234e-07, "loss": 0.82228863, "num_input_tokens_seen": 283050620, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.67578125, "step": 13118, "time_per_iteration": 2.47375750541687 }, { "auxiliary_loss_clip": 0.01112068, "auxiliary_loss_mlp": 0.0102734, "balance_loss_clip": 1.01512146, "balance_loss_mlp": 1.03505731, "epoch": 0.7887569517510897, "flos": 21215450090880.0, "grad_norm": 1.7944636998468926, "language_loss": 0.72910833, "learning_rate": 4.244955442049165e-07, "loss": 0.75050247, "num_input_tokens_seen": 283070215, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 13119, "time_per_iteration": 2.553250312805176 }, { "auxiliary_loss_clip": 0.01113956, "auxiliary_loss_mlp": 0.01026568, "balance_loss_clip": 1.01435518, "balance_loss_mlp": 1.03454614, "epoch": 0.7888170750037578, "flos": 22711237176960.0, "grad_norm": 1.4210016730531179, "language_loss": 0.71816933, "learning_rate": 4.242628367010528e-07, "loss": 0.73957455, "num_input_tokens_seen": 283091485, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 13120, "time_per_iteration": 2.5489156246185303 }, { "auxiliary_loss_clip": 0.01122003, "auxiliary_loss_mlp": 0.0127793, "balance_loss_clip": 1.01863217, "balance_loss_mlp": 1.03527188, "epoch": 0.7888771982564257, "flos": 36429148592640.0, "grad_norm": 1.5867982161482328, "language_loss": 0.78982008, "learning_rate": 4.240301854311943e-07, "loss": 0.81381947, "num_input_tokens_seen": 283115040, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 13121, "time_per_iteration": 2.662452220916748 }, { "auxiliary_loss_clip": 0.01110788, "auxiliary_loss_mlp": 0.01028453, "balance_loss_clip": 1.01616216, "balance_loss_mlp": 1.03445184, "epoch": 0.7889373215090937, "flos": 27309993488640.0, "grad_norm": 1.4778794468328282, "language_loss": 0.80398905, "learning_rate": 4.2379759040364594e-07, "loss": 0.82538146, "num_input_tokens_seen": 283136925, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.671875, "step": 13122, "time_per_iteration": 2.597099542617798 }, { "auxiliary_loss_clip": 0.01121904, "auxiliary_loss_mlp": 0.01024964, "balance_loss_clip": 1.01324618, "balance_loss_mlp": 1.03392506, "epoch": 0.7889974447617616, "flos": 19829010983040.0, "grad_norm": 1.9771833815927657, "language_loss": 0.78299797, "learning_rate": 4.235650516267058e-07, "loss": 0.80446666, "num_input_tokens_seen": 283155725, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69921875, "step": 13123, "time_per_iteration": 2.5268754959106445 }, { "auxiliary_loss_clip": 0.01134859, "auxiliary_loss_mlp": 0.01036221, "balance_loss_clip": 1.02371562, "balance_loss_mlp": 1.03602517, "epoch": 0.7890575680144296, "flos": 17271201450240.0, "grad_norm": 2.0679591038356917, "language_loss": 0.67388785, "learning_rate": 4.2333256910867467e-07, "loss": 0.69559872, "num_input_tokens_seen": 283173845, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.72265625, "step": 13124, "time_per_iteration": 2.5200698375701904 }, { "auxiliary_loss_clip": 0.01126049, "auxiliary_loss_mlp": 0.01027861, "balance_loss_clip": 1.01506972, "balance_loss_mlp": 1.03674698, "epoch": 0.7891176912670975, "flos": 27600151553280.0, "grad_norm": 2.1291894691562443, "language_loss": 0.72545886, "learning_rate": 4.2310014285784824e-07, "loss": 0.74699801, "num_input_tokens_seen": 283191985, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 13125, "time_per_iteration": 2.5760977268218994 }, { "auxiliary_loss_clip": 0.0112136, "auxiliary_loss_mlp": 0.01028799, "balance_loss_clip": 1.0159483, "balance_loss_mlp": 1.03178895, "epoch": 0.7891778145197655, "flos": 22711668140160.0, "grad_norm": 1.9399340943918006, "language_loss": 0.72464323, "learning_rate": 4.228677728825216e-07, "loss": 0.74614489, "num_input_tokens_seen": 283210855, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 13126, "time_per_iteration": 2.5192434787750244 }, { "auxiliary_loss_clip": 0.011238, "auxiliary_loss_mlp": 0.01028666, "balance_loss_clip": 1.01600611, "balance_loss_mlp": 1.03516924, "epoch": 0.7892379377724335, "flos": 17310775259520.0, "grad_norm": 2.0531721590307086, "language_loss": 0.76614439, "learning_rate": 4.2263545919098663e-07, "loss": 0.787669, "num_input_tokens_seen": 283229665, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 13127, "time_per_iteration": 2.497460126876831 }, { "auxiliary_loss_clip": 0.01111189, "auxiliary_loss_mlp": 0.01032944, "balance_loss_clip": 1.02101135, "balance_loss_mlp": 1.03573298, "epoch": 0.7892980610251015, "flos": 25775674087680.0, "grad_norm": 1.596034111778072, "language_loss": 0.85882485, "learning_rate": 4.2240320179153576e-07, "loss": 0.88026619, "num_input_tokens_seen": 283248615, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6640625, "step": 13128, "time_per_iteration": 2.546785593032837 }, { "auxiliary_loss_clip": 0.0111166, "auxiliary_loss_mlp": 0.01036085, "balance_loss_clip": 1.02458096, "balance_loss_mlp": 1.03452909, "epoch": 0.7893581842777694, "flos": 21579943351680.0, "grad_norm": 2.005200665282209, "language_loss": 0.68680334, "learning_rate": 4.221710006924557e-07, "loss": 0.7082808, "num_input_tokens_seen": 283267135, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.68359375, "step": 13129, "time_per_iteration": 2.7425153255462646 }, { "auxiliary_loss_clip": 0.0103318, "auxiliary_loss_mlp": 0.01001849, "balance_loss_clip": 1.00069845, "balance_loss_mlp": 1.00248885, "epoch": 0.7894183075304374, "flos": 69016468176000.0, "grad_norm": 0.7161906501564775, "language_loss": 0.61567688, "learning_rate": 4.2193885590203424e-07, "loss": 0.63602716, "num_input_tokens_seen": 283328940, "router_z_loss_clip": 0.01147461, "router_z_loss_mlp": 0.21582031, "step": 13130, "time_per_iteration": 3.196228504180908 }, { "auxiliary_loss_clip": 0.01116166, "auxiliary_loss_mlp": 0.01029071, "balance_loss_clip": 1.01718616, "balance_loss_mlp": 1.03798223, "epoch": 0.7894784307831053, "flos": 24243258107520.0, "grad_norm": 1.522015346781823, "language_loss": 0.7386983, "learning_rate": 4.217067674285557e-07, "loss": 0.76015067, "num_input_tokens_seen": 283350000, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 13131, "time_per_iteration": 2.572916269302368 }, { "auxiliary_loss_clip": 0.01131118, "auxiliary_loss_mlp": 0.01028819, "balance_loss_clip": 1.01456761, "balance_loss_mlp": 1.035851, "epoch": 0.7895385540357733, "flos": 20266546550400.0, "grad_norm": 2.429474687241914, "language_loss": 0.6912365, "learning_rate": 4.2147473528030295e-07, "loss": 0.71283585, "num_input_tokens_seen": 283368020, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7734375, "step": 13132, "time_per_iteration": 2.533621311187744 }, { "auxiliary_loss_clip": 0.01115876, "auxiliary_loss_mlp": 0.01036823, "balance_loss_clip": 1.02363861, "balance_loss_mlp": 1.03611529, "epoch": 0.7895986772884414, "flos": 20996574566400.0, "grad_norm": 1.9432298574178168, "language_loss": 0.61935323, "learning_rate": 4.2124275946555655e-07, "loss": 0.64088023, "num_input_tokens_seen": 283387030, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 13133, "time_per_iteration": 2.5237696170806885 }, { "auxiliary_loss_clip": 0.01137237, "auxiliary_loss_mlp": 0.01036478, "balance_loss_clip": 1.02243495, "balance_loss_mlp": 1.03709555, "epoch": 0.7896588005411093, "flos": 18657999694080.0, "grad_norm": 2.220833702475338, "language_loss": 0.80185664, "learning_rate": 4.2101083999259424e-07, "loss": 0.82359374, "num_input_tokens_seen": 283402090, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.73828125, "step": 13134, "time_per_iteration": 2.4906930923461914 }, { "auxiliary_loss_clip": 0.01114572, "auxiliary_loss_mlp": 0.01031751, "balance_loss_clip": 1.01944232, "balance_loss_mlp": 1.03555942, "epoch": 0.7897189237937773, "flos": 18405907067520.0, "grad_norm": 1.9678105869151243, "language_loss": 0.80091888, "learning_rate": 4.2077897686969455e-07, "loss": 0.82238209, "num_input_tokens_seen": 283421035, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 13135, "time_per_iteration": 2.5087428092956543 }, { "auxiliary_loss_clip": 0.01111509, "auxiliary_loss_mlp": 0.01031731, "balance_loss_clip": 1.01887465, "balance_loss_mlp": 1.03404713, "epoch": 0.7897790470464452, "flos": 23731602825600.0, "grad_norm": 1.5928473067894355, "language_loss": 0.72763336, "learning_rate": 4.2054717010512997e-07, "loss": 0.74906576, "num_input_tokens_seen": 283441830, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 13136, "time_per_iteration": 2.5425338745117188 }, { "auxiliary_loss_clip": 0.01111698, "auxiliary_loss_mlp": 0.01034272, "balance_loss_clip": 1.02108777, "balance_loss_mlp": 1.03646564, "epoch": 0.7898391702991132, "flos": 15918949111680.0, "grad_norm": 9.45700754436597, "language_loss": 0.71240628, "learning_rate": 4.203154197071745e-07, "loss": 0.73386598, "num_input_tokens_seen": 283459540, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.75, "step": 13137, "time_per_iteration": 2.502636671066284 }, { "auxiliary_loss_clip": 0.01103227, "auxiliary_loss_mlp": 0.0103588, "balance_loss_clip": 1.02210522, "balance_loss_mlp": 1.03456378, "epoch": 0.7898992935517811, "flos": 19829046896640.0, "grad_norm": 1.7542763871585423, "language_loss": 0.73810506, "learning_rate": 4.200837256840981e-07, "loss": 0.75949615, "num_input_tokens_seen": 283478790, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.6875, "step": 13138, "time_per_iteration": 2.457023859024048 }, { "auxiliary_loss_clip": 0.01130861, "auxiliary_loss_mlp": 0.01034893, "balance_loss_clip": 1.02283454, "balance_loss_mlp": 1.03518414, "epoch": 0.7899594168044491, "flos": 16216253982720.0, "grad_norm": 2.123108994019264, "language_loss": 0.68574888, "learning_rate": 4.1985208804416985e-07, "loss": 0.7074064, "num_input_tokens_seen": 283495720, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 13139, "time_per_iteration": 2.4977824687957764 }, { "auxiliary_loss_clip": 0.01033639, "auxiliary_loss_mlp": 0.01002317, "balance_loss_clip": 1.00112522, "balance_loss_mlp": 1.00306821, "epoch": 0.790019540057117, "flos": 68331005959680.0, "grad_norm": 0.8494009779712153, "language_loss": 0.60218924, "learning_rate": 4.196205067956551e-07, "loss": 0.62254882, "num_input_tokens_seen": 283558795, "router_z_loss_clip": 0.01190186, "router_z_loss_mlp": 0.21679688, "step": 13140, "time_per_iteration": 3.1860265731811523 }, { "auxiliary_loss_clip": 0.01106498, "auxiliary_loss_mlp": 0.01036817, "balance_loss_clip": 1.02414501, "balance_loss_mlp": 1.03672099, "epoch": 0.7900796633097851, "flos": 30332773601280.0, "grad_norm": 1.7618692278762387, "language_loss": 0.76056308, "learning_rate": 4.1938898194681995e-07, "loss": 0.78199625, "num_input_tokens_seen": 283579305, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 13141, "time_per_iteration": 2.6074142456054688 }, { "auxiliary_loss_clip": 0.0111582, "auxiliary_loss_mlp": 0.01030576, "balance_loss_clip": 1.01776731, "balance_loss_mlp": 1.03553557, "epoch": 0.790139786562453, "flos": 22126790983680.0, "grad_norm": 2.117223448724954, "language_loss": 0.68550891, "learning_rate": 4.191575135059262e-07, "loss": 0.7069729, "num_input_tokens_seen": 283597840, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 13142, "time_per_iteration": 4.032540559768677 }, { "auxiliary_loss_clip": 0.0106931, "auxiliary_loss_mlp": 0.01004236, "balance_loss_clip": 1.00299668, "balance_loss_mlp": 1.00285411, "epoch": 0.790199909815121, "flos": 58207284213120.0, "grad_norm": 0.8303792758381784, "language_loss": 0.60063517, "learning_rate": 4.189261014812344e-07, "loss": 0.62137061, "num_input_tokens_seen": 283647950, "router_z_loss_clip": 0.01239014, "router_z_loss_mlp": 0.21582031, "step": 13143, "time_per_iteration": 2.940781593322754 }, { "auxiliary_loss_clip": 0.01133262, "auxiliary_loss_mlp": 0.01036257, "balance_loss_clip": 1.02344823, "balance_loss_mlp": 1.03594637, "epoch": 0.7902600330677889, "flos": 34533316759680.0, "grad_norm": 1.6502594145392426, "language_loss": 0.74309528, "learning_rate": 4.186947458810024e-07, "loss": 0.76479048, "num_input_tokens_seen": 283670645, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 13144, "time_per_iteration": 2.617405652999878 }, { "auxiliary_loss_clip": 0.01116675, "auxiliary_loss_mlp": 0.01033406, "balance_loss_clip": 1.0195899, "balance_loss_mlp": 1.03581405, "epoch": 0.7903201563204569, "flos": 22346384780160.0, "grad_norm": 1.9050129220306025, "language_loss": 0.82850933, "learning_rate": 4.184634467134884e-07, "loss": 0.85001016, "num_input_tokens_seen": 283688830, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.72265625, "step": 13145, "time_per_iteration": 2.5029289722442627 }, { "auxiliary_loss_clip": 0.01108442, "auxiliary_loss_mlp": 0.01031906, "balance_loss_clip": 1.02036047, "balance_loss_mlp": 1.03297186, "epoch": 0.790380279573125, "flos": 22053533195520.0, "grad_norm": 1.7575061349905865, "language_loss": 0.73215765, "learning_rate": 4.1823220398694527e-07, "loss": 0.75356114, "num_input_tokens_seen": 283708625, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 13146, "time_per_iteration": 2.5377352237701416 }, { "auxiliary_loss_clip": 0.01105269, "auxiliary_loss_mlp": 0.01027746, "balance_loss_clip": 1.01556253, "balance_loss_mlp": 1.03365541, "epoch": 0.7904404028257929, "flos": 20302600826880.0, "grad_norm": 5.49544741950973, "language_loss": 0.75975376, "learning_rate": 4.180010177096256e-07, "loss": 0.78108394, "num_input_tokens_seen": 283725710, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.71875, "step": 13147, "time_per_iteration": 2.4882428646087646 }, { "auxiliary_loss_clip": 0.01112076, "auxiliary_loss_mlp": 0.01035885, "balance_loss_clip": 1.02234864, "balance_loss_mlp": 1.03360271, "epoch": 0.7905005260784609, "flos": 20008923229440.0, "grad_norm": 1.7570374800681539, "language_loss": 0.72142011, "learning_rate": 4.177698878897806e-07, "loss": 0.74289972, "num_input_tokens_seen": 283744150, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.6953125, "step": 13148, "time_per_iteration": 3.914022922515869 }, { "auxiliary_loss_clip": 0.01113416, "auxiliary_loss_mlp": 0.01030458, "balance_loss_clip": 1.01772606, "balance_loss_mlp": 1.03401506, "epoch": 0.7905606493311288, "flos": 26905926418560.0, "grad_norm": 1.8019269397070314, "language_loss": 0.71705103, "learning_rate": 4.175388145356584e-07, "loss": 0.73848975, "num_input_tokens_seen": 283764170, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 13149, "time_per_iteration": 2.5864920616149902 }, { "auxiliary_loss_clip": 0.0113486, "auxiliary_loss_mlp": 0.01029455, "balance_loss_clip": 1.01518583, "balance_loss_mlp": 1.03578091, "epoch": 0.7906207725837968, "flos": 23696230907520.0, "grad_norm": 2.794099303241026, "language_loss": 0.6537649, "learning_rate": 4.1730779765550527e-07, "loss": 0.67540801, "num_input_tokens_seen": 283784305, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.71875, "step": 13150, "time_per_iteration": 2.606760263442993 }, { "auxiliary_loss_clip": 0.01103215, "auxiliary_loss_mlp": 0.01029789, "balance_loss_clip": 1.01775527, "balance_loss_mlp": 1.03397703, "epoch": 0.7906808958364647, "flos": 20848837927680.0, "grad_norm": 1.8532725504962675, "language_loss": 0.70298016, "learning_rate": 4.17076837257565e-07, "loss": 0.72431022, "num_input_tokens_seen": 283804040, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 13151, "time_per_iteration": 2.510530948638916 }, { "auxiliary_loss_clip": 0.01125651, "auxiliary_loss_mlp": 0.01033774, "balance_loss_clip": 1.02073288, "balance_loss_mlp": 1.03663683, "epoch": 0.7907410190891327, "flos": 40735196974080.0, "grad_norm": 1.3329280611559464, "language_loss": 0.7011134, "learning_rate": 4.16845933350082e-07, "loss": 0.72270763, "num_input_tokens_seen": 283827120, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 13152, "time_per_iteration": 2.661972761154175 }, { "auxiliary_loss_clip": 0.01129023, "auxiliary_loss_mlp": 0.01277013, "balance_loss_clip": 1.01763928, "balance_loss_mlp": 1.03341746, "epoch": 0.7908011423418007, "flos": 13261165050240.0, "grad_norm": 2.7434767917547416, "language_loss": 0.73159236, "learning_rate": 4.16615085941294e-07, "loss": 0.75565273, "num_input_tokens_seen": 283844820, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 13153, "time_per_iteration": 2.532252550125122 }, { "auxiliary_loss_clip": 0.01103561, "auxiliary_loss_mlp": 0.010297, "balance_loss_clip": 1.01706398, "balance_loss_mlp": 1.03354883, "epoch": 0.7908612655944687, "flos": 19754747614080.0, "grad_norm": 1.460746821512939, "language_loss": 0.78734016, "learning_rate": 4.163842950394414e-07, "loss": 0.80867279, "num_input_tokens_seen": 283862870, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 13154, "time_per_iteration": 2.4743049144744873 }, { "auxiliary_loss_clip": 0.0111921, "auxiliary_loss_mlp": 0.0103045, "balance_loss_clip": 1.01660419, "balance_loss_mlp": 1.03516805, "epoch": 0.7909213888471366, "flos": 21287738211840.0, "grad_norm": 1.91184191418818, "language_loss": 0.70253575, "learning_rate": 4.161535606527595e-07, "loss": 0.72403228, "num_input_tokens_seen": 283882405, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75, "step": 13155, "time_per_iteration": 3.895181179046631 }, { "auxiliary_loss_clip": 0.01106502, "auxiliary_loss_mlp": 0.01028886, "balance_loss_clip": 1.0158627, "balance_loss_mlp": 1.03566372, "epoch": 0.7909815120998046, "flos": 22528882805760.0, "grad_norm": 1.6758333088653394, "language_loss": 0.76986206, "learning_rate": 4.1592288278948294e-07, "loss": 0.79121602, "num_input_tokens_seen": 283902070, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 13156, "time_per_iteration": 4.001264810562134 }, { "auxiliary_loss_clip": 0.01129929, "auxiliary_loss_mlp": 0.01029261, "balance_loss_clip": 1.01670837, "balance_loss_mlp": 1.03396702, "epoch": 0.7910416353524725, "flos": 26727702111360.0, "grad_norm": 1.693819396816372, "language_loss": 0.6553008, "learning_rate": 4.156922614578435e-07, "loss": 0.67689276, "num_input_tokens_seen": 283924100, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 13157, "time_per_iteration": 2.6780271530151367 }, { "auxiliary_loss_clip": 0.011147, "auxiliary_loss_mlp": 0.01036052, "balance_loss_clip": 1.02244473, "balance_loss_mlp": 1.03450298, "epoch": 0.7911017586051405, "flos": 24644847139200.0, "grad_norm": 10.897223315007052, "language_loss": 0.73935425, "learning_rate": 4.1546169666607246e-07, "loss": 0.76086175, "num_input_tokens_seen": 283944955, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7109375, "step": 13158, "time_per_iteration": 2.5813148021698 }, { "auxiliary_loss_clip": 0.01110583, "auxiliary_loss_mlp": 0.01033412, "balance_loss_clip": 1.02200949, "balance_loss_mlp": 1.03406191, "epoch": 0.7911618818578086, "flos": 17565489578880.0, "grad_norm": 2.2670061471531455, "language_loss": 0.67546809, "learning_rate": 4.1523118842239756e-07, "loss": 0.69690806, "num_input_tokens_seen": 283963125, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.67578125, "step": 13159, "time_per_iteration": 2.498002052307129 }, { "auxiliary_loss_clip": 0.01111773, "auxiliary_loss_mlp": 0.01031549, "balance_loss_clip": 1.01886463, "balance_loss_mlp": 1.03420138, "epoch": 0.7912220051104765, "flos": 16721660298240.0, "grad_norm": 1.7639931800306987, "language_loss": 0.6734882, "learning_rate": 4.15000736735045e-07, "loss": 0.69492137, "num_input_tokens_seen": 283982850, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 13160, "time_per_iteration": 2.558466911315918 }, { "auxiliary_loss_clip": 0.01115741, "auxiliary_loss_mlp": 0.01026534, "balance_loss_clip": 1.0159483, "balance_loss_mlp": 1.0327574, "epoch": 0.7912821283631445, "flos": 13698736531200.0, "grad_norm": 2.1325210559558845, "language_loss": 0.72754025, "learning_rate": 4.14770341612239e-07, "loss": 0.748963, "num_input_tokens_seen": 283998275, "router_z_loss_clip": 0.10546875, "router_z_loss_mlp": 0.65234375, "step": 13161, "time_per_iteration": 2.622528076171875 }, { "auxiliary_loss_clip": 0.01120733, "auxiliary_loss_mlp": 0.01030153, "balance_loss_clip": 1.0174036, "balance_loss_mlp": 1.0349617, "epoch": 0.7913422516158124, "flos": 23769021818880.0, "grad_norm": 1.9272290933567382, "language_loss": 0.73879027, "learning_rate": 4.1454000306220193e-07, "loss": 0.76029921, "num_input_tokens_seen": 284018750, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.68359375, "step": 13162, "time_per_iteration": 2.629906415939331 }, { "auxiliary_loss_clip": 0.01124719, "auxiliary_loss_mlp": 0.01030749, "balance_loss_clip": 1.01820219, "balance_loss_mlp": 1.03530204, "epoch": 0.7914023748684804, "flos": 19938251220480.0, "grad_norm": 1.7263097829635157, "language_loss": 0.72121531, "learning_rate": 4.1430972109315367e-07, "loss": 0.74276996, "num_input_tokens_seen": 284037850, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 13163, "time_per_iteration": 2.6902031898498535 }, { "auxiliary_loss_clip": 0.01131504, "auxiliary_loss_mlp": 0.0103161, "balance_loss_clip": 1.01830637, "balance_loss_mlp": 1.03537571, "epoch": 0.7914624981211483, "flos": 20594805966720.0, "grad_norm": 1.7595270483214096, "language_loss": 0.69926202, "learning_rate": 4.1407949571331226e-07, "loss": 0.72089314, "num_input_tokens_seen": 284056380, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69921875, "step": 13164, "time_per_iteration": 2.651021957397461 }, { "auxiliary_loss_clip": 0.01129763, "auxiliary_loss_mlp": 0.01030971, "balance_loss_clip": 1.01850164, "balance_loss_mlp": 1.03356075, "epoch": 0.7915226213738163, "flos": 21799465320960.0, "grad_norm": 1.8172484858604905, "language_loss": 0.6646775, "learning_rate": 4.1384932693089446e-07, "loss": 0.6862849, "num_input_tokens_seen": 284074945, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 13165, "time_per_iteration": 2.6017496585845947 }, { "auxiliary_loss_clip": 0.01133992, "auxiliary_loss_mlp": 0.01025595, "balance_loss_clip": 1.01431203, "balance_loss_mlp": 1.03205156, "epoch": 0.7915827446264843, "flos": 16288362535680.0, "grad_norm": 1.8495352255580861, "language_loss": 0.72352427, "learning_rate": 4.136192147541142e-07, "loss": 0.74512017, "num_input_tokens_seen": 284092070, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6640625, "step": 13166, "time_per_iteration": 2.7385382652282715 }, { "auxiliary_loss_clip": 0.01103905, "auxiliary_loss_mlp": 0.01034117, "balance_loss_clip": 1.02015781, "balance_loss_mlp": 1.03363836, "epoch": 0.7916428678791523, "flos": 25702595867520.0, "grad_norm": 7.056075195946068, "language_loss": 0.77163219, "learning_rate": 4.1338915919118353e-07, "loss": 0.79301238, "num_input_tokens_seen": 284112255, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.703125, "step": 13167, "time_per_iteration": 2.5942399501800537 }, { "auxiliary_loss_clip": 0.01128752, "auxiliary_loss_mlp": 0.01032894, "balance_loss_clip": 1.02096152, "balance_loss_mlp": 1.03420234, "epoch": 0.7917029911318202, "flos": 23878513451520.0, "grad_norm": 1.6424085403754944, "language_loss": 0.84022295, "learning_rate": 4.1315916025031216e-07, "loss": 0.86183941, "num_input_tokens_seen": 284132330, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 13168, "time_per_iteration": 2.5359206199645996 }, { "auxiliary_loss_clip": 0.01105219, "auxiliary_loss_mlp": 0.01030749, "balance_loss_clip": 1.01799989, "balance_loss_mlp": 1.03505754, "epoch": 0.7917631143844882, "flos": 21646593037440.0, "grad_norm": 1.5314581967484675, "language_loss": 0.72823727, "learning_rate": 4.1292921793970947e-07, "loss": 0.74959695, "num_input_tokens_seen": 284150640, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 13169, "time_per_iteration": 2.4909777641296387 }, { "auxiliary_loss_clip": 0.01106198, "auxiliary_loss_mlp": 0.01032585, "balance_loss_clip": 1.01929355, "balance_loss_mlp": 1.03406572, "epoch": 0.7918232376371561, "flos": 38874198355200.0, "grad_norm": 2.7878866531883766, "language_loss": 0.67138565, "learning_rate": 4.1269933226757934e-07, "loss": 0.69277352, "num_input_tokens_seen": 284171910, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 13170, "time_per_iteration": 2.6259679794311523 }, { "auxiliary_loss_clip": 0.01112144, "auxiliary_loss_mlp": 0.0102473, "balance_loss_clip": 1.01301193, "balance_loss_mlp": 1.03342927, "epoch": 0.7918833608898241, "flos": 20775544225920.0, "grad_norm": 1.8367666218427292, "language_loss": 0.7096194, "learning_rate": 4.124695032421277e-07, "loss": 0.73098814, "num_input_tokens_seen": 284191340, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69921875, "step": 13171, "time_per_iteration": 2.514719009399414 }, { "auxiliary_loss_clip": 0.01113144, "auxiliary_loss_mlp": 0.01028674, "balance_loss_clip": 1.01638401, "balance_loss_mlp": 1.03481007, "epoch": 0.7919434841424922, "flos": 33910122769920.0, "grad_norm": 1.8701090721130686, "language_loss": 0.66965485, "learning_rate": 4.1223973087155594e-07, "loss": 0.69107306, "num_input_tokens_seen": 284212495, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 13172, "time_per_iteration": 2.605703592300415 }, { "auxiliary_loss_clip": 0.01123455, "auxiliary_loss_mlp": 0.01033792, "balance_loss_clip": 1.02087533, "balance_loss_mlp": 1.03572106, "epoch": 0.7920036073951601, "flos": 21064660796160.0, "grad_norm": 1.6334422611426243, "language_loss": 0.79117334, "learning_rate": 4.1201001516406377e-07, "loss": 0.81274581, "num_input_tokens_seen": 284230825, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 13173, "time_per_iteration": 2.5510003566741943 }, { "auxiliary_loss_clip": 0.01131231, "auxiliary_loss_mlp": 0.0102951, "balance_loss_clip": 1.01791108, "balance_loss_mlp": 1.03479719, "epoch": 0.7920637306478281, "flos": 23655974739840.0, "grad_norm": 1.8651011391411747, "language_loss": 0.76933503, "learning_rate": 4.11780356127849e-07, "loss": 0.79094243, "num_input_tokens_seen": 284250365, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6953125, "step": 13174, "time_per_iteration": 2.597010374069214 }, { "auxiliary_loss_clip": 0.01119488, "auxiliary_loss_mlp": 0.01035186, "balance_loss_clip": 1.02309191, "balance_loss_mlp": 1.03405094, "epoch": 0.792123853900496, "flos": 27195438038400.0, "grad_norm": 2.077315365688508, "language_loss": 0.71322978, "learning_rate": 4.115507537711085e-07, "loss": 0.7347765, "num_input_tokens_seen": 284269635, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.67578125, "step": 13175, "time_per_iteration": 2.6012282371520996 }, { "auxiliary_loss_clip": 0.01112353, "auxiliary_loss_mlp": 0.01031611, "balance_loss_clip": 1.01855707, "balance_loss_mlp": 1.03409386, "epoch": 0.792183977153164, "flos": 19098659744640.0, "grad_norm": 1.8288092513932768, "language_loss": 0.59490728, "learning_rate": 4.1132120810203607e-07, "loss": 0.6163469, "num_input_tokens_seen": 284288380, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 13176, "time_per_iteration": 2.576306104660034 }, { "auxiliary_loss_clip": 0.0110682, "auxiliary_loss_mlp": 0.0103375, "balance_loss_clip": 1.02128029, "balance_loss_mlp": 1.0378238, "epoch": 0.7922441004058319, "flos": 17128851851520.0, "grad_norm": 1.884747758972978, "language_loss": 0.73455048, "learning_rate": 4.110917191288219e-07, "loss": 0.75595617, "num_input_tokens_seen": 284306920, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69140625, "step": 13177, "time_per_iteration": 2.564952850341797 }, { "auxiliary_loss_clip": 0.01112365, "auxiliary_loss_mlp": 0.01031782, "balance_loss_clip": 1.0195812, "balance_loss_mlp": 1.03592277, "epoch": 0.7923042236585, "flos": 17821640442240.0, "grad_norm": 2.154766269273262, "language_loss": 0.63961512, "learning_rate": 4.1086228685965786e-07, "loss": 0.66105664, "num_input_tokens_seen": 284324700, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 13178, "time_per_iteration": 2.5563273429870605 }, { "auxiliary_loss_clip": 0.01118997, "auxiliary_loss_mlp": 0.01029392, "balance_loss_clip": 1.01843619, "balance_loss_mlp": 1.03278482, "epoch": 0.7923643469111679, "flos": 29935206892800.0, "grad_norm": 1.4898158382683386, "language_loss": 0.68591464, "learning_rate": 4.1063291130273115e-07, "loss": 0.70739853, "num_input_tokens_seen": 284345985, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.6875, "step": 13179, "time_per_iteration": 2.646812677383423 }, { "auxiliary_loss_clip": 0.01102248, "auxiliary_loss_mlp": 0.01029573, "balance_loss_clip": 1.01681209, "balance_loss_mlp": 1.03400421, "epoch": 0.7924244701638359, "flos": 22674716023680.0, "grad_norm": 3.0538108529715444, "language_loss": 0.73939663, "learning_rate": 4.1040359246622724e-07, "loss": 0.76071483, "num_input_tokens_seen": 284364475, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.68359375, "step": 13180, "time_per_iteration": 2.57356858253479 }, { "auxiliary_loss_clip": 0.01116525, "auxiliary_loss_mlp": 0.01036088, "balance_loss_clip": 1.02233696, "balance_loss_mlp": 1.03556776, "epoch": 0.7924845934165038, "flos": 17968156018560.0, "grad_norm": 2.1656058592158423, "language_loss": 0.81718177, "learning_rate": 4.1017433035832983e-07, "loss": 0.83870792, "num_input_tokens_seen": 284382125, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 13181, "time_per_iteration": 2.5158252716064453 }, { "auxiliary_loss_clip": 0.0111975, "auxiliary_loss_mlp": 0.01034344, "balance_loss_clip": 1.02158272, "balance_loss_mlp": 1.03351855, "epoch": 0.7925447166691718, "flos": 23476960333440.0, "grad_norm": 2.002786116486202, "language_loss": 0.78084928, "learning_rate": 4.099451249872221e-07, "loss": 0.80239028, "num_input_tokens_seen": 284401585, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6796875, "step": 13182, "time_per_iteration": 2.549844980239868 }, { "auxiliary_loss_clip": 0.01117512, "auxiliary_loss_mlp": 0.01035938, "balance_loss_clip": 1.02228284, "balance_loss_mlp": 1.03535259, "epoch": 0.7926048399218397, "flos": 20447572118400.0, "grad_norm": 2.204385403213722, "language_loss": 0.73924685, "learning_rate": 4.097159763610816e-07, "loss": 0.76078141, "num_input_tokens_seen": 284419125, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73046875, "step": 13183, "time_per_iteration": 3.9614272117614746 }, { "auxiliary_loss_clip": 0.01121305, "auxiliary_loss_mlp": 0.01026525, "balance_loss_clip": 1.01420498, "balance_loss_mlp": 1.03410065, "epoch": 0.7926649631745077, "flos": 37160038535040.0, "grad_norm": 1.6560381772116177, "language_loss": 0.68288815, "learning_rate": 4.0948688448808767e-07, "loss": 0.70436645, "num_input_tokens_seen": 284440445, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 13184, "time_per_iteration": 2.6976089477539062 }, { "auxiliary_loss_clip": 0.01133651, "auxiliary_loss_mlp": 0.01037078, "balance_loss_clip": 1.02432895, "balance_loss_mlp": 1.03470051, "epoch": 0.7927250864271758, "flos": 17018606033280.0, "grad_norm": 1.8893223141981137, "language_loss": 0.70304978, "learning_rate": 4.092578493764152e-07, "loss": 0.72475713, "num_input_tokens_seen": 284459370, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 13185, "time_per_iteration": 2.6058173179626465 }, { "auxiliary_loss_clip": 0.0111557, "auxiliary_loss_mlp": 0.01031929, "balance_loss_clip": 1.01896548, "balance_loss_mlp": 1.03465044, "epoch": 0.7927852096798437, "flos": 17749208666880.0, "grad_norm": 2.123645435321438, "language_loss": 0.64821994, "learning_rate": 4.090288710342391e-07, "loss": 0.6696949, "num_input_tokens_seen": 284477525, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 13186, "time_per_iteration": 2.591806173324585 }, { "auxiliary_loss_clip": 0.01112958, "auxiliary_loss_mlp": 0.01031726, "balance_loss_clip": 1.0196383, "balance_loss_mlp": 1.03456438, "epoch": 0.7928453329325117, "flos": 23838436851840.0, "grad_norm": 1.6468061391488695, "language_loss": 0.76806909, "learning_rate": 4.087999494697292e-07, "loss": 0.78951591, "num_input_tokens_seen": 284496590, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 13187, "time_per_iteration": 2.5737552642822266 }, { "auxiliary_loss_clip": 0.01120946, "auxiliary_loss_mlp": 0.01028762, "balance_loss_clip": 1.01690054, "balance_loss_mlp": 1.03525972, "epoch": 0.7929054561851796, "flos": 17454920538240.0, "grad_norm": 2.154268462221467, "language_loss": 0.72446096, "learning_rate": 4.085710846910566e-07, "loss": 0.74595809, "num_input_tokens_seen": 284511470, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 13188, "time_per_iteration": 2.590492010116577 }, { "auxiliary_loss_clip": 0.0110626, "auxiliary_loss_mlp": 0.01289313, "balance_loss_clip": 1.02842128, "balance_loss_mlp": 1.03474915, "epoch": 0.7929655794378476, "flos": 21981280988160.0, "grad_norm": 4.572833793350871, "language_loss": 0.63121116, "learning_rate": 4.083422767063882e-07, "loss": 0.6551668, "num_input_tokens_seen": 284531125, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71484375, "step": 13189, "time_per_iteration": 2.5358150005340576 }, { "auxiliary_loss_clip": 0.01138711, "auxiliary_loss_mlp": 0.0103216, "balance_loss_clip": 1.02016735, "balance_loss_mlp": 1.03419852, "epoch": 0.7930257026905155, "flos": 17273930883840.0, "grad_norm": 2.222461926236115, "language_loss": 0.72459352, "learning_rate": 4.0811352552388987e-07, "loss": 0.74630219, "num_input_tokens_seen": 284549340, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 13190, "time_per_iteration": 4.025338888168335 }, { "auxiliary_loss_clip": 0.01108624, "auxiliary_loss_mlp": 0.0103075, "balance_loss_clip": 1.02014089, "balance_loss_mlp": 1.03463066, "epoch": 0.7930858259431836, "flos": 27300584125440.0, "grad_norm": 1.8241613757635997, "language_loss": 0.73060429, "learning_rate": 4.078848311517249e-07, "loss": 0.75199801, "num_input_tokens_seen": 284567060, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.6484375, "step": 13191, "time_per_iteration": 2.656789541244507 }, { "auxiliary_loss_clip": 0.01112541, "auxiliary_loss_mlp": 0.01038408, "balance_loss_clip": 1.02543783, "balance_loss_mlp": 1.0344429, "epoch": 0.7931459491958515, "flos": 19863736456320.0, "grad_norm": 1.7150998960767514, "language_loss": 0.69151872, "learning_rate": 4.076561935980545e-07, "loss": 0.71302819, "num_input_tokens_seen": 284586600, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 13192, "time_per_iteration": 2.6653177738189697 }, { "auxiliary_loss_clip": 0.01130256, "auxiliary_loss_mlp": 0.01035262, "balance_loss_clip": 1.02312672, "balance_loss_mlp": 1.03337777, "epoch": 0.7932060724485195, "flos": 23147120718720.0, "grad_norm": 2.0014014582114648, "language_loss": 0.75023174, "learning_rate": 4.0742761287103946e-07, "loss": 0.77188694, "num_input_tokens_seen": 284605715, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.703125, "step": 13193, "time_per_iteration": 2.621108055114746 }, { "auxiliary_loss_clip": 0.01114449, "auxiliary_loss_mlp": 0.01034459, "balance_loss_clip": 1.02054763, "balance_loss_mlp": 1.03645515, "epoch": 0.7932661957011874, "flos": 29934847756800.0, "grad_norm": 1.446995465722003, "language_loss": 0.71979147, "learning_rate": 4.0719908897883526e-07, "loss": 0.74128056, "num_input_tokens_seen": 284628540, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.69140625, "step": 13194, "time_per_iteration": 2.739185094833374 }, { "auxiliary_loss_clip": 0.01107374, "auxiliary_loss_mlp": 0.01035529, "balance_loss_clip": 1.02239799, "balance_loss_mlp": 1.03464937, "epoch": 0.7933263189538554, "flos": 22559119079040.0, "grad_norm": 2.3419352564471505, "language_loss": 0.69850314, "learning_rate": 4.06970621929599e-07, "loss": 0.7199322, "num_input_tokens_seen": 284646040, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 13195, "time_per_iteration": 2.6238651275634766 }, { "auxiliary_loss_clip": 0.01139835, "auxiliary_loss_mlp": 0.01029804, "balance_loss_clip": 1.01706004, "balance_loss_mlp": 1.03477621, "epoch": 0.7933864422065233, "flos": 25479051575040.0, "grad_norm": 1.7357265082263325, "language_loss": 0.77685678, "learning_rate": 4.067422117314834e-07, "loss": 0.79855317, "num_input_tokens_seen": 284665110, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 13196, "time_per_iteration": 4.051764011383057 }, { "auxiliary_loss_clip": 0.01132744, "auxiliary_loss_mlp": 0.01033885, "balance_loss_clip": 1.02108765, "balance_loss_mlp": 1.03536355, "epoch": 0.7934465654591913, "flos": 33583156243200.0, "grad_norm": 2.1468854347326296, "language_loss": 0.69301176, "learning_rate": 4.0651385839264e-07, "loss": 0.71467811, "num_input_tokens_seen": 284686515, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 13197, "time_per_iteration": 2.6906168460845947 }, { "auxiliary_loss_clip": 0.01111515, "auxiliary_loss_mlp": 0.01029937, "balance_loss_clip": 1.01825404, "balance_loss_mlp": 1.03430009, "epoch": 0.7935066887118594, "flos": 31432538263680.0, "grad_norm": 2.11203531095924, "language_loss": 0.64971215, "learning_rate": 4.0628556192121753e-07, "loss": 0.67112666, "num_input_tokens_seen": 284707300, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 13198, "time_per_iteration": 4.121496915817261 }, { "auxiliary_loss_clip": 0.01144592, "auxiliary_loss_mlp": 0.01031542, "balance_loss_clip": 1.01785707, "balance_loss_mlp": 1.03632069, "epoch": 0.7935668119645273, "flos": 14682616940160.0, "grad_norm": 2.165752866362289, "language_loss": 0.71969134, "learning_rate": 4.0605732232536494e-07, "loss": 0.74145269, "num_input_tokens_seen": 284723545, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 13199, "time_per_iteration": 2.5803377628326416 }, { "auxiliary_loss_clip": 0.01123318, "auxiliary_loss_mlp": 0.01029996, "balance_loss_clip": 1.01845706, "balance_loss_mlp": 1.0360055, "epoch": 0.7936269352171953, "flos": 18004246208640.0, "grad_norm": 2.6090675339044207, "language_loss": 0.80816483, "learning_rate": 4.058291396132252e-07, "loss": 0.82969803, "num_input_tokens_seen": 284742650, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.69140625, "step": 13200, "time_per_iteration": 2.560743570327759 }, { "auxiliary_loss_clip": 0.0111212, "auxiliary_loss_mlp": 0.01033348, "balance_loss_clip": 1.02132535, "balance_loss_mlp": 1.03670371, "epoch": 0.7936870584698632, "flos": 18880215183360.0, "grad_norm": 1.9675564956984959, "language_loss": 0.77234817, "learning_rate": 4.0560101379294333e-07, "loss": 0.79380274, "num_input_tokens_seen": 284760955, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6640625, "step": 13201, "time_per_iteration": 2.551121234893799 }, { "auxiliary_loss_clip": 0.01101163, "auxiliary_loss_mlp": 0.01030608, "balance_loss_clip": 1.0187763, "balance_loss_mlp": 1.03289104, "epoch": 0.7937471817225312, "flos": 23367001824000.0, "grad_norm": 1.4528921728202775, "language_loss": 0.67219579, "learning_rate": 4.053729448726595e-07, "loss": 0.69351351, "num_input_tokens_seen": 284780745, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 13202, "time_per_iteration": 2.5527522563934326 }, { "auxiliary_loss_clip": 0.01112627, "auxiliary_loss_mlp": 0.0103443, "balance_loss_clip": 1.02163887, "balance_loss_mlp": 1.03445685, "epoch": 0.7938073049751991, "flos": 22674428714880.0, "grad_norm": 3.6232125694679174, "language_loss": 0.74853635, "learning_rate": 4.051449328605145e-07, "loss": 0.77000695, "num_input_tokens_seen": 284799000, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 13203, "time_per_iteration": 2.572502374649048 }, { "auxiliary_loss_clip": 0.01146884, "auxiliary_loss_mlp": 0.01032029, "balance_loss_clip": 1.02017975, "balance_loss_mlp": 1.03377867, "epoch": 0.7938674282278672, "flos": 22851431959680.0, "grad_norm": 1.6787585826312117, "language_loss": 0.66168475, "learning_rate": 4.0491697776464326e-07, "loss": 0.68347389, "num_input_tokens_seen": 284817450, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 13204, "time_per_iteration": 2.6544992923736572 }, { "auxiliary_loss_clip": 0.01145813, "auxiliary_loss_mlp": 0.01030752, "balance_loss_clip": 1.01856303, "balance_loss_mlp": 1.03279841, "epoch": 0.7939275514805351, "flos": 27012509049600.0, "grad_norm": 1.4962987074062903, "language_loss": 0.79196924, "learning_rate": 4.0468907959318257e-07, "loss": 0.81373489, "num_input_tokens_seen": 284838865, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.68359375, "step": 13205, "time_per_iteration": 2.67082142829895 }, { "auxiliary_loss_clip": 0.0112717, "auxiliary_loss_mlp": 0.01029783, "balance_loss_clip": 1.0170995, "balance_loss_mlp": 1.03316307, "epoch": 0.7939876747332031, "flos": 21142838747520.0, "grad_norm": 2.162838538035924, "language_loss": 0.77654201, "learning_rate": 4.044612383542656e-07, "loss": 0.79811156, "num_input_tokens_seen": 284857975, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.671875, "step": 13206, "time_per_iteration": 2.6100409030914307 }, { "auxiliary_loss_clip": 0.01125989, "auxiliary_loss_mlp": 0.01031992, "balance_loss_clip": 1.01856303, "balance_loss_mlp": 1.03562498, "epoch": 0.794047797985871, "flos": 23289075267840.0, "grad_norm": 2.448148239869186, "language_loss": 0.79048139, "learning_rate": 4.042334540560217e-07, "loss": 0.81206119, "num_input_tokens_seen": 284877145, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 13207, "time_per_iteration": 2.6565072536468506 }, { "auxiliary_loss_clip": 0.01129248, "auxiliary_loss_mlp": 0.01035327, "balance_loss_clip": 1.02275014, "balance_loss_mlp": 1.03398514, "epoch": 0.794107921238539, "flos": 24608074590720.0, "grad_norm": 1.7577539817427847, "language_loss": 0.83902311, "learning_rate": 4.0400572670658174e-07, "loss": 0.8606689, "num_input_tokens_seen": 284895560, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 13208, "time_per_iteration": 2.665492296218872 }, { "auxiliary_loss_clip": 0.01059951, "auxiliary_loss_mlp": 0.01003593, "balance_loss_clip": 1.00252652, "balance_loss_mlp": 1.00260019, "epoch": 0.7941680444912069, "flos": 64093690252800.0, "grad_norm": 0.7243914555041715, "language_loss": 0.58374631, "learning_rate": 4.0377805631407116e-07, "loss": 0.60438174, "num_input_tokens_seen": 284963135, "router_z_loss_clip": 0.01068115, "router_z_loss_mlp": 0.21484375, "step": 13209, "time_per_iteration": 3.347954273223877 }, { "auxiliary_loss_clip": 0.01136446, "auxiliary_loss_mlp": 0.01030429, "balance_loss_clip": 1.01760769, "balance_loss_mlp": 1.03750134, "epoch": 0.794228167743875, "flos": 24388839930240.0, "grad_norm": 2.2357267226903006, "language_loss": 0.63187277, "learning_rate": 4.03550442886617e-07, "loss": 0.65354156, "num_input_tokens_seen": 284981755, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.72265625, "step": 13210, "time_per_iteration": 2.6034317016601562 }, { "auxiliary_loss_clip": 0.0111334, "auxiliary_loss_mlp": 0.01037757, "balance_loss_clip": 1.0255022, "balance_loss_mlp": 1.03375745, "epoch": 0.794288290996543, "flos": 28512498026880.0, "grad_norm": 2.9718697827811003, "language_loss": 0.68841398, "learning_rate": 4.0332288643233994e-07, "loss": 0.70992494, "num_input_tokens_seen": 285003060, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.703125, "step": 13211, "time_per_iteration": 2.7094521522521973 }, { "auxiliary_loss_clip": 0.01116321, "auxiliary_loss_mlp": 0.01038139, "balance_loss_clip": 1.02491832, "balance_loss_mlp": 1.03547072, "epoch": 0.7943484142492109, "flos": 25922117836800.0, "grad_norm": 1.804314734645382, "language_loss": 0.72600234, "learning_rate": 4.0309538695936227e-07, "loss": 0.74754691, "num_input_tokens_seen": 285021640, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 13212, "time_per_iteration": 2.6208655834198 }, { "auxiliary_loss_clip": 0.01110187, "auxiliary_loss_mlp": 0.01027397, "balance_loss_clip": 1.01560736, "balance_loss_mlp": 1.03497052, "epoch": 0.7944085375018789, "flos": 23915286000000.0, "grad_norm": 2.0417687944672314, "language_loss": 0.80685592, "learning_rate": 4.0286794447580277e-07, "loss": 0.82823169, "num_input_tokens_seen": 285040490, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6640625, "step": 13213, "time_per_iteration": 2.653231382369995 }, { "auxiliary_loss_clip": 0.01117737, "auxiliary_loss_mlp": 0.01031966, "balance_loss_clip": 1.0187043, "balance_loss_mlp": 1.03651237, "epoch": 0.7944686607545468, "flos": 20229953569920.0, "grad_norm": 2.184094494735635, "language_loss": 0.68330175, "learning_rate": 4.026405589897779e-07, "loss": 0.70479876, "num_input_tokens_seen": 285059270, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 13214, "time_per_iteration": 2.537762403488159 }, { "auxiliary_loss_clip": 0.01100609, "auxiliary_loss_mlp": 0.01028164, "balance_loss_clip": 1.01635587, "balance_loss_mlp": 1.03348744, "epoch": 0.7945287840072148, "flos": 21980993679360.0, "grad_norm": 1.8091555226570466, "language_loss": 0.72539842, "learning_rate": 4.024132305094021e-07, "loss": 0.7466861, "num_input_tokens_seen": 285075390, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 13215, "time_per_iteration": 2.5655629634857178 }, { "auxiliary_loss_clip": 0.01139733, "auxiliary_loss_mlp": 0.01026951, "balance_loss_clip": 1.01509571, "balance_loss_mlp": 1.03664625, "epoch": 0.7945889072598827, "flos": 26397718842240.0, "grad_norm": 1.8382201315445135, "language_loss": 0.7908271, "learning_rate": 4.021859590427896e-07, "loss": 0.81249392, "num_input_tokens_seen": 285096290, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.67578125, "step": 13216, "time_per_iteration": 2.6145784854888916 }, { "auxiliary_loss_clip": 0.01114511, "auxiliary_loss_mlp": 0.01033338, "balance_loss_clip": 1.02130425, "balance_loss_mlp": 1.03554821, "epoch": 0.7946490305125508, "flos": 25810255906560.0, "grad_norm": 2.7608580590016767, "language_loss": 0.73667705, "learning_rate": 4.0195874459804923e-07, "loss": 0.75815552, "num_input_tokens_seen": 285116020, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69921875, "step": 13217, "time_per_iteration": 2.613828182220459 }, { "auxiliary_loss_clip": 0.01118829, "auxiliary_loss_mlp": 0.01033117, "balance_loss_clip": 1.0213275, "balance_loss_mlp": 1.03459644, "epoch": 0.7947091537652187, "flos": 15960965045760.0, "grad_norm": 1.786992865746948, "language_loss": 0.74121201, "learning_rate": 4.017315871832909e-07, "loss": 0.76273143, "num_input_tokens_seen": 285133510, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6640625, "step": 13218, "time_per_iteration": 2.483666181564331 }, { "auxiliary_loss_clip": 0.01114325, "auxiliary_loss_mlp": 0.0103863, "balance_loss_clip": 1.02505207, "balance_loss_mlp": 1.03392053, "epoch": 0.7947692770178867, "flos": 18587866389120.0, "grad_norm": 1.8482004334393327, "language_loss": 0.83147907, "learning_rate": 4.0150448680662064e-07, "loss": 0.85300863, "num_input_tokens_seen": 285151690, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 13219, "time_per_iteration": 2.5457942485809326 }, { "auxiliary_loss_clip": 0.011195, "auxiliary_loss_mlp": 0.01033528, "balance_loss_clip": 1.01990211, "balance_loss_mlp": 1.0372889, "epoch": 0.7948294002705546, "flos": 20442220992000.0, "grad_norm": 2.475103992236208, "language_loss": 0.75643659, "learning_rate": 4.012774434761443e-07, "loss": 0.77796686, "num_input_tokens_seen": 285170485, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 13220, "time_per_iteration": 2.5902562141418457 }, { "auxiliary_loss_clip": 0.01126904, "auxiliary_loss_mlp": 0.01035031, "balance_loss_clip": 1.0220139, "balance_loss_mlp": 1.03640246, "epoch": 0.7948895235232226, "flos": 38181194282880.0, "grad_norm": 1.9545904644705556, "language_loss": 0.7217052, "learning_rate": 4.0105045719996333e-07, "loss": 0.74332458, "num_input_tokens_seen": 285191050, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 13221, "time_per_iteration": 2.764552116394043 }, { "auxiliary_loss_clip": 0.01111288, "auxiliary_loss_mlp": 0.01028316, "balance_loss_clip": 1.01591778, "balance_loss_mlp": 1.03501129, "epoch": 0.7949496467758905, "flos": 15559806977280.0, "grad_norm": 1.8235484973839604, "language_loss": 0.75011694, "learning_rate": 4.008235279861778e-07, "loss": 0.77151293, "num_input_tokens_seen": 285208750, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.67578125, "step": 13222, "time_per_iteration": 2.5961766242980957 }, { "auxiliary_loss_clip": 0.01041313, "auxiliary_loss_mlp": 0.01004063, "balance_loss_clip": 1.0028832, "balance_loss_mlp": 1.00276995, "epoch": 0.7950097700285585, "flos": 70897036728960.0, "grad_norm": 0.7643294561218696, "language_loss": 0.67112827, "learning_rate": 4.0059665584288817e-07, "loss": 0.69158202, "num_input_tokens_seen": 285264605, "router_z_loss_clip": 0.01177979, "router_z_loss_mlp": 0.21289062, "step": 13223, "time_per_iteration": 3.0848307609558105 }, { "auxiliary_loss_clip": 0.01111385, "auxiliary_loss_mlp": 0.01031902, "balance_loss_clip": 1.01945066, "balance_loss_mlp": 1.03317928, "epoch": 0.7950698932812266, "flos": 23951627585280.0, "grad_norm": 1.7785499523515218, "language_loss": 0.71187961, "learning_rate": 4.003698407781888e-07, "loss": 0.73331249, "num_input_tokens_seen": 285283940, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 13224, "time_per_iteration": 2.6084396839141846 }, { "auxiliary_loss_clip": 0.01024084, "auxiliary_loss_mlp": 0.01000024, "balance_loss_clip": 0.99883223, "balance_loss_mlp": 1.00267768, "epoch": 0.7951300165338945, "flos": 60282561415680.0, "grad_norm": 0.6708814666121603, "language_loss": 0.5503453, "learning_rate": 4.001430828001753e-07, "loss": 0.57058632, "num_input_tokens_seen": 285349525, "router_z_loss_clip": 0.01190186, "router_z_loss_mlp": 0.21386719, "step": 13225, "time_per_iteration": 4.586049795150757 }, { "auxiliary_loss_clip": 0.01118451, "auxiliary_loss_mlp": 0.0102316, "balance_loss_clip": 1.01156735, "balance_loss_mlp": 1.0331068, "epoch": 0.7951901397865625, "flos": 22819004956800.0, "grad_norm": 1.9714722009991184, "language_loss": 0.64786106, "learning_rate": 3.9991638191694e-07, "loss": 0.66927719, "num_input_tokens_seen": 285367355, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 13226, "time_per_iteration": 2.586498975753784 }, { "auxiliary_loss_clip": 0.01140669, "auxiliary_loss_mlp": 0.01037958, "balance_loss_clip": 1.02544689, "balance_loss_mlp": 1.03538799, "epoch": 0.7952502630392304, "flos": 35695672871040.0, "grad_norm": 1.9639763206181255, "language_loss": 0.70315903, "learning_rate": 3.9968973813657316e-07, "loss": 0.72494531, "num_input_tokens_seen": 285386190, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 13227, "time_per_iteration": 2.762348175048828 }, { "auxiliary_loss_clip": 0.01127788, "auxiliary_loss_mlp": 0.01027973, "balance_loss_clip": 1.01717877, "balance_loss_mlp": 1.034302, "epoch": 0.7953103862918984, "flos": 25629840869760.0, "grad_norm": 1.678184461098094, "language_loss": 0.69169247, "learning_rate": 3.994631514671625e-07, "loss": 0.7132501, "num_input_tokens_seen": 285406150, "router_z_loss_clip": 0.10791016, "router_z_loss_mlp": 0.6640625, "step": 13228, "time_per_iteration": 2.625406265258789 }, { "auxiliary_loss_clip": 0.01139661, "auxiliary_loss_mlp": 0.01034394, "balance_loss_clip": 1.02168679, "balance_loss_mlp": 1.03375363, "epoch": 0.7953705095445663, "flos": 40551980676480.0, "grad_norm": 1.439052077660114, "language_loss": 0.70599627, "learning_rate": 3.992366219167955e-07, "loss": 0.72773683, "num_input_tokens_seen": 285429900, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 13229, "time_per_iteration": 2.864305257797241 }, { "auxiliary_loss_clip": 0.01117356, "auxiliary_loss_mlp": 0.01032163, "balance_loss_clip": 1.01959825, "balance_loss_mlp": 1.03672433, "epoch": 0.7954306327972344, "flos": 27636672706560.0, "grad_norm": 1.8310077139046188, "language_loss": 0.71773899, "learning_rate": 3.990101494935558e-07, "loss": 0.73923421, "num_input_tokens_seen": 285452555, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 13230, "time_per_iteration": 2.593188524246216 }, { "auxiliary_loss_clip": 0.01041444, "auxiliary_loss_mlp": 0.01003907, "balance_loss_clip": 1.00269723, "balance_loss_mlp": 1.0029459, "epoch": 0.7954907560499023, "flos": 59504055995520.0, "grad_norm": 0.9310769119732851, "language_loss": 0.63562965, "learning_rate": 3.987837342055256e-07, "loss": 0.65608323, "num_input_tokens_seen": 285515700, "router_z_loss_clip": 0.01208496, "router_z_loss_mlp": 0.21484375, "step": 13231, "time_per_iteration": 4.624614715576172 }, { "auxiliary_loss_clip": 0.01121647, "auxiliary_loss_mlp": 0.01028425, "balance_loss_clip": 1.0166055, "balance_loss_mlp": 1.03470206, "epoch": 0.7955508793025703, "flos": 20120533764480.0, "grad_norm": 1.726577444716355, "language_loss": 0.69958866, "learning_rate": 3.9855737606078457e-07, "loss": 0.72108936, "num_input_tokens_seen": 285533910, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 13232, "time_per_iteration": 2.601057767868042 }, { "auxiliary_loss_clip": 0.01111763, "auxiliary_loss_mlp": 0.01026059, "balance_loss_clip": 1.01338744, "balance_loss_mlp": 1.03355622, "epoch": 0.7956110025552382, "flos": 26505378881280.0, "grad_norm": 1.5755093545082703, "language_loss": 0.78144693, "learning_rate": 3.9833107506741226e-07, "loss": 0.80282509, "num_input_tokens_seen": 285554080, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 13233, "time_per_iteration": 2.60516619682312 }, { "auxiliary_loss_clip": 0.01114124, "auxiliary_loss_mlp": 0.01031575, "balance_loss_clip": 1.01861131, "balance_loss_mlp": 1.03440499, "epoch": 0.7956711258079062, "flos": 22565475786240.0, "grad_norm": 2.1544465673940345, "language_loss": 0.78968453, "learning_rate": 3.9810483123348315e-07, "loss": 0.81114161, "num_input_tokens_seen": 285572325, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 13234, "time_per_iteration": 2.610109329223633 }, { "auxiliary_loss_clip": 0.01120761, "auxiliary_loss_mlp": 0.0127286, "balance_loss_clip": 1.01484466, "balance_loss_mlp": 1.0355221, "epoch": 0.7957312490605741, "flos": 17379005143680.0, "grad_norm": 1.6564777830223971, "language_loss": 0.70123732, "learning_rate": 3.978786445670723e-07, "loss": 0.72517347, "num_input_tokens_seen": 285589770, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.67578125, "step": 13235, "time_per_iteration": 2.553290843963623 }, { "auxiliary_loss_clip": 0.01131268, "auxiliary_loss_mlp": 0.01027671, "balance_loss_clip": 1.01457, "balance_loss_mlp": 1.03482533, "epoch": 0.7957913723132422, "flos": 22491427898880.0, "grad_norm": 1.5502588532803725, "language_loss": 0.67796767, "learning_rate": 3.9765251507625153e-07, "loss": 0.69955707, "num_input_tokens_seen": 285610065, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 13236, "time_per_iteration": 2.610820770263672 }, { "auxiliary_loss_clip": 0.01137758, "auxiliary_loss_mlp": 0.01026793, "balance_loss_clip": 1.01457429, "balance_loss_mlp": 1.03439522, "epoch": 0.7958514955659101, "flos": 22638087129600.0, "grad_norm": 1.6890469724181334, "language_loss": 0.75177592, "learning_rate": 3.97426442769091e-07, "loss": 0.77342141, "num_input_tokens_seen": 285628480, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 13237, "time_per_iteration": 2.626271963119507 }, { "auxiliary_loss_clip": 0.01115092, "auxiliary_loss_mlp": 0.01039304, "balance_loss_clip": 1.0259285, "balance_loss_mlp": 1.03653049, "epoch": 0.7959116188185781, "flos": 20704225772160.0, "grad_norm": 3.4345430276347786, "language_loss": 0.71550512, "learning_rate": 3.9720042765365823e-07, "loss": 0.7370491, "num_input_tokens_seen": 285647805, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.69921875, "step": 13238, "time_per_iteration": 4.148678541183472 }, { "auxiliary_loss_clip": 0.01112035, "auxiliary_loss_mlp": 0.01027797, "balance_loss_clip": 1.01548839, "balance_loss_mlp": 1.03278053, "epoch": 0.7959717420712461, "flos": 19024683684480.0, "grad_norm": 1.8027935175939054, "language_loss": 0.73880899, "learning_rate": 3.9697446973801885e-07, "loss": 0.7602073, "num_input_tokens_seen": 285665505, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 13239, "time_per_iteration": 4.105223655700684 }, { "auxiliary_loss_clip": 0.01108139, "auxiliary_loss_mlp": 0.01032429, "balance_loss_clip": 1.01960182, "balance_loss_mlp": 1.03533852, "epoch": 0.796031865323914, "flos": 26356636661760.0, "grad_norm": 1.7228021555793969, "language_loss": 0.6891948, "learning_rate": 3.967485690302381e-07, "loss": 0.71060038, "num_input_tokens_seen": 285685855, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 13240, "time_per_iteration": 2.702270030975342 }, { "auxiliary_loss_clip": 0.01033609, "auxiliary_loss_mlp": 0.01000318, "balance_loss_clip": 0.99909002, "balance_loss_mlp": 1.0031637, "epoch": 0.796091988576582, "flos": 62069440320000.0, "grad_norm": 0.8895232927946175, "language_loss": 0.58713734, "learning_rate": 3.9652272553837583e-07, "loss": 0.60747659, "num_input_tokens_seen": 285735710, "router_z_loss_clip": 0.01226807, "router_z_loss_mlp": 0.21484375, "step": 13241, "time_per_iteration": 3.0386886596679688 }, { "auxiliary_loss_clip": 0.01131766, "auxiliary_loss_mlp": 0.01031498, "balance_loss_clip": 1.02018499, "balance_loss_mlp": 1.03625035, "epoch": 0.7961521118292499, "flos": 39020103400320.0, "grad_norm": 1.8318433818376754, "language_loss": 0.64550948, "learning_rate": 3.9629693927049355e-07, "loss": 0.66714215, "num_input_tokens_seen": 285757045, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.69140625, "step": 13242, "time_per_iteration": 2.847909927368164 }, { "auxiliary_loss_clip": 0.01105183, "auxiliary_loss_mlp": 0.0103264, "balance_loss_clip": 1.01940203, "balance_loss_mlp": 1.03456497, "epoch": 0.796212235081918, "flos": 21762836426880.0, "grad_norm": 1.7578952141464326, "language_loss": 0.76212394, "learning_rate": 3.9607121023464816e-07, "loss": 0.78350222, "num_input_tokens_seen": 285776050, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 13243, "time_per_iteration": 2.5313053131103516 }, { "auxiliary_loss_clip": 0.01132022, "auxiliary_loss_mlp": 0.01031674, "balance_loss_clip": 1.01835191, "balance_loss_mlp": 1.03479815, "epoch": 0.7962723583345859, "flos": 21178857110400.0, "grad_norm": 1.6870141907291292, "language_loss": 0.79443532, "learning_rate": 3.9584553843889547e-07, "loss": 0.81607234, "num_input_tokens_seen": 285796830, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 13244, "time_per_iteration": 2.5615756511688232 }, { "auxiliary_loss_clip": 0.01116014, "auxiliary_loss_mlp": 0.01030993, "balance_loss_clip": 1.01783252, "balance_loss_mlp": 1.03471851, "epoch": 0.7963324815872539, "flos": 17128636369920.0, "grad_norm": 2.330211511696903, "language_loss": 0.68683255, "learning_rate": 3.9561992389128875e-07, "loss": 0.70830262, "num_input_tokens_seen": 285814755, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 13245, "time_per_iteration": 2.4759039878845215 }, { "auxiliary_loss_clip": 0.01121973, "auxiliary_loss_mlp": 0.01035075, "balance_loss_clip": 1.02275503, "balance_loss_mlp": 1.03466392, "epoch": 0.7963926048399218, "flos": 21397481239680.0, "grad_norm": 1.5386806893362122, "language_loss": 0.79149657, "learning_rate": 3.953943665998802e-07, "loss": 0.81306708, "num_input_tokens_seen": 285834255, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 13246, "time_per_iteration": 2.584716320037842 }, { "auxiliary_loss_clip": 0.01122908, "auxiliary_loss_mlp": 0.01027481, "balance_loss_clip": 1.01487505, "balance_loss_mlp": 1.03385282, "epoch": 0.7964527280925898, "flos": 25184188828800.0, "grad_norm": 2.977165061596545, "language_loss": 0.66151011, "learning_rate": 3.9516886657271955e-07, "loss": 0.68301404, "num_input_tokens_seen": 285853540, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 13247, "time_per_iteration": 2.5922939777374268 }, { "auxiliary_loss_clip": 0.0111813, "auxiliary_loss_mlp": 0.01032452, "balance_loss_clip": 1.02097249, "balance_loss_mlp": 1.0328846, "epoch": 0.7965128513452577, "flos": 27015884928000.0, "grad_norm": 1.7420778730662818, "language_loss": 0.7164197, "learning_rate": 3.949434238178537e-07, "loss": 0.73792553, "num_input_tokens_seen": 285872705, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6796875, "step": 13248, "time_per_iteration": 2.5599610805511475 }, { "auxiliary_loss_clip": 0.01122937, "auxiliary_loss_mlp": 0.0102924, "balance_loss_clip": 1.0164789, "balance_loss_mlp": 1.03407633, "epoch": 0.7965729745979258, "flos": 24419578993920.0, "grad_norm": 3.130009970736379, "language_loss": 0.75970447, "learning_rate": 3.947180383433277e-07, "loss": 0.78122628, "num_input_tokens_seen": 285890290, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 13249, "time_per_iteration": 2.5533697605133057 }, { "auxiliary_loss_clip": 0.01120623, "auxiliary_loss_mlp": 0.01031694, "balance_loss_clip": 1.01936829, "balance_loss_mlp": 1.03441966, "epoch": 0.7966330978505937, "flos": 18840389978880.0, "grad_norm": 2.1355510178930004, "language_loss": 0.61420101, "learning_rate": 3.944927101571871e-07, "loss": 0.63572419, "num_input_tokens_seen": 285909190, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 13250, "time_per_iteration": 2.53391432762146 }, { "auxiliary_loss_clip": 0.01122582, "auxiliary_loss_mlp": 0.0102805, "balance_loss_clip": 1.01534855, "balance_loss_mlp": 1.03542805, "epoch": 0.7966932211032617, "flos": 13152319862400.0, "grad_norm": 2.5673790201201356, "language_loss": 0.72159541, "learning_rate": 3.9426743926747095e-07, "loss": 0.74310184, "num_input_tokens_seen": 285927570, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 13251, "time_per_iteration": 2.5171375274658203 }, { "auxiliary_loss_clip": 0.0110895, "auxiliary_loss_mlp": 0.0103019, "balance_loss_clip": 1.01653469, "balance_loss_mlp": 1.03661644, "epoch": 0.7967533443559297, "flos": 23949760078080.0, "grad_norm": 1.8389726780878746, "language_loss": 0.7281816, "learning_rate": 3.940422256822191e-07, "loss": 0.74957287, "num_input_tokens_seen": 285945810, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 13252, "time_per_iteration": 2.5170488357543945 }, { "auxiliary_loss_clip": 0.01102728, "auxiliary_loss_mlp": 0.01028517, "balance_loss_clip": 1.01586258, "balance_loss_mlp": 1.03564095, "epoch": 0.7968134676085976, "flos": 30368791964160.0, "grad_norm": 1.8393869493438768, "language_loss": 0.66193032, "learning_rate": 3.9381706940946957e-07, "loss": 0.6832428, "num_input_tokens_seen": 285964235, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.671875, "step": 13253, "time_per_iteration": 2.5610692501068115 }, { "auxiliary_loss_clip": 0.01134745, "auxiliary_loss_mlp": 0.01037157, "balance_loss_clip": 1.02398407, "balance_loss_mlp": 1.03689563, "epoch": 0.7968735908612656, "flos": 23075048079360.0, "grad_norm": 1.7033387688329242, "language_loss": 0.67934799, "learning_rate": 3.9359197045725747e-07, "loss": 0.70106703, "num_input_tokens_seen": 285983710, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 13254, "time_per_iteration": 2.650632858276367 }, { "auxiliary_loss_clip": 0.01111906, "auxiliary_loss_mlp": 0.01035998, "balance_loss_clip": 1.02384496, "balance_loss_mlp": 1.03469932, "epoch": 0.7969337141139335, "flos": 23582250074880.0, "grad_norm": 1.848171622451551, "language_loss": 0.69439471, "learning_rate": 3.933669288336154e-07, "loss": 0.71587372, "num_input_tokens_seen": 286003425, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.68359375, "step": 13255, "time_per_iteration": 2.5389723777770996 }, { "auxiliary_loss_clip": 0.01101969, "auxiliary_loss_mlp": 0.01030928, "balance_loss_clip": 1.01888227, "balance_loss_mlp": 1.03346598, "epoch": 0.7969938373666016, "flos": 19755860935680.0, "grad_norm": 1.9330139908923063, "language_loss": 0.79230922, "learning_rate": 3.931419445465747e-07, "loss": 0.81363821, "num_input_tokens_seen": 286020130, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 13256, "time_per_iteration": 2.5103328227996826 }, { "auxiliary_loss_clip": 0.01114089, "auxiliary_loss_mlp": 0.01028562, "balance_loss_clip": 1.01595592, "balance_loss_mlp": 1.03568578, "epoch": 0.7970539606192695, "flos": 24134089697280.0, "grad_norm": 2.280958621030601, "language_loss": 0.65689379, "learning_rate": 3.929170176041656e-07, "loss": 0.67832029, "num_input_tokens_seen": 286040230, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 13257, "time_per_iteration": 2.548889636993408 }, { "auxiliary_loss_clip": 0.01127902, "auxiliary_loss_mlp": 0.01033732, "balance_loss_clip": 1.02107835, "balance_loss_mlp": 1.03765845, "epoch": 0.7971140838719375, "flos": 17968622895360.0, "grad_norm": 2.4743109505041523, "language_loss": 0.71864754, "learning_rate": 3.926921480144132e-07, "loss": 0.74026388, "num_input_tokens_seen": 286059475, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.72265625, "step": 13258, "time_per_iteration": 2.5044379234313965 }, { "auxiliary_loss_clip": 0.01114494, "auxiliary_loss_mlp": 0.01031815, "balance_loss_clip": 1.01860642, "balance_loss_mlp": 1.03421307, "epoch": 0.7971742071246054, "flos": 19169547235200.0, "grad_norm": 2.128422562847979, "language_loss": 0.68759418, "learning_rate": 3.9246733578534405e-07, "loss": 0.70905721, "num_input_tokens_seen": 286077820, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 13259, "time_per_iteration": 2.534987449645996 }, { "auxiliary_loss_clip": 0.01109989, "auxiliary_loss_mlp": 0.01030758, "balance_loss_clip": 1.01933765, "balance_loss_mlp": 1.03414083, "epoch": 0.7972343303772734, "flos": 27125951178240.0, "grad_norm": 2.047523352103931, "language_loss": 0.73730284, "learning_rate": 3.9224258092498074e-07, "loss": 0.75871027, "num_input_tokens_seen": 286097285, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.66796875, "step": 13260, "time_per_iteration": 2.609891891479492 }, { "auxiliary_loss_clip": 0.01120212, "auxiliary_loss_mlp": 0.01029084, "balance_loss_clip": 1.0174675, "balance_loss_mlp": 1.03396094, "epoch": 0.7972944536299413, "flos": 20996646393600.0, "grad_norm": 9.356559634659149, "language_loss": 0.78178728, "learning_rate": 3.920178834413439e-07, "loss": 0.80328023, "num_input_tokens_seen": 286116000, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.68359375, "step": 13261, "time_per_iteration": 2.5654501914978027 }, { "auxiliary_loss_clip": 0.01107948, "auxiliary_loss_mlp": 0.0102732, "balance_loss_clip": 1.01651335, "balance_loss_mlp": 1.03416479, "epoch": 0.7973545768826094, "flos": 21580015178880.0, "grad_norm": 1.4767519363578914, "language_loss": 0.76335704, "learning_rate": 3.91793243342452e-07, "loss": 0.78470975, "num_input_tokens_seen": 286135110, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.6484375, "step": 13262, "time_per_iteration": 2.5675208568573 }, { "auxiliary_loss_clip": 0.01136796, "auxiliary_loss_mlp": 0.01034017, "balance_loss_clip": 1.0200336, "balance_loss_mlp": 1.0373044, "epoch": 0.7974147001352773, "flos": 20558536208640.0, "grad_norm": 2.2006010680419896, "language_loss": 0.70569563, "learning_rate": 3.915686606363231e-07, "loss": 0.7274037, "num_input_tokens_seen": 286152835, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7265625, "step": 13263, "time_per_iteration": 2.5962862968444824 }, { "auxiliary_loss_clip": 0.01119822, "auxiliary_loss_mlp": 0.01034405, "balance_loss_clip": 1.02099442, "balance_loss_mlp": 1.03677917, "epoch": 0.7974748233879453, "flos": 20996790048000.0, "grad_norm": 1.624164533029211, "language_loss": 0.70872748, "learning_rate": 3.9134413533097143e-07, "loss": 0.73026979, "num_input_tokens_seen": 286171785, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7421875, "step": 13264, "time_per_iteration": 2.5515859127044678 }, { "auxiliary_loss_clip": 0.01102658, "auxiliary_loss_mlp": 0.01030531, "balance_loss_clip": 1.0185864, "balance_loss_mlp": 1.03532839, "epoch": 0.7975349466406133, "flos": 22565188477440.0, "grad_norm": 1.725239136434542, "language_loss": 0.76875943, "learning_rate": 3.911196674344095e-07, "loss": 0.79009128, "num_input_tokens_seen": 286190420, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.671875, "step": 13265, "time_per_iteration": 2.6059577465057373 }, { "auxiliary_loss_clip": 0.0113085, "auxiliary_loss_mlp": 0.01029612, "balance_loss_clip": 1.01760149, "balance_loss_mlp": 1.03380966, "epoch": 0.7975950698932812, "flos": 21689542725120.0, "grad_norm": 1.9893703136296117, "language_loss": 0.7540248, "learning_rate": 3.90895256954648e-07, "loss": 0.7756294, "num_input_tokens_seen": 286210105, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 13266, "time_per_iteration": 3.959554672241211 }, { "auxiliary_loss_clip": 0.01119326, "auxiliary_loss_mlp": 0.01027513, "balance_loss_clip": 1.0154314, "balance_loss_mlp": 1.03330302, "epoch": 0.7976551931459492, "flos": 19604568850560.0, "grad_norm": 1.7684348001601442, "language_loss": 0.84532344, "learning_rate": 3.9067090389969583e-07, "loss": 0.86679184, "num_input_tokens_seen": 286228180, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 13267, "time_per_iteration": 2.565495252609253 }, { "auxiliary_loss_clip": 0.01112755, "auxiliary_loss_mlp": 0.01033487, "balance_loss_clip": 1.02067852, "balance_loss_mlp": 1.03488803, "epoch": 0.7977153163986171, "flos": 21687603390720.0, "grad_norm": 1.5094643948764679, "language_loss": 0.76051664, "learning_rate": 3.904466082775593e-07, "loss": 0.78197908, "num_input_tokens_seen": 286247305, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 13268, "time_per_iteration": 2.5644123554229736 }, { "auxiliary_loss_clip": 0.01103478, "auxiliary_loss_mlp": 0.01028615, "balance_loss_clip": 1.01619351, "balance_loss_mlp": 1.03480458, "epoch": 0.7977754396512852, "flos": 23476780765440.0, "grad_norm": 3.2405848563283994, "language_loss": 0.77705383, "learning_rate": 3.902223700962426e-07, "loss": 0.79837465, "num_input_tokens_seen": 286268145, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 13269, "time_per_iteration": 2.546985149383545 }, { "auxiliary_loss_clip": 0.01108888, "auxiliary_loss_mlp": 0.01032031, "balance_loss_clip": 1.02050984, "balance_loss_mlp": 1.03313911, "epoch": 0.7978355629039531, "flos": 22382223575040.0, "grad_norm": 2.0741287866312916, "language_loss": 0.81910408, "learning_rate": 3.8999818936374964e-07, "loss": 0.84051335, "num_input_tokens_seen": 286286775, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66796875, "step": 13270, "time_per_iteration": 2.492610454559326 }, { "auxiliary_loss_clip": 0.01130691, "auxiliary_loss_mlp": 0.01030723, "balance_loss_clip": 1.01818252, "balance_loss_mlp": 1.03524959, "epoch": 0.7978956861566211, "flos": 20266331068800.0, "grad_norm": 1.7181567280704906, "language_loss": 0.59408128, "learning_rate": 3.8977406608807883e-07, "loss": 0.61569548, "num_input_tokens_seen": 286305590, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 13271, "time_per_iteration": 2.5191447734832764 }, { "auxiliary_loss_clip": 0.01126813, "auxiliary_loss_mlp": 0.01033195, "balance_loss_clip": 1.02060056, "balance_loss_mlp": 1.03654325, "epoch": 0.797955809409289, "flos": 28112417366400.0, "grad_norm": 1.4585363353800982, "language_loss": 0.73297358, "learning_rate": 3.895500002772303e-07, "loss": 0.75457364, "num_input_tokens_seen": 286328050, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 13272, "time_per_iteration": 4.011946439743042 }, { "auxiliary_loss_clip": 0.01148825, "auxiliary_loss_mlp": 0.01027582, "balance_loss_clip": 1.01507711, "balance_loss_mlp": 1.03627348, "epoch": 0.798015932661957, "flos": 15559591495680.0, "grad_norm": 1.7214864211416134, "language_loss": 0.71655416, "learning_rate": 3.893259919391989e-07, "loss": 0.7383182, "num_input_tokens_seen": 286345265, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 13273, "time_per_iteration": 2.5303633213043213 }, { "auxiliary_loss_clip": 0.01125737, "auxiliary_loss_mlp": 0.01033681, "balance_loss_clip": 1.02099156, "balance_loss_mlp": 1.03592443, "epoch": 0.7980760559146249, "flos": 23951196622080.0, "grad_norm": 2.6249767613043478, "language_loss": 0.75974059, "learning_rate": 3.89102041081981e-07, "loss": 0.78133476, "num_input_tokens_seen": 286364465, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 13274, "time_per_iteration": 2.550273895263672 }, { "auxiliary_loss_clip": 0.01133863, "auxiliary_loss_mlp": 0.01030107, "balance_loss_clip": 1.0185256, "balance_loss_mlp": 1.03371489, "epoch": 0.798136179167293, "flos": 28038082170240.0, "grad_norm": 1.378804191631296, "language_loss": 0.77841127, "learning_rate": 3.888781477135663e-07, "loss": 0.80005085, "num_input_tokens_seen": 286385565, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6484375, "step": 13275, "time_per_iteration": 2.628521203994751 }, { "auxiliary_loss_clip": 0.011164, "auxiliary_loss_mlp": 0.01033184, "balance_loss_clip": 1.01964772, "balance_loss_mlp": 1.03548074, "epoch": 0.7981963024199609, "flos": 35984538046080.0, "grad_norm": 1.7588631080756134, "language_loss": 0.64133054, "learning_rate": 3.88654311841947e-07, "loss": 0.66282642, "num_input_tokens_seen": 286403950, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 13276, "time_per_iteration": 2.6828460693359375 }, { "auxiliary_loss_clip": 0.01128837, "auxiliary_loss_mlp": 0.01029122, "balance_loss_clip": 1.0171001, "balance_loss_mlp": 1.03466892, "epoch": 0.7982564256726289, "flos": 25884914325120.0, "grad_norm": 1.5707932817917885, "language_loss": 0.6062364, "learning_rate": 3.884305334751101e-07, "loss": 0.62781596, "num_input_tokens_seen": 286426160, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 13277, "time_per_iteration": 2.6000664234161377 }, { "auxiliary_loss_clip": 0.01126547, "auxiliary_loss_mlp": 0.01031582, "balance_loss_clip": 1.01941109, "balance_loss_mlp": 1.03718495, "epoch": 0.7983165489252969, "flos": 25739152934400.0, "grad_norm": 1.9724625078744749, "language_loss": 0.79782468, "learning_rate": 3.8820681262104226e-07, "loss": 0.81940597, "num_input_tokens_seen": 286446610, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.71484375, "step": 13278, "time_per_iteration": 2.580493450164795 }, { "auxiliary_loss_clip": 0.01124865, "auxiliary_loss_mlp": 0.01036048, "balance_loss_clip": 1.0222615, "balance_loss_mlp": 1.03517294, "epoch": 0.7983766721779648, "flos": 21908202768000.0, "grad_norm": 2.126914523520179, "language_loss": 0.63268411, "learning_rate": 3.8798314928772656e-07, "loss": 0.65429318, "num_input_tokens_seen": 286465460, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.71875, "step": 13279, "time_per_iteration": 3.982475757598877 }, { "auxiliary_loss_clip": 0.01106195, "auxiliary_loss_mlp": 0.01028486, "balance_loss_clip": 1.0173099, "balance_loss_mlp": 1.03329241, "epoch": 0.7984367954306328, "flos": 29347420734720.0, "grad_norm": 1.6415206296941176, "language_loss": 0.70747113, "learning_rate": 3.877595434831462e-07, "loss": 0.72881794, "num_input_tokens_seen": 286485720, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.640625, "step": 13280, "time_per_iteration": 2.583488941192627 }, { "auxiliary_loss_clip": 0.01136152, "auxiliary_loss_mlp": 0.01029881, "balance_loss_clip": 1.01646423, "balance_loss_mlp": 1.03754783, "epoch": 0.7984969186833007, "flos": 31357772104320.0, "grad_norm": 1.8541201616400205, "language_loss": 0.62731302, "learning_rate": 3.875359952152812e-07, "loss": 0.64897341, "num_input_tokens_seen": 286507465, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 13281, "time_per_iteration": 4.151926755905151 }, { "auxiliary_loss_clip": 0.01116985, "auxiliary_loss_mlp": 0.01034086, "balance_loss_clip": 1.0216707, "balance_loss_mlp": 1.03472066, "epoch": 0.7985570419359688, "flos": 24312924535680.0, "grad_norm": 2.1484671531363935, "language_loss": 0.8020314, "learning_rate": 3.8731250449210753e-07, "loss": 0.82354212, "num_input_tokens_seen": 286526345, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.734375, "step": 13282, "time_per_iteration": 2.537076234817505 }, { "auxiliary_loss_clip": 0.01131552, "auxiliary_loss_mlp": 0.01276905, "balance_loss_clip": 1.01701117, "balance_loss_mlp": 1.03389072, "epoch": 0.7986171651886367, "flos": 15742233175680.0, "grad_norm": 1.9047422662322997, "language_loss": 0.71429873, "learning_rate": 3.870890713216031e-07, "loss": 0.73838329, "num_input_tokens_seen": 286544095, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 13283, "time_per_iteration": 2.6400110721588135 }, { "auxiliary_loss_clip": 0.01109056, "auxiliary_loss_mlp": 0.01026853, "balance_loss_clip": 1.01533747, "balance_loss_mlp": 1.03175092, "epoch": 0.7986772884413047, "flos": 11619401091840.0, "grad_norm": 1.9273955491461774, "language_loss": 0.73755121, "learning_rate": 3.868656957117404e-07, "loss": 0.7589103, "num_input_tokens_seen": 286560960, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.68359375, "step": 13284, "time_per_iteration": 2.540843963623047 }, { "auxiliary_loss_clip": 0.01121421, "auxiliary_loss_mlp": 0.01031839, "balance_loss_clip": 1.02085328, "balance_loss_mlp": 1.03509295, "epoch": 0.7987374116939726, "flos": 22091059929600.0, "grad_norm": 1.5005083193821451, "language_loss": 0.70233041, "learning_rate": 3.866423776704919e-07, "loss": 0.72386301, "num_input_tokens_seen": 286579865, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.6796875, "step": 13285, "time_per_iteration": 2.6087684631347656 }, { "auxiliary_loss_clip": 0.01121195, "auxiliary_loss_mlp": 0.01026571, "balance_loss_clip": 1.01475167, "balance_loss_mlp": 1.03447711, "epoch": 0.7987975349466406, "flos": 17890696339200.0, "grad_norm": 1.6268321141884023, "language_loss": 0.73426068, "learning_rate": 3.864191172058262e-07, "loss": 0.75573838, "num_input_tokens_seen": 286597295, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 13286, "time_per_iteration": 2.5598747730255127 }, { "auxiliary_loss_clip": 0.01122274, "auxiliary_loss_mlp": 0.01032875, "balance_loss_clip": 1.01993477, "balance_loss_mlp": 1.03499305, "epoch": 0.7988576581993085, "flos": 19719232041600.0, "grad_norm": 1.7652060108971739, "language_loss": 0.75169414, "learning_rate": 3.8619591432571255e-07, "loss": 0.77324569, "num_input_tokens_seen": 286616270, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 13287, "time_per_iteration": 2.5895087718963623 }, { "auxiliary_loss_clip": 0.01121304, "auxiliary_loss_mlp": 0.01029062, "balance_loss_clip": 1.01703417, "balance_loss_mlp": 1.03549695, "epoch": 0.7989177814519766, "flos": 28036358317440.0, "grad_norm": 1.5587156198236742, "language_loss": 0.61225879, "learning_rate": 3.8597276903811446e-07, "loss": 0.63376248, "num_input_tokens_seen": 286638315, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 13288, "time_per_iteration": 2.6546239852905273 }, { "auxiliary_loss_clip": 0.01112674, "auxiliary_loss_mlp": 0.01033541, "balance_loss_clip": 1.02204967, "balance_loss_mlp": 1.03628898, "epoch": 0.7989779047046445, "flos": 28871029630080.0, "grad_norm": 2.3233314976247943, "language_loss": 0.70101058, "learning_rate": 3.857496813509973e-07, "loss": 0.72247267, "num_input_tokens_seen": 286658630, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.67578125, "step": 13289, "time_per_iteration": 2.6555700302124023 }, { "auxiliary_loss_clip": 0.01129346, "auxiliary_loss_mlp": 0.01034148, "balance_loss_clip": 1.021101, "balance_loss_mlp": 1.03544712, "epoch": 0.7990380279573125, "flos": 18186887888640.0, "grad_norm": 2.514430751319645, "language_loss": 0.63187969, "learning_rate": 3.8552665127232073e-07, "loss": 0.65351468, "num_input_tokens_seen": 286676870, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.67578125, "step": 13290, "time_per_iteration": 2.538767099380493 }, { "auxiliary_loss_clip": 0.01114744, "auxiliary_loss_mlp": 0.01030118, "balance_loss_clip": 1.01694524, "balance_loss_mlp": 1.03580976, "epoch": 0.7990981512099805, "flos": 20879936127360.0, "grad_norm": 2.0241029107703086, "language_loss": 0.71166778, "learning_rate": 3.8530367881004656e-07, "loss": 0.73311639, "num_input_tokens_seen": 286694300, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 13291, "time_per_iteration": 2.592515468597412 }, { "auxiliary_loss_clip": 0.01127615, "auxiliary_loss_mlp": 0.01030148, "balance_loss_clip": 1.01900768, "balance_loss_mlp": 1.0336628, "epoch": 0.7991582744626484, "flos": 26099911180800.0, "grad_norm": 1.53837270304919, "language_loss": 0.63737619, "learning_rate": 3.850807639721292e-07, "loss": 0.65895379, "num_input_tokens_seen": 286714545, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.671875, "step": 13292, "time_per_iteration": 2.6047959327697754 }, { "auxiliary_loss_clip": 0.01112304, "auxiliary_loss_mlp": 0.01030451, "balance_loss_clip": 1.01888156, "balance_loss_mlp": 1.03498888, "epoch": 0.7992183977153164, "flos": 35295843605760.0, "grad_norm": 1.789525609408765, "language_loss": 0.5619477, "learning_rate": 3.8485790676652585e-07, "loss": 0.58337522, "num_input_tokens_seen": 286734525, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.68359375, "step": 13293, "time_per_iteration": 2.6794774532318115 }, { "auxiliary_loss_clip": 0.01123538, "auxiliary_loss_mlp": 0.01033015, "balance_loss_clip": 1.02036691, "balance_loss_mlp": 1.03601861, "epoch": 0.7992785209679844, "flos": 51853426577280.0, "grad_norm": 2.823857726697475, "language_loss": 0.71133977, "learning_rate": 3.846351072011893e-07, "loss": 0.73290533, "num_input_tokens_seen": 286753430, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 13294, "time_per_iteration": 2.810851812362671 }, { "auxiliary_loss_clip": 0.01131776, "auxiliary_loss_mlp": 0.01031025, "balance_loss_clip": 1.0188179, "balance_loss_mlp": 1.03598917, "epoch": 0.7993386442206524, "flos": 22565116650240.0, "grad_norm": 1.993139540847925, "language_loss": 0.72216737, "learning_rate": 3.844123652840705e-07, "loss": 0.74379539, "num_input_tokens_seen": 286771915, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69140625, "step": 13295, "time_per_iteration": 2.616208553314209 }, { "auxiliary_loss_clip": 0.01128481, "auxiliary_loss_mlp": 0.01033901, "balance_loss_clip": 1.02254629, "balance_loss_mlp": 1.03512549, "epoch": 0.7993987674733203, "flos": 18800277465600.0, "grad_norm": 1.7239936054145195, "language_loss": 0.7621578, "learning_rate": 3.8418968102311866e-07, "loss": 0.78378153, "num_input_tokens_seen": 286789835, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6640625, "step": 13296, "time_per_iteration": 2.5161545276641846 }, { "auxiliary_loss_clip": 0.01050356, "auxiliary_loss_mlp": 0.01001587, "balance_loss_clip": 1.00035274, "balance_loss_mlp": 1.00219977, "epoch": 0.7994588907259883, "flos": 69421720394880.0, "grad_norm": 0.7073153989031371, "language_loss": 0.60809147, "learning_rate": 3.839670544262801e-07, "loss": 0.62861085, "num_input_tokens_seen": 286855580, "router_z_loss_clip": 0.0123291, "router_z_loss_mlp": 0.21386719, "step": 13297, "time_per_iteration": 3.2051172256469727 }, { "auxiliary_loss_clip": 0.0112158, "auxiliary_loss_mlp": 0.01031679, "balance_loss_clip": 1.01967514, "balance_loss_mlp": 1.03544259, "epoch": 0.7995190139786562, "flos": 13480327883520.0, "grad_norm": 2.0133115386294103, "language_loss": 0.70618695, "learning_rate": 3.837444855015015e-07, "loss": 0.72771955, "num_input_tokens_seen": 286874360, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 13298, "time_per_iteration": 2.519041061401367 }, { "auxiliary_loss_clip": 0.01134544, "auxiliary_loss_mlp": 0.01036194, "balance_loss_clip": 1.02268767, "balance_loss_mlp": 1.03524339, "epoch": 0.7995791372313242, "flos": 21652842003840.0, "grad_norm": 2.029321310021909, "language_loss": 0.7533645, "learning_rate": 3.835219742567237e-07, "loss": 0.77507186, "num_input_tokens_seen": 286891950, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 13299, "time_per_iteration": 2.582101821899414 }, { "auxiliary_loss_clip": 0.01118873, "auxiliary_loss_mlp": 0.01029947, "balance_loss_clip": 1.0186162, "balance_loss_mlp": 1.03456926, "epoch": 0.7996392604839921, "flos": 26068130622720.0, "grad_norm": 1.5985591811273272, "language_loss": 0.77284122, "learning_rate": 3.8329952069988925e-07, "loss": 0.7943294, "num_input_tokens_seen": 286911725, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6640625, "step": 13300, "time_per_iteration": 2.58050537109375 }, { "auxiliary_loss_clip": 0.01131433, "auxiliary_loss_mlp": 0.01027542, "balance_loss_clip": 1.01485193, "balance_loss_mlp": 1.03394437, "epoch": 0.7996993837366602, "flos": 24606889441920.0, "grad_norm": 1.7051603994067581, "language_loss": 0.63596702, "learning_rate": 3.8307712483893596e-07, "loss": 0.65755671, "num_input_tokens_seen": 286931400, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 13301, "time_per_iteration": 2.6072299480438232 }, { "auxiliary_loss_clip": 0.01110374, "auxiliary_loss_mlp": 0.01037181, "balance_loss_clip": 1.02426493, "balance_loss_mlp": 1.03471136, "epoch": 0.7997595069893281, "flos": 20992049452800.0, "grad_norm": 1.5862335763607358, "language_loss": 0.71677351, "learning_rate": 3.8285478668180103e-07, "loss": 0.73824906, "num_input_tokens_seen": 286949795, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.66796875, "step": 13302, "time_per_iteration": 2.584066390991211 }, { "auxiliary_loss_clip": 0.01105715, "auxiliary_loss_mlp": 0.01277466, "balance_loss_clip": 1.01796317, "balance_loss_mlp": 1.0347991, "epoch": 0.7998196302419961, "flos": 24426510318720.0, "grad_norm": 1.7812244214000674, "language_loss": 0.83634233, "learning_rate": 3.826325062364184e-07, "loss": 0.86017406, "num_input_tokens_seen": 286968805, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 13303, "time_per_iteration": 2.5528318881988525 }, { "auxiliary_loss_clip": 0.01100597, "auxiliary_loss_mlp": 0.01031583, "balance_loss_clip": 1.02004361, "balance_loss_mlp": 1.03398848, "epoch": 0.7998797534946641, "flos": 30264651457920.0, "grad_norm": 1.7620847129407091, "language_loss": 0.5877353, "learning_rate": 3.8241028351072234e-07, "loss": 0.60905707, "num_input_tokens_seen": 286990235, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 13304, "time_per_iteration": 2.6131432056427 }, { "auxiliary_loss_clip": 0.01105092, "auxiliary_loss_mlp": 0.01028377, "balance_loss_clip": 1.01591325, "balance_loss_mlp": 1.03334785, "epoch": 0.799939876747332, "flos": 23513984277120.0, "grad_norm": 1.4734854375411937, "language_loss": 0.69351995, "learning_rate": 3.821881185126412e-07, "loss": 0.7148546, "num_input_tokens_seen": 287011060, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.71875, "step": 13305, "time_per_iteration": 2.591355800628662 }, { "auxiliary_loss_clip": 0.01128572, "auxiliary_loss_mlp": 0.01029393, "balance_loss_clip": 1.01815104, "balance_loss_mlp": 1.03437257, "epoch": 0.8, "flos": 19318109886720.0, "grad_norm": 2.015624451688603, "language_loss": 0.69256586, "learning_rate": 3.819660112501053e-07, "loss": 0.71414554, "num_input_tokens_seen": 287029215, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.671875, "step": 13306, "time_per_iteration": 2.617152690887451 }, { "auxiliary_loss_clip": 0.01113896, "auxiliary_loss_mlp": 0.01032363, "balance_loss_clip": 1.02027559, "balance_loss_mlp": 1.03634477, "epoch": 0.800060123252668, "flos": 32412432263040.0, "grad_norm": 2.0085311918155933, "language_loss": 0.69784522, "learning_rate": 3.817439617310396e-07, "loss": 0.71930778, "num_input_tokens_seen": 287050855, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 13307, "time_per_iteration": 2.6921145915985107 }, { "auxiliary_loss_clip": 0.01122025, "auxiliary_loss_mlp": 0.0103189, "balance_loss_clip": 1.02055931, "balance_loss_mlp": 1.03424907, "epoch": 0.800120246505336, "flos": 20010611168640.0, "grad_norm": 1.744576814707919, "language_loss": 0.76758707, "learning_rate": 3.815219699633705e-07, "loss": 0.78912622, "num_input_tokens_seen": 287069915, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6953125, "step": 13308, "time_per_iteration": 3.9956889152526855 }, { "auxiliary_loss_clip": 0.01144655, "auxiliary_loss_mlp": 0.01034125, "balance_loss_clip": 1.02055883, "balance_loss_mlp": 1.03584003, "epoch": 0.8001803697580039, "flos": 31868278151040.0, "grad_norm": 1.7639319852888435, "language_loss": 0.78552282, "learning_rate": 3.8130003595501803e-07, "loss": 0.80731058, "num_input_tokens_seen": 287091450, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 13309, "time_per_iteration": 2.622227668762207 }, { "auxiliary_loss_clip": 0.01024064, "auxiliary_loss_mlp": 0.01000622, "balance_loss_clip": 0.9994238, "balance_loss_mlp": 1.00276005, "epoch": 0.8002404930106719, "flos": 63392066916480.0, "grad_norm": 0.8719837781898134, "language_loss": 0.64726734, "learning_rate": 3.810781597139039e-07, "loss": 0.6675142, "num_input_tokens_seen": 287148365, "router_z_loss_clip": 0.01196289, "router_z_loss_mlp": 0.21289062, "step": 13310, "time_per_iteration": 3.0642964839935303 }, { "auxiliary_loss_clip": 0.01113586, "auxiliary_loss_mlp": 0.01032697, "balance_loss_clip": 1.02067518, "balance_loss_mlp": 1.03620028, "epoch": 0.8003006162633398, "flos": 27855476403840.0, "grad_norm": 1.8298266303199746, "language_loss": 0.82762134, "learning_rate": 3.808563412479464e-07, "loss": 0.84908426, "num_input_tokens_seen": 287168280, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 13311, "time_per_iteration": 2.5475659370422363 }, { "auxiliary_loss_clip": 0.01112222, "auxiliary_loss_mlp": 0.01033873, "balance_loss_clip": 1.02155256, "balance_loss_mlp": 1.03490305, "epoch": 0.8003607395160078, "flos": 18223337214720.0, "grad_norm": 2.862094341457588, "language_loss": 0.66337204, "learning_rate": 3.8063458056506016e-07, "loss": 0.68483293, "num_input_tokens_seen": 287185980, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 13312, "time_per_iteration": 2.5139949321746826 }, { "auxiliary_loss_clip": 0.01108863, "auxiliary_loss_mlp": 0.01034365, "balance_loss_clip": 1.02141309, "balance_loss_mlp": 1.03685558, "epoch": 0.8004208627686757, "flos": 20886975192960.0, "grad_norm": 1.9370714295222742, "language_loss": 0.75339305, "learning_rate": 3.8041287767316076e-07, "loss": 0.77482533, "num_input_tokens_seen": 287203875, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 13313, "time_per_iteration": 3.959953784942627 }, { "auxiliary_loss_clip": 0.01112671, "auxiliary_loss_mlp": 0.0103064, "balance_loss_clip": 1.01815867, "balance_loss_mlp": 1.03444147, "epoch": 0.8004809860213438, "flos": 26436143416320.0, "grad_norm": 1.3598337931012525, "language_loss": 0.7562238, "learning_rate": 3.8019123258015906e-07, "loss": 0.77765691, "num_input_tokens_seen": 287226445, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 13314, "time_per_iteration": 2.737546682357788 }, { "auxiliary_loss_clip": 0.01059184, "auxiliary_loss_mlp": 0.01004978, "balance_loss_clip": 1.00373256, "balance_loss_mlp": 1.0025847, "epoch": 0.8005411092740117, "flos": 63648612829440.0, "grad_norm": 0.801088394213269, "language_loss": 0.53820157, "learning_rate": 3.7996964529396693e-07, "loss": 0.55884326, "num_input_tokens_seen": 287286240, "router_z_loss_clip": 0.01245117, "router_z_loss_mlp": 0.21386719, "step": 13315, "time_per_iteration": 3.1257259845733643 }, { "auxiliary_loss_clip": 0.01121208, "auxiliary_loss_mlp": 0.01032259, "balance_loss_clip": 1.01987267, "balance_loss_mlp": 1.03482819, "epoch": 0.8006012325266797, "flos": 36138056774400.0, "grad_norm": 1.8539957720938625, "language_loss": 0.71345413, "learning_rate": 3.7974811582248957e-07, "loss": 0.73498881, "num_input_tokens_seen": 287310265, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 13316, "time_per_iteration": 2.666400194168091 }, { "auxiliary_loss_clip": 0.0112557, "auxiliary_loss_mlp": 0.01029653, "balance_loss_clip": 1.01752949, "balance_loss_mlp": 1.03605366, "epoch": 0.8006613557793477, "flos": 33838947970560.0, "grad_norm": 1.7104997709112395, "language_loss": 0.64512277, "learning_rate": 3.795266441736349e-07, "loss": 0.66667497, "num_input_tokens_seen": 287331610, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.71875, "step": 13317, "time_per_iteration": 2.644238233566284 }, { "auxiliary_loss_clip": 0.0102429, "auxiliary_loss_mlp": 0.01000708, "balance_loss_clip": 0.99950963, "balance_loss_mlp": 1.0029546, "epoch": 0.8007214790320156, "flos": 67348310699520.0, "grad_norm": 0.7669765716247351, "language_loss": 0.59061849, "learning_rate": 3.7930523035530595e-07, "loss": 0.61086845, "num_input_tokens_seen": 287394795, "router_z_loss_clip": 0.01196289, "router_z_loss_mlp": 0.21289062, "step": 13318, "time_per_iteration": 3.2080776691436768 }, { "auxiliary_loss_clip": 0.01127903, "auxiliary_loss_mlp": 0.01024861, "balance_loss_clip": 1.01270103, "balance_loss_mlp": 1.03382492, "epoch": 0.8007816022846836, "flos": 23185653033600.0, "grad_norm": 1.7379196570667481, "language_loss": 0.66443259, "learning_rate": 3.7908387437540437e-07, "loss": 0.68596023, "num_input_tokens_seen": 287414595, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.67578125, "step": 13319, "time_per_iteration": 2.6810648441314697 }, { "auxiliary_loss_clip": 0.01105946, "auxiliary_loss_mlp": 0.01282973, "balance_loss_clip": 1.02248156, "balance_loss_mlp": 1.03393662, "epoch": 0.8008417255373516, "flos": 21981388728960.0, "grad_norm": 1.7294393131320318, "language_loss": 0.7382952, "learning_rate": 3.7886257624182914e-07, "loss": 0.76218438, "num_input_tokens_seen": 287434395, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 13320, "time_per_iteration": 3.897864818572998 }, { "auxiliary_loss_clip": 0.01114104, "auxiliary_loss_mlp": 0.01030075, "balance_loss_clip": 1.01742077, "balance_loss_mlp": 1.03476882, "epoch": 0.8009018487900196, "flos": 16727334647040.0, "grad_norm": 1.8581544103923886, "language_loss": 0.80096161, "learning_rate": 3.786413359624796e-07, "loss": 0.82240343, "num_input_tokens_seen": 287450590, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 13321, "time_per_iteration": 2.487262725830078 }, { "auxiliary_loss_clip": 0.01143661, "auxiliary_loss_mlp": 0.01031123, "balance_loss_clip": 1.01953566, "balance_loss_mlp": 1.03183603, "epoch": 0.8009619720426875, "flos": 20813609664000.0, "grad_norm": 2.068046306707781, "language_loss": 0.66031611, "learning_rate": 3.7842015354524934e-07, "loss": 0.68206394, "num_input_tokens_seen": 287468455, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.66796875, "step": 13322, "time_per_iteration": 4.084707736968994 }, { "auxiliary_loss_clip": 0.01111864, "auxiliary_loss_mlp": 0.0102797, "balance_loss_clip": 1.01594162, "balance_loss_mlp": 1.03405142, "epoch": 0.8010220952953555, "flos": 17931096161280.0, "grad_norm": 1.8422561102319692, "language_loss": 0.77768487, "learning_rate": 3.781990289980328e-07, "loss": 0.79908323, "num_input_tokens_seen": 287486485, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 13323, "time_per_iteration": 2.4758687019348145 }, { "auxiliary_loss_clip": 0.01106496, "auxiliary_loss_mlp": 0.01030493, "balance_loss_clip": 1.0181011, "balance_loss_mlp": 1.03542316, "epoch": 0.8010822185480234, "flos": 24572235795840.0, "grad_norm": 3.554812429913936, "language_loss": 0.7170819, "learning_rate": 3.779779623287209e-07, "loss": 0.73845178, "num_input_tokens_seen": 287503940, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 13324, "time_per_iteration": 2.548326253890991 }, { "auxiliary_loss_clip": 0.01133871, "auxiliary_loss_mlp": 0.01032965, "balance_loss_clip": 1.02001238, "balance_loss_mlp": 1.03472733, "epoch": 0.8011423418006914, "flos": 21829988903040.0, "grad_norm": 1.6469027412301491, "language_loss": 0.76402354, "learning_rate": 3.777569535452041e-07, "loss": 0.78569186, "num_input_tokens_seen": 287521660, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 13325, "time_per_iteration": 2.5561282634735107 }, { "auxiliary_loss_clip": 0.01106446, "auxiliary_loss_mlp": 0.01027862, "balance_loss_clip": 1.01694822, "balance_loss_mlp": 1.03230584, "epoch": 0.8012024650533593, "flos": 23915178259200.0, "grad_norm": 1.7044850211330642, "language_loss": 0.79486483, "learning_rate": 3.775360026553682e-07, "loss": 0.81620795, "num_input_tokens_seen": 287541505, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.6484375, "step": 13326, "time_per_iteration": 2.58148455619812 }, { "auxiliary_loss_clip": 0.01109437, "auxiliary_loss_mlp": 0.01033629, "balance_loss_clip": 1.02224445, "balance_loss_mlp": 1.03376031, "epoch": 0.8012625883060274, "flos": 23587062497280.0, "grad_norm": 1.9270119503453684, "language_loss": 0.66000915, "learning_rate": 3.7731510966709836e-07, "loss": 0.68143982, "num_input_tokens_seen": 287560015, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.66796875, "step": 13327, "time_per_iteration": 2.53027081489563 }, { "auxiliary_loss_clip": 0.01107928, "auxiliary_loss_mlp": 0.01031082, "balance_loss_clip": 1.01942372, "balance_loss_mlp": 1.03393579, "epoch": 0.8013227115586953, "flos": 20813932886400.0, "grad_norm": 2.2581394867849793, "language_loss": 0.73588246, "learning_rate": 3.7709427458827925e-07, "loss": 0.75727254, "num_input_tokens_seen": 287579150, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.65234375, "step": 13328, "time_per_iteration": 2.5421242713928223 }, { "auxiliary_loss_clip": 0.01142285, "auxiliary_loss_mlp": 0.0103202, "balance_loss_clip": 1.01878202, "balance_loss_mlp": 1.03455114, "epoch": 0.8013828348113633, "flos": 15888317788800.0, "grad_norm": 1.9296157035491386, "language_loss": 0.73966384, "learning_rate": 3.7687349742678977e-07, "loss": 0.7614069, "num_input_tokens_seen": 287597420, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 13329, "time_per_iteration": 2.513298749923706 }, { "auxiliary_loss_clip": 0.01032535, "auxiliary_loss_mlp": 0.01001071, "balance_loss_clip": 0.99979508, "balance_loss_mlp": 1.00235963, "epoch": 0.8014429580640313, "flos": 71062981562880.0, "grad_norm": 0.6942849777940794, "language_loss": 0.5296731, "learning_rate": 3.7665277819051067e-07, "loss": 0.55000913, "num_input_tokens_seen": 287667280, "router_z_loss_clip": 0.01275635, "router_z_loss_mlp": 0.21289062, "step": 13330, "time_per_iteration": 3.1851134300231934 }, { "auxiliary_loss_clip": 0.0112404, "auxiliary_loss_mlp": 0.01034654, "balance_loss_clip": 1.02083719, "balance_loss_mlp": 1.03383076, "epoch": 0.8015030813166992, "flos": 23076340968960.0, "grad_norm": 1.770553584059561, "language_loss": 0.72847974, "learning_rate": 3.7643211688731814e-07, "loss": 0.75006676, "num_input_tokens_seen": 287687375, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.72265625, "step": 13331, "time_per_iteration": 2.5541319847106934 }, { "auxiliary_loss_clip": 0.01111069, "auxiliary_loss_mlp": 0.01028367, "balance_loss_clip": 1.0175426, "balance_loss_mlp": 1.03384352, "epoch": 0.8015632045693672, "flos": 21872328059520.0, "grad_norm": 1.7922005943741401, "language_loss": 0.76850021, "learning_rate": 3.7621151352508693e-07, "loss": 0.78989452, "num_input_tokens_seen": 287707895, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.6796875, "step": 13332, "time_per_iteration": 2.566391706466675 }, { "auxiliary_loss_clip": 0.01104484, "auxiliary_loss_mlp": 0.01028601, "balance_loss_clip": 1.01638222, "balance_loss_mlp": 1.03534627, "epoch": 0.8016233278220352, "flos": 23656728925440.0, "grad_norm": 1.7598613534177705, "language_loss": 0.8347137, "learning_rate": 3.759909681116895e-07, "loss": 0.85604465, "num_input_tokens_seen": 287723990, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69140625, "step": 13333, "time_per_iteration": 2.5657804012298584 }, { "auxiliary_loss_clip": 0.01120875, "auxiliary_loss_mlp": 0.01029066, "balance_loss_clip": 1.01663876, "balance_loss_mlp": 1.03336716, "epoch": 0.8016834510747032, "flos": 16253170185600.0, "grad_norm": 1.6980792261649675, "language_loss": 0.73592579, "learning_rate": 3.7577048065499727e-07, "loss": 0.75742519, "num_input_tokens_seen": 287742380, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69921875, "step": 13334, "time_per_iteration": 2.5705623626708984 }, { "auxiliary_loss_clip": 0.01128247, "auxiliary_loss_mlp": 0.01028086, "balance_loss_clip": 1.01580167, "balance_loss_mlp": 1.03516126, "epoch": 0.8017435743273711, "flos": 12276027665280.0, "grad_norm": 1.8201885674419027, "language_loss": 0.74483919, "learning_rate": 3.7555005116287885e-07, "loss": 0.76640254, "num_input_tokens_seen": 287760130, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6640625, "step": 13335, "time_per_iteration": 2.601091146469116 }, { "auxiliary_loss_clip": 0.0112722, "auxiliary_loss_mlp": 0.01027463, "balance_loss_clip": 1.01464796, "balance_loss_mlp": 1.03227293, "epoch": 0.8018036975800391, "flos": 17196112068480.0, "grad_norm": 1.7236536913439795, "language_loss": 0.75508207, "learning_rate": 3.753296796432004e-07, "loss": 0.77662885, "num_input_tokens_seen": 287777565, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6796875, "step": 13336, "time_per_iteration": 2.55940318107605 }, { "auxiliary_loss_clip": 0.01137571, "auxiliary_loss_mlp": 0.01035678, "balance_loss_clip": 1.02230287, "balance_loss_mlp": 1.03724229, "epoch": 0.801863820832707, "flos": 20631865824000.0, "grad_norm": 2.174440193558865, "language_loss": 0.75340766, "learning_rate": 3.75109366103826e-07, "loss": 0.77514017, "num_input_tokens_seen": 287796310, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 13337, "time_per_iteration": 2.5941977500915527 }, { "auxiliary_loss_clip": 0.01152573, "auxiliary_loss_mlp": 0.01280681, "balance_loss_clip": 1.02064538, "balance_loss_mlp": 1.03629851, "epoch": 0.801923944085375, "flos": 30445569285120.0, "grad_norm": 2.553726565563549, "language_loss": 0.80174756, "learning_rate": 3.7488911055261974e-07, "loss": 0.82608008, "num_input_tokens_seen": 287817330, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 13338, "time_per_iteration": 2.637667179107666 }, { "auxiliary_loss_clip": 0.01130258, "auxiliary_loss_mlp": 0.01030109, "balance_loss_clip": 1.01698446, "balance_loss_mlp": 1.03293407, "epoch": 0.801984067338043, "flos": 20010575255040.0, "grad_norm": 1.7269889189952894, "language_loss": 0.74233377, "learning_rate": 3.746689129974396e-07, "loss": 0.76393741, "num_input_tokens_seen": 287835095, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 13339, "time_per_iteration": 2.5738747119903564 }, { "auxiliary_loss_clip": 0.01108745, "auxiliary_loss_mlp": 0.01028035, "balance_loss_clip": 1.01641822, "balance_loss_mlp": 1.03360963, "epoch": 0.802044190590711, "flos": 22784028088320.0, "grad_norm": 1.6227466121600396, "language_loss": 0.79038751, "learning_rate": 3.744487734461459e-07, "loss": 0.81175536, "num_input_tokens_seen": 287854595, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.66015625, "step": 13340, "time_per_iteration": 2.617159843444824 }, { "auxiliary_loss_clip": 0.01129445, "auxiliary_loss_mlp": 0.01029668, "balance_loss_clip": 1.01797962, "balance_loss_mlp": 1.03388989, "epoch": 0.8021043138433789, "flos": 27600115639680.0, "grad_norm": 1.9104067937463176, "language_loss": 0.67808235, "learning_rate": 3.7422869190659376e-07, "loss": 0.69967341, "num_input_tokens_seen": 287876960, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.69140625, "step": 13341, "time_per_iteration": 2.611812114715576 }, { "auxiliary_loss_clip": 0.01041842, "auxiliary_loss_mlp": 0.01001325, "balance_loss_clip": 1.00007296, "balance_loss_mlp": 1.00229692, "epoch": 0.8021644370960469, "flos": 62063730057600.0, "grad_norm": 0.8106244010774617, "language_loss": 0.61679876, "learning_rate": 3.740086683866379e-07, "loss": 0.6372304, "num_input_tokens_seen": 287936530, "router_z_loss_clip": 0.01251221, "router_z_loss_mlp": 0.21289062, "step": 13342, "time_per_iteration": 3.0799221992492676 }, { "auxiliary_loss_clip": 0.01124074, "auxiliary_loss_mlp": 0.01035592, "balance_loss_clip": 1.02269351, "balance_loss_mlp": 1.03672934, "epoch": 0.8022245603487148, "flos": 23361794352000.0, "grad_norm": 1.772883483523048, "language_loss": 0.63871408, "learning_rate": 3.7378870289413e-07, "loss": 0.66031069, "num_input_tokens_seen": 287954285, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 13343, "time_per_iteration": 2.535937786102295 }, { "auxiliary_loss_clip": 0.01116564, "auxiliary_loss_mlp": 0.01026192, "balance_loss_clip": 1.0148257, "balance_loss_mlp": 1.03320146, "epoch": 0.8022846836013828, "flos": 10853354712960.0, "grad_norm": 1.8356237074298678, "language_loss": 0.6926378, "learning_rate": 3.7356879543691955e-07, "loss": 0.71406537, "num_input_tokens_seen": 287971595, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.65625, "step": 13344, "time_per_iteration": 2.493748426437378 }, { "auxiliary_loss_clip": 0.01115264, "auxiliary_loss_mlp": 0.01029665, "balance_loss_clip": 1.01707077, "balance_loss_mlp": 1.0366993, "epoch": 0.8023448068540509, "flos": 29240443054080.0, "grad_norm": 1.5341366690146219, "language_loss": 0.70291603, "learning_rate": 3.7334894602285626e-07, "loss": 0.72436535, "num_input_tokens_seen": 287992540, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 13345, "time_per_iteration": 2.5804286003112793 }, { "auxiliary_loss_clip": 0.01041157, "auxiliary_loss_mlp": 0.00999656, "balance_loss_clip": 0.99845773, "balance_loss_mlp": 1.00186634, "epoch": 0.8024049301067188, "flos": 64153588181760.0, "grad_norm": 0.8542275277534369, "language_loss": 0.62887269, "learning_rate": 3.731291546597839e-07, "loss": 0.64928085, "num_input_tokens_seen": 288052810, "router_z_loss_clip": 0.01196289, "router_z_loss_mlp": 0.21386719, "step": 13346, "time_per_iteration": 3.1729533672332764 }, { "auxiliary_loss_clip": 0.01120813, "auxiliary_loss_mlp": 0.01028446, "balance_loss_clip": 1.01625705, "balance_loss_mlp": 1.03591251, "epoch": 0.8024650533593868, "flos": 28585360765440.0, "grad_norm": 2.1879858529155873, "language_loss": 0.72865474, "learning_rate": 3.7290942135554747e-07, "loss": 0.75014734, "num_input_tokens_seen": 288073045, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.66796875, "step": 13347, "time_per_iteration": 2.6033754348754883 }, { "auxiliary_loss_clip": 0.01114464, "auxiliary_loss_mlp": 0.01031664, "balance_loss_clip": 1.01871181, "balance_loss_mlp": 1.03544736, "epoch": 0.8025251766120547, "flos": 16982264448000.0, "grad_norm": 1.7930633695829223, "language_loss": 0.72489357, "learning_rate": 3.7268974611798874e-07, "loss": 0.74635482, "num_input_tokens_seen": 288091165, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 13348, "time_per_iteration": 2.608886241912842 }, { "auxiliary_loss_clip": 0.01121564, "auxiliary_loss_mlp": 0.01028223, "balance_loss_clip": 1.0161829, "balance_loss_mlp": 1.03587568, "epoch": 0.8025852998647227, "flos": 22163671272960.0, "grad_norm": 2.0506450427335965, "language_loss": 0.75916749, "learning_rate": 3.724701289549468e-07, "loss": 0.7806654, "num_input_tokens_seen": 288110595, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.67578125, "step": 13349, "time_per_iteration": 3.906597137451172 }, { "auxiliary_loss_clip": 0.01106096, "auxiliary_loss_mlp": 0.01034373, "balance_loss_clip": 1.0228157, "balance_loss_mlp": 1.03711772, "epoch": 0.8026454231173906, "flos": 22017012042240.0, "grad_norm": 2.002331742184827, "language_loss": 0.83224452, "learning_rate": 3.722505698742588e-07, "loss": 0.85364914, "num_input_tokens_seen": 288128995, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.69140625, "step": 13350, "time_per_iteration": 2.543806552886963 }, { "auxiliary_loss_clip": 0.01114728, "auxiliary_loss_mlp": 0.01034757, "balance_loss_clip": 1.02291942, "balance_loss_mlp": 1.03645837, "epoch": 0.8027055463700586, "flos": 22491320158080.0, "grad_norm": 1.7757569878221162, "language_loss": 0.71229613, "learning_rate": 3.7203106888376176e-07, "loss": 0.73379099, "num_input_tokens_seen": 288149265, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 13351, "time_per_iteration": 2.5679471492767334 }, { "auxiliary_loss_clip": 0.01130472, "auxiliary_loss_mlp": 0.01026382, "balance_loss_clip": 1.0139302, "balance_loss_mlp": 1.03617024, "epoch": 0.8027656696227266, "flos": 28912901909760.0, "grad_norm": 1.5561922542511437, "language_loss": 0.6161294, "learning_rate": 3.71811625991288e-07, "loss": 0.63769794, "num_input_tokens_seen": 288170745, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.671875, "step": 13352, "time_per_iteration": 2.5782175064086914 }, { "auxiliary_loss_clip": 0.01112412, "auxiliary_loss_mlp": 0.01037161, "balance_loss_clip": 1.02478707, "balance_loss_mlp": 1.03564835, "epoch": 0.8028257928753946, "flos": 18589374760320.0, "grad_norm": 1.8539505860839514, "language_loss": 0.7679168, "learning_rate": 3.7159224120466924e-07, "loss": 0.78941256, "num_input_tokens_seen": 288189415, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.68359375, "step": 13353, "time_per_iteration": 2.4790778160095215 }, { "auxiliary_loss_clip": 0.01156725, "auxiliary_loss_mlp": 0.01029846, "balance_loss_clip": 1.01586866, "balance_loss_mlp": 1.03644085, "epoch": 0.8028859161280625, "flos": 19130009339520.0, "grad_norm": 2.4743277545452464, "language_loss": 0.73401654, "learning_rate": 3.7137291453173394e-07, "loss": 0.75588214, "num_input_tokens_seen": 288206900, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 13354, "time_per_iteration": 2.5653274059295654 }, { "auxiliary_loss_clip": 0.01131571, "auxiliary_loss_mlp": 0.01033447, "balance_loss_clip": 1.02164507, "balance_loss_mlp": 1.03396475, "epoch": 0.8029460393807305, "flos": 20229881742720.0, "grad_norm": 2.129817346242875, "language_loss": 0.65573674, "learning_rate": 3.711536459803113e-07, "loss": 0.67738694, "num_input_tokens_seen": 288224800, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.703125, "step": 13355, "time_per_iteration": 3.9514307975769043 }, { "auxiliary_loss_clip": 0.01135822, "auxiliary_loss_mlp": 0.01032341, "balance_loss_clip": 1.01870954, "balance_loss_mlp": 1.03596509, "epoch": 0.8030061626333984, "flos": 49783320933120.0, "grad_norm": 2.740554199849414, "language_loss": 0.68567502, "learning_rate": 3.7093443555822446e-07, "loss": 0.70735669, "num_input_tokens_seen": 288249400, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 13356, "time_per_iteration": 2.8195366859436035 }, { "auxiliary_loss_clip": 0.01139682, "auxiliary_loss_mlp": 0.01029135, "balance_loss_clip": 1.01678503, "balance_loss_mlp": 1.03521395, "epoch": 0.8030662858860664, "flos": 21615243442560.0, "grad_norm": 1.5200518881591278, "language_loss": 0.77552319, "learning_rate": 3.7071528327329647e-07, "loss": 0.79721129, "num_input_tokens_seen": 288268780, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.69140625, "step": 13357, "time_per_iteration": 2.594287633895874 }, { "auxiliary_loss_clip": 0.01132366, "auxiliary_loss_mlp": 0.01030079, "balance_loss_clip": 1.01663804, "balance_loss_mlp": 1.03354096, "epoch": 0.8031264091387345, "flos": 25630056351360.0, "grad_norm": 1.451777450240438, "language_loss": 0.7696811, "learning_rate": 3.704961891333498e-07, "loss": 0.79130554, "num_input_tokens_seen": 288290830, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 13358, "time_per_iteration": 2.5953526496887207 }, { "auxiliary_loss_clip": 0.01110111, "auxiliary_loss_mlp": 0.01029954, "balance_loss_clip": 1.01797998, "balance_loss_mlp": 1.03327203, "epoch": 0.8031865323914024, "flos": 19646225648640.0, "grad_norm": 1.5477123955057919, "language_loss": 0.84691751, "learning_rate": 3.7027715314620233e-07, "loss": 0.8683182, "num_input_tokens_seen": 288308865, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6796875, "step": 13359, "time_per_iteration": 2.546957492828369 }, { "auxiliary_loss_clip": 0.01120698, "auxiliary_loss_mlp": 0.01028893, "balance_loss_clip": 1.01743102, "balance_loss_mlp": 1.03525591, "epoch": 0.8032466556440704, "flos": 26169110732160.0, "grad_norm": 3.172346527175333, "language_loss": 0.73689234, "learning_rate": 3.700581753196712e-07, "loss": 0.75838828, "num_input_tokens_seen": 288327325, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.671875, "step": 13360, "time_per_iteration": 2.537203550338745 }, { "auxiliary_loss_clip": 0.01104417, "auxiliary_loss_mlp": 0.01032673, "balance_loss_clip": 1.02025712, "balance_loss_mlp": 1.03420115, "epoch": 0.8033067788967383, "flos": 25520026014720.0, "grad_norm": 2.388891019763148, "language_loss": 0.69842249, "learning_rate": 3.698392556615706e-07, "loss": 0.71979338, "num_input_tokens_seen": 288347285, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 13361, "time_per_iteration": 2.5342037677764893 }, { "auxiliary_loss_clip": 0.01107251, "auxiliary_loss_mlp": 0.01034336, "balance_loss_clip": 1.0210855, "balance_loss_mlp": 1.03486133, "epoch": 0.8033669021494063, "flos": 24024274842240.0, "grad_norm": 1.455882181854585, "language_loss": 0.70215809, "learning_rate": 3.696203941797147e-07, "loss": 0.72357392, "num_input_tokens_seen": 288367785, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 13362, "time_per_iteration": 3.9647417068481445 }, { "auxiliary_loss_clip": 0.01111607, "auxiliary_loss_mlp": 0.01039326, "balance_loss_clip": 1.02491355, "balance_loss_mlp": 1.03671575, "epoch": 0.8034270254020742, "flos": 13588059749760.0, "grad_norm": 2.7819530620878035, "language_loss": 0.78377771, "learning_rate": 3.6940159088191213e-07, "loss": 0.805287, "num_input_tokens_seen": 288384135, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.74609375, "step": 13363, "time_per_iteration": 3.9356589317321777 }, { "auxiliary_loss_clip": 0.01112997, "auxiliary_loss_mlp": 0.01028679, "balance_loss_clip": 1.01558995, "balance_loss_mlp": 1.03451204, "epoch": 0.8034871486547422, "flos": 27412661537280.0, "grad_norm": 2.2915930437711114, "language_loss": 0.75000143, "learning_rate": 3.6918284577597294e-07, "loss": 0.77141821, "num_input_tokens_seen": 288403805, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 13364, "time_per_iteration": 2.576889991760254 }, { "auxiliary_loss_clip": 0.01115553, "auxiliary_loss_mlp": 0.01030535, "balance_loss_clip": 1.01877499, "balance_loss_mlp": 1.03778005, "epoch": 0.8035472719074102, "flos": 32598593475840.0, "grad_norm": 1.9379866549008078, "language_loss": 0.59816289, "learning_rate": 3.6896415886970276e-07, "loss": 0.61962378, "num_input_tokens_seen": 288424895, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6875, "step": 13365, "time_per_iteration": 2.577404737472534 }, { "auxiliary_loss_clip": 0.01125238, "auxiliary_loss_mlp": 0.01033709, "balance_loss_clip": 1.02067983, "balance_loss_mlp": 1.03624439, "epoch": 0.8036073951600782, "flos": 21287989607040.0, "grad_norm": 2.114511511600704, "language_loss": 0.66413879, "learning_rate": 3.6874553017090637e-07, "loss": 0.68572831, "num_input_tokens_seen": 288443865, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 13366, "time_per_iteration": 2.4950637817382812 }, { "auxiliary_loss_clip": 0.01102052, "auxiliary_loss_mlp": 0.01030322, "balance_loss_clip": 1.01840103, "balance_loss_mlp": 1.03404784, "epoch": 0.8036675184127461, "flos": 18113845582080.0, "grad_norm": 2.4061239038061353, "language_loss": 0.74689996, "learning_rate": 3.6852695968738546e-07, "loss": 0.76822376, "num_input_tokens_seen": 288461065, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 13367, "time_per_iteration": 2.4810233116149902 }, { "auxiliary_loss_clip": 0.01101929, "auxiliary_loss_mlp": 0.01025208, "balance_loss_clip": 1.01365674, "balance_loss_mlp": 1.03390098, "epoch": 0.8037276416654141, "flos": 26030280666240.0, "grad_norm": 2.1875510857916076, "language_loss": 0.74024886, "learning_rate": 3.683084474269411e-07, "loss": 0.76152027, "num_input_tokens_seen": 288481865, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6796875, "step": 13368, "time_per_iteration": 2.555201768875122 }, { "auxiliary_loss_clip": 0.0110611, "auxiliary_loss_mlp": 0.01033209, "balance_loss_clip": 1.02103209, "balance_loss_mlp": 1.03484011, "epoch": 0.803787764918082, "flos": 18802180886400.0, "grad_norm": 1.9757406527145684, "language_loss": 0.70257926, "learning_rate": 3.6808999339737136e-07, "loss": 0.72397244, "num_input_tokens_seen": 288499345, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.7109375, "step": 13369, "time_per_iteration": 2.4794254302978516 }, { "auxiliary_loss_clip": 0.01118574, "auxiliary_loss_mlp": 0.01030244, "balance_loss_clip": 1.01847267, "balance_loss_mlp": 1.03305387, "epoch": 0.80384788817075, "flos": 20225787592320.0, "grad_norm": 1.620185670435021, "language_loss": 0.73396415, "learning_rate": 3.6787159760647167e-07, "loss": 0.75545233, "num_input_tokens_seen": 288517660, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.67578125, "step": 13370, "time_per_iteration": 2.5864131450653076 }, { "auxiliary_loss_clip": 0.01124742, "auxiliary_loss_mlp": 0.0103457, "balance_loss_clip": 1.02086115, "balance_loss_mlp": 1.03519118, "epoch": 0.8039080114234181, "flos": 18515290959360.0, "grad_norm": 2.4783079809338413, "language_loss": 0.87484527, "learning_rate": 3.6765326006203633e-07, "loss": 0.89643836, "num_input_tokens_seen": 288534180, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 13371, "time_per_iteration": 2.4935574531555176 }, { "auxiliary_loss_clip": 0.01117068, "auxiliary_loss_mlp": 0.01036167, "balance_loss_clip": 1.02259541, "balance_loss_mlp": 1.03448224, "epoch": 0.803968134676086, "flos": 22382510883840.0, "grad_norm": 1.7477926054633304, "language_loss": 0.74663073, "learning_rate": 3.674349807718573e-07, "loss": 0.76816308, "num_input_tokens_seen": 288553350, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73828125, "step": 13372, "time_per_iteration": 2.5196146965026855 }, { "auxiliary_loss_clip": 0.0113894, "auxiliary_loss_mlp": 0.01029935, "balance_loss_clip": 1.01683962, "balance_loss_mlp": 1.03422403, "epoch": 0.804028257928754, "flos": 23842566915840.0, "grad_norm": 2.1136724065454704, "language_loss": 0.79655528, "learning_rate": 3.6721675974372433e-07, "loss": 0.81824398, "num_input_tokens_seen": 288571325, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69140625, "step": 13373, "time_per_iteration": 2.6106505393981934 }, { "auxiliary_loss_clip": 0.01119164, "auxiliary_loss_mlp": 0.01034264, "balance_loss_clip": 1.02280164, "balance_loss_mlp": 1.03411508, "epoch": 0.8040883811814219, "flos": 23550720912000.0, "grad_norm": 1.6093414275682794, "language_loss": 0.74407971, "learning_rate": 3.6699859698542477e-07, "loss": 0.76561397, "num_input_tokens_seen": 288592100, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.671875, "step": 13374, "time_per_iteration": 2.5773329734802246 }, { "auxiliary_loss_clip": 0.01141019, "auxiliary_loss_mlp": 0.01033083, "balance_loss_clip": 1.02001786, "balance_loss_mlp": 1.03382432, "epoch": 0.8041485044340899, "flos": 19026263882880.0, "grad_norm": 2.5461338465333183, "language_loss": 0.68096304, "learning_rate": 3.6678049250474576e-07, "loss": 0.70270407, "num_input_tokens_seen": 288612305, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 13375, "time_per_iteration": 2.5419299602508545 }, { "auxiliary_loss_clip": 0.01049676, "auxiliary_loss_mlp": 0.01001886, "balance_loss_clip": 1.00061691, "balance_loss_mlp": 1.00196576, "epoch": 0.8042086276867578, "flos": 70005663797760.0, "grad_norm": 0.7226375271100812, "language_loss": 0.55660868, "learning_rate": 3.665624463094685e-07, "loss": 0.5771243, "num_input_tokens_seen": 288676015, "router_z_loss_clip": 0.01269531, "router_z_loss_mlp": 0.21484375, "step": 13376, "time_per_iteration": 3.2269320487976074 }, { "auxiliary_loss_clip": 0.01142674, "auxiliary_loss_mlp": 0.01029697, "balance_loss_clip": 1.01710296, "balance_loss_mlp": 1.0353632, "epoch": 0.8042687509394258, "flos": 21872435800320.0, "grad_norm": 1.6602492675053422, "language_loss": 0.73167837, "learning_rate": 3.663444584073767e-07, "loss": 0.75340205, "num_input_tokens_seen": 288696455, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 13377, "time_per_iteration": 2.5446078777313232 }, { "auxiliary_loss_clip": 0.01112078, "auxiliary_loss_mlp": 0.01030463, "balance_loss_clip": 1.0179286, "balance_loss_mlp": 1.03277779, "epoch": 0.8043288741920938, "flos": 26614870513920.0, "grad_norm": 1.7422420409201096, "language_loss": 0.697586, "learning_rate": 3.6612652880624827e-07, "loss": 0.71901143, "num_input_tokens_seen": 288715560, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 13378, "time_per_iteration": 2.520247220993042 }, { "auxiliary_loss_clip": 0.01114648, "auxiliary_loss_mlp": 0.01027318, "balance_loss_clip": 1.01475894, "balance_loss_mlp": 1.03489542, "epoch": 0.8043889974447618, "flos": 33403387651200.0, "grad_norm": 1.4568012199365448, "language_loss": 0.69309223, "learning_rate": 3.6590865751386214e-07, "loss": 0.71451193, "num_input_tokens_seen": 288739485, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 13379, "time_per_iteration": 2.6794588565826416 }, { "auxiliary_loss_clip": 0.01108973, "auxiliary_loss_mlp": 0.01032374, "balance_loss_clip": 1.01866508, "balance_loss_mlp": 1.03659701, "epoch": 0.8044491206974297, "flos": 20375966355840.0, "grad_norm": 1.7467195858437872, "language_loss": 0.76593417, "learning_rate": 3.656908445379918e-07, "loss": 0.78734767, "num_input_tokens_seen": 288757420, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 13380, "time_per_iteration": 2.5161988735198975 }, { "auxiliary_loss_clip": 0.01137883, "auxiliary_loss_mlp": 0.01030013, "balance_loss_clip": 1.01801515, "balance_loss_mlp": 1.03335357, "epoch": 0.8045092439500977, "flos": 23403810286080.0, "grad_norm": 1.960547091372801, "language_loss": 0.6896072, "learning_rate": 3.6547308988641155e-07, "loss": 0.71128619, "num_input_tokens_seen": 288775535, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 13381, "time_per_iteration": 2.5667638778686523 }, { "auxiliary_loss_clip": 0.01115759, "auxiliary_loss_mlp": 0.01034094, "balance_loss_clip": 1.02166009, "balance_loss_mlp": 1.0376184, "epoch": 0.8045693672027656, "flos": 24097245321600.0, "grad_norm": 1.7800463643405096, "language_loss": 0.63509595, "learning_rate": 3.6525539356689225e-07, "loss": 0.65659451, "num_input_tokens_seen": 288795035, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 13382, "time_per_iteration": 2.5293943881988525 }, { "auxiliary_loss_clip": 0.01121823, "auxiliary_loss_mlp": 0.01036486, "balance_loss_clip": 1.02482772, "balance_loss_mlp": 1.03599429, "epoch": 0.8046294904554336, "flos": 27707165147520.0, "grad_norm": 3.998380457558698, "language_loss": 0.76154482, "learning_rate": 3.6503775558720305e-07, "loss": 0.7831279, "num_input_tokens_seen": 288816270, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.68359375, "step": 13383, "time_per_iteration": 2.5657289028167725 }, { "auxiliary_loss_clip": 0.01112182, "auxiliary_loss_mlp": 0.01034046, "balance_loss_clip": 1.02229142, "balance_loss_mlp": 1.03417706, "epoch": 0.8046896137081017, "flos": 24972998814720.0, "grad_norm": 1.7155744673498576, "language_loss": 0.69920558, "learning_rate": 3.648201759551102e-07, "loss": 0.72066784, "num_input_tokens_seen": 288836050, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.69140625, "step": 13384, "time_per_iteration": 2.538160562515259 }, { "auxiliary_loss_clip": 0.01114085, "auxiliary_loss_mlp": 0.01034571, "balance_loss_clip": 1.02069473, "balance_loss_mlp": 1.03342688, "epoch": 0.8047497369607696, "flos": 17675484001920.0, "grad_norm": 2.5285975536378165, "language_loss": 0.79545921, "learning_rate": 3.646026546783796e-07, "loss": 0.81694579, "num_input_tokens_seen": 288852900, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.72265625, "step": 13385, "time_per_iteration": 2.4673619270324707 }, { "auxiliary_loss_clip": 0.01104281, "auxiliary_loss_mlp": 0.01033603, "balance_loss_clip": 1.02074051, "balance_loss_mlp": 1.03440666, "epoch": 0.8048098602134376, "flos": 22382079920640.0, "grad_norm": 2.5866754858625987, "language_loss": 0.72073692, "learning_rate": 3.64385191764774e-07, "loss": 0.74211574, "num_input_tokens_seen": 288872625, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 13386, "time_per_iteration": 2.481586456298828 }, { "auxiliary_loss_clip": 0.01040442, "auxiliary_loss_mlp": 0.00999916, "balance_loss_clip": 0.99866468, "balance_loss_mlp": 1.00145805, "epoch": 0.8048699834661055, "flos": 71200949702400.0, "grad_norm": 0.6756188150574042, "language_loss": 0.51258749, "learning_rate": 3.641677872220525e-07, "loss": 0.53299099, "num_input_tokens_seen": 288939180, "router_z_loss_clip": 0.01251221, "router_z_loss_mlp": 0.21484375, "step": 13387, "time_per_iteration": 3.1591010093688965 }, { "auxiliary_loss_clip": 0.01103718, "auxiliary_loss_mlp": 0.01034647, "balance_loss_clip": 1.02120006, "balance_loss_mlp": 1.03389084, "epoch": 0.8049301067187735, "flos": 23660320285440.0, "grad_norm": 1.841782067487208, "language_loss": 0.73482329, "learning_rate": 3.639504410579752e-07, "loss": 0.75620699, "num_input_tokens_seen": 288958925, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.6953125, "step": 13388, "time_per_iteration": 2.5800395011901855 }, { "auxiliary_loss_clip": 0.01127802, "auxiliary_loss_mlp": 0.0102721, "balance_loss_clip": 1.01514566, "balance_loss_mlp": 1.03354311, "epoch": 0.8049902299714414, "flos": 24426330750720.0, "grad_norm": 2.364114129103909, "language_loss": 0.71550572, "learning_rate": 3.6373315328029855e-07, "loss": 0.73705578, "num_input_tokens_seen": 288980935, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 13389, "time_per_iteration": 2.5423295497894287 }, { "auxiliary_loss_clip": 0.01123695, "auxiliary_loss_mlp": 0.0103549, "balance_loss_clip": 1.02293122, "balance_loss_mlp": 1.03658783, "epoch": 0.8050503532241094, "flos": 17492626840320.0, "grad_norm": 2.0005422088161593, "language_loss": 0.82627118, "learning_rate": 3.6351592389677644e-07, "loss": 0.84786296, "num_input_tokens_seen": 288996780, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 13390, "time_per_iteration": 3.9339139461517334 }, { "auxiliary_loss_clip": 0.01032232, "auxiliary_loss_mlp": 0.01000921, "balance_loss_clip": 0.99960393, "balance_loss_mlp": 1.00178647, "epoch": 0.8051104764767774, "flos": 57658030369920.0, "grad_norm": 0.7744697873829437, "language_loss": 0.55534041, "learning_rate": 3.63298752915161e-07, "loss": 0.57567203, "num_input_tokens_seen": 289057590, "router_z_loss_clip": 0.01318359, "router_z_loss_mlp": 0.21289062, "step": 13391, "time_per_iteration": 3.081707000732422 }, { "auxiliary_loss_clip": 0.01111187, "auxiliary_loss_mlp": 0.01029463, "balance_loss_clip": 1.01744092, "balance_loss_mlp": 1.03527284, "epoch": 0.8051705997294454, "flos": 18003456109440.0, "grad_norm": 1.554830174390552, "language_loss": 0.847628, "learning_rate": 3.630816403432042e-07, "loss": 0.86903453, "num_input_tokens_seen": 289076285, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 13392, "time_per_iteration": 2.4891698360443115 }, { "auxiliary_loss_clip": 0.01129527, "auxiliary_loss_mlp": 0.01029457, "balance_loss_clip": 1.01760149, "balance_loss_mlp": 1.03320456, "epoch": 0.8052307229821133, "flos": 26397754755840.0, "grad_norm": 1.6019279307121337, "language_loss": 0.70278525, "learning_rate": 3.628645861886517e-07, "loss": 0.72437501, "num_input_tokens_seen": 289097585, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69921875, "step": 13393, "time_per_iteration": 2.6427907943725586 }, { "auxiliary_loss_clip": 0.01113959, "auxiliary_loss_mlp": 0.01030571, "balance_loss_clip": 1.01749992, "balance_loss_mlp": 1.03476119, "epoch": 0.8052908462347813, "flos": 21757018423680.0, "grad_norm": 1.8667584984895345, "language_loss": 0.76113778, "learning_rate": 3.6264759045925166e-07, "loss": 0.78258306, "num_input_tokens_seen": 289116890, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 13394, "time_per_iteration": 2.5504212379455566 }, { "auxiliary_loss_clip": 0.01106586, "auxiliary_loss_mlp": 0.01028538, "balance_loss_clip": 1.01636684, "balance_loss_mlp": 1.03619909, "epoch": 0.8053509694874492, "flos": 25442279026560.0, "grad_norm": 1.559648275783798, "language_loss": 0.6520673, "learning_rate": 3.6243065316274656e-07, "loss": 0.67341852, "num_input_tokens_seen": 289136670, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 13395, "time_per_iteration": 2.545804262161255 }, { "auxiliary_loss_clip": 0.01032237, "auxiliary_loss_mlp": 0.01000448, "balance_loss_clip": 0.99922037, "balance_loss_mlp": 1.00170612, "epoch": 0.8054110927401172, "flos": 57668122091520.0, "grad_norm": 0.7422929553438238, "language_loss": 0.57275748, "learning_rate": 3.622137743068803e-07, "loss": 0.59308434, "num_input_tokens_seen": 289200150, "router_z_loss_clip": 0.01226807, "router_z_loss_mlp": 0.21289062, "step": 13396, "time_per_iteration": 3.2510528564453125 }, { "auxiliary_loss_clip": 0.0111353, "auxiliary_loss_mlp": 0.01036037, "balance_loss_clip": 1.02312613, "balance_loss_mlp": 1.03509605, "epoch": 0.8054712159927853, "flos": 19276201693440.0, "grad_norm": 1.7103772938892703, "language_loss": 0.77216899, "learning_rate": 3.6199695389939054e-07, "loss": 0.79366469, "num_input_tokens_seen": 289218125, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 13397, "time_per_iteration": 3.927943229675293 }, { "auxiliary_loss_clip": 0.0111372, "auxiliary_loss_mlp": 0.01029438, "balance_loss_clip": 1.01723135, "balance_loss_mlp": 1.03553295, "epoch": 0.8055313392454532, "flos": 17967617314560.0, "grad_norm": 1.789712356788475, "language_loss": 0.70911217, "learning_rate": 3.617801919480166e-07, "loss": 0.73054373, "num_input_tokens_seen": 289237115, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69140625, "step": 13398, "time_per_iteration": 2.5144906044006348 }, { "auxiliary_loss_clip": 0.01119598, "auxiliary_loss_mlp": 0.01028766, "balance_loss_clip": 1.01713109, "balance_loss_mlp": 1.03364229, "epoch": 0.8055914624981212, "flos": 13478352635520.0, "grad_norm": 1.9684556308106473, "language_loss": 0.69052738, "learning_rate": 3.6156348846049345e-07, "loss": 0.71201104, "num_input_tokens_seen": 289253635, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.68359375, "step": 13399, "time_per_iteration": 2.4626877307891846 }, { "auxiliary_loss_clip": 0.01104711, "auxiliary_loss_mlp": 0.01032723, "balance_loss_clip": 1.02133226, "balance_loss_mlp": 1.03587282, "epoch": 0.8056515857507891, "flos": 13224787551360.0, "grad_norm": 4.6222406343975715, "language_loss": 0.72543615, "learning_rate": 3.61346843444555e-07, "loss": 0.7468105, "num_input_tokens_seen": 289270085, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6875, "step": 13400, "time_per_iteration": 2.5147132873535156 }, { "auxiliary_loss_clip": 0.01109919, "auxiliary_loss_mlp": 0.0102958, "balance_loss_clip": 1.0170033, "balance_loss_mlp": 1.03255844, "epoch": 0.8057117090034571, "flos": 23878190229120.0, "grad_norm": 1.791903169376385, "language_loss": 0.64143836, "learning_rate": 3.611302569079324e-07, "loss": 0.66283339, "num_input_tokens_seen": 289289645, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 13401, "time_per_iteration": 2.5109739303588867 }, { "auxiliary_loss_clip": 0.01114324, "auxiliary_loss_mlp": 0.01030572, "balance_loss_clip": 1.01835334, "balance_loss_mlp": 1.03458595, "epoch": 0.805771832256125, "flos": 21214300855680.0, "grad_norm": 2.2858614319535238, "language_loss": 0.83699238, "learning_rate": 3.609137288583548e-07, "loss": 0.85844135, "num_input_tokens_seen": 289306630, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.70703125, "step": 13402, "time_per_iteration": 2.514110803604126 }, { "auxiliary_loss_clip": 0.01142703, "auxiliary_loss_mlp": 0.01032989, "balance_loss_clip": 1.02120519, "balance_loss_mlp": 1.03609443, "epoch": 0.805831955508793, "flos": 17566818382080.0, "grad_norm": 2.2715276028418034, "language_loss": 0.67456877, "learning_rate": 3.606972593035511e-07, "loss": 0.69632566, "num_input_tokens_seen": 289324960, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.703125, "step": 13403, "time_per_iteration": 2.4977715015411377 }, { "auxiliary_loss_clip": 0.01130705, "auxiliary_loss_mlp": 0.01280438, "balance_loss_clip": 1.02222109, "balance_loss_mlp": 1.03700542, "epoch": 0.805892078761461, "flos": 26907542530560.0, "grad_norm": 1.591968183797944, "language_loss": 0.84885615, "learning_rate": 3.604808482512445e-07, "loss": 0.8729676, "num_input_tokens_seen": 289344980, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.671875, "step": 13404, "time_per_iteration": 4.0655837059021 }, { "auxiliary_loss_clip": 0.01125447, "auxiliary_loss_mlp": 0.0103117, "balance_loss_clip": 1.02004743, "balance_loss_mlp": 1.03244317, "epoch": 0.805952202014129, "flos": 31506442496640.0, "grad_norm": 1.5364762819724678, "language_loss": 0.70306367, "learning_rate": 3.602644957091594e-07, "loss": 0.72462988, "num_input_tokens_seen": 289367500, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6640625, "step": 13405, "time_per_iteration": 4.1753623485565186 }, { "auxiliary_loss_clip": 0.0111237, "auxiliary_loss_mlp": 0.01025205, "balance_loss_clip": 1.01357019, "balance_loss_mlp": 1.03493845, "epoch": 0.8060123252667969, "flos": 24389953251840.0, "grad_norm": 2.760984660572077, "language_loss": 0.75504959, "learning_rate": 3.600482016850166e-07, "loss": 0.77642536, "num_input_tokens_seen": 289385930, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.68359375, "step": 13406, "time_per_iteration": 2.517998218536377 }, { "auxiliary_loss_clip": 0.01113963, "auxiliary_loss_mlp": 0.01034071, "balance_loss_clip": 1.02102947, "balance_loss_mlp": 1.03521752, "epoch": 0.8060724485194649, "flos": 23479941162240.0, "grad_norm": 1.6842771217141441, "language_loss": 0.76354969, "learning_rate": 3.5983196618653497e-07, "loss": 0.78503007, "num_input_tokens_seen": 289408025, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 13407, "time_per_iteration": 2.560683488845825 }, { "auxiliary_loss_clip": 0.01120411, "auxiliary_loss_mlp": 0.01279536, "balance_loss_clip": 1.02023172, "balance_loss_mlp": 1.03365397, "epoch": 0.8061325717721328, "flos": 18624495283200.0, "grad_norm": 1.6124509410133845, "language_loss": 0.73492289, "learning_rate": 3.596157892214309e-07, "loss": 0.75892234, "num_input_tokens_seen": 289426575, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 13408, "time_per_iteration": 2.5039031505584717 }, { "auxiliary_loss_clip": 0.01121529, "auxiliary_loss_mlp": 0.01027856, "balance_loss_clip": 1.01555347, "balance_loss_mlp": 1.0345912, "epoch": 0.8061926950248008, "flos": 23582752865280.0, "grad_norm": 2.0153560115388305, "language_loss": 0.71184385, "learning_rate": 3.5939967079742093e-07, "loss": 0.7333377, "num_input_tokens_seen": 289447760, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 13409, "time_per_iteration": 2.5740818977355957 }, { "auxiliary_loss_clip": 0.01103385, "auxiliary_loss_mlp": 0.0103031, "balance_loss_clip": 1.01791263, "balance_loss_mlp": 1.03366899, "epoch": 0.8062528182774689, "flos": 11143333209600.0, "grad_norm": 17.691447767809503, "language_loss": 0.76800621, "learning_rate": 3.591836109222155e-07, "loss": 0.78934318, "num_input_tokens_seen": 289463920, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 13410, "time_per_iteration": 2.5186617374420166 }, { "auxiliary_loss_clip": 0.01111397, "auxiliary_loss_mlp": 0.01034598, "balance_loss_clip": 1.02246857, "balance_loss_mlp": 1.03478765, "epoch": 0.8063129415301368, "flos": 22346815743360.0, "grad_norm": 2.5705601628394716, "language_loss": 0.68511868, "learning_rate": 3.589676096035268e-07, "loss": 0.70657867, "num_input_tokens_seen": 289482635, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 13411, "time_per_iteration": 2.5217185020446777 }, { "auxiliary_loss_clip": 0.01119861, "auxiliary_loss_mlp": 0.01028654, "balance_loss_clip": 1.0170486, "balance_loss_mlp": 1.03430152, "epoch": 0.8063730647828048, "flos": 23988400133760.0, "grad_norm": 1.777822118121368, "language_loss": 0.67947489, "learning_rate": 3.5875166684906224e-07, "loss": 0.70096004, "num_input_tokens_seen": 289502040, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 13412, "time_per_iteration": 2.5696263313293457 }, { "auxiliary_loss_clip": 0.01117413, "auxiliary_loss_mlp": 0.01032587, "balance_loss_clip": 1.01879478, "balance_loss_mlp": 1.03671455, "epoch": 0.8064331880354727, "flos": 21321494017920.0, "grad_norm": 1.5979861847292836, "language_loss": 0.81438446, "learning_rate": 3.5853578266653005e-07, "loss": 0.83588451, "num_input_tokens_seen": 289520740, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71875, "step": 13413, "time_per_iteration": 2.493239641189575 }, { "auxiliary_loss_clip": 0.01111939, "auxiliary_loss_mlp": 0.01030861, "balance_loss_clip": 1.0188446, "balance_loss_mlp": 1.03449678, "epoch": 0.8064933112881407, "flos": 19682890456320.0, "grad_norm": 5.029865741975722, "language_loss": 0.84896189, "learning_rate": 3.583199570636324e-07, "loss": 0.87038982, "num_input_tokens_seen": 289535840, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.68359375, "step": 13414, "time_per_iteration": 2.4831395149230957 }, { "auxiliary_loss_clip": 0.01120862, "auxiliary_loss_mlp": 0.01029847, "balance_loss_clip": 1.01823592, "balance_loss_mlp": 1.03428113, "epoch": 0.8065534345408086, "flos": 19279721226240.0, "grad_norm": 1.8890609716005828, "language_loss": 0.66924179, "learning_rate": 3.58104190048073e-07, "loss": 0.69074881, "num_input_tokens_seen": 289555205, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6875, "step": 13415, "time_per_iteration": 2.549830198287964 }, { "auxiliary_loss_clip": 0.01126011, "auxiliary_loss_mlp": 0.01025384, "balance_loss_clip": 1.01395142, "balance_loss_mlp": 1.03315127, "epoch": 0.8066135577934767, "flos": 19677718897920.0, "grad_norm": 1.576581086447987, "language_loss": 0.7648772, "learning_rate": 3.5788848162755224e-07, "loss": 0.7863912, "num_input_tokens_seen": 289573000, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6640625, "step": 13416, "time_per_iteration": 2.590446710586548 }, { "auxiliary_loss_clip": 0.01115653, "auxiliary_loss_mlp": 0.01033322, "balance_loss_clip": 1.0216279, "balance_loss_mlp": 1.03464532, "epoch": 0.8066736810461446, "flos": 21143592933120.0, "grad_norm": 1.5796177831705647, "language_loss": 0.65270984, "learning_rate": 3.576728318097666e-07, "loss": 0.67419958, "num_input_tokens_seen": 289592625, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.71875, "step": 13417, "time_per_iteration": 2.5370826721191406 }, { "auxiliary_loss_clip": 0.01133759, "auxiliary_loss_mlp": 0.01035704, "balance_loss_clip": 1.02365184, "balance_loss_mlp": 1.03605461, "epoch": 0.8067338042988126, "flos": 22598261925120.0, "grad_norm": 1.6502491680312406, "language_loss": 0.80963838, "learning_rate": 3.5745724060241345e-07, "loss": 0.83133298, "num_input_tokens_seen": 289610780, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7109375, "step": 13418, "time_per_iteration": 2.558116912841797 }, { "auxiliary_loss_clip": 0.01123586, "auxiliary_loss_mlp": 0.01030445, "balance_loss_clip": 1.01733828, "balance_loss_mlp": 1.03497732, "epoch": 0.8067939275514805, "flos": 16508423208960.0, "grad_norm": 1.52300065037993, "language_loss": 0.84824824, "learning_rate": 3.5724170801318623e-07, "loss": 0.86978859, "num_input_tokens_seen": 289628890, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 13419, "time_per_iteration": -0.15505647659301758 }, { "auxiliary_loss_clip": 0.01132162, "auxiliary_loss_mlp": 0.01275019, "balance_loss_clip": 1.01577282, "balance_loss_mlp": 1.03448057, "epoch": 0.8068540508041485, "flos": 28541836460160.0, "grad_norm": 1.7044033264046583, "language_loss": 0.7590484, "learning_rate": 3.570262340497767e-07, "loss": 0.78312027, "num_input_tokens_seen": 289647220, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.70703125, "step": 13420, "time_per_iteration": 2.575012445449829 }, { "auxiliary_loss_clip": 0.01110563, "auxiliary_loss_mlp": 0.01271004, "balance_loss_clip": 1.01327801, "balance_loss_mlp": 1.03386545, "epoch": 0.8069141740568164, "flos": 21652482867840.0, "grad_norm": 2.2870390197368504, "language_loss": 0.78950393, "learning_rate": 3.5681081871987417e-07, "loss": 0.81331962, "num_input_tokens_seen": 289665800, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.67578125, "step": 13421, "time_per_iteration": 2.5354933738708496 }, { "auxiliary_loss_clip": 0.01132597, "auxiliary_loss_mlp": 0.01024489, "balance_loss_clip": 1.01236534, "balance_loss_mlp": 1.03626966, "epoch": 0.8069742973094844, "flos": 17529327561600.0, "grad_norm": 1.8512171738868124, "language_loss": 0.80027878, "learning_rate": 3.565954620311673e-07, "loss": 0.82184964, "num_input_tokens_seen": 289682705, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69921875, "step": 13422, "time_per_iteration": 2.4766178131103516 }, { "auxiliary_loss_clip": 0.01102704, "auxiliary_loss_mlp": 0.01033009, "balance_loss_clip": 1.02087927, "balance_loss_mlp": 1.03362036, "epoch": 0.8070344205621525, "flos": 23367037737600.0, "grad_norm": 1.6753563224251125, "language_loss": 0.67879266, "learning_rate": 3.563801639913411e-07, "loss": 0.70014983, "num_input_tokens_seen": 289702920, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 13423, "time_per_iteration": 2.532877206802368 }, { "auxiliary_loss_clip": 0.01123635, "auxiliary_loss_mlp": 0.01036722, "balance_loss_clip": 1.02419949, "balance_loss_mlp": 1.03521419, "epoch": 0.8070945438148204, "flos": 22930184528640.0, "grad_norm": 1.9184479490613253, "language_loss": 0.79944062, "learning_rate": 3.561649246080789e-07, "loss": 0.82104421, "num_input_tokens_seen": 289723280, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 13424, "time_per_iteration": 2.530388832092285 }, { "auxiliary_loss_clip": 0.01113686, "auxiliary_loss_mlp": 0.01027035, "balance_loss_clip": 1.01493478, "balance_loss_mlp": 1.03320229, "epoch": 0.8071546670674884, "flos": 25300683613440.0, "grad_norm": 1.5848036433194215, "language_loss": 0.78971946, "learning_rate": 3.5594974388906153e-07, "loss": 0.81112671, "num_input_tokens_seen": 289743475, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.71484375, "step": 13425, "time_per_iteration": 2.6021182537078857 }, { "auxiliary_loss_clip": 0.0111853, "auxiliary_loss_mlp": 0.01029035, "balance_loss_clip": 1.01765609, "balance_loss_mlp": 1.03282893, "epoch": 0.8072147903201563, "flos": 18113701927680.0, "grad_norm": 3.162557040769132, "language_loss": 0.74976093, "learning_rate": 3.5573462184196965e-07, "loss": 0.77123666, "num_input_tokens_seen": 289761400, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6796875, "step": 13426, "time_per_iteration": 2.529315710067749 }, { "auxiliary_loss_clip": 0.01112915, "auxiliary_loss_mlp": 0.01025019, "balance_loss_clip": 1.01345515, "balance_loss_mlp": 1.03480315, "epoch": 0.8072749135728243, "flos": 26688164215680.0, "grad_norm": 1.920225496890632, "language_loss": 0.73320895, "learning_rate": 3.555195584744788e-07, "loss": 0.75458825, "num_input_tokens_seen": 289781025, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.69140625, "step": 13427, "time_per_iteration": 2.5593390464782715 }, { "auxiliary_loss_clip": 0.01146719, "auxiliary_loss_mlp": 0.01040526, "balance_loss_clip": 1.02747273, "balance_loss_mlp": 1.03340244, "epoch": 0.8073350368254922, "flos": 19240291071360.0, "grad_norm": 1.7005765625490177, "language_loss": 0.70073664, "learning_rate": 3.553045537942654e-07, "loss": 0.72260904, "num_input_tokens_seen": 289798380, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 13428, "time_per_iteration": 2.590696334838867 }, { "auxiliary_loss_clip": 0.0112153, "auxiliary_loss_mlp": 0.01028882, "balance_loss_clip": 1.01753294, "balance_loss_mlp": 1.03420496, "epoch": 0.8073951600781603, "flos": 13334530579200.0, "grad_norm": 1.9498844125290806, "language_loss": 0.74860966, "learning_rate": 3.550896078090011e-07, "loss": 0.77011371, "num_input_tokens_seen": 289814515, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6953125, "step": 13429, "time_per_iteration": 2.521907091140747 }, { "auxiliary_loss_clip": 0.01110529, "auxiliary_loss_mlp": 0.01030881, "balance_loss_clip": 1.01935911, "balance_loss_mlp": 1.034917, "epoch": 0.8074552833308282, "flos": 22853191726080.0, "grad_norm": 1.706417441568488, "language_loss": 0.66811484, "learning_rate": 3.5487472052635847e-07, "loss": 0.68952894, "num_input_tokens_seen": 289834315, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66796875, "step": 13430, "time_per_iteration": 2.5099549293518066 }, { "auxiliary_loss_clip": 0.01130016, "auxiliary_loss_mlp": 0.01029374, "balance_loss_clip": 1.01778054, "balance_loss_mlp": 1.03429818, "epoch": 0.8075154065834962, "flos": 20339409288960.0, "grad_norm": 1.7009110705228783, "language_loss": 0.8019228, "learning_rate": 3.546598919540049e-07, "loss": 0.82351667, "num_input_tokens_seen": 289853770, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.68359375, "step": 13431, "time_per_iteration": 3.9152069091796875 }, { "auxiliary_loss_clip": 0.01118743, "auxiliary_loss_mlp": 0.01028368, "balance_loss_clip": 1.01548696, "balance_loss_mlp": 1.03355694, "epoch": 0.8075755298361641, "flos": 21908059113600.0, "grad_norm": 1.8524778592213553, "language_loss": 0.80070055, "learning_rate": 3.54445122099607e-07, "loss": 0.82217169, "num_input_tokens_seen": 289870480, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.67578125, "step": 13432, "time_per_iteration": 2.5079853534698486 }, { "auxiliary_loss_clip": 0.01139122, "auxiliary_loss_mlp": 0.0103375, "balance_loss_clip": 1.02110171, "balance_loss_mlp": 1.03904438, "epoch": 0.8076356530888321, "flos": 15669298609920.0, "grad_norm": 1.8148787681481655, "language_loss": 0.70093012, "learning_rate": 3.5423041097083075e-07, "loss": 0.72265887, "num_input_tokens_seen": 289888275, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73828125, "step": 13433, "time_per_iteration": 2.511164665222168 }, { "auxiliary_loss_clip": 0.0112138, "auxiliary_loss_mlp": 0.01029196, "balance_loss_clip": 1.01736999, "balance_loss_mlp": 1.03455234, "epoch": 0.8076957763415, "flos": 37777414521600.0, "grad_norm": 1.526651234777473, "language_loss": 0.724877, "learning_rate": 3.540157585753367e-07, "loss": 0.74638283, "num_input_tokens_seen": 289911495, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 13434, "time_per_iteration": 2.6611294746398926 }, { "auxiliary_loss_clip": 0.01131294, "auxiliary_loss_mlp": 0.01027296, "balance_loss_clip": 1.01557803, "balance_loss_mlp": 1.03686571, "epoch": 0.807755899594168, "flos": 19610781903360.0, "grad_norm": 1.8200892004262066, "language_loss": 0.67921901, "learning_rate": 3.538011649207868e-07, "loss": 0.70080489, "num_input_tokens_seen": 289930045, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.67578125, "step": 13435, "time_per_iteration": 2.4956748485565186 }, { "auxiliary_loss_clip": 0.01141916, "auxiliary_loss_mlp": 0.01034636, "balance_loss_clip": 1.02057505, "balance_loss_mlp": 1.03639638, "epoch": 0.8078160228468361, "flos": 23294893271040.0, "grad_norm": 1.8277216781814987, "language_loss": 0.74818468, "learning_rate": 3.535866300148387e-07, "loss": 0.76995021, "num_input_tokens_seen": 289950815, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7109375, "step": 13436, "time_per_iteration": 2.6247718334198 }, { "auxiliary_loss_clip": 0.01098446, "auxiliary_loss_mlp": 0.01027665, "balance_loss_clip": 1.01536262, "balance_loss_mlp": 1.03147662, "epoch": 0.807876146099504, "flos": 27162651899520.0, "grad_norm": 1.5782570300598004, "language_loss": 0.7035135, "learning_rate": 3.5337215386514883e-07, "loss": 0.72477454, "num_input_tokens_seen": 289971730, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.671875, "step": 13437, "time_per_iteration": 2.5301761627197266 }, { "auxiliary_loss_clip": 0.0112301, "auxiliary_loss_mlp": 0.01031741, "balance_loss_clip": 1.02028513, "balance_loss_mlp": 1.03744268, "epoch": 0.807936269352172, "flos": 27160030206720.0, "grad_norm": 1.6248436813377822, "language_loss": 0.725703, "learning_rate": 3.5315773647937053e-07, "loss": 0.74725044, "num_input_tokens_seen": 289992995, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.67578125, "step": 13438, "time_per_iteration": 4.08112359046936 }, { "auxiliary_loss_clip": 0.0112025, "auxiliary_loss_mlp": 0.01030346, "balance_loss_clip": 1.01835322, "balance_loss_mlp": 1.03458214, "epoch": 0.8079963926048399, "flos": 20740423703040.0, "grad_norm": 1.9104873605717283, "language_loss": 0.76978254, "learning_rate": 3.529433778651569e-07, "loss": 0.79128855, "num_input_tokens_seen": 290009405, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.68359375, "step": 13439, "time_per_iteration": 2.625822067260742 }, { "auxiliary_loss_clip": 0.01103727, "auxiliary_loss_mlp": 0.01031327, "balance_loss_clip": 1.01960874, "balance_loss_mlp": 1.03533924, "epoch": 0.8080565158575079, "flos": 25009663622400.0, "grad_norm": 1.433195560560569, "language_loss": 0.78442109, "learning_rate": 3.527290780301575e-07, "loss": 0.80577165, "num_input_tokens_seen": 290031085, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.68359375, "step": 13440, "time_per_iteration": 2.542982816696167 }, { "auxiliary_loss_clip": 0.01121858, "auxiliary_loss_mlp": 0.01273593, "balance_loss_clip": 1.01359963, "balance_loss_mlp": 1.03360355, "epoch": 0.8081166391101758, "flos": 18698076293760.0, "grad_norm": 1.859201876705966, "language_loss": 0.59231234, "learning_rate": 3.5251483698201987e-07, "loss": 0.61626685, "num_input_tokens_seen": 290048670, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 13441, "time_per_iteration": 2.568180799484253 }, { "auxiliary_loss_clip": 0.01112789, "auxiliary_loss_mlp": 0.01029582, "balance_loss_clip": 1.01634419, "balance_loss_mlp": 1.03535318, "epoch": 0.8081767623628439, "flos": 19828651847040.0, "grad_norm": 2.1002087300670054, "language_loss": 0.76207185, "learning_rate": 3.5230065472838956e-07, "loss": 0.78349555, "num_input_tokens_seen": 290064085, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6875, "step": 13442, "time_per_iteration": 2.516850709915161 }, { "auxiliary_loss_clip": 0.01120498, "auxiliary_loss_mlp": 0.01029007, "balance_loss_clip": 1.01693714, "balance_loss_mlp": 1.03419924, "epoch": 0.8082368856155118, "flos": 35772952982400.0, "grad_norm": 1.6149097701776836, "language_loss": 0.70691186, "learning_rate": 3.5208653127691147e-07, "loss": 0.72840691, "num_input_tokens_seen": 290086255, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 13443, "time_per_iteration": 2.6793372631073 }, { "auxiliary_loss_clip": 0.01129095, "auxiliary_loss_mlp": 0.01033588, "balance_loss_clip": 1.02201283, "balance_loss_mlp": 1.03472507, "epoch": 0.8082970088681798, "flos": 17198015489280.0, "grad_norm": 1.811739244833627, "language_loss": 0.82427108, "learning_rate": 3.518724666352251e-07, "loss": 0.84589791, "num_input_tokens_seen": 290103995, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.671875, "step": 13444, "time_per_iteration": 2.794590950012207 }, { "auxiliary_loss_clip": 0.01050219, "auxiliary_loss_mlp": 0.01249425, "balance_loss_clip": 1.00236225, "balance_loss_mlp": 1.00187922, "epoch": 0.8083571321208477, "flos": 63555207511680.0, "grad_norm": 0.8152309035165038, "language_loss": 0.5362106, "learning_rate": 3.516584608109714e-07, "loss": 0.55920702, "num_input_tokens_seen": 290157245, "router_z_loss_clip": 0.01190186, "router_z_loss_mlp": 0.21484375, "step": 13445, "time_per_iteration": 4.451337814331055 }, { "auxiliary_loss_clip": 0.01129984, "auxiliary_loss_mlp": 0.0102922, "balance_loss_clip": 1.01637506, "balance_loss_mlp": 1.03510666, "epoch": 0.8084172553735157, "flos": 17930701111680.0, "grad_norm": 2.1350566402588584, "language_loss": 0.7203908, "learning_rate": 3.514445138117872e-07, "loss": 0.74198282, "num_input_tokens_seen": 290174970, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.68359375, "step": 13446, "time_per_iteration": 2.5741798877716064 }, { "auxiliary_loss_clip": 0.01126506, "auxiliary_loss_mlp": 0.01032575, "balance_loss_clip": 1.01969433, "balance_loss_mlp": 1.03700829, "epoch": 0.8084773786261836, "flos": 24097999507200.0, "grad_norm": 1.7782843223920397, "language_loss": 0.71141243, "learning_rate": 3.512306256453077e-07, "loss": 0.73300326, "num_input_tokens_seen": 290194395, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 13447, "time_per_iteration": 4.137669086456299 }, { "auxiliary_loss_clip": 0.01149708, "auxiliary_loss_mlp": 0.0103329, "balance_loss_clip": 1.01965857, "balance_loss_mlp": 1.03426504, "epoch": 0.8085375018788516, "flos": 15588211656960.0, "grad_norm": 1.9823589409742288, "language_loss": 0.75317997, "learning_rate": 3.5101679631916593e-07, "loss": 0.77500999, "num_input_tokens_seen": 290209200, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.70703125, "step": 13448, "time_per_iteration": 2.5399866104125977 }, { "auxiliary_loss_clip": 0.01022823, "auxiliary_loss_mlp": 0.01002888, "balance_loss_clip": 1.00164211, "balance_loss_mlp": 1.00149214, "epoch": 0.8085976251315197, "flos": 67561296393600.0, "grad_norm": 0.8666901151571665, "language_loss": 0.63872069, "learning_rate": 3.5080302584099266e-07, "loss": 0.65897781, "num_input_tokens_seen": 290274565, "router_z_loss_clip": 0.01245117, "router_z_loss_mlp": 0.21289062, "step": 13449, "time_per_iteration": 3.168654203414917 }, { "auxiliary_loss_clip": 0.01114382, "auxiliary_loss_mlp": 0.01027789, "balance_loss_clip": 1.01522422, "balance_loss_mlp": 1.03692222, "epoch": 0.8086577483841876, "flos": 22561453463040.0, "grad_norm": 1.9329759641405138, "language_loss": 0.73998559, "learning_rate": 3.50589314218418e-07, "loss": 0.76140732, "num_input_tokens_seen": 290293630, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 13450, "time_per_iteration": 2.5318758487701416 }, { "auxiliary_loss_clip": 0.0111953, "auxiliary_loss_mlp": 0.01275076, "balance_loss_clip": 1.01607871, "balance_loss_mlp": 1.03307307, "epoch": 0.8087178716368556, "flos": 17968084191360.0, "grad_norm": 1.5683952149993068, "language_loss": 0.73676169, "learning_rate": 3.5037566145906695e-07, "loss": 0.7607078, "num_input_tokens_seen": 290311450, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 13451, "time_per_iteration": 2.5397112369537354 }, { "auxiliary_loss_clip": 0.01125126, "auxiliary_loss_mlp": 0.01028118, "balance_loss_clip": 1.0145874, "balance_loss_mlp": 1.0344069, "epoch": 0.8087779948895235, "flos": 21719527603200.0, "grad_norm": 1.8990071746670711, "language_loss": 0.80161786, "learning_rate": 3.501620675705659e-07, "loss": 0.82315034, "num_input_tokens_seen": 290330165, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 13452, "time_per_iteration": 2.7632086277008057 }, { "auxiliary_loss_clip": 0.01113955, "auxiliary_loss_mlp": 0.01033917, "balance_loss_clip": 1.02145374, "balance_loss_mlp": 1.03518188, "epoch": 0.8088381181421915, "flos": 29092885983360.0, "grad_norm": 1.5692524851502703, "language_loss": 0.78160441, "learning_rate": 3.499485325605367e-07, "loss": 0.80308306, "num_input_tokens_seen": 290350815, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.703125, "step": 13453, "time_per_iteration": 2.6547367572784424 }, { "auxiliary_loss_clip": 0.01137761, "auxiliary_loss_mlp": 0.01032186, "balance_loss_clip": 1.01938319, "balance_loss_mlp": 1.03251028, "epoch": 0.8088982413948594, "flos": 22198432659840.0, "grad_norm": 2.3576817091249, "language_loss": 0.77248096, "learning_rate": 3.497350564365997e-07, "loss": 0.79418051, "num_input_tokens_seen": 290367380, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 13454, "time_per_iteration": 2.589024066925049 }, { "auxiliary_loss_clip": 0.01128481, "auxiliary_loss_mlp": 0.01034464, "balance_loss_clip": 1.02176809, "balance_loss_mlp": 1.03414345, "epoch": 0.8089583646475275, "flos": 28036717453440.0, "grad_norm": 2.1304856236186254, "language_loss": 0.7667222, "learning_rate": 3.495216392063733e-07, "loss": 0.78835166, "num_input_tokens_seen": 290387965, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 13455, "time_per_iteration": 2.603052854537964 }, { "auxiliary_loss_clip": 0.01120663, "auxiliary_loss_mlp": 0.0103105, "balance_loss_clip": 1.01902139, "balance_loss_mlp": 1.03333914, "epoch": 0.8090184879001954, "flos": 16617735273600.0, "grad_norm": 1.7708841203721715, "language_loss": 0.78578258, "learning_rate": 3.493082808774746e-07, "loss": 0.80729973, "num_input_tokens_seen": 290404150, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 13456, "time_per_iteration": 2.5727503299713135 }, { "auxiliary_loss_clip": 0.01103072, "auxiliary_loss_mlp": 0.01034001, "balance_loss_clip": 1.02235389, "balance_loss_mlp": 1.03555691, "epoch": 0.8090786111528634, "flos": 27340804379520.0, "grad_norm": 5.498762754602754, "language_loss": 0.71573544, "learning_rate": 3.4909498145751704e-07, "loss": 0.7371062, "num_input_tokens_seen": 290422370, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.67578125, "step": 13457, "time_per_iteration": 2.5640225410461426 }, { "auxiliary_loss_clip": 0.01121011, "auxiliary_loss_mlp": 0.01029798, "balance_loss_clip": 1.01809168, "balance_loss_mlp": 1.03500271, "epoch": 0.8091387344055313, "flos": 21105742976640.0, "grad_norm": 1.817680261659581, "language_loss": 0.72952867, "learning_rate": 3.4888174095411316e-07, "loss": 0.75103676, "num_input_tokens_seen": 290442645, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.68359375, "step": 13458, "time_per_iteration": 2.5632095336914062 }, { "auxiliary_loss_clip": 0.01112597, "auxiliary_loss_mlp": 0.01035551, "balance_loss_clip": 1.02312326, "balance_loss_mlp": 1.03442597, "epoch": 0.8091988576581993, "flos": 11655060318720.0, "grad_norm": 2.539951388639218, "language_loss": 0.78886998, "learning_rate": 3.4866855937487214e-07, "loss": 0.81035149, "num_input_tokens_seen": 290458520, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 13459, "time_per_iteration": 2.5600905418395996 }, { "auxiliary_loss_clip": 0.01122227, "auxiliary_loss_mlp": 0.01028259, "balance_loss_clip": 1.01555097, "balance_loss_mlp": 1.03390276, "epoch": 0.8092589809108672, "flos": 22963329803520.0, "grad_norm": 1.888915814202184, "language_loss": 0.80042446, "learning_rate": 3.484554367274035e-07, "loss": 0.82192934, "num_input_tokens_seen": 290474465, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 13460, "time_per_iteration": 2.5905613899230957 }, { "auxiliary_loss_clip": 0.01121918, "auxiliary_loss_mlp": 0.01035488, "balance_loss_clip": 1.02245808, "balance_loss_mlp": 1.03354716, "epoch": 0.8093191041635353, "flos": 13260985482240.0, "grad_norm": 1.833748093821964, "language_loss": 0.84654415, "learning_rate": 3.482423730193116e-07, "loss": 0.86811817, "num_input_tokens_seen": 290492060, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 13461, "time_per_iteration": 2.5858609676361084 }, { "auxiliary_loss_clip": 0.01114014, "auxiliary_loss_mlp": 0.01035998, "balance_loss_clip": 1.02350545, "balance_loss_mlp": 1.03488827, "epoch": 0.8093792274162032, "flos": 25516003691520.0, "grad_norm": 1.7670211995597167, "language_loss": 0.76594365, "learning_rate": 3.4802936825820006e-07, "loss": 0.78744376, "num_input_tokens_seen": 290511510, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 13462, "time_per_iteration": 2.5346405506134033 }, { "auxiliary_loss_clip": 0.01124002, "auxiliary_loss_mlp": 0.01035612, "balance_loss_clip": 1.02217114, "balance_loss_mlp": 1.03559232, "epoch": 0.8094393506688712, "flos": 23546483107200.0, "grad_norm": 1.8209691599620261, "language_loss": 0.82927608, "learning_rate": 3.47816422451672e-07, "loss": 0.85087216, "num_input_tokens_seen": 290530035, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 13463, "time_per_iteration": 2.558389663696289 }, { "auxiliary_loss_clip": 0.01102705, "auxiliary_loss_mlp": 0.01035186, "balance_loss_clip": 1.02355111, "balance_loss_mlp": 1.03469229, "epoch": 0.8094994739215392, "flos": 17055917285760.0, "grad_norm": 1.980830449490046, "language_loss": 0.6214906, "learning_rate": 3.476035356073248e-07, "loss": 0.64286959, "num_input_tokens_seen": 290548245, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 13464, "time_per_iteration": 2.492621898651123 }, { "auxiliary_loss_clip": 0.01100214, "auxiliary_loss_mlp": 0.01027834, "balance_loss_clip": 1.01653862, "balance_loss_mlp": 1.0332284, "epoch": 0.8095595971742071, "flos": 23551223702400.0, "grad_norm": 1.570853049925731, "language_loss": 0.61835867, "learning_rate": 3.473907077327576e-07, "loss": 0.63963914, "num_input_tokens_seen": 290568625, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.671875, "step": 13465, "time_per_iteration": 2.6201467514038086 }, { "auxiliary_loss_clip": 0.01113033, "auxiliary_loss_mlp": 0.01036113, "balance_loss_clip": 1.02367377, "balance_loss_mlp": 1.03676975, "epoch": 0.8096197204268751, "flos": 20373201008640.0, "grad_norm": 1.7462259036045424, "language_loss": 0.82056785, "learning_rate": 3.4717793883556444e-07, "loss": 0.84205931, "num_input_tokens_seen": 290586575, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.67578125, "step": 13466, "time_per_iteration": 2.54441237449646 }, { "auxiliary_loss_clip": 0.01126071, "auxiliary_loss_mlp": 0.0128485, "balance_loss_clip": 1.02538407, "balance_loss_mlp": 1.03705978, "epoch": 0.809679843679543, "flos": 27818775682560.0, "grad_norm": 1.736771088601647, "language_loss": 0.75386018, "learning_rate": 3.4696522892334e-07, "loss": 0.77796936, "num_input_tokens_seen": 290606790, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 13467, "time_per_iteration": 2.683135509490967 }, { "auxiliary_loss_clip": 0.01127201, "auxiliary_loss_mlp": 0.01030787, "balance_loss_clip": 1.01846719, "balance_loss_mlp": 1.03133821, "epoch": 0.8097399669322111, "flos": 22014103040640.0, "grad_norm": 1.6893791559741043, "language_loss": 0.79344791, "learning_rate": 3.467525780036735e-07, "loss": 0.81502777, "num_input_tokens_seen": 290625525, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 13468, "time_per_iteration": 2.5826416015625 }, { "auxiliary_loss_clip": 0.01108249, "auxiliary_loss_mlp": 0.01028813, "balance_loss_clip": 1.01727986, "balance_loss_mlp": 1.03327346, "epoch": 0.809800090184879, "flos": 18988988544000.0, "grad_norm": 1.9188465862017299, "language_loss": 0.68055624, "learning_rate": 3.4653998608415535e-07, "loss": 0.70192689, "num_input_tokens_seen": 290644935, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66015625, "step": 13469, "time_per_iteration": 2.602205514907837 }, { "auxiliary_loss_clip": 0.01049646, "auxiliary_loss_mlp": 0.01001427, "balance_loss_clip": 1.00015724, "balance_loss_mlp": 1.00184989, "epoch": 0.809860213437547, "flos": 66099516508800.0, "grad_norm": 0.7032351805497099, "language_loss": 0.54674286, "learning_rate": 3.4632745317237187e-07, "loss": 0.56725353, "num_input_tokens_seen": 290710735, "router_z_loss_clip": 0.01269531, "router_z_loss_mlp": 0.21386719, "step": 13470, "time_per_iteration": 3.399698257446289 }, { "auxiliary_loss_clip": 0.01102501, "auxiliary_loss_mlp": 0.01027884, "balance_loss_clip": 1.0164398, "balance_loss_mlp": 1.03405118, "epoch": 0.8099203366902149, "flos": 20882485992960.0, "grad_norm": 2.7060981909798913, "language_loss": 0.69243395, "learning_rate": 3.4611497927590794e-07, "loss": 0.71373779, "num_input_tokens_seen": 290729565, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.68359375, "step": 13471, "time_per_iteration": 2.6801862716674805 }, { "auxiliary_loss_clip": 0.01111744, "auxiliary_loss_mlp": 0.0102803, "balance_loss_clip": 1.01607966, "balance_loss_mlp": 1.03439415, "epoch": 0.8099804599428829, "flos": 26030927111040.0, "grad_norm": 1.7406335121884553, "language_loss": 0.79832906, "learning_rate": 3.4590256440234544e-07, "loss": 0.81972682, "num_input_tokens_seen": 290749360, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 13472, "time_per_iteration": 2.696291208267212 }, { "auxiliary_loss_clip": 0.01032011, "auxiliary_loss_mlp": 0.01004454, "balance_loss_clip": 1.00321984, "balance_loss_mlp": 1.0014745, "epoch": 0.8100405831955508, "flos": 69303573584640.0, "grad_norm": 0.7559702074116897, "language_loss": 0.57835728, "learning_rate": 3.4569020855926614e-07, "loss": 0.59872192, "num_input_tokens_seen": 290812145, "router_z_loss_clip": 0.0123291, "router_z_loss_mlp": 0.21289062, "step": 13473, "time_per_iteration": 4.542537689208984 }, { "auxiliary_loss_clip": 0.01140617, "auxiliary_loss_mlp": 0.01029471, "balance_loss_clip": 1.01733589, "balance_loss_mlp": 1.03486228, "epoch": 0.8101007064482189, "flos": 15012492468480.0, "grad_norm": 8.688032524617302, "language_loss": 0.74087203, "learning_rate": 3.454779117542479e-07, "loss": 0.76257288, "num_input_tokens_seen": 290829845, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69921875, "step": 13474, "time_per_iteration": 2.6533937454223633 }, { "auxiliary_loss_clip": 0.01116106, "auxiliary_loss_mlp": 0.01037369, "balance_loss_clip": 1.02534103, "balance_loss_mlp": 1.03588641, "epoch": 0.8101608297008868, "flos": 21067210661760.0, "grad_norm": 1.7280746098002955, "language_loss": 0.78909254, "learning_rate": 3.452656739948672e-07, "loss": 0.81062734, "num_input_tokens_seen": 290848815, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7109375, "step": 13475, "time_per_iteration": 2.621188163757324 }, { "auxiliary_loss_clip": 0.01110669, "auxiliary_loss_mlp": 0.01039259, "balance_loss_clip": 1.02733803, "balance_loss_mlp": 1.03406024, "epoch": 0.8102209529535548, "flos": 23731279603200.0, "grad_norm": 1.5940968012476495, "language_loss": 0.75324321, "learning_rate": 3.4505349528869785e-07, "loss": 0.77474248, "num_input_tokens_seen": 290868580, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.67578125, "step": 13476, "time_per_iteration": 2.650301218032837 }, { "auxiliary_loss_clip": 0.01123833, "auxiliary_loss_mlp": 0.01036744, "balance_loss_clip": 1.02350628, "balance_loss_mlp": 1.03443551, "epoch": 0.8102810762062228, "flos": 10955879107200.0, "grad_norm": 2.4269456593765257, "language_loss": 0.734972, "learning_rate": 3.448413756433124e-07, "loss": 0.75657773, "num_input_tokens_seen": 290883540, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71484375, "step": 13477, "time_per_iteration": 2.554809808731079 }, { "auxiliary_loss_clip": 0.01120174, "auxiliary_loss_mlp": 0.01033364, "balance_loss_clip": 1.02141345, "balance_loss_mlp": 1.03391111, "epoch": 0.8103411994588907, "flos": 14648825220480.0, "grad_norm": 1.891418632286608, "language_loss": 0.69898373, "learning_rate": 3.446293150662807e-07, "loss": 0.72051907, "num_input_tokens_seen": 290901560, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6796875, "step": 13478, "time_per_iteration": 2.6999223232269287 }, { "auxiliary_loss_clip": 0.01022397, "auxiliary_loss_mlp": 0.01001767, "balance_loss_clip": 1.00054491, "balance_loss_mlp": 1.00109005, "epoch": 0.8104013227115587, "flos": 59153314665600.0, "grad_norm": 0.6856310337252797, "language_loss": 0.52169579, "learning_rate": 3.4441731356517003e-07, "loss": 0.54193747, "num_input_tokens_seen": 290959185, "router_z_loss_clip": 0.01220703, "router_z_loss_mlp": 0.21289062, "step": 13479, "time_per_iteration": 4.454169750213623 }, { "auxiliary_loss_clip": 0.01139139, "auxiliary_loss_mlp": 0.01031643, "balance_loss_clip": 1.01903677, "balance_loss_mlp": 1.03512454, "epoch": 0.8104614459642266, "flos": 19828687760640.0, "grad_norm": 2.1007493265548653, "language_loss": 0.71071655, "learning_rate": 3.4420537114754766e-07, "loss": 0.73242438, "num_input_tokens_seen": 290979585, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 13480, "time_per_iteration": 2.6182055473327637 }, { "auxiliary_loss_clip": 0.01103659, "auxiliary_loss_mlp": 0.0102924, "balance_loss_clip": 1.0174861, "balance_loss_mlp": 1.03435194, "epoch": 0.8105215692168947, "flos": 25374264624000.0, "grad_norm": 1.8607364688082793, "language_loss": 0.7943275, "learning_rate": 3.439934878209756e-07, "loss": 0.81565654, "num_input_tokens_seen": 291000865, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6953125, "step": 13481, "time_per_iteration": 2.743306875228882 }, { "auxiliary_loss_clip": 0.01120639, "auxiliary_loss_mlp": 0.01032916, "balance_loss_clip": 1.02062571, "balance_loss_mlp": 1.03388143, "epoch": 0.8105816924695626, "flos": 20481722974080.0, "grad_norm": 1.7539907903165783, "language_loss": 0.72248769, "learning_rate": 3.4378166359301663e-07, "loss": 0.7440232, "num_input_tokens_seen": 291018285, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 13482, "time_per_iteration": 2.673281669616699 }, { "auxiliary_loss_clip": 0.01147503, "auxiliary_loss_mlp": 0.01026496, "balance_loss_clip": 1.01372325, "balance_loss_mlp": 1.03221023, "epoch": 0.8106418157222306, "flos": 14538687143040.0, "grad_norm": 3.2330536110958934, "language_loss": 0.65625346, "learning_rate": 3.435698984712292e-07, "loss": 0.67799348, "num_input_tokens_seen": 291035745, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 13483, "time_per_iteration": 2.59946870803833 }, { "auxiliary_loss_clip": 0.01119999, "auxiliary_loss_mlp": 0.01028418, "balance_loss_clip": 1.01583004, "balance_loss_mlp": 1.03339803, "epoch": 0.8107019389748985, "flos": 22564470205440.0, "grad_norm": 2.3847728520442457, "language_loss": 0.76258045, "learning_rate": 3.433581924631723e-07, "loss": 0.78406465, "num_input_tokens_seen": 291053280, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 13484, "time_per_iteration": 2.6059062480926514 }, { "auxiliary_loss_clip": 0.01126031, "auxiliary_loss_mlp": 0.01032417, "balance_loss_clip": 1.01928568, "balance_loss_mlp": 1.03530347, "epoch": 0.8107620622275665, "flos": 19609560840960.0, "grad_norm": 1.468993909270639, "language_loss": 0.72207296, "learning_rate": 3.431465455763991e-07, "loss": 0.74365747, "num_input_tokens_seen": 291072855, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73046875, "step": 13485, "time_per_iteration": 2.536604642868042 }, { "auxiliary_loss_clip": 0.01127592, "auxiliary_loss_mlp": 0.01024513, "balance_loss_clip": 1.01303875, "balance_loss_mlp": 1.03340495, "epoch": 0.8108221854802344, "flos": 16143498984960.0, "grad_norm": 1.9771178973099837, "language_loss": 0.7525574, "learning_rate": 3.429349578184644e-07, "loss": 0.77407849, "num_input_tokens_seen": 291090285, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.671875, "step": 13486, "time_per_iteration": 4.045328617095947 }, { "auxiliary_loss_clip": 0.01125415, "auxiliary_loss_mlp": 0.01030177, "balance_loss_clip": 1.01764262, "balance_loss_mlp": 1.03601599, "epoch": 0.8108823087329025, "flos": 21106209853440.0, "grad_norm": 1.9075388911295665, "language_loss": 0.72478807, "learning_rate": 3.427234291969183e-07, "loss": 0.74634397, "num_input_tokens_seen": 291107675, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 13487, "time_per_iteration": 2.5505828857421875 }, { "auxiliary_loss_clip": 0.01144727, "auxiliary_loss_mlp": 0.01032979, "balance_loss_clip": 1.02170217, "balance_loss_mlp": 1.0348171, "epoch": 0.8109424319855704, "flos": 29199648182400.0, "grad_norm": 1.691218748590456, "language_loss": 0.84144723, "learning_rate": 3.4251195971931025e-07, "loss": 0.86322427, "num_input_tokens_seen": 291126900, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.65625, "step": 13488, "time_per_iteration": 4.145839691162109 }, { "auxiliary_loss_clip": 0.01105101, "auxiliary_loss_mlp": 0.01030383, "balance_loss_clip": 1.01764011, "balance_loss_mlp": 1.03538859, "epoch": 0.8110025552382384, "flos": 23111856541440.0, "grad_norm": 1.8799916830980525, "language_loss": 0.73690915, "learning_rate": 3.423005493931861e-07, "loss": 0.75826395, "num_input_tokens_seen": 291145285, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 13489, "time_per_iteration": 2.5673255920410156 }, { "auxiliary_loss_clip": 0.01136428, "auxiliary_loss_mlp": 0.01276794, "balance_loss_clip": 1.01755536, "balance_loss_mlp": 1.03397393, "epoch": 0.8110626784909064, "flos": 22379961018240.0, "grad_norm": 1.564187728604566, "language_loss": 0.71587801, "learning_rate": 3.4208919822609185e-07, "loss": 0.74001014, "num_input_tokens_seen": 291163485, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.671875, "step": 13490, "time_per_iteration": 2.6917316913604736 }, { "auxiliary_loss_clip": 0.0113357, "auxiliary_loss_mlp": 0.01037598, "balance_loss_clip": 1.02397251, "balance_loss_mlp": 1.03620994, "epoch": 0.8111228017435743, "flos": 23368043318400.0, "grad_norm": 4.732250032030551, "language_loss": 0.71872407, "learning_rate": 3.4187790622556987e-07, "loss": 0.74043572, "num_input_tokens_seen": 291182215, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.703125, "step": 13491, "time_per_iteration": 2.5641539096832275 }, { "auxiliary_loss_clip": 0.01103133, "auxiliary_loss_mlp": 0.01029924, "balance_loss_clip": 1.01784158, "balance_loss_mlp": 1.0331099, "epoch": 0.8111829249962423, "flos": 21286553063040.0, "grad_norm": 1.3667676024296527, "language_loss": 0.67697084, "learning_rate": 3.416666733991593e-07, "loss": 0.69830143, "num_input_tokens_seen": 291203145, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69921875, "step": 13492, "time_per_iteration": 2.5530011653900146 }, { "auxiliary_loss_clip": 0.01142109, "auxiliary_loss_mlp": 0.01031393, "balance_loss_clip": 1.01890612, "balance_loss_mlp": 1.03541672, "epoch": 0.8112430482489102, "flos": 22345558767360.0, "grad_norm": 1.95463146870522, "language_loss": 0.72390652, "learning_rate": 3.414554997543999e-07, "loss": 0.74564159, "num_input_tokens_seen": 291220600, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 13493, "time_per_iteration": 2.649827241897583 }, { "auxiliary_loss_clip": 0.01113354, "auxiliary_loss_mlp": 0.01036844, "balance_loss_clip": 1.02391028, "balance_loss_mlp": 1.03450549, "epoch": 0.8113031715015783, "flos": 31138321962240.0, "grad_norm": 1.6584742527542926, "language_loss": 0.70774287, "learning_rate": 3.412443852988274e-07, "loss": 0.72924483, "num_input_tokens_seen": 291241195, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 13494, "time_per_iteration": 2.7092700004577637 }, { "auxiliary_loss_clip": 0.01115343, "auxiliary_loss_mlp": 0.01031371, "balance_loss_clip": 1.017555, "balance_loss_mlp": 1.03516078, "epoch": 0.8113632947542462, "flos": 25335445000320.0, "grad_norm": 2.0360753569819625, "language_loss": 0.7638334, "learning_rate": 3.410333300399759e-07, "loss": 0.78530055, "num_input_tokens_seen": 291258715, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71875, "step": 13495, "time_per_iteration": 2.562764883041382 }, { "auxiliary_loss_clip": 0.01122242, "auxiliary_loss_mlp": 0.01035483, "balance_loss_clip": 1.02300215, "balance_loss_mlp": 1.03504503, "epoch": 0.8114234180069142, "flos": 28439168411520.0, "grad_norm": 1.8528524463356812, "language_loss": 0.80107695, "learning_rate": 3.408223339853771e-07, "loss": 0.82265425, "num_input_tokens_seen": 291278030, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 13496, "time_per_iteration": 2.6269404888153076 }, { "auxiliary_loss_clip": 0.01111743, "auxiliary_loss_mlp": 0.01032485, "balance_loss_clip": 1.01932979, "balance_loss_mlp": 1.03374672, "epoch": 0.8114835412595821, "flos": 20338870584960.0, "grad_norm": 3.950747912091263, "language_loss": 0.71224433, "learning_rate": 3.4061139714256217e-07, "loss": 0.73368657, "num_input_tokens_seen": 291296740, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.69140625, "step": 13497, "time_per_iteration": 2.5279064178466797 }, { "auxiliary_loss_clip": 0.01132091, "auxiliary_loss_mlp": 0.0102993, "balance_loss_clip": 1.01862836, "balance_loss_mlp": 1.03556025, "epoch": 0.8115436645122501, "flos": 22490889194880.0, "grad_norm": 1.760254725751619, "language_loss": 0.76778322, "learning_rate": 3.404005195190571e-07, "loss": 0.78940344, "num_input_tokens_seen": 291318730, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.69921875, "step": 13498, "time_per_iteration": 2.590355157852173 }, { "auxiliary_loss_clip": 0.01102573, "auxiliary_loss_mlp": 0.01035677, "balance_loss_clip": 1.02336311, "balance_loss_mlp": 1.03376329, "epoch": 0.811603787764918, "flos": 13845288021120.0, "grad_norm": 1.9512902555277123, "language_loss": 0.84134066, "learning_rate": 3.401897011223891e-07, "loss": 0.86272323, "num_input_tokens_seen": 291336755, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 13499, "time_per_iteration": 2.5672521591186523 }, { "auxiliary_loss_clip": 0.01129095, "auxiliary_loss_mlp": 0.01031148, "balance_loss_clip": 1.01776052, "balance_loss_mlp": 1.03416324, "epoch": 0.8116639110175861, "flos": 21614632911360.0, "grad_norm": 2.9360399448730234, "language_loss": 0.76298928, "learning_rate": 3.399789419600805e-07, "loss": 0.78459167, "num_input_tokens_seen": 291356795, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6796875, "step": 13500, "time_per_iteration": 2.644357919692993 }, { "auxiliary_loss_clip": 0.01040521, "auxiliary_loss_mlp": 0.01004571, "balance_loss_clip": 1.0032891, "balance_loss_mlp": 1.00147438, "epoch": 0.811724034270254, "flos": 64459799625600.0, "grad_norm": 0.8025664088558144, "language_loss": 0.59951186, "learning_rate": 3.397682420396544e-07, "loss": 0.61996275, "num_input_tokens_seen": 291416005, "router_z_loss_clip": 0.01281738, "router_z_loss_mlp": 0.21289062, "step": 13501, "time_per_iteration": 3.229804039001465 }, { "auxiliary_loss_clip": 0.01109715, "auxiliary_loss_mlp": 0.01024572, "balance_loss_clip": 1.01275826, "balance_loss_mlp": 1.03471458, "epoch": 0.811784157522922, "flos": 24754123290240.0, "grad_norm": 1.744766739211908, "language_loss": 0.78813523, "learning_rate": 3.395576013686281e-07, "loss": 0.80947816, "num_input_tokens_seen": 291434870, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6640625, "step": 13502, "time_per_iteration": 2.6588854789733887 }, { "auxiliary_loss_clip": 0.01122215, "auxiliary_loss_mlp": 0.01032584, "balance_loss_clip": 1.02065134, "balance_loss_mlp": 1.0354166, "epoch": 0.81184428077559, "flos": 12167146563840.0, "grad_norm": 1.9481915725873753, "language_loss": 0.71026778, "learning_rate": 3.3934701995452033e-07, "loss": 0.73181581, "num_input_tokens_seen": 291452230, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 13503, "time_per_iteration": 2.660686492919922 }, { "auxiliary_loss_clip": 0.0112229, "auxiliary_loss_mlp": 0.01029937, "balance_loss_clip": 1.01688385, "balance_loss_mlp": 1.03508914, "epoch": 0.8119044040282579, "flos": 44422037775360.0, "grad_norm": 1.9529282526194536, "language_loss": 0.67593503, "learning_rate": 3.391364978048457e-07, "loss": 0.69745731, "num_input_tokens_seen": 291477425, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69140625, "step": 13504, "time_per_iteration": 2.904874086380005 }, { "auxiliary_loss_clip": 0.01121499, "auxiliary_loss_mlp": 0.01030451, "balance_loss_clip": 1.01699209, "balance_loss_mlp": 1.03269529, "epoch": 0.8119645272809259, "flos": 52155507957120.0, "grad_norm": 1.774792001751292, "language_loss": 0.74378073, "learning_rate": 3.3892603492711704e-07, "loss": 0.76530015, "num_input_tokens_seen": 291501070, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 13505, "time_per_iteration": 2.896496295928955 }, { "auxiliary_loss_clip": 0.01121732, "auxiliary_loss_mlp": 0.01027972, "balance_loss_clip": 1.01416731, "balance_loss_mlp": 1.03443813, "epoch": 0.8120246505335939, "flos": 30232978640640.0, "grad_norm": 2.013120602476509, "language_loss": 0.72710073, "learning_rate": 3.387156313288457e-07, "loss": 0.74859774, "num_input_tokens_seen": 291524945, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.703125, "step": 13506, "time_per_iteration": 2.757632255554199 }, { "auxiliary_loss_clip": 0.01108582, "auxiliary_loss_mlp": 0.01028255, "balance_loss_clip": 1.01572645, "balance_loss_mlp": 1.03561521, "epoch": 0.8120847737862619, "flos": 22127652910080.0, "grad_norm": 2.3614745609585164, "language_loss": 0.76317543, "learning_rate": 3.3850528701754e-07, "loss": 0.78454375, "num_input_tokens_seen": 291544605, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.73046875, "step": 13507, "time_per_iteration": 2.6492245197296143 }, { "auxiliary_loss_clip": 0.01105575, "auxiliary_loss_mlp": 0.01027686, "balance_loss_clip": 1.01565218, "balance_loss_mlp": 1.03328085, "epoch": 0.8121448970389298, "flos": 23295180579840.0, "grad_norm": 1.6832365087544676, "language_loss": 0.70224327, "learning_rate": 3.382950020007067e-07, "loss": 0.72357589, "num_input_tokens_seen": 291563850, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.72265625, "step": 13508, "time_per_iteration": 2.6773955821990967 }, { "auxiliary_loss_clip": 0.01112727, "auxiliary_loss_mlp": 0.01035438, "balance_loss_clip": 1.02324259, "balance_loss_mlp": 1.03520095, "epoch": 0.8122050202915978, "flos": 22164138149760.0, "grad_norm": 1.789032319327011, "language_loss": 0.76167822, "learning_rate": 3.380847762858501e-07, "loss": 0.78315985, "num_input_tokens_seen": 291581730, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 13509, "time_per_iteration": 2.6137189865112305 }, { "auxiliary_loss_clip": 0.01104745, "auxiliary_loss_mlp": 0.01035183, "balance_loss_clip": 1.02261257, "balance_loss_mlp": 1.03419709, "epoch": 0.8122651435442657, "flos": 23258946735360.0, "grad_norm": 1.9160600512321435, "language_loss": 0.76980615, "learning_rate": 3.37874609880473e-07, "loss": 0.79120541, "num_input_tokens_seen": 291601225, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 13510, "time_per_iteration": 2.5747225284576416 }, { "auxiliary_loss_clip": 0.01115286, "auxiliary_loss_mlp": 0.0103552, "balance_loss_clip": 1.02244258, "balance_loss_mlp": 1.03479934, "epoch": 0.8123252667969337, "flos": 16140015365760.0, "grad_norm": 1.6532900236697186, "language_loss": 0.69774306, "learning_rate": 3.376645027920759e-07, "loss": 0.71925116, "num_input_tokens_seen": 291616995, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 13511, "time_per_iteration": 2.583381414413452 }, { "auxiliary_loss_clip": 0.01123934, "auxiliary_loss_mlp": 0.01034331, "balance_loss_clip": 1.02247524, "balance_loss_mlp": 1.03560269, "epoch": 0.8123853900496016, "flos": 21245399055360.0, "grad_norm": 1.6381094410182326, "language_loss": 0.79664522, "learning_rate": 3.374544550281564e-07, "loss": 0.81822789, "num_input_tokens_seen": 291636145, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.703125, "step": 13512, "time_per_iteration": 2.631725311279297 }, { "auxiliary_loss_clip": 0.01111928, "auxiliary_loss_mlp": 0.01032108, "balance_loss_clip": 1.01991832, "balance_loss_mlp": 1.03389716, "epoch": 0.8124455133022697, "flos": 64377596373120.0, "grad_norm": 1.6346388829980338, "language_loss": 0.63603461, "learning_rate": 3.3724446659621063e-07, "loss": 0.65747494, "num_input_tokens_seen": 291662440, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69140625, "step": 13513, "time_per_iteration": 2.870697498321533 }, { "auxiliary_loss_clip": 0.01142359, "auxiliary_loss_mlp": 0.01034251, "balance_loss_clip": 1.02175164, "balance_loss_mlp": 1.03482389, "epoch": 0.8125056365549376, "flos": 23842207779840.0, "grad_norm": 1.678249626505243, "language_loss": 0.71390378, "learning_rate": 3.3703453750373355e-07, "loss": 0.73566985, "num_input_tokens_seen": 291680950, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 13514, "time_per_iteration": 2.572510004043579 }, { "auxiliary_loss_clip": 0.01127127, "auxiliary_loss_mlp": 0.01031858, "balance_loss_clip": 1.0190196, "balance_loss_mlp": 1.03277767, "epoch": 0.8125657598076056, "flos": 23550325862400.0, "grad_norm": 1.4988249591898908, "language_loss": 0.63031185, "learning_rate": 3.3682466775821515e-07, "loss": 0.65190178, "num_input_tokens_seen": 291702395, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6796875, "step": 13515, "time_per_iteration": 4.004777669906616 }, { "auxiliary_loss_clip": 0.01097191, "auxiliary_loss_mlp": 0.01275108, "balance_loss_clip": 1.01637506, "balance_loss_mlp": 1.03262091, "epoch": 0.8126258830602736, "flos": 20704225772160.0, "grad_norm": 1.5029003752037882, "language_loss": 0.75348997, "learning_rate": 3.366148573671466e-07, "loss": 0.77721298, "num_input_tokens_seen": 291721135, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.64453125, "step": 13516, "time_per_iteration": 2.5380852222442627 }, { "auxiliary_loss_clip": 0.01141037, "auxiliary_loss_mlp": 0.01033144, "balance_loss_clip": 1.02096057, "balance_loss_mlp": 1.03457117, "epoch": 0.8126860063129415, "flos": 23618160696960.0, "grad_norm": 1.7116783033793057, "language_loss": 0.91348588, "learning_rate": 3.3640510633801465e-07, "loss": 0.93522763, "num_input_tokens_seen": 291741235, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.7109375, "step": 13517, "time_per_iteration": 2.598564624786377 }, { "auxiliary_loss_clip": 0.01115283, "auxiliary_loss_mlp": 0.01277644, "balance_loss_clip": 1.01899743, "balance_loss_mlp": 1.03716159, "epoch": 0.8127461295656095, "flos": 25007149670400.0, "grad_norm": 1.5118559636889757, "language_loss": 0.78739667, "learning_rate": 3.3619541467830615e-07, "loss": 0.81132591, "num_input_tokens_seen": 291761430, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6953125, "step": 13518, "time_per_iteration": 2.566389799118042 }, { "auxiliary_loss_clip": 0.011141, "auxiliary_loss_mlp": 0.01030657, "balance_loss_clip": 1.01954627, "balance_loss_mlp": 1.03667068, "epoch": 0.8128062528182775, "flos": 27342169096320.0, "grad_norm": 1.6751562476097157, "language_loss": 0.7889601, "learning_rate": 3.359857823955026e-07, "loss": 0.8104077, "num_input_tokens_seen": 291781755, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.68359375, "step": 13519, "time_per_iteration": 2.5816166400909424 }, { "auxiliary_loss_clip": 0.01104489, "auxiliary_loss_mlp": 0.01036505, "balance_loss_clip": 1.02392817, "balance_loss_mlp": 1.03407943, "epoch": 0.8128663760709455, "flos": 26506312634880.0, "grad_norm": 1.7226618076787366, "language_loss": 0.70415932, "learning_rate": 3.357762094970864e-07, "loss": 0.72556925, "num_input_tokens_seen": 291804410, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 13520, "time_per_iteration": 2.543165922164917 }, { "auxiliary_loss_clip": 0.01123144, "auxiliary_loss_mlp": 0.01030532, "balance_loss_clip": 1.01790786, "balance_loss_mlp": 1.03522992, "epoch": 0.8129264993236134, "flos": 17931239815680.0, "grad_norm": 1.5834969351006707, "language_loss": 0.72875345, "learning_rate": 3.3556669599053654e-07, "loss": 0.75029016, "num_input_tokens_seen": 291823285, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 13521, "time_per_iteration": 3.9185447692871094 }, { "auxiliary_loss_clip": 0.01113969, "auxiliary_loss_mlp": 0.01033195, "balance_loss_clip": 1.02082145, "balance_loss_mlp": 1.03451514, "epoch": 0.8129866225762814, "flos": 26177694082560.0, "grad_norm": 1.920172756013311, "language_loss": 0.70239592, "learning_rate": 3.353572418833299e-07, "loss": 0.72386754, "num_input_tokens_seen": 291845305, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 13522, "time_per_iteration": 2.6421103477478027 }, { "auxiliary_loss_clip": 0.01110113, "auxiliary_loss_mlp": 0.01032004, "balance_loss_clip": 1.02069652, "balance_loss_mlp": 1.0353024, "epoch": 0.8130467458289493, "flos": 21032197879680.0, "grad_norm": 1.7521844029449458, "language_loss": 0.70307577, "learning_rate": 3.351478471829412e-07, "loss": 0.72449696, "num_input_tokens_seen": 291863715, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.66015625, "step": 13523, "time_per_iteration": 2.5652365684509277 }, { "auxiliary_loss_clip": 0.01111078, "auxiliary_loss_mlp": 0.0102947, "balance_loss_clip": 1.01651752, "balance_loss_mlp": 1.03360319, "epoch": 0.8131068690816173, "flos": 15487051979520.0, "grad_norm": 2.0126162857227046, "language_loss": 0.70859003, "learning_rate": 3.349385118968435e-07, "loss": 0.72999549, "num_input_tokens_seen": 291880735, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 13524, "time_per_iteration": 2.567235231399536 }, { "auxiliary_loss_clip": 0.01112057, "auxiliary_loss_mlp": 0.01029267, "balance_loss_clip": 1.01716697, "balance_loss_mlp": 1.03399575, "epoch": 0.8131669923342852, "flos": 29351227576320.0, "grad_norm": 1.910520085040919, "language_loss": 0.62462813, "learning_rate": 3.3472923603250713e-07, "loss": 0.64604133, "num_input_tokens_seen": 291900535, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 13525, "time_per_iteration": 2.5766499042510986 }, { "auxiliary_loss_clip": 0.01120881, "auxiliary_loss_mlp": 0.01033338, "balance_loss_clip": 1.02126813, "balance_loss_mlp": 1.03407586, "epoch": 0.8132271155869533, "flos": 35256162055680.0, "grad_norm": 1.6213876252285984, "language_loss": 0.65649295, "learning_rate": 3.345200195974003e-07, "loss": 0.67803508, "num_input_tokens_seen": 291919760, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 13526, "time_per_iteration": 2.6423449516296387 }, { "auxiliary_loss_clip": 0.01118774, "auxiliary_loss_mlp": 0.01028163, "balance_loss_clip": 1.01620626, "balance_loss_mlp": 1.03314304, "epoch": 0.8132872388396212, "flos": 27781895393280.0, "grad_norm": 1.5354830539509334, "language_loss": 0.75146085, "learning_rate": 3.3431086259899013e-07, "loss": 0.77293015, "num_input_tokens_seen": 291938915, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.68359375, "step": 13527, "time_per_iteration": 2.594057321548462 }, { "auxiliary_loss_clip": 0.01101284, "auxiliary_loss_mlp": 0.01028583, "balance_loss_clip": 1.01661992, "balance_loss_mlp": 1.03335452, "epoch": 0.8133473620922892, "flos": 27819601695360.0, "grad_norm": 1.7200519705740998, "language_loss": 0.70825464, "learning_rate": 3.3410176504474087e-07, "loss": 0.72955328, "num_input_tokens_seen": 291958145, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 13528, "time_per_iteration": 3.9867024421691895 }, { "auxiliary_loss_clip": 0.01104702, "auxiliary_loss_mlp": 0.01033469, "balance_loss_clip": 1.02122664, "balance_loss_mlp": 1.03444171, "epoch": 0.8134074853449572, "flos": 18989527248000.0, "grad_norm": 1.8316247486053698, "language_loss": 0.68938565, "learning_rate": 3.338927269421143e-07, "loss": 0.71076739, "num_input_tokens_seen": 291976860, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.703125, "step": 13529, "time_per_iteration": 2.4777214527130127 }, { "auxiliary_loss_clip": 0.01110861, "auxiliary_loss_mlp": 0.01030781, "balance_loss_clip": 1.01884222, "balance_loss_mlp": 1.03374982, "epoch": 0.8134676085976251, "flos": 24242863057920.0, "grad_norm": 1.3551959338471813, "language_loss": 0.77669352, "learning_rate": 3.3368374829857016e-07, "loss": 0.79810989, "num_input_tokens_seen": 291998085, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6796875, "step": 13530, "time_per_iteration": 4.0585997104644775 }, { "auxiliary_loss_clip": 0.01111894, "auxiliary_loss_mlp": 0.0103499, "balance_loss_clip": 1.02192426, "balance_loss_mlp": 1.03437495, "epoch": 0.8135277318502931, "flos": 19062389986560.0, "grad_norm": 1.7495589850711577, "language_loss": 0.81895959, "learning_rate": 3.334748291215677e-07, "loss": 0.84042847, "num_input_tokens_seen": 292016585, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6875, "step": 13531, "time_per_iteration": 2.504432201385498 }, { "auxiliary_loss_clip": 0.01122393, "auxiliary_loss_mlp": 0.01030644, "balance_loss_clip": 1.01734018, "balance_loss_mlp": 1.03403568, "epoch": 0.813587855102961, "flos": 17269728992640.0, "grad_norm": 1.9853947716051066, "language_loss": 0.71554631, "learning_rate": 3.3326596941856065e-07, "loss": 0.7370767, "num_input_tokens_seen": 292033255, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 13532, "time_per_iteration": 2.4650802612304688 }, { "auxiliary_loss_clip": 0.01129395, "auxiliary_loss_mlp": 0.0103082, "balance_loss_clip": 1.01889944, "balance_loss_mlp": 1.034356, "epoch": 0.8136479783556291, "flos": 20157593621760.0, "grad_norm": 2.111710952743999, "language_loss": 0.76254302, "learning_rate": 3.3305716919700456e-07, "loss": 0.78414512, "num_input_tokens_seen": 292051800, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 13533, "time_per_iteration": 2.4895405769348145 }, { "auxiliary_loss_clip": 0.01107055, "auxiliary_loss_mlp": 0.01283664, "balance_loss_clip": 1.02359033, "balance_loss_mlp": 1.0342257, "epoch": 0.813708101608297, "flos": 22052348046720.0, "grad_norm": 1.699148630275631, "language_loss": 0.76413625, "learning_rate": 3.328484284643496e-07, "loss": 0.7880435, "num_input_tokens_seen": 292072215, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 13534, "time_per_iteration": 2.5203397274017334 }, { "auxiliary_loss_clip": 0.01112266, "auxiliary_loss_mlp": 0.01027306, "balance_loss_clip": 1.01646352, "balance_loss_mlp": 1.03450334, "epoch": 0.813768224860965, "flos": 16173412035840.0, "grad_norm": 2.1838016538701712, "language_loss": 0.93024212, "learning_rate": 3.3263974722804666e-07, "loss": 0.95163786, "num_input_tokens_seen": 292088830, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.68359375, "step": 13535, "time_per_iteration": 2.509406089782715 }, { "auxiliary_loss_clip": 0.01129007, "auxiliary_loss_mlp": 0.01029608, "balance_loss_clip": 1.01743674, "balance_loss_mlp": 1.03557062, "epoch": 0.8138283481136329, "flos": 24352318776960.0, "grad_norm": 1.575878159073258, "language_loss": 0.70291066, "learning_rate": 3.324311254955419e-07, "loss": 0.72449678, "num_input_tokens_seen": 292109225, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6640625, "step": 13536, "time_per_iteration": 2.5709476470947266 }, { "auxiliary_loss_clip": 0.01128535, "auxiliary_loss_mlp": 0.0102989, "balance_loss_clip": 1.01752257, "balance_loss_mlp": 1.03335357, "epoch": 0.8138884713663009, "flos": 25516362827520.0, "grad_norm": 1.7006765277168812, "language_loss": 0.75320351, "learning_rate": 3.322225632742801e-07, "loss": 0.77478778, "num_input_tokens_seen": 292129660, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 13537, "time_per_iteration": 2.548281192779541 }, { "auxiliary_loss_clip": 0.01120453, "auxiliary_loss_mlp": 0.01028425, "balance_loss_clip": 1.01645088, "balance_loss_mlp": 1.03555226, "epoch": 0.8139485946189688, "flos": 22454368041600.0, "grad_norm": 1.6194214753809955, "language_loss": 0.76237118, "learning_rate": 3.3201406057170587e-07, "loss": 0.78386003, "num_input_tokens_seen": 292149090, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.66796875, "step": 13538, "time_per_iteration": 2.5464088916778564 }, { "auxiliary_loss_clip": 0.01109844, "auxiliary_loss_mlp": 0.01028593, "balance_loss_clip": 1.01677966, "balance_loss_mlp": 1.03204513, "epoch": 0.8140087178716369, "flos": 21250391045760.0, "grad_norm": 1.5989941105189187, "language_loss": 0.78090513, "learning_rate": 3.318056173952586e-07, "loss": 0.80228949, "num_input_tokens_seen": 292169260, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 13539, "time_per_iteration": 2.72995662689209 }, { "auxiliary_loss_clip": 0.01138814, "auxiliary_loss_mlp": 0.01030615, "balance_loss_clip": 1.01880097, "balance_loss_mlp": 1.0358963, "epoch": 0.8140688411243048, "flos": 39415730774400.0, "grad_norm": 2.50208052031171, "language_loss": 0.65914726, "learning_rate": 3.31597233752378e-07, "loss": 0.68084157, "num_input_tokens_seen": 292188145, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.66796875, "step": 13540, "time_per_iteration": 2.7102537155151367 }, { "auxiliary_loss_clip": 0.01114542, "auxiliary_loss_mlp": 0.01031094, "balance_loss_clip": 1.01948917, "balance_loss_mlp": 1.03671384, "epoch": 0.8141289643769728, "flos": 25415885508480.0, "grad_norm": 1.3998810418911456, "language_loss": 0.67512619, "learning_rate": 3.3138890965050046e-07, "loss": 0.69658256, "num_input_tokens_seen": 292212135, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6875, "step": 13541, "time_per_iteration": 2.5790064334869385 }, { "auxiliary_loss_clip": 0.01101974, "auxiliary_loss_mlp": 0.01028747, "balance_loss_clip": 1.0164392, "balance_loss_mlp": 1.03372693, "epoch": 0.8141890876296408, "flos": 12568053237120.0, "grad_norm": 1.940061177438044, "language_loss": 0.69076681, "learning_rate": 3.3118064509706065e-07, "loss": 0.71207404, "num_input_tokens_seen": 292230645, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 13542, "time_per_iteration": 2.5242841243743896 }, { "auxiliary_loss_clip": 0.01107755, "auxiliary_loss_mlp": 0.01030381, "balance_loss_clip": 1.01745892, "balance_loss_mlp": 1.03481126, "epoch": 0.8142492108823087, "flos": 14967172483200.0, "grad_norm": 2.744954756801328, "language_loss": 0.79395068, "learning_rate": 3.3097244009949044e-07, "loss": 0.815332, "num_input_tokens_seen": 292243540, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73046875, "step": 13543, "time_per_iteration": 2.439436197280884 }, { "auxiliary_loss_clip": 0.01133688, "auxiliary_loss_mlp": 0.01038422, "balance_loss_clip": 1.02473736, "balance_loss_mlp": 1.03565145, "epoch": 0.8143093341349767, "flos": 12422004537600.0, "grad_norm": 1.9096491502175066, "language_loss": 0.77266192, "learning_rate": 3.307642946652209e-07, "loss": 0.79438299, "num_input_tokens_seen": 292261715, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 13544, "time_per_iteration": 2.5641238689422607 }, { "auxiliary_loss_clip": 0.01111135, "auxiliary_loss_mlp": 0.01033973, "balance_loss_clip": 1.01935744, "balance_loss_mlp": 1.03635836, "epoch": 0.8143694573876447, "flos": 11910564737280.0, "grad_norm": 2.8589079101815975, "language_loss": 0.73291451, "learning_rate": 3.3055620880168023e-07, "loss": 0.75436556, "num_input_tokens_seen": 292275080, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.75, "step": 13545, "time_per_iteration": 2.466754198074341 }, { "auxiliary_loss_clip": 0.01110526, "auxiliary_loss_mlp": 0.01029365, "balance_loss_clip": 1.01764047, "balance_loss_mlp": 1.03449059, "epoch": 0.8144295806403127, "flos": 21212900225280.0, "grad_norm": 1.737307018731399, "language_loss": 0.76873147, "learning_rate": 3.303481825162939e-07, "loss": 0.79013038, "num_input_tokens_seen": 292294635, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 13546, "time_per_iteration": 2.578491449356079 }, { "auxiliary_loss_clip": 0.01115222, "auxiliary_loss_mlp": 0.01028386, "balance_loss_clip": 1.0168885, "balance_loss_mlp": 1.03630185, "epoch": 0.8144897038929806, "flos": 19865280741120.0, "grad_norm": 2.2802956000469887, "language_loss": 0.70354676, "learning_rate": 3.3014021581648566e-07, "loss": 0.72498286, "num_input_tokens_seen": 292312695, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.703125, "step": 13547, "time_per_iteration": 2.599280595779419 }, { "auxiliary_loss_clip": 0.01106927, "auxiliary_loss_mlp": 0.0103733, "balance_loss_clip": 1.02372777, "balance_loss_mlp": 1.03518963, "epoch": 0.8145498271456486, "flos": 24571733005440.0, "grad_norm": 1.9670726344750709, "language_loss": 0.70168769, "learning_rate": 3.299323087096786e-07, "loss": 0.72313029, "num_input_tokens_seen": 292332005, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 13548, "time_per_iteration": 2.6239590644836426 }, { "auxiliary_loss_clip": 0.0111801, "auxiliary_loss_mlp": 0.01032646, "balance_loss_clip": 1.01936615, "balance_loss_mlp": 1.03767502, "epoch": 0.8146099503983165, "flos": 20193037367040.0, "grad_norm": 2.0173857098793997, "language_loss": 0.76824677, "learning_rate": 3.2972446120329055e-07, "loss": 0.78975332, "num_input_tokens_seen": 292348365, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 13549, "time_per_iteration": 2.5216894149780273 }, { "auxiliary_loss_clip": 0.01106033, "auxiliary_loss_mlp": 0.01030659, "balance_loss_clip": 1.01845193, "balance_loss_mlp": 1.03657818, "epoch": 0.8146700736509845, "flos": 19536949497600.0, "grad_norm": 1.8520964367356483, "language_loss": 0.70765543, "learning_rate": 3.2951667330474075e-07, "loss": 0.72902238, "num_input_tokens_seen": 292368050, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 13550, "time_per_iteration": 2.5347046852111816 }, { "auxiliary_loss_clip": 0.0110839, "auxiliary_loss_mlp": 0.01025838, "balance_loss_clip": 1.01373243, "balance_loss_mlp": 1.03353, "epoch": 0.8147301969036524, "flos": 18041341979520.0, "grad_norm": 4.553888759701478, "language_loss": 0.71880496, "learning_rate": 3.293089450214437e-07, "loss": 0.74014717, "num_input_tokens_seen": 292385315, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.66015625, "step": 13551, "time_per_iteration": 2.572643995285034 }, { "auxiliary_loss_clip": 0.01146196, "auxiliary_loss_mlp": 0.01035375, "balance_loss_clip": 1.0215888, "balance_loss_mlp": 1.03745031, "epoch": 0.8147903201563205, "flos": 18004713085440.0, "grad_norm": 1.845180144407759, "language_loss": 0.68621701, "learning_rate": 3.2910127636081277e-07, "loss": 0.70803267, "num_input_tokens_seen": 292403375, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.73828125, "step": 13552, "time_per_iteration": 2.5992507934570312 }, { "auxiliary_loss_clip": 0.01103334, "auxiliary_loss_mlp": 0.01041334, "balance_loss_clip": 1.02900767, "balance_loss_mlp": 1.03379607, "epoch": 0.8148504434089884, "flos": 20259327916800.0, "grad_norm": 2.168032371958883, "language_loss": 0.82240212, "learning_rate": 3.2889366733025935e-07, "loss": 0.84384882, "num_input_tokens_seen": 292419260, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 13553, "time_per_iteration": 2.5081701278686523 }, { "auxiliary_loss_clip": 0.01112923, "auxiliary_loss_mlp": 0.01026954, "balance_loss_clip": 1.01439524, "balance_loss_mlp": 1.03480947, "epoch": 0.8149105666616564, "flos": 12494723621760.0, "grad_norm": 1.7625793337401472, "language_loss": 0.68048382, "learning_rate": 3.2868611793719183e-07, "loss": 0.70188254, "num_input_tokens_seen": 292436095, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 13554, "time_per_iteration": 2.579425811767578 }, { "auxiliary_loss_clip": 0.01125107, "auxiliary_loss_mlp": 0.01036005, "balance_loss_clip": 1.0228442, "balance_loss_mlp": 1.03698111, "epoch": 0.8149706899143244, "flos": 32523683662080.0, "grad_norm": 1.9410337339032269, "language_loss": 0.66584027, "learning_rate": 3.284786281890186e-07, "loss": 0.68745136, "num_input_tokens_seen": 292457190, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 13555, "time_per_iteration": 2.6139001846313477 }, { "auxiliary_loss_clip": 0.01115413, "auxiliary_loss_mlp": 0.01034592, "balance_loss_clip": 1.02133644, "balance_loss_mlp": 1.03422654, "epoch": 0.8150308131669923, "flos": 19386088375680.0, "grad_norm": 2.0676517705778137, "language_loss": 0.73730856, "learning_rate": 3.282711980931425e-07, "loss": 0.75880861, "num_input_tokens_seen": 292474300, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 13556, "time_per_iteration": 3.8669960498809814 }, { "auxiliary_loss_clip": 0.01113001, "auxiliary_loss_mlp": 0.01028137, "balance_loss_clip": 1.01702118, "balance_loss_mlp": 1.03632319, "epoch": 0.8150909364196604, "flos": 17421380213760.0, "grad_norm": 1.6359251956465952, "language_loss": 0.803123, "learning_rate": 3.280638276569676e-07, "loss": 0.82453436, "num_input_tokens_seen": 292492420, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6796875, "step": 13557, "time_per_iteration": 2.527691602706909 }, { "auxiliary_loss_clip": 0.01125842, "auxiliary_loss_mlp": 0.01032737, "balance_loss_clip": 1.01980925, "balance_loss_mlp": 1.03584671, "epoch": 0.8151510596723283, "flos": 27162795553920.0, "grad_norm": 1.740794720600771, "language_loss": 0.65988576, "learning_rate": 3.278565168878942e-07, "loss": 0.68147159, "num_input_tokens_seen": 292512895, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 13558, "time_per_iteration": 2.609787702560425 }, { "auxiliary_loss_clip": 0.01022631, "auxiliary_loss_mlp": 0.01002976, "balance_loss_clip": 1.00189161, "balance_loss_mlp": 1.00127971, "epoch": 0.8152111829249963, "flos": 64219052718720.0, "grad_norm": 0.7945097958152143, "language_loss": 0.56969231, "learning_rate": 3.276492657933201e-07, "loss": 0.58994842, "num_input_tokens_seen": 292566580, "router_z_loss_clip": 0.01086426, "router_z_loss_mlp": 0.21289062, "step": 13559, "time_per_iteration": 3.0182454586029053 }, { "auxiliary_loss_clip": 0.01040953, "auxiliary_loss_mlp": 0.01000844, "balance_loss_clip": 0.99968195, "balance_loss_mlp": 1.00172663, "epoch": 0.8152713061776642, "flos": 67072012306560.0, "grad_norm": 0.6774789340980026, "language_loss": 0.55298197, "learning_rate": 3.2744207438064166e-07, "loss": 0.57339996, "num_input_tokens_seen": 292621490, "router_z_loss_clip": 0.01159668, "router_z_loss_mlp": 0.21289062, "step": 13560, "time_per_iteration": 2.996877908706665 }, { "auxiliary_loss_clip": 0.01118595, "auxiliary_loss_mlp": 0.01028416, "balance_loss_clip": 1.01661444, "balance_loss_mlp": 1.03312492, "epoch": 0.8153314294303322, "flos": 14391130072320.0, "grad_norm": 1.6212150244451082, "language_loss": 0.67316109, "learning_rate": 3.272349426572536e-07, "loss": 0.69463116, "num_input_tokens_seen": 292638660, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.67578125, "step": 13561, "time_per_iteration": 2.517144203186035 }, { "auxiliary_loss_clip": 0.0111637, "auxiliary_loss_mlp": 0.01032184, "balance_loss_clip": 1.01957703, "balance_loss_mlp": 1.03494334, "epoch": 0.8153915526830001, "flos": 25623520076160.0, "grad_norm": 1.5472343977035448, "language_loss": 0.81400275, "learning_rate": 3.270278706305476e-07, "loss": 0.83548832, "num_input_tokens_seen": 292658545, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 13562, "time_per_iteration": 3.986663341522217 }, { "auxiliary_loss_clip": 0.01122712, "auxiliary_loss_mlp": 0.01032083, "balance_loss_clip": 1.01978111, "balance_loss_mlp": 1.03497314, "epoch": 0.8154516759356681, "flos": 23369156640000.0, "grad_norm": 2.0585143172443647, "language_loss": 0.71825361, "learning_rate": 3.268208583079135e-07, "loss": 0.73980153, "num_input_tokens_seen": 292678460, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 13563, "time_per_iteration": 2.5354678630828857 }, { "auxiliary_loss_clip": 0.01165986, "auxiliary_loss_mlp": 0.01031039, "balance_loss_clip": 1.01759243, "balance_loss_mlp": 1.03312707, "epoch": 0.815511799188336, "flos": 28149189914880.0, "grad_norm": 1.6095132965465115, "language_loss": 0.70124125, "learning_rate": 3.266139056967385e-07, "loss": 0.72321153, "num_input_tokens_seen": 292699815, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.69921875, "step": 13564, "time_per_iteration": 2.6477138996124268 }, { "auxiliary_loss_clip": 0.01121411, "auxiliary_loss_mlp": 0.01029489, "balance_loss_clip": 1.01692438, "balance_loss_mlp": 1.03393769, "epoch": 0.8155719224410041, "flos": 16983413683200.0, "grad_norm": 2.007838532140089, "language_loss": 0.70417255, "learning_rate": 3.2640701280440986e-07, "loss": 0.72568154, "num_input_tokens_seen": 292717370, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 13565, "time_per_iteration": 2.5212762355804443 }, { "auxiliary_loss_clip": 0.01111575, "auxiliary_loss_mlp": 0.01035081, "balance_loss_clip": 1.02272534, "balance_loss_mlp": 1.03448248, "epoch": 0.815632045693672, "flos": 24681727428480.0, "grad_norm": 2.0904895451408607, "language_loss": 0.79129195, "learning_rate": 3.262001796383087e-07, "loss": 0.81275856, "num_input_tokens_seen": 292737110, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6796875, "step": 13566, "time_per_iteration": 2.603332281112671 }, { "auxiliary_loss_clip": 0.01120614, "auxiliary_loss_mlp": 0.01029611, "balance_loss_clip": 1.01836932, "balance_loss_mlp": 1.03591585, "epoch": 0.81569216894634, "flos": 19938323047680.0, "grad_norm": 2.085545670929843, "language_loss": 0.82178879, "learning_rate": 3.259934062058183e-07, "loss": 0.84329098, "num_input_tokens_seen": 292756510, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.66796875, "step": 13567, "time_per_iteration": 2.5406601428985596 }, { "auxiliary_loss_clip": 0.01100624, "auxiliary_loss_mlp": 0.01026372, "balance_loss_clip": 1.01560712, "balance_loss_mlp": 1.03392434, "epoch": 0.8157522921990079, "flos": 21799393493760.0, "grad_norm": 2.106229225725329, "language_loss": 0.7941094, "learning_rate": 3.257866925143176e-07, "loss": 0.81537932, "num_input_tokens_seen": 292776710, "router_z_loss_clip": 0.10791016, "router_z_loss_mlp": 0.66796875, "step": 13568, "time_per_iteration": 2.541170835494995 }, { "auxiliary_loss_clip": 0.01116272, "auxiliary_loss_mlp": 0.01029421, "balance_loss_clip": 1.01811445, "balance_loss_mlp": 1.03249764, "epoch": 0.8158124154516759, "flos": 18508323720960.0, "grad_norm": 2.3436882757283346, "language_loss": 0.77044511, "learning_rate": 3.2558003857118244e-07, "loss": 0.79190201, "num_input_tokens_seen": 292794350, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.66015625, "step": 13569, "time_per_iteration": 3.860727071762085 }, { "auxiliary_loss_clip": 0.01101491, "auxiliary_loss_mlp": 0.01034174, "balance_loss_clip": 1.02324903, "balance_loss_mlp": 1.03432918, "epoch": 0.815872538704344, "flos": 26830801123200.0, "grad_norm": 2.358121335742407, "language_loss": 0.58337426, "learning_rate": 3.253734443837888e-07, "loss": 0.60473096, "num_input_tokens_seen": 292814005, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.671875, "step": 13570, "time_per_iteration": 2.567249059677124 }, { "auxiliary_loss_clip": 0.01117039, "auxiliary_loss_mlp": 0.01034739, "balance_loss_clip": 1.02201378, "balance_loss_mlp": 1.03622067, "epoch": 0.8159326619570119, "flos": 21725704742400.0, "grad_norm": 2.0098844905661206, "language_loss": 0.8269875, "learning_rate": 3.251669099595089e-07, "loss": 0.84850526, "num_input_tokens_seen": 292833485, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 13571, "time_per_iteration": 4.042137861251831 }, { "auxiliary_loss_clip": 0.01122286, "auxiliary_loss_mlp": 0.01039799, "balance_loss_clip": 1.02754998, "balance_loss_mlp": 1.03562009, "epoch": 0.8159927852096799, "flos": 13840726993920.0, "grad_norm": 3.080268037276913, "language_loss": 0.78578627, "learning_rate": 3.2496043530571495e-07, "loss": 0.80740714, "num_input_tokens_seen": 292848045, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 13572, "time_per_iteration": 2.5999326705932617 }, { "auxiliary_loss_clip": 0.01104791, "auxiliary_loss_mlp": 0.01032825, "balance_loss_clip": 1.02080226, "balance_loss_mlp": 1.03393054, "epoch": 0.8160529084623478, "flos": 24499516711680.0, "grad_norm": 2.082475498823696, "language_loss": 0.65204144, "learning_rate": 3.2475402042977315e-07, "loss": 0.67341757, "num_input_tokens_seen": 292869965, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.70703125, "step": 13573, "time_per_iteration": 2.5573604106903076 }, { "auxiliary_loss_clip": 0.01128833, "auxiliary_loss_mlp": 0.01029804, "balance_loss_clip": 1.01853919, "balance_loss_mlp": 1.03411078, "epoch": 0.8161130317150158, "flos": 24826339584000.0, "grad_norm": 1.458740921534322, "language_loss": 0.75313479, "learning_rate": 3.2454766533905176e-07, "loss": 0.77472115, "num_input_tokens_seen": 292889680, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6796875, "step": 13574, "time_per_iteration": 2.5431740283966064 }, { "auxiliary_loss_clip": 0.01113774, "auxiliary_loss_mlp": 0.01034857, "balance_loss_clip": 1.02270377, "balance_loss_mlp": 1.03447258, "epoch": 0.8161731549676837, "flos": 30956542208640.0, "grad_norm": 1.7598158656282366, "language_loss": 0.59423721, "learning_rate": 3.243413700409141e-07, "loss": 0.61572349, "num_input_tokens_seen": 292912360, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 13575, "time_per_iteration": 2.5777719020843506 }, { "auxiliary_loss_clip": 0.01155178, "auxiliary_loss_mlp": 0.01033582, "balance_loss_clip": 1.02085638, "balance_loss_mlp": 1.03307176, "epoch": 0.8162332782203517, "flos": 18551991680640.0, "grad_norm": 1.8157965479280058, "language_loss": 0.74421924, "learning_rate": 3.24135134542723e-07, "loss": 0.76610684, "num_input_tokens_seen": 292928325, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 13576, "time_per_iteration": 2.5315470695495605 }, { "auxiliary_loss_clip": 0.01040075, "auxiliary_loss_mlp": 0.0100051, "balance_loss_clip": 0.99931806, "balance_loss_mlp": 1.00130451, "epoch": 0.8162934014730197, "flos": 70386853904640.0, "grad_norm": 0.8267901640705995, "language_loss": 0.58660334, "learning_rate": 3.2392895885183747e-07, "loss": 0.60700923, "num_input_tokens_seen": 292992795, "router_z_loss_clip": 0.01190186, "router_z_loss_mlp": 0.21289062, "step": 13577, "time_per_iteration": 3.155442714691162 }, { "auxiliary_loss_clip": 0.01136516, "auxiliary_loss_mlp": 0.01029506, "balance_loss_clip": 1.01767445, "balance_loss_mlp": 1.03434587, "epoch": 0.8163535247256877, "flos": 21214839559680.0, "grad_norm": 1.882026705340829, "language_loss": 0.71098506, "learning_rate": 3.237228429756165e-07, "loss": 0.73264533, "num_input_tokens_seen": 293011950, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.66015625, "step": 13578, "time_per_iteration": 2.596518039703369 }, { "auxiliary_loss_clip": 0.01130468, "auxiliary_loss_mlp": 0.01031094, "balance_loss_clip": 1.0194, "balance_loss_mlp": 1.03490531, "epoch": 0.8164136479783556, "flos": 21098847565440.0, "grad_norm": 1.7642845042081634, "language_loss": 0.73414725, "learning_rate": 3.235167869214155e-07, "loss": 0.75576293, "num_input_tokens_seen": 293030175, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 13579, "time_per_iteration": 2.588320255279541 }, { "auxiliary_loss_clip": 0.01104306, "auxiliary_loss_mlp": 0.01029177, "balance_loss_clip": 1.01733971, "balance_loss_mlp": 1.0338583, "epoch": 0.8164737712310236, "flos": 21720640924800.0, "grad_norm": 1.7154389927817062, "language_loss": 0.79260159, "learning_rate": 3.2331079069658773e-07, "loss": 0.81393635, "num_input_tokens_seen": 293047980, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.703125, "step": 13580, "time_per_iteration": 2.5469794273376465 }, { "auxiliary_loss_clip": 0.01126654, "auxiliary_loss_mlp": 0.01033217, "balance_loss_clip": 1.02080131, "balance_loss_mlp": 1.03385115, "epoch": 0.8165338944836915, "flos": 19536805843200.0, "grad_norm": 1.7172768280306214, "language_loss": 0.68559778, "learning_rate": 3.231048543084849e-07, "loss": 0.70719647, "num_input_tokens_seen": 293067030, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6640625, "step": 13581, "time_per_iteration": 2.6091747283935547 }, { "auxiliary_loss_clip": 0.01102844, "auxiliary_loss_mlp": 0.01027788, "balance_loss_clip": 1.01599181, "balance_loss_mlp": 1.0340054, "epoch": 0.8165940177363595, "flos": 22928568416640.0, "grad_norm": 1.7367839912768577, "language_loss": 0.60242808, "learning_rate": 3.228989777644564e-07, "loss": 0.62373441, "num_input_tokens_seen": 293085575, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6875, "step": 13582, "time_per_iteration": 2.545382022857666 }, { "auxiliary_loss_clip": 0.01125014, "auxiliary_loss_mlp": 0.01277082, "balance_loss_clip": 1.01910806, "balance_loss_mlp": 1.0336113, "epoch": 0.8166541409890276, "flos": 23370377702400.0, "grad_norm": 1.4410518537197923, "language_loss": 0.82403678, "learning_rate": 3.2269316107184953e-07, "loss": 0.84805775, "num_input_tokens_seen": 293108200, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6484375, "step": 13583, "time_per_iteration": 2.6046080589294434 }, { "auxiliary_loss_clip": 0.01120085, "auxiliary_loss_mlp": 0.01028412, "balance_loss_clip": 1.01690221, "balance_loss_mlp": 1.0337882, "epoch": 0.8167142642416955, "flos": 18441997257600.0, "grad_norm": 1.6176880203904012, "language_loss": 0.74442774, "learning_rate": 3.2248740423800856e-07, "loss": 0.76591271, "num_input_tokens_seen": 293126020, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.68359375, "step": 13584, "time_per_iteration": 2.5318660736083984 }, { "auxiliary_loss_clip": 0.01113802, "auxiliary_loss_mlp": 0.01029343, "balance_loss_clip": 1.01656413, "balance_loss_mlp": 1.03629804, "epoch": 0.8167743874943635, "flos": 21214983214080.0, "grad_norm": 1.8056772651567068, "language_loss": 0.74639684, "learning_rate": 3.2228170727027794e-07, "loss": 0.76782835, "num_input_tokens_seen": 293144620, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 13585, "time_per_iteration": 2.495309591293335 }, { "auxiliary_loss_clip": 0.01122154, "auxiliary_loss_mlp": 0.01033335, "balance_loss_clip": 1.02105093, "balance_loss_mlp": 1.03528321, "epoch": 0.8168345107470314, "flos": 18697681244160.0, "grad_norm": 1.8171676285819731, "language_loss": 0.69480658, "learning_rate": 3.220760701759968e-07, "loss": 0.71636152, "num_input_tokens_seen": 293162850, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 13586, "time_per_iteration": 2.4935319423675537 }, { "auxiliary_loss_clip": 0.01147947, "auxiliary_loss_mlp": 0.01031635, "balance_loss_clip": 1.02063227, "balance_loss_mlp": 1.0359242, "epoch": 0.8168946339996994, "flos": 16253098358400.0, "grad_norm": 1.6021120195527063, "language_loss": 0.60890543, "learning_rate": 3.218704929625049e-07, "loss": 0.63070124, "num_input_tokens_seen": 293181620, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6796875, "step": 13587, "time_per_iteration": 2.521397352218628 }, { "auxiliary_loss_clip": 0.01115361, "auxiliary_loss_mlp": 0.01033232, "balance_loss_clip": 1.02090621, "balance_loss_mlp": 1.03461599, "epoch": 0.8169547572523673, "flos": 26941585645440.0, "grad_norm": 2.602802363032666, "language_loss": 0.6935935, "learning_rate": 3.216649756371379e-07, "loss": 0.71507943, "num_input_tokens_seen": 293200270, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.71875, "step": 13588, "time_per_iteration": 2.5300724506378174 }, { "auxiliary_loss_clip": 0.01124769, "auxiliary_loss_mlp": 0.01034607, "balance_loss_clip": 1.02096927, "balance_loss_mlp": 1.03621161, "epoch": 0.8170148805050353, "flos": 18952323736320.0, "grad_norm": 1.5310169540948508, "language_loss": 0.72481048, "learning_rate": 3.214595182072319e-07, "loss": 0.74640429, "num_input_tokens_seen": 293218960, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.70703125, "step": 13589, "time_per_iteration": 2.51297926902771 }, { "auxiliary_loss_clip": 0.01110201, "auxiliary_loss_mlp": 0.01033031, "balance_loss_clip": 1.02098465, "balance_loss_mlp": 1.0337441, "epoch": 0.8170750037577033, "flos": 21834909066240.0, "grad_norm": 1.9408394752044142, "language_loss": 0.73472422, "learning_rate": 3.212541206801169e-07, "loss": 0.75615656, "num_input_tokens_seen": 293236450, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.67578125, "step": 13590, "time_per_iteration": 2.5064759254455566 }, { "auxiliary_loss_clip": 0.01107479, "auxiliary_loss_mlp": 0.01036257, "balance_loss_clip": 1.02247679, "balance_loss_mlp": 1.03527761, "epoch": 0.8171351270103713, "flos": 33507169021440.0, "grad_norm": 5.263652335686441, "language_loss": 0.65320861, "learning_rate": 3.2104878306312456e-07, "loss": 0.6746459, "num_input_tokens_seen": 293256480, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.72265625, "step": 13591, "time_per_iteration": 2.5903232097625732 }, { "auxiliary_loss_clip": 0.01105225, "auxiliary_loss_mlp": 0.01033433, "balance_loss_clip": 1.02077293, "balance_loss_mlp": 1.0335325, "epoch": 0.8171952502630392, "flos": 22708184520960.0, "grad_norm": 1.9712175213212917, "language_loss": 0.67620265, "learning_rate": 3.208435053635825e-07, "loss": 0.69758922, "num_input_tokens_seen": 293274960, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 13592, "time_per_iteration": 2.5383429527282715 }, { "auxiliary_loss_clip": 0.0112143, "auxiliary_loss_mlp": 0.01028167, "balance_loss_clip": 1.01576924, "balance_loss_mlp": 1.03419411, "epoch": 0.8172553735157072, "flos": 26723715701760.0, "grad_norm": 1.8074669783037132, "language_loss": 0.66040325, "learning_rate": 3.206382875888163e-07, "loss": 0.68189919, "num_input_tokens_seen": 293295945, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 13593, "time_per_iteration": 2.5874428749084473 }, { "auxiliary_loss_clip": 0.01111014, "auxiliary_loss_mlp": 0.01032176, "balance_loss_clip": 1.0206306, "balance_loss_mlp": 1.03372836, "epoch": 0.8173154967683751, "flos": 15961072786560.0, "grad_norm": 1.9601960201112856, "language_loss": 0.6946848, "learning_rate": 3.2043312974614935e-07, "loss": 0.71611667, "num_input_tokens_seen": 293313300, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.68359375, "step": 13594, "time_per_iteration": 2.525987148284912 }, { "auxiliary_loss_clip": 0.01104443, "auxiliary_loss_mlp": 0.01030253, "balance_loss_clip": 1.01860023, "balance_loss_mlp": 1.03363693, "epoch": 0.8173756200210431, "flos": 25986720447360.0, "grad_norm": 1.801413080269846, "language_loss": 0.66147292, "learning_rate": 3.2022803184290446e-07, "loss": 0.6828199, "num_input_tokens_seen": 293333085, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.70703125, "step": 13595, "time_per_iteration": 2.556229591369629 }, { "auxiliary_loss_clip": 0.01133674, "auxiliary_loss_mlp": 0.01027993, "balance_loss_clip": 1.01510012, "balance_loss_mlp": 1.03403115, "epoch": 0.8174357432737112, "flos": 25664422688640.0, "grad_norm": 1.8498674767949053, "language_loss": 0.78553236, "learning_rate": 3.200229938863994e-07, "loss": 0.80714905, "num_input_tokens_seen": 293351895, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 13596, "time_per_iteration": 2.7370493412017822 }, { "auxiliary_loss_clip": 0.01118144, "auxiliary_loss_mlp": 0.01028893, "balance_loss_clip": 1.01648331, "balance_loss_mlp": 1.03278685, "epoch": 0.8174958665263791, "flos": 21835088634240.0, "grad_norm": 12.040647088816323, "language_loss": 0.58321214, "learning_rate": 3.1981801588395274e-07, "loss": 0.60468251, "num_input_tokens_seen": 293371165, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.671875, "step": 13597, "time_per_iteration": 2.584923505783081 }, { "auxiliary_loss_clip": 0.01135441, "auxiliary_loss_mlp": 0.01030972, "balance_loss_clip": 1.01868176, "balance_loss_mlp": 1.03334928, "epoch": 0.8175559897790471, "flos": 22455517276800.0, "grad_norm": 1.6115867855679211, "language_loss": 0.82645941, "learning_rate": 3.1961309784287926e-07, "loss": 0.84812355, "num_input_tokens_seen": 293391150, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.66796875, "step": 13598, "time_per_iteration": 3.940540313720703 }, { "auxiliary_loss_clip": 0.01158215, "auxiliary_loss_mlp": 0.01031503, "balance_loss_clip": 1.01987433, "balance_loss_mlp": 1.03470862, "epoch": 0.817616113031715, "flos": 23615790399360.0, "grad_norm": 1.6945962647579484, "language_loss": 0.82169282, "learning_rate": 3.194082397704916e-07, "loss": 0.84359002, "num_input_tokens_seen": 293409440, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.70703125, "step": 13599, "time_per_iteration": 2.620863437652588 }, { "auxiliary_loss_clip": 0.0113793, "auxiliary_loss_mlp": 0.01030777, "balance_loss_clip": 1.01871324, "balance_loss_mlp": 1.03437185, "epoch": 0.817676236284383, "flos": 27672260106240.0, "grad_norm": 1.7002413226210604, "language_loss": 0.83833522, "learning_rate": 3.1920344167410094e-07, "loss": 0.86002231, "num_input_tokens_seen": 293428995, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.67578125, "step": 13600, "time_per_iteration": 2.56724214553833 }, { "auxiliary_loss_clip": 0.01129783, "auxiliary_loss_mlp": 0.01029205, "balance_loss_clip": 1.01692688, "balance_loss_mlp": 1.03377795, "epoch": 0.8177363595370509, "flos": 24681009156480.0, "grad_norm": 1.7356434073521285, "language_loss": 0.74586904, "learning_rate": 3.1899870356101553e-07, "loss": 0.76745892, "num_input_tokens_seen": 293449155, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.69921875, "step": 13601, "time_per_iteration": 2.565906286239624 }, { "auxiliary_loss_clip": 0.01133788, "auxiliary_loss_mlp": 0.01031623, "balance_loss_clip": 1.01894557, "balance_loss_mlp": 1.03535724, "epoch": 0.817796482789719, "flos": 17346326745600.0, "grad_norm": 2.692312742816097, "language_loss": 0.67952228, "learning_rate": 3.1879402543854326e-07, "loss": 0.7011764, "num_input_tokens_seen": 293466125, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 13602, "time_per_iteration": 2.5376808643341064 }, { "auxiliary_loss_clip": 0.01128223, "auxiliary_loss_mlp": 0.01028034, "balance_loss_clip": 1.01653028, "balance_loss_mlp": 1.03443599, "epoch": 0.8178566060423869, "flos": 17778475272960.0, "grad_norm": 2.093949480494991, "language_loss": 0.83603382, "learning_rate": 3.1858940731398676e-07, "loss": 0.8575964, "num_input_tokens_seen": 293481345, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 13603, "time_per_iteration": 2.5150601863861084 }, { "auxiliary_loss_clip": 0.01138062, "auxiliary_loss_mlp": 0.01028681, "balance_loss_clip": 1.01680136, "balance_loss_mlp": 1.03368175, "epoch": 0.8179167292950549, "flos": 24973250209920.0, "grad_norm": 1.919628207098486, "language_loss": 0.69252384, "learning_rate": 3.183848491946497e-07, "loss": 0.7141912, "num_input_tokens_seen": 293502330, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 13604, "time_per_iteration": 3.9631240367889404 }, { "auxiliary_loss_clip": 0.0112246, "auxiliary_loss_mlp": 0.01033447, "balance_loss_clip": 1.02086461, "balance_loss_mlp": 1.03601766, "epoch": 0.8179768525477228, "flos": 22856783086080.0, "grad_norm": 1.6129443938377777, "language_loss": 0.74159312, "learning_rate": 3.1818035108783113e-07, "loss": 0.76315224, "num_input_tokens_seen": 293521415, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.68359375, "step": 13605, "time_per_iteration": 2.563110828399658 }, { "auxiliary_loss_clip": 0.01129561, "auxiliary_loss_mlp": 0.01039456, "balance_loss_clip": 1.0270524, "balance_loss_mlp": 1.03580236, "epoch": 0.8180369758003908, "flos": 18515147304960.0, "grad_norm": 2.228308131360761, "language_loss": 0.74256063, "learning_rate": 3.179759130008306e-07, "loss": 0.76425076, "num_input_tokens_seen": 293539245, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.66796875, "step": 13606, "time_per_iteration": 2.4973185062408447 }, { "auxiliary_loss_clip": 0.01102556, "auxiliary_loss_mlp": 0.01028871, "balance_loss_clip": 1.01711679, "balance_loss_mlp": 1.03478265, "epoch": 0.8180970990530587, "flos": 33182105915520.0, "grad_norm": 1.7308083716511502, "language_loss": 0.65576792, "learning_rate": 3.1777153494094224e-07, "loss": 0.67708218, "num_input_tokens_seen": 293560640, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6796875, "step": 13607, "time_per_iteration": 2.6435372829437256 }, { "auxiliary_loss_clip": 0.01122168, "auxiliary_loss_mlp": 0.01030555, "balance_loss_clip": 1.01841378, "balance_loss_mlp": 1.03532958, "epoch": 0.8181572223057267, "flos": 25010022758400.0, "grad_norm": 1.661410963293236, "language_loss": 0.70184267, "learning_rate": 3.1756721691546105e-07, "loss": 0.7233699, "num_input_tokens_seen": 293579465, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.69140625, "step": 13608, "time_per_iteration": 2.550861120223999 }, { "auxiliary_loss_clip": 0.01110941, "auxiliary_loss_mlp": 0.01033225, "balance_loss_clip": 1.02143502, "balance_loss_mlp": 1.03379774, "epoch": 0.8182173455583948, "flos": 28548731871360.0, "grad_norm": 1.6490316019260807, "language_loss": 0.79739964, "learning_rate": 3.1736295893167797e-07, "loss": 0.81884122, "num_input_tokens_seen": 293600540, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 13609, "time_per_iteration": 2.582002878189087 }, { "auxiliary_loss_clip": 0.01048779, "auxiliary_loss_mlp": 0.01002338, "balance_loss_clip": 1.00113988, "balance_loss_mlp": 1.00113022, "epoch": 0.8182774688110627, "flos": 72028043245440.0, "grad_norm": 0.7435122621945782, "language_loss": 0.5587033, "learning_rate": 3.171587609968829e-07, "loss": 0.57921445, "num_input_tokens_seen": 293665160, "router_z_loss_clip": 0.01196289, "router_z_loss_mlp": 0.21191406, "step": 13610, "time_per_iteration": 3.193124771118164 }, { "auxiliary_loss_clip": 0.01143077, "auxiliary_loss_mlp": 0.01274123, "balance_loss_clip": 1.01577401, "balance_loss_mlp": 1.03348029, "epoch": 0.8183375920637307, "flos": 19755358145280.0, "grad_norm": 1.8541174444483908, "language_loss": 0.77453637, "learning_rate": 3.169546231183622e-07, "loss": 0.79870832, "num_input_tokens_seen": 293683995, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.65234375, "step": 13611, "time_per_iteration": 4.028745412826538 }, { "auxiliary_loss_clip": 0.01134151, "auxiliary_loss_mlp": 0.01035086, "balance_loss_clip": 1.02142477, "balance_loss_mlp": 1.0336324, "epoch": 0.8183977153163986, "flos": 22341895580160.0, "grad_norm": 1.9995062737932854, "language_loss": 0.7709533, "learning_rate": 3.1675054530340275e-07, "loss": 0.79264569, "num_input_tokens_seen": 293704115, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73828125, "step": 13612, "time_per_iteration": 2.582103729248047 }, { "auxiliary_loss_clip": 0.0112454, "auxiliary_loss_mlp": 0.01026116, "balance_loss_clip": 1.01460624, "balance_loss_mlp": 1.03226972, "epoch": 0.8184578385690666, "flos": 17712472032000.0, "grad_norm": 1.8963138070764296, "language_loss": 0.86406237, "learning_rate": 3.165465275592858e-07, "loss": 0.88556892, "num_input_tokens_seen": 293722225, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 13613, "time_per_iteration": 4.053312540054321 }, { "auxiliary_loss_clip": 0.01110469, "auxiliary_loss_mlp": 0.01041438, "balance_loss_clip": 1.02991629, "balance_loss_mlp": 1.03508186, "epoch": 0.8185179618217345, "flos": 25701159323520.0, "grad_norm": 1.5614378544476295, "language_loss": 0.72659254, "learning_rate": 3.163425698932927e-07, "loss": 0.74811161, "num_input_tokens_seen": 293743995, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 13614, "time_per_iteration": 2.5556654930114746 }, { "auxiliary_loss_clip": 0.01122978, "auxiliary_loss_mlp": 0.01035511, "balance_loss_clip": 1.02270794, "balance_loss_mlp": 1.03495049, "epoch": 0.8185780850744026, "flos": 25960326929280.0, "grad_norm": 1.7137433943776563, "language_loss": 0.81223458, "learning_rate": 3.161386723127029e-07, "loss": 0.83381951, "num_input_tokens_seen": 293764935, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 13615, "time_per_iteration": 2.5852928161621094 }, { "auxiliary_loss_clip": 0.01111559, "auxiliary_loss_mlp": 0.01031545, "balance_loss_clip": 1.01894462, "balance_loss_mlp": 1.03400254, "epoch": 0.8186382083270705, "flos": 25228431406080.0, "grad_norm": 1.7107026485381747, "language_loss": 0.7590102, "learning_rate": 3.159348348247923e-07, "loss": 0.78044128, "num_input_tokens_seen": 293784035, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.68359375, "step": 13616, "time_per_iteration": 2.5065395832061768 }, { "auxiliary_loss_clip": 0.01126107, "auxiliary_loss_mlp": 0.01029893, "balance_loss_clip": 1.01725078, "balance_loss_mlp": 1.03223562, "epoch": 0.8186983315797385, "flos": 22415009713920.0, "grad_norm": 1.7197443906151186, "language_loss": 0.75282943, "learning_rate": 3.157310574368355e-07, "loss": 0.77438951, "num_input_tokens_seen": 293803360, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.671875, "step": 13617, "time_per_iteration": 2.547292470932007 }, { "auxiliary_loss_clip": 0.01114414, "auxiliary_loss_mlp": 0.01029652, "balance_loss_clip": 1.01872039, "balance_loss_mlp": 1.03032088, "epoch": 0.8187584548324064, "flos": 22018017623040.0, "grad_norm": 2.0371595337892145, "language_loss": 0.68384326, "learning_rate": 3.1552734015610447e-07, "loss": 0.70528394, "num_input_tokens_seen": 293821325, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.6640625, "step": 13618, "time_per_iteration": 2.544964551925659 }, { "auxiliary_loss_clip": 0.01125272, "auxiliary_loss_mlp": 0.01031005, "balance_loss_clip": 1.0183332, "balance_loss_mlp": 1.03501058, "epoch": 0.8188185780850744, "flos": 29241664116480.0, "grad_norm": 1.8899136414136288, "language_loss": 0.70207441, "learning_rate": 3.1532368298987066e-07, "loss": 0.72363722, "num_input_tokens_seen": 293840315, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 13619, "time_per_iteration": 2.6042990684509277 }, { "auxiliary_loss_clip": 0.01150235, "auxiliary_loss_mlp": 0.01032119, "balance_loss_clip": 1.01964974, "balance_loss_mlp": 1.0360719, "epoch": 0.8188787013377423, "flos": 20696504348160.0, "grad_norm": 1.9039035951622834, "language_loss": 0.73751903, "learning_rate": 3.1512008594539996e-07, "loss": 0.75934249, "num_input_tokens_seen": 293855685, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 13620, "time_per_iteration": 2.542691469192505 }, { "auxiliary_loss_clip": 0.01118211, "auxiliary_loss_mlp": 0.01275919, "balance_loss_clip": 1.01749849, "balance_loss_mlp": 1.03292143, "epoch": 0.8189388245904103, "flos": 23732967542400.0, "grad_norm": 1.634479020215103, "language_loss": 0.75951958, "learning_rate": 3.149165490299599e-07, "loss": 0.78346086, "num_input_tokens_seen": 293875540, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.671875, "step": 13621, "time_per_iteration": 2.58931040763855 }, { "auxiliary_loss_clip": 0.01119648, "auxiliary_loss_mlp": 0.01025265, "balance_loss_clip": 1.01288557, "balance_loss_mlp": 1.03378642, "epoch": 0.8189989478430784, "flos": 28255090187520.0, "grad_norm": 3.5380899632812226, "language_loss": 0.65854943, "learning_rate": 3.1471307225081335e-07, "loss": 0.67999852, "num_input_tokens_seen": 293896570, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.67578125, "step": 13622, "time_per_iteration": 2.569500684738159 }, { "auxiliary_loss_clip": 0.01105773, "auxiliary_loss_mlp": 0.01028852, "balance_loss_clip": 1.01529813, "balance_loss_mlp": 1.03600526, "epoch": 0.8190590710957463, "flos": 21397696721280.0, "grad_norm": 1.8372347619860943, "language_loss": 0.75128531, "learning_rate": 3.145096556152229e-07, "loss": 0.77263153, "num_input_tokens_seen": 293914680, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.6953125, "step": 13623, "time_per_iteration": 2.528935194015503 }, { "auxiliary_loss_clip": 0.01121675, "auxiliary_loss_mlp": 0.01034736, "balance_loss_clip": 1.02196908, "balance_loss_mlp": 1.03403854, "epoch": 0.8191191943484143, "flos": 38796451367040.0, "grad_norm": 2.000599882958361, "language_loss": 0.62992448, "learning_rate": 3.143062991304464e-07, "loss": 0.65148854, "num_input_tokens_seen": 293936480, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 13624, "time_per_iteration": 2.6629974842071533 }, { "auxiliary_loss_clip": 0.01100268, "auxiliary_loss_mlp": 0.01034561, "balance_loss_clip": 1.02299786, "balance_loss_mlp": 1.033476, "epoch": 0.8191793176010822, "flos": 25446516831360.0, "grad_norm": 1.5985798570269465, "language_loss": 0.78197634, "learning_rate": 3.1410300280374234e-07, "loss": 0.8033247, "num_input_tokens_seen": 293957815, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.66796875, "step": 13625, "time_per_iteration": 2.5457382202148438 }, { "auxiliary_loss_clip": 0.01117138, "auxiliary_loss_mlp": 0.01277055, "balance_loss_clip": 1.01704788, "balance_loss_mlp": 1.03434169, "epoch": 0.8192394408537502, "flos": 25083029151360.0, "grad_norm": 2.14303241057923, "language_loss": 0.75409079, "learning_rate": 3.138997666423657e-07, "loss": 0.77803272, "num_input_tokens_seen": 293975440, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73828125, "step": 13626, "time_per_iteration": 2.566612482070923 }, { "auxiliary_loss_clip": 0.01111811, "auxiliary_loss_mlp": 0.01033094, "balance_loss_clip": 1.02158475, "balance_loss_mlp": 1.03532147, "epoch": 0.8192995641064181, "flos": 27673732563840.0, "grad_norm": 1.4962986657408297, "language_loss": 0.73568976, "learning_rate": 3.136965906535691e-07, "loss": 0.75713879, "num_input_tokens_seen": 293997540, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.67578125, "step": 13627, "time_per_iteration": 2.5878353118896484 }, { "auxiliary_loss_clip": 0.0110379, "auxiliary_loss_mlp": 0.01030798, "balance_loss_clip": 1.01848996, "balance_loss_mlp": 1.03473449, "epoch": 0.8193596873590862, "flos": 21288492397440.0, "grad_norm": 1.5420430022057894, "language_loss": 0.69097543, "learning_rate": 3.1349347484460343e-07, "loss": 0.71232128, "num_input_tokens_seen": 294017030, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 13628, "time_per_iteration": 2.51507568359375 }, { "auxiliary_loss_clip": 0.01126943, "auxiliary_loss_mlp": 0.01030242, "balance_loss_clip": 1.01736188, "balance_loss_mlp": 1.03665495, "epoch": 0.8194198106117541, "flos": 17492626840320.0, "grad_norm": 1.961604833903226, "language_loss": 0.85206932, "learning_rate": 3.1329041922271747e-07, "loss": 0.87364113, "num_input_tokens_seen": 294035700, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.72265625, "step": 13629, "time_per_iteration": 2.518662214279175 }, { "auxiliary_loss_clip": 0.01101812, "auxiliary_loss_mlp": 0.01024663, "balance_loss_clip": 1.01274848, "balance_loss_mlp": 1.03419292, "epoch": 0.8194799338644221, "flos": 15267925059840.0, "grad_norm": 2.0327955938131432, "language_loss": 0.73637235, "learning_rate": 3.130874237951577e-07, "loss": 0.75763714, "num_input_tokens_seen": 294049730, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.67578125, "step": 13630, "time_per_iteration": 2.476325035095215 }, { "auxiliary_loss_clip": 0.01113863, "auxiliary_loss_mlp": 0.01030582, "balance_loss_clip": 1.01861382, "balance_loss_mlp": 1.03507686, "epoch": 0.81954005711709, "flos": 14718814871040.0, "grad_norm": 1.989321010020634, "language_loss": 0.72644639, "learning_rate": 3.128844885691679e-07, "loss": 0.74789083, "num_input_tokens_seen": 294066545, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6953125, "step": 13631, "time_per_iteration": 2.490722894668579 }, { "auxiliary_loss_clip": 0.01102887, "auxiliary_loss_mlp": 0.01036329, "balance_loss_clip": 1.02458072, "balance_loss_mlp": 1.03343725, "epoch": 0.819600180369758, "flos": 23074042498560.0, "grad_norm": 1.566878297922198, "language_loss": 0.76836646, "learning_rate": 3.126816135519916e-07, "loss": 0.78975862, "num_input_tokens_seen": 294087455, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6953125, "step": 13632, "time_per_iteration": 2.47928786277771 }, { "auxiliary_loss_clip": 0.01118611, "auxiliary_loss_mlp": 0.01025474, "balance_loss_clip": 1.01469123, "balance_loss_mlp": 1.03244352, "epoch": 0.8196603036224259, "flos": 27599792417280.0, "grad_norm": 1.665586833037025, "language_loss": 0.65770948, "learning_rate": 3.124787987508684e-07, "loss": 0.67915034, "num_input_tokens_seen": 294107480, "router_z_loss_clip": 0.10742188, "router_z_loss_mlp": 0.6796875, "step": 13633, "time_per_iteration": 2.564368486404419 }, { "auxiliary_loss_clip": 0.01108219, "auxiliary_loss_mlp": 0.01033968, "balance_loss_clip": 1.02298319, "balance_loss_mlp": 1.03330564, "epoch": 0.8197204268750939, "flos": 28582020800640.0, "grad_norm": 1.6323585748716594, "language_loss": 0.75377607, "learning_rate": 3.1227604417303563e-07, "loss": 0.77519798, "num_input_tokens_seen": 294130115, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.65625, "step": 13634, "time_per_iteration": 2.561213970184326 }, { "auxiliary_loss_clip": 0.01144244, "auxiliary_loss_mlp": 0.01275221, "balance_loss_clip": 1.01689124, "balance_loss_mlp": 1.03347659, "epoch": 0.819780550127762, "flos": 23258300290560.0, "grad_norm": 2.805331490665847, "language_loss": 0.81936473, "learning_rate": 3.120733498257293e-07, "loss": 0.84355938, "num_input_tokens_seen": 294148495, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.66015625, "step": 13635, "time_per_iteration": 2.5779218673706055 }, { "auxiliary_loss_clip": 0.01111886, "auxiliary_loss_mlp": 0.01030177, "balance_loss_clip": 1.01725447, "balance_loss_mlp": 1.0334568, "epoch": 0.8198406733804299, "flos": 26685255214080.0, "grad_norm": 1.8109050418734498, "language_loss": 0.76188159, "learning_rate": 3.1187071571618393e-07, "loss": 0.78330225, "num_input_tokens_seen": 294169595, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 13636, "time_per_iteration": 2.534735918045044 }, { "auxiliary_loss_clip": 0.01126312, "auxiliary_loss_mlp": 0.01035521, "balance_loss_clip": 1.02327824, "balance_loss_mlp": 1.03211856, "epoch": 0.8199007966330979, "flos": 20084084438400.0, "grad_norm": 2.1739483667415085, "language_loss": 0.81083941, "learning_rate": 3.116681418516296e-07, "loss": 0.83245772, "num_input_tokens_seen": 294183885, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6796875, "step": 13637, "time_per_iteration": 2.5188121795654297 }, { "auxiliary_loss_clip": 0.01111709, "auxiliary_loss_mlp": 0.01031765, "balance_loss_clip": 1.01981997, "balance_loss_mlp": 1.03517199, "epoch": 0.8199609198857658, "flos": 31902788142720.0, "grad_norm": 2.5676857834673337, "language_loss": 0.7131021, "learning_rate": 3.114656282392969e-07, "loss": 0.73453689, "num_input_tokens_seen": 294200150, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 13638, "time_per_iteration": 2.5381758213043213 }, { "auxiliary_loss_clip": 0.01142238, "auxiliary_loss_mlp": 0.01033965, "balance_loss_clip": 1.01985621, "balance_loss_mlp": 1.03420508, "epoch": 0.8200210431384338, "flos": 26470150617600.0, "grad_norm": 2.051165968762665, "language_loss": 0.79563522, "learning_rate": 3.1126317488641227e-07, "loss": 0.81739724, "num_input_tokens_seen": 294220385, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.72265625, "step": 13639, "time_per_iteration": 3.959136962890625 }, { "auxiliary_loss_clip": 0.01128575, "auxiliary_loss_mlp": 0.01029549, "balance_loss_clip": 1.01747298, "balance_loss_mlp": 1.033324, "epoch": 0.8200811663911017, "flos": 22091454979200.0, "grad_norm": 1.5999861155300772, "language_loss": 0.78919548, "learning_rate": 3.1106078180020135e-07, "loss": 0.81077677, "num_input_tokens_seen": 294239355, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 13640, "time_per_iteration": 2.5234107971191406 }, { "auxiliary_loss_clip": 0.01133889, "auxiliary_loss_mlp": 0.01029696, "balance_loss_clip": 1.01704144, "balance_loss_mlp": 1.0347507, "epoch": 0.8201412896437698, "flos": 37593659520000.0, "grad_norm": 1.731537381440658, "language_loss": 0.63301808, "learning_rate": 3.1085844898788584e-07, "loss": 0.65465391, "num_input_tokens_seen": 294259395, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 13641, "time_per_iteration": 2.6628379821777344 }, { "auxiliary_loss_clip": 0.01137867, "auxiliary_loss_mlp": 0.01029791, "balance_loss_clip": 1.01750612, "balance_loss_mlp": 1.03379226, "epoch": 0.8202014128964377, "flos": 19646333389440.0, "grad_norm": 1.827292833962653, "language_loss": 0.7335186, "learning_rate": 3.10656176456688e-07, "loss": 0.75519514, "num_input_tokens_seen": 294277365, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 13642, "time_per_iteration": 2.550311326980591 }, { "auxiliary_loss_clip": 0.01121348, "auxiliary_loss_mlp": 0.01031474, "balance_loss_clip": 1.01957059, "balance_loss_mlp": 1.03553891, "epoch": 0.8202615361491057, "flos": 31467335564160.0, "grad_norm": 1.6860131178393745, "language_loss": 0.70142162, "learning_rate": 3.1045396421382596e-07, "loss": 0.7229498, "num_input_tokens_seen": 294297555, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 13643, "time_per_iteration": 2.5808651447296143 }, { "auxiliary_loss_clip": 0.01109971, "auxiliary_loss_mlp": 0.0103237, "balance_loss_clip": 1.0209018, "balance_loss_mlp": 1.03405607, "epoch": 0.8203216594017736, "flos": 24715555061760.0, "grad_norm": 1.7700289389765487, "language_loss": 0.6574291, "learning_rate": 3.1025181226651497e-07, "loss": 0.6788525, "num_input_tokens_seen": 294317600, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.671875, "step": 13644, "time_per_iteration": 2.5638179779052734 }, { "auxiliary_loss_clip": 0.01132587, "auxiliary_loss_mlp": 0.01033203, "balance_loss_clip": 1.02095437, "balance_loss_mlp": 1.03494883, "epoch": 0.8203817826544416, "flos": 26031824951040.0, "grad_norm": 1.4127731781100867, "language_loss": 0.70886707, "learning_rate": 3.10049720621971e-07, "loss": 0.73052502, "num_input_tokens_seen": 294340215, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.70703125, "step": 13645, "time_per_iteration": 4.023597478866577 }, { "auxiliary_loss_clip": 0.01118199, "auxiliary_loss_mlp": 0.01027367, "balance_loss_clip": 1.01572037, "balance_loss_mlp": 1.03317153, "epoch": 0.8204419059071095, "flos": 25954544839680.0, "grad_norm": 1.7617669020172537, "language_loss": 0.71359676, "learning_rate": 3.0984768928740513e-07, "loss": 0.73505247, "num_input_tokens_seen": 294358590, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.67578125, "step": 13646, "time_per_iteration": 2.568648099899292 }, { "auxiliary_loss_clip": 0.01123335, "auxiliary_loss_mlp": 0.01032102, "balance_loss_clip": 1.01960337, "balance_loss_mlp": 1.03518391, "epoch": 0.8205020291597775, "flos": 23580059345280.0, "grad_norm": 2.4340597449754817, "language_loss": 0.78805494, "learning_rate": 3.0964571827002786e-07, "loss": 0.80960935, "num_input_tokens_seen": 294375825, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 13647, "time_per_iteration": 2.5383665561676025 }, { "auxiliary_loss_clip": 0.01099546, "auxiliary_loss_mlp": 0.0103302, "balance_loss_clip": 1.02159369, "balance_loss_mlp": 1.03470325, "epoch": 0.8205621524124456, "flos": 26799164219520.0, "grad_norm": 1.3711058361477209, "language_loss": 0.67753279, "learning_rate": 3.09443807577046e-07, "loss": 0.69885844, "num_input_tokens_seen": 294398500, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6484375, "step": 13648, "time_per_iteration": 2.5470190048217773 }, { "auxiliary_loss_clip": 0.01119569, "auxiliary_loss_mlp": 0.01028332, "balance_loss_clip": 1.01591027, "balance_loss_mlp": 1.0322473, "epoch": 0.8206222756651135, "flos": 27527863432320.0, "grad_norm": 1.77083643955493, "language_loss": 0.7983675, "learning_rate": 3.092419572156668e-07, "loss": 0.81984651, "num_input_tokens_seen": 294418840, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 13649, "time_per_iteration": 2.814640522003174 }, { "auxiliary_loss_clip": 0.01113814, "auxiliary_loss_mlp": 0.01036122, "balance_loss_clip": 1.02220404, "balance_loss_mlp": 1.03577971, "epoch": 0.8206823989177815, "flos": 21178605715200.0, "grad_norm": 1.6667635563377017, "language_loss": 0.68731999, "learning_rate": 3.090401671930929e-07, "loss": 0.70881933, "num_input_tokens_seen": 294438215, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.6875, "step": 13650, "time_per_iteration": 2.627939224243164 }, { "auxiliary_loss_clip": 0.0112401, "auxiliary_loss_mlp": 0.01033735, "balance_loss_clip": 1.02069998, "balance_loss_mlp": 1.03577662, "epoch": 0.8207425221704494, "flos": 11509622150400.0, "grad_norm": 4.263814801675647, "language_loss": 0.60617125, "learning_rate": 3.088384375165258e-07, "loss": 0.62774873, "num_input_tokens_seen": 294455260, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 13651, "time_per_iteration": 2.646050214767456 }, { "auxiliary_loss_clip": 0.01135681, "auxiliary_loss_mlp": 0.01283528, "balance_loss_clip": 1.02409792, "balance_loss_mlp": 1.03721285, "epoch": 0.8208026454231174, "flos": 19791987039360.0, "grad_norm": 2.2298209887994633, "language_loss": 0.72257787, "learning_rate": 3.086367681931641e-07, "loss": 0.74676996, "num_input_tokens_seen": 294473205, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 13652, "time_per_iteration": 4.036789894104004 }, { "auxiliary_loss_clip": 0.01116271, "auxiliary_loss_mlp": 0.01031644, "balance_loss_clip": 1.02067041, "balance_loss_mlp": 1.03313696, "epoch": 0.8208627686757853, "flos": 15667538843520.0, "grad_norm": 1.8778873294732379, "language_loss": 0.73190701, "learning_rate": 3.0843515923020655e-07, "loss": 0.75338614, "num_input_tokens_seen": 294490645, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.65234375, "step": 13653, "time_per_iteration": 2.5652809143066406 }, { "auxiliary_loss_clip": 0.01143691, "auxiliary_loss_mlp": 0.0103205, "balance_loss_clip": 1.01855576, "balance_loss_mlp": 1.03579867, "epoch": 0.8209228919284534, "flos": 37482659516160.0, "grad_norm": 2.197527940882293, "language_loss": 0.63101244, "learning_rate": 3.0823361063484576e-07, "loss": 0.65276986, "num_input_tokens_seen": 294513500, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 13654, "time_per_iteration": 2.760277509689331 }, { "auxiliary_loss_clip": 0.01130452, "auxiliary_loss_mlp": 0.01029259, "balance_loss_clip": 1.0170995, "balance_loss_mlp": 1.03273368, "epoch": 0.8209830151811213, "flos": 23112969863040.0, "grad_norm": 1.6334199727312662, "language_loss": 0.7021066, "learning_rate": 3.0803212241427636e-07, "loss": 0.72370374, "num_input_tokens_seen": 294535710, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.7109375, "step": 13655, "time_per_iteration": 4.053625822067261 }, { "auxiliary_loss_clip": 0.01119541, "auxiliary_loss_mlp": 0.01034874, "balance_loss_clip": 1.02235115, "balance_loss_mlp": 1.03407764, "epoch": 0.8210431384337893, "flos": 21288169175040.0, "grad_norm": 2.0048014277204445, "language_loss": 0.81842691, "learning_rate": 3.078306945756881e-07, "loss": 0.83997107, "num_input_tokens_seen": 294554055, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.67578125, "step": 13656, "time_per_iteration": 2.642165184020996 }, { "auxiliary_loss_clip": 0.01118574, "auxiliary_loss_mlp": 0.0103654, "balance_loss_clip": 1.02410698, "balance_loss_mlp": 1.03481126, "epoch": 0.8211032616864572, "flos": 11502403516800.0, "grad_norm": 2.123219584363535, "language_loss": 0.73914194, "learning_rate": 3.0762932712626975e-07, "loss": 0.76069307, "num_input_tokens_seen": 294570390, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.66015625, "step": 13657, "time_per_iteration": 2.504094362258911 }, { "auxiliary_loss_clip": 0.01129175, "auxiliary_loss_mlp": 0.01030543, "balance_loss_clip": 1.01824045, "balance_loss_mlp": 1.0338341, "epoch": 0.8211633849391252, "flos": 29821477455360.0, "grad_norm": 1.7929032296932657, "language_loss": 0.55812275, "learning_rate": 3.074280200732073e-07, "loss": 0.57971996, "num_input_tokens_seen": 294593050, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 13658, "time_per_iteration": 2.673478603363037 }, { "auxiliary_loss_clip": 0.01148344, "auxiliary_loss_mlp": 0.01033683, "balance_loss_clip": 1.0212791, "balance_loss_mlp": 1.0336262, "epoch": 0.8212235081917931, "flos": 13115439573120.0, "grad_norm": 1.9884243655336968, "language_loss": 0.78831184, "learning_rate": 3.0722677342368464e-07, "loss": 0.81013203, "num_input_tokens_seen": 294608550, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69921875, "step": 13659, "time_per_iteration": 2.6402533054351807 }, { "auxiliary_loss_clip": 0.01104769, "auxiliary_loss_mlp": 0.01026404, "balance_loss_clip": 1.01572275, "balance_loss_mlp": 1.03097868, "epoch": 0.8212836314444611, "flos": 40515351782400.0, "grad_norm": 2.3302317960981314, "language_loss": 0.5979116, "learning_rate": 3.070255871848848e-07, "loss": 0.61922336, "num_input_tokens_seen": 294630380, "router_z_loss_clip": 0.10693359, "router_z_loss_mlp": 0.6484375, "step": 13660, "time_per_iteration": 2.6984784603118896 }, { "auxiliary_loss_clip": 0.01113471, "auxiliary_loss_mlp": 0.0103708, "balance_loss_clip": 1.02361548, "balance_loss_mlp": 1.03377223, "epoch": 0.8213437546971292, "flos": 18770543982720.0, "grad_norm": 2.098774171095997, "language_loss": 0.72253633, "learning_rate": 3.068244613639861e-07, "loss": 0.7440418, "num_input_tokens_seen": 294648655, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.70703125, "step": 13661, "time_per_iteration": 2.546082019805908 }, { "auxiliary_loss_clip": 0.01113136, "auxiliary_loss_mlp": 0.01033262, "balance_loss_clip": 1.0205369, "balance_loss_mlp": 1.0318538, "epoch": 0.8214038779497971, "flos": 19682279925120.0, "grad_norm": 2.0411405868613857, "language_loss": 0.74885988, "learning_rate": 3.0662339596816746e-07, "loss": 0.77032387, "num_input_tokens_seen": 294666915, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 13662, "time_per_iteration": 2.513211727142334 }, { "auxiliary_loss_clip": 0.01022241, "auxiliary_loss_mlp": 0.00999504, "balance_loss_clip": 0.9983477, "balance_loss_mlp": 1.00101721, "epoch": 0.8214640012024651, "flos": 71602969697280.0, "grad_norm": 0.7416293552195261, "language_loss": 0.53979385, "learning_rate": 3.06422391004604e-07, "loss": 0.56001127, "num_input_tokens_seen": 294731545, "router_z_loss_clip": 0.01153564, "router_z_loss_mlp": 0.21289062, "step": 13663, "time_per_iteration": 3.29788875579834 }, { "auxiliary_loss_clip": 0.01137668, "auxiliary_loss_mlp": 0.01031626, "balance_loss_clip": 1.01907325, "balance_loss_mlp": 1.03286386, "epoch": 0.821524124455133, "flos": 14757203531520.0, "grad_norm": 2.57143926560336, "language_loss": 0.65509939, "learning_rate": 3.062214464804691e-07, "loss": 0.67679232, "num_input_tokens_seen": 294748745, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 13664, "time_per_iteration": 2.5425946712493896 }, { "auxiliary_loss_clip": 0.0111866, "auxiliary_loss_mlp": 0.01029362, "balance_loss_clip": 1.01790619, "balance_loss_mlp": 1.03492761, "epoch": 0.821584247707801, "flos": 25082274965760.0, "grad_norm": 1.4156042790486714, "language_loss": 0.75245011, "learning_rate": 3.0602056240293305e-07, "loss": 0.77393043, "num_input_tokens_seen": 294768955, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6640625, "step": 13665, "time_per_iteration": 2.6426947116851807 }, { "auxiliary_loss_clip": 0.01113489, "auxiliary_loss_mlp": 0.01278788, "balance_loss_clip": 1.01995909, "balance_loss_mlp": 1.03487241, "epoch": 0.8216443709604689, "flos": 36830701710720.0, "grad_norm": 2.0635401161704583, "language_loss": 0.65308094, "learning_rate": 3.058197387791666e-07, "loss": 0.67700374, "num_input_tokens_seen": 294789250, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6953125, "step": 13666, "time_per_iteration": 2.6504008769989014 }, { "auxiliary_loss_clip": 0.01138026, "auxiliary_loss_mlp": 0.01033307, "balance_loss_clip": 1.02106416, "balance_loss_mlp": 1.03461134, "epoch": 0.821704494213137, "flos": 25081808088960.0, "grad_norm": 1.64003308966461, "language_loss": 0.76914716, "learning_rate": 3.056189756163354e-07, "loss": 0.79086041, "num_input_tokens_seen": 294809760, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 13667, "time_per_iteration": 2.63313364982605 }, { "auxiliary_loss_clip": 0.01132498, "auxiliary_loss_mlp": 0.01033098, "balance_loss_clip": 1.02036023, "balance_loss_mlp": 1.03495431, "epoch": 0.8217646174658049, "flos": 14356117290240.0, "grad_norm": 2.353116583590173, "language_loss": 0.76920986, "learning_rate": 3.054182729216046e-07, "loss": 0.79086578, "num_input_tokens_seen": 294826495, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 13668, "time_per_iteration": 2.486677408218384 }, { "auxiliary_loss_clip": 0.01135541, "auxiliary_loss_mlp": 0.01032973, "balance_loss_clip": 1.02074194, "balance_loss_mlp": 1.03289366, "epoch": 0.8218247407184729, "flos": 22090557139200.0, "grad_norm": 2.2038887483648417, "language_loss": 0.73162031, "learning_rate": 3.052176307021361e-07, "loss": 0.75330544, "num_input_tokens_seen": 294845370, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 13669, "time_per_iteration": 2.6486427783966064 }, { "auxiliary_loss_clip": 0.01111563, "auxiliary_loss_mlp": 0.01027038, "balance_loss_clip": 1.01492679, "balance_loss_mlp": 1.03440487, "epoch": 0.8218848639711408, "flos": 16764035368320.0, "grad_norm": 1.7222854745723344, "language_loss": 0.78769112, "learning_rate": 3.050170489650918e-07, "loss": 0.80907714, "num_input_tokens_seen": 294863740, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.68359375, "step": 13670, "time_per_iteration": 2.5782501697540283 }, { "auxiliary_loss_clip": 0.01116295, "auxiliary_loss_mlp": 0.01037436, "balance_loss_clip": 1.02417397, "balance_loss_mlp": 1.03612792, "epoch": 0.8219449872238088, "flos": 25994801007360.0, "grad_norm": 1.9979815900627238, "language_loss": 0.75136304, "learning_rate": 3.0481652771762823e-07, "loss": 0.77290034, "num_input_tokens_seen": 294882815, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 13671, "time_per_iteration": 2.602933645248413 }, { "auxiliary_loss_clip": 0.01118184, "auxiliary_loss_mlp": 0.01032766, "balance_loss_clip": 1.01973605, "balance_loss_mlp": 1.03554177, "epoch": 0.8220051104764767, "flos": 20778094091520.0, "grad_norm": 2.2413138875277694, "language_loss": 0.76436979, "learning_rate": 3.0461606696690246e-07, "loss": 0.78587925, "num_input_tokens_seen": 294901985, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7421875, "step": 13672, "time_per_iteration": 2.5693352222442627 }, { "auxiliary_loss_clip": 0.01140477, "auxiliary_loss_mlp": 0.01036534, "balance_loss_clip": 1.02290797, "balance_loss_mlp": 1.03475785, "epoch": 0.8220652337291448, "flos": 14574849160320.0, "grad_norm": 1.9982599585510716, "language_loss": 0.74504745, "learning_rate": 3.0441566672006903e-07, "loss": 0.76681757, "num_input_tokens_seen": 294919705, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.69921875, "step": 13673, "time_per_iteration": 2.5611674785614014 }, { "auxiliary_loss_clip": 0.01123667, "auxiliary_loss_mlp": 0.01031853, "balance_loss_clip": 1.0193541, "balance_loss_mlp": 1.0350337, "epoch": 0.8221253569818128, "flos": 23805866194560.0, "grad_norm": 1.7173005608879661, "language_loss": 0.79373395, "learning_rate": 3.0421532698427753e-07, "loss": 0.81528914, "num_input_tokens_seen": 294939900, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 13674, "time_per_iteration": 2.5718836784362793 }, { "auxiliary_loss_clip": 0.011205, "auxiliary_loss_mlp": 0.01033466, "balance_loss_clip": 1.02123523, "balance_loss_mlp": 1.03357601, "epoch": 0.8221854802344807, "flos": 21288241002240.0, "grad_norm": 1.856549517246756, "language_loss": 0.6993959, "learning_rate": 3.040150477666799e-07, "loss": 0.72093558, "num_input_tokens_seen": 294959110, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69140625, "step": 13675, "time_per_iteration": 2.655400037765503 }, { "auxiliary_loss_clip": 0.01110392, "auxiliary_loss_mlp": 0.01038401, "balance_loss_clip": 1.02591419, "balance_loss_mlp": 1.03446507, "epoch": 0.8222456034871487, "flos": 20956785275520.0, "grad_norm": 1.558359003226421, "language_loss": 0.74474174, "learning_rate": 3.038148290744218e-07, "loss": 0.76622969, "num_input_tokens_seen": 294978660, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.66796875, "step": 13676, "time_per_iteration": 2.5526576042175293 }, { "auxiliary_loss_clip": 0.0111989, "auxiliary_loss_mlp": 0.01025986, "balance_loss_clip": 1.01416039, "balance_loss_mlp": 1.03328133, "epoch": 0.8223057267398166, "flos": 21397517153280.0, "grad_norm": 2.611609290387512, "language_loss": 0.80186021, "learning_rate": 3.036146709146503e-07, "loss": 0.82331896, "num_input_tokens_seen": 294998075, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 13677, "time_per_iteration": 2.6073150634765625 }, { "auxiliary_loss_clip": 0.01114925, "auxiliary_loss_mlp": 0.01033514, "balance_loss_clip": 1.02127111, "balance_loss_mlp": 1.03569102, "epoch": 0.8223658499924846, "flos": 15268212368640.0, "grad_norm": 1.7928840294666424, "language_loss": 0.70301652, "learning_rate": 3.034145732945066e-07, "loss": 0.72450089, "num_input_tokens_seen": 295015950, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.703125, "step": 13678, "time_per_iteration": 2.58536958694458 }, { "auxiliary_loss_clip": 0.01101646, "auxiliary_loss_mlp": 0.01032692, "balance_loss_clip": 1.02033639, "balance_loss_mlp": 1.03316188, "epoch": 0.8224259732451525, "flos": 31249537447680.0, "grad_norm": 1.5472990152024166, "language_loss": 0.71295756, "learning_rate": 3.0321453622113337e-07, "loss": 0.73430097, "num_input_tokens_seen": 295036800, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 13679, "time_per_iteration": 2.659940481185913 }, { "auxiliary_loss_clip": 0.01118453, "auxiliary_loss_mlp": 0.01027105, "balance_loss_clip": 1.01470709, "balance_loss_mlp": 1.03283441, "epoch": 0.8224860964978206, "flos": 21574628138880.0, "grad_norm": 2.0337683839323977, "language_loss": 0.69909841, "learning_rate": 3.0301455970166847e-07, "loss": 0.72055399, "num_input_tokens_seen": 295055300, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6796875, "step": 13680, "time_per_iteration": 2.590291976928711 }, { "auxiliary_loss_clip": 0.01130108, "auxiliary_loss_mlp": 0.01031429, "balance_loss_clip": 1.01994872, "balance_loss_mlp": 1.03545702, "epoch": 0.8225462197504885, "flos": 25483217552640.0, "grad_norm": 2.1293015451276722, "language_loss": 0.59678012, "learning_rate": 3.028146437432488e-07, "loss": 0.61839545, "num_input_tokens_seen": 295076420, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.68359375, "step": 13681, "time_per_iteration": 3.9621734619140625 }, { "auxiliary_loss_clip": 0.01124398, "auxiliary_loss_mlp": 0.01037406, "balance_loss_clip": 1.02512717, "balance_loss_mlp": 1.03474784, "epoch": 0.8226063430031565, "flos": 39385458587520.0, "grad_norm": 1.5162137612572475, "language_loss": 0.69400686, "learning_rate": 3.026147883530084e-07, "loss": 0.71562493, "num_input_tokens_seen": 295100540, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71875, "step": 13682, "time_per_iteration": 2.7596023082733154 }, { "auxiliary_loss_clip": 0.01104389, "auxiliary_loss_mlp": 0.01033359, "balance_loss_clip": 1.01976895, "balance_loss_mlp": 1.03411007, "epoch": 0.8226664662558244, "flos": 22815269942400.0, "grad_norm": 1.8265286904572342, "language_loss": 0.79620266, "learning_rate": 3.024149935380809e-07, "loss": 0.81758016, "num_input_tokens_seen": 295120180, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.703125, "step": 13683, "time_per_iteration": 2.57999849319458 }, { "auxiliary_loss_clip": 0.01099003, "auxiliary_loss_mlp": 0.01029786, "balance_loss_clip": 1.01898003, "balance_loss_mlp": 1.03351307, "epoch": 0.8227265895084924, "flos": 25665607837440.0, "grad_norm": 1.8925157956492427, "language_loss": 0.86397797, "learning_rate": 3.022152593055947e-07, "loss": 0.88526583, "num_input_tokens_seen": 295138530, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.65625, "step": 13684, "time_per_iteration": 2.5253407955169678 }, { "auxiliary_loss_clip": 0.01100689, "auxiliary_loss_mlp": 0.01029436, "balance_loss_clip": 1.01774144, "balance_loss_mlp": 1.03421545, "epoch": 0.8227867127611603, "flos": 26179274280960.0, "grad_norm": 1.4857241688022942, "language_loss": 0.79858875, "learning_rate": 3.0201558566267895e-07, "loss": 0.81989002, "num_input_tokens_seen": 295160260, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6640625, "step": 13685, "time_per_iteration": 2.590756416320801 }, { "auxiliary_loss_clip": 0.0112924, "auxiliary_loss_mlp": 0.01030534, "balance_loss_clip": 1.01829147, "balance_loss_mlp": 1.03404379, "epoch": 0.8228468360138284, "flos": 22018053536640.0, "grad_norm": 1.526672251497046, "language_loss": 0.68854368, "learning_rate": 3.0181597261645886e-07, "loss": 0.71014142, "num_input_tokens_seen": 295177055, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.68359375, "step": 13686, "time_per_iteration": 2.5401551723480225 }, { "auxiliary_loss_clip": 0.01129526, "auxiliary_loss_mlp": 0.01037381, "balance_loss_clip": 1.02474451, "balance_loss_mlp": 1.03275299, "epoch": 0.8229069592664963, "flos": 14903359971840.0, "grad_norm": 3.7683478757529074, "language_loss": 0.78252852, "learning_rate": 3.016164201740594e-07, "loss": 0.80419761, "num_input_tokens_seen": 295193870, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 13687, "time_per_iteration": 3.89567232131958 }, { "auxiliary_loss_clip": 0.01112044, "auxiliary_loss_mlp": 0.0103209, "balance_loss_clip": 1.01976371, "balance_loss_mlp": 1.03361249, "epoch": 0.8229670825191643, "flos": 15669478177920.0, "grad_norm": 1.7207974463915212, "language_loss": 0.72740614, "learning_rate": 3.0141692834260067e-07, "loss": 0.74884748, "num_input_tokens_seen": 295211040, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.69921875, "step": 13688, "time_per_iteration": 2.489675998687744 }, { "auxiliary_loss_clip": 0.01103128, "auxiliary_loss_mlp": 0.01279272, "balance_loss_clip": 1.0203253, "balance_loss_mlp": 1.03391623, "epoch": 0.8230272057718323, "flos": 23183498217600.0, "grad_norm": 2.085836261318969, "language_loss": 0.73324251, "learning_rate": 3.012174971292021e-07, "loss": 0.75706649, "num_input_tokens_seen": 295231300, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 13689, "time_per_iteration": 2.5474696159362793 }, { "auxiliary_loss_clip": 0.01113838, "auxiliary_loss_mlp": 0.01035437, "balance_loss_clip": 1.02313423, "balance_loss_mlp": 1.034266, "epoch": 0.8230873290245002, "flos": 21032413361280.0, "grad_norm": 1.890678418114159, "language_loss": 0.68807554, "learning_rate": 3.010181265409821e-07, "loss": 0.70956826, "num_input_tokens_seen": 295251045, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 13690, "time_per_iteration": 2.5433549880981445 }, { "auxiliary_loss_clip": 0.01138731, "auxiliary_loss_mlp": 0.01033485, "balance_loss_clip": 1.02147436, "balance_loss_mlp": 1.03413332, "epoch": 0.8231474522771682, "flos": 21250139650560.0, "grad_norm": 1.7761824901187706, "language_loss": 0.8543117, "learning_rate": 3.0081881658505403e-07, "loss": 0.87603384, "num_input_tokens_seen": 295270225, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 13691, "time_per_iteration": 2.640198230743408 }, { "auxiliary_loss_clip": 0.01113147, "auxiliary_loss_mlp": 0.01033315, "balance_loss_clip": 1.0210309, "balance_loss_mlp": 1.03413606, "epoch": 0.8232075755298361, "flos": 23842028211840.0, "grad_norm": 2.0165893226792813, "language_loss": 0.7718935, "learning_rate": 3.0061956726853186e-07, "loss": 0.79335809, "num_input_tokens_seen": 295288950, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 13692, "time_per_iteration": 2.583770751953125 }, { "auxiliary_loss_clip": 0.011108, "auxiliary_loss_mlp": 0.01026502, "balance_loss_clip": 1.01428294, "balance_loss_mlp": 1.0337708, "epoch": 0.8232676987825042, "flos": 21653955325440.0, "grad_norm": 1.5179937616157075, "language_loss": 0.71437156, "learning_rate": 3.0042037859852574e-07, "loss": 0.73574454, "num_input_tokens_seen": 295309405, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 13693, "time_per_iteration": 2.553490400314331 }, { "auxiliary_loss_clip": 0.01125228, "auxiliary_loss_mlp": 0.01034113, "balance_loss_clip": 1.02329469, "balance_loss_mlp": 1.03260469, "epoch": 0.8233278220351721, "flos": 26322701287680.0, "grad_norm": 1.4037277482667343, "language_loss": 0.83815414, "learning_rate": 3.002212505821453e-07, "loss": 0.85974753, "num_input_tokens_seen": 295331115, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.65625, "step": 13694, "time_per_iteration": 4.057621479034424 }, { "auxiliary_loss_clip": 0.01147986, "auxiliary_loss_mlp": 0.01032585, "balance_loss_clip": 1.01981163, "balance_loss_mlp": 1.03360677, "epoch": 0.8233879452878401, "flos": 21725812483200.0, "grad_norm": 1.5233059447834603, "language_loss": 0.76799035, "learning_rate": 3.0002218322649533e-07, "loss": 0.78979605, "num_input_tokens_seen": 295350495, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 13695, "time_per_iteration": 2.5282886028289795 }, { "auxiliary_loss_clip": 0.01121461, "auxiliary_loss_mlp": 0.01032271, "balance_loss_clip": 1.02016521, "balance_loss_mlp": 1.03556538, "epoch": 0.823448068540508, "flos": 20557746109440.0, "grad_norm": 1.8955912045284222, "language_loss": 0.81004846, "learning_rate": 2.9982317653868095e-07, "loss": 0.83158576, "num_input_tokens_seen": 295368225, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.68359375, "step": 13696, "time_per_iteration": 3.9816412925720215 }, { "auxiliary_loss_clip": 0.01031897, "auxiliary_loss_mlp": 0.01002078, "balance_loss_clip": 1.00080836, "balance_loss_mlp": 1.00180483, "epoch": 0.823508191793176, "flos": 67273688194560.0, "grad_norm": 0.7329907170494189, "language_loss": 0.63867521, "learning_rate": 2.996242305258041e-07, "loss": 0.659015, "num_input_tokens_seen": 295430035, "router_z_loss_clip": 0.01269531, "router_z_loss_mlp": 0.21289062, "step": 13697, "time_per_iteration": 3.206627607345581 }, { "auxiliary_loss_clip": 0.01124771, "auxiliary_loss_mlp": 0.0102684, "balance_loss_clip": 1.01528287, "balance_loss_mlp": 1.03288651, "epoch": 0.8235683150458439, "flos": 17928402641280.0, "grad_norm": 1.7559203147344649, "language_loss": 0.72958642, "learning_rate": 2.994253451949647e-07, "loss": 0.75110257, "num_input_tokens_seen": 295447765, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.65625, "step": 13698, "time_per_iteration": 2.5435755252838135 }, { "auxiliary_loss_clip": 0.01131013, "auxiliary_loss_mlp": 0.01027098, "balance_loss_clip": 1.01374066, "balance_loss_mlp": 1.03411889, "epoch": 0.823628438298512, "flos": 18916089891840.0, "grad_norm": 1.903839388683042, "language_loss": 0.7179209, "learning_rate": 2.9922652055325983e-07, "loss": 0.73950201, "num_input_tokens_seen": 295464810, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6953125, "step": 13699, "time_per_iteration": 2.564452648162842 }, { "auxiliary_loss_clip": 0.01118934, "auxiliary_loss_mlp": 0.01030993, "balance_loss_clip": 1.01898313, "balance_loss_mlp": 1.03368664, "epoch": 0.8236885615511799, "flos": 18696460181760.0, "grad_norm": 2.3429368229771734, "language_loss": 0.81553423, "learning_rate": 2.9902775660778633e-07, "loss": 0.83703351, "num_input_tokens_seen": 295482605, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.67578125, "step": 13700, "time_per_iteration": 2.552372455596924 }, { "auxiliary_loss_clip": 0.01121007, "auxiliary_loss_mlp": 0.01033707, "balance_loss_clip": 1.02206028, "balance_loss_mlp": 1.03339362, "epoch": 0.8237486848038479, "flos": 23695009845120.0, "grad_norm": 1.6353113624605544, "language_loss": 0.7284435, "learning_rate": 2.988290533656359e-07, "loss": 0.74999064, "num_input_tokens_seen": 295503780, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.703125, "step": 13701, "time_per_iteration": 2.6136207580566406 }, { "auxiliary_loss_clip": 0.01113359, "auxiliary_loss_mlp": 0.01038219, "balance_loss_clip": 1.02531433, "balance_loss_mlp": 1.0332135, "epoch": 0.8238088080565159, "flos": 23441301106560.0, "grad_norm": 1.8806837888117078, "language_loss": 0.69237375, "learning_rate": 2.9863041083390105e-07, "loss": 0.7138896, "num_input_tokens_seen": 295522035, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 13702, "time_per_iteration": 2.575115442276001 }, { "auxiliary_loss_clip": 0.01139386, "auxiliary_loss_mlp": 0.01034119, "balance_loss_clip": 1.02169728, "balance_loss_mlp": 1.03459454, "epoch": 0.8238689313091838, "flos": 22746537267840.0, "grad_norm": 1.8840951479811037, "language_loss": 0.7476781, "learning_rate": 2.9843182901967056e-07, "loss": 0.76941317, "num_input_tokens_seen": 295541190, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69140625, "step": 13703, "time_per_iteration": 2.5771305561065674 }, { "auxiliary_loss_clip": 0.01107168, "auxiliary_loss_mlp": 0.01038979, "balance_loss_clip": 1.025437, "balance_loss_mlp": 1.03414583, "epoch": 0.8239290545618518, "flos": 25630092264960.0, "grad_norm": 2.688158971386157, "language_loss": 0.69534826, "learning_rate": 2.98233307930031e-07, "loss": 0.71680975, "num_input_tokens_seen": 295558860, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73046875, "step": 13704, "time_per_iteration": 2.535135507583618 }, { "auxiliary_loss_clip": 0.01099952, "auxiliary_loss_mlp": 0.01031006, "balance_loss_clip": 1.01872134, "balance_loss_mlp": 1.03236747, "epoch": 0.8239891778145197, "flos": 26026473824640.0, "grad_norm": 1.797168648911867, "language_loss": 0.64382142, "learning_rate": 2.980348475720673e-07, "loss": 0.66513097, "num_input_tokens_seen": 295578155, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 13705, "time_per_iteration": 2.6031665802001953 }, { "auxiliary_loss_clip": 0.01103922, "auxiliary_loss_mlp": 0.01028618, "balance_loss_clip": 1.01587415, "balance_loss_mlp": 1.03320301, "epoch": 0.8240493010671878, "flos": 21833257040640.0, "grad_norm": 1.70068847189943, "language_loss": 0.69417775, "learning_rate": 2.9783644795286146e-07, "loss": 0.71550316, "num_input_tokens_seen": 295599170, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 13706, "time_per_iteration": 2.5480761528015137 }, { "auxiliary_loss_clip": 0.01124202, "auxiliary_loss_mlp": 0.01031487, "balance_loss_clip": 1.01873755, "balance_loss_mlp": 1.0368402, "epoch": 0.8241094243198557, "flos": 18551919853440.0, "grad_norm": 2.280126259181788, "language_loss": 0.6933831, "learning_rate": 2.9763810907949503e-07, "loss": 0.71493995, "num_input_tokens_seen": 295617465, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 13707, "time_per_iteration": 2.5624425411224365 }, { "auxiliary_loss_clip": 0.01129473, "auxiliary_loss_mlp": 0.01031174, "balance_loss_clip": 1.01838899, "balance_loss_mlp": 1.03303695, "epoch": 0.8241695475725237, "flos": 25447163276160.0, "grad_norm": 1.8359398437141243, "language_loss": 0.79175568, "learning_rate": 2.974398309590447e-07, "loss": 0.81336212, "num_input_tokens_seen": 295634960, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 13708, "time_per_iteration": 2.6836442947387695 }, { "auxiliary_loss_clip": 0.01112238, "auxiliary_loss_mlp": 0.01029565, "balance_loss_clip": 1.01651752, "balance_loss_mlp": 1.03315091, "epoch": 0.8242296708251916, "flos": 18989670902400.0, "grad_norm": 3.4261063674989507, "language_loss": 0.68987608, "learning_rate": 2.972416135985878e-07, "loss": 0.71129405, "num_input_tokens_seen": 295652725, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 13709, "time_per_iteration": 2.7241060733795166 }, { "auxiliary_loss_clip": 0.0113985, "auxiliary_loss_mlp": 0.01035536, "balance_loss_clip": 1.02392542, "balance_loss_mlp": 1.03440571, "epoch": 0.8242897940778596, "flos": 22600883617920.0, "grad_norm": 1.7328399809710995, "language_loss": 0.82200831, "learning_rate": 2.970434570051972e-07, "loss": 0.84376216, "num_input_tokens_seen": 295671195, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6953125, "step": 13710, "time_per_iteration": 2.642932415008545 }, { "auxiliary_loss_clip": 0.01111025, "auxiliary_loss_mlp": 0.01035479, "balance_loss_clip": 1.02274108, "balance_loss_mlp": 1.03306854, "epoch": 0.8243499173305275, "flos": 21468153248640.0, "grad_norm": 3.0825341544553106, "language_loss": 0.78065073, "learning_rate": 2.9684536118594604e-07, "loss": 0.8021158, "num_input_tokens_seen": 295689130, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 13711, "time_per_iteration": 2.557671070098877 }, { "auxiliary_loss_clip": 0.0112949, "auxiliary_loss_mlp": 0.01028683, "balance_loss_clip": 1.01633883, "balance_loss_mlp": 1.03410769, "epoch": 0.8244100405831956, "flos": 20010359773440.0, "grad_norm": 1.8491568108171843, "language_loss": 0.6544112, "learning_rate": 2.966473261479019e-07, "loss": 0.67599297, "num_input_tokens_seen": 295706385, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 13712, "time_per_iteration": 2.5323269367218018 }, { "auxiliary_loss_clip": 0.0111265, "auxiliary_loss_mlp": 0.01030859, "balance_loss_clip": 1.01864564, "balance_loss_mlp": 1.0342958, "epoch": 0.8244701638358635, "flos": 26430684549120.0, "grad_norm": 1.6267024467002167, "language_loss": 0.74011028, "learning_rate": 2.964493518981337e-07, "loss": 0.7615453, "num_input_tokens_seen": 295727925, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69140625, "step": 13713, "time_per_iteration": 2.6569151878356934 }, { "auxiliary_loss_clip": 0.01111704, "auxiliary_loss_mlp": 0.0102748, "balance_loss_clip": 1.01506436, "balance_loss_mlp": 1.03311968, "epoch": 0.8245302870885315, "flos": 17640004343040.0, "grad_norm": 1.9467204504791962, "language_loss": 0.81143737, "learning_rate": 2.9625143844370603e-07, "loss": 0.83282924, "num_input_tokens_seen": 295744420, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 13714, "time_per_iteration": 2.484917640686035 }, { "auxiliary_loss_clip": 0.01113411, "auxiliary_loss_mlp": 0.01035021, "balance_loss_clip": 1.0216217, "balance_loss_mlp": 1.03556371, "epoch": 0.8245904103411995, "flos": 26209510554240.0, "grad_norm": 2.187406246528711, "language_loss": 0.66230929, "learning_rate": 2.9605358579168193e-07, "loss": 0.6837936, "num_input_tokens_seen": 295765105, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6875, "step": 13715, "time_per_iteration": 2.634934425354004 }, { "auxiliary_loss_clip": 0.01131603, "auxiliary_loss_mlp": 0.01031079, "balance_loss_clip": 1.01872289, "balance_loss_mlp": 1.03489923, "epoch": 0.8246505335938674, "flos": 24205084928640.0, "grad_norm": 1.7972914382354146, "language_loss": 0.74902862, "learning_rate": 2.9585579394912174e-07, "loss": 0.77065539, "num_input_tokens_seen": 295784200, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.703125, "step": 13716, "time_per_iteration": 2.5631659030914307 }, { "auxiliary_loss_clip": 0.01049863, "auxiliary_loss_mlp": 0.00999122, "balance_loss_clip": 0.99793571, "balance_loss_mlp": 1.00158107, "epoch": 0.8247106568465354, "flos": 67092195749760.0, "grad_norm": 0.7150453371496793, "language_loss": 0.58998632, "learning_rate": 2.956580629230858e-07, "loss": 0.61047626, "num_input_tokens_seen": 295846555, "router_z_loss_clip": 0.01184082, "router_z_loss_mlp": 0.2109375, "step": 13717, "time_per_iteration": 3.305166006088257 }, { "auxiliary_loss_clip": 0.0111192, "auxiliary_loss_mlp": 0.0127525, "balance_loss_clip": 1.0173744, "balance_loss_mlp": 1.03522229, "epoch": 0.8247707800992033, "flos": 12568232805120.0, "grad_norm": 1.9207848505811758, "language_loss": 0.79295653, "learning_rate": 2.9546039272062897e-07, "loss": 0.81682825, "num_input_tokens_seen": 295863425, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.67578125, "step": 13718, "time_per_iteration": 2.481168508529663 }, { "auxiliary_loss_clip": 0.01109661, "auxiliary_loss_mlp": 0.01037418, "balance_loss_clip": 1.02528298, "balance_loss_mlp": 1.0340482, "epoch": 0.8248309033518714, "flos": 15923617879680.0, "grad_norm": 1.5184130852519988, "language_loss": 0.68531442, "learning_rate": 2.952627833488055e-07, "loss": 0.7067852, "num_input_tokens_seen": 295880925, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6640625, "step": 13719, "time_per_iteration": 2.5349793434143066 }, { "auxiliary_loss_clip": 0.01121978, "auxiliary_loss_mlp": 0.01031351, "balance_loss_clip": 1.01790476, "balance_loss_mlp": 1.0327369, "epoch": 0.8248910266045393, "flos": 17564735393280.0, "grad_norm": 2.312661292951242, "language_loss": 0.69350123, "learning_rate": 2.9506523481466896e-07, "loss": 0.71503448, "num_input_tokens_seen": 295898205, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 13720, "time_per_iteration": 2.545029878616333 }, { "auxiliary_loss_clip": 0.01159019, "auxiliary_loss_mlp": 0.01029151, "balance_loss_clip": 1.01667547, "balance_loss_mlp": 1.03484523, "epoch": 0.8249511498572073, "flos": 28619655275520.0, "grad_norm": 1.9386463662454336, "language_loss": 0.76110673, "learning_rate": 2.948677471252683e-07, "loss": 0.78298837, "num_input_tokens_seen": 295918130, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7109375, "step": 13721, "time_per_iteration": 2.707771062850952 }, { "auxiliary_loss_clip": 0.01145122, "auxiliary_loss_mlp": 0.0102762, "balance_loss_clip": 1.01570535, "balance_loss_mlp": 1.03293192, "epoch": 0.8250112731098752, "flos": 25556583081600.0, "grad_norm": 1.9469038361348308, "language_loss": 0.78051198, "learning_rate": 2.9467032028765173e-07, "loss": 0.80223942, "num_input_tokens_seen": 295937760, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 13722, "time_per_iteration": 2.6802268028259277 }, { "auxiliary_loss_clip": 0.01101913, "auxiliary_loss_mlp": 0.01030945, "balance_loss_clip": 1.01899433, "balance_loss_mlp": 1.03483629, "epoch": 0.8250713963625432, "flos": 27746164339200.0, "grad_norm": 2.37308135171142, "language_loss": 0.6273253, "learning_rate": 2.944729543088642e-07, "loss": 0.64865386, "num_input_tokens_seen": 295957585, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 13723, "time_per_iteration": 4.05610728263855 }, { "auxiliary_loss_clip": 0.01104244, "auxiliary_loss_mlp": 0.01034972, "balance_loss_clip": 1.02246106, "balance_loss_mlp": 1.03470588, "epoch": 0.8251315196152111, "flos": 21610610588160.0, "grad_norm": 1.8749277361783627, "language_loss": 0.74386156, "learning_rate": 2.9427564919595107e-07, "loss": 0.76525366, "num_input_tokens_seen": 295977135, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 13724, "time_per_iteration": 2.551490068435669 }, { "auxiliary_loss_clip": 0.01132784, "auxiliary_loss_mlp": 0.01033935, "balance_loss_clip": 1.02111375, "balance_loss_mlp": 1.0357374, "epoch": 0.8251916428678792, "flos": 28579363194240.0, "grad_norm": 2.3387588596191917, "language_loss": 0.6414603, "learning_rate": 2.9407840495595104e-07, "loss": 0.66312754, "num_input_tokens_seen": 295996265, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 13725, "time_per_iteration": 2.570162057876587 }, { "auxiliary_loss_clip": 0.01118276, "auxiliary_loss_mlp": 0.01029576, "balance_loss_clip": 1.01764297, "balance_loss_mlp": 1.03276896, "epoch": 0.8252517661205471, "flos": 23075191733760.0, "grad_norm": 1.6455151727981343, "language_loss": 0.81772161, "learning_rate": 2.938812215959052e-07, "loss": 0.83920014, "num_input_tokens_seen": 296014745, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.67578125, "step": 13726, "time_per_iteration": 2.5215508937835693 }, { "auxiliary_loss_clip": 0.01104672, "auxiliary_loss_mlp": 0.01032513, "balance_loss_clip": 1.02089632, "balance_loss_mlp": 1.03016365, "epoch": 0.8253118893732151, "flos": 31759576617600.0, "grad_norm": 2.3218741687418722, "language_loss": 0.70382524, "learning_rate": 2.9368409912284997e-07, "loss": 0.72519708, "num_input_tokens_seen": 296036960, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.65625, "step": 13727, "time_per_iteration": 2.562730550765991 }, { "auxiliary_loss_clip": 0.01114401, "auxiliary_loss_mlp": 0.01030279, "balance_loss_clip": 1.0171001, "balance_loss_mlp": 1.03532541, "epoch": 0.8253720126258831, "flos": 18296415434880.0, "grad_norm": 1.935826559608398, "language_loss": 0.62693071, "learning_rate": 2.934870375438201e-07, "loss": 0.64837742, "num_input_tokens_seen": 296056540, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 13728, "time_per_iteration": 3.8993170261383057 }, { "auxiliary_loss_clip": 0.01124816, "auxiliary_loss_mlp": 0.01031178, "balance_loss_clip": 1.01864326, "balance_loss_mlp": 1.03534579, "epoch": 0.825432135878551, "flos": 26797332625920.0, "grad_norm": 1.9580141504694504, "language_loss": 0.71584058, "learning_rate": 2.9329003686584775e-07, "loss": 0.73740053, "num_input_tokens_seen": 296077950, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 13729, "time_per_iteration": 2.6242260932922363 }, { "auxiliary_loss_clip": 0.01040186, "auxiliary_loss_mlp": 0.01000159, "balance_loss_clip": 0.99887723, "balance_loss_mlp": 1.00118816, "epoch": 0.825492259131219, "flos": 54219116217600.0, "grad_norm": 0.8784626496426932, "language_loss": 0.62727362, "learning_rate": 2.9309309709596396e-07, "loss": 0.64767706, "num_input_tokens_seen": 296127060, "router_z_loss_clip": 0.01281738, "router_z_loss_mlp": 0.21191406, "step": 13730, "time_per_iteration": 2.97711443901062 }, { "auxiliary_loss_clip": 0.01137922, "auxiliary_loss_mlp": 0.01032635, "balance_loss_clip": 1.02026105, "balance_loss_mlp": 1.03428054, "epoch": 0.825552382383887, "flos": 43756145493120.0, "grad_norm": 2.217108770560487, "language_loss": 0.63135684, "learning_rate": 2.9289621824119716e-07, "loss": 0.6530624, "num_input_tokens_seen": 296147775, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 13731, "time_per_iteration": 2.79964280128479 }, { "auxiliary_loss_clip": 0.01133909, "auxiliary_loss_mlp": 0.01277773, "balance_loss_clip": 1.01856756, "balance_loss_mlp": 1.03352857, "epoch": 0.825612505636555, "flos": 12602814624000.0, "grad_norm": 2.020949923155693, "language_loss": 0.69702113, "learning_rate": 2.92699400308573e-07, "loss": 0.72113788, "num_input_tokens_seen": 296163560, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.73046875, "step": 13732, "time_per_iteration": 2.5057709217071533 }, { "auxiliary_loss_clip": 0.01113629, "auxiliary_loss_mlp": 0.01037684, "balance_loss_clip": 1.02511311, "balance_loss_mlp": 1.03315091, "epoch": 0.8256726288892229, "flos": 17595618111360.0, "grad_norm": 2.224324511560802, "language_loss": 0.70853865, "learning_rate": 2.925026433051156e-07, "loss": 0.73005182, "num_input_tokens_seen": 296178730, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 13733, "time_per_iteration": 2.5242156982421875 }, { "auxiliary_loss_clip": 0.01099479, "auxiliary_loss_mlp": 0.0103169, "balance_loss_clip": 1.01925659, "balance_loss_mlp": 1.03186464, "epoch": 0.8257327521418909, "flos": 23805794367360.0, "grad_norm": 1.4283232471458285, "language_loss": 0.82554537, "learning_rate": 2.923059472378466e-07, "loss": 0.84685701, "num_input_tokens_seen": 296200175, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6796875, "step": 13734, "time_per_iteration": 2.5882833003997803 }, { "auxiliary_loss_clip": 0.01116973, "auxiliary_loss_mlp": 0.01029255, "balance_loss_clip": 1.01683354, "balance_loss_mlp": 1.03260243, "epoch": 0.8257928753945588, "flos": 19281121856640.0, "grad_norm": 1.7562238597888822, "language_loss": 0.82921469, "learning_rate": 2.921093121137856e-07, "loss": 0.85067689, "num_input_tokens_seen": 296219305, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6640625, "step": 13735, "time_per_iteration": 2.5486295223236084 }, { "auxiliary_loss_clip": 0.01115368, "auxiliary_loss_mlp": 0.01030451, "balance_loss_clip": 1.0191679, "balance_loss_mlp": 1.03665018, "epoch": 0.8258529986472268, "flos": 18478841633280.0, "grad_norm": 1.714045260391489, "language_loss": 0.7098853, "learning_rate": 2.9191273793994976e-07, "loss": 0.73134351, "num_input_tokens_seen": 296236945, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6953125, "step": 13736, "time_per_iteration": 4.011132717132568 }, { "auxiliary_loss_clip": 0.01030713, "auxiliary_loss_mlp": 0.01000296, "balance_loss_clip": 0.99915773, "balance_loss_mlp": 1.00085235, "epoch": 0.8259131218998947, "flos": 62137957512960.0, "grad_norm": 0.8534637442511477, "language_loss": 0.67957568, "learning_rate": 2.917162247233549e-07, "loss": 0.69988573, "num_input_tokens_seen": 296294685, "router_z_loss_clip": 0.01141357, "router_z_loss_mlp": 0.2109375, "step": 13737, "time_per_iteration": 4.671741247177124 }, { "auxiliary_loss_clip": 0.01121947, "auxiliary_loss_mlp": 0.01031119, "balance_loss_clip": 1.01933551, "balance_loss_mlp": 1.03455412, "epoch": 0.8259732451525628, "flos": 22159038418560.0, "grad_norm": 1.91011878959777, "language_loss": 0.69488084, "learning_rate": 2.9151977247101366e-07, "loss": 0.71641147, "num_input_tokens_seen": 296314790, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 13738, "time_per_iteration": 2.5785999298095703 }, { "auxiliary_loss_clip": 0.011249, "auxiliary_loss_mlp": 0.01033484, "balance_loss_clip": 1.01945949, "balance_loss_mlp": 1.03661466, "epoch": 0.8260333684052307, "flos": 38361645233280.0, "grad_norm": 2.3363165922837674, "language_loss": 0.63145959, "learning_rate": 2.9132338118993716e-07, "loss": 0.65304351, "num_input_tokens_seen": 296335355, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.70703125, "step": 13739, "time_per_iteration": 2.68532395362854 }, { "auxiliary_loss_clip": 0.01039823, "auxiliary_loss_mlp": 0.01001678, "balance_loss_clip": 1.00047958, "balance_loss_mlp": 1.0008359, "epoch": 0.8260934916578987, "flos": 62185611882240.0, "grad_norm": 0.8363272118625317, "language_loss": 0.59410739, "learning_rate": 2.911270508871333e-07, "loss": 0.61452246, "num_input_tokens_seen": 296399885, "router_z_loss_clip": 0.01196289, "router_z_loss_mlp": 0.21289062, "step": 13740, "time_per_iteration": 3.2032575607299805 }, { "auxiliary_loss_clip": 0.01108819, "auxiliary_loss_mlp": 0.01033519, "balance_loss_clip": 1.02025664, "balance_loss_mlp": 1.03568935, "epoch": 0.8261536149105667, "flos": 22565475786240.0, "grad_norm": 1.899044328048422, "language_loss": 0.75504494, "learning_rate": 2.909307815696105e-07, "loss": 0.77646828, "num_input_tokens_seen": 296417660, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 13741, "time_per_iteration": 2.5755815505981445 }, { "auxiliary_loss_clip": 0.01111116, "auxiliary_loss_mlp": 0.01034162, "balance_loss_clip": 1.02137101, "balance_loss_mlp": 1.03335071, "epoch": 0.8262137381632346, "flos": 32525479342080.0, "grad_norm": 1.908949922910124, "language_loss": 0.62639076, "learning_rate": 2.907345732443707e-07, "loss": 0.64784348, "num_input_tokens_seen": 296438255, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 13742, "time_per_iteration": 2.613391399383545 }, { "auxiliary_loss_clip": 0.01144188, "auxiliary_loss_mlp": 0.01038343, "balance_loss_clip": 1.02370369, "balance_loss_mlp": 1.03494692, "epoch": 0.8262738614159026, "flos": 14136451666560.0, "grad_norm": 4.216911137671753, "language_loss": 0.66073585, "learning_rate": 2.905384259184176e-07, "loss": 0.68256116, "num_input_tokens_seen": 296454485, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.734375, "step": 13743, "time_per_iteration": 2.60788893699646 }, { "auxiliary_loss_clip": 0.01123053, "auxiliary_loss_mlp": 0.0103413, "balance_loss_clip": 1.02149963, "balance_loss_mlp": 1.03551018, "epoch": 0.8263339846685706, "flos": 19825347795840.0, "grad_norm": 1.778106496260356, "language_loss": 0.66744125, "learning_rate": 2.9034233959875097e-07, "loss": 0.68901306, "num_input_tokens_seen": 296473740, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 13744, "time_per_iteration": 2.532461166381836 }, { "auxiliary_loss_clip": 0.01130485, "auxiliary_loss_mlp": 0.01029295, "balance_loss_clip": 1.01744008, "balance_loss_mlp": 1.03531671, "epoch": 0.8263941079212386, "flos": 22745962650240.0, "grad_norm": 1.9612811192609942, "language_loss": 0.75397754, "learning_rate": 2.9014631429236836e-07, "loss": 0.77557534, "num_input_tokens_seen": 296493355, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 13745, "time_per_iteration": 2.5798823833465576 }, { "auxiliary_loss_clip": 0.01118716, "auxiliary_loss_mlp": 0.01278303, "balance_loss_clip": 1.02035487, "balance_loss_mlp": 1.03388166, "epoch": 0.8264542311739065, "flos": 20120641505280.0, "grad_norm": 1.80963504047039, "language_loss": 0.78620511, "learning_rate": 2.899503500062652e-07, "loss": 0.8101753, "num_input_tokens_seen": 296510520, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.66796875, "step": 13746, "time_per_iteration": 2.581465721130371 }, { "auxiliary_loss_clip": 0.01117763, "auxiliary_loss_mlp": 0.01036059, "balance_loss_clip": 1.0229578, "balance_loss_mlp": 1.03697991, "epoch": 0.8265143544265745, "flos": 16837149502080.0, "grad_norm": 2.2533499920833946, "language_loss": 0.67893481, "learning_rate": 2.897544467474358e-07, "loss": 0.70047301, "num_input_tokens_seen": 296528265, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 13747, "time_per_iteration": 2.6571574211120605 }, { "auxiliary_loss_clip": 0.01113923, "auxiliary_loss_mlp": 0.01032277, "balance_loss_clip": 1.01922321, "balance_loss_mlp": 1.0343262, "epoch": 0.8265744776792424, "flos": 22018592240640.0, "grad_norm": 2.180467950504755, "language_loss": 0.75494772, "learning_rate": 2.895586045228711e-07, "loss": 0.77640975, "num_input_tokens_seen": 296547810, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 13748, "time_per_iteration": 2.5437371730804443 }, { "auxiliary_loss_clip": 0.01126235, "auxiliary_loss_mlp": 0.01032984, "balance_loss_clip": 1.02211261, "balance_loss_mlp": 1.03440619, "epoch": 0.8266346009319104, "flos": 19244852098560.0, "grad_norm": 1.7943570344263293, "language_loss": 0.63364106, "learning_rate": 2.893628233395593e-07, "loss": 0.65523326, "num_input_tokens_seen": 296565940, "router_z_loss_clip": 0.10888672, "router_z_loss_mlp": 0.65234375, "step": 13749, "time_per_iteration": 2.5772321224212646 }, { "auxiliary_loss_clip": 0.01118152, "auxiliary_loss_mlp": 0.01027699, "balance_loss_clip": 1.01606977, "balance_loss_mlp": 1.03254199, "epoch": 0.8266947241845783, "flos": 24166768095360.0, "grad_norm": 1.6870920926894835, "language_loss": 0.73684871, "learning_rate": 2.8916710320448843e-07, "loss": 0.75830722, "num_input_tokens_seen": 296585090, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 13750, "time_per_iteration": 2.5566036701202393 }, { "auxiliary_loss_clip": 0.01113886, "auxiliary_loss_mlp": 0.01033626, "balance_loss_clip": 1.02112699, "balance_loss_mlp": 1.03534031, "epoch": 0.8267548474372464, "flos": 21105814803840.0, "grad_norm": 4.698940073638504, "language_loss": 0.66108704, "learning_rate": 2.8897144412464293e-07, "loss": 0.68256211, "num_input_tokens_seen": 296604950, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 13751, "time_per_iteration": 2.6811790466308594 }, { "auxiliary_loss_clip": 0.01116868, "auxiliary_loss_mlp": 0.0102875, "balance_loss_clip": 1.01704955, "balance_loss_mlp": 1.03184354, "epoch": 0.8268149706899143, "flos": 39968288668800.0, "grad_norm": 1.5111539942017385, "language_loss": 0.60567391, "learning_rate": 2.8877584610700534e-07, "loss": 0.62713009, "num_input_tokens_seen": 296627780, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 13752, "time_per_iteration": 2.7217161655426025 }, { "auxiliary_loss_clip": 0.01031023, "auxiliary_loss_mlp": 0.00999535, "balance_loss_clip": 0.99836713, "balance_loss_mlp": 1.00122368, "epoch": 0.8268750939425823, "flos": 65782423244160.0, "grad_norm": 0.7724542997790987, "language_loss": 0.57514745, "learning_rate": 2.885803091585555e-07, "loss": 0.59545302, "num_input_tokens_seen": 296683850, "router_z_loss_clip": 0.01165771, "router_z_loss_mlp": 0.2109375, "step": 13753, "time_per_iteration": 3.1886417865753174 }, { "auxiliary_loss_clip": 0.01058141, "auxiliary_loss_mlp": 0.01247068, "balance_loss_clip": 1.0, "balance_loss_mlp": 1.00118399, "epoch": 0.8269352171952503, "flos": 52981455242880.0, "grad_norm": 0.6993794626130088, "language_loss": 0.54974484, "learning_rate": 2.883848332862724e-07, "loss": 0.57279694, "num_input_tokens_seen": 296741420, "router_z_loss_clip": 0.01153564, "router_z_loss_mlp": 0.21289062, "step": 13754, "time_per_iteration": 3.2583913803100586 }, { "auxiliary_loss_clip": 0.01125403, "auxiliary_loss_mlp": 0.01028671, "balance_loss_clip": 1.01817453, "balance_loss_mlp": 1.03164756, "epoch": 0.8269953404479182, "flos": 23076125487360.0, "grad_norm": 2.027704576607424, "language_loss": 0.69241261, "learning_rate": 2.8818941849713205e-07, "loss": 0.71395326, "num_input_tokens_seen": 296759620, "router_z_loss_clip": 0.10498047, "router_z_loss_mlp": 0.66796875, "step": 13755, "time_per_iteration": 2.585956573486328 }, { "auxiliary_loss_clip": 0.01136803, "auxiliary_loss_mlp": 0.01029596, "balance_loss_clip": 1.01845598, "balance_loss_mlp": 1.03398013, "epoch": 0.8270554637005862, "flos": 14209996763520.0, "grad_norm": 1.906488940542906, "language_loss": 0.69710779, "learning_rate": 2.8799406479810785e-07, "loss": 0.71877176, "num_input_tokens_seen": 296777275, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.67578125, "step": 13756, "time_per_iteration": 2.5691332817077637 }, { "auxiliary_loss_clip": 0.01114323, "auxiliary_loss_mlp": 0.01031539, "balance_loss_clip": 1.01908743, "balance_loss_mlp": 1.03618348, "epoch": 0.8271155869532542, "flos": 21762046327680.0, "grad_norm": 1.890545984830979, "language_loss": 0.72246277, "learning_rate": 2.877987721961714e-07, "loss": 0.7439214, "num_input_tokens_seen": 296796655, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 13757, "time_per_iteration": 2.5288825035095215 }, { "auxiliary_loss_clip": 0.01115724, "auxiliary_loss_mlp": 0.01040437, "balance_loss_clip": 1.02591765, "balance_loss_mlp": 1.03506613, "epoch": 0.8271757102059222, "flos": 15120475729920.0, "grad_norm": 2.2364611277388327, "language_loss": 0.69226611, "learning_rate": 2.876035406982933e-07, "loss": 0.71382773, "num_input_tokens_seen": 296813705, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.71875, "step": 13758, "time_per_iteration": 2.4740684032440186 }, { "auxiliary_loss_clip": 0.01115633, "auxiliary_loss_mlp": 0.01029652, "balance_loss_clip": 1.01778436, "balance_loss_mlp": 1.03227222, "epoch": 0.8272358334585901, "flos": 24133730561280.0, "grad_norm": 1.94302456369319, "language_loss": 0.69461536, "learning_rate": 2.8740837031143916e-07, "loss": 0.71606821, "num_input_tokens_seen": 296833985, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.66015625, "step": 13759, "time_per_iteration": 2.6018741130828857 }, { "auxiliary_loss_clip": 0.01117963, "auxiliary_loss_mlp": 0.01030633, "balance_loss_clip": 1.01825285, "balance_loss_mlp": 1.03284526, "epoch": 0.8272959567112581, "flos": 18990712396800.0, "grad_norm": 1.6306379837200753, "language_loss": 0.71099555, "learning_rate": 2.872132610425753e-07, "loss": 0.73248148, "num_input_tokens_seen": 296850150, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.67578125, "step": 13760, "time_per_iteration": 2.5266499519348145 }, { "auxiliary_loss_clip": 0.01128576, "auxiliary_loss_mlp": 0.01030678, "balance_loss_clip": 1.01899004, "balance_loss_mlp": 1.03565311, "epoch": 0.827356079963926, "flos": 20631614428800.0, "grad_norm": 1.8633194637796668, "language_loss": 0.77577484, "learning_rate": 2.8701821289866443e-07, "loss": 0.79736745, "num_input_tokens_seen": 296869585, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6640625, "step": 13761, "time_per_iteration": 2.608602285385132 }, { "auxiliary_loss_clip": 0.01126347, "auxiliary_loss_mlp": 0.01031365, "balance_loss_clip": 1.02002215, "balance_loss_mlp": 1.03371644, "epoch": 0.827416203216594, "flos": 22416625825920.0, "grad_norm": 1.7784487908239468, "language_loss": 0.70157206, "learning_rate": 2.868232258866672e-07, "loss": 0.72314918, "num_input_tokens_seen": 296887710, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6640625, "step": 13762, "time_per_iteration": 2.603119134902954 }, { "auxiliary_loss_clip": 0.01136123, "auxiliary_loss_mlp": 0.01023878, "balance_loss_clip": 1.01219583, "balance_loss_mlp": 1.03214073, "epoch": 0.827476326469262, "flos": 19026192055680.0, "grad_norm": 2.1678750157115485, "language_loss": 0.70164633, "learning_rate": 2.866283000135423e-07, "loss": 0.72324634, "num_input_tokens_seen": 296906265, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.68359375, "step": 13763, "time_per_iteration": 2.5883867740631104 }, { "auxiliary_loss_clip": 0.01049073, "auxiliary_loss_mlp": 0.00999788, "balance_loss_clip": 0.99847049, "balance_loss_mlp": 1.00114131, "epoch": 0.82753644972193, "flos": 68500575089280.0, "grad_norm": 0.8411375830886865, "language_loss": 0.65145653, "learning_rate": 2.8643343528624565e-07, "loss": 0.67194515, "num_input_tokens_seen": 296971290, "router_z_loss_clip": 0.01318359, "router_z_loss_mlp": 0.21191406, "step": 13764, "time_per_iteration": 4.665590524673462 }, { "auxiliary_loss_clip": 0.01113497, "auxiliary_loss_mlp": 0.01034469, "balance_loss_clip": 1.02141583, "balance_loss_mlp": 1.0349946, "epoch": 0.8275965729745979, "flos": 18405404277120.0, "grad_norm": 2.1752009674766897, "language_loss": 0.77880865, "learning_rate": 2.862386317117327e-07, "loss": 0.80028832, "num_input_tokens_seen": 296989060, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 13765, "time_per_iteration": 2.5282485485076904 }, { "auxiliary_loss_clip": 0.01151057, "auxiliary_loss_mlp": 0.01032346, "balance_loss_clip": 1.01931703, "balance_loss_mlp": 1.03505206, "epoch": 0.8276566962272659, "flos": 28512067063680.0, "grad_norm": 2.382856543742129, "language_loss": 0.62472153, "learning_rate": 2.8604388929695367e-07, "loss": 0.64655566, "num_input_tokens_seen": 297011300, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 13766, "time_per_iteration": 2.6318297386169434 }, { "auxiliary_loss_clip": 0.01129462, "auxiliary_loss_mlp": 0.01032847, "balance_loss_clip": 1.01991224, "balance_loss_mlp": 1.03368187, "epoch": 0.8277168194799339, "flos": 20230240878720.0, "grad_norm": 1.6938176815536916, "language_loss": 0.82397926, "learning_rate": 2.858492080488599e-07, "loss": 0.84560233, "num_input_tokens_seen": 297030350, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 13767, "time_per_iteration": 2.5598888397216797 }, { "auxiliary_loss_clip": 0.01123789, "auxiliary_loss_mlp": 0.01030128, "balance_loss_clip": 1.01753354, "balance_loss_mlp": 1.03573012, "epoch": 0.8277769427326018, "flos": 28476623318400.0, "grad_norm": 1.7607406827930479, "language_loss": 0.69086742, "learning_rate": 2.856545879743986e-07, "loss": 0.71240664, "num_input_tokens_seen": 297049710, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 13768, "time_per_iteration": 2.623090982437134 }, { "auxiliary_loss_clip": 0.01144119, "auxiliary_loss_mlp": 0.01028389, "balance_loss_clip": 1.01677203, "balance_loss_mlp": 1.03196836, "epoch": 0.8278370659852698, "flos": 27197628768000.0, "grad_norm": 2.1155138195968854, "language_loss": 0.7457279, "learning_rate": 2.8546002908051516e-07, "loss": 0.76745296, "num_input_tokens_seen": 297070510, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 13769, "time_per_iteration": 2.724835157394409 }, { "auxiliary_loss_clip": 0.01119524, "auxiliary_loss_mlp": 0.01029328, "balance_loss_clip": 1.01694846, "balance_loss_mlp": 1.03268373, "epoch": 0.8278971892379378, "flos": 37816126404480.0, "grad_norm": 1.9645035862996805, "language_loss": 0.7418555, "learning_rate": 2.8526553137415256e-07, "loss": 0.76334405, "num_input_tokens_seen": 297092585, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 13770, "time_per_iteration": 4.088689088821411 }, { "auxiliary_loss_clip": 0.01130662, "auxiliary_loss_mlp": 0.01033066, "balance_loss_clip": 1.0202632, "balance_loss_mlp": 1.03476334, "epoch": 0.8279573124906058, "flos": 22560160573440.0, "grad_norm": 1.5955213995702766, "language_loss": 0.75786304, "learning_rate": 2.850710948622532e-07, "loss": 0.77950025, "num_input_tokens_seen": 297110055, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 13771, "time_per_iteration": 2.559666395187378 }, { "auxiliary_loss_clip": 0.01101861, "auxiliary_loss_mlp": 0.01031785, "balance_loss_clip": 1.0193516, "balance_loss_mlp": 1.03359306, "epoch": 0.8280174357432737, "flos": 36064619418240.0, "grad_norm": 1.556934482598552, "language_loss": 0.72757632, "learning_rate": 2.848767195517543e-07, "loss": 0.74891275, "num_input_tokens_seen": 297132170, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 13772, "time_per_iteration": 2.6345174312591553 }, { "auxiliary_loss_clip": 0.0111887, "auxiliary_loss_mlp": 0.01029283, "balance_loss_clip": 1.01631904, "balance_loss_mlp": 1.03395057, "epoch": 0.8280775589959417, "flos": 22961067246720.0, "grad_norm": 2.2815505884486598, "language_loss": 0.74516809, "learning_rate": 2.84682405449594e-07, "loss": 0.7666496, "num_input_tokens_seen": 297149515, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6796875, "step": 13773, "time_per_iteration": 2.57177472114563 }, { "auxiliary_loss_clip": 0.01137514, "auxiliary_loss_mlp": 0.01035779, "balance_loss_clip": 1.02397776, "balance_loss_mlp": 1.03474879, "epoch": 0.8281376822486096, "flos": 26063282286720.0, "grad_norm": 1.7075000319216316, "language_loss": 0.75844395, "learning_rate": 2.8448815256270563e-07, "loss": 0.78017688, "num_input_tokens_seen": 297170320, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 13774, "time_per_iteration": 2.6376965045928955 }, { "auxiliary_loss_clip": 0.01133905, "auxiliary_loss_mlp": 0.01273203, "balance_loss_clip": 1.01413536, "balance_loss_mlp": 1.03599644, "epoch": 0.8281978055012776, "flos": 20667776446080.0, "grad_norm": 1.670143766526899, "language_loss": 0.75234109, "learning_rate": 2.8429396089802346e-07, "loss": 0.77641213, "num_input_tokens_seen": 297189935, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.71875, "step": 13775, "time_per_iteration": 2.6285674571990967 }, { "auxiliary_loss_clip": 0.01106415, "auxiliary_loss_mlp": 0.01029649, "balance_loss_clip": 1.01644027, "balance_loss_mlp": 1.03400624, "epoch": 0.8282579287539455, "flos": 29315281040640.0, "grad_norm": 1.878766874272837, "language_loss": 0.73390633, "learning_rate": 2.840998304624753e-07, "loss": 0.75526696, "num_input_tokens_seen": 297210885, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 13776, "time_per_iteration": 2.6105706691741943 }, { "auxiliary_loss_clip": 0.01139204, "auxiliary_loss_mlp": 0.01027668, "balance_loss_clip": 1.01566398, "balance_loss_mlp": 1.03291667, "epoch": 0.8283180520066136, "flos": 15706178899200.0, "grad_norm": 1.9916822550359918, "language_loss": 0.77527106, "learning_rate": 2.8390576126299095e-07, "loss": 0.79693979, "num_input_tokens_seen": 297228500, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 13777, "time_per_iteration": 4.003324508666992 }, { "auxiliary_loss_clip": 0.01109317, "auxiliary_loss_mlp": 0.01028806, "balance_loss_clip": 1.01767778, "balance_loss_mlp": 1.03455544, "epoch": 0.8283781752592815, "flos": 24791470456320.0, "grad_norm": 1.671363655842356, "language_loss": 0.82582122, "learning_rate": 2.8371175330649633e-07, "loss": 0.84720248, "num_input_tokens_seen": 297249470, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.66015625, "step": 13778, "time_per_iteration": 2.5300095081329346 }, { "auxiliary_loss_clip": 0.01107609, "auxiliary_loss_mlp": 0.01025821, "balance_loss_clip": 1.01368535, "balance_loss_mlp": 1.03279066, "epoch": 0.8284382985119495, "flos": 18982811404800.0, "grad_norm": 2.0600489771498594, "language_loss": 0.74536461, "learning_rate": 2.8351780659991307e-07, "loss": 0.7666989, "num_input_tokens_seen": 297265970, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.66015625, "step": 13779, "time_per_iteration": 4.03974723815918 }, { "auxiliary_loss_clip": 0.01049061, "auxiliary_loss_mlp": 0.01001584, "balance_loss_clip": 1.00045764, "balance_loss_mlp": 1.00116694, "epoch": 0.8284984217646175, "flos": 61034460814080.0, "grad_norm": 0.9765204529626837, "language_loss": 0.56787521, "learning_rate": 2.833239211501648e-07, "loss": 0.58838171, "num_input_tokens_seen": 297325525, "router_z_loss_clip": 0.0112915, "router_z_loss_mlp": 0.21289062, "step": 13780, "time_per_iteration": 3.179823637008667 }, { "auxiliary_loss_clip": 0.01129542, "auxiliary_loss_mlp": 0.01025017, "balance_loss_clip": 1.01223731, "balance_loss_mlp": 1.03298688, "epoch": 0.8285585450172854, "flos": 19714635100800.0, "grad_norm": 1.764757454044531, "language_loss": 0.79830849, "learning_rate": 2.8313009696416945e-07, "loss": 0.81985408, "num_input_tokens_seen": 297345025, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 13781, "time_per_iteration": 2.593015432357788 }, { "auxiliary_loss_clip": 0.01141179, "auxiliary_loss_mlp": 0.01032202, "balance_loss_clip": 1.01908922, "balance_loss_mlp": 1.03585386, "epoch": 0.8286186682699535, "flos": 21688896280320.0, "grad_norm": 2.327420766681422, "language_loss": 0.75853777, "learning_rate": 2.8293633404884533e-07, "loss": 0.78027159, "num_input_tokens_seen": 297363570, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 13782, "time_per_iteration": 2.622987747192383 }, { "auxiliary_loss_clip": 0.01109077, "auxiliary_loss_mlp": 0.01029861, "balance_loss_clip": 1.01746905, "balance_loss_mlp": 1.03120089, "epoch": 0.8286787915226214, "flos": 25775566346880.0, "grad_norm": 1.840305882034367, "language_loss": 0.76172435, "learning_rate": 2.82742632411106e-07, "loss": 0.78311372, "num_input_tokens_seen": 297385385, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 13783, "time_per_iteration": 2.576157569885254 }, { "auxiliary_loss_clip": 0.01102537, "auxiliary_loss_mlp": 0.01027047, "balance_loss_clip": 1.01469731, "balance_loss_mlp": 1.03347957, "epoch": 0.8287389147752894, "flos": 21288348743040.0, "grad_norm": 1.6828464015174842, "language_loss": 0.73561943, "learning_rate": 2.82548992057865e-07, "loss": 0.75691521, "num_input_tokens_seen": 297403950, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.69140625, "step": 13784, "time_per_iteration": 2.5370352268218994 }, { "auxiliary_loss_clip": 0.01114908, "auxiliary_loss_mlp": 0.01035471, "balance_loss_clip": 1.02219653, "balance_loss_mlp": 1.03361535, "epoch": 0.8287990380279573, "flos": 33544875323520.0, "grad_norm": 1.5929529411151002, "language_loss": 0.69910514, "learning_rate": 2.8235541299603285e-07, "loss": 0.72060895, "num_input_tokens_seen": 297424565, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 13785, "time_per_iteration": 2.7115252017974854 }, { "auxiliary_loss_clip": 0.01122648, "auxiliary_loss_mlp": 0.01033216, "balance_loss_clip": 1.02088976, "balance_loss_mlp": 1.0357486, "epoch": 0.8288591612806253, "flos": 22966346545920.0, "grad_norm": 1.411369491942644, "language_loss": 0.6832093, "learning_rate": 2.8216189523251756e-07, "loss": 0.70476794, "num_input_tokens_seen": 297445180, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 13786, "time_per_iteration": 2.7891361713409424 }, { "auxiliary_loss_clip": 0.01142687, "auxiliary_loss_mlp": 0.01035105, "balance_loss_clip": 1.02258778, "balance_loss_mlp": 1.03498507, "epoch": 0.8289192845332932, "flos": 18588979710720.0, "grad_norm": 1.7740963549858333, "language_loss": 0.77043343, "learning_rate": 2.8196843877422493e-07, "loss": 0.79221135, "num_input_tokens_seen": 297463790, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.72265625, "step": 13787, "time_per_iteration": 2.6143758296966553 }, { "auxiliary_loss_clip": 0.01111293, "auxiliary_loss_mlp": 0.01034145, "balance_loss_clip": 1.02219439, "balance_loss_mlp": 1.03429985, "epoch": 0.8289794077859612, "flos": 15450423085440.0, "grad_norm": 3.355325927061236, "language_loss": 0.80354309, "learning_rate": 2.817750436280606e-07, "loss": 0.82499743, "num_input_tokens_seen": 297480100, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.67578125, "step": 13788, "time_per_iteration": 2.485490322113037 }, { "auxiliary_loss_clip": 0.01119518, "auxiliary_loss_mlp": 0.01032558, "balance_loss_clip": 1.0206666, "balance_loss_mlp": 1.03423476, "epoch": 0.8290395310386292, "flos": 28877853214080.0, "grad_norm": 1.9515753873934307, "language_loss": 0.71048194, "learning_rate": 2.815817098009239e-07, "loss": 0.73200274, "num_input_tokens_seen": 297499890, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.67578125, "step": 13789, "time_per_iteration": 2.636727809906006 }, { "auxiliary_loss_clip": 0.01130796, "auxiliary_loss_mlp": 0.0102926, "balance_loss_clip": 1.0164752, "balance_loss_mlp": 1.03268886, "epoch": 0.8290996542912972, "flos": 36576274700160.0, "grad_norm": 1.919524607866055, "language_loss": 0.68218935, "learning_rate": 2.813884372997164e-07, "loss": 0.70378983, "num_input_tokens_seen": 297521440, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 13790, "time_per_iteration": 2.663106679916382 }, { "auxiliary_loss_clip": 0.01112472, "auxiliary_loss_mlp": 0.0103176, "balance_loss_clip": 1.01777112, "balance_loss_mlp": 1.03316545, "epoch": 0.8291597775439651, "flos": 23623009032960.0, "grad_norm": 1.8354816631092385, "language_loss": 0.77617884, "learning_rate": 2.8119522613133415e-07, "loss": 0.79762119, "num_input_tokens_seen": 297539920, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.703125, "step": 13791, "time_per_iteration": 2.5486032962799072 }, { "auxiliary_loss_clip": 0.01119284, "auxiliary_loss_mlp": 0.01026637, "balance_loss_clip": 1.01350057, "balance_loss_mlp": 1.03446651, "epoch": 0.8292199007966331, "flos": 21397481239680.0, "grad_norm": 1.3647302958589698, "language_loss": 0.69939733, "learning_rate": 2.8100207630267415e-07, "loss": 0.72085655, "num_input_tokens_seen": 297560000, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.671875, "step": 13792, "time_per_iteration": 2.5524942874908447 }, { "auxiliary_loss_clip": 0.01120483, "auxiliary_loss_mlp": 0.01032124, "balance_loss_clip": 1.01936913, "balance_loss_mlp": 1.03229642, "epoch": 0.829280024049301, "flos": 28767607395840.0, "grad_norm": 1.8094024033889804, "language_loss": 0.65115553, "learning_rate": 2.8080898782062745e-07, "loss": 0.67268163, "num_input_tokens_seen": 297579300, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 13793, "time_per_iteration": 2.5949878692626953 }, { "auxiliary_loss_clip": 0.01149551, "auxiliary_loss_mlp": 0.01031681, "balance_loss_clip": 1.01874685, "balance_loss_mlp": 1.03404975, "epoch": 0.829340147301969, "flos": 19938071652480.0, "grad_norm": 2.9315900377018527, "language_loss": 0.66486561, "learning_rate": 2.8061596069208547e-07, "loss": 0.68667793, "num_input_tokens_seen": 297598095, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 13794, "time_per_iteration": 2.6505181789398193 }, { "auxiliary_loss_clip": 0.01102163, "auxiliary_loss_mlp": 0.01036529, "balance_loss_clip": 1.02360654, "balance_loss_mlp": 1.03208506, "epoch": 0.829400270554637, "flos": 25228575060480.0, "grad_norm": 2.5348823920376353, "language_loss": 0.66275781, "learning_rate": 2.8042299492393785e-07, "loss": 0.68414468, "num_input_tokens_seen": 297615955, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 13795, "time_per_iteration": 2.593151807785034 }, { "auxiliary_loss_clip": 0.01160533, "auxiliary_loss_mlp": 0.01031139, "balance_loss_clip": 1.01918197, "balance_loss_mlp": 1.03376329, "epoch": 0.829460393807305, "flos": 24463570176000.0, "grad_norm": 2.1912195233498797, "language_loss": 0.66573834, "learning_rate": 2.8023009052306944e-07, "loss": 0.68765509, "num_input_tokens_seen": 297636285, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.64453125, "step": 13796, "time_per_iteration": 2.648585081100464 }, { "auxiliary_loss_clip": 0.0113778, "auxiliary_loss_mlp": 0.0103218, "balance_loss_clip": 1.01962137, "balance_loss_mlp": 1.03423953, "epoch": 0.829520517059973, "flos": 15122486891520.0, "grad_norm": 1.6821432569312778, "language_loss": 0.71440846, "learning_rate": 2.8003724749636594e-07, "loss": 0.73610806, "num_input_tokens_seen": 297653315, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.67578125, "step": 13797, "time_per_iteration": 2.553527355194092 }, { "auxiliary_loss_clip": 0.01157578, "auxiliary_loss_mlp": 0.01032854, "balance_loss_clip": 1.02120709, "balance_loss_mlp": 1.03482735, "epoch": 0.8295806403126409, "flos": 21579979265280.0, "grad_norm": 1.8095561563528253, "language_loss": 0.71784383, "learning_rate": 2.7984446585070795e-07, "loss": 0.73974812, "num_input_tokens_seen": 297673480, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6953125, "step": 13798, "time_per_iteration": 2.6835057735443115 }, { "auxiliary_loss_clip": 0.01125609, "auxiliary_loss_mlp": 0.01030397, "balance_loss_clip": 1.0188098, "balance_loss_mlp": 1.03301048, "epoch": 0.8296407635653089, "flos": 21726566668800.0, "grad_norm": 1.7694943884098038, "language_loss": 0.76265275, "learning_rate": 2.7965174559297745e-07, "loss": 0.78421283, "num_input_tokens_seen": 297693250, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.65625, "step": 13799, "time_per_iteration": 2.6082804203033447 }, { "auxiliary_loss_clip": 0.01102412, "auxiliary_loss_mlp": 0.01036385, "balance_loss_clip": 1.02407122, "balance_loss_mlp": 1.03441954, "epoch": 0.8297008868179768, "flos": 11181147252480.0, "grad_norm": 1.8955668186148753, "language_loss": 0.67382336, "learning_rate": 2.794590867300499e-07, "loss": 0.69521129, "num_input_tokens_seen": 297710975, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 13800, "time_per_iteration": 2.505080461502075 }, { "auxiliary_loss_clip": 0.01143667, "auxiliary_loss_mlp": 0.01032811, "balance_loss_clip": 1.01910162, "balance_loss_mlp": 1.03649282, "epoch": 0.8297610100706448, "flos": 20664041431680.0, "grad_norm": 1.6803346644435337, "language_loss": 0.74296969, "learning_rate": 2.7926648926880215e-07, "loss": 0.76473445, "num_input_tokens_seen": 297730860, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.71875, "step": 13801, "time_per_iteration": 2.61856746673584 }, { "auxiliary_loss_clip": 0.01139611, "auxiliary_loss_mlp": 0.01031113, "balance_loss_clip": 1.01976407, "balance_loss_mlp": 1.0346266, "epoch": 0.8298211333233128, "flos": 20376325491840.0, "grad_norm": 1.5331536311346246, "language_loss": 0.73630655, "learning_rate": 2.790739532161073e-07, "loss": 0.75801384, "num_input_tokens_seen": 297749765, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6953125, "step": 13802, "time_per_iteration": 2.5941388607025146 }, { "auxiliary_loss_clip": 0.01121726, "auxiliary_loss_mlp": 0.01033727, "balance_loss_clip": 1.02166319, "balance_loss_mlp": 1.03426456, "epoch": 0.8298812565759808, "flos": 21508696725120.0, "grad_norm": 1.5925295636517423, "language_loss": 0.74695766, "learning_rate": 2.7888147857883604e-07, "loss": 0.76851219, "num_input_tokens_seen": 297770380, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6953125, "step": 13803, "time_per_iteration": 2.675528049468994 }, { "auxiliary_loss_clip": 0.01113199, "auxiliary_loss_mlp": 0.01033974, "balance_loss_clip": 1.02211833, "balance_loss_mlp": 1.03565836, "epoch": 0.8299413798286487, "flos": 22818681734400.0, "grad_norm": 1.6202983872825858, "language_loss": 0.79462528, "learning_rate": 2.786890653638574e-07, "loss": 0.81609702, "num_input_tokens_seen": 297789440, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 13804, "time_per_iteration": 2.531538724899292 }, { "auxiliary_loss_clip": 0.01101511, "auxiliary_loss_mlp": 0.01027428, "balance_loss_clip": 1.01606107, "balance_loss_mlp": 1.03287661, "epoch": 0.8300015030813167, "flos": 29679199683840.0, "grad_norm": 1.7841025364742984, "language_loss": 0.73074067, "learning_rate": 2.7849671357803895e-07, "loss": 0.75203001, "num_input_tokens_seen": 297810425, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6875, "step": 13805, "time_per_iteration": 2.688516616821289 }, { "auxiliary_loss_clip": 0.01113803, "auxiliary_loss_mlp": 0.01275646, "balance_loss_clip": 1.01641166, "balance_loss_mlp": 1.03448439, "epoch": 0.8300616263339846, "flos": 19719483436800.0, "grad_norm": 2.4478407532515787, "language_loss": 0.77441669, "learning_rate": 2.783044232282439e-07, "loss": 0.79831117, "num_input_tokens_seen": 297827680, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 13806, "time_per_iteration": 4.003866672515869 }, { "auxiliary_loss_clip": 0.01114614, "auxiliary_loss_mlp": 0.01033171, "balance_loss_clip": 1.01946819, "balance_loss_mlp": 1.03428185, "epoch": 0.8301217495866526, "flos": 19901945548800.0, "grad_norm": 1.8220317098070002, "language_loss": 0.63655078, "learning_rate": 2.7811219432133535e-07, "loss": 0.6580286, "num_input_tokens_seen": 297848005, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71484375, "step": 13807, "time_per_iteration": 2.603423595428467 }, { "auxiliary_loss_clip": 0.01114618, "auxiliary_loss_mlp": 0.01028976, "balance_loss_clip": 1.01617289, "balance_loss_mlp": 1.03699756, "epoch": 0.8301818728393207, "flos": 19715784336000.0, "grad_norm": 2.556545873261531, "language_loss": 0.73036766, "learning_rate": 2.779200268641737e-07, "loss": 0.75180364, "num_input_tokens_seen": 297866730, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 13808, "time_per_iteration": 2.575106620788574 }, { "auxiliary_loss_clip": 0.01128764, "auxiliary_loss_mlp": 0.01044011, "balance_loss_clip": 1.03061771, "balance_loss_mlp": 1.03727794, "epoch": 0.8302419960919886, "flos": 28293658416000.0, "grad_norm": 1.8364124190225057, "language_loss": 0.66502017, "learning_rate": 2.7772792086361654e-07, "loss": 0.68674791, "num_input_tokens_seen": 297886390, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73828125, "step": 13809, "time_per_iteration": 2.7055394649505615 }, { "auxiliary_loss_clip": 0.01104589, "auxiliary_loss_mlp": 0.01026161, "balance_loss_clip": 1.01383519, "balance_loss_mlp": 1.0337739, "epoch": 0.8303021193446566, "flos": 18223444955520.0, "grad_norm": 2.050570811857135, "language_loss": 0.73823178, "learning_rate": 2.775358763265194e-07, "loss": 0.75953925, "num_input_tokens_seen": 297905110, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.70703125, "step": 13810, "time_per_iteration": 2.630434513092041 }, { "auxiliary_loss_clip": 0.01127983, "auxiliary_loss_mlp": 0.01034359, "balance_loss_clip": 1.02318859, "balance_loss_mlp": 1.03535187, "epoch": 0.8303622425973245, "flos": 23111425578240.0, "grad_norm": 2.1855248994591783, "language_loss": 0.81057525, "learning_rate": 2.7734389325973583e-07, "loss": 0.83219862, "num_input_tokens_seen": 297925460, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.66015625, "step": 13811, "time_per_iteration": 2.601919651031494 }, { "auxiliary_loss_clip": 0.01133043, "auxiliary_loss_mlp": 0.0103789, "balance_loss_clip": 1.0258261, "balance_loss_mlp": 1.03623199, "epoch": 0.8304223658499925, "flos": 19572860119680.0, "grad_norm": 1.7894769244633475, "language_loss": 0.73681706, "learning_rate": 2.771519716701185e-07, "loss": 0.75852638, "num_input_tokens_seen": 297941760, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69921875, "step": 13812, "time_per_iteration": 4.009759426116943 }, { "auxiliary_loss_clip": 0.01118914, "auxiliary_loss_mlp": 0.01030001, "balance_loss_clip": 1.01795506, "balance_loss_mlp": 1.03461432, "epoch": 0.8304824891026604, "flos": 24426115269120.0, "grad_norm": 1.9270503569625153, "language_loss": 0.78435028, "learning_rate": 2.769601115645146e-07, "loss": 0.80583942, "num_input_tokens_seen": 297959745, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6640625, "step": 13813, "time_per_iteration": 2.593327760696411 }, { "auxiliary_loss_clip": 0.01148206, "auxiliary_loss_mlp": 0.01274306, "balance_loss_clip": 1.01419306, "balance_loss_mlp": 1.03288746, "epoch": 0.8305426123553284, "flos": 22381792611840.0, "grad_norm": 1.880439297543539, "language_loss": 0.70796549, "learning_rate": 2.7676831294977267e-07, "loss": 0.73219061, "num_input_tokens_seen": 297977665, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 13814, "time_per_iteration": 2.661564826965332 }, { "auxiliary_loss_clip": 0.01110662, "auxiliary_loss_mlp": 0.01040212, "balance_loss_clip": 1.02576423, "balance_loss_mlp": 1.03499615, "epoch": 0.8306027356079964, "flos": 14903575453440.0, "grad_norm": 2.15023086233673, "language_loss": 0.68024343, "learning_rate": 2.7657657583273696e-07, "loss": 0.70175219, "num_input_tokens_seen": 297993525, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7578125, "step": 13815, "time_per_iteration": 2.511991500854492 }, { "auxiliary_loss_clip": 0.01112392, "auxiliary_loss_mlp": 0.01032361, "balance_loss_clip": 1.01944447, "balance_loss_mlp": 1.03458095, "epoch": 0.8306628588606644, "flos": 19644573623040.0, "grad_norm": 2.6628941638634167, "language_loss": 0.75681019, "learning_rate": 2.763849002202501e-07, "loss": 0.77825773, "num_input_tokens_seen": 298012920, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 13816, "time_per_iteration": 2.604511260986328 }, { "auxiliary_loss_clip": 0.01128722, "auxiliary_loss_mlp": 0.01034713, "balance_loss_clip": 1.02286339, "balance_loss_mlp": 1.03365779, "epoch": 0.8307229821133323, "flos": 24389737770240.0, "grad_norm": 1.723007494384073, "language_loss": 0.8146559, "learning_rate": 2.7619328611915207e-07, "loss": 0.83629024, "num_input_tokens_seen": 298033310, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.68359375, "step": 13817, "time_per_iteration": 2.578551769256592 }, { "auxiliary_loss_clip": 0.01114333, "auxiliary_loss_mlp": 0.01034096, "balance_loss_clip": 1.02053547, "balance_loss_mlp": 1.03444457, "epoch": 0.8307831053660003, "flos": 20996933702400.0, "grad_norm": 1.959474438486644, "language_loss": 0.77960062, "learning_rate": 2.760017335362816e-07, "loss": 0.80108488, "num_input_tokens_seen": 298053530, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7109375, "step": 13818, "time_per_iteration": 2.5558462142944336 }, { "auxiliary_loss_clip": 0.01123514, "auxiliary_loss_mlp": 0.01039065, "balance_loss_clip": 1.02623856, "balance_loss_mlp": 1.03553307, "epoch": 0.8308432286186682, "flos": 21397301671680.0, "grad_norm": 4.15057129428772, "language_loss": 0.81767243, "learning_rate": 2.7581024247847495e-07, "loss": 0.83929819, "num_input_tokens_seen": 298069305, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 13819, "time_per_iteration": 3.899796962738037 }, { "auxiliary_loss_clip": 0.01031346, "auxiliary_loss_mlp": 0.01002568, "balance_loss_clip": 1.00135219, "balance_loss_mlp": 1.00150633, "epoch": 0.8309033518713362, "flos": 62533656714240.0, "grad_norm": 0.7570690903140105, "language_loss": 0.56277376, "learning_rate": 2.7561881295256564e-07, "loss": 0.5831129, "num_input_tokens_seen": 298125830, "router_z_loss_clip": 0.012146, "router_z_loss_mlp": 0.2109375, "step": 13820, "time_per_iteration": 3.407353162765503 }, { "auxiliary_loss_clip": 0.01109173, "auxiliary_loss_mlp": 0.01026595, "balance_loss_clip": 1.0141319, "balance_loss_mlp": 1.03309464, "epoch": 0.8309634751240043, "flos": 16979104051200.0, "grad_norm": 1.9484186460829303, "language_loss": 0.68565875, "learning_rate": 2.754274449653846e-07, "loss": 0.70701647, "num_input_tokens_seen": 298142320, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.671875, "step": 13821, "time_per_iteration": 4.174016714096069 }, { "auxiliary_loss_clip": 0.01159779, "auxiliary_loss_mlp": 0.01031989, "balance_loss_clip": 1.01876855, "balance_loss_mlp": 1.03431869, "epoch": 0.8310235983766722, "flos": 22674464628480.0, "grad_norm": 1.7187100106781634, "language_loss": 0.69105828, "learning_rate": 2.752361385237629e-07, "loss": 0.71297598, "num_input_tokens_seen": 298161845, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 13822, "time_per_iteration": 2.7152602672576904 }, { "auxiliary_loss_clip": 0.01137572, "auxiliary_loss_mlp": 0.01035435, "balance_loss_clip": 1.02375817, "balance_loss_mlp": 1.0349915, "epoch": 0.8310837216293402, "flos": 30811463176320.0, "grad_norm": 2.4903077824400093, "language_loss": 0.62066513, "learning_rate": 2.7504489363452643e-07, "loss": 0.6423952, "num_input_tokens_seen": 298184165, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6640625, "step": 13823, "time_per_iteration": 2.7439193725585938 }, { "auxiliary_loss_clip": 0.01040376, "auxiliary_loss_mlp": 0.01001287, "balance_loss_clip": 1.0, "balance_loss_mlp": 1.00141656, "epoch": 0.8311438448820081, "flos": 71276074997760.0, "grad_norm": 0.8563455394499125, "language_loss": 0.61937612, "learning_rate": 2.748537103045001e-07, "loss": 0.63979274, "num_input_tokens_seen": 298251720, "router_z_loss_clip": 0.01287842, "router_z_loss_mlp": 0.21191406, "step": 13824, "time_per_iteration": 3.277235269546509 }, { "auxiliary_loss_clip": 0.01102997, "auxiliary_loss_mlp": 0.01030953, "balance_loss_clip": 1.01876974, "balance_loss_mlp": 1.03416073, "epoch": 0.8312039681346761, "flos": 20887082933760.0, "grad_norm": 1.5153580864249532, "language_loss": 0.74119377, "learning_rate": 2.746625885405076e-07, "loss": 0.76253331, "num_input_tokens_seen": 298271910, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6875, "step": 13825, "time_per_iteration": 2.5950229167938232 }, { "auxiliary_loss_clip": 0.01141185, "auxiliary_loss_mlp": 0.01278621, "balance_loss_clip": 1.0190742, "balance_loss_mlp": 1.03398669, "epoch": 0.831264091387344, "flos": 17017528625280.0, "grad_norm": 1.8636629534117142, "language_loss": 0.80326712, "learning_rate": 2.7447152834936925e-07, "loss": 0.82746518, "num_input_tokens_seen": 298288105, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 13826, "time_per_iteration": 2.6365272998809814 }, { "auxiliary_loss_clip": 0.01102194, "auxiliary_loss_mlp": 0.01031827, "balance_loss_clip": 1.01907158, "balance_loss_mlp": 1.03323841, "epoch": 0.831324214640012, "flos": 24419578993920.0, "grad_norm": 1.868294574822419, "language_loss": 0.68074369, "learning_rate": 2.742805297379034e-07, "loss": 0.70208395, "num_input_tokens_seen": 298307600, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 13827, "time_per_iteration": 2.677715539932251 }, { "auxiliary_loss_clip": 0.01110395, "auxiliary_loss_mlp": 0.01029976, "balance_loss_clip": 1.0185318, "balance_loss_mlp": 1.0327282, "epoch": 0.83138433789268, "flos": 13545576938880.0, "grad_norm": 1.9681396663604718, "language_loss": 0.74174196, "learning_rate": 2.740895927129261e-07, "loss": 0.76314569, "num_input_tokens_seen": 298323055, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6875, "step": 13828, "time_per_iteration": 2.500692844390869 }, { "auxiliary_loss_clip": 0.01129963, "auxiliary_loss_mlp": 0.01034231, "balance_loss_clip": 1.02190518, "balance_loss_mlp": 1.03289843, "epoch": 0.831444461145348, "flos": 44492386561920.0, "grad_norm": 1.5785634106712867, "language_loss": 0.67467105, "learning_rate": 2.7389871728125236e-07, "loss": 0.69631296, "num_input_tokens_seen": 298346950, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 13829, "time_per_iteration": 2.8472070693969727 }, { "auxiliary_loss_clip": 0.01123151, "auxiliary_loss_mlp": 0.01028456, "balance_loss_clip": 1.01553965, "balance_loss_mlp": 1.03335023, "epoch": 0.8315045843980159, "flos": 22705024124160.0, "grad_norm": 2.5640130196723017, "language_loss": 0.82613373, "learning_rate": 2.7370790344969254e-07, "loss": 0.84764981, "num_input_tokens_seen": 298366315, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 13830, "time_per_iteration": 2.6453819274902344 }, { "auxiliary_loss_clip": 0.01111541, "auxiliary_loss_mlp": 0.01029624, "balance_loss_clip": 1.01748228, "balance_loss_mlp": 1.03376174, "epoch": 0.8315647076506839, "flos": 16873491087360.0, "grad_norm": 2.3608124885440307, "language_loss": 0.74113995, "learning_rate": 2.7351715122505757e-07, "loss": 0.76255155, "num_input_tokens_seen": 298385185, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 13831, "time_per_iteration": 2.5990560054779053 }, { "auxiliary_loss_clip": 0.01105665, "auxiliary_loss_mlp": 0.01035132, "balance_loss_clip": 1.0217272, "balance_loss_mlp": 1.03580558, "epoch": 0.8316248309033518, "flos": 13808730954240.0, "grad_norm": 1.6479435716195716, "language_loss": 0.71548325, "learning_rate": 2.733264606141545e-07, "loss": 0.73689121, "num_input_tokens_seen": 298402335, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6953125, "step": 13832, "time_per_iteration": 2.656522035598755 }, { "auxiliary_loss_clip": 0.01119584, "auxiliary_loss_mlp": 0.01034538, "balance_loss_clip": 1.0218364, "balance_loss_mlp": 1.03316617, "epoch": 0.8316849541560198, "flos": 23512511819520.0, "grad_norm": 1.6157775696172383, "language_loss": 0.8455615, "learning_rate": 2.731358316237884e-07, "loss": 0.86710274, "num_input_tokens_seen": 298423370, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.68359375, "step": 13833, "time_per_iteration": 2.669795274734497 }, { "auxiliary_loss_clip": 0.01122787, "auxiliary_loss_mlp": 0.01037196, "balance_loss_clip": 1.02423191, "balance_loss_mlp": 1.03426671, "epoch": 0.8317450774086879, "flos": 23771356202880.0, "grad_norm": 1.6440420370396427, "language_loss": 0.75957084, "learning_rate": 2.7294526426076194e-07, "loss": 0.78117073, "num_input_tokens_seen": 298444835, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 13834, "time_per_iteration": 2.61137056350708 }, { "auxiliary_loss_clip": 0.0110037, "auxiliary_loss_mlp": 0.01277583, "balance_loss_clip": 1.01879573, "balance_loss_mlp": 1.03339148, "epoch": 0.8318052006613558, "flos": 16215535710720.0, "grad_norm": 1.909336514651199, "language_loss": 0.79403317, "learning_rate": 2.7275475853187703e-07, "loss": 0.81781274, "num_input_tokens_seen": 298461845, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 13835, "time_per_iteration": 2.5383963584899902 }, { "auxiliary_loss_clip": 0.01117031, "auxiliary_loss_mlp": 0.01037074, "balance_loss_clip": 1.02304852, "balance_loss_mlp": 1.0361861, "epoch": 0.8318653239140238, "flos": 19974556892160.0, "grad_norm": 2.079536839263398, "language_loss": 0.80911994, "learning_rate": 2.725643144439318e-07, "loss": 0.83066106, "num_input_tokens_seen": 298479095, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.71875, "step": 13836, "time_per_iteration": 2.6013498306274414 }, { "auxiliary_loss_clip": 0.01119676, "auxiliary_loss_mlp": 0.01033582, "balance_loss_clip": 1.02139878, "balance_loss_mlp": 1.03392553, "epoch": 0.8319254471666917, "flos": 17704714694400.0, "grad_norm": 1.9333686637675995, "language_loss": 0.77345926, "learning_rate": 2.723739320037226e-07, "loss": 0.79499185, "num_input_tokens_seen": 298494475, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 13837, "time_per_iteration": 2.529597282409668 }, { "auxiliary_loss_clip": 0.01108219, "auxiliary_loss_mlp": 0.01029542, "balance_loss_clip": 1.01524854, "balance_loss_mlp": 1.03431499, "epoch": 0.8319855704193597, "flos": 19536554448000.0, "grad_norm": 1.9178214936097018, "language_loss": 0.83288801, "learning_rate": 2.7218361121804377e-07, "loss": 0.85426569, "num_input_tokens_seen": 298513185, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.73828125, "step": 13838, "time_per_iteration": 2.5307157039642334 }, { "auxiliary_loss_clip": 0.01121406, "auxiliary_loss_mlp": 0.01030702, "balance_loss_clip": 1.01785111, "balance_loss_mlp": 1.03445268, "epoch": 0.8320456936720276, "flos": 21178067011200.0, "grad_norm": 2.078392405349835, "language_loss": 0.74702418, "learning_rate": 2.719933520936875e-07, "loss": 0.76854527, "num_input_tokens_seen": 298531885, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 13839, "time_per_iteration": 2.5725257396698 }, { "auxiliary_loss_clip": 0.0111463, "auxiliary_loss_mlp": 0.01035156, "balance_loss_clip": 1.02263331, "balance_loss_mlp": 1.03591561, "epoch": 0.8321058169246957, "flos": 33250874503680.0, "grad_norm": 1.413525130693402, "language_loss": 0.67633432, "learning_rate": 2.7180315463744353e-07, "loss": 0.69783217, "num_input_tokens_seen": 298554905, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 13840, "time_per_iteration": 2.6780343055725098 }, { "auxiliary_loss_clip": 0.01138403, "auxiliary_loss_mlp": 0.0103006, "balance_loss_clip": 1.0174005, "balance_loss_mlp": 1.03272057, "epoch": 0.8321659401773636, "flos": 14208129256320.0, "grad_norm": 1.9299171559833532, "language_loss": 0.79696238, "learning_rate": 2.71613018856099e-07, "loss": 0.81864703, "num_input_tokens_seen": 298571185, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 13841, "time_per_iteration": 2.512120485305786 }, { "auxiliary_loss_clip": 0.01129928, "auxiliary_loss_mlp": 0.01031016, "balance_loss_clip": 1.01861262, "balance_loss_mlp": 1.03432679, "epoch": 0.8322260634300316, "flos": 15158253859200.0, "grad_norm": 1.7175059753535806, "language_loss": 0.68010235, "learning_rate": 2.7142294475644046e-07, "loss": 0.70171183, "num_input_tokens_seen": 298588505, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6796875, "step": 13842, "time_per_iteration": 2.5659401416778564 }, { "auxiliary_loss_clip": 0.011211, "auxiliary_loss_mlp": 0.01031446, "balance_loss_clip": 1.0184226, "balance_loss_mlp": 1.0346359, "epoch": 0.8322861866826995, "flos": 25300827267840.0, "grad_norm": 1.6223052036823915, "language_loss": 0.73140979, "learning_rate": 2.7123293234525044e-07, "loss": 0.75293529, "num_input_tokens_seen": 298609295, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6875, "step": 13843, "time_per_iteration": 2.604468822479248 }, { "auxiliary_loss_clip": 0.01129807, "auxiliary_loss_mlp": 0.01031598, "balance_loss_clip": 1.01896739, "balance_loss_mlp": 1.03633225, "epoch": 0.8323463099353675, "flos": 17019360218880.0, "grad_norm": 1.919696798378959, "language_loss": 0.77567345, "learning_rate": 2.7104298162931005e-07, "loss": 0.79728746, "num_input_tokens_seen": 298625765, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.67578125, "step": 13844, "time_per_iteration": 2.645587921142578 }, { "auxiliary_loss_clip": 0.0103993, "auxiliary_loss_mlp": 0.01001642, "balance_loss_clip": 1.00052094, "balance_loss_mlp": 1.00148737, "epoch": 0.8324064331880354, "flos": 55607889709440.0, "grad_norm": 0.7820153364799097, "language_loss": 0.55326217, "learning_rate": 2.7085309261539804e-07, "loss": 0.5736779, "num_input_tokens_seen": 298683005, "router_z_loss_clip": 0.01123047, "router_z_loss_mlp": 0.2109375, "step": 13845, "time_per_iteration": 3.115665912628174 }, { "auxiliary_loss_clip": 0.01115605, "auxiliary_loss_mlp": 0.01044445, "balance_loss_clip": 1.03153515, "balance_loss_mlp": 1.03613138, "epoch": 0.8324665564407034, "flos": 26138623063680.0, "grad_norm": 1.5134323085732637, "language_loss": 0.753299, "learning_rate": 2.7066326531029205e-07, "loss": 0.77489948, "num_input_tokens_seen": 298703060, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 13846, "time_per_iteration": 2.6624655723571777 }, { "auxiliary_loss_clip": 0.01140586, "auxiliary_loss_mlp": 0.01030439, "balance_loss_clip": 1.01668787, "balance_loss_mlp": 1.03270006, "epoch": 0.8325266796933715, "flos": 22049187649920.0, "grad_norm": 1.6439496861266878, "language_loss": 0.78722614, "learning_rate": 2.7047349972076474e-07, "loss": 0.80893636, "num_input_tokens_seen": 298721765, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.72265625, "step": 13847, "time_per_iteration": 4.029558181762695 }, { "auxiliary_loss_clip": 0.01147371, "auxiliary_loss_mlp": 0.01028237, "balance_loss_clip": 1.01655483, "balance_loss_mlp": 1.03454256, "epoch": 0.8325868029460394, "flos": 22816634659200.0, "grad_norm": 1.6308629350644535, "language_loss": 0.74388307, "learning_rate": 2.7028379585358995e-07, "loss": 0.76563919, "num_input_tokens_seen": 298740825, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6875, "step": 13848, "time_per_iteration": 2.5767555236816406 }, { "auxiliary_loss_clip": 0.01112723, "auxiliary_loss_mlp": 0.01028485, "balance_loss_clip": 1.01583695, "balance_loss_mlp": 1.03473401, "epoch": 0.8326469261987074, "flos": 14757454926720.0, "grad_norm": 3.158858674310133, "language_loss": 0.63323689, "learning_rate": 2.70094153715537e-07, "loss": 0.6546489, "num_input_tokens_seen": 298758515, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 13849, "time_per_iteration": 2.5339725017547607 }, { "auxiliary_loss_clip": 0.01123301, "auxiliary_loss_mlp": 0.01031377, "balance_loss_clip": 1.01881838, "balance_loss_mlp": 1.03383255, "epoch": 0.8327070494513753, "flos": 22926126291840.0, "grad_norm": 2.639808302705031, "language_loss": 0.795798, "learning_rate": 2.69904573313374e-07, "loss": 0.81734478, "num_input_tokens_seen": 298776375, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 13850, "time_per_iteration": 2.5837481021881104 }, { "auxiliary_loss_clip": 0.01126472, "auxiliary_loss_mlp": 0.01030449, "balance_loss_clip": 1.01944053, "balance_loss_mlp": 1.03368759, "epoch": 0.8327671727040433, "flos": 20665334321280.0, "grad_norm": 1.5942538280438103, "language_loss": 0.78215814, "learning_rate": 2.697150546538658e-07, "loss": 0.80372727, "num_input_tokens_seen": 298795135, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.66015625, "step": 13851, "time_per_iteration": 2.6632893085479736 }, { "auxiliary_loss_clip": 0.01031696, "auxiliary_loss_mlp": 0.01003457, "balance_loss_clip": 1.00227678, "balance_loss_mlp": 1.00178826, "epoch": 0.8328272959567112, "flos": 56060760384000.0, "grad_norm": 0.7683746605778201, "language_loss": 0.55836082, "learning_rate": 2.6952559774377716e-07, "loss": 0.57871234, "num_input_tokens_seen": 298855475, "router_z_loss_clip": 0.01177979, "router_z_loss_mlp": 0.2109375, "step": 13852, "time_per_iteration": 3.1421828269958496 }, { "auxiliary_loss_clip": 0.01110807, "auxiliary_loss_mlp": 0.01030097, "balance_loss_clip": 1.01774669, "balance_loss_mlp": 1.03503227, "epoch": 0.8328874192093793, "flos": 32303084284800.0, "grad_norm": 1.7233340953912084, "language_loss": 0.66990572, "learning_rate": 2.6933620258986886e-07, "loss": 0.69131476, "num_input_tokens_seen": 298875875, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.66796875, "step": 13853, "time_per_iteration": 2.754894495010376 }, { "auxiliary_loss_clip": 0.01124406, "auxiliary_loss_mlp": 0.010327, "balance_loss_clip": 1.02022505, "balance_loss_mlp": 1.03723681, "epoch": 0.8329475424620472, "flos": 23512691387520.0, "grad_norm": 1.574701033332836, "language_loss": 0.784621, "learning_rate": 2.69146869198899e-07, "loss": 0.8061921, "num_input_tokens_seen": 298895950, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 13854, "time_per_iteration": 4.217481374740601 }, { "auxiliary_loss_clip": 0.01023027, "auxiliary_loss_mlp": 0.01002971, "balance_loss_clip": 1.00168955, "balance_loss_mlp": 1.00195646, "epoch": 0.8330076657147152, "flos": 67840680378240.0, "grad_norm": 0.7644868105783839, "language_loss": 0.58688134, "learning_rate": 2.6895759757762524e-07, "loss": 0.60714132, "num_input_tokens_seen": 298955770, "router_z_loss_clip": 0.01281738, "router_z_loss_mlp": 0.2109375, "step": 13855, "time_per_iteration": 3.094306707382202 }, { "auxiliary_loss_clip": 0.01111925, "auxiliary_loss_mlp": 0.01026103, "balance_loss_clip": 1.01384234, "balance_loss_mlp": 1.03486085, "epoch": 0.8330677889673831, "flos": 22892801448960.0, "grad_norm": 1.7025199358659544, "language_loss": 0.71608239, "learning_rate": 2.687683877328024e-07, "loss": 0.7374627, "num_input_tokens_seen": 298976545, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.68359375, "step": 13856, "time_per_iteration": 2.632591485977173 }, { "auxiliary_loss_clip": 0.01111882, "auxiliary_loss_mlp": 0.01030624, "balance_loss_clip": 1.01859617, "balance_loss_mlp": 1.03438985, "epoch": 0.8331279122200511, "flos": 27345042184320.0, "grad_norm": 1.926646887702722, "language_loss": 0.75349265, "learning_rate": 2.685792396711823e-07, "loss": 0.77491772, "num_input_tokens_seen": 298996750, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 13857, "time_per_iteration": 2.7135684490203857 }, { "auxiliary_loss_clip": 0.0113606, "auxiliary_loss_mlp": 0.0102816, "balance_loss_clip": 1.01707375, "balance_loss_mlp": 1.03558636, "epoch": 0.833188035472719, "flos": 19938179393280.0, "grad_norm": 1.718760068513306, "language_loss": 0.73306757, "learning_rate": 2.68390153399515e-07, "loss": 0.75470978, "num_input_tokens_seen": 299014895, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.65234375, "step": 13858, "time_per_iteration": 2.6253364086151123 }, { "auxiliary_loss_clip": 0.01107162, "auxiliary_loss_mlp": 0.01032514, "balance_loss_clip": 1.01926398, "balance_loss_mlp": 1.03383613, "epoch": 0.833248158725387, "flos": 15232624968960.0, "grad_norm": 1.8904601624185393, "language_loss": 0.728351, "learning_rate": 2.682011289245494e-07, "loss": 0.74974781, "num_input_tokens_seen": 299032855, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 13859, "time_per_iteration": 2.6138346195220947 }, { "auxiliary_loss_clip": 0.01107698, "auxiliary_loss_mlp": 0.01025585, "balance_loss_clip": 1.01409292, "balance_loss_mlp": 1.03321934, "epoch": 0.8333082819780551, "flos": 24535535074560.0, "grad_norm": 1.8232237240214464, "language_loss": 0.79347563, "learning_rate": 2.680121662530308e-07, "loss": 0.81480843, "num_input_tokens_seen": 299052055, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.65625, "step": 13860, "time_per_iteration": 2.6169888973236084 }, { "auxiliary_loss_clip": 0.011548, "auxiliary_loss_mlp": 0.01030792, "balance_loss_clip": 1.01861429, "balance_loss_mlp": 1.03414774, "epoch": 0.833368405230723, "flos": 31467407391360.0, "grad_norm": 1.488120264707672, "language_loss": 0.8214258, "learning_rate": 2.678232653917027e-07, "loss": 0.84328169, "num_input_tokens_seen": 299075285, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.671875, "step": 13861, "time_per_iteration": 4.161766052246094 }, { "auxiliary_loss_clip": 0.01104863, "auxiliary_loss_mlp": 0.01033421, "balance_loss_clip": 1.02095807, "balance_loss_mlp": 1.0356003, "epoch": 0.833428528483391, "flos": 14902713527040.0, "grad_norm": 1.7780319658672086, "language_loss": 0.78824431, "learning_rate": 2.6763442634730604e-07, "loss": 0.80962718, "num_input_tokens_seen": 299092520, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69140625, "step": 13862, "time_per_iteration": 2.5546066761016846 }, { "auxiliary_loss_clip": 0.01112015, "auxiliary_loss_mlp": 0.01035753, "balance_loss_clip": 1.02306283, "balance_loss_mlp": 1.03273165, "epoch": 0.8334886517360589, "flos": 22199833290240.0, "grad_norm": 1.8078946930733155, "language_loss": 0.75067526, "learning_rate": 2.674456491265815e-07, "loss": 0.7721529, "num_input_tokens_seen": 299109450, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 13863, "time_per_iteration": 4.14808464050293 }, { "auxiliary_loss_clip": 0.01137666, "auxiliary_loss_mlp": 0.01275136, "balance_loss_clip": 1.01696396, "balance_loss_mlp": 1.0358305, "epoch": 0.8335487749887269, "flos": 30372562892160.0, "grad_norm": 2.2398626085167805, "language_loss": 0.75275344, "learning_rate": 2.6725693373626424e-07, "loss": 0.77688146, "num_input_tokens_seen": 299129540, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.65625, "step": 13864, "time_per_iteration": 2.6515860557556152 }, { "auxiliary_loss_clip": 0.01121049, "auxiliary_loss_mlp": 0.01034758, "balance_loss_clip": 1.02197921, "balance_loss_mlp": 1.03419054, "epoch": 0.8336088982413948, "flos": 24681152810880.0, "grad_norm": 1.7409915188301512, "language_loss": 0.69094038, "learning_rate": 2.6706828018309033e-07, "loss": 0.71249843, "num_input_tokens_seen": 299148670, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 13865, "time_per_iteration": 2.5885651111602783 }, { "auxiliary_loss_clip": 0.01112756, "auxiliary_loss_mlp": 0.01031932, "balance_loss_clip": 1.01950455, "balance_loss_mlp": 1.03490806, "epoch": 0.8336690214940629, "flos": 38177207873280.0, "grad_norm": 1.6227314730565863, "language_loss": 0.75471067, "learning_rate": 2.6687968847379185e-07, "loss": 0.77615762, "num_input_tokens_seen": 299169330, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6875, "step": 13866, "time_per_iteration": 2.678953170776367 }, { "auxiliary_loss_clip": 0.01040881, "auxiliary_loss_mlp": 0.00999865, "balance_loss_clip": 0.99871492, "balance_loss_mlp": 1.00183213, "epoch": 0.8337291447467308, "flos": 55565119589760.0, "grad_norm": 0.9158513703841686, "language_loss": 0.63061786, "learning_rate": 2.6669115861509927e-07, "loss": 0.65102541, "num_input_tokens_seen": 299220980, "router_z_loss_clip": 0.01147461, "router_z_loss_mlp": 0.2109375, "step": 13867, "time_per_iteration": 3.0579488277435303 }, { "auxiliary_loss_clip": 0.01118382, "auxiliary_loss_mlp": 0.01030709, "balance_loss_clip": 1.01935387, "balance_loss_mlp": 1.0338589, "epoch": 0.8337892679993988, "flos": 24133550993280.0, "grad_norm": 1.8633072451146702, "language_loss": 0.72088265, "learning_rate": 2.665026906137404e-07, "loss": 0.74237347, "num_input_tokens_seen": 299240130, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.66796875, "step": 13868, "time_per_iteration": 2.5893068313598633 }, { "auxiliary_loss_clip": 0.01123921, "auxiliary_loss_mlp": 0.01032193, "balance_loss_clip": 1.01933599, "balance_loss_mlp": 1.03716326, "epoch": 0.8338493912520667, "flos": 28183915388160.0, "grad_norm": 1.7821925121516922, "language_loss": 0.80344844, "learning_rate": 2.6631428447644125e-07, "loss": 0.82500958, "num_input_tokens_seen": 299260705, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 13869, "time_per_iteration": 2.5750343799591064 }, { "auxiliary_loss_clip": 0.01114985, "auxiliary_loss_mlp": 0.01035999, "balance_loss_clip": 1.02334464, "balance_loss_mlp": 1.03674901, "epoch": 0.8339095145047347, "flos": 22158356060160.0, "grad_norm": 1.5191318694700846, "language_loss": 0.82703483, "learning_rate": 2.6612594020992627e-07, "loss": 0.84854466, "num_input_tokens_seen": 299278925, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 13870, "time_per_iteration": 2.6011340618133545 }, { "auxiliary_loss_clip": 0.01122001, "auxiliary_loss_mlp": 0.01028913, "balance_loss_clip": 1.01581192, "balance_loss_mlp": 1.03198838, "epoch": 0.8339696377574026, "flos": 17307112072320.0, "grad_norm": 2.522354912940266, "language_loss": 0.70934516, "learning_rate": 2.6593765782091583e-07, "loss": 0.73085427, "num_input_tokens_seen": 299291580, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 13871, "time_per_iteration": 2.507664680480957 }, { "auxiliary_loss_clip": 0.01111888, "auxiliary_loss_mlp": 0.01030644, "balance_loss_clip": 1.01839554, "balance_loss_mlp": 1.03553939, "epoch": 0.8340297610100706, "flos": 20668351063680.0, "grad_norm": 1.8247325427840346, "language_loss": 0.69219047, "learning_rate": 2.657494373161302e-07, "loss": 0.71361578, "num_input_tokens_seen": 299310385, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.67578125, "step": 13872, "time_per_iteration": 2.638692855834961 }, { "auxiliary_loss_clip": 0.01121664, "auxiliary_loss_mlp": 0.01027156, "balance_loss_clip": 1.01511621, "balance_loss_mlp": 1.03515244, "epoch": 0.8340898842627387, "flos": 20515442866560.0, "grad_norm": 1.8275368362903843, "language_loss": 0.73434961, "learning_rate": 2.6556127870228606e-07, "loss": 0.75583792, "num_input_tokens_seen": 299327660, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 13873, "time_per_iteration": 2.6161463260650635 }, { "auxiliary_loss_clip": 0.01122753, "auxiliary_loss_mlp": 0.01036902, "balance_loss_clip": 1.02258468, "balance_loss_mlp": 1.03386426, "epoch": 0.8341500075154066, "flos": 21425850005760.0, "grad_norm": 2.1751496700235617, "language_loss": 0.75251895, "learning_rate": 2.653731819860983e-07, "loss": 0.77411556, "num_input_tokens_seen": 299343685, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.71484375, "step": 13874, "time_per_iteration": 2.5906944274902344 }, { "auxiliary_loss_clip": 0.01119081, "auxiliary_loss_mlp": 0.01025645, "balance_loss_clip": 1.01407599, "balance_loss_mlp": 1.03396451, "epoch": 0.8342101307680746, "flos": 22090988102400.0, "grad_norm": 2.603851858976116, "language_loss": 0.66253901, "learning_rate": 2.6518514717427944e-07, "loss": 0.68398625, "num_input_tokens_seen": 299363305, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.671875, "step": 13875, "time_per_iteration": 2.702860116958618 }, { "auxiliary_loss_clip": 0.01140553, "auxiliary_loss_mlp": 0.01034116, "balance_loss_clip": 1.02176023, "balance_loss_mlp": 1.03635216, "epoch": 0.8342702540207425, "flos": 21871466133120.0, "grad_norm": 1.962501965702608, "language_loss": 0.79709482, "learning_rate": 2.6499717427354084e-07, "loss": 0.81884146, "num_input_tokens_seen": 299382630, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 13876, "time_per_iteration": 2.608039617538452 }, { "auxiliary_loss_clip": 0.01111341, "auxiliary_loss_mlp": 0.01030355, "balance_loss_clip": 1.01807666, "balance_loss_mlp": 1.03439832, "epoch": 0.8343303772734105, "flos": 22528487756160.0, "grad_norm": 2.0067734912186594, "language_loss": 0.87297636, "learning_rate": 2.648092632905894e-07, "loss": 0.89439332, "num_input_tokens_seen": 299402385, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 13877, "time_per_iteration": 2.5540289878845215 }, { "auxiliary_loss_clip": 0.01129945, "auxiliary_loss_mlp": 0.01025634, "balance_loss_clip": 1.01427948, "balance_loss_mlp": 1.03398848, "epoch": 0.8343905005260784, "flos": 14939773384320.0, "grad_norm": 2.4862391314071735, "language_loss": 0.69520998, "learning_rate": 2.6462141423213233e-07, "loss": 0.71676582, "num_input_tokens_seen": 299419820, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.68359375, "step": 13878, "time_per_iteration": 2.5632376670837402 }, { "auxiliary_loss_clip": 0.01118752, "auxiliary_loss_mlp": 0.01032056, "balance_loss_clip": 1.02030814, "balance_loss_mlp": 1.03377175, "epoch": 0.8344506237787465, "flos": 15012456554880.0, "grad_norm": 2.1229641562451347, "language_loss": 0.79715002, "learning_rate": 2.644336271048728e-07, "loss": 0.81865811, "num_input_tokens_seen": 299436265, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 13879, "time_per_iteration": 2.6559948921203613 }, { "auxiliary_loss_clip": 0.0113809, "auxiliary_loss_mlp": 0.01031145, "balance_loss_clip": 1.01976693, "balance_loss_mlp": 1.03400421, "epoch": 0.8345107470314144, "flos": 17560389847680.0, "grad_norm": 2.181650392285162, "language_loss": 0.83183467, "learning_rate": 2.6424590191551345e-07, "loss": 0.85352707, "num_input_tokens_seen": 299451660, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.68359375, "step": 13880, "time_per_iteration": 2.6627073287963867 }, { "auxiliary_loss_clip": 0.01123735, "auxiliary_loss_mlp": 0.01029665, "balance_loss_clip": 1.01773858, "balance_loss_mlp": 1.03530526, "epoch": 0.8345708702840824, "flos": 25187277398400.0, "grad_norm": 1.6799953320976502, "language_loss": 0.7792263, "learning_rate": 2.6405823867075196e-07, "loss": 0.80076027, "num_input_tokens_seen": 299472070, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.7109375, "step": 13881, "time_per_iteration": 2.635124921798706 }, { "auxiliary_loss_clip": 0.01125119, "auxiliary_loss_mlp": 0.01023757, "balance_loss_clip": 1.01271772, "balance_loss_mlp": 1.03337836, "epoch": 0.8346309935367503, "flos": 15083559527040.0, "grad_norm": 2.6752078586058103, "language_loss": 0.78033525, "learning_rate": 2.638706373772872e-07, "loss": 0.80182397, "num_input_tokens_seen": 299486725, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6484375, "step": 13882, "time_per_iteration": 2.5216519832611084 }, { "auxiliary_loss_clip": 0.011305, "auxiliary_loss_mlp": 0.01043593, "balance_loss_clip": 1.03124917, "balance_loss_mlp": 1.03500676, "epoch": 0.8346911167894183, "flos": 22930615491840.0, "grad_norm": 1.9567428553412236, "language_loss": 0.80536842, "learning_rate": 2.636830980418139e-07, "loss": 0.82710934, "num_input_tokens_seen": 299505435, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 13883, "time_per_iteration": 2.7109718322753906 }, { "auxiliary_loss_clip": 0.01112317, "auxiliary_loss_mlp": 0.01029781, "balance_loss_clip": 1.01727629, "balance_loss_mlp": 1.03256655, "epoch": 0.8347512400420862, "flos": 20193037367040.0, "grad_norm": 2.255738055762503, "language_loss": 0.74241674, "learning_rate": 2.634956206710235e-07, "loss": 0.76383775, "num_input_tokens_seen": 299523555, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 13884, "time_per_iteration": 2.5477027893066406 }, { "auxiliary_loss_clip": 0.01129607, "auxiliary_loss_mlp": 0.01035611, "balance_loss_clip": 1.021276, "balance_loss_mlp": 1.03666413, "epoch": 0.8348113632947542, "flos": 25954832148480.0, "grad_norm": 1.6713729141619982, "language_loss": 0.70037389, "learning_rate": 2.6330820527160804e-07, "loss": 0.72202605, "num_input_tokens_seen": 299541660, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.75390625, "step": 13885, "time_per_iteration": 2.6065595149993896 }, { "auxiliary_loss_clip": 0.01130513, "auxiliary_loss_mlp": 0.01033115, "balance_loss_clip": 1.02032411, "balance_loss_mlp": 1.03417134, "epoch": 0.8348714865474223, "flos": 34204554552960.0, "grad_norm": 1.5867868935978313, "language_loss": 0.69964099, "learning_rate": 2.631208518502552e-07, "loss": 0.72127724, "num_input_tokens_seen": 299562465, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 13886, "time_per_iteration": 2.741953134536743 }, { "auxiliary_loss_clip": 0.01128593, "auxiliary_loss_mlp": 0.01030638, "balance_loss_clip": 1.0183897, "balance_loss_mlp": 1.03372812, "epoch": 0.8349316098000902, "flos": 24390132819840.0, "grad_norm": 1.5135935117360702, "language_loss": 0.79158759, "learning_rate": 2.62933560413652e-07, "loss": 0.81317991, "num_input_tokens_seen": 299582700, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 13887, "time_per_iteration": 2.6594350337982178 }, { "auxiliary_loss_clip": 0.01126802, "auxiliary_loss_mlp": 0.01280678, "balance_loss_clip": 1.0201726, "balance_loss_mlp": 1.03691065, "epoch": 0.8349917330527582, "flos": 23032744836480.0, "grad_norm": 2.6394514592783413, "language_loss": 0.64054096, "learning_rate": 2.62746330968481e-07, "loss": 0.66461575, "num_input_tokens_seen": 299600310, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.72265625, "step": 13888, "time_per_iteration": 2.569305419921875 }, { "auxiliary_loss_clip": 0.0112255, "auxiliary_loss_mlp": 0.01026791, "balance_loss_clip": 1.01455986, "balance_loss_mlp": 1.033548, "epoch": 0.8350518563054261, "flos": 13625873792640.0, "grad_norm": 2.712935957335997, "language_loss": 0.66750991, "learning_rate": 2.6255916352142525e-07, "loss": 0.68900335, "num_input_tokens_seen": 299617025, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.71484375, "step": 13889, "time_per_iteration": 3.9910151958465576 }, { "auxiliary_loss_clip": 0.01109873, "auxiliary_loss_mlp": 0.0102787, "balance_loss_clip": 1.01625884, "balance_loss_mlp": 1.03253961, "epoch": 0.8351119795580941, "flos": 21579799697280.0, "grad_norm": 1.907746063520589, "language_loss": 0.68619424, "learning_rate": 2.6237205807916353e-07, "loss": 0.70757163, "num_input_tokens_seen": 299633050, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 13890, "time_per_iteration": 2.6227903366088867 }, { "auxiliary_loss_clip": 0.01121181, "auxiliary_loss_mlp": 0.01034732, "balance_loss_clip": 1.02220321, "balance_loss_mlp": 1.03449774, "epoch": 0.835172102810762, "flos": 20038297576320.0, "grad_norm": 2.992270425927383, "language_loss": 0.59233081, "learning_rate": 2.621850146483733e-07, "loss": 0.61388993, "num_input_tokens_seen": 299646445, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 13891, "time_per_iteration": 2.5566418170928955 }, { "auxiliary_loss_clip": 0.01112864, "auxiliary_loss_mlp": 0.01029377, "balance_loss_clip": 1.01709843, "balance_loss_mlp": 1.034688, "epoch": 0.8352322260634301, "flos": 25111577485440.0, "grad_norm": 1.5915570561187613, "language_loss": 0.71714658, "learning_rate": 2.6199803323572943e-07, "loss": 0.7385689, "num_input_tokens_seen": 299662665, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6953125, "step": 13892, "time_per_iteration": 2.648961305618286 }, { "auxiliary_loss_clip": 0.01099971, "auxiliary_loss_mlp": 0.01030618, "balance_loss_clip": 1.01804757, "balance_loss_mlp": 1.03284836, "epoch": 0.835292349316098, "flos": 24863758577280.0, "grad_norm": 1.603253868873745, "language_loss": 0.66028571, "learning_rate": 2.61811113847906e-07, "loss": 0.68159163, "num_input_tokens_seen": 299683585, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.671875, "step": 13893, "time_per_iteration": 2.570345163345337 }, { "auxiliary_loss_clip": 0.0111871, "auxiliary_loss_mlp": 0.01026986, "balance_loss_clip": 1.01476693, "balance_loss_mlp": 1.03122854, "epoch": 0.835352472568766, "flos": 19865568049920.0, "grad_norm": 2.0345787389532277, "language_loss": 0.7840957, "learning_rate": 2.6162425649157183e-07, "loss": 0.80555266, "num_input_tokens_seen": 299702680, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69140625, "step": 13894, "time_per_iteration": 2.6390888690948486 }, { "auxiliary_loss_clip": 0.01109693, "auxiliary_loss_mlp": 0.0102859, "balance_loss_clip": 1.01641893, "balance_loss_mlp": 1.03538442, "epoch": 0.8354125958214339, "flos": 22054754257920.0, "grad_norm": 1.9767033156074176, "language_loss": 0.72452092, "learning_rate": 2.614374611733965e-07, "loss": 0.74590373, "num_input_tokens_seen": 299721050, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.65234375, "step": 13895, "time_per_iteration": 4.009970426559448 }, { "auxiliary_loss_clip": 0.01113397, "auxiliary_loss_mlp": 0.01039226, "balance_loss_clip": 1.02555251, "balance_loss_mlp": 1.03397703, "epoch": 0.8354727190741019, "flos": 21397804462080.0, "grad_norm": 1.927597309884712, "language_loss": 0.71715629, "learning_rate": 2.6125072790004596e-07, "loss": 0.73868251, "num_input_tokens_seen": 299738255, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.703125, "step": 13896, "time_per_iteration": 2.5525319576263428 }, { "auxiliary_loss_clip": 0.01110945, "auxiliary_loss_mlp": 0.01028594, "balance_loss_clip": 1.01650608, "balance_loss_mlp": 1.03457761, "epoch": 0.8355328423267698, "flos": 50840997834240.0, "grad_norm": 2.0757845711998617, "language_loss": 0.58605319, "learning_rate": 2.61064056678185e-07, "loss": 0.60744858, "num_input_tokens_seen": 299761315, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.67578125, "step": 13897, "time_per_iteration": 2.839538812637329 }, { "auxiliary_loss_clip": 0.01022274, "auxiliary_loss_mlp": 0.01249511, "balance_loss_clip": 1.00229955, "balance_loss_mlp": 1.0011487, "epoch": 0.8355929655794379, "flos": 65551052764800.0, "grad_norm": 0.7027080300744143, "language_loss": 0.57695329, "learning_rate": 2.6087744751447436e-07, "loss": 0.59967113, "num_input_tokens_seen": 299828735, "router_z_loss_clip": 0.01275635, "router_z_loss_mlp": 0.2109375, "step": 13898, "time_per_iteration": 3.2032759189605713 }, { "auxiliary_loss_clip": 0.01114626, "auxiliary_loss_mlp": 0.01035428, "balance_loss_clip": 1.02180815, "balance_loss_mlp": 1.03456998, "epoch": 0.8356530888321058, "flos": 19170516902400.0, "grad_norm": 2.2091144407222814, "language_loss": 0.80054575, "learning_rate": 2.606909004155737e-07, "loss": 0.82204634, "num_input_tokens_seen": 299848395, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.70703125, "step": 13899, "time_per_iteration": 2.5943472385406494 }, { "auxiliary_loss_clip": 0.01103778, "auxiliary_loss_mlp": 0.01031653, "balance_loss_clip": 1.02029252, "balance_loss_mlp": 1.03398371, "epoch": 0.8357132120847738, "flos": 44126672238720.0, "grad_norm": 1.6271707169088963, "language_loss": 0.68859625, "learning_rate": 2.6050441538814174e-07, "loss": 0.70995057, "num_input_tokens_seen": 299871665, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6953125, "step": 13900, "time_per_iteration": 2.7138733863830566 }, { "auxiliary_loss_clip": 0.01121975, "auxiliary_loss_mlp": 0.01030307, "balance_loss_clip": 1.01836812, "balance_loss_mlp": 1.03611708, "epoch": 0.8357733353374418, "flos": 24389701856640.0, "grad_norm": 1.4436543790400247, "language_loss": 0.71326518, "learning_rate": 2.603179924388317e-07, "loss": 0.73478806, "num_input_tokens_seen": 299891960, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 13901, "time_per_iteration": 2.6132009029388428 }, { "auxiliary_loss_clip": 0.011413, "auxiliary_loss_mlp": 0.01040484, "balance_loss_clip": 1.02695334, "balance_loss_mlp": 1.03387666, "epoch": 0.8358334585901097, "flos": 20916313626240.0, "grad_norm": 2.1864820104093927, "language_loss": 0.80161655, "learning_rate": 2.6013163157429805e-07, "loss": 0.82343447, "num_input_tokens_seen": 299905070, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 13902, "time_per_iteration": 4.105058908462524 }, { "auxiliary_loss_clip": 0.01109859, "auxiliary_loss_mlp": 0.0103386, "balance_loss_clip": 1.02200508, "balance_loss_mlp": 1.03454661, "epoch": 0.8358935818427777, "flos": 19244169740160.0, "grad_norm": 1.4915246703348926, "language_loss": 0.6272229, "learning_rate": 2.5994533280119047e-07, "loss": 0.64866006, "num_input_tokens_seen": 299925130, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6640625, "step": 13903, "time_per_iteration": 2.518096446990967 }, { "auxiliary_loss_clip": 0.01120942, "auxiliary_loss_mlp": 0.0102931, "balance_loss_clip": 1.01733506, "balance_loss_mlp": 1.03448892, "epoch": 0.8359537050954456, "flos": 24134053783680.0, "grad_norm": 2.1081583910700226, "language_loss": 0.74532568, "learning_rate": 2.597590961261589e-07, "loss": 0.76682824, "num_input_tokens_seen": 299943845, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 13904, "time_per_iteration": 4.115883111953735 }, { "auxiliary_loss_clip": 0.01116302, "auxiliary_loss_mlp": 0.01030912, "balance_loss_clip": 1.01862192, "balance_loss_mlp": 1.03237116, "epoch": 0.8360138283481137, "flos": 16180415187840.0, "grad_norm": 1.6176311138468034, "language_loss": 0.73023868, "learning_rate": 2.5957292155584774e-07, "loss": 0.75171077, "num_input_tokens_seen": 299961620, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.66015625, "step": 13905, "time_per_iteration": 2.6050961017608643 }, { "auxiliary_loss_clip": 0.01112957, "auxiliary_loss_mlp": 0.01037654, "balance_loss_clip": 1.02673459, "balance_loss_mlp": 1.03684604, "epoch": 0.8360739516007816, "flos": 22198899536640.0, "grad_norm": 1.7133884625596048, "language_loss": 0.66110563, "learning_rate": 2.593868090969027e-07, "loss": 0.6826117, "num_input_tokens_seen": 299982170, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.671875, "step": 13906, "time_per_iteration": 2.597900629043579 }, { "auxiliary_loss_clip": 0.01110857, "auxiliary_loss_mlp": 0.010264, "balance_loss_clip": 1.01398993, "balance_loss_mlp": 1.03350663, "epoch": 0.8361340748534496, "flos": 20923137210240.0, "grad_norm": 1.490142402844977, "language_loss": 0.74209046, "learning_rate": 2.5920075875596504e-07, "loss": 0.76346296, "num_input_tokens_seen": 300001330, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 13907, "time_per_iteration": 2.572648286819458 }, { "auxiliary_loss_clip": 0.01128938, "auxiliary_loss_mlp": 0.01033157, "balance_loss_clip": 1.02031255, "balance_loss_mlp": 1.03178608, "epoch": 0.8361941981061175, "flos": 26173599932160.0, "grad_norm": 1.7890065037786362, "language_loss": 0.75399303, "learning_rate": 2.5901477053967437e-07, "loss": 0.77561402, "num_input_tokens_seen": 300020645, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 13908, "time_per_iteration": 2.6524829864501953 }, { "auxiliary_loss_clip": 0.01113767, "auxiliary_loss_mlp": 0.01032483, "balance_loss_clip": 1.01894689, "balance_loss_mlp": 1.03475606, "epoch": 0.8362543213587855, "flos": 24419363512320.0, "grad_norm": 2.0167958344816066, "language_loss": 0.71681446, "learning_rate": 2.588288444546678e-07, "loss": 0.73827696, "num_input_tokens_seen": 300039945, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.70703125, "step": 13909, "time_per_iteration": 2.661376714706421 }, { "auxiliary_loss_clip": 0.01104957, "auxiliary_loss_mlp": 0.01284648, "balance_loss_clip": 1.02521372, "balance_loss_mlp": 1.03368425, "epoch": 0.8363144446114534, "flos": 17202396948480.0, "grad_norm": 1.9032626695499306, "language_loss": 0.73456967, "learning_rate": 2.586429805075816e-07, "loss": 0.75846571, "num_input_tokens_seen": 300058260, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71484375, "step": 13910, "time_per_iteration": 2.5026652812957764 }, { "auxiliary_loss_clip": 0.01113279, "auxiliary_loss_mlp": 0.01276282, "balance_loss_clip": 1.01750469, "balance_loss_mlp": 1.03490317, "epoch": 0.8363745678641215, "flos": 19279398003840.0, "grad_norm": 2.055508133962697, "language_loss": 0.73158145, "learning_rate": 2.584571787050476e-07, "loss": 0.75547707, "num_input_tokens_seen": 300076720, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 13911, "time_per_iteration": 2.6626391410827637 }, { "auxiliary_loss_clip": 0.01124285, "auxiliary_loss_mlp": 0.01036221, "balance_loss_clip": 1.02375722, "balance_loss_mlp": 1.03523517, "epoch": 0.8364346911167894, "flos": 11874869596800.0, "grad_norm": 1.9200730061466516, "language_loss": 0.79024637, "learning_rate": 2.582714390536973e-07, "loss": 0.81185138, "num_input_tokens_seen": 300092950, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 13912, "time_per_iteration": 2.563727617263794 }, { "auxiliary_loss_clip": 0.01112785, "auxiliary_loss_mlp": 0.01033113, "balance_loss_clip": 1.02086401, "balance_loss_mlp": 1.0346117, "epoch": 0.8364948143694574, "flos": 20225212974720.0, "grad_norm": 2.029233682462267, "language_loss": 0.78659046, "learning_rate": 2.580857615601593e-07, "loss": 0.80804944, "num_input_tokens_seen": 300110950, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 13913, "time_per_iteration": 2.6201632022857666 }, { "auxiliary_loss_clip": 0.01110614, "auxiliary_loss_mlp": 0.01033051, "balance_loss_clip": 1.02044439, "balance_loss_mlp": 1.03267336, "epoch": 0.8365549376221254, "flos": 21612909058560.0, "grad_norm": 1.8334449441985137, "language_loss": 0.73351038, "learning_rate": 2.5790014623105947e-07, "loss": 0.75494707, "num_input_tokens_seen": 300128705, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 13914, "time_per_iteration": 2.624760627746582 }, { "auxiliary_loss_clip": 0.01138284, "auxiliary_loss_mlp": 0.01035974, "balance_loss_clip": 1.02331972, "balance_loss_mlp": 1.03451097, "epoch": 0.8366150608747933, "flos": 23294210912640.0, "grad_norm": 1.6452622936740724, "language_loss": 0.70945036, "learning_rate": 2.577145930730222e-07, "loss": 0.73119295, "num_input_tokens_seen": 300148635, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 13915, "time_per_iteration": 2.7076799869537354 }, { "auxiliary_loss_clip": 0.0113092, "auxiliary_loss_mlp": 0.01031454, "balance_loss_clip": 1.01930666, "balance_loss_mlp": 1.03489375, "epoch": 0.8366751841274613, "flos": 15267673664640.0, "grad_norm": 1.7570643023734407, "language_loss": 0.72102916, "learning_rate": 2.575291020926693e-07, "loss": 0.74265301, "num_input_tokens_seen": 300165490, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 13916, "time_per_iteration": 2.5905051231384277 }, { "auxiliary_loss_clip": 0.01130772, "auxiliary_loss_mlp": 0.01277625, "balance_loss_clip": 1.01811445, "balance_loss_mlp": 1.03496981, "epoch": 0.8367353073801292, "flos": 13224931205760.0, "grad_norm": 1.9651392490795763, "language_loss": 0.74778622, "learning_rate": 2.5734367329662123e-07, "loss": 0.7718702, "num_input_tokens_seen": 300182130, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 13917, "time_per_iteration": 2.6233482360839844 }, { "auxiliary_loss_clip": 0.01100104, "auxiliary_loss_mlp": 0.01031663, "balance_loss_clip": 1.02009964, "balance_loss_mlp": 1.03491092, "epoch": 0.8367954306327973, "flos": 24205084928640.0, "grad_norm": 1.6631507578105353, "language_loss": 0.79326105, "learning_rate": 2.57158306691494e-07, "loss": 0.81457877, "num_input_tokens_seen": 300203050, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.65234375, "step": 13918, "time_per_iteration": 2.6045913696289062 }, { "auxiliary_loss_clip": 0.0114054, "auxiliary_loss_mlp": 0.01034415, "balance_loss_clip": 1.02180302, "balance_loss_mlp": 1.03579378, "epoch": 0.8368555538854652, "flos": 24534744975360.0, "grad_norm": 1.5026498355969249, "language_loss": 0.67785025, "learning_rate": 2.5697300228390404e-07, "loss": 0.69959974, "num_input_tokens_seen": 300224380, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 13919, "time_per_iteration": 2.592290163040161 }, { "auxiliary_loss_clip": 0.01110434, "auxiliary_loss_mlp": 0.01027859, "balance_loss_clip": 1.01611066, "balance_loss_mlp": 1.03331113, "epoch": 0.8369156771381332, "flos": 20259363830400.0, "grad_norm": 1.7320484149161002, "language_loss": 0.73827636, "learning_rate": 2.56787760080464e-07, "loss": 0.75965929, "num_input_tokens_seen": 300242915, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 13920, "time_per_iteration": 2.5334506034851074 }, { "auxiliary_loss_clip": 0.011216, "auxiliary_loss_mlp": 0.01032889, "balance_loss_clip": 1.0204854, "balance_loss_mlp": 1.03404951, "epoch": 0.8369758003908011, "flos": 21835555511040.0, "grad_norm": 1.7798732552189669, "language_loss": 0.6876896, "learning_rate": 2.566025800877849e-07, "loss": 0.70923448, "num_input_tokens_seen": 300261905, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 13921, "time_per_iteration": 2.5263068675994873 }, { "auxiliary_loss_clip": 0.01114878, "auxiliary_loss_mlp": 0.01032586, "balance_loss_clip": 1.02070665, "balance_loss_mlp": 1.03528285, "epoch": 0.8370359236434691, "flos": 21719312121600.0, "grad_norm": 1.9974054546653142, "language_loss": 0.85920715, "learning_rate": 2.564174623124744e-07, "loss": 0.88068181, "num_input_tokens_seen": 300281145, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.7109375, "step": 13922, "time_per_iteration": 2.5217714309692383 }, { "auxiliary_loss_clip": 0.01125345, "auxiliary_loss_mlp": 0.01278362, "balance_loss_clip": 1.01994777, "balance_loss_mlp": 1.03172398, "epoch": 0.837096046896137, "flos": 23148880485120.0, "grad_norm": 1.5803311997675062, "language_loss": 0.71768862, "learning_rate": 2.562324067611403e-07, "loss": 0.74172568, "num_input_tokens_seen": 300301610, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.66796875, "step": 13923, "time_per_iteration": 2.593566656112671 }, { "auxiliary_loss_clip": 0.01119541, "auxiliary_loss_mlp": 0.01027586, "balance_loss_clip": 1.0144136, "balance_loss_mlp": 1.03433228, "epoch": 0.8371561701488051, "flos": 24492872695680.0, "grad_norm": 1.7359141152362043, "language_loss": 0.76058245, "learning_rate": 2.5604741344038625e-07, "loss": 0.78205371, "num_input_tokens_seen": 300319420, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.671875, "step": 13924, "time_per_iteration": 2.5706632137298584 }, { "auxiliary_loss_clip": 0.01118974, "auxiliary_loss_mlp": 0.01028237, "balance_loss_clip": 1.01604831, "balance_loss_mlp": 1.03394198, "epoch": 0.837216293401473, "flos": 29206723161600.0, "grad_norm": 1.5769988764296905, "language_loss": 0.6461789, "learning_rate": 2.5586248235681387e-07, "loss": 0.66765106, "num_input_tokens_seen": 300341325, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 13925, "time_per_iteration": 2.6088218688964844 }, { "auxiliary_loss_clip": 0.01122921, "auxiliary_loss_mlp": 0.01030634, "balance_loss_clip": 1.01844454, "balance_loss_mlp": 1.03487611, "epoch": 0.837276416654141, "flos": 25265275781760.0, "grad_norm": 1.648819967756604, "language_loss": 0.74243677, "learning_rate": 2.5567761351702266e-07, "loss": 0.76397228, "num_input_tokens_seen": 300361620, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69921875, "step": 13926, "time_per_iteration": 2.6044304370880127 }, { "auxiliary_loss_clip": 0.01113643, "auxiliary_loss_mlp": 0.01034323, "balance_loss_clip": 1.02112699, "balance_loss_mlp": 1.03618968, "epoch": 0.837336539906809, "flos": 13882024656000.0, "grad_norm": 1.655791541183687, "language_loss": 0.7099635, "learning_rate": 2.554928069276112e-07, "loss": 0.73144317, "num_input_tokens_seen": 300378675, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6875, "step": 13927, "time_per_iteration": 2.5962183475494385 }, { "auxiliary_loss_clip": 0.01040785, "auxiliary_loss_mlp": 0.01002216, "balance_loss_clip": 1.0008986, "balance_loss_mlp": 1.0015378, "epoch": 0.8373966631594769, "flos": 68870599044480.0, "grad_norm": 0.712264503658512, "language_loss": 0.58817565, "learning_rate": 2.553080625951738e-07, "loss": 0.60860574, "num_input_tokens_seen": 300449740, "router_z_loss_clip": 0.01318359, "router_z_loss_mlp": 0.21289062, "step": 13928, "time_per_iteration": 3.3061835765838623 }, { "auxiliary_loss_clip": 0.01132767, "auxiliary_loss_mlp": 0.01033581, "balance_loss_clip": 1.02071214, "balance_loss_mlp": 1.03735876, "epoch": 0.8374567864121449, "flos": 20448972748800.0, "grad_norm": 1.8552772209883082, "language_loss": 0.69463301, "learning_rate": 2.5512338052630313e-07, "loss": 0.71629649, "num_input_tokens_seen": 300470000, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 13929, "time_per_iteration": 2.5549941062927246 }, { "auxiliary_loss_clip": 0.0111389, "auxiliary_loss_mlp": 0.01029683, "balance_loss_clip": 1.01645041, "balance_loss_mlp": 1.0339942, "epoch": 0.8375169096648128, "flos": 41904197101440.0, "grad_norm": 1.9404636084492652, "language_loss": 0.66744137, "learning_rate": 2.549387607275912e-07, "loss": 0.68887711, "num_input_tokens_seen": 300494975, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 13930, "time_per_iteration": 4.2601683139801025 }, { "auxiliary_loss_clip": 0.01139041, "auxiliary_loss_mlp": 0.01028379, "balance_loss_clip": 1.01636899, "balance_loss_mlp": 1.03677452, "epoch": 0.8375770329174809, "flos": 20009354192640.0, "grad_norm": 1.6663970937371362, "language_loss": 0.71497154, "learning_rate": 2.547542032056262e-07, "loss": 0.7366457, "num_input_tokens_seen": 300513175, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.67578125, "step": 13931, "time_per_iteration": 2.6149942874908447 }, { "auxiliary_loss_clip": 0.0112801, "auxiliary_loss_mlp": 0.0103144, "balance_loss_clip": 1.01952457, "balance_loss_mlp": 1.03385508, "epoch": 0.8376371561701488, "flos": 22783597125120.0, "grad_norm": 1.8403445796774047, "language_loss": 0.71598387, "learning_rate": 2.545697079669942e-07, "loss": 0.73757839, "num_input_tokens_seen": 300533770, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 13932, "time_per_iteration": 2.5404770374298096 }, { "auxiliary_loss_clip": 0.01153832, "auxiliary_loss_mlp": 0.01030412, "balance_loss_clip": 1.01930189, "balance_loss_mlp": 1.03327727, "epoch": 0.8376972794228168, "flos": 23914459987200.0, "grad_norm": 2.162406039940842, "language_loss": 0.66582727, "learning_rate": 2.5438527501827913e-07, "loss": 0.68766975, "num_input_tokens_seen": 300552995, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6796875, "step": 13933, "time_per_iteration": 2.636810779571533 }, { "auxiliary_loss_clip": 0.01119693, "auxiliary_loss_mlp": 0.01040209, "balance_loss_clip": 1.02799606, "balance_loss_mlp": 1.0352968, "epoch": 0.8377574026754847, "flos": 13734718980480.0, "grad_norm": 2.0762968712019307, "language_loss": 0.76353812, "learning_rate": 2.542009043660642e-07, "loss": 0.78513718, "num_input_tokens_seen": 300570275, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6640625, "step": 13934, "time_per_iteration": 2.5197737216949463 }, { "auxiliary_loss_clip": 0.011141, "auxiliary_loss_mlp": 0.01031689, "balance_loss_clip": 1.01935124, "balance_loss_mlp": 1.03775811, "epoch": 0.8378175259281527, "flos": 21651333632640.0, "grad_norm": 1.5270246092914517, "language_loss": 0.77294087, "learning_rate": 2.540165960169276e-07, "loss": 0.79439878, "num_input_tokens_seen": 300590875, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.671875, "step": 13935, "time_per_iteration": 2.604780435562134 }, { "auxiliary_loss_clip": 0.01129463, "auxiliary_loss_mlp": 0.01028692, "balance_loss_clip": 1.01612735, "balance_loss_mlp": 1.03341842, "epoch": 0.8378776491808206, "flos": 15448806973440.0, "grad_norm": 3.4281852878858703, "language_loss": 0.56332982, "learning_rate": 2.5383234997744774e-07, "loss": 0.58491135, "num_input_tokens_seen": 300607490, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 13936, "time_per_iteration": 4.054915428161621 }, { "auxiliary_loss_clip": 0.01108777, "auxiliary_loss_mlp": 0.01026861, "balance_loss_clip": 1.01552415, "balance_loss_mlp": 1.03232431, "epoch": 0.8379377724334887, "flos": 14720395069440.0, "grad_norm": 1.7812863303558735, "language_loss": 0.89160639, "learning_rate": 2.536481662541996e-07, "loss": 0.91296279, "num_input_tokens_seen": 300623635, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.67578125, "step": 13937, "time_per_iteration": 2.5755767822265625 }, { "auxiliary_loss_clip": 0.01112418, "auxiliary_loss_mlp": 0.01030419, "balance_loss_clip": 1.01874876, "balance_loss_mlp": 1.03464079, "epoch": 0.8379978956861566, "flos": 24535247765760.0, "grad_norm": 1.990632560376512, "language_loss": 0.8172313, "learning_rate": 2.5346404485375617e-07, "loss": 0.83865964, "num_input_tokens_seen": 300643835, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6875, "step": 13938, "time_per_iteration": 2.589905261993408 }, { "auxiliary_loss_clip": 0.01114426, "auxiliary_loss_mlp": 0.01032225, "balance_loss_clip": 1.01911807, "balance_loss_mlp": 1.0354054, "epoch": 0.8380580189388246, "flos": 18952611045120.0, "grad_norm": 2.139940325903529, "language_loss": 0.70379645, "learning_rate": 2.5327998578268814e-07, "loss": 0.725263, "num_input_tokens_seen": 300662500, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 13939, "time_per_iteration": 2.525155782699585 }, { "auxiliary_loss_clip": 0.01133281, "auxiliary_loss_mlp": 0.01034947, "balance_loss_clip": 1.02187622, "balance_loss_mlp": 1.03632808, "epoch": 0.8381181421914926, "flos": 26540283922560.0, "grad_norm": 1.6330836334962753, "language_loss": 0.76020515, "learning_rate": 2.5309598904756457e-07, "loss": 0.78188741, "num_input_tokens_seen": 300681480, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 13940, "time_per_iteration": 2.619371175765991 }, { "auxiliary_loss_clip": 0.01111888, "auxiliary_loss_mlp": 0.01033617, "balance_loss_clip": 1.02147603, "balance_loss_mlp": 1.03578043, "epoch": 0.8381782654441605, "flos": 23291481479040.0, "grad_norm": 1.7139593977869116, "language_loss": 0.76390719, "learning_rate": 2.529120546549517e-07, "loss": 0.78536224, "num_input_tokens_seen": 300699165, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.671875, "step": 13941, "time_per_iteration": 2.5533483028411865 }, { "auxiliary_loss_clip": 0.01140352, "auxiliary_loss_mlp": 0.01030411, "balance_loss_clip": 1.01727998, "balance_loss_mlp": 1.03368235, "epoch": 0.8382383886968285, "flos": 26758800311040.0, "grad_norm": 2.057742230502502, "language_loss": 0.73034656, "learning_rate": 2.527281826114136e-07, "loss": 0.75205415, "num_input_tokens_seen": 300714615, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 13942, "time_per_iteration": 2.7087838649749756 }, { "auxiliary_loss_clip": 0.01146191, "auxiliary_loss_mlp": 0.01035767, "balance_loss_clip": 1.02249861, "balance_loss_mlp": 1.03592038, "epoch": 0.8382985119494964, "flos": 26104544035200.0, "grad_norm": 1.5255970504535932, "language_loss": 0.79396522, "learning_rate": 2.5254437292351196e-07, "loss": 0.81578481, "num_input_tokens_seen": 300734860, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.74609375, "step": 13943, "time_per_iteration": 4.105563402175903 }, { "auxiliary_loss_clip": 0.01137428, "auxiliary_loss_mlp": 0.01029018, "balance_loss_clip": 1.01741326, "balance_loss_mlp": 1.03350055, "epoch": 0.8383586352021645, "flos": 16435129507200.0, "grad_norm": 2.1808993619809596, "language_loss": 0.85193086, "learning_rate": 2.523606255978068e-07, "loss": 0.8735953, "num_input_tokens_seen": 300752735, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6875, "step": 13944, "time_per_iteration": 2.6101133823394775 }, { "auxiliary_loss_clip": 0.01031971, "auxiliary_loss_mlp": 0.0124891, "balance_loss_clip": 1.00175142, "balance_loss_mlp": 1.00143826, "epoch": 0.8384187584548324, "flos": 64195532288640.0, "grad_norm": 0.6706544510190652, "language_loss": 0.50267208, "learning_rate": 2.521769406408554e-07, "loss": 0.52548087, "num_input_tokens_seen": 300820760, "router_z_loss_clip": 0.01287842, "router_z_loss_mlp": 0.21289062, "step": 13945, "time_per_iteration": 3.2310330867767334 }, { "auxiliary_loss_clip": 0.01137225, "auxiliary_loss_mlp": 0.01029884, "balance_loss_clip": 1.01819515, "balance_loss_mlp": 1.03388047, "epoch": 0.8384788817075004, "flos": 22382905933440.0, "grad_norm": 1.9356171217195324, "language_loss": 0.65219355, "learning_rate": 2.519933180592126e-07, "loss": 0.6738646, "num_input_tokens_seen": 300840025, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.67578125, "step": 13946, "time_per_iteration": 4.111257791519165 }, { "auxiliary_loss_clip": 0.01137647, "auxiliary_loss_mlp": 0.01030442, "balance_loss_clip": 1.01750171, "balance_loss_mlp": 1.03274345, "epoch": 0.8385390049601683, "flos": 29496845312640.0, "grad_norm": 1.629863976750364, "language_loss": 0.67815083, "learning_rate": 2.5180975785943223e-07, "loss": 0.69983172, "num_input_tokens_seen": 300860380, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 13947, "time_per_iteration": 2.6725828647613525 }, { "auxiliary_loss_clip": 0.01123607, "auxiliary_loss_mlp": 0.01032753, "balance_loss_clip": 1.0197649, "balance_loss_mlp": 1.03545117, "epoch": 0.8385991282128363, "flos": 32707797799680.0, "grad_norm": 1.601878190073065, "language_loss": 0.69846624, "learning_rate": 2.5162626004806475e-07, "loss": 0.72002983, "num_input_tokens_seen": 300881895, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 13948, "time_per_iteration": 2.60265851020813 }, { "auxiliary_loss_clip": 0.01117905, "auxiliary_loss_mlp": 0.01030487, "balance_loss_clip": 1.01979399, "balance_loss_mlp": 1.03376198, "epoch": 0.8386592514655042, "flos": 25441022050560.0, "grad_norm": 2.3327832667842254, "language_loss": 0.85047865, "learning_rate": 2.514428246316589e-07, "loss": 0.87196255, "num_input_tokens_seen": 300901575, "router_z_loss_clip": 0.10693359, "router_z_loss_mlp": 0.6640625, "step": 13949, "time_per_iteration": 2.6247398853302 }, { "auxiliary_loss_clip": 0.01121616, "auxiliary_loss_mlp": 0.01028417, "balance_loss_clip": 1.0159775, "balance_loss_mlp": 1.03595209, "epoch": 0.8387193747181723, "flos": 22015898720640.0, "grad_norm": 1.6647423899895144, "language_loss": 0.70424616, "learning_rate": 2.512594516167601e-07, "loss": 0.72574651, "num_input_tokens_seen": 300919735, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.67578125, "step": 13950, "time_per_iteration": 2.5949082374572754 }, { "auxiliary_loss_clip": 0.01111938, "auxiliary_loss_mlp": 0.01030938, "balance_loss_clip": 1.01807547, "balance_loss_mlp": 1.03457296, "epoch": 0.8387794979708402, "flos": 18150223080960.0, "grad_norm": 1.6186639724078442, "language_loss": 0.6470058, "learning_rate": 2.5107614100991423e-07, "loss": 0.66843456, "num_input_tokens_seen": 300939150, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6796875, "step": 13951, "time_per_iteration": 2.515387535095215 }, { "auxiliary_loss_clip": 0.01113646, "auxiliary_loss_mlp": 0.01027463, "balance_loss_clip": 1.01550007, "balance_loss_mlp": 1.03603339, "epoch": 0.8388396212235082, "flos": 25411216740480.0, "grad_norm": 1.7061144593766255, "language_loss": 0.69955027, "learning_rate": 2.5089289281766123e-07, "loss": 0.72096139, "num_input_tokens_seen": 300959730, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 13952, "time_per_iteration": 2.6027212142944336 }, { "auxiliary_loss_clip": 0.01109358, "auxiliary_loss_mlp": 0.01028044, "balance_loss_clip": 1.01621854, "balance_loss_mlp": 1.03346312, "epoch": 0.8388997444761762, "flos": 22273055164800.0, "grad_norm": 2.500423101392387, "language_loss": 0.72912776, "learning_rate": 2.50709707046542e-07, "loss": 0.75050181, "num_input_tokens_seen": 300976120, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 13953, "time_per_iteration": 2.5673446655273438 }, { "auxiliary_loss_clip": 0.01145655, "auxiliary_loss_mlp": 0.01027322, "balance_loss_clip": 1.01504326, "balance_loss_mlp": 1.03386664, "epoch": 0.8389598677288441, "flos": 19573219255680.0, "grad_norm": 1.996859292641369, "language_loss": 0.68078864, "learning_rate": 2.505265837030937e-07, "loss": 0.70251846, "num_input_tokens_seen": 300995080, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.671875, "step": 13954, "time_per_iteration": 2.6297171115875244 }, { "auxiliary_loss_clip": 0.01121693, "auxiliary_loss_mlp": 0.01033903, "balance_loss_clip": 1.022596, "balance_loss_mlp": 1.03530371, "epoch": 0.8390199909815121, "flos": 17384715406080.0, "grad_norm": 1.8642848480702874, "language_loss": 0.73207164, "learning_rate": 2.503435227938513e-07, "loss": 0.75362754, "num_input_tokens_seen": 301012920, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.69140625, "step": 13955, "time_per_iteration": 2.5687549114227295 }, { "auxiliary_loss_clip": 0.01148147, "auxiliary_loss_mlp": 0.01028392, "balance_loss_clip": 1.01585126, "balance_loss_mlp": 1.0341748, "epoch": 0.83908011423418, "flos": 24639639667200.0, "grad_norm": 2.4773656923406833, "language_loss": 0.66495258, "learning_rate": 2.5016052432534774e-07, "loss": 0.68671793, "num_input_tokens_seen": 301028875, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 13956, "time_per_iteration": 2.6935482025146484 }, { "auxiliary_loss_clip": 0.01115838, "auxiliary_loss_mlp": 0.0103502, "balance_loss_clip": 1.02197289, "balance_loss_mlp": 1.03537846, "epoch": 0.8391402374868481, "flos": 24718356322560.0, "grad_norm": 1.9865376866334903, "language_loss": 0.79852784, "learning_rate": 2.499775883041142e-07, "loss": 0.82003641, "num_input_tokens_seen": 301050115, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 13957, "time_per_iteration": 2.5866689682006836 }, { "auxiliary_loss_clip": 0.01068138, "auxiliary_loss_mlp": 0.01000519, "balance_loss_clip": 0.99930286, "balance_loss_mlp": 1.00174594, "epoch": 0.839200360739516, "flos": 56871695784960.0, "grad_norm": 0.7521185322036417, "language_loss": 0.53334308, "learning_rate": 2.4979471473667945e-07, "loss": 0.5540297, "num_input_tokens_seen": 301114155, "router_z_loss_clip": 0.012146, "router_z_loss_mlp": 0.21191406, "step": 13958, "time_per_iteration": 3.309299945831299 }, { "auxiliary_loss_clip": 0.01111635, "auxiliary_loss_mlp": 0.01035312, "balance_loss_clip": 1.02221143, "balance_loss_mlp": 1.03439629, "epoch": 0.839260483992184, "flos": 18332792933760.0, "grad_norm": 1.982526593612152, "language_loss": 0.73289168, "learning_rate": 2.496119036295683e-07, "loss": 0.75436115, "num_input_tokens_seen": 301133150, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.68359375, "step": 13959, "time_per_iteration": 2.5569663047790527 }, { "auxiliary_loss_clip": 0.01128154, "auxiliary_loss_mlp": 0.01026446, "balance_loss_clip": 1.01516294, "balance_loss_mlp": 1.03555739, "epoch": 0.8393206072448519, "flos": 27087921653760.0, "grad_norm": 1.7565953397001306, "language_loss": 0.55588549, "learning_rate": 2.494291549893062e-07, "loss": 0.57743144, "num_input_tokens_seen": 301153600, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.66015625, "step": 13960, "time_per_iteration": 2.5962462425231934 }, { "auxiliary_loss_clip": 0.0104984, "auxiliary_loss_mlp": 0.01001021, "balance_loss_clip": 0.99979287, "balance_loss_mlp": 1.00145721, "epoch": 0.8393807304975199, "flos": 61521192057600.0, "grad_norm": 0.8001042122767253, "language_loss": 0.60723984, "learning_rate": 2.492464688224145e-07, "loss": 0.62774843, "num_input_tokens_seen": 301214335, "router_z_loss_clip": 0.01226807, "router_z_loss_mlp": 0.21289062, "step": 13961, "time_per_iteration": 3.247877597808838 }, { "auxiliary_loss_clip": 0.01137709, "auxiliary_loss_mlp": 0.01034522, "balance_loss_clip": 1.02260113, "balance_loss_mlp": 1.03311825, "epoch": 0.8394408537501878, "flos": 14894848448640.0, "grad_norm": 4.419839103447787, "language_loss": 0.68499386, "learning_rate": 2.49063845135413e-07, "loss": 0.70671618, "num_input_tokens_seen": 301228960, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 13962, "time_per_iteration": 2.5609896183013916 }, { "auxiliary_loss_clip": 0.01130058, "auxiliary_loss_mlp": 0.01029285, "balance_loss_clip": 1.01687503, "balance_loss_mlp": 1.0328275, "epoch": 0.8395009770028559, "flos": 17412186332160.0, "grad_norm": 2.2700995692980643, "language_loss": 0.72881436, "learning_rate": 2.488812839348184e-07, "loss": 0.75040781, "num_input_tokens_seen": 301245875, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 13963, "time_per_iteration": 2.54884672164917 }, { "auxiliary_loss_clip": 0.01121044, "auxiliary_loss_mlp": 0.01036699, "balance_loss_clip": 1.02456963, "balance_loss_mlp": 1.03391802, "epoch": 0.8395611002555238, "flos": 27924747782400.0, "grad_norm": 1.7373347406528292, "language_loss": 0.76344156, "learning_rate": 2.486987852271474e-07, "loss": 0.78501904, "num_input_tokens_seen": 301265550, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 13964, "time_per_iteration": 2.6764345169067383 }, { "auxiliary_loss_clip": 0.01124934, "auxiliary_loss_mlp": 0.01037078, "balance_loss_clip": 1.02354145, "balance_loss_mlp": 1.03612232, "epoch": 0.8396212235081918, "flos": 11100922225920.0, "grad_norm": 1.9290340831235282, "language_loss": 0.78299701, "learning_rate": 2.4851634901891085e-07, "loss": 0.80461717, "num_input_tokens_seen": 301282035, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 13965, "time_per_iteration": 2.5730459690093994 }, { "auxiliary_loss_clip": 0.01110622, "auxiliary_loss_mlp": 0.01032562, "balance_loss_clip": 1.02090335, "balance_loss_mlp": 1.03360832, "epoch": 0.8396813467608598, "flos": 35735641729920.0, "grad_norm": 1.5121249501249148, "language_loss": 0.65490746, "learning_rate": 2.4833397531662094e-07, "loss": 0.67633933, "num_input_tokens_seen": 301305210, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6796875, "step": 13966, "time_per_iteration": 2.714322805404663 }, { "auxiliary_loss_clip": 0.01120012, "auxiliary_loss_mlp": 0.01031807, "balance_loss_clip": 1.01937902, "balance_loss_mlp": 1.03398156, "epoch": 0.8397414700135277, "flos": 26176724415360.0, "grad_norm": 1.5885807668622183, "language_loss": 0.74439877, "learning_rate": 2.481516641267851e-07, "loss": 0.76591694, "num_input_tokens_seen": 301324885, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 13967, "time_per_iteration": 2.6469409465789795 }, { "auxiliary_loss_clip": 0.01120074, "auxiliary_loss_mlp": 0.01034119, "balance_loss_clip": 1.02207899, "balance_loss_mlp": 1.03529632, "epoch": 0.8398015932661957, "flos": 18333116156160.0, "grad_norm": 4.25563818598177, "language_loss": 0.83132505, "learning_rate": 2.479694154559111e-07, "loss": 0.85286695, "num_input_tokens_seen": 301343070, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 13968, "time_per_iteration": 2.594425678253174 }, { "auxiliary_loss_clip": 0.01122927, "auxiliary_loss_mlp": 0.01031568, "balance_loss_clip": 1.01959944, "balance_loss_mlp": 1.03487682, "epoch": 0.8398617165188637, "flos": 17379507934080.0, "grad_norm": 1.8930073847961406, "language_loss": 0.7737875, "learning_rate": 2.4778722931050076e-07, "loss": 0.79533243, "num_input_tokens_seen": 301359280, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69921875, "step": 13969, "time_per_iteration": 2.574758291244507 }, { "auxiliary_loss_clip": 0.011228, "auxiliary_loss_mlp": 0.01027973, "balance_loss_clip": 1.01491332, "balance_loss_mlp": 1.03376818, "epoch": 0.8399218397715317, "flos": 22929681738240.0, "grad_norm": 3.2192789549219376, "language_loss": 0.77302164, "learning_rate": 2.476051056970576e-07, "loss": 0.79452938, "num_input_tokens_seen": 301376465, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 13970, "time_per_iteration": 2.621274471282959 }, { "auxiliary_loss_clip": 0.01121257, "auxiliary_loss_mlp": 0.0103767, "balance_loss_clip": 1.02551055, "balance_loss_mlp": 1.03411829, "epoch": 0.8399819630241996, "flos": 23149562843520.0, "grad_norm": 2.5422850669195873, "language_loss": 0.72161919, "learning_rate": 2.4742304462208064e-07, "loss": 0.74320847, "num_input_tokens_seen": 301396000, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6953125, "step": 13971, "time_per_iteration": 2.665480375289917 }, { "auxiliary_loss_clip": 0.0112063, "auxiliary_loss_mlp": 0.01030144, "balance_loss_clip": 1.01684666, "balance_loss_mlp": 1.03368759, "epoch": 0.8400420862768676, "flos": 16397423205120.0, "grad_norm": 1.8780744282230621, "language_loss": 0.77214646, "learning_rate": 2.472410460920669e-07, "loss": 0.7936542, "num_input_tokens_seen": 301413160, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 13972, "time_per_iteration": 4.035035610198975 }, { "auxiliary_loss_clip": 0.01145902, "auxiliary_loss_mlp": 0.01035722, "balance_loss_clip": 1.02378917, "balance_loss_mlp": 1.0343411, "epoch": 0.8401022095295355, "flos": 21287486816640.0, "grad_norm": 1.265566990943354, "language_loss": 0.68242121, "learning_rate": 2.4705911011351156e-07, "loss": 0.70423746, "num_input_tokens_seen": 301433325, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 13973, "time_per_iteration": 2.662341594696045 }, { "auxiliary_loss_clip": 0.01122649, "auxiliary_loss_mlp": 0.01025739, "balance_loss_clip": 1.0133177, "balance_loss_mlp": 1.03499258, "epoch": 0.8401623327822035, "flos": 17311313963520.0, "grad_norm": 1.9400048842444035, "language_loss": 0.78060877, "learning_rate": 2.468772366929071e-07, "loss": 0.80209261, "num_input_tokens_seen": 301450265, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.703125, "step": 13974, "time_per_iteration": 2.5777316093444824 }, { "auxiliary_loss_clip": 0.01111549, "auxiliary_loss_mlp": 0.01030077, "balance_loss_clip": 1.01808476, "balance_loss_mlp": 1.03296566, "epoch": 0.8402224560348714, "flos": 22236677665920.0, "grad_norm": 1.5891818793362191, "language_loss": 0.72543216, "learning_rate": 2.466954258367453e-07, "loss": 0.74684846, "num_input_tokens_seen": 301470760, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 13975, "time_per_iteration": 2.5557520389556885 }, { "auxiliary_loss_clip": 0.01119969, "auxiliary_loss_mlp": 0.01029044, "balance_loss_clip": 1.01636028, "balance_loss_mlp": 1.03452098, "epoch": 0.8402825792875395, "flos": 20229953569920.0, "grad_norm": 1.6028818695633353, "language_loss": 0.72666812, "learning_rate": 2.465136775515131e-07, "loss": 0.74815822, "num_input_tokens_seen": 301489425, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.67578125, "step": 13976, "time_per_iteration": 2.559870481491089 }, { "auxiliary_loss_clip": 0.01100288, "auxiliary_loss_mlp": 0.01278151, "balance_loss_clip": 1.02004385, "balance_loss_mlp": 1.03297925, "epoch": 0.8403427025402074, "flos": 23289973107840.0, "grad_norm": 2.0544588533838763, "language_loss": 0.7189641, "learning_rate": 2.4633199184369747e-07, "loss": 0.7427485, "num_input_tokens_seen": 301508885, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.671875, "step": 13977, "time_per_iteration": 2.511507034301758 }, { "auxiliary_loss_clip": 0.01110859, "auxiliary_loss_mlp": 0.01030095, "balance_loss_clip": 1.01812637, "balance_loss_mlp": 1.0322597, "epoch": 0.8404028257928754, "flos": 32675586278400.0, "grad_norm": 1.5054857387365705, "language_loss": 0.68414778, "learning_rate": 2.461503687197819e-07, "loss": 0.70555735, "num_input_tokens_seen": 301533780, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 13978, "time_per_iteration": 4.059051036834717 }, { "auxiliary_loss_clip": 0.01126187, "auxiliary_loss_mlp": 0.01030848, "balance_loss_clip": 1.01880229, "balance_loss_mlp": 1.0336951, "epoch": 0.8404629490455434, "flos": 16180522928640.0, "grad_norm": 1.84002242938537, "language_loss": 0.77924496, "learning_rate": 2.4596880818624856e-07, "loss": 0.80081534, "num_input_tokens_seen": 301551775, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.65625, "step": 13979, "time_per_iteration": 2.5191235542297363 }, { "auxiliary_loss_clip": 0.01113312, "auxiliary_loss_mlp": 0.0102692, "balance_loss_clip": 1.01487994, "balance_loss_mlp": 1.03544164, "epoch": 0.8405230722982113, "flos": 15194451790080.0, "grad_norm": 1.7678626380985558, "language_loss": 0.77973008, "learning_rate": 2.45787310249576e-07, "loss": 0.80113244, "num_input_tokens_seen": 301570495, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 13980, "time_per_iteration": 2.626404047012329 }, { "auxiliary_loss_clip": 0.0112957, "auxiliary_loss_mlp": 0.01028186, "balance_loss_clip": 1.01666999, "balance_loss_mlp": 1.03578842, "epoch": 0.8405831955508793, "flos": 27812418975360.0, "grad_norm": 1.4733623800694504, "language_loss": 0.86503327, "learning_rate": 2.4560587491624263e-07, "loss": 0.88661081, "num_input_tokens_seen": 301591705, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 13981, "time_per_iteration": 2.6584677696228027 }, { "auxiliary_loss_clip": 0.01111406, "auxiliary_loss_mlp": 0.01031571, "balance_loss_clip": 1.01910782, "balance_loss_mlp": 1.03482533, "epoch": 0.8406433188035473, "flos": 23769452782080.0, "grad_norm": 2.2717311249772902, "language_loss": 0.67520785, "learning_rate": 2.4542450219272213e-07, "loss": 0.69663757, "num_input_tokens_seen": 301611670, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.67578125, "step": 13982, "time_per_iteration": 2.615817070007324 }, { "auxiliary_loss_clip": 0.01140117, "auxiliary_loss_mlp": 0.01036301, "balance_loss_clip": 1.0224545, "balance_loss_mlp": 1.03551865, "epoch": 0.8407034420562153, "flos": 29205681667200.0, "grad_norm": 1.5496956094853809, "language_loss": 0.67872864, "learning_rate": 2.4524319208548826e-07, "loss": 0.7004928, "num_input_tokens_seen": 301632540, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.6875, "step": 13983, "time_per_iteration": 2.708226203918457 }, { "auxiliary_loss_clip": 0.0112949, "auxiliary_loss_mlp": 0.01031773, "balance_loss_clip": 1.01932168, "balance_loss_mlp": 1.03241968, "epoch": 0.8407635653088832, "flos": 26360084367360.0, "grad_norm": 1.835909839054482, "language_loss": 0.78487021, "learning_rate": 2.450619446010105e-07, "loss": 0.80648291, "num_input_tokens_seen": 301651480, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.703125, "step": 13984, "time_per_iteration": 2.6288387775421143 }, { "auxiliary_loss_clip": 0.0110831, "auxiliary_loss_mlp": 0.01031375, "balance_loss_clip": 1.01947808, "balance_loss_mlp": 1.03147674, "epoch": 0.8408236885615512, "flos": 21468799693440.0, "grad_norm": 1.8307945376887744, "language_loss": 0.60339147, "learning_rate": 2.4488075974575873e-07, "loss": 0.62478834, "num_input_tokens_seen": 301670010, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 13985, "time_per_iteration": 3.983168601989746 }, { "auxiliary_loss_clip": 0.01141992, "auxiliary_loss_mlp": 0.01031185, "balance_loss_clip": 1.01859045, "balance_loss_mlp": 1.03549385, "epoch": 0.8408838118142191, "flos": 22963724853120.0, "grad_norm": 1.5846374046231757, "language_loss": 0.81764579, "learning_rate": 2.4469963752619695e-07, "loss": 0.83937752, "num_input_tokens_seen": 301689785, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 13986, "time_per_iteration": 2.62856125831604 }, { "auxiliary_loss_clip": 0.01122073, "auxiliary_loss_mlp": 0.01277331, "balance_loss_clip": 1.01832795, "balance_loss_mlp": 1.03624058, "epoch": 0.8409439350668871, "flos": 26800026145920.0, "grad_norm": 1.481700193159358, "language_loss": 0.65841615, "learning_rate": 2.445185779487904e-07, "loss": 0.68241018, "num_input_tokens_seen": 301712225, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.68359375, "step": 13987, "time_per_iteration": 2.653670310974121 }, { "auxiliary_loss_clip": 0.01102499, "auxiliary_loss_mlp": 0.01284362, "balance_loss_clip": 1.02525198, "balance_loss_mlp": 1.03393936, "epoch": 0.841004058319555, "flos": 26578672583040.0, "grad_norm": 1.829674334153605, "language_loss": 0.67488521, "learning_rate": 2.4433758102000055e-07, "loss": 0.69875383, "num_input_tokens_seen": 301730955, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 13988, "time_per_iteration": 4.138489723205566 }, { "auxiliary_loss_clip": 0.01118856, "auxiliary_loss_mlp": 0.01039418, "balance_loss_clip": 1.02609634, "balance_loss_mlp": 1.03744495, "epoch": 0.8410641815722231, "flos": 14501878680960.0, "grad_norm": 2.02614684711111, "language_loss": 0.8101961, "learning_rate": 2.441566467462857e-07, "loss": 0.83177882, "num_input_tokens_seen": 301746930, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 13989, "time_per_iteration": 2.5103137493133545 }, { "auxiliary_loss_clip": 0.01112883, "auxiliary_loss_mlp": 0.01031609, "balance_loss_clip": 1.01912165, "balance_loss_mlp": 1.03444004, "epoch": 0.841124304824891, "flos": 29166682475520.0, "grad_norm": 1.7237039524978148, "language_loss": 0.75679177, "learning_rate": 2.439757751341038e-07, "loss": 0.77823663, "num_input_tokens_seen": 301766945, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 13990, "time_per_iteration": 2.664679765701294 }, { "auxiliary_loss_clip": 0.01106143, "auxiliary_loss_mlp": 0.01033868, "balance_loss_clip": 1.02227509, "balance_loss_mlp": 1.03211224, "epoch": 0.841184428077559, "flos": 22412028885120.0, "grad_norm": 1.6033027296785485, "language_loss": 0.80805755, "learning_rate": 2.43794966189909e-07, "loss": 0.82945764, "num_input_tokens_seen": 301785460, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.65234375, "step": 13991, "time_per_iteration": 2.5299911499023438 }, { "auxiliary_loss_clip": 0.01127613, "auxiliary_loss_mlp": 0.01031793, "balance_loss_clip": 1.01986074, "balance_loss_mlp": 1.03376007, "epoch": 0.841244551330227, "flos": 22962791099520.0, "grad_norm": 2.2670587457868856, "language_loss": 0.70837426, "learning_rate": 2.436142199201552e-07, "loss": 0.72996831, "num_input_tokens_seen": 301804180, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.67578125, "step": 13992, "time_per_iteration": 2.6018269062042236 }, { "auxiliary_loss_clip": 0.01130258, "auxiliary_loss_mlp": 0.01028004, "balance_loss_clip": 1.01616037, "balance_loss_mlp": 1.03414476, "epoch": 0.8413046745828949, "flos": 21032736583680.0, "grad_norm": 1.6556392220475713, "language_loss": 0.76474279, "learning_rate": 2.434335363312912e-07, "loss": 0.7863254, "num_input_tokens_seen": 301823670, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 13993, "time_per_iteration": 2.572840690612793 }, { "auxiliary_loss_clip": 0.01117701, "auxiliary_loss_mlp": 0.01034237, "balance_loss_clip": 1.02017033, "balance_loss_mlp": 1.03478694, "epoch": 0.841364797835563, "flos": 27052082858880.0, "grad_norm": 1.7826886387140128, "language_loss": 0.74229097, "learning_rate": 2.4325291542976625e-07, "loss": 0.76381034, "num_input_tokens_seen": 301845890, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7421875, "step": 13994, "time_per_iteration": 2.616128921508789 }, { "auxiliary_loss_clip": 0.011098, "auxiliary_loss_mlp": 0.01031393, "balance_loss_clip": 1.02010357, "balance_loss_mlp": 1.03444195, "epoch": 0.8414249210882309, "flos": 17895688329600.0, "grad_norm": 1.9032400283515059, "language_loss": 0.59478176, "learning_rate": 2.430723572220257e-07, "loss": 0.61619365, "num_input_tokens_seen": 301863985, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6640625, "step": 13995, "time_per_iteration": 2.549888849258423 }, { "auxiliary_loss_clip": 0.01110776, "auxiliary_loss_mlp": 0.01033394, "balance_loss_clip": 1.02069807, "balance_loss_mlp": 1.03405952, "epoch": 0.8414850443408989, "flos": 25441201618560.0, "grad_norm": 1.667328063136771, "language_loss": 0.71707809, "learning_rate": 2.428918617145135e-07, "loss": 0.73851973, "num_input_tokens_seen": 301882765, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.68359375, "step": 13996, "time_per_iteration": 2.602912664413452 }, { "auxiliary_loss_clip": 0.01119352, "auxiliary_loss_mlp": 0.01030933, "balance_loss_clip": 1.0192976, "balance_loss_mlp": 1.03262973, "epoch": 0.8415451675935668, "flos": 23220055284480.0, "grad_norm": 2.0415669297651156, "language_loss": 0.6427139, "learning_rate": 2.427114289136705e-07, "loss": 0.66421676, "num_input_tokens_seen": 301902720, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6875, "step": 13997, "time_per_iteration": 2.5947353839874268 }, { "auxiliary_loss_clip": 0.01125033, "auxiliary_loss_mlp": 0.01035628, "balance_loss_clip": 1.02210915, "balance_loss_mlp": 1.03492165, "epoch": 0.8416052908462348, "flos": 18546496899840.0, "grad_norm": 2.1569458523430347, "language_loss": 0.82556808, "learning_rate": 2.425310588259373e-07, "loss": 0.8471747, "num_input_tokens_seen": 301921245, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.72265625, "step": 13998, "time_per_iteration": 2.586123466491699 }, { "auxiliary_loss_clip": 0.01113757, "auxiliary_loss_mlp": 0.01282051, "balance_loss_clip": 1.02254891, "balance_loss_mlp": 1.0356642, "epoch": 0.8416654140989027, "flos": 26105190480000.0, "grad_norm": 1.521462013962135, "language_loss": 0.80434817, "learning_rate": 2.4235075145774875e-07, "loss": 0.82830626, "num_input_tokens_seen": 301942320, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 13999, "time_per_iteration": 2.578220844268799 }, { "auxiliary_loss_clip": 0.01115602, "auxiliary_loss_mlp": 0.01033992, "balance_loss_clip": 1.02086103, "balance_loss_mlp": 1.03558123, "epoch": 0.8417255373515707, "flos": 26433270328320.0, "grad_norm": 2.2973396996382647, "language_loss": 0.66695577, "learning_rate": 2.421705068155413e-07, "loss": 0.68845177, "num_input_tokens_seen": 301963110, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 14000, "time_per_iteration": 2.580517053604126 }, { "auxiliary_loss_clip": 0.01116565, "auxiliary_loss_mlp": 0.01028471, "balance_loss_clip": 1.01541162, "balance_loss_mlp": 1.03585279, "epoch": 0.8417856606042387, "flos": 24717745791360.0, "grad_norm": 1.4728547304149098, "language_loss": 0.79544693, "learning_rate": 2.419903249057462e-07, "loss": 0.81689727, "num_input_tokens_seen": 301984915, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 14001, "time_per_iteration": 2.6231839656829834 }, { "auxiliary_loss_clip": 0.01146156, "auxiliary_loss_mlp": 0.01028481, "balance_loss_clip": 1.0168221, "balance_loss_mlp": 1.03308153, "epoch": 0.8418457838569067, "flos": 20850849089280.0, "grad_norm": 1.7481832888470739, "language_loss": 0.78554296, "learning_rate": 2.418102057347953e-07, "loss": 0.80728936, "num_input_tokens_seen": 302004095, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.68359375, "step": 14002, "time_per_iteration": 2.679147481918335 }, { "auxiliary_loss_clip": 0.01136579, "auxiliary_loss_mlp": 0.01276809, "balance_loss_clip": 1.01754534, "balance_loss_mlp": 1.03296065, "epoch": 0.8419059071095746, "flos": 28660629715200.0, "grad_norm": 1.4931820039644577, "language_loss": 0.7779578, "learning_rate": 2.4163014930911487e-07, "loss": 0.80209172, "num_input_tokens_seen": 302027250, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6796875, "step": 14003, "time_per_iteration": 2.675367832183838 }, { "auxiliary_loss_clip": 0.01120654, "auxiliary_loss_mlp": 0.01028799, "balance_loss_clip": 1.01639557, "balance_loss_mlp": 1.03403997, "epoch": 0.8419660303622426, "flos": 21653596189440.0, "grad_norm": 1.788686297300219, "language_loss": 0.65619314, "learning_rate": 2.4145015563513115e-07, "loss": 0.67768764, "num_input_tokens_seen": 302046950, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 14004, "time_per_iteration": 2.5856032371520996 }, { "auxiliary_loss_clip": 0.01114271, "auxiliary_loss_mlp": 0.01032954, "balance_loss_clip": 1.01974583, "balance_loss_mlp": 1.03495455, "epoch": 0.8420261536149106, "flos": 25301114576640.0, "grad_norm": 2.2750661152258265, "language_loss": 0.76009631, "learning_rate": 2.412702247192686e-07, "loss": 0.78156853, "num_input_tokens_seen": 302065470, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 14005, "time_per_iteration": 2.556056499481201 }, { "auxiliary_loss_clip": 0.01129904, "auxiliary_loss_mlp": 0.01276059, "balance_loss_clip": 1.01647806, "balance_loss_mlp": 1.0328939, "epoch": 0.8420862768675785, "flos": 18763397176320.0, "grad_norm": 1.8944003719442901, "language_loss": 0.77390492, "learning_rate": 2.4109035656794694e-07, "loss": 0.79796457, "num_input_tokens_seen": 302083190, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 14006, "time_per_iteration": 2.557676315307617 }, { "auxiliary_loss_clip": 0.01112811, "auxiliary_loss_mlp": 0.01035977, "balance_loss_clip": 1.0235728, "balance_loss_mlp": 1.0355891, "epoch": 0.8421464001202466, "flos": 18328052338560.0, "grad_norm": 1.7921902832275431, "language_loss": 0.76954347, "learning_rate": 2.409105511875864e-07, "loss": 0.79103136, "num_input_tokens_seen": 302098820, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 14007, "time_per_iteration": 2.5269763469696045 }, { "auxiliary_loss_clip": 0.01121157, "auxiliary_loss_mlp": 0.01032946, "balance_loss_clip": 1.01980352, "balance_loss_mlp": 1.03282118, "epoch": 0.8422065233729145, "flos": 31537181560320.0, "grad_norm": 2.082387565912849, "language_loss": 0.65836155, "learning_rate": 2.407308085846034e-07, "loss": 0.67990255, "num_input_tokens_seen": 302117075, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 14008, "time_per_iteration": 2.7523884773254395 }, { "auxiliary_loss_clip": 0.01031741, "auxiliary_loss_mlp": 0.01002407, "balance_loss_clip": 1.0011735, "balance_loss_mlp": 1.00170767, "epoch": 0.8422666466255825, "flos": 64298128510080.0, "grad_norm": 0.7137904350020116, "language_loss": 0.56939101, "learning_rate": 2.405511287654123e-07, "loss": 0.58973241, "num_input_tokens_seen": 302179735, "router_z_loss_clip": 0.0123291, "router_z_loss_mlp": 0.21289062, "step": 14009, "time_per_iteration": 3.253872871398926 }, { "auxiliary_loss_clip": 0.01144648, "auxiliary_loss_mlp": 0.01028969, "balance_loss_clip": 1.01751852, "balance_loss_mlp": 1.03270721, "epoch": 0.8423267698782504, "flos": 24316731377280.0, "grad_norm": 1.318465216209572, "language_loss": 0.78043807, "learning_rate": 2.403715117364253e-07, "loss": 0.80217421, "num_input_tokens_seen": 302202055, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.671875, "step": 14010, "time_per_iteration": 2.6703906059265137 }, { "auxiliary_loss_clip": 0.01107536, "auxiliary_loss_mlp": 0.01039966, "balance_loss_clip": 1.02635789, "balance_loss_mlp": 1.03590608, "epoch": 0.8423868931309184, "flos": 18296092212480.0, "grad_norm": 11.123696311026256, "language_loss": 0.72376525, "learning_rate": 2.401919575040532e-07, "loss": 0.74524027, "num_input_tokens_seen": 302221360, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71484375, "step": 14011, "time_per_iteration": 2.6164474487304688 }, { "auxiliary_loss_clip": 0.01111345, "auxiliary_loss_mlp": 0.01036507, "balance_loss_clip": 1.02455044, "balance_loss_mlp": 1.03283763, "epoch": 0.8424470163835863, "flos": 23550218121600.0, "grad_norm": 2.640285687861024, "language_loss": 0.84610629, "learning_rate": 2.4001246607470316e-07, "loss": 0.86758482, "num_input_tokens_seen": 302240715, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.69140625, "step": 14012, "time_per_iteration": 2.5416839122772217 }, { "auxiliary_loss_clip": 0.01094618, "auxiliary_loss_mlp": 0.01028177, "balance_loss_clip": 1.01753116, "balance_loss_mlp": 1.03164852, "epoch": 0.8425071396362543, "flos": 23769488695680.0, "grad_norm": 1.5506011750774542, "language_loss": 0.67577076, "learning_rate": 2.398330374547812e-07, "loss": 0.69699872, "num_input_tokens_seen": 302260950, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.62890625, "step": 14013, "time_per_iteration": 4.01661491394043 }, { "auxiliary_loss_clip": 0.01106844, "auxiliary_loss_mlp": 0.01032857, "balance_loss_clip": 1.02008963, "balance_loss_mlp": 1.0343895, "epoch": 0.8425672628889223, "flos": 16178906816640.0, "grad_norm": 2.436167466124245, "language_loss": 0.78452373, "learning_rate": 2.3965367165068984e-07, "loss": 0.80592072, "num_input_tokens_seen": 302277500, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.72265625, "step": 14014, "time_per_iteration": 2.516674757003784 }, { "auxiliary_loss_clip": 0.01129889, "auxiliary_loss_mlp": 0.0127689, "balance_loss_clip": 1.01825166, "balance_loss_mlp": 1.03470314, "epoch": 0.8426273861415903, "flos": 23149131880320.0, "grad_norm": 1.8023591361643425, "language_loss": 0.67622089, "learning_rate": 2.3947436866883186e-07, "loss": 0.70028871, "num_input_tokens_seen": 302297930, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 14015, "time_per_iteration": 2.5701100826263428 }, { "auxiliary_loss_clip": 0.01103637, "auxiliary_loss_mlp": 0.01030169, "balance_loss_clip": 1.01699018, "balance_loss_mlp": 1.03531981, "epoch": 0.8426875093942582, "flos": 17457757712640.0, "grad_norm": 2.0724122916347167, "language_loss": 0.75777721, "learning_rate": 2.39295128515604e-07, "loss": 0.77911526, "num_input_tokens_seen": 302315735, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.68359375, "step": 14016, "time_per_iteration": 2.578273057937622 }, { "auxiliary_loss_clip": 0.01122689, "auxiliary_loss_mlp": 0.01037941, "balance_loss_clip": 1.02502441, "balance_loss_mlp": 1.0360229, "epoch": 0.8427476326469262, "flos": 19640551299840.0, "grad_norm": 2.0187346845282064, "language_loss": 0.79204798, "learning_rate": 2.3911595119740433e-07, "loss": 0.8136543, "num_input_tokens_seen": 302332790, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 14017, "time_per_iteration": 2.519913911819458 }, { "auxiliary_loss_clip": 0.01119556, "auxiliary_loss_mlp": 0.01030388, "balance_loss_clip": 1.01814556, "balance_loss_mlp": 1.03419852, "epoch": 0.8428077558995941, "flos": 11941160146560.0, "grad_norm": 3.619985836154574, "language_loss": 0.6281234, "learning_rate": 2.3893683672062705e-07, "loss": 0.6496228, "num_input_tokens_seen": 302346490, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6796875, "step": 14018, "time_per_iteration": 2.5487706661224365 }, { "auxiliary_loss_clip": 0.01113479, "auxiliary_loss_mlp": 0.0103289, "balance_loss_clip": 1.02113008, "balance_loss_mlp": 1.03472352, "epoch": 0.8428678791522621, "flos": 10451729767680.0, "grad_norm": 1.8143130551147157, "language_loss": 0.79202247, "learning_rate": 2.38757785091664e-07, "loss": 0.81348622, "num_input_tokens_seen": 302363235, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69921875, "step": 14019, "time_per_iteration": 3.9795703887939453 }, { "auxiliary_loss_clip": 0.01131352, "auxiliary_loss_mlp": 0.01032384, "balance_loss_clip": 1.01980758, "balance_loss_mlp": 1.03458691, "epoch": 0.8429280024049302, "flos": 28767248259840.0, "grad_norm": 1.7537974151225997, "language_loss": 0.78454453, "learning_rate": 2.38578796316905e-07, "loss": 0.80618191, "num_input_tokens_seen": 302383270, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 14020, "time_per_iteration": 2.631622791290283 }, { "auxiliary_loss_clip": 0.01112258, "auxiliary_loss_mlp": 0.01027174, "balance_loss_clip": 1.01509762, "balance_loss_mlp": 1.03446305, "epoch": 0.8429881256575981, "flos": 19537093152000.0, "grad_norm": 2.693382330862378, "language_loss": 0.71319103, "learning_rate": 2.3839987040273745e-07, "loss": 0.7345854, "num_input_tokens_seen": 302401355, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 14021, "time_per_iteration": 2.6263697147369385 }, { "auxiliary_loss_clip": 0.01122645, "auxiliary_loss_mlp": 0.01039864, "balance_loss_clip": 1.02732372, "balance_loss_mlp": 1.03451955, "epoch": 0.8430482489102661, "flos": 24790931752320.0, "grad_norm": 1.716064541440861, "language_loss": 0.69478166, "learning_rate": 2.3822100735554817e-07, "loss": 0.7164067, "num_input_tokens_seen": 302419515, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 14022, "time_per_iteration": 2.5470969676971436 }, { "auxiliary_loss_clip": 0.01152933, "auxiliary_loss_mlp": 0.01035108, "balance_loss_clip": 1.02169704, "balance_loss_mlp": 1.03642678, "epoch": 0.843108372162934, "flos": 21544248211200.0, "grad_norm": 2.113363029577765, "language_loss": 0.71779084, "learning_rate": 2.380422071817183e-07, "loss": 0.73967123, "num_input_tokens_seen": 302438280, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 14023, "time_per_iteration": 2.73026442527771 }, { "auxiliary_loss_clip": 0.01113088, "auxiliary_loss_mlp": 0.01035527, "balance_loss_clip": 1.02255702, "balance_loss_mlp": 1.03593576, "epoch": 0.843168495415602, "flos": 24608792862720.0, "grad_norm": 1.4261886623410982, "language_loss": 0.66915262, "learning_rate": 2.3786346988763007e-07, "loss": 0.69063872, "num_input_tokens_seen": 302460860, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6875, "step": 14024, "time_per_iteration": 2.610707998275757 }, { "auxiliary_loss_clip": 0.01110692, "auxiliary_loss_mlp": 0.01032248, "balance_loss_clip": 1.01967716, "balance_loss_mlp": 1.03431451, "epoch": 0.8432286186682699, "flos": 15122738286720.0, "grad_norm": 1.8853231599226132, "language_loss": 0.80931056, "learning_rate": 2.3768479547966213e-07, "loss": 0.83073997, "num_input_tokens_seen": 302476980, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.671875, "step": 14025, "time_per_iteration": 2.543720245361328 }, { "auxiliary_loss_clip": 0.01107626, "auxiliary_loss_mlp": 0.01029575, "balance_loss_clip": 1.01900136, "balance_loss_mlp": 1.03281748, "epoch": 0.843288741920938, "flos": 20301882554880.0, "grad_norm": 1.4871935871430964, "language_loss": 0.77406198, "learning_rate": 2.3750618396419053e-07, "loss": 0.795434, "num_input_tokens_seen": 302496380, "router_z_loss_clip": 0.10546875, "router_z_loss_mlp": 0.6640625, "step": 14026, "time_per_iteration": 3.9721720218658447 }, { "auxiliary_loss_clip": 0.01121074, "auxiliary_loss_mlp": 0.0103534, "balance_loss_clip": 1.02307296, "balance_loss_mlp": 1.03524828, "epoch": 0.8433488651736059, "flos": 23332096782720.0, "grad_norm": 1.4416342330065746, "language_loss": 0.82835627, "learning_rate": 2.3732763534758904e-07, "loss": 0.84992039, "num_input_tokens_seen": 302516845, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6796875, "step": 14027, "time_per_iteration": 2.5657124519348145 }, { "auxiliary_loss_clip": 0.01119331, "auxiliary_loss_mlp": 0.01032979, "balance_loss_clip": 1.02104652, "balance_loss_mlp": 1.03347278, "epoch": 0.8434089884262739, "flos": 39458105844480.0, "grad_norm": 1.4148019390337012, "language_loss": 0.56672394, "learning_rate": 2.3714914963623057e-07, "loss": 0.58824712, "num_input_tokens_seen": 302538865, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 14028, "time_per_iteration": 2.729485034942627 }, { "auxiliary_loss_clip": 0.01120146, "auxiliary_loss_mlp": 0.0102527, "balance_loss_clip": 1.01400435, "balance_loss_mlp": 1.03409958, "epoch": 0.8434691116789418, "flos": 23768842250880.0, "grad_norm": 1.932857329126615, "language_loss": 0.63722062, "learning_rate": 2.3697072683648423e-07, "loss": 0.65867484, "num_input_tokens_seen": 302557970, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6796875, "step": 14029, "time_per_iteration": 4.097900867462158 }, { "auxiliary_loss_clip": 0.01099673, "auxiliary_loss_mlp": 0.0102896, "balance_loss_clip": 1.01786792, "balance_loss_mlp": 1.03422809, "epoch": 0.8435292349316098, "flos": 22671411972480.0, "grad_norm": 1.439530525736346, "language_loss": 0.75036258, "learning_rate": 2.3679236695471783e-07, "loss": 0.77164888, "num_input_tokens_seen": 302578915, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.65625, "step": 14030, "time_per_iteration": 2.60080885887146 }, { "auxiliary_loss_clip": 0.01120026, "auxiliary_loss_mlp": 0.01030697, "balance_loss_clip": 1.01840079, "balance_loss_mlp": 1.034477, "epoch": 0.8435893581842777, "flos": 18843622202880.0, "grad_norm": 2.2831384404034107, "language_loss": 0.83349723, "learning_rate": 2.3661406999729584e-07, "loss": 0.85500449, "num_input_tokens_seen": 302596300, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 14031, "time_per_iteration": 2.556424856185913 }, { "auxiliary_loss_clip": 0.01117213, "auxiliary_loss_mlp": 0.01030768, "balance_loss_clip": 1.01910353, "balance_loss_mlp": 1.03292346, "epoch": 0.8436494814369457, "flos": 20704225772160.0, "grad_norm": 2.1815826679473576, "language_loss": 0.80212933, "learning_rate": 2.3643583597058247e-07, "loss": 0.82360911, "num_input_tokens_seen": 302614975, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.671875, "step": 14032, "time_per_iteration": 2.596372365951538 }, { "auxiliary_loss_clip": 0.01129476, "auxiliary_loss_mlp": 0.01034393, "balance_loss_clip": 1.02026057, "balance_loss_mlp": 1.03240681, "epoch": 0.8437096046896138, "flos": 22674177319680.0, "grad_norm": 1.4207421730881473, "language_loss": 0.75534117, "learning_rate": 2.3625766488093734e-07, "loss": 0.77697986, "num_input_tokens_seen": 302636415, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.703125, "step": 14033, "time_per_iteration": 2.6107702255249023 }, { "auxiliary_loss_clip": 0.01128371, "auxiliary_loss_mlp": 0.01034351, "balance_loss_clip": 1.02322268, "balance_loss_mlp": 1.03320146, "epoch": 0.8437697279422817, "flos": 16180127879040.0, "grad_norm": 1.791099515443747, "language_loss": 0.83388245, "learning_rate": 2.360795567347189e-07, "loss": 0.8555097, "num_input_tokens_seen": 302653605, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6875, "step": 14034, "time_per_iteration": 2.647657871246338 }, { "auxiliary_loss_clip": 0.01130859, "auxiliary_loss_mlp": 0.01031216, "balance_loss_clip": 1.01877046, "balance_loss_mlp": 1.03492212, "epoch": 0.8438298511949497, "flos": 28765847629440.0, "grad_norm": 1.4765217998231923, "language_loss": 0.78237307, "learning_rate": 2.3590151153828408e-07, "loss": 0.80399382, "num_input_tokens_seen": 302673965, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 14035, "time_per_iteration": 2.6461992263793945 }, { "auxiliary_loss_clip": 0.01110807, "auxiliary_loss_mlp": 0.01033003, "balance_loss_clip": 1.02095103, "balance_loss_mlp": 1.03401506, "epoch": 0.8438899744476176, "flos": 33724284779520.0, "grad_norm": 1.3176662346091186, "language_loss": 0.72291994, "learning_rate": 2.3572352929798644e-07, "loss": 0.74435806, "num_input_tokens_seen": 302695560, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 14036, "time_per_iteration": 2.9157752990722656 }, { "auxiliary_loss_clip": 0.01097142, "auxiliary_loss_mlp": 0.01024895, "balance_loss_clip": 1.01441073, "balance_loss_mlp": 1.03304291, "epoch": 0.8439500977002856, "flos": 25110787386240.0, "grad_norm": 1.6436861777131526, "language_loss": 0.69782257, "learning_rate": 2.3554561002017804e-07, "loss": 0.7190429, "num_input_tokens_seen": 302713480, "router_z_loss_clip": 0.10498047, "router_z_loss_mlp": 0.640625, "step": 14037, "time_per_iteration": 2.5856966972351074 }, { "auxiliary_loss_clip": 0.01100054, "auxiliary_loss_mlp": 0.01026099, "balance_loss_clip": 1.01411235, "balance_loss_mlp": 1.03268051, "epoch": 0.8440102209529535, "flos": 32850362880000.0, "grad_norm": 1.9206013407179232, "language_loss": 0.69056737, "learning_rate": 2.3536775371120732e-07, "loss": 0.71182895, "num_input_tokens_seen": 302736860, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.671875, "step": 14038, "time_per_iteration": 2.6860764026641846 }, { "auxiliary_loss_clip": 0.01140129, "auxiliary_loss_mlp": 0.01034707, "balance_loss_clip": 1.02202952, "balance_loss_mlp": 1.03466499, "epoch": 0.8440703442056215, "flos": 23730202195200.0, "grad_norm": 1.558309941595967, "language_loss": 0.76225936, "learning_rate": 2.3518996037742345e-07, "loss": 0.78400767, "num_input_tokens_seen": 302757745, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 14039, "time_per_iteration": 2.646082639694214 }, { "auxiliary_loss_clip": 0.01109643, "auxiliary_loss_mlp": 0.01030134, "balance_loss_clip": 1.01757586, "balance_loss_mlp": 1.03685594, "epoch": 0.8441304674582895, "flos": 20193719725440.0, "grad_norm": 2.0631491770879786, "language_loss": 0.79412884, "learning_rate": 2.3501223002516935e-07, "loss": 0.8155266, "num_input_tokens_seen": 302774885, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7265625, "step": 14040, "time_per_iteration": 2.52694034576416 }, { "auxiliary_loss_clip": 0.01131722, "auxiliary_loss_mlp": 0.01032382, "balance_loss_clip": 1.01998448, "balance_loss_mlp": 1.03635323, "epoch": 0.8441905907109575, "flos": 20219897761920.0, "grad_norm": 2.7289534506716118, "language_loss": 0.69154155, "learning_rate": 2.3483456266078882e-07, "loss": 0.71318257, "num_input_tokens_seen": 302791035, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 14041, "time_per_iteration": 2.600123405456543 }, { "auxiliary_loss_clip": 0.01123711, "auxiliary_loss_mlp": 0.01033507, "balance_loss_clip": 1.02176476, "balance_loss_mlp": 1.03469706, "epoch": 0.8442507139636254, "flos": 13516453987200.0, "grad_norm": 1.8371868587967275, "language_loss": 0.68660593, "learning_rate": 2.346569582906226e-07, "loss": 0.70817804, "num_input_tokens_seen": 302808650, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.7109375, "step": 14042, "time_per_iteration": 2.604004144668579 }, { "auxiliary_loss_clip": 0.01117701, "auxiliary_loss_mlp": 0.01030686, "balance_loss_clip": 1.01913512, "balance_loss_mlp": 1.03332031, "epoch": 0.8443108372162934, "flos": 18220212731520.0, "grad_norm": 1.6374614612627179, "language_loss": 0.74953175, "learning_rate": 2.3447941692100824e-07, "loss": 0.77101558, "num_input_tokens_seen": 302824605, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.66796875, "step": 14043, "time_per_iteration": 2.5361874103546143 }, { "auxiliary_loss_clip": 0.01101462, "auxiliary_loss_mlp": 0.0127644, "balance_loss_clip": 1.01776373, "balance_loss_mlp": 1.03367209, "epoch": 0.8443709604689613, "flos": 16105110324480.0, "grad_norm": 3.9951685389318516, "language_loss": 0.71850049, "learning_rate": 2.3430193855828184e-07, "loss": 0.74227959, "num_input_tokens_seen": 302840170, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 14044, "time_per_iteration": 2.4729690551757812 }, { "auxiliary_loss_clip": 0.01116991, "auxiliary_loss_mlp": 0.01027797, "balance_loss_clip": 1.01539946, "balance_loss_mlp": 1.03269291, "epoch": 0.8444310837216293, "flos": 18512130562560.0, "grad_norm": 1.538442043613496, "language_loss": 0.74857342, "learning_rate": 2.3412452320877764e-07, "loss": 0.77002144, "num_input_tokens_seen": 302858320, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.66796875, "step": 14045, "time_per_iteration": 2.5764877796173096 }, { "auxiliary_loss_clip": 0.01118946, "auxiliary_loss_mlp": 0.0127786, "balance_loss_clip": 1.0188303, "balance_loss_mlp": 1.03259981, "epoch": 0.8444912069742974, "flos": 14939845211520.0, "grad_norm": 1.6922391696292136, "language_loss": 0.78735089, "learning_rate": 2.3394717087882676e-07, "loss": 0.81131899, "num_input_tokens_seen": 302875255, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6875, "step": 14046, "time_per_iteration": 2.4964723587036133 }, { "auxiliary_loss_clip": 0.01122337, "auxiliary_loss_mlp": 0.01029433, "balance_loss_clip": 1.01659429, "balance_loss_mlp": 1.03408432, "epoch": 0.8445513302269653, "flos": 20120318282880.0, "grad_norm": 1.769492801552691, "language_loss": 0.7814272, "learning_rate": 2.3376988157475863e-07, "loss": 0.8029449, "num_input_tokens_seen": 302894690, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 14047, "time_per_iteration": 2.5626938343048096 }, { "auxiliary_loss_clip": 0.01121741, "auxiliary_loss_mlp": 0.01030557, "balance_loss_clip": 1.01839817, "balance_loss_mlp": 1.03608048, "epoch": 0.8446114534796333, "flos": 31170928533120.0, "grad_norm": 1.7659560538099994, "language_loss": 0.71941364, "learning_rate": 2.3359265530289995e-07, "loss": 0.7409367, "num_input_tokens_seen": 302912405, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.67578125, "step": 14048, "time_per_iteration": 2.6106643676757812 }, { "auxiliary_loss_clip": 0.01165743, "auxiliary_loss_mlp": 0.01036928, "balance_loss_clip": 1.0246433, "balance_loss_mlp": 1.03530765, "epoch": 0.8446715767323012, "flos": 23948323534080.0, "grad_norm": 1.466687541225671, "language_loss": 0.73443925, "learning_rate": 2.3341549206957588e-07, "loss": 0.75646597, "num_input_tokens_seen": 302932525, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 14049, "time_per_iteration": 2.658621072769165 }, { "auxiliary_loss_clip": 0.01030976, "auxiliary_loss_mlp": 0.01248132, "balance_loss_clip": 1.00085378, "balance_loss_mlp": 1.00086832, "epoch": 0.8447316999849692, "flos": 67984897484160.0, "grad_norm": 0.8034648895519002, "language_loss": 0.60691607, "learning_rate": 2.332383918811085e-07, "loss": 0.62970716, "num_input_tokens_seen": 302991285, "router_z_loss_clip": 0.0135498, "router_z_loss_mlp": 0.21289062, "step": 14050, "time_per_iteration": 3.0504448413848877 }, { "auxiliary_loss_clip": 0.01119194, "auxiliary_loss_mlp": 0.01276306, "balance_loss_clip": 1.01648688, "balance_loss_mlp": 1.03285217, "epoch": 0.8447918232376371, "flos": 22418924296320.0, "grad_norm": 2.5037588515089513, "language_loss": 0.72455865, "learning_rate": 2.3306135474381805e-07, "loss": 0.7485137, "num_input_tokens_seen": 303009515, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.68359375, "step": 14051, "time_per_iteration": 2.6432464122772217 }, { "auxiliary_loss_clip": 0.01110194, "auxiliary_loss_mlp": 0.01028027, "balance_loss_clip": 1.01620746, "balance_loss_mlp": 1.03409696, "epoch": 0.8448519464903052, "flos": 23694147918720.0, "grad_norm": 1.8511833233691697, "language_loss": 0.75038028, "learning_rate": 2.328843806640235e-07, "loss": 0.77176249, "num_input_tokens_seen": 303026905, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 14052, "time_per_iteration": 2.520761489868164 }, { "auxiliary_loss_clip": 0.01115224, "auxiliary_loss_mlp": 0.01026911, "balance_loss_clip": 1.01611018, "balance_loss_mlp": 1.03339505, "epoch": 0.8449120697429731, "flos": 13735904129280.0, "grad_norm": 1.6975360504557768, "language_loss": 0.73663914, "learning_rate": 2.327074696480391e-07, "loss": 0.75806046, "num_input_tokens_seen": 303045245, "router_z_loss_clip": 0.10791016, "router_z_loss_mlp": 0.63671875, "step": 14053, "time_per_iteration": 2.5133585929870605 }, { "auxiliary_loss_clip": 0.01110742, "auxiliary_loss_mlp": 0.01029991, "balance_loss_clip": 1.01773, "balance_loss_mlp": 1.03426313, "epoch": 0.8449721929956411, "flos": 20886795624960.0, "grad_norm": 1.7981245644865818, "language_loss": 0.74009943, "learning_rate": 2.3253062170217942e-07, "loss": 0.76150668, "num_input_tokens_seen": 303065205, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.671875, "step": 14054, "time_per_iteration": 2.5408987998962402 }, { "auxiliary_loss_clip": 0.01116022, "auxiliary_loss_mlp": 0.01029086, "balance_loss_clip": 1.01720071, "balance_loss_mlp": 1.03588974, "epoch": 0.845032316248309, "flos": 33216939129600.0, "grad_norm": 2.092852254924968, "language_loss": 0.78232872, "learning_rate": 2.323538368327549e-07, "loss": 0.80377984, "num_input_tokens_seen": 303088250, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.71484375, "step": 14055, "time_per_iteration": 4.078706502914429 }, { "auxiliary_loss_clip": 0.01110442, "auxiliary_loss_mlp": 0.01031073, "balance_loss_clip": 1.01962852, "balance_loss_mlp": 1.03439367, "epoch": 0.845092439500977, "flos": 23585230903680.0, "grad_norm": 1.6640277372664858, "language_loss": 0.72784907, "learning_rate": 2.3217711504607583e-07, "loss": 0.74926424, "num_input_tokens_seen": 303109280, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.671875, "step": 14056, "time_per_iteration": 2.5459232330322266 }, { "auxiliary_loss_clip": 0.0111747, "auxiliary_loss_mlp": 0.01033833, "balance_loss_clip": 1.02107787, "balance_loss_mlp": 1.03658509, "epoch": 0.8451525627536449, "flos": 13333920048000.0, "grad_norm": 2.5599375917580374, "language_loss": 0.67344606, "learning_rate": 2.3200045634844744e-07, "loss": 0.69495904, "num_input_tokens_seen": 303126075, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 14057, "time_per_iteration": 2.486328601837158 }, { "auxiliary_loss_clip": 0.01141446, "auxiliary_loss_mlp": 0.01029648, "balance_loss_clip": 1.01760793, "balance_loss_mlp": 1.03542686, "epoch": 0.8452126860063129, "flos": 27817985583360.0, "grad_norm": 1.8588936689091269, "language_loss": 0.77967525, "learning_rate": 2.318238607461751e-07, "loss": 0.80138618, "num_input_tokens_seen": 303146920, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 14058, "time_per_iteration": 2.6174018383026123 }, { "auxiliary_loss_clip": 0.01104145, "auxiliary_loss_mlp": 0.01032877, "balance_loss_clip": 1.02078891, "balance_loss_mlp": 1.03460014, "epoch": 0.845272809258981, "flos": 27124694202240.0, "grad_norm": 1.4353898628575796, "language_loss": 0.69879335, "learning_rate": 2.316473282455611e-07, "loss": 0.72016358, "num_input_tokens_seen": 303167885, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 14059, "time_per_iteration": 2.561098098754883 }, { "auxiliary_loss_clip": 0.01107297, "auxiliary_loss_mlp": 0.01034518, "balance_loss_clip": 1.0215838, "balance_loss_mlp": 1.03497267, "epoch": 0.8453329325116489, "flos": 18332577452160.0, "grad_norm": 1.9368448051980354, "language_loss": 0.57514602, "learning_rate": 2.314708588529053e-07, "loss": 0.59656417, "num_input_tokens_seen": 303185000, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 14060, "time_per_iteration": 2.47891902923584 }, { "auxiliary_loss_clip": 0.0112318, "auxiliary_loss_mlp": 0.01034476, "balance_loss_clip": 1.02073157, "balance_loss_mlp": 1.03313112, "epoch": 0.8453930557643169, "flos": 22675254727680.0, "grad_norm": 1.6200391531895062, "language_loss": 0.75559455, "learning_rate": 2.3129445257450463e-07, "loss": 0.77717113, "num_input_tokens_seen": 303205210, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.72265625, "step": 14061, "time_per_iteration": 3.995718002319336 }, { "auxiliary_loss_clip": 0.01124629, "auxiliary_loss_mlp": 0.0102685, "balance_loss_clip": 1.01420152, "balance_loss_mlp": 1.03527749, "epoch": 0.8454531790169848, "flos": 22487261921280.0, "grad_norm": 2.1044400028620123, "language_loss": 0.71070766, "learning_rate": 2.311181094166559e-07, "loss": 0.73222244, "num_input_tokens_seen": 303224655, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 14062, "time_per_iteration": 2.565539836883545 }, { "auxiliary_loss_clip": 0.01058323, "auxiliary_loss_mlp": 0.01002114, "balance_loss_clip": 1.00073099, "balance_loss_mlp": 1.0014596, "epoch": 0.8455133022696528, "flos": 58277848481280.0, "grad_norm": 0.6334286001024957, "language_loss": 0.52643526, "learning_rate": 2.3094182938565221e-07, "loss": 0.54703963, "num_input_tokens_seen": 303289645, "router_z_loss_clip": 0.01385498, "router_z_loss_mlp": 0.21289062, "step": 14063, "time_per_iteration": 3.2312819957733154 }, { "auxiliary_loss_clip": 0.01126827, "auxiliary_loss_mlp": 0.01026152, "balance_loss_clip": 1.01531553, "balance_loss_mlp": 1.03520739, "epoch": 0.8455734255223207, "flos": 21361283308800.0, "grad_norm": 2.2919058788912148, "language_loss": 0.81883848, "learning_rate": 2.3076561248778305e-07, "loss": 0.84036827, "num_input_tokens_seen": 303308350, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.64453125, "step": 14064, "time_per_iteration": 2.603459358215332 }, { "auxiliary_loss_clip": 0.01115992, "auxiliary_loss_mlp": 0.01034684, "balance_loss_clip": 1.02126718, "balance_loss_mlp": 1.03663027, "epoch": 0.8456335487749888, "flos": 20449260057600.0, "grad_norm": 2.9889539030133823, "language_loss": 0.72592044, "learning_rate": 2.3058945872933867e-07, "loss": 0.74742723, "num_input_tokens_seen": 303325230, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.703125, "step": 14065, "time_per_iteration": 2.5683236122131348 }, { "auxiliary_loss_clip": 0.01129046, "auxiliary_loss_mlp": 0.01029128, "balance_loss_clip": 1.01734447, "balance_loss_mlp": 1.03581953, "epoch": 0.8456936720276567, "flos": 28840901097600.0, "grad_norm": 1.4884051498531288, "language_loss": 0.77324879, "learning_rate": 2.3041336811660517e-07, "loss": 0.7948305, "num_input_tokens_seen": 303345810, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.66796875, "step": 14066, "time_per_iteration": 2.63199520111084 }, { "auxiliary_loss_clip": 0.01131908, "auxiliary_loss_mlp": 0.01029549, "balance_loss_clip": 1.01687121, "balance_loss_mlp": 1.03653789, "epoch": 0.8457537952803247, "flos": 22672884430080.0, "grad_norm": 1.5109242255913269, "language_loss": 0.69846725, "learning_rate": 2.3023734065586641e-07, "loss": 0.72008187, "num_input_tokens_seen": 303365140, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.68359375, "step": 14067, "time_per_iteration": 2.6761085987091064 }, { "auxiliary_loss_clip": 0.01126575, "auxiliary_loss_mlp": 0.01029816, "balance_loss_clip": 1.017519, "balance_loss_mlp": 1.03226733, "epoch": 0.8458139185329926, "flos": 33802929607680.0, "grad_norm": 4.057256639185153, "language_loss": 0.71231997, "learning_rate": 2.3006137635340427e-07, "loss": 0.73388392, "num_input_tokens_seen": 303386150, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.67578125, "step": 14068, "time_per_iteration": 4.08352255821228 }, { "auxiliary_loss_clip": 0.0105033, "auxiliary_loss_mlp": 0.00999539, "balance_loss_clip": 0.99817371, "balance_loss_mlp": 1.00192595, "epoch": 0.8458740417856606, "flos": 70295929603200.0, "grad_norm": 0.6971082386594885, "language_loss": 0.6050247, "learning_rate": 2.298854752154997e-07, "loss": 0.62552333, "num_input_tokens_seen": 303453770, "router_z_loss_clip": 0.01367188, "router_z_loss_mlp": 0.21289062, "step": 14069, "time_per_iteration": 3.2365148067474365 }, { "auxiliary_loss_clip": 0.01122196, "auxiliary_loss_mlp": 0.01029509, "balance_loss_clip": 1.01768303, "balance_loss_mlp": 1.03417253, "epoch": 0.8459341650383285, "flos": 24170862245760.0, "grad_norm": 4.124629652612424, "language_loss": 0.74243009, "learning_rate": 2.297096372484284e-07, "loss": 0.76394719, "num_input_tokens_seen": 303474520, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 14070, "time_per_iteration": 4.1358606815338135 }, { "auxiliary_loss_clip": 0.01137113, "auxiliary_loss_mlp": 0.01032656, "balance_loss_clip": 1.02010322, "balance_loss_mlp": 1.03358173, "epoch": 0.8459942882909965, "flos": 38181158369280.0, "grad_norm": 2.020638330127647, "language_loss": 0.67140543, "learning_rate": 2.295338624584666e-07, "loss": 0.69310319, "num_input_tokens_seen": 303497345, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 14071, "time_per_iteration": 2.7630436420440674 }, { "auxiliary_loss_clip": 0.01031447, "auxiliary_loss_mlp": 0.01248318, "balance_loss_clip": 1.00097394, "balance_loss_mlp": 1.00160348, "epoch": 0.8460544115436646, "flos": 64118252177280.0, "grad_norm": 0.7343443370652557, "language_loss": 0.6111877, "learning_rate": 2.2935815085188692e-07, "loss": 0.63398528, "num_input_tokens_seen": 303554890, "router_z_loss_clip": 0.01403809, "router_z_loss_mlp": 0.21289062, "step": 14072, "time_per_iteration": 2.9905319213867188 }, { "auxiliary_loss_clip": 0.01132766, "auxiliary_loss_mlp": 0.01033522, "balance_loss_clip": 1.020069, "balance_loss_mlp": 1.03471923, "epoch": 0.8461145347963325, "flos": 24170826332160.0, "grad_norm": 2.4287022927241324, "language_loss": 0.72748852, "learning_rate": 2.2918250243496096e-07, "loss": 0.74915141, "num_input_tokens_seen": 303574380, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.70703125, "step": 14073, "time_per_iteration": 2.594733238220215 }, { "auxiliary_loss_clip": 0.0112779, "auxiliary_loss_mlp": 0.01032048, "balance_loss_clip": 1.01824379, "balance_loss_mlp": 1.0363847, "epoch": 0.8461746580490005, "flos": 34893787697280.0, "grad_norm": 2.4103360043010134, "language_loss": 0.78235197, "learning_rate": 2.290069172139555e-07, "loss": 0.80395031, "num_input_tokens_seen": 303594910, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7421875, "step": 14074, "time_per_iteration": 2.6457479000091553 }, { "auxiliary_loss_clip": 0.01112798, "auxiliary_loss_mlp": 0.01030575, "balance_loss_clip": 1.01803434, "balance_loss_mlp": 1.03504562, "epoch": 0.8462347813016684, "flos": 25557014044800.0, "grad_norm": 1.973610307819154, "language_loss": 0.75593841, "learning_rate": 2.2883139519513818e-07, "loss": 0.77737212, "num_input_tokens_seen": 303613520, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 14075, "time_per_iteration": 2.600132465362549 }, { "auxiliary_loss_clip": 0.01118624, "auxiliary_loss_mlp": 0.01026673, "balance_loss_clip": 1.01412058, "balance_loss_mlp": 1.03383136, "epoch": 0.8462949045543364, "flos": 21325336773120.0, "grad_norm": 3.288324173394173, "language_loss": 0.72903764, "learning_rate": 2.2865593638477222e-07, "loss": 0.75049061, "num_input_tokens_seen": 303631225, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.671875, "step": 14076, "time_per_iteration": 2.5022640228271484 }, { "auxiliary_loss_clip": 0.0112446, "auxiliary_loss_mlp": 0.01030617, "balance_loss_clip": 1.01822519, "balance_loss_mlp": 1.03465223, "epoch": 0.8463550278070043, "flos": 22637440684800.0, "grad_norm": 2.085933708028278, "language_loss": 0.78175849, "learning_rate": 2.284805407891195e-07, "loss": 0.80330932, "num_input_tokens_seen": 303649175, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71875, "step": 14077, "time_per_iteration": 2.5909175872802734 }, { "auxiliary_loss_clip": 0.01134285, "auxiliary_loss_mlp": 0.01033896, "balance_loss_clip": 1.02109861, "balance_loss_mlp": 1.0363276, "epoch": 0.8464151510596724, "flos": 13005588804480.0, "grad_norm": 2.2870964552004938, "language_loss": 0.75212538, "learning_rate": 2.2830520841443945e-07, "loss": 0.77380717, "num_input_tokens_seen": 303665915, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 14078, "time_per_iteration": 2.522294044494629 }, { "auxiliary_loss_clip": 0.01136203, "auxiliary_loss_mlp": 0.01024652, "balance_loss_clip": 1.01200986, "balance_loss_mlp": 1.03324604, "epoch": 0.8464752743123403, "flos": 15704921923200.0, "grad_norm": 2.7565293970409503, "language_loss": 0.85559219, "learning_rate": 2.2812993926698866e-07, "loss": 0.87720072, "num_input_tokens_seen": 303679985, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.67578125, "step": 14079, "time_per_iteration": 2.5325827598571777 }, { "auxiliary_loss_clip": 0.01139247, "auxiliary_loss_mlp": 0.01037036, "balance_loss_clip": 1.02325511, "balance_loss_mlp": 1.03682423, "epoch": 0.8465353975650083, "flos": 21653955325440.0, "grad_norm": 2.3810807224547257, "language_loss": 0.58671784, "learning_rate": 2.279547333530234e-07, "loss": 0.60848075, "num_input_tokens_seen": 303698470, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 14080, "time_per_iteration": 2.627779245376587 }, { "auxiliary_loss_clip": 0.01123092, "auxiliary_loss_mlp": 0.01030624, "balance_loss_clip": 1.01791656, "balance_loss_mlp": 1.03353405, "epoch": 0.8465955208176762, "flos": 18515650095360.0, "grad_norm": 2.2747269951817217, "language_loss": 0.66188836, "learning_rate": 2.2777959067879472e-07, "loss": 0.68342555, "num_input_tokens_seen": 303716415, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 14081, "time_per_iteration": 2.48579740524292 }, { "auxiliary_loss_clip": 0.01114501, "auxiliary_loss_mlp": 0.0103576, "balance_loss_clip": 1.02427411, "balance_loss_mlp": 1.03441417, "epoch": 0.8466556440703442, "flos": 24200559815040.0, "grad_norm": 1.639898775125471, "language_loss": 0.73147023, "learning_rate": 2.2760451125055402e-07, "loss": 0.75297284, "num_input_tokens_seen": 303734490, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.7109375, "step": 14082, "time_per_iteration": 2.5219709873199463 }, { "auxiliary_loss_clip": 0.01140028, "auxiliary_loss_mlp": 0.01026637, "balance_loss_clip": 1.01525235, "balance_loss_mlp": 1.03345561, "epoch": 0.8467157673230121, "flos": 20375894528640.0, "grad_norm": 2.025445707489203, "language_loss": 0.76041746, "learning_rate": 2.2742949507454924e-07, "loss": 0.78208411, "num_input_tokens_seen": 303752310, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.703125, "step": 14083, "time_per_iteration": 2.536954402923584 }, { "auxiliary_loss_clip": 0.01143925, "auxiliary_loss_mlp": 0.01032853, "balance_loss_clip": 1.01900649, "balance_loss_mlp": 1.03483319, "epoch": 0.8467758905756801, "flos": 28473642489600.0, "grad_norm": 2.1594934091061613, "language_loss": 0.65936172, "learning_rate": 2.272545421570262e-07, "loss": 0.68112946, "num_input_tokens_seen": 303776065, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 14084, "time_per_iteration": 2.615561008453369 }, { "auxiliary_loss_clip": 0.01140601, "auxiliary_loss_mlp": 0.01031149, "balance_loss_clip": 1.01941299, "balance_loss_mlp": 1.03677928, "epoch": 0.8468360138283482, "flos": 11692551139200.0, "grad_norm": 2.4604821016453027, "language_loss": 0.69883299, "learning_rate": 2.2707965250422823e-07, "loss": 0.72055054, "num_input_tokens_seen": 303793500, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 14085, "time_per_iteration": 2.5486013889312744 }, { "auxiliary_loss_clip": 0.01138668, "auxiliary_loss_mlp": 0.01031206, "balance_loss_clip": 1.01957154, "balance_loss_mlp": 1.0338769, "epoch": 0.8468961370810161, "flos": 24607859109120.0, "grad_norm": 1.5605107342114002, "language_loss": 0.71021014, "learning_rate": 2.269048261223976e-07, "loss": 0.73190892, "num_input_tokens_seen": 303814835, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6953125, "step": 14086, "time_per_iteration": 2.5681064128875732 }, { "auxiliary_loss_clip": 0.01110568, "auxiliary_loss_mlp": 0.01030382, "balance_loss_clip": 1.01842523, "balance_loss_mlp": 1.03370488, "epoch": 0.8469562603336841, "flos": 19609812236160.0, "grad_norm": 2.238883940160836, "language_loss": 0.74405342, "learning_rate": 2.2673006301777198e-07, "loss": 0.76546293, "num_input_tokens_seen": 303834505, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.67578125, "step": 14087, "time_per_iteration": 2.520009994506836 }, { "auxiliary_loss_clip": 0.01121303, "auxiliary_loss_mlp": 0.01024503, "balance_loss_clip": 1.01253438, "balance_loss_mlp": 1.03491664, "epoch": 0.847016383586352, "flos": 22638949056000.0, "grad_norm": 1.3670754603760669, "language_loss": 0.74027312, "learning_rate": 2.2655536319658952e-07, "loss": 0.76173121, "num_input_tokens_seen": 303855050, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.68359375, "step": 14088, "time_per_iteration": 2.5602242946624756 }, { "auxiliary_loss_clip": 0.01113947, "auxiliary_loss_mlp": 0.01035858, "balance_loss_clip": 1.02336478, "balance_loss_mlp": 1.03436565, "epoch": 0.84707650683902, "flos": 20960161153920.0, "grad_norm": 1.8985452137707324, "language_loss": 0.72182852, "learning_rate": 2.2638072666508369e-07, "loss": 0.74332654, "num_input_tokens_seen": 303875635, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 14089, "time_per_iteration": 2.6019742488861084 }, { "auxiliary_loss_clip": 0.01099784, "auxiliary_loss_mlp": 0.01027174, "balance_loss_clip": 1.0153718, "balance_loss_mlp": 1.03432059, "epoch": 0.8471366300916879, "flos": 24093007516800.0, "grad_norm": 1.6343127548779612, "language_loss": 0.79340726, "learning_rate": 2.2620615342948857e-07, "loss": 0.81467688, "num_input_tokens_seen": 303896750, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.65625, "step": 14090, "time_per_iteration": 2.5942766666412354 }, { "auxiliary_loss_clip": 0.01149996, "auxiliary_loss_mlp": 0.01038897, "balance_loss_clip": 1.02495527, "balance_loss_mlp": 1.0349164, "epoch": 0.847196753344356, "flos": 18332900674560.0, "grad_norm": 2.2781896517676636, "language_loss": 0.77332795, "learning_rate": 2.260316434960319e-07, "loss": 0.79521692, "num_input_tokens_seen": 303915435, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.70703125, "step": 14091, "time_per_iteration": 2.6260366439819336 }, { "auxiliary_loss_clip": 0.0102308, "auxiliary_loss_mlp": 0.0099939, "balance_loss_clip": 0.9980728, "balance_loss_mlp": 1.00177479, "epoch": 0.8472568765970239, "flos": 49567536956160.0, "grad_norm": 0.8239604231912134, "language_loss": 0.59479356, "learning_rate": 2.258571968709433e-07, "loss": 0.61501825, "num_input_tokens_seen": 303977245, "router_z_loss_clip": 0.01318359, "router_z_loss_mlp": 0.21289062, "step": 14092, "time_per_iteration": 3.199922561645508 }, { "auxiliary_loss_clip": 0.01119869, "auxiliary_loss_mlp": 0.01035742, "balance_loss_clip": 1.02218175, "balance_loss_mlp": 1.03241158, "epoch": 0.8473169998496919, "flos": 19279074781440.0, "grad_norm": 1.71638197523708, "language_loss": 0.70587206, "learning_rate": 2.2568281356044783e-07, "loss": 0.72742814, "num_input_tokens_seen": 303996055, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.6953125, "step": 14093, "time_per_iteration": 2.588085651397705 }, { "auxiliary_loss_clip": 0.01099561, "auxiliary_loss_mlp": 0.01027913, "balance_loss_clip": 1.01633811, "balance_loss_mlp": 1.03304505, "epoch": 0.8473771231023598, "flos": 17675555829120.0, "grad_norm": 1.918941270307523, "language_loss": 0.83294189, "learning_rate": 2.2550849357076852e-07, "loss": 0.85421664, "num_input_tokens_seen": 304012205, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6640625, "step": 14094, "time_per_iteration": 2.5429489612579346 }, { "auxiliary_loss_clip": 0.01109245, "auxiliary_loss_mlp": 0.01030585, "balance_loss_clip": 1.0188781, "balance_loss_mlp": 1.03429031, "epoch": 0.8474372463550278, "flos": 26359761144960.0, "grad_norm": 1.6258969920055193, "language_loss": 0.71303737, "learning_rate": 2.2533423690812637e-07, "loss": 0.73443568, "num_input_tokens_seen": 304033475, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.65625, "step": 14095, "time_per_iteration": 2.5750670433044434 }, { "auxiliary_loss_clip": 0.01109593, "auxiliary_loss_mlp": 0.01281843, "balance_loss_clip": 1.02359259, "balance_loss_mlp": 1.03385842, "epoch": 0.8474973696076957, "flos": 23402050519680.0, "grad_norm": 1.686595141787818, "language_loss": 0.80748498, "learning_rate": 2.2516004357874064e-07, "loss": 0.83139932, "num_input_tokens_seen": 304051845, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.66796875, "step": 14096, "time_per_iteration": 2.6301965713500977 }, { "auxiliary_loss_clip": 0.01113693, "auxiliary_loss_mlp": 0.01031513, "balance_loss_clip": 1.01838255, "balance_loss_mlp": 1.0341047, "epoch": 0.8475574928603637, "flos": 25075666863360.0, "grad_norm": 1.7161641102439396, "language_loss": 0.77493429, "learning_rate": 2.2498591358882724e-07, "loss": 0.79638642, "num_input_tokens_seen": 304069965, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 14097, "time_per_iteration": 3.9827706813812256 }, { "auxiliary_loss_clip": 0.01129822, "auxiliary_loss_mlp": 0.0102876, "balance_loss_clip": 1.01695836, "balance_loss_mlp": 1.03595972, "epoch": 0.8476176161130318, "flos": 19966691813760.0, "grad_norm": 1.6370135547653353, "language_loss": 0.80065787, "learning_rate": 2.248118469446003e-07, "loss": 0.82224369, "num_input_tokens_seen": 304086805, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 14098, "time_per_iteration": 2.5581939220428467 }, { "auxiliary_loss_clip": 0.01110507, "auxiliary_loss_mlp": 0.01281411, "balance_loss_clip": 1.02177429, "balance_loss_mlp": 1.03400159, "epoch": 0.8476777393656997, "flos": 21285834791040.0, "grad_norm": 1.8678408890473945, "language_loss": 0.71473587, "learning_rate": 2.246378436522729e-07, "loss": 0.73865497, "num_input_tokens_seen": 304105865, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.671875, "step": 14099, "time_per_iteration": 2.6100921630859375 }, { "auxiliary_loss_clip": 0.01111171, "auxiliary_loss_mlp": 0.01031264, "balance_loss_clip": 1.01769197, "balance_loss_mlp": 1.03323114, "epoch": 0.8477378626183677, "flos": 29971476650880.0, "grad_norm": 2.4748614156287516, "language_loss": 0.634606, "learning_rate": 2.244639037180538e-07, "loss": 0.65603036, "num_input_tokens_seen": 304128300, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.69140625, "step": 14100, "time_per_iteration": 2.574843406677246 }, { "auxiliary_loss_clip": 0.011138, "auxiliary_loss_mlp": 0.01030109, "balance_loss_clip": 1.0169301, "balance_loss_mlp": 1.03408921, "epoch": 0.8477979858710356, "flos": 24237727413120.0, "grad_norm": 2.4493444738813706, "language_loss": 0.73560524, "learning_rate": 2.2429002714815093e-07, "loss": 0.75704432, "num_input_tokens_seen": 304143695, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 14101, "time_per_iteration": 2.5130155086517334 }, { "auxiliary_loss_clip": 0.01112316, "auxiliary_loss_mlp": 0.01029741, "balance_loss_clip": 1.01803493, "balance_loss_mlp": 1.03416538, "epoch": 0.8478581091237036, "flos": 25593678852480.0, "grad_norm": 1.6829982247743198, "language_loss": 0.7125622, "learning_rate": 2.241162139487689e-07, "loss": 0.7339828, "num_input_tokens_seen": 304165800, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6953125, "step": 14102, "time_per_iteration": 3.9540224075317383 }, { "auxiliary_loss_clip": 0.01125161, "auxiliary_loss_mlp": 0.01031934, "balance_loss_clip": 1.01864266, "balance_loss_mlp": 1.03522277, "epoch": 0.8479182323763715, "flos": 12057116227200.0, "grad_norm": 2.034986488657207, "language_loss": 0.81510198, "learning_rate": 2.2394246412611185e-07, "loss": 0.83667296, "num_input_tokens_seen": 304182910, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 14103, "time_per_iteration": 2.484467029571533 }, { "auxiliary_loss_clip": 0.01102423, "auxiliary_loss_mlp": 0.01029546, "balance_loss_clip": 1.01747584, "balance_loss_mlp": 1.03501201, "epoch": 0.8479783556290396, "flos": 21433391861760.0, "grad_norm": 2.033889769926057, "language_loss": 0.78668213, "learning_rate": 2.237687776863788e-07, "loss": 0.80800176, "num_input_tokens_seen": 304200175, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.671875, "step": 14104, "time_per_iteration": 2.4710772037506104 }, { "auxiliary_loss_clip": 0.0111578, "auxiliary_loss_mlp": 0.01034414, "balance_loss_clip": 1.02139688, "balance_loss_mlp": 1.03634715, "epoch": 0.8480384788817075, "flos": 19642634288640.0, "grad_norm": 1.9385476748019068, "language_loss": 0.78907299, "learning_rate": 2.235951546357695e-07, "loss": 0.81057495, "num_input_tokens_seen": 304217775, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 14105, "time_per_iteration": 2.5353641510009766 }, { "auxiliary_loss_clip": 0.01126313, "auxiliary_loss_mlp": 0.01032766, "balance_loss_clip": 1.01950955, "balance_loss_mlp": 1.03475976, "epoch": 0.8480986021343755, "flos": 22489201255680.0, "grad_norm": 4.0374646361831426, "language_loss": 0.50146371, "learning_rate": 2.234215949804794e-07, "loss": 0.52305448, "num_input_tokens_seen": 304235760, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 14106, "time_per_iteration": 2.5532891750335693 }, { "auxiliary_loss_clip": 0.01129337, "auxiliary_loss_mlp": 0.01029912, "balance_loss_clip": 1.01821208, "balance_loss_mlp": 1.03440738, "epoch": 0.8481587253870434, "flos": 22090557139200.0, "grad_norm": 1.7953700365912257, "language_loss": 0.75879639, "learning_rate": 2.2324809872670337e-07, "loss": 0.78038883, "num_input_tokens_seen": 304253985, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.68359375, "step": 14107, "time_per_iteration": 2.633519411087036 }, { "auxiliary_loss_clip": 0.01130465, "auxiliary_loss_mlp": 0.01028058, "balance_loss_clip": 1.01567197, "balance_loss_mlp": 1.03511405, "epoch": 0.8482188486397114, "flos": 33582689366400.0, "grad_norm": 1.6042649161169025, "language_loss": 0.73374259, "learning_rate": 2.2307466588063194e-07, "loss": 0.75532782, "num_input_tokens_seen": 304276785, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 14108, "time_per_iteration": 2.820600748062134 }, { "auxiliary_loss_clip": 0.0109954, "auxiliary_loss_mlp": 0.01026631, "balance_loss_clip": 1.01336288, "balance_loss_mlp": 1.0333941, "epoch": 0.8482789718923793, "flos": 20919402195840.0, "grad_norm": 1.761987889069983, "language_loss": 0.72678602, "learning_rate": 2.2290129644845445e-07, "loss": 0.74804771, "num_input_tokens_seen": 304296310, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.66015625, "step": 14109, "time_per_iteration": 2.585155487060547 }, { "auxiliary_loss_clip": 0.01145103, "auxiliary_loss_mlp": 0.01031992, "balance_loss_clip": 1.01954079, "balance_loss_mlp": 1.03328967, "epoch": 0.8483390951450474, "flos": 12896204912640.0, "grad_norm": 1.9027522533494063, "language_loss": 0.7380898, "learning_rate": 2.2272799043635926e-07, "loss": 0.75986075, "num_input_tokens_seen": 304311715, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.671875, "step": 14110, "time_per_iteration": 4.876477003097534 }, { "auxiliary_loss_clip": 0.01147709, "auxiliary_loss_mlp": 0.01032981, "balance_loss_clip": 1.02097702, "balance_loss_mlp": 1.03474176, "epoch": 0.8483992183977154, "flos": 25081628520960.0, "grad_norm": 1.6772463858609157, "language_loss": 0.76013839, "learning_rate": 2.2255474785052964e-07, "loss": 0.78194523, "num_input_tokens_seen": 304331910, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.68359375, "step": 14111, "time_per_iteration": 2.7432138919830322 }, { "auxiliary_loss_clip": 0.01102126, "auxiliary_loss_mlp": 0.01028357, "balance_loss_clip": 1.01702583, "balance_loss_mlp": 1.03486872, "epoch": 0.8484593416503833, "flos": 25557445008000.0, "grad_norm": 1.8268539656162497, "language_loss": 0.67437565, "learning_rate": 2.2238156869714952e-07, "loss": 0.6956805, "num_input_tokens_seen": 304351405, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.671875, "step": 14112, "time_per_iteration": 4.257203102111816 }, { "auxiliary_loss_clip": 0.01111856, "auxiliary_loss_mlp": 0.01028961, "balance_loss_clip": 1.01725483, "balance_loss_mlp": 1.03306115, "epoch": 0.8485194649030513, "flos": 27198454780800.0, "grad_norm": 1.3397499585545634, "language_loss": 0.73652828, "learning_rate": 2.2220845298239842e-07, "loss": 0.75793636, "num_input_tokens_seen": 304372935, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6953125, "step": 14113, "time_per_iteration": 2.6328015327453613 }, { "auxiliary_loss_clip": 0.01128101, "auxiliary_loss_mlp": 0.01034044, "balance_loss_clip": 1.02183676, "balance_loss_mlp": 1.03342652, "epoch": 0.8485795881557192, "flos": 24205910941440.0, "grad_norm": 2.2937533780912656, "language_loss": 0.66671669, "learning_rate": 2.220354007124545e-07, "loss": 0.6883381, "num_input_tokens_seen": 304393070, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 14114, "time_per_iteration": 2.598862886428833 }, { "auxiliary_loss_clip": 0.01106528, "auxiliary_loss_mlp": 0.01030361, "balance_loss_clip": 1.01722956, "balance_loss_mlp": 1.03604257, "epoch": 0.8486397114083872, "flos": 21141653598720.0, "grad_norm": 1.617966905619044, "language_loss": 0.78973806, "learning_rate": 2.2186241189349354e-07, "loss": 0.81110692, "num_input_tokens_seen": 304411195, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 14115, "time_per_iteration": 2.5851948261260986 }, { "auxiliary_loss_clip": 0.01102899, "auxiliary_loss_mlp": 0.01032262, "balance_loss_clip": 1.02006662, "balance_loss_mlp": 1.03377151, "epoch": 0.8486998346610551, "flos": 19974772373760.0, "grad_norm": 2.1016025523591453, "language_loss": 0.79446065, "learning_rate": 2.216894865316894e-07, "loss": 0.81581223, "num_input_tokens_seen": 304429425, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.69140625, "step": 14116, "time_per_iteration": 2.6208183765411377 }, { "auxiliary_loss_clip": 0.01114636, "auxiliary_loss_mlp": 0.0103135, "balance_loss_clip": 1.01843333, "balance_loss_mlp": 1.03373241, "epoch": 0.8487599579137232, "flos": 19792310261760.0, "grad_norm": 2.2901746645864454, "language_loss": 0.68444002, "learning_rate": 2.215166246332132e-07, "loss": 0.70589989, "num_input_tokens_seen": 304447460, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 14117, "time_per_iteration": 2.519754409790039 }, { "auxiliary_loss_clip": 0.01122296, "auxiliary_loss_mlp": 0.01027516, "balance_loss_clip": 1.01501083, "balance_loss_mlp": 1.03313911, "epoch": 0.8488200811663911, "flos": 26396030903040.0, "grad_norm": 1.965237099288409, "language_loss": 0.65624964, "learning_rate": 2.2134382620423376e-07, "loss": 0.67774785, "num_input_tokens_seen": 304468230, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71484375, "step": 14118, "time_per_iteration": 2.6408793926239014 }, { "auxiliary_loss_clip": 0.01127317, "auxiliary_loss_mlp": 0.01030615, "balance_loss_clip": 1.019135, "balance_loss_mlp": 1.03317118, "epoch": 0.8488802044190591, "flos": 16359285939840.0, "grad_norm": 1.6687672232461679, "language_loss": 0.73310912, "learning_rate": 2.211710912509175e-07, "loss": 0.75468844, "num_input_tokens_seen": 304484860, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 14119, "time_per_iteration": 2.538064956665039 }, { "auxiliary_loss_clip": 0.01131223, "auxiliary_loss_mlp": 0.01032119, "balance_loss_clip": 1.01858246, "balance_loss_mlp": 1.03437912, "epoch": 0.848940327671727, "flos": 19208869649280.0, "grad_norm": 1.6683001646351154, "language_loss": 0.77745211, "learning_rate": 2.2099841977943013e-07, "loss": 0.7990855, "num_input_tokens_seen": 304503575, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.703125, "step": 14120, "time_per_iteration": 2.6414437294006348 }, { "auxiliary_loss_clip": 0.0111427, "auxiliary_loss_mlp": 0.01032532, "balance_loss_clip": 1.0198245, "balance_loss_mlp": 1.0349803, "epoch": 0.849000450924395, "flos": 21871178824320.0, "grad_norm": 2.151019551187839, "language_loss": 0.75974321, "learning_rate": 2.208258117959323e-07, "loss": 0.78121114, "num_input_tokens_seen": 304525005, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 14121, "time_per_iteration": 2.5603554248809814 }, { "auxiliary_loss_clip": 0.01049284, "auxiliary_loss_mlp": 0.01000437, "balance_loss_clip": 0.99889892, "balance_loss_mlp": 1.00151777, "epoch": 0.8490605741770629, "flos": 71166475624320.0, "grad_norm": 0.8124202226278165, "language_loss": 0.6015141, "learning_rate": 2.2065326730658506e-07, "loss": 0.6220113, "num_input_tokens_seen": 304585220, "router_z_loss_clip": 0.01538086, "router_z_loss_mlp": 0.21289062, "step": 14122, "time_per_iteration": 3.167018413543701 }, { "auxiliary_loss_clip": 0.01112524, "auxiliary_loss_mlp": 0.01029125, "balance_loss_clip": 1.01718628, "balance_loss_mlp": 1.03491354, "epoch": 0.849120697429731, "flos": 24973357950720.0, "grad_norm": 1.7066660313221746, "language_loss": 0.79632628, "learning_rate": 2.2048078631754574e-07, "loss": 0.81774276, "num_input_tokens_seen": 304604665, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 14123, "time_per_iteration": 2.5245773792266846 }, { "auxiliary_loss_clip": 0.01110375, "auxiliary_loss_mlp": 0.01030495, "balance_loss_clip": 1.01810956, "balance_loss_mlp": 1.0352844, "epoch": 0.8491808206823989, "flos": 23032277959680.0, "grad_norm": 2.362247681870599, "language_loss": 0.8298139, "learning_rate": 2.203083688349696e-07, "loss": 0.85122257, "num_input_tokens_seen": 304620600, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6640625, "step": 14124, "time_per_iteration": 2.4943056106567383 }, { "auxiliary_loss_clip": 0.01141579, "auxiliary_loss_mlp": 0.01031138, "balance_loss_clip": 1.01785183, "balance_loss_mlp": 1.0367763, "epoch": 0.8492409439350669, "flos": 23878549365120.0, "grad_norm": 1.8621154915205789, "language_loss": 0.71349365, "learning_rate": 2.2013601486500977e-07, "loss": 0.73522079, "num_input_tokens_seen": 304639540, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 14125, "time_per_iteration": 2.545820713043213 }, { "auxiliary_loss_clip": 0.01107966, "auxiliary_loss_mlp": 0.01036075, "balance_loss_clip": 1.02258682, "balance_loss_mlp": 1.03559017, "epoch": 0.8493010671877349, "flos": 22419893963520.0, "grad_norm": 1.672627883051265, "language_loss": 0.73863852, "learning_rate": 2.1996372441381684e-07, "loss": 0.76007891, "num_input_tokens_seen": 304660595, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 14126, "time_per_iteration": 2.499002456665039 }, { "auxiliary_loss_clip": 0.01136922, "auxiliary_loss_mlp": 0.01030835, "balance_loss_clip": 1.01894391, "balance_loss_mlp": 1.03388882, "epoch": 0.8493611904404028, "flos": 17529435302400.0, "grad_norm": 1.4571204828662738, "language_loss": 0.67901522, "learning_rate": 2.1979149748754077e-07, "loss": 0.70069277, "num_input_tokens_seen": 304679580, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.67578125, "step": 14127, "time_per_iteration": 2.535472869873047 }, { "auxiliary_loss_clip": 0.01132434, "auxiliary_loss_mlp": 0.01275618, "balance_loss_clip": 1.01583433, "balance_loss_mlp": 1.03509676, "epoch": 0.8494213136930708, "flos": 19462937523840.0, "grad_norm": 2.1646588962633673, "language_loss": 0.69447577, "learning_rate": 2.1961933409232603e-07, "loss": 0.71855628, "num_input_tokens_seen": 304698385, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 14128, "time_per_iteration": 2.5305426120758057 }, { "auxiliary_loss_clip": 0.01122183, "auxiliary_loss_mlp": 0.01035633, "balance_loss_clip": 1.02280569, "balance_loss_mlp": 1.03599453, "epoch": 0.8494814369457387, "flos": 18770292587520.0, "grad_norm": 1.7592567060118005, "language_loss": 0.77944803, "learning_rate": 2.1944723423431787e-07, "loss": 0.80102623, "num_input_tokens_seen": 304715430, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6796875, "step": 14129, "time_per_iteration": 2.496560573577881 }, { "auxiliary_loss_clip": 0.01148924, "auxiliary_loss_mlp": 0.01034082, "balance_loss_clip": 1.02186298, "balance_loss_mlp": 1.03487539, "epoch": 0.8495415601984068, "flos": 23331486251520.0, "grad_norm": 1.8950980483186124, "language_loss": 0.68023139, "learning_rate": 2.1927519791965788e-07, "loss": 0.70206147, "num_input_tokens_seen": 304734345, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 14130, "time_per_iteration": 2.5766987800598145 }, { "auxiliary_loss_clip": 0.0111282, "auxiliary_loss_mlp": 0.01030286, "balance_loss_clip": 1.01756072, "balance_loss_mlp": 1.03444219, "epoch": 0.8496016834510747, "flos": 26612859352320.0, "grad_norm": 2.045477803611183, "language_loss": 0.71000642, "learning_rate": 2.1910322515448554e-07, "loss": 0.7314375, "num_input_tokens_seen": 304755030, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 14131, "time_per_iteration": 2.5321812629699707 }, { "auxiliary_loss_clip": 0.0112807, "auxiliary_loss_mlp": 0.01028701, "balance_loss_clip": 1.01636863, "balance_loss_mlp": 1.03139997, "epoch": 0.8496618067037427, "flos": 25480380378240.0, "grad_norm": 1.932181625133736, "language_loss": 0.68692386, "learning_rate": 2.189313159449375e-07, "loss": 0.70849162, "num_input_tokens_seen": 304774320, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.703125, "step": 14132, "time_per_iteration": 2.6648545265197754 }, { "auxiliary_loss_clip": 0.01099931, "auxiliary_loss_mlp": 0.01036086, "balance_loss_clip": 1.02446342, "balance_loss_mlp": 1.03213954, "epoch": 0.8497219299564106, "flos": 25374587846400.0, "grad_norm": 1.6337692837776225, "language_loss": 0.70306277, "learning_rate": 2.1875947029714982e-07, "loss": 0.72442293, "num_input_tokens_seen": 304795355, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 14133, "time_per_iteration": 2.5113179683685303 }, { "auxiliary_loss_clip": 0.01059013, "auxiliary_loss_mlp": 0.01246049, "balance_loss_clip": 0.99865592, "balance_loss_mlp": 1.00195646, "epoch": 0.8497820532090786, "flos": 67780279658880.0, "grad_norm": 0.730543284885463, "language_loss": 0.57558393, "learning_rate": 2.1858768821725482e-07, "loss": 0.59863454, "num_input_tokens_seen": 304863915, "router_z_loss_clip": 0.01470947, "router_z_loss_mlp": 0.21484375, "step": 14134, "time_per_iteration": 3.245671272277832 }, { "auxiliary_loss_clip": 0.01131066, "auxiliary_loss_mlp": 0.01030229, "balance_loss_clip": 1.01739001, "balance_loss_mlp": 1.03379989, "epoch": 0.8498421764617465, "flos": 23440546920960.0, "grad_norm": 1.9720236387814998, "language_loss": 0.78586888, "learning_rate": 2.1841596971138299e-07, "loss": 0.80748183, "num_input_tokens_seen": 304881555, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 14135, "time_per_iteration": 2.577579975128174 }, { "auxiliary_loss_clip": 0.01115328, "auxiliary_loss_mlp": 0.01030931, "balance_loss_clip": 1.01811028, "balance_loss_mlp": 1.03497386, "epoch": 0.8499022997144146, "flos": 17712615686400.0, "grad_norm": 1.9008975054269757, "language_loss": 0.63705605, "learning_rate": 2.1824431478566184e-07, "loss": 0.65851867, "num_input_tokens_seen": 304898760, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 14136, "time_per_iteration": 2.5886173248291016 }, { "auxiliary_loss_clip": 0.01115879, "auxiliary_loss_mlp": 0.01033908, "balance_loss_clip": 1.02009141, "balance_loss_mlp": 1.03576708, "epoch": 0.8499624229670825, "flos": 18588512833920.0, "grad_norm": 2.293401158741901, "language_loss": 0.83973175, "learning_rate": 2.1807272344621875e-07, "loss": 0.86122966, "num_input_tokens_seen": 304915465, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7109375, "step": 14137, "time_per_iteration": 2.47161602973938 }, { "auxiliary_loss_clip": 0.0112754, "auxiliary_loss_mlp": 0.01027888, "balance_loss_clip": 1.01682496, "balance_loss_mlp": 1.03407431, "epoch": 0.8500225462197505, "flos": 24345854328960.0, "grad_norm": 2.2105572459286176, "language_loss": 0.78555965, "learning_rate": 2.1790119569917565e-07, "loss": 0.80711389, "num_input_tokens_seen": 304933190, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.6640625, "step": 14138, "time_per_iteration": 3.958698272705078 }, { "auxiliary_loss_clip": 0.01113218, "auxiliary_loss_mlp": 0.01029834, "balance_loss_clip": 1.01783562, "balance_loss_mlp": 1.03549945, "epoch": 0.8500826694724185, "flos": 16545518979840.0, "grad_norm": 1.8192027292473323, "language_loss": 0.64533508, "learning_rate": 2.17729731550655e-07, "loss": 0.66676557, "num_input_tokens_seen": 304951110, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 14139, "time_per_iteration": 2.4940173625946045 }, { "auxiliary_loss_clip": 0.01128192, "auxiliary_loss_mlp": 0.01032193, "balance_loss_clip": 1.02031374, "balance_loss_mlp": 1.03340626, "epoch": 0.8501427927250864, "flos": 16289404030080.0, "grad_norm": 1.5846139652401021, "language_loss": 0.70690435, "learning_rate": 2.1755833100677635e-07, "loss": 0.72850823, "num_input_tokens_seen": 304969095, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.67578125, "step": 14140, "time_per_iteration": 2.4841654300689697 }, { "auxiliary_loss_clip": 0.01133544, "auxiliary_loss_mlp": 0.01032084, "balance_loss_clip": 1.01943564, "balance_loss_mlp": 1.03468418, "epoch": 0.8502029159777544, "flos": 26687912820480.0, "grad_norm": 2.0442180864587636, "language_loss": 0.64215684, "learning_rate": 2.17386994073655e-07, "loss": 0.66381311, "num_input_tokens_seen": 304989315, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 14141, "time_per_iteration": 2.5532548427581787 }, { "auxiliary_loss_clip": 0.01131309, "auxiliary_loss_mlp": 0.01034265, "balance_loss_clip": 1.02159309, "balance_loss_mlp": 1.03556061, "epoch": 0.8502630392304223, "flos": 35590778179200.0, "grad_norm": 1.5182632006365755, "language_loss": 0.70901233, "learning_rate": 2.172157207574068e-07, "loss": 0.73066807, "num_input_tokens_seen": 305011020, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 14142, "time_per_iteration": 2.6642234325408936 }, { "auxiliary_loss_clip": 0.01138867, "auxiliary_loss_mlp": 0.01028153, "balance_loss_clip": 1.01628006, "balance_loss_mlp": 1.03377295, "epoch": 0.8503231624830904, "flos": 21649466125440.0, "grad_norm": 1.484485544592968, "language_loss": 0.65363276, "learning_rate": 2.1704451106414323e-07, "loss": 0.67530298, "num_input_tokens_seen": 305033550, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6953125, "step": 14143, "time_per_iteration": 2.603776454925537 }, { "auxiliary_loss_clip": 0.01133133, "auxiliary_loss_mlp": 0.01279157, "balance_loss_clip": 1.01895559, "balance_loss_mlp": 1.03398645, "epoch": 0.8503832857357583, "flos": 22417451838720.0, "grad_norm": 2.10130795131138, "language_loss": 0.67784393, "learning_rate": 2.1687336499997566e-07, "loss": 0.70196688, "num_input_tokens_seen": 305052885, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 14144, "time_per_iteration": 3.9976370334625244 }, { "auxiliary_loss_clip": 0.01144516, "auxiliary_loss_mlp": 0.01029809, "balance_loss_clip": 1.01726246, "balance_loss_mlp": 1.03343725, "epoch": 0.8504434089884263, "flos": 18697968552960.0, "grad_norm": 2.060780236179624, "language_loss": 0.64926815, "learning_rate": 2.1670228257101008e-07, "loss": 0.67101139, "num_input_tokens_seen": 305071995, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.66796875, "step": 14145, "time_per_iteration": 2.5410501956939697 }, { "auxiliary_loss_clip": 0.01120663, "auxiliary_loss_mlp": 0.01034077, "balance_loss_clip": 1.02160764, "balance_loss_mlp": 1.03421545, "epoch": 0.8505035322410942, "flos": 20007989475840.0, "grad_norm": 1.592169158346823, "language_loss": 0.86037093, "learning_rate": 2.165312637833532e-07, "loss": 0.88191831, "num_input_tokens_seen": 305090190, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6875, "step": 14146, "time_per_iteration": 2.6265251636505127 }, { "auxiliary_loss_clip": 0.0110049, "auxiliary_loss_mlp": 0.01028371, "balance_loss_clip": 1.01624727, "balance_loss_mlp": 1.03227246, "epoch": 0.8505636554937622, "flos": 20812173120000.0, "grad_norm": 1.6827934106540492, "language_loss": 0.83759397, "learning_rate": 2.1636030864310783e-07, "loss": 0.85888261, "num_input_tokens_seen": 305109355, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.68359375, "step": 14147, "time_per_iteration": 2.52079701423645 }, { "auxiliary_loss_clip": 0.01115175, "auxiliary_loss_mlp": 0.01028679, "balance_loss_clip": 1.01581621, "balance_loss_mlp": 1.03404164, "epoch": 0.8506237787464301, "flos": 14174445277440.0, "grad_norm": 3.023209296006245, "language_loss": 0.85701597, "learning_rate": 2.1618941715637518e-07, "loss": 0.87845445, "num_input_tokens_seen": 305124165, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 14148, "time_per_iteration": 2.5511276721954346 }, { "auxiliary_loss_clip": 0.01120188, "auxiliary_loss_mlp": 0.01030578, "balance_loss_clip": 1.01788783, "balance_loss_mlp": 1.03446722, "epoch": 0.8506839019990982, "flos": 23258372117760.0, "grad_norm": 1.549205134339711, "language_loss": 0.71860951, "learning_rate": 2.160185893292532e-07, "loss": 0.74011719, "num_input_tokens_seen": 305143940, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 14149, "time_per_iteration": 2.6031572818756104 }, { "auxiliary_loss_clip": 0.0112202, "auxiliary_loss_mlp": 0.01027616, "balance_loss_clip": 1.01520658, "balance_loss_mlp": 1.03544283, "epoch": 0.8507440252517661, "flos": 23659206963840.0, "grad_norm": 1.908939219428779, "language_loss": 0.75741208, "learning_rate": 2.1584782516783906e-07, "loss": 0.77890849, "num_input_tokens_seen": 305163505, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 14150, "time_per_iteration": 2.5879948139190674 }, { "auxiliary_loss_clip": 0.0113007, "auxiliary_loss_mlp": 0.01032294, "balance_loss_clip": 1.02041459, "balance_loss_mlp": 1.03625715, "epoch": 0.8508041485044341, "flos": 18661339658880.0, "grad_norm": 1.6881169594906948, "language_loss": 0.72494578, "learning_rate": 2.1567712467822696e-07, "loss": 0.7465694, "num_input_tokens_seen": 305182325, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.66796875, "step": 14151, "time_per_iteration": 4.348740100860596 }, { "auxiliary_loss_clip": 0.01068297, "auxiliary_loss_mlp": 0.00999181, "balance_loss_clip": 0.99776286, "balance_loss_mlp": 1.00277972, "epoch": 0.8508642717571021, "flos": 52510918055040.0, "grad_norm": 0.9511866026598291, "language_loss": 0.59675837, "learning_rate": 2.1550648786650826e-07, "loss": 0.61743319, "num_input_tokens_seen": 305230775, "router_z_loss_clip": 0.01416016, "router_z_loss_mlp": 0.21289062, "step": 14152, "time_per_iteration": 2.9361259937286377 }, { "auxiliary_loss_clip": 0.0104113, "auxiliary_loss_mlp": 0.01000334, "balance_loss_clip": 0.9989754, "balance_loss_mlp": 1.00222087, "epoch": 0.85092439500977, "flos": 69297145050240.0, "grad_norm": 0.6872460170062601, "language_loss": 0.61190975, "learning_rate": 2.1533591473877234e-07, "loss": 0.6323244, "num_input_tokens_seen": 305296000, "router_z_loss_clip": 0.01361084, "router_z_loss_mlp": 0.21484375, "step": 14153, "time_per_iteration": 4.909361839294434 }, { "auxiliary_loss_clip": 0.01112463, "auxiliary_loss_mlp": 0.01029642, "balance_loss_clip": 1.01695168, "balance_loss_mlp": 1.03525078, "epoch": 0.850984518262438, "flos": 24389737770240.0, "grad_norm": 1.5940405044132624, "language_loss": 0.80590796, "learning_rate": 2.1516540530110782e-07, "loss": 0.82732904, "num_input_tokens_seen": 305314705, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 14154, "time_per_iteration": 2.68501615524292 }, { "auxiliary_loss_clip": 0.01122837, "auxiliary_loss_mlp": 0.01032904, "balance_loss_clip": 1.02158451, "balance_loss_mlp": 1.03153157, "epoch": 0.851044641515106, "flos": 23294821443840.0, "grad_norm": 1.693783740297139, "language_loss": 0.79681367, "learning_rate": 2.1499495955959834e-07, "loss": 0.81837106, "num_input_tokens_seen": 305333870, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.65234375, "step": 14155, "time_per_iteration": 2.583726406097412 }, { "auxiliary_loss_clip": 0.01130559, "auxiliary_loss_mlp": 0.01029665, "balance_loss_clip": 1.01657557, "balance_loss_mlp": 1.03270769, "epoch": 0.851104764767774, "flos": 22050085489920.0, "grad_norm": 1.868730714182194, "language_loss": 0.70479351, "learning_rate": 2.148245775203268e-07, "loss": 0.72639573, "num_input_tokens_seen": 305352780, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 14156, "time_per_iteration": 2.508704900741577 }, { "auxiliary_loss_clip": 0.01126592, "auxiliary_loss_mlp": 0.01030288, "balance_loss_clip": 1.01818871, "balance_loss_mlp": 1.03369117, "epoch": 0.8511648880204419, "flos": 20704728562560.0, "grad_norm": 3.979877916087967, "language_loss": 0.73496157, "learning_rate": 2.1465425918937497e-07, "loss": 0.7565304, "num_input_tokens_seen": 305371370, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6640625, "step": 14157, "time_per_iteration": 2.4969258308410645 }, { "auxiliary_loss_clip": 0.01152968, "auxiliary_loss_mlp": 0.01025376, "balance_loss_clip": 1.01398599, "balance_loss_mlp": 1.03407812, "epoch": 0.8512250112731099, "flos": 24024669891840.0, "grad_norm": 2.0421699488215395, "language_loss": 0.79023194, "learning_rate": 2.1448400457281934e-07, "loss": 0.81201541, "num_input_tokens_seen": 305387955, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.65625, "step": 14158, "time_per_iteration": 2.6357741355895996 }, { "auxiliary_loss_clip": 0.01113174, "auxiliary_loss_mlp": 0.01033207, "balance_loss_clip": 1.02043366, "balance_loss_mlp": 1.03504682, "epoch": 0.8512851345257778, "flos": 22015467757440.0, "grad_norm": 2.0546453986088387, "language_loss": 0.79637337, "learning_rate": 2.1431381367673706e-07, "loss": 0.81783724, "num_input_tokens_seen": 305406285, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 14159, "time_per_iteration": 2.5490453243255615 }, { "auxiliary_loss_clip": 0.01123608, "auxiliary_loss_mlp": 0.01033412, "balance_loss_clip": 1.02113962, "balance_loss_mlp": 1.03643799, "epoch": 0.8513452577784458, "flos": 14830209924480.0, "grad_norm": 1.6245396077158438, "language_loss": 0.71111894, "learning_rate": 2.1414368650720104e-07, "loss": 0.73268914, "num_input_tokens_seen": 305424500, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6953125, "step": 14160, "time_per_iteration": 2.5047755241394043 }, { "auxiliary_loss_clip": 0.0111876, "auxiliary_loss_mlp": 0.01032417, "balance_loss_clip": 1.018821, "balance_loss_mlp": 1.03658962, "epoch": 0.8514053810311137, "flos": 33035662166400.0, "grad_norm": 1.9787361352436326, "language_loss": 0.70066845, "learning_rate": 2.1397362307028377e-07, "loss": 0.72218013, "num_input_tokens_seen": 305442990, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 14161, "time_per_iteration": 2.6062891483306885 }, { "auxiliary_loss_clip": 0.01112693, "auxiliary_loss_mlp": 0.01033193, "balance_loss_clip": 1.02111697, "balance_loss_mlp": 1.03217316, "epoch": 0.8514655042837818, "flos": 27564456412800.0, "grad_norm": 2.2081557349547474, "language_loss": 0.78347468, "learning_rate": 2.1380362337205304e-07, "loss": 0.80493355, "num_input_tokens_seen": 305463065, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.71875, "step": 14162, "time_per_iteration": 2.6091196537017822 }, { "auxiliary_loss_clip": 0.01113563, "auxiliary_loss_mlp": 0.01034477, "balance_loss_clip": 1.02184653, "balance_loss_mlp": 1.035465, "epoch": 0.8515256275364497, "flos": 35556052705920.0, "grad_norm": 1.9745774638168425, "language_loss": 0.76809496, "learning_rate": 2.1363368741857668e-07, "loss": 0.78957534, "num_input_tokens_seen": 305489070, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 14163, "time_per_iteration": 2.662126064300537 }, { "auxiliary_loss_clip": 0.01040486, "auxiliary_loss_mlp": 0.01000013, "balance_loss_clip": 0.99867171, "balance_loss_mlp": 1.00204408, "epoch": 0.8515857507891177, "flos": 70207372621440.0, "grad_norm": 0.6997987812851484, "language_loss": 0.55057621, "learning_rate": 2.1346381521591894e-07, "loss": 0.57098126, "num_input_tokens_seen": 305551490, "router_z_loss_clip": 0.01342773, "router_z_loss_mlp": 0.21289062, "step": 14164, "time_per_iteration": 3.213981866836548 }, { "auxiliary_loss_clip": 0.01112265, "auxiliary_loss_mlp": 0.01033321, "balance_loss_clip": 1.02120924, "balance_loss_mlp": 1.03440261, "epoch": 0.8516458740417857, "flos": 22675290641280.0, "grad_norm": 1.9984019117009375, "language_loss": 0.72429228, "learning_rate": 2.1329400677014207e-07, "loss": 0.74574816, "num_input_tokens_seen": 305570535, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 14165, "time_per_iteration": 2.6258389949798584 }, { "auxiliary_loss_clip": 0.01127165, "auxiliary_loss_mlp": 0.01032221, "balance_loss_clip": 1.01999032, "balance_loss_mlp": 1.03351808, "epoch": 0.8517059972944536, "flos": 22747435107840.0, "grad_norm": 1.677200874729296, "language_loss": 0.67615724, "learning_rate": 2.1312426208730572e-07, "loss": 0.6977511, "num_input_tokens_seen": 305590800, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6640625, "step": 14166, "time_per_iteration": 2.60670804977417 }, { "auxiliary_loss_clip": 0.01120854, "auxiliary_loss_mlp": 0.01034824, "balance_loss_clip": 1.02295709, "balance_loss_mlp": 1.03394032, "epoch": 0.8517661205471216, "flos": 21689147675520.0, "grad_norm": 4.269928682259888, "language_loss": 0.73611581, "learning_rate": 2.1295458117346854e-07, "loss": 0.75767261, "num_input_tokens_seen": 305609495, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.69140625, "step": 14167, "time_per_iteration": 2.592885971069336 }, { "auxiliary_loss_clip": 0.01103055, "auxiliary_loss_mlp": 0.01030249, "balance_loss_clip": 1.01844764, "balance_loss_mlp": 1.0340414, "epoch": 0.8518262437997896, "flos": 25374839241600.0, "grad_norm": 1.8279968386683156, "language_loss": 0.80000567, "learning_rate": 2.1278496403468548e-07, "loss": 0.82133877, "num_input_tokens_seen": 305629420, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69140625, "step": 14168, "time_per_iteration": 2.577014207839966 }, { "auxiliary_loss_clip": 0.01101977, "auxiliary_loss_mlp": 0.01029437, "balance_loss_clip": 1.01749218, "balance_loss_mlp": 1.03315783, "epoch": 0.8518863670524576, "flos": 27235406897280.0, "grad_norm": 1.7557386057853102, "language_loss": 0.76549828, "learning_rate": 2.1261541067700994e-07, "loss": 0.78681242, "num_input_tokens_seen": 305649835, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 14169, "time_per_iteration": 2.6048412322998047 }, { "auxiliary_loss_clip": 0.01106708, "auxiliary_loss_mlp": 0.01029592, "balance_loss_clip": 1.01696181, "balance_loss_mlp": 1.03538108, "epoch": 0.8519464903051255, "flos": 20959514709120.0, "grad_norm": 1.6527925926685172, "language_loss": 0.63573867, "learning_rate": 2.124459211064926e-07, "loss": 0.65710169, "num_input_tokens_seen": 305668840, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 14170, "time_per_iteration": 2.5752670764923096 }, { "auxiliary_loss_clip": 0.01109849, "auxiliary_loss_mlp": 0.0102645, "balance_loss_clip": 1.01452327, "balance_loss_mlp": 1.03180146, "epoch": 0.8520066135577935, "flos": 18441745862400.0, "grad_norm": 2.3522608080191105, "language_loss": 0.86797941, "learning_rate": 2.1227649532918203e-07, "loss": 0.88934237, "num_input_tokens_seen": 305686955, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 14171, "time_per_iteration": 2.531853675842285 }, { "auxiliary_loss_clip": 0.01114635, "auxiliary_loss_mlp": 0.01035261, "balance_loss_clip": 1.02330387, "balance_loss_mlp": 1.03599918, "epoch": 0.8520667368104614, "flos": 29130233149440.0, "grad_norm": 2.711519122982577, "language_loss": 0.54528171, "learning_rate": 2.1210713335112505e-07, "loss": 0.56678063, "num_input_tokens_seen": 305706290, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6953125, "step": 14172, "time_per_iteration": 2.5953264236450195 }, { "auxiliary_loss_clip": 0.01118584, "auxiliary_loss_mlp": 0.01028905, "balance_loss_clip": 1.01846218, "balance_loss_mlp": 1.03448319, "epoch": 0.8521268600631294, "flos": 16034366488320.0, "grad_norm": 2.0086738353030484, "language_loss": 0.696823, "learning_rate": 2.1193783517836495e-07, "loss": 0.7182979, "num_input_tokens_seen": 305723835, "router_z_loss_clip": 0.10449219, "router_z_loss_mlp": 0.6640625, "step": 14173, "time_per_iteration": 2.53236985206604 }, { "auxiliary_loss_clip": 0.0112322, "auxiliary_loss_mlp": 0.01274815, "balance_loss_clip": 1.01462865, "balance_loss_mlp": 1.0349555, "epoch": 0.8521869833157973, "flos": 22454870832000.0, "grad_norm": 2.2623715906636894, "language_loss": 0.76760161, "learning_rate": 2.117686008169448e-07, "loss": 0.79158193, "num_input_tokens_seen": 305741655, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 14174, "time_per_iteration": 2.533156394958496 }, { "auxiliary_loss_clip": 0.01131389, "auxiliary_loss_mlp": 0.01276085, "balance_loss_clip": 1.01742423, "balance_loss_mlp": 1.03514528, "epoch": 0.8522471065684654, "flos": 24972029147520.0, "grad_norm": 1.8204363022403267, "language_loss": 0.8206802, "learning_rate": 2.1159943027290273e-07, "loss": 0.84475493, "num_input_tokens_seen": 305761890, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 14175, "time_per_iteration": 2.5917856693267822 }, { "auxiliary_loss_clip": 0.01106884, "auxiliary_loss_mlp": 0.01030186, "balance_loss_clip": 1.01746047, "balance_loss_mlp": 1.03600907, "epoch": 0.8523072298211333, "flos": 17710604524800.0, "grad_norm": 2.287108138627002, "language_loss": 0.65768504, "learning_rate": 2.1143032355227696e-07, "loss": 0.67905569, "num_input_tokens_seen": 305779190, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 14176, "time_per_iteration": 2.4697341918945312 }, { "auxiliary_loss_clip": 0.01120613, "auxiliary_loss_mlp": 0.01277519, "balance_loss_clip": 1.01766372, "balance_loss_mlp": 1.03525877, "epoch": 0.8523673530738013, "flos": 25446193608960.0, "grad_norm": 1.6783000968762718, "language_loss": 0.78500807, "learning_rate": 2.1126128066110182e-07, "loss": 0.80898941, "num_input_tokens_seen": 305799870, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.67578125, "step": 14177, "time_per_iteration": 2.6297757625579834 }, { "auxiliary_loss_clip": 0.01121645, "auxiliary_loss_mlp": 0.01031275, "balance_loss_clip": 1.01836455, "balance_loss_mlp": 1.03502226, "epoch": 0.8524274763264693, "flos": 31429593348480.0, "grad_norm": 1.5797501432102914, "language_loss": 0.73447382, "learning_rate": 2.1109230160541114e-07, "loss": 0.75600302, "num_input_tokens_seen": 305819695, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 14178, "time_per_iteration": 2.6001014709472656 }, { "auxiliary_loss_clip": 0.01131624, "auxiliary_loss_mlp": 0.01036348, "balance_loss_clip": 1.02389693, "balance_loss_mlp": 1.03512907, "epoch": 0.8524875995791372, "flos": 17712651600000.0, "grad_norm": 1.5708287793338758, "language_loss": 0.74521428, "learning_rate": 2.1092338639123386e-07, "loss": 0.76689398, "num_input_tokens_seen": 305837270, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 14179, "time_per_iteration": 2.548654317855835 }, { "auxiliary_loss_clip": 0.01100978, "auxiliary_loss_mlp": 0.01027517, "balance_loss_clip": 1.01563787, "balance_loss_mlp": 1.03389525, "epoch": 0.8525477228318052, "flos": 21687316081920.0, "grad_norm": 2.744918538937984, "language_loss": 0.80996776, "learning_rate": 2.107545350245994e-07, "loss": 0.83125269, "num_input_tokens_seen": 305855250, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.671875, "step": 14180, "time_per_iteration": 3.921265125274658 }, { "auxiliary_loss_clip": 0.01139279, "auxiliary_loss_mlp": 0.01032701, "balance_loss_clip": 1.02079797, "balance_loss_mlp": 1.03352356, "epoch": 0.8526078460844732, "flos": 24827057856000.0, "grad_norm": 2.012063979066467, "language_loss": 0.60989642, "learning_rate": 2.105857475115329e-07, "loss": 0.63161618, "num_input_tokens_seen": 305875660, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 14181, "time_per_iteration": 2.6336750984191895 }, { "auxiliary_loss_clip": 0.01101601, "auxiliary_loss_mlp": 0.01032365, "balance_loss_clip": 1.02121878, "balance_loss_mlp": 1.03477573, "epoch": 0.8526679693371412, "flos": 13516418073600.0, "grad_norm": 2.299615084315698, "language_loss": 0.72375256, "learning_rate": 2.1041702385805827e-07, "loss": 0.74509227, "num_input_tokens_seen": 305892415, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.66796875, "step": 14182, "time_per_iteration": 2.4962689876556396 }, { "auxiliary_loss_clip": 0.01139009, "auxiliary_loss_mlp": 0.01032706, "balance_loss_clip": 1.02051127, "balance_loss_mlp": 1.03597891, "epoch": 0.8527280925898091, "flos": 23514092017920.0, "grad_norm": 1.4677761846803434, "language_loss": 0.70507145, "learning_rate": 2.102483640701962e-07, "loss": 0.72678858, "num_input_tokens_seen": 305912665, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.67578125, "step": 14183, "time_per_iteration": 2.61128568649292 }, { "auxiliary_loss_clip": 0.01121179, "auxiliary_loss_mlp": 0.01031112, "balance_loss_clip": 1.01958466, "balance_loss_mlp": 1.03698158, "epoch": 0.8527882158424771, "flos": 20303031790080.0, "grad_norm": 1.8020219951746752, "language_loss": 0.72784936, "learning_rate": 2.10079768153967e-07, "loss": 0.7493723, "num_input_tokens_seen": 305931515, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 14184, "time_per_iteration": 2.6021101474761963 }, { "auxiliary_loss_clip": 0.01103955, "auxiliary_loss_mlp": 0.01031217, "balance_loss_clip": 1.01894414, "balance_loss_mlp": 1.03532624, "epoch": 0.852848339095145, "flos": 17202504689280.0, "grad_norm": 1.5696010937644527, "language_loss": 0.76869428, "learning_rate": 2.099112361153863e-07, "loss": 0.79004598, "num_input_tokens_seen": 305949965, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 14185, "time_per_iteration": 2.53246808052063 }, { "auxiliary_loss_clip": 0.01130401, "auxiliary_loss_mlp": 0.01027622, "balance_loss_clip": 1.01484239, "balance_loss_mlp": 1.03385854, "epoch": 0.852908462347813, "flos": 18368990864640.0, "grad_norm": 1.9335861164617303, "language_loss": 0.80276394, "learning_rate": 2.0974276796046863e-07, "loss": 0.82434416, "num_input_tokens_seen": 305967820, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 14186, "time_per_iteration": 3.96358323097229 }, { "auxiliary_loss_clip": 0.01119608, "auxiliary_loss_mlp": 0.01030241, "balance_loss_clip": 1.01687133, "balance_loss_mlp": 1.03261256, "epoch": 0.8529685856004809, "flos": 18624890332800.0, "grad_norm": 1.7647081388670163, "language_loss": 0.62606448, "learning_rate": 2.0957436369522674e-07, "loss": 0.64756298, "num_input_tokens_seen": 305985505, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6953125, "step": 14187, "time_per_iteration": 2.5459463596343994 }, { "auxiliary_loss_clip": 0.01142229, "auxiliary_loss_mlp": 0.01030148, "balance_loss_clip": 1.01679683, "balance_loss_mlp": 1.03301775, "epoch": 0.853028708853149, "flos": 29607665748480.0, "grad_norm": 1.5735307648069807, "language_loss": 0.76837289, "learning_rate": 2.0940602332567026e-07, "loss": 0.79009664, "num_input_tokens_seen": 306005220, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73046875, "step": 14188, "time_per_iteration": 2.703456163406372 }, { "auxiliary_loss_clip": 0.01114656, "auxiliary_loss_mlp": 0.01031382, "balance_loss_clip": 1.01777434, "balance_loss_mlp": 1.0349158, "epoch": 0.8530888321058169, "flos": 26353153042560.0, "grad_norm": 1.8007882728745015, "language_loss": 0.78369504, "learning_rate": 2.0923774685780704e-07, "loss": 0.8051554, "num_input_tokens_seen": 306023785, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7109375, "step": 14189, "time_per_iteration": 2.551266670227051 }, { "auxiliary_loss_clip": 0.01122026, "auxiliary_loss_mlp": 0.01032043, "balance_loss_clip": 1.01947272, "balance_loss_mlp": 1.03438783, "epoch": 0.8531489553584849, "flos": 20521979141760.0, "grad_norm": 1.8264883995362327, "language_loss": 0.79268408, "learning_rate": 2.0906953429764162e-07, "loss": 0.81422478, "num_input_tokens_seen": 306041600, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 14190, "time_per_iteration": 2.5758914947509766 }, { "auxiliary_loss_clip": 0.01140355, "auxiliary_loss_mlp": 0.01029826, "balance_loss_clip": 1.01714206, "balance_loss_mlp": 1.03618348, "epoch": 0.8532090786111529, "flos": 20704297599360.0, "grad_norm": 1.8735494332321945, "language_loss": 0.76243663, "learning_rate": 2.0890138565117875e-07, "loss": 0.78413844, "num_input_tokens_seen": 306060345, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 14191, "time_per_iteration": 2.59081768989563 }, { "auxiliary_loss_clip": 0.01112784, "auxiliary_loss_mlp": 0.0103098, "balance_loss_clip": 1.0182128, "balance_loss_mlp": 1.03535151, "epoch": 0.8532692018638208, "flos": 19828903242240.0, "grad_norm": 1.8138070158880015, "language_loss": 0.68768013, "learning_rate": 2.087333009244172e-07, "loss": 0.70911771, "num_input_tokens_seen": 306078285, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 14192, "time_per_iteration": 2.4934680461883545 }, { "auxiliary_loss_clip": 0.01117671, "auxiliary_loss_mlp": 0.0103799, "balance_loss_clip": 1.02353632, "balance_loss_mlp": 1.034163, "epoch": 0.8533293251164888, "flos": 20996790048000.0, "grad_norm": 2.687061966721417, "language_loss": 0.63029373, "learning_rate": 2.0856528012335685e-07, "loss": 0.6518504, "num_input_tokens_seen": 306093760, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.74609375, "step": 14193, "time_per_iteration": 3.9863224029541016 }, { "auxiliary_loss_clip": 0.01121559, "auxiliary_loss_mlp": 0.01029479, "balance_loss_clip": 1.0173614, "balance_loss_mlp": 1.03459787, "epoch": 0.8533894483691568, "flos": 22419606654720.0, "grad_norm": 1.7712771576211768, "language_loss": 0.76491749, "learning_rate": 2.0839732325399306e-07, "loss": 0.7864278, "num_input_tokens_seen": 306112595, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 14194, "time_per_iteration": 2.5375375747680664 }, { "auxiliary_loss_clip": 0.01102968, "auxiliary_loss_mlp": 0.01028614, "balance_loss_clip": 1.01696134, "balance_loss_mlp": 1.03351855, "epoch": 0.8534495716218248, "flos": 21616536332160.0, "grad_norm": 1.9933778302876788, "language_loss": 0.69488657, "learning_rate": 2.0822943032232132e-07, "loss": 0.71620238, "num_input_tokens_seen": 306131800, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6953125, "step": 14195, "time_per_iteration": 4.0070717334747314 }, { "auxiliary_loss_clip": 0.01121887, "auxiliary_loss_mlp": 0.01032105, "balance_loss_clip": 1.01954031, "balance_loss_mlp": 1.0331291, "epoch": 0.8535096948744927, "flos": 22346277039360.0, "grad_norm": 2.1153506074831894, "language_loss": 0.85830343, "learning_rate": 2.0806160133433127e-07, "loss": 0.87984341, "num_input_tokens_seen": 306150590, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 14196, "time_per_iteration": 2.5743513107299805 }, { "auxiliary_loss_clip": 0.0104154, "auxiliary_loss_mlp": 0.01003769, "balance_loss_clip": 1.00229108, "balance_loss_mlp": 1.00252497, "epoch": 0.8535698181271607, "flos": 52762507891200.0, "grad_norm": 0.7554706579701311, "language_loss": 0.5511772, "learning_rate": 2.0789383629601365e-07, "loss": 0.5716303, "num_input_tokens_seen": 306205850, "router_z_loss_clip": 0.01477051, "router_z_loss_mlp": 0.21484375, "step": 14197, "time_per_iteration": 3.1474571228027344 }, { "auxiliary_loss_clip": 0.01126778, "auxiliary_loss_mlp": 0.01278603, "balance_loss_clip": 1.02007079, "balance_loss_mlp": 1.03339064, "epoch": 0.8536299413798286, "flos": 19062892776960.0, "grad_norm": 1.5222089911442298, "language_loss": 0.81445467, "learning_rate": 2.0772613521335503e-07, "loss": 0.83850849, "num_input_tokens_seen": 306225220, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.671875, "step": 14198, "time_per_iteration": 2.5842392444610596 }, { "auxiliary_loss_clip": 0.01102868, "auxiliary_loss_mlp": 0.01030272, "balance_loss_clip": 1.01901293, "balance_loss_mlp": 1.0346415, "epoch": 0.8536900646324966, "flos": 49344743871360.0, "grad_norm": 1.4516730986657094, "language_loss": 0.6846056, "learning_rate": 2.075584980923406e-07, "loss": 0.70593703, "num_input_tokens_seen": 306249865, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6796875, "step": 14199, "time_per_iteration": 2.760650873184204 }, { "auxiliary_loss_clip": 0.01123803, "auxiliary_loss_mlp": 0.01028833, "balance_loss_clip": 1.01737714, "balance_loss_mlp": 1.0319494, "epoch": 0.8537501878851645, "flos": 21762333636480.0, "grad_norm": 1.6155876736142656, "language_loss": 0.8641991, "learning_rate": 2.0739092493895248e-07, "loss": 0.8857255, "num_input_tokens_seen": 306270215, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.65234375, "step": 14200, "time_per_iteration": 2.784663200378418 }, { "auxiliary_loss_clip": 0.01128744, "auxiliary_loss_mlp": 0.01027565, "balance_loss_clip": 1.01633501, "balance_loss_mlp": 1.03309906, "epoch": 0.8538103111378326, "flos": 22269176496000.0, "grad_norm": 1.6841198667093826, "language_loss": 0.77771783, "learning_rate": 2.0722341575917125e-07, "loss": 0.79928094, "num_input_tokens_seen": 306288960, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.69140625, "step": 14201, "time_per_iteration": 2.5967400074005127 }, { "auxiliary_loss_clip": 0.01119304, "auxiliary_loss_mlp": 0.01027776, "balance_loss_clip": 1.01622486, "balance_loss_mlp": 1.03405714, "epoch": 0.8538704343905005, "flos": 20303929630080.0, "grad_norm": 2.0874964474500235, "language_loss": 0.68503767, "learning_rate": 2.0705597055897472e-07, "loss": 0.70650852, "num_input_tokens_seen": 306308735, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.671875, "step": 14202, "time_per_iteration": 2.6103413105010986 }, { "auxiliary_loss_clip": 0.01112198, "auxiliary_loss_mlp": 0.01034617, "balance_loss_clip": 1.02242804, "balance_loss_mlp": 1.03417945, "epoch": 0.8539305576431685, "flos": 24755164784640.0, "grad_norm": 1.806897540537672, "language_loss": 0.80314618, "learning_rate": 2.0688858934433794e-07, "loss": 0.82461435, "num_input_tokens_seen": 306329015, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.69140625, "step": 14203, "time_per_iteration": 2.9717884063720703 }, { "auxiliary_loss_clip": 0.01124178, "auxiliary_loss_mlp": 0.01033505, "balance_loss_clip": 1.0206182, "balance_loss_mlp": 1.03539681, "epoch": 0.8539906808958365, "flos": 26687625511680.0, "grad_norm": 1.8628405232824803, "language_loss": 0.65853035, "learning_rate": 2.067212721212357e-07, "loss": 0.68010712, "num_input_tokens_seen": 306349085, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 14204, "time_per_iteration": 3.4450652599334717 }, { "auxiliary_loss_clip": 0.0110543, "auxiliary_loss_mlp": 0.01031848, "balance_loss_clip": 1.01883686, "balance_loss_mlp": 1.0346396, "epoch": 0.8540508041485044, "flos": 13365521038080.0, "grad_norm": 2.679459994490425, "language_loss": 0.60094386, "learning_rate": 2.065540188956383e-07, "loss": 0.62231666, "num_input_tokens_seen": 306365385, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 14205, "time_per_iteration": 3.330199956893921 }, { "auxiliary_loss_clip": 0.01119539, "auxiliary_loss_mlp": 0.01026895, "balance_loss_clip": 1.01464641, "balance_loss_mlp": 1.03395128, "epoch": 0.8541109274011724, "flos": 32780876019840.0, "grad_norm": 2.028394857314235, "language_loss": 0.72473007, "learning_rate": 2.0638682967351472e-07, "loss": 0.74619442, "num_input_tokens_seen": 306384585, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 14206, "time_per_iteration": 3.3548195362091064 }, { "auxiliary_loss_clip": 0.01113767, "auxiliary_loss_mlp": 0.0102677, "balance_loss_clip": 1.01455128, "balance_loss_mlp": 1.03495431, "epoch": 0.8541710506538404, "flos": 17639286071040.0, "grad_norm": 2.5385107784906786, "language_loss": 0.76444364, "learning_rate": 2.0621970446083094e-07, "loss": 0.78584898, "num_input_tokens_seen": 306401565, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69921875, "step": 14207, "time_per_iteration": 2.7592384815216064 }, { "auxiliary_loss_clip": 0.01105604, "auxiliary_loss_mlp": 0.01029917, "balance_loss_clip": 1.01658344, "balance_loss_mlp": 1.03525305, "epoch": 0.8542311739065084, "flos": 20263062931200.0, "grad_norm": 1.709910367238343, "language_loss": 0.85108387, "learning_rate": 2.0605264326355253e-07, "loss": 0.87243903, "num_input_tokens_seen": 306419995, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.703125, "step": 14208, "time_per_iteration": 2.4712624549865723 }, { "auxiliary_loss_clip": 0.01121256, "auxiliary_loss_mlp": 0.01033433, "balance_loss_clip": 1.0214467, "balance_loss_mlp": 1.03479862, "epoch": 0.8542912971591763, "flos": 17785657992960.0, "grad_norm": 1.4673496287833552, "language_loss": 0.66250002, "learning_rate": 2.0588564608764015e-07, "loss": 0.68404698, "num_input_tokens_seen": 306439240, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 14209, "time_per_iteration": 2.533271551132202 }, { "auxiliary_loss_clip": 0.01137301, "auxiliary_loss_mlp": 0.01028027, "balance_loss_clip": 1.01639199, "balance_loss_mlp": 1.03449202, "epoch": 0.8543514204118443, "flos": 26979507429120.0, "grad_norm": 2.0614520655508817, "language_loss": 0.70480824, "learning_rate": 2.0571871293905408e-07, "loss": 0.72646153, "num_input_tokens_seen": 306458425, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 14210, "time_per_iteration": 2.650763988494873 }, { "auxiliary_loss_clip": 0.01119801, "auxiliary_loss_mlp": 0.01028296, "balance_loss_clip": 1.0160358, "balance_loss_mlp": 1.033638, "epoch": 0.8544115436645122, "flos": 24024598064640.0, "grad_norm": 1.3259740743148136, "language_loss": 0.69832194, "learning_rate": 2.0555184382375156e-07, "loss": 0.71980286, "num_input_tokens_seen": 306477210, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6796875, "step": 14211, "time_per_iteration": 2.5248918533325195 }, { "auxiliary_loss_clip": 0.01121855, "auxiliary_loss_mlp": 0.01032817, "balance_loss_clip": 1.02043152, "balance_loss_mlp": 1.03400898, "epoch": 0.8544716669171802, "flos": 16617986668800.0, "grad_norm": 2.1829421959637045, "language_loss": 0.81210989, "learning_rate": 2.0538503874768854e-07, "loss": 0.83365655, "num_input_tokens_seen": 306495820, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.69921875, "step": 14212, "time_per_iteration": 2.469804525375366 }, { "auxiliary_loss_clip": 0.01130778, "auxiliary_loss_mlp": 0.01034071, "balance_loss_clip": 1.02051139, "balance_loss_mlp": 1.03437912, "epoch": 0.8545317901698481, "flos": 40005779489280.0, "grad_norm": 1.8889160474029176, "language_loss": 0.66378707, "learning_rate": 2.0521829771681664e-07, "loss": 0.68543559, "num_input_tokens_seen": 306516420, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.69921875, "step": 14213, "time_per_iteration": 2.6567928791046143 }, { "auxiliary_loss_clip": 0.01100635, "auxiliary_loss_mlp": 0.01026333, "balance_loss_clip": 1.01395345, "balance_loss_mlp": 1.03256714, "epoch": 0.8545919134225162, "flos": 19902520166400.0, "grad_norm": 2.2233211194206484, "language_loss": 0.78191745, "learning_rate": 2.0505162073708714e-07, "loss": 0.80318713, "num_input_tokens_seen": 306534785, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6796875, "step": 14214, "time_per_iteration": 2.450028657913208 }, { "auxiliary_loss_clip": 0.01124357, "auxiliary_loss_mlp": 0.01029497, "balance_loss_clip": 1.01590109, "balance_loss_mlp": 1.03424454, "epoch": 0.8546520366751841, "flos": 18952970181120.0, "grad_norm": 8.440513075668639, "language_loss": 0.6715098, "learning_rate": 2.0488500781444862e-07, "loss": 0.69304836, "num_input_tokens_seen": 306552440, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 14215, "time_per_iteration": 2.4976930618286133 }, { "auxiliary_loss_clip": 0.01103471, "auxiliary_loss_mlp": 0.0103021, "balance_loss_clip": 1.01792002, "balance_loss_mlp": 1.03521824, "epoch": 0.8547121599278521, "flos": 35621445415680.0, "grad_norm": 2.1341839442944477, "language_loss": 0.62808198, "learning_rate": 2.0471845895484562e-07, "loss": 0.64941883, "num_input_tokens_seen": 306573600, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 14216, "time_per_iteration": 2.6075685024261475 }, { "auxiliary_loss_clip": 0.01130906, "auxiliary_loss_mlp": 0.01030705, "balance_loss_clip": 1.0184387, "balance_loss_mlp": 1.03365445, "epoch": 0.8547722831805201, "flos": 16910048154240.0, "grad_norm": 1.7467918044570356, "language_loss": 0.65473878, "learning_rate": 2.0455197416422344e-07, "loss": 0.67635489, "num_input_tokens_seen": 306592840, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.703125, "step": 14217, "time_per_iteration": 2.4910809993743896 }, { "auxiliary_loss_clip": 0.01110257, "auxiliary_loss_mlp": 0.01031721, "balance_loss_clip": 1.02020001, "balance_loss_mlp": 1.03527939, "epoch": 0.854832406433188, "flos": 23002616304000.0, "grad_norm": 1.7226521921960063, "language_loss": 0.64957213, "learning_rate": 2.043855534485226e-07, "loss": 0.67099196, "num_input_tokens_seen": 306613210, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 14218, "time_per_iteration": 2.563013792037964 }, { "auxiliary_loss_clip": 0.01113722, "auxiliary_loss_mlp": 0.01029283, "balance_loss_clip": 1.01728415, "balance_loss_mlp": 1.03342891, "epoch": 0.854892529685856, "flos": 20412595249920.0, "grad_norm": 2.6258434141325053, "language_loss": 0.70249462, "learning_rate": 2.042191968136826e-07, "loss": 0.72392464, "num_input_tokens_seen": 306631620, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7109375, "step": 14219, "time_per_iteration": 2.506502151489258 }, { "auxiliary_loss_clip": 0.01123127, "auxiliary_loss_mlp": 0.01035176, "balance_loss_clip": 1.02276039, "balance_loss_mlp": 1.03305423, "epoch": 0.854952652938524, "flos": 16398716094720.0, "grad_norm": 2.8652949245382886, "language_loss": 0.67428327, "learning_rate": 2.040529042656398e-07, "loss": 0.69586629, "num_input_tokens_seen": 306646695, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.72265625, "step": 14220, "time_per_iteration": 2.464643955230713 }, { "auxiliary_loss_clip": 0.0110068, "auxiliary_loss_mlp": 0.01026419, "balance_loss_clip": 1.01476598, "balance_loss_mlp": 1.03420639, "epoch": 0.855012776191192, "flos": 21178677542400.0, "grad_norm": 2.0437746364874965, "language_loss": 0.71910417, "learning_rate": 2.038866758103295e-07, "loss": 0.74037522, "num_input_tokens_seen": 306665465, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6640625, "step": 14221, "time_per_iteration": 3.951673984527588 }, { "auxiliary_loss_clip": 0.01131117, "auxiliary_loss_mlp": 0.01035921, "balance_loss_clip": 1.0233146, "balance_loss_mlp": 1.03464675, "epoch": 0.8550728994438599, "flos": 26140993361280.0, "grad_norm": 1.8777821507168686, "language_loss": 0.60045224, "learning_rate": 2.0372051145368374e-07, "loss": 0.62212265, "num_input_tokens_seen": 306685950, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 14222, "time_per_iteration": 2.568751096725464 }, { "auxiliary_loss_clip": 0.01119844, "auxiliary_loss_mlp": 0.01032023, "balance_loss_clip": 1.01982176, "balance_loss_mlp": 1.03412306, "epoch": 0.8551330226965279, "flos": 22786793435520.0, "grad_norm": 1.545127796115659, "language_loss": 0.84060091, "learning_rate": 2.0355441120163207e-07, "loss": 0.86211956, "num_input_tokens_seen": 306705740, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 14223, "time_per_iteration": 2.554997205734253 }, { "auxiliary_loss_clip": 0.01112068, "auxiliary_loss_mlp": 0.01030976, "balance_loss_clip": 1.01878047, "balance_loss_mlp": 1.03547204, "epoch": 0.8551931459491958, "flos": 22419032037120.0, "grad_norm": 1.5529749104929855, "language_loss": 0.74183846, "learning_rate": 2.033883750601022e-07, "loss": 0.76326889, "num_input_tokens_seen": 306725065, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 14224, "time_per_iteration": 2.4903626441955566 }, { "auxiliary_loss_clip": 0.01120194, "auxiliary_loss_mlp": 0.01275537, "balance_loss_clip": 1.01634145, "balance_loss_mlp": 1.03400505, "epoch": 0.8552532692018638, "flos": 19573183342080.0, "grad_norm": 2.467633824629997, "language_loss": 0.75703412, "learning_rate": 2.032224030350207e-07, "loss": 0.78099144, "num_input_tokens_seen": 306743630, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 14225, "time_per_iteration": 2.5400655269622803 }, { "auxiliary_loss_clip": 0.01146959, "auxiliary_loss_mlp": 0.01037968, "balance_loss_clip": 1.02623224, "balance_loss_mlp": 1.03530121, "epoch": 0.8553133924545318, "flos": 26432767537920.0, "grad_norm": 1.6159580528302402, "language_loss": 0.77241194, "learning_rate": 2.0305649513230883e-07, "loss": 0.79426128, "num_input_tokens_seen": 306763105, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 14226, "time_per_iteration": 2.608292579650879 }, { "auxiliary_loss_clip": 0.01105712, "auxiliary_loss_mlp": 0.01034503, "balance_loss_clip": 1.02178907, "balance_loss_mlp": 1.03417265, "epoch": 0.8553735157071998, "flos": 16362446336640.0, "grad_norm": 2.207915012136845, "language_loss": 0.55168331, "learning_rate": 2.0289065135788875e-07, "loss": 0.57308543, "num_input_tokens_seen": 306779875, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 14227, "time_per_iteration": 4.13147497177124 }, { "auxiliary_loss_clip": 0.01121036, "auxiliary_loss_mlp": 0.01275868, "balance_loss_clip": 1.01687658, "balance_loss_mlp": 1.03508794, "epoch": 0.8554336389598677, "flos": 20887334328960.0, "grad_norm": 2.1486878427679854, "language_loss": 0.65798354, "learning_rate": 2.027248717176786e-07, "loss": 0.68195266, "num_input_tokens_seen": 306800015, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.67578125, "step": 14228, "time_per_iteration": 2.557451009750366 }, { "auxiliary_loss_clip": 0.01131295, "auxiliary_loss_mlp": 0.01036163, "balance_loss_clip": 1.02333641, "balance_loss_mlp": 1.03506303, "epoch": 0.8554937622125357, "flos": 21284721469440.0, "grad_norm": 2.17702225928885, "language_loss": 0.74021554, "learning_rate": 2.0255915621759455e-07, "loss": 0.76189017, "num_input_tokens_seen": 306814160, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 14229, "time_per_iteration": 2.659687042236328 }, { "auxiliary_loss_clip": 0.01111142, "auxiliary_loss_mlp": 0.01028453, "balance_loss_clip": 1.01698482, "balance_loss_mlp": 1.03452086, "epoch": 0.8555538854652037, "flos": 22413178120320.0, "grad_norm": 2.706802311438786, "language_loss": 0.72794133, "learning_rate": 2.0239350486355078e-07, "loss": 0.74933732, "num_input_tokens_seen": 306833310, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6796875, "step": 14230, "time_per_iteration": 2.8230979442596436 }, { "auxiliary_loss_clip": 0.01112235, "auxiliary_loss_mlp": 0.01030322, "balance_loss_clip": 1.01788211, "balance_loss_mlp": 1.03491735, "epoch": 0.8556140087178716, "flos": 20193719725440.0, "grad_norm": 1.8810737914963567, "language_loss": 0.82801116, "learning_rate": 2.0222791766145853e-07, "loss": 0.84943676, "num_input_tokens_seen": 306851345, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6796875, "step": 14231, "time_per_iteration": 3.1715104579925537 }, { "auxiliary_loss_clip": 0.01114742, "auxiliary_loss_mlp": 0.01033768, "balance_loss_clip": 1.02016628, "balance_loss_mlp": 1.03658116, "epoch": 0.8556741319705397, "flos": 22638123043200.0, "grad_norm": 2.075034988578091, "language_loss": 0.68065417, "learning_rate": 2.0206239461722817e-07, "loss": 0.70213926, "num_input_tokens_seen": 306871040, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.6953125, "step": 14232, "time_per_iteration": 3.2637109756469727 }, { "auxiliary_loss_clip": 0.01123621, "auxiliary_loss_mlp": 0.01031351, "balance_loss_clip": 1.01802969, "balance_loss_mlp": 1.03510416, "epoch": 0.8557342552232076, "flos": 23549320281600.0, "grad_norm": 3.514308171903001, "language_loss": 0.67052847, "learning_rate": 2.0189693573676525e-07, "loss": 0.69207823, "num_input_tokens_seen": 306891625, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 14233, "time_per_iteration": 3.411311626434326 }, { "auxiliary_loss_clip": 0.01109723, "auxiliary_loss_mlp": 0.01032237, "balance_loss_clip": 1.02020276, "balance_loss_mlp": 1.03288007, "epoch": 0.8557943784758756, "flos": 19609884063360.0, "grad_norm": 3.211158174349134, "language_loss": 0.76955366, "learning_rate": 2.017315410259759e-07, "loss": 0.79097325, "num_input_tokens_seen": 306910020, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 14234, "time_per_iteration": 5.845862150192261 }, { "auxiliary_loss_clip": 0.01051113, "auxiliary_loss_mlp": 0.01002227, "balance_loss_clip": 1.00084448, "balance_loss_mlp": 1.00266385, "epoch": 0.8558545017285435, "flos": 70641891446400.0, "grad_norm": 0.7699798620093162, "language_loss": 0.58015996, "learning_rate": 2.0156621049076206e-07, "loss": 0.6006934, "num_input_tokens_seen": 306969505, "router_z_loss_clip": 0.01385498, "router_z_loss_mlp": 0.21289062, "step": 14235, "time_per_iteration": 3.8558590412139893 }, { "auxiliary_loss_clip": 0.01041929, "auxiliary_loss_mlp": 0.01002665, "balance_loss_clip": 1.0013361, "balance_loss_mlp": 1.00286603, "epoch": 0.8559146249812115, "flos": 56649983086080.0, "grad_norm": 0.7825156052769013, "language_loss": 0.56678349, "learning_rate": 2.0140094413702412e-07, "loss": 0.58722943, "num_input_tokens_seen": 307027710, "router_z_loss_clip": 0.01330566, "router_z_loss_mlp": 0.21289062, "step": 14236, "time_per_iteration": 3.6113221645355225 }, { "auxiliary_loss_clip": 0.01120219, "auxiliary_loss_mlp": 0.0103462, "balance_loss_clip": 1.02208567, "balance_loss_mlp": 1.03417182, "epoch": 0.8559747482338794, "flos": 22888240421760.0, "grad_norm": 1.9657255780723615, "language_loss": 0.69993049, "learning_rate": 2.012357419706594e-07, "loss": 0.72147888, "num_input_tokens_seen": 307045515, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 14237, "time_per_iteration": 5.559738874435425 }, { "auxiliary_loss_clip": 0.01118185, "auxiliary_loss_mlp": 0.01028333, "balance_loss_clip": 1.01615036, "balance_loss_mlp": 1.03180337, "epoch": 0.8560348714865474, "flos": 22601925112320.0, "grad_norm": 2.1569023538165757, "language_loss": 0.63357842, "learning_rate": 2.010706039975647e-07, "loss": 0.6550436, "num_input_tokens_seen": 307064470, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 14238, "time_per_iteration": 3.391570806503296 }, { "auxiliary_loss_clip": 0.01098129, "auxiliary_loss_mlp": 0.01032161, "balance_loss_clip": 1.02146769, "balance_loss_mlp": 1.0330708, "epoch": 0.8560949947392154, "flos": 17931455297280.0, "grad_norm": 1.6252930185844012, "language_loss": 0.69239479, "learning_rate": 2.0090553022363266e-07, "loss": 0.71369767, "num_input_tokens_seen": 307083900, "router_z_loss_clip": 0.10693359, "router_z_loss_mlp": 0.65234375, "step": 14239, "time_per_iteration": 3.726893663406372 }, { "auxiliary_loss_clip": 0.01105325, "auxiliary_loss_mlp": 0.01032166, "balance_loss_clip": 1.01948833, "balance_loss_mlp": 1.03413689, "epoch": 0.8561551179918834, "flos": 31026208636800.0, "grad_norm": 2.2889157682672847, "language_loss": 0.66595936, "learning_rate": 2.0074052065475434e-07, "loss": 0.68733424, "num_input_tokens_seen": 307104590, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 14240, "time_per_iteration": 3.1968860626220703 }, { "auxiliary_loss_clip": 0.01131469, "auxiliary_loss_mlp": 0.01033295, "balance_loss_clip": 1.02050948, "balance_loss_mlp": 1.03419614, "epoch": 0.8562152412445513, "flos": 30665198995200.0, "grad_norm": 1.5557695223136938, "language_loss": 0.61762547, "learning_rate": 2.0057557529681813e-07, "loss": 0.63927311, "num_input_tokens_seen": 307125580, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 14241, "time_per_iteration": 3.4616599082946777 }, { "auxiliary_loss_clip": 0.01107695, "auxiliary_loss_mlp": 0.01036497, "balance_loss_clip": 1.02334201, "balance_loss_mlp": 1.03448141, "epoch": 0.8562753644972193, "flos": 31576144838400.0, "grad_norm": 1.6447668969674516, "language_loss": 0.74752319, "learning_rate": 2.0041069415571198e-07, "loss": 0.76896507, "num_input_tokens_seen": 307147625, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73046875, "step": 14242, "time_per_iteration": 3.447462558746338 }, { "auxiliary_loss_clip": 0.01162064, "auxiliary_loss_mlp": 0.01040311, "balance_loss_clip": 1.02774024, "balance_loss_mlp": 1.0364877, "epoch": 0.8563354877498872, "flos": 26213640618240.0, "grad_norm": 2.0609803677644707, "language_loss": 0.76516426, "learning_rate": 2.0024587723731813e-07, "loss": 0.78718793, "num_input_tokens_seen": 307164665, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 14243, "time_per_iteration": 3.4309520721435547 }, { "auxiliary_loss_clip": 0.01121404, "auxiliary_loss_mlp": 0.01278384, "balance_loss_clip": 1.01883566, "balance_loss_mlp": 1.03372931, "epoch": 0.8563956110025552, "flos": 24134341092480.0, "grad_norm": 1.9836007775976903, "language_loss": 0.68471718, "learning_rate": 2.000811245475198e-07, "loss": 0.70871508, "num_input_tokens_seen": 307182530, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 14244, "time_per_iteration": 3.4135496616363525 }, { "auxiliary_loss_clip": 0.01032714, "auxiliary_loss_mlp": 0.01002773, "balance_loss_clip": 1.00150955, "balance_loss_mlp": 1.00216579, "epoch": 0.8564557342552233, "flos": 56271986311680.0, "grad_norm": 0.7513312969949268, "language_loss": 0.58506489, "learning_rate": 1.9991643609219654e-07, "loss": 0.60541975, "num_input_tokens_seen": 307241240, "router_z_loss_clip": 0.01263428, "router_z_loss_mlp": 0.21386719, "step": 14245, "time_per_iteration": 3.650892496109009 }, { "auxiliary_loss_clip": 0.01115639, "auxiliary_loss_mlp": 0.01028958, "balance_loss_clip": 1.01563025, "balance_loss_mlp": 1.03463662, "epoch": 0.8565158575078912, "flos": 23440618748160.0, "grad_norm": 1.74563208560547, "language_loss": 0.77276564, "learning_rate": 1.9975181187722457e-07, "loss": 0.79421163, "num_input_tokens_seen": 307261485, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 14246, "time_per_iteration": 3.4395203590393066 }, { "auxiliary_loss_clip": 0.01117338, "auxiliary_loss_mlp": 0.01026574, "balance_loss_clip": 1.0142597, "balance_loss_mlp": 1.0308919, "epoch": 0.8565759807605592, "flos": 20375930442240.0, "grad_norm": 2.275481330450674, "language_loss": 0.81476343, "learning_rate": 1.9958725190848004e-07, "loss": 0.83620256, "num_input_tokens_seen": 307279160, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 14247, "time_per_iteration": 3.44175386428833 }, { "auxiliary_loss_clip": 0.01122755, "auxiliary_loss_mlp": 0.01030924, "balance_loss_clip": 1.0185262, "balance_loss_mlp": 1.03567481, "epoch": 0.8566361040132271, "flos": 18807101049600.0, "grad_norm": 1.6543142682083152, "language_loss": 0.77174509, "learning_rate": 1.994227561918349e-07, "loss": 0.79328191, "num_input_tokens_seen": 307297920, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 14248, "time_per_iteration": 3.3493685722351074 }, { "auxiliary_loss_clip": 0.01139377, "auxiliary_loss_mlp": 0.01030979, "balance_loss_clip": 1.01800919, "balance_loss_mlp": 1.03363919, "epoch": 0.8566962272658951, "flos": 24535355506560.0, "grad_norm": 1.8906812120784184, "language_loss": 0.77574217, "learning_rate": 1.9925832473316073e-07, "loss": 0.79744577, "num_input_tokens_seen": 307318320, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 14249, "time_per_iteration": 3.3728606700897217 }, { "auxiliary_loss_clip": 0.01118952, "auxiliary_loss_mlp": 0.01034828, "balance_loss_clip": 1.02212596, "balance_loss_mlp": 1.03369927, "epoch": 0.856756350518563, "flos": 23178506227200.0, "grad_norm": 1.847123975908251, "language_loss": 0.78381169, "learning_rate": 1.9909395753832414e-07, "loss": 0.80534953, "num_input_tokens_seen": 307336720, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.67578125, "step": 14250, "time_per_iteration": 3.2303929328918457 }, { "auxiliary_loss_clip": 0.01117567, "auxiliary_loss_mlp": 0.01029761, "balance_loss_clip": 1.01851392, "balance_loss_mlp": 1.03364587, "epoch": 0.856816473771231, "flos": 20808581760000.0, "grad_norm": 1.6851041532842075, "language_loss": 0.7978847, "learning_rate": 1.9892965461319223e-07, "loss": 0.81935799, "num_input_tokens_seen": 307354120, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.66015625, "step": 14251, "time_per_iteration": 3.573415517807007 }, { "auxiliary_loss_clip": 0.01128419, "auxiliary_loss_mlp": 0.01027064, "balance_loss_clip": 1.01498199, "balance_loss_mlp": 1.03347635, "epoch": 0.856876597023899, "flos": 20228157889920.0, "grad_norm": 1.980683127913227, "language_loss": 0.6178931, "learning_rate": 1.9876541596362784e-07, "loss": 0.63944787, "num_input_tokens_seen": 307373165, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 14252, "time_per_iteration": 3.2128565311431885 }, { "auxiliary_loss_clip": 0.01125319, "auxiliary_loss_mlp": 0.01035014, "balance_loss_clip": 1.02131069, "balance_loss_mlp": 1.03676844, "epoch": 0.856936720276567, "flos": 18296128126080.0, "grad_norm": 2.3375437012296967, "language_loss": 0.69695199, "learning_rate": 1.9860124159549253e-07, "loss": 0.71855533, "num_input_tokens_seen": 307391000, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.70703125, "step": 14253, "time_per_iteration": 3.371732711791992 }, { "auxiliary_loss_clip": 0.01040926, "auxiliary_loss_mlp": 0.01000235, "balance_loss_clip": 0.99905491, "balance_loss_mlp": 1.00210786, "epoch": 0.8569968435292349, "flos": 69878394933120.0, "grad_norm": 0.7877849516597214, "language_loss": 0.59307051, "learning_rate": 1.984371315146447e-07, "loss": 0.61348212, "num_input_tokens_seen": 307452865, "router_z_loss_clip": 0.01177979, "router_z_loss_mlp": 0.21289062, "step": 14254, "time_per_iteration": 3.8345158100128174 }, { "auxiliary_loss_clip": 0.01118965, "auxiliary_loss_mlp": 0.01035653, "balance_loss_clip": 1.02213502, "balance_loss_mlp": 1.03309441, "epoch": 0.8570569667819029, "flos": 25848572739840.0, "grad_norm": 2.2735925842710913, "language_loss": 0.80859035, "learning_rate": 1.9827308572694168e-07, "loss": 0.83013654, "num_input_tokens_seen": 307471940, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.68359375, "step": 14255, "time_per_iteration": 3.216458797454834 }, { "auxiliary_loss_clip": 0.01100942, "auxiliary_loss_mlp": 0.01025593, "balance_loss_clip": 1.01305234, "balance_loss_mlp": 1.03373766, "epoch": 0.8571170900345708, "flos": 23257115141760.0, "grad_norm": 2.3913861859504233, "language_loss": 0.67719275, "learning_rate": 1.9810910423823791e-07, "loss": 0.69845808, "num_input_tokens_seen": 307488745, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.671875, "step": 14256, "time_per_iteration": 3.201871395111084 }, { "auxiliary_loss_clip": 0.01112604, "auxiliary_loss_mlp": 0.01030785, "balance_loss_clip": 1.01878071, "balance_loss_mlp": 1.03490162, "epoch": 0.8571772132872388, "flos": 18917670090240.0, "grad_norm": 1.7218528256914667, "language_loss": 0.70192134, "learning_rate": 1.9794518705438513e-07, "loss": 0.72335523, "num_input_tokens_seen": 307506855, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 14257, "time_per_iteration": 3.329902172088623 }, { "auxiliary_loss_clip": 0.01128248, "auxiliary_loss_mlp": 0.0103087, "balance_loss_clip": 1.01834738, "balance_loss_mlp": 1.03407085, "epoch": 0.8572373365399069, "flos": 33250120318080.0, "grad_norm": 1.8407938553374625, "language_loss": 0.77198982, "learning_rate": 1.9778133418123287e-07, "loss": 0.79358101, "num_input_tokens_seen": 307526115, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.67578125, "step": 14258, "time_per_iteration": 3.4940643310546875 }, { "auxiliary_loss_clip": 0.01111311, "auxiliary_loss_mlp": 0.01279831, "balance_loss_clip": 1.0209682, "balance_loss_mlp": 1.0349313, "epoch": 0.8572974597925748, "flos": 23327535755520.0, "grad_norm": 1.6600964996845213, "language_loss": 0.67989129, "learning_rate": 1.9761754562462963e-07, "loss": 0.70380276, "num_input_tokens_seen": 307545230, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.67578125, "step": 14259, "time_per_iteration": 3.7636454105377197 }, { "auxiliary_loss_clip": 0.0111058, "auxiliary_loss_mlp": 0.01028103, "balance_loss_clip": 1.01563346, "balance_loss_mlp": 1.03269315, "epoch": 0.8573575830452428, "flos": 24535858296960.0, "grad_norm": 1.6633636444619189, "language_loss": 0.76946992, "learning_rate": 1.9745382139041932e-07, "loss": 0.79085678, "num_input_tokens_seen": 307564900, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6875, "step": 14260, "time_per_iteration": 3.4450273513793945 }, { "auxiliary_loss_clip": 0.01122672, "auxiliary_loss_mlp": 0.0102847, "balance_loss_clip": 1.01582146, "balance_loss_mlp": 1.03473234, "epoch": 0.8574177062979107, "flos": 24165403378560.0, "grad_norm": 1.8652626057959794, "language_loss": 0.74257636, "learning_rate": 1.9729016148444532e-07, "loss": 0.7640878, "num_input_tokens_seen": 307583500, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 14261, "time_per_iteration": 3.423755168914795 }, { "auxiliary_loss_clip": 0.01116561, "auxiliary_loss_mlp": 0.01034246, "balance_loss_clip": 1.02024436, "balance_loss_mlp": 1.03542399, "epoch": 0.8574778295505787, "flos": 17930737025280.0, "grad_norm": 1.8913351718467146, "language_loss": 0.78836542, "learning_rate": 1.9712656591254896e-07, "loss": 0.80987346, "num_input_tokens_seen": 307601430, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.71875, "step": 14262, "time_per_iteration": 3.4018383026123047 }, { "auxiliary_loss_clip": 0.01126909, "auxiliary_loss_mlp": 0.01031866, "balance_loss_clip": 1.02052379, "balance_loss_mlp": 1.03454673, "epoch": 0.8575379528032466, "flos": 21580697537280.0, "grad_norm": 1.6234592055186101, "language_loss": 0.68150318, "learning_rate": 1.969630346805673e-07, "loss": 0.70309091, "num_input_tokens_seen": 307621495, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.66015625, "step": 14263, "time_per_iteration": 4.75161075592041 }, { "auxiliary_loss_clip": 0.0112886, "auxiliary_loss_mlp": 0.0127924, "balance_loss_clip": 1.01970816, "balance_loss_mlp": 1.03357255, "epoch": 0.8575980760559146, "flos": 21761579450880.0, "grad_norm": 1.571053431915799, "language_loss": 0.79665452, "learning_rate": 1.9679956779433726e-07, "loss": 0.82073557, "num_input_tokens_seen": 307640840, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 14264, "time_per_iteration": 3.264058828353882 }, { "auxiliary_loss_clip": 0.011308, "auxiliary_loss_mlp": 0.01029607, "balance_loss_clip": 1.01762629, "balance_loss_mlp": 1.03529215, "epoch": 0.8576581993085826, "flos": 34386442047360.0, "grad_norm": 1.2443368479380736, "language_loss": 0.69917333, "learning_rate": 1.9663616525969173e-07, "loss": 0.72077733, "num_input_tokens_seen": 307663820, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 14265, "time_per_iteration": 3.4682466983795166 }, { "auxiliary_loss_clip": 0.01098586, "auxiliary_loss_mlp": 0.01026122, "balance_loss_clip": 1.01492238, "balance_loss_mlp": 1.03312254, "epoch": 0.8577183225612506, "flos": 23222497409280.0, "grad_norm": 1.665724106428348, "language_loss": 0.65799081, "learning_rate": 1.964728270824636e-07, "loss": 0.67923796, "num_input_tokens_seen": 307682385, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.65625, "step": 14266, "time_per_iteration": 3.429805278778076 }, { "auxiliary_loss_clip": 0.01120116, "auxiliary_loss_mlp": 0.01034837, "balance_loss_clip": 1.02243936, "balance_loss_mlp": 1.03466702, "epoch": 0.8577784458139185, "flos": 28804164462720.0, "grad_norm": 1.853513172690939, "language_loss": 0.75442523, "learning_rate": 1.963095532684802e-07, "loss": 0.77597475, "num_input_tokens_seen": 307704680, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6796875, "step": 14267, "time_per_iteration": 3.7148287296295166 }, { "auxiliary_loss_clip": 0.01109711, "auxiliary_loss_mlp": 0.01030637, "balance_loss_clip": 1.01892495, "balance_loss_mlp": 1.0338273, "epoch": 0.8578385690665865, "flos": 19755573626880.0, "grad_norm": 1.7162551685441534, "language_loss": 0.87990546, "learning_rate": 1.9614634382356975e-07, "loss": 0.90130889, "num_input_tokens_seen": 307723245, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 14268, "time_per_iteration": 3.2821009159088135 }, { "auxiliary_loss_clip": 0.01113623, "auxiliary_loss_mlp": 0.01034883, "balance_loss_clip": 1.02221131, "balance_loss_mlp": 1.03415585, "epoch": 0.8578986923192544, "flos": 20704082117760.0, "grad_norm": 1.762153832697314, "language_loss": 0.72756016, "learning_rate": 1.9598319875355608e-07, "loss": 0.74904525, "num_input_tokens_seen": 307742510, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 14269, "time_per_iteration": 4.999084949493408 }, { "auxiliary_loss_clip": 0.01112994, "auxiliary_loss_mlp": 0.01030928, "balance_loss_clip": 1.01865005, "balance_loss_mlp": 1.03523624, "epoch": 0.8579588155719224, "flos": 36101715189120.0, "grad_norm": 1.5229044945816579, "language_loss": 0.66528904, "learning_rate": 1.9582011806426158e-07, "loss": 0.6867283, "num_input_tokens_seen": 307766030, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 14270, "time_per_iteration": 3.3090312480926514 }, { "auxiliary_loss_clip": 0.01118304, "auxiliary_loss_mlp": 0.0102607, "balance_loss_clip": 1.01367867, "balance_loss_mlp": 1.03258491, "epoch": 0.8580189388245905, "flos": 22853479034880.0, "grad_norm": 1.599097192336923, "language_loss": 0.73835832, "learning_rate": 1.9565710176150585e-07, "loss": 0.7598021, "num_input_tokens_seen": 307785800, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.67578125, "step": 14271, "time_per_iteration": 3.402916669845581 }, { "auxiliary_loss_clip": 0.01057739, "auxiliary_loss_mlp": 0.01247759, "balance_loss_clip": 1.00051081, "balance_loss_mlp": 1.00175619, "epoch": 0.8580790620772584, "flos": 69642104290560.0, "grad_norm": 0.8039685906189074, "language_loss": 0.59455204, "learning_rate": 1.9549414985110734e-07, "loss": 0.617607, "num_input_tokens_seen": 307850995, "router_z_loss_clip": 0.01318359, "router_z_loss_mlp": 0.21289062, "step": 14272, "time_per_iteration": 3.267935037612915 }, { "auxiliary_loss_clip": 0.01125643, "auxiliary_loss_mlp": 0.01032406, "balance_loss_clip": 1.01960921, "balance_loss_mlp": 1.03545046, "epoch": 0.8581391853299264, "flos": 13334243270400.0, "grad_norm": 1.6923859304472781, "language_loss": 0.75121677, "learning_rate": 1.9533126233888032e-07, "loss": 0.77279723, "num_input_tokens_seen": 307868585, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 14273, "time_per_iteration": 2.6230738162994385 }, { "auxiliary_loss_clip": 0.01111316, "auxiliary_loss_mlp": 0.01030848, "balance_loss_clip": 1.01843238, "balance_loss_mlp": 1.03403974, "epoch": 0.8581993085825943, "flos": 19645651031040.0, "grad_norm": 2.458743587263827, "language_loss": 0.82370639, "learning_rate": 1.9516843923063876e-07, "loss": 0.845128, "num_input_tokens_seen": 307886820, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.68359375, "step": 14274, "time_per_iteration": 2.5801870822906494 }, { "auxiliary_loss_clip": 0.01031842, "auxiliary_loss_mlp": 0.01000087, "balance_loss_clip": 0.99872798, "balance_loss_mlp": 1.00162697, "epoch": 0.8582594318352623, "flos": 55825077294720.0, "grad_norm": 0.8140639658980878, "language_loss": 0.60856158, "learning_rate": 1.9500568053219313e-07, "loss": 0.62888086, "num_input_tokens_seen": 307944020, "router_z_loss_clip": 0.01361084, "router_z_loss_mlp": 0.21289062, "step": 14275, "time_per_iteration": 3.2741658687591553 }, { "auxiliary_loss_clip": 0.01113489, "auxiliary_loss_mlp": 0.01029851, "balance_loss_clip": 1.0163027, "balance_loss_mlp": 1.03403771, "epoch": 0.8583195550879302, "flos": 24279563779200.0, "grad_norm": 2.2014916363551738, "language_loss": 0.59253436, "learning_rate": 1.948429862493517e-07, "loss": 0.61396784, "num_input_tokens_seen": 307961055, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.703125, "step": 14276, "time_per_iteration": 4.510971784591675 }, { "auxiliary_loss_clip": 0.01121599, "auxiliary_loss_mlp": 0.01035055, "balance_loss_clip": 1.0229193, "balance_loss_mlp": 1.03429794, "epoch": 0.8583796783405983, "flos": 13444129952640.0, "grad_norm": 7.438967681954091, "language_loss": 0.7631309, "learning_rate": 1.9468035638792046e-07, "loss": 0.78469753, "num_input_tokens_seen": 307978690, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69921875, "step": 14277, "time_per_iteration": 2.539851188659668 }, { "auxiliary_loss_clip": 0.0111976, "auxiliary_loss_mlp": 0.01030561, "balance_loss_clip": 1.01741171, "balance_loss_mlp": 1.03298843, "epoch": 0.8584398015932662, "flos": 16180271533440.0, "grad_norm": 1.6331117180713157, "language_loss": 0.83323777, "learning_rate": 1.9451779095370325e-07, "loss": 0.85474098, "num_input_tokens_seen": 307995870, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.69140625, "step": 14278, "time_per_iteration": 4.129496812820435 }, { "auxiliary_loss_clip": 0.01124171, "auxiliary_loss_mlp": 0.01031606, "balance_loss_clip": 1.01816487, "balance_loss_mlp": 1.03506446, "epoch": 0.8584999248459342, "flos": 17450431338240.0, "grad_norm": 1.9340447785385189, "language_loss": 0.7444194, "learning_rate": 1.9435528995250272e-07, "loss": 0.76597714, "num_input_tokens_seen": 308013645, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71484375, "step": 14279, "time_per_iteration": 2.5180742740631104 }, { "auxiliary_loss_clip": 0.01102836, "auxiliary_loss_mlp": 0.01032458, "balance_loss_clip": 1.02051282, "balance_loss_mlp": 1.03469265, "epoch": 0.8585600480986021, "flos": 23441013797760.0, "grad_norm": 2.3807456722085605, "language_loss": 0.66023511, "learning_rate": 1.9419285339011626e-07, "loss": 0.68158805, "num_input_tokens_seen": 308032490, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6796875, "step": 14280, "time_per_iteration": 2.5385541915893555 }, { "auxiliary_loss_clip": 0.01108517, "auxiliary_loss_mlp": 0.01029369, "balance_loss_clip": 1.01775217, "balance_loss_mlp": 1.03252125, "epoch": 0.8586201713512701, "flos": 19937927998080.0, "grad_norm": 1.6583949529852315, "language_loss": 0.62782967, "learning_rate": 1.940304812723421e-07, "loss": 0.64920855, "num_input_tokens_seen": 308052110, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 14281, "time_per_iteration": 2.4963104724884033 }, { "auxiliary_loss_clip": 0.01120362, "auxiliary_loss_mlp": 0.01279386, "balance_loss_clip": 1.01966631, "balance_loss_mlp": 1.03344011, "epoch": 0.858680294603938, "flos": 15304769435520.0, "grad_norm": 1.80140917363239, "language_loss": 0.73512405, "learning_rate": 1.9386817360497432e-07, "loss": 0.75912148, "num_input_tokens_seen": 308070660, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 14282, "time_per_iteration": 2.5116472244262695 }, { "auxiliary_loss_clip": 0.01117471, "auxiliary_loss_mlp": 0.0102605, "balance_loss_clip": 1.01458263, "balance_loss_mlp": 1.03156471, "epoch": 0.858740417856606, "flos": 22711237176960.0, "grad_norm": 2.0434633193753435, "language_loss": 0.75740391, "learning_rate": 1.9370593039380622e-07, "loss": 0.77883911, "num_input_tokens_seen": 308089520, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6796875, "step": 14283, "time_per_iteration": 2.5211360454559326 }, { "auxiliary_loss_clip": 0.01112352, "auxiliary_loss_mlp": 0.01031999, "balance_loss_clip": 1.01995313, "balance_loss_mlp": 1.03333807, "epoch": 0.8588005411092741, "flos": 34054303962240.0, "grad_norm": 1.826315466350251, "language_loss": 0.60032308, "learning_rate": 1.9354375164462632e-07, "loss": 0.62176657, "num_input_tokens_seen": 308111545, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69921875, "step": 14284, "time_per_iteration": 2.605154037475586 }, { "auxiliary_loss_clip": 0.01118738, "auxiliary_loss_mlp": 0.01029811, "balance_loss_clip": 1.01748514, "balance_loss_mlp": 1.03434038, "epoch": 0.858860664361942, "flos": 28913584268160.0, "grad_norm": 1.9905973401364674, "language_loss": 0.75554335, "learning_rate": 1.9338163736322354e-07, "loss": 0.7770288, "num_input_tokens_seen": 308129690, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6640625, "step": 14285, "time_per_iteration": 2.5440776348114014 }, { "auxiliary_loss_clip": 0.01116372, "auxiliary_loss_mlp": 0.0103498, "balance_loss_clip": 1.02235603, "balance_loss_mlp": 1.0369786, "epoch": 0.85892078761461, "flos": 19792525743360.0, "grad_norm": 2.012190938960332, "language_loss": 0.74689293, "learning_rate": 1.9321958755538303e-07, "loss": 0.76840645, "num_input_tokens_seen": 308147410, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 14286, "time_per_iteration": 2.5289785861968994 }, { "auxiliary_loss_clip": 0.01039938, "auxiliary_loss_mlp": 0.01250379, "balance_loss_clip": 1.00306344, "balance_loss_mlp": 1.00157237, "epoch": 0.8589809108672779, "flos": 67106630039040.0, "grad_norm": 0.8255377320838017, "language_loss": 0.49495423, "learning_rate": 1.9305760222688793e-07, "loss": 0.51785743, "num_input_tokens_seen": 308204875, "router_z_loss_clip": 0.01330566, "router_z_loss_mlp": 0.21289062, "step": 14287, "time_per_iteration": 3.118678331375122 }, { "auxiliary_loss_clip": 0.01114168, "auxiliary_loss_mlp": 0.0103006, "balance_loss_clip": 1.01768053, "balance_loss_mlp": 1.03565609, "epoch": 0.8590410341199459, "flos": 29716259541120.0, "grad_norm": 2.048369591434287, "language_loss": 0.79083341, "learning_rate": 1.9289568138351854e-07, "loss": 0.81227571, "num_input_tokens_seen": 308225690, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 14288, "time_per_iteration": 2.6201508045196533 }, { "auxiliary_loss_clip": 0.01132107, "auxiliary_loss_mlp": 0.01034605, "balance_loss_clip": 1.02090192, "balance_loss_mlp": 1.0343523, "epoch": 0.8591011573726138, "flos": 23987430466560.0, "grad_norm": 2.1424687830005325, "language_loss": 0.80768371, "learning_rate": 1.927338250310544e-07, "loss": 0.82935083, "num_input_tokens_seen": 308245255, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.70703125, "step": 14289, "time_per_iteration": 2.53117036819458 }, { "auxiliary_loss_clip": 0.01114688, "auxiliary_loss_mlp": 0.01028472, "balance_loss_clip": 1.01625288, "balance_loss_mlp": 1.03644836, "epoch": 0.8591612806252819, "flos": 14428656806400.0, "grad_norm": 1.7458530864107975, "language_loss": 0.65356624, "learning_rate": 1.9257203317527115e-07, "loss": 0.67499787, "num_input_tokens_seen": 308261755, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 14290, "time_per_iteration": 2.5131561756134033 }, { "auxiliary_loss_clip": 0.0112349, "auxiliary_loss_mlp": 0.01030302, "balance_loss_clip": 1.01721883, "balance_loss_mlp": 1.0348531, "epoch": 0.8592214038779498, "flos": 31577150419200.0, "grad_norm": 2.3576945343728153, "language_loss": 0.54987442, "learning_rate": 1.9241030582194217e-07, "loss": 0.57141238, "num_input_tokens_seen": 308285145, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 14291, "time_per_iteration": 2.6280572414398193 }, { "auxiliary_loss_clip": 0.01113982, "auxiliary_loss_mlp": 0.01033426, "balance_loss_clip": 1.02107596, "balance_loss_mlp": 1.03551173, "epoch": 0.8592815271306178, "flos": 17457290835840.0, "grad_norm": 2.0825268016112983, "language_loss": 0.71711004, "learning_rate": 1.9224864297684018e-07, "loss": 0.7385841, "num_input_tokens_seen": 308304130, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 14292, "time_per_iteration": 2.6402666568756104 }, { "auxiliary_loss_clip": 0.01110364, "auxiliary_loss_mlp": 0.01029885, "balance_loss_clip": 1.0175705, "balance_loss_mlp": 1.03290057, "epoch": 0.8593416503832857, "flos": 25411360394880.0, "grad_norm": 1.6906915760796926, "language_loss": 0.71476614, "learning_rate": 1.920870446457339e-07, "loss": 0.73616862, "num_input_tokens_seen": 308324670, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 14293, "time_per_iteration": 2.541346311569214 }, { "auxiliary_loss_clip": 0.01118454, "auxiliary_loss_mlp": 0.01034105, "balance_loss_clip": 1.02196312, "balance_loss_mlp": 1.03295064, "epoch": 0.8594017736359537, "flos": 20996646393600.0, "grad_norm": 1.6126983766857046, "language_loss": 0.68260235, "learning_rate": 1.9192551083439068e-07, "loss": 0.70412791, "num_input_tokens_seen": 308344215, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.67578125, "step": 14294, "time_per_iteration": 2.5867276191711426 }, { "auxiliary_loss_clip": 0.01099351, "auxiliary_loss_mlp": 0.01030392, "balance_loss_clip": 1.01832771, "balance_loss_mlp": 1.03214586, "epoch": 0.8594618968886216, "flos": 22091059929600.0, "grad_norm": 1.8754583512586647, "language_loss": 0.780541, "learning_rate": 1.917640415485744e-07, "loss": 0.8018384, "num_input_tokens_seen": 308360520, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.671875, "step": 14295, "time_per_iteration": 2.4774057865142822 }, { "auxiliary_loss_clip": 0.01126917, "auxiliary_loss_mlp": 0.01039226, "balance_loss_clip": 1.02479541, "balance_loss_mlp": 1.03596401, "epoch": 0.8595220201412896, "flos": 11656245467520.0, "grad_norm": 5.088321740528312, "language_loss": 0.69499862, "learning_rate": 1.9160263679404908e-07, "loss": 0.71666002, "num_input_tokens_seen": 308376865, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.73046875, "step": 14296, "time_per_iteration": 2.512213706970215 }, { "auxiliary_loss_clip": 0.01121085, "auxiliary_loss_mlp": 0.01028808, "balance_loss_clip": 1.01608801, "balance_loss_mlp": 1.03394675, "epoch": 0.8595821433939577, "flos": 25040366772480.0, "grad_norm": 2.006642940114648, "language_loss": 0.78978622, "learning_rate": 1.9144129657657325e-07, "loss": 0.81128514, "num_input_tokens_seen": 308395870, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 14297, "time_per_iteration": 2.543534517288208 }, { "auxiliary_loss_clip": 0.01107871, "auxiliary_loss_mlp": 0.01027315, "balance_loss_clip": 1.01632965, "balance_loss_mlp": 1.0325793, "epoch": 0.8596422666466256, "flos": 28511528359680.0, "grad_norm": 1.7761384542338976, "language_loss": 0.67973608, "learning_rate": 1.9128002090190564e-07, "loss": 0.70108795, "num_input_tokens_seen": 308417250, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.6640625, "step": 14298, "time_per_iteration": 2.637230634689331 }, { "auxiliary_loss_clip": 0.01120102, "auxiliary_loss_mlp": 0.01033406, "balance_loss_clip": 1.02116323, "balance_loss_mlp": 1.03566384, "epoch": 0.8597023898992936, "flos": 13589137157760.0, "grad_norm": 2.0438243232042197, "language_loss": 0.68385363, "learning_rate": 1.9111880977580118e-07, "loss": 0.70538878, "num_input_tokens_seen": 308434565, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6640625, "step": 14299, "time_per_iteration": 2.4504902362823486 }, { "auxiliary_loss_clip": 0.01132979, "auxiliary_loss_mlp": 0.01034977, "balance_loss_clip": 1.022192, "balance_loss_mlp": 1.03553843, "epoch": 0.8597625131519615, "flos": 32300821728000.0, "grad_norm": 2.264016795163564, "language_loss": 0.72018731, "learning_rate": 1.9095766320401395e-07, "loss": 0.74186689, "num_input_tokens_seen": 308450040, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 14300, "time_per_iteration": 2.59419584274292 }, { "auxiliary_loss_clip": 0.01119852, "auxiliary_loss_mlp": 0.01029518, "balance_loss_clip": 1.01751387, "balance_loss_mlp": 1.03334355, "epoch": 0.8598226364046295, "flos": 28730367970560.0, "grad_norm": 1.7305535890548187, "language_loss": 0.68835551, "learning_rate": 1.9079658119229382e-07, "loss": 0.70984924, "num_input_tokens_seen": 308470545, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 14301, "time_per_iteration": 2.6078341007232666 }, { "auxiliary_loss_clip": 0.01140471, "auxiliary_loss_mlp": 0.01031054, "balance_loss_clip": 1.01890695, "balance_loss_mlp": 1.03533649, "epoch": 0.8598827596572974, "flos": 21871825269120.0, "grad_norm": 1.7052799049600957, "language_loss": 0.74520612, "learning_rate": 1.9063556374639012e-07, "loss": 0.76692128, "num_input_tokens_seen": 308490020, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.69140625, "step": 14302, "time_per_iteration": 2.6122169494628906 }, { "auxiliary_loss_clip": 0.01106813, "auxiliary_loss_mlp": 0.01029715, "balance_loss_clip": 1.01856303, "balance_loss_mlp": 1.03218377, "epoch": 0.8599428829099655, "flos": 23767297966080.0, "grad_norm": 1.914280959654461, "language_loss": 0.83757645, "learning_rate": 1.9047461087204896e-07, "loss": 0.85894173, "num_input_tokens_seen": 308509065, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.65625, "step": 14303, "time_per_iteration": 2.569415807723999 }, { "auxiliary_loss_clip": 0.01111513, "auxiliary_loss_mlp": 0.01033829, "balance_loss_clip": 1.02097249, "balance_loss_mlp": 1.03353405, "epoch": 0.8600030061626334, "flos": 16212770363520.0, "grad_norm": 2.013365717840036, "language_loss": 0.77500129, "learning_rate": 1.903137225750142e-07, "loss": 0.79645467, "num_input_tokens_seen": 308524725, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 14304, "time_per_iteration": 3.973996162414551 }, { "auxiliary_loss_clip": 0.01104584, "auxiliary_loss_mlp": 0.01036077, "balance_loss_clip": 1.02245784, "balance_loss_mlp": 1.0346998, "epoch": 0.8600631294153014, "flos": 15669370437120.0, "grad_norm": 2.0734123750853053, "language_loss": 0.53473419, "learning_rate": 1.901528988610277e-07, "loss": 0.55614078, "num_input_tokens_seen": 308543525, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.69921875, "step": 14305, "time_per_iteration": 2.500077486038208 }, { "auxiliary_loss_clip": 0.01103025, "auxiliary_loss_mlp": 0.01027402, "balance_loss_clip": 1.01495647, "balance_loss_mlp": 1.03508067, "epoch": 0.8601232526679693, "flos": 17493093717120.0, "grad_norm": 2.958473685139883, "language_loss": 0.8370356, "learning_rate": 1.8999213973582885e-07, "loss": 0.85833985, "num_input_tokens_seen": 308557995, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6796875, "step": 14306, "time_per_iteration": 2.58581280708313 }, { "auxiliary_loss_clip": 0.01108853, "auxiliary_loss_mlp": 0.01275165, "balance_loss_clip": 1.01662111, "balance_loss_mlp": 1.03307211, "epoch": 0.8601833759206373, "flos": 26985935963520.0, "grad_norm": 1.3688334872855947, "language_loss": 0.7118814, "learning_rate": 1.8983144520515437e-07, "loss": 0.73572159, "num_input_tokens_seen": 308582750, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.671875, "step": 14307, "time_per_iteration": 2.6445810794830322 }, { "auxiliary_loss_clip": 0.01114962, "auxiliary_loss_mlp": 0.01284676, "balance_loss_clip": 1.02417839, "balance_loss_mlp": 1.03439593, "epoch": 0.8602434991733052, "flos": 25229760209280.0, "grad_norm": 1.7735516741583919, "language_loss": 0.63832313, "learning_rate": 1.8967081527473905e-07, "loss": 0.66231948, "num_input_tokens_seen": 308603770, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71484375, "step": 14308, "time_per_iteration": 2.570542573928833 }, { "auxiliary_loss_clip": 0.01040214, "auxiliary_loss_mlp": 0.01002034, "balance_loss_clip": 1.00072908, "balance_loss_mlp": 1.00122857, "epoch": 0.8603036224259732, "flos": 71015363107200.0, "grad_norm": 0.6307898491711674, "language_loss": 0.48528108, "learning_rate": 1.8951024995031605e-07, "loss": 0.50570357, "num_input_tokens_seen": 308667735, "router_z_loss_clip": 0.01306152, "router_z_loss_mlp": 0.21289062, "step": 14309, "time_per_iteration": 3.249134063720703 }, { "auxiliary_loss_clip": 0.01150097, "auxiliary_loss_mlp": 0.01035021, "balance_loss_clip": 1.0213536, "balance_loss_mlp": 1.03520334, "epoch": 0.8603637456786413, "flos": 20300625578880.0, "grad_norm": 3.4320168704114704, "language_loss": 0.67534608, "learning_rate": 1.8934974923761503e-07, "loss": 0.69719732, "num_input_tokens_seen": 308686300, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.70703125, "step": 14310, "time_per_iteration": 4.092551231384277 }, { "auxiliary_loss_clip": 0.01150752, "auxiliary_loss_mlp": 0.01032246, "balance_loss_clip": 1.02045619, "balance_loss_mlp": 1.03461981, "epoch": 0.8604238689313092, "flos": 21835842819840.0, "grad_norm": 1.6727324713752503, "language_loss": 0.78265166, "learning_rate": 1.8918931314236386e-07, "loss": 0.80448169, "num_input_tokens_seen": 308705825, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.7109375, "step": 14311, "time_per_iteration": 2.574784994125366 }, { "auxiliary_loss_clip": 0.01110582, "auxiliary_loss_mlp": 0.01029485, "balance_loss_clip": 1.01774251, "balance_loss_mlp": 1.03408003, "epoch": 0.8604839921839772, "flos": 18004210295040.0, "grad_norm": 2.2142935747536314, "language_loss": 0.72040844, "learning_rate": 1.8902894167028794e-07, "loss": 0.74180913, "num_input_tokens_seen": 308723340, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.67578125, "step": 14312, "time_per_iteration": 2.719695806503296 }, { "auxiliary_loss_clip": 0.01119234, "auxiliary_loss_mlp": 0.01030781, "balance_loss_clip": 1.01851487, "balance_loss_mlp": 1.03214848, "epoch": 0.8605441154366451, "flos": 16252164604800.0, "grad_norm": 1.9767324795848327, "language_loss": 0.77785921, "learning_rate": 1.8886863482711136e-07, "loss": 0.79935932, "num_input_tokens_seen": 308741280, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 14313, "time_per_iteration": 2.531721591949463 }, { "auxiliary_loss_clip": 0.01121085, "auxiliary_loss_mlp": 0.01031874, "balance_loss_clip": 1.01857615, "balance_loss_mlp": 1.03571415, "epoch": 0.8606042386893131, "flos": 32267065921920.0, "grad_norm": 1.6346239935139946, "language_loss": 0.72621012, "learning_rate": 1.8870839261855377e-07, "loss": 0.74773967, "num_input_tokens_seen": 308762875, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.671875, "step": 14314, "time_per_iteration": 2.725614547729492 }, { "auxiliary_loss_clip": 0.01118752, "auxiliary_loss_mlp": 0.01032571, "balance_loss_clip": 1.01995277, "balance_loss_mlp": 1.03244615, "epoch": 0.860664361941981, "flos": 20229774001920.0, "grad_norm": 1.9399932692654955, "language_loss": 0.68992889, "learning_rate": 1.8854821505033458e-07, "loss": 0.71144217, "num_input_tokens_seen": 308780315, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 14315, "time_per_iteration": 2.6893908977508545 }, { "auxiliary_loss_clip": 0.01112472, "auxiliary_loss_mlp": 0.01035587, "balance_loss_clip": 1.02329612, "balance_loss_mlp": 1.03531694, "epoch": 0.8607244851946491, "flos": 25191622944000.0, "grad_norm": 1.77781578465512, "language_loss": 0.72447681, "learning_rate": 1.8838810212817015e-07, "loss": 0.74595737, "num_input_tokens_seen": 308799435, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 14316, "time_per_iteration": 3.5586421489715576 }, { "auxiliary_loss_clip": 0.01125424, "auxiliary_loss_mlp": 0.0102795, "balance_loss_clip": 1.01578498, "balance_loss_mlp": 1.03764474, "epoch": 0.860784608447317, "flos": 20482082110080.0, "grad_norm": 5.393598948682033, "language_loss": 0.82736522, "learning_rate": 1.882280538577743e-07, "loss": 0.84889901, "num_input_tokens_seen": 308817730, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69921875, "step": 14317, "time_per_iteration": 3.262140989303589 }, { "auxiliary_loss_clip": 0.01119248, "auxiliary_loss_mlp": 0.01031596, "balance_loss_clip": 1.01918077, "balance_loss_mlp": 1.03470683, "epoch": 0.860844731699985, "flos": 50476037696640.0, "grad_norm": 1.8492516159007153, "language_loss": 0.66922981, "learning_rate": 1.880680702448585e-07, "loss": 0.6907382, "num_input_tokens_seen": 308841735, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.66796875, "step": 14318, "time_per_iteration": 5.700196981430054 }, { "auxiliary_loss_clip": 0.01108411, "auxiliary_loss_mlp": 0.01027675, "balance_loss_clip": 1.01614118, "balance_loss_mlp": 1.03319097, "epoch": 0.8609048549526529, "flos": 41172768455040.0, "grad_norm": 1.4873684154770412, "language_loss": 0.71297199, "learning_rate": 1.879081512951326e-07, "loss": 0.7343328, "num_input_tokens_seen": 308865050, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 14319, "time_per_iteration": 4.960399150848389 }, { "auxiliary_loss_clip": 0.01126388, "auxiliary_loss_mlp": 0.01280962, "balance_loss_clip": 1.02142763, "balance_loss_mlp": 1.037974, "epoch": 0.8609649782053209, "flos": 23951196622080.0, "grad_norm": 1.9182858729412442, "language_loss": 0.67301261, "learning_rate": 1.8774829701430407e-07, "loss": 0.6970861, "num_input_tokens_seen": 308885375, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 14320, "time_per_iteration": 3.061715602874756 }, { "auxiliary_loss_clip": 0.01125435, "auxiliary_loss_mlp": 0.01034171, "balance_loss_clip": 1.02124238, "balance_loss_mlp": 1.03522348, "epoch": 0.8610251014579888, "flos": 23112574813440.0, "grad_norm": 1.9548446082939066, "language_loss": 0.79730153, "learning_rate": 1.875885074080763e-07, "loss": 0.81889766, "num_input_tokens_seen": 308904700, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 14321, "time_per_iteration": 2.9932289123535156 }, { "auxiliary_loss_clip": 0.01109362, "auxiliary_loss_mlp": 0.01276728, "balance_loss_clip": 1.0168767, "balance_loss_mlp": 1.03408194, "epoch": 0.8610852247106568, "flos": 19426811420160.0, "grad_norm": 1.8050096441839527, "language_loss": 0.70877576, "learning_rate": 1.87428782482153e-07, "loss": 0.73263663, "num_input_tokens_seen": 308922985, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6640625, "step": 14322, "time_per_iteration": 3.1098973751068115 }, { "auxiliary_loss_clip": 0.01125266, "auxiliary_loss_mlp": 0.01038141, "balance_loss_clip": 1.02473032, "balance_loss_mlp": 1.03477407, "epoch": 0.8611453479633249, "flos": 19312076401920.0, "grad_norm": 1.991668225241887, "language_loss": 0.55932844, "learning_rate": 1.87269122242234e-07, "loss": 0.58096248, "num_input_tokens_seen": 308940765, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73046875, "step": 14323, "time_per_iteration": 3.079379081726074 }, { "auxiliary_loss_clip": 0.01127274, "auxiliary_loss_mlp": 0.01027315, "balance_loss_clip": 1.01524496, "balance_loss_mlp": 1.03370953, "epoch": 0.8612054712159928, "flos": 23253667436160.0, "grad_norm": 1.8127590832933602, "language_loss": 0.75594628, "learning_rate": 1.8710952669401725e-07, "loss": 0.77749217, "num_input_tokens_seen": 308960110, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.671875, "step": 14324, "time_per_iteration": 3.218891143798828 }, { "auxiliary_loss_clip": 0.0113997, "auxiliary_loss_mlp": 0.01031003, "balance_loss_clip": 1.01778293, "balance_loss_mlp": 1.03524375, "epoch": 0.8612655944686608, "flos": 16028440744320.0, "grad_norm": 2.438859462108209, "language_loss": 0.66477585, "learning_rate": 1.869499958431977e-07, "loss": 0.68648565, "num_input_tokens_seen": 308976665, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.69140625, "step": 14325, "time_per_iteration": 3.089118719100952 }, { "auxiliary_loss_clip": 0.01113896, "auxiliary_loss_mlp": 0.01034561, "balance_loss_clip": 1.0214721, "balance_loss_mlp": 1.03442585, "epoch": 0.8613257177213287, "flos": 22492720788480.0, "grad_norm": 2.486294911137563, "language_loss": 0.6499567, "learning_rate": 1.8679052969546948e-07, "loss": 0.67144132, "num_input_tokens_seen": 308997015, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 14326, "time_per_iteration": 3.1815409660339355 }, { "auxiliary_loss_clip": 0.01121484, "auxiliary_loss_mlp": 0.01031895, "balance_loss_clip": 1.01899672, "balance_loss_mlp": 1.03412247, "epoch": 0.8613858409739967, "flos": 17238056175360.0, "grad_norm": 2.1300214891875866, "language_loss": 0.84437513, "learning_rate": 1.8663112825652294e-07, "loss": 0.86590892, "num_input_tokens_seen": 309015250, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 14327, "time_per_iteration": 2.954130172729492 }, { "auxiliary_loss_clip": 0.01112145, "auxiliary_loss_mlp": 0.01032551, "balance_loss_clip": 1.02005827, "balance_loss_mlp": 1.03409767, "epoch": 0.8614459642266646, "flos": 22821123859200.0, "grad_norm": 1.8786033412157022, "language_loss": 0.75409317, "learning_rate": 1.8647179153204706e-07, "loss": 0.77554017, "num_input_tokens_seen": 309034140, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 14328, "time_per_iteration": 3.061985969543457 }, { "auxiliary_loss_clip": 0.01110587, "auxiliary_loss_mlp": 0.01028786, "balance_loss_clip": 1.0164237, "balance_loss_mlp": 1.03325129, "epoch": 0.8615060874793327, "flos": 26504301473280.0, "grad_norm": 1.6106672049891178, "language_loss": 0.8014667, "learning_rate": 1.8631251952772774e-07, "loss": 0.82286042, "num_input_tokens_seen": 309055075, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.68359375, "step": 14329, "time_per_iteration": 3.143353223800659 }, { "auxiliary_loss_clip": 0.01121658, "auxiliary_loss_mlp": 0.01028727, "balance_loss_clip": 1.01613832, "balance_loss_mlp": 1.03412211, "epoch": 0.8615662107320006, "flos": 24061011477120.0, "grad_norm": 2.14428652717372, "language_loss": 0.77033669, "learning_rate": 1.861533122492498e-07, "loss": 0.79184055, "num_input_tokens_seen": 309074650, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 14330, "time_per_iteration": 3.063910722732544 }, { "auxiliary_loss_clip": 0.01112012, "auxiliary_loss_mlp": 0.01029928, "balance_loss_clip": 1.01721406, "balance_loss_mlp": 1.03384686, "epoch": 0.8616263339846686, "flos": 24165044242560.0, "grad_norm": 2.2631098879749785, "language_loss": 0.65032542, "learning_rate": 1.8599416970229354e-07, "loss": 0.67174482, "num_input_tokens_seen": 309094385, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 14331, "time_per_iteration": 3.242399215698242 }, { "auxiliary_loss_clip": 0.01120654, "auxiliary_loss_mlp": 0.01036076, "balance_loss_clip": 1.0248822, "balance_loss_mlp": 1.03506005, "epoch": 0.8616864572373365, "flos": 21724340025600.0, "grad_norm": 1.6001983391684633, "language_loss": 0.76196313, "learning_rate": 1.8583509189253977e-07, "loss": 0.78353047, "num_input_tokens_seen": 309111815, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6796875, "step": 14332, "time_per_iteration": 3.1805617809295654 }, { "auxiliary_loss_clip": 0.01114581, "auxiliary_loss_mlp": 0.01030191, "balance_loss_clip": 1.01648784, "balance_loss_mlp": 1.03329992, "epoch": 0.8617465804900045, "flos": 23766651521280.0, "grad_norm": 1.9161087081945354, "language_loss": 0.67113775, "learning_rate": 1.8567607882566484e-07, "loss": 0.69258547, "num_input_tokens_seen": 309131385, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 14333, "time_per_iteration": 3.147280693054199 }, { "auxiliary_loss_clip": 0.01127275, "auxiliary_loss_mlp": 0.01036962, "balance_loss_clip": 1.02269304, "balance_loss_mlp": 1.03543448, "epoch": 0.8618067037426724, "flos": 37703941251840.0, "grad_norm": 2.5280641632678384, "language_loss": 0.62390429, "learning_rate": 1.8551713050734375e-07, "loss": 0.64554662, "num_input_tokens_seen": 309155020, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.73828125, "step": 14334, "time_per_iteration": 3.1970221996307373 }, { "auxiliary_loss_clip": 0.01135078, "auxiliary_loss_mlp": 0.01039726, "balance_loss_clip": 1.0269711, "balance_loss_mlp": 1.03750682, "epoch": 0.8618668269953405, "flos": 21471026336640.0, "grad_norm": 2.5870042183892803, "language_loss": 0.69152546, "learning_rate": 1.8535824694324885e-07, "loss": 0.71327353, "num_input_tokens_seen": 309172865, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 14335, "time_per_iteration": 2.7658519744873047 }, { "auxiliary_loss_clip": 0.01115485, "auxiliary_loss_mlp": 0.01032613, "balance_loss_clip": 1.01986396, "balance_loss_mlp": 1.03623128, "epoch": 0.8619269502480085, "flos": 22232691256320.0, "grad_norm": 1.4801428536004206, "language_loss": 0.82844937, "learning_rate": 1.8519942813904986e-07, "loss": 0.84993035, "num_input_tokens_seen": 309193575, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 14336, "time_per_iteration": 2.588883876800537 }, { "auxiliary_loss_clip": 0.01106478, "auxiliary_loss_mlp": 0.01034345, "balance_loss_clip": 1.02123165, "balance_loss_mlp": 1.03430915, "epoch": 0.8619870735006764, "flos": 22710626645760.0, "grad_norm": 2.006572967975887, "language_loss": 0.67604792, "learning_rate": 1.8504067410041579e-07, "loss": 0.69745612, "num_input_tokens_seen": 309212680, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 14337, "time_per_iteration": 2.5482017993927 }, { "auxiliary_loss_clip": 0.0112941, "auxiliary_loss_mlp": 0.0103096, "balance_loss_clip": 1.01847923, "balance_loss_mlp": 1.03413534, "epoch": 0.8620471967533444, "flos": 37520293991040.0, "grad_norm": 1.4853967879009193, "language_loss": 0.66997242, "learning_rate": 1.8488198483301076e-07, "loss": 0.69157612, "num_input_tokens_seen": 309234485, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 14338, "time_per_iteration": 2.7576985359191895 }, { "auxiliary_loss_clip": 0.0112337, "auxiliary_loss_mlp": 0.01028744, "balance_loss_clip": 1.01721025, "balance_loss_mlp": 1.03590405, "epoch": 0.8621073200060123, "flos": 19682459493120.0, "grad_norm": 1.890932983596505, "language_loss": 0.62490785, "learning_rate": 1.8472336034249914e-07, "loss": 0.64642906, "num_input_tokens_seen": 309253630, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.703125, "step": 14339, "time_per_iteration": 2.5617356300354004 }, { "auxiliary_loss_clip": 0.01119656, "auxiliary_loss_mlp": 0.01278639, "balance_loss_clip": 1.01956117, "balance_loss_mlp": 1.03337717, "epoch": 0.8621674432586803, "flos": 14536855549440.0, "grad_norm": 2.223940055494413, "language_loss": 0.62318677, "learning_rate": 1.8456480063454105e-07, "loss": 0.64716971, "num_input_tokens_seen": 309270950, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.68359375, "step": 14340, "time_per_iteration": 2.606227159500122 }, { "auxiliary_loss_clip": 0.01133141, "auxiliary_loss_mlp": 0.01024249, "balance_loss_clip": 1.01305556, "balance_loss_mlp": 1.03397667, "epoch": 0.8622275665113482, "flos": 20740100480640.0, "grad_norm": 1.6677205377228903, "language_loss": 0.8018086, "learning_rate": 1.8440630571479555e-07, "loss": 0.8233825, "num_input_tokens_seen": 309288780, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.63671875, "step": 14341, "time_per_iteration": 2.5678532123565674 }, { "auxiliary_loss_clip": 0.01110899, "auxiliary_loss_mlp": 0.01032296, "balance_loss_clip": 1.02037454, "balance_loss_mlp": 1.03436446, "epoch": 0.8622876897640163, "flos": 24715914197760.0, "grad_norm": 1.7073833864698873, "language_loss": 0.74568725, "learning_rate": 1.842478755889183e-07, "loss": 0.76711917, "num_input_tokens_seen": 309310875, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.67578125, "step": 14342, "time_per_iteration": 2.5392937660217285 }, { "auxiliary_loss_clip": 0.01130825, "auxiliary_loss_mlp": 0.01028883, "balance_loss_clip": 1.01590657, "balance_loss_mlp": 1.03435159, "epoch": 0.8623478130166842, "flos": 17457362663040.0, "grad_norm": 1.694821267093568, "language_loss": 0.68636304, "learning_rate": 1.8408951026256415e-07, "loss": 0.70796013, "num_input_tokens_seen": 309329900, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 14343, "time_per_iteration": 2.5992684364318848 }, { "auxiliary_loss_clip": 0.01112022, "auxiliary_loss_mlp": 0.01045227, "balance_loss_clip": 1.03127348, "balance_loss_mlp": 1.0332787, "epoch": 0.8624079362693522, "flos": 18109176814080.0, "grad_norm": 1.7967473629716506, "language_loss": 0.67922008, "learning_rate": 1.8393120974138433e-07, "loss": 0.70079255, "num_input_tokens_seen": 309347870, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.69921875, "step": 14344, "time_per_iteration": 2.49241304397583 }, { "auxiliary_loss_clip": 0.01133512, "auxiliary_loss_mlp": 0.01040851, "balance_loss_clip": 1.02793479, "balance_loss_mlp": 1.03508425, "epoch": 0.8624680595220201, "flos": 30666455971200.0, "grad_norm": 1.383285644910612, "language_loss": 0.81420672, "learning_rate": 1.8377297403102809e-07, "loss": 0.83595037, "num_input_tokens_seen": 309371695, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 14345, "time_per_iteration": 2.634725332260132 }, { "auxiliary_loss_clip": 0.01116739, "auxiliary_loss_mlp": 0.01035631, "balance_loss_clip": 1.02241683, "balance_loss_mlp": 1.03641903, "epoch": 0.8625281827746881, "flos": 37998588516480.0, "grad_norm": 2.5466264771822917, "language_loss": 0.50173116, "learning_rate": 1.8361480313714207e-07, "loss": 0.52325487, "num_input_tokens_seen": 309394645, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 14346, "time_per_iteration": 4.042406797409058 }, { "auxiliary_loss_clip": 0.0115589, "auxiliary_loss_mlp": 0.01033215, "balance_loss_clip": 1.02053738, "balance_loss_mlp": 1.03323281, "epoch": 0.862588306027356, "flos": 26249730808320.0, "grad_norm": 1.461932479369142, "language_loss": 0.74790221, "learning_rate": 1.8345669706537214e-07, "loss": 0.76979327, "num_input_tokens_seen": 309413170, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 14347, "time_per_iteration": 2.6421725749969482 }, { "auxiliary_loss_clip": 0.01124028, "auxiliary_loss_mlp": 0.01028573, "balance_loss_clip": 1.01547766, "balance_loss_mlp": 1.03687549, "epoch": 0.862648429280024, "flos": 25878809013120.0, "grad_norm": 1.9071947101953222, "language_loss": 0.80722803, "learning_rate": 1.832986558213594e-07, "loss": 0.82875401, "num_input_tokens_seen": 309431315, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 14348, "time_per_iteration": 2.5746428966522217 }, { "auxiliary_loss_clip": 0.01106919, "auxiliary_loss_mlp": 0.01028941, "balance_loss_clip": 1.01701403, "balance_loss_mlp": 1.03176498, "epoch": 0.862708552532692, "flos": 37816413713280.0, "grad_norm": 1.6088266368989346, "language_loss": 0.6604135, "learning_rate": 1.8314067941074484e-07, "loss": 0.68177211, "num_input_tokens_seen": 309453020, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6640625, "step": 14349, "time_per_iteration": 2.6947507858276367 }, { "auxiliary_loss_clip": 0.01111894, "auxiliary_loss_mlp": 0.01036611, "balance_loss_clip": 1.02409387, "balance_loss_mlp": 1.03440499, "epoch": 0.86276867578536, "flos": 19091800247040.0, "grad_norm": 4.518871315544904, "language_loss": 0.69257337, "learning_rate": 1.8298276783916623e-07, "loss": 0.7140584, "num_input_tokens_seen": 309469780, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 14350, "time_per_iteration": 2.533257484436035 }, { "auxiliary_loss_clip": 0.01120617, "auxiliary_loss_mlp": 0.01033756, "balance_loss_clip": 1.021209, "balance_loss_mlp": 1.03378606, "epoch": 0.862828799038028, "flos": 22164281804160.0, "grad_norm": 2.208507535207266, "language_loss": 0.76791644, "learning_rate": 1.828249211122579e-07, "loss": 0.78946018, "num_input_tokens_seen": 309489610, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 14351, "time_per_iteration": 2.868802309036255 }, { "auxiliary_loss_clip": 0.01111203, "auxiliary_loss_mlp": 0.01026647, "balance_loss_clip": 1.0152266, "balance_loss_mlp": 1.03370595, "epoch": 0.8628889222906959, "flos": 23145576433920.0, "grad_norm": 2.0484930434171833, "language_loss": 0.84652305, "learning_rate": 1.8266713923565423e-07, "loss": 0.86790156, "num_input_tokens_seen": 309508295, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6875, "step": 14352, "time_per_iteration": 4.0513670444488525 }, { "auxiliary_loss_clip": 0.01032082, "auxiliary_loss_mlp": 0.01002221, "balance_loss_clip": 1.00093961, "balance_loss_mlp": 1.00186634, "epoch": 0.8629490455433639, "flos": 57817762151040.0, "grad_norm": 0.790709853084732, "language_loss": 0.61978149, "learning_rate": 1.8250942221498544e-07, "loss": 0.64012456, "num_input_tokens_seen": 309567960, "router_z_loss_clip": 0.01281738, "router_z_loss_mlp": 0.21289062, "step": 14353, "time_per_iteration": 3.179760694503784 }, { "auxiliary_loss_clip": 0.01102876, "auxiliary_loss_mlp": 0.01028347, "balance_loss_clip": 1.01676631, "balance_loss_mlp": 1.03563237, "epoch": 0.8630091687960318, "flos": 15919667383680.0, "grad_norm": 1.6837904437093805, "language_loss": 0.82054889, "learning_rate": 1.8235177005588075e-07, "loss": 0.84186113, "num_input_tokens_seen": 309586050, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.671875, "step": 14354, "time_per_iteration": 3.2623960971832275 }, { "auxiliary_loss_clip": 0.01121211, "auxiliary_loss_mlp": 0.01027608, "balance_loss_clip": 1.0148766, "balance_loss_mlp": 1.03401375, "epoch": 0.8630692920486999, "flos": 17961691570560.0, "grad_norm": 2.443574667832704, "language_loss": 0.85541606, "learning_rate": 1.8219418276396524e-07, "loss": 0.87690431, "num_input_tokens_seen": 309602910, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 14355, "time_per_iteration": 3.3588733673095703 }, { "auxiliary_loss_clip": 0.01130728, "auxiliary_loss_mlp": 0.01030343, "balance_loss_clip": 1.01721251, "balance_loss_mlp": 1.03301728, "epoch": 0.8631294153013678, "flos": 22455158140800.0, "grad_norm": 1.9285647247064002, "language_loss": 0.58709574, "learning_rate": 1.8203666034486352e-07, "loss": 0.60870647, "num_input_tokens_seen": 309621175, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 14356, "time_per_iteration": 3.3004839420318604 }, { "auxiliary_loss_clip": 0.0114106, "auxiliary_loss_mlp": 0.01034836, "balance_loss_clip": 1.02077496, "balance_loss_mlp": 1.03457224, "epoch": 0.8631895385540358, "flos": 16837005847680.0, "grad_norm": 1.6127211117249864, "language_loss": 0.77168381, "learning_rate": 1.8187920280419733e-07, "loss": 0.79344273, "num_input_tokens_seen": 309639395, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.71484375, "step": 14357, "time_per_iteration": 3.2462875843048096 }, { "auxiliary_loss_clip": 0.01114228, "auxiliary_loss_mlp": 0.01031477, "balance_loss_clip": 1.01857281, "balance_loss_mlp": 1.03537154, "epoch": 0.8632496618067037, "flos": 18697214367360.0, "grad_norm": 2.2702999273830184, "language_loss": 0.7800045, "learning_rate": 1.8172181014758548e-07, "loss": 0.80146158, "num_input_tokens_seen": 309657265, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 14358, "time_per_iteration": 3.2132081985473633 }, { "auxiliary_loss_clip": 0.01131522, "auxiliary_loss_mlp": 0.01027795, "balance_loss_clip": 1.01486707, "balance_loss_mlp": 1.03419363, "epoch": 0.8633097850593717, "flos": 24279922915200.0, "grad_norm": 1.73541488402772, "language_loss": 0.75351298, "learning_rate": 1.815644823806448e-07, "loss": 0.77510613, "num_input_tokens_seen": 309678610, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 14359, "time_per_iteration": 5.524081230163574 }, { "auxiliary_loss_clip": 0.01142371, "auxiliary_loss_mlp": 0.01028279, "balance_loss_clip": 1.01581538, "balance_loss_mlp": 1.03189993, "epoch": 0.8633699083120396, "flos": 21178569801600.0, "grad_norm": 1.921463032248089, "language_loss": 0.70440018, "learning_rate": 1.8140721950899041e-07, "loss": 0.7261067, "num_input_tokens_seen": 309697710, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6640625, "step": 14360, "time_per_iteration": 3.022082567214966 }, { "auxiliary_loss_clip": 0.01133534, "auxiliary_loss_mlp": 0.01033193, "balance_loss_clip": 1.01957953, "balance_loss_mlp": 1.03604913, "epoch": 0.8634300315647077, "flos": 19244888012160.0, "grad_norm": 1.9713117387094417, "language_loss": 0.76398987, "learning_rate": 1.8125002153823444e-07, "loss": 0.78565717, "num_input_tokens_seen": 309715985, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7109375, "step": 14361, "time_per_iteration": 5.236161708831787 }, { "auxiliary_loss_clip": 0.0102372, "auxiliary_loss_mlp": 0.01000528, "balance_loss_clip": 0.99931824, "balance_loss_mlp": 1.0024364, "epoch": 0.8634901548173756, "flos": 66195648282240.0, "grad_norm": 0.9496195485762184, "language_loss": 0.58934748, "learning_rate": 1.8109288847398662e-07, "loss": 0.60958993, "num_input_tokens_seen": 309779930, "router_z_loss_clip": 0.01208496, "router_z_loss_mlp": 0.21289062, "step": 14362, "time_per_iteration": 3.6366071701049805 }, { "auxiliary_loss_clip": 0.0115939, "auxiliary_loss_mlp": 0.01031437, "balance_loss_clip": 1.01781762, "balance_loss_mlp": 1.03513479, "epoch": 0.8635502780700436, "flos": 24789531121920.0, "grad_norm": 1.6108125373560669, "language_loss": 0.80672896, "learning_rate": 1.8093582032185471e-07, "loss": 0.8286373, "num_input_tokens_seen": 309800580, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71484375, "step": 14363, "time_per_iteration": 3.3300888538360596 }, { "auxiliary_loss_clip": 0.01123307, "auxiliary_loss_mlp": 0.01037248, "balance_loss_clip": 1.02385521, "balance_loss_mlp": 1.03419387, "epoch": 0.8636104013227116, "flos": 25189970918400.0, "grad_norm": 2.270792109647762, "language_loss": 0.72489583, "learning_rate": 1.807788170874449e-07, "loss": 0.74650139, "num_input_tokens_seen": 309821725, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 14364, "time_per_iteration": 2.961488723754883 }, { "auxiliary_loss_clip": 0.01123554, "auxiliary_loss_mlp": 0.0103048, "balance_loss_clip": 1.01874948, "balance_loss_mlp": 1.03452873, "epoch": 0.8636705245753795, "flos": 23878441624320.0, "grad_norm": 2.398281189032434, "language_loss": 0.71707165, "learning_rate": 1.8062187877635892e-07, "loss": 0.73861206, "num_input_tokens_seen": 309841565, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.7109375, "step": 14365, "time_per_iteration": 3.029702663421631 }, { "auxiliary_loss_clip": 0.01126753, "auxiliary_loss_mlp": 0.01035903, "balance_loss_clip": 1.02266467, "balance_loss_mlp": 1.03758955, "epoch": 0.8637306478280475, "flos": 23110455911040.0, "grad_norm": 2.176258553311648, "language_loss": 0.71312195, "learning_rate": 1.8046500539419784e-07, "loss": 0.73474848, "num_input_tokens_seen": 309858635, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 14366, "time_per_iteration": 3.079782247543335 }, { "auxiliary_loss_clip": 0.01119542, "auxiliary_loss_mlp": 0.01028229, "balance_loss_clip": 1.01695228, "balance_loss_mlp": 1.03486836, "epoch": 0.8637907710807154, "flos": 21906802137600.0, "grad_norm": 1.8216782232744957, "language_loss": 0.82020617, "learning_rate": 1.803081969465612e-07, "loss": 0.84168386, "num_input_tokens_seen": 309877885, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.66796875, "step": 14367, "time_per_iteration": 3.0946943759918213 }, { "auxiliary_loss_clip": 0.01123129, "auxiliary_loss_mlp": 0.01028221, "balance_loss_clip": 1.01436877, "balance_loss_mlp": 1.03478479, "epoch": 0.8638508943333835, "flos": 23580526222080.0, "grad_norm": 1.9910038191228203, "language_loss": 0.61665869, "learning_rate": 1.8015145343904337e-07, "loss": 0.63817215, "num_input_tokens_seen": 309893140, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.703125, "step": 14368, "time_per_iteration": 3.3223602771759033 }, { "auxiliary_loss_clip": 0.0111483, "auxiliary_loss_mlp": 0.01029442, "balance_loss_clip": 1.01712823, "balance_loss_mlp": 1.03734183, "epoch": 0.8639110175860514, "flos": 25775853655680.0, "grad_norm": 2.2344753093504903, "language_loss": 0.76273668, "learning_rate": 1.7999477487723925e-07, "loss": 0.78417933, "num_input_tokens_seen": 309914175, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 14369, "time_per_iteration": 3.0986104011535645 }, { "auxiliary_loss_clip": 0.01123683, "auxiliary_loss_mlp": 0.01034973, "balance_loss_clip": 1.02154458, "balance_loss_mlp": 1.03409958, "epoch": 0.8639711408387194, "flos": 23369443948800.0, "grad_norm": 1.7078239402755173, "language_loss": 0.6410054, "learning_rate": 1.7983816126673968e-07, "loss": 0.66259193, "num_input_tokens_seen": 309932395, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71484375, "step": 14370, "time_per_iteration": 3.089662551879883 }, { "auxiliary_loss_clip": 0.01149635, "auxiliary_loss_mlp": 0.01031978, "balance_loss_clip": 1.01929998, "balance_loss_mlp": 1.03622937, "epoch": 0.8640312640913873, "flos": 22127221946880.0, "grad_norm": 1.7617017952238936, "language_loss": 0.66637194, "learning_rate": 1.7968161261313486e-07, "loss": 0.68818808, "num_input_tokens_seen": 309951720, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.68359375, "step": 14371, "time_per_iteration": 3.2336156368255615 }, { "auxiliary_loss_clip": 0.01040877, "auxiliary_loss_mlp": 0.01003592, "balance_loss_clip": 1.00232875, "balance_loss_mlp": 1.00221741, "epoch": 0.8640913873440553, "flos": 57571735944960.0, "grad_norm": 0.8606899560034846, "language_loss": 0.5690155, "learning_rate": 1.795251289220101e-07, "loss": 0.58946025, "num_input_tokens_seen": 310006120, "router_z_loss_clip": 0.01263428, "router_z_loss_mlp": 0.21484375, "step": 14372, "time_per_iteration": 3.2559192180633545 }, { "auxiliary_loss_clip": 0.01102107, "auxiliary_loss_mlp": 0.0103123, "balance_loss_clip": 1.01843262, "balance_loss_mlp": 1.03562009, "epoch": 0.8641515105967232, "flos": 23987430466560.0, "grad_norm": 1.7206542059385814, "language_loss": 0.79409134, "learning_rate": 1.7936871019895115e-07, "loss": 0.81542468, "num_input_tokens_seen": 310026740, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6640625, "step": 14373, "time_per_iteration": 2.498201370239258 }, { "auxiliary_loss_clip": 0.01104072, "auxiliary_loss_mlp": 0.010279, "balance_loss_clip": 1.01563919, "balance_loss_mlp": 1.03438795, "epoch": 0.8642116338493913, "flos": 19062749122560.0, "grad_norm": 1.947282662156976, "language_loss": 0.63592309, "learning_rate": 1.7921235644953957e-07, "loss": 0.65724277, "num_input_tokens_seen": 310044135, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 14374, "time_per_iteration": 2.446321487426758 }, { "auxiliary_loss_clip": 0.01120288, "auxiliary_loss_mlp": 0.01035829, "balance_loss_clip": 1.02364039, "balance_loss_mlp": 1.03456855, "epoch": 0.8642717571020592, "flos": 18254148105600.0, "grad_norm": 1.698967429078856, "language_loss": 0.77625084, "learning_rate": 1.7905606767935554e-07, "loss": 0.79781204, "num_input_tokens_seen": 310061560, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 14375, "time_per_iteration": 2.488859176635742 }, { "auxiliary_loss_clip": 0.01139369, "auxiliary_loss_mlp": 0.01281679, "balance_loss_clip": 1.02199197, "balance_loss_mlp": 1.03480518, "epoch": 0.8643318803547272, "flos": 57663270777600.0, "grad_norm": 2.6006925791443787, "language_loss": 0.60698545, "learning_rate": 1.7889984389397594e-07, "loss": 0.63119596, "num_input_tokens_seen": 310087310, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 14376, "time_per_iteration": 2.896735429763794 }, { "auxiliary_loss_clip": 0.01128599, "auxiliary_loss_mlp": 0.01033713, "balance_loss_clip": 1.02220345, "balance_loss_mlp": 1.03400266, "epoch": 0.8643920036073952, "flos": 19609524927360.0, "grad_norm": 1.4810591914356506, "language_loss": 0.66529, "learning_rate": 1.787436850989772e-07, "loss": 0.68691313, "num_input_tokens_seen": 310106260, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.67578125, "step": 14377, "time_per_iteration": 2.537412643432617 }, { "auxiliary_loss_clip": 0.01122086, "auxiliary_loss_mlp": 0.01033032, "balance_loss_clip": 1.02043748, "balance_loss_mlp": 1.03355217, "epoch": 0.8644521268600631, "flos": 20850346298880.0, "grad_norm": 2.048859442899959, "language_loss": 0.7002297, "learning_rate": 1.7858759129993083e-07, "loss": 0.7217809, "num_input_tokens_seen": 310125305, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 14378, "time_per_iteration": 2.635589122772217 }, { "auxiliary_loss_clip": 0.01032042, "auxiliary_loss_mlp": 0.01002108, "balance_loss_clip": 1.00083244, "balance_loss_mlp": 1.00197256, "epoch": 0.8645122501127311, "flos": 66719550101760.0, "grad_norm": 0.8017931479747935, "language_loss": 0.6035217, "learning_rate": 1.7843156250240843e-07, "loss": 0.62386322, "num_input_tokens_seen": 310189270, "router_z_loss_clip": 0.01275635, "router_z_loss_mlp": 0.21386719, "step": 14379, "time_per_iteration": 3.1540334224700928 }, { "auxiliary_loss_clip": 0.01119886, "auxiliary_loss_mlp": 0.0103144, "balance_loss_clip": 1.01950717, "balance_loss_mlp": 1.03413296, "epoch": 0.864572373365399, "flos": 21690009601920.0, "grad_norm": 1.7121403803362854, "language_loss": 0.74747789, "learning_rate": 1.7827559871197794e-07, "loss": 0.76899117, "num_input_tokens_seen": 310208395, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.671875, "step": 14380, "time_per_iteration": 2.5849287509918213 }, { "auxiliary_loss_clip": 0.01123386, "auxiliary_loss_mlp": 0.01032108, "balance_loss_clip": 1.01971042, "balance_loss_mlp": 1.03381586, "epoch": 0.8646324966180671, "flos": 20266402896000.0, "grad_norm": 2.062752946974774, "language_loss": 0.75239402, "learning_rate": 1.7811969993420516e-07, "loss": 0.77394897, "num_input_tokens_seen": 310227415, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71484375, "step": 14381, "time_per_iteration": 2.5690531730651855 }, { "auxiliary_loss_clip": 0.01119464, "auxiliary_loss_mlp": 0.01033605, "balance_loss_clip": 1.02166617, "balance_loss_mlp": 1.03438032, "epoch": 0.864692619870735, "flos": 11946188050560.0, "grad_norm": 2.3595379545417163, "language_loss": 0.84277034, "learning_rate": 1.779638661746541e-07, "loss": 0.86430103, "num_input_tokens_seen": 310242625, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.671875, "step": 14382, "time_per_iteration": 2.5057249069213867 }, { "auxiliary_loss_clip": 0.01114772, "auxiliary_loss_mlp": 0.01034625, "balance_loss_clip": 1.02138674, "balance_loss_mlp": 1.03557062, "epoch": 0.864752743123403, "flos": 21470703114240.0, "grad_norm": 1.903677738327644, "language_loss": 0.75943291, "learning_rate": 1.7780809743888536e-07, "loss": 0.78092688, "num_input_tokens_seen": 310260585, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 14383, "time_per_iteration": 2.5571281909942627 }, { "auxiliary_loss_clip": 0.01121097, "auxiliary_loss_mlp": 0.01031482, "balance_loss_clip": 1.01944804, "balance_loss_mlp": 1.0364747, "epoch": 0.8648128663760709, "flos": 19530018172800.0, "grad_norm": 1.6505984909302558, "language_loss": 0.84947395, "learning_rate": 1.77652393732459e-07, "loss": 0.87099975, "num_input_tokens_seen": 310277210, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.66796875, "step": 14384, "time_per_iteration": 2.4759232997894287 }, { "auxiliary_loss_clip": 0.01108195, "auxiliary_loss_mlp": 0.01029743, "balance_loss_clip": 1.01756597, "balance_loss_mlp": 1.0322938, "epoch": 0.8648729896287389, "flos": 21287953693440.0, "grad_norm": 1.6272146780468533, "language_loss": 0.80864489, "learning_rate": 1.7749675506093052e-07, "loss": 0.83002424, "num_input_tokens_seen": 310296610, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.67578125, "step": 14385, "time_per_iteration": 2.5033085346221924 }, { "auxiliary_loss_clip": 0.01126522, "auxiliary_loss_mlp": 0.01034274, "balance_loss_clip": 1.02137566, "balance_loss_mlp": 1.03732789, "epoch": 0.8649331128814068, "flos": 24604483230720.0, "grad_norm": 2.661373111479312, "language_loss": 0.72431481, "learning_rate": 1.773411814298551e-07, "loss": 0.74592268, "num_input_tokens_seen": 310316830, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 14386, "time_per_iteration": 2.5280752182006836 }, { "auxiliary_loss_clip": 0.01112127, "auxiliary_loss_mlp": 0.01035345, "balance_loss_clip": 1.023072, "balance_loss_mlp": 1.0348022, "epoch": 0.8649932361340749, "flos": 15377811742080.0, "grad_norm": 2.0233048201597787, "language_loss": 0.82140929, "learning_rate": 1.7718567284478403e-07, "loss": 0.842884, "num_input_tokens_seen": 310334355, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 14387, "time_per_iteration": 2.472724199295044 }, { "auxiliary_loss_clip": 0.01099669, "auxiliary_loss_mlp": 0.01027251, "balance_loss_clip": 1.01577091, "balance_loss_mlp": 1.03383636, "epoch": 0.8650533593867428, "flos": 19901227276800.0, "grad_norm": 2.275786393065253, "language_loss": 0.68405426, "learning_rate": 1.770302293112682e-07, "loss": 0.70532346, "num_input_tokens_seen": 310352900, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.66015625, "step": 14388, "time_per_iteration": 3.8630518913269043 }, { "auxiliary_loss_clip": 0.01125505, "auxiliary_loss_mlp": 0.01033605, "balance_loss_clip": 1.02243471, "balance_loss_mlp": 1.03398538, "epoch": 0.8651134826394108, "flos": 25626931868160.0, "grad_norm": 1.5735326836010572, "language_loss": 0.90521741, "learning_rate": 1.7687485083485366e-07, "loss": 0.92680848, "num_input_tokens_seen": 310372855, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.65625, "step": 14389, "time_per_iteration": 2.5804495811462402 }, { "auxiliary_loss_clip": 0.01137497, "auxiliary_loss_mlp": 0.01276084, "balance_loss_clip": 1.01762223, "balance_loss_mlp": 1.03502536, "epoch": 0.8651736058920788, "flos": 18734525619840.0, "grad_norm": 1.5343455101534311, "language_loss": 0.70702112, "learning_rate": 1.7671953742108636e-07, "loss": 0.73115695, "num_input_tokens_seen": 310391595, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.67578125, "step": 14390, "time_per_iteration": 2.5997726917266846 }, { "auxiliary_loss_clip": 0.01104528, "auxiliary_loss_mlp": 0.01037718, "balance_loss_clip": 1.02490938, "balance_loss_mlp": 1.03421474, "epoch": 0.8652337291447467, "flos": 20776765288320.0, "grad_norm": 1.5512918751501736, "language_loss": 0.8226282, "learning_rate": 1.7656428907550879e-07, "loss": 0.84405065, "num_input_tokens_seen": 310410090, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 14391, "time_per_iteration": 2.64277982711792 }, { "auxiliary_loss_clip": 0.01050278, "auxiliary_loss_mlp": 0.01003416, "balance_loss_clip": 1.0020746, "balance_loss_mlp": 1.00208783, "epoch": 0.8652938523974147, "flos": 64227887464320.0, "grad_norm": 0.7935089554599576, "language_loss": 0.68078429, "learning_rate": 1.764091058036612e-07, "loss": 0.70132118, "num_input_tokens_seen": 310470055, "router_z_loss_clip": 0.01342773, "router_z_loss_mlp": 0.21386719, "step": 14392, "time_per_iteration": 3.106154680252075 }, { "auxiliary_loss_clip": 0.01116348, "auxiliary_loss_mlp": 0.010307, "balance_loss_clip": 1.01738465, "balance_loss_mlp": 1.03496218, "epoch": 0.8653539756500827, "flos": 18040587793920.0, "grad_norm": 2.091756531579889, "language_loss": 0.75965178, "learning_rate": 1.7625398761108135e-07, "loss": 0.78112227, "num_input_tokens_seen": 310487665, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 14393, "time_per_iteration": 3.895092725753784 }, { "auxiliary_loss_clip": 0.01121461, "auxiliary_loss_mlp": 0.01030906, "balance_loss_clip": 1.01858568, "balance_loss_mlp": 1.0328126, "epoch": 0.8654140989027507, "flos": 17382416935680.0, "grad_norm": 1.7854943117767026, "language_loss": 0.73666131, "learning_rate": 1.7609893450330636e-07, "loss": 0.75818497, "num_input_tokens_seen": 310506130, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.70703125, "step": 14394, "time_per_iteration": 2.7949626445770264 }, { "auxiliary_loss_clip": 0.01114482, "auxiliary_loss_mlp": 0.01032522, "balance_loss_clip": 1.01954627, "balance_loss_mlp": 1.03474212, "epoch": 0.8654742221554186, "flos": 53284862448000.0, "grad_norm": 2.081198551283567, "language_loss": 0.65029073, "learning_rate": 1.759439464858683e-07, "loss": 0.6717608, "num_input_tokens_seen": 310532445, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 14395, "time_per_iteration": 2.824122428894043 }, { "auxiliary_loss_clip": 0.01111276, "auxiliary_loss_mlp": 0.01033694, "balance_loss_clip": 1.02173162, "balance_loss_mlp": 1.03471351, "epoch": 0.8655343454080866, "flos": 23914711382400.0, "grad_norm": 1.8124859114597343, "language_loss": 0.68341315, "learning_rate": 1.7578902356429848e-07, "loss": 0.70486283, "num_input_tokens_seen": 310552300, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.67578125, "step": 14396, "time_per_iteration": 2.536745309829712 }, { "auxiliary_loss_clip": 0.01116359, "auxiliary_loss_mlp": 0.01035873, "balance_loss_clip": 1.02302217, "balance_loss_mlp": 1.03634191, "epoch": 0.8655944686607545, "flos": 24097209408000.0, "grad_norm": 2.4328031887864454, "language_loss": 0.69557965, "learning_rate": 1.7563416574412627e-07, "loss": 0.71710193, "num_input_tokens_seen": 310572710, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 14397, "time_per_iteration": 2.761536121368408 }, { "auxiliary_loss_clip": 0.01119379, "auxiliary_loss_mlp": 0.01029138, "balance_loss_clip": 1.01700258, "balance_loss_mlp": 1.03239512, "epoch": 0.8656545919134225, "flos": 23112718467840.0, "grad_norm": 2.0753190713822196, "language_loss": 0.63317454, "learning_rate": 1.754793730308779e-07, "loss": 0.65465963, "num_input_tokens_seen": 310592460, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 14398, "time_per_iteration": 2.7460317611694336 }, { "auxiliary_loss_clip": 0.01131786, "auxiliary_loss_mlp": 0.01030199, "balance_loss_clip": 1.01812279, "balance_loss_mlp": 1.03591359, "epoch": 0.8657147151660904, "flos": 21141761339520.0, "grad_norm": 1.934448491713469, "language_loss": 0.76129597, "learning_rate": 1.753246454300774e-07, "loss": 0.78291583, "num_input_tokens_seen": 310609375, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 14399, "time_per_iteration": 2.6424317359924316 }, { "auxiliary_loss_clip": 0.01110846, "auxiliary_loss_mlp": 0.01028204, "balance_loss_clip": 1.01657557, "balance_loss_mlp": 1.03382707, "epoch": 0.8657748384187585, "flos": 16289439943680.0, "grad_norm": 12.098384520934786, "language_loss": 0.93203753, "learning_rate": 1.7516998294724638e-07, "loss": 0.95342803, "num_input_tokens_seen": 310627405, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 14400, "time_per_iteration": 4.046866416931152 }, { "auxiliary_loss_clip": 0.01150322, "auxiliary_loss_mlp": 0.01035968, "balance_loss_clip": 1.02379632, "balance_loss_mlp": 1.03566003, "epoch": 0.8658349616714264, "flos": 30843890179200.0, "grad_norm": 1.8683189500754247, "language_loss": 0.67485267, "learning_rate": 1.750153855879053e-07, "loss": 0.69671559, "num_input_tokens_seen": 310649945, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6953125, "step": 14401, "time_per_iteration": 2.6745150089263916 }, { "auxiliary_loss_clip": 0.01113275, "auxiliary_loss_mlp": 0.01027015, "balance_loss_clip": 1.01442695, "balance_loss_mlp": 1.03352022, "epoch": 0.8658950849240944, "flos": 18952862440320.0, "grad_norm": 1.8617212753495866, "language_loss": 0.73780894, "learning_rate": 1.7486085335756995e-07, "loss": 0.75921184, "num_input_tokens_seen": 310668285, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 14402, "time_per_iteration": 4.244922399520874 }, { "auxiliary_loss_clip": 0.01154198, "auxiliary_loss_mlp": 0.0103335, "balance_loss_clip": 1.02175713, "balance_loss_mlp": 1.03336358, "epoch": 0.8659552081767624, "flos": 23364344217600.0, "grad_norm": 1.6355547793466332, "language_loss": 0.74974096, "learning_rate": 1.7470638626175637e-07, "loss": 0.77161646, "num_input_tokens_seen": 310687015, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 14403, "time_per_iteration": 2.629912853240967 }, { "auxiliary_loss_clip": 0.01124078, "auxiliary_loss_mlp": 0.01032045, "balance_loss_clip": 1.0191642, "balance_loss_mlp": 1.03615952, "epoch": 0.8660153314294303, "flos": 23841992298240.0, "grad_norm": 1.9104327413109765, "language_loss": 0.73212212, "learning_rate": 1.7455198430597618e-07, "loss": 0.75368333, "num_input_tokens_seen": 310707580, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 14404, "time_per_iteration": 2.629403591156006 }, { "auxiliary_loss_clip": 0.01129726, "auxiliary_loss_mlp": 0.01034492, "balance_loss_clip": 1.02136123, "balance_loss_mlp": 1.03573036, "epoch": 0.8660754546820983, "flos": 23112467072640.0, "grad_norm": 1.6033376678902906, "language_loss": 0.70478296, "learning_rate": 1.7439764749574092e-07, "loss": 0.72642517, "num_input_tokens_seen": 310727300, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.671875, "step": 14405, "time_per_iteration": 2.6104233264923096 }, { "auxiliary_loss_clip": 0.01115059, "auxiliary_loss_mlp": 0.01281025, "balance_loss_clip": 1.0212338, "balance_loss_mlp": 1.03461087, "epoch": 0.8661355779347663, "flos": 14391991998720.0, "grad_norm": 3.1824020724444364, "language_loss": 0.66441357, "learning_rate": 1.7424337583655691e-07, "loss": 0.6883744, "num_input_tokens_seen": 310744935, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 14406, "time_per_iteration": 2.6221179962158203 }, { "auxiliary_loss_clip": 0.01119995, "auxiliary_loss_mlp": 0.01025238, "balance_loss_clip": 1.01390696, "balance_loss_mlp": 1.03407335, "epoch": 0.8661957011874343, "flos": 21870137329920.0, "grad_norm": 1.5212928698860426, "language_loss": 0.83166218, "learning_rate": 1.7408916933393059e-07, "loss": 0.85311449, "num_input_tokens_seen": 310765085, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.68359375, "step": 14407, "time_per_iteration": 2.656374454498291 }, { "auxiliary_loss_clip": 0.01112771, "auxiliary_loss_mlp": 0.01035334, "balance_loss_clip": 1.02276933, "balance_loss_mlp": 1.03607035, "epoch": 0.8662558244401022, "flos": 21835160461440.0, "grad_norm": 1.7810301048184085, "language_loss": 0.70001632, "learning_rate": 1.7393502799336514e-07, "loss": 0.72149736, "num_input_tokens_seen": 310783260, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 14408, "time_per_iteration": 2.5716986656188965 }, { "auxiliary_loss_clip": 0.01106937, "auxiliary_loss_mlp": 0.01031116, "balance_loss_clip": 1.01993418, "balance_loss_mlp": 1.03231168, "epoch": 0.8663159476927702, "flos": 17384104874880.0, "grad_norm": 2.053317026356081, "language_loss": 0.7728405, "learning_rate": 1.7378095182036146e-07, "loss": 0.79422104, "num_input_tokens_seen": 310801970, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.65625, "step": 14409, "time_per_iteration": 2.598571300506592 }, { "auxiliary_loss_clip": 0.01117566, "auxiliary_loss_mlp": 0.01031745, "balance_loss_clip": 1.01772571, "balance_loss_mlp": 1.03701198, "epoch": 0.8663760709454381, "flos": 22747722416640.0, "grad_norm": 1.7799210035175248, "language_loss": 0.77021945, "learning_rate": 1.7362694082041784e-07, "loss": 0.79171252, "num_input_tokens_seen": 310822070, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.71875, "step": 14410, "time_per_iteration": 2.5752182006835938 }, { "auxiliary_loss_clip": 0.01129897, "auxiliary_loss_mlp": 0.01033189, "balance_loss_clip": 1.02071381, "balance_loss_mlp": 1.03402948, "epoch": 0.8664361941981061, "flos": 17376850327680.0, "grad_norm": 1.9315818581107074, "language_loss": 0.77624267, "learning_rate": 1.734729949990308e-07, "loss": 0.79787362, "num_input_tokens_seen": 310838355, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6875, "step": 14411, "time_per_iteration": 2.54723858833313 }, { "auxiliary_loss_clip": 0.01118168, "auxiliary_loss_mlp": 0.01032968, "balance_loss_clip": 1.02110684, "balance_loss_mlp": 1.03186607, "epoch": 0.866496317450774, "flos": 16034438315520.0, "grad_norm": 1.9378515391782407, "language_loss": 0.7403127, "learning_rate": 1.7331911436169411e-07, "loss": 0.76182407, "num_input_tokens_seen": 310856055, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 14412, "time_per_iteration": 2.7227258682250977 }, { "auxiliary_loss_clip": 0.01108809, "auxiliary_loss_mlp": 0.01274171, "balance_loss_clip": 1.01672626, "balance_loss_mlp": 1.03398192, "epoch": 0.8665564407034421, "flos": 20814830726400.0, "grad_norm": 1.8105184494419697, "language_loss": 0.69682193, "learning_rate": 1.7316529891389897e-07, "loss": 0.72065169, "num_input_tokens_seen": 310876695, "router_z_loss_clip": 0.10742188, "router_z_loss_mlp": 0.66015625, "step": 14413, "time_per_iteration": 2.843313694000244 }, { "auxiliary_loss_clip": 0.01112916, "auxiliary_loss_mlp": 0.01030663, "balance_loss_clip": 1.01785445, "balance_loss_mlp": 1.0356698, "epoch": 0.86661656395611, "flos": 15815167741440.0, "grad_norm": 2.0472586612521786, "language_loss": 0.62828815, "learning_rate": 1.7301154866113565e-07, "loss": 0.64972401, "num_input_tokens_seen": 310893880, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 14414, "time_per_iteration": 2.6231162548065186 }, { "auxiliary_loss_clip": 0.01131292, "auxiliary_loss_mlp": 0.01277138, "balance_loss_clip": 1.01817083, "balance_loss_mlp": 1.03590107, "epoch": 0.866676687208778, "flos": 23036910814080.0, "grad_norm": 1.655585048438213, "language_loss": 0.63731647, "learning_rate": 1.7285786360889064e-07, "loss": 0.66140074, "num_input_tokens_seen": 310914145, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 14415, "time_per_iteration": 2.650773525238037 }, { "auxiliary_loss_clip": 0.01049301, "auxiliary_loss_mlp": 0.01001372, "balance_loss_clip": 1.00010824, "balance_loss_mlp": 1.00174594, "epoch": 0.866736810461446, "flos": 63802275212160.0, "grad_norm": 0.9368815792688514, "language_loss": 0.60345984, "learning_rate": 1.7270424376264826e-07, "loss": 0.62396657, "num_input_tokens_seen": 310972825, "router_z_loss_clip": 0.01263428, "router_z_loss_mlp": 0.21289062, "step": 14416, "time_per_iteration": 3.2839157581329346 }, { "auxiliary_loss_clip": 0.01114733, "auxiliary_loss_mlp": 0.01284256, "balance_loss_clip": 1.02409792, "balance_loss_mlp": 1.03387666, "epoch": 0.8667969337141139, "flos": 20449367798400.0, "grad_norm": 1.820508013753933, "language_loss": 0.74411857, "learning_rate": 1.7255068912789094e-07, "loss": 0.76810849, "num_input_tokens_seen": 310992050, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 14417, "time_per_iteration": 2.653139114379883 }, { "auxiliary_loss_clip": 0.01118966, "auxiliary_loss_mlp": 0.01034519, "balance_loss_clip": 1.02232361, "balance_loss_mlp": 1.03342116, "epoch": 0.866857056966782, "flos": 21653703930240.0, "grad_norm": 1.6582515943976395, "language_loss": 0.7480756, "learning_rate": 1.7239719971009947e-07, "loss": 0.7696104, "num_input_tokens_seen": 311011105, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 14418, "time_per_iteration": 2.67588210105896 }, { "auxiliary_loss_clip": 0.01111062, "auxiliary_loss_mlp": 0.01033167, "balance_loss_clip": 1.02117407, "balance_loss_mlp": 1.03399289, "epoch": 0.8669171802194499, "flos": 22852832590080.0, "grad_norm": 1.642632157693041, "language_loss": 0.68271828, "learning_rate": 1.7224377551474988e-07, "loss": 0.70416057, "num_input_tokens_seen": 311032080, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.68359375, "step": 14419, "time_per_iteration": 2.6518049240112305 }, { "auxiliary_loss_clip": 0.01111885, "auxiliary_loss_mlp": 0.0103328, "balance_loss_clip": 1.02176404, "balance_loss_mlp": 1.03384614, "epoch": 0.8669773034721179, "flos": 18734166483840.0, "grad_norm": 2.770137257437777, "language_loss": 0.78680193, "learning_rate": 1.7209041654731892e-07, "loss": 0.80825365, "num_input_tokens_seen": 311049735, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.69140625, "step": 14420, "time_per_iteration": 2.6355791091918945 }, { "auxiliary_loss_clip": 0.01111958, "auxiliary_loss_mlp": 0.01029057, "balance_loss_clip": 1.01723146, "balance_loss_mlp": 1.03500199, "epoch": 0.8670374267247858, "flos": 18916018064640.0, "grad_norm": 2.259920392774669, "language_loss": 0.837152, "learning_rate": 1.7193712281327888e-07, "loss": 0.85856211, "num_input_tokens_seen": 311067675, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 14421, "time_per_iteration": 2.649064779281616 }, { "auxiliary_loss_clip": 0.01041272, "auxiliary_loss_mlp": 0.01000053, "balance_loss_clip": 0.998676, "balance_loss_mlp": 1.00200737, "epoch": 0.8670975499774538, "flos": 48814527214080.0, "grad_norm": 0.7034621965397186, "language_loss": 0.49138898, "learning_rate": 1.717838943181007e-07, "loss": 0.5118022, "num_input_tokens_seen": 311126605, "router_z_loss_clip": 0.01379395, "router_z_loss_mlp": 0.21289062, "step": 14422, "time_per_iteration": 3.2559118270874023 }, { "auxiliary_loss_clip": 0.01120397, "auxiliary_loss_mlp": 0.01276696, "balance_loss_clip": 1.01812315, "balance_loss_mlp": 1.03448141, "epoch": 0.8671576732301217, "flos": 26136145025280.0, "grad_norm": 1.6259368864971968, "language_loss": 0.73389077, "learning_rate": 1.7163073106725223e-07, "loss": 0.75786173, "num_input_tokens_seen": 311147325, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.68359375, "step": 14423, "time_per_iteration": 2.7238080501556396 }, { "auxiliary_loss_clip": 0.01124878, "auxiliary_loss_mlp": 0.01025122, "balance_loss_clip": 1.01404727, "balance_loss_mlp": 1.03294623, "epoch": 0.8672177964827897, "flos": 18367446579840.0, "grad_norm": 1.7752527907756825, "language_loss": 0.76167059, "learning_rate": 1.7147763306620022e-07, "loss": 0.78317058, "num_input_tokens_seen": 311165385, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.65234375, "step": 14424, "time_per_iteration": 2.687671661376953 }, { "auxiliary_loss_clip": 0.01124119, "auxiliary_loss_mlp": 0.01031617, "balance_loss_clip": 1.02040577, "balance_loss_mlp": 1.03287148, "epoch": 0.8672779197354576, "flos": 28545355992960.0, "grad_norm": 1.829646031057663, "language_loss": 0.71313632, "learning_rate": 1.7132460032040851e-07, "loss": 0.73469365, "num_input_tokens_seen": 311185860, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.64453125, "step": 14425, "time_per_iteration": 2.783975601196289 }, { "auxiliary_loss_clip": 0.01113021, "auxiliary_loss_mlp": 0.0103174, "balance_loss_clip": 1.0184598, "balance_loss_mlp": 1.03375196, "epoch": 0.8673380429881257, "flos": 22382474970240.0, "grad_norm": 1.7875019737745688, "language_loss": 0.68116671, "learning_rate": 1.7117163283533697e-07, "loss": 0.70261431, "num_input_tokens_seen": 311205810, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 14426, "time_per_iteration": 2.6577842235565186 }, { "auxiliary_loss_clip": 0.01130456, "auxiliary_loss_mlp": 0.01028358, "balance_loss_clip": 1.01552486, "balance_loss_mlp": 1.03406894, "epoch": 0.8673981662407936, "flos": 20996430912000.0, "grad_norm": 1.823561561371371, "language_loss": 0.70975, "learning_rate": 1.710187306164459e-07, "loss": 0.73133814, "num_input_tokens_seen": 311226080, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 14427, "time_per_iteration": 2.6844804286956787 }, { "auxiliary_loss_clip": 0.01131269, "auxiliary_loss_mlp": 0.01030685, "balance_loss_clip": 1.0185256, "balance_loss_mlp": 1.0358696, "epoch": 0.8674582894934616, "flos": 24426797627520.0, "grad_norm": 1.6845728942440932, "language_loss": 0.67906916, "learning_rate": 1.7086589366919158e-07, "loss": 0.70068866, "num_input_tokens_seen": 311246380, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 14428, "time_per_iteration": 2.7971439361572266 }, { "auxiliary_loss_clip": 0.01120966, "auxiliary_loss_mlp": 0.01026735, "balance_loss_clip": 1.014516, "balance_loss_mlp": 1.03496265, "epoch": 0.8675184127461296, "flos": 20737514701440.0, "grad_norm": 2.4963805675066126, "language_loss": 0.70475167, "learning_rate": 1.7071312199902855e-07, "loss": 0.72622865, "num_input_tokens_seen": 311266465, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 14429, "time_per_iteration": 4.145447731018066 }, { "auxiliary_loss_clip": 0.01119952, "auxiliary_loss_mlp": 0.01025871, "balance_loss_clip": 1.01379538, "balance_loss_mlp": 1.0342623, "epoch": 0.8675785359987975, "flos": 19135647774720.0, "grad_norm": 1.942406240118088, "language_loss": 0.66462231, "learning_rate": 1.7056041561140822e-07, "loss": 0.68608057, "num_input_tokens_seen": 311285075, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 14430, "time_per_iteration": 2.7042462825775146 }, { "auxiliary_loss_clip": 0.01141955, "auxiliary_loss_mlp": 0.01039969, "balance_loss_clip": 1.02633119, "balance_loss_mlp": 1.03558218, "epoch": 0.8676386592514655, "flos": 22710662559360.0, "grad_norm": 1.824099407049013, "language_loss": 0.68903691, "learning_rate": 1.7040777451178133e-07, "loss": 0.71085614, "num_input_tokens_seen": 311303230, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7109375, "step": 14431, "time_per_iteration": 2.673896312713623 }, { "auxiliary_loss_clip": 0.01112674, "auxiliary_loss_mlp": 0.01036196, "balance_loss_clip": 1.02356625, "balance_loss_mlp": 1.03413248, "epoch": 0.8676987825041335, "flos": 14209853109120.0, "grad_norm": 2.1038623997135817, "language_loss": 0.63446879, "learning_rate": 1.7025519870559445e-07, "loss": 0.65595752, "num_input_tokens_seen": 311318070, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 14432, "time_per_iteration": 2.612060070037842 }, { "auxiliary_loss_clip": 0.01112703, "auxiliary_loss_mlp": 0.01040582, "balance_loss_clip": 1.02909589, "balance_loss_mlp": 1.03511965, "epoch": 0.8677589057568015, "flos": 13589927256960.0, "grad_norm": 2.388331761701806, "language_loss": 0.78445733, "learning_rate": 1.7010268819829298e-07, "loss": 0.80599022, "num_input_tokens_seen": 311334885, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6875, "step": 14433, "time_per_iteration": 2.6772217750549316 }, { "auxiliary_loss_clip": 0.01039677, "auxiliary_loss_mlp": 0.01000987, "balance_loss_clip": 0.99960405, "balance_loss_mlp": 1.00110149, "epoch": 0.8678190290094694, "flos": 68933657370240.0, "grad_norm": 0.7058135730952021, "language_loss": 0.58476347, "learning_rate": 1.6995024299531923e-07, "loss": 0.60517007, "num_input_tokens_seen": 311399780, "router_z_loss_clip": 0.01385498, "router_z_loss_mlp": 0.21289062, "step": 14434, "time_per_iteration": 3.458150625228882 }, { "auxiliary_loss_clip": 0.01117169, "auxiliary_loss_mlp": 0.01030444, "balance_loss_clip": 1.01930356, "balance_loss_mlp": 1.0329628, "epoch": 0.8678791522621374, "flos": 32557726776960.0, "grad_norm": 1.7352384793145652, "language_loss": 0.79880488, "learning_rate": 1.697978631021144e-07, "loss": 0.82028097, "num_input_tokens_seen": 311419610, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6640625, "step": 14435, "time_per_iteration": 4.174589157104492 }, { "auxiliary_loss_clip": 0.01120601, "auxiliary_loss_mlp": 0.01274553, "balance_loss_clip": 1.01548946, "balance_loss_mlp": 1.03210187, "epoch": 0.8679392755148053, "flos": 35042637657600.0, "grad_norm": 2.6950372485066874, "language_loss": 0.62427586, "learning_rate": 1.696455485241155e-07, "loss": 0.64822745, "num_input_tokens_seen": 311440045, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.70703125, "step": 14436, "time_per_iteration": 2.8688464164733887 }, { "auxiliary_loss_clip": 0.01122884, "auxiliary_loss_mlp": 0.01033621, "balance_loss_clip": 1.02019787, "balance_loss_mlp": 1.03582823, "epoch": 0.8679993987674733, "flos": 18552494471040.0, "grad_norm": 2.4538790041250578, "language_loss": 0.70728707, "learning_rate": 1.6949329926675904e-07, "loss": 0.72885215, "num_input_tokens_seen": 311456660, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6953125, "step": 14437, "time_per_iteration": 2.689751386642456 }, { "auxiliary_loss_clip": 0.01129428, "auxiliary_loss_mlp": 0.01028529, "balance_loss_clip": 1.01596987, "balance_loss_mlp": 1.03480387, "epoch": 0.8680595220201412, "flos": 27454390162560.0, "grad_norm": 2.6786558949297805, "language_loss": 0.80484891, "learning_rate": 1.6934111533547801e-07, "loss": 0.82642847, "num_input_tokens_seen": 311475460, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.68359375, "step": 14438, "time_per_iteration": 2.6881816387176514 }, { "auxiliary_loss_clip": 0.01106476, "auxiliary_loss_mlp": 0.0102593, "balance_loss_clip": 1.01436698, "balance_loss_mlp": 1.03173232, "epoch": 0.8681196452728093, "flos": 19208797822080.0, "grad_norm": 1.7702618737236617, "language_loss": 0.67449057, "learning_rate": 1.6918899673570363e-07, "loss": 0.69581461, "num_input_tokens_seen": 311494575, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.65625, "step": 14439, "time_per_iteration": 2.5989766120910645 }, { "auxiliary_loss_clip": 0.01109617, "auxiliary_loss_mlp": 0.01034195, "balance_loss_clip": 1.02261996, "balance_loss_mlp": 1.03285003, "epoch": 0.8681797685254772, "flos": 37560442417920.0, "grad_norm": 1.6067179200361337, "language_loss": 0.63708329, "learning_rate": 1.6903694347286447e-07, "loss": 0.65852135, "num_input_tokens_seen": 311515805, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6796875, "step": 14440, "time_per_iteration": 2.80387806892395 }, { "auxiliary_loss_clip": 0.01109663, "auxiliary_loss_mlp": 0.01033809, "balance_loss_clip": 1.02154219, "balance_loss_mlp": 1.03471804, "epoch": 0.8682398917781452, "flos": 23289937194240.0, "grad_norm": 1.611561096261957, "language_loss": 0.6535902, "learning_rate": 1.6888495555238657e-07, "loss": 0.67502487, "num_input_tokens_seen": 311536000, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.66015625, "step": 14441, "time_per_iteration": 2.6155648231506348 }, { "auxiliary_loss_clip": 0.01110381, "auxiliary_loss_mlp": 0.01026698, "balance_loss_clip": 1.01530778, "balance_loss_mlp": 1.03504932, "epoch": 0.8683000150308132, "flos": 21872794936320.0, "grad_norm": 1.5640788564755737, "language_loss": 0.66244692, "learning_rate": 1.687330329796952e-07, "loss": 0.68381768, "num_input_tokens_seen": 311556220, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.66015625, "step": 14442, "time_per_iteration": 4.910135269165039 }, { "auxiliary_loss_clip": 0.01123335, "auxiliary_loss_mlp": 0.01030055, "balance_loss_clip": 1.01691794, "balance_loss_mlp": 1.03416872, "epoch": 0.8683601382834811, "flos": 14647209108480.0, "grad_norm": 1.768771882961923, "language_loss": 0.72262102, "learning_rate": 1.6858117576021069e-07, "loss": 0.74415493, "num_input_tokens_seen": 311572530, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 14443, "time_per_iteration": 2.7635726928710938 }, { "auxiliary_loss_clip": 0.01104215, "auxiliary_loss_mlp": 0.01029941, "balance_loss_clip": 1.01754344, "balance_loss_mlp": 1.03441691, "epoch": 0.8684202615361492, "flos": 26359904799360.0, "grad_norm": 1.7416615473184687, "language_loss": 0.83143389, "learning_rate": 1.6842938389935334e-07, "loss": 0.85277551, "num_input_tokens_seen": 311591105, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 14444, "time_per_iteration": 4.183194637298584 }, { "auxiliary_loss_clip": 0.01130245, "auxiliary_loss_mlp": 0.01033022, "balance_loss_clip": 1.02057648, "balance_loss_mlp": 1.03453553, "epoch": 0.8684803847888171, "flos": 20704010290560.0, "grad_norm": 1.813244666386564, "language_loss": 0.77565259, "learning_rate": 1.6827765740253996e-07, "loss": 0.7972852, "num_input_tokens_seen": 311608350, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6875, "step": 14445, "time_per_iteration": 2.659285545349121 }, { "auxiliary_loss_clip": 0.01112719, "auxiliary_loss_mlp": 0.01033391, "balance_loss_clip": 1.02113032, "balance_loss_mlp": 1.03472865, "epoch": 0.8685405080414851, "flos": 22638123043200.0, "grad_norm": 2.1671548462067984, "language_loss": 0.67643297, "learning_rate": 1.681259962751851e-07, "loss": 0.6978941, "num_input_tokens_seen": 311626380, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 14446, "time_per_iteration": 2.6945559978485107 }, { "auxiliary_loss_clip": 0.01111442, "auxiliary_loss_mlp": 0.01034545, "balance_loss_clip": 1.0219326, "balance_loss_mlp": 1.03495741, "epoch": 0.868600631294153, "flos": 24822065865600.0, "grad_norm": 1.7720815996440065, "language_loss": 0.83046138, "learning_rate": 1.6797440052270105e-07, "loss": 0.85192126, "num_input_tokens_seen": 311644345, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.671875, "step": 14447, "time_per_iteration": 2.7877233028411865 }, { "auxiliary_loss_clip": 0.01155322, "auxiliary_loss_mlp": 0.01030532, "balance_loss_clip": 1.01839042, "balance_loss_mlp": 1.03355479, "epoch": 0.868660754546821, "flos": 25113983696640.0, "grad_norm": 1.8665865836227298, "language_loss": 0.73778331, "learning_rate": 1.6782287015049844e-07, "loss": 0.75964189, "num_input_tokens_seen": 311663340, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.68359375, "step": 14448, "time_per_iteration": 2.7448506355285645 }, { "auxiliary_loss_clip": 0.01121211, "auxiliary_loss_mlp": 0.01030493, "balance_loss_clip": 1.01793456, "balance_loss_mlp": 1.03447735, "epoch": 0.8687208777994889, "flos": 12677832178560.0, "grad_norm": 2.0019513508416265, "language_loss": 0.80938256, "learning_rate": 1.676714051639847e-07, "loss": 0.8308996, "num_input_tokens_seen": 311679860, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 14449, "time_per_iteration": 2.635883331298828 }, { "auxiliary_loss_clip": 0.01048827, "auxiliary_loss_mlp": 0.01001747, "balance_loss_clip": 1.00039434, "balance_loss_mlp": 1.00130808, "epoch": 0.8687810010521569, "flos": 65617235573760.0, "grad_norm": 0.8024767243823215, "language_loss": 0.60580444, "learning_rate": 1.6752000556856528e-07, "loss": 0.62631011, "num_input_tokens_seen": 311738135, "router_z_loss_clip": 0.0135498, "router_z_loss_mlp": 0.21289062, "step": 14450, "time_per_iteration": 3.162000894546509 }, { "auxiliary_loss_clip": 0.01132151, "auxiliary_loss_mlp": 0.01030411, "balance_loss_clip": 1.01797199, "balance_loss_mlp": 1.0350287, "epoch": 0.8688411243048249, "flos": 24244012293120.0, "grad_norm": 6.661372539565555, "language_loss": 0.75815523, "learning_rate": 1.673686713696427e-07, "loss": 0.77978081, "num_input_tokens_seen": 311756975, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 14451, "time_per_iteration": 2.7250444889068604 }, { "auxiliary_loss_clip": 0.01109289, "auxiliary_loss_mlp": 0.01028351, "balance_loss_clip": 1.01619816, "balance_loss_mlp": 1.03303492, "epoch": 0.8689012475574929, "flos": 18221828843520.0, "grad_norm": 2.437832660383471, "language_loss": 0.71882498, "learning_rate": 1.6721740257261874e-07, "loss": 0.74020147, "num_input_tokens_seen": 311771830, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.671875, "step": 14452, "time_per_iteration": 2.609156608581543 }, { "auxiliary_loss_clip": 0.01125065, "auxiliary_loss_mlp": 0.01282206, "balance_loss_clip": 1.02206159, "balance_loss_mlp": 1.03596616, "epoch": 0.8689613708101608, "flos": 19646728439040.0, "grad_norm": 3.399293617096038, "language_loss": 0.72172397, "learning_rate": 1.6706619918289077e-07, "loss": 0.74579668, "num_input_tokens_seen": 311790130, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 14453, "time_per_iteration": 2.6993231773376465 }, { "auxiliary_loss_clip": 0.01112947, "auxiliary_loss_mlp": 0.01035603, "balance_loss_clip": 1.0232712, "balance_loss_mlp": 1.03594649, "epoch": 0.8690214940628288, "flos": 11728749070080.0, "grad_norm": 7.273225198315945, "language_loss": 0.72934878, "learning_rate": 1.6691506120585542e-07, "loss": 0.75083435, "num_input_tokens_seen": 311808360, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 14454, "time_per_iteration": 2.780256509780884 }, { "auxiliary_loss_clip": 0.01103701, "auxiliary_loss_mlp": 0.01031049, "balance_loss_clip": 1.01914585, "balance_loss_mlp": 1.03556967, "epoch": 0.8690816173154968, "flos": 34936450076160.0, "grad_norm": 1.990038257753998, "language_loss": 0.59418523, "learning_rate": 1.6676398864690678e-07, "loss": 0.6155327, "num_input_tokens_seen": 311831325, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 14455, "time_per_iteration": 2.7776060104370117 }, { "auxiliary_loss_clip": 0.01121562, "auxiliary_loss_mlp": 0.01029293, "balance_loss_clip": 1.01665723, "balance_loss_mlp": 1.03329039, "epoch": 0.8691417405681647, "flos": 11614804151040.0, "grad_norm": 2.1584982755000097, "language_loss": 0.80014074, "learning_rate": 1.6661298151143476e-07, "loss": 0.82164925, "num_input_tokens_seen": 311848090, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 14456, "time_per_iteration": 2.5946855545043945 }, { "auxiliary_loss_clip": 0.01152327, "auxiliary_loss_mlp": 0.01036126, "balance_loss_clip": 1.02195239, "balance_loss_mlp": 1.03558278, "epoch": 0.8692018638208328, "flos": 24608038677120.0, "grad_norm": 2.4431180275807485, "language_loss": 0.74580729, "learning_rate": 1.6646203980483e-07, "loss": 0.76769179, "num_input_tokens_seen": 311867855, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7265625, "step": 14457, "time_per_iteration": 2.726901054382324 }, { "auxiliary_loss_clip": 0.01136938, "auxiliary_loss_mlp": 0.01028383, "balance_loss_clip": 1.01687324, "balance_loss_mlp": 1.03466249, "epoch": 0.8692619870735007, "flos": 25995124229760.0, "grad_norm": 1.5243287498531213, "language_loss": 0.78755701, "learning_rate": 1.663111635324783e-07, "loss": 0.80921018, "num_input_tokens_seen": 311888675, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 14458, "time_per_iteration": 2.7660586833953857 }, { "auxiliary_loss_clip": 0.0111142, "auxiliary_loss_mlp": 0.01032491, "balance_loss_clip": 1.01937759, "balance_loss_mlp": 1.03436375, "epoch": 0.8693221103261687, "flos": 18041808856320.0, "grad_norm": 1.7365026216808004, "language_loss": 0.70432341, "learning_rate": 1.6616035269976481e-07, "loss": 0.72576249, "num_input_tokens_seen": 311907310, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.68359375, "step": 14459, "time_per_iteration": 2.7367935180664062 }, { "auxiliary_loss_clip": 0.01115382, "auxiliary_loss_mlp": 0.01031413, "balance_loss_clip": 1.01984954, "balance_loss_mlp": 1.03357577, "epoch": 0.8693822335788366, "flos": 27492347859840.0, "grad_norm": 2.027407575645448, "language_loss": 0.73781657, "learning_rate": 1.660096073120707e-07, "loss": 0.75928456, "num_input_tokens_seen": 311929635, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.640625, "step": 14460, "time_per_iteration": 2.732903003692627 }, { "auxiliary_loss_clip": 0.01125444, "auxiliary_loss_mlp": 0.01033014, "balance_loss_clip": 1.02024722, "balance_loss_mlp": 1.03589702, "epoch": 0.8694423568315046, "flos": 24097712198400.0, "grad_norm": 4.468856124264706, "language_loss": 0.64946151, "learning_rate": 1.6585892737477635e-07, "loss": 0.67104608, "num_input_tokens_seen": 311948800, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.72265625, "step": 14461, "time_per_iteration": 2.702948570251465 }, { "auxiliary_loss_clip": 0.01110653, "auxiliary_loss_mlp": 0.01031909, "balance_loss_clip": 1.01923752, "balance_loss_mlp": 1.03253818, "epoch": 0.8695024800841725, "flos": 18362131367040.0, "grad_norm": 2.200191048947905, "language_loss": 0.82610118, "learning_rate": 1.6570831289325883e-07, "loss": 0.84752673, "num_input_tokens_seen": 311964090, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 14462, "time_per_iteration": 2.773202657699585 }, { "auxiliary_loss_clip": 0.01111744, "auxiliary_loss_mlp": 0.01035804, "balance_loss_clip": 1.02269733, "balance_loss_mlp": 1.0334059, "epoch": 0.8695626033368405, "flos": 14027750133120.0, "grad_norm": 2.030292586649091, "language_loss": 0.64720541, "learning_rate": 1.6555776387289333e-07, "loss": 0.66868091, "num_input_tokens_seen": 311981460, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 14463, "time_per_iteration": 2.5822322368621826 }, { "auxiliary_loss_clip": 0.01125632, "auxiliary_loss_mlp": 0.01036392, "balance_loss_clip": 1.02321315, "balance_loss_mlp": 1.03726137, "epoch": 0.8696227265895085, "flos": 16836862193280.0, "grad_norm": 2.14015391613299, "language_loss": 0.66574305, "learning_rate": 1.6540728031905227e-07, "loss": 0.68736327, "num_input_tokens_seen": 312000115, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 14464, "time_per_iteration": 2.592125654220581 }, { "auxiliary_loss_clip": 0.01123506, "auxiliary_loss_mlp": 0.01035361, "balance_loss_clip": 1.0228796, "balance_loss_mlp": 1.03432703, "epoch": 0.8696828498421765, "flos": 21799070271360.0, "grad_norm": 2.3050316772452173, "language_loss": 0.62633824, "learning_rate": 1.6525686223710666e-07, "loss": 0.64792687, "num_input_tokens_seen": 312020770, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 14465, "time_per_iteration": 2.667421579360962 }, { "auxiliary_loss_clip": 0.01106989, "auxiliary_loss_mlp": 0.01035402, "balance_loss_clip": 1.02433348, "balance_loss_mlp": 1.03342795, "epoch": 0.8697429730948444, "flos": 22894812610560.0, "grad_norm": 1.4259311650816178, "language_loss": 0.84732217, "learning_rate": 1.6510650963242356e-07, "loss": 0.86874604, "num_input_tokens_seen": 312041870, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6484375, "step": 14466, "time_per_iteration": 2.8123064041137695 }, { "auxiliary_loss_clip": 0.01126928, "auxiliary_loss_mlp": 0.01040839, "balance_loss_clip": 1.02690387, "balance_loss_mlp": 1.03692949, "epoch": 0.8698030963475124, "flos": 24717458482560.0, "grad_norm": 2.4634242603953167, "language_loss": 0.61689854, "learning_rate": 1.6495622251036977e-07, "loss": 0.63857627, "num_input_tokens_seen": 312058210, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.71875, "step": 14467, "time_per_iteration": 2.7032241821289062 }, { "auxiliary_loss_clip": 0.01097639, "auxiliary_loss_mlp": 0.01028302, "balance_loss_clip": 1.0171144, "balance_loss_mlp": 1.03267789, "epoch": 0.8698632196001803, "flos": 28442221067520.0, "grad_norm": 1.398858896579045, "language_loss": 0.68921292, "learning_rate": 1.6480600087630746e-07, "loss": 0.71047235, "num_input_tokens_seen": 312082665, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6484375, "step": 14468, "time_per_iteration": 2.8000457286834717 }, { "auxiliary_loss_clip": 0.01116364, "auxiliary_loss_mlp": 0.01030275, "balance_loss_clip": 1.0184257, "balance_loss_mlp": 1.03234792, "epoch": 0.8699233428528483, "flos": 27636457224960.0, "grad_norm": 1.5145244195174756, "language_loss": 0.70612234, "learning_rate": 1.6465584473559923e-07, "loss": 0.72758871, "num_input_tokens_seen": 312101960, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6640625, "step": 14469, "time_per_iteration": 2.7877860069274902 }, { "auxiliary_loss_clip": 0.01106794, "auxiliary_loss_mlp": 0.01028659, "balance_loss_clip": 1.0173223, "balance_loss_mlp": 1.0336473, "epoch": 0.8699834661055164, "flos": 20045659864320.0, "grad_norm": 2.5198717951880325, "language_loss": 0.83963734, "learning_rate": 1.6450575409360234e-07, "loss": 0.86099184, "num_input_tokens_seen": 312117125, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6484375, "step": 14470, "time_per_iteration": 2.760754108428955 }, { "auxiliary_loss_clip": 0.01121695, "auxiliary_loss_mlp": 0.01031986, "balance_loss_clip": 1.01988077, "balance_loss_mlp": 1.03586149, "epoch": 0.8700435893581843, "flos": 23732787974400.0, "grad_norm": 1.6892904144717589, "language_loss": 0.73090041, "learning_rate": 1.6435572895567317e-07, "loss": 0.75243717, "num_input_tokens_seen": 312135775, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.68359375, "step": 14471, "time_per_iteration": 4.122209548950195 }, { "auxiliary_loss_clip": 0.01102309, "auxiliary_loss_mlp": 0.01028793, "balance_loss_clip": 1.01762962, "balance_loss_mlp": 1.0343256, "epoch": 0.8701037126108523, "flos": 23548422441600.0, "grad_norm": 1.347360165926027, "language_loss": 0.78946269, "learning_rate": 1.6420576932716702e-07, "loss": 0.81077373, "num_input_tokens_seen": 312156070, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6796875, "step": 14472, "time_per_iteration": 2.644765853881836 }, { "auxiliary_loss_clip": 0.01111059, "auxiliary_loss_mlp": 0.01274168, "balance_loss_clip": 1.01527119, "balance_loss_mlp": 1.03316736, "epoch": 0.8701638358635202, "flos": 18843442634880.0, "grad_norm": 1.882827006836691, "language_loss": 0.72434598, "learning_rate": 1.64055875213434e-07, "loss": 0.74819827, "num_input_tokens_seen": 312174380, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 14473, "time_per_iteration": 2.668773889541626 }, { "auxiliary_loss_clip": 0.01105338, "auxiliary_loss_mlp": 0.01029784, "balance_loss_clip": 1.01757061, "balance_loss_mlp": 1.03476381, "epoch": 0.8702239591161882, "flos": 27928339142400.0, "grad_norm": 1.6256784914856137, "language_loss": 0.72210932, "learning_rate": 1.6390604661982477e-07, "loss": 0.7434606, "num_input_tokens_seen": 312195130, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 14474, "time_per_iteration": 2.698241710662842 }, { "auxiliary_loss_clip": 0.01119321, "auxiliary_loss_mlp": 0.01035533, "balance_loss_clip": 1.02228332, "balance_loss_mlp": 1.03296053, "epoch": 0.8702840823688561, "flos": 17233997938560.0, "grad_norm": 1.9758352794978769, "language_loss": 0.6708734, "learning_rate": 1.6375628355168502e-07, "loss": 0.69242197, "num_input_tokens_seen": 312212300, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6875, "step": 14475, "time_per_iteration": 2.575253486633301 }, { "auxiliary_loss_clip": 0.01101186, "auxiliary_loss_mlp": 0.01025864, "balance_loss_clip": 1.01337719, "balance_loss_mlp": 1.03218186, "epoch": 0.8703442056215241, "flos": 19427565605760.0, "grad_norm": 1.6601986226147434, "language_loss": 0.78089017, "learning_rate": 1.6360658601436118e-07, "loss": 0.80216068, "num_input_tokens_seen": 312231735, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 14476, "time_per_iteration": 2.7126286029815674 }, { "auxiliary_loss_clip": 0.01108211, "auxiliary_loss_mlp": 0.0102742, "balance_loss_clip": 1.01577902, "balance_loss_mlp": 1.03332055, "epoch": 0.8704043288741921, "flos": 22273845264000.0, "grad_norm": 1.5291254109478931, "language_loss": 0.72202086, "learning_rate": 1.634569540131936e-07, "loss": 0.74337715, "num_input_tokens_seen": 312253060, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.65625, "step": 14477, "time_per_iteration": 4.204022407531738 }, { "auxiliary_loss_clip": 0.01112033, "auxiliary_loss_mlp": 0.01027833, "balance_loss_clip": 1.01612687, "balance_loss_mlp": 1.03418517, "epoch": 0.8704644521268601, "flos": 16648725732480.0, "grad_norm": 1.7555802821248545, "language_loss": 0.59678286, "learning_rate": 1.6330738755352359e-07, "loss": 0.61818153, "num_input_tokens_seen": 312269460, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69140625, "step": 14478, "time_per_iteration": 2.5146145820617676 }, { "auxiliary_loss_clip": 0.01111184, "auxiliary_loss_mlp": 0.01027727, "balance_loss_clip": 1.01690888, "balance_loss_mlp": 1.03716576, "epoch": 0.870524575379528, "flos": 24280210224000.0, "grad_norm": 1.4389506166252457, "language_loss": 0.71303439, "learning_rate": 1.631578866406882e-07, "loss": 0.73442352, "num_input_tokens_seen": 312289830, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.6484375, "step": 14479, "time_per_iteration": 2.639181137084961 }, { "auxiliary_loss_clip": 0.01120368, "auxiliary_loss_mlp": 0.01030719, "balance_loss_clip": 1.01792204, "balance_loss_mlp": 1.03423595, "epoch": 0.870584698632196, "flos": 28768684803840.0, "grad_norm": 1.4207248267585257, "language_loss": 0.70692736, "learning_rate": 1.6300845128002316e-07, "loss": 0.72843826, "num_input_tokens_seen": 312311320, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 14480, "time_per_iteration": 2.6793572902679443 }, { "auxiliary_loss_clip": 0.01139159, "auxiliary_loss_mlp": 0.01030361, "balance_loss_clip": 1.01851726, "balance_loss_mlp": 1.03337753, "epoch": 0.8706448218848639, "flos": 32449635774720.0, "grad_norm": 1.986935280123558, "language_loss": 0.70125866, "learning_rate": 1.6285908147686067e-07, "loss": 0.72295386, "num_input_tokens_seen": 312332095, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69921875, "step": 14481, "time_per_iteration": 2.7178499698638916 }, { "auxiliary_loss_clip": 0.01109626, "auxiliary_loss_mlp": 0.0102952, "balance_loss_clip": 1.01759326, "balance_loss_mlp": 1.03346336, "epoch": 0.8707049451375319, "flos": 22748009725440.0, "grad_norm": 1.7694957094305193, "language_loss": 0.77085525, "learning_rate": 1.6270977723653267e-07, "loss": 0.7922467, "num_input_tokens_seen": 312351225, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 14482, "time_per_iteration": 2.717366933822632 }, { "auxiliary_loss_clip": 0.01120111, "auxiliary_loss_mlp": 0.01030838, "balance_loss_clip": 1.01836872, "balance_loss_mlp": 1.03557253, "epoch": 0.8707650683902, "flos": 15851976203520.0, "grad_norm": 3.085586922315482, "language_loss": 0.76540649, "learning_rate": 1.6256053856436602e-07, "loss": 0.78691602, "num_input_tokens_seen": 312369730, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.671875, "step": 14483, "time_per_iteration": 4.773298978805542 }, { "auxiliary_loss_clip": 0.01117616, "auxiliary_loss_mlp": 0.01039761, "balance_loss_clip": 1.02589107, "balance_loss_mlp": 1.03609872, "epoch": 0.8708251916428679, "flos": 16468131127680.0, "grad_norm": 2.1260454982686103, "language_loss": 0.62109971, "learning_rate": 1.6241136546568756e-07, "loss": 0.64267349, "num_input_tokens_seen": 312386780, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7265625, "step": 14484, "time_per_iteration": 2.672680139541626 }, { "auxiliary_loss_clip": 0.01103791, "auxiliary_loss_mlp": 0.01033663, "balance_loss_clip": 1.02165294, "balance_loss_mlp": 1.03593946, "epoch": 0.8708853148955359, "flos": 15377847655680.0, "grad_norm": 1.7243030409301225, "language_loss": 0.6813882, "learning_rate": 1.622622579458206e-07, "loss": 0.70276278, "num_input_tokens_seen": 312404875, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 14485, "time_per_iteration": 4.265707731246948 }, { "auxiliary_loss_clip": 0.01145167, "auxiliary_loss_mlp": 0.01279618, "balance_loss_clip": 1.02014685, "balance_loss_mlp": 1.03317797, "epoch": 0.8709454381482038, "flos": 30551325903360.0, "grad_norm": 1.7396896267256623, "language_loss": 0.62822545, "learning_rate": 1.6211321601008642e-07, "loss": 0.65247333, "num_input_tokens_seen": 312425280, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.67578125, "step": 14486, "time_per_iteration": 2.8334877490997314 }, { "auxiliary_loss_clip": 0.01105227, "auxiliary_loss_mlp": 0.01033679, "balance_loss_clip": 1.02019691, "balance_loss_mlp": 1.033566, "epoch": 0.8710055614008718, "flos": 22601422321920.0, "grad_norm": 2.1775585907729176, "language_loss": 0.6167165, "learning_rate": 1.619642396638039e-07, "loss": 0.63810563, "num_input_tokens_seen": 312443835, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 14487, "time_per_iteration": 2.6561903953552246 }, { "auxiliary_loss_clip": 0.0111878, "auxiliary_loss_mlp": 0.01271524, "balance_loss_clip": 1.013942, "balance_loss_mlp": 1.03354979, "epoch": 0.8710656846535397, "flos": 18443146492800.0, "grad_norm": 1.9490551229014526, "language_loss": 0.67899477, "learning_rate": 1.6181532891228945e-07, "loss": 0.70289785, "num_input_tokens_seen": 312460830, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.66796875, "step": 14488, "time_per_iteration": 2.6854617595672607 }, { "auxiliary_loss_clip": 0.01122694, "auxiliary_loss_mlp": 0.01280368, "balance_loss_clip": 1.0214653, "balance_loss_mlp": 1.03572726, "epoch": 0.8711258079062077, "flos": 16503862181760.0, "grad_norm": 1.7565723996691722, "language_loss": 0.85503507, "learning_rate": 1.616664837608581e-07, "loss": 0.87906575, "num_input_tokens_seen": 312477575, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 14489, "time_per_iteration": 2.5418052673339844 }, { "auxiliary_loss_clip": 0.01117639, "auxiliary_loss_mlp": 0.01030197, "balance_loss_clip": 1.01741135, "balance_loss_mlp": 1.03564644, "epoch": 0.8711859311588757, "flos": 15663336952320.0, "grad_norm": 2.1863519462357552, "language_loss": 0.75929576, "learning_rate": 1.615177042148206e-07, "loss": 0.78077412, "num_input_tokens_seen": 312492140, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.734375, "step": 14490, "time_per_iteration": 2.738563060760498 }, { "auxiliary_loss_clip": 0.01114354, "auxiliary_loss_mlp": 0.01032304, "balance_loss_clip": 1.01944733, "balance_loss_mlp": 1.03495002, "epoch": 0.8712460544115437, "flos": 15557544420480.0, "grad_norm": 2.7036513745975297, "language_loss": 0.76442409, "learning_rate": 1.6136899027948748e-07, "loss": 0.78589064, "num_input_tokens_seen": 312508400, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 14491, "time_per_iteration": 2.6883389949798584 }, { "auxiliary_loss_clip": 0.0113203, "auxiliary_loss_mlp": 0.01025692, "balance_loss_clip": 1.01377702, "balance_loss_mlp": 1.03795969, "epoch": 0.8713061776642116, "flos": 16763568491520.0, "grad_norm": 1.5089867710959517, "language_loss": 0.66864944, "learning_rate": 1.6122034196016497e-07, "loss": 0.69022667, "num_input_tokens_seen": 312525915, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 14492, "time_per_iteration": 2.7446537017822266 }, { "auxiliary_loss_clip": 0.0110895, "auxiliary_loss_mlp": 0.01028926, "balance_loss_clip": 1.01785183, "balance_loss_mlp": 1.0340426, "epoch": 0.8713663009168796, "flos": 24279887001600.0, "grad_norm": 1.817435108166236, "language_loss": 0.6988433, "learning_rate": 1.6107175926215956e-07, "loss": 0.72022206, "num_input_tokens_seen": 312544735, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.6640625, "step": 14493, "time_per_iteration": 2.6060805320739746 }, { "auxiliary_loss_clip": 0.0110654, "auxiliary_loss_mlp": 0.01034533, "balance_loss_clip": 1.02129483, "balance_loss_mlp": 1.03692555, "epoch": 0.8714264241695475, "flos": 23795594904960.0, "grad_norm": 2.5432320942021285, "language_loss": 0.89285946, "learning_rate": 1.609232421907718e-07, "loss": 0.91427016, "num_input_tokens_seen": 312557910, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 14494, "time_per_iteration": 2.655775308609009 }, { "auxiliary_loss_clip": 0.01130136, "auxiliary_loss_mlp": 0.01031254, "balance_loss_clip": 1.01940441, "balance_loss_mlp": 1.03632939, "epoch": 0.8714865474222155, "flos": 37997942071680.0, "grad_norm": 1.756541525489348, "language_loss": 0.59226334, "learning_rate": 1.6077479075130352e-07, "loss": 0.6138773, "num_input_tokens_seen": 312580360, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 14495, "time_per_iteration": 2.770237922668457 }, { "auxiliary_loss_clip": 0.0112961, "auxiliary_loss_mlp": 0.01036464, "balance_loss_clip": 1.02386975, "balance_loss_mlp": 1.03390503, "epoch": 0.8715466706748836, "flos": 22455696844800.0, "grad_norm": 1.909164615634246, "language_loss": 0.80250299, "learning_rate": 1.6062640494905177e-07, "loss": 0.82416379, "num_input_tokens_seen": 312597550, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 14496, "time_per_iteration": 2.748258590698242 }, { "auxiliary_loss_clip": 0.0112244, "auxiliary_loss_mlp": 0.01034697, "balance_loss_clip": 1.02082109, "balance_loss_mlp": 1.03489316, "epoch": 0.8716067939275515, "flos": 21215126868480.0, "grad_norm": 2.185771873543432, "language_loss": 0.78901708, "learning_rate": 1.6047808478931213e-07, "loss": 0.81058836, "num_input_tokens_seen": 312616435, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.6953125, "step": 14497, "time_per_iteration": 2.610468626022339 }, { "auxiliary_loss_clip": 0.01105523, "auxiliary_loss_mlp": 0.01031361, "balance_loss_clip": 1.01885581, "balance_loss_mlp": 1.03540659, "epoch": 0.8716669171802195, "flos": 22997732054400.0, "grad_norm": 2.0091874493068693, "language_loss": 0.66943252, "learning_rate": 1.6032983027737745e-07, "loss": 0.69080144, "num_input_tokens_seen": 312632770, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 14498, "time_per_iteration": 2.7176060676574707 }, { "auxiliary_loss_clip": 0.01126828, "auxiliary_loss_mlp": 0.01031902, "balance_loss_clip": 1.02007604, "balance_loss_mlp": 1.03261912, "epoch": 0.8717270404328874, "flos": 29784058462080.0, "grad_norm": 1.8346380725208307, "language_loss": 0.57491362, "learning_rate": 1.6018164141853972e-07, "loss": 0.59650099, "num_input_tokens_seen": 312651900, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 14499, "time_per_iteration": 2.8057236671447754 }, { "auxiliary_loss_clip": 0.01130886, "auxiliary_loss_mlp": 0.01030807, "balance_loss_clip": 1.01845098, "balance_loss_mlp": 1.03376508, "epoch": 0.8717871636855554, "flos": 22018125363840.0, "grad_norm": 1.8010790916719621, "language_loss": 0.79705906, "learning_rate": 1.600335182180863e-07, "loss": 0.81867599, "num_input_tokens_seen": 312671380, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.703125, "step": 14500, "time_per_iteration": 2.5964412689208984 }, { "auxiliary_loss_clip": 0.0111267, "auxiliary_loss_mlp": 0.01030124, "balance_loss_clip": 1.01782131, "balance_loss_mlp": 1.0345633, "epoch": 0.8718472869382233, "flos": 16654256426880.0, "grad_norm": 2.033078788941925, "language_loss": 0.73150992, "learning_rate": 1.5988546068130315e-07, "loss": 0.75293785, "num_input_tokens_seen": 312689215, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 14501, "time_per_iteration": 2.6801249980926514 }, { "auxiliary_loss_clip": 0.01135522, "auxiliary_loss_mlp": 0.0102932, "balance_loss_clip": 1.01630855, "balance_loss_mlp": 1.03646803, "epoch": 0.8719074101908914, "flos": 19495328613120.0, "grad_norm": 1.991515152563303, "language_loss": 0.64559925, "learning_rate": 1.5973746881347517e-07, "loss": 0.66724765, "num_input_tokens_seen": 312706400, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.72265625, "step": 14502, "time_per_iteration": 2.6780335903167725 }, { "auxiliary_loss_clip": 0.01123266, "auxiliary_loss_mlp": 0.01033584, "balance_loss_clip": 1.02074504, "balance_loss_mlp": 1.03462565, "epoch": 0.8719675334435593, "flos": 33070890430080.0, "grad_norm": 1.6432834174911501, "language_loss": 0.68976712, "learning_rate": 1.59589542619883e-07, "loss": 0.71133566, "num_input_tokens_seen": 312727985, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 14503, "time_per_iteration": 2.639906406402588 }, { "auxiliary_loss_clip": 0.01120623, "auxiliary_loss_mlp": 0.01027997, "balance_loss_clip": 1.01566529, "balance_loss_mlp": 1.03425264, "epoch": 0.8720276566962273, "flos": 13626268842240.0, "grad_norm": 2.2329301770256142, "language_loss": 0.69758189, "learning_rate": 1.59441682105806e-07, "loss": 0.71906805, "num_input_tokens_seen": 312745025, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 14504, "time_per_iteration": 2.7658798694610596 }, { "auxiliary_loss_clip": 0.01112784, "auxiliary_loss_mlp": 0.01274216, "balance_loss_clip": 1.01473188, "balance_loss_mlp": 1.03441215, "epoch": 0.8720877799488952, "flos": 23514163845120.0, "grad_norm": 1.675928457180004, "language_loss": 0.69945592, "learning_rate": 1.592938872765206e-07, "loss": 0.72332597, "num_input_tokens_seen": 312764170, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 14505, "time_per_iteration": 2.7049427032470703 }, { "auxiliary_loss_clip": 0.01120466, "auxiliary_loss_mlp": 0.01031351, "balance_loss_clip": 1.01996088, "balance_loss_mlp": 1.035043, "epoch": 0.8721479032015632, "flos": 20814148368000.0, "grad_norm": 2.0028873902744233, "language_loss": 0.78302115, "learning_rate": 1.5914615813730214e-07, "loss": 0.80453938, "num_input_tokens_seen": 312783830, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6796875, "step": 14506, "time_per_iteration": 2.636526346206665 }, { "auxiliary_loss_clip": 0.01121738, "auxiliary_loss_mlp": 0.01027989, "balance_loss_clip": 1.01547194, "balance_loss_mlp": 1.03309834, "epoch": 0.8722080264542311, "flos": 19463655795840.0, "grad_norm": 2.611931554728952, "language_loss": 0.74447602, "learning_rate": 1.589984946934213e-07, "loss": 0.76597327, "num_input_tokens_seen": 312802015, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 14507, "time_per_iteration": 2.6627140045166016 }, { "auxiliary_loss_clip": 0.01151007, "auxiliary_loss_mlp": 0.0102984, "balance_loss_clip": 1.01660836, "balance_loss_mlp": 1.03530622, "epoch": 0.8722681497068991, "flos": 21761866759680.0, "grad_norm": 3.131050355717461, "language_loss": 0.72178406, "learning_rate": 1.5885089695014898e-07, "loss": 0.7435925, "num_input_tokens_seen": 312820650, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 14508, "time_per_iteration": 2.727612018585205 }, { "auxiliary_loss_clip": 0.01125539, "auxiliary_loss_mlp": 0.01036912, "balance_loss_clip": 1.02365029, "balance_loss_mlp": 1.03588009, "epoch": 0.8723282729595672, "flos": 28877134942080.0, "grad_norm": 2.162478759025482, "language_loss": 0.68648672, "learning_rate": 1.5870336491275205e-07, "loss": 0.70811129, "num_input_tokens_seen": 312841310, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 14509, "time_per_iteration": 2.615283727645874 }, { "auxiliary_loss_clip": 0.01120551, "auxiliary_loss_mlp": 0.01030135, "balance_loss_clip": 1.01809502, "balance_loss_mlp": 1.03392506, "epoch": 0.8723883962122351, "flos": 26469145036800.0, "grad_norm": 1.8414752011461357, "language_loss": 0.58287597, "learning_rate": 1.5855589858649586e-07, "loss": 0.60438281, "num_input_tokens_seen": 312862100, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 14510, "time_per_iteration": 2.6528377532958984 }, { "auxiliary_loss_clip": 0.01109163, "auxiliary_loss_mlp": 0.01029076, "balance_loss_clip": 1.01528931, "balance_loss_mlp": 1.03456426, "epoch": 0.8724485194649031, "flos": 20521476351360.0, "grad_norm": 15.433888211025725, "language_loss": 0.67070532, "learning_rate": 1.5840849797664246e-07, "loss": 0.69208771, "num_input_tokens_seen": 312880220, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.74609375, "step": 14511, "time_per_iteration": 2.6127474308013916 }, { "auxiliary_loss_clip": 0.01150464, "auxiliary_loss_mlp": 0.01032605, "balance_loss_clip": 1.01940823, "balance_loss_mlp": 1.03433275, "epoch": 0.872508642717571, "flos": 24353360271360.0, "grad_norm": 1.635387395210831, "language_loss": 0.81829739, "learning_rate": 1.5826116308845295e-07, "loss": 0.84012806, "num_input_tokens_seen": 312900765, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71484375, "step": 14512, "time_per_iteration": 2.587444543838501 }, { "auxiliary_loss_clip": 0.01040236, "auxiliary_loss_mlp": 0.01004566, "balance_loss_clip": 1.00328445, "balance_loss_mlp": 1.00133038, "epoch": 0.872568765970239, "flos": 61410012485760.0, "grad_norm": 0.7896628507918635, "language_loss": 0.58619857, "learning_rate": 1.581138939271849e-07, "loss": 0.60664666, "num_input_tokens_seen": 312955840, "router_z_loss_clip": 0.01281738, "router_z_loss_mlp": 0.21191406, "step": 14513, "time_per_iteration": 4.491479873657227 }, { "auxiliary_loss_clip": 0.01112932, "auxiliary_loss_mlp": 0.01031525, "balance_loss_clip": 1.01828718, "balance_loss_mlp": 1.03548503, "epoch": 0.8726288892229069, "flos": 22598046443520.0, "grad_norm": 1.659912503020356, "language_loss": 0.77093095, "learning_rate": 1.579666904980943e-07, "loss": 0.7923755, "num_input_tokens_seen": 312973565, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6875, "step": 14514, "time_per_iteration": 2.6316492557525635 }, { "auxiliary_loss_clip": 0.01116839, "auxiliary_loss_mlp": 0.01026197, "balance_loss_clip": 1.0143894, "balance_loss_mlp": 1.03629768, "epoch": 0.872689012475575, "flos": 25885201633920.0, "grad_norm": 1.9312095262217241, "language_loss": 0.6541667, "learning_rate": 1.578195528064341e-07, "loss": 0.67559707, "num_input_tokens_seen": 312994660, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.71484375, "step": 14515, "time_per_iteration": 2.619136095046997 }, { "auxiliary_loss_clip": 0.01114455, "auxiliary_loss_mlp": 0.01032301, "balance_loss_clip": 1.01899719, "balance_loss_mlp": 1.03496659, "epoch": 0.8727491357282429, "flos": 21506721477120.0, "grad_norm": 1.940415738385174, "language_loss": 0.78752148, "learning_rate": 1.5767248085745543e-07, "loss": 0.80898911, "num_input_tokens_seen": 313009860, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 14516, "time_per_iteration": 2.686936378479004 }, { "auxiliary_loss_clip": 0.01106168, "auxiliary_loss_mlp": 0.01029876, "balance_loss_clip": 1.01723349, "balance_loss_mlp": 1.03495359, "epoch": 0.8728092589809109, "flos": 19207504932480.0, "grad_norm": 2.699508160962976, "language_loss": 0.71985269, "learning_rate": 1.5752547465640675e-07, "loss": 0.74121308, "num_input_tokens_seen": 313027025, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 14517, "time_per_iteration": 2.7406766414642334 }, { "auxiliary_loss_clip": 0.01124594, "auxiliary_loss_mlp": 0.01022998, "balance_loss_clip": 1.01157808, "balance_loss_mlp": 1.0319823, "epoch": 0.8728693822335788, "flos": 20595308757120.0, "grad_norm": 3.031648767148705, "language_loss": 0.72509491, "learning_rate": 1.573785342085343e-07, "loss": 0.74657083, "num_input_tokens_seen": 313046830, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6640625, "step": 14518, "time_per_iteration": 4.051185369491577 }, { "auxiliary_loss_clip": 0.0111992, "auxiliary_loss_mlp": 0.01033819, "balance_loss_clip": 1.02217841, "balance_loss_mlp": 1.03497875, "epoch": 0.8729295054862468, "flos": 21728613744000.0, "grad_norm": 1.3251142838167238, "language_loss": 0.74017119, "learning_rate": 1.5723165951908256e-07, "loss": 0.76170862, "num_input_tokens_seen": 313067715, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 14519, "time_per_iteration": 2.667816638946533 }, { "auxiliary_loss_clip": 0.01114171, "auxiliary_loss_mlp": 0.01028246, "balance_loss_clip": 1.01469827, "balance_loss_mlp": 1.03404295, "epoch": 0.8729896287389147, "flos": 17673436926720.0, "grad_norm": 2.5198092975377206, "language_loss": 0.76155937, "learning_rate": 1.5708485059329268e-07, "loss": 0.78298354, "num_input_tokens_seen": 313082305, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7109375, "step": 14520, "time_per_iteration": 2.7626326084136963 }, { "auxiliary_loss_clip": 0.0112204, "auxiliary_loss_mlp": 0.01033326, "balance_loss_clip": 1.02029026, "balance_loss_mlp": 1.03472555, "epoch": 0.8730497519915827, "flos": 24571804832640.0, "grad_norm": 1.546122748474385, "language_loss": 0.81710041, "learning_rate": 1.5693810743640378e-07, "loss": 0.83865404, "num_input_tokens_seen": 313101190, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 14521, "time_per_iteration": 2.571559190750122 }, { "auxiliary_loss_clip": 0.01114939, "auxiliary_loss_mlp": 0.01035537, "balance_loss_clip": 1.02216208, "balance_loss_mlp": 1.03532183, "epoch": 0.8731098752442508, "flos": 13443734903040.0, "grad_norm": 2.016454129608226, "language_loss": 0.76271546, "learning_rate": 1.567914300536528e-07, "loss": 0.78422016, "num_input_tokens_seen": 313118965, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.703125, "step": 14522, "time_per_iteration": 2.6238863468170166 }, { "auxiliary_loss_clip": 0.01109021, "auxiliary_loss_mlp": 0.01274608, "balance_loss_clip": 1.01628757, "balance_loss_mlp": 1.03302288, "epoch": 0.8731699984969187, "flos": 23474446381440.0, "grad_norm": 1.8604577310209778, "language_loss": 0.75676525, "learning_rate": 1.566448184502749e-07, "loss": 0.7806015, "num_input_tokens_seen": 313139280, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 14523, "time_per_iteration": 2.530378818511963 }, { "auxiliary_loss_clip": 0.01109438, "auxiliary_loss_mlp": 0.01031013, "balance_loss_clip": 1.0187881, "balance_loss_mlp": 1.03339291, "epoch": 0.8732301217495867, "flos": 17712651600000.0, "grad_norm": 2.5637997256982215, "language_loss": 0.78495932, "learning_rate": 1.5649827263150116e-07, "loss": 0.80636382, "num_input_tokens_seen": 313156655, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 14524, "time_per_iteration": 2.6124839782714844 }, { "auxiliary_loss_clip": 0.01118322, "auxiliary_loss_mlp": 0.01029167, "balance_loss_clip": 1.0176636, "balance_loss_mlp": 1.03439653, "epoch": 0.8732902450022546, "flos": 22054359208320.0, "grad_norm": 2.048483291625584, "language_loss": 0.77522123, "learning_rate": 1.5635179260256236e-07, "loss": 0.79669607, "num_input_tokens_seen": 313174050, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.66015625, "step": 14525, "time_per_iteration": 4.326208114624023 }, { "auxiliary_loss_clip": 0.01176048, "auxiliary_loss_mlp": 0.01033834, "balance_loss_clip": 1.02146602, "balance_loss_mlp": 1.0347445, "epoch": 0.8733503682549226, "flos": 22272983337600.0, "grad_norm": 1.7406739141102066, "language_loss": 0.6911279, "learning_rate": 1.5620537836868563e-07, "loss": 0.71322668, "num_input_tokens_seen": 313192765, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.7109375, "step": 14526, "time_per_iteration": 2.7767138481140137 }, { "auxiliary_loss_clip": 0.01143047, "auxiliary_loss_mlp": 0.01038367, "balance_loss_clip": 1.02465773, "balance_loss_mlp": 1.03474283, "epoch": 0.8734104915075905, "flos": 23364344217600.0, "grad_norm": 1.95918546083008, "language_loss": 0.61493444, "learning_rate": 1.560590299350961e-07, "loss": 0.63674855, "num_input_tokens_seen": 313210925, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 14527, "time_per_iteration": 4.219867706298828 }, { "auxiliary_loss_clip": 0.01119564, "auxiliary_loss_mlp": 0.01034436, "balance_loss_clip": 1.0204227, "balance_loss_mlp": 1.0366509, "epoch": 0.8734706147602586, "flos": 17712292464000.0, "grad_norm": 1.990881167611905, "language_loss": 0.65512109, "learning_rate": 1.559127473070163e-07, "loss": 0.67666113, "num_input_tokens_seen": 313228250, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.734375, "step": 14528, "time_per_iteration": 2.5360286235809326 }, { "auxiliary_loss_clip": 0.01129276, "auxiliary_loss_mlp": 0.01028134, "balance_loss_clip": 1.01590347, "balance_loss_mlp": 1.03518867, "epoch": 0.8735307380129265, "flos": 22049367217920.0, "grad_norm": 1.7294702523966785, "language_loss": 0.89034009, "learning_rate": 1.5576653048966737e-07, "loss": 0.91191423, "num_input_tokens_seen": 313247880, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 14529, "time_per_iteration": 2.9265458583831787 }, { "auxiliary_loss_clip": 0.01120265, "auxiliary_loss_mlp": 0.01028465, "balance_loss_clip": 1.01619267, "balance_loss_mlp": 1.03508198, "epoch": 0.8735908612655945, "flos": 25338425829120.0, "grad_norm": 1.786260725853259, "language_loss": 0.84649116, "learning_rate": 1.5562037948826734e-07, "loss": 0.86797845, "num_input_tokens_seen": 313266790, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.671875, "step": 14530, "time_per_iteration": 2.666295051574707 }, { "auxiliary_loss_clip": 0.01121666, "auxiliary_loss_mlp": 0.01030804, "balance_loss_clip": 1.01926506, "balance_loss_mlp": 1.0344696, "epoch": 0.8736509845182624, "flos": 21540908246400.0, "grad_norm": 1.520687502948437, "language_loss": 0.7413944, "learning_rate": 1.5547429430803093e-07, "loss": 0.76291913, "num_input_tokens_seen": 313286805, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6953125, "step": 14531, "time_per_iteration": 2.784675121307373 }, { "auxiliary_loss_clip": 0.01107617, "auxiliary_loss_mlp": 0.01030414, "balance_loss_clip": 1.01842797, "balance_loss_mlp": 1.03241289, "epoch": 0.8737111077709304, "flos": 22017227523840.0, "grad_norm": 1.9126292361116288, "language_loss": 0.6145249, "learning_rate": 1.5532827495417268e-07, "loss": 0.63590521, "num_input_tokens_seen": 313305415, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6640625, "step": 14532, "time_per_iteration": 2.614882469177246 }, { "auxiliary_loss_clip": 0.01031115, "auxiliary_loss_mlp": 0.01001165, "balance_loss_clip": 0.99989569, "balance_loss_mlp": 1.00112236, "epoch": 0.8737712310235983, "flos": 70066315912320.0, "grad_norm": 0.825249338803865, "language_loss": 0.58808792, "learning_rate": 1.5518232143190302e-07, "loss": 0.60841072, "num_input_tokens_seen": 313369940, "router_z_loss_clip": 0.01269531, "router_z_loss_mlp": 0.21289062, "step": 14533, "time_per_iteration": 3.3989574909210205 }, { "auxiliary_loss_clip": 0.01120213, "auxiliary_loss_mlp": 0.01030502, "balance_loss_clip": 1.01894438, "balance_loss_mlp": 1.03509212, "epoch": 0.8738313542762663, "flos": 28658331244800.0, "grad_norm": 1.5067109196340152, "language_loss": 0.76653904, "learning_rate": 1.5503643374643116e-07, "loss": 0.78804618, "num_input_tokens_seen": 313390965, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.671875, "step": 14534, "time_per_iteration": 2.6036975383758545 }, { "auxiliary_loss_clip": 0.01119756, "auxiliary_loss_mlp": 0.01030922, "balance_loss_clip": 1.01857185, "balance_loss_mlp": 1.03292465, "epoch": 0.8738914775289344, "flos": 22346384780160.0, "grad_norm": 1.704983366835616, "language_loss": 0.74966395, "learning_rate": 1.5489061190296272e-07, "loss": 0.77117068, "num_input_tokens_seen": 313409680, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 14535, "time_per_iteration": 2.6606688499450684 }, { "auxiliary_loss_clip": 0.01102997, "auxiliary_loss_mlp": 0.01030746, "balance_loss_clip": 1.01813996, "balance_loss_mlp": 1.03366327, "epoch": 0.8739516007816023, "flos": 31759648444800.0, "grad_norm": 2.030018047838763, "language_loss": 0.74613273, "learning_rate": 1.547448559067024e-07, "loss": 0.76747012, "num_input_tokens_seen": 313431335, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 14536, "time_per_iteration": 2.52095627784729 }, { "auxiliary_loss_clip": 0.01120087, "auxiliary_loss_mlp": 0.01032402, "balance_loss_clip": 1.02051687, "balance_loss_mlp": 1.03306413, "epoch": 0.8740117240342703, "flos": 21211715076480.0, "grad_norm": 1.7020765640990447, "language_loss": 0.64279372, "learning_rate": 1.545991657628518e-07, "loss": 0.66431862, "num_input_tokens_seen": 313449225, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69140625, "step": 14537, "time_per_iteration": 2.512221336364746 }, { "auxiliary_loss_clip": 0.01111729, "auxiliary_loss_mlp": 0.01026695, "balance_loss_clip": 1.01351035, "balance_loss_mlp": 1.03453851, "epoch": 0.8740718472869382, "flos": 25186666867200.0, "grad_norm": 1.9127962158875056, "language_loss": 0.57968855, "learning_rate": 1.5445354147660995e-07, "loss": 0.60107273, "num_input_tokens_seen": 313467715, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6796875, "step": 14538, "time_per_iteration": 2.5988659858703613 }, { "auxiliary_loss_clip": 0.01110875, "auxiliary_loss_mlp": 0.01028368, "balance_loss_clip": 1.0160296, "balance_loss_mlp": 1.03659868, "epoch": 0.8741319705396062, "flos": 19500931134720.0, "grad_norm": 1.9014247543086473, "language_loss": 0.68415308, "learning_rate": 1.543079830531735e-07, "loss": 0.70554554, "num_input_tokens_seen": 313486805, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.65625, "step": 14539, "time_per_iteration": 2.558084487915039 }, { "auxiliary_loss_clip": 0.01113925, "auxiliary_loss_mlp": 0.01028935, "balance_loss_clip": 1.016698, "balance_loss_mlp": 1.03313172, "epoch": 0.8741920937922741, "flos": 14100900180480.0, "grad_norm": 2.349559460391705, "language_loss": 0.74732774, "learning_rate": 1.541624904977381e-07, "loss": 0.76875627, "num_input_tokens_seen": 313504880, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.71875, "step": 14540, "time_per_iteration": 2.5026724338531494 }, { "auxiliary_loss_clip": 0.01120946, "auxiliary_loss_mlp": 0.01033684, "balance_loss_clip": 1.02179909, "balance_loss_mlp": 1.03433967, "epoch": 0.8742522170449422, "flos": 27709858667520.0, "grad_norm": 3.6074398622268897, "language_loss": 0.78753817, "learning_rate": 1.5401706381549472e-07, "loss": 0.80908453, "num_input_tokens_seen": 313524995, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 14541, "time_per_iteration": 2.573399066925049 }, { "auxiliary_loss_clip": 0.0114344, "auxiliary_loss_mlp": 0.01034736, "balance_loss_clip": 1.020818, "balance_loss_mlp": 1.03528953, "epoch": 0.8743123402976101, "flos": 21142587352320.0, "grad_norm": 2.156871183916201, "language_loss": 0.66984487, "learning_rate": 1.538717030116343e-07, "loss": 0.69162661, "num_input_tokens_seen": 313541740, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7265625, "step": 14542, "time_per_iteration": 2.5661368370056152 }, { "auxiliary_loss_clip": 0.01022334, "auxiliary_loss_mlp": 0.01002393, "balance_loss_clip": 1.00112963, "balance_loss_mlp": 1.00122547, "epoch": 0.8743724635502781, "flos": 60870024351360.0, "grad_norm": 0.7842425623596053, "language_loss": 0.54520535, "learning_rate": 1.5372640809134385e-07, "loss": 0.56545269, "num_input_tokens_seen": 313593445, "router_z_loss_clip": 0.01263428, "router_z_loss_mlp": 0.2109375, "step": 14543, "time_per_iteration": 2.9653186798095703 }, { "auxiliary_loss_clip": 0.01124628, "auxiliary_loss_mlp": 0.01033155, "balance_loss_clip": 1.0201261, "balance_loss_mlp": 1.03614521, "epoch": 0.874432586802946, "flos": 28109292883200.0, "grad_norm": 2.083188030714167, "language_loss": 0.69895631, "learning_rate": 1.5358117905980893e-07, "loss": 0.72053409, "num_input_tokens_seen": 313615640, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 14544, "time_per_iteration": 2.589437484741211 }, { "auxiliary_loss_clip": 0.0113318, "auxiliary_loss_mlp": 0.01026725, "balance_loss_clip": 1.01571631, "balance_loss_mlp": 1.03319287, "epoch": 0.874492710055614, "flos": 23550289948800.0, "grad_norm": 1.6933072810502452, "language_loss": 0.76099837, "learning_rate": 1.5343601592221212e-07, "loss": 0.78259742, "num_input_tokens_seen": 313635550, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6484375, "step": 14545, "time_per_iteration": 2.700894594192505 }, { "auxiliary_loss_clip": 0.01113411, "auxiliary_loss_mlp": 0.01280615, "balance_loss_clip": 1.01933146, "balance_loss_mlp": 1.0340333, "epoch": 0.8745528333082819, "flos": 40915647924480.0, "grad_norm": 1.6666771851818314, "language_loss": 0.66510975, "learning_rate": 1.5329091868373345e-07, "loss": 0.68905002, "num_input_tokens_seen": 313659275, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.703125, "step": 14546, "time_per_iteration": 2.8223485946655273 }, { "auxiliary_loss_clip": 0.0110198, "auxiliary_loss_mlp": 0.0103061, "balance_loss_clip": 1.01844513, "balance_loss_mlp": 1.0336237, "epoch": 0.87461295656095, "flos": 23622901292160.0, "grad_norm": 1.764226623209209, "language_loss": 0.72936499, "learning_rate": 1.5314588734955237e-07, "loss": 0.75069088, "num_input_tokens_seen": 313680595, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.68359375, "step": 14547, "time_per_iteration": 2.551609516143799 }, { "auxiliary_loss_clip": 0.01110389, "auxiliary_loss_mlp": 0.01039002, "balance_loss_clip": 1.02523375, "balance_loss_mlp": 1.0356853, "epoch": 0.874673079813618, "flos": 38794116983040.0, "grad_norm": 2.7841258297782088, "language_loss": 0.69570303, "learning_rate": 1.5300092192484337e-07, "loss": 0.71719694, "num_input_tokens_seen": 313699730, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75, "step": 14548, "time_per_iteration": 2.625370740890503 }, { "auxiliary_loss_clip": 0.01125351, "auxiliary_loss_mlp": 0.01032304, "balance_loss_clip": 1.01975703, "balance_loss_mlp": 1.03564274, "epoch": 0.8747332030662859, "flos": 23696159080320.0, "grad_norm": 1.9136995895238431, "language_loss": 0.70875382, "learning_rate": 1.5285602241478058e-07, "loss": 0.73033035, "num_input_tokens_seen": 313720090, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 14549, "time_per_iteration": 2.54707407951355 }, { "auxiliary_loss_clip": 0.01118383, "auxiliary_loss_mlp": 0.01285667, "balance_loss_clip": 1.02419722, "balance_loss_mlp": 1.03549349, "epoch": 0.8747933263189539, "flos": 24462456854400.0, "grad_norm": 4.615872098871551, "language_loss": 0.83777928, "learning_rate": 1.5271118882453492e-07, "loss": 0.86181974, "num_input_tokens_seen": 313736795, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.73828125, "step": 14550, "time_per_iteration": 2.551783800125122 }, { "auxiliary_loss_clip": 0.0111468, "auxiliary_loss_mlp": 0.01037413, "balance_loss_clip": 1.02518201, "balance_loss_mlp": 1.03455925, "epoch": 0.8748534495716218, "flos": 13809161917440.0, "grad_norm": 1.648569593218714, "language_loss": 0.71955109, "learning_rate": 1.525664211592752e-07, "loss": 0.741072, "num_input_tokens_seen": 313754820, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7109375, "step": 14551, "time_per_iteration": 2.5965068340301514 }, { "auxiliary_loss_clip": 0.01111292, "auxiliary_loss_mlp": 0.01278384, "balance_loss_clip": 1.01884043, "balance_loss_mlp": 1.03477263, "epoch": 0.8749135728242898, "flos": 29862092759040.0, "grad_norm": 1.946287251173769, "language_loss": 0.64161754, "learning_rate": 1.5242171942416726e-07, "loss": 0.66551435, "num_input_tokens_seen": 313775830, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.67578125, "step": 14552, "time_per_iteration": 2.681057929992676 }, { "auxiliary_loss_clip": 0.01132543, "auxiliary_loss_mlp": 0.01027357, "balance_loss_clip": 1.01287925, "balance_loss_mlp": 1.03452694, "epoch": 0.8749736960769577, "flos": 24133479166080.0, "grad_norm": 2.080528565111541, "language_loss": 0.79596603, "learning_rate": 1.522770836243765e-07, "loss": 0.81756496, "num_input_tokens_seen": 313795745, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.71484375, "step": 14553, "time_per_iteration": 2.7408154010772705 }, { "auxiliary_loss_clip": 0.01113938, "auxiliary_loss_mlp": 0.01027967, "balance_loss_clip": 1.01589155, "balance_loss_mlp": 1.03344834, "epoch": 0.8750338193296258, "flos": 17202540602880.0, "grad_norm": 2.0812689743541704, "language_loss": 0.69879872, "learning_rate": 1.5213251376506285e-07, "loss": 0.7202177, "num_input_tokens_seen": 313813895, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.71484375, "step": 14554, "time_per_iteration": 3.8277533054351807 }, { "auxiliary_loss_clip": 0.01121517, "auxiliary_loss_mlp": 0.01273873, "balance_loss_clip": 1.01503658, "balance_loss_mlp": 1.03490067, "epoch": 0.8750939425822937, "flos": 23733218937600.0, "grad_norm": 1.4196066381047379, "language_loss": 0.83671474, "learning_rate": 1.5198800985138704e-07, "loss": 0.8606686, "num_input_tokens_seen": 313834225, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.69140625, "step": 14555, "time_per_iteration": 2.524397850036621 }, { "auxiliary_loss_clip": 0.01112721, "auxiliary_loss_mlp": 0.01032794, "balance_loss_clip": 1.01988983, "balance_loss_mlp": 1.03481269, "epoch": 0.8751540658349617, "flos": 26541684552960.0, "grad_norm": 1.7807414586037338, "language_loss": 0.70625842, "learning_rate": 1.5184357188850516e-07, "loss": 0.72771353, "num_input_tokens_seen": 313854430, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 14556, "time_per_iteration": 2.5117146968841553 }, { "auxiliary_loss_clip": 0.01123918, "auxiliary_loss_mlp": 0.01035178, "balance_loss_clip": 1.02163601, "balance_loss_mlp": 1.03419471, "epoch": 0.8752141890876296, "flos": 19386806647680.0, "grad_norm": 2.1162397039442618, "language_loss": 0.76530755, "learning_rate": 1.5169919988157264e-07, "loss": 0.78689849, "num_input_tokens_seen": 313871600, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 14557, "time_per_iteration": 2.5186941623687744 }, { "auxiliary_loss_clip": 0.01132462, "auxiliary_loss_mlp": 0.01038985, "balance_loss_clip": 1.0256753, "balance_loss_mlp": 1.03635132, "epoch": 0.8752743123402976, "flos": 25374408278400.0, "grad_norm": 1.881424268222062, "language_loss": 0.82925737, "learning_rate": 1.5155489383574072e-07, "loss": 0.85097176, "num_input_tokens_seen": 313891570, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 14558, "time_per_iteration": 2.5457887649536133 }, { "auxiliary_loss_clip": 0.01022651, "auxiliary_loss_mlp": 0.01000314, "balance_loss_clip": 0.99904472, "balance_loss_mlp": 1.00145352, "epoch": 0.8753344355929655, "flos": 59952398578560.0, "grad_norm": 0.8093720495650348, "language_loss": 0.56073868, "learning_rate": 1.5141065375616036e-07, "loss": 0.58096838, "num_input_tokens_seen": 313951290, "router_z_loss_clip": 0.01269531, "router_z_loss_mlp": 0.21191406, "step": 14559, "time_per_iteration": 4.469848394393921 }, { "auxiliary_loss_clip": 0.01111832, "auxiliary_loss_mlp": 0.01031427, "balance_loss_clip": 1.01917815, "balance_loss_mlp": 1.03453434, "epoch": 0.8753945588456336, "flos": 17894646835200.0, "grad_norm": 1.6258378840643295, "language_loss": 0.65787613, "learning_rate": 1.512664796479788e-07, "loss": 0.67930871, "num_input_tokens_seen": 313968645, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 14560, "time_per_iteration": 2.51603102684021 }, { "auxiliary_loss_clip": 0.01101345, "auxiliary_loss_mlp": 0.01028974, "balance_loss_clip": 1.01726127, "balance_loss_mlp": 1.03348041, "epoch": 0.8754546820983016, "flos": 30914885410560.0, "grad_norm": 2.2169688770261833, "language_loss": 0.78753984, "learning_rate": 1.5112237151634032e-07, "loss": 0.80884308, "num_input_tokens_seen": 313987580, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 14561, "time_per_iteration": 2.5782947540283203 }, { "auxiliary_loss_clip": 0.01104644, "auxiliary_loss_mlp": 0.01035174, "balance_loss_clip": 1.02171504, "balance_loss_mlp": 1.03280699, "epoch": 0.8755148053509695, "flos": 20631075724800.0, "grad_norm": 2.0323041561953947, "language_loss": 0.77625692, "learning_rate": 1.5097832936638889e-07, "loss": 0.79765511, "num_input_tokens_seen": 314004460, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 14562, "time_per_iteration": 2.521038770675659 }, { "auxiliary_loss_clip": 0.01096986, "auxiliary_loss_mlp": 0.01027199, "balance_loss_clip": 1.01582599, "balance_loss_mlp": 1.03211522, "epoch": 0.8755749286036375, "flos": 34969739005440.0, "grad_norm": 1.4656285308550023, "language_loss": 0.71613306, "learning_rate": 1.5083435320326453e-07, "loss": 0.73737484, "num_input_tokens_seen": 314026855, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6484375, "step": 14563, "time_per_iteration": 2.6643409729003906 }, { "auxiliary_loss_clip": 0.01108349, "auxiliary_loss_mlp": 0.01034139, "balance_loss_clip": 1.021384, "balance_loss_mlp": 1.03680241, "epoch": 0.8756350518563054, "flos": 18186456925440.0, "grad_norm": 3.990119460690863, "language_loss": 0.65662205, "learning_rate": 1.5069044303210588e-07, "loss": 0.67804694, "num_input_tokens_seen": 314042830, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 14564, "time_per_iteration": 2.496711015701294 }, { "auxiliary_loss_clip": 0.01110504, "auxiliary_loss_mlp": 0.01037146, "balance_loss_clip": 1.02456379, "balance_loss_mlp": 1.03228378, "epoch": 0.8756951751089734, "flos": 20084012611200.0, "grad_norm": 1.49016131922028, "language_loss": 0.70155555, "learning_rate": 1.5054659885804766e-07, "loss": 0.72303206, "num_input_tokens_seen": 314062225, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 14565, "time_per_iteration": 2.510411024093628 }, { "auxiliary_loss_clip": 0.01127572, "auxiliary_loss_mlp": 0.01033206, "balance_loss_clip": 1.01919937, "balance_loss_mlp": 1.03607643, "epoch": 0.8757552983616413, "flos": 27525241739520.0, "grad_norm": 1.6138067535410352, "language_loss": 0.77531981, "learning_rate": 1.5040282068622444e-07, "loss": 0.79692769, "num_input_tokens_seen": 314082325, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.73828125, "step": 14566, "time_per_iteration": 2.54573655128479 }, { "auxiliary_loss_clip": 0.01124537, "auxiliary_loss_mlp": 0.01034126, "balance_loss_clip": 1.02168071, "balance_loss_mlp": 1.03774929, "epoch": 0.8758154216143094, "flos": 18073014796800.0, "grad_norm": 2.624104981648841, "language_loss": 0.71032649, "learning_rate": 1.502591085217668e-07, "loss": 0.73191315, "num_input_tokens_seen": 314100310, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 14567, "time_per_iteration": 3.9856314659118652 }, { "auxiliary_loss_clip": 0.01120403, "auxiliary_loss_mlp": 0.01283049, "balance_loss_clip": 1.02300954, "balance_loss_mlp": 1.0343554, "epoch": 0.8758755448669773, "flos": 25045681985280.0, "grad_norm": 1.858932950002807, "language_loss": 0.7393471, "learning_rate": 1.5011546236980355e-07, "loss": 0.76338166, "num_input_tokens_seen": 314121330, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.68359375, "step": 14568, "time_per_iteration": 2.5757482051849365 }, { "auxiliary_loss_clip": 0.01127809, "auxiliary_loss_mlp": 0.0102948, "balance_loss_clip": 1.01731467, "balance_loss_mlp": 1.03367877, "epoch": 0.8759356681196453, "flos": 22856818999680.0, "grad_norm": 2.201966894593352, "language_loss": 0.86810994, "learning_rate": 1.4997188223546053e-07, "loss": 0.88968283, "num_input_tokens_seen": 314139875, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.67578125, "step": 14569, "time_per_iteration": 4.051290988922119 }, { "auxiliary_loss_clip": 0.01128993, "auxiliary_loss_mlp": 0.0103282, "balance_loss_clip": 1.02060676, "balance_loss_mlp": 1.03279388, "epoch": 0.8759957913723132, "flos": 21032521102080.0, "grad_norm": 2.5421086354459046, "language_loss": 0.74161077, "learning_rate": 1.498283681238628e-07, "loss": 0.76322895, "num_input_tokens_seen": 314157850, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69921875, "step": 14570, "time_per_iteration": 2.519458055496216 }, { "auxiliary_loss_clip": 0.01113936, "auxiliary_loss_mlp": 0.01277447, "balance_loss_clip": 1.01750016, "balance_loss_mlp": 1.0340817, "epoch": 0.8760559146249812, "flos": 20010467514240.0, "grad_norm": 1.9917476999223125, "language_loss": 0.67709076, "learning_rate": 1.496849200401309e-07, "loss": 0.70100456, "num_input_tokens_seen": 314176720, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 14571, "time_per_iteration": 2.509465217590332 }, { "auxiliary_loss_clip": 0.01128196, "auxiliary_loss_mlp": 0.01029607, "balance_loss_clip": 1.01725745, "balance_loss_mlp": 1.03356028, "epoch": 0.8761160378776491, "flos": 19974161842560.0, "grad_norm": 2.0303334825899806, "language_loss": 0.62722719, "learning_rate": 1.4954153798938474e-07, "loss": 0.64880526, "num_input_tokens_seen": 314196645, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.671875, "step": 14572, "time_per_iteration": 2.6606557369232178 }, { "auxiliary_loss_clip": 0.01108069, "auxiliary_loss_mlp": 0.0102846, "balance_loss_clip": 1.01732016, "balance_loss_mlp": 1.03355527, "epoch": 0.8761761611303172, "flos": 28804415857920.0, "grad_norm": 1.8247753867135121, "language_loss": 0.73340273, "learning_rate": 1.493982219767409e-07, "loss": 0.75476801, "num_input_tokens_seen": 314217430, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.65625, "step": 14573, "time_per_iteration": 2.6503331661224365 }, { "auxiliary_loss_clip": 0.01135512, "auxiliary_loss_mlp": 0.01277116, "balance_loss_clip": 1.01903749, "balance_loss_mlp": 1.03437543, "epoch": 0.8762362843829851, "flos": 18332505624960.0, "grad_norm": 2.6249145057668937, "language_loss": 0.73172873, "learning_rate": 1.4925497200731508e-07, "loss": 0.75585496, "num_input_tokens_seen": 314235310, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.66015625, "step": 14574, "time_per_iteration": 2.6231000423431396 }, { "auxiliary_loss_clip": 0.0110907, "auxiliary_loss_mlp": 0.0103367, "balance_loss_clip": 1.02191615, "balance_loss_mlp": 1.03242886, "epoch": 0.8762964076356531, "flos": 15779149378560.0, "grad_norm": 1.6923900232615032, "language_loss": 0.76164627, "learning_rate": 1.4911178808621805e-07, "loss": 0.78307366, "num_input_tokens_seen": 314252355, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6796875, "step": 14575, "time_per_iteration": 2.531074285507202 }, { "auxiliary_loss_clip": 0.01121815, "auxiliary_loss_mlp": 0.01035298, "balance_loss_clip": 1.02364516, "balance_loss_mlp": 1.03598309, "epoch": 0.8763565308883211, "flos": 33176754789120.0, "grad_norm": 2.7269597380673436, "language_loss": 0.66757411, "learning_rate": 1.4896867021855997e-07, "loss": 0.68914527, "num_input_tokens_seen": 314272755, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6796875, "step": 14576, "time_per_iteration": 2.838840961456299 }, { "auxiliary_loss_clip": 0.01118457, "auxiliary_loss_mlp": 0.01028957, "balance_loss_clip": 1.01745355, "balance_loss_mlp": 1.03291178, "epoch": 0.876416654140989, "flos": 15888102307200.0, "grad_norm": 1.9476490246403002, "language_loss": 0.66676176, "learning_rate": 1.488256184094494e-07, "loss": 0.68823594, "num_input_tokens_seen": 314291365, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.67578125, "step": 14577, "time_per_iteration": 2.7441647052764893 }, { "auxiliary_loss_clip": 0.01113985, "auxiliary_loss_mlp": 0.0103329, "balance_loss_clip": 1.0200516, "balance_loss_mlp": 1.03608763, "epoch": 0.876476777393657, "flos": 25885237547520.0, "grad_norm": 2.5720852663393763, "language_loss": 0.71648622, "learning_rate": 1.4868263266398984e-07, "loss": 0.73795891, "num_input_tokens_seen": 314310075, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69140625, "step": 14578, "time_per_iteration": 2.658829689025879 }, { "auxiliary_loss_clip": 0.0110577, "auxiliary_loss_mlp": 0.01033503, "balance_loss_clip": 1.02130759, "balance_loss_mlp": 1.03459775, "epoch": 0.876536900646325, "flos": 13589675861760.0, "grad_norm": 2.0915528700939228, "language_loss": 0.71291775, "learning_rate": 1.485397129872854e-07, "loss": 0.73431039, "num_input_tokens_seen": 314325695, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7109375, "step": 14579, "time_per_iteration": 2.6558821201324463 }, { "auxiliary_loss_clip": 0.01117085, "auxiliary_loss_mlp": 0.01029067, "balance_loss_clip": 1.01736629, "balance_loss_mlp": 1.03390241, "epoch": 0.876597023898993, "flos": 12203344494720.0, "grad_norm": 1.6089182994792914, "language_loss": 0.69734704, "learning_rate": 1.483968593844358e-07, "loss": 0.71880859, "num_input_tokens_seen": 314343605, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.65234375, "step": 14580, "time_per_iteration": 2.7239162921905518 }, { "auxiliary_loss_clip": 0.01101299, "auxiliary_loss_mlp": 0.01275502, "balance_loss_clip": 1.0166564, "balance_loss_mlp": 1.03482985, "epoch": 0.8766571471516609, "flos": 25336773803520.0, "grad_norm": 2.578118961880442, "language_loss": 0.65318549, "learning_rate": 1.4825407186054007e-07, "loss": 0.67695349, "num_input_tokens_seen": 314364275, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6640625, "step": 14581, "time_per_iteration": 2.692342758178711 }, { "auxiliary_loss_clip": 0.01118552, "auxiliary_loss_mlp": 0.01032327, "balance_loss_clip": 1.02045405, "balance_loss_mlp": 1.03363264, "epoch": 0.8767172704043289, "flos": 30113287545600.0, "grad_norm": 1.5485489608458716, "language_loss": 0.73888892, "learning_rate": 1.4811135042069257e-07, "loss": 0.76039773, "num_input_tokens_seen": 314385140, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.671875, "step": 14582, "time_per_iteration": 2.8196921348571777 }, { "auxiliary_loss_clip": 0.01101159, "auxiliary_loss_mlp": 0.01277197, "balance_loss_clip": 1.01748729, "balance_loss_mlp": 1.03276038, "epoch": 0.8767773936569968, "flos": 19281157770240.0, "grad_norm": 2.4851841920754736, "language_loss": 0.66784889, "learning_rate": 1.4796869506998766e-07, "loss": 0.69163239, "num_input_tokens_seen": 314403715, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.68359375, "step": 14583, "time_per_iteration": 2.854379892349243 }, { "auxiliary_loss_clip": 0.01112844, "auxiliary_loss_mlp": 0.01272091, "balance_loss_clip": 1.01337767, "balance_loss_mlp": 1.03371692, "epoch": 0.8768375169096648, "flos": 21247230648960.0, "grad_norm": 1.66581993101986, "language_loss": 0.78917116, "learning_rate": 1.4782610581351596e-07, "loss": 0.81302053, "num_input_tokens_seen": 314421880, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.703125, "step": 14584, "time_per_iteration": 2.713770866394043 }, { "auxiliary_loss_clip": 0.0113051, "auxiliary_loss_mlp": 0.01027863, "balance_loss_clip": 1.01569784, "balance_loss_mlp": 1.0334866, "epoch": 0.8768976401623327, "flos": 23295539715840.0, "grad_norm": 2.006333324434569, "language_loss": 0.72183597, "learning_rate": 1.4768358265636626e-07, "loss": 0.74341965, "num_input_tokens_seen": 314441585, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.70703125, "step": 14585, "time_per_iteration": 2.7924201488494873 }, { "auxiliary_loss_clip": 0.01123095, "auxiliary_loss_mlp": 0.01031773, "balance_loss_clip": 1.01841521, "balance_loss_mlp": 1.03549922, "epoch": 0.8769577634150008, "flos": 21361247395200.0, "grad_norm": 2.119621949578132, "language_loss": 0.74599338, "learning_rate": 1.4754112560362452e-07, "loss": 0.767542, "num_input_tokens_seen": 314459020, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6953125, "step": 14586, "time_per_iteration": 2.7581441402435303 }, { "auxiliary_loss_clip": 0.01101847, "auxiliary_loss_mlp": 0.01031161, "balance_loss_clip": 1.01826882, "balance_loss_mlp": 1.03320503, "epoch": 0.8770178866676687, "flos": 23514056104320.0, "grad_norm": 1.8066799372738203, "language_loss": 0.78637558, "learning_rate": 1.4739873466037534e-07, "loss": 0.80770564, "num_input_tokens_seen": 314478935, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 14587, "time_per_iteration": 2.812232255935669 }, { "auxiliary_loss_clip": 0.01111288, "auxiliary_loss_mlp": 0.0103129, "balance_loss_clip": 1.01724136, "balance_loss_mlp": 1.03383446, "epoch": 0.8770780099203367, "flos": 19719052473600.0, "grad_norm": 2.516801041515828, "language_loss": 0.73924375, "learning_rate": 1.4725640983169951e-07, "loss": 0.76066953, "num_input_tokens_seen": 314497635, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.6875, "step": 14588, "time_per_iteration": 2.7976882457733154 }, { "auxiliary_loss_clip": 0.01144142, "auxiliary_loss_mlp": 0.01277789, "balance_loss_clip": 1.01941013, "balance_loss_mlp": 1.03274143, "epoch": 0.8771381331730047, "flos": 21395901041280.0, "grad_norm": 1.7197870749424775, "language_loss": 0.66659713, "learning_rate": 1.4711415112267654e-07, "loss": 0.69081646, "num_input_tokens_seen": 314515445, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 14589, "time_per_iteration": 2.9573957920074463 }, { "auxiliary_loss_clip": 0.01112941, "auxiliary_loss_mlp": 0.01032511, "balance_loss_clip": 1.0204947, "balance_loss_mlp": 1.03564906, "epoch": 0.8771982564256726, "flos": 20261770041600.0, "grad_norm": 2.8396451089676336, "language_loss": 0.70383704, "learning_rate": 1.4697195853838373e-07, "loss": 0.72529161, "num_input_tokens_seen": 314533040, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.68359375, "step": 14590, "time_per_iteration": 2.81549334526062 }, { "auxiliary_loss_clip": 0.0112688, "auxiliary_loss_mlp": 0.01043401, "balance_loss_clip": 1.02983522, "balance_loss_mlp": 1.03512895, "epoch": 0.8772583796783406, "flos": 12489372495360.0, "grad_norm": 2.028185599227967, "language_loss": 0.75250876, "learning_rate": 1.4682983208389499e-07, "loss": 0.77421159, "num_input_tokens_seen": 314548280, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73828125, "step": 14591, "time_per_iteration": 2.755202054977417 }, { "auxiliary_loss_clip": 0.01097863, "auxiliary_loss_mlp": 0.01024831, "balance_loss_clip": 1.0136497, "balance_loss_mlp": 1.03316283, "epoch": 0.8773185029310085, "flos": 15921103927680.0, "grad_norm": 1.61147800134011, "language_loss": 0.79999924, "learning_rate": 1.4668777176428247e-07, "loss": 0.82122624, "num_input_tokens_seen": 314565345, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6484375, "step": 14592, "time_per_iteration": 2.773277521133423 }, { "auxiliary_loss_clip": 0.01099046, "auxiliary_loss_mlp": 0.01031239, "balance_loss_clip": 1.02028334, "balance_loss_mlp": 1.03350091, "epoch": 0.8773786261836766, "flos": 21504530747520.0, "grad_norm": 1.9964956281540989, "language_loss": 0.82689655, "learning_rate": 1.4654577758461595e-07, "loss": 0.84819937, "num_input_tokens_seen": 314584190, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.65625, "step": 14593, "time_per_iteration": 2.7910797595977783 }, { "auxiliary_loss_clip": 0.01111421, "auxiliary_loss_mlp": 0.0102832, "balance_loss_clip": 1.01688218, "balance_loss_mlp": 1.03476334, "epoch": 0.8774387494363445, "flos": 26761493831040.0, "grad_norm": 1.526955429605408, "language_loss": 0.75810707, "learning_rate": 1.4640384954996354e-07, "loss": 0.77950454, "num_input_tokens_seen": 314605625, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.67578125, "step": 14594, "time_per_iteration": 2.864478588104248 }, { "auxiliary_loss_clip": 0.01139284, "auxiliary_loss_mlp": 0.01031362, "balance_loss_clip": 1.0188868, "balance_loss_mlp": 1.03369999, "epoch": 0.8774988726890125, "flos": 18478841633280.0, "grad_norm": 2.5785280558635435, "language_loss": 0.77871501, "learning_rate": 1.46261987665389e-07, "loss": 0.80042154, "num_input_tokens_seen": 314622630, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 14595, "time_per_iteration": 4.271026849746704 }, { "auxiliary_loss_clip": 0.01129024, "auxiliary_loss_mlp": 0.01033903, "balance_loss_clip": 1.0215174, "balance_loss_mlp": 1.03451085, "epoch": 0.8775589959416804, "flos": 24426366664320.0, "grad_norm": 1.3756671690128521, "language_loss": 0.70679498, "learning_rate": 1.4612019193595626e-07, "loss": 0.72842419, "num_input_tokens_seen": 314642460, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 14596, "time_per_iteration": 2.884512424468994 }, { "auxiliary_loss_clip": 0.01128403, "auxiliary_loss_mlp": 0.01025461, "balance_loss_clip": 1.01403463, "balance_loss_mlp": 1.03465712, "epoch": 0.8776191191943484, "flos": 23440151871360.0, "grad_norm": 1.7715055316578279, "language_loss": 0.85697162, "learning_rate": 1.4597846236672505e-07, "loss": 0.87851024, "num_input_tokens_seen": 314659875, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.67578125, "step": 14597, "time_per_iteration": 2.9668819904327393 }, { "auxiliary_loss_clip": 0.01111657, "auxiliary_loss_mlp": 0.01031949, "balance_loss_clip": 1.02036738, "balance_loss_mlp": 1.03480625, "epoch": 0.8776792424470163, "flos": 26830872950400.0, "grad_norm": 1.6835636296804415, "language_loss": 0.72824645, "learning_rate": 1.4583679896275336e-07, "loss": 0.74968249, "num_input_tokens_seen": 314680260, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6796875, "step": 14598, "time_per_iteration": 2.9125661849975586 }, { "auxiliary_loss_clip": 0.01113706, "auxiliary_loss_mlp": 0.01277581, "balance_loss_clip": 1.01843238, "balance_loss_mlp": 1.03567171, "epoch": 0.8777393656996844, "flos": 15626169354240.0, "grad_norm": 2.1658492209748794, "language_loss": 0.77476597, "learning_rate": 1.4569520172909644e-07, "loss": 0.79867887, "num_input_tokens_seen": 314696260, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 14599, "time_per_iteration": 2.8189523220062256 }, { "auxiliary_loss_clip": 0.01128095, "auxiliary_loss_mlp": 0.01026819, "balance_loss_clip": 1.01484466, "balance_loss_mlp": 1.03283679, "epoch": 0.8777994889523523, "flos": 18879999701760.0, "grad_norm": 2.467631231225996, "language_loss": 0.67762595, "learning_rate": 1.4555367067080849e-07, "loss": 0.69917512, "num_input_tokens_seen": 314714215, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 14600, "time_per_iteration": 2.8444743156433105 }, { "auxiliary_loss_clip": 0.01111135, "auxiliary_loss_mlp": 0.0103716, "balance_loss_clip": 1.02475023, "balance_loss_mlp": 1.0326426, "epoch": 0.8778596122050203, "flos": 48826516400640.0, "grad_norm": 1.7677937440973062, "language_loss": 0.69472528, "learning_rate": 1.4541220579293966e-07, "loss": 0.71620822, "num_input_tokens_seen": 314735700, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 14601, "time_per_iteration": 4.340956211090088 }, { "auxiliary_loss_clip": 0.01114963, "auxiliary_loss_mlp": 0.01032552, "balance_loss_clip": 1.02014279, "balance_loss_mlp": 1.03474593, "epoch": 0.8779197354576883, "flos": 25660184883840.0, "grad_norm": 1.7541572162648709, "language_loss": 0.73310983, "learning_rate": 1.4527080710053862e-07, "loss": 0.75458503, "num_input_tokens_seen": 314753335, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71484375, "step": 14602, "time_per_iteration": 2.958986759185791 }, { "auxiliary_loss_clip": 0.01100437, "auxiliary_loss_mlp": 0.01034597, "balance_loss_clip": 1.02266431, "balance_loss_mlp": 1.03316116, "epoch": 0.8779798587103562, "flos": 18843227153280.0, "grad_norm": 1.8963277790133668, "language_loss": 0.70875573, "learning_rate": 1.4512947459865134e-07, "loss": 0.73010606, "num_input_tokens_seen": 314770800, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 14603, "time_per_iteration": 2.8527956008911133 }, { "auxiliary_loss_clip": 0.01039777, "auxiliary_loss_mlp": 0.01003458, "balance_loss_clip": 1.00221789, "balance_loss_mlp": 1.00135827, "epoch": 0.8780399819630242, "flos": 68613119377920.0, "grad_norm": 0.7324106751928274, "language_loss": 0.54175651, "learning_rate": 1.449882082923224e-07, "loss": 0.56218886, "num_input_tokens_seen": 314837275, "router_z_loss_clip": 0.01239014, "router_z_loss_mlp": 0.21289062, "step": 14604, "time_per_iteration": 3.467463731765747 }, { "auxiliary_loss_clip": 0.01049446, "auxiliary_loss_mlp": 0.01001623, "balance_loss_clip": 1.00032926, "balance_loss_mlp": 1.00120735, "epoch": 0.8781001052156922, "flos": 65734807766400.0, "grad_norm": 0.7146042149907468, "language_loss": 0.59295213, "learning_rate": 1.4484700818659223e-07, "loss": 0.61346281, "num_input_tokens_seen": 314902220, "router_z_loss_clip": 0.01293945, "router_z_loss_mlp": 0.21289062, "step": 14605, "time_per_iteration": 3.412799119949341 }, { "auxiliary_loss_clip": 0.01139732, "auxiliary_loss_mlp": 0.0103151, "balance_loss_clip": 1.01865923, "balance_loss_mlp": 1.03393745, "epoch": 0.8781602284683602, "flos": 22049654526720.0, "grad_norm": 1.5944318842048044, "language_loss": 0.85183227, "learning_rate": 1.4470587428649994e-07, "loss": 0.87354469, "num_input_tokens_seen": 314921645, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 14606, "time_per_iteration": 2.8615341186523438 }, { "auxiliary_loss_clip": 0.011063, "auxiliary_loss_mlp": 0.01030606, "balance_loss_clip": 1.01768386, "balance_loss_mlp": 1.03589511, "epoch": 0.8782203517210281, "flos": 17562939713280.0, "grad_norm": 3.5358112334124994, "language_loss": 0.7025224, "learning_rate": 1.4456480659708304e-07, "loss": 0.7238915, "num_input_tokens_seen": 314939390, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 14607, "time_per_iteration": 2.724405288696289 }, { "auxiliary_loss_clip": 0.01119504, "auxiliary_loss_mlp": 0.0103862, "balance_loss_clip": 1.02730155, "balance_loss_mlp": 1.03441358, "epoch": 0.8782804749736961, "flos": 25520421064320.0, "grad_norm": 1.7834759584071633, "language_loss": 0.72122157, "learning_rate": 1.444238051233755e-07, "loss": 0.7428028, "num_input_tokens_seen": 314959205, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.671875, "step": 14608, "time_per_iteration": 5.3814966678619385 }, { "auxiliary_loss_clip": 0.01116527, "auxiliary_loss_mlp": 0.01032623, "balance_loss_clip": 1.0202198, "balance_loss_mlp": 1.03517294, "epoch": 0.878340598226364, "flos": 21798747048960.0, "grad_norm": 2.002455745355996, "language_loss": 0.87283683, "learning_rate": 1.4428286987040928e-07, "loss": 0.89432836, "num_input_tokens_seen": 314977485, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.72265625, "step": 14609, "time_per_iteration": 2.961911916732788 }, { "auxiliary_loss_clip": 0.01140598, "auxiliary_loss_mlp": 0.01027584, "balance_loss_clip": 1.01575792, "balance_loss_mlp": 1.03584862, "epoch": 0.878400721479032, "flos": 21102403011840.0, "grad_norm": 1.4842925970224576, "language_loss": 0.70407307, "learning_rate": 1.4414200084321349e-07, "loss": 0.72575492, "num_input_tokens_seen": 314997830, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 14610, "time_per_iteration": 4.65478777885437 }, { "auxiliary_loss_clip": 0.01110547, "auxiliary_loss_mlp": 0.01277367, "balance_loss_clip": 1.01860321, "balance_loss_mlp": 1.03473604, "epoch": 0.8784608447316999, "flos": 20923532259840.0, "grad_norm": 2.2247791260753855, "language_loss": 0.80313993, "learning_rate": 1.4400119804681654e-07, "loss": 0.8270191, "num_input_tokens_seen": 315016480, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6640625, "step": 14611, "time_per_iteration": 2.878422260284424 }, { "auxiliary_loss_clip": 0.01104846, "auxiliary_loss_mlp": 0.01034591, "balance_loss_clip": 1.02241337, "balance_loss_mlp": 1.03528237, "epoch": 0.878520967984368, "flos": 23330660238720.0, "grad_norm": 1.7166041381431227, "language_loss": 0.76761812, "learning_rate": 1.43860461486242e-07, "loss": 0.78901249, "num_input_tokens_seen": 315036135, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 14612, "time_per_iteration": 2.7607407569885254 }, { "auxiliary_loss_clip": 0.01137384, "auxiliary_loss_mlp": 0.01033177, "balance_loss_clip": 1.02080333, "balance_loss_mlp": 1.03429973, "epoch": 0.8785810912370359, "flos": 25518984520320.0, "grad_norm": 1.4279637505781841, "language_loss": 0.72536546, "learning_rate": 1.4371979116651334e-07, "loss": 0.74707103, "num_input_tokens_seen": 315057995, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.67578125, "step": 14613, "time_per_iteration": 3.0381221771240234 }, { "auxiliary_loss_clip": 0.01048948, "auxiliary_loss_mlp": 0.01000841, "balance_loss_clip": 0.99953556, "balance_loss_mlp": 1.00113297, "epoch": 0.8786412144897039, "flos": 61841047691520.0, "grad_norm": 1.0378868287402792, "language_loss": 0.64609504, "learning_rate": 1.435791870926506e-07, "loss": 0.6665929, "num_input_tokens_seen": 315104010, "router_z_loss_clip": 0.01306152, "router_z_loss_mlp": 0.2109375, "step": 14614, "time_per_iteration": 3.238917350769043 }, { "auxiliary_loss_clip": 0.01022481, "auxiliary_loss_mlp": 0.01002159, "balance_loss_clip": 1.0009135, "balance_loss_mlp": 1.00140858, "epoch": 0.8787013377423719, "flos": 70989364638720.0, "grad_norm": 0.6645274280740111, "language_loss": 0.58637959, "learning_rate": 1.4343864926967109e-07, "loss": 0.60662597, "num_input_tokens_seen": 315174550, "router_z_loss_clip": 0.01245117, "router_z_loss_mlp": 0.2109375, "step": 14615, "time_per_iteration": 3.3739984035491943 }, { "auxiliary_loss_clip": 0.01137409, "auxiliary_loss_mlp": 0.01032891, "balance_loss_clip": 1.02067769, "balance_loss_mlp": 1.03301716, "epoch": 0.8787614609950398, "flos": 17347404153600.0, "grad_norm": 1.9063747336188537, "language_loss": 0.72855747, "learning_rate": 1.4329817770259035e-07, "loss": 0.75026047, "num_input_tokens_seen": 315191825, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 14616, "time_per_iteration": 2.8485469818115234 }, { "auxiliary_loss_clip": 0.01022477, "auxiliary_loss_mlp": 0.01006118, "balance_loss_clip": 1.00493801, "balance_loss_mlp": 1.00132465, "epoch": 0.8788215842477078, "flos": 72511401588480.0, "grad_norm": 0.8156441552813223, "language_loss": 0.57956862, "learning_rate": 1.4315777239642212e-07, "loss": 0.59985459, "num_input_tokens_seen": 315255075, "router_z_loss_clip": 0.01177979, "router_z_loss_mlp": 0.2109375, "step": 14617, "time_per_iteration": 3.474886178970337 }, { "auxiliary_loss_clip": 0.01117698, "auxiliary_loss_mlp": 0.01028979, "balance_loss_clip": 1.01702261, "balance_loss_mlp": 1.03296018, "epoch": 0.8788817075003758, "flos": 24827452905600.0, "grad_norm": 1.6769221480715504, "language_loss": 0.83227158, "learning_rate": 1.4301743335617645e-07, "loss": 0.85373831, "num_input_tokens_seen": 315273995, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.66796875, "step": 14618, "time_per_iteration": 2.7982797622680664 }, { "auxiliary_loss_clip": 0.01115817, "auxiliary_loss_mlp": 0.01028828, "balance_loss_clip": 1.01722908, "balance_loss_mlp": 1.03358817, "epoch": 0.8789418307530438, "flos": 22638769488000.0, "grad_norm": 1.603320163889601, "language_loss": 0.69026947, "learning_rate": 1.4287716058686195e-07, "loss": 0.71171594, "num_input_tokens_seen": 315294485, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.64453125, "step": 14619, "time_per_iteration": 2.823373317718506 }, { "auxiliary_loss_clip": 0.01141798, "auxiliary_loss_mlp": 0.01036545, "balance_loss_clip": 1.02439117, "balance_loss_mlp": 1.03521645, "epoch": 0.8790019540057117, "flos": 19785738072960.0, "grad_norm": 1.7297530650428354, "language_loss": 0.77136314, "learning_rate": 1.4273695409348441e-07, "loss": 0.79314661, "num_input_tokens_seen": 315310420, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7109375, "step": 14620, "time_per_iteration": 3.057891845703125 }, { "auxiliary_loss_clip": 0.01123184, "auxiliary_loss_mlp": 0.01033739, "balance_loss_clip": 1.02028072, "balance_loss_mlp": 1.03507662, "epoch": 0.8790620772583797, "flos": 20229774001920.0, "grad_norm": 1.649755589874542, "language_loss": 0.79022324, "learning_rate": 1.4259681388104738e-07, "loss": 0.81179243, "num_input_tokens_seen": 315330110, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.69921875, "step": 14621, "time_per_iteration": 2.633972406387329 }, { "auxiliary_loss_clip": 0.01121907, "auxiliary_loss_mlp": 0.01035903, "balance_loss_clip": 1.02380347, "balance_loss_mlp": 1.03350079, "epoch": 0.8791222005110476, "flos": 24130785646080.0, "grad_norm": 2.0625630945181443, "language_loss": 0.66654932, "learning_rate": 1.4245673995455242e-07, "loss": 0.68812746, "num_input_tokens_seen": 315350080, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.703125, "step": 14622, "time_per_iteration": 3.127586841583252 }, { "auxiliary_loss_clip": 0.01121425, "auxiliary_loss_mlp": 0.01034934, "balance_loss_clip": 1.02318573, "balance_loss_mlp": 1.03565133, "epoch": 0.8791823237637156, "flos": 21614201948160.0, "grad_norm": 1.9558094274193216, "language_loss": 0.73070705, "learning_rate": 1.4231673231899798e-07, "loss": 0.75227058, "num_input_tokens_seen": 315366360, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6796875, "step": 14623, "time_per_iteration": 2.633408308029175 }, { "auxiliary_loss_clip": 0.01133375, "auxiliary_loss_mlp": 0.01031058, "balance_loss_clip": 1.01908934, "balance_loss_mlp": 1.03036332, "epoch": 0.8792424470163835, "flos": 24243401761920.0, "grad_norm": 1.7181951679237069, "language_loss": 0.78450751, "learning_rate": 1.4217679097938118e-07, "loss": 0.80615187, "num_input_tokens_seen": 315385890, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.67578125, "step": 14624, "time_per_iteration": 3.0267271995544434 }, { "auxiliary_loss_clip": 0.01113108, "auxiliary_loss_mlp": 0.0103339, "balance_loss_clip": 1.0200268, "balance_loss_mlp": 1.03413761, "epoch": 0.8793025702690516, "flos": 24893204751360.0, "grad_norm": 1.7304634906225078, "language_loss": 0.79922235, "learning_rate": 1.4203691594069555e-07, "loss": 0.82068729, "num_input_tokens_seen": 315403400, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.703125, "step": 14625, "time_per_iteration": 2.618826389312744 }, { "auxiliary_loss_clip": 0.01117524, "auxiliary_loss_mlp": 0.0103641, "balance_loss_clip": 1.02456713, "balance_loss_mlp": 1.03412151, "epoch": 0.8793626935217195, "flos": 25373115388800.0, "grad_norm": 1.6442604327516124, "language_loss": 0.73976284, "learning_rate": 1.4189710720793336e-07, "loss": 0.76130217, "num_input_tokens_seen": 315423670, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.66015625, "step": 14626, "time_per_iteration": 3.1041243076324463 }, { "auxiliary_loss_clip": 0.01112933, "auxiliary_loss_mlp": 0.01033691, "balance_loss_clip": 1.02117419, "balance_loss_mlp": 1.03310156, "epoch": 0.8794228167743875, "flos": 34678000742400.0, "grad_norm": 2.341604391711364, "language_loss": 0.70826983, "learning_rate": 1.4175736478608346e-07, "loss": 0.72973609, "num_input_tokens_seen": 315446265, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 14627, "time_per_iteration": 2.653160572052002 }, { "auxiliary_loss_clip": 0.01129678, "auxiliary_loss_mlp": 0.01032426, "balance_loss_clip": 1.01971221, "balance_loss_mlp": 1.03372073, "epoch": 0.8794829400270555, "flos": 19464014931840.0, "grad_norm": 1.7246978663101675, "language_loss": 0.72203135, "learning_rate": 1.4161768868013392e-07, "loss": 0.7436524, "num_input_tokens_seen": 315464655, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 14628, "time_per_iteration": 3.177154064178467 }, { "auxiliary_loss_clip": 0.01129526, "auxiliary_loss_mlp": 0.0103648, "balance_loss_clip": 1.0237422, "balance_loss_mlp": 1.03404081, "epoch": 0.8795430632797234, "flos": 15231403906560.0, "grad_norm": 4.011506523903523, "language_loss": 0.68904579, "learning_rate": 1.41478078895068e-07, "loss": 0.71070582, "num_input_tokens_seen": 315481090, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 14629, "time_per_iteration": 2.5827276706695557 }, { "auxiliary_loss_clip": 0.01116548, "auxiliary_loss_mlp": 0.01030916, "balance_loss_clip": 1.01975191, "balance_loss_mlp": 1.03229284, "epoch": 0.8796031865323914, "flos": 27744727795200.0, "grad_norm": 1.6256214193647982, "language_loss": 0.69136965, "learning_rate": 1.4133853543586915e-07, "loss": 0.71284437, "num_input_tokens_seen": 315502010, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6640625, "step": 14630, "time_per_iteration": 3.002864122390747 }, { "auxiliary_loss_clip": 0.01116862, "auxiliary_loss_mlp": 0.01036536, "balance_loss_clip": 1.02386415, "balance_loss_mlp": 1.03358233, "epoch": 0.8796633097850594, "flos": 31285412156160.0, "grad_norm": 1.560995688588532, "language_loss": 0.74295497, "learning_rate": 1.411990583075169e-07, "loss": 0.76448888, "num_input_tokens_seen": 315523040, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.65625, "step": 14631, "time_per_iteration": 2.883204460144043 }, { "auxiliary_loss_clip": 0.01117896, "auxiliary_loss_mlp": 0.01280796, "balance_loss_clip": 1.02116072, "balance_loss_mlp": 1.03480124, "epoch": 0.8797234330377274, "flos": 24243150366720.0, "grad_norm": 2.0236328922693665, "language_loss": 0.69200993, "learning_rate": 1.410596475149888e-07, "loss": 0.71599686, "num_input_tokens_seen": 315541865, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.65234375, "step": 14632, "time_per_iteration": 2.7649738788604736 }, { "auxiliary_loss_clip": 0.01121122, "auxiliary_loss_mlp": 0.01027038, "balance_loss_clip": 1.01597571, "balance_loss_mlp": 1.03534722, "epoch": 0.8797835562903953, "flos": 24167414540160.0, "grad_norm": 1.8265485381127744, "language_loss": 0.65119946, "learning_rate": 1.4092030306325974e-07, "loss": 0.67268109, "num_input_tokens_seen": 315561470, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.6796875, "step": 14633, "time_per_iteration": 2.672962188720703 }, { "auxiliary_loss_clip": 0.01128596, "auxiliary_loss_mlp": 0.01034002, "balance_loss_clip": 1.02214706, "balance_loss_mlp": 1.03436542, "epoch": 0.8798436795430633, "flos": 19284677303040.0, "grad_norm": 1.4202806311626672, "language_loss": 0.84234095, "learning_rate": 1.4078102495730338e-07, "loss": 0.86396694, "num_input_tokens_seen": 315583140, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.67578125, "step": 14634, "time_per_iteration": 2.5919909477233887 }, { "auxiliary_loss_clip": 0.01119512, "auxiliary_loss_mlp": 0.01280171, "balance_loss_clip": 1.02100301, "balance_loss_mlp": 1.032722, "epoch": 0.8799038027957312, "flos": 28179390274560.0, "grad_norm": 1.6408034768087625, "language_loss": 0.79888511, "learning_rate": 1.4064181320208968e-07, "loss": 0.82288194, "num_input_tokens_seen": 315601935, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 14635, "time_per_iteration": 2.6736600399017334 }, { "auxiliary_loss_clip": 0.01121392, "auxiliary_loss_mlp": 0.01026652, "balance_loss_clip": 1.01446235, "balance_loss_mlp": 1.03395057, "epoch": 0.8799639260483992, "flos": 24644703484800.0, "grad_norm": 3.5760953443821473, "language_loss": 0.65222383, "learning_rate": 1.405026678025869e-07, "loss": 0.67370427, "num_input_tokens_seen": 315619995, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 14636, "time_per_iteration": 2.572443723678589 }, { "auxiliary_loss_clip": 0.01121678, "auxiliary_loss_mlp": 0.01035931, "balance_loss_clip": 1.02310979, "balance_loss_mlp": 1.03421009, "epoch": 0.8800240493010671, "flos": 22200479735040.0, "grad_norm": 1.8266590569861074, "language_loss": 0.70460153, "learning_rate": 1.4036358876376065e-07, "loss": 0.72617769, "num_input_tokens_seen": 315637895, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 14637, "time_per_iteration": 3.933027744293213 }, { "auxiliary_loss_clip": 0.01122255, "auxiliary_loss_mlp": 0.01031076, "balance_loss_clip": 1.01925671, "balance_loss_mlp": 1.03723431, "epoch": 0.8800841725537352, "flos": 14246086953600.0, "grad_norm": 1.8814245097589801, "language_loss": 0.65818989, "learning_rate": 1.4022457609057447e-07, "loss": 0.6797232, "num_input_tokens_seen": 315655520, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 14638, "time_per_iteration": 2.5151784420013428 }, { "auxiliary_loss_clip": 0.01112134, "auxiliary_loss_mlp": 0.01029031, "balance_loss_clip": 1.01720548, "balance_loss_mlp": 1.03429627, "epoch": 0.8801442958064031, "flos": 37415794348800.0, "grad_norm": 2.094152179325316, "language_loss": 0.58299971, "learning_rate": 1.400856297879891e-07, "loss": 0.60441136, "num_input_tokens_seen": 315678955, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69140625, "step": 14639, "time_per_iteration": 2.6103382110595703 }, { "auxiliary_loss_clip": 0.01121806, "auxiliary_loss_mlp": 0.01036511, "balance_loss_clip": 1.02369022, "balance_loss_mlp": 1.03401279, "epoch": 0.8802044190590711, "flos": 25047334010880.0, "grad_norm": 1.4415750611879699, "language_loss": 0.74434102, "learning_rate": 1.3994674986096322e-07, "loss": 0.76592422, "num_input_tokens_seen": 315700360, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 14640, "time_per_iteration": 2.599071979522705 }, { "auxiliary_loss_clip": 0.01121508, "auxiliary_loss_mlp": 0.01040354, "balance_loss_clip": 1.02772379, "balance_loss_mlp": 1.03404069, "epoch": 0.8802645423117391, "flos": 24133874215680.0, "grad_norm": 2.0675688067831226, "language_loss": 0.69612467, "learning_rate": 1.3980793631445377e-07, "loss": 0.71774334, "num_input_tokens_seen": 315719270, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 14641, "time_per_iteration": 2.586221218109131 }, { "auxiliary_loss_clip": 0.0111181, "auxiliary_loss_mlp": 0.01027704, "balance_loss_clip": 1.01462698, "balance_loss_mlp": 1.03255188, "epoch": 0.880324665564407, "flos": 17931203902080.0, "grad_norm": 1.956599228364541, "language_loss": 0.85189378, "learning_rate": 1.3966918915341342e-07, "loss": 0.87328887, "num_input_tokens_seen": 315737425, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 14642, "time_per_iteration": 2.550687313079834 }, { "auxiliary_loss_clip": 0.01138997, "auxiliary_loss_mlp": 0.01034442, "balance_loss_clip": 1.02107871, "balance_loss_mlp": 1.03599572, "epoch": 0.880384788817075, "flos": 21287630471040.0, "grad_norm": 1.6976348678882809, "language_loss": 0.78841931, "learning_rate": 1.3953050838279446e-07, "loss": 0.81015366, "num_input_tokens_seen": 315755725, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6796875, "step": 14643, "time_per_iteration": 3.9420835971832275 }, { "auxiliary_loss_clip": 0.01105018, "auxiliary_loss_mlp": 0.0102561, "balance_loss_clip": 1.01426125, "balance_loss_mlp": 1.03117239, "epoch": 0.880444912069743, "flos": 12458489777280.0, "grad_norm": 1.676460914648308, "language_loss": 0.72953904, "learning_rate": 1.3939189400754603e-07, "loss": 0.75084531, "num_input_tokens_seen": 315773835, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6484375, "step": 14644, "time_per_iteration": 2.4786839485168457 }, { "auxiliary_loss_clip": 0.01104628, "auxiliary_loss_mlp": 0.01278752, "balance_loss_clip": 1.01877403, "balance_loss_mlp": 1.0342648, "epoch": 0.880505035322411, "flos": 13625945619840.0, "grad_norm": 1.9553971239787087, "language_loss": 0.79317081, "learning_rate": 1.392533460326153e-07, "loss": 0.81700462, "num_input_tokens_seen": 315790615, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 14645, "time_per_iteration": 2.4767134189605713 }, { "auxiliary_loss_clip": 0.01110266, "auxiliary_loss_mlp": 0.01037174, "balance_loss_clip": 1.02434111, "balance_loss_mlp": 1.03408766, "epoch": 0.8805651585750789, "flos": 23183067254400.0, "grad_norm": 1.5914853034267018, "language_loss": 0.64514291, "learning_rate": 1.391148644629454e-07, "loss": 0.66661727, "num_input_tokens_seen": 315811010, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.671875, "step": 14646, "time_per_iteration": 2.5372698307037354 }, { "auxiliary_loss_clip": 0.01125825, "auxiliary_loss_mlp": 0.01033327, "balance_loss_clip": 1.02044678, "balance_loss_mlp": 1.03549492, "epoch": 0.8806252818277469, "flos": 14903000835840.0, "grad_norm": 1.9139928416085215, "language_loss": 0.768327, "learning_rate": 1.3897644930347973e-07, "loss": 0.78991848, "num_input_tokens_seen": 315828130, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 14647, "time_per_iteration": 2.4996612071990967 }, { "auxiliary_loss_clip": 0.01099621, "auxiliary_loss_mlp": 0.01032185, "balance_loss_clip": 1.02055025, "balance_loss_mlp": 1.0335058, "epoch": 0.8806854050804148, "flos": 17639178330240.0, "grad_norm": 1.909929954559407, "language_loss": 0.75275099, "learning_rate": 1.388381005591577e-07, "loss": 0.77406907, "num_input_tokens_seen": 315844900, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6640625, "step": 14648, "time_per_iteration": 2.452669382095337 }, { "auxiliary_loss_clip": 0.01120518, "auxiliary_loss_mlp": 0.01030338, "balance_loss_clip": 1.018435, "balance_loss_mlp": 1.03292823, "epoch": 0.8807455283330828, "flos": 25332392344320.0, "grad_norm": 2.011002526957072, "language_loss": 0.65463531, "learning_rate": 1.3869981823491595e-07, "loss": 0.67614388, "num_input_tokens_seen": 315863745, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 14649, "time_per_iteration": 2.5825581550598145 }, { "auxiliary_loss_clip": 0.01129788, "auxiliary_loss_mlp": 0.01032823, "balance_loss_clip": 1.02009761, "balance_loss_mlp": 1.03512192, "epoch": 0.8808056515857507, "flos": 25265168040960.0, "grad_norm": 1.3190472472086854, "language_loss": 0.62430412, "learning_rate": 1.3856160233569015e-07, "loss": 0.64593029, "num_input_tokens_seen": 315885765, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.68359375, "step": 14650, "time_per_iteration": 4.10599422454834 }, { "auxiliary_loss_clip": 0.0111772, "auxiliary_loss_mlp": 0.01032364, "balance_loss_clip": 1.02083087, "balance_loss_mlp": 1.03333426, "epoch": 0.8808657748384188, "flos": 26578852151040.0, "grad_norm": 1.3883539623564465, "language_loss": 0.72783083, "learning_rate": 1.384234528664121e-07, "loss": 0.74933171, "num_input_tokens_seen": 315907340, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 14651, "time_per_iteration": 2.5688085556030273 }, { "auxiliary_loss_clip": 0.0111417, "auxiliary_loss_mlp": 0.01031901, "balance_loss_clip": 1.01858568, "balance_loss_mlp": 1.0352056, "epoch": 0.8809258980910867, "flos": 18661231918080.0, "grad_norm": 1.7430784339420686, "language_loss": 0.7179184, "learning_rate": 1.382853698320132e-07, "loss": 0.73937905, "num_input_tokens_seen": 315924935, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 14652, "time_per_iteration": 4.040740966796875 }, { "auxiliary_loss_clip": 0.01109299, "auxiliary_loss_mlp": 0.01033604, "balance_loss_clip": 1.02131343, "balance_loss_mlp": 1.0323782, "epoch": 0.8809860213437547, "flos": 19792274348160.0, "grad_norm": 1.9548276346761089, "language_loss": 0.74829304, "learning_rate": 1.3814735323741977e-07, "loss": 0.76972204, "num_input_tokens_seen": 315943165, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 14653, "time_per_iteration": 2.50229811668396 }, { "auxiliary_loss_clip": 0.01105326, "auxiliary_loss_mlp": 0.01032443, "balance_loss_clip": 1.01958013, "balance_loss_mlp": 1.03400278, "epoch": 0.8810461445964227, "flos": 17894467267200.0, "grad_norm": 2.1877022795123655, "language_loss": 0.70984572, "learning_rate": 1.380094030875585e-07, "loss": 0.7312234, "num_input_tokens_seen": 315961340, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 14654, "time_per_iteration": 2.4894325733184814 }, { "auxiliary_loss_clip": 0.01138075, "auxiliary_loss_mlp": 0.01036982, "balance_loss_clip": 1.02366614, "balance_loss_mlp": 1.03394639, "epoch": 0.8811062678490906, "flos": 29163917128320.0, "grad_norm": 2.106877192686228, "language_loss": 0.71478295, "learning_rate": 1.378715193873521e-07, "loss": 0.73653352, "num_input_tokens_seen": 315981335, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6875, "step": 14655, "time_per_iteration": 2.570467710494995 }, { "auxiliary_loss_clip": 0.01137473, "auxiliary_loss_mlp": 0.0103041, "balance_loss_clip": 1.01833999, "balance_loss_mlp": 1.03422201, "epoch": 0.8811663911017587, "flos": 20338834671360.0, "grad_norm": 2.015808298135483, "language_loss": 0.81608891, "learning_rate": 1.377337021417211e-07, "loss": 0.83776766, "num_input_tokens_seen": 316001325, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 14656, "time_per_iteration": 2.5644161701202393 }, { "auxiliary_loss_clip": 0.01120778, "auxiliary_loss_mlp": 0.01033123, "balance_loss_clip": 1.02168465, "balance_loss_mlp": 1.03426051, "epoch": 0.8812265143544266, "flos": 25885704424320.0, "grad_norm": 1.6332699050427837, "language_loss": 0.68741572, "learning_rate": 1.3759595135558355e-07, "loss": 0.70895475, "num_input_tokens_seen": 316022540, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.68359375, "step": 14657, "time_per_iteration": 2.5504157543182373 }, { "auxiliary_loss_clip": 0.01122298, "auxiliary_loss_mlp": 0.01035345, "balance_loss_clip": 1.0232867, "balance_loss_mlp": 1.03460431, "epoch": 0.8812866376070946, "flos": 27195509865600.0, "grad_norm": 1.852637826537522, "language_loss": 0.8379637, "learning_rate": 1.3745826703385665e-07, "loss": 0.85954016, "num_input_tokens_seen": 316037735, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6953125, "step": 14658, "time_per_iteration": 2.562457799911499 }, { "auxiliary_loss_clip": 0.01127575, "auxiliary_loss_mlp": 0.01036534, "balance_loss_clip": 1.0230571, "balance_loss_mlp": 1.03681922, "epoch": 0.8813467608597625, "flos": 23807194997760.0, "grad_norm": 1.785147393918243, "language_loss": 0.77582729, "learning_rate": 1.3732064918145226e-07, "loss": 0.7974683, "num_input_tokens_seen": 316058105, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 14659, "time_per_iteration": 2.5752005577087402 }, { "auxiliary_loss_clip": 0.01119467, "auxiliary_loss_mlp": 0.01035775, "balance_loss_clip": 1.0229007, "balance_loss_mlp": 1.03356731, "epoch": 0.8814068841124305, "flos": 21105455667840.0, "grad_norm": 1.9326056102697913, "language_loss": 0.60495323, "learning_rate": 1.3718309780328309e-07, "loss": 0.62650561, "num_input_tokens_seen": 316074415, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.68359375, "step": 14660, "time_per_iteration": 2.638505458831787 }, { "auxiliary_loss_clip": 0.01120733, "auxiliary_loss_mlp": 0.01038655, "balance_loss_clip": 1.02513719, "balance_loss_mlp": 1.0338186, "epoch": 0.8814670073650984, "flos": 24716991605760.0, "grad_norm": 1.6688917011552948, "language_loss": 0.77551371, "learning_rate": 1.3704561290425697e-07, "loss": 0.79710764, "num_input_tokens_seen": 316094405, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.69140625, "step": 14661, "time_per_iteration": 2.572338581085205 }, { "auxiliary_loss_clip": 0.01131905, "auxiliary_loss_mlp": 0.01040222, "balance_loss_clip": 1.02726984, "balance_loss_mlp": 1.03521276, "epoch": 0.8815271306177664, "flos": 22966274718720.0, "grad_norm": 1.7316240190485528, "language_loss": 0.76639664, "learning_rate": 1.3690819448928158e-07, "loss": 0.78811789, "num_input_tokens_seen": 316113390, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 14662, "time_per_iteration": 2.5879502296447754 }, { "auxiliary_loss_clip": 0.01136743, "auxiliary_loss_mlp": 0.01026162, "balance_loss_clip": 1.01493216, "balance_loss_mlp": 1.0339731, "epoch": 0.8815872538704344, "flos": 19460064435840.0, "grad_norm": 1.7551009223089418, "language_loss": 0.73543358, "learning_rate": 1.3677084256325966e-07, "loss": 0.75706261, "num_input_tokens_seen": 316131085, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6640625, "step": 14663, "time_per_iteration": 2.5810744762420654 }, { "auxiliary_loss_clip": 0.01111547, "auxiliary_loss_mlp": 0.01281751, "balance_loss_clip": 1.02264094, "balance_loss_mlp": 1.03465903, "epoch": 0.8816473771231024, "flos": 32156604622080.0, "grad_norm": 1.5244925650822911, "language_loss": 0.69886804, "learning_rate": 1.366335571310937e-07, "loss": 0.72280097, "num_input_tokens_seen": 316151440, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6796875, "step": 14664, "time_per_iteration": 2.66068434715271 }, { "auxiliary_loss_clip": 0.01153368, "auxiliary_loss_mlp": 0.01031107, "balance_loss_clip": 1.02007389, "balance_loss_mlp": 1.03355074, "epoch": 0.8817075003757703, "flos": 19682279925120.0, "grad_norm": 1.46063770242201, "language_loss": 0.81100237, "learning_rate": 1.3649633819768313e-07, "loss": 0.83284712, "num_input_tokens_seen": 316170750, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6640625, "step": 14665, "time_per_iteration": 2.612560272216797 }, { "auxiliary_loss_clip": 0.01122404, "auxiliary_loss_mlp": 0.01037433, "balance_loss_clip": 1.02609634, "balance_loss_mlp": 1.03524005, "epoch": 0.8817676236284383, "flos": 15668616251520.0, "grad_norm": 2.070088741636229, "language_loss": 0.58545762, "learning_rate": 1.3635918576792448e-07, "loss": 0.60705602, "num_input_tokens_seen": 316187265, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.69140625, "step": 14666, "time_per_iteration": 2.521867513656616 }, { "auxiliary_loss_clip": 0.01114755, "auxiliary_loss_mlp": 0.01032701, "balance_loss_clip": 1.01923621, "balance_loss_mlp": 1.0366354, "epoch": 0.8818277468811063, "flos": 17895185539200.0, "grad_norm": 1.9833130927736868, "language_loss": 0.83657908, "learning_rate": 1.3622209984671296e-07, "loss": 0.85805368, "num_input_tokens_seen": 316206555, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.6953125, "step": 14667, "time_per_iteration": 2.5474350452423096 }, { "auxiliary_loss_clip": 0.01101862, "auxiliary_loss_mlp": 0.01034298, "balance_loss_clip": 1.02147698, "balance_loss_mlp": 1.03379595, "epoch": 0.8818878701337742, "flos": 18770508069120.0, "grad_norm": 1.924343143819196, "language_loss": 0.62318218, "learning_rate": 1.3608508043893995e-07, "loss": 0.64454377, "num_input_tokens_seen": 316225210, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6796875, "step": 14668, "time_per_iteration": 2.467893362045288 }, { "auxiliary_loss_clip": 0.011093, "auxiliary_loss_mlp": 0.0102861, "balance_loss_clip": 1.01761878, "balance_loss_mlp": 1.03419471, "epoch": 0.8819479933864423, "flos": 17712292464000.0, "grad_norm": 2.1276057772269676, "language_loss": 0.57254565, "learning_rate": 1.359481275494967e-07, "loss": 0.59392464, "num_input_tokens_seen": 316242685, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6640625, "step": 14669, "time_per_iteration": 2.5150301456451416 }, { "auxiliary_loss_clip": 0.01101642, "auxiliary_loss_mlp": 0.01032457, "balance_loss_clip": 1.02045226, "balance_loss_mlp": 1.03321719, "epoch": 0.8820081166391102, "flos": 11728749070080.0, "grad_norm": 2.381304969991621, "language_loss": 0.71598136, "learning_rate": 1.3581124118326902e-07, "loss": 0.73732239, "num_input_tokens_seen": 316260935, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 14670, "time_per_iteration": 2.459657669067383 }, { "auxiliary_loss_clip": 0.01109604, "auxiliary_loss_mlp": 0.01029526, "balance_loss_clip": 1.01766419, "balance_loss_mlp": 1.03241968, "epoch": 0.8820682398917782, "flos": 27490372611840.0, "grad_norm": 4.008737322597259, "language_loss": 0.73814541, "learning_rate": 1.356744213451433e-07, "loss": 0.75953668, "num_input_tokens_seen": 316281190, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 14671, "time_per_iteration": 2.5825283527374268 }, { "auxiliary_loss_clip": 0.0112234, "auxiliary_loss_mlp": 0.01029469, "balance_loss_clip": 1.01664829, "balance_loss_mlp": 1.03317511, "epoch": 0.8821283631444461, "flos": 16873850223360.0, "grad_norm": 3.0279534466761677, "language_loss": 0.85027915, "learning_rate": 1.3553766804000179e-07, "loss": 0.8717972, "num_input_tokens_seen": 316297115, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 14672, "time_per_iteration": 2.533691167831421 }, { "auxiliary_loss_clip": 0.01126641, "auxiliary_loss_mlp": 0.0102985, "balance_loss_clip": 1.01789308, "balance_loss_mlp": 1.03395367, "epoch": 0.8821884863971141, "flos": 24280964409600.0, "grad_norm": 1.4597270255215098, "language_loss": 0.72606939, "learning_rate": 1.3540098127272504e-07, "loss": 0.74763429, "num_input_tokens_seen": 316318235, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.66015625, "step": 14673, "time_per_iteration": 2.558598518371582 }, { "auxiliary_loss_clip": 0.01039851, "auxiliary_loss_mlp": 0.01001503, "balance_loss_clip": 1.00019169, "balance_loss_mlp": 1.00090659, "epoch": 0.882248609649782, "flos": 59432342492160.0, "grad_norm": 0.8001339772616012, "language_loss": 0.49416381, "learning_rate": 1.352643610481905e-07, "loss": 0.51457739, "num_input_tokens_seen": 316384705, "router_z_loss_clip": 0.01312256, "router_z_loss_mlp": 0.2109375, "step": 14674, "time_per_iteration": 3.2284886837005615 }, { "auxiliary_loss_clip": 0.01120837, "auxiliary_loss_mlp": 0.01276059, "balance_loss_clip": 1.01685548, "balance_loss_mlp": 1.03281927, "epoch": 0.88230873290245, "flos": 19937784343680.0, "grad_norm": 2.1998392437970247, "language_loss": 0.76253271, "learning_rate": 1.3512780737127493e-07, "loss": 0.78650165, "num_input_tokens_seen": 316401165, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 14675, "time_per_iteration": 2.6506223678588867 }, { "auxiliary_loss_clip": 0.0111765, "auxiliary_loss_mlp": 0.01032097, "balance_loss_clip": 1.02013445, "balance_loss_mlp": 1.03367341, "epoch": 0.882368856155118, "flos": 16362769559040.0, "grad_norm": 1.931567214240882, "language_loss": 0.79242206, "learning_rate": 1.3499132024685022e-07, "loss": 0.81391948, "num_input_tokens_seen": 316418780, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.66015625, "step": 14676, "time_per_iteration": 2.5135700702667236 }, { "auxiliary_loss_clip": 0.01048873, "auxiliary_loss_mlp": 0.01000429, "balance_loss_clip": 0.99910557, "balance_loss_mlp": 1.00102997, "epoch": 0.882428979407786, "flos": 70594563277440.0, "grad_norm": 0.6865475769966174, "language_loss": 0.54783058, "learning_rate": 1.3485489967978847e-07, "loss": 0.56832361, "num_input_tokens_seen": 316482030, "router_z_loss_clip": 0.01324463, "router_z_loss_mlp": 0.2109375, "step": 14677, "time_per_iteration": 3.256361246109009 }, { "auxiliary_loss_clip": 0.01099861, "auxiliary_loss_mlp": 0.01030011, "balance_loss_clip": 1.01786947, "balance_loss_mlp": 1.03289485, "epoch": 0.8824891026604539, "flos": 18150294908160.0, "grad_norm": 2.821591851449798, "language_loss": 0.64777762, "learning_rate": 1.347185456749571e-07, "loss": 0.66907632, "num_input_tokens_seen": 316499175, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.671875, "step": 14678, "time_per_iteration": 3.8646395206451416 }, { "auxiliary_loss_clip": 0.01123386, "auxiliary_loss_mlp": 0.01034627, "balance_loss_clip": 1.02246761, "balance_loss_mlp": 1.03485394, "epoch": 0.8825492259131219, "flos": 27232713377280.0, "grad_norm": 3.002816737452375, "language_loss": 0.71422601, "learning_rate": 1.3458225823722335e-07, "loss": 0.73580611, "num_input_tokens_seen": 316519495, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.703125, "step": 14679, "time_per_iteration": 2.5822055339813232 }, { "auxiliary_loss_clip": 0.011177, "auxiliary_loss_mlp": 0.01034047, "balance_loss_clip": 1.02323508, "balance_loss_mlp": 1.03383482, "epoch": 0.8826093491657898, "flos": 22274419881600.0, "grad_norm": 1.6823048518321098, "language_loss": 0.63654149, "learning_rate": 1.3444603737144977e-07, "loss": 0.65805894, "num_input_tokens_seen": 316538180, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.6640625, "step": 14680, "time_per_iteration": 2.5545382499694824 }, { "auxiliary_loss_clip": 0.01101679, "auxiliary_loss_mlp": 0.01032634, "balance_loss_clip": 1.02037907, "balance_loss_mlp": 1.03407443, "epoch": 0.8826694724184578, "flos": 14204753377920.0, "grad_norm": 2.25277973941884, "language_loss": 0.77492076, "learning_rate": 1.343098830824987e-07, "loss": 0.79626393, "num_input_tokens_seen": 316551750, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.67578125, "step": 14681, "time_per_iteration": 2.4653396606445312 }, { "auxiliary_loss_clip": 0.01123575, "auxiliary_loss_mlp": 0.01029216, "balance_loss_clip": 1.01660419, "balance_loss_mlp": 1.03531241, "epoch": 0.8827295956711259, "flos": 20631686256000.0, "grad_norm": 1.7074260433284107, "language_loss": 0.72684759, "learning_rate": 1.341737953752291e-07, "loss": 0.74837554, "num_input_tokens_seen": 316570680, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 14682, "time_per_iteration": 2.527881145477295 }, { "auxiliary_loss_clip": 0.01143286, "auxiliary_loss_mlp": 0.01032573, "balance_loss_clip": 1.01827383, "balance_loss_mlp": 1.03590608, "epoch": 0.8827897189237938, "flos": 18513064316160.0, "grad_norm": 1.9369162928858468, "language_loss": 0.74959588, "learning_rate": 1.3403777425449669e-07, "loss": 0.77135444, "num_input_tokens_seen": 316588635, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.71875, "step": 14683, "time_per_iteration": 2.565673828125 }, { "auxiliary_loss_clip": 0.01130571, "auxiliary_loss_mlp": 0.01031583, "balance_loss_clip": 1.0186007, "balance_loss_mlp": 1.03547287, "epoch": 0.8828498421764618, "flos": 22747399194240.0, "grad_norm": 1.6271146481290002, "language_loss": 0.65430743, "learning_rate": 1.3390181972515646e-07, "loss": 0.67592895, "num_input_tokens_seen": 316607550, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6875, "step": 14684, "time_per_iteration": 3.9810070991516113 }, { "auxiliary_loss_clip": 0.01109604, "auxiliary_loss_mlp": 0.01031204, "balance_loss_clip": 1.01947355, "balance_loss_mlp": 1.03310263, "epoch": 0.8829099654291297, "flos": 15012384727680.0, "grad_norm": 1.8661597723911565, "language_loss": 0.6956563, "learning_rate": 1.3376593179206008e-07, "loss": 0.71706444, "num_input_tokens_seen": 316624460, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.67578125, "step": 14685, "time_per_iteration": 2.5578081607818604 }, { "auxiliary_loss_clip": 0.01150744, "auxiliary_loss_mlp": 0.01030537, "balance_loss_clip": 1.01703119, "balance_loss_mlp": 1.03469324, "epoch": 0.8829700886817977, "flos": 16720546976640.0, "grad_norm": 1.8285323358337922, "language_loss": 0.7440834, "learning_rate": 1.3363011046005722e-07, "loss": 0.7658962, "num_input_tokens_seen": 316640765, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71484375, "step": 14686, "time_per_iteration": 2.629018783569336 }, { "auxiliary_loss_clip": 0.01135213, "auxiliary_loss_mlp": 0.01026596, "balance_loss_clip": 1.01469314, "balance_loss_mlp": 1.03273153, "epoch": 0.8830302119344656, "flos": 15263256291840.0, "grad_norm": 1.7349293713223193, "language_loss": 0.62924474, "learning_rate": 1.3349435573399448e-07, "loss": 0.65086281, "num_input_tokens_seen": 316656120, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 14687, "time_per_iteration": 2.50710129737854 }, { "auxiliary_loss_clip": 0.01113462, "auxiliary_loss_mlp": 0.01035266, "balance_loss_clip": 1.02245736, "balance_loss_mlp": 1.03458977, "epoch": 0.8830903351871336, "flos": 28617751854720.0, "grad_norm": 1.8696799039972105, "language_loss": 0.68874574, "learning_rate": 1.333586676187173e-07, "loss": 0.71023297, "num_input_tokens_seen": 316676095, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 14688, "time_per_iteration": 2.585191011428833 }, { "auxiliary_loss_clip": 0.01108229, "auxiliary_loss_mlp": 0.01028861, "balance_loss_clip": 1.01678538, "balance_loss_mlp": 1.03318834, "epoch": 0.8831504584398016, "flos": 28001632844160.0, "grad_norm": 2.058455314610115, "language_loss": 0.66878057, "learning_rate": 1.3322304611906775e-07, "loss": 0.69015145, "num_input_tokens_seen": 316696235, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6640625, "step": 14689, "time_per_iteration": 2.602116107940674 }, { "auxiliary_loss_clip": 0.01122457, "auxiliary_loss_mlp": 0.01027441, "balance_loss_clip": 1.01603889, "balance_loss_mlp": 1.03754354, "epoch": 0.8832105816924696, "flos": 26579642250240.0, "grad_norm": 1.2888020053704379, "language_loss": 0.74583888, "learning_rate": 1.3308749123988562e-07, "loss": 0.76733786, "num_input_tokens_seen": 316719680, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.66796875, "step": 14690, "time_per_iteration": 2.633216619491577 }, { "auxiliary_loss_clip": 0.01121773, "auxiliary_loss_mlp": 0.01032913, "balance_loss_clip": 1.01972818, "balance_loss_mlp": 1.03460228, "epoch": 0.8832707049451375, "flos": 24898771359360.0, "grad_norm": 1.4184135760959378, "language_loss": 0.72724259, "learning_rate": 1.3295200298600828e-07, "loss": 0.74878943, "num_input_tokens_seen": 316739830, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.69140625, "step": 14691, "time_per_iteration": 2.6013877391815186 }, { "auxiliary_loss_clip": 0.01134041, "auxiliary_loss_mlp": 0.01029439, "balance_loss_clip": 1.01652288, "balance_loss_mlp": 1.03533673, "epoch": 0.8833308281978055, "flos": 15451141357440.0, "grad_norm": 3.3794534339588145, "language_loss": 0.52128685, "learning_rate": 1.3281658136227213e-07, "loss": 0.54292166, "num_input_tokens_seen": 316758105, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 14692, "time_per_iteration": 3.9937782287597656 }, { "auxiliary_loss_clip": 0.01131914, "auxiliary_loss_mlp": 0.01030547, "balance_loss_clip": 1.01805365, "balance_loss_mlp": 1.03538275, "epoch": 0.8833909514504734, "flos": 20301523418880.0, "grad_norm": 2.022431948372787, "language_loss": 0.60570055, "learning_rate": 1.3268122637350865e-07, "loss": 0.62732518, "num_input_tokens_seen": 316777455, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 14693, "time_per_iteration": 2.5344078540802 }, { "auxiliary_loss_clip": 0.0109967, "auxiliary_loss_mlp": 0.01026237, "balance_loss_clip": 1.01518011, "balance_loss_mlp": 1.03491092, "epoch": 0.8834510747031414, "flos": 20374027021440.0, "grad_norm": 1.6814215755925863, "language_loss": 0.75219208, "learning_rate": 1.3254593802454904e-07, "loss": 0.77345115, "num_input_tokens_seen": 316796300, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6484375, "step": 14694, "time_per_iteration": 4.086346864700317 }, { "auxiliary_loss_clip": 0.01112605, "auxiliary_loss_mlp": 0.0102958, "balance_loss_clip": 1.01823163, "balance_loss_mlp": 1.03212166, "epoch": 0.8835111979558095, "flos": 29752026508800.0, "grad_norm": 1.6754443762507105, "language_loss": 0.72903311, "learning_rate": 1.3241071632022126e-07, "loss": 0.7504549, "num_input_tokens_seen": 316819090, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.7109375, "step": 14695, "time_per_iteration": 2.6596713066101074 }, { "auxiliary_loss_clip": 0.01106322, "auxiliary_loss_mlp": 0.01025413, "balance_loss_clip": 1.01390338, "balance_loss_mlp": 1.03328931, "epoch": 0.8835713212084774, "flos": 24134556574080.0, "grad_norm": 1.7076952009154494, "language_loss": 0.80140954, "learning_rate": 1.32275561265351e-07, "loss": 0.82272691, "num_input_tokens_seen": 316839250, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.640625, "step": 14696, "time_per_iteration": 2.6696879863739014 }, { "auxiliary_loss_clip": 0.01138869, "auxiliary_loss_mlp": 0.01033618, "balance_loss_clip": 1.02185249, "balance_loss_mlp": 1.03504348, "epoch": 0.8836314444611454, "flos": 27672331933440.0, "grad_norm": 1.5474249207996482, "language_loss": 0.75003421, "learning_rate": 1.321404728647617e-07, "loss": 0.77175909, "num_input_tokens_seen": 316861315, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69140625, "step": 14697, "time_per_iteration": 2.593402147293091 }, { "auxiliary_loss_clip": 0.0111964, "auxiliary_loss_mlp": 0.01034666, "balance_loss_clip": 1.02294135, "balance_loss_mlp": 1.03402221, "epoch": 0.8836915677138133, "flos": 16836969934080.0, "grad_norm": 1.865517902221796, "language_loss": 0.71568036, "learning_rate": 1.3200545112327378e-07, "loss": 0.73722345, "num_input_tokens_seen": 316879325, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 14698, "time_per_iteration": 2.5310144424438477 }, { "auxiliary_loss_clip": 0.01030893, "auxiliary_loss_mlp": 0.010017, "balance_loss_clip": 1.0003947, "balance_loss_mlp": 1.0012095, "epoch": 0.8837516909664813, "flos": 69310540823040.0, "grad_norm": 0.8093448798817837, "language_loss": 0.53701979, "learning_rate": 1.3187049604570688e-07, "loss": 0.55734569, "num_input_tokens_seen": 316936425, "router_z_loss_clip": 0.01306152, "router_z_loss_mlp": 0.2109375, "step": 14699, "time_per_iteration": 3.123386859893799 }, { "auxiliary_loss_clip": 0.01125605, "auxiliary_loss_mlp": 0.01027812, "balance_loss_clip": 1.01504469, "balance_loss_mlp": 1.03598762, "epoch": 0.8838118142191492, "flos": 21324726241920.0, "grad_norm": 1.6514096410104049, "language_loss": 0.76745337, "learning_rate": 1.3173560763687607e-07, "loss": 0.78898752, "num_input_tokens_seen": 316956360, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 14700, "time_per_iteration": 2.5527076721191406 }, { "auxiliary_loss_clip": 0.01127017, "auxiliary_loss_mlp": 0.01030038, "balance_loss_clip": 1.01777172, "balance_loss_mlp": 1.03393817, "epoch": 0.8838719374718172, "flos": 21470559459840.0, "grad_norm": 1.8203881764658838, "language_loss": 0.73193198, "learning_rate": 1.3160078590159595e-07, "loss": 0.75350255, "num_input_tokens_seen": 316975295, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6640625, "step": 14701, "time_per_iteration": 2.5242910385131836 }, { "auxiliary_loss_clip": 0.01124099, "auxiliary_loss_mlp": 0.01031851, "balance_loss_clip": 1.01892304, "balance_loss_mlp": 1.03524113, "epoch": 0.8839320607244852, "flos": 26468929555200.0, "grad_norm": 2.053320728139665, "language_loss": 0.70818901, "learning_rate": 1.3146603084467756e-07, "loss": 0.72974849, "num_input_tokens_seen": 316994520, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 14702, "time_per_iteration": 2.575050115585327 }, { "auxiliary_loss_clip": 0.0113175, "auxiliary_loss_mlp": 0.01033715, "balance_loss_clip": 1.02110898, "balance_loss_mlp": 1.03597927, "epoch": 0.8839921839771532, "flos": 45222270923520.0, "grad_norm": 2.9523638113918316, "language_loss": 0.71716332, "learning_rate": 1.3133134247093015e-07, "loss": 0.73881799, "num_input_tokens_seen": 317018095, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 14703, "time_per_iteration": 2.7677242755889893 }, { "auxiliary_loss_clip": 0.01136265, "auxiliary_loss_mlp": 0.01030278, "balance_loss_clip": 1.01850629, "balance_loss_mlp": 1.03436303, "epoch": 0.8840523072298211, "flos": 20006876154240.0, "grad_norm": 1.8170145416948749, "language_loss": 0.66675842, "learning_rate": 1.311967207851601e-07, "loss": 0.68842387, "num_input_tokens_seen": 317035755, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6640625, "step": 14704, "time_per_iteration": 2.542194128036499 }, { "auxiliary_loss_clip": 0.01107586, "auxiliary_loss_mlp": 0.01025915, "balance_loss_clip": 1.01463211, "balance_loss_mlp": 1.03186131, "epoch": 0.8841124304824891, "flos": 24426007528320.0, "grad_norm": 1.5981842174485734, "language_loss": 0.70794988, "learning_rate": 1.3106216579217221e-07, "loss": 0.72928488, "num_input_tokens_seen": 317055765, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6640625, "step": 14705, "time_per_iteration": 2.570061206817627 }, { "auxiliary_loss_clip": 0.01119332, "auxiliary_loss_mlp": 0.01032011, "balance_loss_clip": 1.02135396, "balance_loss_mlp": 1.0346334, "epoch": 0.884172553735157, "flos": 31284622056960.0, "grad_norm": 1.550275091258582, "language_loss": 0.70917189, "learning_rate": 1.3092767749676824e-07, "loss": 0.73068535, "num_input_tokens_seen": 317077955, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.671875, "step": 14706, "time_per_iteration": 2.600215435028076 }, { "auxiliary_loss_clip": 0.01109805, "auxiliary_loss_mlp": 0.01032999, "balance_loss_clip": 1.02094066, "balance_loss_mlp": 1.03311586, "epoch": 0.884232676987825, "flos": 21391160446080.0, "grad_norm": 1.6560316344618513, "language_loss": 0.74113786, "learning_rate": 1.3079325590374746e-07, "loss": 0.76256597, "num_input_tokens_seen": 317095825, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 14707, "time_per_iteration": 2.514378070831299 }, { "auxiliary_loss_clip": 0.01048595, "auxiliary_loss_mlp": 0.01000939, "balance_loss_clip": 0.99971145, "balance_loss_mlp": 1.00088096, "epoch": 0.8842928002404931, "flos": 57911451799680.0, "grad_norm": 0.797572307884926, "language_loss": 0.60380578, "learning_rate": 1.3065890101790688e-07, "loss": 0.62430108, "num_input_tokens_seen": 317152875, "router_z_loss_clip": 0.01226807, "router_z_loss_mlp": 0.2109375, "step": 14708, "time_per_iteration": 3.026493787765503 }, { "auxiliary_loss_clip": 0.01114012, "auxiliary_loss_mlp": 0.01030531, "balance_loss_clip": 1.01952767, "balance_loss_mlp": 1.03235173, "epoch": 0.884352923493161, "flos": 41463896186880.0, "grad_norm": 1.5785872213949363, "language_loss": 0.6746875, "learning_rate": 1.3052461284404248e-07, "loss": 0.6961329, "num_input_tokens_seen": 317176725, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.63671875, "step": 14709, "time_per_iteration": 2.7037429809570312 }, { "auxiliary_loss_clip": 0.01123456, "auxiliary_loss_mlp": 0.01036546, "balance_loss_clip": 1.02267027, "balance_loss_mlp": 1.0335499, "epoch": 0.884413046745829, "flos": 46541234332800.0, "grad_norm": 1.5270569324933088, "language_loss": 0.62390876, "learning_rate": 1.3039039138694485e-07, "loss": 0.64550877, "num_input_tokens_seen": 317206880, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.72265625, "step": 14710, "time_per_iteration": 2.8068552017211914 }, { "auxiliary_loss_clip": 0.01126163, "auxiliary_loss_mlp": 0.01027228, "balance_loss_clip": 1.01581407, "balance_loss_mlp": 1.03419232, "epoch": 0.8844731699984969, "flos": 19135324552320.0, "grad_norm": 2.6563277744354643, "language_loss": 0.63522643, "learning_rate": 1.302562366514055e-07, "loss": 0.65676033, "num_input_tokens_seen": 317224135, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.65625, "step": 14711, "time_per_iteration": 2.558076858520508 }, { "auxiliary_loss_clip": 0.01131823, "auxiliary_loss_mlp": 0.01031997, "balance_loss_clip": 1.01885414, "balance_loss_mlp": 1.03356576, "epoch": 0.8845332932511649, "flos": 23260634674560.0, "grad_norm": 2.4324464252971025, "language_loss": 0.76556879, "learning_rate": 1.3012214864221126e-07, "loss": 0.78720695, "num_input_tokens_seen": 317244505, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 14712, "time_per_iteration": 2.5662436485290527 }, { "auxiliary_loss_clip": 0.01146207, "auxiliary_loss_mlp": 0.01027243, "balance_loss_clip": 1.01522088, "balance_loss_mlp": 1.03236389, "epoch": 0.8845934165038328, "flos": 17564591738880.0, "grad_norm": 2.365552654540985, "language_loss": 0.81125569, "learning_rate": 1.2998812736414788e-07, "loss": 0.83299023, "num_input_tokens_seen": 317257830, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 14713, "time_per_iteration": 2.538095235824585 }, { "auxiliary_loss_clip": 0.0111894, "auxiliary_loss_mlp": 0.0102879, "balance_loss_clip": 1.01635671, "balance_loss_mlp": 1.0326674, "epoch": 0.8846535397565009, "flos": 27485739757440.0, "grad_norm": 1.3837380559914105, "language_loss": 0.55637914, "learning_rate": 1.2985417282199794e-07, "loss": 0.57785648, "num_input_tokens_seen": 317278430, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6875, "step": 14714, "time_per_iteration": 2.5559864044189453 }, { "auxiliary_loss_clip": 0.01146173, "auxiliary_loss_mlp": 0.01034368, "balance_loss_clip": 1.02232826, "balance_loss_mlp": 1.03341722, "epoch": 0.8847136630091688, "flos": 28761430256640.0, "grad_norm": 1.7008236800776362, "language_loss": 0.74191105, "learning_rate": 1.2972028502054145e-07, "loss": 0.7637164, "num_input_tokens_seen": 317295970, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 14715, "time_per_iteration": 2.637232780456543 }, { "auxiliary_loss_clip": 0.01111025, "auxiliary_loss_mlp": 0.01025992, "balance_loss_clip": 1.01410043, "balance_loss_mlp": 1.03328633, "epoch": 0.8847737862618368, "flos": 23476924419840.0, "grad_norm": 1.4700395436214433, "language_loss": 0.75282443, "learning_rate": 1.2958646396455786e-07, "loss": 0.7741946, "num_input_tokens_seen": 317316185, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 14716, "time_per_iteration": 2.5539298057556152 }, { "auxiliary_loss_clip": 0.01127848, "auxiliary_loss_mlp": 0.01037133, "balance_loss_clip": 1.02504551, "balance_loss_mlp": 1.03391862, "epoch": 0.8848339095145047, "flos": 18660872782080.0, "grad_norm": 2.0453460208059346, "language_loss": 0.70883226, "learning_rate": 1.294527096588216e-07, "loss": 0.73048204, "num_input_tokens_seen": 317333275, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.671875, "step": 14717, "time_per_iteration": 2.5737709999084473 }, { "auxiliary_loss_clip": 0.01107441, "auxiliary_loss_mlp": 0.01030788, "balance_loss_clip": 1.01920724, "balance_loss_mlp": 1.0340035, "epoch": 0.8848940327671727, "flos": 32270298145920.0, "grad_norm": 2.1810990904860654, "language_loss": 0.74010682, "learning_rate": 1.2931902210810663e-07, "loss": 0.76148915, "num_input_tokens_seen": 317351245, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6484375, "step": 14718, "time_per_iteration": 2.5797290802001953 }, { "auxiliary_loss_clip": 0.01148381, "auxiliary_loss_mlp": 0.01028008, "balance_loss_clip": 1.01550293, "balance_loss_mlp": 1.03342509, "epoch": 0.8849541560198406, "flos": 24021832717440.0, "grad_norm": 1.5045224266464257, "language_loss": 0.78569776, "learning_rate": 1.291854013171838e-07, "loss": 0.80746162, "num_input_tokens_seen": 317370740, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 14719, "time_per_iteration": 2.617955207824707 }, { "auxiliary_loss_clip": 0.0110402, "auxiliary_loss_mlp": 0.01028809, "balance_loss_clip": 1.01657796, "balance_loss_mlp": 1.03485668, "epoch": 0.8850142792725086, "flos": 16873060124160.0, "grad_norm": 1.5871342789827432, "language_loss": 0.72077799, "learning_rate": 1.2905184729082176e-07, "loss": 0.74210626, "num_input_tokens_seen": 317388370, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69140625, "step": 14720, "time_per_iteration": 3.8622446060180664 }, { "auxiliary_loss_clip": 0.01123234, "auxiliary_loss_mlp": 0.01033838, "balance_loss_clip": 1.02087426, "balance_loss_mlp": 1.03387547, "epoch": 0.8850744025251767, "flos": 24024059360640.0, "grad_norm": 1.8733273141476077, "language_loss": 0.82184052, "learning_rate": 1.289183600337862e-07, "loss": 0.84341121, "num_input_tokens_seen": 317407390, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 14721, "time_per_iteration": 2.5257787704467773 }, { "auxiliary_loss_clip": 0.01119031, "auxiliary_loss_mlp": 0.01031952, "balance_loss_clip": 1.01901174, "balance_loss_mlp": 1.03212535, "epoch": 0.8851345257778446, "flos": 47955575329920.0, "grad_norm": 2.321974373707496, "language_loss": 0.6208843, "learning_rate": 1.287849395508418e-07, "loss": 0.64239419, "num_input_tokens_seen": 317430825, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 14722, "time_per_iteration": 2.8735339641571045 }, { "auxiliary_loss_clip": 0.01112496, "auxiliary_loss_mlp": 0.01028269, "balance_loss_clip": 1.01598454, "balance_loss_mlp": 1.03344488, "epoch": 0.8851946490305126, "flos": 36611000173440.0, "grad_norm": 1.8881908815529882, "language_loss": 0.68824112, "learning_rate": 1.2865158584674962e-07, "loss": 0.70964879, "num_input_tokens_seen": 317451905, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 14723, "time_per_iteration": 2.684643507003784 }, { "auxiliary_loss_clip": 0.01129854, "auxiliary_loss_mlp": 0.01031609, "balance_loss_clip": 1.01947331, "balance_loss_mlp": 1.03522336, "epoch": 0.8852547722831805, "flos": 21544248211200.0, "grad_norm": 1.6874802817510854, "language_loss": 0.78134632, "learning_rate": 1.2851829892626852e-07, "loss": 0.80296099, "num_input_tokens_seen": 317470030, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 14724, "time_per_iteration": 2.5437042713165283 }, { "auxiliary_loss_clip": 0.01030094, "auxiliary_loss_mlp": 0.01003746, "balance_loss_clip": 1.00257766, "balance_loss_mlp": 1.00070071, "epoch": 0.8853148955358485, "flos": 63059246472960.0, "grad_norm": 0.8001412151176172, "language_loss": 0.58899045, "learning_rate": 1.2838507879415515e-07, "loss": 0.60932887, "num_input_tokens_seen": 317527460, "router_z_loss_clip": 0.01165771, "router_z_loss_mlp": 0.20996094, "step": 14725, "time_per_iteration": 2.980823516845703 }, { "auxiliary_loss_clip": 0.01125399, "auxiliary_loss_mlp": 0.01288293, "balance_loss_clip": 1.0284586, "balance_loss_mlp": 1.03411889, "epoch": 0.8853750187885164, "flos": 18149828031360.0, "grad_norm": 2.851042274006394, "language_loss": 0.69058013, "learning_rate": 1.282519254551644e-07, "loss": 0.71471709, "num_input_tokens_seen": 317544070, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.734375, "step": 14726, "time_per_iteration": 3.928650379180908 }, { "auxiliary_loss_clip": 0.01111664, "auxiliary_loss_mlp": 0.01278934, "balance_loss_clip": 1.01966572, "balance_loss_mlp": 1.03571689, "epoch": 0.8854351420411845, "flos": 39570542392320.0, "grad_norm": 1.6985745343657859, "language_loss": 0.69783586, "learning_rate": 1.281188389140475e-07, "loss": 0.7217418, "num_input_tokens_seen": 317570275, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.66796875, "step": 14727, "time_per_iteration": 2.7417871952056885 }, { "auxiliary_loss_clip": 0.01123958, "auxiliary_loss_mlp": 0.01034198, "balance_loss_clip": 1.02137136, "balance_loss_mlp": 1.03791094, "epoch": 0.8854952652938524, "flos": 23769309127680.0, "grad_norm": 1.5978548703382855, "language_loss": 0.69907951, "learning_rate": 1.2798581917555385e-07, "loss": 0.72066098, "num_input_tokens_seen": 317590160, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 14728, "time_per_iteration": 2.5218420028686523 }, { "auxiliary_loss_clip": 0.01129081, "auxiliary_loss_mlp": 0.0102956, "balance_loss_clip": 1.01797891, "balance_loss_mlp": 1.03345251, "epoch": 0.8855553885465204, "flos": 18290310122880.0, "grad_norm": 2.010850147801514, "language_loss": 0.66349339, "learning_rate": 1.278528662444316e-07, "loss": 0.68507987, "num_input_tokens_seen": 317608340, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.69140625, "step": 14729, "time_per_iteration": 2.542447805404663 }, { "auxiliary_loss_clip": 0.01119705, "auxiliary_loss_mlp": 0.01033086, "balance_loss_clip": 1.02174973, "balance_loss_mlp": 1.03531742, "epoch": 0.8856155117991883, "flos": 36867402432000.0, "grad_norm": 1.9173515487822024, "language_loss": 0.62719655, "learning_rate": 1.2771998012542407e-07, "loss": 0.64872444, "num_input_tokens_seen": 317629910, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.66796875, "step": 14730, "time_per_iteration": 2.75642728805542 }, { "auxiliary_loss_clip": 0.01124719, "auxiliary_loss_mlp": 0.01033479, "balance_loss_clip": 1.02157617, "balance_loss_mlp": 1.03168976, "epoch": 0.8856756350518563, "flos": 22163886754560.0, "grad_norm": 1.438464065083964, "language_loss": 0.79584914, "learning_rate": 1.275871608232748e-07, "loss": 0.81743115, "num_input_tokens_seen": 317650265, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6640625, "step": 14731, "time_per_iteration": 2.5996663570404053 }, { "auxiliary_loss_clip": 0.01128939, "auxiliary_loss_mlp": 0.01036157, "balance_loss_clip": 1.02400374, "balance_loss_mlp": 1.03252518, "epoch": 0.8857357583045242, "flos": 20740962407040.0, "grad_norm": 1.6220999056391092, "language_loss": 0.82932466, "learning_rate": 1.2745440834272313e-07, "loss": 0.85097563, "num_input_tokens_seen": 317669045, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.69921875, "step": 14732, "time_per_iteration": 2.5159928798675537 }, { "auxiliary_loss_clip": 0.01039071, "auxiliary_loss_mlp": 0.01001255, "balance_loss_clip": 0.99997979, "balance_loss_mlp": 1.00092101, "epoch": 0.8857958815571922, "flos": 69892329409920.0, "grad_norm": 0.8948132318243353, "language_loss": 0.59948319, "learning_rate": 1.2732172268850727e-07, "loss": 0.61988646, "num_input_tokens_seen": 317728065, "router_z_loss_clip": 0.01275635, "router_z_loss_mlp": 0.2109375, "step": 14733, "time_per_iteration": 4.509424686431885 }, { "auxiliary_loss_clip": 0.01104129, "auxiliary_loss_mlp": 0.01277542, "balance_loss_clip": 1.01849675, "balance_loss_mlp": 1.0354203, "epoch": 0.8858560048098603, "flos": 15121948187520.0, "grad_norm": 2.3615262391780325, "language_loss": 0.66638005, "learning_rate": 1.271891038653614e-07, "loss": 0.69019675, "num_input_tokens_seen": 317746120, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 14734, "time_per_iteration": 2.483863115310669 }, { "auxiliary_loss_clip": 0.01120959, "auxiliary_loss_mlp": 0.01038047, "balance_loss_clip": 1.02666879, "balance_loss_mlp": 1.03571916, "epoch": 0.8859161280625282, "flos": 16611019430400.0, "grad_norm": 1.9464224043286948, "language_loss": 0.75404072, "learning_rate": 1.2705655187801933e-07, "loss": 0.77563083, "num_input_tokens_seen": 317762280, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.671875, "step": 14735, "time_per_iteration": 3.9822239875793457 }, { "auxiliary_loss_clip": 0.01129716, "auxiliary_loss_mlp": 0.01033424, "balance_loss_clip": 1.02112162, "balance_loss_mlp": 1.03380585, "epoch": 0.8859762513151962, "flos": 18694484933760.0, "grad_norm": 1.6337980413623872, "language_loss": 0.70445955, "learning_rate": 1.2692406673121126e-07, "loss": 0.72609097, "num_input_tokens_seen": 317780615, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 14736, "time_per_iteration": 2.493907928466797 }, { "auxiliary_loss_clip": 0.01123136, "auxiliary_loss_mlp": 0.01028771, "balance_loss_clip": 1.01669526, "balance_loss_mlp": 1.03477025, "epoch": 0.8860363745678641, "flos": 19536877670400.0, "grad_norm": 1.7619540026887561, "language_loss": 0.84456575, "learning_rate": 1.2679164842966494e-07, "loss": 0.86608481, "num_input_tokens_seen": 317798830, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69921875, "step": 14737, "time_per_iteration": 2.5321409702301025 }, { "auxiliary_loss_clip": 0.0113671, "auxiliary_loss_mlp": 0.01034434, "balance_loss_clip": 1.02058172, "balance_loss_mlp": 1.03609896, "epoch": 0.8860964978205321, "flos": 35954912304000.0, "grad_norm": 2.1825321100701762, "language_loss": 0.68185729, "learning_rate": 1.2665929697810617e-07, "loss": 0.7035687, "num_input_tokens_seen": 317819235, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7421875, "step": 14738, "time_per_iteration": 2.6578755378723145 }, { "auxiliary_loss_clip": 0.0111627, "auxiliary_loss_mlp": 0.01029871, "balance_loss_clip": 1.01675844, "balance_loss_mlp": 1.03352058, "epoch": 0.8861566210732, "flos": 21212577002880.0, "grad_norm": 2.0134991967432176, "language_loss": 0.75315881, "learning_rate": 1.265270123812583e-07, "loss": 0.77462018, "num_input_tokens_seen": 317836785, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73828125, "step": 14739, "time_per_iteration": 2.5561819076538086 }, { "auxiliary_loss_clip": 0.01110952, "auxiliary_loss_mlp": 0.01032972, "balance_loss_clip": 1.02149212, "balance_loss_mlp": 1.0348587, "epoch": 0.886216744325868, "flos": 14609071843200.0, "grad_norm": 3.230301405278888, "language_loss": 0.87144804, "learning_rate": 1.263947946438424e-07, "loss": 0.89288723, "num_input_tokens_seen": 317854225, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.671875, "step": 14740, "time_per_iteration": 2.5701303482055664 }, { "auxiliary_loss_clip": 0.01138709, "auxiliary_loss_mlp": 0.01032785, "balance_loss_clip": 1.02013099, "balance_loss_mlp": 1.03553796, "epoch": 0.886276867578536, "flos": 26651643062400.0, "grad_norm": 1.6737572607207607, "language_loss": 0.62945151, "learning_rate": 1.262626437705767e-07, "loss": 0.65116644, "num_input_tokens_seen": 317874865, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 14741, "time_per_iteration": 2.644862651824951 }, { "auxiliary_loss_clip": 0.01101389, "auxiliary_loss_mlp": 0.01028369, "balance_loss_clip": 1.01629949, "balance_loss_mlp": 1.0338515, "epoch": 0.886336990831204, "flos": 16764071281920.0, "grad_norm": 1.8006884644518482, "language_loss": 0.72938555, "learning_rate": 1.2613055976617748e-07, "loss": 0.75068313, "num_input_tokens_seen": 317892830, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.671875, "step": 14742, "time_per_iteration": 2.484462022781372 }, { "auxiliary_loss_clip": 0.01111432, "auxiliary_loss_mlp": 0.01278581, "balance_loss_clip": 1.01954687, "balance_loss_mlp": 1.03449869, "epoch": 0.8863971140838719, "flos": 25265275781760.0, "grad_norm": 1.9163505470377928, "language_loss": 0.59252512, "learning_rate": 1.2599854263535824e-07, "loss": 0.61642516, "num_input_tokens_seen": 317911780, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 14743, "time_per_iteration": 2.5770938396453857 }, { "auxiliary_loss_clip": 0.01132269, "auxiliary_loss_mlp": 0.01033576, "balance_loss_clip": 1.02025986, "balance_loss_mlp": 1.03435731, "epoch": 0.8864572373365399, "flos": 23404313076480.0, "grad_norm": 1.706584208694846, "language_loss": 0.60069287, "learning_rate": 1.258665923828308e-07, "loss": 0.62235129, "num_input_tokens_seen": 317932855, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 14744, "time_per_iteration": 2.5724122524261475 }, { "auxiliary_loss_clip": 0.01114148, "auxiliary_loss_mlp": 0.01281967, "balance_loss_clip": 1.02212369, "balance_loss_mlp": 1.03552663, "epoch": 0.8865173605892078, "flos": 18548759456640.0, "grad_norm": 1.6530749559500124, "language_loss": 0.76733601, "learning_rate": 1.257347090133034e-07, "loss": 0.7912972, "num_input_tokens_seen": 317952090, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 14745, "time_per_iteration": 2.5772626399993896 }, { "auxiliary_loss_clip": 0.01101902, "auxiliary_loss_mlp": 0.01279312, "balance_loss_clip": 1.02056503, "balance_loss_mlp": 1.03447533, "epoch": 0.8865774838418758, "flos": 19025868833280.0, "grad_norm": 1.6148770227415095, "language_loss": 0.77068841, "learning_rate": 1.256028925314836e-07, "loss": 0.79450059, "num_input_tokens_seen": 317970370, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.67578125, "step": 14746, "time_per_iteration": 2.5041606426239014 }, { "auxiliary_loss_clip": 0.01125246, "auxiliary_loss_mlp": 0.01037108, "balance_loss_clip": 1.02389908, "balance_loss_mlp": 1.03642201, "epoch": 0.8866376070945439, "flos": 22163168482560.0, "grad_norm": 2.193893902519011, "language_loss": 0.76886261, "learning_rate": 1.254711429420743e-07, "loss": 0.7904861, "num_input_tokens_seen": 317989125, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.70703125, "step": 14747, "time_per_iteration": 2.667083263397217 }, { "auxiliary_loss_clip": 0.01116258, "auxiliary_loss_mlp": 0.01274601, "balance_loss_clip": 1.0157752, "balance_loss_mlp": 1.03167796, "epoch": 0.8866977303472118, "flos": 20704261685760.0, "grad_norm": 2.1676226088375894, "language_loss": 0.82761765, "learning_rate": 1.253394602497786e-07, "loss": 0.85152626, "num_input_tokens_seen": 318007820, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.66796875, "step": 14748, "time_per_iteration": 2.516921281814575 }, { "auxiliary_loss_clip": 0.01113161, "auxiliary_loss_mlp": 0.01035441, "balance_loss_clip": 1.02282917, "balance_loss_mlp": 1.03525257, "epoch": 0.8867578535998798, "flos": 48794448533760.0, "grad_norm": 3.1367497061819236, "language_loss": 0.77690899, "learning_rate": 1.25207844459295e-07, "loss": 0.79839504, "num_input_tokens_seen": 318030435, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 14749, "time_per_iteration": 2.7970798015594482 }, { "auxiliary_loss_clip": 0.01149507, "auxiliary_loss_mlp": 0.012751, "balance_loss_clip": 1.01559258, "balance_loss_mlp": 1.0330447, "epoch": 0.8868179768525477, "flos": 21105312013440.0, "grad_norm": 2.0006655794356902, "language_loss": 0.69574434, "learning_rate": 1.2507629557532152e-07, "loss": 0.71999037, "num_input_tokens_seen": 318049465, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 14750, "time_per_iteration": 2.5927329063415527 }, { "auxiliary_loss_clip": 0.01111106, "auxiliary_loss_mlp": 0.0103063, "balance_loss_clip": 1.01860154, "balance_loss_mlp": 1.03399503, "epoch": 0.8868781001052157, "flos": 21830922656640.0, "grad_norm": 2.3391780485469096, "language_loss": 0.59620643, "learning_rate": 1.249448136025515e-07, "loss": 0.61762381, "num_input_tokens_seen": 318067760, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.68359375, "step": 14751, "time_per_iteration": 2.578249216079712 }, { "auxiliary_loss_clip": 0.01102619, "auxiliary_loss_mlp": 0.01029736, "balance_loss_clip": 1.01768994, "balance_loss_mlp": 1.03449368, "epoch": 0.8869382233578836, "flos": 13516418073600.0, "grad_norm": 2.073669431070268, "language_loss": 0.81452513, "learning_rate": 1.2481339854567806e-07, "loss": 0.83584869, "num_input_tokens_seen": 318082785, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.68359375, "step": 14752, "time_per_iteration": 2.523012638092041 }, { "auxiliary_loss_clip": 0.01125424, "auxiliary_loss_mlp": 0.01033971, "balance_loss_clip": 1.02088821, "balance_loss_mlp": 1.03484082, "epoch": 0.8869983466105517, "flos": 22704988210560.0, "grad_norm": 1.9185633319418982, "language_loss": 0.79979169, "learning_rate": 1.2468205040939106e-07, "loss": 0.82138562, "num_input_tokens_seen": 318101925, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73046875, "step": 14753, "time_per_iteration": 2.6339986324310303 }, { "auxiliary_loss_clip": 0.01101172, "auxiliary_loss_mlp": 0.01032026, "balance_loss_clip": 1.02035594, "balance_loss_mlp": 1.0334518, "epoch": 0.8870584698632196, "flos": 15340751884800.0, "grad_norm": 2.3781631537292056, "language_loss": 0.65289879, "learning_rate": 1.245507691983776e-07, "loss": 0.67423081, "num_input_tokens_seen": 318119945, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 14754, "time_per_iteration": 2.5367214679718018 }, { "auxiliary_loss_clip": 0.01105007, "auxiliary_loss_mlp": 0.01033864, "balance_loss_clip": 1.0212394, "balance_loss_mlp": 1.03543806, "epoch": 0.8871185931158876, "flos": 26615624699520.0, "grad_norm": 2.123997884612316, "language_loss": 0.74219978, "learning_rate": 1.244195549173228e-07, "loss": 0.76358849, "num_input_tokens_seen": 318139685, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 14755, "time_per_iteration": 2.6210477352142334 }, { "auxiliary_loss_clip": 0.01121607, "auxiliary_loss_mlp": 0.01032498, "balance_loss_clip": 1.01871097, "balance_loss_mlp": 1.03365433, "epoch": 0.8871787163685555, "flos": 21799034357760.0, "grad_norm": 1.516559791446696, "language_loss": 0.77980459, "learning_rate": 1.2428840757090987e-07, "loss": 0.80134565, "num_input_tokens_seen": 318160375, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.69921875, "step": 14756, "time_per_iteration": 2.5069990158081055 }, { "auxiliary_loss_clip": 0.01121844, "auxiliary_loss_mlp": 0.0103228, "balance_loss_clip": 1.01970315, "balance_loss_mlp": 1.03486741, "epoch": 0.8872388396212235, "flos": 14902964922240.0, "grad_norm": 2.5476654727751975, "language_loss": 0.76444542, "learning_rate": 1.24157327163819e-07, "loss": 0.78598666, "num_input_tokens_seen": 318177995, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 14757, "time_per_iteration": 2.4857635498046875 }, { "auxiliary_loss_clip": 0.01144012, "auxiliary_loss_mlp": 0.01033477, "balance_loss_clip": 1.02181244, "balance_loss_mlp": 1.03371, "epoch": 0.8872989628738914, "flos": 26432157006720.0, "grad_norm": 2.100514415263354, "language_loss": 0.67692357, "learning_rate": 1.2402631370072714e-07, "loss": 0.69869852, "num_input_tokens_seen": 318197030, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.65625, "step": 14758, "time_per_iteration": 2.6103851795196533 }, { "auxiliary_loss_clip": 0.01103167, "auxiliary_loss_mlp": 0.01034134, "balance_loss_clip": 1.02075887, "balance_loss_mlp": 1.03515875, "epoch": 0.8873590861265594, "flos": 48142562555520.0, "grad_norm": 1.8217317665408772, "language_loss": 0.69036382, "learning_rate": 1.23895367186311e-07, "loss": 0.7117368, "num_input_tokens_seen": 318221780, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6796875, "step": 14759, "time_per_iteration": 2.784715414047241 }, { "auxiliary_loss_clip": 0.01100177, "auxiliary_loss_mlp": 0.01030899, "balance_loss_clip": 1.01867461, "balance_loss_mlp": 1.03219557, "epoch": 0.8874192093792275, "flos": 18332972501760.0, "grad_norm": 1.9592724798439298, "language_loss": 0.7498247, "learning_rate": 1.2376448762524326e-07, "loss": 0.77113545, "num_input_tokens_seen": 318239710, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 14760, "time_per_iteration": 2.540119171142578 }, { "auxiliary_loss_clip": 0.01123828, "auxiliary_loss_mlp": 0.01033655, "balance_loss_clip": 1.02085149, "balance_loss_mlp": 1.03578031, "epoch": 0.8874793326318954, "flos": 17894215872000.0, "grad_norm": 2.178892529399914, "language_loss": 0.76079631, "learning_rate": 1.236336750221947e-07, "loss": 0.78237116, "num_input_tokens_seen": 318257425, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 14761, "time_per_iteration": 2.501777410507202 }, { "auxiliary_loss_clip": 0.01103188, "auxiliary_loss_mlp": 0.01036978, "balance_loss_clip": 1.02421093, "balance_loss_mlp": 1.03355956, "epoch": 0.8875394558845634, "flos": 31102231772160.0, "grad_norm": 1.6644050777029853, "language_loss": 0.61384636, "learning_rate": 1.2350292938183348e-07, "loss": 0.63524801, "num_input_tokens_seen": 318278485, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 14762, "time_per_iteration": 3.9707465171813965 }, { "auxiliary_loss_clip": 0.01111601, "auxiliary_loss_mlp": 0.0103484, "balance_loss_clip": 1.02169776, "balance_loss_mlp": 1.03259945, "epoch": 0.8875995791372313, "flos": 21142048648320.0, "grad_norm": 1.7246294286826866, "language_loss": 0.64536655, "learning_rate": 1.2337225070882616e-07, "loss": 0.6668309, "num_input_tokens_seen": 318297560, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.69921875, "step": 14763, "time_per_iteration": 2.5065484046936035 }, { "auxiliary_loss_clip": 0.01120904, "auxiliary_loss_mlp": 0.01277834, "balance_loss_clip": 1.01800811, "balance_loss_mlp": 1.03318846, "epoch": 0.8876597023898993, "flos": 17455136019840.0, "grad_norm": 2.3082835838100455, "language_loss": 0.70853019, "learning_rate": 1.232416390078357e-07, "loss": 0.7325176, "num_input_tokens_seen": 318313060, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 14764, "time_per_iteration": 2.624643564224243 }, { "auxiliary_loss_clip": 0.01105243, "auxiliary_loss_mlp": 0.01036479, "balance_loss_clip": 1.02412915, "balance_loss_mlp": 1.03317976, "epoch": 0.8877198256425672, "flos": 22707933125760.0, "grad_norm": 2.829982327425785, "language_loss": 0.65824735, "learning_rate": 1.2311109428352363e-07, "loss": 0.67966449, "num_input_tokens_seen": 318332030, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.71875, "step": 14765, "time_per_iteration": 2.4901747703552246 }, { "auxiliary_loss_clip": 0.01125448, "auxiliary_loss_mlp": 0.01026744, "balance_loss_clip": 1.01600909, "balance_loss_mlp": 1.03429794, "epoch": 0.8877799488952353, "flos": 24535104111360.0, "grad_norm": 1.5178940061632429, "language_loss": 0.76379108, "learning_rate": 1.2298061654054825e-07, "loss": 0.78531295, "num_input_tokens_seen": 318351090, "router_z_loss_clip": 0.10742188, "router_z_loss_mlp": 0.64453125, "step": 14766, "time_per_iteration": 2.569973945617676 }, { "auxiliary_loss_clip": 0.01117307, "auxiliary_loss_mlp": 0.01031776, "balance_loss_clip": 1.01963449, "balance_loss_mlp": 1.03281665, "epoch": 0.8878400721479032, "flos": 20959191486720.0, "grad_norm": 1.7078784356840935, "language_loss": 0.73209333, "learning_rate": 1.228502057835672e-07, "loss": 0.75358421, "num_input_tokens_seen": 318372000, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6640625, "step": 14767, "time_per_iteration": 2.5686724185943604 }, { "auxiliary_loss_clip": 0.01098156, "auxiliary_loss_mlp": 0.01028703, "balance_loss_clip": 1.01699638, "balance_loss_mlp": 1.03318501, "epoch": 0.8879001954005712, "flos": 25295260659840.0, "grad_norm": 1.7382659318765732, "language_loss": 0.71166825, "learning_rate": 1.2271986201723294e-07, "loss": 0.73293686, "num_input_tokens_seen": 318391530, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6484375, "step": 14768, "time_per_iteration": 3.914360761642456 }, { "auxiliary_loss_clip": 0.01111359, "auxiliary_loss_mlp": 0.01031865, "balance_loss_clip": 1.02027237, "balance_loss_mlp": 1.03454447, "epoch": 0.8879603186532391, "flos": 23185329811200.0, "grad_norm": 1.9924835767482976, "language_loss": 0.70285976, "learning_rate": 1.2258958524619822e-07, "loss": 0.72429204, "num_input_tokens_seen": 318410690, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.67578125, "step": 14769, "time_per_iteration": 2.542595386505127 }, { "auxiliary_loss_clip": 0.01117747, "auxiliary_loss_mlp": 0.01031575, "balance_loss_clip": 1.02014327, "balance_loss_mlp": 1.03398108, "epoch": 0.8880204419059071, "flos": 13655427707520.0, "grad_norm": 1.930680246398618, "language_loss": 0.67068362, "learning_rate": 1.2245937547511199e-07, "loss": 0.69217682, "num_input_tokens_seen": 318427380, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.66015625, "step": 14770, "time_per_iteration": 2.493648052215576 }, { "auxiliary_loss_clip": 0.01107106, "auxiliary_loss_mlp": 0.01029245, "balance_loss_clip": 1.01728249, "balance_loss_mlp": 1.03346765, "epoch": 0.888080565158575, "flos": 20631865824000.0, "grad_norm": 2.1368671523221243, "language_loss": 0.65415186, "learning_rate": 1.2232923270862094e-07, "loss": 0.67551541, "num_input_tokens_seen": 318448530, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.65234375, "step": 14771, "time_per_iteration": 2.5653181076049805 }, { "auxiliary_loss_clip": 0.01119031, "auxiliary_loss_mlp": 0.01026889, "balance_loss_clip": 1.01503396, "balance_loss_mlp": 1.03311825, "epoch": 0.888140688411243, "flos": 28620014411520.0, "grad_norm": 1.4827024522343608, "language_loss": 0.82665741, "learning_rate": 1.2219915695136984e-07, "loss": 0.84811664, "num_input_tokens_seen": 318468655, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 14772, "time_per_iteration": 2.6067521572113037 }, { "auxiliary_loss_clip": 0.01114956, "auxiliary_loss_mlp": 0.01023481, "balance_loss_clip": 1.01224577, "balance_loss_mlp": 1.03189957, "epoch": 0.8882008116639111, "flos": 25520241496320.0, "grad_norm": 1.9612738272850632, "language_loss": 0.76365662, "learning_rate": 1.2206914820800028e-07, "loss": 0.78504097, "num_input_tokens_seen": 318488740, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6484375, "step": 14773, "time_per_iteration": 2.7462854385375977 }, { "auxiliary_loss_clip": 0.01117191, "auxiliary_loss_mlp": 0.01026069, "balance_loss_clip": 1.01468468, "balance_loss_mlp": 1.03416467, "epoch": 0.888260934916579, "flos": 27673696650240.0, "grad_norm": 1.8711770075600944, "language_loss": 0.75229943, "learning_rate": 1.2193920648315238e-07, "loss": 0.77373201, "num_input_tokens_seen": 318508810, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.65625, "step": 14774, "time_per_iteration": 2.5948665142059326 }, { "auxiliary_loss_clip": 0.0114195, "auxiliary_loss_mlp": 0.0128194, "balance_loss_clip": 1.02191901, "balance_loss_mlp": 1.03589356, "epoch": 0.888321058169247, "flos": 21611077464960.0, "grad_norm": 6.114056193372572, "language_loss": 0.72465444, "learning_rate": 1.2180933178146303e-07, "loss": 0.74889338, "num_input_tokens_seen": 318526860, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69921875, "step": 14775, "time_per_iteration": 4.013475179672241 }, { "auxiliary_loss_clip": 0.01100889, "auxiliary_loss_mlp": 0.01029577, "balance_loss_clip": 1.01772165, "balance_loss_mlp": 1.03313005, "epoch": 0.8883811814219149, "flos": 18149109759360.0, "grad_norm": 1.5894037550958926, "language_loss": 0.80467308, "learning_rate": 1.2167952410756742e-07, "loss": 0.82597768, "num_input_tokens_seen": 318545180, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.67578125, "step": 14776, "time_per_iteration": 2.4929895401000977 }, { "auxiliary_loss_clip": 0.01115553, "auxiliary_loss_mlp": 0.01035083, "balance_loss_clip": 1.02203548, "balance_loss_mlp": 1.03388262, "epoch": 0.8884413046745829, "flos": 28324648874880.0, "grad_norm": 3.5298879444492277, "language_loss": 0.69395018, "learning_rate": 1.2154978346609812e-07, "loss": 0.7154566, "num_input_tokens_seen": 318564350, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 14777, "time_per_iteration": 4.056652307510376 }, { "auxiliary_loss_clip": 0.01108807, "auxiliary_loss_mlp": 0.01035967, "balance_loss_clip": 1.02396846, "balance_loss_mlp": 1.03185499, "epoch": 0.8885014279272508, "flos": 15158756649600.0, "grad_norm": 1.6328266990658875, "language_loss": 0.70514441, "learning_rate": 1.2142010986168493e-07, "loss": 0.72659206, "num_input_tokens_seen": 318582275, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 14778, "time_per_iteration": 2.5205516815185547 }, { "auxiliary_loss_clip": 0.01103419, "auxiliary_loss_mlp": 0.01032348, "balance_loss_clip": 1.02046299, "balance_loss_mlp": 1.03451848, "epoch": 0.8885615511799189, "flos": 19099593498240.0, "grad_norm": 1.5487478239579011, "language_loss": 0.77167571, "learning_rate": 1.212905032989555e-07, "loss": 0.79303342, "num_input_tokens_seen": 318601230, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 14779, "time_per_iteration": 2.481212854385376 }, { "auxiliary_loss_clip": 0.01040256, "auxiliary_loss_mlp": 0.01001112, "balance_loss_clip": 0.99987841, "balance_loss_mlp": 1.00137424, "epoch": 0.8886216744325868, "flos": 51186567605760.0, "grad_norm": 0.8795166968927024, "language_loss": 0.56804806, "learning_rate": 1.2116096378253616e-07, "loss": 0.58846176, "num_input_tokens_seen": 318645595, "router_z_loss_clip": 0.0123291, "router_z_loss_mlp": 0.2109375, "step": 14780, "time_per_iteration": 2.919813871383667 }, { "auxiliary_loss_clip": 0.01022238, "auxiliary_loss_mlp": 0.01001804, "balance_loss_clip": 1.00055218, "balance_loss_mlp": 1.00112486, "epoch": 0.8886817976852548, "flos": 61612981263360.0, "grad_norm": 0.8052675984246302, "language_loss": 0.62428153, "learning_rate": 1.2103149131704827e-07, "loss": 0.64452195, "num_input_tokens_seen": 318707850, "router_z_loss_clip": 0.01251221, "router_z_loss_mlp": 0.2109375, "step": 14781, "time_per_iteration": 3.171459436416626 }, { "auxiliary_loss_clip": 0.01082725, "auxiliary_loss_mlp": 0.01004147, "balance_loss_clip": 1.00291288, "balance_loss_mlp": 1.00106382, "epoch": 0.8887419209379227, "flos": 60646946935680.0, "grad_norm": 0.7857742587898149, "language_loss": 0.58121449, "learning_rate": 1.209020859071137e-07, "loss": 0.60208321, "num_input_tokens_seen": 318764915, "router_z_loss_clip": 0.0123291, "router_z_loss_mlp": 0.2109375, "step": 14782, "time_per_iteration": 3.039487361907959 }, { "auxiliary_loss_clip": 0.0110064, "auxiliary_loss_mlp": 0.01028955, "balance_loss_clip": 1.01760602, "balance_loss_mlp": 1.03435457, "epoch": 0.8888020441905907, "flos": 29205861235200.0, "grad_norm": 2.128692654288903, "language_loss": 0.65901721, "learning_rate": 1.2077274755734967e-07, "loss": 0.68031311, "num_input_tokens_seen": 318785660, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6640625, "step": 14783, "time_per_iteration": 2.6107184886932373 }, { "auxiliary_loss_clip": 0.01104335, "auxiliary_loss_mlp": 0.01031039, "balance_loss_clip": 1.01827168, "balance_loss_mlp": 1.03369331, "epoch": 0.8888621674432586, "flos": 22162701605760.0, "grad_norm": 1.4850726716948133, "language_loss": 0.77530539, "learning_rate": 1.206434762723727e-07, "loss": 0.79665911, "num_input_tokens_seen": 318806080, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 14784, "time_per_iteration": 2.492439031600952 }, { "auxiliary_loss_clip": 0.01122665, "auxiliary_loss_mlp": 0.01033905, "balance_loss_clip": 1.02155471, "balance_loss_mlp": 1.03539777, "epoch": 0.8889222906959267, "flos": 27672834723840.0, "grad_norm": 4.670102752487299, "language_loss": 0.60838169, "learning_rate": 1.2051427205679554e-07, "loss": 0.62994742, "num_input_tokens_seen": 318826445, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 14785, "time_per_iteration": 2.5780222415924072 }, { "auxiliary_loss_clip": 0.01130152, "auxiliary_loss_mlp": 0.01030607, "balance_loss_clip": 1.01789308, "balance_loss_mlp": 1.0356015, "epoch": 0.8889824139485947, "flos": 14168627274240.0, "grad_norm": 1.7198481216103034, "language_loss": 0.65153611, "learning_rate": 1.2038513491522938e-07, "loss": 0.67314368, "num_input_tokens_seen": 318843915, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 14786, "time_per_iteration": 2.552997350692749 }, { "auxiliary_loss_clip": 0.01133491, "auxiliary_loss_mlp": 0.01279157, "balance_loss_clip": 1.01841068, "balance_loss_mlp": 1.03452528, "epoch": 0.8890425372012626, "flos": 12853003829760.0, "grad_norm": 3.019941216360414, "language_loss": 0.8544277, "learning_rate": 1.2025606485228324e-07, "loss": 0.87855417, "num_input_tokens_seen": 318859670, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7265625, "step": 14787, "time_per_iteration": 2.557004928588867 }, { "auxiliary_loss_clip": 0.01120163, "auxiliary_loss_mlp": 0.010304, "balance_loss_clip": 1.01844335, "balance_loss_mlp": 1.0335952, "epoch": 0.8891026604539306, "flos": 21689291329920.0, "grad_norm": 2.254150305013642, "language_loss": 0.71022701, "learning_rate": 1.2012706187256227e-07, "loss": 0.73173267, "num_input_tokens_seen": 318877855, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 14788, "time_per_iteration": 2.531403064727783 }, { "auxiliary_loss_clip": 0.01100597, "auxiliary_loss_mlp": 0.01028193, "balance_loss_clip": 1.01665914, "balance_loss_mlp": 1.03435564, "epoch": 0.8891627837065985, "flos": 21871430219520.0, "grad_norm": 1.5765593243815317, "language_loss": 0.70205212, "learning_rate": 1.1999812598067128e-07, "loss": 0.72334009, "num_input_tokens_seen": 318896045, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 14789, "time_per_iteration": 2.556544542312622 }, { "auxiliary_loss_clip": 0.01112232, "auxiliary_loss_mlp": 0.01027131, "balance_loss_clip": 1.01508451, "balance_loss_mlp": 1.03554356, "epoch": 0.8892229069592665, "flos": 21580230660480.0, "grad_norm": 2.1887920836479715, "language_loss": 0.70575881, "learning_rate": 1.19869257181211e-07, "loss": 0.72715247, "num_input_tokens_seen": 318915515, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.67578125, "step": 14790, "time_per_iteration": 2.5319390296936035 }, { "auxiliary_loss_clip": 0.01120938, "auxiliary_loss_mlp": 0.01028418, "balance_loss_clip": 1.01720667, "balance_loss_mlp": 1.0349133, "epoch": 0.8892830302119344, "flos": 23075981832960.0, "grad_norm": 1.6699384539830975, "language_loss": 0.728742, "learning_rate": 1.1974045547878087e-07, "loss": 0.75023556, "num_input_tokens_seen": 318934305, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.68359375, "step": 14791, "time_per_iteration": 2.5429258346557617 }, { "auxiliary_loss_clip": 0.01127569, "auxiliary_loss_mlp": 0.01036215, "balance_loss_clip": 1.02374554, "balance_loss_mlp": 1.03536892, "epoch": 0.8893431534646025, "flos": 23072139077760.0, "grad_norm": 1.4449577418477908, "language_loss": 0.73902917, "learning_rate": 1.1961172087797677e-07, "loss": 0.76066697, "num_input_tokens_seen": 318953880, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.65625, "step": 14792, "time_per_iteration": 2.5597715377807617 }, { "auxiliary_loss_clip": 0.01160184, "auxiliary_loss_mlp": 0.01032944, "balance_loss_clip": 1.01915717, "balance_loss_mlp": 1.03532863, "epoch": 0.8894032767172704, "flos": 22454978572800.0, "grad_norm": 1.9060583705309353, "language_loss": 0.65894336, "learning_rate": 1.1948305338339392e-07, "loss": 0.68087459, "num_input_tokens_seen": 318971395, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.71484375, "step": 14793, "time_per_iteration": 2.592176675796509 }, { "auxiliary_loss_clip": 0.01130952, "auxiliary_loss_mlp": 0.01034765, "balance_loss_clip": 1.02142549, "balance_loss_mlp": 1.03421474, "epoch": 0.8894633999699384, "flos": 25338246261120.0, "grad_norm": 1.6018303619689886, "language_loss": 0.7165575, "learning_rate": 1.1935445299962355e-07, "loss": 0.73821467, "num_input_tokens_seen": 318990580, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.70703125, "step": 14794, "time_per_iteration": 2.606549024581909 }, { "auxiliary_loss_clip": 0.01117651, "auxiliary_loss_mlp": 0.01033522, "balance_loss_clip": 1.02045667, "balance_loss_mlp": 1.03666997, "epoch": 0.8895235232226063, "flos": 20994096528000.0, "grad_norm": 1.568349095730693, "language_loss": 0.75333726, "learning_rate": 1.1922591973125507e-07, "loss": 0.77484894, "num_input_tokens_seen": 319010040, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 14795, "time_per_iteration": 2.688469171524048 }, { "auxiliary_loss_clip": 0.01048931, "auxiliary_loss_mlp": 0.01000448, "balance_loss_clip": 0.99914855, "balance_loss_mlp": 1.00124383, "epoch": 0.8895836464752743, "flos": 69732956764800.0, "grad_norm": 0.768223368826408, "language_loss": 0.56140757, "learning_rate": 1.1909745358287527e-07, "loss": 0.58190131, "num_input_tokens_seen": 319063860, "router_z_loss_clip": 0.01300049, "router_z_loss_mlp": 0.2109375, "step": 14796, "time_per_iteration": 3.074242115020752 }, { "auxiliary_loss_clip": 0.01105342, "auxiliary_loss_mlp": 0.01032501, "balance_loss_clip": 1.01867294, "balance_loss_mlp": 1.03500891, "epoch": 0.8896437697279422, "flos": 23221815050880.0, "grad_norm": 1.8249942906711658, "language_loss": 0.70532966, "learning_rate": 1.189690545590698e-07, "loss": 0.72670817, "num_input_tokens_seen": 319082335, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.703125, "step": 14797, "time_per_iteration": 2.5504086017608643 }, { "auxiliary_loss_clip": 0.01040218, "auxiliary_loss_mlp": 0.01001767, "balance_loss_clip": 1.0005213, "balance_loss_mlp": 1.00107026, "epoch": 0.8897038929806103, "flos": 70752711882240.0, "grad_norm": 0.7620966596229978, "language_loss": 0.5809108, "learning_rate": 1.188407226644197e-07, "loss": 0.60133076, "num_input_tokens_seen": 319147075, "router_z_loss_clip": 0.01245117, "router_z_loss_mlp": 0.2109375, "step": 14798, "time_per_iteration": 3.2171831130981445 }, { "auxiliary_loss_clip": 0.01111263, "auxiliary_loss_mlp": 0.01025919, "balance_loss_clip": 1.01345003, "balance_loss_mlp": 1.03453326, "epoch": 0.8897640162332782, "flos": 19245103493760.0, "grad_norm": 1.6422672779272773, "language_loss": 0.79142189, "learning_rate": 1.1871245790350526e-07, "loss": 0.81279373, "num_input_tokens_seen": 319166630, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6796875, "step": 14799, "time_per_iteration": 2.4874825477600098 }, { "auxiliary_loss_clip": 0.01030967, "auxiliary_loss_mlp": 0.01002179, "balance_loss_clip": 1.00090933, "balance_loss_mlp": 1.00096726, "epoch": 0.8898241394859462, "flos": 41356275039360.0, "grad_norm": 0.8814230862581653, "language_loss": 0.5813235, "learning_rate": 1.1858426028090396e-07, "loss": 0.60165501, "num_input_tokens_seen": 319221865, "router_z_loss_clip": 0.01269531, "router_z_loss_mlp": 0.2109375, "step": 14800, "time_per_iteration": 3.0514512062072754 }, { "auxiliary_loss_clip": 0.01112245, "auxiliary_loss_mlp": 0.01033572, "balance_loss_clip": 1.02181816, "balance_loss_mlp": 1.03549361, "epoch": 0.8898842627386142, "flos": 18986295024000.0, "grad_norm": 2.0669146387760313, "language_loss": 0.66451639, "learning_rate": 1.1845612980119124e-07, "loss": 0.6859746, "num_input_tokens_seen": 319240710, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6796875, "step": 14801, "time_per_iteration": 2.493488311767578 }, { "auxiliary_loss_clip": 0.01132568, "auxiliary_loss_mlp": 0.01036641, "balance_loss_clip": 1.02261567, "balance_loss_mlp": 1.03490186, "epoch": 0.8899443859912821, "flos": 13217173868160.0, "grad_norm": 2.7858057841914934, "language_loss": 0.75714326, "learning_rate": 1.1832806646893922e-07, "loss": 0.77883536, "num_input_tokens_seen": 319256495, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.703125, "step": 14802, "time_per_iteration": 2.5640196800231934 }, { "auxiliary_loss_clip": 0.01110914, "auxiliary_loss_mlp": 0.01030151, "balance_loss_clip": 1.01837873, "balance_loss_mlp": 1.03492165, "epoch": 0.8900045092439501, "flos": 22674680110080.0, "grad_norm": 1.7972392408080964, "language_loss": 0.73694378, "learning_rate": 1.1820007028871781e-07, "loss": 0.75835443, "num_input_tokens_seen": 319273620, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.66796875, "step": 14803, "time_per_iteration": 3.9207279682159424 }, { "auxiliary_loss_clip": 0.01118634, "auxiliary_loss_mlp": 0.01031658, "balance_loss_clip": 1.01956463, "balance_loss_mlp": 1.03319085, "epoch": 0.890064632496618, "flos": 21141617685120.0, "grad_norm": 1.9099094417842082, "language_loss": 0.72163248, "learning_rate": 1.1807214126509602e-07, "loss": 0.74313533, "num_input_tokens_seen": 319291720, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.671875, "step": 14804, "time_per_iteration": 2.513643741607666 }, { "auxiliary_loss_clip": 0.01111283, "auxiliary_loss_mlp": 0.01031233, "balance_loss_clip": 1.01841164, "balance_loss_mlp": 1.03217649, "epoch": 0.8901247557492861, "flos": 23397058529280.0, "grad_norm": 1.7603458304766006, "language_loss": 0.81279552, "learning_rate": 1.1794427940263774e-07, "loss": 0.83422065, "num_input_tokens_seen": 319310380, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 14805, "time_per_iteration": 2.5131142139434814 }, { "auxiliary_loss_clip": 0.01114036, "auxiliary_loss_mlp": 0.01031746, "balance_loss_clip": 1.01911032, "balance_loss_mlp": 1.03455889, "epoch": 0.890184879001954, "flos": 29169591477120.0, "grad_norm": 1.9379852000845155, "language_loss": 0.67590845, "learning_rate": 1.1781648470590755e-07, "loss": 0.69736624, "num_input_tokens_seen": 319331765, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 14806, "time_per_iteration": 2.590261697769165 }, { "auxiliary_loss_clip": 0.01119679, "auxiliary_loss_mlp": 0.01031441, "balance_loss_clip": 1.01909089, "balance_loss_mlp": 1.03418779, "epoch": 0.890245002254622, "flos": 14427830793600.0, "grad_norm": 2.0315138685638594, "language_loss": 0.67105961, "learning_rate": 1.1768875717946536e-07, "loss": 0.69257081, "num_input_tokens_seen": 319349135, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6796875, "step": 14807, "time_per_iteration": 2.5197012424468994 }, { "auxiliary_loss_clip": 0.01117176, "auxiliary_loss_mlp": 0.01026824, "balance_loss_clip": 1.01535046, "balance_loss_mlp": 1.03242373, "epoch": 0.8903051255072899, "flos": 22382187661440.0, "grad_norm": 1.7329252039807965, "language_loss": 0.75453627, "learning_rate": 1.1756109682786952e-07, "loss": 0.7759763, "num_input_tokens_seen": 319368410, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.671875, "step": 14808, "time_per_iteration": 2.60427188873291 }, { "auxiliary_loss_clip": 0.01129344, "auxiliary_loss_mlp": 0.01033231, "balance_loss_clip": 1.021608, "balance_loss_mlp": 1.03341007, "epoch": 0.8903652487599579, "flos": 20777375819520.0, "grad_norm": 2.8110870215718067, "language_loss": 0.81508338, "learning_rate": 1.174335036556755e-07, "loss": 0.83670914, "num_input_tokens_seen": 319387535, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6953125, "step": 14809, "time_per_iteration": 3.922858715057373 }, { "auxiliary_loss_clip": 0.01110263, "auxiliary_loss_mlp": 0.01031933, "balance_loss_clip": 1.01994681, "balance_loss_mlp": 1.03307867, "epoch": 0.8904253720126258, "flos": 24424499157120.0, "grad_norm": 1.7733940238044898, "language_loss": 0.68727148, "learning_rate": 1.1730597766743766e-07, "loss": 0.70869339, "num_input_tokens_seen": 319407210, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.68359375, "step": 14810, "time_per_iteration": 2.6235971450805664 }, { "auxiliary_loss_clip": 0.0113066, "auxiliary_loss_mlp": 0.01026885, "balance_loss_clip": 1.01426625, "balance_loss_mlp": 1.03340483, "epoch": 0.8904854952652939, "flos": 19463871277440.0, "grad_norm": 2.051133945125283, "language_loss": 0.70174479, "learning_rate": 1.1717851886770658e-07, "loss": 0.72332025, "num_input_tokens_seen": 319425340, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 14811, "time_per_iteration": 2.543534994125366 }, { "auxiliary_loss_clip": 0.01131467, "auxiliary_loss_mlp": 0.01276067, "balance_loss_clip": 1.01715446, "balance_loss_mlp": 1.03534293, "epoch": 0.8905456185179618, "flos": 50800741666560.0, "grad_norm": 2.3308668242808026, "language_loss": 0.65275759, "learning_rate": 1.1705112726103106e-07, "loss": 0.67683291, "num_input_tokens_seen": 319448150, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6953125, "step": 14812, "time_per_iteration": 2.806642770767212 }, { "auxiliary_loss_clip": 0.01137954, "auxiliary_loss_mlp": 0.01030823, "balance_loss_clip": 1.01934338, "balance_loss_mlp": 1.03465152, "epoch": 0.8906057417706298, "flos": 17784867893760.0, "grad_norm": 1.7131961206574238, "language_loss": 0.68342328, "learning_rate": 1.169238028519568e-07, "loss": 0.70511103, "num_input_tokens_seen": 319466115, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.67578125, "step": 14813, "time_per_iteration": 2.5677225589752197 }, { "auxiliary_loss_clip": 0.01126656, "auxiliary_loss_mlp": 0.0103285, "balance_loss_clip": 1.01851511, "balance_loss_mlp": 1.03515053, "epoch": 0.8906658650232978, "flos": 21944867575680.0, "grad_norm": 2.1272149801003146, "language_loss": 0.75486541, "learning_rate": 1.1679654564502884e-07, "loss": 0.77646047, "num_input_tokens_seen": 319485255, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.734375, "step": 14814, "time_per_iteration": 2.571660280227661 }, { "auxiliary_loss_clip": 0.01112776, "auxiliary_loss_mlp": 0.01028294, "balance_loss_clip": 1.0154016, "balance_loss_mlp": 1.03330588, "epoch": 0.8907259882759657, "flos": 21287810039040.0, "grad_norm": 2.1672384925696053, "language_loss": 0.7422958, "learning_rate": 1.1666935564478753e-07, "loss": 0.76370651, "num_input_tokens_seen": 319501800, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 14815, "time_per_iteration": 2.5209381580352783 }, { "auxiliary_loss_clip": 0.0112715, "auxiliary_loss_mlp": 0.01031436, "balance_loss_clip": 1.01912212, "balance_loss_mlp": 1.03401589, "epoch": 0.8907861115286337, "flos": 20120426023680.0, "grad_norm": 1.8533859427755306, "language_loss": 0.75526506, "learning_rate": 1.1654223285577259e-07, "loss": 0.77685094, "num_input_tokens_seen": 319520415, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.66015625, "step": 14816, "time_per_iteration": 4.108804702758789 }, { "auxiliary_loss_clip": 0.01121057, "auxiliary_loss_mlp": 0.01028268, "balance_loss_clip": 1.01603103, "balance_loss_mlp": 1.03300095, "epoch": 0.8908462347813016, "flos": 20084156265600.0, "grad_norm": 1.8742609383684572, "language_loss": 0.77951354, "learning_rate": 1.1641517728252082e-07, "loss": 0.80100679, "num_input_tokens_seen": 319538410, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.703125, "step": 14817, "time_per_iteration": 2.5513203144073486 }, { "auxiliary_loss_clip": 0.01049013, "auxiliary_loss_mlp": 0.01003197, "balance_loss_clip": 1.00185585, "balance_loss_mlp": 1.00086212, "epoch": 0.8909063580339697, "flos": 65503649790720.0, "grad_norm": 0.7553115175863838, "language_loss": 0.56549811, "learning_rate": 1.1628818892956549e-07, "loss": 0.58602023, "num_input_tokens_seen": 319602565, "router_z_loss_clip": 0.01342773, "router_z_loss_mlp": 0.21191406, "step": 14818, "time_per_iteration": 4.718410968780518 }, { "auxiliary_loss_clip": 0.01138283, "auxiliary_loss_mlp": 0.01033983, "balance_loss_clip": 1.02223468, "balance_loss_mlp": 1.03467369, "epoch": 0.8909664812866376, "flos": 29863062426240.0, "grad_norm": 1.8544161813066178, "language_loss": 0.64394617, "learning_rate": 1.1616126780143964e-07, "loss": 0.66566885, "num_input_tokens_seen": 319624645, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.67578125, "step": 14819, "time_per_iteration": 2.6729142665863037 }, { "auxiliary_loss_clip": 0.01113266, "auxiliary_loss_mlp": 0.01029468, "balance_loss_clip": 1.01609826, "balance_loss_mlp": 1.03466988, "epoch": 0.8910266045393056, "flos": 25447127362560.0, "grad_norm": 1.8153114462198066, "language_loss": 0.78546834, "learning_rate": 1.160344139026721e-07, "loss": 0.80689567, "num_input_tokens_seen": 319644040, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6953125, "step": 14820, "time_per_iteration": 2.5467135906219482 }, { "auxiliary_loss_clip": 0.01129556, "auxiliary_loss_mlp": 0.01281706, "balance_loss_clip": 1.02299571, "balance_loss_mlp": 1.0340308, "epoch": 0.8910867277919735, "flos": 24499121662080.0, "grad_norm": 1.70872883157785, "language_loss": 0.764862, "learning_rate": 1.1590762723779057e-07, "loss": 0.78897464, "num_input_tokens_seen": 319663930, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 14821, "time_per_iteration": 2.667888641357422 }, { "auxiliary_loss_clip": 0.01031148, "auxiliary_loss_mlp": 0.01246645, "balance_loss_clip": 0.99948466, "balance_loss_mlp": 1.00085902, "epoch": 0.8911468510446415, "flos": 60688136856960.0, "grad_norm": 0.7916407054860656, "language_loss": 0.59308517, "learning_rate": 1.15780907811319e-07, "loss": 0.61586308, "num_input_tokens_seen": 319721245, "router_z_loss_clip": 0.01245117, "router_z_loss_mlp": 0.21191406, "step": 14822, "time_per_iteration": 3.0691189765930176 }, { "auxiliary_loss_clip": 0.01112084, "auxiliary_loss_mlp": 0.01029932, "balance_loss_clip": 1.01752877, "balance_loss_mlp": 1.03312814, "epoch": 0.8912069742973094, "flos": 25337492075520.0, "grad_norm": 6.159272454041832, "language_loss": 0.6906637, "learning_rate": 1.156542556277802e-07, "loss": 0.71208382, "num_input_tokens_seen": 319741200, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 14823, "time_per_iteration": 2.6316027641296387 }, { "auxiliary_loss_clip": 0.01109098, "auxiliary_loss_mlp": 0.01029507, "balance_loss_clip": 1.01737714, "balance_loss_mlp": 1.0332495, "epoch": 0.8912670975499775, "flos": 18223516782720.0, "grad_norm": 2.031297496343854, "language_loss": 0.69139791, "learning_rate": 1.1552767069169389e-07, "loss": 0.71278387, "num_input_tokens_seen": 319759265, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.66796875, "step": 14824, "time_per_iteration": 2.596982479095459 }, { "auxiliary_loss_clip": 0.01140631, "auxiliary_loss_mlp": 0.01030875, "balance_loss_clip": 1.01804245, "balance_loss_mlp": 1.03389645, "epoch": 0.8913272208026454, "flos": 26504481041280.0, "grad_norm": 2.027888107171236, "language_loss": 0.70408958, "learning_rate": 1.154011530075778e-07, "loss": 0.72580469, "num_input_tokens_seen": 319777560, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 14825, "time_per_iteration": 2.6026241779327393 }, { "auxiliary_loss_clip": 0.01119433, "auxiliary_loss_mlp": 0.01030716, "balance_loss_clip": 1.01784706, "balance_loss_mlp": 1.03351796, "epoch": 0.8913873440553134, "flos": 18802324540800.0, "grad_norm": 1.816282785839059, "language_loss": 0.71133387, "learning_rate": 1.1527470257994632e-07, "loss": 0.73283535, "num_input_tokens_seen": 319794125, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6796875, "step": 14826, "time_per_iteration": 2.518594980239868 }, { "auxiliary_loss_clip": 0.01110211, "auxiliary_loss_mlp": 0.01025869, "balance_loss_clip": 1.01401925, "balance_loss_mlp": 1.03409374, "epoch": 0.8914474673079814, "flos": 20884892204160.0, "grad_norm": 1.9050651123088838, "language_loss": 0.74785674, "learning_rate": 1.1514831941331272e-07, "loss": 0.76921749, "num_input_tokens_seen": 319810310, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.671875, "step": 14827, "time_per_iteration": 2.5361785888671875 }, { "auxiliary_loss_clip": 0.01112149, "auxiliary_loss_mlp": 0.01030635, "balance_loss_clip": 1.0180887, "balance_loss_mlp": 1.03410256, "epoch": 0.8915075905606493, "flos": 20952439729920.0, "grad_norm": 1.7360427960402303, "language_loss": 0.77936018, "learning_rate": 1.1502200351218738e-07, "loss": 0.80078804, "num_input_tokens_seen": 319828505, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 14828, "time_per_iteration": 2.7360687255859375 }, { "auxiliary_loss_clip": 0.01128982, "auxiliary_loss_mlp": 0.01031355, "balance_loss_clip": 1.01870108, "balance_loss_mlp": 1.03379464, "epoch": 0.8915677138133173, "flos": 23076305055360.0, "grad_norm": 1.7021076651331029, "language_loss": 0.75459182, "learning_rate": 1.1489575488107805e-07, "loss": 0.77619517, "num_input_tokens_seen": 319848680, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.68359375, "step": 14829, "time_per_iteration": 2.5823662281036377 }, { "auxiliary_loss_clip": 0.01102896, "auxiliary_loss_mlp": 0.01033268, "balance_loss_clip": 1.02185404, "balance_loss_mlp": 1.03329659, "epoch": 0.8916278370659853, "flos": 23440259612160.0, "grad_norm": 1.5173112782788498, "language_loss": 0.84338826, "learning_rate": 1.1476957352448979e-07, "loss": 0.86474991, "num_input_tokens_seen": 319868835, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6953125, "step": 14830, "time_per_iteration": 2.567936658859253 }, { "auxiliary_loss_clip": 0.0110717, "auxiliary_loss_mlp": 0.01026504, "balance_loss_clip": 1.01514304, "balance_loss_mlp": 1.03277719, "epoch": 0.8916879603186533, "flos": 25160488830720.0, "grad_norm": 1.4696268451607852, "language_loss": 0.74922359, "learning_rate": 1.1464345944692677e-07, "loss": 0.77056032, "num_input_tokens_seen": 319891585, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.65625, "step": 14831, "time_per_iteration": 2.6267197132110596 }, { "auxiliary_loss_clip": 0.01113197, "auxiliary_loss_mlp": 0.01028359, "balance_loss_clip": 1.01569295, "balance_loss_mlp": 1.0333215, "epoch": 0.8917480835713212, "flos": 20229845829120.0, "grad_norm": 2.748161981673709, "language_loss": 0.73228294, "learning_rate": 1.145174126528885e-07, "loss": 0.75369847, "num_input_tokens_seen": 319910315, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 14832, "time_per_iteration": 2.6659209728240967 }, { "auxiliary_loss_clip": 0.01148463, "auxiliary_loss_mlp": 0.01276365, "balance_loss_clip": 1.01702356, "balance_loss_mlp": 1.03296423, "epoch": 0.8918082068239892, "flos": 26101922342400.0, "grad_norm": 1.9037999532349887, "language_loss": 0.66654694, "learning_rate": 1.1439143314687361e-07, "loss": 0.69079524, "num_input_tokens_seen": 319932275, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.7109375, "step": 14833, "time_per_iteration": 2.5823750495910645 }, { "auxiliary_loss_clip": 0.01116407, "auxiliary_loss_mlp": 0.01032074, "balance_loss_clip": 1.01900864, "balance_loss_mlp": 1.03615665, "epoch": 0.8918683300766571, "flos": 24831439315200.0, "grad_norm": 1.8313172415964112, "language_loss": 0.7420584, "learning_rate": 1.1426552093337848e-07, "loss": 0.76354325, "num_input_tokens_seen": 319955335, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 14834, "time_per_iteration": 2.6305558681488037 }, { "auxiliary_loss_clip": 0.01128593, "auxiliary_loss_mlp": 0.01032464, "balance_loss_clip": 1.02087736, "balance_loss_mlp": 1.03371191, "epoch": 0.8919284533293251, "flos": 22305158945280.0, "grad_norm": 2.128661980202119, "language_loss": 0.79092139, "learning_rate": 1.1413967601689578e-07, "loss": 0.81253195, "num_input_tokens_seen": 319973990, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.67578125, "step": 14835, "time_per_iteration": 2.6002297401428223 }, { "auxiliary_loss_clip": 0.01098782, "auxiliary_loss_mlp": 0.01025448, "balance_loss_clip": 1.01412332, "balance_loss_mlp": 1.03231668, "epoch": 0.891988576581993, "flos": 30373532559360.0, "grad_norm": 2.4080309364674117, "language_loss": 0.73752457, "learning_rate": 1.1401389840191722e-07, "loss": 0.75876683, "num_input_tokens_seen": 319995555, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6640625, "step": 14836, "time_per_iteration": 2.626155376434326 }, { "auxiliary_loss_clip": 0.01119489, "auxiliary_loss_mlp": 0.0103149, "balance_loss_clip": 1.01876485, "balance_loss_mlp": 1.03266907, "epoch": 0.8920486998346611, "flos": 15552947479680.0, "grad_norm": 4.151165577309592, "language_loss": 0.68330091, "learning_rate": 1.1388818809293099e-07, "loss": 0.70481068, "num_input_tokens_seen": 320012385, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 14837, "time_per_iteration": 2.5913376808166504 }, { "auxiliary_loss_clip": 0.01152295, "auxiliary_loss_mlp": 0.01032626, "balance_loss_clip": 1.01921463, "balance_loss_mlp": 1.0348928, "epoch": 0.892108823087329, "flos": 21214983214080.0, "grad_norm": 1.823094313932328, "language_loss": 0.6769551, "learning_rate": 1.137625450944244e-07, "loss": 0.69880426, "num_input_tokens_seen": 320032390, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 14838, "time_per_iteration": 2.763122797012329 }, { "auxiliary_loss_clip": 0.01113064, "auxiliary_loss_mlp": 0.01030242, "balance_loss_clip": 1.01706314, "balance_loss_mlp": 1.03348947, "epoch": 0.892168946339997, "flos": 21978982517760.0, "grad_norm": 4.601509564814051, "language_loss": 0.76349819, "learning_rate": 1.1363696941087986e-07, "loss": 0.7849313, "num_input_tokens_seen": 320052885, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 14839, "time_per_iteration": 2.640000820159912 }, { "auxiliary_loss_clip": 0.01048674, "auxiliary_loss_mlp": 0.0125019, "balance_loss_clip": 1.00316525, "balance_loss_mlp": 1.00080299, "epoch": 0.892229069592665, "flos": 67475289277440.0, "grad_norm": 0.6813923347962692, "language_loss": 0.49436146, "learning_rate": 1.1351146104678e-07, "loss": 0.51735008, "num_input_tokens_seen": 320113685, "router_z_loss_clip": 0.01116943, "router_z_loss_mlp": 0.2109375, "step": 14840, "time_per_iteration": 3.245704174041748 }, { "auxiliary_loss_clip": 0.01108927, "auxiliary_loss_mlp": 0.0103226, "balance_loss_clip": 1.01852155, "balance_loss_mlp": 1.03497362, "epoch": 0.8922891928453329, "flos": 19459561645440.0, "grad_norm": 2.4308736539887867, "language_loss": 0.64198482, "learning_rate": 1.1338602000660347e-07, "loss": 0.66339672, "num_input_tokens_seen": 320130810, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.73828125, "step": 14841, "time_per_iteration": 2.498145341873169 }, { "auxiliary_loss_clip": 0.01124916, "auxiliary_loss_mlp": 0.01280005, "balance_loss_clip": 1.02215266, "balance_loss_mlp": 1.03338253, "epoch": 0.8923493160980009, "flos": 23367396873600.0, "grad_norm": 1.9175133610901625, "language_loss": 0.68386632, "learning_rate": 1.132606462948269e-07, "loss": 0.70791554, "num_input_tokens_seen": 320152170, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.65234375, "step": 14842, "time_per_iteration": 2.7241055965423584 }, { "auxiliary_loss_clip": 0.01101669, "auxiliary_loss_mlp": 0.01033139, "balance_loss_clip": 1.02224374, "balance_loss_mlp": 1.03485286, "epoch": 0.8924094393506689, "flos": 26177047637760.0, "grad_norm": 2.2637355300029087, "language_loss": 0.8014791, "learning_rate": 1.1313533991592428e-07, "loss": 0.82282722, "num_input_tokens_seen": 320172360, "router_z_loss_clip": 0.10888672, "router_z_loss_mlp": 0.66796875, "step": 14843, "time_per_iteration": 2.5957579612731934 }, { "auxiliary_loss_clip": 0.01124272, "auxiliary_loss_mlp": 0.01032544, "balance_loss_clip": 1.01966906, "balance_loss_mlp": 1.0352242, "epoch": 0.8924695626033369, "flos": 22018520413440.0, "grad_norm": 2.034105389528174, "language_loss": 0.68706429, "learning_rate": 1.1301010087436802e-07, "loss": 0.70863247, "num_input_tokens_seen": 320192130, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 14844, "time_per_iteration": 2.6552183628082275 }, { "auxiliary_loss_clip": 0.01138462, "auxiliary_loss_mlp": 0.01029745, "balance_loss_clip": 1.01764476, "balance_loss_mlp": 1.03399837, "epoch": 0.8925296858560048, "flos": 14793940166400.0, "grad_norm": 2.113411672182631, "language_loss": 0.91729331, "learning_rate": 1.1288492917462744e-07, "loss": 0.93897545, "num_input_tokens_seen": 320207760, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 14845, "time_per_iteration": 4.023139715194702 }, { "auxiliary_loss_clip": 0.01109676, "auxiliary_loss_mlp": 0.01025031, "balance_loss_clip": 1.01378989, "balance_loss_mlp": 1.03409719, "epoch": 0.8925898091086728, "flos": 22346636175360.0, "grad_norm": 1.9890127732730212, "language_loss": 0.72404599, "learning_rate": 1.1275982482116941e-07, "loss": 0.74539304, "num_input_tokens_seen": 320225325, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6640625, "step": 14846, "time_per_iteration": 2.551987648010254 }, { "auxiliary_loss_clip": 0.01131208, "auxiliary_loss_mlp": 0.01034317, "balance_loss_clip": 1.02205634, "balance_loss_mlp": 1.0351634, "epoch": 0.8926499323613407, "flos": 45806322067200.0, "grad_norm": 1.6438236628995946, "language_loss": 0.5700435, "learning_rate": 1.1263478781845859e-07, "loss": 0.59169877, "num_input_tokens_seen": 320247645, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6953125, "step": 14847, "time_per_iteration": 2.8739492893218994 }, { "auxiliary_loss_clip": 0.01128712, "auxiliary_loss_mlp": 0.01029666, "balance_loss_clip": 1.01810288, "balance_loss_mlp": 1.03267264, "epoch": 0.8927100556140087, "flos": 22127042378880.0, "grad_norm": 2.11170806240505, "language_loss": 0.76843905, "learning_rate": 1.1250981817095762e-07, "loss": 0.79002285, "num_input_tokens_seen": 320266005, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.69140625, "step": 14848, "time_per_iteration": 2.5725276470184326 }, { "auxiliary_loss_clip": 0.01146545, "auxiliary_loss_mlp": 0.01031828, "balance_loss_clip": 1.01979351, "balance_loss_mlp": 1.0345161, "epoch": 0.8927701788666766, "flos": 28330143655680.0, "grad_norm": 1.7561349319013049, "language_loss": 0.69010365, "learning_rate": 1.1238491588312582e-07, "loss": 0.71188736, "num_input_tokens_seen": 320285555, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 14849, "time_per_iteration": 2.7543833255767822 }, { "auxiliary_loss_clip": 0.0111379, "auxiliary_loss_mlp": 0.01034157, "balance_loss_clip": 1.0218606, "balance_loss_mlp": 1.03603005, "epoch": 0.8928303021193447, "flos": 25294973351040.0, "grad_norm": 1.9479444057210917, "language_loss": 0.81100309, "learning_rate": 1.1226008095942052e-07, "loss": 0.83248258, "num_input_tokens_seen": 320305395, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 14850, "time_per_iteration": 2.5389716625213623 }, { "auxiliary_loss_clip": 0.01119548, "auxiliary_loss_mlp": 0.01035613, "balance_loss_clip": 1.02314949, "balance_loss_mlp": 1.03498983, "epoch": 0.8928904253720126, "flos": 22236713579520.0, "grad_norm": 1.5369239975095612, "language_loss": 0.75085771, "learning_rate": 1.1213531340429772e-07, "loss": 0.77240932, "num_input_tokens_seen": 320324220, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.66796875, "step": 14851, "time_per_iteration": 3.898308515548706 }, { "auxiliary_loss_clip": 0.01118301, "auxiliary_loss_mlp": 0.01027656, "balance_loss_clip": 1.01575303, "balance_loss_mlp": 1.03334463, "epoch": 0.8929505486246806, "flos": 27092374940160.0, "grad_norm": 1.4888490585638916, "language_loss": 0.78301167, "learning_rate": 1.1201061322220895e-07, "loss": 0.80447125, "num_input_tokens_seen": 320347195, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.67578125, "step": 14852, "time_per_iteration": 2.612622022628784 }, { "auxiliary_loss_clip": 0.01110356, "auxiliary_loss_mlp": 0.01032575, "balance_loss_clip": 1.02099967, "balance_loss_mlp": 1.03498149, "epoch": 0.8930106718773486, "flos": 23039352938880.0, "grad_norm": 1.5102156244987037, "language_loss": 0.69287789, "learning_rate": 1.1188598041760489e-07, "loss": 0.71430719, "num_input_tokens_seen": 320366850, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6640625, "step": 14853, "time_per_iteration": 2.664238929748535 }, { "auxiliary_loss_clip": 0.0114062, "auxiliary_loss_mlp": 0.01033384, "balance_loss_clip": 1.02079582, "balance_loss_mlp": 1.03617406, "epoch": 0.8930707951300165, "flos": 35626652887680.0, "grad_norm": 1.9709954560221863, "language_loss": 0.67253637, "learning_rate": 1.117614149949333e-07, "loss": 0.69427645, "num_input_tokens_seen": 320388895, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 14854, "time_per_iteration": 2.65079402923584 }, { "auxiliary_loss_clip": 0.01118585, "auxiliary_loss_mlp": 0.01030231, "balance_loss_clip": 1.01820922, "balance_loss_mlp": 1.03464687, "epoch": 0.8931309183826845, "flos": 23039891642880.0, "grad_norm": 1.4912946957610993, "language_loss": 0.74717969, "learning_rate": 1.1163691695864041e-07, "loss": 0.76866782, "num_input_tokens_seen": 320408520, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.66015625, "step": 14855, "time_per_iteration": 2.729297637939453 }, { "auxiliary_loss_clip": 0.01118698, "auxiliary_loss_mlp": 0.0103175, "balance_loss_clip": 1.01957262, "balance_loss_mlp": 1.03362477, "epoch": 0.8931910416353525, "flos": 26504624695680.0, "grad_norm": 3.050568652235809, "language_loss": 0.64336759, "learning_rate": 1.1151248631316779e-07, "loss": 0.66487205, "num_input_tokens_seen": 320427400, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 14856, "time_per_iteration": 2.6213507652282715 }, { "auxiliary_loss_clip": 0.01102272, "auxiliary_loss_mlp": 0.01030048, "balance_loss_clip": 1.01798964, "balance_loss_mlp": 1.03364849, "epoch": 0.8932511648880205, "flos": 24973609345920.0, "grad_norm": 1.5836258149938938, "language_loss": 0.66852629, "learning_rate": 1.1138812306295697e-07, "loss": 0.6898495, "num_input_tokens_seen": 320447570, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.68359375, "step": 14857, "time_per_iteration": 2.6613223552703857 }, { "auxiliary_loss_clip": 0.01140567, "auxiliary_loss_mlp": 0.01033886, "balance_loss_clip": 1.02122641, "balance_loss_mlp": 1.03448844, "epoch": 0.8933112881406884, "flos": 24460733001600.0, "grad_norm": 2.3539947926412026, "language_loss": 0.75484729, "learning_rate": 1.1126382721244598e-07, "loss": 0.77659178, "num_input_tokens_seen": 320464405, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 14858, "time_per_iteration": 3.9487814903259277 }, { "auxiliary_loss_clip": 0.01109657, "auxiliary_loss_mlp": 0.01030055, "balance_loss_clip": 1.01744258, "balance_loss_mlp": 1.03294432, "epoch": 0.8933714113933564, "flos": 28293083798400.0, "grad_norm": 1.583795674733172, "language_loss": 0.69358629, "learning_rate": 1.111395987660706e-07, "loss": 0.7149834, "num_input_tokens_seen": 320485525, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 14859, "time_per_iteration": 2.668882131576538 }, { "auxiliary_loss_clip": 0.0113249, "auxiliary_loss_mlp": 0.01028848, "balance_loss_clip": 1.01775515, "balance_loss_mlp": 1.03281736, "epoch": 0.8934315346460243, "flos": 21434864319360.0, "grad_norm": 1.605272974448936, "language_loss": 0.75509608, "learning_rate": 1.1101543772826394e-07, "loss": 0.77670944, "num_input_tokens_seen": 320506725, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.63671875, "step": 14860, "time_per_iteration": 4.18671441078186 }, { "auxiliary_loss_clip": 0.01124141, "auxiliary_loss_mlp": 0.01031067, "balance_loss_clip": 1.01773381, "balance_loss_mlp": 1.03603196, "epoch": 0.8934916578986923, "flos": 23769596436480.0, "grad_norm": 1.877056272108725, "language_loss": 0.57520461, "learning_rate": 1.1089134410345757e-07, "loss": 0.5967567, "num_input_tokens_seen": 320525425, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.69921875, "step": 14861, "time_per_iteration": 2.595306158065796 }, { "auxiliary_loss_clip": 0.01117002, "auxiliary_loss_mlp": 0.01030662, "balance_loss_clip": 1.01883614, "balance_loss_mlp": 1.03207421, "epoch": 0.8935517811513602, "flos": 18916161719040.0, "grad_norm": 1.8074178427586691, "language_loss": 0.63464987, "learning_rate": 1.1076731789607995e-07, "loss": 0.6561265, "num_input_tokens_seen": 320543010, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.66796875, "step": 14862, "time_per_iteration": 2.5336451530456543 }, { "auxiliary_loss_clip": 0.01107185, "auxiliary_loss_mlp": 0.0102821, "balance_loss_clip": 1.01591396, "balance_loss_mlp": 1.03151882, "epoch": 0.8936119044040283, "flos": 24061370613120.0, "grad_norm": 1.6531331314633162, "language_loss": 0.78039861, "learning_rate": 1.1064335911055667e-07, "loss": 0.80175257, "num_input_tokens_seen": 320562180, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.66796875, "step": 14863, "time_per_iteration": 2.6363441944122314 }, { "auxiliary_loss_clip": 0.01115575, "auxiliary_loss_mlp": 0.01028461, "balance_loss_clip": 1.01731491, "balance_loss_mlp": 1.031955, "epoch": 0.8936720276566962, "flos": 21324079797120.0, "grad_norm": 1.774274574864546, "language_loss": 0.71290511, "learning_rate": 1.1051946775131194e-07, "loss": 0.73434544, "num_input_tokens_seen": 320580395, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.65625, "step": 14864, "time_per_iteration": 2.5535740852355957 }, { "auxiliary_loss_clip": 0.01119888, "auxiliary_loss_mlp": 0.01035604, "balance_loss_clip": 1.02278852, "balance_loss_mlp": 1.03765845, "epoch": 0.8937321509093642, "flos": 18406122549120.0, "grad_norm": 3.374172305746446, "language_loss": 0.75914896, "learning_rate": 1.1039564382276734e-07, "loss": 0.7807039, "num_input_tokens_seen": 320599505, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.734375, "step": 14865, "time_per_iteration": 2.742187976837158 }, { "auxiliary_loss_clip": 0.0112541, "auxiliary_loss_mlp": 0.01029684, "balance_loss_clip": 1.01826406, "balance_loss_mlp": 1.0323602, "epoch": 0.8937922741620322, "flos": 22054754257920.0, "grad_norm": 1.6396589598114575, "language_loss": 0.71865553, "learning_rate": 1.1027188732934134e-07, "loss": 0.74020642, "num_input_tokens_seen": 320619825, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6640625, "step": 14866, "time_per_iteration": 2.6028149127960205 }, { "auxiliary_loss_clip": 0.01115383, "auxiliary_loss_mlp": 0.01031759, "balance_loss_clip": 1.01855671, "balance_loss_mlp": 1.03560245, "epoch": 0.8938523974147001, "flos": 25664386775040.0, "grad_norm": 2.425020997696337, "language_loss": 0.83937716, "learning_rate": 1.1014819827545063e-07, "loss": 0.86084855, "num_input_tokens_seen": 320638515, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 14867, "time_per_iteration": 2.7840654850006104 }, { "auxiliary_loss_clip": 0.01110587, "auxiliary_loss_mlp": 0.01028504, "balance_loss_clip": 1.01788223, "balance_loss_mlp": 1.0333668, "epoch": 0.8939125206673681, "flos": 25742852035200.0, "grad_norm": 1.6959728281774447, "language_loss": 0.80478841, "learning_rate": 1.100245766655099e-07, "loss": 0.82617939, "num_input_tokens_seen": 320659430, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.68359375, "step": 14868, "time_per_iteration": 2.601412773132324 }, { "auxiliary_loss_clip": 0.01120854, "auxiliary_loss_mlp": 0.01032807, "balance_loss_clip": 1.02043891, "balance_loss_mlp": 1.03411281, "epoch": 0.8939726439200361, "flos": 27344503480320.0, "grad_norm": 2.0095128002740164, "language_loss": 0.77474356, "learning_rate": 1.0990102250392985e-07, "loss": 0.79628015, "num_input_tokens_seen": 320679295, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 14869, "time_per_iteration": 2.639407157897949 }, { "auxiliary_loss_clip": 0.01110186, "auxiliary_loss_mlp": 0.01269388, "balance_loss_clip": 1.01012874, "balance_loss_mlp": 1.03372645, "epoch": 0.8940327671727041, "flos": 20338834671360.0, "grad_norm": 1.4605045847145668, "language_loss": 0.65678227, "learning_rate": 1.0977753579512072e-07, "loss": 0.68057799, "num_input_tokens_seen": 320697535, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.67578125, "step": 14870, "time_per_iteration": 2.6230392456054688 }, { "auxiliary_loss_clip": 0.01058132, "auxiliary_loss_mlp": 0.00999807, "balance_loss_clip": 0.99852514, "balance_loss_mlp": 1.00115383, "epoch": 0.894092890425372, "flos": 58410573235200.0, "grad_norm": 0.7946944362984822, "language_loss": 0.55936092, "learning_rate": 1.0965411654348877e-07, "loss": 0.57994032, "num_input_tokens_seen": 320758635, "router_z_loss_clip": 0.01281738, "router_z_loss_mlp": 0.2109375, "step": 14871, "time_per_iteration": 3.217721462249756 }, { "auxiliary_loss_clip": 0.01110004, "auxiliary_loss_mlp": 0.01032901, "balance_loss_clip": 1.02120697, "balance_loss_mlp": 1.03429151, "epoch": 0.89415301367804, "flos": 19829657427840.0, "grad_norm": 1.8807032504431787, "language_loss": 0.77021599, "learning_rate": 1.0953076475343959e-07, "loss": 0.79164493, "num_input_tokens_seen": 320777175, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6640625, "step": 14872, "time_per_iteration": 2.6951181888580322 }, { "auxiliary_loss_clip": 0.01107459, "auxiliary_loss_mlp": 0.01026167, "balance_loss_clip": 1.01515806, "balance_loss_mlp": 1.03372777, "epoch": 0.8942131369307079, "flos": 25775781828480.0, "grad_norm": 1.4552647150140092, "language_loss": 0.66798377, "learning_rate": 1.0940748042937387e-07, "loss": 0.68932003, "num_input_tokens_seen": 320797670, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.65234375, "step": 14873, "time_per_iteration": 2.5363025665283203 }, { "auxiliary_loss_clip": 0.01099649, "auxiliary_loss_mlp": 0.01032489, "balance_loss_clip": 1.02036595, "balance_loss_mlp": 1.03330994, "epoch": 0.8942732601833759, "flos": 23149024139520.0, "grad_norm": 1.5731600945093085, "language_loss": 0.59739876, "learning_rate": 1.0928426357569231e-07, "loss": 0.61872011, "num_input_tokens_seen": 320817410, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6640625, "step": 14874, "time_per_iteration": 2.620781183242798 }, { "auxiliary_loss_clip": 0.01132886, "auxiliary_loss_mlp": 0.01032274, "balance_loss_clip": 1.01889884, "balance_loss_mlp": 1.03718495, "epoch": 0.8943333834360438, "flos": 27855548231040.0, "grad_norm": 2.121392945951504, "language_loss": 0.75457144, "learning_rate": 1.0916111419679164e-07, "loss": 0.77622294, "num_input_tokens_seen": 320836745, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69140625, "step": 14875, "time_per_iteration": 2.586916208267212 }, { "auxiliary_loss_clip": 0.01135524, "auxiliary_loss_mlp": 0.0103285, "balance_loss_clip": 1.01913548, "balance_loss_mlp": 1.03607702, "epoch": 0.8943935066887119, "flos": 22163958581760.0, "grad_norm": 2.134882897250912, "language_loss": 0.77725232, "learning_rate": 1.0903803229706721e-07, "loss": 0.79893607, "num_input_tokens_seen": 320853305, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 14876, "time_per_iteration": 2.5511019229888916 }, { "auxiliary_loss_clip": 0.01110725, "auxiliary_loss_mlp": 0.01029455, "balance_loss_clip": 1.0177784, "balance_loss_mlp": 1.03470469, "epoch": 0.8944536299413798, "flos": 21470056669440.0, "grad_norm": 1.9059277602796278, "language_loss": 0.7856037, "learning_rate": 1.0891501788091129e-07, "loss": 0.80700552, "num_input_tokens_seen": 320872885, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.671875, "step": 14877, "time_per_iteration": 2.792051315307617 }, { "auxiliary_loss_clip": 0.01138459, "auxiliary_loss_mlp": 0.01034352, "balance_loss_clip": 1.02178133, "balance_loss_mlp": 1.03483653, "epoch": 0.8945137531940478, "flos": 17748777703680.0, "grad_norm": 18.59566769612315, "language_loss": 0.75151634, "learning_rate": 1.0879207095271393e-07, "loss": 0.7732445, "num_input_tokens_seen": 320889755, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 14878, "time_per_iteration": 2.639514684677124 }, { "auxiliary_loss_clip": 0.01110962, "auxiliary_loss_mlp": 0.01026297, "balance_loss_clip": 1.0133152, "balance_loss_mlp": 1.03264534, "epoch": 0.8945738764467158, "flos": 21142264129920.0, "grad_norm": 1.5525092403481586, "language_loss": 0.75966775, "learning_rate": 1.0866919151686272e-07, "loss": 0.78104037, "num_input_tokens_seen": 320907860, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69140625, "step": 14879, "time_per_iteration": 2.7156851291656494 }, { "auxiliary_loss_clip": 0.01131143, "auxiliary_loss_mlp": 0.01034227, "balance_loss_clip": 1.02181768, "balance_loss_mlp": 1.03624713, "epoch": 0.8946339996993837, "flos": 14903000835840.0, "grad_norm": 1.8647128394586734, "language_loss": 0.74606121, "learning_rate": 1.0854637957774281e-07, "loss": 0.76771492, "num_input_tokens_seen": 320925825, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 14880, "time_per_iteration": 2.5703117847442627 }, { "auxiliary_loss_clip": 0.01111348, "auxiliary_loss_mlp": 0.01025192, "balance_loss_clip": 1.01263881, "balance_loss_mlp": 1.03352582, "epoch": 0.8946941229520518, "flos": 27382173868800.0, "grad_norm": 1.6557773901305661, "language_loss": 0.82945871, "learning_rate": 1.084236351397374e-07, "loss": 0.85082418, "num_input_tokens_seen": 320946165, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 14881, "time_per_iteration": 2.6143712997436523 }, { "auxiliary_loss_clip": 0.0112581, "auxiliary_loss_mlp": 0.01274113, "balance_loss_clip": 1.01567698, "balance_loss_mlp": 1.03237295, "epoch": 0.8947542462047197, "flos": 31796277338880.0, "grad_norm": 1.8551047849641389, "language_loss": 0.6706745, "learning_rate": 1.0830095820722673e-07, "loss": 0.69467372, "num_input_tokens_seen": 320969330, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 14882, "time_per_iteration": 2.8456718921661377 }, { "auxiliary_loss_clip": 0.01125022, "auxiliary_loss_mlp": 0.01032038, "balance_loss_clip": 1.01815033, "balance_loss_mlp": 1.03461337, "epoch": 0.8948143694573877, "flos": 20883599314560.0, "grad_norm": 2.5220888741512266, "language_loss": 0.75661337, "learning_rate": 1.0817834878458887e-07, "loss": 0.778184, "num_input_tokens_seen": 320985055, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.72265625, "step": 14883, "time_per_iteration": 2.57258939743042 }, { "auxiliary_loss_clip": 0.01115499, "auxiliary_loss_mlp": 0.01034358, "balance_loss_clip": 1.02214527, "balance_loss_mlp": 1.03249586, "epoch": 0.8948744927100556, "flos": 28215552291840.0, "grad_norm": 1.5884513172295247, "language_loss": 0.72314978, "learning_rate": 1.0805580687619897e-07, "loss": 0.74464834, "num_input_tokens_seen": 321004720, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.65625, "step": 14884, "time_per_iteration": 2.8322556018829346 }, { "auxiliary_loss_clip": 0.01114978, "auxiliary_loss_mlp": 0.01023731, "balance_loss_clip": 1.01253128, "balance_loss_mlp": 1.03099024, "epoch": 0.8949346159627236, "flos": 21902672073600.0, "grad_norm": 1.5458724361250562, "language_loss": 0.75278723, "learning_rate": 1.0793333248643133e-07, "loss": 0.77417433, "num_input_tokens_seen": 321022350, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.66015625, "step": 14885, "time_per_iteration": 2.540548086166382 }, { "auxiliary_loss_clip": 0.01119887, "auxiliary_loss_mlp": 0.0103222, "balance_loss_clip": 1.01951873, "balance_loss_mlp": 1.03168285, "epoch": 0.8949947392153915, "flos": 21359128492800.0, "grad_norm": 2.192953720749081, "language_loss": 0.81719351, "learning_rate": 1.0781092561965555e-07, "loss": 0.8387146, "num_input_tokens_seen": 321040450, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 14886, "time_per_iteration": 3.970494031906128 }, { "auxiliary_loss_clip": 0.01148139, "auxiliary_loss_mlp": 0.01029348, "balance_loss_clip": 1.01757622, "balance_loss_mlp": 1.03510427, "epoch": 0.8950548624680595, "flos": 52445342799360.0, "grad_norm": 1.915161190573023, "language_loss": 0.6348772, "learning_rate": 1.076885862802408e-07, "loss": 0.65665209, "num_input_tokens_seen": 321063970, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6796875, "step": 14887, "time_per_iteration": 2.958719253540039 }, { "auxiliary_loss_clip": 0.01124183, "auxiliary_loss_mlp": 0.01033762, "balance_loss_clip": 1.02129889, "balance_loss_mlp": 1.03576136, "epoch": 0.8951149857207275, "flos": 20121323863680.0, "grad_norm": 1.9350769660730258, "language_loss": 0.60316062, "learning_rate": 1.075663144725525e-07, "loss": 0.62474, "num_input_tokens_seen": 321083840, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.70703125, "step": 14888, "time_per_iteration": 2.5554747581481934 }, { "auxiliary_loss_clip": 0.01105498, "auxiliary_loss_mlp": 0.01025296, "balance_loss_clip": 1.01333964, "balance_loss_mlp": 1.03513825, "epoch": 0.8951751089733955, "flos": 29862631463040.0, "grad_norm": 2.2748864645582434, "language_loss": 0.70016515, "learning_rate": 1.0744411020095512e-07, "loss": 0.7214731, "num_input_tokens_seen": 321104165, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.703125, "step": 14889, "time_per_iteration": 2.790846586227417 }, { "auxiliary_loss_clip": 0.01113763, "auxiliary_loss_mlp": 0.0128172, "balance_loss_clip": 1.02154207, "balance_loss_mlp": 1.03512931, "epoch": 0.8952352322260634, "flos": 15262789415040.0, "grad_norm": 2.9973663197092035, "language_loss": 0.71814954, "learning_rate": 1.0732197346980854e-07, "loss": 0.74210441, "num_input_tokens_seen": 321117290, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 14890, "time_per_iteration": 2.516260862350464 }, { "auxiliary_loss_clip": 0.01163736, "auxiliary_loss_mlp": 0.01275276, "balance_loss_clip": 1.01580834, "balance_loss_mlp": 1.03223848, "epoch": 0.8952953554787314, "flos": 26798338206720.0, "grad_norm": 1.6513005578006559, "language_loss": 0.75495863, "learning_rate": 1.0719990428347259e-07, "loss": 0.77934867, "num_input_tokens_seen": 321137115, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 14891, "time_per_iteration": 2.6698715686798096 }, { "auxiliary_loss_clip": 0.01148333, "auxiliary_loss_mlp": 0.01029239, "balance_loss_clip": 1.01664424, "balance_loss_mlp": 1.03249621, "epoch": 0.8953554787313994, "flos": 14137205852160.0, "grad_norm": 1.6638933283117163, "language_loss": 0.76859087, "learning_rate": 1.0707790264630356e-07, "loss": 0.79036659, "num_input_tokens_seen": 321154490, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 14892, "time_per_iteration": 4.090404272079468 }, { "auxiliary_loss_clip": 0.01111261, "auxiliary_loss_mlp": 0.01031536, "balance_loss_clip": 1.01988304, "balance_loss_mlp": 1.03382373, "epoch": 0.8954156019840673, "flos": 25703314139520.0, "grad_norm": 1.340142574905494, "language_loss": 0.81797624, "learning_rate": 1.069559685626542e-07, "loss": 0.83940423, "num_input_tokens_seen": 321175625, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6796875, "step": 14893, "time_per_iteration": 2.5607223510742188 }, { "auxiliary_loss_clip": 0.01126089, "auxiliary_loss_mlp": 0.01030821, "balance_loss_clip": 1.01915002, "balance_loss_mlp": 1.03324485, "epoch": 0.8954757252367354, "flos": 21907987286400.0, "grad_norm": 1.7029598720399823, "language_loss": 0.74910182, "learning_rate": 1.0683410203687726e-07, "loss": 0.77067089, "num_input_tokens_seen": 321193895, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.65625, "step": 14894, "time_per_iteration": 2.713996648788452 }, { "auxiliary_loss_clip": 0.01115074, "auxiliary_loss_mlp": 0.0103299, "balance_loss_clip": 1.02059221, "balance_loss_mlp": 1.03621912, "epoch": 0.8955358484894033, "flos": 12970396454400.0, "grad_norm": 2.7063375671768606, "language_loss": 0.6671086, "learning_rate": 1.0671230307332146e-07, "loss": 0.68858922, "num_input_tokens_seen": 321211610, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 14895, "time_per_iteration": 2.5387215614318848 }, { "auxiliary_loss_clip": 0.01114615, "auxiliary_loss_mlp": 0.01029122, "balance_loss_clip": 1.01607466, "balance_loss_mlp": 1.03512657, "epoch": 0.8955959717420713, "flos": 17273966797440.0, "grad_norm": 1.7452958025948524, "language_loss": 0.66965455, "learning_rate": 1.0659057167633335e-07, "loss": 0.6910919, "num_input_tokens_seen": 321229805, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 14896, "time_per_iteration": 2.537304401397705 }, { "auxiliary_loss_clip": 0.01111865, "auxiliary_loss_mlp": 0.01031423, "balance_loss_clip": 1.01904321, "balance_loss_mlp": 1.0344913, "epoch": 0.8956560949947392, "flos": 14793868339200.0, "grad_norm": 1.7851911467417687, "language_loss": 0.76007283, "learning_rate": 1.0646890785025697e-07, "loss": 0.7815057, "num_input_tokens_seen": 321247165, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 14897, "time_per_iteration": 2.718977451324463 }, { "auxiliary_loss_clip": 0.0105797, "auxiliary_loss_mlp": 0.01249047, "balance_loss_clip": 1.00186372, "balance_loss_mlp": 1.00088644, "epoch": 0.8957162182474072, "flos": 63607817957760.0, "grad_norm": 0.7648068638065477, "language_loss": 0.55353183, "learning_rate": 1.0634731159943489e-07, "loss": 0.57660198, "num_input_tokens_seen": 321308425, "router_z_loss_clip": 0.01251221, "router_z_loss_mlp": 0.21191406, "step": 14898, "time_per_iteration": 3.2335448265075684 }, { "auxiliary_loss_clip": 0.01104451, "auxiliary_loss_mlp": 0.01277352, "balance_loss_clip": 1.01776314, "balance_loss_mlp": 1.03577077, "epoch": 0.8957763415000751, "flos": 25009843190400.0, "grad_norm": 1.8378348467709429, "language_loss": 0.70206034, "learning_rate": 1.0622578292820628e-07, "loss": 0.72587836, "num_input_tokens_seen": 321329295, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 14899, "time_per_iteration": 4.213734865188599 }, { "auxiliary_loss_clip": 0.01148924, "auxiliary_loss_mlp": 0.01034901, "balance_loss_clip": 1.02138233, "balance_loss_mlp": 1.03522539, "epoch": 0.8958364647527431, "flos": 19828615933440.0, "grad_norm": 1.6466455581535144, "language_loss": 0.73962575, "learning_rate": 1.061043218409079e-07, "loss": 0.761464, "num_input_tokens_seen": 321347580, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.6953125, "step": 14900, "time_per_iteration": 2.576180934906006 }, { "auxiliary_loss_clip": 0.01058065, "auxiliary_loss_mlp": 0.01002319, "balance_loss_clip": 1.0011332, "balance_loss_mlp": 1.00115347, "epoch": 0.895896588005411, "flos": 65537190115200.0, "grad_norm": 0.780571103998949, "language_loss": 0.61802578, "learning_rate": 1.059829283418745e-07, "loss": 0.63862967, "num_input_tokens_seen": 321407820, "router_z_loss_clip": 0.01184082, "router_z_loss_mlp": 0.21191406, "step": 14901, "time_per_iteration": 3.1611008644104004 }, { "auxiliary_loss_clip": 0.01111821, "auxiliary_loss_mlp": 0.01028564, "balance_loss_clip": 1.01606464, "balance_loss_mlp": 1.03279567, "epoch": 0.8959567112580791, "flos": 25591021246080.0, "grad_norm": 1.6998919272575013, "language_loss": 0.70614576, "learning_rate": 1.0586160243543884e-07, "loss": 0.72754961, "num_input_tokens_seen": 321426745, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 14902, "time_per_iteration": 4.506518840789795 }, { "auxiliary_loss_clip": 0.01114989, "auxiliary_loss_mlp": 0.01027847, "balance_loss_clip": 1.01625419, "balance_loss_mlp": 1.03192496, "epoch": 0.896016834510747, "flos": 24201780877440.0, "grad_norm": 1.6564538937921136, "language_loss": 0.77786213, "learning_rate": 1.0574034412592992e-07, "loss": 0.79929054, "num_input_tokens_seen": 321446165, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6484375, "step": 14903, "time_per_iteration": 2.664020538330078 }, { "auxiliary_loss_clip": 0.01124861, "auxiliary_loss_mlp": 0.01031971, "balance_loss_clip": 1.01878643, "balance_loss_mlp": 1.03525162, "epoch": 0.896076957763415, "flos": 23075945919360.0, "grad_norm": 3.602264729569987, "language_loss": 0.73093581, "learning_rate": 1.0561915341767557e-07, "loss": 0.75250411, "num_input_tokens_seen": 321465285, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 14904, "time_per_iteration": 2.755531072616577 }, { "auxiliary_loss_clip": 0.01112375, "auxiliary_loss_mlp": 0.01025805, "balance_loss_clip": 1.01289487, "balance_loss_mlp": 1.03401399, "epoch": 0.8961370810160829, "flos": 22236605838720.0, "grad_norm": 2.1158252570523026, "language_loss": 0.74632561, "learning_rate": 1.0549803031500037e-07, "loss": 0.76770741, "num_input_tokens_seen": 321483670, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 14905, "time_per_iteration": 2.572136402130127 }, { "auxiliary_loss_clip": 0.01108923, "auxiliary_loss_mlp": 0.01030471, "balance_loss_clip": 1.01840746, "balance_loss_mlp": 1.03263557, "epoch": 0.8961972042687509, "flos": 23072318645760.0, "grad_norm": 1.6683173976683083, "language_loss": 0.77126801, "learning_rate": 1.0537697482222796e-07, "loss": 0.79266202, "num_input_tokens_seen": 321501190, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.671875, "step": 14906, "time_per_iteration": 2.556577444076538 }, { "auxiliary_loss_clip": 0.01113201, "auxiliary_loss_mlp": 0.01031061, "balance_loss_clip": 1.01891387, "balance_loss_mlp": 1.03520977, "epoch": 0.896257327521419, "flos": 18185882307840.0, "grad_norm": 1.812446953355978, "language_loss": 0.74485278, "learning_rate": 1.0525598694367754e-07, "loss": 0.76629531, "num_input_tokens_seen": 321518540, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.69140625, "step": 14907, "time_per_iteration": 2.5381667613983154 }, { "auxiliary_loss_clip": 0.01110434, "auxiliary_loss_mlp": 0.01033466, "balance_loss_clip": 1.02172971, "balance_loss_mlp": 1.03458142, "epoch": 0.8963174507740869, "flos": 17895472848000.0, "grad_norm": 1.7425797021629783, "language_loss": 0.82627302, "learning_rate": 1.0513506668366656e-07, "loss": 0.84771198, "num_input_tokens_seen": 321536555, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 14908, "time_per_iteration": 2.7147586345672607 }, { "auxiliary_loss_clip": 0.01122364, "auxiliary_loss_mlp": 0.01029845, "balance_loss_clip": 1.01623154, "balance_loss_mlp": 1.03377223, "epoch": 0.8963775740267549, "flos": 21032269706880.0, "grad_norm": 1.5906987937102612, "language_loss": 0.70535034, "learning_rate": 1.0501421404651156e-07, "loss": 0.72687244, "num_input_tokens_seen": 321557655, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.70703125, "step": 14909, "time_per_iteration": 2.582475423812866 }, { "auxiliary_loss_clip": 0.01132146, "auxiliary_loss_mlp": 0.01035215, "balance_loss_clip": 1.02237666, "balance_loss_mlp": 1.03560758, "epoch": 0.8964376972794228, "flos": 23179619548800.0, "grad_norm": 2.056316405206866, "language_loss": 0.72729504, "learning_rate": 1.0489342903652421e-07, "loss": 0.74896866, "num_input_tokens_seen": 321576160, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 14910, "time_per_iteration": 2.6126019954681396 }, { "auxiliary_loss_clip": 0.01109546, "auxiliary_loss_mlp": 0.0102625, "balance_loss_clip": 1.01404881, "balance_loss_mlp": 1.03253186, "epoch": 0.8964978205320908, "flos": 24972998814720.0, "grad_norm": 2.4264700358353952, "language_loss": 0.63307977, "learning_rate": 1.0477271165801594e-07, "loss": 0.65443772, "num_input_tokens_seen": 321596205, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 14911, "time_per_iteration": 2.793729543685913 }, { "auxiliary_loss_clip": 0.01124516, "auxiliary_loss_mlp": 0.01279148, "balance_loss_clip": 1.01906669, "balance_loss_mlp": 1.03637457, "epoch": 0.8965579437847587, "flos": 19172025273600.0, "grad_norm": 1.659796222715234, "language_loss": 0.74724627, "learning_rate": 1.046520619152944e-07, "loss": 0.77128291, "num_input_tokens_seen": 321614800, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 14912, "time_per_iteration": 2.583979606628418 }, { "auxiliary_loss_clip": 0.01119897, "auxiliary_loss_mlp": 0.01033387, "balance_loss_clip": 1.02108443, "balance_loss_mlp": 1.03235292, "epoch": 0.8966180670374267, "flos": 24276690691200.0, "grad_norm": 1.7139571862851906, "language_loss": 0.81964886, "learning_rate": 1.045314798126653e-07, "loss": 0.8411817, "num_input_tokens_seen": 321633445, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 14913, "time_per_iteration": 2.58022403717041 }, { "auxiliary_loss_clip": 0.01114235, "auxiliary_loss_mlp": 0.01036094, "balance_loss_clip": 1.02329683, "balance_loss_mlp": 1.03438282, "epoch": 0.8966781902900947, "flos": 13553190622080.0, "grad_norm": 2.1508015390794895, "language_loss": 0.61338967, "learning_rate": 1.0441096535443183e-07, "loss": 0.63489294, "num_input_tokens_seen": 321650890, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 14914, "time_per_iteration": 2.7311806678771973 }, { "auxiliary_loss_clip": 0.01040105, "auxiliary_loss_mlp": 0.01001936, "balance_loss_clip": 1.00073814, "balance_loss_mlp": 1.00090921, "epoch": 0.8967383135427627, "flos": 65066114223360.0, "grad_norm": 0.7215445252110158, "language_loss": 0.55058259, "learning_rate": 1.0429051854489524e-07, "loss": 0.57100302, "num_input_tokens_seen": 321710960, "router_z_loss_clip": 0.01196289, "router_z_loss_mlp": 0.2109375, "step": 14915, "time_per_iteration": 3.153806447982788 }, { "auxiliary_loss_clip": 0.01125863, "auxiliary_loss_mlp": 0.0103035, "balance_loss_clip": 1.01912689, "balance_loss_mlp": 1.03323722, "epoch": 0.8967984367954306, "flos": 29713027317120.0, "grad_norm": 1.441916795822517, "language_loss": 0.71319795, "learning_rate": 1.0417013938835362e-07, "loss": 0.73476017, "num_input_tokens_seen": 321733290, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.66015625, "step": 14916, "time_per_iteration": 2.689831256866455 }, { "auxiliary_loss_clip": 0.01121147, "auxiliary_loss_mlp": 0.01029791, "balance_loss_clip": 1.01789403, "balance_loss_mlp": 1.03457785, "epoch": 0.8968585600480986, "flos": 25702488126720.0, "grad_norm": 1.5153101986866635, "language_loss": 0.77865779, "learning_rate": 1.040498278891031e-07, "loss": 0.8001672, "num_input_tokens_seen": 321753120, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 14917, "time_per_iteration": 2.729154348373413 }, { "auxiliary_loss_clip": 0.01116883, "auxiliary_loss_mlp": 0.01038851, "balance_loss_clip": 1.02551794, "balance_loss_mlp": 1.0362246, "epoch": 0.8969186833007665, "flos": 30044698525440.0, "grad_norm": 1.8421822090501396, "language_loss": 0.68182188, "learning_rate": 1.0392958405143693e-07, "loss": 0.70337927, "num_input_tokens_seen": 321772840, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 14918, "time_per_iteration": 2.752697229385376 }, { "auxiliary_loss_clip": 0.01138601, "auxiliary_loss_mlp": 0.01030843, "balance_loss_clip": 1.01916075, "balance_loss_mlp": 1.03367233, "epoch": 0.8969788065534345, "flos": 22818143030400.0, "grad_norm": 1.7339651491222556, "language_loss": 0.83402634, "learning_rate": 1.038094078796472e-07, "loss": 0.85572076, "num_input_tokens_seen": 321791020, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6953125, "step": 14919, "time_per_iteration": 2.5762887001037598 }, { "auxiliary_loss_clip": 0.01103872, "auxiliary_loss_mlp": 0.01278246, "balance_loss_clip": 1.01787174, "balance_loss_mlp": 1.03359485, "epoch": 0.8970389298061026, "flos": 13261488272640.0, "grad_norm": 2.2051515057505937, "language_loss": 0.71726143, "learning_rate": 1.0368929937802163e-07, "loss": 0.74108261, "num_input_tokens_seen": 321810075, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.703125, "step": 14920, "time_per_iteration": 2.486726999282837 }, { "auxiliary_loss_clip": 0.01111719, "auxiliary_loss_mlp": 0.0103255, "balance_loss_clip": 1.01968157, "balance_loss_mlp": 1.03363061, "epoch": 0.8970990530587705, "flos": 10266071345280.0, "grad_norm": 2.3997146707111976, "language_loss": 0.90958488, "learning_rate": 1.0356925855084719e-07, "loss": 0.93102753, "num_input_tokens_seen": 321822635, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 14921, "time_per_iteration": 2.7031936645507812 }, { "auxiliary_loss_clip": 0.01150571, "auxiliary_loss_mlp": 0.01030277, "balance_loss_clip": 1.01793313, "balance_loss_mlp": 1.03489876, "epoch": 0.8971591763114385, "flos": 20302708567680.0, "grad_norm": 1.9140392806490942, "language_loss": 0.74072397, "learning_rate": 1.0344928540240805e-07, "loss": 0.76253247, "num_input_tokens_seen": 321841130, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.71484375, "step": 14922, "time_per_iteration": 2.582298755645752 }, { "auxiliary_loss_clip": 0.01136756, "auxiliary_loss_mlp": 0.01031179, "balance_loss_clip": 1.01811957, "balance_loss_mlp": 1.03305936, "epoch": 0.8972192995641064, "flos": 23257043314560.0, "grad_norm": 1.987337265466669, "language_loss": 0.70496261, "learning_rate": 1.0332937993698498e-07, "loss": 0.72664201, "num_input_tokens_seen": 321859855, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6875, "step": 14923, "time_per_iteration": 2.642099380493164 }, { "auxiliary_loss_clip": 0.01121052, "auxiliary_loss_mlp": 0.01028686, "balance_loss_clip": 1.01543641, "balance_loss_mlp": 1.033144, "epoch": 0.8972794228167744, "flos": 18369601395840.0, "grad_norm": 2.9086182146664914, "language_loss": 0.7070328, "learning_rate": 1.0320954215885768e-07, "loss": 0.72853017, "num_input_tokens_seen": 321877990, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 14924, "time_per_iteration": 2.707793712615967 }, { "auxiliary_loss_clip": 0.01109938, "auxiliary_loss_mlp": 0.01031328, "balance_loss_clip": 1.02003908, "balance_loss_mlp": 1.03302073, "epoch": 0.8973395460694423, "flos": 23952058548480.0, "grad_norm": 1.4256648829201288, "language_loss": 0.72125298, "learning_rate": 1.0308977207230252e-07, "loss": 0.74266559, "num_input_tokens_seen": 321898120, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6796875, "step": 14925, "time_per_iteration": 2.7012081146240234 }, { "auxiliary_loss_clip": 0.01126548, "auxiliary_loss_mlp": 0.01277358, "balance_loss_clip": 1.01787984, "balance_loss_mlp": 1.03804016, "epoch": 0.8973996693221103, "flos": 24970843998720.0, "grad_norm": 1.5508901036942648, "language_loss": 0.8259722, "learning_rate": 1.0297006968159427e-07, "loss": 0.85001123, "num_input_tokens_seen": 321918140, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 14926, "time_per_iteration": 2.600334405899048 }, { "auxiliary_loss_clip": 0.01136937, "auxiliary_loss_mlp": 0.01031347, "balance_loss_clip": 1.01978993, "balance_loss_mlp": 1.03282905, "epoch": 0.8974597925747783, "flos": 25738937452800.0, "grad_norm": 1.689029796082503, "language_loss": 0.79149127, "learning_rate": 1.028504349910042e-07, "loss": 0.81317413, "num_input_tokens_seen": 321938580, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6875, "step": 14927, "time_per_iteration": 2.6415445804595947 }, { "auxiliary_loss_clip": 0.01119458, "auxiliary_loss_mlp": 0.01027175, "balance_loss_clip": 1.01564121, "balance_loss_mlp": 1.03357029, "epoch": 0.8975199158274463, "flos": 38071918131840.0, "grad_norm": 1.6078982621777484, "language_loss": 0.66639507, "learning_rate": 1.0273086800480225e-07, "loss": 0.68786132, "num_input_tokens_seen": 321961135, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6796875, "step": 14928, "time_per_iteration": 4.31438422203064 }, { "auxiliary_loss_clip": 0.01124076, "auxiliary_loss_mlp": 0.01042051, "balance_loss_clip": 1.02700078, "balance_loss_mlp": 1.03499103, "epoch": 0.8975800390801142, "flos": 25411683617280.0, "grad_norm": 1.7973258314465332, "language_loss": 0.70947725, "learning_rate": 1.0261136872725517e-07, "loss": 0.73113847, "num_input_tokens_seen": 321980945, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.71875, "step": 14929, "time_per_iteration": 2.524491310119629 }, { "auxiliary_loss_clip": 0.01121495, "auxiliary_loss_mlp": 0.01028391, "balance_loss_clip": 1.01708436, "balance_loss_mlp": 1.03519869, "epoch": 0.8976401623327822, "flos": 21759604202880.0, "grad_norm": 1.5451904816252204, "language_loss": 0.67964065, "learning_rate": 1.0249193716262782e-07, "loss": 0.70113945, "num_input_tokens_seen": 322000350, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6875, "step": 14930, "time_per_iteration": 2.50970721244812 }, { "auxiliary_loss_clip": 0.01117287, "auxiliary_loss_mlp": 0.01031815, "balance_loss_clip": 1.01963139, "balance_loss_mlp": 1.03166902, "epoch": 0.8977002855854501, "flos": 13845323934720.0, "grad_norm": 1.79333561400101, "language_loss": 0.74651039, "learning_rate": 1.023725733151819e-07, "loss": 0.76800144, "num_input_tokens_seen": 322018980, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 14931, "time_per_iteration": 2.71063232421875 }, { "auxiliary_loss_clip": 0.01110791, "auxiliary_loss_mlp": 0.01025181, "balance_loss_clip": 1.01386166, "balance_loss_mlp": 1.03472257, "epoch": 0.8977604088381181, "flos": 19427529692160.0, "grad_norm": 1.6117455616421716, "language_loss": 0.63375866, "learning_rate": 1.0225327718917775e-07, "loss": 0.65511841, "num_input_tokens_seen": 322037675, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.671875, "step": 14932, "time_per_iteration": 2.567885398864746 }, { "auxiliary_loss_clip": 0.01106433, "auxiliary_loss_mlp": 0.01029263, "balance_loss_clip": 1.01819444, "balance_loss_mlp": 1.03277862, "epoch": 0.8978205320907862, "flos": 22742083981440.0, "grad_norm": 1.749779151252223, "language_loss": 0.71828282, "learning_rate": 1.0213404878887266e-07, "loss": 0.73963982, "num_input_tokens_seen": 322055130, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.6484375, "step": 14933, "time_per_iteration": 2.5457520484924316 }, { "auxiliary_loss_clip": 0.01138907, "auxiliary_loss_mlp": 0.01032156, "balance_loss_clip": 1.018942, "balance_loss_mlp": 1.03400755, "epoch": 0.8978806553434541, "flos": 21360529123200.0, "grad_norm": 1.3702469635623329, "language_loss": 0.74912381, "learning_rate": 1.0201488811852166e-07, "loss": 0.77083445, "num_input_tokens_seen": 322074850, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6953125, "step": 14934, "time_per_iteration": 3.9835422039031982 }, { "auxiliary_loss_clip": 0.01128333, "auxiliary_loss_mlp": 0.01028218, "balance_loss_clip": 1.01589179, "balance_loss_mlp": 1.03373444, "epoch": 0.8979407785961221, "flos": 20924178704640.0, "grad_norm": 3.362986397759065, "language_loss": 0.60443199, "learning_rate": 1.0189579518237645e-07, "loss": 0.62599754, "num_input_tokens_seen": 322093315, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.67578125, "step": 14935, "time_per_iteration": 2.8307390213012695 }, { "auxiliary_loss_clip": 0.01126789, "auxiliary_loss_mlp": 0.01028488, "balance_loss_clip": 1.01586342, "balance_loss_mlp": 1.03394377, "epoch": 0.89800090184879, "flos": 25228934196480.0, "grad_norm": 1.9956417235811459, "language_loss": 0.76734889, "learning_rate": 1.0177676998468854e-07, "loss": 0.78890163, "num_input_tokens_seen": 322112555, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6640625, "step": 14936, "time_per_iteration": 2.593829393386841 }, { "auxiliary_loss_clip": 0.01120493, "auxiliary_loss_mlp": 0.0102939, "balance_loss_clip": 1.01730251, "balance_loss_mlp": 1.03447187, "epoch": 0.898061025101458, "flos": 22562674525440.0, "grad_norm": 1.5681658491081736, "language_loss": 0.73741198, "learning_rate": 1.0165781252970473e-07, "loss": 0.75891089, "num_input_tokens_seen": 322130440, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 14937, "time_per_iteration": 2.623490810394287 }, { "auxiliary_loss_clip": 0.01049404, "auxiliary_loss_mlp": 0.01001624, "balance_loss_clip": 1.00037277, "balance_loss_mlp": 1.00126648, "epoch": 0.8981211483541259, "flos": 56192551384320.0, "grad_norm": 0.8452676012153536, "language_loss": 0.63472193, "learning_rate": 1.015389228216701e-07, "loss": 0.65523225, "num_input_tokens_seen": 322187295, "router_z_loss_clip": 0.01251221, "router_z_loss_mlp": 0.21289062, "step": 14938, "time_per_iteration": 3.2615153789520264 }, { "auxiliary_loss_clip": 0.01118801, "auxiliary_loss_mlp": 0.01031674, "balance_loss_clip": 1.01908588, "balance_loss_mlp": 1.03419065, "epoch": 0.898181271606794, "flos": 16392718523520.0, "grad_norm": 2.6166141956613145, "language_loss": 0.80396855, "learning_rate": 1.0142010086482833e-07, "loss": 0.82547325, "num_input_tokens_seen": 322202965, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.66796875, "step": 14939, "time_per_iteration": 2.5607059001922607 }, { "auxiliary_loss_clip": 0.01047824, "auxiliary_loss_mlp": 0.00998785, "balance_loss_clip": 0.99746132, "balance_loss_mlp": 1.00082624, "epoch": 0.8982413948594619, "flos": 63440259989760.0, "grad_norm": 0.7184772669984424, "language_loss": 0.52904558, "learning_rate": 1.0130134666341894e-07, "loss": 0.54951167, "num_input_tokens_seen": 322269490, "router_z_loss_clip": 0.01324463, "router_z_loss_mlp": 0.2109375, "step": 14940, "time_per_iteration": 3.306663990020752 }, { "auxiliary_loss_clip": 0.0110208, "auxiliary_loss_mlp": 0.01035014, "balance_loss_clip": 1.02157271, "balance_loss_mlp": 1.03279495, "epoch": 0.8983015181121299, "flos": 21835340029440.0, "grad_norm": 1.9285655470301093, "language_loss": 0.77822125, "learning_rate": 1.0118266022168076e-07, "loss": 0.79959214, "num_input_tokens_seen": 322288060, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.6953125, "step": 14941, "time_per_iteration": 5.022096395492554 }, { "auxiliary_loss_clip": 0.01112669, "auxiliary_loss_mlp": 0.01034052, "balance_loss_clip": 1.02121937, "balance_loss_mlp": 1.03458452, "epoch": 0.8983616413647978, "flos": 28949961767040.0, "grad_norm": 2.6625754279295597, "language_loss": 0.73536241, "learning_rate": 1.0106404154384885e-07, "loss": 0.75682962, "num_input_tokens_seen": 322307930, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 14942, "time_per_iteration": 2.6124565601348877 }, { "auxiliary_loss_clip": 0.01122499, "auxiliary_loss_mlp": 0.01035622, "balance_loss_clip": 1.0228008, "balance_loss_mlp": 1.03446662, "epoch": 0.8984217646174658, "flos": 17785083375360.0, "grad_norm": 2.2302172874905852, "language_loss": 0.79898679, "learning_rate": 1.0094549063415714e-07, "loss": 0.82056797, "num_input_tokens_seen": 322326155, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 14943, "time_per_iteration": 4.183795928955078 }, { "auxiliary_loss_clip": 0.0112261, "auxiliary_loss_mlp": 0.01031227, "balance_loss_clip": 1.01782811, "balance_loss_mlp": 1.03379607, "epoch": 0.8984818878701337, "flos": 23404528558080.0, "grad_norm": 1.5006672734267064, "language_loss": 0.71332043, "learning_rate": 1.0082700749683537e-07, "loss": 0.73485881, "num_input_tokens_seen": 322345850, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 14944, "time_per_iteration": 2.616147041320801 }, { "auxiliary_loss_clip": 0.01135912, "auxiliary_loss_mlp": 0.0103271, "balance_loss_clip": 1.01954949, "balance_loss_mlp": 1.03732097, "epoch": 0.8985420111228017, "flos": 22346061557760.0, "grad_norm": 1.8976693271652303, "language_loss": 0.75778413, "learning_rate": 1.0070859213611283e-07, "loss": 0.77947044, "num_input_tokens_seen": 322364715, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 14945, "time_per_iteration": 2.80985426902771 }, { "auxiliary_loss_clip": 0.01108256, "auxiliary_loss_mlp": 0.01034918, "balance_loss_clip": 1.02179956, "balance_loss_mlp": 1.03420365, "epoch": 0.8986021343754698, "flos": 21392776558080.0, "grad_norm": 2.1072000360436047, "language_loss": 0.7387917, "learning_rate": 1.0059024455621501e-07, "loss": 0.76022351, "num_input_tokens_seen": 322383570, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7421875, "step": 14946, "time_per_iteration": 2.598771572113037 }, { "auxiliary_loss_clip": 0.01136581, "auxiliary_loss_mlp": 0.01028146, "balance_loss_clip": 1.01575375, "balance_loss_mlp": 1.03255725, "epoch": 0.8986622576281377, "flos": 21325372686720.0, "grad_norm": 2.4967896098426525, "language_loss": 0.64393318, "learning_rate": 1.0047196476136544e-07, "loss": 0.66558045, "num_input_tokens_seen": 322401375, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6796875, "step": 14947, "time_per_iteration": 2.5811710357666016 }, { "auxiliary_loss_clip": 0.01121156, "auxiliary_loss_mlp": 0.01032773, "balance_loss_clip": 1.02047026, "balance_loss_mlp": 1.03446758, "epoch": 0.8987223808808057, "flos": 23988292392960.0, "grad_norm": 1.8868297463443497, "language_loss": 0.69653779, "learning_rate": 1.0035375275578517e-07, "loss": 0.71807706, "num_input_tokens_seen": 322421890, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 14948, "time_per_iteration": 2.6665987968444824 }, { "auxiliary_loss_clip": 0.01129088, "auxiliary_loss_mlp": 0.01029319, "balance_loss_clip": 1.0172435, "balance_loss_mlp": 1.03389812, "epoch": 0.8987825041334736, "flos": 41500956044160.0, "grad_norm": 1.7846059387951771, "language_loss": 0.7466948, "learning_rate": 1.0023560854369306e-07, "loss": 0.7682789, "num_input_tokens_seen": 322445730, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 14949, "time_per_iteration": 3.04842472076416 }, { "auxiliary_loss_clip": 0.01140976, "auxiliary_loss_mlp": 0.0103399, "balance_loss_clip": 1.01974487, "balance_loss_mlp": 1.03478622, "epoch": 0.8988426273861416, "flos": 27564276844800.0, "grad_norm": 1.6296657085600896, "language_loss": 0.75732327, "learning_rate": 1.0011753212930529e-07, "loss": 0.779073, "num_input_tokens_seen": 322464595, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.70703125, "step": 14950, "time_per_iteration": 2.612825393676758 }, { "auxiliary_loss_clip": 0.01121915, "auxiliary_loss_mlp": 0.01029611, "balance_loss_clip": 1.01814914, "balance_loss_mlp": 1.03592014, "epoch": 0.8989027506388095, "flos": 17092653920640.0, "grad_norm": 1.6255621514970404, "language_loss": 0.66191852, "learning_rate": 9.999952351683583e-08, "loss": 0.68343377, "num_input_tokens_seen": 322483305, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6796875, "step": 14951, "time_per_iteration": 2.575554847717285 }, { "auxiliary_loss_clip": 0.0110457, "auxiliary_loss_mlp": 0.01027883, "balance_loss_clip": 1.01633132, "balance_loss_mlp": 1.03203809, "epoch": 0.8989628738914776, "flos": 20555124416640.0, "grad_norm": 1.9156364763233193, "language_loss": 0.73990762, "learning_rate": 9.988158271049596e-08, "loss": 0.76123214, "num_input_tokens_seen": 322501905, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.72265625, "step": 14952, "time_per_iteration": 2.546449899673462 }, { "auxiliary_loss_clip": 0.01102342, "auxiliary_loss_mlp": 0.0103505, "balance_loss_clip": 1.02265191, "balance_loss_mlp": 1.03384149, "epoch": 0.8990229971441455, "flos": 16251087196800.0, "grad_norm": 5.049474164076655, "language_loss": 0.57382071, "learning_rate": 9.976370971449477e-08, "loss": 0.59519464, "num_input_tokens_seen": 322518135, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 14953, "time_per_iteration": 2.4761037826538086 }, { "auxiliary_loss_clip": 0.01119596, "auxiliary_loss_mlp": 0.01034483, "balance_loss_clip": 1.02200198, "balance_loss_mlp": 1.03348804, "epoch": 0.8990831203968135, "flos": 21981316901760.0, "grad_norm": 1.9072009743898888, "language_loss": 0.81866735, "learning_rate": 9.964590453303867e-08, "loss": 0.84020817, "num_input_tokens_seen": 322537905, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 14954, "time_per_iteration": 2.8061702251434326 }, { "auxiliary_loss_clip": 0.01101659, "auxiliary_loss_mlp": 0.01031487, "balance_loss_clip": 1.01923239, "balance_loss_mlp": 1.03453708, "epoch": 0.8991432436494814, "flos": 27447171528960.0, "grad_norm": 1.757181798047181, "language_loss": 0.60343862, "learning_rate": 9.952816717033185e-08, "loss": 0.62477005, "num_input_tokens_seen": 322557945, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.671875, "step": 14955, "time_per_iteration": 2.562886953353882 }, { "auxiliary_loss_clip": 0.0110908, "auxiliary_loss_mlp": 0.01031328, "balance_loss_clip": 1.0197413, "balance_loss_mlp": 1.03341913, "epoch": 0.8992033669021494, "flos": 21579835610880.0, "grad_norm": 1.9251958349755134, "language_loss": 0.55050188, "learning_rate": 9.941049763057651e-08, "loss": 0.57190597, "num_input_tokens_seen": 322575765, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.66796875, "step": 14956, "time_per_iteration": 2.560636043548584 }, { "auxiliary_loss_clip": 0.01138165, "auxiliary_loss_mlp": 0.01031973, "balance_loss_clip": 1.01944971, "balance_loss_mlp": 1.03415191, "epoch": 0.8992634901548173, "flos": 28584211530240.0, "grad_norm": 1.963114455680319, "language_loss": 0.79875183, "learning_rate": 9.92928959179713e-08, "loss": 0.82045323, "num_input_tokens_seen": 322595665, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 14957, "time_per_iteration": 2.608752727508545 }, { "auxiliary_loss_clip": 0.01106461, "auxiliary_loss_mlp": 0.01031083, "balance_loss_clip": 1.01773143, "balance_loss_mlp": 1.03537774, "epoch": 0.8993236134074853, "flos": 19867435557120.0, "grad_norm": 1.8103536717578117, "language_loss": 0.79109943, "learning_rate": 9.917536203671351e-08, "loss": 0.81247491, "num_input_tokens_seen": 322614755, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 14958, "time_per_iteration": 2.5619916915893555 }, { "auxiliary_loss_clip": 0.01156538, "auxiliary_loss_mlp": 0.01029766, "balance_loss_clip": 1.01826251, "balance_loss_mlp": 1.03642821, "epoch": 0.8993837366601534, "flos": 19390649402880.0, "grad_norm": 1.6409924890707266, "language_loss": 0.74680597, "learning_rate": 9.905789599099734e-08, "loss": 0.76866901, "num_input_tokens_seen": 322633425, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 14959, "time_per_iteration": 2.878377914428711 }, { "auxiliary_loss_clip": 0.01099475, "auxiliary_loss_mlp": 0.01029474, "balance_loss_clip": 1.01736236, "balance_loss_mlp": 1.03261018, "epoch": 0.8994438599128213, "flos": 18551740285440.0, "grad_norm": 2.1476108181093805, "language_loss": 0.68509257, "learning_rate": 9.894049778501546e-08, "loss": 0.70638204, "num_input_tokens_seen": 322652065, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.66796875, "step": 14960, "time_per_iteration": 2.551501750946045 }, { "auxiliary_loss_clip": 0.01120514, "auxiliary_loss_mlp": 0.01029545, "balance_loss_clip": 1.0179162, "balance_loss_mlp": 1.03520882, "epoch": 0.8995039831654893, "flos": 24427587726720.0, "grad_norm": 1.6157788098562735, "language_loss": 0.65509117, "learning_rate": 9.882316742295671e-08, "loss": 0.67659175, "num_input_tokens_seen": 322673275, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.68359375, "step": 14961, "time_per_iteration": 2.6720683574676514 }, { "auxiliary_loss_clip": 0.01117196, "auxiliary_loss_mlp": 0.01025887, "balance_loss_clip": 1.01452684, "balance_loss_mlp": 1.03236413, "epoch": 0.8995641064181572, "flos": 21251324799360.0, "grad_norm": 1.479741157529379, "language_loss": 0.83055532, "learning_rate": 9.870590490900887e-08, "loss": 0.85198617, "num_input_tokens_seen": 322693375, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.66796875, "step": 14962, "time_per_iteration": 2.629542112350464 }, { "auxiliary_loss_clip": 0.01125827, "auxiliary_loss_mlp": 0.0103285, "balance_loss_clip": 1.01920605, "balance_loss_mlp": 1.03597093, "epoch": 0.8996242296708252, "flos": 23513661054720.0, "grad_norm": 1.9228761851688332, "language_loss": 0.76223755, "learning_rate": 9.85887102473566e-08, "loss": 0.78382432, "num_input_tokens_seen": 322712615, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 14963, "time_per_iteration": 2.6023576259613037 }, { "auxiliary_loss_clip": 0.01066123, "auxiliary_loss_mlp": 0.01002295, "balance_loss_clip": 1.00109696, "balance_loss_mlp": 1.00094247, "epoch": 0.8996843529234931, "flos": 62403230430720.0, "grad_norm": 0.7754207874553739, "language_loss": 0.5754565, "learning_rate": 9.847158344218232e-08, "loss": 0.59614062, "num_input_tokens_seen": 322766855, "router_z_loss_clip": 0.01196289, "router_z_loss_mlp": 0.2109375, "step": 14964, "time_per_iteration": 3.0066630840301514 }, { "auxiliary_loss_clip": 0.01119566, "auxiliary_loss_mlp": 0.01032521, "balance_loss_clip": 1.01925898, "balance_loss_mlp": 1.03693891, "epoch": 0.8997444761761612, "flos": 24236829573120.0, "grad_norm": 5.380958797890161, "language_loss": 0.67547613, "learning_rate": 9.835452449766557e-08, "loss": 0.69699699, "num_input_tokens_seen": 322781130, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73828125, "step": 14965, "time_per_iteration": 2.5912232398986816 }, { "auxiliary_loss_clip": 0.0110481, "auxiliary_loss_mlp": 0.01031104, "balance_loss_clip": 1.01933169, "balance_loss_mlp": 1.0344528, "epoch": 0.8998045994288291, "flos": 21361103740800.0, "grad_norm": 1.9314565975122868, "language_loss": 0.72289646, "learning_rate": 9.82375334179848e-08, "loss": 0.74425554, "num_input_tokens_seen": 322800310, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.703125, "step": 14966, "time_per_iteration": 2.5226333141326904 }, { "auxiliary_loss_clip": 0.01132366, "auxiliary_loss_mlp": 0.01033534, "balance_loss_clip": 1.02089238, "balance_loss_mlp": 1.03598738, "epoch": 0.8998647226814971, "flos": 28986159697920.0, "grad_norm": 1.7312673480475507, "language_loss": 0.73436606, "learning_rate": 9.812061020731443e-08, "loss": 0.75602508, "num_input_tokens_seen": 322820955, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 14967, "time_per_iteration": 2.5625152587890625 }, { "auxiliary_loss_clip": 0.01137824, "auxiliary_loss_mlp": 0.01277625, "balance_loss_clip": 1.01857042, "balance_loss_mlp": 1.03317261, "epoch": 0.899924845934165, "flos": 13625909706240.0, "grad_norm": 2.203413242461138, "language_loss": 0.72233415, "learning_rate": 9.80037548698267e-08, "loss": 0.74648863, "num_input_tokens_seen": 322838780, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6875, "step": 14968, "time_per_iteration": 2.513498306274414 }, { "auxiliary_loss_clip": 0.01112248, "auxiliary_loss_mlp": 0.01033105, "balance_loss_clip": 1.02085042, "balance_loss_mlp": 1.03418267, "epoch": 0.899984969186833, "flos": 20882629647360.0, "grad_norm": 2.2325327438532256, "language_loss": 0.71069831, "learning_rate": 9.788696740969293e-08, "loss": 0.73215187, "num_input_tokens_seen": 322856710, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 14969, "time_per_iteration": 3.867605209350586 }, { "auxiliary_loss_clip": 0.01106594, "auxiliary_loss_mlp": 0.01030461, "balance_loss_clip": 1.01811671, "balance_loss_mlp": 1.03701961, "epoch": 0.9000450924395009, "flos": 20921808407040.0, "grad_norm": 1.9344281117646491, "language_loss": 0.76113063, "learning_rate": 9.777024783108045e-08, "loss": 0.7825011, "num_input_tokens_seen": 322876070, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 14970, "time_per_iteration": 2.517185688018799 }, { "auxiliary_loss_clip": 0.01112955, "auxiliary_loss_mlp": 0.01031219, "balance_loss_clip": 1.01944733, "balance_loss_mlp": 1.0338124, "epoch": 0.900105215692169, "flos": 17165049782400.0, "grad_norm": 1.58387482787334, "language_loss": 0.73252082, "learning_rate": 9.76535961381546e-08, "loss": 0.75396258, "num_input_tokens_seen": 322895095, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.703125, "step": 14971, "time_per_iteration": 2.5683722496032715 }, { "auxiliary_loss_clip": 0.01130462, "auxiliary_loss_mlp": 0.01032103, "balance_loss_clip": 1.01845968, "balance_loss_mlp": 1.03419256, "epoch": 0.900165338944837, "flos": 19931930426880.0, "grad_norm": 2.4061356043549846, "language_loss": 0.81271851, "learning_rate": 9.753701233507828e-08, "loss": 0.83434415, "num_input_tokens_seen": 322911845, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.6953125, "step": 14972, "time_per_iteration": 2.540855884552002 }, { "auxiliary_loss_clip": 0.0112436, "auxiliary_loss_mlp": 0.01031059, "balance_loss_clip": 1.01861989, "balance_loss_mlp": 1.03396297, "epoch": 0.9002254621975049, "flos": 16107085572480.0, "grad_norm": 1.927793153708072, "language_loss": 0.81560588, "learning_rate": 9.742049642601279e-08, "loss": 0.83716011, "num_input_tokens_seen": 322928170, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71875, "step": 14973, "time_per_iteration": 2.5737593173980713 }, { "auxiliary_loss_clip": 0.01101764, "auxiliary_loss_mlp": 0.01036378, "balance_loss_clip": 1.02349162, "balance_loss_mlp": 1.03361273, "epoch": 0.9002855854501729, "flos": 28476120528000.0, "grad_norm": 1.9147229533483392, "language_loss": 0.5801608, "learning_rate": 9.730404841511531e-08, "loss": 0.60154223, "num_input_tokens_seen": 322948165, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6796875, "step": 14974, "time_per_iteration": 2.6125001907348633 }, { "auxiliary_loss_clip": 0.0110652, "auxiliary_loss_mlp": 0.01033983, "balance_loss_clip": 1.02184117, "balance_loss_mlp": 1.03719437, "epoch": 0.9003457087028408, "flos": 25630307746560.0, "grad_norm": 1.616820778900516, "language_loss": 0.63180578, "learning_rate": 9.718766830654201e-08, "loss": 0.65321076, "num_input_tokens_seen": 322968880, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6953125, "step": 14975, "time_per_iteration": 2.5537359714508057 }, { "auxiliary_loss_clip": 0.01112009, "auxiliary_loss_mlp": 0.0103133, "balance_loss_clip": 1.01874721, "balance_loss_mlp": 1.03355408, "epoch": 0.9004058319555088, "flos": 24389414547840.0, "grad_norm": 1.9359441788851175, "language_loss": 0.72868806, "learning_rate": 9.707135610444627e-08, "loss": 0.75012147, "num_input_tokens_seen": 322989395, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 14976, "time_per_iteration": 4.041009902954102 }, { "auxiliary_loss_clip": 0.01113495, "auxiliary_loss_mlp": 0.0103249, "balance_loss_clip": 1.02030039, "balance_loss_mlp": 1.03565454, "epoch": 0.9004659552081767, "flos": 29059345658880.0, "grad_norm": 1.898623075715326, "language_loss": 0.69468296, "learning_rate": 9.695511181297922e-08, "loss": 0.71614283, "num_input_tokens_seen": 323009060, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 14977, "time_per_iteration": 2.7461917400360107 }, { "auxiliary_loss_clip": 0.01123888, "auxiliary_loss_mlp": 0.01283286, "balance_loss_clip": 1.02207637, "balance_loss_mlp": 1.0348624, "epoch": 0.9005260784608448, "flos": 16763855800320.0, "grad_norm": 5.639227847488241, "language_loss": 0.656407, "learning_rate": 9.683893543628863e-08, "loss": 0.68047869, "num_input_tokens_seen": 323027530, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.70703125, "step": 14978, "time_per_iteration": 2.580148696899414 }, { "auxiliary_loss_clip": 0.01135226, "auxiliary_loss_mlp": 0.01034717, "balance_loss_clip": 1.02122808, "balance_loss_mlp": 1.03612208, "epoch": 0.9005862017135127, "flos": 20376002269440.0, "grad_norm": 1.694873585928632, "language_loss": 0.78879982, "learning_rate": 9.672282697852118e-08, "loss": 0.81049931, "num_input_tokens_seen": 323045370, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 14979, "time_per_iteration": 2.5394482612609863 }, { "auxiliary_loss_clip": 0.01039768, "auxiliary_loss_mlp": 0.0100256, "balance_loss_clip": 1.00132036, "balance_loss_mlp": 1.00087118, "epoch": 0.9006463249661807, "flos": 67580255796480.0, "grad_norm": 0.7227723847396597, "language_loss": 0.53626895, "learning_rate": 9.660678644382025e-08, "loss": 0.55669224, "num_input_tokens_seen": 323105660, "router_z_loss_clip": 0.01239014, "router_z_loss_mlp": 0.21289062, "step": 14980, "time_per_iteration": 3.237882375717163 }, { "auxiliary_loss_clip": 0.01115902, "auxiliary_loss_mlp": 0.01038998, "balance_loss_clip": 1.02587914, "balance_loss_mlp": 1.0355916, "epoch": 0.9007064482188486, "flos": 28293335193600.0, "grad_norm": 1.7669624261791272, "language_loss": 0.82547796, "learning_rate": 9.649081383632695e-08, "loss": 0.84702694, "num_input_tokens_seen": 323126365, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 14981, "time_per_iteration": 2.733654260635376 }, { "auxiliary_loss_clip": 0.01136608, "auxiliary_loss_mlp": 0.01029623, "balance_loss_clip": 1.01733816, "balance_loss_mlp": 1.03462696, "epoch": 0.9007665714715166, "flos": 21616320850560.0, "grad_norm": 1.4227238254094408, "language_loss": 0.81538939, "learning_rate": 9.637490916017998e-08, "loss": 0.83705175, "num_input_tokens_seen": 323145655, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.671875, "step": 14982, "time_per_iteration": 2.8856468200683594 }, { "auxiliary_loss_clip": 0.01117237, "auxiliary_loss_mlp": 0.01030366, "balance_loss_clip": 1.0193516, "balance_loss_mlp": 1.03400123, "epoch": 0.9008266947241845, "flos": 26541864120960.0, "grad_norm": 1.8822165709554508, "language_loss": 0.7211132, "learning_rate": 9.62590724195158e-08, "loss": 0.74258924, "num_input_tokens_seen": 323164540, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6484375, "step": 14983, "time_per_iteration": 4.231679677963257 }, { "auxiliary_loss_clip": 0.01110737, "auxiliary_loss_mlp": 0.01024935, "balance_loss_clip": 1.01389611, "balance_loss_mlp": 1.03531551, "epoch": 0.9008868179768525, "flos": 23110527738240.0, "grad_norm": 1.333380756408315, "language_loss": 0.75146282, "learning_rate": 9.614330361846846e-08, "loss": 0.77281952, "num_input_tokens_seen": 323186960, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6640625, "step": 14984, "time_per_iteration": 4.165903091430664 }, { "auxiliary_loss_clip": 0.01098115, "auxiliary_loss_mlp": 0.01033546, "balance_loss_clip": 1.02135682, "balance_loss_mlp": 1.03192592, "epoch": 0.9009469412295206, "flos": 19060809788160.0, "grad_norm": 1.4132377717271871, "language_loss": 0.70214081, "learning_rate": 9.602760276116906e-08, "loss": 0.7234574, "num_input_tokens_seen": 323206135, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6640625, "step": 14985, "time_per_iteration": 2.668689250946045 }, { "auxiliary_loss_clip": 0.01120877, "auxiliary_loss_mlp": 0.01033395, "balance_loss_clip": 1.02057362, "balance_loss_mlp": 1.03269565, "epoch": 0.9010070644821885, "flos": 23222281927680.0, "grad_norm": 2.039601198159744, "language_loss": 0.70182043, "learning_rate": 9.591196985174721e-08, "loss": 0.72336316, "num_input_tokens_seen": 323225980, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 14986, "time_per_iteration": 2.630556106567383 }, { "auxiliary_loss_clip": 0.0111228, "auxiliary_loss_mlp": 0.01033789, "balance_loss_clip": 1.02106905, "balance_loss_mlp": 1.03337693, "epoch": 0.9010671877348565, "flos": 17384823146880.0, "grad_norm": 1.8616349495709599, "language_loss": 0.76387918, "learning_rate": 9.579640489432917e-08, "loss": 0.78533983, "num_input_tokens_seen": 323243700, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 14987, "time_per_iteration": 2.500988245010376 }, { "auxiliary_loss_clip": 0.0111105, "auxiliary_loss_mlp": 0.01033214, "balance_loss_clip": 1.02085817, "balance_loss_mlp": 1.03386569, "epoch": 0.9011273109875244, "flos": 21908166854400.0, "grad_norm": 2.17305471372113, "language_loss": 0.73517168, "learning_rate": 9.568090789303917e-08, "loss": 0.75661433, "num_input_tokens_seen": 323261535, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6796875, "step": 14988, "time_per_iteration": 2.5911569595336914 }, { "auxiliary_loss_clip": 0.01126246, "auxiliary_loss_mlp": 0.010304, "balance_loss_clip": 1.01697087, "balance_loss_mlp": 1.03628814, "epoch": 0.9011874342401924, "flos": 24060831909120.0, "grad_norm": 1.7381985071944828, "language_loss": 0.69670951, "learning_rate": 9.556547885199883e-08, "loss": 0.7182759, "num_input_tokens_seen": 323281855, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 14989, "time_per_iteration": 2.670290470123291 }, { "auxiliary_loss_clip": 0.01103795, "auxiliary_loss_mlp": 0.01030865, "balance_loss_clip": 1.018592, "balance_loss_mlp": 1.03476024, "epoch": 0.9012475574928603, "flos": 16758791982720.0, "grad_norm": 1.823205199699631, "language_loss": 0.79821408, "learning_rate": 9.54501177753284e-08, "loss": 0.81956065, "num_input_tokens_seen": 323299505, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.69140625, "step": 14990, "time_per_iteration": 2.622903823852539 }, { "auxiliary_loss_clip": 0.01143043, "auxiliary_loss_mlp": 0.01030996, "balance_loss_clip": 1.01865757, "balance_loss_mlp": 1.03617859, "epoch": 0.9013076807455284, "flos": 19971109186560.0, "grad_norm": 4.046453731230086, "language_loss": 0.77840114, "learning_rate": 9.533482466714349e-08, "loss": 0.80014145, "num_input_tokens_seen": 323318365, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.71484375, "step": 14991, "time_per_iteration": 2.613490581512451 }, { "auxiliary_loss_clip": 0.01048458, "auxiliary_loss_mlp": 0.01250442, "balance_loss_clip": 1.00323153, "balance_loss_mlp": 1.00069737, "epoch": 0.9013678039981963, "flos": 65180274624000.0, "grad_norm": 0.7740499214191511, "language_loss": 0.60234761, "learning_rate": 9.521959953155967e-08, "loss": 0.62533665, "num_input_tokens_seen": 323371835, "router_z_loss_clip": 0.01275635, "router_z_loss_mlp": 0.21191406, "step": 14992, "time_per_iteration": 3.138993263244629 }, { "auxiliary_loss_clip": 0.0111067, "auxiliary_loss_mlp": 0.01029204, "balance_loss_clip": 1.01705694, "balance_loss_mlp": 1.03340173, "epoch": 0.9014279272508643, "flos": 20521224956160.0, "grad_norm": 1.9939879608216984, "language_loss": 0.83004594, "learning_rate": 9.510444237268833e-08, "loss": 0.8514446, "num_input_tokens_seen": 323388495, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.68359375, "step": 14993, "time_per_iteration": 2.628577947616577 }, { "auxiliary_loss_clip": 0.011032, "auxiliary_loss_mlp": 0.01034241, "balance_loss_clip": 1.02151513, "balance_loss_mlp": 1.03384757, "epoch": 0.9014880505035322, "flos": 17309051406720.0, "grad_norm": 2.305341394429723, "language_loss": 0.73460317, "learning_rate": 9.498935319464019e-08, "loss": 0.75597751, "num_input_tokens_seen": 323405280, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 14994, "time_per_iteration": 2.604828119277954 }, { "auxiliary_loss_clip": 0.0110815, "auxiliary_loss_mlp": 0.01027874, "balance_loss_clip": 1.01680565, "balance_loss_mlp": 1.033656, "epoch": 0.9015481737562002, "flos": 28402862739840.0, "grad_norm": 1.6478518925568701, "language_loss": 0.64419794, "learning_rate": 9.487433200152107e-08, "loss": 0.66555822, "num_input_tokens_seen": 323425310, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.66015625, "step": 14995, "time_per_iteration": 2.602998971939087 }, { "auxiliary_loss_clip": 0.01130306, "auxiliary_loss_mlp": 0.01031846, "balance_loss_clip": 1.01885843, "balance_loss_mlp": 1.03305113, "epoch": 0.9016082970088681, "flos": 29752672953600.0, "grad_norm": 1.6100253701496907, "language_loss": 0.66687775, "learning_rate": 9.475937879743679e-08, "loss": 0.68849927, "num_input_tokens_seen": 323447805, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 14996, "time_per_iteration": 2.68923282623291 }, { "auxiliary_loss_clip": 0.01108804, "auxiliary_loss_mlp": 0.01028783, "balance_loss_clip": 1.01721334, "balance_loss_mlp": 1.03260648, "epoch": 0.9016684202615362, "flos": 23513230091520.0, "grad_norm": 1.6900214205602038, "language_loss": 0.66040653, "learning_rate": 9.464449358648962e-08, "loss": 0.68178236, "num_input_tokens_seen": 323467150, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.671875, "step": 14997, "time_per_iteration": 2.6688907146453857 }, { "auxiliary_loss_clip": 0.01119513, "auxiliary_loss_mlp": 0.01033301, "balance_loss_clip": 1.0213443, "balance_loss_mlp": 1.03311181, "epoch": 0.9017285435142042, "flos": 16979247705600.0, "grad_norm": 1.7954552477362289, "language_loss": 0.77408254, "learning_rate": 9.452967637277875e-08, "loss": 0.79561067, "num_input_tokens_seen": 323484250, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.68359375, "step": 14998, "time_per_iteration": 2.49149751663208 }, { "auxiliary_loss_clip": 0.01103535, "auxiliary_loss_mlp": 0.01030657, "balance_loss_clip": 1.01845598, "balance_loss_mlp": 1.03395283, "epoch": 0.9017886667668721, "flos": 21393351175680.0, "grad_norm": 2.153520379853357, "language_loss": 0.75124544, "learning_rate": 9.441492716040267e-08, "loss": 0.77258742, "num_input_tokens_seen": 323502910, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 14999, "time_per_iteration": 2.592602491378784 }, { "auxiliary_loss_clip": 0.01147239, "auxiliary_loss_mlp": 0.01030411, "balance_loss_clip": 1.01750684, "balance_loss_mlp": 1.03353953, "epoch": 0.9018487900195401, "flos": 20996574566400.0, "grad_norm": 2.5333834312311616, "language_loss": 0.75540721, "learning_rate": 9.430024595345609e-08, "loss": 0.77718377, "num_input_tokens_seen": 323521820, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 15000, "time_per_iteration": 2.7129428386688232 }, { "auxiliary_loss_clip": 0.01108075, "auxiliary_loss_mlp": 0.01025069, "balance_loss_clip": 1.01355386, "balance_loss_mlp": 1.03205347, "epoch": 0.901908913272208, "flos": 53358443458560.0, "grad_norm": 2.0143451557682046, "language_loss": 0.8109073, "learning_rate": 9.418563275603153e-08, "loss": 0.83223879, "num_input_tokens_seen": 323543200, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66796875, "step": 15001, "time_per_iteration": 2.7800698280334473 }, { "auxiliary_loss_clip": 0.01128512, "auxiliary_loss_mlp": 0.01026677, "balance_loss_clip": 1.01515532, "balance_loss_mlp": 1.03488016, "epoch": 0.901969036524876, "flos": 22089838867200.0, "grad_norm": 1.7438388603482118, "language_loss": 0.78287315, "learning_rate": 9.407108757221927e-08, "loss": 0.804425, "num_input_tokens_seen": 323563075, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 15002, "time_per_iteration": 2.6263701915740967 }, { "auxiliary_loss_clip": 0.0112107, "auxiliary_loss_mlp": 0.01032574, "balance_loss_clip": 1.01958656, "balance_loss_mlp": 1.03422582, "epoch": 0.9020291597775439, "flos": 23835025059840.0, "grad_norm": 2.0710934657184765, "language_loss": 0.68161589, "learning_rate": 9.395661040610758e-08, "loss": 0.7031523, "num_input_tokens_seen": 323579065, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69140625, "step": 15003, "time_per_iteration": 2.678162097930908 }, { "auxiliary_loss_clip": 0.0112308, "auxiliary_loss_mlp": 0.01034469, "balance_loss_clip": 1.02167773, "balance_loss_mlp": 1.03412163, "epoch": 0.902089283030212, "flos": 18326005263360.0, "grad_norm": 2.437304539520658, "language_loss": 0.85885358, "learning_rate": 9.384220126178144e-08, "loss": 0.88042909, "num_input_tokens_seen": 323594835, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 15004, "time_per_iteration": 2.5675055980682373 }, { "auxiliary_loss_clip": 0.01131759, "auxiliary_loss_mlp": 0.01030973, "balance_loss_clip": 1.01830673, "balance_loss_mlp": 1.03505528, "epoch": 0.9021494062828799, "flos": 24170359455360.0, "grad_norm": 2.2368715270821022, "language_loss": 0.72442925, "learning_rate": 9.372786014332378e-08, "loss": 0.74605656, "num_input_tokens_seen": 323611475, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 15005, "time_per_iteration": 2.6328821182250977 }, { "auxiliary_loss_clip": 0.01111322, "auxiliary_loss_mlp": 0.01028775, "balance_loss_clip": 1.01607919, "balance_loss_mlp": 1.03148913, "epoch": 0.9022095295355479, "flos": 21616859554560.0, "grad_norm": 1.995909484323522, "language_loss": 0.71167505, "learning_rate": 9.361358705481515e-08, "loss": 0.73307598, "num_input_tokens_seen": 323629730, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 15006, "time_per_iteration": 2.5238027572631836 }, { "auxiliary_loss_clip": 0.01122591, "auxiliary_loss_mlp": 0.01029918, "balance_loss_clip": 1.01759744, "balance_loss_mlp": 1.0341301, "epoch": 0.9022696527882158, "flos": 18726229578240.0, "grad_norm": 2.291146577501474, "language_loss": 0.84865046, "learning_rate": 9.349938200033446e-08, "loss": 0.87017548, "num_input_tokens_seen": 323646000, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.70703125, "step": 15007, "time_per_iteration": 2.618961811065674 }, { "auxiliary_loss_clip": 0.01120139, "auxiliary_loss_mlp": 0.01027839, "balance_loss_clip": 1.01579905, "balance_loss_mlp": 1.03535032, "epoch": 0.9023297760408838, "flos": 26761206522240.0, "grad_norm": 1.433189366188307, "language_loss": 0.78624928, "learning_rate": 9.338524498395606e-08, "loss": 0.80772901, "num_input_tokens_seen": 323667250, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.66796875, "step": 15008, "time_per_iteration": 2.7033839225769043 }, { "auxiliary_loss_clip": 0.01126035, "auxiliary_loss_mlp": 0.01029004, "balance_loss_clip": 1.01731503, "balance_loss_mlp": 1.03465366, "epoch": 0.9023898992935517, "flos": 20522553759360.0, "grad_norm": 1.6081973811375643, "language_loss": 0.735232, "learning_rate": 9.327117600975399e-08, "loss": 0.75678229, "num_input_tokens_seen": 323687150, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.64453125, "step": 15009, "time_per_iteration": 2.608959197998047 }, { "auxiliary_loss_clip": 0.01101415, "auxiliary_loss_mlp": 0.01034429, "balance_loss_clip": 1.02220464, "balance_loss_mlp": 1.03403962, "epoch": 0.9024500225462198, "flos": 17456644391040.0, "grad_norm": 1.7328711343289114, "language_loss": 0.73613632, "learning_rate": 9.315717508179921e-08, "loss": 0.75749481, "num_input_tokens_seen": 323703660, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 15010, "time_per_iteration": 2.5617637634277344 }, { "auxiliary_loss_clip": 0.01121746, "auxiliary_loss_mlp": 0.01032972, "balance_loss_clip": 1.02007353, "balance_loss_mlp": 1.03418732, "epoch": 0.9025101457988878, "flos": 20813609664000.0, "grad_norm": 1.888907214500062, "language_loss": 0.73978293, "learning_rate": 9.30432422041596e-08, "loss": 0.76133013, "num_input_tokens_seen": 323722060, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 15011, "time_per_iteration": 4.0986058712005615 }, { "auxiliary_loss_clip": 0.0111559, "auxiliary_loss_mlp": 0.01033597, "balance_loss_clip": 1.02097821, "balance_loss_mlp": 1.03445911, "epoch": 0.9025702690515557, "flos": 19026371623680.0, "grad_norm": 3.4295597648359077, "language_loss": 0.72977686, "learning_rate": 9.292937738090168e-08, "loss": 0.75126874, "num_input_tokens_seen": 323740645, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 15012, "time_per_iteration": 2.5922460556030273 }, { "auxiliary_loss_clip": 0.01129228, "auxiliary_loss_mlp": 0.01036488, "balance_loss_clip": 1.02487707, "balance_loss_mlp": 1.03448677, "epoch": 0.9026303923042237, "flos": 19682818629120.0, "grad_norm": 1.6388258440163925, "language_loss": 0.69212419, "learning_rate": 9.281558061608818e-08, "loss": 0.71378136, "num_input_tokens_seen": 323758905, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6875, "step": 15013, "time_per_iteration": 2.7102818489074707 }, { "auxiliary_loss_clip": 0.01106465, "auxiliary_loss_mlp": 0.01030043, "balance_loss_clip": 1.01824689, "balance_loss_mlp": 1.03152633, "epoch": 0.9026905155568916, "flos": 24608110504320.0, "grad_norm": 1.4630678980531782, "language_loss": 0.72934139, "learning_rate": 9.270185191378144e-08, "loss": 0.75070643, "num_input_tokens_seen": 323780595, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.66015625, "step": 15014, "time_per_iteration": 2.6020712852478027 }, { "auxiliary_loss_clip": 0.01101642, "auxiliary_loss_mlp": 0.0102994, "balance_loss_clip": 1.01722622, "balance_loss_mlp": 1.03261185, "epoch": 0.9027506388095596, "flos": 20521799573760.0, "grad_norm": 1.791032645016125, "language_loss": 0.72021532, "learning_rate": 9.258819127803886e-08, "loss": 0.74153113, "num_input_tokens_seen": 323798160, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 15015, "time_per_iteration": 2.6653048992156982 }, { "auxiliary_loss_clip": 0.01120705, "auxiliary_loss_mlp": 0.01030169, "balance_loss_clip": 1.01823592, "balance_loss_mlp": 1.03332889, "epoch": 0.9028107620622275, "flos": 22784494965120.0, "grad_norm": 2.089498984658803, "language_loss": 0.69165164, "learning_rate": 9.247459871291763e-08, "loss": 0.7131604, "num_input_tokens_seen": 323816810, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 15016, "time_per_iteration": 2.735102415084839 }, { "auxiliary_loss_clip": 0.01100812, "auxiliary_loss_mlp": 0.01026878, "balance_loss_clip": 1.01573801, "balance_loss_mlp": 1.0335238, "epoch": 0.9028708853148956, "flos": 25410534382080.0, "grad_norm": 1.6809014765154426, "language_loss": 0.70401883, "learning_rate": 9.23610742224712e-08, "loss": 0.72529566, "num_input_tokens_seen": 323836900, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.671875, "step": 15017, "time_per_iteration": 4.013059139251709 }, { "auxiliary_loss_clip": 0.0112287, "auxiliary_loss_mlp": 0.01033731, "balance_loss_clip": 1.02240014, "balance_loss_mlp": 1.03774464, "epoch": 0.9029310085675635, "flos": 21871322478720.0, "grad_norm": 5.901765185426073, "language_loss": 0.69483685, "learning_rate": 9.224761781075075e-08, "loss": 0.71640289, "num_input_tokens_seen": 323855325, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.67578125, "step": 15018, "time_per_iteration": 2.6984097957611084 }, { "auxiliary_loss_clip": 0.01112814, "auxiliary_loss_mlp": 0.01031355, "balance_loss_clip": 1.01854646, "balance_loss_mlp": 1.03492343, "epoch": 0.9029911318202315, "flos": 26214394803840.0, "grad_norm": 2.2989333105969108, "language_loss": 0.68790317, "learning_rate": 9.213422948180549e-08, "loss": 0.70934486, "num_input_tokens_seen": 323875650, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 15019, "time_per_iteration": 2.7485902309417725 }, { "auxiliary_loss_clip": 0.01099575, "auxiliary_loss_mlp": 0.01031899, "balance_loss_clip": 1.02039552, "balance_loss_mlp": 1.03293896, "epoch": 0.9030512550728994, "flos": 15961360095360.0, "grad_norm": 2.124840419536666, "language_loss": 0.72294879, "learning_rate": 9.20209092396822e-08, "loss": 0.74426347, "num_input_tokens_seen": 323892920, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 15020, "time_per_iteration": 2.5507733821868896 }, { "auxiliary_loss_clip": 0.01112047, "auxiliary_loss_mlp": 0.0103172, "balance_loss_clip": 1.01913738, "balance_loss_mlp": 1.03318727, "epoch": 0.9031113783255674, "flos": 23987610034560.0, "grad_norm": 1.5083174488685107, "language_loss": 0.74186611, "learning_rate": 9.190765708842474e-08, "loss": 0.76330376, "num_input_tokens_seen": 323913835, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 15021, "time_per_iteration": 2.656867027282715 }, { "auxiliary_loss_clip": 0.01106656, "auxiliary_loss_mlp": 0.0103138, "balance_loss_clip": 1.01817143, "balance_loss_mlp": 1.03497934, "epoch": 0.9031715015782353, "flos": 22237216369920.0, "grad_norm": 2.1263838428510478, "language_loss": 0.72871822, "learning_rate": 9.179447303207477e-08, "loss": 0.75009859, "num_input_tokens_seen": 323933440, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 15022, "time_per_iteration": 2.5342838764190674 }, { "auxiliary_loss_clip": 0.01123057, "auxiliary_loss_mlp": 0.01026119, "balance_loss_clip": 1.01422238, "balance_loss_mlp": 1.0324831, "epoch": 0.9032316248309034, "flos": 26323168164480.0, "grad_norm": 1.6716560433941525, "language_loss": 0.72657132, "learning_rate": 9.168135707467128e-08, "loss": 0.74806309, "num_input_tokens_seen": 323954090, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.640625, "step": 15023, "time_per_iteration": 2.722245216369629 }, { "auxiliary_loss_clip": 0.0111202, "auxiliary_loss_mlp": 0.0103082, "balance_loss_clip": 1.01878619, "balance_loss_mlp": 1.03468585, "epoch": 0.9032917480835713, "flos": 22636686499200.0, "grad_norm": 1.7694022032734258, "language_loss": 0.83012354, "learning_rate": 9.156830922025194e-08, "loss": 0.85155189, "num_input_tokens_seen": 323974040, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.68359375, "step": 15024, "time_per_iteration": 4.2211833000183105 }, { "auxiliary_loss_clip": 0.01113395, "auxiliary_loss_mlp": 0.01029992, "balance_loss_clip": 1.01820862, "balance_loss_mlp": 1.03446925, "epoch": 0.9033518713362393, "flos": 23878764846720.0, "grad_norm": 1.7262883387251513, "language_loss": 0.69864666, "learning_rate": 9.145532947285017e-08, "loss": 0.72008049, "num_input_tokens_seen": 323996125, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.703125, "step": 15025, "time_per_iteration": 2.6248698234558105 }, { "auxiliary_loss_clip": 0.01126203, "auxiliary_loss_mlp": 0.01030147, "balance_loss_clip": 1.01817274, "balance_loss_mlp": 1.03236699, "epoch": 0.9034119945889073, "flos": 26905279973760.0, "grad_norm": 2.0501233248092032, "language_loss": 0.76856738, "learning_rate": 9.134241783649855e-08, "loss": 0.79013085, "num_input_tokens_seen": 324017645, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.671875, "step": 15026, "time_per_iteration": 4.290085554122925 }, { "auxiliary_loss_clip": 0.01110652, "auxiliary_loss_mlp": 0.01027935, "balance_loss_clip": 1.01636004, "balance_loss_mlp": 1.03452444, "epoch": 0.9034721178415752, "flos": 20850166730880.0, "grad_norm": 1.7122618738392217, "language_loss": 0.68130279, "learning_rate": 9.122957431522648e-08, "loss": 0.70268869, "num_input_tokens_seen": 324036875, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.671875, "step": 15027, "time_per_iteration": 2.703437566757202 }, { "auxiliary_loss_clip": 0.01134974, "auxiliary_loss_mlp": 0.0102944, "balance_loss_clip": 1.01822853, "balance_loss_mlp": 1.03430748, "epoch": 0.9035322410942432, "flos": 22234307368320.0, "grad_norm": 1.7497427423405383, "language_loss": 0.75595307, "learning_rate": 9.111679891306057e-08, "loss": 0.77759731, "num_input_tokens_seen": 324057045, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6484375, "step": 15028, "time_per_iteration": 2.6435437202453613 }, { "auxiliary_loss_clip": 0.01138276, "auxiliary_loss_mlp": 0.01035997, "balance_loss_clip": 1.02356994, "balance_loss_mlp": 1.0351553, "epoch": 0.9035923643469111, "flos": 25923410726400.0, "grad_norm": 1.8964920385538635, "language_loss": 0.69016081, "learning_rate": 9.100409163402601e-08, "loss": 0.71190357, "num_input_tokens_seen": 324079735, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.671875, "step": 15029, "time_per_iteration": 2.789039373397827 }, { "auxiliary_loss_clip": 0.01115052, "auxiliary_loss_mlp": 0.01032164, "balance_loss_clip": 1.01924133, "balance_loss_mlp": 1.03538918, "epoch": 0.9036524875995792, "flos": 20339804338560.0, "grad_norm": 1.9754737539887894, "language_loss": 0.73714554, "learning_rate": 9.08914524821447e-08, "loss": 0.75861776, "num_input_tokens_seen": 324097785, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 15030, "time_per_iteration": 2.5436995029449463 }, { "auxiliary_loss_clip": 0.01113114, "auxiliary_loss_mlp": 0.01033004, "balance_loss_clip": 1.02045703, "balance_loss_mlp": 1.03383017, "epoch": 0.9037126108522471, "flos": 40114624677120.0, "grad_norm": 2.203404149677633, "language_loss": 0.68284029, "learning_rate": 9.07788814614372e-08, "loss": 0.70430148, "num_input_tokens_seen": 324121625, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 15031, "time_per_iteration": 2.845616102218628 }, { "auxiliary_loss_clip": 0.01116698, "auxiliary_loss_mlp": 0.01025496, "balance_loss_clip": 1.01341462, "balance_loss_mlp": 1.03198981, "epoch": 0.9037727341049151, "flos": 23332024955520.0, "grad_norm": 1.585642819470976, "language_loss": 0.75966978, "learning_rate": 9.066637857591985e-08, "loss": 0.78109169, "num_input_tokens_seen": 324142535, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.671875, "step": 15032, "time_per_iteration": 2.5877480506896973 }, { "auxiliary_loss_clip": 0.01103706, "auxiliary_loss_mlp": 0.01034161, "balance_loss_clip": 1.02176309, "balance_loss_mlp": 1.03415203, "epoch": 0.903832857357583, "flos": 12822659815680.0, "grad_norm": 2.2533691034317256, "language_loss": 0.75242239, "learning_rate": 9.055394382960813e-08, "loss": 0.77380097, "num_input_tokens_seen": 324159610, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 15033, "time_per_iteration": 2.6270484924316406 }, { "auxiliary_loss_clip": 0.01130046, "auxiliary_loss_mlp": 0.01034283, "balance_loss_clip": 1.02188492, "balance_loss_mlp": 1.03374314, "epoch": 0.903892980610251, "flos": 25703026830720.0, "grad_norm": 1.6412535308066165, "language_loss": 0.74074417, "learning_rate": 9.044157722651458e-08, "loss": 0.76238745, "num_input_tokens_seen": 324182510, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 15034, "time_per_iteration": 2.6416661739349365 }, { "auxiliary_loss_clip": 0.01130052, "auxiliary_loss_mlp": 0.01031671, "balance_loss_clip": 1.01982796, "balance_loss_mlp": 1.03471184, "epoch": 0.9039531038629189, "flos": 14684089397760.0, "grad_norm": 1.5589951176561616, "language_loss": 0.63182205, "learning_rate": 9.032927877064911e-08, "loss": 0.65343928, "num_input_tokens_seen": 324200555, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 15035, "time_per_iteration": 2.6633851528167725 }, { "auxiliary_loss_clip": 0.01103234, "auxiliary_loss_mlp": 0.01031052, "balance_loss_clip": 1.01824856, "balance_loss_mlp": 1.03477669, "epoch": 0.904013227115587, "flos": 24024921287040.0, "grad_norm": 2.163354775876305, "language_loss": 0.61681229, "learning_rate": 9.02170484660194e-08, "loss": 0.6381551, "num_input_tokens_seen": 324220255, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.68359375, "step": 15036, "time_per_iteration": 2.5453591346740723 }, { "auxiliary_loss_clip": 0.01134462, "auxiliary_loss_mlp": 0.01026117, "balance_loss_clip": 1.01414251, "balance_loss_mlp": 1.03337359, "epoch": 0.9040733503682549, "flos": 22856459863680.0, "grad_norm": 1.631005551244134, "language_loss": 0.82545269, "learning_rate": 9.010488631663072e-08, "loss": 0.84705842, "num_input_tokens_seen": 324237855, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.65234375, "step": 15037, "time_per_iteration": 2.7535252571105957 }, { "auxiliary_loss_clip": 0.01105682, "auxiliary_loss_mlp": 0.01030374, "balance_loss_clip": 1.01806021, "balance_loss_mlp": 1.03497386, "epoch": 0.9041334736209229, "flos": 19974951941760.0, "grad_norm": 1.7840748614939848, "language_loss": 0.67667639, "learning_rate": 8.999279232648582e-08, "loss": 0.69803691, "num_input_tokens_seen": 324257050, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.70703125, "step": 15038, "time_per_iteration": 2.5987322330474854 }, { "auxiliary_loss_clip": 0.01119558, "auxiliary_loss_mlp": 0.01036129, "balance_loss_clip": 1.0233258, "balance_loss_mlp": 1.03298569, "epoch": 0.9041935968735909, "flos": 17530548624000.0, "grad_norm": 1.8200025228374572, "language_loss": 0.7527526, "learning_rate": 8.98807664995851e-08, "loss": 0.77430952, "num_input_tokens_seen": 324275510, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 15039, "time_per_iteration": 2.6427559852600098 }, { "auxiliary_loss_clip": 0.01118392, "auxiliary_loss_mlp": 0.01028928, "balance_loss_clip": 1.01712072, "balance_loss_mlp": 1.03392577, "epoch": 0.9042537201262588, "flos": 22780149419520.0, "grad_norm": 1.6321513935471181, "language_loss": 0.700881, "learning_rate": 8.976880883992599e-08, "loss": 0.72235417, "num_input_tokens_seen": 324295150, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.66796875, "step": 15040, "time_per_iteration": 2.5906171798706055 }, { "auxiliary_loss_clip": 0.01125379, "auxiliary_loss_mlp": 0.01027167, "balance_loss_clip": 1.01534748, "balance_loss_mlp": 1.03191388, "epoch": 0.9043138433789268, "flos": 20595416497920.0, "grad_norm": 1.6963157242686713, "language_loss": 0.68033004, "learning_rate": 8.96569193515051e-08, "loss": 0.70185548, "num_input_tokens_seen": 324313855, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.66796875, "step": 15041, "time_per_iteration": 2.687382221221924 }, { "auxiliary_loss_clip": 0.01111263, "auxiliary_loss_mlp": 0.01030887, "balance_loss_clip": 1.01816714, "balance_loss_mlp": 1.0334841, "epoch": 0.9043739666315948, "flos": 32962978995840.0, "grad_norm": 2.2625019577992473, "language_loss": 0.57278043, "learning_rate": 8.954509803831455e-08, "loss": 0.59420192, "num_input_tokens_seen": 324338465, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 15042, "time_per_iteration": 2.6307103633880615 }, { "auxiliary_loss_clip": 0.01039758, "auxiliary_loss_mlp": 0.0124704, "balance_loss_clip": 0.9999041, "balance_loss_mlp": 1.00108957, "epoch": 0.9044340898842628, "flos": 67296418525440.0, "grad_norm": 0.7801925217307407, "language_loss": 0.5689739, "learning_rate": 8.943334490434473e-08, "loss": 0.59184194, "num_input_tokens_seen": 324398740, "router_z_loss_clip": 0.01220703, "router_z_loss_mlp": 0.21191406, "step": 15043, "time_per_iteration": 3.1408333778381348 }, { "auxiliary_loss_clip": 0.01120388, "auxiliary_loss_mlp": 0.0102902, "balance_loss_clip": 1.01582909, "balance_loss_mlp": 1.03363848, "epoch": 0.9044942131369307, "flos": 17713154390400.0, "grad_norm": 2.107961668112218, "language_loss": 0.70377958, "learning_rate": 8.932165995358487e-08, "loss": 0.72527367, "num_input_tokens_seen": 324417335, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.69140625, "step": 15044, "time_per_iteration": 2.5539474487304688 }, { "auxiliary_loss_clip": 0.0112165, "auxiliary_loss_mlp": 0.01037263, "balance_loss_clip": 1.02382207, "balance_loss_mlp": 1.03355837, "epoch": 0.9045543363895987, "flos": 17820563034240.0, "grad_norm": 2.557452718467562, "language_loss": 0.69515073, "learning_rate": 8.921004319001957e-08, "loss": 0.71673989, "num_input_tokens_seen": 324433240, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 15045, "time_per_iteration": 2.6781749725341797 }, { "auxiliary_loss_clip": 0.01133094, "auxiliary_loss_mlp": 0.0103418, "balance_loss_clip": 1.02153206, "balance_loss_mlp": 1.03634775, "epoch": 0.9046144596422666, "flos": 21872723109120.0, "grad_norm": 1.9446193688959794, "language_loss": 0.66185415, "learning_rate": 8.909849461763297e-08, "loss": 0.68352681, "num_input_tokens_seen": 324452675, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 15046, "time_per_iteration": 2.591524362564087 }, { "auxiliary_loss_clip": 0.01131772, "auxiliary_loss_mlp": 0.01034457, "balance_loss_clip": 1.02142727, "balance_loss_mlp": 1.0339241, "epoch": 0.9046745828949346, "flos": 17672646827520.0, "grad_norm": 2.2708204964144927, "language_loss": 0.62307501, "learning_rate": 8.898701424040545e-08, "loss": 0.64473724, "num_input_tokens_seen": 324467865, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 15047, "time_per_iteration": 2.632758617401123 }, { "auxiliary_loss_clip": 0.01102614, "auxiliary_loss_mlp": 0.01277388, "balance_loss_clip": 1.01913452, "balance_loss_mlp": 1.03514493, "epoch": 0.9047347061476025, "flos": 25702559953920.0, "grad_norm": 1.604987219993028, "language_loss": 0.71355981, "learning_rate": 8.887560206231626e-08, "loss": 0.73735988, "num_input_tokens_seen": 324490430, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.67578125, "step": 15048, "time_per_iteration": 2.72037410736084 }, { "auxiliary_loss_clip": 0.01112063, "auxiliary_loss_mlp": 0.01032654, "balance_loss_clip": 1.01913524, "balance_loss_mlp": 1.03252077, "epoch": 0.9047948294002706, "flos": 15158146118400.0, "grad_norm": 2.903111086022661, "language_loss": 0.74678707, "learning_rate": 8.876425808734022e-08, "loss": 0.76823425, "num_input_tokens_seen": 324506620, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.70703125, "step": 15049, "time_per_iteration": 2.5543699264526367 }, { "auxiliary_loss_clip": 0.01111148, "auxiliary_loss_mlp": 0.01029636, "balance_loss_clip": 1.01692879, "balance_loss_mlp": 1.03569603, "epoch": 0.9048549526529385, "flos": 21872292145920.0, "grad_norm": 1.66834297815156, "language_loss": 0.7581377, "learning_rate": 8.865298231945173e-08, "loss": 0.77954555, "num_input_tokens_seen": 324525505, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6640625, "step": 15050, "time_per_iteration": 2.675025224685669 }, { "auxiliary_loss_clip": 0.01103334, "auxiliary_loss_mlp": 0.01281267, "balance_loss_clip": 1.02285957, "balance_loss_mlp": 1.03406036, "epoch": 0.9049150759056065, "flos": 23546626761600.0, "grad_norm": 1.729386797984623, "language_loss": 0.82033432, "learning_rate": 8.854177476262181e-08, "loss": 0.84418035, "num_input_tokens_seen": 324544415, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.69140625, "step": 15051, "time_per_iteration": 2.5486299991607666 }, { "auxiliary_loss_clip": 0.01110579, "auxiliary_loss_mlp": 0.01029154, "balance_loss_clip": 1.01742387, "balance_loss_mlp": 1.0332824, "epoch": 0.9049751991582745, "flos": 19645902426240.0, "grad_norm": 1.9391177045225587, "language_loss": 0.88930762, "learning_rate": 8.843063542081908e-08, "loss": 0.91070503, "num_input_tokens_seen": 324562555, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 15052, "time_per_iteration": 2.726142168045044 }, { "auxiliary_loss_clip": 0.01116522, "auxiliary_loss_mlp": 0.0103309, "balance_loss_clip": 1.02090132, "balance_loss_mlp": 1.03574491, "epoch": 0.9050353224109424, "flos": 15596220389760.0, "grad_norm": 2.3052331012646214, "language_loss": 0.77466238, "learning_rate": 8.831956429800946e-08, "loss": 0.79615849, "num_input_tokens_seen": 324580865, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.71875, "step": 15053, "time_per_iteration": 3.9124863147735596 }, { "auxiliary_loss_clip": 0.01109015, "auxiliary_loss_mlp": 0.01035998, "balance_loss_clip": 1.02370119, "balance_loss_mlp": 1.03334641, "epoch": 0.9050954456636104, "flos": 28183592165760.0, "grad_norm": 1.8631557070485167, "language_loss": 0.7297104, "learning_rate": 8.82085613981578e-08, "loss": 0.75116056, "num_input_tokens_seen": 324600665, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.66796875, "step": 15054, "time_per_iteration": 2.8000075817108154 }, { "auxiliary_loss_clip": 0.01137606, "auxiliary_loss_mlp": 0.01031857, "balance_loss_clip": 1.01995373, "balance_loss_mlp": 1.03373682, "epoch": 0.9051555689162784, "flos": 25731611078400.0, "grad_norm": 1.758469979248819, "language_loss": 0.83353639, "learning_rate": 8.809762672522425e-08, "loss": 0.85523099, "num_input_tokens_seen": 324618145, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 15055, "time_per_iteration": 2.673671007156372 }, { "auxiliary_loss_clip": 0.01108496, "auxiliary_loss_mlp": 0.01037381, "balance_loss_clip": 1.02507854, "balance_loss_mlp": 1.0333035, "epoch": 0.9052156921689464, "flos": 23257258796160.0, "grad_norm": 1.6888807551634708, "language_loss": 0.85191894, "learning_rate": 8.798676028316876e-08, "loss": 0.87337768, "num_input_tokens_seen": 324638165, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.66796875, "step": 15056, "time_per_iteration": 2.658888578414917 }, { "auxiliary_loss_clip": 0.01118606, "auxiliary_loss_mlp": 0.01029906, "balance_loss_clip": 1.01761532, "balance_loss_mlp": 1.03332281, "epoch": 0.9052758154216143, "flos": 29564285097600.0, "grad_norm": 2.30225144071818, "language_loss": 0.72427833, "learning_rate": 8.78759620759475e-08, "loss": 0.74576354, "num_input_tokens_seen": 324658560, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.671875, "step": 15057, "time_per_iteration": 2.751744031906128 }, { "auxiliary_loss_clip": 0.01109709, "auxiliary_loss_mlp": 0.01028106, "balance_loss_clip": 1.01633358, "balance_loss_mlp": 1.03413916, "epoch": 0.9053359386742823, "flos": 22127688823680.0, "grad_norm": 1.6031087871295662, "language_loss": 0.81230158, "learning_rate": 8.776523210751463e-08, "loss": 0.83367968, "num_input_tokens_seen": 324679185, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6640625, "step": 15058, "time_per_iteration": 2.68245530128479 }, { "auxiliary_loss_clip": 0.01030799, "auxiliary_loss_mlp": 0.01003577, "balance_loss_clip": 1.00238466, "balance_loss_mlp": 1.00111449, "epoch": 0.9053960619269502, "flos": 67090112760960.0, "grad_norm": 0.6864277551716109, "language_loss": 0.51361054, "learning_rate": 8.765457038182166e-08, "loss": 0.53395426, "num_input_tokens_seen": 324744830, "router_z_loss_clip": 0.01190186, "router_z_loss_mlp": 0.2109375, "step": 15059, "time_per_iteration": 4.699098110198975 }, { "auxiliary_loss_clip": 0.0113738, "auxiliary_loss_mlp": 0.01029066, "balance_loss_clip": 1.01672757, "balance_loss_mlp": 1.03343272, "epoch": 0.9054561851796182, "flos": 15815419136640.0, "grad_norm": 2.068307182948565, "language_loss": 0.67046374, "learning_rate": 8.754397690281768e-08, "loss": 0.69212818, "num_input_tokens_seen": 324762905, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 15060, "time_per_iteration": 2.71187162399292 }, { "auxiliary_loss_clip": 0.01121954, "auxiliary_loss_mlp": 0.01032836, "balance_loss_clip": 1.02087367, "balance_loss_mlp": 1.03476834, "epoch": 0.9055163084322861, "flos": 17566997950080.0, "grad_norm": 1.97081625243571, "language_loss": 0.64212096, "learning_rate": 8.743345167445038e-08, "loss": 0.66366887, "num_input_tokens_seen": 324781905, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6953125, "step": 15061, "time_per_iteration": 2.6491928100585938 }, { "auxiliary_loss_clip": 0.01123066, "auxiliary_loss_mlp": 0.01035045, "balance_loss_clip": 1.02291536, "balance_loss_mlp": 1.03617287, "epoch": 0.9055764316849542, "flos": 17639573379840.0, "grad_norm": 1.7119197425963237, "language_loss": 0.71326995, "learning_rate": 8.732299470066285e-08, "loss": 0.734851, "num_input_tokens_seen": 324799260, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 15062, "time_per_iteration": 2.7658865451812744 }, { "auxiliary_loss_clip": 0.01099739, "auxiliary_loss_mlp": 0.01032758, "balance_loss_clip": 1.02087879, "balance_loss_mlp": 1.03253067, "epoch": 0.9056365549376221, "flos": 20120856986880.0, "grad_norm": 1.9753446306243823, "language_loss": 0.70966655, "learning_rate": 8.721260598539792e-08, "loss": 0.73099148, "num_input_tokens_seen": 324817800, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 15063, "time_per_iteration": 2.5552570819854736 }, { "auxiliary_loss_clip": 0.01125333, "auxiliary_loss_mlp": 0.01029093, "balance_loss_clip": 1.01713037, "balance_loss_mlp": 1.03207207, "epoch": 0.9056966781902901, "flos": 27598786836480.0, "grad_norm": 1.6203555822591047, "language_loss": 0.72069418, "learning_rate": 8.710228553259469e-08, "loss": 0.7422384, "num_input_tokens_seen": 324838445, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.66015625, "step": 15064, "time_per_iteration": 2.6122167110443115 }, { "auxiliary_loss_clip": 0.01110792, "auxiliary_loss_mlp": 0.01034811, "balance_loss_clip": 1.02229977, "balance_loss_mlp": 1.03138244, "epoch": 0.9057568014429581, "flos": 25920106675200.0, "grad_norm": 1.826451953388549, "language_loss": 0.69775224, "learning_rate": 8.699203334619065e-08, "loss": 0.71920818, "num_input_tokens_seen": 324859895, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 15065, "time_per_iteration": 4.165670394897461 }, { "auxiliary_loss_clip": 0.01122486, "auxiliary_loss_mlp": 0.01035603, "balance_loss_clip": 1.02369404, "balance_loss_mlp": 1.03593731, "epoch": 0.905816924695626, "flos": 22930364096640.0, "grad_norm": 1.5087732479120533, "language_loss": 0.62805784, "learning_rate": 8.688184943011956e-08, "loss": 0.64963871, "num_input_tokens_seen": 324879580, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 15066, "time_per_iteration": 2.561328887939453 }, { "auxiliary_loss_clip": 0.01158926, "auxiliary_loss_mlp": 0.01028734, "balance_loss_clip": 1.01512671, "balance_loss_mlp": 1.03237629, "epoch": 0.905877047948294, "flos": 22157422306560.0, "grad_norm": 2.325104878127691, "language_loss": 0.79943126, "learning_rate": 8.677173378831448e-08, "loss": 0.82130778, "num_input_tokens_seen": 324898950, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 15067, "time_per_iteration": 5.108846426010132 }, { "auxiliary_loss_clip": 0.01122379, "auxiliary_loss_mlp": 0.01034236, "balance_loss_clip": 1.0211463, "balance_loss_mlp": 1.03461933, "epoch": 0.905937171200962, "flos": 15304805349120.0, "grad_norm": 2.0069537942745432, "language_loss": 0.69899917, "learning_rate": 8.666168642470495e-08, "loss": 0.72056532, "num_input_tokens_seen": 324917455, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 15068, "time_per_iteration": 2.558293342590332 }, { "auxiliary_loss_clip": 0.01099403, "auxiliary_loss_mlp": 0.01026975, "balance_loss_clip": 1.01523852, "balance_loss_mlp": 1.03305757, "epoch": 0.90599729445363, "flos": 19462973437440.0, "grad_norm": 1.8934860220648455, "language_loss": 0.85497665, "learning_rate": 8.655170734321804e-08, "loss": 0.87624043, "num_input_tokens_seen": 324934495, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6640625, "step": 15069, "time_per_iteration": 2.752598762512207 }, { "auxiliary_loss_clip": 0.01116277, "auxiliary_loss_mlp": 0.01030401, "balance_loss_clip": 1.0174191, "balance_loss_mlp": 1.03426814, "epoch": 0.9060574177062979, "flos": 23732967542400.0, "grad_norm": 1.916816640684055, "language_loss": 0.59555352, "learning_rate": 8.644179654777839e-08, "loss": 0.61702037, "num_input_tokens_seen": 324953230, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 15070, "time_per_iteration": 2.5263030529022217 }, { "auxiliary_loss_clip": 0.01131495, "auxiliary_loss_mlp": 0.01024945, "balance_loss_clip": 1.01233315, "balance_loss_mlp": 1.03361607, "epoch": 0.9061175409589659, "flos": 17311134395520.0, "grad_norm": 2.8209558613335504, "language_loss": 0.81675017, "learning_rate": 8.633195404230931e-08, "loss": 0.83831459, "num_input_tokens_seen": 324969880, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 15071, "time_per_iteration": 2.6955535411834717 }, { "auxiliary_loss_clip": 0.0110734, "auxiliary_loss_mlp": 0.01035155, "balance_loss_clip": 1.02171409, "balance_loss_mlp": 1.03485453, "epoch": 0.9061776642116338, "flos": 17778439359360.0, "grad_norm": 2.0431185470107756, "language_loss": 0.61798644, "learning_rate": 8.622217983072988e-08, "loss": 0.63941139, "num_input_tokens_seen": 324987005, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 15072, "time_per_iteration": 2.5248942375183105 }, { "auxiliary_loss_clip": 0.01120161, "auxiliary_loss_mlp": 0.01032786, "balance_loss_clip": 1.02002418, "balance_loss_mlp": 1.03345418, "epoch": 0.9062377874643018, "flos": 42777688037760.0, "grad_norm": 2.1814816881857224, "language_loss": 0.7343055, "learning_rate": 8.611247391695787e-08, "loss": 0.75583494, "num_input_tokens_seen": 325010700, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 15073, "time_per_iteration": 2.7992711067199707 }, { "auxiliary_loss_clip": 0.01109046, "auxiliary_loss_mlp": 0.01025667, "balance_loss_clip": 1.01413941, "balance_loss_mlp": 1.03382957, "epoch": 0.9062979107169697, "flos": 16361620323840.0, "grad_norm": 2.0078450975881443, "language_loss": 0.8145023, "learning_rate": 8.600283630490901e-08, "loss": 0.83584946, "num_input_tokens_seen": 325028760, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66796875, "step": 15074, "time_per_iteration": 2.7753212451934814 }, { "auxiliary_loss_clip": 0.01066486, "auxiliary_loss_mlp": 0.01248843, "balance_loss_clip": 1.00176799, "balance_loss_mlp": 1.001158, "epoch": 0.9063580339696378, "flos": 69313988528640.0, "grad_norm": 0.7398256386288505, "language_loss": 0.52370667, "learning_rate": 8.58932669984953e-08, "loss": 0.54685998, "num_input_tokens_seen": 325093545, "router_z_loss_clip": 0.01184082, "router_z_loss_mlp": 0.21191406, "step": 15075, "time_per_iteration": 3.340449571609497 }, { "auxiliary_loss_clip": 0.01048952, "auxiliary_loss_mlp": 0.01002018, "balance_loss_clip": 1.0008316, "balance_loss_mlp": 1.00107455, "epoch": 0.9064181572223057, "flos": 58794747148800.0, "grad_norm": 0.7265699335493493, "language_loss": 0.62125885, "learning_rate": 8.578376600162718e-08, "loss": 0.64176857, "num_input_tokens_seen": 325152295, "router_z_loss_clip": 0.01184082, "router_z_loss_mlp": 0.21191406, "step": 15076, "time_per_iteration": 3.1488115787506104 }, { "auxiliary_loss_clip": 0.01109282, "auxiliary_loss_mlp": 0.01034333, "balance_loss_clip": 1.02275753, "balance_loss_mlp": 1.03251863, "epoch": 0.9064782804749737, "flos": 21762692772480.0, "grad_norm": 1.7507467726305825, "language_loss": 0.82120889, "learning_rate": 8.567433331821239e-08, "loss": 0.84264505, "num_input_tokens_seen": 325169705, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6796875, "step": 15077, "time_per_iteration": 2.499812126159668 }, { "auxiliary_loss_clip": 0.01116659, "auxiliary_loss_mlp": 0.01270676, "balance_loss_clip": 1.01274729, "balance_loss_mlp": 1.0320977, "epoch": 0.9065384037276417, "flos": 21397373498880.0, "grad_norm": 1.7367183011240526, "language_loss": 0.8380037, "learning_rate": 8.556496895215692e-08, "loss": 0.86187708, "num_input_tokens_seen": 325189175, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.66796875, "step": 15078, "time_per_iteration": 2.5022408962249756 }, { "auxiliary_loss_clip": 0.01107926, "auxiliary_loss_mlp": 0.0127913, "balance_loss_clip": 1.02039421, "balance_loss_mlp": 1.03397679, "epoch": 0.9065985269803096, "flos": 38283646849920.0, "grad_norm": 2.0385506871463304, "language_loss": 0.65594697, "learning_rate": 8.545567290736255e-08, "loss": 0.67981756, "num_input_tokens_seen": 325211020, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.65234375, "step": 15079, "time_per_iteration": 2.8461179733276367 }, { "auxiliary_loss_clip": 0.01153867, "auxiliary_loss_mlp": 0.0102736, "balance_loss_clip": 1.0155884, "balance_loss_mlp": 1.03249586, "epoch": 0.9066586502329776, "flos": 44818562989440.0, "grad_norm": 1.5074159209086433, "language_loss": 0.70808816, "learning_rate": 8.534644518773083e-08, "loss": 0.72990048, "num_input_tokens_seen": 325236970, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 15080, "time_per_iteration": 2.7630810737609863 }, { "auxiliary_loss_clip": 0.01115388, "auxiliary_loss_mlp": 0.01031053, "balance_loss_clip": 1.01826143, "balance_loss_mlp": 1.03517509, "epoch": 0.9067187734856456, "flos": 18623992492800.0, "grad_norm": 2.0575068167111494, "language_loss": 0.71200818, "learning_rate": 8.523728579715905e-08, "loss": 0.73347259, "num_input_tokens_seen": 325252670, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 15081, "time_per_iteration": 2.476207971572876 }, { "auxiliary_loss_clip": 0.01118925, "auxiliary_loss_mlp": 0.01030297, "balance_loss_clip": 1.01811361, "balance_loss_mlp": 1.03412557, "epoch": 0.9067788967383136, "flos": 22747578762240.0, "grad_norm": 1.8034215467592756, "language_loss": 0.74669021, "learning_rate": 8.51281947395437e-08, "loss": 0.7681824, "num_input_tokens_seen": 325273860, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6640625, "step": 15082, "time_per_iteration": 2.74082612991333 }, { "auxiliary_loss_clip": 0.0112702, "auxiliary_loss_mlp": 0.01035607, "balance_loss_clip": 1.02370381, "balance_loss_mlp": 1.03405964, "epoch": 0.9068390199909815, "flos": 27670787648640.0, "grad_norm": 2.1345353255649093, "language_loss": 0.78257537, "learning_rate": 8.501917201877695e-08, "loss": 0.80420166, "num_input_tokens_seen": 325294140, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 15083, "time_per_iteration": 2.5699498653411865 }, { "auxiliary_loss_clip": 0.01115905, "auxiliary_loss_mlp": 0.01280564, "balance_loss_clip": 1.02233064, "balance_loss_mlp": 1.03265715, "epoch": 0.9068991432436495, "flos": 24244012293120.0, "grad_norm": 1.58642404847122, "language_loss": 0.69014925, "learning_rate": 8.49102176387504e-08, "loss": 0.71411395, "num_input_tokens_seen": 325313130, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.65625, "step": 15084, "time_per_iteration": 2.5207679271698 }, { "auxiliary_loss_clip": 0.01102196, "auxiliary_loss_mlp": 0.01030326, "balance_loss_clip": 1.01836371, "balance_loss_mlp": 1.03435087, "epoch": 0.9069592664963174, "flos": 28033305661440.0, "grad_norm": 1.762188567904502, "language_loss": 0.66844094, "learning_rate": 8.480133160335179e-08, "loss": 0.68976617, "num_input_tokens_seen": 325334880, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6796875, "step": 15085, "time_per_iteration": 2.6628973484039307 }, { "auxiliary_loss_clip": 0.01050049, "auxiliary_loss_mlp": 0.01002346, "balance_loss_clip": 1.00114751, "balance_loss_mlp": 1.00150919, "epoch": 0.9070193897489854, "flos": 68778414789120.0, "grad_norm": 0.7634277670414836, "language_loss": 0.61285877, "learning_rate": 8.469251391646737e-08, "loss": 0.63338268, "num_input_tokens_seen": 325394175, "router_z_loss_clip": 0.01196289, "router_z_loss_mlp": 0.21191406, "step": 15086, "time_per_iteration": 3.181917190551758 }, { "auxiliary_loss_clip": 0.01110975, "auxiliary_loss_mlp": 0.01035892, "balance_loss_clip": 1.02308345, "balance_loss_mlp": 1.0343802, "epoch": 0.9070795130016533, "flos": 23441624328960.0, "grad_norm": 1.5471056228399889, "language_loss": 0.72229481, "learning_rate": 8.458376458198025e-08, "loss": 0.74376345, "num_input_tokens_seen": 325415020, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.671875, "step": 15087, "time_per_iteration": 2.550335645675659 }, { "auxiliary_loss_clip": 0.01118537, "auxiliary_loss_mlp": 0.01026564, "balance_loss_clip": 1.01489913, "balance_loss_mlp": 1.03338647, "epoch": 0.9071396362543214, "flos": 18916413114240.0, "grad_norm": 1.9493413020702044, "language_loss": 0.7692579, "learning_rate": 8.447508360377153e-08, "loss": 0.7907089, "num_input_tokens_seen": 325433595, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.67578125, "step": 15088, "time_per_iteration": 2.6666431427001953 }, { "auxiliary_loss_clip": 0.01117731, "auxiliary_loss_mlp": 0.01029811, "balance_loss_clip": 1.01849842, "balance_loss_mlp": 1.03390527, "epoch": 0.9071997595069893, "flos": 25228646887680.0, "grad_norm": 1.9067945015322405, "language_loss": 0.73431933, "learning_rate": 8.436647098571969e-08, "loss": 0.75579476, "num_input_tokens_seen": 325451605, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.65625, "step": 15089, "time_per_iteration": 2.5857503414154053 }, { "auxiliary_loss_clip": 0.01097081, "auxiliary_loss_mlp": 0.01030211, "balance_loss_clip": 1.01933897, "balance_loss_mlp": 1.03275037, "epoch": 0.9072598827596573, "flos": 24346608514560.0, "grad_norm": 1.7788635672740667, "language_loss": 0.75521457, "learning_rate": 8.425792673170074e-08, "loss": 0.77648753, "num_input_tokens_seen": 325470645, "router_z_loss_clip": 0.10888672, "router_z_loss_mlp": 0.640625, "step": 15090, "time_per_iteration": 2.586864948272705 }, { "auxiliary_loss_clip": 0.01122007, "auxiliary_loss_mlp": 0.01028187, "balance_loss_clip": 1.01531863, "balance_loss_mlp": 1.03376782, "epoch": 0.9073200060123253, "flos": 22674967418880.0, "grad_norm": 2.549362290157815, "language_loss": 0.77657509, "learning_rate": 8.414945084558866e-08, "loss": 0.79807699, "num_input_tokens_seen": 325488070, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 15091, "time_per_iteration": 2.7012174129486084 }, { "auxiliary_loss_clip": 0.01113361, "auxiliary_loss_mlp": 0.01024096, "balance_loss_clip": 1.01327753, "balance_loss_mlp": 1.03194666, "epoch": 0.9073801292649932, "flos": 23695476721920.0, "grad_norm": 1.5177534497231562, "language_loss": 0.85784709, "learning_rate": 8.404104333125462e-08, "loss": 0.87922156, "num_input_tokens_seen": 325509285, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.63671875, "step": 15092, "time_per_iteration": 2.578839063644409 }, { "auxiliary_loss_clip": 0.01120313, "auxiliary_loss_mlp": 0.01031598, "balance_loss_clip": 1.01948667, "balance_loss_mlp": 1.03592658, "epoch": 0.9074402525176612, "flos": 25375413859200.0, "grad_norm": 1.441917400204509, "language_loss": 0.78617287, "learning_rate": 8.393270419256749e-08, "loss": 0.80769193, "num_input_tokens_seen": 325529360, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.671875, "step": 15093, "time_per_iteration": 2.6424496173858643 }, { "auxiliary_loss_clip": 0.01031139, "auxiliary_loss_mlp": 0.01002287, "balance_loss_clip": 1.00107098, "balance_loss_mlp": 1.00092232, "epoch": 0.9075003757703292, "flos": 67649024384640.0, "grad_norm": 0.7432367761283327, "language_loss": 0.57019442, "learning_rate": 8.382443343339284e-08, "loss": 0.59052873, "num_input_tokens_seen": 325583565, "router_z_loss_clip": 0.012146, "router_z_loss_mlp": 0.2109375, "step": 15094, "time_per_iteration": 4.495383977890015 }, { "auxiliary_loss_clip": 0.01113332, "auxiliary_loss_mlp": 0.01031219, "balance_loss_clip": 1.01802897, "balance_loss_mlp": 1.03418899, "epoch": 0.9075604990229972, "flos": 22453649769600.0, "grad_norm": 1.9104518672708515, "language_loss": 0.70988882, "learning_rate": 8.371623105759584e-08, "loss": 0.73133433, "num_input_tokens_seen": 325603690, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 15095, "time_per_iteration": 2.5749332904815674 }, { "auxiliary_loss_clip": 0.01129014, "auxiliary_loss_mlp": 0.01032409, "balance_loss_clip": 1.02048779, "balance_loss_mlp": 1.03343344, "epoch": 0.9076206222756651, "flos": 19536662188800.0, "grad_norm": 1.6637216863193904, "language_loss": 0.73908174, "learning_rate": 8.36080970690367e-08, "loss": 0.76069593, "num_input_tokens_seen": 325622255, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 15096, "time_per_iteration": 2.60669207572937 }, { "auxiliary_loss_clip": 0.01103707, "auxiliary_loss_mlp": 0.01036098, "balance_loss_clip": 1.02243066, "balance_loss_mlp": 1.03400159, "epoch": 0.9076807455283331, "flos": 30116914819200.0, "grad_norm": 1.7997789197218939, "language_loss": 0.5723384, "learning_rate": 8.350003147157548e-08, "loss": 0.59373641, "num_input_tokens_seen": 325640165, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.69921875, "step": 15097, "time_per_iteration": 2.8050665855407715 }, { "auxiliary_loss_clip": 0.01106214, "auxiliary_loss_mlp": 0.01023506, "balance_loss_clip": 1.0121212, "balance_loss_mlp": 1.03025341, "epoch": 0.907740868781001, "flos": 18697537589760.0, "grad_norm": 1.6144184234543437, "language_loss": 0.79664248, "learning_rate": 8.339203426906816e-08, "loss": 0.81793964, "num_input_tokens_seen": 325659455, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.66796875, "step": 15098, "time_per_iteration": 2.535637855529785 }, { "auxiliary_loss_clip": 0.01140274, "auxiliary_loss_mlp": 0.01278633, "balance_loss_clip": 1.01922905, "balance_loss_mlp": 1.03399956, "epoch": 0.907800992033669, "flos": 22638805401600.0, "grad_norm": 2.283339506099385, "language_loss": 0.65875554, "learning_rate": 8.328410546536901e-08, "loss": 0.68294454, "num_input_tokens_seen": 325678095, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 15099, "time_per_iteration": 2.6599009037017822 }, { "auxiliary_loss_clip": 0.01123405, "auxiliary_loss_mlp": 0.01033462, "balance_loss_clip": 1.0208919, "balance_loss_mlp": 1.03588867, "epoch": 0.907861115286337, "flos": 21287666384640.0, "grad_norm": 2.1097185295052365, "language_loss": 0.70154452, "learning_rate": 8.317624506432963e-08, "loss": 0.72311318, "num_input_tokens_seen": 325695825, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 15100, "time_per_iteration": 4.1354289054870605 }, { "auxiliary_loss_clip": 0.0103076, "auxiliary_loss_mlp": 0.0100275, "balance_loss_clip": 1.00151575, "balance_loss_mlp": 1.00077713, "epoch": 0.907921238539005, "flos": 69739493040000.0, "grad_norm": 0.6264855862414347, "language_loss": 0.53486234, "learning_rate": 8.306845306979959e-08, "loss": 0.55519742, "num_input_tokens_seen": 325764515, "router_z_loss_clip": 0.0123291, "router_z_loss_mlp": 0.2109375, "step": 15101, "time_per_iteration": 3.2202751636505127 }, { "auxiliary_loss_clip": 0.01128677, "auxiliary_loss_mlp": 0.01026807, "balance_loss_clip": 1.01434374, "balance_loss_mlp": 1.0341866, "epoch": 0.9079813617916729, "flos": 23477391296640.0, "grad_norm": 2.945421050943598, "language_loss": 0.68077111, "learning_rate": 8.296072948562561e-08, "loss": 0.70232594, "num_input_tokens_seen": 325783235, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6796875, "step": 15102, "time_per_iteration": 2.667543888092041 }, { "auxiliary_loss_clip": 0.01111075, "auxiliary_loss_mlp": 0.01030293, "balance_loss_clip": 1.01855087, "balance_loss_mlp": 1.03508413, "epoch": 0.9080414850443409, "flos": 22929933133440.0, "grad_norm": 1.815290259506698, "language_loss": 0.79337287, "learning_rate": 8.28530743156517e-08, "loss": 0.81478655, "num_input_tokens_seen": 325800195, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.66796875, "step": 15103, "time_per_iteration": 2.824880599975586 }, { "auxiliary_loss_clip": 0.01120385, "auxiliary_loss_mlp": 0.01029414, "balance_loss_clip": 1.01702809, "balance_loss_mlp": 1.03275228, "epoch": 0.9081016082970089, "flos": 26177083551360.0, "grad_norm": 2.0708967677795607, "language_loss": 0.71512848, "learning_rate": 8.274548756372013e-08, "loss": 0.73662645, "num_input_tokens_seen": 325820215, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69921875, "step": 15104, "time_per_iteration": 2.7038657665252686 }, { "auxiliary_loss_clip": 0.01118277, "auxiliary_loss_mlp": 0.01026574, "balance_loss_clip": 1.01551747, "balance_loss_mlp": 1.03373146, "epoch": 0.9081617315496768, "flos": 24462169545600.0, "grad_norm": 1.6260769917068405, "language_loss": 0.77332008, "learning_rate": 8.26379692336705e-08, "loss": 0.79476863, "num_input_tokens_seen": 325838415, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.66796875, "step": 15105, "time_per_iteration": 2.536503553390503 }, { "auxiliary_loss_clip": 0.01101804, "auxiliary_loss_mlp": 0.01281436, "balance_loss_clip": 1.02236438, "balance_loss_mlp": 1.03260648, "epoch": 0.9082218548023449, "flos": 24746868743040.0, "grad_norm": 1.7622729026988901, "language_loss": 0.7378056, "learning_rate": 8.253051932933974e-08, "loss": 0.76163799, "num_input_tokens_seen": 325855580, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.69140625, "step": 15106, "time_per_iteration": 2.52260684967041 }, { "auxiliary_loss_clip": 0.01138472, "auxiliary_loss_mlp": 0.01030362, "balance_loss_clip": 1.01828599, "balance_loss_mlp": 1.03340602, "epoch": 0.9082819780550128, "flos": 24421302846720.0, "grad_norm": 1.6620372109385149, "language_loss": 0.80175775, "learning_rate": 8.242313785456212e-08, "loss": 0.82344604, "num_input_tokens_seen": 325874890, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69140625, "step": 15107, "time_per_iteration": 4.806830406188965 }, { "auxiliary_loss_clip": 0.01039549, "auxiliary_loss_mlp": 0.01001707, "balance_loss_clip": 1.00053239, "balance_loss_mlp": 1.00104439, "epoch": 0.9083421013076808, "flos": 59609704872960.0, "grad_norm": 0.8169194270149538, "language_loss": 0.59665871, "learning_rate": 8.231582481317035e-08, "loss": 0.61707121, "num_input_tokens_seen": 325935835, "router_z_loss_clip": 0.01171875, "router_z_loss_mlp": 0.2109375, "step": 15108, "time_per_iteration": 3.112445116043091 }, { "auxiliary_loss_clip": 0.01119505, "auxiliary_loss_mlp": 0.01026826, "balance_loss_clip": 1.01454687, "balance_loss_mlp": 1.03266323, "epoch": 0.9084022245603487, "flos": 33620216100480.0, "grad_norm": 1.720912892546839, "language_loss": 0.73516965, "learning_rate": 8.220858020899379e-08, "loss": 0.75663292, "num_input_tokens_seen": 325958035, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 15109, "time_per_iteration": 4.51032018661499 }, { "auxiliary_loss_clip": 0.01125211, "auxiliary_loss_mlp": 0.01025138, "balance_loss_clip": 1.01358688, "balance_loss_mlp": 1.0322808, "epoch": 0.9084623478130167, "flos": 24971705925120.0, "grad_norm": 2.6284409159694793, "language_loss": 0.7116909, "learning_rate": 8.210140404586008e-08, "loss": 0.73319435, "num_input_tokens_seen": 325979870, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 15110, "time_per_iteration": 2.6048099994659424 }, { "auxiliary_loss_clip": 0.01117033, "auxiliary_loss_mlp": 0.010278, "balance_loss_clip": 1.01615918, "balance_loss_mlp": 1.03260279, "epoch": 0.9085224710656846, "flos": 31461804869760.0, "grad_norm": 1.6624238306882106, "language_loss": 0.68852884, "learning_rate": 8.199429632759347e-08, "loss": 0.70997715, "num_input_tokens_seen": 325998245, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6640625, "step": 15111, "time_per_iteration": 2.6484687328338623 }, { "auxiliary_loss_clip": 0.01040248, "auxiliary_loss_mlp": 0.01001504, "balance_loss_clip": 1.00028217, "balance_loss_mlp": 1.00109756, "epoch": 0.9085825943183526, "flos": 64917012867840.0, "grad_norm": 0.6874430050107174, "language_loss": 0.51779634, "learning_rate": 8.188725705801713e-08, "loss": 0.53821391, "num_input_tokens_seen": 326061770, "router_z_loss_clip": 0.01220703, "router_z_loss_mlp": 0.2109375, "step": 15112, "time_per_iteration": 3.270833969116211 }, { "auxiliary_loss_clip": 0.01111351, "auxiliary_loss_mlp": 0.01031299, "balance_loss_clip": 1.01881838, "balance_loss_mlp": 1.03330779, "epoch": 0.9086427175710206, "flos": 18953221576320.0, "grad_norm": 1.6747216212434848, "language_loss": 0.69545901, "learning_rate": 8.178028624095023e-08, "loss": 0.71688545, "num_input_tokens_seen": 326080945, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 15113, "time_per_iteration": 2.721104860305786 }, { "auxiliary_loss_clip": 0.01111973, "auxiliary_loss_mlp": 0.01030099, "balance_loss_clip": 1.01810694, "balance_loss_mlp": 1.03452945, "epoch": 0.9087028408236886, "flos": 34014873807360.0, "grad_norm": 2.0413090143698973, "language_loss": 0.79268909, "learning_rate": 8.167338388021106e-08, "loss": 0.8141098, "num_input_tokens_seen": 326100630, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 15114, "time_per_iteration": 2.7224934101104736 }, { "auxiliary_loss_clip": 0.01140339, "auxiliary_loss_mlp": 0.01032439, "balance_loss_clip": 1.01886714, "balance_loss_mlp": 1.03410017, "epoch": 0.9087629640763565, "flos": 21944580266880.0, "grad_norm": 13.820460648185746, "language_loss": 0.70066917, "learning_rate": 8.15665499796141e-08, "loss": 0.72239697, "num_input_tokens_seen": 326120145, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7109375, "step": 15115, "time_per_iteration": 2.5931034088134766 }, { "auxiliary_loss_clip": 0.01134435, "auxiliary_loss_mlp": 0.01032346, "balance_loss_clip": 1.01900613, "balance_loss_mlp": 1.03629756, "epoch": 0.9088230873290245, "flos": 21762908254080.0, "grad_norm": 1.6038375373755807, "language_loss": 0.66065812, "learning_rate": 8.145978454297209e-08, "loss": 0.6823259, "num_input_tokens_seen": 326140715, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 15116, "time_per_iteration": 2.6291985511779785 }, { "auxiliary_loss_clip": 0.01100693, "auxiliary_loss_mlp": 0.01032496, "balance_loss_clip": 1.02053332, "balance_loss_mlp": 1.03497219, "epoch": 0.9088832105816925, "flos": 21541267382400.0, "grad_norm": 1.7918211175601992, "language_loss": 0.69638914, "learning_rate": 8.135308757409553e-08, "loss": 0.71772099, "num_input_tokens_seen": 326159130, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.65625, "step": 15117, "time_per_iteration": 2.546201229095459 }, { "auxiliary_loss_clip": 0.01126791, "auxiliary_loss_mlp": 0.01027182, "balance_loss_clip": 1.01566029, "balance_loss_mlp": 1.03275871, "epoch": 0.9089433338343604, "flos": 25996704428160.0, "grad_norm": 1.5207251521140575, "language_loss": 0.74596071, "learning_rate": 8.12464590767914e-08, "loss": 0.7675004, "num_input_tokens_seen": 326181375, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.67578125, "step": 15118, "time_per_iteration": 2.587080240249634 }, { "auxiliary_loss_clip": 0.01124356, "auxiliary_loss_mlp": 0.01031106, "balance_loss_clip": 1.01777864, "balance_loss_mlp": 1.03276312, "epoch": 0.9090034570870285, "flos": 21178426147200.0, "grad_norm": 2.0858858371158715, "language_loss": 0.73340648, "learning_rate": 8.113989905486618e-08, "loss": 0.75496107, "num_input_tokens_seen": 326199740, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 15119, "time_per_iteration": 2.5781874656677246 }, { "auxiliary_loss_clip": 0.01133697, "auxiliary_loss_mlp": 0.01032045, "balance_loss_clip": 1.01823413, "balance_loss_mlp": 1.03544497, "epoch": 0.9090635803396964, "flos": 16141811045760.0, "grad_norm": 2.475988192553149, "language_loss": 0.71638656, "learning_rate": 8.103340751212173e-08, "loss": 0.7380439, "num_input_tokens_seen": 326214350, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.71484375, "step": 15120, "time_per_iteration": 2.569127321243286 }, { "auxiliary_loss_clip": 0.01103461, "auxiliary_loss_mlp": 0.01271819, "balance_loss_clip": 1.01216888, "balance_loss_mlp": 1.0340569, "epoch": 0.9091237035923644, "flos": 20591537829120.0, "grad_norm": 2.1268897134262383, "language_loss": 0.65908158, "learning_rate": 8.09269844523588e-08, "loss": 0.68283439, "num_input_tokens_seen": 326234580, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 15121, "time_per_iteration": 2.59029221534729 }, { "auxiliary_loss_clip": 0.01111197, "auxiliary_loss_mlp": 0.01038782, "balance_loss_clip": 1.02631235, "balance_loss_mlp": 1.03440905, "epoch": 0.9091838268450323, "flos": 52227760164480.0, "grad_norm": 1.7040070345839193, "language_loss": 0.7018466, "learning_rate": 8.082062987937543e-08, "loss": 0.72334641, "num_input_tokens_seen": 326259080, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6796875, "step": 15122, "time_per_iteration": 2.754028558731079 }, { "auxiliary_loss_clip": 0.01134937, "auxiliary_loss_mlp": 0.01031108, "balance_loss_clip": 1.01910949, "balance_loss_mlp": 1.03185928, "epoch": 0.9092439500977003, "flos": 20559613616640.0, "grad_norm": 1.406731505044457, "language_loss": 0.7464695, "learning_rate": 8.071434379696707e-08, "loss": 0.76812994, "num_input_tokens_seen": 326280175, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.66796875, "step": 15123, "time_per_iteration": 2.60067081451416 }, { "auxiliary_loss_clip": 0.01107933, "auxiliary_loss_mlp": 0.01029415, "balance_loss_clip": 1.01804245, "balance_loss_mlp": 1.03333056, "epoch": 0.9093040733503682, "flos": 28617859595520.0, "grad_norm": 1.9208806362691107, "language_loss": 0.75931817, "learning_rate": 8.060812620892643e-08, "loss": 0.78069162, "num_input_tokens_seen": 326297990, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.65625, "step": 15124, "time_per_iteration": 2.6028032302856445 }, { "auxiliary_loss_clip": 0.01111277, "auxiliary_loss_mlp": 0.0103147, "balance_loss_clip": 1.01876831, "balance_loss_mlp": 1.03471112, "epoch": 0.9093641966030362, "flos": 23440187784960.0, "grad_norm": 2.3563131724131465, "language_loss": 0.71999788, "learning_rate": 8.050197711904493e-08, "loss": 0.7414254, "num_input_tokens_seen": 326316735, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.67578125, "step": 15125, "time_per_iteration": 2.4957563877105713 }, { "auxiliary_loss_clip": 0.0112965, "auxiliary_loss_mlp": 0.01034999, "balance_loss_clip": 1.02275634, "balance_loss_mlp": 1.03546023, "epoch": 0.9094243198557042, "flos": 16800197385600.0, "grad_norm": 1.669398060634091, "language_loss": 0.79278451, "learning_rate": 8.039589653111023e-08, "loss": 0.81443107, "num_input_tokens_seen": 326334370, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.671875, "step": 15126, "time_per_iteration": 2.503485679626465 }, { "auxiliary_loss_clip": 0.01127364, "auxiliary_loss_mlp": 0.01035609, "balance_loss_clip": 1.02280629, "balance_loss_mlp": 1.03290248, "epoch": 0.9094844431083722, "flos": 24273278899200.0, "grad_norm": 1.568552964688529, "language_loss": 0.75697029, "learning_rate": 8.028988444890817e-08, "loss": 0.77860004, "num_input_tokens_seen": 326353435, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.671875, "step": 15127, "time_per_iteration": 2.5587286949157715 }, { "auxiliary_loss_clip": 0.0115424, "auxiliary_loss_mlp": 0.01033624, "balance_loss_clip": 1.02136958, "balance_loss_mlp": 1.03151715, "epoch": 0.9095445663610401, "flos": 21944652094080.0, "grad_norm": 2.0900612433396875, "language_loss": 0.63071787, "learning_rate": 8.018394087622193e-08, "loss": 0.65259653, "num_input_tokens_seen": 326371810, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6953125, "step": 15128, "time_per_iteration": 2.5597097873687744 }, { "auxiliary_loss_clip": 0.01126861, "auxiliary_loss_mlp": 0.01026323, "balance_loss_clip": 1.01412845, "balance_loss_mlp": 1.03353488, "epoch": 0.9096046896137081, "flos": 19792848965760.0, "grad_norm": 1.6874174725475402, "language_loss": 0.7680428, "learning_rate": 8.007806581683274e-08, "loss": 0.78957456, "num_input_tokens_seen": 326391380, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 15129, "time_per_iteration": 2.5123298168182373 }, { "auxiliary_loss_clip": 0.01112178, "auxiliary_loss_mlp": 0.01027019, "balance_loss_clip": 1.01556933, "balance_loss_mlp": 1.03427577, "epoch": 0.909664812866376, "flos": 22638087129600.0, "grad_norm": 1.9691736039890084, "language_loss": 0.83296943, "learning_rate": 7.997225927451845e-08, "loss": 0.85436141, "num_input_tokens_seen": 326408800, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6875, "step": 15130, "time_per_iteration": 2.484130859375 }, { "auxiliary_loss_clip": 0.01121981, "auxiliary_loss_mlp": 0.01032489, "balance_loss_clip": 1.01979935, "balance_loss_mlp": 1.03484154, "epoch": 0.909724936119044, "flos": 39852153020160.0, "grad_norm": 1.7747724532054883, "language_loss": 0.75065362, "learning_rate": 7.98665212530556e-08, "loss": 0.77219832, "num_input_tokens_seen": 326431565, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 15131, "time_per_iteration": 2.6346566677093506 }, { "auxiliary_loss_clip": 0.01122088, "auxiliary_loss_mlp": 0.01029671, "balance_loss_clip": 1.01726091, "balance_loss_mlp": 1.03512478, "epoch": 0.9097850593717121, "flos": 35071616954880.0, "grad_norm": 1.8467360582745744, "language_loss": 0.59573215, "learning_rate": 7.97608517562176e-08, "loss": 0.61724979, "num_input_tokens_seen": 326451715, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 15132, "time_per_iteration": 2.578174114227295 }, { "auxiliary_loss_clip": 0.01131266, "auxiliary_loss_mlp": 0.01030265, "balance_loss_clip": 1.01839757, "balance_loss_mlp": 1.03620386, "epoch": 0.90984518262438, "flos": 23367468700800.0, "grad_norm": 1.417105402575001, "language_loss": 0.82472384, "learning_rate": 7.96552507877748e-08, "loss": 0.84633911, "num_input_tokens_seen": 326470855, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 15133, "time_per_iteration": 2.5027387142181396 }, { "auxiliary_loss_clip": 0.01120742, "auxiliary_loss_mlp": 0.01031026, "balance_loss_clip": 1.01878369, "balance_loss_mlp": 1.03311682, "epoch": 0.909905305877048, "flos": 27523302405120.0, "grad_norm": 1.6287250117202456, "language_loss": 0.73922372, "learning_rate": 7.95497183514966e-08, "loss": 0.76074135, "num_input_tokens_seen": 326490480, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.69921875, "step": 15134, "time_per_iteration": 2.510331869125366 }, { "auxiliary_loss_clip": 0.01135033, "auxiliary_loss_mlp": 0.01032773, "balance_loss_clip": 1.02060151, "balance_loss_mlp": 1.0353179, "epoch": 0.9099654291297159, "flos": 24347865490560.0, "grad_norm": 1.552766911932828, "language_loss": 0.72648013, "learning_rate": 7.944425445114889e-08, "loss": 0.74815816, "num_input_tokens_seen": 326509445, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.7265625, "step": 15135, "time_per_iteration": 3.9571807384490967 }, { "auxiliary_loss_clip": 0.01110398, "auxiliary_loss_mlp": 0.01029448, "balance_loss_clip": 1.01771224, "balance_loss_mlp": 1.0350132, "epoch": 0.9100255523823839, "flos": 21215234609280.0, "grad_norm": 2.0174582980795996, "language_loss": 0.70360744, "learning_rate": 7.933885909049575e-08, "loss": 0.72500587, "num_input_tokens_seen": 326528380, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.66796875, "step": 15136, "time_per_iteration": 2.4945085048675537 }, { "auxiliary_loss_clip": 0.01049022, "auxiliary_loss_mlp": 0.01004467, "balance_loss_clip": 1.00322711, "balance_loss_mlp": 1.00072455, "epoch": 0.9100856756350518, "flos": 46052276446080.0, "grad_norm": 0.8187307487389271, "language_loss": 0.59246886, "learning_rate": 7.923353227329799e-08, "loss": 0.61300373, "num_input_tokens_seen": 326576940, "router_z_loss_clip": 0.01239014, "router_z_loss_mlp": 0.2109375, "step": 15137, "time_per_iteration": 2.877730369567871 }, { "auxiliary_loss_clip": 0.01111419, "auxiliary_loss_mlp": 0.01035482, "balance_loss_clip": 1.02346575, "balance_loss_mlp": 1.03347099, "epoch": 0.9101457988877198, "flos": 20229917656320.0, "grad_norm": 1.6789995978519905, "language_loss": 0.82396889, "learning_rate": 7.912827400331479e-08, "loss": 0.84543788, "num_input_tokens_seen": 326596100, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 15138, "time_per_iteration": 2.4742653369903564 }, { "auxiliary_loss_clip": 0.01107811, "auxiliary_loss_mlp": 0.01023227, "balance_loss_clip": 1.01197922, "balance_loss_mlp": 1.03383207, "epoch": 0.9102059221403878, "flos": 26615157822720.0, "grad_norm": 1.9489422558722593, "language_loss": 0.81192935, "learning_rate": 7.902308428430226e-08, "loss": 0.83323967, "num_input_tokens_seen": 326615700, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6484375, "step": 15139, "time_per_iteration": 2.5297470092773438 }, { "auxiliary_loss_clip": 0.0112875, "auxiliary_loss_mlp": 0.0103482, "balance_loss_clip": 1.02308416, "balance_loss_mlp": 1.03455555, "epoch": 0.9102660453930558, "flos": 21908561904000.0, "grad_norm": 1.879315806848236, "language_loss": 0.77619469, "learning_rate": 7.891796312001475e-08, "loss": 0.7978304, "num_input_tokens_seen": 326635905, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 15140, "time_per_iteration": 2.5189483165740967 }, { "auxiliary_loss_clip": 0.0110106, "auxiliary_loss_mlp": 0.01028283, "balance_loss_clip": 1.0159924, "balance_loss_mlp": 1.03296423, "epoch": 0.9103261686457237, "flos": 36176660916480.0, "grad_norm": 1.472949304801667, "language_loss": 0.66387105, "learning_rate": 7.881291051420302e-08, "loss": 0.68516451, "num_input_tokens_seen": 326661855, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 15141, "time_per_iteration": 2.625365734100342 }, { "auxiliary_loss_clip": 0.0111241, "auxiliary_loss_mlp": 0.01033431, "balance_loss_clip": 1.0216589, "balance_loss_mlp": 1.03404963, "epoch": 0.9103862918983917, "flos": 23878549365120.0, "grad_norm": 1.7049059345185926, "language_loss": 0.7492727, "learning_rate": 7.870792647061741e-08, "loss": 0.77073109, "num_input_tokens_seen": 326679320, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6953125, "step": 15142, "time_per_iteration": 4.0312089920043945 }, { "auxiliary_loss_clip": 0.01126193, "auxiliary_loss_mlp": 0.01030339, "balance_loss_clip": 1.01904416, "balance_loss_mlp": 1.0325377, "epoch": 0.9104464151510596, "flos": 14939521989120.0, "grad_norm": 1.6579382961579145, "language_loss": 0.64155656, "learning_rate": 7.860301099300314e-08, "loss": 0.66312182, "num_input_tokens_seen": 326698110, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.671875, "step": 15143, "time_per_iteration": 2.53271222114563 }, { "auxiliary_loss_clip": 0.01103336, "auxiliary_loss_mlp": 0.01038547, "balance_loss_clip": 1.0252254, "balance_loss_mlp": 1.03287446, "epoch": 0.9105065384037276, "flos": 33655803500160.0, "grad_norm": 2.331969320991207, "language_loss": 0.61053741, "learning_rate": 7.849816408510502e-08, "loss": 0.63195622, "num_input_tokens_seen": 326718370, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 15144, "time_per_iteration": 2.5703864097595215 }, { "auxiliary_loss_clip": 0.01126645, "auxiliary_loss_mlp": 0.01025953, "balance_loss_clip": 1.01400793, "balance_loss_mlp": 1.03413236, "epoch": 0.9105666616563957, "flos": 24316695463680.0, "grad_norm": 2.1275896666352407, "language_loss": 0.71213305, "learning_rate": 7.839338575066467e-08, "loss": 0.73365891, "num_input_tokens_seen": 326738445, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.65625, "step": 15145, "time_per_iteration": 2.548250436782837 }, { "auxiliary_loss_clip": 0.0113542, "auxiliary_loss_mlp": 0.01028327, "balance_loss_clip": 1.01671588, "balance_loss_mlp": 1.03236914, "epoch": 0.9106267849090636, "flos": 29971692132480.0, "grad_norm": 1.5300706139641747, "language_loss": 0.70501763, "learning_rate": 7.828867599342203e-08, "loss": 0.72665513, "num_input_tokens_seen": 326758855, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.66796875, "step": 15146, "time_per_iteration": 2.5922586917877197 }, { "auxiliary_loss_clip": 0.01118665, "auxiliary_loss_mlp": 0.01030445, "balance_loss_clip": 1.01861382, "balance_loss_mlp": 1.03290629, "epoch": 0.9106869081617316, "flos": 25337743470720.0, "grad_norm": 1.9136914078113354, "language_loss": 0.73270929, "learning_rate": 7.8184034817113e-08, "loss": 0.7542004, "num_input_tokens_seen": 326777140, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6796875, "step": 15147, "time_per_iteration": 2.581260919570923 }, { "auxiliary_loss_clip": 0.01130558, "auxiliary_loss_mlp": 0.01030077, "balance_loss_clip": 1.01804852, "balance_loss_mlp": 1.0329864, "epoch": 0.9107470314143995, "flos": 18187031543040.0, "grad_norm": 1.9678596756713473, "language_loss": 0.80222625, "learning_rate": 7.80794622254719e-08, "loss": 0.82383263, "num_input_tokens_seen": 326794070, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7109375, "step": 15148, "time_per_iteration": 3.9763576984405518 }, { "auxiliary_loss_clip": 0.01129441, "auxiliary_loss_mlp": 0.01037572, "balance_loss_clip": 1.02474487, "balance_loss_mlp": 1.03284347, "epoch": 0.9108071546670675, "flos": 23550828652800.0, "grad_norm": 2.188048339509492, "language_loss": 0.67762518, "learning_rate": 7.797495822223155e-08, "loss": 0.69929534, "num_input_tokens_seen": 326814695, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 15149, "time_per_iteration": 2.5605063438415527 }, { "auxiliary_loss_clip": 0.01102789, "auxiliary_loss_mlp": 0.01028271, "balance_loss_clip": 1.01584971, "balance_loss_mlp": 1.03457475, "epoch": 0.9108672779197354, "flos": 25630307746560.0, "grad_norm": 1.6144897772709272, "language_loss": 0.63151717, "learning_rate": 7.78705228111205e-08, "loss": 0.65282774, "num_input_tokens_seen": 326835295, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6796875, "step": 15150, "time_per_iteration": 2.5427377223968506 }, { "auxiliary_loss_clip": 0.01127294, "auxiliary_loss_mlp": 0.01034685, "balance_loss_clip": 1.02259707, "balance_loss_mlp": 1.03274679, "epoch": 0.9109274011724035, "flos": 22339094319360.0, "grad_norm": 1.8597219543387808, "language_loss": 0.72685909, "learning_rate": 7.776615599586645e-08, "loss": 0.74847889, "num_input_tokens_seen": 326853350, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.67578125, "step": 15151, "time_per_iteration": 4.103704929351807 }, { "auxiliary_loss_clip": 0.01109576, "auxiliary_loss_mlp": 0.01026188, "balance_loss_clip": 1.01401651, "balance_loss_mlp": 1.0332073, "epoch": 0.9109875244250714, "flos": 26688200129280.0, "grad_norm": 1.886529354193196, "language_loss": 0.64384234, "learning_rate": 7.76618577801933e-08, "loss": 0.66519994, "num_input_tokens_seen": 326873425, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.67578125, "step": 15152, "time_per_iteration": 2.560887336730957 }, { "auxiliary_loss_clip": 0.0112164, "auxiliary_loss_mlp": 0.01274049, "balance_loss_clip": 1.01410937, "balance_loss_mlp": 1.03506207, "epoch": 0.9110476476777394, "flos": 22930112701440.0, "grad_norm": 1.761921903430098, "language_loss": 0.7319482, "learning_rate": 7.755762816782408e-08, "loss": 0.75590515, "num_input_tokens_seen": 326893455, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.68359375, "step": 15153, "time_per_iteration": 2.5442488193511963 }, { "auxiliary_loss_clip": 0.01130852, "auxiliary_loss_mlp": 0.0103187, "balance_loss_clip": 1.01966929, "balance_loss_mlp": 1.03473532, "epoch": 0.9111077709304073, "flos": 13472857854720.0, "grad_norm": 2.1642749565300488, "language_loss": 0.72506863, "learning_rate": 7.74534671624778e-08, "loss": 0.74669588, "num_input_tokens_seen": 326910210, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69140625, "step": 15154, "time_per_iteration": 2.5038344860076904 }, { "auxiliary_loss_clip": 0.01120837, "auxiliary_loss_mlp": 0.01030197, "balance_loss_clip": 1.01818633, "balance_loss_mlp": 1.03586912, "epoch": 0.9111678941830753, "flos": 20850561780480.0, "grad_norm": 1.9763245570225902, "language_loss": 0.82042611, "learning_rate": 7.734937476787195e-08, "loss": 0.84193641, "num_input_tokens_seen": 326929350, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.67578125, "step": 15155, "time_per_iteration": 2.499203681945801 }, { "auxiliary_loss_clip": 0.01112282, "auxiliary_loss_mlp": 0.0102776, "balance_loss_clip": 1.01532626, "balance_loss_mlp": 1.03523076, "epoch": 0.9112280174357432, "flos": 19682244011520.0, "grad_norm": 1.3954329286500715, "language_loss": 0.5970549, "learning_rate": 7.724535098772111e-08, "loss": 0.61845529, "num_input_tokens_seen": 326949060, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6796875, "step": 15156, "time_per_iteration": 2.4866011142730713 }, { "auxiliary_loss_clip": 0.01113671, "auxiliary_loss_mlp": 0.01027483, "balance_loss_clip": 1.01434612, "balance_loss_mlp": 1.03283691, "epoch": 0.9112881406884112, "flos": 25447163276160.0, "grad_norm": 1.7458884200258804, "language_loss": 0.73830968, "learning_rate": 7.714139582573808e-08, "loss": 0.75972128, "num_input_tokens_seen": 326968950, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 15157, "time_per_iteration": 2.577714681625366 }, { "auxiliary_loss_clip": 0.01137896, "auxiliary_loss_mlp": 0.01029072, "balance_loss_clip": 1.017717, "balance_loss_mlp": 1.03370762, "epoch": 0.9113482639410793, "flos": 33066975847680.0, "grad_norm": 1.6460927818351316, "language_loss": 0.59575045, "learning_rate": 7.703750928563213e-08, "loss": 0.61742008, "num_input_tokens_seen": 326989455, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.68359375, "step": 15158, "time_per_iteration": 2.6317596435546875 }, { "auxiliary_loss_clip": 0.01146165, "auxiliary_loss_mlp": 0.01032529, "balance_loss_clip": 1.02078152, "balance_loss_mlp": 1.0336442, "epoch": 0.9114083871937472, "flos": 21835591424640.0, "grad_norm": 1.433673454377423, "language_loss": 0.67497587, "learning_rate": 7.693369137111139e-08, "loss": 0.6967628, "num_input_tokens_seen": 327009640, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 15159, "time_per_iteration": 2.5228219032287598 }, { "auxiliary_loss_clip": 0.01120289, "auxiliary_loss_mlp": 0.01029886, "balance_loss_clip": 1.01763773, "balance_loss_mlp": 1.03384948, "epoch": 0.9114685104464152, "flos": 27088999061760.0, "grad_norm": 2.1370652465352116, "language_loss": 0.78679889, "learning_rate": 7.682994208588e-08, "loss": 0.80830061, "num_input_tokens_seen": 327027690, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 15160, "time_per_iteration": 2.5267841815948486 }, { "auxiliary_loss_clip": 0.01103396, "auxiliary_loss_mlp": 0.01030289, "balance_loss_clip": 1.01688433, "balance_loss_mlp": 1.03475285, "epoch": 0.9115286336990831, "flos": 17967042696960.0, "grad_norm": 1.8294240340075676, "language_loss": 0.68997365, "learning_rate": 7.672626143364125e-08, "loss": 0.71131051, "num_input_tokens_seen": 327045915, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6875, "step": 15161, "time_per_iteration": 2.4673383235931396 }, { "auxiliary_loss_clip": 0.01138998, "auxiliary_loss_mlp": 0.01032304, "balance_loss_clip": 1.01960254, "balance_loss_mlp": 1.03477871, "epoch": 0.9115887569517511, "flos": 22929861306240.0, "grad_norm": 1.7714482634795088, "language_loss": 0.76460361, "learning_rate": 7.662264941809505e-08, "loss": 0.78631663, "num_input_tokens_seen": 327066355, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 15162, "time_per_iteration": 2.559760332107544 }, { "auxiliary_loss_clip": 0.01120666, "auxiliary_loss_mlp": 0.0103535, "balance_loss_clip": 1.02303576, "balance_loss_mlp": 1.03429067, "epoch": 0.911648880204419, "flos": 23988436047360.0, "grad_norm": 4.627261623538343, "language_loss": 0.66972828, "learning_rate": 7.651910604293909e-08, "loss": 0.69128847, "num_input_tokens_seen": 327086735, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 15163, "time_per_iteration": 2.571582317352295 }, { "auxiliary_loss_clip": 0.01142046, "auxiliary_loss_mlp": 0.01032618, "balance_loss_clip": 1.0201124, "balance_loss_mlp": 1.03553247, "epoch": 0.911709003457087, "flos": 17055306754560.0, "grad_norm": 2.100586618987651, "language_loss": 0.70559388, "learning_rate": 7.641563131186824e-08, "loss": 0.72734058, "num_input_tokens_seen": 327104035, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 15164, "time_per_iteration": 2.5374715328216553 }, { "auxiliary_loss_clip": 0.011011, "auxiliary_loss_mlp": 0.01032585, "balance_loss_clip": 1.02075934, "balance_loss_mlp": 1.03601718, "epoch": 0.911769126709755, "flos": 21653344794240.0, "grad_norm": 1.8184066987248884, "language_loss": 0.76219696, "learning_rate": 7.631222522857528e-08, "loss": 0.78353381, "num_input_tokens_seen": 327124370, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.65234375, "step": 15165, "time_per_iteration": 2.4990899562835693 }, { "auxiliary_loss_clip": 0.01112502, "auxiliary_loss_mlp": 0.01031978, "balance_loss_clip": 1.01972938, "balance_loss_mlp": 1.03475189, "epoch": 0.911829249962423, "flos": 24790321221120.0, "grad_norm": 2.1233220512164297, "language_loss": 0.72445327, "learning_rate": 7.620888779675128e-08, "loss": 0.74589813, "num_input_tokens_seen": 327140915, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 15166, "time_per_iteration": 2.5396037101745605 }, { "auxiliary_loss_clip": 0.01138198, "auxiliary_loss_mlp": 0.01032096, "balance_loss_clip": 1.01898313, "balance_loss_mlp": 1.03261614, "epoch": 0.9118893732150909, "flos": 20959406968320.0, "grad_norm": 1.8680983726586833, "language_loss": 0.72953963, "learning_rate": 7.610561902008283e-08, "loss": 0.75124252, "num_input_tokens_seen": 327158940, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 15167, "time_per_iteration": 2.5442183017730713 }, { "auxiliary_loss_clip": 0.01039955, "auxiliary_loss_mlp": 0.0099978, "balance_loss_clip": 0.99861175, "balance_loss_mlp": 1.00096393, "epoch": 0.9119494964677589, "flos": 67917385872000.0, "grad_norm": 0.7724097202870868, "language_loss": 0.65644389, "learning_rate": 7.600241890225633e-08, "loss": 0.6768412, "num_input_tokens_seen": 327217450, "router_z_loss_clip": 0.01165771, "router_z_loss_mlp": 0.2109375, "step": 15168, "time_per_iteration": 3.161227226257324 }, { "auxiliary_loss_clip": 0.01117825, "auxiliary_loss_mlp": 0.01031669, "balance_loss_clip": 1.02023053, "balance_loss_mlp": 1.03282166, "epoch": 0.9120096197204268, "flos": 18551524803840.0, "grad_norm": 1.7996910459008701, "language_loss": 0.78005612, "learning_rate": 7.589928744695417e-08, "loss": 0.80155098, "num_input_tokens_seen": 327233905, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.671875, "step": 15169, "time_per_iteration": 2.5796127319335938 }, { "auxiliary_loss_clip": 0.01110207, "auxiliary_loss_mlp": 0.01028994, "balance_loss_clip": 1.01710916, "balance_loss_mlp": 1.03321099, "epoch": 0.9120697429730948, "flos": 19025725178880.0, "grad_norm": 2.3107232194575533, "language_loss": 0.82144451, "learning_rate": 7.57962246578574e-08, "loss": 0.8428365, "num_input_tokens_seen": 327252430, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 15170, "time_per_iteration": 2.5253069400787354 }, { "auxiliary_loss_clip": 0.01112856, "auxiliary_loss_mlp": 0.01029755, "balance_loss_clip": 1.01702404, "balance_loss_mlp": 1.03537345, "epoch": 0.9121298662257629, "flos": 17163685065600.0, "grad_norm": 3.404570506792565, "language_loss": 0.77414477, "learning_rate": 7.569323053864329e-08, "loss": 0.79557091, "num_input_tokens_seen": 327269215, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 15171, "time_per_iteration": 2.5186660289764404 }, { "auxiliary_loss_clip": 0.0115251, "auxiliary_loss_mlp": 0.0103274, "balance_loss_clip": 1.02085471, "balance_loss_mlp": 1.03162742, "epoch": 0.9121899894784308, "flos": 19682710888320.0, "grad_norm": 1.4900833335739514, "language_loss": 0.66882914, "learning_rate": 7.559030509298825e-08, "loss": 0.69068164, "num_input_tokens_seen": 327290320, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 15172, "time_per_iteration": 2.6087417602539062 }, { "auxiliary_loss_clip": 0.01123939, "auxiliary_loss_mlp": 0.01031322, "balance_loss_clip": 1.01937103, "balance_loss_mlp": 1.03570175, "epoch": 0.9122501127310988, "flos": 22235743912320.0, "grad_norm": 1.6374581826424297, "language_loss": 0.75191534, "learning_rate": 7.548744832456488e-08, "loss": 0.77346802, "num_input_tokens_seen": 327310150, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69921875, "step": 15173, "time_per_iteration": 2.5347492694854736 }, { "auxiliary_loss_clip": 0.01126187, "auxiliary_loss_mlp": 0.01032477, "balance_loss_clip": 1.01790357, "balance_loss_mlp": 1.03544378, "epoch": 0.9123102359837667, "flos": 15957122290560.0, "grad_norm": 2.292157972886746, "language_loss": 0.66278315, "learning_rate": 7.538466023704426e-08, "loss": 0.6843698, "num_input_tokens_seen": 327326660, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.73046875, "step": 15174, "time_per_iteration": 2.5898654460906982 }, { "auxiliary_loss_clip": 0.0111386, "auxiliary_loss_mlp": 0.01031855, "balance_loss_clip": 1.01830137, "balance_loss_mlp": 1.03481174, "epoch": 0.9123703592364347, "flos": 25155784149120.0, "grad_norm": 1.8673421400978236, "language_loss": 0.74932945, "learning_rate": 7.528194083409411e-08, "loss": 0.77078652, "num_input_tokens_seen": 327346700, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.69921875, "step": 15175, "time_per_iteration": 2.5366485118865967 }, { "auxiliary_loss_clip": 0.01030842, "auxiliary_loss_mlp": 0.01004252, "balance_loss_clip": 1.00306606, "balance_loss_mlp": 1.00099587, "epoch": 0.9124304824891026, "flos": 60801650812800.0, "grad_norm": 0.9644911168058679, "language_loss": 0.58466065, "learning_rate": 7.517929011938084e-08, "loss": 0.60501164, "num_input_tokens_seen": 327403050, "router_z_loss_clip": 0.01184082, "router_z_loss_mlp": 0.2109375, "step": 15176, "time_per_iteration": 3.211101531982422 }, { "auxiliary_loss_clip": 0.01122055, "auxiliary_loss_mlp": 0.01034032, "balance_loss_clip": 1.02169371, "balance_loss_mlp": 1.03436017, "epoch": 0.9124906057417707, "flos": 18150941352960.0, "grad_norm": 2.091119901122576, "language_loss": 0.65540898, "learning_rate": 7.507670809656729e-08, "loss": 0.67696977, "num_input_tokens_seen": 327422225, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 15177, "time_per_iteration": 3.8897311687469482 }, { "auxiliary_loss_clip": 0.01119251, "auxiliary_loss_mlp": 0.01028091, "balance_loss_clip": 1.01635468, "balance_loss_mlp": 1.03327906, "epoch": 0.9125507289944386, "flos": 11686769049600.0, "grad_norm": 2.1031473765131476, "language_loss": 0.81056422, "learning_rate": 7.497419476931432e-08, "loss": 0.83203763, "num_input_tokens_seen": 327437025, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.67578125, "step": 15178, "time_per_iteration": 2.5143625736236572 }, { "auxiliary_loss_clip": 0.01147109, "auxiliary_loss_mlp": 0.01026657, "balance_loss_clip": 1.01468265, "balance_loss_mlp": 1.03520703, "epoch": 0.9126108522471066, "flos": 17748813617280.0, "grad_norm": 3.132001894212166, "language_loss": 0.79332173, "learning_rate": 7.4871750141281e-08, "loss": 0.81505942, "num_input_tokens_seen": 327453915, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.671875, "step": 15179, "time_per_iteration": 2.5995349884033203 }, { "auxiliary_loss_clip": 0.01116557, "auxiliary_loss_mlp": 0.01029821, "balance_loss_clip": 1.01809716, "balance_loss_mlp": 1.03236568, "epoch": 0.9126709754997745, "flos": 27635738952960.0, "grad_norm": 2.8316592977073234, "language_loss": 0.67850828, "learning_rate": 7.476937421612262e-08, "loss": 0.69997209, "num_input_tokens_seen": 327474415, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 15180, "time_per_iteration": 2.6100592613220215 }, { "auxiliary_loss_clip": 0.01107154, "auxiliary_loss_mlp": 0.01026576, "balance_loss_clip": 1.01557899, "balance_loss_mlp": 1.03197622, "epoch": 0.9127310987524425, "flos": 15924982596480.0, "grad_norm": 1.7061565276307693, "language_loss": 0.74853075, "learning_rate": 7.466706699749315e-08, "loss": 0.76986802, "num_input_tokens_seen": 327492750, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.66015625, "step": 15181, "time_per_iteration": 2.500490188598633 }, { "auxiliary_loss_clip": 0.01106466, "auxiliary_loss_mlp": 0.0102987, "balance_loss_clip": 1.01743066, "balance_loss_mlp": 1.03539848, "epoch": 0.9127912220051104, "flos": 21536885923200.0, "grad_norm": 1.8377740222599563, "language_loss": 0.74719352, "learning_rate": 7.456482848904322e-08, "loss": 0.76855689, "num_input_tokens_seen": 327509470, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 15182, "time_per_iteration": 2.5079033374786377 }, { "auxiliary_loss_clip": 0.01104324, "auxiliary_loss_mlp": 0.01031782, "balance_loss_clip": 1.01938474, "balance_loss_mlp": 1.03343832, "epoch": 0.9128513452577784, "flos": 24063561342720.0, "grad_norm": 1.476286364671211, "language_loss": 0.76418293, "learning_rate": 7.446265869442236e-08, "loss": 0.78554398, "num_input_tokens_seen": 327530520, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 15183, "time_per_iteration": 2.6101419925689697 }, { "auxiliary_loss_clip": 0.01116353, "auxiliary_loss_mlp": 0.01029103, "balance_loss_clip": 1.01674652, "balance_loss_mlp": 1.03611517, "epoch": 0.9129114685104465, "flos": 16216469464320.0, "grad_norm": 1.9533781986358543, "language_loss": 0.76384425, "learning_rate": 7.436055761727544e-08, "loss": 0.78529882, "num_input_tokens_seen": 327546960, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 15184, "time_per_iteration": 3.908770799636841 }, { "auxiliary_loss_clip": 0.01113723, "auxiliary_loss_mlp": 0.01032988, "balance_loss_clip": 1.02094746, "balance_loss_mlp": 1.0354569, "epoch": 0.9129715917631144, "flos": 19384364522880.0, "grad_norm": 1.6531284433515712, "language_loss": 0.74390459, "learning_rate": 7.425852526124732e-08, "loss": 0.76537168, "num_input_tokens_seen": 327564830, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69140625, "step": 15185, "time_per_iteration": 2.531569242477417 }, { "auxiliary_loss_clip": 0.01120708, "auxiliary_loss_mlp": 0.01029583, "balance_loss_clip": 1.0175364, "balance_loss_mlp": 1.03457451, "epoch": 0.9130317150157824, "flos": 20590460421120.0, "grad_norm": 1.9134317863971662, "language_loss": 0.68324453, "learning_rate": 7.415656162997863e-08, "loss": 0.70474744, "num_input_tokens_seen": 327583675, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.68359375, "step": 15186, "time_per_iteration": 2.5027759075164795 }, { "auxiliary_loss_clip": 0.01129697, "auxiliary_loss_mlp": 0.010329, "balance_loss_clip": 1.01956022, "balance_loss_mlp": 1.03679097, "epoch": 0.9130918382684503, "flos": 20189230525440.0, "grad_norm": 2.130180254256507, "language_loss": 0.77605987, "learning_rate": 7.405466672710848e-08, "loss": 0.7976858, "num_input_tokens_seen": 327602280, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.75, "step": 15187, "time_per_iteration": 2.5443594455718994 }, { "auxiliary_loss_clip": 0.01101056, "auxiliary_loss_mlp": 0.01277306, "balance_loss_clip": 1.01910889, "balance_loss_mlp": 1.03225946, "epoch": 0.9131519615211183, "flos": 25556870390400.0, "grad_norm": 2.1194533178344885, "language_loss": 0.65730166, "learning_rate": 7.395284055627305e-08, "loss": 0.68108535, "num_input_tokens_seen": 327623515, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6875, "step": 15188, "time_per_iteration": 2.5234665870666504 }, { "auxiliary_loss_clip": 0.0115155, "auxiliary_loss_mlp": 0.01028967, "balance_loss_clip": 1.01720715, "balance_loss_mlp": 1.03142369, "epoch": 0.9132120847737862, "flos": 17931563038080.0, "grad_norm": 2.1286880997689464, "language_loss": 0.76202601, "learning_rate": 7.385108312110655e-08, "loss": 0.78383112, "num_input_tokens_seen": 327642875, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.671875, "step": 15189, "time_per_iteration": 2.5352208614349365 }, { "auxiliary_loss_clip": 0.01109146, "auxiliary_loss_mlp": 0.01028892, "balance_loss_clip": 1.01813912, "balance_loss_mlp": 1.0334065, "epoch": 0.9132722080264543, "flos": 20047635112320.0, "grad_norm": 1.685331344447322, "language_loss": 0.75249279, "learning_rate": 7.374939442524009e-08, "loss": 0.77387321, "num_input_tokens_seen": 327662450, "router_z_loss_clip": 0.10742188, "router_z_loss_mlp": 0.66796875, "step": 15190, "time_per_iteration": 3.9816489219665527 }, { "auxiliary_loss_clip": 0.0103061, "auxiliary_loss_mlp": 0.01004825, "balance_loss_clip": 1.00368679, "balance_loss_mlp": 1.00069737, "epoch": 0.9133323312791222, "flos": 70439967141120.0, "grad_norm": 0.7047567449516988, "language_loss": 0.57373917, "learning_rate": 7.364777447230298e-08, "loss": 0.59409356, "num_input_tokens_seen": 327723845, "router_z_loss_clip": 0.01141357, "router_z_loss_mlp": 0.21191406, "step": 15191, "time_per_iteration": 3.2109169960021973 }, { "auxiliary_loss_clip": 0.01105934, "auxiliary_loss_mlp": 0.01035424, "balance_loss_clip": 1.02278173, "balance_loss_mlp": 1.03561032, "epoch": 0.9133924545317902, "flos": 25483792170240.0, "grad_norm": 2.1752654445141117, "language_loss": 0.74413002, "learning_rate": 7.354622326592163e-08, "loss": 0.76554358, "num_input_tokens_seen": 327742590, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 15192, "time_per_iteration": 4.068048477172852 }, { "auxiliary_loss_clip": 0.01109893, "auxiliary_loss_mlp": 0.01027804, "balance_loss_clip": 1.01589537, "balance_loss_mlp": 1.0353564, "epoch": 0.9134525777844581, "flos": 39930690107520.0, "grad_norm": 1.6093207450918212, "language_loss": 0.69079232, "learning_rate": 7.344474080972008e-08, "loss": 0.71216929, "num_input_tokens_seen": 327764350, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.65625, "step": 15193, "time_per_iteration": 2.697711229324341 }, { "auxiliary_loss_clip": 0.01108592, "auxiliary_loss_mlp": 0.01036968, "balance_loss_clip": 1.02561343, "balance_loss_mlp": 1.03407907, "epoch": 0.9135127010371261, "flos": 20886723797760.0, "grad_norm": 1.9046671112099667, "language_loss": 0.7293191, "learning_rate": 7.334332710732005e-08, "loss": 0.75077462, "num_input_tokens_seen": 327783120, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.66015625, "step": 15194, "time_per_iteration": 2.528795003890991 }, { "auxiliary_loss_clip": 0.01140111, "auxiliary_loss_mlp": 0.01035228, "balance_loss_clip": 1.02123308, "balance_loss_mlp": 1.03305912, "epoch": 0.913572824289794, "flos": 20813250528000.0, "grad_norm": 2.0294651350816655, "language_loss": 0.61752355, "learning_rate": 7.324198216234046e-08, "loss": 0.63927692, "num_input_tokens_seen": 327801960, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.71484375, "step": 15195, "time_per_iteration": 2.609999895095825 }, { "auxiliary_loss_clip": 0.01111863, "auxiliary_loss_mlp": 0.01030867, "balance_loss_clip": 1.01822472, "balance_loss_mlp": 1.03410482, "epoch": 0.913632947542462, "flos": 25703278225920.0, "grad_norm": 1.5722784001313859, "language_loss": 0.71423215, "learning_rate": 7.314070597839861e-08, "loss": 0.73565948, "num_input_tokens_seen": 327823795, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 15196, "time_per_iteration": 2.5437557697296143 }, { "auxiliary_loss_clip": 0.01119825, "auxiliary_loss_mlp": 0.01031381, "balance_loss_clip": 1.0194658, "balance_loss_mlp": 1.03451192, "epoch": 0.9136930707951301, "flos": 26286216048000.0, "grad_norm": 1.5546149448534734, "language_loss": 0.71098757, "learning_rate": 7.303949855910829e-08, "loss": 0.73249966, "num_input_tokens_seen": 327845175, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.67578125, "step": 15197, "time_per_iteration": 2.629801034927368 }, { "auxiliary_loss_clip": 0.01121638, "auxiliary_loss_mlp": 0.01028909, "balance_loss_clip": 1.01628447, "balance_loss_mlp": 1.03332329, "epoch": 0.913753194047798, "flos": 22091885942400.0, "grad_norm": 1.9470848475910483, "language_loss": 0.77976602, "learning_rate": 7.293835990808173e-08, "loss": 0.80127144, "num_input_tokens_seen": 327863150, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 15198, "time_per_iteration": 2.603503465652466 }, { "auxiliary_loss_clip": 0.01108468, "auxiliary_loss_mlp": 0.01034199, "balance_loss_clip": 1.02235532, "balance_loss_mlp": 1.03356314, "epoch": 0.913813317300466, "flos": 23587206151680.0, "grad_norm": 1.523803482314595, "language_loss": 0.67917615, "learning_rate": 7.28372900289278e-08, "loss": 0.70060283, "num_input_tokens_seen": 327883445, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.65625, "step": 15199, "time_per_iteration": 2.5589587688446045 }, { "auxiliary_loss_clip": 0.0111161, "auxiliary_loss_mlp": 0.01280828, "balance_loss_clip": 1.02246976, "balance_loss_mlp": 1.03372395, "epoch": 0.9138734405531339, "flos": 28876452583680.0, "grad_norm": 1.7123328865969043, "language_loss": 0.67547381, "learning_rate": 7.273628892525429e-08, "loss": 0.69939816, "num_input_tokens_seen": 327905745, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6875, "step": 15200, "time_per_iteration": 2.6386234760284424 }, { "auxiliary_loss_clip": 0.01109746, "auxiliary_loss_mlp": 0.0103208, "balance_loss_clip": 1.02088034, "balance_loss_mlp": 1.03490472, "epoch": 0.9139335638058019, "flos": 22821087945600.0, "grad_norm": 1.546681981658961, "language_loss": 0.71339667, "learning_rate": 7.263535660066456e-08, "loss": 0.734815, "num_input_tokens_seen": 327925435, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6640625, "step": 15201, "time_per_iteration": 2.5808043479919434 }, { "auxiliary_loss_clip": 0.01109004, "auxiliary_loss_mlp": 0.01027963, "balance_loss_clip": 1.01692998, "balance_loss_mlp": 1.03272963, "epoch": 0.9139936870584698, "flos": 18004174381440.0, "grad_norm": 1.7552230521424004, "language_loss": 0.71094894, "learning_rate": 7.253449305876148e-08, "loss": 0.73231858, "num_input_tokens_seen": 327944145, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.67578125, "step": 15202, "time_per_iteration": 2.5317366123199463 }, { "auxiliary_loss_clip": 0.01101594, "auxiliary_loss_mlp": 0.01030334, "balance_loss_clip": 1.0177995, "balance_loss_mlp": 1.03338706, "epoch": 0.9140538103111379, "flos": 15813767111040.0, "grad_norm": 2.015262683490185, "language_loss": 0.66696274, "learning_rate": 7.243369830314438e-08, "loss": 0.68828201, "num_input_tokens_seen": 327960565, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 15203, "time_per_iteration": 2.5441107749938965 }, { "auxiliary_loss_clip": 0.01113029, "auxiliary_loss_mlp": 0.01032297, "balance_loss_clip": 1.0190289, "balance_loss_mlp": 1.03609896, "epoch": 0.9141139335638058, "flos": 23987035416960.0, "grad_norm": 1.7213109851525203, "language_loss": 0.69190294, "learning_rate": 7.233297233741021e-08, "loss": 0.71335626, "num_input_tokens_seen": 327981180, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6796875, "step": 15204, "time_per_iteration": 2.610846519470215 }, { "auxiliary_loss_clip": 0.01109075, "auxiliary_loss_mlp": 0.01024314, "balance_loss_clip": 1.01197588, "balance_loss_mlp": 1.03095376, "epoch": 0.9141740568164738, "flos": 24024418496640.0, "grad_norm": 1.9156958361350132, "language_loss": 0.5940001, "learning_rate": 7.223231516515338e-08, "loss": 0.61533397, "num_input_tokens_seen": 328001500, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 15205, "time_per_iteration": 2.518883228302002 }, { "auxiliary_loss_clip": 0.01125405, "auxiliary_loss_mlp": 0.01030341, "balance_loss_clip": 1.0182296, "balance_loss_mlp": 1.03293729, "epoch": 0.9142341800691417, "flos": 27018291139200.0, "grad_norm": 1.910883153136065, "language_loss": 0.8118577, "learning_rate": 7.213172678996681e-08, "loss": 0.83341515, "num_input_tokens_seen": 328023025, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.65625, "step": 15206, "time_per_iteration": 2.5752217769622803 }, { "auxiliary_loss_clip": 0.01138602, "auxiliary_loss_mlp": 0.01027017, "balance_loss_clip": 1.01435733, "balance_loss_mlp": 1.03317833, "epoch": 0.9142943033218097, "flos": 20412487509120.0, "grad_norm": 2.473889963977556, "language_loss": 0.74178362, "learning_rate": 7.203120721543966e-08, "loss": 0.76343977, "num_input_tokens_seen": 328041410, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 15207, "time_per_iteration": 2.520249366760254 }, { "auxiliary_loss_clip": 0.01136397, "auxiliary_loss_mlp": 0.0103676, "balance_loss_clip": 1.02412343, "balance_loss_mlp": 1.03263319, "epoch": 0.9143544265744776, "flos": 19755322231680.0, "grad_norm": 1.7564823789772337, "language_loss": 0.73164099, "learning_rate": 7.193075644515945e-08, "loss": 0.75337255, "num_input_tokens_seen": 328060495, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.68359375, "step": 15208, "time_per_iteration": 2.5660924911499023 }, { "auxiliary_loss_clip": 0.01109267, "auxiliary_loss_mlp": 0.01028807, "balance_loss_clip": 1.01726747, "balance_loss_mlp": 1.0334233, "epoch": 0.9144145498271457, "flos": 17165444832000.0, "grad_norm": 2.2380636874492104, "language_loss": 0.86553574, "learning_rate": 7.183037448271112e-08, "loss": 0.88691652, "num_input_tokens_seen": 328076905, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 15209, "time_per_iteration": 2.487154483795166 }, { "auxiliary_loss_clip": 0.01119488, "auxiliary_loss_mlp": 0.01032738, "balance_loss_clip": 1.02110302, "balance_loss_mlp": 1.03433847, "epoch": 0.9144746730798137, "flos": 23726072131200.0, "grad_norm": 2.1894859035192895, "language_loss": 0.75249434, "learning_rate": 7.173006133167669e-08, "loss": 0.77401662, "num_input_tokens_seen": 328096960, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.67578125, "step": 15210, "time_per_iteration": 2.567037343978882 }, { "auxiliary_loss_clip": 0.01118633, "auxiliary_loss_mlp": 0.01029745, "balance_loss_clip": 1.01725829, "balance_loss_mlp": 1.03150892, "epoch": 0.9145347963324816, "flos": 25847854467840.0, "grad_norm": 1.8921313839774994, "language_loss": 0.77805209, "learning_rate": 7.162981699563642e-08, "loss": 0.79953587, "num_input_tokens_seen": 328115445, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 15211, "time_per_iteration": 2.5116734504699707 }, { "auxiliary_loss_clip": 0.01113536, "auxiliary_loss_mlp": 0.01031059, "balance_loss_clip": 1.01918542, "balance_loss_mlp": 1.03585649, "epoch": 0.9145949195851496, "flos": 19242769109760.0, "grad_norm": 1.638288087262253, "language_loss": 0.83204305, "learning_rate": 7.15296414781672e-08, "loss": 0.85348904, "num_input_tokens_seen": 328133965, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 15212, "time_per_iteration": 2.4957571029663086 }, { "auxiliary_loss_clip": 0.0110791, "auxiliary_loss_mlp": 0.01028273, "balance_loss_clip": 1.0164777, "balance_loss_mlp": 1.0329051, "epoch": 0.9146550428378175, "flos": 18296379521280.0, "grad_norm": 1.6837877747111725, "language_loss": 0.83846509, "learning_rate": 7.142953478284486e-08, "loss": 0.85982692, "num_input_tokens_seen": 328151520, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6640625, "step": 15213, "time_per_iteration": 2.4771878719329834 }, { "auxiliary_loss_clip": 0.01112146, "auxiliary_loss_mlp": 0.01028393, "balance_loss_clip": 1.01694894, "balance_loss_mlp": 1.03429484, "epoch": 0.9147151660904855, "flos": 20084264006400.0, "grad_norm": 1.9288485012866643, "language_loss": 0.70510685, "learning_rate": 7.132949691324141e-08, "loss": 0.72651225, "num_input_tokens_seen": 328171275, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.68359375, "step": 15214, "time_per_iteration": 2.640765428543091 }, { "auxiliary_loss_clip": 0.01116033, "auxiliary_loss_mlp": 0.01036204, "balance_loss_clip": 1.02324641, "balance_loss_mlp": 1.03608406, "epoch": 0.9147752893431534, "flos": 24389127239040.0, "grad_norm": 1.8230616812329268, "language_loss": 0.62817442, "learning_rate": 7.122952787292713e-08, "loss": 0.64969683, "num_input_tokens_seen": 328192115, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 15215, "time_per_iteration": 2.6488258838653564 }, { "auxiliary_loss_clip": 0.0113372, "auxiliary_loss_mlp": 0.01031877, "balance_loss_clip": 1.01879978, "balance_loss_mlp": 1.03499627, "epoch": 0.9148354125958215, "flos": 18150402648960.0, "grad_norm": 2.3811715170316683, "language_loss": 0.75706714, "learning_rate": 7.112962766546937e-08, "loss": 0.77872312, "num_input_tokens_seen": 328208990, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 15216, "time_per_iteration": 2.5500104427337646 }, { "auxiliary_loss_clip": 0.01121058, "auxiliary_loss_mlp": 0.01034579, "balance_loss_clip": 1.02191925, "balance_loss_mlp": 1.03506064, "epoch": 0.9148955358484894, "flos": 23367540528000.0, "grad_norm": 2.554232887872675, "language_loss": 0.68270499, "learning_rate": 7.102979629443418e-08, "loss": 0.70426136, "num_input_tokens_seen": 328227840, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 15217, "time_per_iteration": 2.67738938331604 }, { "auxiliary_loss_clip": 0.01108634, "auxiliary_loss_mlp": 0.01031911, "balance_loss_clip": 1.02046132, "balance_loss_mlp": 1.03382206, "epoch": 0.9149556591011574, "flos": 18076498416000.0, "grad_norm": 1.9994981953343278, "language_loss": 0.79883617, "learning_rate": 7.093003376338314e-08, "loss": 0.82024157, "num_input_tokens_seen": 328246250, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.66015625, "step": 15218, "time_per_iteration": 3.9356472492218018 }, { "auxiliary_loss_clip": 0.01048572, "auxiliary_loss_mlp": 0.01001934, "balance_loss_clip": 1.00088501, "balance_loss_mlp": 1.00100708, "epoch": 0.9150157823538253, "flos": 54586374825600.0, "grad_norm": 0.8450289462965808, "language_loss": 0.59208256, "learning_rate": 7.083034007587718e-08, "loss": 0.61258763, "num_input_tokens_seen": 328303625, "router_z_loss_clip": 0.01049805, "router_z_loss_mlp": 0.21191406, "step": 15219, "time_per_iteration": 3.1193459033966064 }, { "auxiliary_loss_clip": 0.01109671, "auxiliary_loss_mlp": 0.01033705, "balance_loss_clip": 1.02188587, "balance_loss_mlp": 1.03394413, "epoch": 0.9150759056064933, "flos": 17893102550400.0, "grad_norm": 2.326713221085825, "language_loss": 0.78628093, "learning_rate": 7.073071523547391e-08, "loss": 0.8077147, "num_input_tokens_seen": 328322135, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.66796875, "step": 15220, "time_per_iteration": 2.5601961612701416 }, { "auxiliary_loss_clip": 0.01140174, "auxiliary_loss_mlp": 0.01035269, "balance_loss_clip": 1.0218997, "balance_loss_mlp": 1.03283167, "epoch": 0.9151360288591612, "flos": 18073517587200.0, "grad_norm": 3.620349129949942, "language_loss": 0.65752763, "learning_rate": 7.06311592457287e-08, "loss": 0.67928207, "num_input_tokens_seen": 328340750, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 15221, "time_per_iteration": 2.5615620613098145 }, { "auxiliary_loss_clip": 0.0111952, "auxiliary_loss_mlp": 0.01029713, "balance_loss_clip": 1.01825738, "balance_loss_mlp": 1.03346992, "epoch": 0.9151961521118293, "flos": 19354523299200.0, "grad_norm": 2.9762742974915968, "language_loss": 0.8407712, "learning_rate": 7.053167211019473e-08, "loss": 0.86226356, "num_input_tokens_seen": 328359995, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.68359375, "step": 15222, "time_per_iteration": 2.51387357711792 }, { "auxiliary_loss_clip": 0.01039954, "auxiliary_loss_mlp": 0.01002642, "balance_loss_clip": 1.0014317, "balance_loss_mlp": 1.00125575, "epoch": 0.9152562753644973, "flos": 72146621018880.0, "grad_norm": 0.7229457931523474, "language_loss": 0.49634632, "learning_rate": 7.04322538324218e-08, "loss": 0.51677227, "num_input_tokens_seen": 328426865, "router_z_loss_clip": 0.01208496, "router_z_loss_mlp": 0.21191406, "step": 15223, "time_per_iteration": 3.238081455230713 }, { "auxiliary_loss_clip": 0.01129905, "auxiliary_loss_mlp": 0.01037787, "balance_loss_clip": 1.02508581, "balance_loss_mlp": 1.03443837, "epoch": 0.9153163986171652, "flos": 20777016683520.0, "grad_norm": 1.726693211910296, "language_loss": 0.72142649, "learning_rate": 7.033290441595884e-08, "loss": 0.74310338, "num_input_tokens_seen": 328445970, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 15224, "time_per_iteration": 2.574263334274292 }, { "auxiliary_loss_clip": 0.0110904, "auxiliary_loss_mlp": 0.01026223, "balance_loss_clip": 1.01387858, "balance_loss_mlp": 1.03264308, "epoch": 0.9153765218698332, "flos": 23040107124480.0, "grad_norm": 1.9075195519522783, "language_loss": 0.81308508, "learning_rate": 7.023362386435017e-08, "loss": 0.83443773, "num_input_tokens_seen": 328464585, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.67578125, "step": 15225, "time_per_iteration": 4.116075038909912 }, { "auxiliary_loss_clip": 0.01099686, "auxiliary_loss_mlp": 0.01022569, "balance_loss_clip": 1.01089835, "balance_loss_mlp": 1.03177655, "epoch": 0.9154366451225011, "flos": 28990900293120.0, "grad_norm": 1.2688369846771972, "language_loss": 0.71009219, "learning_rate": 7.013441218114002e-08, "loss": 0.73131478, "num_input_tokens_seen": 328490155, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6796875, "step": 15226, "time_per_iteration": 2.679421901702881 }, { "auxiliary_loss_clip": 0.01157368, "auxiliary_loss_mlp": 0.01029856, "balance_loss_clip": 1.01759529, "balance_loss_mlp": 1.0335319, "epoch": 0.9154967683751691, "flos": 22309504490880.0, "grad_norm": 2.9161895920622247, "language_loss": 0.74817222, "learning_rate": 7.003526936986825e-08, "loss": 0.77004445, "num_input_tokens_seen": 328508275, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.703125, "step": 15227, "time_per_iteration": 2.6486716270446777 }, { "auxiliary_loss_clip": 0.01119072, "auxiliary_loss_mlp": 0.01028999, "balance_loss_clip": 1.01770961, "balance_loss_mlp": 1.03371298, "epoch": 0.915556891627837, "flos": 24571481610240.0, "grad_norm": 1.4037703072099343, "language_loss": 0.73734784, "learning_rate": 6.993619543407337e-08, "loss": 0.75882852, "num_input_tokens_seen": 328529425, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6796875, "step": 15228, "time_per_iteration": 2.8321990966796875 }, { "auxiliary_loss_clip": 0.01125701, "auxiliary_loss_mlp": 0.01031441, "balance_loss_clip": 1.01965714, "balance_loss_mlp": 1.03170896, "epoch": 0.9156170148805051, "flos": 22164676853760.0, "grad_norm": 1.555923162548164, "language_loss": 0.72126722, "learning_rate": 6.983719037729074e-08, "loss": 0.74283868, "num_input_tokens_seen": 328550200, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.66796875, "step": 15229, "time_per_iteration": 2.649176836013794 }, { "auxiliary_loss_clip": 0.01105835, "auxiliary_loss_mlp": 0.01034267, "balance_loss_clip": 1.02118981, "balance_loss_mlp": 1.03642416, "epoch": 0.915677138133173, "flos": 20920659171840.0, "grad_norm": 1.6099110301578685, "language_loss": 0.83170402, "learning_rate": 6.973825420305424e-08, "loss": 0.85310507, "num_input_tokens_seen": 328568540, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 15230, "time_per_iteration": 2.5715620517730713 }, { "auxiliary_loss_clip": 0.01135273, "auxiliary_loss_mlp": 0.01029841, "balance_loss_clip": 1.0177654, "balance_loss_mlp": 1.03327274, "epoch": 0.915737261385841, "flos": 24345136056960.0, "grad_norm": 1.884456890861107, "language_loss": 0.83350068, "learning_rate": 6.963938691489368e-08, "loss": 0.85515183, "num_input_tokens_seen": 328587300, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6640625, "step": 15231, "time_per_iteration": 2.6116082668304443 }, { "auxiliary_loss_clip": 0.01106368, "auxiliary_loss_mlp": 0.01025586, "balance_loss_clip": 1.01394498, "balance_loss_mlp": 1.03038394, "epoch": 0.9157973846385089, "flos": 26761386090240.0, "grad_norm": 1.5271591520300225, "language_loss": 0.72439468, "learning_rate": 6.954058851633826e-08, "loss": 0.74571419, "num_input_tokens_seen": 328610055, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.66796875, "step": 15232, "time_per_iteration": 4.0719592571258545 }, { "auxiliary_loss_clip": 0.01102801, "auxiliary_loss_mlp": 0.01030564, "balance_loss_clip": 1.01834452, "balance_loss_mlp": 1.03319383, "epoch": 0.9158575078911769, "flos": 18478733892480.0, "grad_norm": 2.7623094832723916, "language_loss": 0.6786797, "learning_rate": 6.944185901091337e-08, "loss": 0.70001328, "num_input_tokens_seen": 328626815, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6953125, "step": 15233, "time_per_iteration": 2.4823338985443115 }, { "auxiliary_loss_clip": 0.01123357, "auxiliary_loss_mlp": 0.01033913, "balance_loss_clip": 1.02109754, "balance_loss_mlp": 1.03545868, "epoch": 0.9159176311438448, "flos": 21798926616960.0, "grad_norm": 1.9042455642032092, "language_loss": 0.69726694, "learning_rate": 6.934319840214287e-08, "loss": 0.71883965, "num_input_tokens_seen": 328643995, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 15234, "time_per_iteration": 4.013181686401367 }, { "auxiliary_loss_clip": 0.01121507, "auxiliary_loss_mlp": 0.01031188, "balance_loss_clip": 1.01882625, "balance_loss_mlp": 1.03493869, "epoch": 0.9159777543965129, "flos": 24783749032320.0, "grad_norm": 2.582783325149246, "language_loss": 0.88054079, "learning_rate": 6.924460669354681e-08, "loss": 0.90206778, "num_input_tokens_seen": 328659565, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 15235, "time_per_iteration": 2.6049721240997314 }, { "auxiliary_loss_clip": 0.01148204, "auxiliary_loss_mlp": 0.01029949, "balance_loss_clip": 1.01742578, "balance_loss_mlp": 1.03412616, "epoch": 0.9160378776491809, "flos": 26868758820480.0, "grad_norm": 1.6674008995009053, "language_loss": 0.77399504, "learning_rate": 6.914608388864462e-08, "loss": 0.79577661, "num_input_tokens_seen": 328679045, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 15236, "time_per_iteration": 2.7883026599884033 }, { "auxiliary_loss_clip": 0.01066601, "auxiliary_loss_mlp": 0.01002511, "balance_loss_clip": 1.00131893, "balance_loss_mlp": 1.00106049, "epoch": 0.9160980009018488, "flos": 70578222589440.0, "grad_norm": 0.677772593103377, "language_loss": 0.62267405, "learning_rate": 6.904762999095193e-08, "loss": 0.6433652, "num_input_tokens_seen": 328744565, "router_z_loss_clip": 0.01190186, "router_z_loss_mlp": 0.21191406, "step": 15237, "time_per_iteration": 3.234166383743286 }, { "auxiliary_loss_clip": 0.01105218, "auxiliary_loss_mlp": 0.0127907, "balance_loss_clip": 1.01909304, "balance_loss_mlp": 1.03473723, "epoch": 0.9161581241545168, "flos": 16289332202880.0, "grad_norm": 2.532685948578606, "language_loss": 0.74569654, "learning_rate": 6.894924500398236e-08, "loss": 0.76953942, "num_input_tokens_seen": 328762455, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 15238, "time_per_iteration": 2.535452127456665 }, { "auxiliary_loss_clip": 0.01104385, "auxiliary_loss_mlp": 0.01026729, "balance_loss_clip": 1.01442695, "balance_loss_mlp": 1.03450692, "epoch": 0.9162182474071847, "flos": 18438154502400.0, "grad_norm": 1.5951774751899885, "language_loss": 0.74781442, "learning_rate": 6.885092893124688e-08, "loss": 0.76912546, "num_input_tokens_seen": 328780320, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 15239, "time_per_iteration": 2.5047223567962646 }, { "auxiliary_loss_clip": 0.01130549, "auxiliary_loss_mlp": 0.01032773, "balance_loss_clip": 1.02060795, "balance_loss_mlp": 1.03388882, "epoch": 0.9162783706598527, "flos": 19167248764800.0, "grad_norm": 1.9330137405640682, "language_loss": 0.63369447, "learning_rate": 6.875268177625404e-08, "loss": 0.65532768, "num_input_tokens_seen": 328797570, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.703125, "step": 15240, "time_per_iteration": 2.600034475326538 }, { "auxiliary_loss_clip": 0.01120054, "auxiliary_loss_mlp": 0.01271641, "balance_loss_clip": 1.01351714, "balance_loss_mlp": 1.03543782, "epoch": 0.9163384939125206, "flos": 20412990299520.0, "grad_norm": 1.8382707164412122, "language_loss": 0.76412785, "learning_rate": 6.865450354251078e-08, "loss": 0.78804487, "num_input_tokens_seen": 328814075, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.66796875, "step": 15241, "time_per_iteration": 2.6008169651031494 }, { "auxiliary_loss_clip": 0.01119119, "auxiliary_loss_mlp": 0.01031322, "balance_loss_clip": 1.01783347, "balance_loss_mlp": 1.03517127, "epoch": 0.9163986171651887, "flos": 19645902426240.0, "grad_norm": 2.275960064168259, "language_loss": 0.67679012, "learning_rate": 6.855639423351966e-08, "loss": 0.69829452, "num_input_tokens_seen": 328831990, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75390625, "step": 15242, "time_per_iteration": 2.7096996307373047 }, { "auxiliary_loss_clip": 0.01111618, "auxiliary_loss_mlp": 0.01030162, "balance_loss_clip": 1.01750207, "balance_loss_mlp": 1.03353536, "epoch": 0.9164587404178566, "flos": 12823054865280.0, "grad_norm": 1.733502586944855, "language_loss": 0.80787635, "learning_rate": 6.845835385278298e-08, "loss": 0.82929415, "num_input_tokens_seen": 328849105, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 15243, "time_per_iteration": 2.5870273113250732 }, { "auxiliary_loss_clip": 0.0112115, "auxiliary_loss_mlp": 0.01029115, "balance_loss_clip": 1.01681256, "balance_loss_mlp": 1.03209889, "epoch": 0.9165188636705246, "flos": 22309396750080.0, "grad_norm": 1.9674884548050378, "language_loss": 0.81963742, "learning_rate": 6.836038240379905e-08, "loss": 0.84114009, "num_input_tokens_seen": 328866810, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71484375, "step": 15244, "time_per_iteration": 2.5584332942962646 }, { "auxiliary_loss_clip": 0.01107088, "auxiliary_loss_mlp": 0.01035301, "balance_loss_clip": 1.02205682, "balance_loss_mlp": 1.03479636, "epoch": 0.9165789869231925, "flos": 18223337214720.0, "grad_norm": 2.2904807386414094, "language_loss": 0.72171688, "learning_rate": 6.826247989006418e-08, "loss": 0.74314076, "num_input_tokens_seen": 328885325, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 15245, "time_per_iteration": 2.4944069385528564 }, { "auxiliary_loss_clip": 0.01117909, "auxiliary_loss_mlp": 0.01031583, "balance_loss_clip": 1.02045524, "balance_loss_mlp": 1.03292525, "epoch": 0.9166391101758605, "flos": 13691553811200.0, "grad_norm": 1.6928171006809847, "language_loss": 0.7484479, "learning_rate": 6.816464631507224e-08, "loss": 0.76994288, "num_input_tokens_seen": 328902655, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.67578125, "step": 15246, "time_per_iteration": 2.586132526397705 }, { "auxiliary_loss_clip": 0.01119588, "auxiliary_loss_mlp": 0.01031022, "balance_loss_clip": 1.01815343, "balance_loss_mlp": 1.03361416, "epoch": 0.9166992334285284, "flos": 17346793622400.0, "grad_norm": 2.4006378821599395, "language_loss": 0.75706303, "learning_rate": 6.806688168231534e-08, "loss": 0.7785691, "num_input_tokens_seen": 328918440, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6796875, "step": 15247, "time_per_iteration": 2.472608804702759 }, { "auxiliary_loss_clip": 0.0111483, "auxiliary_loss_mlp": 0.01027748, "balance_loss_clip": 1.01512337, "balance_loss_mlp": 1.03634739, "epoch": 0.9167593566811965, "flos": 23731135948800.0, "grad_norm": 1.708480405675637, "language_loss": 0.75735283, "learning_rate": 6.796918599528134e-08, "loss": 0.77877867, "num_input_tokens_seen": 328938055, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 15248, "time_per_iteration": 2.6827518939971924 }, { "auxiliary_loss_clip": 0.01112745, "auxiliary_loss_mlp": 0.01035936, "balance_loss_clip": 1.02228045, "balance_loss_mlp": 1.03408182, "epoch": 0.9168194799338644, "flos": 27818201064960.0, "grad_norm": 2.189090259134521, "language_loss": 0.72647208, "learning_rate": 6.787155925745769e-08, "loss": 0.7479589, "num_input_tokens_seen": 328957895, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.6953125, "step": 15249, "time_per_iteration": 2.520977020263672 }, { "auxiliary_loss_clip": 0.01140493, "auxiliary_loss_mlp": 0.01028382, "balance_loss_clip": 1.01545978, "balance_loss_mlp": 1.03544855, "epoch": 0.9168796031865324, "flos": 21717552355200.0, "grad_norm": 1.9065847271539968, "language_loss": 0.75887835, "learning_rate": 6.777400147232759e-08, "loss": 0.78056717, "num_input_tokens_seen": 328971365, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 15250, "time_per_iteration": 2.5077695846557617 }, { "auxiliary_loss_clip": 0.0113666, "auxiliary_loss_mlp": 0.01026105, "balance_loss_clip": 1.01426172, "balance_loss_mlp": 1.03283668, "epoch": 0.9169397264392004, "flos": 23404420817280.0, "grad_norm": 1.653552378818734, "language_loss": 0.75814462, "learning_rate": 6.767651264337382e-08, "loss": 0.77977234, "num_input_tokens_seen": 328990830, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.68359375, "step": 15251, "time_per_iteration": 2.5454728603363037 }, { "auxiliary_loss_clip": 0.01125232, "auxiliary_loss_mlp": 0.01030931, "balance_loss_clip": 1.01716185, "balance_loss_mlp": 1.03738678, "epoch": 0.9169998496918683, "flos": 23950981140480.0, "grad_norm": 2.0492667866045933, "language_loss": 0.79819429, "learning_rate": 6.757909277407426e-08, "loss": 0.81975591, "num_input_tokens_seen": 329008345, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.69921875, "step": 15252, "time_per_iteration": 2.532768964767456 }, { "auxiliary_loss_clip": 0.01102923, "auxiliary_loss_mlp": 0.01032188, "balance_loss_clip": 1.02002871, "balance_loss_mlp": 1.03342557, "epoch": 0.9170599729445363, "flos": 18332469711360.0, "grad_norm": 1.6048818516219494, "language_loss": 0.775226, "learning_rate": 6.748174186790611e-08, "loss": 0.7965771, "num_input_tokens_seen": 329027440, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6953125, "step": 15253, "time_per_iteration": 2.48555588722229 }, { "auxiliary_loss_clip": 0.01115465, "auxiliary_loss_mlp": 0.01027058, "balance_loss_clip": 1.01479721, "balance_loss_mlp": 1.03697085, "epoch": 0.9171200961972042, "flos": 22674859678080.0, "grad_norm": 1.7559813345633886, "language_loss": 0.7302115, "learning_rate": 6.738445992834397e-08, "loss": 0.75163668, "num_input_tokens_seen": 329046445, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6953125, "step": 15254, "time_per_iteration": 2.7084734439849854 }, { "auxiliary_loss_clip": 0.01126694, "auxiliary_loss_mlp": 0.01025001, "balance_loss_clip": 1.01331306, "balance_loss_mlp": 1.03367805, "epoch": 0.9171802194498723, "flos": 26719298328960.0, "grad_norm": 1.5838590540975397, "language_loss": 0.79458046, "learning_rate": 6.728724695885878e-08, "loss": 0.81609738, "num_input_tokens_seen": 329065555, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6640625, "step": 15255, "time_per_iteration": 2.684969186782837 }, { "auxiliary_loss_clip": 0.01113646, "auxiliary_loss_mlp": 0.01032058, "balance_loss_clip": 1.01969588, "balance_loss_mlp": 1.03416443, "epoch": 0.9172403427025402, "flos": 37889240538240.0, "grad_norm": 1.974532306690649, "language_loss": 0.6837607, "learning_rate": 6.719010296292027e-08, "loss": 0.70521772, "num_input_tokens_seen": 329087515, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.70703125, "step": 15256, "time_per_iteration": 2.7005629539489746 }, { "auxiliary_loss_clip": 0.01108832, "auxiliary_loss_mlp": 0.01035303, "balance_loss_clip": 1.02399004, "balance_loss_mlp": 1.03266501, "epoch": 0.9173004659552082, "flos": 17055163100160.0, "grad_norm": 1.6286682106227663, "language_loss": 0.83544195, "learning_rate": 6.709302794399519e-08, "loss": 0.85688329, "num_input_tokens_seen": 329106820, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.671875, "step": 15257, "time_per_iteration": 2.5425825119018555 }, { "auxiliary_loss_clip": 0.01117644, "auxiliary_loss_mlp": 0.01030903, "balance_loss_clip": 1.01962614, "balance_loss_mlp": 1.0334146, "epoch": 0.9173605892078761, "flos": 22201593056640.0, "grad_norm": 1.582319123917644, "language_loss": 0.77681231, "learning_rate": 6.69960219055481e-08, "loss": 0.79829776, "num_input_tokens_seen": 329126515, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6640625, "step": 15258, "time_per_iteration": 2.604919910430908 }, { "auxiliary_loss_clip": 0.01113634, "auxiliary_loss_mlp": 0.01031938, "balance_loss_clip": 1.02010691, "balance_loss_mlp": 1.03678381, "epoch": 0.9174207124605441, "flos": 16507776764160.0, "grad_norm": 2.1479686259989794, "language_loss": 0.78202963, "learning_rate": 6.689908485104045e-08, "loss": 0.80348539, "num_input_tokens_seen": 329142660, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.67578125, "step": 15259, "time_per_iteration": 2.5929179191589355 }, { "auxiliary_loss_clip": 0.01136593, "auxiliary_loss_mlp": 0.01032037, "balance_loss_clip": 1.02071261, "balance_loss_mlp": 1.03394198, "epoch": 0.917480835713212, "flos": 24535606901760.0, "grad_norm": 1.7441787379457627, "language_loss": 0.76521057, "learning_rate": 6.680221678393216e-08, "loss": 0.78689688, "num_input_tokens_seen": 329162575, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.671875, "step": 15260, "time_per_iteration": 4.180397987365723 }, { "auxiliary_loss_clip": 0.01136505, "auxiliary_loss_mlp": 0.01028555, "balance_loss_clip": 1.01661587, "balance_loss_mlp": 1.03300786, "epoch": 0.9175409589658801, "flos": 20880726226560.0, "grad_norm": 1.6576618095732865, "language_loss": 0.60777718, "learning_rate": 6.670541770768002e-08, "loss": 0.62942779, "num_input_tokens_seen": 329182090, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6796875, "step": 15261, "time_per_iteration": 2.581749439239502 }, { "auxiliary_loss_clip": 0.01117341, "auxiliary_loss_mlp": 0.01028737, "balance_loss_clip": 1.01674497, "balance_loss_mlp": 1.03233743, "epoch": 0.917601082218548, "flos": 14276035918080.0, "grad_norm": 1.7945728707256638, "language_loss": 0.7346673, "learning_rate": 6.660868762573835e-08, "loss": 0.75612807, "num_input_tokens_seen": 329196535, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.671875, "step": 15262, "time_per_iteration": 2.446465492248535 }, { "auxiliary_loss_clip": 0.01105652, "auxiliary_loss_mlp": 0.01034408, "balance_loss_clip": 1.02165914, "balance_loss_mlp": 1.03343701, "epoch": 0.917661205471216, "flos": 19099234362240.0, "grad_norm": 2.062505759648912, "language_loss": 0.77515578, "learning_rate": 6.651202654155907e-08, "loss": 0.79655635, "num_input_tokens_seen": 329215135, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.72265625, "step": 15263, "time_per_iteration": 2.529820442199707 }, { "auxiliary_loss_clip": 0.01106016, "auxiliary_loss_mlp": 0.01029445, "balance_loss_clip": 1.01710105, "balance_loss_mlp": 1.03424394, "epoch": 0.917721328723884, "flos": 21106568989440.0, "grad_norm": 2.095691600619649, "language_loss": 0.75795895, "learning_rate": 6.641543445859255e-08, "loss": 0.77931356, "num_input_tokens_seen": 329235150, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71875, "step": 15264, "time_per_iteration": 2.533219337463379 }, { "auxiliary_loss_clip": 0.01120226, "auxiliary_loss_mlp": 0.01032235, "balance_loss_clip": 1.02079129, "balance_loss_mlp": 1.03411794, "epoch": 0.9177814519765519, "flos": 21943215550080.0, "grad_norm": 2.1197271252766487, "language_loss": 0.83267093, "learning_rate": 6.631891138028511e-08, "loss": 0.85419559, "num_input_tokens_seen": 329254365, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6875, "step": 15265, "time_per_iteration": 2.514677047729492 }, { "auxiliary_loss_clip": 0.01107767, "auxiliary_loss_mlp": 0.01040964, "balance_loss_clip": 1.02789283, "balance_loss_mlp": 1.03500462, "epoch": 0.9178415752292199, "flos": 24205982768640.0, "grad_norm": 2.7817889395649575, "language_loss": 0.73501545, "learning_rate": 6.622245731008181e-08, "loss": 0.75650275, "num_input_tokens_seen": 329274385, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 15266, "time_per_iteration": 2.726658344268799 }, { "auxiliary_loss_clip": 0.01101971, "auxiliary_loss_mlp": 0.01028567, "balance_loss_clip": 1.0168252, "balance_loss_mlp": 1.03317273, "epoch": 0.9179016984818879, "flos": 20042068504320.0, "grad_norm": 1.7698260004182844, "language_loss": 0.77694672, "learning_rate": 6.612607225142475e-08, "loss": 0.79825211, "num_input_tokens_seen": 329292160, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 15267, "time_per_iteration": 4.175321817398071 }, { "auxiliary_loss_clip": 0.01030966, "auxiliary_loss_mlp": 0.01248144, "balance_loss_clip": 1.00093555, "balance_loss_mlp": 1.00104141, "epoch": 0.9179618217345559, "flos": 65555901100800.0, "grad_norm": 0.7332765426842909, "language_loss": 0.56202942, "learning_rate": 6.602975620775364e-08, "loss": 0.58482051, "num_input_tokens_seen": 329351870, "router_z_loss_clip": 0.01300049, "router_z_loss_mlp": 0.2109375, "step": 15268, "time_per_iteration": 3.176184892654419 }, { "auxiliary_loss_clip": 0.01104307, "auxiliary_loss_mlp": 0.01279004, "balance_loss_clip": 1.01945639, "balance_loss_mlp": 1.03373408, "epoch": 0.9180219449872238, "flos": 21324618501120.0, "grad_norm": 2.713668046491339, "language_loss": 0.76233208, "learning_rate": 6.593350918250573e-08, "loss": 0.78616518, "num_input_tokens_seen": 329370930, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 15269, "time_per_iteration": 2.8023781776428223 }, { "auxiliary_loss_clip": 0.0112775, "auxiliary_loss_mlp": 0.01030081, "balance_loss_clip": 1.01745081, "balance_loss_mlp": 1.03265762, "epoch": 0.9180820682398918, "flos": 41060008684800.0, "grad_norm": 1.8778414366016207, "language_loss": 0.72568905, "learning_rate": 6.58373311791156e-08, "loss": 0.74726737, "num_input_tokens_seen": 329391275, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 15270, "time_per_iteration": 2.8013510704040527 }, { "auxiliary_loss_clip": 0.01099127, "auxiliary_loss_mlp": 0.01031522, "balance_loss_clip": 1.01893997, "balance_loss_mlp": 1.03223503, "epoch": 0.9181421914925597, "flos": 28072915384320.0, "grad_norm": 1.9818036262092464, "language_loss": 0.79894102, "learning_rate": 6.574122220101653e-08, "loss": 0.82024759, "num_input_tokens_seen": 329412775, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.66796875, "step": 15271, "time_per_iteration": 2.5413966178894043 }, { "auxiliary_loss_clip": 0.01129271, "auxiliary_loss_mlp": 0.01029323, "balance_loss_clip": 1.01725364, "balance_loss_mlp": 1.03246593, "epoch": 0.9182023147452277, "flos": 29169411909120.0, "grad_norm": 1.80070451722674, "language_loss": 0.7278415, "learning_rate": 6.564518225163707e-08, "loss": 0.7494275, "num_input_tokens_seen": 329432440, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 15272, "time_per_iteration": 2.700070381164551 }, { "auxiliary_loss_clip": 0.01120478, "auxiliary_loss_mlp": 0.01035256, "balance_loss_clip": 1.02350163, "balance_loss_mlp": 1.03360128, "epoch": 0.9182624379978956, "flos": 24060831909120.0, "grad_norm": 1.9544905707541487, "language_loss": 0.72513217, "learning_rate": 6.55492113344056e-08, "loss": 0.74668956, "num_input_tokens_seen": 329450605, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 15273, "time_per_iteration": 3.961219072341919 }, { "auxiliary_loss_clip": 0.01121696, "auxiliary_loss_mlp": 0.01025624, "balance_loss_clip": 1.01438868, "balance_loss_mlp": 1.03144991, "epoch": 0.9183225612505637, "flos": 23293528554240.0, "grad_norm": 1.4822068577360947, "language_loss": 0.74191093, "learning_rate": 6.545330945274674e-08, "loss": 0.76338416, "num_input_tokens_seen": 329470550, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.640625, "step": 15274, "time_per_iteration": 2.5537445545196533 }, { "auxiliary_loss_clip": 0.01112369, "auxiliary_loss_mlp": 0.01038214, "balance_loss_clip": 1.02645373, "balance_loss_mlp": 1.03308535, "epoch": 0.9183826845032316, "flos": 19609237618560.0, "grad_norm": 1.7146991080798617, "language_loss": 0.68778229, "learning_rate": 6.535747661008306e-08, "loss": 0.70928812, "num_input_tokens_seen": 329489765, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.703125, "step": 15275, "time_per_iteration": 2.5323476791381836 }, { "auxiliary_loss_clip": 0.01108685, "auxiliary_loss_mlp": 0.01030071, "balance_loss_clip": 1.01825142, "balance_loss_mlp": 1.03228331, "epoch": 0.9184428077558996, "flos": 18479057114880.0, "grad_norm": 1.8031849286039692, "language_loss": 0.72577178, "learning_rate": 6.526171280983428e-08, "loss": 0.74715936, "num_input_tokens_seen": 329507040, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.67578125, "step": 15276, "time_per_iteration": 4.026610851287842 }, { "auxiliary_loss_clip": 0.0112061, "auxiliary_loss_mlp": 0.0103086, "balance_loss_clip": 1.0189923, "balance_loss_mlp": 1.03436756, "epoch": 0.9185029310085676, "flos": 20741034234240.0, "grad_norm": 1.6185884393618923, "language_loss": 0.73207176, "learning_rate": 6.516601805541855e-08, "loss": 0.75358641, "num_input_tokens_seen": 329525540, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 15277, "time_per_iteration": 2.826046943664551 }, { "auxiliary_loss_clip": 0.01048827, "auxiliary_loss_mlp": 0.01002886, "balance_loss_clip": 1.00175333, "balance_loss_mlp": 1.00090337, "epoch": 0.9185630542612355, "flos": 68209231875840.0, "grad_norm": 0.7270337568453473, "language_loss": 0.59245217, "learning_rate": 6.507039235025047e-08, "loss": 0.61296928, "num_input_tokens_seen": 329592905, "router_z_loss_clip": 0.01135254, "router_z_loss_mlp": 0.21289062, "step": 15278, "time_per_iteration": 3.2613472938537598 }, { "auxiliary_loss_clip": 0.01122088, "auxiliary_loss_mlp": 0.01027465, "balance_loss_clip": 1.0165689, "balance_loss_mlp": 1.03024697, "epoch": 0.9186231775139035, "flos": 12239470598400.0, "grad_norm": 1.7198616895355783, "language_loss": 0.64437592, "learning_rate": 6.497483569774287e-08, "loss": 0.66587138, "num_input_tokens_seen": 329610150, "router_z_loss_clip": 0.10888672, "router_z_loss_mlp": 0.65625, "step": 15279, "time_per_iteration": 2.5305240154266357 }, { "auxiliary_loss_clip": 0.01021988, "auxiliary_loss_mlp": 0.01251033, "balance_loss_clip": 1.00397336, "balance_loss_mlp": 1.0008378, "epoch": 0.9186833007665715, "flos": 63939237770880.0, "grad_norm": 0.8591681618897828, "language_loss": 0.6022529, "learning_rate": 6.487934810130591e-08, "loss": 0.62498313, "num_input_tokens_seen": 329673650, "router_z_loss_clip": 0.01165771, "router_z_loss_mlp": 0.2109375, "step": 15280, "time_per_iteration": 3.205326795578003 }, { "auxiliary_loss_clip": 0.01120832, "auxiliary_loss_mlp": 0.01035751, "balance_loss_clip": 1.02318692, "balance_loss_mlp": 1.0331099, "epoch": 0.9187434240192395, "flos": 19974700546560.0, "grad_norm": 2.0199782208166623, "language_loss": 0.69571912, "learning_rate": 6.478392956434753e-08, "loss": 0.71728492, "num_input_tokens_seen": 329692520, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 15281, "time_per_iteration": 2.5610451698303223 }, { "auxiliary_loss_clip": 0.01120687, "auxiliary_loss_mlp": 0.01029444, "balance_loss_clip": 1.01600349, "balance_loss_mlp": 1.03342879, "epoch": 0.9188035472719074, "flos": 25227820874880.0, "grad_norm": 1.874741633327359, "language_loss": 0.84795702, "learning_rate": 6.468858009027234e-08, "loss": 0.86945832, "num_input_tokens_seen": 329713750, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.69921875, "step": 15282, "time_per_iteration": 2.634202241897583 }, { "auxiliary_loss_clip": 0.01118771, "auxiliary_loss_mlp": 0.01030543, "balance_loss_clip": 1.0178771, "balance_loss_mlp": 1.0341413, "epoch": 0.9188636705245754, "flos": 18405547931520.0, "grad_norm": 1.7808752429652182, "language_loss": 0.60117555, "learning_rate": 6.459329968248384e-08, "loss": 0.62266874, "num_input_tokens_seen": 329730960, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.66796875, "step": 15283, "time_per_iteration": 2.678359270095825 }, { "auxiliary_loss_clip": 0.01116114, "auxiliary_loss_mlp": 0.0127907, "balance_loss_clip": 1.01960576, "balance_loss_mlp": 1.03497028, "epoch": 0.9189237937772433, "flos": 23769129559680.0, "grad_norm": 1.8439661455885357, "language_loss": 0.65703994, "learning_rate": 6.449808834438175e-08, "loss": 0.68099183, "num_input_tokens_seen": 329750975, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.72265625, "step": 15284, "time_per_iteration": 2.6679813861846924 }, { "auxiliary_loss_clip": 0.01100111, "auxiliary_loss_mlp": 0.01032178, "balance_loss_clip": 1.01930881, "balance_loss_mlp": 1.03193223, "epoch": 0.9189839170299113, "flos": 21214624078080.0, "grad_norm": 1.7902962215868983, "language_loss": 0.74024796, "learning_rate": 6.440294607936447e-08, "loss": 0.76157093, "num_input_tokens_seen": 329769645, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6796875, "step": 15285, "time_per_iteration": 2.5463473796844482 }, { "auxiliary_loss_clip": 0.01100289, "auxiliary_loss_mlp": 0.01032848, "balance_loss_clip": 1.02066481, "balance_loss_mlp": 1.03253865, "epoch": 0.9190440402825792, "flos": 16727370560640.0, "grad_norm": 1.9012598571219028, "language_loss": 0.72050709, "learning_rate": 6.430787289082706e-08, "loss": 0.74183846, "num_input_tokens_seen": 329788185, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 15286, "time_per_iteration": 2.490683078765869 }, { "auxiliary_loss_clip": 0.0110373, "auxiliary_loss_mlp": 0.01031696, "balance_loss_clip": 1.01926279, "balance_loss_mlp": 1.0345757, "epoch": 0.9191041635352473, "flos": 23441193365760.0, "grad_norm": 1.705655023892017, "language_loss": 0.73517364, "learning_rate": 6.421286878216214e-08, "loss": 0.7565279, "num_input_tokens_seen": 329806780, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69140625, "step": 15287, "time_per_iteration": 2.546398878097534 }, { "auxiliary_loss_clip": 0.01104624, "auxiliary_loss_mlp": 0.01028652, "balance_loss_clip": 1.01515126, "balance_loss_mlp": 1.03529823, "epoch": 0.9191642867879152, "flos": 18807532012800.0, "grad_norm": 2.6098556793566083, "language_loss": 0.65874511, "learning_rate": 6.4117933756761e-08, "loss": 0.68007785, "num_input_tokens_seen": 329826350, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.69140625, "step": 15288, "time_per_iteration": 2.6061408519744873 }, { "auxiliary_loss_clip": 0.01106067, "auxiliary_loss_mlp": 0.01036291, "balance_loss_clip": 1.02398276, "balance_loss_mlp": 1.03707075, "epoch": 0.9192244100405832, "flos": 32160950167680.0, "grad_norm": 2.1149115088155948, "language_loss": 0.71629357, "learning_rate": 6.402306781801048e-08, "loss": 0.73771715, "num_input_tokens_seen": 329846160, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 15289, "time_per_iteration": 2.5984199047088623 }, { "auxiliary_loss_clip": 0.01132215, "auxiliary_loss_mlp": 0.01030077, "balance_loss_clip": 1.01770329, "balance_loss_mlp": 1.03583705, "epoch": 0.9192845332932512, "flos": 16357669827840.0, "grad_norm": 2.03188309144502, "language_loss": 0.74663591, "learning_rate": 6.39282709692972e-08, "loss": 0.76825881, "num_input_tokens_seen": 329862020, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 15290, "time_per_iteration": 3.086489677429199 }, { "auxiliary_loss_clip": 0.01137097, "auxiliary_loss_mlp": 0.01030075, "balance_loss_clip": 1.01826108, "balance_loss_mlp": 1.03435266, "epoch": 0.9193446565459191, "flos": 26614475464320.0, "grad_norm": 2.140651427239695, "language_loss": 0.71930408, "learning_rate": 6.38335432140038e-08, "loss": 0.7409758, "num_input_tokens_seen": 329880185, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 15291, "time_per_iteration": 2.6960957050323486 }, { "auxiliary_loss_clip": 0.01119202, "auxiliary_loss_mlp": 0.01027381, "balance_loss_clip": 1.01588964, "balance_loss_mlp": 1.03411984, "epoch": 0.9194047797985871, "flos": 22492182084480.0, "grad_norm": 1.7457136697211744, "language_loss": 0.70707256, "learning_rate": 6.373888455551069e-08, "loss": 0.72853839, "num_input_tokens_seen": 329900255, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.67578125, "step": 15292, "time_per_iteration": 2.7935431003570557 }, { "auxiliary_loss_clip": 0.01119845, "auxiliary_loss_mlp": 0.01026714, "balance_loss_clip": 1.01420927, "balance_loss_mlp": 1.0340395, "epoch": 0.9194649030512551, "flos": 25078791346560.0, "grad_norm": 1.6044788286621061, "language_loss": 0.73195457, "learning_rate": 6.364429499719581e-08, "loss": 0.75342011, "num_input_tokens_seen": 329919095, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 15293, "time_per_iteration": 2.8619298934936523 }, { "auxiliary_loss_clip": 0.01100822, "auxiliary_loss_mlp": 0.01028935, "balance_loss_clip": 1.01743102, "balance_loss_mlp": 1.0337081, "epoch": 0.9195250263039231, "flos": 11911139354880.0, "grad_norm": 2.060051882591514, "language_loss": 0.77819359, "learning_rate": 6.35497745424356e-08, "loss": 0.79949123, "num_input_tokens_seen": 329936505, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 15294, "time_per_iteration": 2.716977119445801 }, { "auxiliary_loss_clip": 0.01123709, "auxiliary_loss_mlp": 0.01033327, "balance_loss_clip": 1.01942086, "balance_loss_mlp": 1.03456414, "epoch": 0.919585149556591, "flos": 21834154880640.0, "grad_norm": 1.5058631583384334, "language_loss": 0.77151871, "learning_rate": 6.345532319460267e-08, "loss": 0.79308909, "num_input_tokens_seen": 329956795, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7109375, "step": 15295, "time_per_iteration": 2.9060776233673096 }, { "auxiliary_loss_clip": 0.01100281, "auxiliary_loss_mlp": 0.01026702, "balance_loss_clip": 1.01573443, "balance_loss_mlp": 1.03440189, "epoch": 0.919645272809259, "flos": 28184059042560.0, "grad_norm": 2.2076061219064136, "language_loss": 0.71528953, "learning_rate": 6.33609409570679e-08, "loss": 0.73655939, "num_input_tokens_seen": 329977195, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.65625, "step": 15296, "time_per_iteration": 3.3018221855163574 }, { "auxiliary_loss_clip": 0.01103385, "auxiliary_loss_mlp": 0.01041245, "balance_loss_clip": 1.02943695, "balance_loss_mlp": 1.03507471, "epoch": 0.9197053960619269, "flos": 18332828847360.0, "grad_norm": 1.9875524792500694, "language_loss": 0.75276983, "learning_rate": 6.326662783319925e-08, "loss": 0.77421618, "num_input_tokens_seen": 329992095, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.68359375, "step": 15297, "time_per_iteration": 2.9006199836730957 }, { "auxiliary_loss_clip": 0.01110963, "auxiliary_loss_mlp": 0.01272227, "balance_loss_clip": 1.01272416, "balance_loss_mlp": 1.0341363, "epoch": 0.9197655193145949, "flos": 28183448511360.0, "grad_norm": 1.5869450088121604, "language_loss": 0.73487127, "learning_rate": 6.317238382636314e-08, "loss": 0.75870317, "num_input_tokens_seen": 330011490, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.67578125, "step": 15298, "time_per_iteration": 2.9950897693634033 }, { "auxiliary_loss_clip": 0.01121087, "auxiliary_loss_mlp": 0.01036035, "balance_loss_clip": 1.02364874, "balance_loss_mlp": 1.03383183, "epoch": 0.9198256425672628, "flos": 17306321973120.0, "grad_norm": 3.454802102819871, "language_loss": 0.79335654, "learning_rate": 6.307820893992244e-08, "loss": 0.8149277, "num_input_tokens_seen": 330027885, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 15299, "time_per_iteration": 2.8926444053649902 }, { "auxiliary_loss_clip": 0.01129463, "auxiliary_loss_mlp": 0.01024353, "balance_loss_clip": 1.01289082, "balance_loss_mlp": 1.03506863, "epoch": 0.9198857658199309, "flos": 17858520731520.0, "grad_norm": 2.0577825236684633, "language_loss": 0.64082289, "learning_rate": 6.298410317723801e-08, "loss": 0.66236109, "num_input_tokens_seen": 330046230, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.67578125, "step": 15300, "time_per_iteration": 2.889000177383423 }, { "auxiliary_loss_clip": 0.01114108, "auxiliary_loss_mlp": 0.01278458, "balance_loss_clip": 1.02121782, "balance_loss_mlp": 1.03295279, "epoch": 0.9199458890725988, "flos": 22127545169280.0, "grad_norm": 1.6593226589696883, "language_loss": 0.6937108, "learning_rate": 6.289006654166828e-08, "loss": 0.71763647, "num_input_tokens_seen": 330065535, "router_z_loss_clip": 0.10546875, "router_z_loss_mlp": 0.6328125, "step": 15301, "time_per_iteration": 2.9662444591522217 }, { "auxiliary_loss_clip": 0.01114539, "auxiliary_loss_mlp": 0.0103621, "balance_loss_clip": 1.02297163, "balance_loss_mlp": 1.03489935, "epoch": 0.9200060123252668, "flos": 16034043265920.0, "grad_norm": 1.70452951758533, "language_loss": 0.71337014, "learning_rate": 6.279609903656946e-08, "loss": 0.73487771, "num_input_tokens_seen": 330082920, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 15302, "time_per_iteration": 4.4852752685546875 }, { "auxiliary_loss_clip": 0.01103989, "auxiliary_loss_mlp": 0.01032881, "balance_loss_clip": 1.02099538, "balance_loss_mlp": 1.03451204, "epoch": 0.9200661355779348, "flos": 26864521015680.0, "grad_norm": 1.733575232958931, "language_loss": 0.76487625, "learning_rate": 6.270220066529464e-08, "loss": 0.78624499, "num_input_tokens_seen": 330101165, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 15303, "time_per_iteration": 2.9711666107177734 }, { "auxiliary_loss_clip": 0.01121315, "auxiliary_loss_mlp": 0.01030543, "balance_loss_clip": 1.01884305, "balance_loss_mlp": 1.03532231, "epoch": 0.9201262588306027, "flos": 12786749193600.0, "grad_norm": 2.726777905765988, "language_loss": 0.87461478, "learning_rate": 6.260837143119468e-08, "loss": 0.89613342, "num_input_tokens_seen": 330118775, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.68359375, "step": 15304, "time_per_iteration": 2.8633008003234863 }, { "auxiliary_loss_clip": 0.01152314, "auxiliary_loss_mlp": 0.01033163, "balance_loss_clip": 1.018471, "balance_loss_mlp": 1.0359869, "epoch": 0.9201863820832707, "flos": 20631614428800.0, "grad_norm": 2.5075825968790015, "language_loss": 0.77098751, "learning_rate": 6.251461133761892e-08, "loss": 0.79284227, "num_input_tokens_seen": 330135570, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.71484375, "step": 15305, "time_per_iteration": 2.986652374267578 }, { "auxiliary_loss_clip": 0.0111214, "auxiliary_loss_mlp": 0.01035406, "balance_loss_clip": 1.02298999, "balance_loss_mlp": 1.03362548, "epoch": 0.9202465053359387, "flos": 26395815421440.0, "grad_norm": 1.9384019592189476, "language_loss": 0.81306779, "learning_rate": 6.242092038791246e-08, "loss": 0.83454317, "num_input_tokens_seen": 330152840, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 15306, "time_per_iteration": 3.003735065460205 }, { "auxiliary_loss_clip": 0.01126046, "auxiliary_loss_mlp": 0.01032049, "balance_loss_clip": 1.02086139, "balance_loss_mlp": 1.03291595, "epoch": 0.9203066285886067, "flos": 10488179093760.0, "grad_norm": 1.9491061487523271, "language_loss": 0.72109157, "learning_rate": 6.232729858541952e-08, "loss": 0.74267244, "num_input_tokens_seen": 330168605, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6640625, "step": 15307, "time_per_iteration": 2.6938092708587646 }, { "auxiliary_loss_clip": 0.01112629, "auxiliary_loss_mlp": 0.01031328, "balance_loss_clip": 1.01801217, "balance_loss_mlp": 1.03308821, "epoch": 0.9203667518412746, "flos": 19390721230080.0, "grad_norm": 1.7872364067493043, "language_loss": 0.78660631, "learning_rate": 6.223374593348096e-08, "loss": 0.80804592, "num_input_tokens_seen": 330186160, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.70703125, "step": 15308, "time_per_iteration": 2.640220880508423 }, { "auxiliary_loss_clip": 0.01118994, "auxiliary_loss_mlp": 0.01028853, "balance_loss_clip": 1.01597881, "balance_loss_mlp": 1.03301954, "epoch": 0.9204268750939426, "flos": 15924982596480.0, "grad_norm": 1.889360306057622, "language_loss": 0.77909589, "learning_rate": 6.214026243543568e-08, "loss": 0.80057436, "num_input_tokens_seen": 330201780, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.68359375, "step": 15309, "time_per_iteration": 4.10058331489563 }, { "auxiliary_loss_clip": 0.01136692, "auxiliary_loss_mlp": 0.01028069, "balance_loss_clip": 1.01618946, "balance_loss_mlp": 1.03216779, "epoch": 0.9204869983466105, "flos": 16471758401280.0, "grad_norm": 2.224432534659133, "language_loss": 0.66556489, "learning_rate": 6.204684809461924e-08, "loss": 0.68721247, "num_input_tokens_seen": 330219165, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.68359375, "step": 15310, "time_per_iteration": 2.788050413131714 }, { "auxiliary_loss_clip": 0.01145932, "auxiliary_loss_mlp": 0.0103151, "balance_loss_clip": 1.01886785, "balance_loss_mlp": 1.03212667, "epoch": 0.9205471215992785, "flos": 21539220307200.0, "grad_norm": 1.6610183276654238, "language_loss": 0.66237628, "learning_rate": 6.195350291436585e-08, "loss": 0.6841507, "num_input_tokens_seen": 330238975, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 15311, "time_per_iteration": 2.6492817401885986 }, { "auxiliary_loss_clip": 0.01111763, "auxiliary_loss_mlp": 0.01034624, "balance_loss_clip": 1.022542, "balance_loss_mlp": 1.03344691, "epoch": 0.9206072448519464, "flos": 25005892694400.0, "grad_norm": 1.5715958561305126, "language_loss": 0.76044375, "learning_rate": 6.186022689800707e-08, "loss": 0.78190768, "num_input_tokens_seen": 330259755, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 15312, "time_per_iteration": 2.561124086380005 }, { "auxiliary_loss_clip": 0.01109282, "auxiliary_loss_mlp": 0.01034884, "balance_loss_clip": 1.02339816, "balance_loss_mlp": 1.03451383, "epoch": 0.9206673681046145, "flos": 20522661500160.0, "grad_norm": 1.9283185290356177, "language_loss": 0.79341519, "learning_rate": 6.176702004887091e-08, "loss": 0.81485683, "num_input_tokens_seen": 330277660, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66015625, "step": 15313, "time_per_iteration": 2.5420491695404053 }, { "auxiliary_loss_clip": 0.01113075, "auxiliary_loss_mlp": 0.01029981, "balance_loss_clip": 1.01811337, "balance_loss_mlp": 1.03438842, "epoch": 0.9207274913572824, "flos": 20883455660160.0, "grad_norm": 1.8247318056151727, "language_loss": 0.78061372, "learning_rate": 6.167388237028426e-08, "loss": 0.80204427, "num_input_tokens_seen": 330295455, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.69921875, "step": 15314, "time_per_iteration": 2.4949212074279785 }, { "auxiliary_loss_clip": 0.01124326, "auxiliary_loss_mlp": 0.01032998, "balance_loss_clip": 1.01900303, "balance_loss_mlp": 1.03465366, "epoch": 0.9207876146099504, "flos": 22708256348160.0, "grad_norm": 1.87252236716364, "language_loss": 0.78953755, "learning_rate": 6.15808138655709e-08, "loss": 0.81111079, "num_input_tokens_seen": 330315310, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.71875, "step": 15315, "time_per_iteration": 4.002120494842529 }, { "auxiliary_loss_clip": 0.01123644, "auxiliary_loss_mlp": 0.01029533, "balance_loss_clip": 1.01710534, "balance_loss_mlp": 1.03440201, "epoch": 0.9208477378626184, "flos": 18507354053760.0, "grad_norm": 1.9565082838291035, "language_loss": 0.76174301, "learning_rate": 6.148781453805197e-08, "loss": 0.78327477, "num_input_tokens_seen": 330333260, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71875, "step": 15316, "time_per_iteration": 2.443854808807373 }, { "auxiliary_loss_clip": 0.01101866, "auxiliary_loss_mlp": 0.01032858, "balance_loss_clip": 1.02024007, "balance_loss_mlp": 1.03162003, "epoch": 0.9209078611152863, "flos": 22999635475200.0, "grad_norm": 1.5843056854047481, "language_loss": 0.69121152, "learning_rate": 6.139488439104612e-08, "loss": 0.71255875, "num_input_tokens_seen": 330352465, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 15317, "time_per_iteration": 4.261960029602051 }, { "auxiliary_loss_clip": 0.01112037, "auxiliary_loss_mlp": 0.01031285, "balance_loss_clip": 1.01896441, "balance_loss_mlp": 1.03481829, "epoch": 0.9209679843679544, "flos": 24061514267520.0, "grad_norm": 1.669746207147785, "language_loss": 0.83420944, "learning_rate": 6.130202342787094e-08, "loss": 0.85564274, "num_input_tokens_seen": 330372685, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 15318, "time_per_iteration": 2.582181930541992 }, { "auxiliary_loss_clip": 0.01121239, "auxiliary_loss_mlp": 0.01027487, "balance_loss_clip": 1.015167, "balance_loss_mlp": 1.03511357, "epoch": 0.9210281076206223, "flos": 13553370190080.0, "grad_norm": 1.901075418597308, "language_loss": 0.85654819, "learning_rate": 6.120923165183888e-08, "loss": 0.87803549, "num_input_tokens_seen": 330388860, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 15319, "time_per_iteration": 2.479900360107422 }, { "auxiliary_loss_clip": 0.01022169, "auxiliary_loss_mlp": 0.01001261, "balance_loss_clip": 1.00008094, "balance_loss_mlp": 1.00096381, "epoch": 0.9210882308732903, "flos": 71426289674880.0, "grad_norm": 0.7726827979429585, "language_loss": 0.56249976, "learning_rate": 6.111650906626242e-08, "loss": 0.58273411, "num_input_tokens_seen": 330448735, "router_z_loss_clip": 0.01177979, "router_z_loss_mlp": 0.21289062, "step": 15320, "time_per_iteration": 3.1212964057922363 }, { "auxiliary_loss_clip": 0.01142869, "auxiliary_loss_mlp": 0.01037151, "balance_loss_clip": 1.02367401, "balance_loss_mlp": 1.0347178, "epoch": 0.9211483541259582, "flos": 18509113820160.0, "grad_norm": 2.0027143531880265, "language_loss": 0.63543338, "learning_rate": 6.102385567445023e-08, "loss": 0.65723354, "num_input_tokens_seen": 330465600, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 15321, "time_per_iteration": 2.551299810409546 }, { "auxiliary_loss_clip": 0.0112491, "auxiliary_loss_mlp": 0.01026125, "balance_loss_clip": 1.01480651, "balance_loss_mlp": 1.03353822, "epoch": 0.9212084773786262, "flos": 23258228463360.0, "grad_norm": 1.5300727206203995, "language_loss": 0.71414208, "learning_rate": 6.093127147970944e-08, "loss": 0.73565245, "num_input_tokens_seen": 330485770, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.64453125, "step": 15322, "time_per_iteration": 2.642486810684204 }, { "auxiliary_loss_clip": 0.0113767, "auxiliary_loss_mlp": 0.01032716, "balance_loss_clip": 1.02155221, "balance_loss_mlp": 1.03363371, "epoch": 0.9212686006312941, "flos": 16289511770880.0, "grad_norm": 1.8629715781882736, "language_loss": 0.70185697, "learning_rate": 6.08387564853432e-08, "loss": 0.72356087, "num_input_tokens_seen": 330504255, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.68359375, "step": 15323, "time_per_iteration": 2.7704451084136963 }, { "auxiliary_loss_clip": 0.01121631, "auxiliary_loss_mlp": 0.01036879, "balance_loss_clip": 1.02330685, "balance_loss_mlp": 1.03259742, "epoch": 0.9213287238839621, "flos": 19785773986560.0, "grad_norm": 1.9530714890471148, "language_loss": 0.74605072, "learning_rate": 6.074631069465396e-08, "loss": 0.76763582, "num_input_tokens_seen": 330520705, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7109375, "step": 15324, "time_per_iteration": 2.6824629306793213 }, { "auxiliary_loss_clip": 0.0111335, "auxiliary_loss_mlp": 0.0103911, "balance_loss_clip": 1.02689695, "balance_loss_mlp": 1.0349617, "epoch": 0.92138884713663, "flos": 21030402199680.0, "grad_norm": 2.5272818147871217, "language_loss": 0.71100736, "learning_rate": 6.065393411094044e-08, "loss": 0.73253191, "num_input_tokens_seen": 330539245, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 15325, "time_per_iteration": 2.6507408618927 }, { "auxiliary_loss_clip": 0.01118984, "auxiliary_loss_mlp": 0.01030478, "balance_loss_clip": 1.01860499, "balance_loss_mlp": 1.03323925, "epoch": 0.9214489703892981, "flos": 28587264186240.0, "grad_norm": 1.5805067918753402, "language_loss": 0.78607786, "learning_rate": 6.056162673749932e-08, "loss": 0.80757248, "num_input_tokens_seen": 330561815, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 15326, "time_per_iteration": 2.651923894882202 }, { "auxiliary_loss_clip": 0.01113703, "auxiliary_loss_mlp": 0.01033745, "balance_loss_clip": 1.02095425, "balance_loss_mlp": 1.03456926, "epoch": 0.921509093641966, "flos": 16361476669440.0, "grad_norm": 2.1301899046607335, "language_loss": 0.71137363, "learning_rate": 6.046938857762484e-08, "loss": 0.73284811, "num_input_tokens_seen": 330579760, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 15327, "time_per_iteration": 2.605208158493042 }, { "auxiliary_loss_clip": 0.01116432, "auxiliary_loss_mlp": 0.01041961, "balance_loss_clip": 1.02887213, "balance_loss_mlp": 1.03648853, "epoch": 0.921569216894634, "flos": 26830837036800.0, "grad_norm": 1.7970857798367912, "language_loss": 0.78170896, "learning_rate": 6.037721963460885e-08, "loss": 0.80329293, "num_input_tokens_seen": 330598545, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 15328, "time_per_iteration": 2.6436843872070312 }, { "auxiliary_loss_clip": 0.01118814, "auxiliary_loss_mlp": 0.0102953, "balance_loss_clip": 1.01790738, "balance_loss_mlp": 1.03313279, "epoch": 0.921629340147302, "flos": 24645134448000.0, "grad_norm": 2.299895648500467, "language_loss": 0.71753371, "learning_rate": 6.02851199117409e-08, "loss": 0.73901719, "num_input_tokens_seen": 330616700, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 15329, "time_per_iteration": 2.729811906814575 }, { "auxiliary_loss_clip": 0.01109416, "auxiliary_loss_mlp": 0.01027735, "balance_loss_clip": 1.01635027, "balance_loss_mlp": 1.03311777, "epoch": 0.9216894633999699, "flos": 15086504442240.0, "grad_norm": 1.770451150048901, "language_loss": 0.86580825, "learning_rate": 6.019308941230727e-08, "loss": 0.88717973, "num_input_tokens_seen": 330633355, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.67578125, "step": 15330, "time_per_iteration": 2.746528148651123 }, { "auxiliary_loss_clip": 0.01130961, "auxiliary_loss_mlp": 0.01030861, "balance_loss_clip": 1.01877964, "balance_loss_mlp": 1.03419256, "epoch": 0.921749586652638, "flos": 19204524103680.0, "grad_norm": 2.1222366624088718, "language_loss": 0.76103246, "learning_rate": 6.010112813959245e-08, "loss": 0.78265071, "num_input_tokens_seen": 330651470, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69921875, "step": 15331, "time_per_iteration": 2.80561900138855 }, { "auxiliary_loss_clip": 0.01109749, "auxiliary_loss_mlp": 0.0102655, "balance_loss_clip": 1.01450968, "balance_loss_mlp": 1.03241622, "epoch": 0.9218097099053059, "flos": 20522446018560.0, "grad_norm": 2.306875373527835, "language_loss": 0.75399697, "learning_rate": 6.000923609687847e-08, "loss": 0.77535993, "num_input_tokens_seen": 330669170, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.68359375, "step": 15332, "time_per_iteration": 2.6079044342041016 }, { "auxiliary_loss_clip": 0.01138056, "auxiliary_loss_mlp": 0.01030805, "balance_loss_clip": 1.01953375, "balance_loss_mlp": 1.03362083, "epoch": 0.9218698331579739, "flos": 17348625216000.0, "grad_norm": 1.7042759460040418, "language_loss": 0.74776191, "learning_rate": 5.991741328744449e-08, "loss": 0.76945055, "num_input_tokens_seen": 330686635, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6875, "step": 15333, "time_per_iteration": 3.068502902984619 }, { "auxiliary_loss_clip": 0.01031233, "auxiliary_loss_mlp": 0.01000804, "balance_loss_clip": 0.99973077, "balance_loss_mlp": 1.00132358, "epoch": 0.9219299564106418, "flos": 61958332575360.0, "grad_norm": 0.808216723605205, "language_loss": 0.52931952, "learning_rate": 5.982565971456744e-08, "loss": 0.54963982, "num_input_tokens_seen": 330749160, "router_z_loss_clip": 0.01074219, "router_z_loss_mlp": 0.2109375, "step": 15334, "time_per_iteration": 3.646458625793457 }, { "auxiliary_loss_clip": 0.01118696, "auxiliary_loss_mlp": 0.01031567, "balance_loss_clip": 1.02000976, "balance_loss_mlp": 1.03452897, "epoch": 0.9219900796633098, "flos": 15701761526400.0, "grad_norm": 1.8000798445074402, "language_loss": 0.62630177, "learning_rate": 5.973397538152225e-08, "loss": 0.64780444, "num_input_tokens_seen": 330766840, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.66015625, "step": 15335, "time_per_iteration": 3.4556429386138916 }, { "auxiliary_loss_clip": 0.01137179, "auxiliary_loss_mlp": 0.01026974, "balance_loss_clip": 1.01468349, "balance_loss_mlp": 1.03345144, "epoch": 0.9220502029159777, "flos": 24932670819840.0, "grad_norm": 1.7501220442656762, "language_loss": 0.71373606, "learning_rate": 5.96423602915801e-08, "loss": 0.73537755, "num_input_tokens_seen": 330785585, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.67578125, "step": 15336, "time_per_iteration": 3.3754873275756836 }, { "auxiliary_loss_clip": 0.01116119, "auxiliary_loss_mlp": 0.01033416, "balance_loss_clip": 1.0227344, "balance_loss_mlp": 1.03268945, "epoch": 0.9221103261686457, "flos": 15667215621120.0, "grad_norm": 2.0310977017485206, "language_loss": 0.71820056, "learning_rate": 5.955081444801102e-08, "loss": 0.73969591, "num_input_tokens_seen": 330800750, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.65625, "step": 15337, "time_per_iteration": 3.5853686332702637 }, { "auxiliary_loss_clip": 0.01030917, "auxiliary_loss_mlp": 0.01001891, "balance_loss_clip": 1.00083637, "balance_loss_mlp": 1.00080287, "epoch": 0.9221704494213137, "flos": 67516299630720.0, "grad_norm": 0.8858548052434185, "language_loss": 0.65375948, "learning_rate": 5.9459337854081745e-08, "loss": 0.67408752, "num_input_tokens_seen": 330863640, "router_z_loss_clip": 0.01055908, "router_z_loss_mlp": 0.2109375, "step": 15338, "time_per_iteration": 3.2297284603118896 }, { "auxiliary_loss_clip": 0.01113211, "auxiliary_loss_mlp": 0.01030174, "balance_loss_clip": 1.01771021, "balance_loss_mlp": 1.03500772, "epoch": 0.9222305726739817, "flos": 30226945155840.0, "grad_norm": 2.6666822591892516, "language_loss": 0.66957057, "learning_rate": 5.936793051305766e-08, "loss": 0.6910044, "num_input_tokens_seen": 330884675, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 15339, "time_per_iteration": 2.6413216590881348 }, { "auxiliary_loss_clip": 0.01103301, "auxiliary_loss_mlp": 0.01029875, "balance_loss_clip": 1.0177753, "balance_loss_mlp": 1.03429556, "epoch": 0.9222906959266496, "flos": 25337204766720.0, "grad_norm": 2.0848651826603053, "language_loss": 0.71730238, "learning_rate": 5.927659242819971e-08, "loss": 0.73863411, "num_input_tokens_seen": 330904125, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 15340, "time_per_iteration": 2.6073062419891357 }, { "auxiliary_loss_clip": 0.01128195, "auxiliary_loss_mlp": 0.01028346, "balance_loss_clip": 1.01642525, "balance_loss_mlp": 1.03463244, "epoch": 0.9223508191793176, "flos": 27599864244480.0, "grad_norm": 1.395050601451348, "language_loss": 0.69876873, "learning_rate": 5.918532360276818e-08, "loss": 0.72033417, "num_input_tokens_seen": 330925140, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.66796875, "step": 15341, "time_per_iteration": 2.60636305809021 }, { "auxiliary_loss_clip": 0.0113337, "auxiliary_loss_mlp": 0.01027652, "balance_loss_clip": 1.01653528, "balance_loss_mlp": 1.03324103, "epoch": 0.9224109424319856, "flos": 27307587277440.0, "grad_norm": 1.4899749632942065, "language_loss": 0.67351174, "learning_rate": 5.9094124040020235e-08, "loss": 0.69512206, "num_input_tokens_seen": 330946625, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.64453125, "step": 15342, "time_per_iteration": 2.5580272674560547 }, { "auxiliary_loss_clip": 0.01137804, "auxiliary_loss_mlp": 0.0103346, "balance_loss_clip": 1.02229643, "balance_loss_mlp": 1.03348804, "epoch": 0.9224710656846535, "flos": 48208314401280.0, "grad_norm": 2.225326269348152, "language_loss": 0.6983524, "learning_rate": 5.9002993743210164e-08, "loss": 0.72006506, "num_input_tokens_seen": 330967795, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6875, "step": 15343, "time_per_iteration": 4.3366289138793945 }, { "auxiliary_loss_clip": 0.01039842, "auxiliary_loss_mlp": 0.01001801, "balance_loss_clip": 1.00066304, "balance_loss_mlp": 1.00079465, "epoch": 0.9225311889373216, "flos": 66722171794560.0, "grad_norm": 0.721136009428016, "language_loss": 0.52008754, "learning_rate": 5.891193271559047e-08, "loss": 0.54050398, "num_input_tokens_seen": 331040850, "router_z_loss_clip": 0.01141357, "router_z_loss_mlp": 0.2109375, "step": 15344, "time_per_iteration": 3.4247491359710693 }, { "auxiliary_loss_clip": 0.01096693, "auxiliary_loss_mlp": 0.01031605, "balance_loss_clip": 1.02055454, "balance_loss_mlp": 1.03312325, "epoch": 0.9225913121899895, "flos": 22271295398400.0, "grad_norm": 1.5037294632339664, "language_loss": 0.70086277, "learning_rate": 5.882094096041079e-08, "loss": 0.7221458, "num_input_tokens_seen": 331060595, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.63671875, "step": 15345, "time_per_iteration": 2.5676302909851074 }, { "auxiliary_loss_clip": 0.01120534, "auxiliary_loss_mlp": 0.01035457, "balance_loss_clip": 1.02329135, "balance_loss_mlp": 1.03366923, "epoch": 0.9226514354426575, "flos": 20082719721600.0, "grad_norm": 2.010583664659116, "language_loss": 0.77610862, "learning_rate": 5.873001848091874e-08, "loss": 0.79766858, "num_input_tokens_seen": 331080195, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6953125, "step": 15346, "time_per_iteration": 2.618114948272705 }, { "auxiliary_loss_clip": 0.0112458, "auxiliary_loss_mlp": 0.01037449, "balance_loss_clip": 1.0248543, "balance_loss_mlp": 1.0344038, "epoch": 0.9227115586953254, "flos": 16070851728000.0, "grad_norm": 1.746423164864603, "language_loss": 0.76473337, "learning_rate": 5.863916528035839e-08, "loss": 0.78635365, "num_input_tokens_seen": 331097645, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 15347, "time_per_iteration": 2.5792317390441895 }, { "auxiliary_loss_clip": 0.01107632, "auxiliary_loss_mlp": 0.01029371, "balance_loss_clip": 1.01814187, "balance_loss_mlp": 1.03430164, "epoch": 0.9227716819479934, "flos": 20446027833600.0, "grad_norm": 1.576606734381404, "language_loss": 0.76987958, "learning_rate": 5.854838136197249e-08, "loss": 0.79124969, "num_input_tokens_seen": 331116830, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.64453125, "step": 15348, "time_per_iteration": 2.5844812393188477 }, { "auxiliary_loss_clip": 0.01120841, "auxiliary_loss_mlp": 0.01033771, "balance_loss_clip": 1.02095008, "balance_loss_mlp": 1.03367698, "epoch": 0.9228318052006613, "flos": 25007401065600.0, "grad_norm": 1.6979578287075123, "language_loss": 0.67610276, "learning_rate": 5.8457666729001096e-08, "loss": 0.69764888, "num_input_tokens_seen": 331137235, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 15349, "time_per_iteration": 2.5683064460754395 }, { "auxiliary_loss_clip": 0.01128596, "auxiliary_loss_mlp": 0.01028605, "balance_loss_clip": 1.01673198, "balance_loss_mlp": 1.03176284, "epoch": 0.9228919284533293, "flos": 12677257560960.0, "grad_norm": 2.175028963536656, "language_loss": 0.87256902, "learning_rate": 5.836702138468119e-08, "loss": 0.89414102, "num_input_tokens_seen": 331153155, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 15350, "time_per_iteration": 4.230589389801025 }, { "auxiliary_loss_clip": 0.01039478, "auxiliary_loss_mlp": 0.0100145, "balance_loss_clip": 1.00034738, "balance_loss_mlp": 1.00084794, "epoch": 0.9229520517059973, "flos": 69986162712960.0, "grad_norm": 0.7816188794508666, "language_loss": 0.60367262, "learning_rate": 5.8276445332247514e-08, "loss": 0.62408185, "num_input_tokens_seen": 331214895, "router_z_loss_clip": 0.01104736, "router_z_loss_mlp": 0.2109375, "step": 15351, "time_per_iteration": 3.148738145828247 }, { "auxiliary_loss_clip": 0.01119302, "auxiliary_loss_mlp": 0.01024403, "balance_loss_clip": 1.01252961, "balance_loss_mlp": 1.03135347, "epoch": 0.9230121749586653, "flos": 14793832425600.0, "grad_norm": 1.978223397963439, "language_loss": 0.77868801, "learning_rate": 5.818593857493348e-08, "loss": 0.80012506, "num_input_tokens_seen": 331232185, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.703125, "step": 15352, "time_per_iteration": 2.5919954776763916 }, { "auxiliary_loss_clip": 0.01132139, "auxiliary_loss_mlp": 0.01034659, "balance_loss_clip": 1.02214229, "balance_loss_mlp": 1.03349245, "epoch": 0.9230722982113332, "flos": 22967208472320.0, "grad_norm": 2.866563557450188, "language_loss": 0.5999766, "learning_rate": 5.809550111596784e-08, "loss": 0.62164456, "num_input_tokens_seen": 331251065, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 15353, "time_per_iteration": 2.6724658012390137 }, { "auxiliary_loss_clip": 0.01118999, "auxiliary_loss_mlp": 0.01028933, "balance_loss_clip": 1.01730466, "balance_loss_mlp": 1.034266, "epoch": 0.9231324214640012, "flos": 18551452976640.0, "grad_norm": 1.801825165618182, "language_loss": 0.74568689, "learning_rate": 5.8005132958578674e-08, "loss": 0.76716626, "num_input_tokens_seen": 331269110, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.66796875, "step": 15354, "time_per_iteration": 2.5946743488311768 }, { "auxiliary_loss_clip": 0.01106125, "auxiliary_loss_mlp": 0.0103478, "balance_loss_clip": 1.02129114, "balance_loss_mlp": 1.03462172, "epoch": 0.9231925447166691, "flos": 22082727974400.0, "grad_norm": 2.222894707774654, "language_loss": 0.6494872, "learning_rate": 5.791483410599074e-08, "loss": 0.67089623, "num_input_tokens_seen": 331286555, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71484375, "step": 15355, "time_per_iteration": 2.5687694549560547 }, { "auxiliary_loss_clip": 0.0111777, "auxiliary_loss_mlp": 0.01032752, "balance_loss_clip": 1.02111733, "balance_loss_mlp": 1.03410053, "epoch": 0.9232526679693371, "flos": 26541145848960.0, "grad_norm": 1.463344293311575, "language_loss": 0.74434775, "learning_rate": 5.782460456142724e-08, "loss": 0.76585299, "num_input_tokens_seen": 331307660, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.65625, "step": 15356, "time_per_iteration": 2.6307804584503174 }, { "auxiliary_loss_clip": 0.01121853, "auxiliary_loss_mlp": 0.01031345, "balance_loss_clip": 1.01901281, "balance_loss_mlp": 1.03468835, "epoch": 0.9233127912220052, "flos": 14756664827520.0, "grad_norm": 1.8937240561101807, "language_loss": 0.6117329, "learning_rate": 5.7734444328107366e-08, "loss": 0.6332649, "num_input_tokens_seen": 331324885, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 15357, "time_per_iteration": 4.854215383529663 }, { "auxiliary_loss_clip": 0.01131093, "auxiliary_loss_mlp": 0.01031166, "balance_loss_clip": 1.01851153, "balance_loss_mlp": 1.03354251, "epoch": 0.9233729144746731, "flos": 25261792162560.0, "grad_norm": 1.5005052587512229, "language_loss": 0.700562, "learning_rate": 5.7644353409249004e-08, "loss": 0.72218454, "num_input_tokens_seen": 331345885, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 15358, "time_per_iteration": 4.834606647491455 }, { "auxiliary_loss_clip": 0.01101643, "auxiliary_loss_mlp": 0.01033451, "balance_loss_clip": 1.02120197, "balance_loss_mlp": 1.0331974, "epoch": 0.9234330377273411, "flos": 27849837968640.0, "grad_norm": 2.187469970654424, "language_loss": 0.73491609, "learning_rate": 5.7554331808067794e-08, "loss": 0.75626707, "num_input_tokens_seen": 331364320, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.68359375, "step": 15359, "time_per_iteration": 2.6204378604888916 }, { "auxiliary_loss_clip": 0.01156523, "auxiliary_loss_mlp": 0.01030139, "balance_loss_clip": 1.01820016, "balance_loss_mlp": 1.03478837, "epoch": 0.923493160980009, "flos": 24608361899520.0, "grad_norm": 1.9070455561566324, "language_loss": 0.64831972, "learning_rate": 5.7464379527775386e-08, "loss": 0.6701864, "num_input_tokens_seen": 331384135, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 15360, "time_per_iteration": 2.7212469577789307 }, { "auxiliary_loss_clip": 0.01031328, "auxiliary_loss_mlp": 0.01248112, "balance_loss_clip": 1.00093174, "balance_loss_mlp": 1.00122178, "epoch": 0.923553284232677, "flos": 67700916558720.0, "grad_norm": 0.7686388214948138, "language_loss": 0.55105865, "learning_rate": 5.737449657158278e-08, "loss": 0.57385302, "num_input_tokens_seen": 331440645, "router_z_loss_clip": 0.01269531, "router_z_loss_mlp": 0.21289062, "step": 15361, "time_per_iteration": 3.1653740406036377 }, { "auxiliary_loss_clip": 0.01131527, "auxiliary_loss_mlp": 0.01031772, "balance_loss_clip": 1.0187006, "balance_loss_mlp": 1.03355122, "epoch": 0.9236134074853449, "flos": 18807244704000.0, "grad_norm": 1.7117879843819808, "language_loss": 0.69564432, "learning_rate": 5.7284682942697395e-08, "loss": 0.71727729, "num_input_tokens_seen": 331459580, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 15362, "time_per_iteration": 2.7297492027282715 }, { "auxiliary_loss_clip": 0.01114677, "auxiliary_loss_mlp": 0.01032937, "balance_loss_clip": 1.01872766, "balance_loss_mlp": 1.03451014, "epoch": 0.923673530738013, "flos": 27782362270080.0, "grad_norm": 1.4846198470460006, "language_loss": 0.75885332, "learning_rate": 5.719493864432423e-08, "loss": 0.78032953, "num_input_tokens_seen": 331481560, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7109375, "step": 15363, "time_per_iteration": 2.749046802520752 }, { "auxiliary_loss_clip": 0.01101938, "auxiliary_loss_mlp": 0.01029555, "balance_loss_clip": 1.01746106, "balance_loss_mlp": 1.03323567, "epoch": 0.9237336539906809, "flos": 26797117144320.0, "grad_norm": 1.9480298424694287, "language_loss": 0.83200443, "learning_rate": 5.710526367966606e-08, "loss": 0.85331935, "num_input_tokens_seen": 331499090, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 15364, "time_per_iteration": 2.601651430130005 }, { "auxiliary_loss_clip": 0.01125687, "auxiliary_loss_mlp": 0.01024591, "balance_loss_clip": 1.01352262, "balance_loss_mlp": 1.03257036, "epoch": 0.9237937772433489, "flos": 23587708942080.0, "grad_norm": 1.5282994499857128, "language_loss": 0.67880851, "learning_rate": 5.701565805192365e-08, "loss": 0.7003113, "num_input_tokens_seen": 331519420, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.66796875, "step": 15365, "time_per_iteration": 2.656630754470825 }, { "auxiliary_loss_clip": 0.01104811, "auxiliary_loss_mlp": 0.01028937, "balance_loss_clip": 1.01637864, "balance_loss_mlp": 1.03570521, "epoch": 0.9238539004960168, "flos": 26140562398080.0, "grad_norm": 2.174507970129018, "language_loss": 0.62427294, "learning_rate": 5.692612176429423e-08, "loss": 0.64561045, "num_input_tokens_seen": 331538720, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 15366, "time_per_iteration": 2.548827886581421 }, { "auxiliary_loss_clip": 0.01130327, "auxiliary_loss_mlp": 0.01028051, "balance_loss_clip": 1.01515222, "balance_loss_mlp": 1.03572416, "epoch": 0.9239140237486848, "flos": 21068000760960.0, "grad_norm": 1.4245906926629508, "language_loss": 0.74199384, "learning_rate": 5.6836654819973464e-08, "loss": 0.76357758, "num_input_tokens_seen": 331558505, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6796875, "step": 15367, "time_per_iteration": 2.581939935684204 }, { "auxiliary_loss_clip": 0.0111601, "auxiliary_loss_mlp": 0.01278863, "balance_loss_clip": 1.02082014, "balance_loss_mlp": 1.03344679, "epoch": 0.9239741470013527, "flos": 24607930936320.0, "grad_norm": 1.8434395356502749, "language_loss": 0.64474529, "learning_rate": 5.6747257222153896e-08, "loss": 0.66869408, "num_input_tokens_seen": 331578440, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6484375, "step": 15368, "time_per_iteration": 2.5564990043640137 }, { "auxiliary_loss_clip": 0.01114521, "auxiliary_loss_mlp": 0.01032966, "balance_loss_clip": 1.02054477, "balance_loss_mlp": 1.03509605, "epoch": 0.9240342702540207, "flos": 24718248581760.0, "grad_norm": 1.7707151124118514, "language_loss": 0.74943089, "learning_rate": 5.6657928974026546e-08, "loss": 0.77090573, "num_input_tokens_seen": 331598945, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 15369, "time_per_iteration": 2.5807852745056152 }, { "auxiliary_loss_clip": 0.01118265, "auxiliary_loss_mlp": 0.01040908, "balance_loss_clip": 1.0270741, "balance_loss_mlp": 1.03702223, "epoch": 0.9240943935066888, "flos": 20849987162880.0, "grad_norm": 1.8877686004359981, "language_loss": 0.7680192, "learning_rate": 5.6568670078778414e-08, "loss": 0.78961098, "num_input_tokens_seen": 331616700, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7265625, "step": 15370, "time_per_iteration": 2.778477191925049 }, { "auxiliary_loss_clip": 0.01102947, "auxiliary_loss_mlp": 0.01031433, "balance_loss_clip": 1.01911306, "balance_loss_mlp": 1.03391504, "epoch": 0.9241545167593567, "flos": 24462313200000.0, "grad_norm": 2.1649695005603737, "language_loss": 0.66808474, "learning_rate": 5.647948053959539e-08, "loss": 0.68942857, "num_input_tokens_seen": 331635625, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 15371, "time_per_iteration": 2.608701467514038 }, { "auxiliary_loss_clip": 0.01040544, "auxiliary_loss_mlp": 0.01003844, "balance_loss_clip": 1.00265157, "balance_loss_mlp": 1.00141001, "epoch": 0.9242146400120247, "flos": 61827259847040.0, "grad_norm": 0.7325809428334823, "language_loss": 0.5773471, "learning_rate": 5.6390360359660274e-08, "loss": 0.59779096, "num_input_tokens_seen": 331698595, "router_z_loss_clip": 0.01190186, "router_z_loss_mlp": 0.21289062, "step": 15372, "time_per_iteration": 3.13218092918396 }, { "auxiliary_loss_clip": 0.01102201, "auxiliary_loss_mlp": 0.01030071, "balance_loss_clip": 1.0192349, "balance_loss_mlp": 1.03540611, "epoch": 0.9242747632646926, "flos": 22048397550720.0, "grad_norm": 2.2292978615099823, "language_loss": 0.70001161, "learning_rate": 5.6301309542154064e-08, "loss": 0.72133434, "num_input_tokens_seen": 331717975, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.66796875, "step": 15373, "time_per_iteration": 2.572803258895874 }, { "auxiliary_loss_clip": 0.01112299, "auxiliary_loss_mlp": 0.01037911, "balance_loss_clip": 1.02575159, "balance_loss_mlp": 1.03466952, "epoch": 0.9243348865173606, "flos": 18478338842880.0, "grad_norm": 2.4641292646603685, "language_loss": 0.75564003, "learning_rate": 5.6212328090254e-08, "loss": 0.77714211, "num_input_tokens_seen": 331737220, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 15374, "time_per_iteration": 2.5357205867767334 }, { "auxiliary_loss_clip": 0.01131454, "auxiliary_loss_mlp": 0.01033586, "balance_loss_clip": 1.02174854, "balance_loss_mlp": 1.03592038, "epoch": 0.9243950097700285, "flos": 23258767167360.0, "grad_norm": 2.1291229664764155, "language_loss": 0.65217751, "learning_rate": 5.612341600713577e-08, "loss": 0.67382789, "num_input_tokens_seen": 331757300, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 15375, "time_per_iteration": 2.5899300575256348 }, { "auxiliary_loss_clip": 0.01120146, "auxiliary_loss_mlp": 0.01029226, "balance_loss_clip": 1.01707852, "balance_loss_mlp": 1.03507233, "epoch": 0.9244551330226966, "flos": 29749081593600.0, "grad_norm": 2.1300267740024266, "language_loss": 0.66534138, "learning_rate": 5.6034573295973056e-08, "loss": 0.68683505, "num_input_tokens_seen": 331776995, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.671875, "step": 15376, "time_per_iteration": 2.8900504112243652 }, { "auxiliary_loss_clip": 0.01022358, "auxiliary_loss_mlp": 0.01000819, "balance_loss_clip": 0.99960345, "balance_loss_mlp": 1.00115323, "epoch": 0.9245152562753645, "flos": 68864960609280.0, "grad_norm": 0.6459329935037059, "language_loss": 0.61155057, "learning_rate": 5.594579995993553e-08, "loss": 0.63178241, "num_input_tokens_seen": 331845015, "router_z_loss_clip": 0.012146, "router_z_loss_mlp": 0.21191406, "step": 15377, "time_per_iteration": 3.4023053646087646 }, { "auxiliary_loss_clip": 0.01128668, "auxiliary_loss_mlp": 0.01035383, "balance_loss_clip": 1.02292609, "balance_loss_mlp": 1.03394997, "epoch": 0.9245753795280325, "flos": 21579260993280.0, "grad_norm": 1.6801770957459143, "language_loss": 0.73658842, "learning_rate": 5.5857096002192015e-08, "loss": 0.7582289, "num_input_tokens_seen": 331862795, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 15378, "time_per_iteration": 2.6952664852142334 }, { "auxiliary_loss_clip": 0.01104829, "auxiliary_loss_mlp": 0.01029753, "balance_loss_clip": 1.01698613, "balance_loss_mlp": 1.0354085, "epoch": 0.9246355027807004, "flos": 34496077334400.0, "grad_norm": 2.0078191296778227, "language_loss": 0.6230365, "learning_rate": 5.576846142590752e-08, "loss": 0.64438236, "num_input_tokens_seen": 331882535, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 15379, "time_per_iteration": 3.2118115425109863 }, { "auxiliary_loss_clip": 0.01154644, "auxiliary_loss_mlp": 0.01030999, "balance_loss_clip": 1.01866078, "balance_loss_mlp": 1.03521872, "epoch": 0.9246956260333684, "flos": 15953854152960.0, "grad_norm": 3.3146469325886296, "language_loss": 0.83846676, "learning_rate": 5.567989623424574e-08, "loss": 0.86032319, "num_input_tokens_seen": 331899335, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6640625, "step": 15380, "time_per_iteration": 3.2842798233032227 }, { "auxiliary_loss_clip": 0.01102904, "auxiliary_loss_mlp": 0.01030227, "balance_loss_clip": 1.01695323, "balance_loss_mlp": 1.03242874, "epoch": 0.9247557492860363, "flos": 23368366540800.0, "grad_norm": 1.4461150823463835, "language_loss": 0.73552442, "learning_rate": 5.5591400430366806e-08, "loss": 0.75685573, "num_input_tokens_seen": 331919030, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 15381, "time_per_iteration": 3.2062668800354004 }, { "auxiliary_loss_clip": 0.01111857, "auxiliary_loss_mlp": 0.01030131, "balance_loss_clip": 1.01844287, "balance_loss_mlp": 1.03463519, "epoch": 0.9248158725387043, "flos": 23039855729280.0, "grad_norm": 1.8216590228086897, "language_loss": 0.78526127, "learning_rate": 5.5502974017429313e-08, "loss": 0.80668116, "num_input_tokens_seen": 331936465, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 15382, "time_per_iteration": 3.0941996574401855 }, { "auxiliary_loss_clip": 0.01119781, "auxiliary_loss_mlp": 0.01034339, "balance_loss_clip": 1.02167284, "balance_loss_mlp": 1.03273869, "epoch": 0.9248759957913724, "flos": 22522418357760.0, "grad_norm": 1.6782216156795586, "language_loss": 0.74995047, "learning_rate": 5.5414616998588734e-08, "loss": 0.77149171, "num_input_tokens_seen": 331954625, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 15383, "time_per_iteration": 2.704064130783081 }, { "auxiliary_loss_clip": 0.01116195, "auxiliary_loss_mlp": 0.01031009, "balance_loss_clip": 1.01948118, "balance_loss_mlp": 1.03165507, "epoch": 0.9249361190440403, "flos": 25447271016960.0, "grad_norm": 1.5153343745645773, "language_loss": 0.75446945, "learning_rate": 5.532632937699855e-08, "loss": 0.77594155, "num_input_tokens_seen": 331975865, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66796875, "step": 15384, "time_per_iteration": 2.729684829711914 }, { "auxiliary_loss_clip": 0.01117057, "auxiliary_loss_mlp": 0.01032445, "balance_loss_clip": 1.02023196, "balance_loss_mlp": 1.03277755, "epoch": 0.9249962422967083, "flos": 12378623886720.0, "grad_norm": 2.101305408691604, "language_loss": 0.66572797, "learning_rate": 5.523811115580912e-08, "loss": 0.68722308, "num_input_tokens_seen": 331992760, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.65625, "step": 15385, "time_per_iteration": 4.061114311218262 }, { "auxiliary_loss_clip": 0.01099902, "auxiliary_loss_mlp": 0.01033538, "balance_loss_clip": 1.02177167, "balance_loss_mlp": 1.03251481, "epoch": 0.9250563655493762, "flos": 22929430343040.0, "grad_norm": 3.136221688913055, "language_loss": 0.80513209, "learning_rate": 5.514996233816949e-08, "loss": 0.82646644, "num_input_tokens_seen": 332011890, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 15386, "time_per_iteration": 2.7323548793792725 }, { "auxiliary_loss_clip": 0.01109202, "auxiliary_loss_mlp": 0.01037307, "balance_loss_clip": 1.02462363, "balance_loss_mlp": 1.03474069, "epoch": 0.9251164888020442, "flos": 18478662065280.0, "grad_norm": 1.6595983265963894, "language_loss": 0.75196922, "learning_rate": 5.506188292722447e-08, "loss": 0.77343434, "num_input_tokens_seen": 332029485, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.65625, "step": 15387, "time_per_iteration": 2.5774660110473633 }, { "auxiliary_loss_clip": 0.01109059, "auxiliary_loss_mlp": 0.01033952, "balance_loss_clip": 1.02303886, "balance_loss_mlp": 1.0328716, "epoch": 0.9251766120547121, "flos": 33037062796800.0, "grad_norm": 1.2465214143094927, "language_loss": 0.69733495, "learning_rate": 5.497387292611799e-08, "loss": 0.71876502, "num_input_tokens_seen": 332052970, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.671875, "step": 15388, "time_per_iteration": 2.6675822734832764 }, { "auxiliary_loss_clip": 0.01113066, "auxiliary_loss_mlp": 0.01027996, "balance_loss_clip": 1.0155921, "balance_loss_mlp": 1.03397083, "epoch": 0.9252367353073802, "flos": 24387906176640.0, "grad_norm": 1.6810472303040176, "language_loss": 0.82203686, "learning_rate": 5.488593233799088e-08, "loss": 0.84344745, "num_input_tokens_seen": 332070395, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 15389, "time_per_iteration": 2.6113903522491455 }, { "auxiliary_loss_clip": 0.01119302, "auxiliary_loss_mlp": 0.01032014, "balance_loss_clip": 1.0184536, "balance_loss_mlp": 1.03671241, "epoch": 0.9252968585600481, "flos": 18916844077440.0, "grad_norm": 3.3744266559389136, "language_loss": 0.79388785, "learning_rate": 5.4798061165981514e-08, "loss": 0.81540096, "num_input_tokens_seen": 332090185, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 15390, "time_per_iteration": 2.5392136573791504 }, { "auxiliary_loss_clip": 0.01130522, "auxiliary_loss_mlp": 0.01038276, "balance_loss_clip": 1.02566338, "balance_loss_mlp": 1.03397417, "epoch": 0.9253569818127161, "flos": 21725345606400.0, "grad_norm": 1.8635262923224798, "language_loss": 0.7555033, "learning_rate": 5.4710259413225604e-08, "loss": 0.77719128, "num_input_tokens_seen": 332109050, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 15391, "time_per_iteration": 4.090721368789673 }, { "auxiliary_loss_clip": 0.01120049, "auxiliary_loss_mlp": 0.0103392, "balance_loss_clip": 1.02016914, "balance_loss_mlp": 1.03253376, "epoch": 0.925417105065384, "flos": 34240357434240.0, "grad_norm": 2.176892190777975, "language_loss": 0.52533072, "learning_rate": 5.4622527082856416e-08, "loss": 0.54687041, "num_input_tokens_seen": 332131180, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.6953125, "step": 15392, "time_per_iteration": 2.875939130783081 }, { "auxiliary_loss_clip": 0.01100304, "auxiliary_loss_mlp": 0.01030999, "balance_loss_clip": 1.01920891, "balance_loss_mlp": 1.03126061, "epoch": 0.925477228318052, "flos": 25959536830080.0, "grad_norm": 4.119846876876522, "language_loss": 0.77255762, "learning_rate": 5.453486417800546e-08, "loss": 0.79387063, "num_input_tokens_seen": 332149555, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.69140625, "step": 15393, "time_per_iteration": 2.568350315093994 }, { "auxiliary_loss_clip": 0.01118327, "auxiliary_loss_mlp": 0.01033923, "balance_loss_clip": 1.02151322, "balance_loss_mlp": 1.03294325, "epoch": 0.9255373515707199, "flos": 11838240702720.0, "grad_norm": 1.8161586305195965, "language_loss": 0.69308996, "learning_rate": 5.444727070180044e-08, "loss": 0.71461254, "num_input_tokens_seen": 332165830, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.671875, "step": 15394, "time_per_iteration": 2.5215139389038086 }, { "auxiliary_loss_clip": 0.01104639, "auxiliary_loss_mlp": 0.01029225, "balance_loss_clip": 1.01709521, "balance_loss_mlp": 1.03534651, "epoch": 0.925597474823388, "flos": 21434325615360.0, "grad_norm": 1.776661115160696, "language_loss": 0.72812164, "learning_rate": 5.43597466573682e-08, "loss": 0.74946022, "num_input_tokens_seen": 332185130, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 15395, "time_per_iteration": 2.5359511375427246 }, { "auxiliary_loss_clip": 0.0113293, "auxiliary_loss_mlp": 0.01028261, "balance_loss_clip": 1.01701975, "balance_loss_mlp": 1.03159285, "epoch": 0.925657598076056, "flos": 22857573185280.0, "grad_norm": 3.025919944551797, "language_loss": 0.71309829, "learning_rate": 5.427229204783157e-08, "loss": 0.73471022, "num_input_tokens_seen": 332203695, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.65625, "step": 15396, "time_per_iteration": 2.615687370300293 }, { "auxiliary_loss_clip": 0.01100961, "auxiliary_loss_mlp": 0.01028514, "balance_loss_clip": 1.0158124, "balance_loss_mlp": 1.03350842, "epoch": 0.9257177213287239, "flos": 25704032411520.0, "grad_norm": 1.6882987439250725, "language_loss": 0.86851007, "learning_rate": 5.418490687631205e-08, "loss": 0.88980484, "num_input_tokens_seen": 332224850, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.67578125, "step": 15397, "time_per_iteration": 2.59092116355896 }, { "auxiliary_loss_clip": 0.01022106, "auxiliary_loss_mlp": 0.01001613, "balance_loss_clip": 1.00032604, "balance_loss_mlp": 1.00097585, "epoch": 0.9257778445813919, "flos": 59592933221760.0, "grad_norm": 0.8154893814821332, "language_loss": 0.55172205, "learning_rate": 5.40975911459276e-08, "loss": 0.57195926, "num_input_tokens_seen": 332278085, "router_z_loss_clip": 0.01287842, "router_z_loss_mlp": 0.2109375, "step": 15398, "time_per_iteration": 4.670790672302246 }, { "auxiliary_loss_clip": 0.01126208, "auxiliary_loss_mlp": 0.01032863, "balance_loss_clip": 1.01863527, "balance_loss_mlp": 1.03481936, "epoch": 0.9258379678340598, "flos": 27709427704320.0, "grad_norm": 2.235798068478638, "language_loss": 0.75924516, "learning_rate": 5.4010344859795056e-08, "loss": 0.78083587, "num_input_tokens_seen": 332297875, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.734375, "step": 15399, "time_per_iteration": 2.599083185195923 }, { "auxiliary_loss_clip": 0.01114679, "auxiliary_loss_mlp": 0.01028843, "balance_loss_clip": 1.01775706, "balance_loss_mlp": 1.03190708, "epoch": 0.9258980910867278, "flos": 24863543095680.0, "grad_norm": 1.5828713355576307, "language_loss": 0.78166479, "learning_rate": 5.3923168021027474e-08, "loss": 0.80309999, "num_input_tokens_seen": 332318500, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6484375, "step": 15400, "time_per_iteration": 4.323949337005615 }, { "auxiliary_loss_clip": 0.01031338, "auxiliary_loss_mlp": 0.01003084, "balance_loss_clip": 1.00189769, "balance_loss_mlp": 1.00100708, "epoch": 0.9259582143393957, "flos": 63134587249920.0, "grad_norm": 0.7385775838191934, "language_loss": 0.51345986, "learning_rate": 5.383606063273616e-08, "loss": 0.53380412, "num_input_tokens_seen": 332381980, "router_z_loss_clip": 0.01184082, "router_z_loss_mlp": 0.2109375, "step": 15401, "time_per_iteration": 3.13210129737854 }, { "auxiliary_loss_clip": 0.01108483, "auxiliary_loss_mlp": 0.01034132, "balance_loss_clip": 1.02262819, "balance_loss_mlp": 1.0342195, "epoch": 0.9260183375920638, "flos": 24127122458880.0, "grad_norm": 1.6368806447083086, "language_loss": 0.82222092, "learning_rate": 5.37490226980295e-08, "loss": 0.84364706, "num_input_tokens_seen": 332399510, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.65234375, "step": 15402, "time_per_iteration": 2.596065044403076 }, { "auxiliary_loss_clip": 0.01021917, "auxiliary_loss_mlp": 0.01001442, "balance_loss_clip": 1.00030947, "balance_loss_mlp": 1.00079882, "epoch": 0.9260784608447317, "flos": 63042872849280.0, "grad_norm": 0.7383700292924456, "language_loss": 0.51128471, "learning_rate": 5.366205422001413e-08, "loss": 0.53151828, "num_input_tokens_seen": 332459130, "router_z_loss_clip": 0.01135254, "router_z_loss_mlp": 0.2109375, "step": 15403, "time_per_iteration": 3.2621850967407227 }, { "auxiliary_loss_clip": 0.01039377, "auxiliary_loss_mlp": 0.00999089, "balance_loss_clip": 0.99800986, "balance_loss_mlp": 1.0009954, "epoch": 0.9261385840973997, "flos": 70585979927040.0, "grad_norm": 0.6820380508848055, "language_loss": 0.58690643, "learning_rate": 5.3575155201793346e-08, "loss": 0.6072911, "num_input_tokens_seen": 332526555, "router_z_loss_clip": 0.01080322, "router_z_loss_mlp": 0.21289062, "step": 15404, "time_per_iteration": 3.1918246746063232 }, { "auxiliary_loss_clip": 0.01121837, "auxiliary_loss_mlp": 0.01032485, "balance_loss_clip": 1.01946139, "balance_loss_mlp": 1.03325963, "epoch": 0.9261987073500676, "flos": 20229917656320.0, "grad_norm": 2.4429129294449985, "language_loss": 0.71517015, "learning_rate": 5.348832564646843e-08, "loss": 0.73671341, "num_input_tokens_seen": 332544005, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 15405, "time_per_iteration": 2.53909969329834 }, { "auxiliary_loss_clip": 0.01142349, "auxiliary_loss_mlp": 0.01036244, "balance_loss_clip": 1.02259469, "balance_loss_mlp": 1.03573966, "epoch": 0.9262588306027356, "flos": 20954163582720.0, "grad_norm": 1.7333720492606768, "language_loss": 0.69064558, "learning_rate": 5.3401565557138265e-08, "loss": 0.71243143, "num_input_tokens_seen": 332563070, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7109375, "step": 15406, "time_per_iteration": 2.6101365089416504 }, { "auxiliary_loss_clip": 0.01140812, "auxiliary_loss_mlp": 0.01033483, "balance_loss_clip": 1.02078724, "balance_loss_mlp": 1.03481555, "epoch": 0.9263189538554035, "flos": 22158679282560.0, "grad_norm": 2.4833489918873064, "language_loss": 0.76351309, "learning_rate": 5.331487493689879e-08, "loss": 0.78525603, "num_input_tokens_seen": 332579620, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 15407, "time_per_iteration": 2.8947179317474365 }, { "auxiliary_loss_clip": 0.01132724, "auxiliary_loss_mlp": 0.01039705, "balance_loss_clip": 1.02625799, "balance_loss_mlp": 1.0327009, "epoch": 0.9263790771080715, "flos": 18187211111040.0, "grad_norm": 2.1219639608434786, "language_loss": 0.72692692, "learning_rate": 5.3228253788844216e-08, "loss": 0.74865115, "num_input_tokens_seen": 332597795, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 15408, "time_per_iteration": 2.6953790187835693 }, { "auxiliary_loss_clip": 0.01118209, "auxiliary_loss_mlp": 0.01027895, "balance_loss_clip": 1.01760077, "balance_loss_mlp": 1.03340554, "epoch": 0.9264392003607396, "flos": 48178545004800.0, "grad_norm": 1.7305972322251517, "language_loss": 0.68430102, "learning_rate": 5.314170211606517e-08, "loss": 0.70576203, "num_input_tokens_seen": 332620375, "router_z_loss_clip": 0.10302734, "router_z_loss_mlp": 0.66796875, "step": 15409, "time_per_iteration": 2.794297456741333 }, { "auxiliary_loss_clip": 0.01130715, "auxiliary_loss_mlp": 0.01036396, "balance_loss_clip": 1.02346814, "balance_loss_mlp": 1.03557825, "epoch": 0.9264993236134075, "flos": 26389458714240.0, "grad_norm": 2.010433762684784, "language_loss": 0.75713134, "learning_rate": 5.305521992165141e-08, "loss": 0.77880239, "num_input_tokens_seen": 332639510, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69140625, "step": 15410, "time_per_iteration": 2.6359078884124756 }, { "auxiliary_loss_clip": 0.01138202, "auxiliary_loss_mlp": 0.01026083, "balance_loss_clip": 1.01408494, "balance_loss_mlp": 1.03318214, "epoch": 0.9265594468660755, "flos": 20920084554240.0, "grad_norm": 1.5382359616988568, "language_loss": 0.82065809, "learning_rate": 5.2968807208688236e-08, "loss": 0.84230089, "num_input_tokens_seen": 332658350, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 15411, "time_per_iteration": 2.6351852416992188 }, { "auxiliary_loss_clip": 0.01125257, "auxiliary_loss_mlp": 0.0103452, "balance_loss_clip": 1.02262866, "balance_loss_mlp": 1.03802443, "epoch": 0.9266195701187434, "flos": 17525017929600.0, "grad_norm": 1.832503840039476, "language_loss": 0.75640357, "learning_rate": 5.288246398026008e-08, "loss": 0.77800131, "num_input_tokens_seen": 332676715, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 15412, "time_per_iteration": 2.5967161655426025 }, { "auxiliary_loss_clip": 0.01100082, "auxiliary_loss_mlp": 0.01030344, "balance_loss_clip": 1.01795268, "balance_loss_mlp": 1.03198397, "epoch": 0.9266796933714114, "flos": 33688733293440.0, "grad_norm": 1.6555952458923602, "language_loss": 0.66854835, "learning_rate": 5.279619023944848e-08, "loss": 0.6898526, "num_input_tokens_seen": 332701470, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6796875, "step": 15413, "time_per_iteration": 2.6698825359344482 }, { "auxiliary_loss_clip": 0.01148558, "auxiliary_loss_mlp": 0.01035553, "balance_loss_clip": 1.02298856, "balance_loss_mlp": 1.03398132, "epoch": 0.9267398166240793, "flos": 18916520855040.0, "grad_norm": 2.791756735019865, "language_loss": 0.76026917, "learning_rate": 5.2709985989331854e-08, "loss": 0.78211027, "num_input_tokens_seen": 332719060, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 15414, "time_per_iteration": 2.5711474418640137 }, { "auxiliary_loss_clip": 0.01110559, "auxiliary_loss_mlp": 0.01029264, "balance_loss_clip": 1.01765871, "balance_loss_mlp": 1.03418922, "epoch": 0.9267999398767474, "flos": 20478957626880.0, "grad_norm": 1.9959055451963008, "language_loss": 0.81681085, "learning_rate": 5.262385123298663e-08, "loss": 0.83820903, "num_input_tokens_seen": 332736345, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 15415, "time_per_iteration": 2.8093791007995605 }, { "auxiliary_loss_clip": 0.01120589, "auxiliary_loss_mlp": 0.01032309, "balance_loss_clip": 1.01970243, "balance_loss_mlp": 1.03235912, "epoch": 0.9268600631294153, "flos": 24789351553920.0, "grad_norm": 1.378596938087283, "language_loss": 0.5435012, "learning_rate": 5.253778597348702e-08, "loss": 0.5650301, "num_input_tokens_seen": 332756270, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 15416, "time_per_iteration": 2.620420455932617 }, { "auxiliary_loss_clip": 0.01127139, "auxiliary_loss_mlp": 0.01034718, "balance_loss_clip": 1.02310145, "balance_loss_mlp": 1.03335392, "epoch": 0.9269201863820833, "flos": 18697178453760.0, "grad_norm": 1.6882107290418429, "language_loss": 0.72050774, "learning_rate": 5.245179021390433e-08, "loss": 0.74212635, "num_input_tokens_seen": 332775185, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 15417, "time_per_iteration": 2.575469732284546 }, { "auxiliary_loss_clip": 0.01110913, "auxiliary_loss_mlp": 0.01030156, "balance_loss_clip": 1.01743674, "balance_loss_mlp": 1.03414106, "epoch": 0.9269803096347512, "flos": 20923999136640.0, "grad_norm": 2.1071906881725013, "language_loss": 0.75735402, "learning_rate": 5.236586395730769e-08, "loss": 0.77876467, "num_input_tokens_seen": 332794320, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.67578125, "step": 15418, "time_per_iteration": 2.5930259227752686 }, { "auxiliary_loss_clip": 0.01108696, "auxiliary_loss_mlp": 0.01027752, "balance_loss_clip": 1.01523542, "balance_loss_mlp": 1.03247046, "epoch": 0.9270404328874192, "flos": 28002710252160.0, "grad_norm": 1.488331808016133, "language_loss": 0.76318836, "learning_rate": 5.228000720676351e-08, "loss": 0.78455287, "num_input_tokens_seen": 332818095, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.671875, "step": 15419, "time_per_iteration": 2.7103962898254395 }, { "auxiliary_loss_clip": 0.01098234, "auxiliary_loss_mlp": 0.01031905, "balance_loss_clip": 1.02065766, "balance_loss_mlp": 1.03258777, "epoch": 0.9271005561400871, "flos": 25889870401920.0, "grad_norm": 1.653282976614927, "language_loss": 0.75842416, "learning_rate": 5.2194219965335575e-08, "loss": 0.77972555, "num_input_tokens_seen": 332839860, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.65625, "step": 15420, "time_per_iteration": 2.5687222480773926 }, { "auxiliary_loss_clip": 0.01114215, "auxiliary_loss_mlp": 0.010282, "balance_loss_clip": 1.01574302, "balance_loss_mlp": 1.03453398, "epoch": 0.9271606793927551, "flos": 27053914452480.0, "grad_norm": 1.7814106780954753, "language_loss": 0.76565284, "learning_rate": 5.210850223608565e-08, "loss": 0.78707695, "num_input_tokens_seen": 332861155, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 15421, "time_per_iteration": 2.5789525508880615 }, { "auxiliary_loss_clip": 0.01121569, "auxiliary_loss_mlp": 0.01029708, "balance_loss_clip": 1.01757824, "balance_loss_mlp": 1.03443408, "epoch": 0.9272208026454232, "flos": 26209869690240.0, "grad_norm": 2.0407997076978175, "language_loss": 0.7322523, "learning_rate": 5.202285402207263e-08, "loss": 0.75376505, "num_input_tokens_seen": 332881110, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 15422, "time_per_iteration": 2.5770256519317627 }, { "auxiliary_loss_clip": 0.01114464, "auxiliary_loss_mlp": 0.01039958, "balance_loss_clip": 1.02652907, "balance_loss_mlp": 1.03513479, "epoch": 0.9272809258980911, "flos": 12458453863680.0, "grad_norm": 1.9026858082618907, "language_loss": 0.77690679, "learning_rate": 5.193727532635339e-08, "loss": 0.79845101, "num_input_tokens_seen": 332899350, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 15423, "time_per_iteration": 2.728398084640503 }, { "auxiliary_loss_clip": 0.0111141, "auxiliary_loss_mlp": 0.01027075, "balance_loss_clip": 1.01412296, "balance_loss_mlp": 1.03273249, "epoch": 0.9273410491507591, "flos": 22856890826880.0, "grad_norm": 1.9980717085940156, "language_loss": 0.75374311, "learning_rate": 5.185176615198128e-08, "loss": 0.77512795, "num_input_tokens_seen": 332918105, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 15424, "time_per_iteration": 2.5881383419036865 }, { "auxiliary_loss_clip": 0.01031117, "auxiliary_loss_mlp": 0.01000337, "balance_loss_clip": 0.99907362, "balance_loss_mlp": 1.00089049, "epoch": 0.927401172403427, "flos": 47665384329600.0, "grad_norm": 0.7797561543365716, "language_loss": 0.60720593, "learning_rate": 5.176632650200874e-08, "loss": 0.6275205, "num_input_tokens_seen": 332969490, "router_z_loss_clip": 0.01263428, "router_z_loss_mlp": 0.2109375, "step": 15425, "time_per_iteration": 2.9333584308624268 }, { "auxiliary_loss_clip": 0.01120653, "auxiliary_loss_mlp": 0.0127857, "balance_loss_clip": 1.01917231, "balance_loss_mlp": 1.03467798, "epoch": 0.927461295656095, "flos": 29972374490880.0, "grad_norm": 1.4030754459732115, "language_loss": 0.70166165, "learning_rate": 5.168095637948444e-08, "loss": 0.72565389, "num_input_tokens_seen": 332988805, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 15426, "time_per_iteration": 4.039050579071045 }, { "auxiliary_loss_clip": 0.01120168, "auxiliary_loss_mlp": 0.01024681, "balance_loss_clip": 1.01311719, "balance_loss_mlp": 1.03502524, "epoch": 0.9275214189087629, "flos": 20375427651840.0, "grad_norm": 1.8710259555501665, "language_loss": 0.83264434, "learning_rate": 5.159565578745551e-08, "loss": 0.85409284, "num_input_tokens_seen": 333007960, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.67578125, "step": 15427, "time_per_iteration": 2.5434012413024902 }, { "auxiliary_loss_clip": 0.01130954, "auxiliary_loss_mlp": 0.01032679, "balance_loss_clip": 1.02076364, "balance_loss_mlp": 1.03564274, "epoch": 0.927581542161431, "flos": 22383193242240.0, "grad_norm": 1.7013873773785952, "language_loss": 0.76999098, "learning_rate": 5.1510424728965275e-08, "loss": 0.79162729, "num_input_tokens_seen": 333026035, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 15428, "time_per_iteration": 2.588754892349243 }, { "auxiliary_loss_clip": 0.01127463, "auxiliary_loss_mlp": 0.01033663, "balance_loss_clip": 1.02088976, "balance_loss_mlp": 1.03216994, "epoch": 0.9276416654140989, "flos": 23952453598080.0, "grad_norm": 1.8972240429964249, "language_loss": 0.74528158, "learning_rate": 5.142526320705598e-08, "loss": 0.76689279, "num_input_tokens_seen": 333045590, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 15429, "time_per_iteration": 2.620595693588257 }, { "auxiliary_loss_clip": 0.01126871, "auxiliary_loss_mlp": 0.01029473, "balance_loss_clip": 1.01773667, "balance_loss_mlp": 1.03350616, "epoch": 0.9277017886667669, "flos": 13917719796480.0, "grad_norm": 2.170641386842156, "language_loss": 0.75003695, "learning_rate": 5.134017122476675e-08, "loss": 0.77160037, "num_input_tokens_seen": 333063355, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6640625, "step": 15430, "time_per_iteration": 2.540990114212036 }, { "auxiliary_loss_clip": 0.01112382, "auxiliary_loss_mlp": 0.01030357, "balance_loss_clip": 1.01715469, "balance_loss_mlp": 1.03410339, "epoch": 0.9277619119194348, "flos": 35666478092160.0, "grad_norm": 1.3796538670263396, "language_loss": 0.76455832, "learning_rate": 5.125514878513426e-08, "loss": 0.78598571, "num_input_tokens_seen": 333088045, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6953125, "step": 15431, "time_per_iteration": 2.9505767822265625 }, { "auxiliary_loss_clip": 0.0110916, "auxiliary_loss_mlp": 0.01029585, "balance_loss_clip": 1.01785517, "balance_loss_mlp": 1.03403831, "epoch": 0.9278220351721028, "flos": 23841238112640.0, "grad_norm": 1.4974020880144698, "language_loss": 0.70529628, "learning_rate": 5.1170195891192534e-08, "loss": 0.72668374, "num_input_tokens_seen": 333108005, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6640625, "step": 15432, "time_per_iteration": 2.574235677719116 }, { "auxiliary_loss_clip": 0.01140225, "auxiliary_loss_mlp": 0.01030475, "balance_loss_clip": 1.01829195, "balance_loss_mlp": 1.03409278, "epoch": 0.9278821584247707, "flos": 15228135768960.0, "grad_norm": 2.7046533974393214, "language_loss": 0.82369262, "learning_rate": 5.108531254597381e-08, "loss": 0.84539962, "num_input_tokens_seen": 333124335, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.70703125, "step": 15433, "time_per_iteration": 3.9717276096343994 }, { "auxiliary_loss_clip": 0.01099218, "auxiliary_loss_mlp": 0.01028084, "balance_loss_clip": 1.0170815, "balance_loss_mlp": 1.03386617, "epoch": 0.9279422816774388, "flos": 24681404206080.0, "grad_norm": 1.846026646402291, "language_loss": 0.67227781, "learning_rate": 5.100049875250723e-08, "loss": 0.69355094, "num_input_tokens_seen": 333143995, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.65234375, "step": 15434, "time_per_iteration": 2.4969005584716797 }, { "auxiliary_loss_clip": 0.01114419, "auxiliary_loss_mlp": 0.01028285, "balance_loss_clip": 1.01651335, "balance_loss_mlp": 1.03223014, "epoch": 0.9280024049301068, "flos": 19169188099200.0, "grad_norm": 1.759856394729026, "language_loss": 0.68607128, "learning_rate": 5.091575451381902e-08, "loss": 0.70749831, "num_input_tokens_seen": 333162805, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.64453125, "step": 15435, "time_per_iteration": 2.5069823265075684 }, { "auxiliary_loss_clip": 0.01102847, "auxiliary_loss_mlp": 0.01031521, "balance_loss_clip": 1.0185864, "balance_loss_mlp": 1.03361702, "epoch": 0.9280625281827747, "flos": 11393701983360.0, "grad_norm": 2.1052972434353987, "language_loss": 0.72195828, "learning_rate": 5.083107983293411e-08, "loss": 0.74330199, "num_input_tokens_seen": 333175770, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 15436, "time_per_iteration": 2.4158475399017334 }, { "auxiliary_loss_clip": 0.01119791, "auxiliary_loss_mlp": 0.01028678, "balance_loss_clip": 1.0166502, "balance_loss_mlp": 1.03391695, "epoch": 0.9281226514354427, "flos": 24785616539520.0, "grad_norm": 2.0752571534907363, "language_loss": 0.66689301, "learning_rate": 5.0746474712874074e-08, "loss": 0.68837774, "num_input_tokens_seen": 333194775, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 15437, "time_per_iteration": 2.526306390762329 }, { "auxiliary_loss_clip": 0.01113524, "auxiliary_loss_mlp": 0.01034805, "balance_loss_clip": 1.02139378, "balance_loss_mlp": 1.03381252, "epoch": 0.9281827746881106, "flos": 15083128563840.0, "grad_norm": 2.06140215941911, "language_loss": 0.7130878, "learning_rate": 5.0661939156658285e-08, "loss": 0.7345711, "num_input_tokens_seen": 333208920, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 15438, "time_per_iteration": 2.449370861053467 }, { "auxiliary_loss_clip": 0.01101999, "auxiliary_loss_mlp": 0.0102724, "balance_loss_clip": 1.01525974, "balance_loss_mlp": 1.03337932, "epoch": 0.9282428979407786, "flos": 24135059364480.0, "grad_norm": 1.6652576033585795, "language_loss": 0.64402831, "learning_rate": 5.057747316730343e-08, "loss": 0.66532063, "num_input_tokens_seen": 333229350, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 15439, "time_per_iteration": 2.5321366786956787 }, { "auxiliary_loss_clip": 0.0112003, "auxiliary_loss_mlp": 0.01029121, "balance_loss_clip": 1.01727128, "balance_loss_mlp": 1.03453934, "epoch": 0.9283030211934465, "flos": 24823215100800.0, "grad_norm": 1.8621202242131414, "language_loss": 0.70255852, "learning_rate": 5.049307674782444e-08, "loss": 0.72405005, "num_input_tokens_seen": 333246125, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6796875, "step": 15440, "time_per_iteration": 4.659243106842041 }, { "auxiliary_loss_clip": 0.01133231, "auxiliary_loss_mlp": 0.01284445, "balance_loss_clip": 1.02450919, "balance_loss_mlp": 1.03477442, "epoch": 0.9283631444461146, "flos": 23981037845760.0, "grad_norm": 1.7228094474786781, "language_loss": 0.77029085, "learning_rate": 5.040874990123223e-08, "loss": 0.79446757, "num_input_tokens_seen": 333263685, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 15441, "time_per_iteration": 2.603961229324341 }, { "auxiliary_loss_clip": 0.01106985, "auxiliary_loss_mlp": 0.01026581, "balance_loss_clip": 1.01635253, "balance_loss_mlp": 1.03296232, "epoch": 0.9284232676987825, "flos": 22784530878720.0, "grad_norm": 1.5082355448688696, "language_loss": 0.63931626, "learning_rate": 5.032449263053729e-08, "loss": 0.66065192, "num_input_tokens_seen": 333282435, "router_z_loss_clip": 0.10253906, "router_z_loss_mlp": 0.6484375, "step": 15442, "time_per_iteration": 4.108468532562256 }, { "auxiliary_loss_clip": 0.01132962, "auxiliary_loss_mlp": 0.01038305, "balance_loss_clip": 1.02586579, "balance_loss_mlp": 1.03499222, "epoch": 0.9284833909514505, "flos": 19500500171520.0, "grad_norm": 1.7417829092124328, "language_loss": 0.80886465, "learning_rate": 5.024030493874565e-08, "loss": 0.83057731, "num_input_tokens_seen": 333300400, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7109375, "step": 15443, "time_per_iteration": 2.5511348247528076 }, { "auxiliary_loss_clip": 0.01111696, "auxiliary_loss_mlp": 0.0127434, "balance_loss_clip": 1.01534164, "balance_loss_mlp": 1.03494895, "epoch": 0.9285435142041184, "flos": 23185976256000.0, "grad_norm": 1.9347941621842022, "language_loss": 0.65619105, "learning_rate": 5.0156186828862465e-08, "loss": 0.68005139, "num_input_tokens_seen": 333318980, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 15444, "time_per_iteration": 2.578056573867798 }, { "auxiliary_loss_clip": 0.01129553, "auxiliary_loss_mlp": 0.01033003, "balance_loss_clip": 1.02124321, "balance_loss_mlp": 1.03345728, "epoch": 0.9286036374567864, "flos": 17675519915520.0, "grad_norm": 2.5318118430456797, "language_loss": 0.73315275, "learning_rate": 5.007213830388912e-08, "loss": 0.75477839, "num_input_tokens_seen": 333334135, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69140625, "step": 15445, "time_per_iteration": 2.535889148712158 }, { "auxiliary_loss_clip": 0.01113816, "auxiliary_loss_mlp": 0.01032544, "balance_loss_clip": 1.01969886, "balance_loss_mlp": 1.03664911, "epoch": 0.9286637607094543, "flos": 20886687884160.0, "grad_norm": 1.6377846945266472, "language_loss": 0.71463716, "learning_rate": 4.9988159366825655e-08, "loss": 0.73610079, "num_input_tokens_seen": 333353325, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6796875, "step": 15446, "time_per_iteration": 2.4862453937530518 }, { "auxiliary_loss_clip": 0.01150657, "auxiliary_loss_mlp": 0.01028243, "balance_loss_clip": 1.01574385, "balance_loss_mlp": 1.03409147, "epoch": 0.9287238839621224, "flos": 19026012487680.0, "grad_norm": 1.9536273585203665, "language_loss": 0.69493699, "learning_rate": 4.990425002066856e-08, "loss": 0.71672601, "num_input_tokens_seen": 333371110, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 15447, "time_per_iteration": 2.528028964996338 }, { "auxiliary_loss_clip": 0.01095165, "auxiliary_loss_mlp": 0.01029519, "balance_loss_clip": 1.01868308, "balance_loss_mlp": 1.03190684, "epoch": 0.9287840072147904, "flos": 20557027837440.0, "grad_norm": 1.8038498342655602, "language_loss": 0.72308898, "learning_rate": 4.982041026841255e-08, "loss": 0.74433577, "num_input_tokens_seen": 333391420, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.6328125, "step": 15448, "time_per_iteration": 2.7904906272888184 }, { "auxiliary_loss_clip": 0.01101807, "auxiliary_loss_mlp": 0.01026821, "balance_loss_clip": 1.01343966, "balance_loss_mlp": 1.03317189, "epoch": 0.9288441304674583, "flos": 21250822008960.0, "grad_norm": 1.6754660879256893, "language_loss": 0.74305004, "learning_rate": 4.973664011304968e-08, "loss": 0.76433629, "num_input_tokens_seen": 333410365, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6875, "step": 15449, "time_per_iteration": 2.508580446243286 }, { "auxiliary_loss_clip": 0.01102043, "auxiliary_loss_mlp": 0.01270587, "balance_loss_clip": 1.0114646, "balance_loss_mlp": 1.03331757, "epoch": 0.9289042537201263, "flos": 27669853895040.0, "grad_norm": 1.899669595209306, "language_loss": 0.67819381, "learning_rate": 4.965293955756933e-08, "loss": 0.70192015, "num_input_tokens_seen": 333430000, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 15450, "time_per_iteration": 2.57087779045105 }, { "auxiliary_loss_clip": 0.01129278, "auxiliary_loss_mlp": 0.01023453, "balance_loss_clip": 1.01110256, "balance_loss_mlp": 1.03377128, "epoch": 0.9289643769727942, "flos": 19317750750720.0, "grad_norm": 1.9125118217952968, "language_loss": 0.71941662, "learning_rate": 4.9569308604958896e-08, "loss": 0.74094391, "num_input_tokens_seen": 333445800, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 15451, "time_per_iteration": 2.513977289199829 }, { "auxiliary_loss_clip": 0.01121651, "auxiliary_loss_mlp": 0.01033388, "balance_loss_clip": 1.02175272, "balance_loss_mlp": 1.03463352, "epoch": 0.9290245002254622, "flos": 14058058233600.0, "grad_norm": 2.1218812109834224, "language_loss": 0.73534518, "learning_rate": 4.948574725820243e-08, "loss": 0.75689554, "num_input_tokens_seen": 333461550, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.69140625, "step": 15452, "time_per_iteration": 2.4888036251068115 }, { "auxiliary_loss_clip": 0.01122908, "auxiliary_loss_mlp": 0.01028222, "balance_loss_clip": 1.0161345, "balance_loss_mlp": 1.03681982, "epoch": 0.9290846234781301, "flos": 20047132321920.0, "grad_norm": 1.6629449112944559, "language_loss": 0.74236786, "learning_rate": 4.9402255520282656e-08, "loss": 0.76387918, "num_input_tokens_seen": 333478835, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.68359375, "step": 15453, "time_per_iteration": 2.627821445465088 }, { "auxiliary_loss_clip": 0.01048945, "auxiliary_loss_mlp": 0.01001581, "balance_loss_clip": 1.00026405, "balance_loss_mlp": 1.00091553, "epoch": 0.9291447467307982, "flos": 61227514460160.0, "grad_norm": 0.8156619874416924, "language_loss": 0.60706604, "learning_rate": 4.9318833394178524e-08, "loss": 0.62757134, "num_input_tokens_seen": 333535250, "router_z_loss_clip": 0.01318359, "router_z_loss_mlp": 0.21289062, "step": 15454, "time_per_iteration": 3.0396063327789307 }, { "auxiliary_loss_clip": 0.01120581, "auxiliary_loss_mlp": 0.01030721, "balance_loss_clip": 1.01842451, "balance_loss_mlp": 1.0330267, "epoch": 0.9292048699834661, "flos": 20553328736640.0, "grad_norm": 1.5669020513769458, "language_loss": 0.68787396, "learning_rate": 4.9235480882867667e-08, "loss": 0.70938694, "num_input_tokens_seen": 333553805, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 15455, "time_per_iteration": 2.510869264602661 }, { "auxiliary_loss_clip": 0.0113176, "auxiliary_loss_mlp": 0.01030087, "balance_loss_clip": 1.01772463, "balance_loss_mlp": 1.03557944, "epoch": 0.9292649932361341, "flos": 23623655477760.0, "grad_norm": 1.6950370800095869, "language_loss": 0.6433655, "learning_rate": 4.9152197989324353e-08, "loss": 0.66498393, "num_input_tokens_seen": 333572800, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 15456, "time_per_iteration": 2.6143929958343506 }, { "auxiliary_loss_clip": 0.0113829, "auxiliary_loss_mlp": 0.01030582, "balance_loss_clip": 1.01758838, "balance_loss_mlp": 1.03547895, "epoch": 0.929325116488802, "flos": 15009942602880.0, "grad_norm": 1.6172398983723513, "language_loss": 0.68386847, "learning_rate": 4.906898471652132e-08, "loss": 0.70555711, "num_input_tokens_seen": 333588520, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.671875, "step": 15457, "time_per_iteration": 2.8925580978393555 }, { "auxiliary_loss_clip": 0.01121283, "auxiliary_loss_mlp": 0.01027602, "balance_loss_clip": 1.01602066, "balance_loss_mlp": 1.03315091, "epoch": 0.92938523974147, "flos": 17967365919360.0, "grad_norm": 2.0657216720203015, "language_loss": 0.81086445, "learning_rate": 4.8985841067427533e-08, "loss": 0.83235335, "num_input_tokens_seen": 333603435, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.703125, "step": 15458, "time_per_iteration": 2.563908100128174 }, { "auxiliary_loss_clip": 0.01111488, "auxiliary_loss_mlp": 0.0103139, "balance_loss_clip": 1.01931381, "balance_loss_mlp": 1.03345942, "epoch": 0.9294453629941379, "flos": 23003047267200.0, "grad_norm": 1.8235507608655435, "language_loss": 0.72587776, "learning_rate": 4.89027670450104e-08, "loss": 0.74730659, "num_input_tokens_seen": 333623305, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 15459, "time_per_iteration": 2.5727949142456055 }, { "auxiliary_loss_clip": 0.01130176, "auxiliary_loss_mlp": 0.01032788, "balance_loss_clip": 1.02046132, "balance_loss_mlp": 1.03460979, "epoch": 0.929505486246806, "flos": 17055234927360.0, "grad_norm": 2.2247211579461603, "language_loss": 0.58673108, "learning_rate": 4.881976265223464e-08, "loss": 0.60836065, "num_input_tokens_seen": 333641205, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 15460, "time_per_iteration": 2.526700258255005 }, { "auxiliary_loss_clip": 0.01141401, "auxiliary_loss_mlp": 0.01031483, "balance_loss_clip": 1.01899552, "balance_loss_mlp": 1.03518474, "epoch": 0.9295656094994739, "flos": 27340409329920.0, "grad_norm": 2.384161398783128, "language_loss": 0.80855221, "learning_rate": 4.8736827892063015e-08, "loss": 0.83028102, "num_input_tokens_seen": 333659615, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 15461, "time_per_iteration": 2.596930980682373 }, { "auxiliary_loss_clip": 0.01116568, "auxiliary_loss_mlp": 0.01027979, "balance_loss_clip": 1.01623106, "balance_loss_mlp": 1.03309393, "epoch": 0.9296257327521419, "flos": 21470954509440.0, "grad_norm": 1.885768862868342, "language_loss": 0.78667021, "learning_rate": 4.865396276745426e-08, "loss": 0.80811566, "num_input_tokens_seen": 333678985, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.66015625, "step": 15462, "time_per_iteration": 2.5276906490325928 }, { "auxiliary_loss_clip": 0.01128275, "auxiliary_loss_mlp": 0.01025098, "balance_loss_clip": 1.01361799, "balance_loss_mlp": 1.03203154, "epoch": 0.9296858560048099, "flos": 16362661818240.0, "grad_norm": 1.866086272165393, "language_loss": 0.62385952, "learning_rate": 4.8571167281366235e-08, "loss": 0.64539325, "num_input_tokens_seen": 333696410, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.69140625, "step": 15463, "time_per_iteration": 2.5768234729766846 }, { "auxiliary_loss_clip": 0.01116974, "auxiliary_loss_mlp": 0.01027692, "balance_loss_clip": 1.01642704, "balance_loss_mlp": 1.03155792, "epoch": 0.9297459792574778, "flos": 29858609139840.0, "grad_norm": 1.4182938447223252, "language_loss": 0.71105719, "learning_rate": 4.84884414367539e-08, "loss": 0.73250389, "num_input_tokens_seen": 333716615, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.67578125, "step": 15464, "time_per_iteration": 2.6340644359588623 }, { "auxiliary_loss_clip": 0.01121034, "auxiliary_loss_mlp": 0.01027911, "balance_loss_clip": 1.01637149, "balance_loss_mlp": 1.03380108, "epoch": 0.9298061025101458, "flos": 15924838942080.0, "grad_norm": 2.814262937864682, "language_loss": 0.77429694, "learning_rate": 4.840578523656868e-08, "loss": 0.79578638, "num_input_tokens_seen": 333732800, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6875, "step": 15465, "time_per_iteration": 2.5476160049438477 }, { "auxiliary_loss_clip": 0.01065978, "auxiliary_loss_mlp": 0.01001869, "balance_loss_clip": 1.0006355, "balance_loss_mlp": 1.0010289, "epoch": 0.9298662257628137, "flos": 64096994304000.0, "grad_norm": 0.6936956622618424, "language_loss": 0.56522036, "learning_rate": 4.832319868376111e-08, "loss": 0.58589888, "num_input_tokens_seen": 333799300, "router_z_loss_clip": 0.0123291, "router_z_loss_mlp": 0.2109375, "step": 15466, "time_per_iteration": 3.367398977279663 }, { "auxiliary_loss_clip": 0.0111972, "auxiliary_loss_mlp": 0.01037518, "balance_loss_clip": 1.02395749, "balance_loss_mlp": 1.0328294, "epoch": 0.9299263490154818, "flos": 24280210224000.0, "grad_norm": 1.4152361806452676, "language_loss": 0.72905219, "learning_rate": 4.824068178127838e-08, "loss": 0.75062454, "num_input_tokens_seen": 333820360, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.6875, "step": 15467, "time_per_iteration": 3.95725154876709 }, { "auxiliary_loss_clip": 0.01122366, "auxiliary_loss_mlp": 0.01032671, "balance_loss_clip": 1.02000499, "balance_loss_mlp": 1.03432012, "epoch": 0.9299864722681497, "flos": 23294354567040.0, "grad_norm": 2.9026788584168015, "language_loss": 0.71970046, "learning_rate": 4.8158234532065025e-08, "loss": 0.74125087, "num_input_tokens_seen": 333840415, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 15468, "time_per_iteration": 2.5093801021575928 }, { "auxiliary_loss_clip": 0.01112819, "auxiliary_loss_mlp": 0.01028181, "balance_loss_clip": 1.01577759, "balance_loss_mlp": 1.03297698, "epoch": 0.9300465955208177, "flos": 21395972868480.0, "grad_norm": 1.6525320871368867, "language_loss": 0.75582629, "learning_rate": 4.8075856939063355e-08, "loss": 0.77723634, "num_input_tokens_seen": 333859910, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 15469, "time_per_iteration": 2.4771571159362793 }, { "auxiliary_loss_clip": 0.01120356, "auxiliary_loss_mlp": 0.01034672, "balance_loss_clip": 1.02219725, "balance_loss_mlp": 1.03348625, "epoch": 0.9301067187734856, "flos": 24571445696640.0, "grad_norm": 3.914144318991497, "language_loss": 0.75254995, "learning_rate": 4.7993549005213684e-08, "loss": 0.77410024, "num_input_tokens_seen": 333880495, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69140625, "step": 15470, "time_per_iteration": 2.5437264442443848 }, { "auxiliary_loss_clip": 0.01127636, "auxiliary_loss_mlp": 0.01029173, "balance_loss_clip": 1.0176934, "balance_loss_mlp": 1.03259468, "epoch": 0.9301668420261536, "flos": 22196960202240.0, "grad_norm": 2.572739516185074, "language_loss": 0.74820387, "learning_rate": 4.7911310733453225e-08, "loss": 0.76977193, "num_input_tokens_seen": 333897640, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6796875, "step": 15471, "time_per_iteration": 2.528930425643921 }, { "auxiliary_loss_clip": 0.01110727, "auxiliary_loss_mlp": 0.01026181, "balance_loss_clip": 1.01423573, "balance_loss_mlp": 1.03331912, "epoch": 0.9302269652788215, "flos": 17747628468480.0, "grad_norm": 3.3292031925320975, "language_loss": 0.68380284, "learning_rate": 4.7829142126716516e-08, "loss": 0.70517194, "num_input_tokens_seen": 333913670, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 15472, "time_per_iteration": 2.506180763244629 }, { "auxiliary_loss_clip": 0.0111378, "auxiliary_loss_mlp": 0.01030238, "balance_loss_clip": 1.01910949, "balance_loss_mlp": 1.0319438, "epoch": 0.9302870885314896, "flos": 19390793057280.0, "grad_norm": 1.8094046907158652, "language_loss": 0.88479233, "learning_rate": 4.77470431879361e-08, "loss": 0.90623248, "num_input_tokens_seen": 333934105, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.640625, "step": 15473, "time_per_iteration": 2.5552122592926025 }, { "auxiliary_loss_clip": 0.01125181, "auxiliary_loss_mlp": 0.01039136, "balance_loss_clip": 1.0251466, "balance_loss_mlp": 1.03515649, "epoch": 0.9303472117841575, "flos": 21760286561280.0, "grad_norm": 2.573527593747216, "language_loss": 0.64431202, "learning_rate": 4.7665013920042076e-08, "loss": 0.66595525, "num_input_tokens_seen": 333953635, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.71875, "step": 15474, "time_per_iteration": 3.9476873874664307 }, { "auxiliary_loss_clip": 0.01101433, "auxiliary_loss_mlp": 0.01031223, "balance_loss_clip": 1.01903367, "balance_loss_mlp": 1.03365278, "epoch": 0.9304073350368255, "flos": 19609740408960.0, "grad_norm": 1.849621988612331, "language_loss": 0.74586558, "learning_rate": 4.758305432596143e-08, "loss": 0.76719218, "num_input_tokens_seen": 333971825, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.67578125, "step": 15475, "time_per_iteration": 2.9542763233184814 }, { "auxiliary_loss_clip": 0.01111241, "auxiliary_loss_mlp": 0.01026969, "balance_loss_clip": 1.01510215, "balance_loss_mlp": 1.03382397, "epoch": 0.9304674582894935, "flos": 30441582875520.0, "grad_norm": 2.895365344683936, "language_loss": 0.66678107, "learning_rate": 4.750116440861962e-08, "loss": 0.68816316, "num_input_tokens_seen": 333990120, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.68359375, "step": 15476, "time_per_iteration": 2.632436752319336 }, { "auxiliary_loss_clip": 0.0113928, "auxiliary_loss_mlp": 0.01034652, "balance_loss_clip": 1.02209949, "balance_loss_mlp": 1.03502679, "epoch": 0.9305275815421614, "flos": 17785693906560.0, "grad_norm": 1.8500037467672736, "language_loss": 0.68758976, "learning_rate": 4.7419344170938514e-08, "loss": 0.70932901, "num_input_tokens_seen": 334007970, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 15477, "time_per_iteration": 2.5936622619628906 }, { "auxiliary_loss_clip": 0.01119108, "auxiliary_loss_mlp": 0.01029678, "balance_loss_clip": 1.01808441, "balance_loss_mlp": 1.03436995, "epoch": 0.9305877047948294, "flos": 25298456970240.0, "grad_norm": 1.7888761350171507, "language_loss": 0.5866996, "learning_rate": 4.733759361583889e-08, "loss": 0.60818744, "num_input_tokens_seen": 334027120, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.671875, "step": 15478, "time_per_iteration": 2.5772416591644287 }, { "auxiliary_loss_clip": 0.01126108, "auxiliary_loss_mlp": 0.01029245, "balance_loss_clip": 1.01561308, "balance_loss_mlp": 1.03472018, "epoch": 0.9306478280474973, "flos": 16977236544000.0, "grad_norm": 1.8426646772192803, "language_loss": 0.78342795, "learning_rate": 4.7255912746237524e-08, "loss": 0.80498147, "num_input_tokens_seen": 334042785, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 15479, "time_per_iteration": 2.504068613052368 }, { "auxiliary_loss_clip": 0.01102388, "auxiliary_loss_mlp": 0.01034783, "balance_loss_clip": 1.02291012, "balance_loss_mlp": 1.03428435, "epoch": 0.9307079513001654, "flos": 35663353608960.0, "grad_norm": 1.635886246012865, "language_loss": 0.68288386, "learning_rate": 4.7174301565049203e-08, "loss": 0.70425558, "num_input_tokens_seen": 334063480, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6796875, "step": 15480, "time_per_iteration": 2.593355655670166 }, { "auxiliary_loss_clip": 0.01103616, "auxiliary_loss_mlp": 0.01031554, "balance_loss_clip": 1.01967442, "balance_loss_mlp": 1.03492689, "epoch": 0.9307680745528333, "flos": 19208151377280.0, "grad_norm": 1.7420605158895939, "language_loss": 0.67381483, "learning_rate": 4.709276007518736e-08, "loss": 0.69516647, "num_input_tokens_seen": 334082005, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 15481, "time_per_iteration": 4.007604360580444 }, { "auxiliary_loss_clip": 0.01116801, "auxiliary_loss_mlp": 0.0102638, "balance_loss_clip": 1.01490593, "balance_loss_mlp": 1.0322926, "epoch": 0.9308281978055013, "flos": 19062641381760.0, "grad_norm": 2.058269743713203, "language_loss": 0.74601299, "learning_rate": 4.701128827956102e-08, "loss": 0.76744479, "num_input_tokens_seen": 334101375, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.66796875, "step": 15482, "time_per_iteration": 2.5302188396453857 }, { "auxiliary_loss_clip": 0.01107653, "auxiliary_loss_mlp": 0.01028846, "balance_loss_clip": 1.01704383, "balance_loss_mlp": 1.03253043, "epoch": 0.9308883210581692, "flos": 20521548178560.0, "grad_norm": 1.9508555937186143, "language_loss": 0.80057979, "learning_rate": 4.6929886181078294e-08, "loss": 0.82194477, "num_input_tokens_seen": 334119460, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6640625, "step": 15483, "time_per_iteration": 4.311863422393799 }, { "auxiliary_loss_clip": 0.01121057, "auxiliary_loss_mlp": 0.010303, "balance_loss_clip": 1.0182066, "balance_loss_mlp": 1.03445625, "epoch": 0.9309484443108372, "flos": 13001422826880.0, "grad_norm": 1.7619626751655642, "language_loss": 0.74294049, "learning_rate": 4.684855378264396e-08, "loss": 0.76445413, "num_input_tokens_seen": 334136065, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 15484, "time_per_iteration": 2.687567949295044 }, { "auxiliary_loss_clip": 0.01123123, "auxiliary_loss_mlp": 0.01032558, "balance_loss_clip": 1.0202136, "balance_loss_mlp": 1.03574824, "epoch": 0.9310085675635051, "flos": 12567765928320.0, "grad_norm": 2.3042647373622915, "language_loss": 0.76576799, "learning_rate": 4.67672910871606e-08, "loss": 0.78732485, "num_input_tokens_seen": 334153690, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.69921875, "step": 15485, "time_per_iteration": 2.604889154434204 }, { "auxiliary_loss_clip": 0.01139647, "auxiliary_loss_mlp": 0.0102989, "balance_loss_clip": 1.01615667, "balance_loss_mlp": 1.03374553, "epoch": 0.9310686908161732, "flos": 23477570864640.0, "grad_norm": 3.798651553426305, "language_loss": 0.77969539, "learning_rate": 4.668609809752832e-08, "loss": 0.80139083, "num_input_tokens_seen": 334171880, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.703125, "step": 15486, "time_per_iteration": 2.7042548656463623 }, { "auxiliary_loss_clip": 0.01134268, "auxiliary_loss_mlp": 0.01030602, "balance_loss_clip": 1.01782262, "balance_loss_mlp": 1.03681886, "epoch": 0.9311288140688411, "flos": 24170287628160.0, "grad_norm": 1.8991878380976968, "language_loss": 0.77141947, "learning_rate": 4.6604974816644606e-08, "loss": 0.79306817, "num_input_tokens_seen": 334190005, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 15487, "time_per_iteration": 2.572399377822876 }, { "auxiliary_loss_clip": 0.01129024, "auxiliary_loss_mlp": 0.01029891, "balance_loss_clip": 1.01672459, "balance_loss_mlp": 1.03460789, "epoch": 0.9311889373215091, "flos": 35590203561600.0, "grad_norm": 1.919466281577276, "language_loss": 0.66903925, "learning_rate": 4.652392124740468e-08, "loss": 0.69062835, "num_input_tokens_seen": 334209545, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6796875, "step": 15488, "time_per_iteration": 2.623478651046753 }, { "auxiliary_loss_clip": 0.01116183, "auxiliary_loss_mlp": 0.0103095, "balance_loss_clip": 1.01857615, "balance_loss_mlp": 1.03330863, "epoch": 0.9312490605741771, "flos": 20230528187520.0, "grad_norm": 1.7856484867180287, "language_loss": 0.74707317, "learning_rate": 4.644293739270133e-08, "loss": 0.76854455, "num_input_tokens_seen": 334228900, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.65625, "step": 15489, "time_per_iteration": 2.5246336460113525 }, { "auxiliary_loss_clip": 0.01103975, "auxiliary_loss_mlp": 0.01035322, "balance_loss_clip": 1.02240574, "balance_loss_mlp": 1.03353477, "epoch": 0.931309183826845, "flos": 13950577762560.0, "grad_norm": 6.60944295507674, "language_loss": 0.80862081, "learning_rate": 4.6362023255423823e-08, "loss": 0.83001375, "num_input_tokens_seen": 334245500, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 15490, "time_per_iteration": 2.5017573833465576 }, { "auxiliary_loss_clip": 0.01138406, "auxiliary_loss_mlp": 0.01032548, "balance_loss_clip": 1.01907122, "balance_loss_mlp": 1.03339005, "epoch": 0.931369307079513, "flos": 20156731695360.0, "grad_norm": 1.6145390051542061, "language_loss": 0.71769869, "learning_rate": 4.628117883846094e-08, "loss": 0.73940825, "num_input_tokens_seen": 334264370, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.6953125, "step": 15491, "time_per_iteration": 2.5386674404144287 }, { "auxiliary_loss_clip": 0.01115165, "auxiliary_loss_mlp": 0.01272428, "balance_loss_clip": 1.0145185, "balance_loss_mlp": 1.03221989, "epoch": 0.931429430332181, "flos": 25338569483520.0, "grad_norm": 1.6638348512886763, "language_loss": 0.74442482, "learning_rate": 4.620040414469684e-08, "loss": 0.76830083, "num_input_tokens_seen": 334283905, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.65625, "step": 15492, "time_per_iteration": 2.5907225608825684 }, { "auxiliary_loss_clip": 0.01031409, "auxiliary_loss_mlp": 0.01001534, "balance_loss_clip": 1.00021124, "balance_loss_mlp": 1.00101161, "epoch": 0.931489553584849, "flos": 71226193985280.0, "grad_norm": 0.7125219013474675, "language_loss": 0.53446245, "learning_rate": 4.611969917701475e-08, "loss": 0.55479193, "num_input_tokens_seen": 334339925, "router_z_loss_clip": 0.01324463, "router_z_loss_mlp": 0.21289062, "step": 15493, "time_per_iteration": 3.0850167274475098 }, { "auxiliary_loss_clip": 0.01110471, "auxiliary_loss_mlp": 0.01030145, "balance_loss_clip": 1.01854622, "balance_loss_mlp": 1.0338223, "epoch": 0.9315496768375169, "flos": 14643653662080.0, "grad_norm": 2.0466747864037926, "language_loss": 0.7110579, "learning_rate": 4.6039063938294596e-08, "loss": 0.73246413, "num_input_tokens_seen": 334357225, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6796875, "step": 15494, "time_per_iteration": 2.5289149284362793 }, { "auxiliary_loss_clip": 0.01111795, "auxiliary_loss_mlp": 0.0102584, "balance_loss_clip": 1.01309037, "balance_loss_mlp": 1.03234506, "epoch": 0.9316098000901849, "flos": 28329928174080.0, "grad_norm": 2.1002312669688976, "language_loss": 0.68400061, "learning_rate": 4.595849843141386e-08, "loss": 0.70537698, "num_input_tokens_seen": 334375945, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 15495, "time_per_iteration": 2.639160394668579 }, { "auxiliary_loss_clip": 0.01099469, "auxiliary_loss_mlp": 0.01031698, "balance_loss_clip": 1.02007484, "balance_loss_mlp": 1.03247452, "epoch": 0.9316699233428528, "flos": 28512677594880.0, "grad_norm": 2.0685076819176063, "language_loss": 0.61346149, "learning_rate": 4.587800265924824e-08, "loss": 0.63477319, "num_input_tokens_seen": 334395310, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 15496, "time_per_iteration": 2.6173312664031982 }, { "auxiliary_loss_clip": 0.01112735, "auxiliary_loss_mlp": 0.01033914, "balance_loss_clip": 1.02159953, "balance_loss_mlp": 1.0348568, "epoch": 0.9317300465955208, "flos": 33693402061440.0, "grad_norm": 1.6200325279992482, "language_loss": 0.7708199, "learning_rate": 4.5797576624669654e-08, "loss": 0.7922864, "num_input_tokens_seen": 334416965, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 15497, "time_per_iteration": 2.6898725032806396 }, { "auxiliary_loss_clip": 0.01114295, "auxiliary_loss_mlp": 0.01281908, "balance_loss_clip": 1.0224458, "balance_loss_mlp": 1.03504562, "epoch": 0.9317901698481887, "flos": 23658237296640.0, "grad_norm": 1.668046603444203, "language_loss": 0.6650219, "learning_rate": 4.571722033054937e-08, "loss": 0.68898392, "num_input_tokens_seen": 334435620, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 15498, "time_per_iteration": 2.642204761505127 }, { "auxiliary_loss_clip": 0.01122639, "auxiliary_loss_mlp": 0.01037365, "balance_loss_clip": 1.0245024, "balance_loss_mlp": 1.03509927, "epoch": 0.9318502931008568, "flos": 20960017499520.0, "grad_norm": 1.7234744903533783, "language_loss": 0.80143964, "learning_rate": 4.563693377975397e-08, "loss": 0.82303965, "num_input_tokens_seen": 334456210, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 15499, "time_per_iteration": 2.6348485946655273 }, { "auxiliary_loss_clip": 0.01136985, "auxiliary_loss_mlp": 0.01031629, "balance_loss_clip": 1.01949978, "balance_loss_mlp": 1.03518116, "epoch": 0.9319104163535247, "flos": 23441049711360.0, "grad_norm": 1.482420477691625, "language_loss": 0.76746082, "learning_rate": 4.55567169751494e-08, "loss": 0.7891469, "num_input_tokens_seen": 334475485, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6640625, "step": 15500, "time_per_iteration": 2.689772367477417 }, { "auxiliary_loss_clip": 0.01125066, "auxiliary_loss_mlp": 0.0102609, "balance_loss_clip": 1.01369858, "balance_loss_mlp": 1.03140032, "epoch": 0.9319705396061927, "flos": 22347426274560.0, "grad_norm": 1.7458266648939667, "language_loss": 0.7214613, "learning_rate": 4.547656991959803e-08, "loss": 0.74297285, "num_input_tokens_seen": 334494740, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.671875, "step": 15501, "time_per_iteration": 2.5980544090270996 }, { "auxiliary_loss_clip": 0.01104899, "auxiliary_loss_mlp": 0.01034605, "balance_loss_clip": 1.02244568, "balance_loss_mlp": 1.03428912, "epoch": 0.9320306628588607, "flos": 20993557824000.0, "grad_norm": 1.7319775476078207, "language_loss": 0.66499954, "learning_rate": 4.5396492615960456e-08, "loss": 0.68639457, "num_input_tokens_seen": 334511910, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.70703125, "step": 15502, "time_per_iteration": 2.6546754837036133 }, { "auxiliary_loss_clip": 0.01113605, "auxiliary_loss_mlp": 0.01280511, "balance_loss_clip": 1.02180767, "balance_loss_mlp": 1.0345459, "epoch": 0.9320907861115286, "flos": 24538300421760.0, "grad_norm": 1.5264195445958042, "language_loss": 0.65905201, "learning_rate": 4.5316485067093956e-08, "loss": 0.68299311, "num_input_tokens_seen": 334533150, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.703125, "step": 15503, "time_per_iteration": 2.675004005432129 }, { "auxiliary_loss_clip": 0.01131686, "auxiliary_loss_mlp": 0.01031386, "balance_loss_clip": 1.01875043, "balance_loss_mlp": 1.03403211, "epoch": 0.9321509093641966, "flos": 19785414850560.0, "grad_norm": 2.4910595735490837, "language_loss": 0.74945927, "learning_rate": 4.523654727585424e-08, "loss": 0.77108997, "num_input_tokens_seen": 334550940, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 15504, "time_per_iteration": 2.584110736846924 }, { "auxiliary_loss_clip": 0.01113262, "auxiliary_loss_mlp": 0.01276914, "balance_loss_clip": 1.01646566, "balance_loss_mlp": 1.03511572, "epoch": 0.9322110326168646, "flos": 24972675592320.0, "grad_norm": 5.0312349538008645, "language_loss": 0.71670055, "learning_rate": 4.51566792450937e-08, "loss": 0.74060237, "num_input_tokens_seen": 334570935, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.69140625, "step": 15505, "time_per_iteration": 2.5909500122070312 }, { "auxiliary_loss_clip": 0.01121537, "auxiliary_loss_mlp": 0.01030131, "balance_loss_clip": 1.01756644, "balance_loss_mlp": 1.03484559, "epoch": 0.9322711558695326, "flos": 16143642639360.0, "grad_norm": 1.7512306343754445, "language_loss": 0.75502455, "learning_rate": 4.507688097766316e-08, "loss": 0.77654123, "num_input_tokens_seen": 334589315, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 15506, "time_per_iteration": 2.553274393081665 }, { "auxiliary_loss_clip": 0.01126391, "auxiliary_loss_mlp": 0.01028301, "balance_loss_clip": 1.01649344, "balance_loss_mlp": 1.03382516, "epoch": 0.9323312791222005, "flos": 23732428838400.0, "grad_norm": 1.9458136233080994, "language_loss": 0.82957423, "learning_rate": 4.499715247640967e-08, "loss": 0.85112119, "num_input_tokens_seen": 334608990, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.65625, "step": 15507, "time_per_iteration": 2.527038812637329 }, { "auxiliary_loss_clip": 0.01104269, "auxiliary_loss_mlp": 0.01030933, "balance_loss_clip": 1.01864851, "balance_loss_mlp": 1.03434324, "epoch": 0.9323914023748685, "flos": 20777914523520.0, "grad_norm": 1.4611641123552495, "language_loss": 0.67688888, "learning_rate": 4.491749374417941e-08, "loss": 0.69824088, "num_input_tokens_seen": 334628655, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 15508, "time_per_iteration": 2.5039916038513184 }, { "auxiliary_loss_clip": 0.01128784, "auxiliary_loss_mlp": 0.01030566, "balance_loss_clip": 1.01838255, "balance_loss_mlp": 1.03404558, "epoch": 0.9324515256275364, "flos": 23915178259200.0, "grad_norm": 1.815366339012464, "language_loss": 0.72523367, "learning_rate": 4.4837904783814554e-08, "loss": 0.74682713, "num_input_tokens_seen": 334648295, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 15509, "time_per_iteration": 4.084963083267212 }, { "auxiliary_loss_clip": 0.01111745, "auxiliary_loss_mlp": 0.01030985, "balance_loss_clip": 1.0189805, "balance_loss_mlp": 1.03409028, "epoch": 0.9325116488802044, "flos": 18005215875840.0, "grad_norm": 2.355156438951397, "language_loss": 0.7411744, "learning_rate": 4.4758385598155255e-08, "loss": 0.76260167, "num_input_tokens_seen": 334666280, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 15510, "time_per_iteration": 2.4923911094665527 }, { "auxiliary_loss_clip": 0.01115388, "auxiliary_loss_mlp": 0.0102746, "balance_loss_clip": 1.01536059, "balance_loss_mlp": 1.03466344, "epoch": 0.9325717721328723, "flos": 29021603443200.0, "grad_norm": 1.460054340914922, "language_loss": 0.69967705, "learning_rate": 4.467893619004015e-08, "loss": 0.72110552, "num_input_tokens_seen": 334688830, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.71875, "step": 15511, "time_per_iteration": 2.67287015914917 }, { "auxiliary_loss_clip": 0.01120945, "auxiliary_loss_mlp": 0.01035769, "balance_loss_clip": 1.02327585, "balance_loss_mlp": 1.03416562, "epoch": 0.9326318953855404, "flos": 21646341642240.0, "grad_norm": 1.7682170311439558, "language_loss": 0.78049469, "learning_rate": 4.4599556562303854e-08, "loss": 0.80206186, "num_input_tokens_seen": 334705205, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 15512, "time_per_iteration": 2.8145785331726074 }, { "auxiliary_loss_clip": 0.01099783, "auxiliary_loss_mlp": 0.01027693, "balance_loss_clip": 1.01461554, "balance_loss_mlp": 1.03159118, "epoch": 0.9326920186382083, "flos": 26065724411520.0, "grad_norm": 1.9099016915300746, "language_loss": 0.80483228, "learning_rate": 4.4520246717779654e-08, "loss": 0.82610703, "num_input_tokens_seen": 334723830, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6796875, "step": 15513, "time_per_iteration": 2.620633602142334 }, { "auxiliary_loss_clip": 0.01125632, "auxiliary_loss_mlp": 0.01032662, "balance_loss_clip": 1.01914346, "balance_loss_mlp": 1.03490901, "epoch": 0.9327521418908763, "flos": 12057116227200.0, "grad_norm": 2.3247179938745535, "language_loss": 0.79887712, "learning_rate": 4.444100665929751e-08, "loss": 0.82046002, "num_input_tokens_seen": 334740825, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 15514, "time_per_iteration": 2.5490729808807373 }, { "auxiliary_loss_clip": 0.01119144, "auxiliary_loss_mlp": 0.01040238, "balance_loss_clip": 1.02856123, "balance_loss_mlp": 1.0347811, "epoch": 0.9328122651435443, "flos": 43834395271680.0, "grad_norm": 1.6629519923630025, "language_loss": 0.71504045, "learning_rate": 4.4361836389685823e-08, "loss": 0.73663425, "num_input_tokens_seen": 334765825, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.65625, "step": 15515, "time_per_iteration": 2.884762763977051 }, { "auxiliary_loss_clip": 0.01123451, "auxiliary_loss_mlp": 0.01032611, "balance_loss_clip": 1.01968837, "balance_loss_mlp": 1.03439271, "epoch": 0.9328723883962122, "flos": 15194954580480.0, "grad_norm": 2.4476190906939523, "language_loss": 0.81331861, "learning_rate": 4.428273591176945e-08, "loss": 0.83487922, "num_input_tokens_seen": 334782680, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 15516, "time_per_iteration": 4.134262800216675 }, { "auxiliary_loss_clip": 0.01131329, "auxiliary_loss_mlp": 0.01027278, "balance_loss_clip": 1.01510656, "balance_loss_mlp": 1.03367519, "epoch": 0.9329325116488802, "flos": 20261770041600.0, "grad_norm": 1.7084684523080154, "language_loss": 0.8094517, "learning_rate": 4.420370522837169e-08, "loss": 0.83103782, "num_input_tokens_seen": 334800160, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.703125, "step": 15517, "time_per_iteration": 2.752437114715576 }, { "auxiliary_loss_clip": 0.01121973, "auxiliary_loss_mlp": 0.01032602, "balance_loss_clip": 1.01932192, "balance_loss_mlp": 1.03397059, "epoch": 0.9329926349015482, "flos": 13115008609920.0, "grad_norm": 2.000464757941843, "language_loss": 0.74587834, "learning_rate": 4.4124744342312726e-08, "loss": 0.76742405, "num_input_tokens_seen": 334815840, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 15518, "time_per_iteration": 3.2752065658569336 }, { "auxiliary_loss_clip": 0.01119805, "auxiliary_loss_mlp": 0.01034024, "balance_loss_clip": 1.02147758, "balance_loss_mlp": 1.03345394, "epoch": 0.9330527581542162, "flos": 23040250778880.0, "grad_norm": 3.5454977452434333, "language_loss": 0.75655496, "learning_rate": 4.404585325641075e-08, "loss": 0.77809322, "num_input_tokens_seen": 334834735, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 15519, "time_per_iteration": 3.217707633972168 }, { "auxiliary_loss_clip": 0.01146231, "auxiliary_loss_mlp": 0.01036096, "balance_loss_clip": 1.02271461, "balance_loss_mlp": 1.033288, "epoch": 0.9331128814068841, "flos": 15559627409280.0, "grad_norm": 2.086165090596153, "language_loss": 0.82601571, "learning_rate": 4.3967031973480394e-08, "loss": 0.84783894, "num_input_tokens_seen": 334853490, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.68359375, "step": 15520, "time_per_iteration": 3.417591094970703 }, { "auxiliary_loss_clip": 0.01109857, "auxiliary_loss_mlp": 0.01027011, "balance_loss_clip": 1.01531017, "balance_loss_mlp": 1.0331887, "epoch": 0.9331730046595521, "flos": 22271762275200.0, "grad_norm": 1.5505956930963487, "language_loss": 0.76117837, "learning_rate": 4.388828049633564e-08, "loss": 0.78254712, "num_input_tokens_seen": 334873675, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 15521, "time_per_iteration": 2.8756604194641113 }, { "auxiliary_loss_clip": 0.01113471, "auxiliary_loss_mlp": 0.01033794, "balance_loss_clip": 1.02158725, "balance_loss_mlp": 1.03686428, "epoch": 0.93323312791222, "flos": 15777641007360.0, "grad_norm": 1.7345948025501596, "language_loss": 0.77448797, "learning_rate": 4.380959882778623e-08, "loss": 0.79596066, "num_input_tokens_seen": 334890970, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.67578125, "step": 15522, "time_per_iteration": 2.667675733566284 }, { "auxiliary_loss_clip": 0.01129574, "auxiliary_loss_mlp": 0.0127661, "balance_loss_clip": 1.01730871, "balance_loss_mlp": 1.03403318, "epoch": 0.933293251164888, "flos": 22010978557440.0, "grad_norm": 1.8581801541300613, "language_loss": 0.63125432, "learning_rate": 4.3730986970640596e-08, "loss": 0.65531611, "num_input_tokens_seen": 334906635, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.69140625, "step": 15523, "time_per_iteration": 4.029750108718872 }, { "auxiliary_loss_clip": 0.01117919, "auxiliary_loss_mlp": 0.01029537, "balance_loss_clip": 1.01772952, "balance_loss_mlp": 1.03323269, "epoch": 0.933353374417556, "flos": 26031358074240.0, "grad_norm": 1.7122695561768888, "language_loss": 0.67888981, "learning_rate": 4.365244492770359e-08, "loss": 0.70036435, "num_input_tokens_seen": 334926230, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 15524, "time_per_iteration": 4.226076364517212 }, { "auxiliary_loss_clip": 0.0113904, "auxiliary_loss_mlp": 0.01035574, "balance_loss_clip": 1.02262211, "balance_loss_mlp": 1.03390753, "epoch": 0.933413497670224, "flos": 24900100162560.0, "grad_norm": 1.5761297944494825, "language_loss": 0.74068272, "learning_rate": 4.357397270177876e-08, "loss": 0.76242888, "num_input_tokens_seen": 334946680, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 15525, "time_per_iteration": 2.5782711505889893 }, { "auxiliary_loss_clip": 0.01136169, "auxiliary_loss_mlp": 0.01036456, "balance_loss_clip": 1.02436233, "balance_loss_mlp": 1.0332619, "epoch": 0.9334736209228919, "flos": 23688689051520.0, "grad_norm": 1.631866923436455, "language_loss": 0.83738565, "learning_rate": 4.3495570295666085e-08, "loss": 0.85911191, "num_input_tokens_seen": 334964785, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.671875, "step": 15526, "time_per_iteration": 2.565675735473633 }, { "auxiliary_loss_clip": 0.0113991, "auxiliary_loss_mlp": 0.01028366, "balance_loss_clip": 1.01590312, "balance_loss_mlp": 1.03489184, "epoch": 0.9335337441755599, "flos": 15377344865280.0, "grad_norm": 1.8964583418941925, "language_loss": 0.69226682, "learning_rate": 4.3417237712163766e-08, "loss": 0.71394956, "num_input_tokens_seen": 334982400, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69921875, "step": 15527, "time_per_iteration": 2.5763068199157715 }, { "auxiliary_loss_clip": 0.01105044, "auxiliary_loss_mlp": 0.01031238, "balance_loss_clip": 1.01722503, "balance_loss_mlp": 1.03425658, "epoch": 0.9335938674282279, "flos": 16106726436480.0, "grad_norm": 3.034395716050434, "language_loss": 0.65123314, "learning_rate": 4.333897495406758e-08, "loss": 0.67259592, "num_input_tokens_seen": 334999685, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.70703125, "step": 15528, "time_per_iteration": 2.5038490295410156 }, { "auxiliary_loss_clip": 0.01104438, "auxiliary_loss_mlp": 0.01031299, "balance_loss_clip": 1.0182637, "balance_loss_mlp": 1.03371, "epoch": 0.9336539906808958, "flos": 18952898353920.0, "grad_norm": 2.2616495241733596, "language_loss": 0.74640167, "learning_rate": 4.326078202416994e-08, "loss": 0.76775897, "num_input_tokens_seen": 335019160, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 15529, "time_per_iteration": 2.4987540245056152 }, { "auxiliary_loss_clip": 0.0113703, "auxiliary_loss_mlp": 0.01030669, "balance_loss_clip": 1.01911139, "balance_loss_mlp": 1.03367615, "epoch": 0.9337141139335638, "flos": 18109104986880.0, "grad_norm": 3.8839209097845946, "language_loss": 0.63229513, "learning_rate": 4.318265892526174e-08, "loss": 0.65397215, "num_input_tokens_seen": 335037350, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6796875, "step": 15530, "time_per_iteration": 2.8171064853668213 }, { "auxiliary_loss_clip": 0.01118413, "auxiliary_loss_mlp": 0.0102811, "balance_loss_clip": 1.01641536, "balance_loss_mlp": 1.03353405, "epoch": 0.9337742371862318, "flos": 17345716214400.0, "grad_norm": 1.657181660918233, "language_loss": 0.72901255, "learning_rate": 4.3104605660130744e-08, "loss": 0.75047779, "num_input_tokens_seen": 335056060, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 15531, "time_per_iteration": 2.5753798484802246 }, { "auxiliary_loss_clip": 0.01119952, "auxiliary_loss_mlp": 0.01029022, "balance_loss_clip": 1.01693428, "balance_loss_mlp": 1.03423977, "epoch": 0.9338343604388998, "flos": 29058986522880.0, "grad_norm": 1.6394928454323305, "language_loss": 0.71039003, "learning_rate": 4.302662223156317e-08, "loss": 0.73187971, "num_input_tokens_seen": 335075410, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 15532, "time_per_iteration": 2.607032299041748 }, { "auxiliary_loss_clip": 0.01103181, "auxiliary_loss_mlp": 0.01277525, "balance_loss_clip": 1.01842737, "balance_loss_mlp": 1.03308642, "epoch": 0.9338944836915677, "flos": 18660908695680.0, "grad_norm": 1.9839089688259965, "language_loss": 0.73012263, "learning_rate": 4.294870864234101e-08, "loss": 0.75392973, "num_input_tokens_seen": 335095190, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 15533, "time_per_iteration": 2.53834867477417 }, { "auxiliary_loss_clip": 0.01126772, "auxiliary_loss_mlp": 0.01026965, "balance_loss_clip": 1.01500857, "balance_loss_mlp": 1.03357089, "epoch": 0.9339546069442357, "flos": 16617735273600.0, "grad_norm": 2.017557059889856, "language_loss": 0.80222797, "learning_rate": 4.2870864895245385e-08, "loss": 0.82376528, "num_input_tokens_seen": 335113825, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.671875, "step": 15534, "time_per_iteration": 2.6312308311462402 }, { "auxiliary_loss_clip": 0.010967, "auxiliary_loss_mlp": 0.01030414, "balance_loss_clip": 1.01950645, "balance_loss_mlp": 1.0336802, "epoch": 0.9340147301969036, "flos": 23693106424320.0, "grad_norm": 2.452464578957956, "language_loss": 0.74613094, "learning_rate": 4.2793090993054506e-08, "loss": 0.76740217, "num_input_tokens_seen": 335136425, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.6328125, "step": 15535, "time_per_iteration": 2.617382526397705 }, { "auxiliary_loss_clip": 0.01112993, "auxiliary_loss_mlp": 0.01030608, "balance_loss_clip": 1.01791835, "balance_loss_mlp": 1.03463197, "epoch": 0.9340748534495716, "flos": 28654452576000.0, "grad_norm": 1.9077557249376957, "language_loss": 0.77685392, "learning_rate": 4.271538693854349e-08, "loss": 0.7982899, "num_input_tokens_seen": 335157925, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 15536, "time_per_iteration": 2.555013656616211 }, { "auxiliary_loss_clip": 0.01125777, "auxiliary_loss_mlp": 0.01028218, "balance_loss_clip": 1.01592791, "balance_loss_mlp": 1.03135908, "epoch": 0.9341349767022396, "flos": 24899633285760.0, "grad_norm": 1.5195332896252576, "language_loss": 0.80738467, "learning_rate": 4.2637752734485444e-08, "loss": 0.8289246, "num_input_tokens_seen": 335177840, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.67578125, "step": 15537, "time_per_iteration": 2.5521786212921143 }, { "auxiliary_loss_clip": 0.01121734, "auxiliary_loss_mlp": 0.01031863, "balance_loss_clip": 1.01979923, "balance_loss_mlp": 1.03457189, "epoch": 0.9341950999549076, "flos": 29059525226880.0, "grad_norm": 1.8135748763551909, "language_loss": 0.7792955, "learning_rate": 4.256018838365128e-08, "loss": 0.8008315, "num_input_tokens_seen": 335199470, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6953125, "step": 15538, "time_per_iteration": 2.5692431926727295 }, { "auxiliary_loss_clip": 0.01118757, "auxiliary_loss_mlp": 0.01029483, "balance_loss_clip": 1.01755023, "balance_loss_mlp": 1.03343821, "epoch": 0.9342552232075755, "flos": 23587062497280.0, "grad_norm": 1.4946410398971435, "language_loss": 0.73227412, "learning_rate": 4.2482693888808985e-08, "loss": 0.75375658, "num_input_tokens_seen": 335218885, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 15539, "time_per_iteration": 2.640855073928833 }, { "auxiliary_loss_clip": 0.01136756, "auxiliary_loss_mlp": 0.01029905, "balance_loss_clip": 1.01878834, "balance_loss_mlp": 1.03319955, "epoch": 0.9343153464602435, "flos": 36721389646080.0, "grad_norm": 1.496152142679702, "language_loss": 0.64695752, "learning_rate": 4.24052692527237e-08, "loss": 0.6686241, "num_input_tokens_seen": 335239485, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6796875, "step": 15540, "time_per_iteration": 2.7131385803222656 }, { "auxiliary_loss_clip": 0.01109459, "auxiliary_loss_mlp": 0.01031194, "balance_loss_clip": 1.01958311, "balance_loss_mlp": 1.03289843, "epoch": 0.9343754697129115, "flos": 22236498097920.0, "grad_norm": 1.5759101945179106, "language_loss": 0.76603806, "learning_rate": 4.232791447815898e-08, "loss": 0.78744453, "num_input_tokens_seen": 335258355, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.67578125, "step": 15541, "time_per_iteration": 2.7884862422943115 }, { "auxiliary_loss_clip": 0.01114032, "auxiliary_loss_mlp": 0.01033739, "balance_loss_clip": 1.02142489, "balance_loss_mlp": 1.03624129, "epoch": 0.9344355929655794, "flos": 29710333797120.0, "grad_norm": 1.9122450761638885, "language_loss": 0.66975284, "learning_rate": 4.225062956787528e-08, "loss": 0.69123054, "num_input_tokens_seen": 335276835, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 15542, "time_per_iteration": 2.5773448944091797 }, { "auxiliary_loss_clip": 0.01129707, "auxiliary_loss_mlp": 0.0102782, "balance_loss_clip": 1.01573765, "balance_loss_mlp": 1.0339781, "epoch": 0.9344957162182475, "flos": 18880394751360.0, "grad_norm": 1.6177222863517098, "language_loss": 0.69426274, "learning_rate": 4.217341452463064e-08, "loss": 0.71583796, "num_input_tokens_seen": 335296220, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 15543, "time_per_iteration": 2.4979705810546875 }, { "auxiliary_loss_clip": 0.01112536, "auxiliary_loss_mlp": 0.01028343, "balance_loss_clip": 1.01521158, "balance_loss_mlp": 1.0329951, "epoch": 0.9345558394709154, "flos": 27417761268480.0, "grad_norm": 1.8259641826101725, "language_loss": 0.69582993, "learning_rate": 4.209626935118038e-08, "loss": 0.71723878, "num_input_tokens_seen": 335316335, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 15544, "time_per_iteration": 2.536747932434082 }, { "auxiliary_loss_clip": 0.01112868, "auxiliary_loss_mlp": 0.0103738, "balance_loss_clip": 1.02519667, "balance_loss_mlp": 1.03447866, "epoch": 0.9346159627235834, "flos": 20741285629440.0, "grad_norm": 1.723711782677576, "language_loss": 0.7706213, "learning_rate": 4.201919405027854e-08, "loss": 0.79212379, "num_input_tokens_seen": 335335545, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 15545, "time_per_iteration": 2.470048427581787 }, { "auxiliary_loss_clip": 0.01112452, "auxiliary_loss_mlp": 0.01028853, "balance_loss_clip": 1.01755834, "balance_loss_mlp": 1.03411198, "epoch": 0.9346760859762513, "flos": 21069221823360.0, "grad_norm": 1.5663359631976579, "language_loss": 0.68975294, "learning_rate": 4.194218862467469e-08, "loss": 0.71116602, "num_input_tokens_seen": 335355350, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6953125, "step": 15546, "time_per_iteration": 2.5322132110595703 }, { "auxiliary_loss_clip": 0.01125124, "auxiliary_loss_mlp": 0.01028529, "balance_loss_clip": 1.01695347, "balance_loss_mlp": 1.03339911, "epoch": 0.9347362092289193, "flos": 29204927481600.0, "grad_norm": 1.780823985399598, "language_loss": 0.82376641, "learning_rate": 4.186525307711752e-08, "loss": 0.84530294, "num_input_tokens_seen": 335375160, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6484375, "step": 15547, "time_per_iteration": 2.597223997116089 }, { "auxiliary_loss_clip": 0.01107663, "auxiliary_loss_mlp": 0.01032823, "balance_loss_clip": 1.0214982, "balance_loss_mlp": 1.03214419, "epoch": 0.9347963324815872, "flos": 19427350124160.0, "grad_norm": 3.596234967548186, "language_loss": 0.83441359, "learning_rate": 4.17883874103524e-08, "loss": 0.85581851, "num_input_tokens_seen": 335394080, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6640625, "step": 15548, "time_per_iteration": 2.503145933151245 }, { "auxiliary_loss_clip": 0.01114884, "auxiliary_loss_mlp": 0.01032749, "balance_loss_clip": 1.01938534, "balance_loss_mlp": 1.03337646, "epoch": 0.9348564557342552, "flos": 36901840596480.0, "grad_norm": 1.8449430446007864, "language_loss": 0.65631342, "learning_rate": 4.17115916271229e-08, "loss": 0.67778969, "num_input_tokens_seen": 335414230, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 15549, "time_per_iteration": 2.626836061477661 }, { "auxiliary_loss_clip": 0.01113454, "auxiliary_loss_mlp": 0.01032345, "balance_loss_clip": 1.01890397, "balance_loss_mlp": 1.03504586, "epoch": 0.9349165789869232, "flos": 21690117342720.0, "grad_norm": 2.7038619842781118, "language_loss": 0.80271292, "learning_rate": 4.1634865730169276e-08, "loss": 0.82417095, "num_input_tokens_seen": 335432890, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.6953125, "step": 15550, "time_per_iteration": 2.5098471641540527 }, { "auxiliary_loss_clip": 0.01112185, "auxiliary_loss_mlp": 0.01030906, "balance_loss_clip": 1.01914644, "balance_loss_mlp": 1.03528249, "epoch": 0.9349767022395912, "flos": 18844053166080.0, "grad_norm": 1.7236175320762963, "language_loss": 0.75716925, "learning_rate": 4.155820972222979e-08, "loss": 0.7786001, "num_input_tokens_seen": 335452085, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6796875, "step": 15551, "time_per_iteration": 3.91778826713562 }, { "auxiliary_loss_clip": 0.01143232, "auxiliary_loss_mlp": 0.01031621, "balance_loss_clip": 1.02111852, "balance_loss_mlp": 1.03365779, "epoch": 0.9350368254922591, "flos": 24973429777920.0, "grad_norm": 1.662147767885041, "language_loss": 0.73481202, "learning_rate": 4.148162360604002e-08, "loss": 0.75656056, "num_input_tokens_seen": 335472130, "router_z_loss_clip": 0.10498047, "router_z_loss_mlp": 0.65625, "step": 15552, "time_per_iteration": 2.95196270942688 }, { "auxiliary_loss_clip": 0.01136526, "auxiliary_loss_mlp": 0.0103368, "balance_loss_clip": 1.02149129, "balance_loss_mlp": 1.03340387, "epoch": 0.9350969487449271, "flos": 23070594792960.0, "grad_norm": 2.2452513930518427, "language_loss": 0.77305424, "learning_rate": 4.1405107384333335e-08, "loss": 0.79475629, "num_input_tokens_seen": 335489970, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 15553, "time_per_iteration": 2.5809407234191895 }, { "auxiliary_loss_clip": 0.01132078, "auxiliary_loss_mlp": 0.01034824, "balance_loss_clip": 1.02171075, "balance_loss_mlp": 1.03419495, "epoch": 0.9351570719975951, "flos": 18625177641600.0, "grad_norm": 1.6757622724194712, "language_loss": 0.78154469, "learning_rate": 4.1328661059840006e-08, "loss": 0.80321372, "num_input_tokens_seen": 335509125, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 15554, "time_per_iteration": 2.50618052482605 }, { "auxiliary_loss_clip": 0.01115838, "auxiliary_loss_mlp": 0.0102654, "balance_loss_clip": 1.01607919, "balance_loss_mlp": 1.0329504, "epoch": 0.935217195250263, "flos": 15888353702400.0, "grad_norm": 1.484479708471013, "language_loss": 0.6930511, "learning_rate": 4.1252284635288735e-08, "loss": 0.71447486, "num_input_tokens_seen": 335525620, "router_z_loss_clip": 0.10449219, "router_z_loss_mlp": 0.6484375, "step": 15555, "time_per_iteration": 2.534886121749878 }, { "auxiliary_loss_clip": 0.01163467, "auxiliary_loss_mlp": 0.010387, "balance_loss_clip": 1.02565813, "balance_loss_mlp": 1.03280711, "epoch": 0.935277318502931, "flos": 24390312387840.0, "grad_norm": 1.7118367342300747, "language_loss": 0.75729346, "learning_rate": 4.11759781134049e-08, "loss": 0.77931517, "num_input_tokens_seen": 335547565, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.68359375, "step": 15556, "time_per_iteration": 2.6021487712860107 }, { "auxiliary_loss_clip": 0.01117671, "auxiliary_loss_mlp": 0.01031536, "balance_loss_clip": 1.02003849, "balance_loss_mlp": 1.03212833, "epoch": 0.935337441755599, "flos": 27600259294080.0, "grad_norm": 1.73812996249626, "language_loss": 0.72186816, "learning_rate": 4.109974149691142e-08, "loss": 0.74336022, "num_input_tokens_seen": 335570285, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6796875, "step": 15557, "time_per_iteration": 2.5379183292388916 }, { "auxiliary_loss_clip": 0.01102006, "auxiliary_loss_mlp": 0.01032311, "balance_loss_clip": 1.01978862, "balance_loss_mlp": 1.03279316, "epoch": 0.935397565008267, "flos": 20482872209280.0, "grad_norm": 1.8540195459759437, "language_loss": 0.63108313, "learning_rate": 4.102357478852947e-08, "loss": 0.65242636, "num_input_tokens_seen": 335588600, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 15558, "time_per_iteration": 3.8725295066833496 }, { "auxiliary_loss_clip": 0.01119165, "auxiliary_loss_mlp": 0.01032125, "balance_loss_clip": 1.01874971, "balance_loss_mlp": 1.03300965, "epoch": 0.9354576882609349, "flos": 23654394541440.0, "grad_norm": 2.1147405463516558, "language_loss": 0.7303375, "learning_rate": 4.0947477990977084e-08, "loss": 0.75185037, "num_input_tokens_seen": 335606235, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6875, "step": 15559, "time_per_iteration": 2.5229177474975586 }, { "auxiliary_loss_clip": 0.01135288, "auxiliary_loss_mlp": 0.01270293, "balance_loss_clip": 1.01290774, "balance_loss_mlp": 1.03379929, "epoch": 0.9355178115136029, "flos": 36684904406400.0, "grad_norm": 6.464029306000336, "language_loss": 0.63733625, "learning_rate": 4.0871451106969876e-08, "loss": 0.66139209, "num_input_tokens_seen": 335628240, "router_z_loss_clip": 0.10546875, "router_z_loss_mlp": 0.65625, "step": 15560, "time_per_iteration": 2.6716175079345703 }, { "auxiliary_loss_clip": 0.01164699, "auxiliary_loss_mlp": 0.01034392, "balance_loss_clip": 1.02009904, "balance_loss_mlp": 1.03560126, "epoch": 0.9355779347662708, "flos": 14460401450880.0, "grad_norm": 2.154936434924377, "language_loss": 0.64164078, "learning_rate": 4.0795494139220785e-08, "loss": 0.66363168, "num_input_tokens_seen": 335643755, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.75390625, "step": 15561, "time_per_iteration": 2.568103790283203 }, { "auxiliary_loss_clip": 0.01102549, "auxiliary_loss_mlp": 0.01034583, "balance_loss_clip": 1.02182198, "balance_loss_mlp": 1.03530979, "epoch": 0.9356380580189388, "flos": 23185976256000.0, "grad_norm": 1.7972833828296058, "language_loss": 0.7540921, "learning_rate": 4.071960709044142e-08, "loss": 0.77546346, "num_input_tokens_seen": 335665160, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.671875, "step": 15562, "time_per_iteration": 2.5625476837158203 }, { "auxiliary_loss_clip": 0.01030576, "auxiliary_loss_mlp": 0.01000108, "balance_loss_clip": 0.99888641, "balance_loss_mlp": 1.00083601, "epoch": 0.9356981812716068, "flos": 67471626090240.0, "grad_norm": 0.9496236440872852, "language_loss": 0.62410665, "learning_rate": 4.064378996333895e-08, "loss": 0.64441347, "num_input_tokens_seen": 335715240, "router_z_loss_clip": 0.01220703, "router_z_loss_mlp": 0.2109375, "step": 15563, "time_per_iteration": 3.1406519412994385 }, { "auxiliary_loss_clip": 0.01135421, "auxiliary_loss_mlp": 0.010345, "balance_loss_clip": 1.02237689, "balance_loss_mlp": 1.03322697, "epoch": 0.9357583045242748, "flos": 24681619687680.0, "grad_norm": 1.5565291826788736, "language_loss": 0.7819733, "learning_rate": 4.056804276061987e-08, "loss": 0.80367255, "num_input_tokens_seen": 335734970, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6640625, "step": 15564, "time_per_iteration": 2.7317070960998535 }, { "auxiliary_loss_clip": 0.0110306, "auxiliary_loss_mlp": 0.01032867, "balance_loss_clip": 1.01966453, "balance_loss_mlp": 1.032897, "epoch": 0.9358184277769427, "flos": 19463727623040.0, "grad_norm": 1.8304747458733666, "language_loss": 0.78292769, "learning_rate": 4.049236548498669e-08, "loss": 0.80428702, "num_input_tokens_seen": 335753435, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 15565, "time_per_iteration": 4.939566135406494 }, { "auxiliary_loss_clip": 0.01108851, "auxiliary_loss_mlp": 0.01029504, "balance_loss_clip": 1.01799428, "balance_loss_mlp": 1.03322101, "epoch": 0.9358785510296107, "flos": 18916987731840.0, "grad_norm": 1.9133596750159831, "language_loss": 0.71789157, "learning_rate": 4.041675813914103e-08, "loss": 0.7392751, "num_input_tokens_seen": 335772105, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66796875, "step": 15566, "time_per_iteration": 4.065150260925293 }, { "auxiliary_loss_clip": 0.01127642, "auxiliary_loss_mlp": 0.01277019, "balance_loss_clip": 1.01833999, "balance_loss_mlp": 1.03216648, "epoch": 0.9359386742822787, "flos": 23441265192960.0, "grad_norm": 2.346579171367723, "language_loss": 0.67464852, "learning_rate": 4.034122072578028e-08, "loss": 0.69869518, "num_input_tokens_seen": 335789125, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 15567, "time_per_iteration": 2.618457794189453 }, { "auxiliary_loss_clip": 0.01120855, "auxiliary_loss_mlp": 0.01030632, "balance_loss_clip": 1.01878834, "balance_loss_mlp": 1.03427815, "epoch": 0.9359987975349466, "flos": 32744067557760.0, "grad_norm": 2.251406100916877, "language_loss": 0.62023562, "learning_rate": 4.026575324760051e-08, "loss": 0.64175051, "num_input_tokens_seen": 335810995, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.68359375, "step": 15568, "time_per_iteration": 2.7059617042541504 }, { "auxiliary_loss_clip": 0.01118641, "auxiliary_loss_mlp": 0.0103319, "balance_loss_clip": 1.0212332, "balance_loss_mlp": 1.03230953, "epoch": 0.9360589207876147, "flos": 22819651401600.0, "grad_norm": 1.589090036938718, "language_loss": 0.79106283, "learning_rate": 4.019035570729534e-08, "loss": 0.81258112, "num_input_tokens_seen": 335830580, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.68359375, "step": 15569, "time_per_iteration": 2.61569881439209 }, { "auxiliary_loss_clip": 0.01122524, "auxiliary_loss_mlp": 0.01034738, "balance_loss_clip": 1.02206063, "balance_loss_mlp": 1.03525805, "epoch": 0.9361190440402826, "flos": 20885251340160.0, "grad_norm": 6.790726588678585, "language_loss": 0.69375646, "learning_rate": 4.011502810755485e-08, "loss": 0.71532917, "num_input_tokens_seen": 335846515, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 15570, "time_per_iteration": 2.5336110591888428 }, { "auxiliary_loss_clip": 0.01111347, "auxiliary_loss_mlp": 0.0103108, "balance_loss_clip": 1.01880717, "balance_loss_mlp": 1.03299725, "epoch": 0.9361791672929506, "flos": 19317822577920.0, "grad_norm": 1.9807714254479634, "language_loss": 0.78704786, "learning_rate": 4.003977045106754e-08, "loss": 0.80847216, "num_input_tokens_seen": 335863350, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.69140625, "step": 15571, "time_per_iteration": 2.531660795211792 }, { "auxiliary_loss_clip": 0.01128117, "auxiliary_loss_mlp": 0.01031101, "balance_loss_clip": 1.01853681, "balance_loss_mlp": 1.03266478, "epoch": 0.9362392905456185, "flos": 15158182032000.0, "grad_norm": 2.0240231566896174, "language_loss": 0.77567744, "learning_rate": 3.996458274051928e-08, "loss": 0.79726958, "num_input_tokens_seen": 335880510, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 15572, "time_per_iteration": 2.6030449867248535 }, { "auxiliary_loss_clip": 0.01118471, "auxiliary_loss_mlp": 0.01040456, "balance_loss_clip": 1.02831495, "balance_loss_mlp": 1.03428841, "epoch": 0.9362994137982865, "flos": 22085888371200.0, "grad_norm": 1.7162541196273426, "language_loss": 0.77933097, "learning_rate": 3.988946497859325e-08, "loss": 0.80092025, "num_input_tokens_seen": 335899440, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6640625, "step": 15573, "time_per_iteration": 2.848635196685791 }, { "auxiliary_loss_clip": 0.01125926, "auxiliary_loss_mlp": 0.01025003, "balance_loss_clip": 1.01314759, "balance_loss_mlp": 1.03341174, "epoch": 0.9363595370509544, "flos": 23512260424320.0, "grad_norm": 1.5655815421074053, "language_loss": 0.74715161, "learning_rate": 3.981441716796996e-08, "loss": 0.7686609, "num_input_tokens_seen": 335919540, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.65625, "step": 15574, "time_per_iteration": 2.624608278274536 }, { "auxiliary_loss_clip": 0.01134619, "auxiliary_loss_mlp": 0.01030765, "balance_loss_clip": 1.01865947, "balance_loss_mlp": 1.03266013, "epoch": 0.9364196603036224, "flos": 27123473139840.0, "grad_norm": 1.8886196884536937, "language_loss": 0.68064868, "learning_rate": 3.9739439311328396e-08, "loss": 0.70230258, "num_input_tokens_seen": 335939665, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6640625, "step": 15575, "time_per_iteration": 2.746001958847046 }, { "auxiliary_loss_clip": 0.01117295, "auxiliary_loss_mlp": 0.01033366, "balance_loss_clip": 1.02111089, "balance_loss_mlp": 1.03326905, "epoch": 0.9364797835562904, "flos": 18479057114880.0, "grad_norm": 1.590873174519387, "language_loss": 0.65415549, "learning_rate": 3.9664531411343516e-08, "loss": 0.6756621, "num_input_tokens_seen": 335958580, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.66015625, "step": 15576, "time_per_iteration": 2.5070905685424805 }, { "auxiliary_loss_clip": 0.01112334, "auxiliary_loss_mlp": 0.01028793, "balance_loss_clip": 1.0149647, "balance_loss_mlp": 1.03245115, "epoch": 0.9365399068089584, "flos": 13005552890880.0, "grad_norm": 2.201872415316571, "language_loss": 0.75505197, "learning_rate": 3.9589693470688967e-08, "loss": 0.77646315, "num_input_tokens_seen": 335974965, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.70703125, "step": 15577, "time_per_iteration": 2.5079762935638428 }, { "auxiliary_loss_clip": 0.01136607, "auxiliary_loss_mlp": 0.01027964, "balance_loss_clip": 1.01532197, "balance_loss_mlp": 1.03254426, "epoch": 0.9366000300616263, "flos": 25666433850240.0, "grad_norm": 2.0238935524664194, "language_loss": 0.52061152, "learning_rate": 3.95149254920355e-08, "loss": 0.54225719, "num_input_tokens_seen": 335996575, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.68359375, "step": 15578, "time_per_iteration": 2.596691131591797 }, { "auxiliary_loss_clip": 0.01145023, "auxiliary_loss_mlp": 0.01032924, "balance_loss_clip": 1.02123523, "balance_loss_mlp": 1.03332019, "epoch": 0.9366601533142943, "flos": 21433355948160.0, "grad_norm": 1.7796473357547637, "language_loss": 0.70449764, "learning_rate": 3.944022747805165e-08, "loss": 0.72627711, "num_input_tokens_seen": 336017265, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.671875, "step": 15579, "time_per_iteration": 2.602090358734131 }, { "auxiliary_loss_clip": 0.01117972, "auxiliary_loss_mlp": 0.01025947, "balance_loss_clip": 1.01459861, "balance_loss_mlp": 1.03348255, "epoch": 0.9367202765669622, "flos": 24093222998400.0, "grad_norm": 2.8785607700921036, "language_loss": 0.7681005, "learning_rate": 3.936559943140239e-08, "loss": 0.78953969, "num_input_tokens_seen": 336035905, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6640625, "step": 15580, "time_per_iteration": 2.551494836807251 }, { "auxiliary_loss_clip": 0.01115925, "auxiliary_loss_mlp": 0.01032772, "balance_loss_clip": 1.01983213, "balance_loss_mlp": 1.03506756, "epoch": 0.9367803998196302, "flos": 20888842700160.0, "grad_norm": 1.8037393595480635, "language_loss": 0.66362524, "learning_rate": 3.929104135475203e-08, "loss": 0.68511224, "num_input_tokens_seen": 336055585, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 15581, "time_per_iteration": 2.5698673725128174 }, { "auxiliary_loss_clip": 0.01112257, "auxiliary_loss_mlp": 0.01027875, "balance_loss_clip": 1.01548266, "balance_loss_mlp": 1.03494, "epoch": 0.9368405230722983, "flos": 22564362464640.0, "grad_norm": 1.5020266730237342, "language_loss": 0.76599622, "learning_rate": 3.921655325076045e-08, "loss": 0.78739762, "num_input_tokens_seen": 336076695, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 15582, "time_per_iteration": 2.6032536029815674 }, { "auxiliary_loss_clip": 0.01154202, "auxiliary_loss_mlp": 0.01035576, "balance_loss_clip": 1.02317834, "balance_loss_mlp": 1.03347015, "epoch": 0.9369006463249662, "flos": 18880215183360.0, "grad_norm": 1.7252168889234265, "language_loss": 0.7369591, "learning_rate": 3.914213512208664e-08, "loss": 0.75885683, "num_input_tokens_seen": 336094740, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.66796875, "step": 15583, "time_per_iteration": 2.568514108657837 }, { "auxiliary_loss_clip": 0.01107453, "auxiliary_loss_mlp": 0.01033354, "balance_loss_clip": 1.0207653, "balance_loss_mlp": 1.03581238, "epoch": 0.9369607695776342, "flos": 26432516142720.0, "grad_norm": 2.2528093114279284, "language_loss": 0.84136993, "learning_rate": 3.906778697138602e-08, "loss": 0.86277795, "num_input_tokens_seen": 336113985, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 15584, "time_per_iteration": 2.806638240814209 }, { "auxiliary_loss_clip": 0.01115751, "auxiliary_loss_mlp": 0.01284718, "balance_loss_clip": 1.02450109, "balance_loss_mlp": 1.03406262, "epoch": 0.9370208928303021, "flos": 39567346081920.0, "grad_norm": 1.56693246380681, "language_loss": 0.72022521, "learning_rate": 3.89935088013118e-08, "loss": 0.74422991, "num_input_tokens_seen": 336136395, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 15585, "time_per_iteration": 2.7419593334198 }, { "auxiliary_loss_clip": 0.01124639, "auxiliary_loss_mlp": 0.01024747, "balance_loss_clip": 1.01368463, "balance_loss_mlp": 1.03150606, "epoch": 0.9370810160829701, "flos": 22963114321920.0, "grad_norm": 2.015930583342348, "language_loss": 0.8041178, "learning_rate": 3.8919300614515424e-08, "loss": 0.82561159, "num_input_tokens_seen": 336156345, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.66796875, "step": 15586, "time_per_iteration": 2.7310616970062256 }, { "auxiliary_loss_clip": 0.01128633, "auxiliary_loss_mlp": 0.01034762, "balance_loss_clip": 1.02254272, "balance_loss_mlp": 1.03376114, "epoch": 0.937141139335638, "flos": 23075048079360.0, "grad_norm": 2.1172560015098263, "language_loss": 0.76498175, "learning_rate": 3.8845162413644106e-08, "loss": 0.78661567, "num_input_tokens_seen": 336176760, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 15587, "time_per_iteration": 2.6419756412506104 }, { "auxiliary_loss_clip": 0.01123615, "auxiliary_loss_mlp": 0.01031006, "balance_loss_clip": 1.01880467, "balance_loss_mlp": 1.0348475, "epoch": 0.937201262588306, "flos": 16356664247040.0, "grad_norm": 1.7893363855758977, "language_loss": 0.87380588, "learning_rate": 3.877109420134461e-08, "loss": 0.89535207, "num_input_tokens_seen": 336193285, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7109375, "step": 15588, "time_per_iteration": 2.442866086959839 }, { "auxiliary_loss_clip": 0.0109973, "auxiliary_loss_mlp": 0.01277111, "balance_loss_clip": 1.01886976, "balance_loss_mlp": 1.03386796, "epoch": 0.937261385840974, "flos": 20194078861440.0, "grad_norm": 1.6514790278155387, "language_loss": 0.78325486, "learning_rate": 3.869709598025994e-08, "loss": 0.80702329, "num_input_tokens_seen": 336211425, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.65625, "step": 15589, "time_per_iteration": 2.5788373947143555 }, { "auxiliary_loss_clip": 0.01100734, "auxiliary_loss_mlp": 0.01033494, "balance_loss_clip": 1.02125764, "balance_loss_mlp": 1.03394938, "epoch": 0.937321509093642, "flos": 18295948558080.0, "grad_norm": 1.8258867457934698, "language_loss": 0.77983934, "learning_rate": 3.862316775303065e-08, "loss": 0.80118161, "num_input_tokens_seen": 336230205, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.66796875, "step": 15590, "time_per_iteration": 2.666250705718994 }, { "auxiliary_loss_clip": 0.01124079, "auxiliary_loss_mlp": 0.01038109, "balance_loss_clip": 1.02455473, "balance_loss_mlp": 1.03380811, "epoch": 0.9373816323463099, "flos": 25884662929920.0, "grad_norm": 1.657996026736727, "language_loss": 0.71210885, "learning_rate": 3.854930952229507e-08, "loss": 0.73373073, "num_input_tokens_seen": 336252440, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 15591, "time_per_iteration": 2.580892562866211 }, { "auxiliary_loss_clip": 0.01116281, "auxiliary_loss_mlp": 0.01280819, "balance_loss_clip": 1.01948416, "balance_loss_mlp": 1.03518748, "epoch": 0.9374417555989779, "flos": 27198849830400.0, "grad_norm": 1.9627699057628873, "language_loss": 0.53177059, "learning_rate": 3.847552129068954e-08, "loss": 0.55574155, "num_input_tokens_seen": 336273845, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.71875, "step": 15592, "time_per_iteration": 4.031423091888428 }, { "auxiliary_loss_clip": 0.01110474, "auxiliary_loss_mlp": 0.01027131, "balance_loss_clip": 1.01560938, "balance_loss_mlp": 1.03415489, "epoch": 0.9375018788516458, "flos": 23621249266560.0, "grad_norm": 1.40243703489143, "language_loss": 0.67251396, "learning_rate": 3.840180306084684e-08, "loss": 0.69389009, "num_input_tokens_seen": 336292790, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 15593, "time_per_iteration": 2.5864920616149902 }, { "auxiliary_loss_clip": 0.01118919, "auxiliary_loss_mlp": 0.01028246, "balance_loss_clip": 1.01660466, "balance_loss_mlp": 1.03236997, "epoch": 0.9375620021043138, "flos": 22678774260480.0, "grad_norm": 1.473853670587252, "language_loss": 0.74211073, "learning_rate": 3.832815483539798e-08, "loss": 0.76358241, "num_input_tokens_seen": 336312600, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6875, "step": 15594, "time_per_iteration": 2.676436424255371 }, { "auxiliary_loss_clip": 0.01107241, "auxiliary_loss_mlp": 0.0102998, "balance_loss_clip": 1.01857197, "balance_loss_mlp": 1.03219604, "epoch": 0.9376221253569819, "flos": 27560254521600.0, "grad_norm": 1.6728191368341598, "language_loss": 0.73705184, "learning_rate": 3.825457661697107e-08, "loss": 0.7584241, "num_input_tokens_seen": 336332770, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6640625, "step": 15595, "time_per_iteration": 2.7906181812286377 }, { "auxiliary_loss_clip": 0.01120841, "auxiliary_loss_mlp": 0.01026583, "balance_loss_clip": 1.01479959, "balance_loss_mlp": 1.03331065, "epoch": 0.9376822486096498, "flos": 24129887806080.0, "grad_norm": 1.8000066753276047, "language_loss": 0.76184464, "learning_rate": 3.818106840819246e-08, "loss": 0.78331888, "num_input_tokens_seen": 336351445, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6953125, "step": 15596, "time_per_iteration": 2.6406540870666504 }, { "auxiliary_loss_clip": 0.01119727, "auxiliary_loss_mlp": 0.01031648, "balance_loss_clip": 1.01945853, "balance_loss_mlp": 1.03345108, "epoch": 0.9377423718623178, "flos": 22784028088320.0, "grad_norm": 1.9926314482199856, "language_loss": 0.78924263, "learning_rate": 3.8107630211684724e-08, "loss": 0.81075639, "num_input_tokens_seen": 336368690, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.68359375, "step": 15597, "time_per_iteration": 2.585796356201172 }, { "auxiliary_loss_clip": 0.01105975, "auxiliary_loss_mlp": 0.01030976, "balance_loss_clip": 1.01845288, "balance_loss_mlp": 1.03430176, "epoch": 0.9378024951149857, "flos": 19168900790400.0, "grad_norm": 1.4970896780090404, "language_loss": 0.80981189, "learning_rate": 3.803426203006932e-08, "loss": 0.83118141, "num_input_tokens_seen": 336388165, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 15598, "time_per_iteration": 2.5508151054382324 }, { "auxiliary_loss_clip": 0.01132865, "auxiliary_loss_mlp": 0.01031064, "balance_loss_clip": 1.01808167, "balance_loss_mlp": 1.03297567, "epoch": 0.9378626183676537, "flos": 23505508667520.0, "grad_norm": 2.705326761498827, "language_loss": 0.62910742, "learning_rate": 3.796096386596459e-08, "loss": 0.6507467, "num_input_tokens_seen": 336406475, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.734375, "step": 15599, "time_per_iteration": 2.5324313640594482 }, { "auxiliary_loss_clip": 0.01130099, "auxiliary_loss_mlp": 0.01030491, "balance_loss_clip": 1.01817632, "balance_loss_mlp": 1.03399348, "epoch": 0.9379227416203216, "flos": 18405655672320.0, "grad_norm": 1.7097522304608457, "language_loss": 0.73160636, "learning_rate": 3.788773572198556e-08, "loss": 0.75321227, "num_input_tokens_seen": 336424690, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 15600, "time_per_iteration": 3.915416955947876 }, { "auxiliary_loss_clip": 0.01117544, "auxiliary_loss_mlp": 0.01030342, "balance_loss_clip": 1.01895738, "balance_loss_mlp": 1.03323054, "epoch": 0.9379828648729897, "flos": 17821855923840.0, "grad_norm": 1.7038534050223295, "language_loss": 0.69560248, "learning_rate": 3.781457760074636e-08, "loss": 0.71708137, "num_input_tokens_seen": 336443055, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6640625, "step": 15601, "time_per_iteration": 2.554647922515869 }, { "auxiliary_loss_clip": 0.01031536, "auxiliary_loss_mlp": 0.01002482, "balance_loss_clip": 1.00120091, "balance_loss_mlp": 1.00124085, "epoch": 0.9380429881256576, "flos": 68024399466240.0, "grad_norm": 0.7590214275989818, "language_loss": 0.5809468, "learning_rate": 3.774148950485756e-08, "loss": 0.60128701, "num_input_tokens_seen": 336510190, "router_z_loss_clip": 0.01281738, "router_z_loss_mlp": 0.21191406, "step": 15602, "time_per_iteration": 3.249300718307495 }, { "auxiliary_loss_clip": 0.01156432, "auxiliary_loss_mlp": 0.01031718, "balance_loss_clip": 1.02013695, "balance_loss_mlp": 1.03333724, "epoch": 0.9381031113783256, "flos": 20776980769920.0, "grad_norm": 1.9435402946515998, "language_loss": 0.71771187, "learning_rate": 3.766847143692753e-08, "loss": 0.73959339, "num_input_tokens_seen": 336529250, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.69140625, "step": 15603, "time_per_iteration": 2.582942008972168 }, { "auxiliary_loss_clip": 0.01093785, "auxiliary_loss_mlp": 0.0102514, "balance_loss_clip": 1.01416111, "balance_loss_mlp": 1.0308286, "epoch": 0.9381632346309935, "flos": 19025078734080.0, "grad_norm": 1.6021239115757995, "language_loss": 0.75908142, "learning_rate": 3.759552339956174e-08, "loss": 0.78027064, "num_input_tokens_seen": 336548530, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.62890625, "step": 15604, "time_per_iteration": 2.5666208267211914 }, { "auxiliary_loss_clip": 0.0112192, "auxiliary_loss_mlp": 0.01031459, "balance_loss_clip": 1.01860237, "balance_loss_mlp": 1.03426623, "epoch": 0.9382233578836615, "flos": 23513840622720.0, "grad_norm": 1.9484817082219301, "language_loss": 0.65414238, "learning_rate": 3.75226453953641e-08, "loss": 0.67567611, "num_input_tokens_seen": 336568510, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 15605, "time_per_iteration": 2.8397693634033203 }, { "auxiliary_loss_clip": 0.01133303, "auxiliary_loss_mlp": 0.01282735, "balance_loss_clip": 1.02279317, "balance_loss_mlp": 1.03429794, "epoch": 0.9382834811363294, "flos": 43067882016000.0, "grad_norm": 1.6977281273473661, "language_loss": 0.67693138, "learning_rate": 3.744983742693497e-08, "loss": 0.70109171, "num_input_tokens_seen": 336592020, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 15606, "time_per_iteration": 4.294109106063843 }, { "auxiliary_loss_clip": 0.01105634, "auxiliary_loss_mlp": 0.0102618, "balance_loss_clip": 1.01504028, "balance_loss_mlp": 1.03151846, "epoch": 0.9383436043889974, "flos": 16436242828800.0, "grad_norm": 1.7843636235088518, "language_loss": 0.77913344, "learning_rate": 3.7377099496872954e-08, "loss": 0.80045164, "num_input_tokens_seen": 336610010, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.65234375, "step": 15607, "time_per_iteration": 4.05420994758606 }, { "auxiliary_loss_clip": 0.01111019, "auxiliary_loss_mlp": 0.0102988, "balance_loss_clip": 1.01810789, "balance_loss_mlp": 1.03308845, "epoch": 0.9384037276416655, "flos": 20740603271040.0, "grad_norm": 2.3285551562725764, "language_loss": 0.82833141, "learning_rate": 3.7304431607773744e-08, "loss": 0.84974045, "num_input_tokens_seen": 336628520, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6875, "step": 15608, "time_per_iteration": 2.521122932434082 }, { "auxiliary_loss_clip": 0.01114316, "auxiliary_loss_mlp": 0.0103466, "balance_loss_clip": 1.02213657, "balance_loss_mlp": 1.03691256, "epoch": 0.9384638508943334, "flos": 19062677295360.0, "grad_norm": 2.1557896917734327, "language_loss": 0.68793964, "learning_rate": 3.723183376223082e-08, "loss": 0.70942938, "num_input_tokens_seen": 336647365, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 15609, "time_per_iteration": 2.572950839996338 }, { "auxiliary_loss_clip": 0.01130436, "auxiliary_loss_mlp": 0.01029748, "balance_loss_clip": 1.01662254, "balance_loss_mlp": 1.03278124, "epoch": 0.9385239741470014, "flos": 23404887694080.0, "grad_norm": 2.0760756301256373, "language_loss": 0.75234669, "learning_rate": 3.715930596283479e-08, "loss": 0.77394849, "num_input_tokens_seen": 336667165, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 15610, "time_per_iteration": 2.561250925064087 }, { "auxiliary_loss_clip": 0.01148573, "auxiliary_loss_mlp": 0.01029805, "balance_loss_clip": 1.01855791, "balance_loss_mlp": 1.03401554, "epoch": 0.9385840973996693, "flos": 12824742804480.0, "grad_norm": 2.027627316531777, "language_loss": 0.75420189, "learning_rate": 3.708684821217423e-08, "loss": 0.77598572, "num_input_tokens_seen": 336684130, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.69921875, "step": 15611, "time_per_iteration": 2.550976037979126 }, { "auxiliary_loss_clip": 0.01129174, "auxiliary_loss_mlp": 0.01030341, "balance_loss_clip": 1.01861119, "balance_loss_mlp": 1.03506899, "epoch": 0.9386442206523373, "flos": 15486980152320.0, "grad_norm": 2.013333856091639, "language_loss": 0.65569091, "learning_rate": 3.701446051283463e-08, "loss": 0.67728609, "num_input_tokens_seen": 336701520, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.67578125, "step": 15612, "time_per_iteration": 2.4915945529937744 }, { "auxiliary_loss_clip": 0.01108043, "auxiliary_loss_mlp": 0.01028909, "balance_loss_clip": 1.01739979, "balance_loss_mlp": 1.03274083, "epoch": 0.9387043439050052, "flos": 21178821196800.0, "grad_norm": 2.048219190584362, "language_loss": 0.56783068, "learning_rate": 3.6942142867399715e-08, "loss": 0.58920026, "num_input_tokens_seen": 336720675, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 15613, "time_per_iteration": 2.5150983333587646 }, { "auxiliary_loss_clip": 0.01119484, "auxiliary_loss_mlp": 0.01033487, "balance_loss_clip": 1.0214107, "balance_loss_mlp": 1.03265607, "epoch": 0.9387644671576733, "flos": 27668273696640.0, "grad_norm": 2.242807398200232, "language_loss": 0.70710731, "learning_rate": 3.686989527845008e-08, "loss": 0.72863698, "num_input_tokens_seen": 336741005, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69140625, "step": 15614, "time_per_iteration": 2.5435400009155273 }, { "auxiliary_loss_clip": 0.01114757, "auxiliary_loss_mlp": 0.01033128, "balance_loss_clip": 1.02005053, "balance_loss_mlp": 1.03468001, "epoch": 0.9388245904103412, "flos": 18836331742080.0, "grad_norm": 1.9697005758203197, "language_loss": 0.80972993, "learning_rate": 3.6797717748564105e-08, "loss": 0.83120877, "num_input_tokens_seen": 336757990, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 15615, "time_per_iteration": 2.490255832672119 }, { "auxiliary_loss_clip": 0.01117388, "auxiliary_loss_mlp": 0.01030004, "balance_loss_clip": 1.01818442, "balance_loss_mlp": 1.03302586, "epoch": 0.9388847136630092, "flos": 25483828083840.0, "grad_norm": 1.6490250321543487, "language_loss": 0.71755087, "learning_rate": 3.6725610280317734e-08, "loss": 0.73902476, "num_input_tokens_seen": 336777705, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.66796875, "step": 15616, "time_per_iteration": 2.5882253646850586 }, { "auxiliary_loss_clip": 0.01103404, "auxiliary_loss_mlp": 0.01027376, "balance_loss_clip": 1.01550877, "balance_loss_mlp": 1.03579831, "epoch": 0.9389448369156771, "flos": 18734992496640.0, "grad_norm": 1.920571443178907, "language_loss": 0.66267288, "learning_rate": 3.6653572876284014e-08, "loss": 0.6839807, "num_input_tokens_seen": 336798275, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.67578125, "step": 15617, "time_per_iteration": 2.4935572147369385 }, { "auxiliary_loss_clip": 0.01115535, "auxiliary_loss_mlp": 0.01035908, "balance_loss_clip": 1.02258611, "balance_loss_mlp": 1.03521121, "epoch": 0.9390049601683451, "flos": 21717839664000.0, "grad_norm": 4.281532010742569, "language_loss": 0.73400533, "learning_rate": 3.6581605539034e-08, "loss": 0.75551975, "num_input_tokens_seen": 336813835, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 15618, "time_per_iteration": 2.5178778171539307 }, { "auxiliary_loss_clip": 0.01107414, "auxiliary_loss_mlp": 0.01032557, "balance_loss_clip": 1.01891923, "balance_loss_mlp": 1.0348978, "epoch": 0.939065083421013, "flos": 19391224020480.0, "grad_norm": 2.4814456457898957, "language_loss": 0.69965029, "learning_rate": 3.650970827113586e-08, "loss": 0.72105002, "num_input_tokens_seen": 336832210, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 15619, "time_per_iteration": 2.7364273071289062 }, { "auxiliary_loss_clip": 0.01122079, "auxiliary_loss_mlp": 0.01033885, "balance_loss_clip": 1.02155256, "balance_loss_mlp": 1.03458881, "epoch": 0.939125206673681, "flos": 24681511946880.0, "grad_norm": 1.8095302731214813, "language_loss": 0.77304292, "learning_rate": 3.6437881075155774e-08, "loss": 0.79460251, "num_input_tokens_seen": 336851380, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 15620, "time_per_iteration": 2.6387667655944824 }, { "auxiliary_loss_clip": 0.01106708, "auxiliary_loss_mlp": 0.01028475, "balance_loss_clip": 1.01632738, "balance_loss_mlp": 1.03212762, "epoch": 0.9391853299263491, "flos": 16325961096960.0, "grad_norm": 1.986241135662355, "language_loss": 0.73978293, "learning_rate": 3.636612395365657e-08, "loss": 0.76113474, "num_input_tokens_seen": 336868525, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.65625, "step": 15621, "time_per_iteration": 2.550051212310791 }, { "auxiliary_loss_clip": 0.01133969, "auxiliary_loss_mlp": 0.01033099, "balance_loss_clip": 1.02033758, "balance_loss_mlp": 1.03563666, "epoch": 0.939245453179017, "flos": 11655778590720.0, "grad_norm": 1.924593810791468, "language_loss": 0.66050428, "learning_rate": 3.6294436909199535e-08, "loss": 0.68217498, "num_input_tokens_seen": 336886200, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 15622, "time_per_iteration": 2.5691721439361572 }, { "auxiliary_loss_clip": 0.01118915, "auxiliary_loss_mlp": 0.01031894, "balance_loss_clip": 1.02000308, "balance_loss_mlp": 1.0337559, "epoch": 0.939305576431685, "flos": 23148700917120.0, "grad_norm": 1.5679856508868755, "language_loss": 0.7158016, "learning_rate": 3.622281994434262e-08, "loss": 0.73730969, "num_input_tokens_seen": 336905815, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 15623, "time_per_iteration": 2.5651488304138184 }, { "auxiliary_loss_clip": 0.01115435, "auxiliary_loss_mlp": 0.01030688, "balance_loss_clip": 1.01889229, "balance_loss_mlp": 1.0368309, "epoch": 0.9393656996843529, "flos": 33287790706560.0, "grad_norm": 1.8876076457814313, "language_loss": 0.72676051, "learning_rate": 3.6151273061642004e-08, "loss": 0.74822176, "num_input_tokens_seen": 336928460, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69921875, "step": 15624, "time_per_iteration": 2.6203267574310303 }, { "auxiliary_loss_clip": 0.01126887, "auxiliary_loss_mlp": 0.01031813, "balance_loss_clip": 1.01980329, "balance_loss_mlp": 1.03316391, "epoch": 0.9394258229370209, "flos": 21689434984320.0, "grad_norm": 1.8337883918999378, "language_loss": 0.89359963, "learning_rate": 3.607979626365076e-08, "loss": 0.91518664, "num_input_tokens_seen": 336948320, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 15625, "time_per_iteration": 2.548096179962158 }, { "auxiliary_loss_clip": 0.01110062, "auxiliary_loss_mlp": 0.01032103, "balance_loss_clip": 1.01934147, "balance_loss_mlp": 1.03277981, "epoch": 0.9394859461896888, "flos": 23874203819520.0, "grad_norm": 1.619775724469894, "language_loss": 0.83386862, "learning_rate": 3.600838955291974e-08, "loss": 0.85529029, "num_input_tokens_seen": 336967670, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 15626, "time_per_iteration": 2.506322145462036 }, { "auxiliary_loss_clip": 0.01040011, "auxiliary_loss_mlp": 0.01001161, "balance_loss_clip": 0.9998377, "balance_loss_mlp": 1.00076985, "epoch": 0.9395460694423569, "flos": 61566116993280.0, "grad_norm": 0.7732970315328982, "language_loss": 0.56130421, "learning_rate": 3.5937052931997336e-08, "loss": 0.58171594, "num_input_tokens_seen": 337028395, "router_z_loss_clip": 0.01324463, "router_z_loss_mlp": 0.2109375, "step": 15627, "time_per_iteration": 3.078273296356201 }, { "auxiliary_loss_clip": 0.01137755, "auxiliary_loss_mlp": 0.01030448, "balance_loss_clip": 1.01865232, "balance_loss_mlp": 1.03541398, "epoch": 0.9396061926950248, "flos": 20995712640000.0, "grad_norm": 1.3652285762505816, "language_loss": 0.69949174, "learning_rate": 3.58657864034293e-08, "loss": 0.72117382, "num_input_tokens_seen": 337048150, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.66796875, "step": 15628, "time_per_iteration": 2.538132667541504 }, { "auxiliary_loss_clip": 0.01109983, "auxiliary_loss_mlp": 0.0102607, "balance_loss_clip": 1.01364851, "balance_loss_mlp": 1.03401363, "epoch": 0.9396663159476928, "flos": 27487786832640.0, "grad_norm": 1.6172635092596854, "language_loss": 0.75524825, "learning_rate": 3.579458996975915e-08, "loss": 0.77660871, "num_input_tokens_seen": 337069315, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.671875, "step": 15629, "time_per_iteration": 2.5881075859069824 }, { "auxiliary_loss_clip": 0.01142036, "auxiliary_loss_mlp": 0.01025613, "balance_loss_clip": 1.01434159, "balance_loss_mlp": 1.03287077, "epoch": 0.9397264392003607, "flos": 19427457864960.0, "grad_norm": 1.7057766215760075, "language_loss": 0.74439538, "learning_rate": 3.572346363352752e-08, "loss": 0.76607192, "num_input_tokens_seen": 337087765, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6484375, "step": 15630, "time_per_iteration": 2.5698623657226562 }, { "auxiliary_loss_clip": 0.01117454, "auxiliary_loss_mlp": 0.01029887, "balance_loss_clip": 1.01821589, "balance_loss_mlp": 1.03414798, "epoch": 0.9397865624530287, "flos": 18004820826240.0, "grad_norm": 1.9497382945128632, "language_loss": 0.69396031, "learning_rate": 3.56524073972726e-08, "loss": 0.71543372, "num_input_tokens_seen": 337106265, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6484375, "step": 15631, "time_per_iteration": 2.5125186443328857 }, { "auxiliary_loss_clip": 0.01135616, "auxiliary_loss_mlp": 0.01035157, "balance_loss_clip": 1.02391565, "balance_loss_mlp": 1.03299916, "epoch": 0.9398466857056966, "flos": 22564613859840.0, "grad_norm": 1.5612032307425445, "language_loss": 0.75280607, "learning_rate": 3.5581421263530364e-08, "loss": 0.77451384, "num_input_tokens_seen": 337126090, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.671875, "step": 15632, "time_per_iteration": 2.5785341262817383 }, { "auxiliary_loss_clip": 0.01117295, "auxiliary_loss_mlp": 0.01033489, "balance_loss_clip": 1.02014923, "balance_loss_mlp": 1.0367322, "epoch": 0.9399068089583646, "flos": 24535678728960.0, "grad_norm": 2.5999486792572895, "language_loss": 0.74312651, "learning_rate": 3.551050523483434e-08, "loss": 0.76463431, "num_input_tokens_seen": 337145655, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 15633, "time_per_iteration": 2.561732530593872 }, { "auxiliary_loss_clip": 0.01103724, "auxiliary_loss_mlp": 0.01035335, "balance_loss_clip": 1.02200699, "balance_loss_mlp": 1.03355753, "epoch": 0.9399669322110327, "flos": 25630343660160.0, "grad_norm": 1.7625757546786158, "language_loss": 0.72703212, "learning_rate": 3.543965931371473e-08, "loss": 0.74842274, "num_input_tokens_seen": 337164805, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 15634, "time_per_iteration": 4.0996479988098145 }, { "auxiliary_loss_clip": 0.01112615, "auxiliary_loss_mlp": 0.01027138, "balance_loss_clip": 1.01475763, "balance_loss_mlp": 1.03287959, "epoch": 0.9400270554637006, "flos": 17089385783040.0, "grad_norm": 1.61112837974681, "language_loss": 0.68928188, "learning_rate": 3.536888350270062e-08, "loss": 0.71067941, "num_input_tokens_seen": 337182280, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 15635, "time_per_iteration": 2.7177159786224365 }, { "auxiliary_loss_clip": 0.01135821, "auxiliary_loss_mlp": 0.01028841, "balance_loss_clip": 1.0168072, "balance_loss_mlp": 1.03440189, "epoch": 0.9400871787163686, "flos": 22055113393920.0, "grad_norm": 1.7148161302182143, "language_loss": 0.73747373, "learning_rate": 3.52981778043171e-08, "loss": 0.75912035, "num_input_tokens_seen": 337203495, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.66015625, "step": 15636, "time_per_iteration": 2.6140425205230713 }, { "auxiliary_loss_clip": 0.01113312, "auxiliary_loss_mlp": 0.01031717, "balance_loss_clip": 1.01896131, "balance_loss_mlp": 1.03478336, "epoch": 0.9401473019690365, "flos": 16982767238400.0, "grad_norm": 2.2278516460488924, "language_loss": 0.82733512, "learning_rate": 3.522754222108815e-08, "loss": 0.8487854, "num_input_tokens_seen": 337220435, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 15637, "time_per_iteration": 2.5126841068267822 }, { "auxiliary_loss_clip": 0.01138547, "auxiliary_loss_mlp": 0.01028644, "balance_loss_clip": 1.01578152, "balance_loss_mlp": 1.03356969, "epoch": 0.9402074252217045, "flos": 19681956702720.0, "grad_norm": 3.403797981314476, "language_loss": 0.69443464, "learning_rate": 3.515697675553375e-08, "loss": 0.71610653, "num_input_tokens_seen": 337238095, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 15638, "time_per_iteration": 2.5318613052368164 }, { "auxiliary_loss_clip": 0.01106387, "auxiliary_loss_mlp": 0.01034599, "balance_loss_clip": 1.02373862, "balance_loss_mlp": 1.03315163, "epoch": 0.9402675484743724, "flos": 24754302858240.0, "grad_norm": 1.681962766500567, "language_loss": 0.84851134, "learning_rate": 3.508648141017301e-08, "loss": 0.86992115, "num_input_tokens_seen": 337256645, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.64453125, "step": 15639, "time_per_iteration": 2.4945576190948486 }, { "auxiliary_loss_clip": 0.01119029, "auxiliary_loss_mlp": 0.01272148, "balance_loss_clip": 1.01346481, "balance_loss_mlp": 1.03237581, "epoch": 0.9403276717270405, "flos": 25558630156800.0, "grad_norm": 1.483821138772063, "language_loss": 0.78137201, "learning_rate": 3.5016056187521235e-08, "loss": 0.80528378, "num_input_tokens_seen": 337278360, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 15640, "time_per_iteration": 2.6483616828918457 }, { "auxiliary_loss_clip": 0.01113171, "auxiliary_loss_mlp": 0.01033785, "balance_loss_clip": 1.02131569, "balance_loss_mlp": 1.03363299, "epoch": 0.9403877949797084, "flos": 26689852154880.0, "grad_norm": 1.5141129424149373, "language_loss": 0.73845708, "learning_rate": 3.494570109009198e-08, "loss": 0.75992656, "num_input_tokens_seen": 337302480, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.70703125, "step": 15641, "time_per_iteration": 3.9986531734466553 }, { "auxiliary_loss_clip": 0.01114842, "auxiliary_loss_mlp": 0.01032062, "balance_loss_clip": 1.0185318, "balance_loss_mlp": 1.03374577, "epoch": 0.9404479182323764, "flos": 15011666455680.0, "grad_norm": 1.8751149339895556, "language_loss": 0.82450259, "learning_rate": 3.4875416120395906e-08, "loss": 0.84597158, "num_input_tokens_seen": 337316600, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 15642, "time_per_iteration": 2.4866716861724854 }, { "auxiliary_loss_clip": 0.01108078, "auxiliary_loss_mlp": 0.01029813, "balance_loss_clip": 1.0181427, "balance_loss_mlp": 1.03220856, "epoch": 0.9405080414850443, "flos": 24973573432320.0, "grad_norm": 1.8032257117380195, "language_loss": 0.68395567, "learning_rate": 3.480520128094144e-08, "loss": 0.7053346, "num_input_tokens_seen": 337336895, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.66796875, "step": 15643, "time_per_iteration": 2.522676706314087 }, { "auxiliary_loss_clip": 0.0111816, "auxiliary_loss_mlp": 0.01036925, "balance_loss_clip": 1.02396107, "balance_loss_mlp": 1.03726816, "epoch": 0.9405681647377123, "flos": 20844743777280.0, "grad_norm": 1.8057295908480961, "language_loss": 0.76602405, "learning_rate": 3.4735056574234146e-08, "loss": 0.78757489, "num_input_tokens_seen": 337355105, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 15644, "time_per_iteration": 2.4639101028442383 }, { "auxiliary_loss_clip": 0.01108925, "auxiliary_loss_mlp": 0.01030476, "balance_loss_clip": 1.01939559, "balance_loss_mlp": 1.03396499, "epoch": 0.9406282879903802, "flos": 23805578885760.0, "grad_norm": 1.5174231980010593, "language_loss": 0.67507881, "learning_rate": 3.466498200277734e-08, "loss": 0.69647288, "num_input_tokens_seen": 337374905, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.65625, "step": 15645, "time_per_iteration": 2.470951557159424 }, { "auxiliary_loss_clip": 0.01115642, "auxiliary_loss_mlp": 0.01279246, "balance_loss_clip": 1.02078462, "balance_loss_mlp": 1.03250194, "epoch": 0.9406884112430483, "flos": 27674953626240.0, "grad_norm": 1.5045875630231045, "language_loss": 0.70287013, "learning_rate": 3.459497756907193e-08, "loss": 0.72681892, "num_input_tokens_seen": 337397130, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6484375, "step": 15646, "time_per_iteration": 2.593686819076538 }, { "auxiliary_loss_clip": 0.01108157, "auxiliary_loss_mlp": 0.01030512, "balance_loss_clip": 1.01891255, "balance_loss_mlp": 1.03274202, "epoch": 0.9407485344957163, "flos": 23075048079360.0, "grad_norm": 1.5570420453145284, "language_loss": 0.74112248, "learning_rate": 3.4525043275616114e-08, "loss": 0.76250923, "num_input_tokens_seen": 337418660, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6640625, "step": 15647, "time_per_iteration": 2.5091304779052734 }, { "auxiliary_loss_clip": 0.01039854, "auxiliary_loss_mlp": 0.01004214, "balance_loss_clip": 1.00300443, "balance_loss_mlp": 1.00106418, "epoch": 0.9408086577483842, "flos": 71014034304000.0, "grad_norm": 0.8916922598273801, "language_loss": 0.63543451, "learning_rate": 3.445517912490592e-08, "loss": 0.65587521, "num_input_tokens_seen": 337478055, "router_z_loss_clip": 0.01208496, "router_z_loss_mlp": 0.2109375, "step": 15648, "time_per_iteration": 4.9168922901153564 }, { "auxiliary_loss_clip": 0.01098913, "auxiliary_loss_mlp": 0.01027958, "balance_loss_clip": 1.01688337, "balance_loss_mlp": 1.03349257, "epoch": 0.9408687810010522, "flos": 23730956380800.0, "grad_norm": 1.3673084317981958, "language_loss": 0.66431367, "learning_rate": 3.438538511943401e-08, "loss": 0.6855824, "num_input_tokens_seen": 337499405, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.65234375, "step": 15649, "time_per_iteration": 4.035240173339844 }, { "auxiliary_loss_clip": 0.01126869, "auxiliary_loss_mlp": 0.01029092, "balance_loss_clip": 1.01782048, "balance_loss_mlp": 1.03192282, "epoch": 0.9409289042537201, "flos": 18369314087040.0, "grad_norm": 2.1033431727747884, "language_loss": 0.77642542, "learning_rate": 3.431566126169194e-08, "loss": 0.79798508, "num_input_tokens_seen": 337517195, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.68359375, "step": 15650, "time_per_iteration": 2.544219970703125 }, { "auxiliary_loss_clip": 0.01138678, "auxiliary_loss_mlp": 0.01032172, "balance_loss_clip": 1.02012587, "balance_loss_mlp": 1.03549898, "epoch": 0.9409890275063881, "flos": 23878333883520.0, "grad_norm": 1.6909420379298419, "language_loss": 0.74273366, "learning_rate": 3.424600755416729e-08, "loss": 0.76444221, "num_input_tokens_seen": 337535245, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.67578125, "step": 15651, "time_per_iteration": 2.671168804168701 }, { "auxiliary_loss_clip": 0.01101432, "auxiliary_loss_mlp": 0.01034113, "balance_loss_clip": 1.02166176, "balance_loss_mlp": 1.0332638, "epoch": 0.941049150759056, "flos": 16545088016640.0, "grad_norm": 1.7981610648462714, "language_loss": 0.71889275, "learning_rate": 3.417642399934628e-08, "loss": 0.7402482, "num_input_tokens_seen": 337553040, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6796875, "step": 15652, "time_per_iteration": 2.6093528270721436 }, { "auxiliary_loss_clip": 0.01135574, "auxiliary_loss_mlp": 0.01033629, "balance_loss_clip": 1.02055216, "balance_loss_mlp": 1.03522801, "epoch": 0.9411092740117241, "flos": 25118401069440.0, "grad_norm": 1.6513534633304188, "language_loss": 0.66151619, "learning_rate": 3.410691059971183e-08, "loss": 0.68320823, "num_input_tokens_seen": 337574580, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 15653, "time_per_iteration": 2.6275978088378906 }, { "auxiliary_loss_clip": 0.01102263, "auxiliary_loss_mlp": 0.01031239, "balance_loss_clip": 1.0188235, "balance_loss_mlp": 1.03336883, "epoch": 0.941169397264392, "flos": 21142264129920.0, "grad_norm": 1.8589148633568262, "language_loss": 0.77853656, "learning_rate": 3.403746735774504e-08, "loss": 0.79987156, "num_input_tokens_seen": 337593010, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 15654, "time_per_iteration": 2.500391960144043 }, { "auxiliary_loss_clip": 0.01115461, "auxiliary_loss_mlp": 0.01026256, "balance_loss_clip": 1.01389372, "balance_loss_mlp": 1.03422129, "epoch": 0.94122952051706, "flos": 38508914995200.0, "grad_norm": 1.4748864047217796, "language_loss": 0.70044684, "learning_rate": 3.396809427592373e-08, "loss": 0.72186399, "num_input_tokens_seen": 337616170, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.72265625, "step": 15655, "time_per_iteration": 2.6610305309295654 }, { "auxiliary_loss_clip": 0.01102988, "auxiliary_loss_mlp": 0.01033178, "balance_loss_clip": 1.01976132, "balance_loss_mlp": 1.03429842, "epoch": 0.9412896437697279, "flos": 18369206346240.0, "grad_norm": 1.6784027125276995, "language_loss": 0.72075164, "learning_rate": 3.3898791356724135e-08, "loss": 0.74211335, "num_input_tokens_seen": 337635215, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6875, "step": 15656, "time_per_iteration": 2.5005009174346924 }, { "auxiliary_loss_clip": 0.01119722, "auxiliary_loss_mlp": 0.01025063, "balance_loss_clip": 1.0140419, "balance_loss_mlp": 1.03371429, "epoch": 0.9413497670223959, "flos": 25884950238720.0, "grad_norm": 2.0037336000516315, "language_loss": 0.78020537, "learning_rate": 3.382955860261916e-08, "loss": 0.80165321, "num_input_tokens_seen": 337654195, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.67578125, "step": 15657, "time_per_iteration": 2.5883047580718994 }, { "auxiliary_loss_clip": 0.01126075, "auxiliary_loss_mlp": 0.01027491, "balance_loss_clip": 1.01496792, "balance_loss_mlp": 1.03092694, "epoch": 0.9414098902750638, "flos": 16618309891200.0, "grad_norm": 1.8069711371058927, "language_loss": 0.84493035, "learning_rate": 3.3760396016079716e-08, "loss": 0.86646599, "num_input_tokens_seen": 337671810, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 15658, "time_per_iteration": 2.5489184856414795 }, { "auxiliary_loss_clip": 0.01107203, "auxiliary_loss_mlp": 0.01033865, "balance_loss_clip": 1.01961398, "balance_loss_mlp": 1.03506696, "epoch": 0.9414700135277319, "flos": 18625033987200.0, "grad_norm": 2.5003161858389324, "language_loss": 0.70895088, "learning_rate": 3.369130359957406e-08, "loss": 0.73036152, "num_input_tokens_seen": 337689410, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.71875, "step": 15659, "time_per_iteration": 2.511927604675293 }, { "auxiliary_loss_clip": 0.01117124, "auxiliary_loss_mlp": 0.01038134, "balance_loss_clip": 1.02661204, "balance_loss_mlp": 1.03193212, "epoch": 0.9415301367803999, "flos": 26280146649600.0, "grad_norm": 1.735423312400196, "language_loss": 0.79471695, "learning_rate": 3.362228135556777e-08, "loss": 0.81626952, "num_input_tokens_seen": 337709950, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 15660, "time_per_iteration": 2.561269521713257 }, { "auxiliary_loss_clip": 0.01125407, "auxiliary_loss_mlp": 0.0102878, "balance_loss_clip": 1.0169661, "balance_loss_mlp": 1.03306437, "epoch": 0.9415902600330678, "flos": 23261388860160.0, "grad_norm": 1.3886840300195118, "language_loss": 0.68313932, "learning_rate": 3.355332928652399e-08, "loss": 0.70468116, "num_input_tokens_seen": 337731320, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.66015625, "step": 15661, "time_per_iteration": 2.5667147636413574 }, { "auxiliary_loss_clip": 0.01109728, "auxiliary_loss_mlp": 0.01026968, "balance_loss_clip": 1.01535666, "balance_loss_mlp": 1.03278983, "epoch": 0.9416503832857358, "flos": 14719138093440.0, "grad_norm": 1.9795951347503506, "language_loss": 0.66255176, "learning_rate": 3.3484447394903414e-08, "loss": 0.68391871, "num_input_tokens_seen": 337747720, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 15662, "time_per_iteration": 2.502199411392212 }, { "auxiliary_loss_clip": 0.01039647, "auxiliary_loss_mlp": 0.009995, "balance_loss_clip": 0.9982425, "balance_loss_mlp": 1.00125337, "epoch": 0.9417105065384037, "flos": 70312698276480.0, "grad_norm": 0.7625256487811229, "language_loss": 0.59316468, "learning_rate": 3.341563568316474e-08, "loss": 0.61355615, "num_input_tokens_seen": 337806930, "router_z_loss_clip": 0.01257324, "router_z_loss_mlp": 0.21191406, "step": 15663, "time_per_iteration": 3.2176034450531006 }, { "auxiliary_loss_clip": 0.01116481, "auxiliary_loss_mlp": 0.01034821, "balance_loss_clip": 1.02140379, "balance_loss_mlp": 1.03547215, "epoch": 0.9417706297910717, "flos": 34057895322240.0, "grad_norm": 3.623269833147187, "language_loss": 0.66608441, "learning_rate": 3.334689415376335e-08, "loss": 0.68759739, "num_input_tokens_seen": 337828100, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 15664, "time_per_iteration": 2.624933958053589 }, { "auxiliary_loss_clip": 0.01119116, "auxiliary_loss_mlp": 0.01030304, "balance_loss_clip": 1.01853204, "balance_loss_mlp": 1.0340786, "epoch": 0.9418307530437396, "flos": 16471614746880.0, "grad_norm": 1.9491355326844821, "language_loss": 0.73756599, "learning_rate": 3.3278222809152376e-08, "loss": 0.75906014, "num_input_tokens_seen": 337844805, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.671875, "step": 15665, "time_per_iteration": 2.502753734588623 }, { "auxiliary_loss_clip": 0.01099984, "auxiliary_loss_mlp": 0.01029144, "balance_loss_clip": 1.0177598, "balance_loss_mlp": 1.03407073, "epoch": 0.9418908762964077, "flos": 11253543114240.0, "grad_norm": 1.8684147720906605, "language_loss": 0.63979369, "learning_rate": 3.3209621651782535e-08, "loss": 0.66108495, "num_input_tokens_seen": 337860490, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.66015625, "step": 15666, "time_per_iteration": 2.4351253509521484 }, { "auxiliary_loss_clip": 0.01129211, "auxiliary_loss_mlp": 0.01035173, "balance_loss_clip": 1.02266228, "balance_loss_mlp": 1.03354096, "epoch": 0.9419509995490756, "flos": 18438836860800.0, "grad_norm": 1.968420669410123, "language_loss": 0.78600293, "learning_rate": 3.3141090684102534e-08, "loss": 0.80764675, "num_input_tokens_seen": 337878360, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 15667, "time_per_iteration": 2.4875376224517822 }, { "auxiliary_loss_clip": 0.01100071, "auxiliary_loss_mlp": 0.01028609, "balance_loss_clip": 1.01742172, "balance_loss_mlp": 1.03330147, "epoch": 0.9420111228017436, "flos": 20737945664640.0, "grad_norm": 1.6151646745301422, "language_loss": 0.74948299, "learning_rate": 3.3072629908557085e-08, "loss": 0.77076983, "num_input_tokens_seen": 337895635, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.66796875, "step": 15668, "time_per_iteration": 2.485210418701172 }, { "auxiliary_loss_clip": 0.0114093, "auxiliary_loss_mlp": 0.01030933, "balance_loss_clip": 1.01851177, "balance_loss_mlp": 1.03447747, "epoch": 0.9420712460544115, "flos": 21141940907520.0, "grad_norm": 2.152335634386854, "language_loss": 0.59104645, "learning_rate": 3.300423932759022e-08, "loss": 0.61276507, "num_input_tokens_seen": 337913940, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 15669, "time_per_iteration": 2.515146017074585 }, { "auxiliary_loss_clip": 0.01123943, "auxiliary_loss_mlp": 0.01026486, "balance_loss_clip": 1.01514935, "balance_loss_mlp": 1.03193092, "epoch": 0.9421313693070795, "flos": 15851760721920.0, "grad_norm": 1.53027497527781, "language_loss": 0.76758486, "learning_rate": 3.293591894364223e-08, "loss": 0.78908914, "num_input_tokens_seen": 337932015, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.65234375, "step": 15670, "time_per_iteration": 2.5641419887542725 }, { "auxiliary_loss_clip": 0.01102779, "auxiliary_loss_mlp": 0.01037353, "balance_loss_clip": 1.02482414, "balance_loss_mlp": 1.03467953, "epoch": 0.9421914925597474, "flos": 32415915882240.0, "grad_norm": 4.124340116965809, "language_loss": 0.65297329, "learning_rate": 3.286766875915159e-08, "loss": 0.6743747, "num_input_tokens_seen": 337953345, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 15671, "time_per_iteration": 2.6256895065307617 }, { "auxiliary_loss_clip": 0.01119124, "auxiliary_loss_mlp": 0.01030762, "balance_loss_clip": 1.01830423, "balance_loss_mlp": 1.03350687, "epoch": 0.9422516158124155, "flos": 19718513769600.0, "grad_norm": 1.6998230825254632, "language_loss": 0.79351532, "learning_rate": 3.2799488776553696e-08, "loss": 0.81501424, "num_input_tokens_seen": 337973685, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.67578125, "step": 15672, "time_per_iteration": 2.4989943504333496 }, { "auxiliary_loss_clip": 0.01133199, "auxiliary_loss_mlp": 0.01035923, "balance_loss_clip": 1.02241063, "balance_loss_mlp": 1.03415203, "epoch": 0.9423117390650835, "flos": 16253277926400.0, "grad_norm": 2.0155910537353217, "language_loss": 0.73567021, "learning_rate": 3.273137899828171e-08, "loss": 0.75736141, "num_input_tokens_seen": 337989175, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 15673, "time_per_iteration": 2.5095643997192383 }, { "auxiliary_loss_clip": 0.01110048, "auxiliary_loss_mlp": 0.01029321, "balance_loss_clip": 1.01705444, "balance_loss_mlp": 1.0332756, "epoch": 0.9423718623177514, "flos": 31796564647680.0, "grad_norm": 1.5141794544423866, "language_loss": 0.701087, "learning_rate": 3.266333942676658e-08, "loss": 0.72248077, "num_input_tokens_seen": 338011800, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 15674, "time_per_iteration": 2.574164628982544 }, { "auxiliary_loss_clip": 0.01101205, "auxiliary_loss_mlp": 0.01024702, "balance_loss_clip": 1.01291764, "balance_loss_mlp": 1.03254664, "epoch": 0.9424319855704194, "flos": 23331809473920.0, "grad_norm": 2.026820411761694, "language_loss": 0.81145185, "learning_rate": 3.259537006443613e-08, "loss": 0.83271092, "num_input_tokens_seen": 338032120, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6875, "step": 15675, "time_per_iteration": 3.924386739730835 }, { "auxiliary_loss_clip": 0.01120446, "auxiliary_loss_mlp": 0.01026233, "balance_loss_clip": 1.01463342, "balance_loss_mlp": 1.03397608, "epoch": 0.9424921088230873, "flos": 20777627214720.0, "grad_norm": 1.6442816133879, "language_loss": 0.62483883, "learning_rate": 3.252747091371621e-08, "loss": 0.64630562, "num_input_tokens_seen": 338051880, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.68359375, "step": 15676, "time_per_iteration": 2.536393642425537 }, { "auxiliary_loss_clip": 0.01119326, "auxiliary_loss_mlp": 0.01032693, "balance_loss_clip": 1.02071285, "balance_loss_mlp": 1.03311229, "epoch": 0.9425522320757553, "flos": 19026658932480.0, "grad_norm": 1.70254180934503, "language_loss": 0.67171681, "learning_rate": 3.245964197702977e-08, "loss": 0.69323701, "num_input_tokens_seen": 338069665, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.68359375, "step": 15677, "time_per_iteration": 2.5604331493377686 }, { "auxiliary_loss_clip": 0.01129684, "auxiliary_loss_mlp": 0.01033968, "balance_loss_clip": 1.020509, "balance_loss_mlp": 1.03248501, "epoch": 0.9426123553284232, "flos": 25155353185920.0, "grad_norm": 4.426307185625788, "language_loss": 0.63991392, "learning_rate": 3.239188325679776e-08, "loss": 0.6615504, "num_input_tokens_seen": 338090490, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 15678, "time_per_iteration": 2.572927474975586 }, { "auxiliary_loss_clip": 0.01128276, "auxiliary_loss_mlp": 0.01027001, "balance_loss_clip": 1.01457906, "balance_loss_mlp": 1.0339781, "epoch": 0.9426724785810913, "flos": 21179359900800.0, "grad_norm": 1.9631793460224423, "language_loss": 0.74242055, "learning_rate": 3.2324194755438015e-08, "loss": 0.76397336, "num_input_tokens_seen": 338109825, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6796875, "step": 15679, "time_per_iteration": 2.7509713172912598 }, { "auxiliary_loss_clip": 0.01123832, "auxiliary_loss_mlp": 0.01034009, "balance_loss_clip": 1.02133679, "balance_loss_mlp": 1.03512251, "epoch": 0.9427326018337592, "flos": 14756916222720.0, "grad_norm": 2.186634399357939, "language_loss": 0.77742851, "learning_rate": 3.225657647536639e-08, "loss": 0.79900694, "num_input_tokens_seen": 338125790, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 15680, "time_per_iteration": 2.511845588684082 }, { "auxiliary_loss_clip": 0.01109633, "auxiliary_loss_mlp": 0.01034109, "balance_loss_clip": 1.02162755, "balance_loss_mlp": 1.03305137, "epoch": 0.9427927250864272, "flos": 20923640000640.0, "grad_norm": 2.079662716832652, "language_loss": 0.75574887, "learning_rate": 3.2189028418995846e-08, "loss": 0.77718627, "num_input_tokens_seen": 338145610, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 15681, "time_per_iteration": 2.5570008754730225 }, { "auxiliary_loss_clip": 0.0112342, "auxiliary_loss_mlp": 0.01034752, "balance_loss_clip": 1.02167451, "balance_loss_mlp": 1.03512073, "epoch": 0.9428528483390951, "flos": 19752520970880.0, "grad_norm": 2.111813404229611, "language_loss": 0.65309024, "learning_rate": 3.212155058873711e-08, "loss": 0.67467195, "num_input_tokens_seen": 338165960, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 15682, "time_per_iteration": 2.619337797164917 }, { "auxiliary_loss_clip": 0.01126023, "auxiliary_loss_mlp": 0.01027165, "balance_loss_clip": 1.01511323, "balance_loss_mlp": 1.03225315, "epoch": 0.9429129715917631, "flos": 24534996370560.0, "grad_norm": 1.7215124650969051, "language_loss": 0.76737297, "learning_rate": 3.2054142986998044e-08, "loss": 0.78890479, "num_input_tokens_seen": 338187215, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 15683, "time_per_iteration": 4.0244410037994385 }, { "auxiliary_loss_clip": 0.0111531, "auxiliary_loss_mlp": 0.0104018, "balance_loss_clip": 1.02689385, "balance_loss_mlp": 1.03521562, "epoch": 0.942973094844431, "flos": 17959824063360.0, "grad_norm": 2.608668939097437, "language_loss": 0.75586253, "learning_rate": 3.198680561618472e-08, "loss": 0.77741736, "num_input_tokens_seen": 338201825, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 15684, "time_per_iteration": 2.4318506717681885 }, { "auxiliary_loss_clip": 0.01098657, "auxiliary_loss_mlp": 0.01027005, "balance_loss_clip": 1.01571035, "balance_loss_mlp": 1.03292918, "epoch": 0.9430332180970991, "flos": 26137689310080.0, "grad_norm": 1.7611051095105883, "language_loss": 0.77241707, "learning_rate": 3.191953847869988e-08, "loss": 0.79367369, "num_input_tokens_seen": 338220865, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.65625, "step": 15685, "time_per_iteration": 2.744739294052124 }, { "auxiliary_loss_clip": 0.01107825, "auxiliary_loss_mlp": 0.01029129, "balance_loss_clip": 1.01786327, "balance_loss_mlp": 1.03329539, "epoch": 0.943093341349767, "flos": 23951376190080.0, "grad_norm": 1.7506503862465956, "language_loss": 0.7517885, "learning_rate": 3.1852341576944275e-08, "loss": 0.77315807, "num_input_tokens_seen": 338240160, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.66015625, "step": 15686, "time_per_iteration": 2.5282864570617676 }, { "auxiliary_loss_clip": 0.01112366, "auxiliary_loss_mlp": 0.01028016, "balance_loss_clip": 1.01520157, "balance_loss_mlp": 1.03344142, "epoch": 0.943153464602435, "flos": 17968407413760.0, "grad_norm": 1.8988460698525143, "language_loss": 0.80748558, "learning_rate": 3.178521491331576e-08, "loss": 0.82888937, "num_input_tokens_seen": 338259305, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 15687, "time_per_iteration": 2.7893428802490234 }, { "auxiliary_loss_clip": 0.01124876, "auxiliary_loss_mlp": 0.01036138, "balance_loss_clip": 1.02219677, "balance_loss_mlp": 1.03550828, "epoch": 0.943213587855103, "flos": 14501519544960.0, "grad_norm": 3.043954543975062, "language_loss": 0.76392221, "learning_rate": 3.17181584902102e-08, "loss": 0.78553236, "num_input_tokens_seen": 338274950, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.71875, "step": 15688, "time_per_iteration": 2.530816078186035 }, { "auxiliary_loss_clip": 0.01114476, "auxiliary_loss_mlp": 0.01025125, "balance_loss_clip": 1.01292419, "balance_loss_mlp": 1.03180552, "epoch": 0.9432737111077709, "flos": 28986411093120.0, "grad_norm": 1.598623399389374, "language_loss": 0.68455267, "learning_rate": 3.165117231002057e-08, "loss": 0.70594871, "num_input_tokens_seen": 338295585, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.64453125, "step": 15689, "time_per_iteration": 2.6186933517456055 }, { "auxiliary_loss_clip": 0.01110301, "auxiliary_loss_mlp": 0.01034336, "balance_loss_clip": 1.02370834, "balance_loss_mlp": 1.03398967, "epoch": 0.9433338343604389, "flos": 21609066303360.0, "grad_norm": 2.369466152697433, "language_loss": 0.80403495, "learning_rate": 3.158425637513718e-08, "loss": 0.82548136, "num_input_tokens_seen": 338314555, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.671875, "step": 15690, "time_per_iteration": 3.927398920059204 }, { "auxiliary_loss_clip": 0.01110809, "auxiliary_loss_mlp": 0.01027835, "balance_loss_clip": 1.01581264, "balance_loss_mlp": 1.03468049, "epoch": 0.9433939576131068, "flos": 33182285483520.0, "grad_norm": 1.8251091716145718, "language_loss": 0.60293221, "learning_rate": 3.1517410687948555e-08, "loss": 0.62431866, "num_input_tokens_seen": 338336260, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 15691, "time_per_iteration": 4.199806451797485 }, { "auxiliary_loss_clip": 0.01132813, "auxiliary_loss_mlp": 0.01028699, "balance_loss_clip": 1.01665282, "balance_loss_mlp": 1.03630567, "epoch": 0.9434540808657749, "flos": 20486391742080.0, "grad_norm": 1.5451337837904242, "language_loss": 0.6648401, "learning_rate": 3.145063525083991e-08, "loss": 0.68645525, "num_input_tokens_seen": 338354680, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6953125, "step": 15692, "time_per_iteration": 2.6049909591674805 }, { "auxiliary_loss_clip": 0.01113149, "auxiliary_loss_mlp": 0.01031556, "balance_loss_clip": 1.01964664, "balance_loss_mlp": 1.03474569, "epoch": 0.9435142041184428, "flos": 21542955321600.0, "grad_norm": 2.5707132347759587, "language_loss": 0.75162256, "learning_rate": 3.138393006619444e-08, "loss": 0.77306962, "num_input_tokens_seen": 338372490, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69140625, "step": 15693, "time_per_iteration": 2.59135365486145 }, { "auxiliary_loss_clip": 0.01129224, "auxiliary_loss_mlp": 0.01031538, "balance_loss_clip": 1.01942658, "balance_loss_mlp": 1.03458166, "epoch": 0.9435743273711108, "flos": 25009089004800.0, "grad_norm": 1.7816882071361928, "language_loss": 0.73048002, "learning_rate": 3.1317295136392474e-08, "loss": 0.75208765, "num_input_tokens_seen": 338390870, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 15694, "time_per_iteration": 2.61012864112854 }, { "auxiliary_loss_clip": 0.01049677, "auxiliary_loss_mlp": 0.0100206, "balance_loss_clip": 1.00072527, "balance_loss_mlp": 1.00085926, "epoch": 0.9436344506237787, "flos": 60158707320960.0, "grad_norm": 0.6885640464633233, "language_loss": 0.50585717, "learning_rate": 3.1250730463812325e-08, "loss": 0.52637452, "num_input_tokens_seen": 338453075, "router_z_loss_clip": 0.0133667, "router_z_loss_mlp": 0.21191406, "step": 15695, "time_per_iteration": 3.288020133972168 }, { "auxiliary_loss_clip": 0.01115949, "auxiliary_loss_mlp": 0.01034711, "balance_loss_clip": 1.02326107, "balance_loss_mlp": 1.03284585, "epoch": 0.9436945738764467, "flos": 22237252283520.0, "grad_norm": 1.8429690747438698, "language_loss": 0.65251738, "learning_rate": 3.11842360508292e-08, "loss": 0.67402399, "num_input_tokens_seen": 338471770, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6484375, "step": 15696, "time_per_iteration": 2.628606081008911 }, { "auxiliary_loss_clip": 0.01112277, "auxiliary_loss_mlp": 0.01027607, "balance_loss_clip": 1.01600802, "balance_loss_mlp": 1.03492689, "epoch": 0.9437546971291146, "flos": 25045179194880.0, "grad_norm": 1.5689193795205632, "language_loss": 0.66045773, "learning_rate": 3.111781189981655e-08, "loss": 0.68185651, "num_input_tokens_seen": 338492190, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.68359375, "step": 15697, "time_per_iteration": 2.588313579559326 }, { "auxiliary_loss_clip": 0.01048579, "auxiliary_loss_mlp": 0.01000504, "balance_loss_clip": 0.99922276, "balance_loss_mlp": 1.00089478, "epoch": 0.9438148203817827, "flos": 56790788400000.0, "grad_norm": 0.9657684803453072, "language_loss": 0.63218862, "learning_rate": 3.105145801314446e-08, "loss": 0.65267944, "num_input_tokens_seen": 338552560, "router_z_loss_clip": 0.01281738, "router_z_loss_mlp": 0.21289062, "step": 15698, "time_per_iteration": 3.446979522705078 }, { "auxiliary_loss_clip": 0.01122809, "auxiliary_loss_mlp": 0.01036549, "balance_loss_clip": 1.02361465, "balance_loss_mlp": 1.03387809, "epoch": 0.9438749436344506, "flos": 22346384780160.0, "grad_norm": 1.6957956296867016, "language_loss": 0.69660634, "learning_rate": 3.0985174393181046e-08, "loss": 0.71819997, "num_input_tokens_seen": 338571770, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 15699, "time_per_iteration": 3.2565877437591553 }, { "auxiliary_loss_clip": 0.01124654, "auxiliary_loss_mlp": 0.01031084, "balance_loss_clip": 1.01958585, "balance_loss_mlp": 1.03231919, "epoch": 0.9439350668871186, "flos": 13370800337280.0, "grad_norm": 1.9694568609337828, "language_loss": 0.74248326, "learning_rate": 3.091896104229197e-08, "loss": 0.76404059, "num_input_tokens_seen": 338587310, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.65625, "step": 15700, "time_per_iteration": 3.0944740772247314 }, { "auxiliary_loss_clip": 0.01155214, "auxiliary_loss_mlp": 0.01032766, "balance_loss_clip": 1.02138758, "balance_loss_mlp": 1.03485203, "epoch": 0.9439951901397866, "flos": 17785334770560.0, "grad_norm": 1.8553666203620254, "language_loss": 0.70737147, "learning_rate": 3.085281796284023e-08, "loss": 0.72925127, "num_input_tokens_seen": 338606235, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.671875, "step": 15701, "time_per_iteration": 3.3278584480285645 }, { "auxiliary_loss_clip": 0.01140864, "auxiliary_loss_mlp": 0.01028469, "balance_loss_clip": 1.0160656, "balance_loss_mlp": 1.03266358, "epoch": 0.9440553133924545, "flos": 18879568738560.0, "grad_norm": 2.1896294011653947, "language_loss": 0.77669901, "learning_rate": 3.0786745157185715e-08, "loss": 0.7983923, "num_input_tokens_seen": 338624090, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7265625, "step": 15702, "time_per_iteration": 3.227137565612793 }, { "auxiliary_loss_clip": 0.01112876, "auxiliary_loss_mlp": 0.0103596, "balance_loss_clip": 1.02396166, "balance_loss_mlp": 1.03418696, "epoch": 0.9441154366451225, "flos": 19572967860480.0, "grad_norm": 1.6717555655814529, "language_loss": 0.66813189, "learning_rate": 3.07207426276872e-08, "loss": 0.68962026, "num_input_tokens_seen": 338643695, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 15703, "time_per_iteration": 3.179727554321289 }, { "auxiliary_loss_clip": 0.0111468, "auxiliary_loss_mlp": 0.01031137, "balance_loss_clip": 1.01820862, "balance_loss_mlp": 1.03535748, "epoch": 0.9441755598977905, "flos": 30294995472000.0, "grad_norm": 1.5686620365555632, "language_loss": 0.73141158, "learning_rate": 3.065481037669992e-08, "loss": 0.75286973, "num_input_tokens_seen": 338664725, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 15704, "time_per_iteration": 3.3532371520996094 }, { "auxiliary_loss_clip": 0.01109026, "auxiliary_loss_mlp": 0.01032931, "balance_loss_clip": 1.0205214, "balance_loss_mlp": 1.03353894, "epoch": 0.9442356831504585, "flos": 20667884186880.0, "grad_norm": 1.7547630207257454, "language_loss": 0.74363852, "learning_rate": 3.058894840657622e-08, "loss": 0.76505816, "num_input_tokens_seen": 338683990, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.66796875, "step": 15705, "time_per_iteration": 3.1263837814331055 }, { "auxiliary_loss_clip": 0.01120416, "auxiliary_loss_mlp": 0.01031303, "balance_loss_clip": 1.0206399, "balance_loss_mlp": 1.03483987, "epoch": 0.9442958064031264, "flos": 16107265140480.0, "grad_norm": 1.7853128992146656, "language_loss": 0.77003789, "learning_rate": 3.052315671966732e-08, "loss": 0.79155511, "num_input_tokens_seen": 338702025, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.6796875, "step": 15706, "time_per_iteration": 3.206538200378418 }, { "auxiliary_loss_clip": 0.01107499, "auxiliary_loss_mlp": 0.0102714, "balance_loss_clip": 1.01589894, "balance_loss_mlp": 1.03281093, "epoch": 0.9443559296557944, "flos": 20447392550400.0, "grad_norm": 1.614796968883787, "language_loss": 0.6912021, "learning_rate": 3.045743531832068e-08, "loss": 0.71254849, "num_input_tokens_seen": 338720920, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.65625, "step": 15707, "time_per_iteration": 3.2168147563934326 }, { "auxiliary_loss_clip": 0.01094675, "auxiliary_loss_mlp": 0.0102706, "balance_loss_clip": 1.01622951, "balance_loss_mlp": 1.030424, "epoch": 0.9444160529084623, "flos": 21610897896960.0, "grad_norm": 1.8542966975386403, "language_loss": 0.69499087, "learning_rate": 3.039178420488242e-08, "loss": 0.71620822, "num_input_tokens_seen": 338739590, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.64453125, "step": 15708, "time_per_iteration": 3.5723235607147217 }, { "auxiliary_loss_clip": 0.01109762, "auxiliary_loss_mlp": 0.01029854, "balance_loss_clip": 1.01771903, "balance_loss_mlp": 1.03420019, "epoch": 0.9444761761611303, "flos": 18441781776000.0, "grad_norm": 2.5531796481647553, "language_loss": 0.70513129, "learning_rate": 3.032620338169445e-08, "loss": 0.72652745, "num_input_tokens_seen": 338757240, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6640625, "step": 15709, "time_per_iteration": 3.293264865875244 }, { "auxiliary_loss_clip": 0.01136807, "auxiliary_loss_mlp": 0.01030548, "balance_loss_clip": 1.01853156, "balance_loss_mlp": 1.03476667, "epoch": 0.9445362994137982, "flos": 20957144411520.0, "grad_norm": 3.508386811676818, "language_loss": 0.84907162, "learning_rate": 3.026069285109778e-08, "loss": 0.87074518, "num_input_tokens_seen": 338773750, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6640625, "step": 15710, "time_per_iteration": 3.3679134845733643 }, { "auxiliary_loss_clip": 0.01132779, "auxiliary_loss_mlp": 0.01034514, "balance_loss_clip": 1.02170515, "balance_loss_mlp": 1.03526545, "epoch": 0.9445964226664663, "flos": 20303283185280.0, "grad_norm": 1.5913580173656403, "language_loss": 0.71524787, "learning_rate": 3.01952526154301e-08, "loss": 0.73692083, "num_input_tokens_seen": 338792115, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 15711, "time_per_iteration": 3.295015573501587 }, { "auxiliary_loss_clip": 0.01112267, "auxiliary_loss_mlp": 0.01027848, "balance_loss_clip": 1.01602268, "balance_loss_mlp": 1.03541696, "epoch": 0.9446565459191342, "flos": 26396030903040.0, "grad_norm": 1.5807455018641636, "language_loss": 0.69208086, "learning_rate": 3.0129882677027096e-08, "loss": 0.71348202, "num_input_tokens_seen": 338812480, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 15712, "time_per_iteration": 3.638665199279785 }, { "auxiliary_loss_clip": 0.01115789, "auxiliary_loss_mlp": 0.01031465, "balance_loss_clip": 1.01813138, "balance_loss_mlp": 1.0360682, "epoch": 0.9447166691718022, "flos": 16544764794240.0, "grad_norm": 2.0806300537703524, "language_loss": 0.70580113, "learning_rate": 3.006458303822135e-08, "loss": 0.7272737, "num_input_tokens_seen": 338829105, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 15713, "time_per_iteration": 3.350412368774414 }, { "auxiliary_loss_clip": 0.01116715, "auxiliary_loss_mlp": 0.01032806, "balance_loss_clip": 1.02016425, "balance_loss_mlp": 1.03277409, "epoch": 0.9447767924244702, "flos": 29164635400320.0, "grad_norm": 1.5406807301190795, "language_loss": 0.76584959, "learning_rate": 2.999935370134343e-08, "loss": 0.78734481, "num_input_tokens_seen": 338850670, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.66015625, "step": 15714, "time_per_iteration": 3.2342121601104736 }, { "auxiliary_loss_clip": 0.01129494, "auxiliary_loss_mlp": 0.01033495, "balance_loss_clip": 1.02077508, "balance_loss_mlp": 1.03311539, "epoch": 0.9448369156771381, "flos": 19274908803840.0, "grad_norm": 1.8420741029602867, "language_loss": 0.67643249, "learning_rate": 2.993419466872105e-08, "loss": 0.69806242, "num_input_tokens_seen": 338867795, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 15715, "time_per_iteration": 3.3012423515319824 }, { "auxiliary_loss_clip": 0.01130273, "auxiliary_loss_mlp": 0.01031996, "balance_loss_clip": 1.01925278, "balance_loss_mlp": 1.03427458, "epoch": 0.9448970389298061, "flos": 23841166285440.0, "grad_norm": 1.7296069810211712, "language_loss": 0.74718571, "learning_rate": 2.9869105942679886e-08, "loss": 0.76880842, "num_input_tokens_seen": 338887205, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 15716, "time_per_iteration": 3.4412107467651367 }, { "auxiliary_loss_clip": 0.01124093, "auxiliary_loss_mlp": 0.01033001, "balance_loss_clip": 1.02043664, "balance_loss_mlp": 1.03731668, "epoch": 0.944957162182474, "flos": 22382259488640.0, "grad_norm": 2.8250798813584477, "language_loss": 0.62563694, "learning_rate": 2.9804087525542086e-08, "loss": 0.64720786, "num_input_tokens_seen": 338906130, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 15717, "time_per_iteration": 4.786174297332764 }, { "auxiliary_loss_clip": 0.01111526, "auxiliary_loss_mlp": 0.01030703, "balance_loss_clip": 1.0185256, "balance_loss_mlp": 1.03415108, "epoch": 0.9450172854351421, "flos": 17383889393280.0, "grad_norm": 1.6270486020613717, "language_loss": 0.79419136, "learning_rate": 2.9739139419629132e-08, "loss": 0.81561363, "num_input_tokens_seen": 338923045, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.68359375, "step": 15718, "time_per_iteration": 3.1163218021392822 }, { "auxiliary_loss_clip": 0.01030588, "auxiliary_loss_mlp": 0.01001636, "balance_loss_clip": 1.00047994, "balance_loss_mlp": 1.00062346, "epoch": 0.94507740868781, "flos": 68466352406400.0, "grad_norm": 0.6988151632924824, "language_loss": 0.57766914, "learning_rate": 2.9674261627257612e-08, "loss": 0.59799135, "num_input_tokens_seen": 338987545, "router_z_loss_clip": 0.01153564, "router_z_loss_mlp": 0.2109375, "step": 15719, "time_per_iteration": 3.7192890644073486 }, { "auxiliary_loss_clip": 0.01139349, "auxiliary_loss_mlp": 0.01277003, "balance_loss_clip": 1.0181365, "balance_loss_mlp": 1.03456414, "epoch": 0.945137531940478, "flos": 21142479611520.0, "grad_norm": 1.580091752195843, "language_loss": 0.75790334, "learning_rate": 2.9609454150743452e-08, "loss": 0.78206688, "num_input_tokens_seen": 339007830, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 15720, "time_per_iteration": 3.339484453201294 }, { "auxiliary_loss_clip": 0.01108521, "auxiliary_loss_mlp": 0.01026545, "balance_loss_clip": 1.0155127, "balance_loss_mlp": 1.03161538, "epoch": 0.9451976551931459, "flos": 24533918962560.0, "grad_norm": 1.8996599892396848, "language_loss": 0.72818232, "learning_rate": 2.954471699239991e-08, "loss": 0.74953294, "num_input_tokens_seen": 339028980, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6796875, "step": 15721, "time_per_iteration": 2.8495030403137207 }, { "auxiliary_loss_clip": 0.01138079, "auxiliary_loss_mlp": 0.01034412, "balance_loss_clip": 1.02187145, "balance_loss_mlp": 1.03402042, "epoch": 0.9452577784458139, "flos": 23440582834560.0, "grad_norm": 1.743110640338594, "language_loss": 0.85079628, "learning_rate": 2.948005015453625e-08, "loss": 0.87252116, "num_input_tokens_seen": 339047950, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 15722, "time_per_iteration": 2.6131417751312256 }, { "auxiliary_loss_clip": 0.01038976, "auxiliary_loss_mlp": 0.01001126, "balance_loss_clip": 0.99989241, "balance_loss_mlp": 1.00090122, "epoch": 0.9453179016984818, "flos": 52017686449920.0, "grad_norm": 0.8974719535704526, "language_loss": 0.64537948, "learning_rate": 2.9415453639461073e-08, "loss": 0.66578048, "num_input_tokens_seen": 339104535, "router_z_loss_clip": 0.0123291, "router_z_loss_mlp": 0.21191406, "step": 15723, "time_per_iteration": 3.179353952407837 }, { "auxiliary_loss_clip": 0.01132486, "auxiliary_loss_mlp": 0.01028749, "balance_loss_clip": 1.01630926, "balance_loss_mlp": 1.03508306, "epoch": 0.9453780249511499, "flos": 13473001509120.0, "grad_norm": 3.1993139885756148, "language_loss": 0.73094374, "learning_rate": 2.93509274494792e-08, "loss": 0.75255609, "num_input_tokens_seen": 339122050, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.703125, "step": 15724, "time_per_iteration": 3.9293735027313232 }, { "auxiliary_loss_clip": 0.01128701, "auxiliary_loss_mlp": 0.01026831, "balance_loss_clip": 1.01471305, "balance_loss_mlp": 1.03387594, "epoch": 0.9454381482038178, "flos": 23258515772160.0, "grad_norm": 3.106958419436495, "language_loss": 0.84866786, "learning_rate": 2.9286471586893902e-08, "loss": 0.87022316, "num_input_tokens_seen": 339138940, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.68359375, "step": 15725, "time_per_iteration": 2.6508145332336426 }, { "auxiliary_loss_clip": 0.01121064, "auxiliary_loss_mlp": 0.01029009, "balance_loss_clip": 1.01658773, "balance_loss_mlp": 1.03441048, "epoch": 0.9454982714564858, "flos": 19496621502720.0, "grad_norm": 2.0471605267690935, "language_loss": 0.70857608, "learning_rate": 2.922208605400489e-08, "loss": 0.73007685, "num_input_tokens_seen": 339158245, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 15726, "time_per_iteration": 2.623349905014038 }, { "auxiliary_loss_clip": 0.01120703, "auxiliary_loss_mlp": 0.01031461, "balance_loss_clip": 1.01871753, "balance_loss_mlp": 1.03408337, "epoch": 0.9455583947091538, "flos": 23258120722560.0, "grad_norm": 1.8507280227618839, "language_loss": 0.61478448, "learning_rate": 2.915777085311033e-08, "loss": 0.63630611, "num_input_tokens_seen": 339178200, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 15727, "time_per_iteration": 2.669194459915161 }, { "auxiliary_loss_clip": 0.01106755, "auxiliary_loss_mlp": 0.01033555, "balance_loss_clip": 1.02194369, "balance_loss_mlp": 1.03199494, "epoch": 0.9456185179618217, "flos": 17673041877120.0, "grad_norm": 1.6832681573381065, "language_loss": 0.81363118, "learning_rate": 2.9093525986505273e-08, "loss": 0.83503425, "num_input_tokens_seen": 339193950, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.65625, "step": 15728, "time_per_iteration": 2.5262649059295654 }, { "auxiliary_loss_clip": 0.01066586, "auxiliary_loss_mlp": 0.01002133, "balance_loss_clip": 1.00091672, "balance_loss_mlp": 1.00067997, "epoch": 0.9456786412144897, "flos": 61415040389760.0, "grad_norm": 0.736336638444505, "language_loss": 0.59085429, "learning_rate": 2.9029351456482553e-08, "loss": 0.61154139, "num_input_tokens_seen": 339252330, "router_z_loss_clip": 0.012146, "router_z_loss_mlp": 0.21191406, "step": 15729, "time_per_iteration": 3.16386079788208 }, { "auxiliary_loss_clip": 0.01100929, "auxiliary_loss_mlp": 0.01023847, "balance_loss_clip": 1.0112046, "balance_loss_mlp": 1.03312063, "epoch": 0.9457387644671577, "flos": 18588369179520.0, "grad_norm": 1.9348822911871422, "language_loss": 0.76459432, "learning_rate": 2.8965247265332337e-08, "loss": 0.78584206, "num_input_tokens_seen": 339270325, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 15730, "time_per_iteration": 2.5123181343078613 }, { "auxiliary_loss_clip": 0.01128505, "auxiliary_loss_mlp": 0.01029734, "balance_loss_clip": 1.01677632, "balance_loss_mlp": 1.03304434, "epoch": 0.9457988877198257, "flos": 21108544237440.0, "grad_norm": 1.3937584185178868, "language_loss": 0.62346935, "learning_rate": 2.890121341534235e-08, "loss": 0.64505172, "num_input_tokens_seen": 339291980, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6875, "step": 15731, "time_per_iteration": 4.047123432159424 }, { "auxiliary_loss_clip": 0.0111251, "auxiliary_loss_mlp": 0.01026798, "balance_loss_clip": 1.01413178, "balance_loss_mlp": 1.03342581, "epoch": 0.9458590109724936, "flos": 26688379697280.0, "grad_norm": 1.8382020864030786, "language_loss": 0.64224744, "learning_rate": 2.8837249908797655e-08, "loss": 0.6636405, "num_input_tokens_seen": 339311795, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 15732, "time_per_iteration": 4.133410930633545 }, { "auxiliary_loss_clip": 0.01116187, "auxiliary_loss_mlp": 0.01028892, "balance_loss_clip": 1.01569557, "balance_loss_mlp": 1.03452849, "epoch": 0.9459191342251616, "flos": 18661591054080.0, "grad_norm": 1.9926652999782213, "language_loss": 0.84108078, "learning_rate": 2.8773356747981315e-08, "loss": 0.86253154, "num_input_tokens_seen": 339327745, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 15733, "time_per_iteration": 2.4939987659454346 }, { "auxiliary_loss_clip": 0.011083, "auxiliary_loss_mlp": 0.01026625, "balance_loss_clip": 1.01571155, "balance_loss_mlp": 1.03351784, "epoch": 0.9459792574778295, "flos": 23398459159680.0, "grad_norm": 1.8222200410696099, "language_loss": 0.7225337, "learning_rate": 2.870953393517306e-08, "loss": 0.74388289, "num_input_tokens_seen": 339346445, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.66015625, "step": 15734, "time_per_iteration": 2.5138487815856934 }, { "auxiliary_loss_clip": 0.01104541, "auxiliary_loss_mlp": 0.01029789, "balance_loss_clip": 1.01957297, "balance_loss_mlp": 1.0308392, "epoch": 0.9460393807304975, "flos": 24392969994240.0, "grad_norm": 1.371041087972914, "language_loss": 0.86508948, "learning_rate": 2.8645781472650843e-08, "loss": 0.88643277, "num_input_tokens_seen": 339367945, "router_z_loss_clip": 0.10253906, "router_z_loss_mlp": 0.6484375, "step": 15735, "time_per_iteration": 2.5792760848999023 }, { "auxiliary_loss_clip": 0.01119484, "auxiliary_loss_mlp": 0.01028031, "balance_loss_clip": 1.01616931, "balance_loss_mlp": 1.03360105, "epoch": 0.9460995039831654, "flos": 21939408708480.0, "grad_norm": 2.2183318595025203, "language_loss": 0.67208302, "learning_rate": 2.8582099362689737e-08, "loss": 0.69355816, "num_input_tokens_seen": 339386060, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.68359375, "step": 15736, "time_per_iteration": 2.5123910903930664 }, { "auxiliary_loss_clip": 0.01057671, "auxiliary_loss_mlp": 0.01002616, "balance_loss_clip": 1.00141835, "balance_loss_mlp": 1.00089443, "epoch": 0.9461596272358335, "flos": 70322466775680.0, "grad_norm": 0.9027071382085398, "language_loss": 0.65332472, "learning_rate": 2.851848760756237e-08, "loss": 0.67392761, "num_input_tokens_seen": 339446695, "router_z_loss_clip": 0.01196289, "router_z_loss_mlp": 0.21289062, "step": 15737, "time_per_iteration": 3.2215664386749268 }, { "auxiliary_loss_clip": 0.01030966, "auxiliary_loss_mlp": 0.0100208, "balance_loss_clip": 1.00074506, "balance_loss_mlp": 1.00103831, "epoch": 0.9462197504885014, "flos": 58591242645120.0, "grad_norm": 0.8015840660752783, "language_loss": 0.588175, "learning_rate": 2.8454946209539145e-08, "loss": 0.60850543, "num_input_tokens_seen": 339510080, "router_z_loss_clip": 0.0133667, "router_z_loss_mlp": 0.21191406, "step": 15738, "time_per_iteration": 3.112004041671753 }, { "auxiliary_loss_clip": 0.01108433, "auxiliary_loss_mlp": 0.0103093, "balance_loss_clip": 1.01980758, "balance_loss_mlp": 1.03324032, "epoch": 0.9462798737411694, "flos": 23433759250560.0, "grad_norm": 2.1972919102301525, "language_loss": 0.71656251, "learning_rate": 2.8391475170887135e-08, "loss": 0.73795617, "num_input_tokens_seen": 339529335, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6640625, "step": 15739, "time_per_iteration": 2.589458703994751 }, { "auxiliary_loss_clip": 0.01110845, "auxiliary_loss_mlp": 0.01028831, "balance_loss_clip": 1.01669598, "balance_loss_mlp": 1.03350604, "epoch": 0.9463399969938374, "flos": 25046077034880.0, "grad_norm": 2.2210910957579992, "language_loss": 0.64907062, "learning_rate": 2.832807449387209e-08, "loss": 0.67046738, "num_input_tokens_seen": 339548820, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 15740, "time_per_iteration": 2.536221981048584 }, { "auxiliary_loss_clip": 0.01117336, "auxiliary_loss_mlp": 0.01028544, "balance_loss_clip": 1.01688492, "balance_loss_mlp": 1.0330646, "epoch": 0.9464001202465053, "flos": 24352606085760.0, "grad_norm": 1.8522844021781013, "language_loss": 0.66760862, "learning_rate": 2.8264744180755974e-08, "loss": 0.68906742, "num_input_tokens_seen": 339566775, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.66796875, "step": 15741, "time_per_iteration": 2.6539289951324463 }, { "auxiliary_loss_clip": 0.0111093, "auxiliary_loss_mlp": 0.01024359, "balance_loss_clip": 1.01218188, "balance_loss_mlp": 1.03368247, "epoch": 0.9464602434991733, "flos": 17165444832000.0, "grad_norm": 1.9502127283033364, "language_loss": 0.75955564, "learning_rate": 2.8201484233799643e-08, "loss": 0.78090858, "num_input_tokens_seen": 339581905, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.68359375, "step": 15742, "time_per_iteration": 2.5056159496307373 }, { "auxiliary_loss_clip": 0.01108621, "auxiliary_loss_mlp": 0.01030046, "balance_loss_clip": 1.01910305, "balance_loss_mlp": 1.0333519, "epoch": 0.9465203667518413, "flos": 19938107566080.0, "grad_norm": 1.9483439608498407, "language_loss": 0.72256333, "learning_rate": 2.8138294655259964e-08, "loss": 0.74394995, "num_input_tokens_seen": 339599870, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.6640625, "step": 15743, "time_per_iteration": 2.696000576019287 }, { "auxiliary_loss_clip": 0.01132233, "auxiliary_loss_mlp": 0.01029584, "balance_loss_clip": 1.01721048, "balance_loss_mlp": 1.03427672, "epoch": 0.9465804900045093, "flos": 20120318282880.0, "grad_norm": 2.0109823058926333, "language_loss": 0.79475909, "learning_rate": 2.8075175447392464e-08, "loss": 0.81637728, "num_input_tokens_seen": 339620250, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.7109375, "step": 15744, "time_per_iteration": 2.5438644886016846 }, { "auxiliary_loss_clip": 0.01121894, "auxiliary_loss_mlp": 0.01035099, "balance_loss_clip": 1.02222431, "balance_loss_mlp": 1.03373885, "epoch": 0.9466406132571772, "flos": 23911622812800.0, "grad_norm": 1.7423314080396477, "language_loss": 0.7814908, "learning_rate": 2.801212661244956e-08, "loss": 0.80306077, "num_input_tokens_seen": 339639900, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 15745, "time_per_iteration": 2.6190645694732666 }, { "auxiliary_loss_clip": 0.01120933, "auxiliary_loss_mlp": 0.01031742, "balance_loss_clip": 1.01967847, "balance_loss_mlp": 1.03271306, "epoch": 0.9467007365098452, "flos": 19933223316480.0, "grad_norm": 1.3989053895587424, "language_loss": 0.70406491, "learning_rate": 2.7949148152681234e-08, "loss": 0.72559166, "num_input_tokens_seen": 339658970, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 15746, "time_per_iteration": 2.7338755130767822 }, { "auxiliary_loss_clip": 0.01120817, "auxiliary_loss_mlp": 0.01027038, "balance_loss_clip": 1.01478875, "balance_loss_mlp": 1.03491044, "epoch": 0.9467608597625131, "flos": 19310496203520.0, "grad_norm": 2.2178312003189973, "language_loss": 0.56358194, "learning_rate": 2.788624007033502e-08, "loss": 0.58506048, "num_input_tokens_seen": 339675600, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6796875, "step": 15747, "time_per_iteration": 2.6113691329956055 }, { "auxiliary_loss_clip": 0.01075499, "auxiliary_loss_mlp": 0.01001465, "balance_loss_clip": 1.00030887, "balance_loss_mlp": 1.00079763, "epoch": 0.9468209830151811, "flos": 69630252802560.0, "grad_norm": 0.6563308608126533, "language_loss": 0.53239751, "learning_rate": 2.7823402367656235e-08, "loss": 0.55316716, "num_input_tokens_seen": 339744505, "router_z_loss_clip": 0.01153564, "router_z_loss_mlp": 0.21289062, "step": 15748, "time_per_iteration": 3.4577293395996094 }, { "auxiliary_loss_clip": 0.01121872, "auxiliary_loss_mlp": 0.01034979, "balance_loss_clip": 1.02262282, "balance_loss_mlp": 1.03551722, "epoch": 0.946881106267849, "flos": 27016639113600.0, "grad_norm": 1.626673969264738, "language_loss": 0.66123271, "learning_rate": 2.776063504688686e-08, "loss": 0.68280125, "num_input_tokens_seen": 339765810, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 15749, "time_per_iteration": 2.7793333530426025 }, { "auxiliary_loss_clip": 0.01129396, "auxiliary_loss_mlp": 0.01032065, "balance_loss_clip": 1.0182724, "balance_loss_mlp": 1.03319061, "epoch": 0.9469412295205171, "flos": 20190092451840.0, "grad_norm": 1.9584512513534589, "language_loss": 0.76529217, "learning_rate": 2.7697938110267547e-08, "loss": 0.78690678, "num_input_tokens_seen": 339784125, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.6953125, "step": 15750, "time_per_iteration": 2.9727084636688232 }, { "auxiliary_loss_clip": 0.01110515, "auxiliary_loss_mlp": 0.0102924, "balance_loss_clip": 1.01711655, "balance_loss_mlp": 1.034271, "epoch": 0.947001352773185, "flos": 21324905809920.0, "grad_norm": 1.6429977423205284, "language_loss": 0.6795215, "learning_rate": 2.7635311560035178e-08, "loss": 0.70091909, "num_input_tokens_seen": 339803450, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.67578125, "step": 15751, "time_per_iteration": 2.6819007396698 }, { "auxiliary_loss_clip": 0.01130768, "auxiliary_loss_mlp": 0.01028333, "balance_loss_clip": 1.0166626, "balance_loss_mlp": 1.03536069, "epoch": 0.947061476025853, "flos": 21944041562880.0, "grad_norm": 2.008145650408699, "language_loss": 0.65589368, "learning_rate": 2.7572755398425296e-08, "loss": 0.67748463, "num_input_tokens_seen": 339823215, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6875, "step": 15752, "time_per_iteration": 2.7990615367889404 }, { "auxiliary_loss_clip": 0.01109863, "auxiliary_loss_mlp": 0.01028441, "balance_loss_clip": 1.01590598, "balance_loss_mlp": 1.03268886, "epoch": 0.947121599278521, "flos": 21394715892480.0, "grad_norm": 6.017036541198085, "language_loss": 0.71857208, "learning_rate": 2.7510269627669892e-08, "loss": 0.73995513, "num_input_tokens_seen": 339842230, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 15753, "time_per_iteration": 2.7509238719940186 }, { "auxiliary_loss_clip": 0.01103881, "auxiliary_loss_mlp": 0.01031639, "balance_loss_clip": 1.01752436, "balance_loss_mlp": 1.03326321, "epoch": 0.9471817225311889, "flos": 23075730437760.0, "grad_norm": 2.4591840062524213, "language_loss": 0.69998908, "learning_rate": 2.7447854249998957e-08, "loss": 0.72134429, "num_input_tokens_seen": 339861640, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.70703125, "step": 15754, "time_per_iteration": 2.6898128986358643 }, { "auxiliary_loss_clip": 0.01104227, "auxiliary_loss_mlp": 0.01029632, "balance_loss_clip": 1.01777649, "balance_loss_mlp": 1.03337455, "epoch": 0.947241845783857, "flos": 11910744305280.0, "grad_norm": 4.928232804165877, "language_loss": 0.78609461, "learning_rate": 2.7385509267640495e-08, "loss": 0.80743325, "num_input_tokens_seen": 339878210, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.70703125, "step": 15755, "time_per_iteration": 2.7847402095794678 }, { "auxiliary_loss_clip": 0.01135203, "auxiliary_loss_mlp": 0.01036745, "balance_loss_clip": 1.02188528, "balance_loss_mlp": 1.03382409, "epoch": 0.9473019690365249, "flos": 20740675098240.0, "grad_norm": 2.228514144763919, "language_loss": 0.75162876, "learning_rate": 2.732323468281872e-08, "loss": 0.77334821, "num_input_tokens_seen": 339894255, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7421875, "step": 15756, "time_per_iteration": 2.7137441635131836 }, { "auxiliary_loss_clip": 0.01121222, "auxiliary_loss_mlp": 0.01028767, "balance_loss_clip": 1.01730537, "balance_loss_mlp": 1.03372145, "epoch": 0.9473620922891929, "flos": 17639896602240.0, "grad_norm": 1.8899868041661374, "language_loss": 0.74689198, "learning_rate": 2.726103049775652e-08, "loss": 0.76839191, "num_input_tokens_seen": 339912425, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6953125, "step": 15757, "time_per_iteration": 2.907397747039795 }, { "auxiliary_loss_clip": 0.01124444, "auxiliary_loss_mlp": 0.01031796, "balance_loss_clip": 1.01830184, "balance_loss_mlp": 1.03460026, "epoch": 0.9474222155418608, "flos": 23550002640000.0, "grad_norm": 1.9375417984692058, "language_loss": 0.79633349, "learning_rate": 2.719889671467346e-08, "loss": 0.81789589, "num_input_tokens_seen": 339929635, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 15758, "time_per_iteration": 4.266587972640991 }, { "auxiliary_loss_clip": 0.01120428, "auxiliary_loss_mlp": 0.01278863, "balance_loss_clip": 1.02018332, "balance_loss_mlp": 1.03484607, "epoch": 0.9474823387945288, "flos": 27089753247360.0, "grad_norm": 1.4371769552411855, "language_loss": 0.72199559, "learning_rate": 2.7136833335787534e-08, "loss": 0.74598849, "num_input_tokens_seen": 339951200, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 15759, "time_per_iteration": 2.9485838413238525 }, { "auxiliary_loss_clip": 0.01107695, "auxiliary_loss_mlp": 0.01026188, "balance_loss_clip": 1.01587069, "balance_loss_mlp": 1.03478169, "epoch": 0.9475424620471967, "flos": 22966526113920.0, "grad_norm": 1.545653683529714, "language_loss": 0.75732738, "learning_rate": 2.707484036331298e-08, "loss": 0.77866626, "num_input_tokens_seen": 339971820, "router_z_loss_clip": 0.10302734, "router_z_loss_mlp": 0.640625, "step": 15760, "time_per_iteration": 2.99147367477417 }, { "auxiliary_loss_clip": 0.01102485, "auxiliary_loss_mlp": 0.01278253, "balance_loss_clip": 1.01958406, "balance_loss_mlp": 1.03465629, "epoch": 0.9476025852998647, "flos": 20047671025920.0, "grad_norm": 2.598177426184601, "language_loss": 0.72551191, "learning_rate": 2.7012917799462242e-08, "loss": 0.74931931, "num_input_tokens_seen": 339989420, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 15761, "time_per_iteration": 2.748243808746338 }, { "auxiliary_loss_clip": 0.01121813, "auxiliary_loss_mlp": 0.01035723, "balance_loss_clip": 1.02382028, "balance_loss_mlp": 1.03707695, "epoch": 0.9476627085525327, "flos": 14975468524800.0, "grad_norm": 2.1844697879607837, "language_loss": 0.71548152, "learning_rate": 2.6951065646445558e-08, "loss": 0.73705691, "num_input_tokens_seen": 340006690, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 15762, "time_per_iteration": 2.806321144104004 }, { "auxiliary_loss_clip": 0.01104744, "auxiliary_loss_mlp": 0.01033176, "balance_loss_clip": 1.01949644, "balance_loss_mlp": 1.03223336, "epoch": 0.9477228318052007, "flos": 18697788984960.0, "grad_norm": 1.778950205328039, "language_loss": 0.67446935, "learning_rate": 2.688928390647005e-08, "loss": 0.69584846, "num_input_tokens_seen": 340025480, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 15763, "time_per_iteration": 2.7787232398986816 }, { "auxiliary_loss_clip": 0.0110181, "auxiliary_loss_mlp": 0.01037444, "balance_loss_clip": 1.02552867, "balance_loss_mlp": 1.03244293, "epoch": 0.9477829550578686, "flos": 23875065745920.0, "grad_norm": 3.429924030598325, "language_loss": 0.70206577, "learning_rate": 2.6827572581740398e-08, "loss": 0.72345829, "num_input_tokens_seen": 340043785, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 15764, "time_per_iteration": 2.7949483394622803 }, { "auxiliary_loss_clip": 0.01108555, "auxiliary_loss_mlp": 0.01274255, "balance_loss_clip": 1.01712775, "balance_loss_mlp": 1.0362699, "epoch": 0.9478430783105366, "flos": 22562890007040.0, "grad_norm": 1.7139717556145884, "language_loss": 0.71043372, "learning_rate": 2.6765931674459286e-08, "loss": 0.73426187, "num_input_tokens_seen": 340064360, "router_z_loss_clip": 0.10400391, "router_z_loss_mlp": 0.6328125, "step": 15765, "time_per_iteration": 2.8000450134277344 }, { "auxiliary_loss_clip": 0.01031382, "auxiliary_loss_mlp": 0.01000852, "balance_loss_clip": 0.99966586, "balance_loss_mlp": 1.00112009, "epoch": 0.9479032015632046, "flos": 60857885554560.0, "grad_norm": 0.8064612718157842, "language_loss": 0.59100991, "learning_rate": 2.6704361186826287e-08, "loss": 0.61133224, "num_input_tokens_seen": 340114425, "router_z_loss_clip": 0.01184082, "router_z_loss_mlp": 0.20996094, "step": 15766, "time_per_iteration": 4.479760646820068 }, { "auxiliary_loss_clip": 0.01140979, "auxiliary_loss_mlp": 0.0103565, "balance_loss_clip": 1.02412844, "balance_loss_mlp": 1.03575134, "epoch": 0.9479633248158725, "flos": 14683873916160.0, "grad_norm": 3.1549451383206444, "language_loss": 0.74361253, "learning_rate": 2.6642861121038306e-08, "loss": 0.76537883, "num_input_tokens_seen": 340132200, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.69140625, "step": 15767, "time_per_iteration": 2.821648359298706 }, { "auxiliary_loss_clip": 0.01133769, "auxiliary_loss_mlp": 0.01033886, "balance_loss_clip": 1.02075529, "balance_loss_mlp": 1.03538823, "epoch": 0.9480234480685406, "flos": 20333878594560.0, "grad_norm": 2.542015489541037, "language_loss": 0.73527491, "learning_rate": 2.6581431479290926e-08, "loss": 0.75695145, "num_input_tokens_seen": 340149175, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 15768, "time_per_iteration": 2.8970558643341064 }, { "auxiliary_loss_clip": 0.01057033, "auxiliary_loss_mlp": 0.01002626, "balance_loss_clip": 1.00142848, "balance_loss_mlp": 1.00070453, "epoch": 0.9480835713212085, "flos": 70293092428800.0, "grad_norm": 0.7361016743554654, "language_loss": 0.55304134, "learning_rate": 2.652007226377595e-08, "loss": 0.57363796, "num_input_tokens_seen": 340208155, "router_z_loss_clip": 0.01196289, "router_z_loss_mlp": 0.2109375, "step": 15769, "time_per_iteration": 3.4145474433898926 }, { "auxiliary_loss_clip": 0.0113618, "auxiliary_loss_mlp": 0.01032548, "balance_loss_clip": 1.02047825, "balance_loss_mlp": 1.0313623, "epoch": 0.9481436945738765, "flos": 25449749055360.0, "grad_norm": 1.3807933884600752, "language_loss": 0.77479994, "learning_rate": 2.6458783476683176e-08, "loss": 0.79648727, "num_input_tokens_seen": 340229275, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 15770, "time_per_iteration": 2.8537497520446777 }, { "auxiliary_loss_clip": 0.01125543, "auxiliary_loss_mlp": 0.01033895, "balance_loss_clip": 1.01984024, "balance_loss_mlp": 1.03776002, "epoch": 0.9482038178265444, "flos": 26979902478720.0, "grad_norm": 1.8302585101412516, "language_loss": 0.79673547, "learning_rate": 2.639756512019975e-08, "loss": 0.81832981, "num_input_tokens_seen": 340248920, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.703125, "step": 15771, "time_per_iteration": 2.766604423522949 }, { "auxiliary_loss_clip": 0.01127278, "auxiliary_loss_mlp": 0.0102961, "balance_loss_clip": 1.01777864, "balance_loss_mlp": 1.03422475, "epoch": 0.9482639410792124, "flos": 17785442511360.0, "grad_norm": 1.8122447154644834, "language_loss": 0.69368029, "learning_rate": 2.633641719651081e-08, "loss": 0.71524918, "num_input_tokens_seen": 340266775, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.66796875, "step": 15772, "time_per_iteration": 2.883718729019165 }, { "auxiliary_loss_clip": 0.01111796, "auxiliary_loss_mlp": 0.01028944, "balance_loss_clip": 1.01628399, "balance_loss_mlp": 1.03483033, "epoch": 0.9483240643318803, "flos": 20996682307200.0, "grad_norm": 1.5741954814471124, "language_loss": 0.73741472, "learning_rate": 2.627533970779794e-08, "loss": 0.75882208, "num_input_tokens_seen": 340285295, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 15773, "time_per_iteration": 6.810993671417236 }, { "auxiliary_loss_clip": 0.0113031, "auxiliary_loss_mlp": 0.01038362, "balance_loss_clip": 1.02523088, "balance_loss_mlp": 1.03272152, "epoch": 0.9483841875845483, "flos": 20083294339200.0, "grad_norm": 2.4461737787910165, "language_loss": 0.62898558, "learning_rate": 2.62143326562414e-08, "loss": 0.65067226, "num_input_tokens_seen": 340304265, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 15774, "time_per_iteration": 2.77789044380188 }, { "auxiliary_loss_clip": 0.01103095, "auxiliary_loss_mlp": 0.01033136, "balance_loss_clip": 1.02034533, "balance_loss_mlp": 1.03353083, "epoch": 0.9484443108372163, "flos": 20813645577600.0, "grad_norm": 1.754575592452814, "language_loss": 0.59346944, "learning_rate": 2.615339604401812e-08, "loss": 0.61483181, "num_input_tokens_seen": 340323690, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 15775, "time_per_iteration": 2.771303415298462 }, { "auxiliary_loss_clip": 0.01122507, "auxiliary_loss_mlp": 0.01028044, "balance_loss_clip": 1.01512766, "balance_loss_mlp": 1.03473926, "epoch": 0.9485044340898843, "flos": 36429184506240.0, "grad_norm": 1.4875892438679459, "language_loss": 0.61846709, "learning_rate": 2.6092529873302794e-08, "loss": 0.63997257, "num_input_tokens_seen": 340345830, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 15776, "time_per_iteration": 2.9390909671783447 }, { "auxiliary_loss_clip": 0.01120589, "auxiliary_loss_mlp": 0.01031429, "balance_loss_clip": 1.01844716, "balance_loss_mlp": 1.03332686, "epoch": 0.9485645573425522, "flos": 22602535643520.0, "grad_norm": 1.7755249429589322, "language_loss": 0.73496509, "learning_rate": 2.603173414626747e-08, "loss": 0.75648522, "num_input_tokens_seen": 340365910, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69140625, "step": 15777, "time_per_iteration": 2.8500237464904785 }, { "auxiliary_loss_clip": 0.01124111, "auxiliary_loss_mlp": 0.01038823, "balance_loss_clip": 1.02547097, "balance_loss_mlp": 1.03900242, "epoch": 0.9486246805952202, "flos": 22017766227840.0, "grad_norm": 4.9938885729948, "language_loss": 0.72615051, "learning_rate": 2.5971008865081966e-08, "loss": 0.74777985, "num_input_tokens_seen": 340383935, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.76171875, "step": 15778, "time_per_iteration": 2.8874430656433105 }, { "auxiliary_loss_clip": 0.01106321, "auxiliary_loss_mlp": 0.01279597, "balance_loss_clip": 1.02216792, "balance_loss_mlp": 1.03453815, "epoch": 0.9486848038478882, "flos": 16508674604160.0, "grad_norm": 2.5736998192214022, "language_loss": 0.70453471, "learning_rate": 2.591035403191322e-08, "loss": 0.72839391, "num_input_tokens_seen": 340402760, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.62890625, "step": 15779, "time_per_iteration": 2.6493611335754395 }, { "auxiliary_loss_clip": 0.01103318, "auxiliary_loss_mlp": 0.01033386, "balance_loss_clip": 1.02080309, "balance_loss_mlp": 1.03446651, "epoch": 0.9487449271005561, "flos": 19244385221760.0, "grad_norm": 2.1356819353882446, "language_loss": 0.77904606, "learning_rate": 2.5849769648926157e-08, "loss": 0.80041307, "num_input_tokens_seen": 340422105, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 15780, "time_per_iteration": 2.689164876937866 }, { "auxiliary_loss_clip": 0.01121286, "auxiliary_loss_mlp": 0.01284207, "balance_loss_clip": 1.02486002, "balance_loss_mlp": 1.03477538, "epoch": 0.9488050503532242, "flos": 21762692772480.0, "grad_norm": 1.677071297504399, "language_loss": 0.66055667, "learning_rate": 2.578925571828261e-08, "loss": 0.68461168, "num_input_tokens_seen": 340441160, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 15781, "time_per_iteration": 2.884686231613159 }, { "auxiliary_loss_clip": 0.01102068, "auxiliary_loss_mlp": 0.01032366, "balance_loss_clip": 1.0200274, "balance_loss_mlp": 1.03477311, "epoch": 0.9488651736058921, "flos": 18368919037440.0, "grad_norm": 2.0314532126348825, "language_loss": 0.80063933, "learning_rate": 2.5728812242142405e-08, "loss": 0.82198364, "num_input_tokens_seen": 340458200, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.671875, "step": 15782, "time_per_iteration": 2.849215507507324 }, { "auxiliary_loss_clip": 0.01030671, "auxiliary_loss_mlp": 0.01002798, "balance_loss_clip": 1.00151682, "balance_loss_mlp": 1.00071526, "epoch": 0.9489252968585601, "flos": 70084057230720.0, "grad_norm": 0.7134587048341889, "language_loss": 0.59801352, "learning_rate": 2.5668439222662264e-08, "loss": 0.61834824, "num_input_tokens_seen": 340526420, "router_z_loss_clip": 0.01281738, "router_z_loss_mlp": 0.21191406, "step": 15783, "time_per_iteration": 3.436514377593994 }, { "auxiliary_loss_clip": 0.01111207, "auxiliary_loss_mlp": 0.01031249, "balance_loss_clip": 1.01872063, "balance_loss_mlp": 1.03324008, "epoch": 0.948985420111228, "flos": 27855440490240.0, "grad_norm": 1.800798593613742, "language_loss": 0.73914754, "learning_rate": 2.5608136661996903e-08, "loss": 0.76057208, "num_input_tokens_seen": 340546325, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 15784, "time_per_iteration": 2.6600890159606934 }, { "auxiliary_loss_clip": 0.01138032, "auxiliary_loss_mlp": 0.01028684, "balance_loss_clip": 1.01595235, "balance_loss_mlp": 1.03294659, "epoch": 0.949045543363896, "flos": 24316049018880.0, "grad_norm": 1.904737178668284, "language_loss": 0.69593257, "learning_rate": 2.5547904562298604e-08, "loss": 0.71759975, "num_input_tokens_seen": 340565145, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 15785, "time_per_iteration": 2.791919469833374 }, { "auxiliary_loss_clip": 0.01103727, "auxiliary_loss_mlp": 0.01027866, "balance_loss_clip": 1.01564133, "balance_loss_mlp": 1.03339493, "epoch": 0.9491056666165639, "flos": 24241677909120.0, "grad_norm": 2.634337093920014, "language_loss": 0.75852835, "learning_rate": 2.5487742925716538e-08, "loss": 0.77984416, "num_input_tokens_seen": 340585465, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 15786, "time_per_iteration": 2.8732893466949463 }, { "auxiliary_loss_clip": 0.01129946, "auxiliary_loss_mlp": 0.01030903, "balance_loss_clip": 1.01944065, "balance_loss_mlp": 1.03504145, "epoch": 0.949165789869232, "flos": 24531261356160.0, "grad_norm": 1.7438537459362744, "language_loss": 0.78926605, "learning_rate": 2.5427651754398095e-08, "loss": 0.81087458, "num_input_tokens_seen": 340606010, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.68359375, "step": 15787, "time_per_iteration": 2.793678045272827 }, { "auxiliary_loss_clip": 0.01110961, "auxiliary_loss_mlp": 0.01028345, "balance_loss_clip": 1.01690745, "balance_loss_mlp": 1.03661692, "epoch": 0.9492259131218999, "flos": 22235348862720.0, "grad_norm": 1.5642423936513001, "language_loss": 0.76248312, "learning_rate": 2.5367631050487115e-08, "loss": 0.78387618, "num_input_tokens_seen": 340626135, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.65625, "step": 15788, "time_per_iteration": 2.751056432723999 }, { "auxiliary_loss_clip": 0.01112644, "auxiliary_loss_mlp": 0.01028967, "balance_loss_clip": 1.01543069, "balance_loss_mlp": 1.03377569, "epoch": 0.9492860363745679, "flos": 22966310632320.0, "grad_norm": 1.8784108166305227, "language_loss": 0.7150228, "learning_rate": 2.5307680816126553e-08, "loss": 0.73643887, "num_input_tokens_seen": 340644870, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.69921875, "step": 15789, "time_per_iteration": 2.960205554962158 }, { "auxiliary_loss_clip": 0.01127652, "auxiliary_loss_mlp": 0.01029854, "balance_loss_clip": 1.01728964, "balance_loss_mlp": 1.03384757, "epoch": 0.9493461596272358, "flos": 18370283754240.0, "grad_norm": 1.8145714376980728, "language_loss": 0.73659056, "learning_rate": 2.524780105345492e-08, "loss": 0.7581656, "num_input_tokens_seen": 340663695, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6640625, "step": 15790, "time_per_iteration": 2.6389832496643066 }, { "auxiliary_loss_clip": 0.01130921, "auxiliary_loss_mlp": 0.01033618, "balance_loss_clip": 1.02071917, "balance_loss_mlp": 1.03382492, "epoch": 0.9494062828799038, "flos": 20011724490240.0, "grad_norm": 1.8347490656835657, "language_loss": 0.77551121, "learning_rate": 2.518799176460984e-08, "loss": 0.79715657, "num_input_tokens_seen": 340682970, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 15791, "time_per_iteration": 2.9308173656463623 }, { "auxiliary_loss_clip": 0.01116134, "auxiliary_loss_mlp": 0.01033546, "balance_loss_clip": 1.02013516, "balance_loss_mlp": 1.03514266, "epoch": 0.9494664061325718, "flos": 27228583313280.0, "grad_norm": 1.9562973236482089, "language_loss": 0.73556125, "learning_rate": 2.5128252951725603e-08, "loss": 0.75705802, "num_input_tokens_seen": 340702275, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 15792, "time_per_iteration": 2.9445176124572754 }, { "auxiliary_loss_clip": 0.01110795, "auxiliary_loss_mlp": 0.01035425, "balance_loss_clip": 1.02318215, "balance_loss_mlp": 1.03401399, "epoch": 0.9495265293852397, "flos": 18369816877440.0, "grad_norm": 3.6084596995200613, "language_loss": 0.78337741, "learning_rate": 2.506858461693384e-08, "loss": 0.80483955, "num_input_tokens_seen": 340719060, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6796875, "step": 15793, "time_per_iteration": 2.577805280685425 }, { "auxiliary_loss_clip": 0.01120316, "auxiliary_loss_mlp": 0.01028416, "balance_loss_clip": 1.01665628, "balance_loss_mlp": 1.0340699, "epoch": 0.9495866526379078, "flos": 23075766351360.0, "grad_norm": 1.7103516052004213, "language_loss": 0.7749604, "learning_rate": 2.500898676236396e-08, "loss": 0.79644775, "num_input_tokens_seen": 340737815, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.68359375, "step": 15794, "time_per_iteration": 2.945003032684326 }, { "auxiliary_loss_clip": 0.01131607, "auxiliary_loss_mlp": 0.01278502, "balance_loss_clip": 1.01798677, "balance_loss_mlp": 1.03451693, "epoch": 0.9496467758905757, "flos": 17529902179200.0, "grad_norm": 2.056530416840458, "language_loss": 0.60836446, "learning_rate": 2.4949459390143367e-08, "loss": 0.6324656, "num_input_tokens_seen": 340756035, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7109375, "step": 15795, "time_per_iteration": 2.560391902923584 }, { "auxiliary_loss_clip": 0.01109642, "auxiliary_loss_mlp": 0.01035103, "balance_loss_clip": 1.02289557, "balance_loss_mlp": 1.03433692, "epoch": 0.9497068991432437, "flos": 24133910129280.0, "grad_norm": 1.9063235795117672, "language_loss": 0.79452372, "learning_rate": 2.4890002502396368e-08, "loss": 0.8159712, "num_input_tokens_seen": 340775620, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6640625, "step": 15796, "time_per_iteration": 3.1619110107421875 }, { "auxiliary_loss_clip": 0.01111892, "auxiliary_loss_mlp": 0.0102912, "balance_loss_clip": 1.01610196, "balance_loss_mlp": 1.03394079, "epoch": 0.9497670223959116, "flos": 20303319098880.0, "grad_norm": 2.329037891050265, "language_loss": 0.75255138, "learning_rate": 2.483061610124415e-08, "loss": 0.77396148, "num_input_tokens_seen": 340794510, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69140625, "step": 15797, "time_per_iteration": 2.608358144760132 }, { "auxiliary_loss_clip": 0.01131705, "auxiliary_loss_mlp": 0.01278354, "balance_loss_clip": 1.01948166, "balance_loss_mlp": 1.03561997, "epoch": 0.9498271456485796, "flos": 13698916099200.0, "grad_norm": 1.8206070386109297, "language_loss": 0.65808332, "learning_rate": 2.4771300188806576e-08, "loss": 0.68218386, "num_input_tokens_seen": 340812955, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 15798, "time_per_iteration": 3.0487186908721924 }, { "auxiliary_loss_clip": 0.01114678, "auxiliary_loss_mlp": 0.01037032, "balance_loss_clip": 1.02521205, "balance_loss_mlp": 1.03527749, "epoch": 0.9498872689012475, "flos": 20814004713600.0, "grad_norm": 1.898600291105226, "language_loss": 0.77475715, "learning_rate": 2.471205476720062e-08, "loss": 0.79627424, "num_input_tokens_seen": 340829200, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.70703125, "step": 15799, "time_per_iteration": 2.5561814308166504 }, { "auxiliary_loss_clip": 0.01110311, "auxiliary_loss_mlp": 0.0103304, "balance_loss_clip": 1.02133358, "balance_loss_mlp": 1.0345788, "epoch": 0.9499473921539155, "flos": 21032700670080.0, "grad_norm": 1.755762672780389, "language_loss": 0.71068019, "learning_rate": 2.4652879838540142e-08, "loss": 0.73211372, "num_input_tokens_seen": 340848035, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.66796875, "step": 15800, "time_per_iteration": 4.3058555126190186 }, { "auxiliary_loss_clip": 0.01117066, "auxiliary_loss_mlp": 0.01027082, "balance_loss_clip": 1.01455927, "balance_loss_mlp": 1.03188264, "epoch": 0.9500075154065835, "flos": 20998693468800.0, "grad_norm": 1.7348025029376135, "language_loss": 0.71731389, "learning_rate": 2.4593775404937232e-08, "loss": 0.73875535, "num_input_tokens_seen": 340870025, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.671875, "step": 15801, "time_per_iteration": 2.7400872707366943 }, { "auxiliary_loss_clip": 0.01111928, "auxiliary_loss_mlp": 0.01028941, "balance_loss_clip": 1.01743698, "balance_loss_mlp": 1.03512645, "epoch": 0.9500676386592515, "flos": 25121956515840.0, "grad_norm": 1.5053343918276358, "language_loss": 0.81133044, "learning_rate": 2.4534741468501098e-08, "loss": 0.83273911, "num_input_tokens_seen": 340892290, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6796875, "step": 15802, "time_per_iteration": 2.993746757507324 }, { "auxiliary_loss_clip": 0.0110542, "auxiliary_loss_mlp": 0.01032083, "balance_loss_clip": 1.02001858, "balance_loss_mlp": 1.03531122, "epoch": 0.9501277619119194, "flos": 27523625627520.0, "grad_norm": 1.6141898365501988, "language_loss": 0.677674, "learning_rate": 2.4475778031338713e-08, "loss": 0.699049, "num_input_tokens_seen": 340912260, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.703125, "step": 15803, "time_per_iteration": 2.609832763671875 }, { "auxiliary_loss_clip": 0.01118542, "auxiliary_loss_mlp": 0.01030787, "balance_loss_clip": 1.01885998, "balance_loss_mlp": 1.03359079, "epoch": 0.9501878851645874, "flos": 20813968800000.0, "grad_norm": 1.5736624358659885, "language_loss": 0.76116478, "learning_rate": 2.441688509555395e-08, "loss": 0.7826581, "num_input_tokens_seen": 340928930, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.671875, "step": 15804, "time_per_iteration": 3.06296968460083 }, { "auxiliary_loss_clip": 0.01114701, "auxiliary_loss_mlp": 0.01033068, "balance_loss_clip": 1.02062249, "balance_loss_mlp": 1.03554845, "epoch": 0.9502480084172553, "flos": 18369385914240.0, "grad_norm": 1.5966872985193963, "language_loss": 0.73260438, "learning_rate": 2.4358062663248912e-08, "loss": 0.75408208, "num_input_tokens_seen": 340946615, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69921875, "step": 15805, "time_per_iteration": 2.661010265350342 }, { "auxiliary_loss_clip": 0.01122949, "auxiliary_loss_mlp": 0.01035762, "balance_loss_clip": 1.02345419, "balance_loss_mlp": 1.03488493, "epoch": 0.9503081316699233, "flos": 23549607590400.0, "grad_norm": 1.391927203547494, "language_loss": 0.80438036, "learning_rate": 2.429931073652258e-08, "loss": 0.82596743, "num_input_tokens_seen": 340967545, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 15806, "time_per_iteration": 2.7074759006500244 }, { "auxiliary_loss_clip": 0.01132318, "auxiliary_loss_mlp": 0.01031285, "balance_loss_clip": 1.01957917, "balance_loss_mlp": 1.0353024, "epoch": 0.9503682549225914, "flos": 18040444139520.0, "grad_norm": 3.2947034690997783, "language_loss": 0.8236599, "learning_rate": 2.4240629317471727e-08, "loss": 0.84529591, "num_input_tokens_seen": 340984955, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.703125, "step": 15807, "time_per_iteration": 2.591517686843872 }, { "auxiliary_loss_clip": 0.0110802, "auxiliary_loss_mlp": 0.01031743, "balance_loss_clip": 1.01978624, "balance_loss_mlp": 1.03155398, "epoch": 0.9504283781752593, "flos": 25886135387520.0, "grad_norm": 2.740998402136567, "language_loss": 0.71793318, "learning_rate": 2.4182018408190673e-08, "loss": 0.73933077, "num_input_tokens_seen": 341007300, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.67578125, "step": 15808, "time_per_iteration": 3.999685049057007 }, { "auxiliary_loss_clip": 0.01111509, "auxiliary_loss_mlp": 0.01028285, "balance_loss_clip": 1.01637614, "balance_loss_mlp": 1.03458655, "epoch": 0.9504885014279273, "flos": 22124025636480.0, "grad_norm": 1.4192623851402681, "language_loss": 0.69657254, "learning_rate": 2.4123478010770858e-08, "loss": 0.71797049, "num_input_tokens_seen": 341026695, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 15809, "time_per_iteration": 2.509896755218506 }, { "auxiliary_loss_clip": 0.01119304, "auxiliary_loss_mlp": 0.01028413, "balance_loss_clip": 1.01634932, "balance_loss_mlp": 1.03252196, "epoch": 0.9505486246805952, "flos": 21615961714560.0, "grad_norm": 1.572828577901956, "language_loss": 0.75210702, "learning_rate": 2.40650081273015e-08, "loss": 0.77358425, "num_input_tokens_seen": 341047080, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 15810, "time_per_iteration": 2.5577852725982666 }, { "auxiliary_loss_clip": 0.01127577, "auxiliary_loss_mlp": 0.01040175, "balance_loss_clip": 1.02728295, "balance_loss_mlp": 1.03208899, "epoch": 0.9506087479332632, "flos": 22528236360960.0, "grad_norm": 1.3839805659816269, "language_loss": 0.80074197, "learning_rate": 2.4006608759869374e-08, "loss": 0.82241952, "num_input_tokens_seen": 341067310, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 15811, "time_per_iteration": 2.574430465698242 }, { "auxiliary_loss_clip": 0.01120775, "auxiliary_loss_mlp": 0.01032434, "balance_loss_clip": 1.01999414, "balance_loss_mlp": 1.03406501, "epoch": 0.9506688711859311, "flos": 27527360641920.0, "grad_norm": 2.4776649185373087, "language_loss": 0.70067453, "learning_rate": 2.394827991055859e-08, "loss": 0.72220659, "num_input_tokens_seen": 341085110, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 15812, "time_per_iteration": 2.624547004699707 }, { "auxiliary_loss_clip": 0.01129044, "auxiliary_loss_mlp": 0.01029926, "balance_loss_clip": 1.01840448, "balance_loss_mlp": 1.03386784, "epoch": 0.9507289944385992, "flos": 19865783531520.0, "grad_norm": 1.5496094400549636, "language_loss": 0.65523994, "learning_rate": 2.3890021581450592e-08, "loss": 0.67682958, "num_input_tokens_seen": 341103190, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6875, "step": 15813, "time_per_iteration": 2.534942150115967 }, { "auxiliary_loss_clip": 0.01112483, "auxiliary_loss_mlp": 0.01033501, "balance_loss_clip": 1.02114511, "balance_loss_mlp": 1.03339267, "epoch": 0.9507891176912671, "flos": 25193274969600.0, "grad_norm": 1.3207645945292315, "language_loss": 0.70405531, "learning_rate": 2.383183377462461e-08, "loss": 0.72551513, "num_input_tokens_seen": 341125695, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.703125, "step": 15814, "time_per_iteration": 2.5298218727111816 }, { "auxiliary_loss_clip": 0.01109823, "auxiliary_loss_mlp": 0.01027514, "balance_loss_clip": 1.01574779, "balance_loss_mlp": 1.03415143, "epoch": 0.9508492409439351, "flos": 24899561458560.0, "grad_norm": 4.098655403953133, "language_loss": 0.6391027, "learning_rate": 2.377371649215698e-08, "loss": 0.66047609, "num_input_tokens_seen": 341143930, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.66796875, "step": 15815, "time_per_iteration": 5.496275901794434 }, { "auxiliary_loss_clip": 0.01118261, "auxiliary_loss_mlp": 0.0102642, "balance_loss_clip": 1.0146004, "balance_loss_mlp": 1.03251815, "epoch": 0.950909364196603, "flos": 29784094375680.0, "grad_norm": 1.9380411162251807, "language_loss": 0.58854508, "learning_rate": 2.3715669736122267e-08, "loss": 0.60999185, "num_input_tokens_seen": 341164280, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 15816, "time_per_iteration": 2.6122756004333496 }, { "auxiliary_loss_clip": 0.01101934, "auxiliary_loss_mlp": 0.01036759, "balance_loss_clip": 1.02458167, "balance_loss_mlp": 1.03355837, "epoch": 0.950969487449271, "flos": 24717781704960.0, "grad_norm": 1.4184480679906597, "language_loss": 0.73601353, "learning_rate": 2.365769350859148e-08, "loss": 0.75740045, "num_input_tokens_seen": 341183670, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.68359375, "step": 15817, "time_per_iteration": 2.504859447479248 }, { "auxiliary_loss_clip": 0.0112194, "auxiliary_loss_mlp": 0.0103904, "balance_loss_clip": 1.02499676, "balance_loss_mlp": 1.03455305, "epoch": 0.9510296107019389, "flos": 13699167494400.0, "grad_norm": 1.5147425333909146, "language_loss": 0.60124981, "learning_rate": 2.3599787811633853e-08, "loss": 0.6228596, "num_input_tokens_seen": 341201900, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.69921875, "step": 15818, "time_per_iteration": 2.4995484352111816 }, { "auxiliary_loss_clip": 0.01104971, "auxiliary_loss_mlp": 0.01032172, "balance_loss_clip": 1.01963067, "balance_loss_mlp": 1.03650093, "epoch": 0.9510897339546069, "flos": 17311852667520.0, "grad_norm": 1.7711196629409196, "language_loss": 0.69585145, "learning_rate": 2.3541952647316175e-08, "loss": 0.71722287, "num_input_tokens_seen": 341218340, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 15819, "time_per_iteration": 2.455482244491577 }, { "auxiliary_loss_clip": 0.01142605, "auxiliary_loss_mlp": 0.0103284, "balance_loss_clip": 1.02060354, "balance_loss_mlp": 1.03658164, "epoch": 0.951149857207275, "flos": 14793940166400.0, "grad_norm": 1.8059520724888674, "language_loss": 0.74088299, "learning_rate": 2.348418801770169e-08, "loss": 0.76263744, "num_input_tokens_seen": 341235885, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.703125, "step": 15820, "time_per_iteration": 2.516385078430176 }, { "auxiliary_loss_clip": 0.01122779, "auxiliary_loss_mlp": 0.01035751, "balance_loss_clip": 1.02257228, "balance_loss_mlp": 1.03449154, "epoch": 0.9512099804599429, "flos": 19391152193280.0, "grad_norm": 2.113744638732373, "language_loss": 0.78672767, "learning_rate": 2.342649392485252e-08, "loss": 0.80831301, "num_input_tokens_seen": 341255280, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.70703125, "step": 15821, "time_per_iteration": 2.476233959197998 }, { "auxiliary_loss_clip": 0.0112838, "auxiliary_loss_mlp": 0.01028114, "balance_loss_clip": 1.01563287, "balance_loss_mlp": 1.03296566, "epoch": 0.9512701037126109, "flos": 36757874885760.0, "grad_norm": 1.4053435102209333, "language_loss": 0.71083832, "learning_rate": 2.3368870370827465e-08, "loss": 0.73240328, "num_input_tokens_seen": 341279055, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 15822, "time_per_iteration": 2.6398279666900635 }, { "auxiliary_loss_clip": 0.01112068, "auxiliary_loss_mlp": 0.0103282, "balance_loss_clip": 1.02083349, "balance_loss_mlp": 1.03580964, "epoch": 0.9513302269652788, "flos": 22638266697600.0, "grad_norm": 1.5449122039023375, "language_loss": 0.66079491, "learning_rate": 2.3311317357683102e-08, "loss": 0.68224382, "num_input_tokens_seen": 341298560, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 15823, "time_per_iteration": 2.498528480529785 }, { "auxiliary_loss_clip": 0.01109951, "auxiliary_loss_mlp": 0.01027715, "balance_loss_clip": 1.01631308, "balance_loss_mlp": 1.03447032, "epoch": 0.9513903502179468, "flos": 22893232412160.0, "grad_norm": 1.6437110660041976, "language_loss": 0.77324629, "learning_rate": 2.3253834887472902e-08, "loss": 0.79462296, "num_input_tokens_seen": 341316650, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.66015625, "step": 15824, "time_per_iteration": 2.483776569366455 }, { "auxiliary_loss_clip": 0.01105281, "auxiliary_loss_mlp": 0.01029323, "balance_loss_clip": 1.01665735, "balance_loss_mlp": 1.03318131, "epoch": 0.9514504734706147, "flos": 27928626451200.0, "grad_norm": 2.000639621712917, "language_loss": 0.73794699, "learning_rate": 2.3196422962248553e-08, "loss": 0.75929296, "num_input_tokens_seen": 341336185, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 15825, "time_per_iteration": 2.4997308254241943 }, { "auxiliary_loss_clip": 0.01031085, "auxiliary_loss_mlp": 0.01002442, "balance_loss_clip": 1.00120795, "balance_loss_mlp": 1.00088215, "epoch": 0.9515105967232828, "flos": 67366767312000.0, "grad_norm": 0.8377954248515009, "language_loss": 0.53411222, "learning_rate": 2.3139081584059084e-08, "loss": 0.55444753, "num_input_tokens_seen": 341395795, "router_z_loss_clip": 0.0123291, "router_z_loss_mlp": 0.2109375, "step": 15826, "time_per_iteration": 3.0283803939819336 }, { "auxiliary_loss_clip": 0.01119583, "auxiliary_loss_mlp": 0.01035525, "balance_loss_clip": 1.02304363, "balance_loss_mlp": 1.03366983, "epoch": 0.9515707199759507, "flos": 25846525664640.0, "grad_norm": 1.6085343891845363, "language_loss": 0.72760588, "learning_rate": 2.3081810754950636e-08, "loss": 0.74915695, "num_input_tokens_seen": 341415675, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 15827, "time_per_iteration": 2.510343551635742 }, { "auxiliary_loss_clip": 0.01120428, "auxiliary_loss_mlp": 0.01028644, "balance_loss_clip": 1.01624036, "balance_loss_mlp": 1.03416085, "epoch": 0.9516308432286187, "flos": 21828983322240.0, "grad_norm": 1.8293077624685739, "language_loss": 0.74554813, "learning_rate": 2.3024610476967132e-08, "loss": 0.76703888, "num_input_tokens_seen": 341432990, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 15828, "time_per_iteration": 2.4736974239349365 }, { "auxiliary_loss_clip": 0.01128995, "auxiliary_loss_mlp": 0.01032294, "balance_loss_clip": 1.02084959, "balance_loss_mlp": 1.03592491, "epoch": 0.9516909664812866, "flos": 27269593666560.0, "grad_norm": 1.5756576548569046, "language_loss": 0.73065519, "learning_rate": 2.2967480752149604e-08, "loss": 0.75226808, "num_input_tokens_seen": 341454100, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6640625, "step": 15829, "time_per_iteration": 2.5675888061523438 }, { "auxiliary_loss_clip": 0.01110652, "auxiliary_loss_mlp": 0.01028879, "balance_loss_clip": 1.01738739, "balance_loss_mlp": 1.03290391, "epoch": 0.9517510897339546, "flos": 21215342350080.0, "grad_norm": 1.9340463540434083, "language_loss": 0.60776103, "learning_rate": 2.2910421582537532e-08, "loss": 0.62915629, "num_input_tokens_seen": 341472955, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6875, "step": 15830, "time_per_iteration": 2.4916162490844727 }, { "auxiliary_loss_clip": 0.01114703, "auxiliary_loss_mlp": 0.01031351, "balance_loss_clip": 1.0176897, "balance_loss_mlp": 1.03473318, "epoch": 0.9518112129866225, "flos": 18733986915840.0, "grad_norm": 1.962236223368792, "language_loss": 0.72699493, "learning_rate": 2.2853432970166618e-08, "loss": 0.74845546, "num_input_tokens_seen": 341490165, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.70703125, "step": 15831, "time_per_iteration": 2.5148746967315674 }, { "auxiliary_loss_clip": 0.01134316, "auxiliary_loss_mlp": 0.01027529, "balance_loss_clip": 1.01644254, "balance_loss_mlp": 1.03202736, "epoch": 0.9518713362392905, "flos": 16763676232320.0, "grad_norm": 1.8278109109300058, "language_loss": 0.65787774, "learning_rate": 2.279651491707102e-08, "loss": 0.67949617, "num_input_tokens_seen": 341508055, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6640625, "step": 15832, "time_per_iteration": 2.5363545417785645 }, { "auxiliary_loss_clip": 0.01112743, "auxiliary_loss_mlp": 0.01033183, "balance_loss_clip": 1.02196574, "balance_loss_mlp": 1.03827786, "epoch": 0.9519314594919586, "flos": 15230649720960.0, "grad_norm": 1.7907971332381836, "language_loss": 0.78085589, "learning_rate": 2.2739667425281773e-08, "loss": 0.80231524, "num_input_tokens_seen": 341526155, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.65234375, "step": 15833, "time_per_iteration": 2.556565046310425 }, { "auxiliary_loss_clip": 0.01127965, "auxiliary_loss_mlp": 0.01029745, "balance_loss_clip": 1.01808667, "balance_loss_mlp": 1.03531313, "epoch": 0.9519915827446265, "flos": 19352943100800.0, "grad_norm": 2.310202831561678, "language_loss": 0.74653602, "learning_rate": 2.26828904968277e-08, "loss": 0.76811314, "num_input_tokens_seen": 341540450, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6640625, "step": 15834, "time_per_iteration": 2.6359031200408936 }, { "auxiliary_loss_clip": 0.01116808, "auxiliary_loss_mlp": 0.01036069, "balance_loss_clip": 1.02269936, "balance_loss_mlp": 1.03511667, "epoch": 0.9520517059972945, "flos": 22266303408000.0, "grad_norm": 2.61157879854253, "language_loss": 0.76555145, "learning_rate": 2.262618413373474e-08, "loss": 0.78708029, "num_input_tokens_seen": 341557865, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 15835, "time_per_iteration": 2.580005645751953 }, { "auxiliary_loss_clip": 0.01122449, "auxiliary_loss_mlp": 0.01031164, "balance_loss_clip": 1.01833153, "balance_loss_mlp": 1.03420138, "epoch": 0.9521118292499624, "flos": 14862313704960.0, "grad_norm": 1.9543465325667941, "language_loss": 0.65738535, "learning_rate": 2.2569548338027266e-08, "loss": 0.67892158, "num_input_tokens_seen": 341573890, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 15836, "time_per_iteration": 2.492398262023926 }, { "auxiliary_loss_clip": 0.0112612, "auxiliary_loss_mlp": 0.01029355, "balance_loss_clip": 1.0167253, "balance_loss_mlp": 1.03274679, "epoch": 0.9521719525026304, "flos": 23508812718720.0, "grad_norm": 1.6238695683999518, "language_loss": 0.70523316, "learning_rate": 2.251298311172589e-08, "loss": 0.72678792, "num_input_tokens_seen": 341593770, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.671875, "step": 15837, "time_per_iteration": 2.8077120780944824 }, { "auxiliary_loss_clip": 0.01103991, "auxiliary_loss_mlp": 0.0103227, "balance_loss_clip": 1.01818526, "balance_loss_mlp": 1.03365433, "epoch": 0.9522320757552983, "flos": 19714922409600.0, "grad_norm": 2.7294563662134674, "language_loss": 0.73595333, "learning_rate": 2.245648845684922e-08, "loss": 0.75731599, "num_input_tokens_seen": 341612065, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.703125, "step": 15838, "time_per_iteration": 2.609771966934204 }, { "auxiliary_loss_clip": 0.0112971, "auxiliary_loss_mlp": 0.01027766, "balance_loss_clip": 1.01527286, "balance_loss_mlp": 1.03279662, "epoch": 0.9522921990079664, "flos": 25921291824000.0, "grad_norm": 1.7985049648173, "language_loss": 0.77930248, "learning_rate": 2.2400064375413862e-08, "loss": 0.80087721, "num_input_tokens_seen": 341631365, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 15839, "time_per_iteration": 2.6464626789093018 }, { "auxiliary_loss_clip": 0.01131057, "auxiliary_loss_mlp": 0.01035373, "balance_loss_clip": 1.02317762, "balance_loss_mlp": 1.03485107, "epoch": 0.9523523222606343, "flos": 19208115463680.0, "grad_norm": 1.6410941796691327, "language_loss": 0.80821157, "learning_rate": 2.2343710869433318e-08, "loss": 0.82987583, "num_input_tokens_seen": 341650300, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 15840, "time_per_iteration": 2.6218597888946533 }, { "auxiliary_loss_clip": 0.01114624, "auxiliary_loss_mlp": 0.01028483, "balance_loss_clip": 1.01798677, "balance_loss_mlp": 1.03223407, "epoch": 0.9524124455133023, "flos": 20921269703040.0, "grad_norm": 2.4997005489024753, "language_loss": 0.73162359, "learning_rate": 2.2287427940918423e-08, "loss": 0.75305474, "num_input_tokens_seen": 341667680, "router_z_loss_clip": 0.10449219, "router_z_loss_mlp": 0.6484375, "step": 15841, "time_per_iteration": 4.114611625671387 }, { "auxiliary_loss_clip": 0.01114371, "auxiliary_loss_mlp": 0.01026711, "balance_loss_clip": 1.01572013, "balance_loss_mlp": 1.03322983, "epoch": 0.9524725687659702, "flos": 18843550375680.0, "grad_norm": 1.6050226752380703, "language_loss": 0.78976482, "learning_rate": 2.2231215591877796e-08, "loss": 0.81117564, "num_input_tokens_seen": 341685760, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.63671875, "step": 15842, "time_per_iteration": 2.682119846343994 }, { "auxiliary_loss_clip": 0.01122082, "auxiliary_loss_mlp": 0.01032278, "balance_loss_clip": 1.01979637, "balance_loss_mlp": 1.03627324, "epoch": 0.9525326920186382, "flos": 22674680110080.0, "grad_norm": 1.7303242671473902, "language_loss": 0.72430229, "learning_rate": 2.2175073824318047e-08, "loss": 0.74584591, "num_input_tokens_seen": 341705300, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 15843, "time_per_iteration": 2.638760566711426 }, { "auxiliary_loss_clip": 0.01122311, "auxiliary_loss_mlp": 0.01276574, "balance_loss_clip": 1.01639724, "balance_loss_mlp": 1.03445911, "epoch": 0.9525928152713061, "flos": 22086642556800.0, "grad_norm": 4.817865183161555, "language_loss": 0.78153133, "learning_rate": 2.2119002640242024e-08, "loss": 0.80552018, "num_input_tokens_seen": 341724565, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6953125, "step": 15844, "time_per_iteration": 2.6758086681365967 }, { "auxiliary_loss_clip": 0.0113347, "auxiliary_loss_mlp": 0.01034673, "balance_loss_clip": 1.02045751, "balance_loss_mlp": 1.03406942, "epoch": 0.9526529385239741, "flos": 20704728562560.0, "grad_norm": 1.7458998241837247, "language_loss": 0.70489907, "learning_rate": 2.2063002041651235e-08, "loss": 0.72658044, "num_input_tokens_seen": 341743605, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7265625, "step": 15845, "time_per_iteration": 2.7447972297668457 }, { "auxiliary_loss_clip": 0.01130441, "auxiliary_loss_mlp": 0.01033913, "balance_loss_clip": 1.02167654, "balance_loss_mlp": 1.03323221, "epoch": 0.9527130617766422, "flos": 23368043318400.0, "grad_norm": 1.6675595900385278, "language_loss": 0.75614351, "learning_rate": 2.2007072030543862e-08, "loss": 0.77778703, "num_input_tokens_seen": 341763475, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.703125, "step": 15846, "time_per_iteration": 2.632826805114746 }, { "auxiliary_loss_clip": 0.01128969, "auxiliary_loss_mlp": 0.01026658, "balance_loss_clip": 1.01433146, "balance_loss_mlp": 1.03354502, "epoch": 0.9527731850293101, "flos": 18985935888000.0, "grad_norm": 2.437160649187168, "language_loss": 0.78052658, "learning_rate": 2.1951212608916303e-08, "loss": 0.8020829, "num_input_tokens_seen": 341781265, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 15847, "time_per_iteration": 2.7572896480560303 }, { "auxiliary_loss_clip": 0.01126873, "auxiliary_loss_mlp": 0.01035352, "balance_loss_clip": 1.02410424, "balance_loss_mlp": 1.03299701, "epoch": 0.9528333082819781, "flos": 19318038059520.0, "grad_norm": 1.6849150943657358, "language_loss": 0.77903932, "learning_rate": 2.1895423778761636e-08, "loss": 0.8006615, "num_input_tokens_seen": 341798825, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.66796875, "step": 15848, "time_per_iteration": 2.7262699604034424 }, { "auxiliary_loss_clip": 0.01125502, "auxiliary_loss_mlp": 0.01038435, "balance_loss_clip": 1.02521467, "balance_loss_mlp": 1.03603935, "epoch": 0.952893431534646, "flos": 23951340276480.0, "grad_norm": 1.5894397507933171, "language_loss": 0.72094554, "learning_rate": 2.1839705542070928e-08, "loss": 0.74258494, "num_input_tokens_seen": 341819480, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 15849, "time_per_iteration": 4.156775951385498 }, { "auxiliary_loss_clip": 0.01119182, "auxiliary_loss_mlp": 0.01033103, "balance_loss_clip": 1.02025855, "balance_loss_mlp": 1.0369159, "epoch": 0.952953554787314, "flos": 21030545854080.0, "grad_norm": 1.7304983969043122, "language_loss": 0.75300318, "learning_rate": 2.1784057900833042e-08, "loss": 0.774526, "num_input_tokens_seen": 341838035, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 15850, "time_per_iteration": 2.6495962142944336 }, { "auxiliary_loss_clip": 0.01120739, "auxiliary_loss_mlp": 0.01032608, "balance_loss_clip": 1.02091348, "balance_loss_mlp": 1.0335995, "epoch": 0.9530136780399819, "flos": 22382870019840.0, "grad_norm": 2.0375276805085223, "language_loss": 0.72685552, "learning_rate": 2.172848085703327e-08, "loss": 0.74838901, "num_input_tokens_seen": 341855895, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6875, "step": 15851, "time_per_iteration": 2.757664918899536 }, { "auxiliary_loss_clip": 0.01117442, "auxiliary_loss_mlp": 0.01029895, "balance_loss_clip": 1.01796246, "balance_loss_mlp": 1.03250694, "epoch": 0.95307380129265, "flos": 22159613036160.0, "grad_norm": 1.7898336163771504, "language_loss": 0.79889607, "learning_rate": 2.1672974412655364e-08, "loss": 0.82036948, "num_input_tokens_seen": 341875240, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.66796875, "step": 15852, "time_per_iteration": 2.7260658740997314 }, { "auxiliary_loss_clip": 0.01112617, "auxiliary_loss_mlp": 0.01034366, "balance_loss_clip": 1.02168846, "balance_loss_mlp": 1.0342598, "epoch": 0.9531339245453179, "flos": 44022747214080.0, "grad_norm": 7.103140905250658, "language_loss": 0.59773314, "learning_rate": 2.161753856968018e-08, "loss": 0.61920291, "num_input_tokens_seen": 341901020, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 15853, "time_per_iteration": 2.919674873352051 }, { "auxiliary_loss_clip": 0.01115524, "auxiliary_loss_mlp": 0.01027611, "balance_loss_clip": 1.01658452, "balance_loss_mlp": 1.03244114, "epoch": 0.9531940477979859, "flos": 20266690204800.0, "grad_norm": 2.237243253347429, "language_loss": 0.72870851, "learning_rate": 2.156217333008592e-08, "loss": 0.75013983, "num_input_tokens_seen": 341919365, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.65625, "step": 15854, "time_per_iteration": 2.7336440086364746 }, { "auxiliary_loss_clip": 0.01119541, "auxiliary_loss_mlp": 0.01029051, "balance_loss_clip": 1.0174098, "balance_loss_mlp": 1.03208601, "epoch": 0.9532541710506538, "flos": 25335732309120.0, "grad_norm": 1.91929465410328, "language_loss": 0.6741575, "learning_rate": 2.1506878695848772e-08, "loss": 0.69564337, "num_input_tokens_seen": 341939985, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6953125, "step": 15855, "time_per_iteration": 2.792269468307495 }, { "auxiliary_loss_clip": 0.0110235, "auxiliary_loss_mlp": 0.0103182, "balance_loss_clip": 1.02006555, "balance_loss_mlp": 1.03369832, "epoch": 0.9533142943033218, "flos": 26469288691200.0, "grad_norm": 1.512121845918071, "language_loss": 0.76555645, "learning_rate": 2.145165466894161e-08, "loss": 0.78689814, "num_input_tokens_seen": 341959255, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6875, "step": 15856, "time_per_iteration": 5.031241416931152 }, { "auxiliary_loss_clip": 0.01118242, "auxiliary_loss_mlp": 0.0102665, "balance_loss_clip": 1.01522398, "balance_loss_mlp": 1.03276181, "epoch": 0.9533744175559897, "flos": 23656944407040.0, "grad_norm": 1.6955579241134526, "language_loss": 0.77895653, "learning_rate": 2.139650125133552e-08, "loss": 0.8004055, "num_input_tokens_seen": 341977205, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.67578125, "step": 15857, "time_per_iteration": 4.9344329833984375 }, { "auxiliary_loss_clip": 0.01031182, "auxiliary_loss_mlp": 0.01003, "balance_loss_clip": 1.00182581, "balance_loss_mlp": 1.00124931, "epoch": 0.9534345408086577, "flos": 61052055500160.0, "grad_norm": 0.7102755942714316, "language_loss": 0.62632036, "learning_rate": 2.1341418444998705e-08, "loss": 0.64666218, "num_input_tokens_seen": 342038545, "router_z_loss_clip": 0.01171875, "router_z_loss_mlp": 0.2109375, "step": 15858, "time_per_iteration": 3.2617642879486084 }, { "auxiliary_loss_clip": 0.01137228, "auxiliary_loss_mlp": 0.0102687, "balance_loss_clip": 1.01542544, "balance_loss_mlp": 1.03441954, "epoch": 0.9534946640613258, "flos": 18951677291520.0, "grad_norm": 1.5251814822459593, "language_loss": 0.56951463, "learning_rate": 2.1286406251896928e-08, "loss": 0.59115565, "num_input_tokens_seen": 342058195, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.67578125, "step": 15859, "time_per_iteration": 2.7411811351776123 }, { "auxiliary_loss_clip": 0.01097857, "auxiliary_loss_mlp": 0.01029125, "balance_loss_clip": 1.01794875, "balance_loss_mlp": 1.03209424, "epoch": 0.9535547873139937, "flos": 16654292340480.0, "grad_norm": 2.1186924221884738, "language_loss": 0.81462812, "learning_rate": 2.1231464673993727e-08, "loss": 0.83589792, "num_input_tokens_seen": 342075025, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.65625, "step": 15860, "time_per_iteration": 2.6017885208129883 }, { "auxiliary_loss_clip": 0.01135808, "auxiliary_loss_mlp": 0.0127321, "balance_loss_clip": 1.01510692, "balance_loss_mlp": 1.03188276, "epoch": 0.9536149105666617, "flos": 20667776446080.0, "grad_norm": 2.0583511723109034, "language_loss": 0.66729873, "learning_rate": 2.1176593713249312e-08, "loss": 0.69138891, "num_input_tokens_seen": 342094595, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.68359375, "step": 15861, "time_per_iteration": 2.7548696994781494 }, { "auxiliary_loss_clip": 0.01112224, "auxiliary_loss_mlp": 0.0103526, "balance_loss_clip": 1.02267158, "balance_loss_mlp": 1.0340575, "epoch": 0.9536750338193296, "flos": 30700499086080.0, "grad_norm": 3.2582483415351104, "language_loss": 0.65725672, "learning_rate": 2.1121793371622122e-08, "loss": 0.67873162, "num_input_tokens_seen": 342115970, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 15862, "time_per_iteration": 2.7751975059509277 }, { "auxiliary_loss_clip": 0.01126743, "auxiliary_loss_mlp": 0.0102822, "balance_loss_clip": 1.01697898, "balance_loss_mlp": 1.03464627, "epoch": 0.9537351570719976, "flos": 20405484357120.0, "grad_norm": 1.9285047803384832, "language_loss": 0.67904305, "learning_rate": 2.1067063651067917e-08, "loss": 0.70059276, "num_input_tokens_seen": 342134080, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.65625, "step": 15863, "time_per_iteration": 2.7042558193206787 }, { "auxiliary_loss_clip": 0.01119019, "auxiliary_loss_mlp": 0.0102715, "balance_loss_clip": 1.01550293, "balance_loss_mlp": 1.03319001, "epoch": 0.9537952803246655, "flos": 29929245235200.0, "grad_norm": 1.727738076257441, "language_loss": 0.77984321, "learning_rate": 2.1012404553539587e-08, "loss": 0.80130494, "num_input_tokens_seen": 342154725, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6796875, "step": 15864, "time_per_iteration": 2.7896835803985596 }, { "auxiliary_loss_clip": 0.01129575, "auxiliary_loss_mlp": 0.01027451, "balance_loss_clip": 1.01523757, "balance_loss_mlp": 1.03580821, "epoch": 0.9538554035773336, "flos": 20521404524160.0, "grad_norm": 2.80260697242151, "language_loss": 0.60088784, "learning_rate": 2.0957816080988012e-08, "loss": 0.6224581, "num_input_tokens_seen": 342172275, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 15865, "time_per_iteration": 2.8267970085144043 }, { "auxiliary_loss_clip": 0.0113022, "auxiliary_loss_mlp": 0.01032126, "balance_loss_clip": 1.02016377, "balance_loss_mlp": 1.03421974, "epoch": 0.9539155268300015, "flos": 18406517598720.0, "grad_norm": 1.6809221982847748, "language_loss": 0.69648063, "learning_rate": 2.090329823536163e-08, "loss": 0.71810412, "num_input_tokens_seen": 342190880, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 15866, "time_per_iteration": 2.7137629985809326 }, { "auxiliary_loss_clip": 0.01120514, "auxiliary_loss_mlp": 0.01028838, "balance_loss_clip": 1.01600528, "balance_loss_mlp": 1.03341317, "epoch": 0.9539756500826695, "flos": 26213281482240.0, "grad_norm": 3.225115910840498, "language_loss": 0.85608613, "learning_rate": 2.084885101860534e-08, "loss": 0.87757969, "num_input_tokens_seen": 342208165, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 15867, "time_per_iteration": 2.6959993839263916 }, { "auxiliary_loss_clip": 0.01030634, "auxiliary_loss_mlp": 0.0100195, "balance_loss_clip": 1.00065696, "balance_loss_mlp": 1.0007596, "epoch": 0.9540357733353374, "flos": 68289097766400.0, "grad_norm": 0.7490144848402401, "language_loss": 0.61816251, "learning_rate": 2.079447443266269e-08, "loss": 0.63848835, "num_input_tokens_seen": 342277110, "router_z_loss_clip": 0.01293945, "router_z_loss_mlp": 0.2109375, "step": 15868, "time_per_iteration": 3.3638505935668945 }, { "auxiliary_loss_clip": 0.01129837, "auxiliary_loss_mlp": 0.01028303, "balance_loss_clip": 1.01550579, "balance_loss_mlp": 1.03468108, "epoch": 0.9540958965880054, "flos": 21288276915840.0, "grad_norm": 1.760722284223094, "language_loss": 0.59560788, "learning_rate": 2.074016847947413e-08, "loss": 0.61718929, "num_input_tokens_seen": 342294695, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 15869, "time_per_iteration": 2.6897294521331787 }, { "auxiliary_loss_clip": 0.01129201, "auxiliary_loss_mlp": 0.01280555, "balance_loss_clip": 1.02295792, "balance_loss_mlp": 1.03478813, "epoch": 0.9541560198406733, "flos": 19751407649280.0, "grad_norm": 1.6819836082586936, "language_loss": 0.71182281, "learning_rate": 2.0685933160977665e-08, "loss": 0.73592037, "num_input_tokens_seen": 342314970, "router_z_loss_clip": 0.10742188, "router_z_loss_mlp": 0.67578125, "step": 15870, "time_per_iteration": 2.6857082843780518 }, { "auxiliary_loss_clip": 0.01127067, "auxiliary_loss_mlp": 0.01029249, "balance_loss_clip": 1.01678014, "balance_loss_mlp": 1.03268802, "epoch": 0.9542161430933414, "flos": 24715626888960.0, "grad_norm": 1.9358754735530899, "language_loss": 0.77278179, "learning_rate": 2.063176847910908e-08, "loss": 0.79434496, "num_input_tokens_seen": 342334255, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.67578125, "step": 15871, "time_per_iteration": 2.6874330043792725 }, { "auxiliary_loss_clip": 0.01113127, "auxiliary_loss_mlp": 0.01034063, "balance_loss_clip": 1.02102184, "balance_loss_mlp": 1.03458178, "epoch": 0.9542762663460094, "flos": 31065818359680.0, "grad_norm": 2.0460113641290785, "language_loss": 0.58687663, "learning_rate": 2.0577674435800608e-08, "loss": 0.60834855, "num_input_tokens_seen": 342354730, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 15872, "time_per_iteration": 2.664391279220581 }, { "auxiliary_loss_clip": 0.01122957, "auxiliary_loss_mlp": 0.01032373, "balance_loss_clip": 1.01860487, "balance_loss_mlp": 1.03386772, "epoch": 0.9543363895986773, "flos": 20776729374720.0, "grad_norm": 1.3876942447315437, "language_loss": 0.74933541, "learning_rate": 2.0523651032983592e-08, "loss": 0.77088875, "num_input_tokens_seen": 342374565, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7109375, "step": 15873, "time_per_iteration": 2.7033398151397705 }, { "auxiliary_loss_clip": 0.01103293, "auxiliary_loss_mlp": 0.0103356, "balance_loss_clip": 1.02047038, "balance_loss_mlp": 1.03428566, "epoch": 0.9543965128513453, "flos": 24462744163200.0, "grad_norm": 1.7994825226670752, "language_loss": 0.62490797, "learning_rate": 2.0469698272585824e-08, "loss": 0.64627653, "num_input_tokens_seen": 342394590, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6875, "step": 15874, "time_per_iteration": 2.741105079650879 }, { "auxiliary_loss_clip": 0.01117207, "auxiliary_loss_mlp": 0.0103637, "balance_loss_clip": 1.02411556, "balance_loss_mlp": 1.03291655, "epoch": 0.9544566361040132, "flos": 15261532439040.0, "grad_norm": 2.0810478957124943, "language_loss": 0.89102125, "learning_rate": 2.0415816156532205e-08, "loss": 0.91255707, "num_input_tokens_seen": 342410445, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.66796875, "step": 15875, "time_per_iteration": 2.71364164352417 }, { "auxiliary_loss_clip": 0.01106163, "auxiliary_loss_mlp": 0.01031216, "balance_loss_clip": 1.0178225, "balance_loss_mlp": 1.0342803, "epoch": 0.9545167593566812, "flos": 25918777872000.0, "grad_norm": 2.1649599122332166, "language_loss": 0.67860824, "learning_rate": 2.0362004686746092e-08, "loss": 0.69998205, "num_input_tokens_seen": 342430970, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 15876, "time_per_iteration": 2.6957149505615234 }, { "auxiliary_loss_clip": 0.0111245, "auxiliary_loss_mlp": 0.01030934, "balance_loss_clip": 1.0179882, "balance_loss_mlp": 1.03355885, "epoch": 0.9545768826093491, "flos": 25628188844160.0, "grad_norm": 1.9003901843193747, "language_loss": 0.69075894, "learning_rate": 2.0308263865148166e-08, "loss": 0.71219283, "num_input_tokens_seen": 342449505, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 15877, "time_per_iteration": 2.783026933670044 }, { "auxiliary_loss_clip": 0.01131661, "auxiliary_loss_mlp": 0.01031032, "balance_loss_clip": 1.01829433, "balance_loss_mlp": 1.03452516, "epoch": 0.9546370058620172, "flos": 22491499726080.0, "grad_norm": 1.8194589661217928, "language_loss": 0.70630479, "learning_rate": 2.0254593693655342e-08, "loss": 0.72793168, "num_input_tokens_seen": 342470390, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 15878, "time_per_iteration": 2.8254053592681885 }, { "auxiliary_loss_clip": 0.01099283, "auxiliary_loss_mlp": 0.01027745, "balance_loss_clip": 1.01531196, "balance_loss_mlp": 1.03226352, "epoch": 0.9546971291146851, "flos": 24609582961920.0, "grad_norm": 1.592439263986295, "language_loss": 0.68555886, "learning_rate": 2.020099417418386e-08, "loss": 0.70682907, "num_input_tokens_seen": 342492560, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.671875, "step": 15879, "time_per_iteration": 2.619614839553833 }, { "auxiliary_loss_clip": 0.01126288, "auxiliary_loss_mlp": 0.01277544, "balance_loss_clip": 1.0187124, "balance_loss_mlp": 1.03287303, "epoch": 0.9547572523673531, "flos": 28657756627200.0, "grad_norm": 3.944568396297446, "language_loss": 0.85177141, "learning_rate": 2.0147465308646416e-08, "loss": 0.87580979, "num_input_tokens_seen": 342512315, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.66796875, "step": 15880, "time_per_iteration": 2.7411129474639893 }, { "auxiliary_loss_clip": 0.01147891, "auxiliary_loss_mlp": 0.01029365, "balance_loss_clip": 1.01660395, "balance_loss_mlp": 1.03358793, "epoch": 0.954817375620021, "flos": 33802606385280.0, "grad_norm": 1.899601499781989, "language_loss": 0.72131968, "learning_rate": 2.0094007098952814e-08, "loss": 0.74309218, "num_input_tokens_seen": 342533060, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 15881, "time_per_iteration": 2.6866061687469482 }, { "auxiliary_loss_clip": 0.01110325, "auxiliary_loss_mlp": 0.01033627, "balance_loss_clip": 1.02132428, "balance_loss_mlp": 1.03420651, "epoch": 0.954877498872689, "flos": 14428225843200.0, "grad_norm": 2.0293945011394463, "language_loss": 0.71746784, "learning_rate": 2.0040619547011305e-08, "loss": 0.7389074, "num_input_tokens_seen": 342550830, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.671875, "step": 15882, "time_per_iteration": 2.7641685009002686 }, { "auxiliary_loss_clip": 0.01120067, "auxiliary_loss_mlp": 0.01031971, "balance_loss_clip": 1.02005601, "balance_loss_mlp": 1.03414059, "epoch": 0.9549376221253569, "flos": 59269447336320.0, "grad_norm": 1.6322006738371895, "language_loss": 0.65425956, "learning_rate": 1.9987302654727033e-08, "loss": 0.67577994, "num_input_tokens_seen": 342575070, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 15883, "time_per_iteration": 4.486612558364868 }, { "auxiliary_loss_clip": 0.01114692, "auxiliary_loss_mlp": 0.01029351, "balance_loss_clip": 1.01616073, "balance_loss_mlp": 1.03608799, "epoch": 0.954997745378025, "flos": 17274397760640.0, "grad_norm": 2.192156139858763, "language_loss": 0.78120852, "learning_rate": 1.99340564240027e-08, "loss": 0.80264896, "num_input_tokens_seen": 342592215, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.69921875, "step": 15884, "time_per_iteration": 2.5404396057128906 }, { "auxiliary_loss_clip": 0.01103706, "auxiliary_loss_mlp": 0.01029778, "balance_loss_clip": 1.01696873, "balance_loss_mlp": 1.03288293, "epoch": 0.955057868630693, "flos": 13006378903680.0, "grad_norm": 1.7827810665793158, "language_loss": 0.77855438, "learning_rate": 1.9880880856738558e-08, "loss": 0.79988927, "num_input_tokens_seen": 342610030, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 15885, "time_per_iteration": 2.5787458419799805 }, { "auxiliary_loss_clip": 0.011023, "auxiliary_loss_mlp": 0.01033078, "balance_loss_clip": 1.02085316, "balance_loss_mlp": 1.03256547, "epoch": 0.9551179918833609, "flos": 22637692080000.0, "grad_norm": 1.7497884552688876, "language_loss": 0.70152056, "learning_rate": 1.9827775954832427e-08, "loss": 0.7228744, "num_input_tokens_seen": 342626475, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6953125, "step": 15886, "time_per_iteration": 2.561643600463867 }, { "auxiliary_loss_clip": 0.01133775, "auxiliary_loss_mlp": 0.01036676, "balance_loss_clip": 1.02253819, "balance_loss_mlp": 1.03540015, "epoch": 0.9551781151360289, "flos": 21542811667200.0, "grad_norm": 1.6487700718037706, "language_loss": 0.7241652, "learning_rate": 1.9774741720179232e-08, "loss": 0.74586964, "num_input_tokens_seen": 342646645, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.71875, "step": 15887, "time_per_iteration": 2.7046194076538086 }, { "auxiliary_loss_clip": 0.0111677, "auxiliary_loss_mlp": 0.0102766, "balance_loss_clip": 1.01607931, "balance_loss_mlp": 1.032727, "epoch": 0.9552382383886968, "flos": 20702250524160.0, "grad_norm": 2.433543749504611, "language_loss": 0.57305443, "learning_rate": 1.972177815467191e-08, "loss": 0.59449875, "num_input_tokens_seen": 342663615, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.66015625, "step": 15888, "time_per_iteration": 2.606128692626953 }, { "auxiliary_loss_clip": 0.01030771, "auxiliary_loss_mlp": 0.01003306, "balance_loss_clip": 1.00209606, "balance_loss_mlp": 1.00092244, "epoch": 0.9552983616413648, "flos": 67769792887680.0, "grad_norm": 0.7084446053891348, "language_loss": 0.57896864, "learning_rate": 1.9668885260200275e-08, "loss": 0.59930944, "num_input_tokens_seen": 342728275, "router_z_loss_clip": 0.01208496, "router_z_loss_mlp": 0.2109375, "step": 15889, "time_per_iteration": 3.222137928009033 }, { "auxiliary_loss_clip": 0.01102696, "auxiliary_loss_mlp": 0.01027283, "balance_loss_clip": 1.01447976, "balance_loss_mlp": 1.03197408, "epoch": 0.9553584848940327, "flos": 21579979265280.0, "grad_norm": 1.5446272992707932, "language_loss": 0.66914839, "learning_rate": 1.9616063038652154e-08, "loss": 0.69044816, "num_input_tokens_seen": 342748860, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 15890, "time_per_iteration": 2.5815036296844482 }, { "auxiliary_loss_clip": 0.01103243, "auxiliary_loss_mlp": 0.01028765, "balance_loss_clip": 1.01693308, "balance_loss_mlp": 1.03546405, "epoch": 0.9554186081467008, "flos": 24208173498240.0, "grad_norm": 2.7067270965243875, "language_loss": 0.73886377, "learning_rate": 1.9563311491912483e-08, "loss": 0.76018387, "num_input_tokens_seen": 342769705, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6796875, "step": 15891, "time_per_iteration": 4.121787786483765 }, { "auxiliary_loss_clip": 0.0112408, "auxiliary_loss_mlp": 0.01029789, "balance_loss_clip": 1.01875663, "balance_loss_mlp": 1.03286529, "epoch": 0.9554787313993687, "flos": 16251554073600.0, "grad_norm": 1.8012841702227218, "language_loss": 0.78055012, "learning_rate": 1.95106306218642e-08, "loss": 0.80208886, "num_input_tokens_seen": 342787000, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6484375, "step": 15892, "time_per_iteration": 2.6602373123168945 }, { "auxiliary_loss_clip": 0.01101349, "auxiliary_loss_mlp": 0.01033502, "balance_loss_clip": 1.02078271, "balance_loss_mlp": 1.03359163, "epoch": 0.9555388546520367, "flos": 23404133508480.0, "grad_norm": 1.8124197658866879, "language_loss": 0.70083207, "learning_rate": 1.945802043038669e-08, "loss": 0.7221806, "num_input_tokens_seen": 342807795, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 15893, "time_per_iteration": 2.6037189960479736 }, { "auxiliary_loss_clip": 0.01121357, "auxiliary_loss_mlp": 0.0127608, "balance_loss_clip": 1.01576507, "balance_loss_mlp": 1.0363698, "epoch": 0.9555989779047046, "flos": 14794047907200.0, "grad_norm": 2.5289801965042735, "language_loss": 0.6561017, "learning_rate": 1.940548091935823e-08, "loss": 0.680076, "num_input_tokens_seen": 342825490, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.671875, "step": 15894, "time_per_iteration": 2.499194383621216 }, { "auxiliary_loss_clip": 0.01111255, "auxiliary_loss_mlp": 0.01030657, "balance_loss_clip": 1.01777661, "balance_loss_mlp": 1.03278446, "epoch": 0.9556591011573726, "flos": 22236749493120.0, "grad_norm": 1.8718456711913087, "language_loss": 0.81694561, "learning_rate": 1.935301209065332e-08, "loss": 0.83836472, "num_input_tokens_seen": 342844965, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 15895, "time_per_iteration": 2.515686511993408 }, { "auxiliary_loss_clip": 0.01121012, "auxiliary_loss_mlp": 0.01030497, "balance_loss_clip": 1.01813495, "balance_loss_mlp": 1.03417087, "epoch": 0.9557192244100405, "flos": 27855296835840.0, "grad_norm": 2.8607662838443804, "language_loss": 0.7227757, "learning_rate": 1.9300613946144462e-08, "loss": 0.74429077, "num_input_tokens_seen": 342865915, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 15896, "time_per_iteration": 2.5872039794921875 }, { "auxiliary_loss_clip": 0.01099947, "auxiliary_loss_mlp": 0.01034483, "balance_loss_clip": 1.02220416, "balance_loss_mlp": 1.03279161, "epoch": 0.9557793476627086, "flos": 17602800831360.0, "grad_norm": 1.8532067613846592, "language_loss": 0.79166591, "learning_rate": 1.9248286487701937e-08, "loss": 0.81301022, "num_input_tokens_seen": 342884000, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.671875, "step": 15897, "time_per_iteration": 2.487666368484497 }, { "auxiliary_loss_clip": 0.01119686, "auxiliary_loss_mlp": 0.01032098, "balance_loss_clip": 1.01865673, "balance_loss_mlp": 1.03312325, "epoch": 0.9558394709153766, "flos": 23875496709120.0, "grad_norm": 1.5521049826467843, "language_loss": 0.72880596, "learning_rate": 1.919602971719292e-08, "loss": 0.75032377, "num_input_tokens_seen": 342903095, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.6875, "step": 15898, "time_per_iteration": 5.566342830657959 }, { "auxiliary_loss_clip": 0.01048216, "auxiliary_loss_mlp": 0.0100239, "balance_loss_clip": 1.00128698, "balance_loss_mlp": 1.00059414, "epoch": 0.9558995941680445, "flos": 53682001171200.0, "grad_norm": 0.8923990011903463, "language_loss": 0.52369863, "learning_rate": 1.9143843636482138e-08, "loss": 0.54420465, "num_input_tokens_seen": 342958155, "router_z_loss_clip": 0.01104736, "router_z_loss_mlp": 0.21191406, "step": 15899, "time_per_iteration": 3.0721726417541504 }, { "auxiliary_loss_clip": 0.01116213, "auxiliary_loss_mlp": 0.01030824, "balance_loss_clip": 1.01710916, "balance_loss_mlp": 1.03483713, "epoch": 0.9559597174207125, "flos": 13764488376960.0, "grad_norm": 2.686235701209808, "language_loss": 0.68443334, "learning_rate": 1.9091728247432547e-08, "loss": 0.70590377, "num_input_tokens_seen": 342972500, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 15900, "time_per_iteration": 2.7035796642303467 }, { "auxiliary_loss_clip": 0.01113237, "auxiliary_loss_mlp": 0.01274787, "balance_loss_clip": 1.01555216, "balance_loss_mlp": 1.03417897, "epoch": 0.9560198406733804, "flos": 19936347799680.0, "grad_norm": 1.9159445754591442, "language_loss": 0.83377051, "learning_rate": 1.9039683551903552e-08, "loss": 0.85765076, "num_input_tokens_seen": 342989035, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 15901, "time_per_iteration": 2.6153006553649902 }, { "auxiliary_loss_clip": 0.01049295, "auxiliary_loss_mlp": 0.01002107, "balance_loss_clip": 1.00082004, "balance_loss_mlp": 1.00076056, "epoch": 0.9560799639260484, "flos": 57289550699520.0, "grad_norm": 0.72085219142679, "language_loss": 0.54323995, "learning_rate": 1.898770955175255e-08, "loss": 0.56375396, "num_input_tokens_seen": 343051675, "router_z_loss_clip": 0.01287842, "router_z_loss_mlp": 0.2109375, "step": 15902, "time_per_iteration": 3.2770071029663086 }, { "auxiliary_loss_clip": 0.01113901, "auxiliary_loss_mlp": 0.0102858, "balance_loss_clip": 1.01851308, "balance_loss_mlp": 1.03277278, "epoch": 0.9561400871787163, "flos": 18917167299840.0, "grad_norm": 1.62428170899242, "language_loss": 0.85143697, "learning_rate": 1.8935806248834506e-08, "loss": 0.8728618, "num_input_tokens_seen": 343068895, "router_z_loss_clip": 0.10058594, "router_z_loss_mlp": 0.6328125, "step": 15903, "time_per_iteration": 2.533296585083008 }, { "auxiliary_loss_clip": 0.01118526, "auxiliary_loss_mlp": 0.01271828, "balance_loss_clip": 1.01385391, "balance_loss_mlp": 1.03333938, "epoch": 0.9562002104313844, "flos": 18038576632320.0, "grad_norm": 1.7161350351746587, "language_loss": 0.80898595, "learning_rate": 1.8883973645001494e-08, "loss": 0.8328895, "num_input_tokens_seen": 343087115, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.67578125, "step": 15904, "time_per_iteration": 2.8945348262786865 }, { "auxiliary_loss_clip": 0.01114431, "auxiliary_loss_mlp": 0.01030924, "balance_loss_clip": 1.01792455, "balance_loss_mlp": 1.03533256, "epoch": 0.9562603336840523, "flos": 24717673964160.0, "grad_norm": 2.331557079833809, "language_loss": 0.5958482, "learning_rate": 1.8832211742103588e-08, "loss": 0.61730182, "num_input_tokens_seen": 343105575, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 15905, "time_per_iteration": 2.6731152534484863 }, { "auxiliary_loss_clip": 0.01110703, "auxiliary_loss_mlp": 0.01032113, "balance_loss_clip": 1.02000129, "balance_loss_mlp": 1.03367555, "epoch": 0.9563204569367203, "flos": 21177205084800.0, "grad_norm": 1.9387245107773732, "language_loss": 0.70548975, "learning_rate": 1.878052054198753e-08, "loss": 0.72691786, "num_input_tokens_seen": 343123025, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.68359375, "step": 15906, "time_per_iteration": 2.59745192527771 }, { "auxiliary_loss_clip": 0.01142227, "auxiliary_loss_mlp": 0.01028697, "balance_loss_clip": 1.01607895, "balance_loss_mlp": 1.03590655, "epoch": 0.9563805801893882, "flos": 20229738088320.0, "grad_norm": 1.6870292299032408, "language_loss": 0.71015012, "learning_rate": 1.872890004649874e-08, "loss": 0.73185945, "num_input_tokens_seen": 343141625, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 15907, "time_per_iteration": 2.827427387237549 }, { "auxiliary_loss_clip": 0.01128453, "auxiliary_loss_mlp": 0.01029459, "balance_loss_clip": 1.01729333, "balance_loss_mlp": 1.03425932, "epoch": 0.9564407034420562, "flos": 25411001258880.0, "grad_norm": 2.332634517748582, "language_loss": 0.69840777, "learning_rate": 1.8677350257479075e-08, "loss": 0.71998692, "num_input_tokens_seen": 343161300, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 15908, "time_per_iteration": 2.8259456157684326 }, { "auxiliary_loss_clip": 0.01120064, "auxiliary_loss_mlp": 0.01276902, "balance_loss_clip": 1.01799321, "balance_loss_mlp": 1.03556824, "epoch": 0.9565008266947241, "flos": 18623884752000.0, "grad_norm": 1.7249458462999607, "language_loss": 0.82885504, "learning_rate": 1.8625871176768172e-08, "loss": 0.85282469, "num_input_tokens_seen": 343177815, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.671875, "step": 15909, "time_per_iteration": 2.5927541255950928 }, { "auxiliary_loss_clip": 0.01109477, "auxiliary_loss_mlp": 0.01030986, "balance_loss_clip": 1.01891577, "balance_loss_mlp": 1.03191829, "epoch": 0.9565609499473922, "flos": 24862142465280.0, "grad_norm": 1.849406559826124, "language_loss": 0.67529035, "learning_rate": 1.8574462806203008e-08, "loss": 0.69669503, "num_input_tokens_seen": 343198140, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 15910, "time_per_iteration": 2.9053232669830322 }, { "auxiliary_loss_clip": 0.01116084, "auxiliary_loss_mlp": 0.01034528, "balance_loss_clip": 1.02200556, "balance_loss_mlp": 1.03568089, "epoch": 0.9566210732000601, "flos": 21798459740160.0, "grad_norm": 1.831760129857091, "language_loss": 0.74130219, "learning_rate": 1.8523125147618778e-08, "loss": 0.76280826, "num_input_tokens_seen": 343218280, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71484375, "step": 15911, "time_per_iteration": 2.5955114364624023 }, { "auxiliary_loss_clip": 0.01110882, "auxiliary_loss_mlp": 0.01030308, "balance_loss_clip": 1.01844072, "balance_loss_mlp": 1.03325415, "epoch": 0.9566811964527281, "flos": 18697609416960.0, "grad_norm": 1.8485369124708362, "language_loss": 0.69253802, "learning_rate": 1.8471858202846914e-08, "loss": 0.71394992, "num_input_tokens_seen": 343236850, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 15912, "time_per_iteration": 2.846472978591919 }, { "auxiliary_loss_clip": 0.01121015, "auxiliary_loss_mlp": 0.01035687, "balance_loss_clip": 1.02307463, "balance_loss_mlp": 1.03450441, "epoch": 0.9567413197053961, "flos": 13000632727680.0, "grad_norm": 3.349273637951904, "language_loss": 0.72472262, "learning_rate": 1.84206619737175e-08, "loss": 0.74628967, "num_input_tokens_seen": 343253065, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 15913, "time_per_iteration": 2.660844087600708 }, { "auxiliary_loss_clip": 0.01154595, "auxiliary_loss_mlp": 0.01028316, "balance_loss_clip": 1.0172658, "balance_loss_mlp": 1.03396964, "epoch": 0.956801442958064, "flos": 19719267955200.0, "grad_norm": 1.8400851074335263, "language_loss": 0.73189807, "learning_rate": 1.83695364620573e-08, "loss": 0.7537272, "num_input_tokens_seen": 343270330, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.671875, "step": 15914, "time_per_iteration": 2.8602871894836426 }, { "auxiliary_loss_clip": 0.01100987, "auxiliary_loss_mlp": 0.0103024, "balance_loss_clip": 1.01816416, "balance_loss_mlp": 1.03382444, "epoch": 0.956861566210732, "flos": 18222834424320.0, "grad_norm": 1.412728805045343, "language_loss": 0.67296243, "learning_rate": 1.831848166969108e-08, "loss": 0.69427466, "num_input_tokens_seen": 343289625, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.671875, "step": 15915, "time_per_iteration": 2.631723403930664 }, { "auxiliary_loss_clip": 0.01109611, "auxiliary_loss_mlp": 0.01026564, "balance_loss_clip": 1.01471996, "balance_loss_mlp": 1.03403974, "epoch": 0.9569216894634, "flos": 22196960202240.0, "grad_norm": 1.7329967931760542, "language_loss": 0.64172465, "learning_rate": 1.8267497598440927e-08, "loss": 0.66308641, "num_input_tokens_seen": 343309200, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.66796875, "step": 15916, "time_per_iteration": 2.8056488037109375 }, { "auxiliary_loss_clip": 0.01096013, "auxiliary_loss_mlp": 0.01027213, "balance_loss_clip": 1.01607919, "balance_loss_mlp": 1.0328846, "epoch": 0.956981812716068, "flos": 21689291329920.0, "grad_norm": 1.7996055315237502, "language_loss": 0.80857086, "learning_rate": 1.8216584250125845e-08, "loss": 0.82980311, "num_input_tokens_seen": 343326270, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6328125, "step": 15917, "time_per_iteration": 2.824650287628174 }, { "auxiliary_loss_clip": 0.01119688, "auxiliary_loss_mlp": 0.01034388, "balance_loss_clip": 1.02214539, "balance_loss_mlp": 1.03482175, "epoch": 0.9570419359687359, "flos": 13990905757440.0, "grad_norm": 1.7588229273294873, "language_loss": 0.72689128, "learning_rate": 1.816574162656348e-08, "loss": 0.74843204, "num_input_tokens_seen": 343344430, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.671875, "step": 15918, "time_per_iteration": 2.740734815597534 }, { "auxiliary_loss_clip": 0.01049196, "auxiliary_loss_mlp": 0.01001694, "balance_loss_clip": 1.00050187, "balance_loss_mlp": 1.00100541, "epoch": 0.9571020592214039, "flos": 65196938534400.0, "grad_norm": 0.6347876931330936, "language_loss": 0.53134406, "learning_rate": 1.8114969729567497e-08, "loss": 0.55185294, "num_input_tokens_seen": 343416155, "router_z_loss_clip": 0.01190186, "router_z_loss_mlp": 0.2109375, "step": 15919, "time_per_iteration": 3.5788357257843018 }, { "auxiliary_loss_clip": 0.01118344, "auxiliary_loss_mlp": 0.01035781, "balance_loss_clip": 1.02239966, "balance_loss_mlp": 1.03760421, "epoch": 0.9571621824740718, "flos": 17384068961280.0, "grad_norm": 2.888050591185971, "language_loss": 0.75862336, "learning_rate": 1.8064268560950446e-08, "loss": 0.7801646, "num_input_tokens_seen": 343431715, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 15920, "time_per_iteration": 2.5974836349487305 }, { "auxiliary_loss_clip": 0.01133403, "auxiliary_loss_mlp": 0.01031598, "balance_loss_clip": 1.01871157, "balance_loss_mlp": 1.03580022, "epoch": 0.9572223057267398, "flos": 14538184352640.0, "grad_norm": 1.7496711139208225, "language_loss": 0.79328233, "learning_rate": 1.8013638122521548e-08, "loss": 0.81493235, "num_input_tokens_seen": 343450425, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 15921, "time_per_iteration": 2.753955364227295 }, { "auxiliary_loss_clip": 0.01114582, "auxiliary_loss_mlp": 0.01029776, "balance_loss_clip": 1.01653743, "balance_loss_mlp": 1.03305566, "epoch": 0.9572824289794077, "flos": 33947793158400.0, "grad_norm": 3.4720502174804793, "language_loss": 0.51044977, "learning_rate": 1.7963078416087573e-08, "loss": 0.53189331, "num_input_tokens_seen": 343470445, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 15922, "time_per_iteration": 2.787595510482788 }, { "auxiliary_loss_clip": 0.01138422, "auxiliary_loss_mlp": 0.01031082, "balance_loss_clip": 1.01892853, "balance_loss_mlp": 1.03341854, "epoch": 0.9573425522320758, "flos": 18694915896960.0, "grad_norm": 1.8847307687206338, "language_loss": 0.83490515, "learning_rate": 1.7912589443452642e-08, "loss": 0.85660017, "num_input_tokens_seen": 343485200, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 15923, "time_per_iteration": 2.761324644088745 }, { "auxiliary_loss_clip": 0.01117816, "auxiliary_loss_mlp": 0.01031386, "balance_loss_clip": 1.01993608, "balance_loss_mlp": 1.03250515, "epoch": 0.9574026754847437, "flos": 28510307297280.0, "grad_norm": 1.6206011278541268, "language_loss": 0.75090635, "learning_rate": 1.786217120641931e-08, "loss": 0.77239835, "num_input_tokens_seen": 343505080, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.671875, "step": 15924, "time_per_iteration": 2.7571113109588623 }, { "auxiliary_loss_clip": 0.0111939, "auxiliary_loss_mlp": 0.01029371, "balance_loss_clip": 1.01684213, "balance_loss_mlp": 1.03264523, "epoch": 0.9574627987374117, "flos": 24352390604160.0, "grad_norm": 2.73276924923853, "language_loss": 0.86266184, "learning_rate": 1.7811823706786133e-08, "loss": 0.88414943, "num_input_tokens_seen": 343523995, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 15925, "time_per_iteration": 4.324306011199951 }, { "auxiliary_loss_clip": 0.01133868, "auxiliary_loss_mlp": 0.01031255, "balance_loss_clip": 1.01849329, "balance_loss_mlp": 1.03492308, "epoch": 0.9575229219900797, "flos": 23510680225920.0, "grad_norm": 3.5246902863743697, "language_loss": 0.75364864, "learning_rate": 1.7761546946350348e-08, "loss": 0.77529985, "num_input_tokens_seen": 343542015, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 15926, "time_per_iteration": 2.772242307662964 }, { "auxiliary_loss_clip": 0.0112081, "auxiliary_loss_mlp": 0.01028901, "balance_loss_clip": 1.0175935, "balance_loss_mlp": 1.03434181, "epoch": 0.9575830452427476, "flos": 22674823764480.0, "grad_norm": 1.8606134534668208, "language_loss": 0.78161091, "learning_rate": 1.771134092690585e-08, "loss": 0.80310798, "num_input_tokens_seen": 343561680, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.68359375, "step": 15927, "time_per_iteration": 2.9393365383148193 }, { "auxiliary_loss_clip": 0.01140155, "auxiliary_loss_mlp": 0.01033057, "balance_loss_clip": 1.01927006, "balance_loss_mlp": 1.03348076, "epoch": 0.9576431684954156, "flos": 30485250835200.0, "grad_norm": 2.2581605005794314, "language_loss": 0.68212318, "learning_rate": 1.7661205650244758e-08, "loss": 0.70385534, "num_input_tokens_seen": 343585290, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.70703125, "step": 15928, "time_per_iteration": 2.7211954593658447 }, { "auxiliary_loss_clip": 0.01120097, "auxiliary_loss_mlp": 0.01031533, "balance_loss_clip": 1.01906919, "balance_loss_mlp": 1.03486037, "epoch": 0.9577032917480836, "flos": 21687387909120.0, "grad_norm": 1.7232111235211578, "language_loss": 0.82057673, "learning_rate": 1.761114111815587e-08, "loss": 0.84209305, "num_input_tokens_seen": 343604045, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.67578125, "step": 15929, "time_per_iteration": 2.770632743835449 }, { "auxiliary_loss_clip": 0.01133199, "auxiliary_loss_mlp": 0.01042074, "balance_loss_clip": 1.0300163, "balance_loss_mlp": 1.03449225, "epoch": 0.9577634150007516, "flos": 29023147728000.0, "grad_norm": 1.4154022079924387, "language_loss": 0.72167879, "learning_rate": 1.7561147332426195e-08, "loss": 0.74343145, "num_input_tokens_seen": 343626595, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.71875, "step": 15930, "time_per_iteration": 2.8615612983703613 }, { "auxiliary_loss_clip": 0.01145906, "auxiliary_loss_mlp": 0.01028899, "balance_loss_clip": 1.01707935, "balance_loss_mlp": 1.03409064, "epoch": 0.9578235382534195, "flos": 19282235178240.0, "grad_norm": 1.454738413572605, "language_loss": 0.62167299, "learning_rate": 1.7511224294839644e-08, "loss": 0.64342099, "num_input_tokens_seen": 343646195, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.66796875, "step": 15931, "time_per_iteration": 2.8907299041748047 }, { "auxiliary_loss_clip": 0.01099889, "auxiliary_loss_mlp": 0.01027485, "balance_loss_clip": 1.01589179, "balance_loss_mlp": 1.03406549, "epoch": 0.9578836615060875, "flos": 13699275235200.0, "grad_norm": 1.9893528113935368, "language_loss": 0.68751264, "learning_rate": 1.74613720071779e-08, "loss": 0.70878637, "num_input_tokens_seen": 343663665, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.65625, "step": 15932, "time_per_iteration": 2.9324562549591064 }, { "auxiliary_loss_clip": 0.01133213, "auxiliary_loss_mlp": 0.01036109, "balance_loss_clip": 1.02220345, "balance_loss_mlp": 1.03442287, "epoch": 0.9579437847587554, "flos": 17054516655360.0, "grad_norm": 1.7469178992038974, "language_loss": 0.75033033, "learning_rate": 1.7411590471219982e-08, "loss": 0.77202356, "num_input_tokens_seen": 343682145, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7265625, "step": 15933, "time_per_iteration": 4.2953102588653564 }, { "auxiliary_loss_clip": 0.01057355, "auxiliary_loss_mlp": 0.01000459, "balance_loss_clip": 0.99918979, "balance_loss_mlp": 1.00051129, "epoch": 0.9580039080114234, "flos": 60570887886720.0, "grad_norm": 0.7049215818774838, "language_loss": 0.57272649, "learning_rate": 1.7361879688742697e-08, "loss": 0.59330463, "num_input_tokens_seen": 343744685, "router_z_loss_clip": 0.01269531, "router_z_loss_mlp": 0.21191406, "step": 15934, "time_per_iteration": 3.3668267726898193 }, { "auxiliary_loss_clip": 0.01022025, "auxiliary_loss_mlp": 0.01249231, "balance_loss_clip": 1.0021174, "balance_loss_mlp": 1.00087488, "epoch": 0.9580640312640913, "flos": 66235365745920.0, "grad_norm": 0.9205371933666086, "language_loss": 0.66038895, "learning_rate": 1.731223966152018e-08, "loss": 0.68310153, "num_input_tokens_seen": 343801835, "router_z_loss_clip": 0.01202393, "router_z_loss_mlp": 0.2109375, "step": 15935, "time_per_iteration": 3.360121250152588 }, { "auxiliary_loss_clip": 0.01039948, "auxiliary_loss_mlp": 0.01000374, "balance_loss_clip": 0.99925357, "balance_loss_mlp": 1.00088978, "epoch": 0.9581241545167594, "flos": 62218002971520.0, "grad_norm": 0.7899953288173183, "language_loss": 0.56980836, "learning_rate": 1.7262670391323452e-08, "loss": 0.59021151, "num_input_tokens_seen": 343861515, "router_z_loss_clip": 0.01123047, "router_z_loss_mlp": 0.2109375, "step": 15936, "time_per_iteration": 3.283298969268799 }, { "auxiliary_loss_clip": 0.01147402, "auxiliary_loss_mlp": 0.01029828, "balance_loss_clip": 1.01713872, "balance_loss_mlp": 1.03344023, "epoch": 0.9581842777694273, "flos": 23768088065280.0, "grad_norm": 1.6549646035599153, "language_loss": 0.78467309, "learning_rate": 1.7213171879921994e-08, "loss": 0.80644536, "num_input_tokens_seen": 343881240, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 15937, "time_per_iteration": 2.7591910362243652 }, { "auxiliary_loss_clip": 0.011117, "auxiliary_loss_mlp": 0.01030843, "balance_loss_clip": 1.01780117, "balance_loss_mlp": 1.03341401, "epoch": 0.9582444010220953, "flos": 21213079793280.0, "grad_norm": 1.6225398584806454, "language_loss": 0.68341857, "learning_rate": 1.7163744129081947e-08, "loss": 0.704844, "num_input_tokens_seen": 343900885, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 15938, "time_per_iteration": 2.87636137008667 }, { "auxiliary_loss_clip": 0.01128693, "auxiliary_loss_mlp": 0.01030662, "balance_loss_clip": 1.01847315, "balance_loss_mlp": 1.03303528, "epoch": 0.9583045242747633, "flos": 23805147922560.0, "grad_norm": 2.5529583865563676, "language_loss": 0.66119868, "learning_rate": 1.7114387140567455e-08, "loss": 0.68279219, "num_input_tokens_seen": 343918460, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69140625, "step": 15939, "time_per_iteration": 4.230712413787842 }, { "auxiliary_loss_clip": 0.01118409, "auxiliary_loss_mlp": 0.01033287, "balance_loss_clip": 1.02096033, "balance_loss_mlp": 1.03248191, "epoch": 0.9583646475274312, "flos": 24131468004480.0, "grad_norm": 1.6384264697108433, "language_loss": 0.73676717, "learning_rate": 1.7065100916139774e-08, "loss": 0.75828409, "num_input_tokens_seen": 343938030, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 15940, "time_per_iteration": 4.47626519203186 }, { "auxiliary_loss_clip": 0.01103064, "auxiliary_loss_mlp": 0.0103269, "balance_loss_clip": 1.01976764, "balance_loss_mlp": 1.03433263, "epoch": 0.9584247707800992, "flos": 20886651970560.0, "grad_norm": 1.651406049215207, "language_loss": 0.72985721, "learning_rate": 1.7015885457558167e-08, "loss": 0.75121474, "num_input_tokens_seen": 343956635, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 15941, "time_per_iteration": 2.964790105819702 }, { "auxiliary_loss_clip": 0.01120394, "auxiliary_loss_mlp": 0.01035708, "balance_loss_clip": 1.02368009, "balance_loss_mlp": 1.03423524, "epoch": 0.9584848940327672, "flos": 26067591918720.0, "grad_norm": 1.5236120775325084, "language_loss": 0.71207702, "learning_rate": 1.6966740766578557e-08, "loss": 0.73363805, "num_input_tokens_seen": 343976625, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 15942, "time_per_iteration": 2.8584892749786377 }, { "auxiliary_loss_clip": 0.01103149, "auxiliary_loss_mlp": 0.01034371, "balance_loss_clip": 1.02130032, "balance_loss_mlp": 1.03324342, "epoch": 0.9585450172854352, "flos": 34492988764800.0, "grad_norm": 1.4666131645370089, "language_loss": 0.71942848, "learning_rate": 1.6917666844955102e-08, "loss": 0.74080372, "num_input_tokens_seen": 343997790, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 15943, "time_per_iteration": 2.7225468158721924 }, { "auxiliary_loss_clip": 0.01114498, "auxiliary_loss_mlp": 0.01035238, "balance_loss_clip": 1.02224422, "balance_loss_mlp": 1.03482604, "epoch": 0.9586051405381031, "flos": 23110743219840.0, "grad_norm": 1.7555374287040961, "language_loss": 0.68261003, "learning_rate": 1.6868663694439067e-08, "loss": 0.70410734, "num_input_tokens_seen": 344016935, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 15944, "time_per_iteration": 3.0163471698760986 }, { "auxiliary_loss_clip": 0.01102942, "auxiliary_loss_mlp": 0.01033794, "balance_loss_clip": 1.0212357, "balance_loss_mlp": 1.03328681, "epoch": 0.9586652637907711, "flos": 19603994232960.0, "grad_norm": 1.5695749335611773, "language_loss": 0.66007209, "learning_rate": 1.6819731316779272e-08, "loss": 0.6814394, "num_input_tokens_seen": 344035590, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 15945, "time_per_iteration": 2.5795366764068604 }, { "auxiliary_loss_clip": 0.01075097, "auxiliary_loss_mlp": 0.01004471, "balance_loss_clip": 1.00318944, "balance_loss_mlp": 1.00085425, "epoch": 0.958725387043439, "flos": 72073327317120.0, "grad_norm": 0.7449045466329192, "language_loss": 0.61943573, "learning_rate": 1.6770869713721657e-08, "loss": 0.64023137, "num_input_tokens_seen": 344100845, "router_z_loss_clip": 0.01281738, "router_z_loss_mlp": 0.21289062, "step": 15946, "time_per_iteration": 3.5191187858581543 }, { "auxiliary_loss_clip": 0.01100017, "auxiliary_loss_mlp": 0.01029565, "balance_loss_clip": 1.01770377, "balance_loss_mlp": 1.0334264, "epoch": 0.958785510296107, "flos": 28911932242560.0, "grad_norm": 1.83057167620418, "language_loss": 0.75116706, "learning_rate": 1.6722078887010382e-08, "loss": 0.7724629, "num_input_tokens_seen": 344121780, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6640625, "step": 15947, "time_per_iteration": 2.6650726795196533 }, { "auxiliary_loss_clip": 0.01117693, "auxiliary_loss_mlp": 0.01025965, "balance_loss_clip": 1.01475883, "balance_loss_mlp": 1.03174782, "epoch": 0.958845633548775, "flos": 18477189607680.0, "grad_norm": 2.4121337386974613, "language_loss": 0.69821286, "learning_rate": 1.667335883838672e-08, "loss": 0.71964943, "num_input_tokens_seen": 344140150, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6796875, "step": 15948, "time_per_iteration": 2.653338670730591 }, { "auxiliary_loss_clip": 0.01135835, "auxiliary_loss_mlp": 0.01030555, "balance_loss_clip": 1.01860976, "balance_loss_mlp": 1.0327332, "epoch": 0.958905756801443, "flos": 24206916522240.0, "grad_norm": 2.039336255012693, "language_loss": 0.78704625, "learning_rate": 1.6624709569588836e-08, "loss": 0.80871016, "num_input_tokens_seen": 344158200, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.67578125, "step": 15949, "time_per_iteration": 2.958310127258301 }, { "auxiliary_loss_clip": 0.01104754, "auxiliary_loss_mlp": 0.01031232, "balance_loss_clip": 1.0186615, "balance_loss_mlp": 1.03556168, "epoch": 0.9589658800541109, "flos": 25007939769600.0, "grad_norm": 1.767764049016594, "language_loss": 0.68578362, "learning_rate": 1.657613108235334e-08, "loss": 0.70714355, "num_input_tokens_seen": 344174720, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 15950, "time_per_iteration": 2.6073126792907715 }, { "auxiliary_loss_clip": 0.01120537, "auxiliary_loss_mlp": 0.01030674, "balance_loss_clip": 1.01877689, "balance_loss_mlp": 1.0339694, "epoch": 0.9590260033067789, "flos": 23514558894720.0, "grad_norm": 1.643283662660506, "language_loss": 0.85921711, "learning_rate": 1.6527623378413514e-08, "loss": 0.8807292, "num_input_tokens_seen": 344192580, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 15951, "time_per_iteration": 2.878619909286499 }, { "auxiliary_loss_clip": 0.0112426, "auxiliary_loss_mlp": 0.01036482, "balance_loss_clip": 1.02410221, "balance_loss_mlp": 1.03434896, "epoch": 0.9590861265594469, "flos": 25520349237120.0, "grad_norm": 2.078939743782005, "language_loss": 0.79877627, "learning_rate": 1.647918645950108e-08, "loss": 0.82038367, "num_input_tokens_seen": 344210345, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.71875, "step": 15952, "time_per_iteration": 2.7242846488952637 }, { "auxiliary_loss_clip": 0.01112436, "auxiliary_loss_mlp": 0.01030948, "balance_loss_clip": 1.01843119, "balance_loss_mlp": 1.03467977, "epoch": 0.9591462498121148, "flos": 21179323987200.0, "grad_norm": 1.5273321742668093, "language_loss": 0.6957401, "learning_rate": 1.6430820327343998e-08, "loss": 0.71717393, "num_input_tokens_seen": 344229540, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 15953, "time_per_iteration": 2.6137681007385254 }, { "auxiliary_loss_clip": 0.01149278, "auxiliary_loss_mlp": 0.01032932, "balance_loss_clip": 1.01945591, "balance_loss_mlp": 1.03435445, "epoch": 0.9592063730647828, "flos": 21723047136000.0, "grad_norm": 4.672875967044846, "language_loss": 0.58289337, "learning_rate": 1.6382524983668655e-08, "loss": 0.60471547, "num_input_tokens_seen": 344247830, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.70703125, "step": 15954, "time_per_iteration": 3.0829110145568848 }, { "auxiliary_loss_clip": 0.01113219, "auxiliary_loss_mlp": 0.01033595, "balance_loss_clip": 1.01983798, "balance_loss_mlp": 1.03404307, "epoch": 0.9592664963174508, "flos": 29891395278720.0, "grad_norm": 1.9526285182963135, "language_loss": 0.73859894, "learning_rate": 1.6334300430198567e-08, "loss": 0.76006711, "num_input_tokens_seen": 344267760, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.703125, "step": 15955, "time_per_iteration": 2.652738571166992 }, { "auxiliary_loss_clip": 0.01120676, "auxiliary_loss_mlp": 0.01035465, "balance_loss_clip": 1.02235794, "balance_loss_mlp": 1.03328443, "epoch": 0.9593266195701188, "flos": 17999613354240.0, "grad_norm": 1.7833023285702163, "language_loss": 0.62676513, "learning_rate": 1.6286146668654798e-08, "loss": 0.64832652, "num_input_tokens_seen": 344284905, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 15956, "time_per_iteration": 2.636106491088867 }, { "auxiliary_loss_clip": 0.01129818, "auxiliary_loss_mlp": 0.01031574, "balance_loss_clip": 1.01887858, "balance_loss_mlp": 1.03384924, "epoch": 0.9593867428227867, "flos": 18838271076480.0, "grad_norm": 2.309011383503862, "language_loss": 0.59672076, "learning_rate": 1.6238063700755532e-08, "loss": 0.61833471, "num_input_tokens_seen": 344302025, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 15957, "time_per_iteration": 3.1404435634613037 }, { "auxiliary_loss_clip": 0.01131495, "auxiliary_loss_mlp": 0.01038529, "balance_loss_clip": 1.02570772, "balance_loss_mlp": 1.03482163, "epoch": 0.9594468660754547, "flos": 29567050444800.0, "grad_norm": 2.4545512915449756, "language_loss": 0.74182987, "learning_rate": 1.6190051528217175e-08, "loss": 0.76353014, "num_input_tokens_seen": 344321935, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 15958, "time_per_iteration": 2.716322660446167 }, { "auxiliary_loss_clip": 0.01118879, "auxiliary_loss_mlp": 0.01275865, "balance_loss_clip": 1.01694643, "balance_loss_mlp": 1.03328955, "epoch": 0.9595069893281226, "flos": 20703256104960.0, "grad_norm": 1.4519060987663097, "language_loss": 0.74491459, "learning_rate": 1.6142110152752574e-08, "loss": 0.76886207, "num_input_tokens_seen": 344340405, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 15959, "time_per_iteration": 2.677001953125 }, { "auxiliary_loss_clip": 0.011134, "auxiliary_loss_mlp": 0.01031777, "balance_loss_clip": 1.01903939, "balance_loss_mlp": 1.03433776, "epoch": 0.9595671125807906, "flos": 22453613856000.0, "grad_norm": 2.4848750862797915, "language_loss": 0.6519419, "learning_rate": 1.6094239576073254e-08, "loss": 0.67339367, "num_input_tokens_seen": 344359925, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 15960, "time_per_iteration": 2.941073179244995 }, { "auxiliary_loss_clip": 0.01099935, "auxiliary_loss_mlp": 0.0103145, "balance_loss_clip": 1.01861084, "balance_loss_mlp": 1.0337534, "epoch": 0.9596272358334585, "flos": 23915214172800.0, "grad_norm": 1.713046021065904, "language_loss": 0.77918822, "learning_rate": 1.604643979988718e-08, "loss": 0.800502, "num_input_tokens_seen": 344379100, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6640625, "step": 15961, "time_per_iteration": 2.6368155479431152 }, { "auxiliary_loss_clip": 0.01136644, "auxiliary_loss_mlp": 0.01026433, "balance_loss_clip": 1.01495314, "balance_loss_mlp": 1.03356564, "epoch": 0.9596873590861266, "flos": 17672539086720.0, "grad_norm": 1.9709346291395784, "language_loss": 0.75637603, "learning_rate": 1.599871082590032e-08, "loss": 0.77800679, "num_input_tokens_seen": 344396895, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.671875, "step": 15962, "time_per_iteration": 2.7664473056793213 }, { "auxiliary_loss_clip": 0.01123141, "auxiliary_loss_mlp": 0.01031347, "balance_loss_clip": 1.01858521, "balance_loss_mlp": 1.03433096, "epoch": 0.9597474823387945, "flos": 25808532053760.0, "grad_norm": 2.191446111974534, "language_loss": 0.71522588, "learning_rate": 1.59510526558162e-08, "loss": 0.73677081, "num_input_tokens_seen": 344415115, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 15963, "time_per_iteration": 3.0357611179351807 }, { "auxiliary_loss_clip": 0.01117767, "auxiliary_loss_mlp": 0.01030424, "balance_loss_clip": 1.01874793, "balance_loss_mlp": 1.03465962, "epoch": 0.9598076055914625, "flos": 23768519028480.0, "grad_norm": 1.8353380216959752, "language_loss": 0.73866224, "learning_rate": 1.5903465291335016e-08, "loss": 0.76014423, "num_input_tokens_seen": 344435185, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.65234375, "step": 15964, "time_per_iteration": 2.7931764125823975 }, { "auxiliary_loss_clip": 0.0104924, "auxiliary_loss_mlp": 0.0099998, "balance_loss_clip": 0.99877596, "balance_loss_mlp": 1.00112224, "epoch": 0.9598677288441305, "flos": 56515962464640.0, "grad_norm": 0.9372206607607159, "language_loss": 0.57662809, "learning_rate": 1.5855948734155854e-08, "loss": 0.59712029, "num_input_tokens_seen": 344488950, "router_z_loss_clip": 0.01202393, "router_z_loss_mlp": 0.2109375, "step": 15965, "time_per_iteration": 3.33957576751709 }, { "auxiliary_loss_clip": 0.01101914, "auxiliary_loss_mlp": 0.01029183, "balance_loss_clip": 1.01745892, "balance_loss_mlp": 1.03288066, "epoch": 0.9599278520967984, "flos": 22997480659200.0, "grad_norm": 1.8233120235592486, "language_loss": 0.78718251, "learning_rate": 1.5808502985973803e-08, "loss": 0.80849349, "num_input_tokens_seen": 344506740, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 15966, "time_per_iteration": 3.964458703994751 }, { "auxiliary_loss_clip": 0.01116375, "auxiliary_loss_mlp": 0.01028143, "balance_loss_clip": 1.01607287, "balance_loss_mlp": 1.03164411, "epoch": 0.9599879753494664, "flos": 23039676161280.0, "grad_norm": 1.5681567974543897, "language_loss": 0.79375052, "learning_rate": 1.576112804848262e-08, "loss": 0.81519574, "num_input_tokens_seen": 344526670, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.66796875, "step": 15967, "time_per_iteration": 2.6659209728240967 }, { "auxiliary_loss_clip": 0.0114279, "auxiliary_loss_mlp": 0.01027074, "balance_loss_clip": 1.01417005, "balance_loss_mlp": 1.03556895, "epoch": 0.9600480986021344, "flos": 16392287560320.0, "grad_norm": 2.856494709194183, "language_loss": 0.80690837, "learning_rate": 1.5713823923372504e-08, "loss": 0.82860696, "num_input_tokens_seen": 344541995, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 15968, "time_per_iteration": 2.9625024795532227 }, { "auxiliary_loss_clip": 0.01143736, "auxiliary_loss_mlp": 0.0103976, "balance_loss_clip": 1.02550292, "balance_loss_mlp": 1.03630161, "epoch": 0.9601082218548024, "flos": 24276439296000.0, "grad_norm": 2.3905039838565236, "language_loss": 0.67341852, "learning_rate": 1.5666590612331886e-08, "loss": 0.69525349, "num_input_tokens_seen": 344559980, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.71875, "step": 15969, "time_per_iteration": 2.6469058990478516 }, { "auxiliary_loss_clip": 0.01111354, "auxiliary_loss_mlp": 0.01034395, "balance_loss_clip": 1.02299905, "balance_loss_mlp": 1.03500819, "epoch": 0.9601683451074703, "flos": 19609991804160.0, "grad_norm": 3.2404212651620092, "language_loss": 0.80071759, "learning_rate": 1.5619428117046086e-08, "loss": 0.82217503, "num_input_tokens_seen": 344577765, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.671875, "step": 15970, "time_per_iteration": 2.621417284011841 }, { "auxiliary_loss_clip": 0.01109308, "auxiliary_loss_mlp": 0.01029389, "balance_loss_clip": 1.01669276, "balance_loss_mlp": 1.03445101, "epoch": 0.9602284683601383, "flos": 26651104358400.0, "grad_norm": 1.4897157935442285, "language_loss": 0.77081627, "learning_rate": 1.557233643919864e-08, "loss": 0.79220325, "num_input_tokens_seen": 344597650, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6640625, "step": 15971, "time_per_iteration": 2.9364089965820312 }, { "auxiliary_loss_clip": 0.01123077, "auxiliary_loss_mlp": 0.01028906, "balance_loss_clip": 1.01593041, "balance_loss_mlp": 1.0337007, "epoch": 0.9602885916128062, "flos": 15554096714880.0, "grad_norm": 1.9060771201571896, "language_loss": 0.67452621, "learning_rate": 1.5525315580469988e-08, "loss": 0.69604611, "num_input_tokens_seen": 344613580, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 15972, "time_per_iteration": 2.6903252601623535 }, { "auxiliary_loss_clip": 0.01109166, "auxiliary_loss_mlp": 0.01274057, "balance_loss_clip": 1.01519346, "balance_loss_mlp": 1.03266239, "epoch": 0.9603487148654742, "flos": 16502353810560.0, "grad_norm": 2.144899077735457, "language_loss": 0.76266408, "learning_rate": 1.5478365542538117e-08, "loss": 0.78649628, "num_input_tokens_seen": 344626910, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6796875, "step": 15973, "time_per_iteration": 2.525782823562622 }, { "auxiliary_loss_clip": 0.0113911, "auxiliary_loss_mlp": 0.01035246, "balance_loss_clip": 1.0231576, "balance_loss_mlp": 1.03650832, "epoch": 0.9604088381181421, "flos": 20845354308480.0, "grad_norm": 1.6826745259721496, "language_loss": 0.69345713, "learning_rate": 1.543148632707858e-08, "loss": 0.71520066, "num_input_tokens_seen": 344644330, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.671875, "step": 15974, "time_per_iteration": 4.462371110916138 }, { "auxiliary_loss_clip": 0.01111782, "auxiliary_loss_mlp": 0.01028943, "balance_loss_clip": 1.01625872, "balance_loss_mlp": 1.03271389, "epoch": 0.9604689613708102, "flos": 19683105937920.0, "grad_norm": 2.175587859581649, "language_loss": 0.68047351, "learning_rate": 1.5384677935764477e-08, "loss": 0.70188069, "num_input_tokens_seen": 344663910, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 15975, "time_per_iteration": 2.51027774810791 }, { "auxiliary_loss_clip": 0.01099784, "auxiliary_loss_mlp": 0.01027974, "balance_loss_clip": 1.01642883, "balance_loss_mlp": 1.03430915, "epoch": 0.9605290846234781, "flos": 24097568544000.0, "grad_norm": 3.132525274161879, "language_loss": 0.55579323, "learning_rate": 1.5337940370266257e-08, "loss": 0.57707083, "num_input_tokens_seen": 344682320, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.65625, "step": 15976, "time_per_iteration": 2.6073098182678223 }, { "auxiliary_loss_clip": 0.01116947, "auxiliary_loss_mlp": 0.01027256, "balance_loss_clip": 1.0156033, "balance_loss_mlp": 1.03254521, "epoch": 0.9605892078761461, "flos": 27122575299840.0, "grad_norm": 1.885193297338401, "language_loss": 0.68255728, "learning_rate": 1.5291273632251467e-08, "loss": 0.70399934, "num_input_tokens_seen": 344701355, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6640625, "step": 15977, "time_per_iteration": 3.0448904037475586 }, { "auxiliary_loss_clip": 0.01110128, "auxiliary_loss_mlp": 0.01034066, "balance_loss_clip": 1.02153134, "balance_loss_mlp": 1.03274298, "epoch": 0.9606493311288141, "flos": 14136918543360.0, "grad_norm": 2.0770553229025834, "language_loss": 0.82100803, "learning_rate": 1.524467772338589e-08, "loss": 0.84244996, "num_input_tokens_seen": 344717980, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 15978, "time_per_iteration": 2.5304007530212402 }, { "auxiliary_loss_clip": 0.01114993, "auxiliary_loss_mlp": 0.01032702, "balance_loss_clip": 1.02025688, "balance_loss_mlp": 1.03498852, "epoch": 0.960709454381482, "flos": 15813336147840.0, "grad_norm": 2.1910624661610623, "language_loss": 0.82954299, "learning_rate": 1.519815264533264e-08, "loss": 0.85101992, "num_input_tokens_seen": 344733480, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7109375, "step": 15979, "time_per_iteration": 2.553581953048706 }, { "auxiliary_loss_clip": 0.01103195, "auxiliary_loss_mlp": 0.01039465, "balance_loss_clip": 1.02783024, "balance_loss_mlp": 1.03371453, "epoch": 0.96076957763415, "flos": 22565403959040.0, "grad_norm": 1.5452086661442792, "language_loss": 0.80257738, "learning_rate": 1.51516983997515e-08, "loss": 0.82400399, "num_input_tokens_seen": 344752130, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6953125, "step": 15980, "time_per_iteration": 2.9688422679901123 }, { "auxiliary_loss_clip": 0.0111217, "auxiliary_loss_mlp": 0.01028392, "balance_loss_clip": 1.01613176, "balance_loss_mlp": 1.03295505, "epoch": 0.960829700886818, "flos": 24681260551680.0, "grad_norm": 1.9428734719942202, "language_loss": 0.68377584, "learning_rate": 1.5105314988300698e-08, "loss": 0.70518148, "num_input_tokens_seen": 344771195, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 15981, "time_per_iteration": 6.254152774810791 }, { "auxiliary_loss_clip": 0.01109139, "auxiliary_loss_mlp": 0.01271727, "balance_loss_clip": 1.01364315, "balance_loss_mlp": 1.03350902, "epoch": 0.960889824139486, "flos": 26542223256960.0, "grad_norm": 1.3923322896080281, "language_loss": 0.69348168, "learning_rate": 1.505900241263558e-08, "loss": 0.71729034, "num_input_tokens_seen": 344793150, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.66796875, "step": 15982, "time_per_iteration": 3.0031826496124268 }, { "auxiliary_loss_clip": 0.01048615, "auxiliary_loss_mlp": 0.01000782, "balance_loss_clip": 0.99944699, "balance_loss_mlp": 1.00119221, "epoch": 0.9609499473921539, "flos": 71114942586240.0, "grad_norm": 0.7326414158759013, "language_loss": 0.53320396, "learning_rate": 1.50127606744086e-08, "loss": 0.55369794, "num_input_tokens_seen": 344852855, "router_z_loss_clip": 0.0133667, "router_z_loss_mlp": 0.2109375, "step": 15983, "time_per_iteration": 3.303877115249634 }, { "auxiliary_loss_clip": 0.01102044, "auxiliary_loss_mlp": 0.01033902, "balance_loss_clip": 1.02189207, "balance_loss_mlp": 1.03401995, "epoch": 0.9610100706448219, "flos": 20552466810240.0, "grad_norm": 4.451226158274185, "language_loss": 0.67947912, "learning_rate": 1.4966589775270432e-08, "loss": 0.70083863, "num_input_tokens_seen": 344869830, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 15984, "time_per_iteration": 2.614015817642212 }, { "auxiliary_loss_clip": 0.01124697, "auxiliary_loss_mlp": 0.01030861, "balance_loss_clip": 1.01948273, "balance_loss_mlp": 1.03334153, "epoch": 0.9610701938974898, "flos": 14064199459200.0, "grad_norm": 1.4544307537924195, "language_loss": 0.67194331, "learning_rate": 1.492048971686821e-08, "loss": 0.69349885, "num_input_tokens_seen": 344888905, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6484375, "step": 15985, "time_per_iteration": 2.948617458343506 }, { "auxiliary_loss_clip": 0.01104491, "auxiliary_loss_mlp": 0.01032076, "balance_loss_clip": 1.01994658, "balance_loss_mlp": 1.0346669, "epoch": 0.9611303171501578, "flos": 20229989483520.0, "grad_norm": 1.7951743152286124, "language_loss": 0.78884041, "learning_rate": 1.487446050084773e-08, "loss": 0.81020612, "num_input_tokens_seen": 344907160, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69921875, "step": 15986, "time_per_iteration": 2.63736629486084 }, { "auxiliary_loss_clip": 0.01138382, "auxiliary_loss_mlp": 0.01032331, "balance_loss_clip": 1.01844311, "balance_loss_mlp": 1.03470278, "epoch": 0.9611904404028258, "flos": 20951075013120.0, "grad_norm": 2.1134196525166953, "language_loss": 0.63509309, "learning_rate": 1.4828502128851228e-08, "loss": 0.65680027, "num_input_tokens_seen": 344922400, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.6875, "step": 15987, "time_per_iteration": 2.571106195449829 }, { "auxiliary_loss_clip": 0.01098406, "auxiliary_loss_mlp": 0.01028529, "balance_loss_clip": 1.01755559, "balance_loss_mlp": 1.03365004, "epoch": 0.9612505636554938, "flos": 24827740214400.0, "grad_norm": 1.6178139884670864, "language_loss": 0.66734922, "learning_rate": 1.4782614602519172e-08, "loss": 0.6886186, "num_input_tokens_seen": 344941910, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.6484375, "step": 15988, "time_per_iteration": 2.8310840129852295 }, { "auxiliary_loss_clip": 0.01110155, "auxiliary_loss_mlp": 0.01035999, "balance_loss_clip": 1.02439952, "balance_loss_mlp": 1.03352642, "epoch": 0.9613106869081617, "flos": 17164977955200.0, "grad_norm": 2.018601924191725, "language_loss": 0.74729025, "learning_rate": 1.4736797923489142e-08, "loss": 0.76875174, "num_input_tokens_seen": 344960020, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 15989, "time_per_iteration": 2.979994535446167 }, { "auxiliary_loss_clip": 0.01039729, "auxiliary_loss_mlp": 0.01000648, "balance_loss_clip": 0.99931282, "balance_loss_mlp": 1.00071216, "epoch": 0.9613708101608297, "flos": 61563818522880.0, "grad_norm": 0.831312402877839, "language_loss": 0.63137567, "learning_rate": 1.4691052093395829e-08, "loss": 0.65177941, "num_input_tokens_seen": 345018290, "router_z_loss_clip": 0.0133667, "router_z_loss_mlp": 0.21191406, "step": 15990, "time_per_iteration": 3.154850482940674 }, { "auxiliary_loss_clip": 0.01110008, "auxiliary_loss_mlp": 0.01030941, "balance_loss_clip": 1.01929402, "balance_loss_mlp": 1.0324825, "epoch": 0.9614309334134977, "flos": 27417904922880.0, "grad_norm": 1.7778642220594014, "language_loss": 0.77760291, "learning_rate": 1.4645377113872149e-08, "loss": 0.7990123, "num_input_tokens_seen": 345040235, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6875, "step": 15991, "time_per_iteration": 2.697277069091797 }, { "auxiliary_loss_clip": 0.01108013, "auxiliary_loss_mlp": 0.01033681, "balance_loss_clip": 1.0201683, "balance_loss_mlp": 1.03493774, "epoch": 0.9614910566661656, "flos": 22819148611200.0, "grad_norm": 5.8779122095519725, "language_loss": 0.84526646, "learning_rate": 1.4599772986548131e-08, "loss": 0.86668342, "num_input_tokens_seen": 345054540, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73046875, "step": 15992, "time_per_iteration": 3.135270357131958 }, { "auxiliary_loss_clip": 0.01119414, "auxiliary_loss_mlp": 0.01031777, "balance_loss_clip": 1.0188905, "balance_loss_mlp": 1.03237987, "epoch": 0.9615511799188337, "flos": 20667812359680.0, "grad_norm": 1.7468332626242373, "language_loss": 0.72498548, "learning_rate": 1.455423971305092e-08, "loss": 0.74649739, "num_input_tokens_seen": 345074035, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 15993, "time_per_iteration": 2.8042426109313965 }, { "auxiliary_loss_clip": 0.01129108, "auxiliary_loss_mlp": 0.01031594, "balance_loss_clip": 1.01874876, "balance_loss_mlp": 1.03248513, "epoch": 0.9616113031715016, "flos": 33149212035840.0, "grad_norm": 3.097379456928448, "language_loss": 0.68071246, "learning_rate": 1.450877729500588e-08, "loss": 0.70231938, "num_input_tokens_seen": 345099270, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 15994, "time_per_iteration": 2.7283718585968018 }, { "auxiliary_loss_clip": 0.01130238, "auxiliary_loss_mlp": 0.01030502, "balance_loss_clip": 1.01811635, "balance_loss_mlp": 1.03483295, "epoch": 0.9616714264241696, "flos": 25009807276800.0, "grad_norm": 1.8584248540505222, "language_loss": 0.84513712, "learning_rate": 1.4463385734035272e-08, "loss": 0.86674452, "num_input_tokens_seen": 345116975, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 15995, "time_per_iteration": 2.896843433380127 }, { "auxiliary_loss_clip": 0.01113687, "auxiliary_loss_mlp": 0.01034131, "balance_loss_clip": 1.02129197, "balance_loss_mlp": 1.03421116, "epoch": 0.9617315496768375, "flos": 13547480359680.0, "grad_norm": 1.993108784692113, "language_loss": 0.75861073, "learning_rate": 1.4418065031758908e-08, "loss": 0.7800889, "num_input_tokens_seen": 345133645, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 15996, "time_per_iteration": 2.8639140129089355 }, { "auxiliary_loss_clip": 0.01118303, "auxiliary_loss_mlp": 0.01032688, "balance_loss_clip": 1.02034998, "balance_loss_mlp": 1.0322994, "epoch": 0.9617916729295055, "flos": 11254512781440.0, "grad_norm": 1.9827988553421017, "language_loss": 0.76780117, "learning_rate": 1.4372815189794384e-08, "loss": 0.78931111, "num_input_tokens_seen": 345150740, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 15997, "time_per_iteration": 2.5744688510894775 }, { "auxiliary_loss_clip": 0.010983, "auxiliary_loss_mlp": 0.01273464, "balance_loss_clip": 1.01518703, "balance_loss_mlp": 1.03283954, "epoch": 0.9618517961821734, "flos": 22819723228800.0, "grad_norm": 1.5083698242862449, "language_loss": 0.66934156, "learning_rate": 1.4327636209756189e-08, "loss": 0.69305909, "num_input_tokens_seen": 345170365, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.65625, "step": 15998, "time_per_iteration": 2.733471632003784 }, { "auxiliary_loss_clip": 0.01117434, "auxiliary_loss_mlp": 0.01029145, "balance_loss_clip": 1.01641941, "balance_loss_mlp": 1.03171229, "epoch": 0.9619119194348414, "flos": 16617340224000.0, "grad_norm": 2.055964925299781, "language_loss": 0.72837621, "learning_rate": 1.4282528093257252e-08, "loss": 0.74984199, "num_input_tokens_seen": 345188930, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.67578125, "step": 15999, "time_per_iteration": 3.1724891662597656 }, { "auxiliary_loss_clip": 0.01116079, "auxiliary_loss_mlp": 0.0102621, "balance_loss_clip": 1.01514149, "balance_loss_mlp": 1.03306794, "epoch": 0.9619720426875094, "flos": 24535140024960.0, "grad_norm": 1.8088915082927732, "language_loss": 0.65244967, "learning_rate": 1.4237490841906951e-08, "loss": 0.67387253, "num_input_tokens_seen": 345209615, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.65625, "step": 16000, "time_per_iteration": 2.637876033782959 }, { "auxiliary_loss_clip": 0.01128174, "auxiliary_loss_mlp": 0.01029484, "balance_loss_clip": 1.01687741, "balance_loss_mlp": 1.03334773, "epoch": 0.9620321659401774, "flos": 20632224960000.0, "grad_norm": 1.5738933154344026, "language_loss": 0.7539221, "learning_rate": 1.4192524457312449e-08, "loss": 0.77549863, "num_input_tokens_seen": 345229175, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.68359375, "step": 16001, "time_per_iteration": 2.558159828186035 }, { "auxiliary_loss_clip": 0.01108875, "auxiliary_loss_mlp": 0.01031195, "balance_loss_clip": 1.01875579, "balance_loss_mlp": 1.03155792, "epoch": 0.9620922891928453, "flos": 18515290959360.0, "grad_norm": 1.5277602675741957, "language_loss": 0.68439585, "learning_rate": 1.4147628941078682e-08, "loss": 0.70579654, "num_input_tokens_seen": 345247815, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 16002, "time_per_iteration": 2.567310333251953 }, { "auxiliary_loss_clip": 0.01112028, "auxiliary_loss_mlp": 0.01031278, "balance_loss_clip": 1.0190413, "balance_loss_mlp": 1.03539705, "epoch": 0.9621524124455133, "flos": 28767391914240.0, "grad_norm": 1.2545622636983895, "language_loss": 0.64634776, "learning_rate": 1.4102804294807924e-08, "loss": 0.66778082, "num_input_tokens_seen": 345269935, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6796875, "step": 16003, "time_per_iteration": 3.2757747173309326 }, { "auxiliary_loss_clip": 0.01139409, "auxiliary_loss_mlp": 0.01038314, "balance_loss_clip": 1.02656555, "balance_loss_mlp": 1.03482664, "epoch": 0.9622125356981813, "flos": 19098875226240.0, "grad_norm": 1.7446971813126355, "language_loss": 0.75903678, "learning_rate": 1.4058050520099563e-08, "loss": 0.78081405, "num_input_tokens_seen": 345288310, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6953125, "step": 16004, "time_per_iteration": 2.7775158882141113 }, { "auxiliary_loss_clip": 0.01109196, "auxiliary_loss_mlp": 0.01029952, "balance_loss_clip": 1.01704776, "balance_loss_mlp": 1.03353596, "epoch": 0.9622726589508492, "flos": 20302816308480.0, "grad_norm": 1.8862446832207824, "language_loss": 0.79494667, "learning_rate": 1.4013367618551209e-08, "loss": 0.81633818, "num_input_tokens_seen": 345306615, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.66796875, "step": 16005, "time_per_iteration": 2.5453362464904785 }, { "auxiliary_loss_clip": 0.01107407, "auxiliary_loss_mlp": 0.01026177, "balance_loss_clip": 1.01506114, "balance_loss_mlp": 1.03287315, "epoch": 0.9623327822035173, "flos": 54929750889600.0, "grad_norm": 2.511719540318051, "language_loss": 0.67703342, "learning_rate": 1.3968755591757143e-08, "loss": 0.69836926, "num_input_tokens_seen": 345331935, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.65625, "step": 16006, "time_per_iteration": 2.9185426235198975 }, { "auxiliary_loss_clip": 0.01115718, "auxiliary_loss_mlp": 0.0103271, "balance_loss_clip": 1.02104568, "balance_loss_mlp": 1.03411841, "epoch": 0.9623929054561852, "flos": 21759029585280.0, "grad_norm": 1.5414012898307157, "language_loss": 0.78257346, "learning_rate": 1.3924214441309201e-08, "loss": 0.80405772, "num_input_tokens_seen": 345351510, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6328125, "step": 16007, "time_per_iteration": 3.2830281257629395 }, { "auxiliary_loss_clip": 0.01104762, "auxiliary_loss_mlp": 0.0103245, "balance_loss_clip": 1.01969504, "balance_loss_mlp": 1.03514063, "epoch": 0.9624530287088532, "flos": 17931563038080.0, "grad_norm": 2.014760476507654, "language_loss": 0.68125027, "learning_rate": 1.3879744168797447e-08, "loss": 0.70262235, "num_input_tokens_seen": 345367750, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 16008, "time_per_iteration": 4.097310543060303 }, { "auxiliary_loss_clip": 0.01122794, "auxiliary_loss_mlp": 0.01031874, "balance_loss_clip": 1.01855826, "balance_loss_mlp": 1.03471756, "epoch": 0.9625131519615211, "flos": 23253739263360.0, "grad_norm": 2.188061496902615, "language_loss": 0.72636914, "learning_rate": 1.3835344775808832e-08, "loss": 0.74791586, "num_input_tokens_seen": 345384790, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 16009, "time_per_iteration": 2.6930360794067383 }, { "auxiliary_loss_clip": 0.01107696, "auxiliary_loss_mlp": 0.01032173, "balance_loss_clip": 1.02035332, "balance_loss_mlp": 1.03195071, "epoch": 0.9625732752141891, "flos": 18916628595840.0, "grad_norm": 1.7084692956449319, "language_loss": 0.75032461, "learning_rate": 1.3791016263927646e-08, "loss": 0.77172327, "num_input_tokens_seen": 345403390, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 16010, "time_per_iteration": 3.1762356758117676 }, { "auxiliary_loss_clip": 0.0111815, "auxiliary_loss_mlp": 0.01033801, "balance_loss_clip": 1.01874506, "balance_loss_mlp": 1.03592396, "epoch": 0.962633398466857, "flos": 16252918790400.0, "grad_norm": 1.7361761738066945, "language_loss": 0.69674855, "learning_rate": 1.3746758634735734e-08, "loss": 0.71826804, "num_input_tokens_seen": 345418685, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.734375, "step": 16011, "time_per_iteration": 2.6310977935791016 }, { "auxiliary_loss_clip": 0.01110156, "auxiliary_loss_mlp": 0.01030029, "balance_loss_clip": 1.01868641, "balance_loss_mlp": 1.03453839, "epoch": 0.962693521719525, "flos": 54197424403200.0, "grad_norm": 1.7765529747474085, "language_loss": 0.68480361, "learning_rate": 1.3702571889813164e-08, "loss": 0.70620549, "num_input_tokens_seen": 345442380, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.66796875, "step": 16012, "time_per_iteration": 2.970712661743164 }, { "auxiliary_loss_clip": 0.01127379, "auxiliary_loss_mlp": 0.01031776, "balance_loss_clip": 1.01827574, "balance_loss_mlp": 1.0364337, "epoch": 0.962753644972193, "flos": 33105795471360.0, "grad_norm": 1.9559181487620207, "language_loss": 0.72068799, "learning_rate": 1.365845603073601e-08, "loss": 0.74227953, "num_input_tokens_seen": 345463815, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73046875, "step": 16013, "time_per_iteration": 2.7655375003814697 }, { "auxiliary_loss_clip": 0.01120495, "auxiliary_loss_mlp": 0.01031662, "balance_loss_clip": 1.01838779, "balance_loss_mlp": 1.03410017, "epoch": 0.962813768224861, "flos": 42230660837760.0, "grad_norm": 2.256058128866377, "language_loss": 0.63565034, "learning_rate": 1.3614411059079234e-08, "loss": 0.65717191, "num_input_tokens_seen": 345484525, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6796875, "step": 16014, "time_per_iteration": 3.242217779159546 }, { "auxiliary_loss_clip": 0.01115495, "auxiliary_loss_mlp": 0.01028862, "balance_loss_clip": 1.01823449, "balance_loss_mlp": 1.03349781, "epoch": 0.9628738914775289, "flos": 43944677003520.0, "grad_norm": 1.4764032757202867, "language_loss": 0.71902382, "learning_rate": 1.3570436976414689e-08, "loss": 0.74046743, "num_input_tokens_seen": 345508295, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.64453125, "step": 16015, "time_per_iteration": 2.799093723297119 }, { "auxiliary_loss_clip": 0.01127625, "auxiliary_loss_mlp": 0.01028486, "balance_loss_clip": 1.01640415, "balance_loss_mlp": 1.03284287, "epoch": 0.9629340147301969, "flos": 15596184476160.0, "grad_norm": 1.777694381710141, "language_loss": 0.7720651, "learning_rate": 1.3526533784311345e-08, "loss": 0.79362619, "num_input_tokens_seen": 345525155, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 16016, "time_per_iteration": 4.212409019470215 }, { "auxiliary_loss_clip": 0.01021933, "auxiliary_loss_mlp": 0.01000999, "balance_loss_clip": 0.99977154, "balance_loss_mlp": 1.0007863, "epoch": 0.9629941379828649, "flos": 62951011816320.0, "grad_norm": 0.8110642457557045, "language_loss": 0.63120139, "learning_rate": 1.348270148433639e-08, "loss": 0.65143073, "num_input_tokens_seen": 345578905, "router_z_loss_clip": 0.01226807, "router_z_loss_mlp": 0.2109375, "step": 16017, "time_per_iteration": 3.393202543258667 }, { "auxiliary_loss_clip": 0.01121455, "auxiliary_loss_mlp": 0.01026667, "balance_loss_clip": 1.01454949, "balance_loss_mlp": 1.03419948, "epoch": 0.9630542612355328, "flos": 29899116702720.0, "grad_norm": 2.5595685795165526, "language_loss": 0.65987986, "learning_rate": 1.3438940078053684e-08, "loss": 0.68136108, "num_input_tokens_seen": 345598965, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 16018, "time_per_iteration": 2.83027982711792 }, { "auxiliary_loss_clip": 0.01137699, "auxiliary_loss_mlp": 0.01036605, "balance_loss_clip": 1.02484536, "balance_loss_mlp": 1.03420925, "epoch": 0.9631143844882009, "flos": 17894575008000.0, "grad_norm": 2.2458632205167794, "language_loss": 0.79436994, "learning_rate": 1.3395249567025314e-08, "loss": 0.816113, "num_input_tokens_seen": 345617945, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.67578125, "step": 16019, "time_per_iteration": 2.649982213973999 }, { "auxiliary_loss_clip": 0.01125398, "auxiliary_loss_mlp": 0.01028127, "balance_loss_clip": 1.0174582, "balance_loss_mlp": 1.03198111, "epoch": 0.9631745077408688, "flos": 26139161767680.0, "grad_norm": 1.3704361482940377, "language_loss": 0.71217924, "learning_rate": 1.335162995281025e-08, "loss": 0.73371452, "num_input_tokens_seen": 345637920, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.6640625, "step": 16020, "time_per_iteration": 2.6598963737487793 }, { "auxiliary_loss_clip": 0.01119383, "auxiliary_loss_mlp": 0.01027634, "balance_loss_clip": 1.01605248, "balance_loss_mlp": 1.03373981, "epoch": 0.9632346309935368, "flos": 24973645259520.0, "grad_norm": 2.115961355444434, "language_loss": 0.76753604, "learning_rate": 1.3308081236965253e-08, "loss": 0.78900623, "num_input_tokens_seen": 345656195, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6796875, "step": 16021, "time_per_iteration": 2.9572763442993164 }, { "auxiliary_loss_clip": 0.01134449, "auxiliary_loss_mlp": 0.01033123, "balance_loss_clip": 1.01875854, "balance_loss_mlp": 1.03495061, "epoch": 0.9632947542462047, "flos": 23617226943360.0, "grad_norm": 3.7780408907191494, "language_loss": 0.64953381, "learning_rate": 1.3264603421044407e-08, "loss": 0.67120951, "num_input_tokens_seen": 345676700, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7265625, "step": 16022, "time_per_iteration": 5.347781658172607 }, { "auxiliary_loss_clip": 0.01116895, "auxiliary_loss_mlp": 0.01034662, "balance_loss_clip": 1.02317595, "balance_loss_mlp": 1.03258562, "epoch": 0.9633548774988727, "flos": 26395599939840.0, "grad_norm": 1.874038244612469, "language_loss": 0.73170376, "learning_rate": 1.3221196506599586e-08, "loss": 0.75321931, "num_input_tokens_seen": 345696725, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6640625, "step": 16023, "time_per_iteration": 4.872462511062622 }, { "auxiliary_loss_clip": 0.01109728, "auxiliary_loss_mlp": 0.01026806, "balance_loss_clip": 1.0153203, "balance_loss_mlp": 1.03318417, "epoch": 0.9634150007515406, "flos": 23767728929280.0, "grad_norm": 1.4966236935533987, "language_loss": 0.8158865, "learning_rate": 1.3177860495179328e-08, "loss": 0.8372519, "num_input_tokens_seen": 345716245, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.67578125, "step": 16024, "time_per_iteration": 2.925429582595825 }, { "auxiliary_loss_clip": 0.01116441, "auxiliary_loss_mlp": 0.01033543, "balance_loss_clip": 1.02073383, "balance_loss_mlp": 1.0360049, "epoch": 0.9634751240042086, "flos": 24135346673280.0, "grad_norm": 1.4545963886209337, "language_loss": 0.81336784, "learning_rate": 1.3134595388330837e-08, "loss": 0.83486766, "num_input_tokens_seen": 345739060, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 16025, "time_per_iteration": 2.973825693130493 }, { "auxiliary_loss_clip": 0.01112761, "auxiliary_loss_mlp": 0.01029406, "balance_loss_clip": 1.01694298, "balance_loss_mlp": 1.03332567, "epoch": 0.9635352472568766, "flos": 24349086552960.0, "grad_norm": 1.7945177127703857, "language_loss": 0.76479936, "learning_rate": 1.309140118759755e-08, "loss": 0.78622103, "num_input_tokens_seen": 345758325, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 16026, "time_per_iteration": 2.6438260078430176 }, { "auxiliary_loss_clip": 0.01130639, "auxiliary_loss_mlp": 0.01280774, "balance_loss_clip": 1.02082086, "balance_loss_mlp": 1.03375983, "epoch": 0.9635953705095446, "flos": 23984772860160.0, "grad_norm": 2.5342534628122757, "language_loss": 0.63047522, "learning_rate": 1.3048277894521343e-08, "loss": 0.65458935, "num_input_tokens_seen": 345778530, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69921875, "step": 16027, "time_per_iteration": 2.614441394805908 }, { "auxiliary_loss_clip": 0.01114848, "auxiliary_loss_mlp": 0.01030544, "balance_loss_clip": 1.01754451, "balance_loss_mlp": 1.0359273, "epoch": 0.9636554937622125, "flos": 19828436365440.0, "grad_norm": 1.7803972217810256, "language_loss": 0.87314141, "learning_rate": 1.3005225510640982e-08, "loss": 0.89459532, "num_input_tokens_seen": 345796535, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69921875, "step": 16028, "time_per_iteration": 2.601747512817383 }, { "auxiliary_loss_clip": 0.01057841, "auxiliary_loss_mlp": 0.01002068, "balance_loss_clip": 1.00077438, "balance_loss_mlp": 1.00084698, "epoch": 0.9637156170148805, "flos": 67435499986560.0, "grad_norm": 0.6960708099008898, "language_loss": 0.53239501, "learning_rate": 1.2962244037493019e-08, "loss": 0.55299407, "num_input_tokens_seen": 345859700, "router_z_loss_clip": 0.01293945, "router_z_loss_mlp": 0.2109375, "step": 16029, "time_per_iteration": 3.6483280658721924 }, { "auxiliary_loss_clip": 0.01104204, "auxiliary_loss_mlp": 0.01030539, "balance_loss_clip": 1.01799858, "balance_loss_mlp": 1.03449941, "epoch": 0.9637757402675484, "flos": 20300912887680.0, "grad_norm": 1.6111501940002317, "language_loss": 0.73931485, "learning_rate": 1.2919333476611338e-08, "loss": 0.76066232, "num_input_tokens_seen": 345878760, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 16030, "time_per_iteration": 2.669487714767456 }, { "auxiliary_loss_clip": 0.01117956, "auxiliary_loss_mlp": 0.01032267, "balance_loss_clip": 1.02069139, "balance_loss_mlp": 1.03343666, "epoch": 0.9638358635202164, "flos": 27234544970880.0, "grad_norm": 1.7349079783091947, "language_loss": 0.66295552, "learning_rate": 1.2876493829527157e-08, "loss": 0.68445778, "num_input_tokens_seen": 345900445, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6640625, "step": 16031, "time_per_iteration": 2.6497957706451416 }, { "auxiliary_loss_clip": 0.01030793, "auxiliary_loss_mlp": 0.01248437, "balance_loss_clip": 1.00135601, "balance_loss_mlp": 1.00098085, "epoch": 0.9638959867728845, "flos": 70288998278400.0, "grad_norm": 1.8542825588452632, "language_loss": 0.60723317, "learning_rate": 1.2833725097769477e-08, "loss": 0.63002545, "num_input_tokens_seen": 345961020, "router_z_loss_clip": 0.01220703, "router_z_loss_mlp": 0.21289062, "step": 16032, "time_per_iteration": 3.2447216510772705 }, { "auxiliary_loss_clip": 0.01104375, "auxiliary_loss_mlp": 0.01031001, "balance_loss_clip": 1.02000427, "balance_loss_mlp": 1.03191686, "epoch": 0.9639561100255524, "flos": 25407517639680.0, "grad_norm": 1.6939820472814564, "language_loss": 0.66318405, "learning_rate": 1.279102728286463e-08, "loss": 0.68453777, "num_input_tokens_seen": 345980210, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.6328125, "step": 16033, "time_per_iteration": 3.0901496410369873 }, { "auxiliary_loss_clip": 0.01146755, "auxiliary_loss_mlp": 0.01030263, "balance_loss_clip": 1.01816905, "balance_loss_mlp": 1.03522086, "epoch": 0.9640162332782204, "flos": 18113881495680.0, "grad_norm": 1.5952004453812196, "language_loss": 0.65686893, "learning_rate": 1.2748400386336066e-08, "loss": 0.67863911, "num_input_tokens_seen": 345998280, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.67578125, "step": 16034, "time_per_iteration": 3.00816011428833 }, { "auxiliary_loss_clip": 0.01131123, "auxiliary_loss_mlp": 0.01029649, "balance_loss_clip": 1.01729286, "balance_loss_mlp": 1.03483486, "epoch": 0.9640763565308883, "flos": 23440295525760.0, "grad_norm": 2.370037703793977, "language_loss": 0.73762715, "learning_rate": 1.2705844409705457e-08, "loss": 0.75923485, "num_input_tokens_seen": 346015545, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 16035, "time_per_iteration": 2.8165206909179688 }, { "auxiliary_loss_clip": 0.01104413, "auxiliary_loss_mlp": 0.01027914, "balance_loss_clip": 1.01558185, "balance_loss_mlp": 1.03675556, "epoch": 0.9641364797835563, "flos": 22419355259520.0, "grad_norm": 1.8278408090859337, "language_loss": 0.82083529, "learning_rate": 1.2663359354491366e-08, "loss": 0.84215856, "num_input_tokens_seen": 346034055, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.67578125, "step": 16036, "time_per_iteration": 2.664640426635742 }, { "auxiliary_loss_clip": 0.0111922, "auxiliary_loss_mlp": 0.01030456, "balance_loss_clip": 1.01786137, "balance_loss_mlp": 1.03247571, "epoch": 0.9641966030362242, "flos": 30622357048320.0, "grad_norm": 1.9588018052746319, "language_loss": 0.70054638, "learning_rate": 1.262094522220991e-08, "loss": 0.72204316, "num_input_tokens_seen": 346054130, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 16037, "time_per_iteration": 2.6299407482147217 }, { "auxiliary_loss_clip": 0.01101174, "auxiliary_loss_mlp": 0.01025743, "balance_loss_clip": 1.01348233, "balance_loss_mlp": 1.0320847, "epoch": 0.9642567262888923, "flos": 20953122088320.0, "grad_norm": 1.5846405664236842, "language_loss": 0.6310128, "learning_rate": 1.2578602014374551e-08, "loss": 0.65228194, "num_input_tokens_seen": 346072990, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.69140625, "step": 16038, "time_per_iteration": 3.0685617923736572 }, { "auxiliary_loss_clip": 0.01130536, "auxiliary_loss_mlp": 0.01276785, "balance_loss_clip": 1.01765418, "balance_loss_mlp": 1.03342855, "epoch": 0.9643168495415602, "flos": 22639415932800.0, "grad_norm": 1.7765771380936677, "language_loss": 0.71059, "learning_rate": 1.2536329732496741e-08, "loss": 0.73466319, "num_input_tokens_seen": 346093745, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.703125, "step": 16039, "time_per_iteration": 3.044734239578247 }, { "auxiliary_loss_clip": 0.01139368, "auxiliary_loss_mlp": 0.01031001, "balance_loss_clip": 1.01846588, "balance_loss_mlp": 1.03481007, "epoch": 0.9643769727942282, "flos": 20266259241600.0, "grad_norm": 2.6457035010078025, "language_loss": 0.73250741, "learning_rate": 1.2494128378085278e-08, "loss": 0.75421107, "num_input_tokens_seen": 346110115, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 16040, "time_per_iteration": 2.7411210536956787 }, { "auxiliary_loss_clip": 0.01112252, "auxiliary_loss_mlp": 0.0103106, "balance_loss_clip": 1.01844203, "balance_loss_mlp": 1.03361499, "epoch": 0.9644370960468961, "flos": 13881845088000.0, "grad_norm": 2.3934542826679146, "language_loss": 0.73265213, "learning_rate": 1.24519979526454e-08, "loss": 0.75408524, "num_input_tokens_seen": 346127165, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 16041, "time_per_iteration": 2.5812439918518066 }, { "auxiliary_loss_clip": 0.01115605, "auxiliary_loss_mlp": 0.01027259, "balance_loss_clip": 1.0156064, "balance_loss_mlp": 1.03311837, "epoch": 0.9644972192995641, "flos": 17238199829760.0, "grad_norm": 2.0652400727822893, "language_loss": 0.71829164, "learning_rate": 1.2409938457681457e-08, "loss": 0.73972023, "num_input_tokens_seen": 346145950, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.64453125, "step": 16042, "time_per_iteration": 2.6080989837646484 }, { "auxiliary_loss_clip": 0.011321, "auxiliary_loss_mlp": 0.01029725, "balance_loss_clip": 1.01822162, "balance_loss_mlp": 1.03431559, "epoch": 0.964557342552232, "flos": 23840340272640.0, "grad_norm": 1.7198886469027614, "language_loss": 0.80945641, "learning_rate": 1.236794989469403e-08, "loss": 0.83107471, "num_input_tokens_seen": 346165005, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.70703125, "step": 16043, "time_per_iteration": 2.742713451385498 }, { "auxiliary_loss_clip": 0.01119152, "auxiliary_loss_mlp": 0.01029751, "balance_loss_clip": 1.01851606, "balance_loss_mlp": 1.03444588, "epoch": 0.9646174658049, "flos": 21653129312640.0, "grad_norm": 1.5621723991385958, "language_loss": 0.71857053, "learning_rate": 1.2326032265181696e-08, "loss": 0.74005955, "num_input_tokens_seen": 346185095, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6640625, "step": 16044, "time_per_iteration": 3.3485584259033203 }, { "auxiliary_loss_clip": 0.01117649, "auxiliary_loss_mlp": 0.01024825, "balance_loss_clip": 1.01251066, "balance_loss_mlp": 1.03264642, "epoch": 0.9646775890575681, "flos": 17129570123520.0, "grad_norm": 1.75701526680872, "language_loss": 0.69737792, "learning_rate": 1.2284185570640371e-08, "loss": 0.71880263, "num_input_tokens_seen": 346202580, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.671875, "step": 16045, "time_per_iteration": 2.7255024909973145 }, { "auxiliary_loss_clip": 0.01150562, "auxiliary_loss_mlp": 0.01032088, "balance_loss_clip": 1.01889729, "balance_loss_mlp": 1.03508341, "epoch": 0.964737712310236, "flos": 13005732458880.0, "grad_norm": 2.0952641849500133, "language_loss": 0.75138974, "learning_rate": 1.2242409812563525e-08, "loss": 0.77321625, "num_input_tokens_seen": 346219395, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 16046, "time_per_iteration": 2.6979901790618896 }, { "auxiliary_loss_clip": 0.01113473, "auxiliary_loss_mlp": 0.01033658, "balance_loss_clip": 1.02029443, "balance_loss_mlp": 1.03426445, "epoch": 0.964797835562904, "flos": 24279240556800.0, "grad_norm": 1.8737502984961472, "language_loss": 0.6268332, "learning_rate": 1.2200704992441746e-08, "loss": 0.64830446, "num_input_tokens_seen": 346239715, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.703125, "step": 16047, "time_per_iteration": 2.596656084060669 }, { "auxiliary_loss_clip": 0.01121937, "auxiliary_loss_mlp": 0.01035006, "balance_loss_clip": 1.02242923, "balance_loss_mlp": 1.0363121, "epoch": 0.9648579588155719, "flos": 20522697413760.0, "grad_norm": 1.8430527717622192, "language_loss": 0.69120789, "learning_rate": 1.215907111176362e-08, "loss": 0.71277738, "num_input_tokens_seen": 346258500, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 16048, "time_per_iteration": 2.5556421279907227 }, { "auxiliary_loss_clip": 0.0111244, "auxiliary_loss_mlp": 0.01030443, "balance_loss_clip": 1.01786661, "balance_loss_mlp": 1.03382349, "epoch": 0.9649180820682399, "flos": 32154844855680.0, "grad_norm": 1.8457898996301794, "language_loss": 0.63629609, "learning_rate": 1.2117508172014845e-08, "loss": 0.65772492, "num_input_tokens_seen": 346279110, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 16049, "time_per_iteration": 2.7705018520355225 }, { "auxiliary_loss_clip": 0.01113101, "auxiliary_loss_mlp": 0.01029861, "balance_loss_clip": 1.01796961, "balance_loss_mlp": 1.03579712, "epoch": 0.9649782053209078, "flos": 20522589672960.0, "grad_norm": 1.5806795411186996, "language_loss": 0.70809031, "learning_rate": 1.2076016174679127e-08, "loss": 0.72951996, "num_input_tokens_seen": 346297860, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 16050, "time_per_iteration": 4.54584002494812 }, { "auxiliary_loss_clip": 0.01114971, "auxiliary_loss_mlp": 0.01035089, "balance_loss_clip": 1.02140999, "balance_loss_mlp": 1.03302038, "epoch": 0.9650383285735759, "flos": 43067953843200.0, "grad_norm": 1.7174354213761682, "language_loss": 0.69615352, "learning_rate": 1.2034595121236613e-08, "loss": 0.71765411, "num_input_tokens_seen": 346319860, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 16051, "time_per_iteration": 2.743175506591797 }, { "auxiliary_loss_clip": 0.01030452, "auxiliary_loss_mlp": 0.00998612, "balance_loss_clip": 0.99734211, "balance_loss_mlp": 1.0005765, "epoch": 0.9650984518262438, "flos": 61748255882880.0, "grad_norm": 0.8446852787550861, "language_loss": 0.59145045, "learning_rate": 1.1993245013165454e-08, "loss": 0.61174107, "num_input_tokens_seen": 346379025, "router_z_loss_clip": 0.01269531, "router_z_loss_mlp": 0.2109375, "step": 16052, "time_per_iteration": 3.179063558578491 }, { "auxiliary_loss_clip": 0.01098122, "auxiliary_loss_mlp": 0.01026781, "balance_loss_clip": 1.01490772, "balance_loss_mlp": 1.03215158, "epoch": 0.9651585750789118, "flos": 29789337761280.0, "grad_norm": 1.8768677509353695, "language_loss": 0.74663961, "learning_rate": 1.1951965851942025e-08, "loss": 0.76788867, "num_input_tokens_seen": 346402250, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.65625, "step": 16053, "time_per_iteration": 2.7098047733306885 }, { "auxiliary_loss_clip": 0.01122991, "auxiliary_loss_mlp": 0.01029849, "balance_loss_clip": 1.01741576, "balance_loss_mlp": 1.03516769, "epoch": 0.9652186983315797, "flos": 16873060124160.0, "grad_norm": 2.1731484436902613, "language_loss": 0.67755544, "learning_rate": 1.1910757639038927e-08, "loss": 0.69908386, "num_input_tokens_seen": 346419555, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69921875, "step": 16054, "time_per_iteration": 2.47070050239563 }, { "auxiliary_loss_clip": 0.01110275, "auxiliary_loss_mlp": 0.0128451, "balance_loss_clip": 1.02608538, "balance_loss_mlp": 1.0341723, "epoch": 0.9652788215842477, "flos": 31649761762560.0, "grad_norm": 1.457782783390283, "language_loss": 0.61908203, "learning_rate": 1.186962037592698e-08, "loss": 0.64302993, "num_input_tokens_seen": 346441245, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.671875, "step": 16055, "time_per_iteration": 2.845877170562744 }, { "auxiliary_loss_clip": 0.0112579, "auxiliary_loss_mlp": 0.01032121, "balance_loss_clip": 1.0200336, "balance_loss_mlp": 1.03336668, "epoch": 0.9653389448369156, "flos": 24754266944640.0, "grad_norm": 1.8393668043806122, "language_loss": 0.77002782, "learning_rate": 1.1828554064074126e-08, "loss": 0.7916069, "num_input_tokens_seen": 346460065, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.65625, "step": 16056, "time_per_iteration": 3.3259437084198 }, { "auxiliary_loss_clip": 0.01119931, "auxiliary_loss_mlp": 0.0102892, "balance_loss_clip": 1.01690388, "balance_loss_mlp": 1.03251553, "epoch": 0.9653990680895836, "flos": 20297249700480.0, "grad_norm": 5.198781501479335, "language_loss": 0.71805024, "learning_rate": 1.1787558704945855e-08, "loss": 0.73953873, "num_input_tokens_seen": 346478005, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 16057, "time_per_iteration": 4.201760768890381 }, { "auxiliary_loss_clip": 0.01098785, "auxiliary_loss_mlp": 0.01032953, "balance_loss_clip": 1.0210619, "balance_loss_mlp": 1.03271937, "epoch": 0.9654591913422517, "flos": 22528775064960.0, "grad_norm": 1.7489881676628187, "language_loss": 0.71965849, "learning_rate": 1.1746634300005443e-08, "loss": 0.74097592, "num_input_tokens_seen": 346497575, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.66015625, "step": 16058, "time_per_iteration": 2.5898866653442383 }, { "auxiliary_loss_clip": 0.01108455, "auxiliary_loss_mlp": 0.01033944, "balance_loss_clip": 1.02197528, "balance_loss_mlp": 1.03268635, "epoch": 0.9655193145949196, "flos": 26763002202240.0, "grad_norm": 1.3755954162310013, "language_loss": 0.73977095, "learning_rate": 1.1705780850713276e-08, "loss": 0.76119494, "num_input_tokens_seen": 346520000, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.671875, "step": 16059, "time_per_iteration": 2.552598476409912 }, { "auxiliary_loss_clip": 0.01124198, "auxiliary_loss_mlp": 0.01034095, "balance_loss_clip": 1.02176309, "balance_loss_mlp": 1.03503203, "epoch": 0.9655794378475876, "flos": 41970703132800.0, "grad_norm": 2.6293016216421052, "language_loss": 0.73587394, "learning_rate": 1.16649983585273e-08, "loss": 0.7574569, "num_input_tokens_seen": 346541605, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.71484375, "step": 16060, "time_per_iteration": 2.734605312347412 }, { "auxiliary_loss_clip": 0.01115452, "auxiliary_loss_mlp": 0.01029734, "balance_loss_clip": 1.01862955, "balance_loss_mlp": 1.03361869, "epoch": 0.9656395611002555, "flos": 27709427704320.0, "grad_norm": 1.5763940824306102, "language_loss": 0.76508236, "learning_rate": 1.1624286824903018e-08, "loss": 0.78653425, "num_input_tokens_seen": 346560955, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.64453125, "step": 16061, "time_per_iteration": 2.8421685695648193 }, { "auxiliary_loss_clip": 0.01039514, "auxiliary_loss_mlp": 0.01002807, "balance_loss_clip": 1.00155485, "balance_loss_mlp": 1.00110555, "epoch": 0.9656996843529235, "flos": 68968562411520.0, "grad_norm": 0.7231553150359465, "language_loss": 0.64174378, "learning_rate": 1.1583646251293044e-08, "loss": 0.66216695, "num_input_tokens_seen": 346621615, "router_z_loss_clip": 0.01251221, "router_z_loss_mlp": 0.2109375, "step": 16062, "time_per_iteration": 3.5975236892700195 }, { "auxiliary_loss_clip": 0.01132904, "auxiliary_loss_mlp": 0.01027367, "balance_loss_clip": 1.01640606, "balance_loss_mlp": 1.03190506, "epoch": 0.9657598076055914, "flos": 20631327120000.0, "grad_norm": 2.5305471178414543, "language_loss": 0.93123233, "learning_rate": 1.1543076639147997e-08, "loss": 0.95283508, "num_input_tokens_seen": 346637460, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.65625, "step": 16063, "time_per_iteration": 3.1030406951904297 }, { "auxiliary_loss_clip": 0.01113899, "auxiliary_loss_mlp": 0.01032285, "balance_loss_clip": 1.01991081, "balance_loss_mlp": 1.03557038, "epoch": 0.9658199308582595, "flos": 20448577699200.0, "grad_norm": 1.8107719752566078, "language_loss": 0.82551217, "learning_rate": 1.1502577989915608e-08, "loss": 0.84697402, "num_input_tokens_seen": 346655625, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 16064, "time_per_iteration": 6.41461181640625 }, { "auxiliary_loss_clip": 0.01113239, "auxiliary_loss_mlp": 0.01274843, "balance_loss_clip": 1.01682496, "balance_loss_mlp": 1.03666341, "epoch": 0.9658800541109274, "flos": 24718033100160.0, "grad_norm": 1.548654455809715, "language_loss": 0.83964097, "learning_rate": 1.1462150305041163e-08, "loss": 0.86352181, "num_input_tokens_seen": 346675220, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.67578125, "step": 16065, "time_per_iteration": 2.7680552005767822 }, { "auxiliary_loss_clip": 0.01117441, "auxiliary_loss_mlp": 0.01028927, "balance_loss_clip": 1.01673818, "balance_loss_mlp": 1.03241491, "epoch": 0.9659401773635954, "flos": 21610035970560.0, "grad_norm": 1.6453375133467922, "language_loss": 0.67296571, "learning_rate": 1.142179358596751e-08, "loss": 0.6944294, "num_input_tokens_seen": 346694710, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 16066, "time_per_iteration": 2.578687906265259 }, { "auxiliary_loss_clip": 0.01100831, "auxiliary_loss_mlp": 0.01024622, "balance_loss_clip": 1.01293302, "balance_loss_mlp": 1.03279734, "epoch": 0.9660003006162633, "flos": 20301200196480.0, "grad_norm": 1.8921217107237613, "language_loss": 0.81944633, "learning_rate": 1.138150783413483e-08, "loss": 0.84070086, "num_input_tokens_seen": 346712645, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 16067, "time_per_iteration": 2.7746431827545166 }, { "auxiliary_loss_clip": 0.01118218, "auxiliary_loss_mlp": 0.01028502, "balance_loss_clip": 1.01618814, "balance_loss_mlp": 1.03345478, "epoch": 0.9660604238689313, "flos": 24461954064000.0, "grad_norm": 2.467783132184093, "language_loss": 0.68775749, "learning_rate": 1.1341293050980638e-08, "loss": 0.7092247, "num_input_tokens_seen": 346732375, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.671875, "step": 16068, "time_per_iteration": 2.6490671634674072 }, { "auxiliary_loss_clip": 0.01105752, "auxiliary_loss_mlp": 0.01026992, "balance_loss_clip": 1.01649594, "balance_loss_mlp": 1.0325489, "epoch": 0.9661205471215992, "flos": 19864023765120.0, "grad_norm": 1.6862646365328078, "language_loss": 0.67808169, "learning_rate": 1.1301149237940233e-08, "loss": 0.69940913, "num_input_tokens_seen": 346750430, "router_z_loss_clip": 0.10546875, "router_z_loss_mlp": 0.640625, "step": 16069, "time_per_iteration": 2.631877899169922 }, { "auxiliary_loss_clip": 0.01122664, "auxiliary_loss_mlp": 0.01027951, "balance_loss_clip": 1.01600599, "balance_loss_mlp": 1.03645635, "epoch": 0.9661806703742672, "flos": 20557889763840.0, "grad_norm": 2.0481063297965143, "language_loss": 0.8896327, "learning_rate": 1.1261076396446467e-08, "loss": 0.91113883, "num_input_tokens_seen": 346768455, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 16070, "time_per_iteration": 2.6465232372283936 }, { "auxiliary_loss_clip": 0.01105568, "auxiliary_loss_mlp": 0.01034721, "balance_loss_clip": 1.02213264, "balance_loss_mlp": 1.0367837, "epoch": 0.9662407936269353, "flos": 21430949736960.0, "grad_norm": 6.379321377779408, "language_loss": 0.77353847, "learning_rate": 1.1221074527929086e-08, "loss": 0.79494143, "num_input_tokens_seen": 346786530, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 16071, "time_per_iteration": 2.6730339527130127 }, { "auxiliary_loss_clip": 0.01118883, "auxiliary_loss_mlp": 0.01030678, "balance_loss_clip": 1.01748765, "balance_loss_mlp": 1.03345919, "epoch": 0.9663009168796032, "flos": 14902893095040.0, "grad_norm": 1.7078916546783243, "language_loss": 0.65708017, "learning_rate": 1.1181143633816059e-08, "loss": 0.67857587, "num_input_tokens_seen": 346804635, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.671875, "step": 16072, "time_per_iteration": 2.537292718887329 }, { "auxiliary_loss_clip": 0.01113044, "auxiliary_loss_mlp": 0.01031392, "balance_loss_clip": 1.01922059, "balance_loss_mlp": 1.03456557, "epoch": 0.9663610401322712, "flos": 24310877460480.0, "grad_norm": 2.1237604351872594, "language_loss": 0.77492416, "learning_rate": 1.1141283715532023e-08, "loss": 0.79636848, "num_input_tokens_seen": 346823070, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6953125, "step": 16073, "time_per_iteration": 2.5972628593444824 }, { "auxiliary_loss_clip": 0.01105601, "auxiliary_loss_mlp": 0.01034293, "balance_loss_clip": 1.02156138, "balance_loss_mlp": 1.03481543, "epoch": 0.9664211633849391, "flos": 17637849527040.0, "grad_norm": 2.154478392688768, "language_loss": 0.76022303, "learning_rate": 1.1101494774499398e-08, "loss": 0.78162199, "num_input_tokens_seen": 346841180, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 16074, "time_per_iteration": 2.6155264377593994 }, { "auxiliary_loss_clip": 0.01122373, "auxiliary_loss_mlp": 0.01027366, "balance_loss_clip": 1.01502156, "balance_loss_mlp": 1.0345248, "epoch": 0.9664812866376071, "flos": 29789409588480.0, "grad_norm": 1.8088716222814962, "language_loss": 0.75662446, "learning_rate": 1.1061776812138601e-08, "loss": 0.77812189, "num_input_tokens_seen": 346864250, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 16075, "time_per_iteration": 2.7112081050872803 }, { "auxiliary_loss_clip": 0.01131011, "auxiliary_loss_mlp": 0.01032549, "balance_loss_clip": 1.02032995, "balance_loss_mlp": 1.03591251, "epoch": 0.966541409890275, "flos": 14282320798080.0, "grad_norm": 1.945219739677659, "language_loss": 0.79043758, "learning_rate": 1.1022129829866722e-08, "loss": 0.81207323, "num_input_tokens_seen": 346881955, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 16076, "time_per_iteration": 2.685199737548828 }, { "auxiliary_loss_clip": 0.01122365, "auxiliary_loss_mlp": 0.01038944, "balance_loss_clip": 1.0255388, "balance_loss_mlp": 1.03506994, "epoch": 0.9666015331429431, "flos": 19860432405120.0, "grad_norm": 1.7507154259528819, "language_loss": 0.72209561, "learning_rate": 1.098255382909885e-08, "loss": 0.74370873, "num_input_tokens_seen": 346900445, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.69921875, "step": 16077, "time_per_iteration": 2.581267833709717 }, { "auxiliary_loss_clip": 0.01111136, "auxiliary_loss_mlp": 0.01032425, "balance_loss_clip": 1.02007508, "balance_loss_mlp": 1.03365934, "epoch": 0.966661656395611, "flos": 27125951178240.0, "grad_norm": 1.771355481237918, "language_loss": 0.59915739, "learning_rate": 1.0943048811247191e-08, "loss": 0.62059295, "num_input_tokens_seen": 346920135, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 16078, "time_per_iteration": 2.586325168609619 }, { "auxiliary_loss_clip": 0.01115272, "auxiliary_loss_mlp": 0.01033757, "balance_loss_clip": 1.02113891, "balance_loss_mlp": 1.03542578, "epoch": 0.966721779648279, "flos": 21508229848320.0, "grad_norm": 1.8606740168743725, "language_loss": 0.72354525, "learning_rate": 1.0903614777721726e-08, "loss": 0.74503553, "num_input_tokens_seen": 346940450, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 16079, "time_per_iteration": 2.5260329246520996 }, { "auxiliary_loss_clip": 0.01107291, "auxiliary_loss_mlp": 0.01026917, "balance_loss_clip": 1.01552677, "balance_loss_mlp": 1.03268051, "epoch": 0.9667819029009469, "flos": 23878118401920.0, "grad_norm": 1.5308664183571568, "language_loss": 0.72185552, "learning_rate": 1.0864251729929775e-08, "loss": 0.74319768, "num_input_tokens_seen": 346960935, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.65625, "step": 16080, "time_per_iteration": 2.6267011165618896 }, { "auxiliary_loss_clip": 0.011205, "auxiliary_loss_mlp": 0.01029225, "balance_loss_clip": 1.01753664, "balance_loss_mlp": 1.03456247, "epoch": 0.9668420261536149, "flos": 21507224267520.0, "grad_norm": 2.388827138369643, "language_loss": 0.73748016, "learning_rate": 1.0824959669275769e-08, "loss": 0.75897741, "num_input_tokens_seen": 346980100, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6796875, "step": 16081, "time_per_iteration": 2.616485357284546 }, { "auxiliary_loss_clip": 0.01118964, "auxiliary_loss_mlp": 0.01028419, "balance_loss_clip": 1.01637888, "balance_loss_mlp": 1.03230786, "epoch": 0.9669021494062828, "flos": 26687266375680.0, "grad_norm": 1.6760168629442518, "language_loss": 0.67269951, "learning_rate": 1.0785738597162364e-08, "loss": 0.69417334, "num_input_tokens_seen": 347001250, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 16082, "time_per_iteration": 2.7254233360290527 }, { "auxiliary_loss_clip": 0.01126905, "auxiliary_loss_mlp": 0.01026431, "balance_loss_clip": 1.01468897, "balance_loss_mlp": 1.03269184, "epoch": 0.9669622726589508, "flos": 29825032901760.0, "grad_norm": 2.582155544162998, "language_loss": 0.76945466, "learning_rate": 1.0746588514989108e-08, "loss": 0.79098803, "num_input_tokens_seen": 347022975, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 16083, "time_per_iteration": 2.6239097118377686 }, { "auxiliary_loss_clip": 0.01139351, "auxiliary_loss_mlp": 0.01033404, "balance_loss_clip": 1.02074373, "balance_loss_mlp": 1.03437412, "epoch": 0.9670223959116189, "flos": 22922499018240.0, "grad_norm": 2.2274393035762627, "language_loss": 0.79582703, "learning_rate": 1.0707509424152883e-08, "loss": 0.81755459, "num_input_tokens_seen": 347038780, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 16084, "time_per_iteration": 2.5167956352233887 }, { "auxiliary_loss_clip": 0.01114501, "auxiliary_loss_mlp": 0.01029204, "balance_loss_clip": 1.01644826, "balance_loss_mlp": 1.0335871, "epoch": 0.9670825191642868, "flos": 18624495283200.0, "grad_norm": 2.327054938717909, "language_loss": 0.67670095, "learning_rate": 1.0668501326048795e-08, "loss": 0.698138, "num_input_tokens_seen": 347056705, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 16085, "time_per_iteration": 2.5493600368499756 }, { "auxiliary_loss_clip": 0.01125206, "auxiliary_loss_mlp": 0.0103207, "balance_loss_clip": 1.01896286, "balance_loss_mlp": 1.03549433, "epoch": 0.9671426424169548, "flos": 24497936513280.0, "grad_norm": 2.008375576248639, "language_loss": 0.6853019, "learning_rate": 1.0629564222068621e-08, "loss": 0.70687461, "num_input_tokens_seen": 347075710, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 16086, "time_per_iteration": 2.5401575565338135 }, { "auxiliary_loss_clip": 0.01110028, "auxiliary_loss_mlp": 0.01033211, "balance_loss_clip": 1.02080083, "balance_loss_mlp": 1.03413725, "epoch": 0.9672027656696227, "flos": 20371189847040.0, "grad_norm": 2.0015539849594046, "language_loss": 0.78451586, "learning_rate": 1.0590698113602137e-08, "loss": 0.80594826, "num_input_tokens_seen": 347092325, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.66796875, "step": 16087, "time_per_iteration": 2.5494496822357178 }, { "auxiliary_loss_clip": 0.01119494, "auxiliary_loss_mlp": 0.01025248, "balance_loss_clip": 1.0135777, "balance_loss_mlp": 1.03333831, "epoch": 0.9672628889222907, "flos": 18880179269760.0, "grad_norm": 1.793386885347901, "language_loss": 0.71242344, "learning_rate": 1.0551903002036234e-08, "loss": 0.73387086, "num_input_tokens_seen": 347110595, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6875, "step": 16088, "time_per_iteration": 2.722344398498535 }, { "auxiliary_loss_clip": 0.01099958, "auxiliary_loss_mlp": 0.01277665, "balance_loss_clip": 1.01948023, "balance_loss_mlp": 1.03395426, "epoch": 0.9673230121749586, "flos": 28695247447680.0, "grad_norm": 1.4808930216551484, "language_loss": 0.70497406, "learning_rate": 1.0513178888755358e-08, "loss": 0.72875029, "num_input_tokens_seen": 347131625, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.66015625, "step": 16089, "time_per_iteration": 2.718536615371704 }, { "auxiliary_loss_clip": 0.01104014, "auxiliary_loss_mlp": 0.01030471, "balance_loss_clip": 1.01812112, "balance_loss_mlp": 1.03461337, "epoch": 0.9673831354276267, "flos": 20484452407680.0, "grad_norm": 2.4303318450760085, "language_loss": 0.74811047, "learning_rate": 1.047452577514174e-08, "loss": 0.76945531, "num_input_tokens_seen": 347147910, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 16090, "time_per_iteration": 2.541569709777832 }, { "auxiliary_loss_clip": 0.01124279, "auxiliary_loss_mlp": 0.01031376, "balance_loss_clip": 1.01843572, "balance_loss_mlp": 1.03385711, "epoch": 0.9674432586802946, "flos": 26797548107520.0, "grad_norm": 1.7983822748545741, "language_loss": 0.690723, "learning_rate": 1.0435943662574498e-08, "loss": 0.71227956, "num_input_tokens_seen": 347168805, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73046875, "step": 16091, "time_per_iteration": 3.9693992137908936 }, { "auxiliary_loss_clip": 0.0110021, "auxiliary_loss_mlp": 0.01033616, "balance_loss_clip": 1.02049732, "balance_loss_mlp": 1.03256917, "epoch": 0.9675033819329626, "flos": 22310941034880.0, "grad_norm": 1.837935725360841, "language_loss": 0.7693342, "learning_rate": 1.039743255243053e-08, "loss": 0.79067254, "num_input_tokens_seen": 347189455, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.67578125, "step": 16092, "time_per_iteration": 2.534492015838623 }, { "auxiliary_loss_clip": 0.01102308, "auxiliary_loss_mlp": 0.01026206, "balance_loss_clip": 1.01468408, "balance_loss_mlp": 1.03512025, "epoch": 0.9675635051856305, "flos": 12675713276160.0, "grad_norm": 1.7262721510312142, "language_loss": 0.76327693, "learning_rate": 1.0358992446084513e-08, "loss": 0.78456211, "num_input_tokens_seen": 347206030, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 16093, "time_per_iteration": 2.5738351345062256 }, { "auxiliary_loss_clip": 0.01120325, "auxiliary_loss_mlp": 0.01029334, "balance_loss_clip": 1.01579201, "balance_loss_mlp": 1.03481627, "epoch": 0.9676236284382985, "flos": 24608469640320.0, "grad_norm": 1.415836215978823, "language_loss": 0.69317335, "learning_rate": 1.0320623344908019e-08, "loss": 0.71466994, "num_input_tokens_seen": 347226250, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.68359375, "step": 16094, "time_per_iteration": 2.687023878097534 }, { "auxiliary_loss_clip": 0.01113055, "auxiliary_loss_mlp": 0.01030363, "balance_loss_clip": 1.01856685, "balance_loss_mlp": 1.03365135, "epoch": 0.9676837516909664, "flos": 23367145478400.0, "grad_norm": 1.46528962419296, "language_loss": 0.75530529, "learning_rate": 1.0282325250270396e-08, "loss": 0.77673948, "num_input_tokens_seen": 347247350, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.70703125, "step": 16095, "time_per_iteration": 2.5745043754577637 }, { "auxiliary_loss_clip": 0.01108187, "auxiliary_loss_mlp": 0.01035626, "balance_loss_clip": 1.02265573, "balance_loss_mlp": 1.03667688, "epoch": 0.9677438749436345, "flos": 20486894532480.0, "grad_norm": 1.79052901879715, "language_loss": 0.70649505, "learning_rate": 1.024409816353855e-08, "loss": 0.72793317, "num_input_tokens_seen": 347266870, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 16096, "time_per_iteration": 2.5633742809295654 }, { "auxiliary_loss_clip": 0.01121865, "auxiliary_loss_mlp": 0.01029245, "balance_loss_clip": 1.01698399, "balance_loss_mlp": 1.03495407, "epoch": 0.9678039981963025, "flos": 47555889719040.0, "grad_norm": 1.9941023872007755, "language_loss": 0.72049952, "learning_rate": 1.0205942086076502e-08, "loss": 0.74201065, "num_input_tokens_seen": 347290120, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6953125, "step": 16097, "time_per_iteration": 2.7642743587493896 }, { "auxiliary_loss_clip": 0.01100474, "auxiliary_loss_mlp": 0.01034999, "balance_loss_clip": 1.02260721, "balance_loss_mlp": 1.03355968, "epoch": 0.9678641214489704, "flos": 25040474513280.0, "grad_norm": 1.5383454581722982, "language_loss": 0.77976853, "learning_rate": 1.016785701924605e-08, "loss": 0.80112326, "num_input_tokens_seen": 347308785, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.66796875, "step": 16098, "time_per_iteration": 2.58768892288208 }, { "auxiliary_loss_clip": 0.01118321, "auxiliary_loss_mlp": 0.01026987, "balance_loss_clip": 1.0155195, "balance_loss_mlp": 1.03333044, "epoch": 0.9679242447016384, "flos": 19240937516160.0, "grad_norm": 1.8043672427640294, "language_loss": 0.90791857, "learning_rate": 1.012984296440611e-08, "loss": 0.92937171, "num_input_tokens_seen": 347326375, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6640625, "step": 16099, "time_per_iteration": 3.988123893737793 }, { "auxiliary_loss_clip": 0.0111246, "auxiliary_loss_mlp": 0.01030375, "balance_loss_clip": 1.01773834, "balance_loss_mlp": 1.03307116, "epoch": 0.9679843679543063, "flos": 33254681345280.0, "grad_norm": 1.7693400147221998, "language_loss": 0.66202909, "learning_rate": 1.0091899922913816e-08, "loss": 0.68345749, "num_input_tokens_seen": 347348250, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 16100, "time_per_iteration": 2.6287920475006104 }, { "auxiliary_loss_clip": 0.01131294, "auxiliary_loss_mlp": 0.01032289, "balance_loss_clip": 1.01982546, "balance_loss_mlp": 1.03511643, "epoch": 0.9680444912069743, "flos": 22783633038720.0, "grad_norm": 4.192457227574072, "language_loss": 0.73484439, "learning_rate": 1.0054027896122752e-08, "loss": 0.75648022, "num_input_tokens_seen": 347367400, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69140625, "step": 16101, "time_per_iteration": 2.610476493835449 }, { "auxiliary_loss_clip": 0.01119327, "auxiliary_loss_mlp": 0.01030719, "balance_loss_clip": 1.01799929, "balance_loss_mlp": 1.03376365, "epoch": 0.9681046144596422, "flos": 24024095274240.0, "grad_norm": 1.5675072721401628, "language_loss": 0.72993755, "learning_rate": 1.0016226885384726e-08, "loss": 0.75143802, "num_input_tokens_seen": 347387600, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 16102, "time_per_iteration": 2.6134355068206787 }, { "auxiliary_loss_clip": 0.01132901, "auxiliary_loss_mlp": 0.01036385, "balance_loss_clip": 1.02238965, "balance_loss_mlp": 1.03652573, "epoch": 0.9681647377123103, "flos": 23441013797760.0, "grad_norm": 1.6372693811060173, "language_loss": 0.77518904, "learning_rate": 9.97849689204866e-09, "loss": 0.79688191, "num_input_tokens_seen": 347406915, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.6953125, "step": 16103, "time_per_iteration": 2.651230573654175 }, { "auxiliary_loss_clip": 0.01120908, "auxiliary_loss_mlp": 0.0102728, "balance_loss_clip": 1.01565659, "balance_loss_mlp": 1.03462815, "epoch": 0.9682248609649782, "flos": 22675075159680.0, "grad_norm": 1.6167117248354521, "language_loss": 0.80756372, "learning_rate": 9.940837917461254e-09, "loss": 0.82904565, "num_input_tokens_seen": 347425140, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6875, "step": 16104, "time_per_iteration": 2.655764579772949 }, { "auxiliary_loss_clip": 0.01106596, "auxiliary_loss_mlp": 0.01030189, "balance_loss_clip": 1.01948357, "balance_loss_mlp": 1.03118968, "epoch": 0.9682849842176462, "flos": 24428413739520.0, "grad_norm": 1.5459968342079196, "language_loss": 0.77626693, "learning_rate": 9.903249962966098e-09, "loss": 0.79763472, "num_input_tokens_seen": 347446350, "router_z_loss_clip": 0.10742188, "router_z_loss_mlp": 0.6640625, "step": 16105, "time_per_iteration": 2.6013705730438232 }, { "auxiliary_loss_clip": 0.0104039, "auxiliary_loss_mlp": 0.01001595, "balance_loss_clip": 1.00042081, "balance_loss_mlp": 1.00088406, "epoch": 0.9683451074703141, "flos": 59995132784640.0, "grad_norm": 0.8172496838071918, "language_loss": 0.56741333, "learning_rate": 9.86573302990501e-09, "loss": 0.58783317, "num_input_tokens_seen": 347510135, "router_z_loss_clip": 0.01171875, "router_z_loss_mlp": 0.2109375, "step": 16106, "time_per_iteration": 6.795946359634399 }, { "auxiliary_loss_clip": 0.01049004, "auxiliary_loss_mlp": 0.01001749, "balance_loss_clip": 1.00061607, "balance_loss_mlp": 1.00112891, "epoch": 0.9684052307229821, "flos": 52696145514240.0, "grad_norm": 0.8810891616621582, "language_loss": 0.61667651, "learning_rate": 9.828287119616473e-09, "loss": 0.63718408, "num_input_tokens_seen": 347562505, "router_z_loss_clip": 0.01135254, "router_z_loss_mlp": 0.2109375, "step": 16107, "time_per_iteration": 3.0989387035369873 }, { "auxiliary_loss_clip": 0.01112807, "auxiliary_loss_mlp": 0.01038163, "balance_loss_clip": 1.02433467, "balance_loss_mlp": 1.03512049, "epoch": 0.96846535397565, "flos": 22783848520320.0, "grad_norm": 2.005284099037003, "language_loss": 0.73936778, "learning_rate": 9.790912233437198e-09, "loss": 0.76087749, "num_input_tokens_seen": 347579150, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.6875, "step": 16108, "time_per_iteration": 2.5950117111206055 }, { "auxiliary_loss_clip": 0.01113417, "auxiliary_loss_mlp": 0.0102876, "balance_loss_clip": 1.01629043, "balance_loss_mlp": 1.03495288, "epoch": 0.968525477228318, "flos": 23323980309120.0, "grad_norm": 1.9218704562659668, "language_loss": 0.57108378, "learning_rate": 9.753608372700783e-09, "loss": 0.59250557, "num_input_tokens_seen": 347596705, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 16109, "time_per_iteration": 2.712536096572876 }, { "auxiliary_loss_clip": 0.01031264, "auxiliary_loss_mlp": 0.01002088, "balance_loss_clip": 1.00093794, "balance_loss_mlp": 1.00094652, "epoch": 0.9685856004809861, "flos": 67882947707520.0, "grad_norm": 0.6671369878125459, "language_loss": 0.54243708, "learning_rate": 9.716375538738608e-09, "loss": 0.56277061, "num_input_tokens_seen": 347661870, "router_z_loss_clip": 0.01147461, "router_z_loss_mlp": 0.2109375, "step": 16110, "time_per_iteration": 3.098203659057617 }, { "auxiliary_loss_clip": 0.01152009, "auxiliary_loss_mlp": 0.01039855, "balance_loss_clip": 1.02608061, "balance_loss_mlp": 1.03508031, "epoch": 0.968645723733654, "flos": 33947900899200.0, "grad_norm": 1.8304911160726747, "language_loss": 0.62667632, "learning_rate": 9.679213732879388e-09, "loss": 0.64859498, "num_input_tokens_seen": 347684295, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.71875, "step": 16111, "time_per_iteration": 2.7173988819122314 }, { "auxiliary_loss_clip": 0.01118197, "auxiliary_loss_mlp": 0.01028247, "balance_loss_clip": 1.01707101, "balance_loss_mlp": 1.03308415, "epoch": 0.968705846986322, "flos": 18551488890240.0, "grad_norm": 1.8391305745340947, "language_loss": 0.74719381, "learning_rate": 9.642122956449173e-09, "loss": 0.76865828, "num_input_tokens_seen": 347702585, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.671875, "step": 16112, "time_per_iteration": 2.5150887966156006 }, { "auxiliary_loss_clip": 0.011073, "auxiliary_loss_mlp": 0.01028598, "balance_loss_clip": 1.01723123, "balance_loss_mlp": 1.03201592, "epoch": 0.9687659702389899, "flos": 17420913336960.0, "grad_norm": 3.6470711447119277, "language_loss": 0.66767806, "learning_rate": 9.605103210771793e-09, "loss": 0.68903702, "num_input_tokens_seen": 347721810, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6640625, "step": 16113, "time_per_iteration": 2.569800615310669 }, { "auxiliary_loss_clip": 0.01030615, "auxiliary_loss_mlp": 0.01002706, "balance_loss_clip": 1.00144243, "balance_loss_mlp": 1.00096071, "epoch": 0.9688260934916579, "flos": 62047176865920.0, "grad_norm": 0.7003169274055714, "language_loss": 0.56558418, "learning_rate": 9.568154497168635e-09, "loss": 0.58591735, "num_input_tokens_seen": 347782330, "router_z_loss_clip": 0.01263428, "router_z_loss_mlp": 0.2109375, "step": 16114, "time_per_iteration": 3.223607301712036 }, { "auxiliary_loss_clip": 0.01120949, "auxiliary_loss_mlp": 0.01034399, "balance_loss_clip": 1.0219003, "balance_loss_mlp": 1.03459084, "epoch": 0.9688862167443258, "flos": 15076520461440.0, "grad_norm": 1.874296331725046, "language_loss": 0.82466602, "learning_rate": 9.531276816957757e-09, "loss": 0.84621954, "num_input_tokens_seen": 347794835, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 16115, "time_per_iteration": 2.505248785018921 }, { "auxiliary_loss_clip": 0.01105236, "auxiliary_loss_mlp": 0.0102671, "balance_loss_clip": 1.01579642, "balance_loss_mlp": 1.03275347, "epoch": 0.9689463399969939, "flos": 19938215306880.0, "grad_norm": 1.6291755606514726, "language_loss": 0.72097731, "learning_rate": 9.494470171455438e-09, "loss": 0.74229681, "num_input_tokens_seen": 347814320, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.63671875, "step": 16116, "time_per_iteration": 2.5631325244903564 }, { "auxiliary_loss_clip": 0.01130833, "auxiliary_loss_mlp": 0.01030218, "balance_loss_clip": 1.01751661, "balance_loss_mlp": 1.03455305, "epoch": 0.9690064632496618, "flos": 19573039687680.0, "grad_norm": 2.1065530865633146, "language_loss": 0.75305492, "learning_rate": 9.457734561975739e-09, "loss": 0.77466547, "num_input_tokens_seen": 347832125, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 16117, "time_per_iteration": 2.5597212314605713 }, { "auxiliary_loss_clip": 0.01103522, "auxiliary_loss_mlp": 0.01030517, "balance_loss_clip": 1.0181495, "balance_loss_mlp": 1.03338218, "epoch": 0.9690665865023298, "flos": 21872292145920.0, "grad_norm": 2.616998732956204, "language_loss": 0.77739179, "learning_rate": 9.421069989828722e-09, "loss": 0.79873216, "num_input_tokens_seen": 347850765, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 16118, "time_per_iteration": 2.5987539291381836 }, { "auxiliary_loss_clip": 0.01123139, "auxiliary_loss_mlp": 0.01277205, "balance_loss_clip": 1.01807857, "balance_loss_mlp": 1.0358299, "epoch": 0.9691267097549977, "flos": 20994491577600.0, "grad_norm": 2.0020171512672955, "language_loss": 0.78218198, "learning_rate": 9.384476456323565e-09, "loss": 0.80618536, "num_input_tokens_seen": 347870125, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 16119, "time_per_iteration": 2.587646961212158 }, { "auxiliary_loss_clip": 0.01134304, "auxiliary_loss_mlp": 0.01028465, "balance_loss_clip": 1.01748586, "balance_loss_mlp": 1.03362119, "epoch": 0.9691868330076657, "flos": 24279132816000.0, "grad_norm": 1.4056679643018284, "language_loss": 0.75578868, "learning_rate": 9.347953962765886e-09, "loss": 0.77741629, "num_input_tokens_seen": 347890615, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.65234375, "step": 16120, "time_per_iteration": 2.682447910308838 }, { "auxiliary_loss_clip": 0.01110478, "auxiliary_loss_mlp": 0.01031456, "balance_loss_clip": 1.0201906, "balance_loss_mlp": 1.03393602, "epoch": 0.9692469562603336, "flos": 17675699483520.0, "grad_norm": 1.8129247349730013, "language_loss": 0.6993928, "learning_rate": 9.311502510459313e-09, "loss": 0.72081208, "num_input_tokens_seen": 347908685, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.671875, "step": 16121, "time_per_iteration": 2.583998680114746 }, { "auxiliary_loss_clip": 0.01109607, "auxiliary_loss_mlp": 0.01030885, "balance_loss_clip": 1.01893985, "balance_loss_mlp": 1.03321171, "epoch": 0.9693070795130017, "flos": 26834392483200.0, "grad_norm": 2.151798578083138, "language_loss": 0.68955266, "learning_rate": 9.275122100704357e-09, "loss": 0.71095765, "num_input_tokens_seen": 347926385, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6796875, "step": 16122, "time_per_iteration": 2.5289323329925537 }, { "auxiliary_loss_clip": 0.01127311, "auxiliary_loss_mlp": 0.01026354, "balance_loss_clip": 1.01537514, "balance_loss_mlp": 1.03204632, "epoch": 0.9693672027656697, "flos": 18077288515200.0, "grad_norm": 1.7533465892142908, "language_loss": 0.75886792, "learning_rate": 9.238812734799762e-09, "loss": 0.78040457, "num_input_tokens_seen": 347945290, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.6796875, "step": 16123, "time_per_iteration": 2.53078031539917 }, { "auxiliary_loss_clip": 0.01128322, "auxiliary_loss_mlp": 0.01029956, "balance_loss_clip": 1.01757586, "balance_loss_mlp": 1.032547, "epoch": 0.9694273260183376, "flos": 21763015994880.0, "grad_norm": 1.5788626656047162, "language_loss": 0.74484229, "learning_rate": 9.20257441404093e-09, "loss": 0.76642513, "num_input_tokens_seen": 347966330, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 16124, "time_per_iteration": 2.5406100749969482 }, { "auxiliary_loss_clip": 0.01111829, "auxiliary_loss_mlp": 0.01033678, "balance_loss_clip": 1.02140486, "balance_loss_mlp": 1.03363037, "epoch": 0.9694874492710056, "flos": 24426115269120.0, "grad_norm": 2.1692906254940554, "language_loss": 0.74572271, "learning_rate": 9.166407139721277e-09, "loss": 0.7671777, "num_input_tokens_seen": 347982590, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 16125, "time_per_iteration": 2.5313987731933594 }, { "auxiliary_loss_clip": 0.0103103, "auxiliary_loss_mlp": 0.01001515, "balance_loss_clip": 1.00027525, "balance_loss_mlp": 1.00116587, "epoch": 0.9695475725236735, "flos": 67650748237440.0, "grad_norm": 0.8797022929536058, "language_loss": 0.6148479, "learning_rate": 9.130310913131545e-09, "loss": 0.63517332, "num_input_tokens_seen": 348043310, "router_z_loss_clip": 0.01239014, "router_z_loss_mlp": 0.2109375, "step": 16126, "time_per_iteration": 3.0916330814361572 }, { "auxiliary_loss_clip": 0.01110088, "auxiliary_loss_mlp": 0.01033403, "balance_loss_clip": 1.02135646, "balance_loss_mlp": 1.03442574, "epoch": 0.9696076957763415, "flos": 19129326981120.0, "grad_norm": 1.731050634704256, "language_loss": 0.74885452, "learning_rate": 9.094285735559815e-09, "loss": 0.77028942, "num_input_tokens_seen": 348062200, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.66796875, "step": 16127, "time_per_iteration": 2.666013479232788 }, { "auxiliary_loss_clip": 0.01138397, "auxiliary_loss_mlp": 0.01034629, "balance_loss_clip": 1.02192688, "balance_loss_mlp": 1.03327656, "epoch": 0.9696678190290094, "flos": 15486836497920.0, "grad_norm": 1.5740988764403407, "language_loss": 0.69157082, "learning_rate": 9.058331608291947e-09, "loss": 0.71330106, "num_input_tokens_seen": 348080685, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 16128, "time_per_iteration": 2.599975347518921 }, { "auxiliary_loss_clip": 0.01130167, "auxiliary_loss_mlp": 0.01033544, "balance_loss_clip": 1.02075243, "balance_loss_mlp": 1.03551555, "epoch": 0.9697279422816775, "flos": 36208692869760.0, "grad_norm": 1.6188361444970512, "language_loss": 0.65464205, "learning_rate": 9.022448532610916e-09, "loss": 0.67627919, "num_input_tokens_seen": 348102500, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 16129, "time_per_iteration": 2.71112060546875 }, { "auxiliary_loss_clip": 0.01132395, "auxiliary_loss_mlp": 0.01029312, "balance_loss_clip": 1.01665175, "balance_loss_mlp": 1.03378844, "epoch": 0.9697880655343454, "flos": 25484007651840.0, "grad_norm": 3.3083431194257265, "language_loss": 0.6290701, "learning_rate": 8.986636509797253e-09, "loss": 0.6506871, "num_input_tokens_seen": 348122515, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 16130, "time_per_iteration": 2.604879856109619 }, { "auxiliary_loss_clip": 0.01115217, "auxiliary_loss_mlp": 0.0103138, "balance_loss_clip": 1.01914275, "balance_loss_mlp": 1.03511953, "epoch": 0.9698481887870134, "flos": 15333533251200.0, "grad_norm": 2.7861485252271416, "language_loss": 0.70082915, "learning_rate": 8.950895541128823e-09, "loss": 0.72229511, "num_input_tokens_seen": 348138775, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7109375, "step": 16131, "time_per_iteration": 2.519237756729126 }, { "auxiliary_loss_clip": 0.01103968, "auxiliary_loss_mlp": 0.01276679, "balance_loss_clip": 1.01800096, "balance_loss_mlp": 1.03413486, "epoch": 0.9699083120396813, "flos": 21982250655360.0, "grad_norm": 1.6635790051293196, "language_loss": 0.76430213, "learning_rate": 8.915225627881717e-09, "loss": 0.78810859, "num_input_tokens_seen": 348157115, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69921875, "step": 16132, "time_per_iteration": 4.019611597061157 }, { "auxiliary_loss_clip": 0.01132898, "auxiliary_loss_mlp": 0.01037536, "balance_loss_clip": 1.02466726, "balance_loss_mlp": 1.03632605, "epoch": 0.9699684352923493, "flos": 15664055224320.0, "grad_norm": 3.7426096586557933, "language_loss": 0.79392743, "learning_rate": 8.879626771328252e-09, "loss": 0.81563175, "num_input_tokens_seen": 348173035, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 16133, "time_per_iteration": 2.6689088344573975 }, { "auxiliary_loss_clip": 0.01117254, "auxiliary_loss_mlp": 0.01027235, "balance_loss_clip": 1.01574326, "balance_loss_mlp": 1.03304696, "epoch": 0.9700285585450172, "flos": 21908382336000.0, "grad_norm": 3.27974118827183, "language_loss": 0.64771116, "learning_rate": 8.844098972739189e-09, "loss": 0.66915607, "num_input_tokens_seen": 348192960, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 16134, "time_per_iteration": 2.6354517936706543 }, { "auxiliary_loss_clip": 0.01119661, "auxiliary_loss_mlp": 0.01031482, "balance_loss_clip": 1.0191741, "balance_loss_mlp": 1.03311896, "epoch": 0.9700886817976853, "flos": 23914890950400.0, "grad_norm": 3.4392810843973787, "language_loss": 0.80997372, "learning_rate": 8.808642233382402e-09, "loss": 0.83148515, "num_input_tokens_seen": 348212805, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 16135, "time_per_iteration": 2.6206655502319336 }, { "auxiliary_loss_clip": 0.01113509, "auxiliary_loss_mlp": 0.01034189, "balance_loss_clip": 1.02056372, "balance_loss_mlp": 1.03327703, "epoch": 0.9701488050503532, "flos": 25447845634560.0, "grad_norm": 1.7349213904088099, "language_loss": 0.7309345, "learning_rate": 8.773256554523323e-09, "loss": 0.75241148, "num_input_tokens_seen": 348232900, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7109375, "step": 16136, "time_per_iteration": 2.609377861022949 }, { "auxiliary_loss_clip": 0.01100247, "auxiliary_loss_mlp": 0.01032335, "balance_loss_clip": 1.02145767, "balance_loss_mlp": 1.0333612, "epoch": 0.9702089283030212, "flos": 20590855470720.0, "grad_norm": 1.437169538230184, "language_loss": 0.76090509, "learning_rate": 8.737941937424276e-09, "loss": 0.78223085, "num_input_tokens_seen": 348253065, "router_z_loss_clip": 0.10888672, "router_z_loss_mlp": 0.66796875, "step": 16137, "time_per_iteration": 2.6262927055358887 }, { "auxiliary_loss_clip": 0.0105812, "auxiliary_loss_mlp": 0.01001937, "balance_loss_clip": 1.00068557, "balance_loss_mlp": 1.00113726, "epoch": 0.9702690515556892, "flos": 70651516291200.0, "grad_norm": 0.7984988915590808, "language_loss": 0.54949272, "learning_rate": 8.702698383346474e-09, "loss": 0.57009327, "num_input_tokens_seen": 348316075, "router_z_loss_clip": 0.01251221, "router_z_loss_mlp": 0.20996094, "step": 16138, "time_per_iteration": 3.3279306888580322 }, { "auxiliary_loss_clip": 0.01119351, "auxiliary_loss_mlp": 0.0102977, "balance_loss_clip": 1.01802826, "balance_loss_mlp": 1.03287411, "epoch": 0.9703291748083571, "flos": 35881439034240.0, "grad_norm": 1.675826285812541, "language_loss": 0.70627403, "learning_rate": 8.66752589354669e-09, "loss": 0.7277652, "num_input_tokens_seen": 348337605, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 16139, "time_per_iteration": 2.840691566467285 }, { "auxiliary_loss_clip": 0.01113931, "auxiliary_loss_mlp": 0.0102861, "balance_loss_clip": 1.01620018, "balance_loss_mlp": 1.03532338, "epoch": 0.9703892980610251, "flos": 24316479982080.0, "grad_norm": 1.4479263274494965, "language_loss": 0.72445631, "learning_rate": 8.632424469280807e-09, "loss": 0.74588174, "num_input_tokens_seen": 348359430, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 16140, "time_per_iteration": 4.07818603515625 }, { "auxiliary_loss_clip": 0.01115332, "auxiliary_loss_mlp": 0.01287112, "balance_loss_clip": 1.02711868, "balance_loss_mlp": 1.03506994, "epoch": 0.970449421313693, "flos": 18003743418240.0, "grad_norm": 2.5403097427070738, "language_loss": 0.68263787, "learning_rate": 8.597394111801382e-09, "loss": 0.7066623, "num_input_tokens_seen": 348377890, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71484375, "step": 16141, "time_per_iteration": 2.5374951362609863 }, { "auxiliary_loss_clip": 0.01107007, "auxiliary_loss_mlp": 0.01031225, "balance_loss_clip": 1.01945281, "balance_loss_mlp": 1.03194857, "epoch": 0.9705095445663611, "flos": 25337994865920.0, "grad_norm": 2.331852683389298, "language_loss": 0.68404222, "learning_rate": 8.5624348223583e-09, "loss": 0.70542449, "num_input_tokens_seen": 348396550, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6640625, "step": 16142, "time_per_iteration": 2.5964019298553467 }, { "auxiliary_loss_clip": 0.01135408, "auxiliary_loss_mlp": 0.01027216, "balance_loss_clip": 1.01522994, "balance_loss_mlp": 1.03197074, "epoch": 0.970569667819029, "flos": 26833602384000.0, "grad_norm": 1.7594104778803932, "language_loss": 0.55441356, "learning_rate": 8.527546602199453e-09, "loss": 0.57603979, "num_input_tokens_seen": 348417120, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 16143, "time_per_iteration": 2.674731969833374 }, { "auxiliary_loss_clip": 0.01111906, "auxiliary_loss_mlp": 0.01031594, "balance_loss_clip": 1.02004242, "balance_loss_mlp": 1.03400016, "epoch": 0.970629791071697, "flos": 25848644567040.0, "grad_norm": 2.189548542737318, "language_loss": 0.6822319, "learning_rate": 8.492729452569624e-09, "loss": 0.70366693, "num_input_tokens_seen": 348437750, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6953125, "step": 16144, "time_per_iteration": 2.6929845809936523 }, { "auxiliary_loss_clip": 0.01111056, "auxiliary_loss_mlp": 0.01038036, "balance_loss_clip": 1.02672863, "balance_loss_mlp": 1.03485894, "epoch": 0.9706899143243649, "flos": 36540184510080.0, "grad_norm": 1.7679213547559673, "language_loss": 0.72339535, "learning_rate": 8.45798337471182e-09, "loss": 0.74488628, "num_input_tokens_seen": 348460935, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.67578125, "step": 16145, "time_per_iteration": 2.775803327560425 }, { "auxiliary_loss_clip": 0.01121033, "auxiliary_loss_mlp": 0.01030377, "balance_loss_clip": 1.01705575, "balance_loss_mlp": 1.03248203, "epoch": 0.9707500375770329, "flos": 12268234414080.0, "grad_norm": 2.8592265763740614, "language_loss": 0.80446196, "learning_rate": 8.423308369865267e-09, "loss": 0.82597601, "num_input_tokens_seen": 348474480, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 16146, "time_per_iteration": 2.502317428588867 }, { "auxiliary_loss_clip": 0.01048142, "auxiliary_loss_mlp": 0.0100253, "balance_loss_clip": 1.00128984, "balance_loss_mlp": 1.00109148, "epoch": 0.9708101608297008, "flos": 60853040196480.0, "grad_norm": 0.7722170944350474, "language_loss": 0.54561931, "learning_rate": 8.388704439268313e-09, "loss": 0.56612599, "num_input_tokens_seen": 348541220, "router_z_loss_clip": 0.01239014, "router_z_loss_mlp": 0.21191406, "step": 16147, "time_per_iteration": 5.353432893753052 }, { "auxiliary_loss_clip": 0.01132951, "auxiliary_loss_mlp": 0.01279508, "balance_loss_clip": 1.01899481, "balance_loss_mlp": 1.0343852, "epoch": 0.9708702840823689, "flos": 27124766029440.0, "grad_norm": 3.004259814014648, "language_loss": 0.61397564, "learning_rate": 8.3541715841553e-09, "loss": 0.63810021, "num_input_tokens_seen": 348559230, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71484375, "step": 16148, "time_per_iteration": 4.091493606567383 }, { "auxiliary_loss_clip": 0.01127501, "auxiliary_loss_mlp": 0.01037237, "balance_loss_clip": 1.02522087, "balance_loss_mlp": 1.03364182, "epoch": 0.9709304073350368, "flos": 20777699041920.0, "grad_norm": 1.8707252186885723, "language_loss": 0.73463154, "learning_rate": 8.319709805758801e-09, "loss": 0.75627893, "num_input_tokens_seen": 348577850, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 16149, "time_per_iteration": 2.6840693950653076 }, { "auxiliary_loss_clip": 0.01109449, "auxiliary_loss_mlp": 0.01038585, "balance_loss_clip": 1.02612209, "balance_loss_mlp": 1.03133106, "epoch": 0.9709905305877048, "flos": 41934541115520.0, "grad_norm": 2.121772827667487, "language_loss": 0.75355875, "learning_rate": 8.285319105308496e-09, "loss": 0.77503914, "num_input_tokens_seen": 348598345, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 16150, "time_per_iteration": 2.763049602508545 }, { "auxiliary_loss_clip": 0.01126498, "auxiliary_loss_mlp": 0.01032107, "balance_loss_clip": 1.0175693, "balance_loss_mlp": 1.03579593, "epoch": 0.9710506538403728, "flos": 17165588486400.0, "grad_norm": 2.0107614115933647, "language_loss": 0.74228275, "learning_rate": 8.250999484032073e-09, "loss": 0.76386875, "num_input_tokens_seen": 348616300, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.734375, "step": 16151, "time_per_iteration": 2.5870862007141113 }, { "auxiliary_loss_clip": 0.0111032, "auxiliary_loss_mlp": 0.01029071, "balance_loss_clip": 1.01694143, "balance_loss_mlp": 1.03451478, "epoch": 0.9711107770930407, "flos": 21173470070400.0, "grad_norm": 1.878252630302768, "language_loss": 0.74544692, "learning_rate": 8.216750943153884e-09, "loss": 0.76684082, "num_input_tokens_seen": 348633845, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.671875, "step": 16152, "time_per_iteration": 2.538478136062622 }, { "auxiliary_loss_clip": 0.01128096, "auxiliary_loss_mlp": 0.01034579, "balance_loss_clip": 1.02231872, "balance_loss_mlp": 1.03387558, "epoch": 0.9711709003457087, "flos": 22237072715520.0, "grad_norm": 3.9621658167151885, "language_loss": 0.67308736, "learning_rate": 8.182573483896505e-09, "loss": 0.69471407, "num_input_tokens_seen": 348653070, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.67578125, "step": 16153, "time_per_iteration": 2.6798157691955566 }, { "auxiliary_loss_clip": 0.01145795, "auxiliary_loss_mlp": 0.01029716, "balance_loss_clip": 1.01855218, "balance_loss_mlp": 1.03298092, "epoch": 0.9712310235983767, "flos": 26213856099840.0, "grad_norm": 2.76181547579473, "language_loss": 0.7053175, "learning_rate": 8.148467107479406e-09, "loss": 0.7270726, "num_input_tokens_seen": 348672145, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6796875, "step": 16154, "time_per_iteration": 2.6394383907318115 }, { "auxiliary_loss_clip": 0.01151005, "auxiliary_loss_mlp": 0.01032781, "balance_loss_clip": 1.01946568, "balance_loss_mlp": 1.03684473, "epoch": 0.9712911468510447, "flos": 20668171495680.0, "grad_norm": 2.5037627098757933, "language_loss": 0.81057668, "learning_rate": 8.11443181512006e-09, "loss": 0.83241451, "num_input_tokens_seen": 348690615, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 16155, "time_per_iteration": 2.765947103500366 }, { "auxiliary_loss_clip": 0.01115784, "auxiliary_loss_mlp": 0.01287371, "balance_loss_clip": 1.02711022, "balance_loss_mlp": 1.03485608, "epoch": 0.9713512701037126, "flos": 13552903313280.0, "grad_norm": 2.098869761792586, "language_loss": 0.67226875, "learning_rate": 8.080467608032826e-09, "loss": 0.69630039, "num_input_tokens_seen": 348708665, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 16156, "time_per_iteration": 2.631984233856201 }, { "auxiliary_loss_clip": 0.01112283, "auxiliary_loss_mlp": 0.01035159, "balance_loss_clip": 1.02320254, "balance_loss_mlp": 1.03274083, "epoch": 0.9714113933563806, "flos": 25848752307840.0, "grad_norm": 2.397403141782249, "language_loss": 0.90883756, "learning_rate": 8.046574487429625e-09, "loss": 0.93031204, "num_input_tokens_seen": 348726105, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.70703125, "step": 16157, "time_per_iteration": 2.6669037342071533 }, { "auxiliary_loss_clip": 0.0113223, "auxiliary_loss_mlp": 0.0127769, "balance_loss_clip": 1.01922369, "balance_loss_mlp": 1.03576469, "epoch": 0.9714715166090485, "flos": 12743081233920.0, "grad_norm": 1.8503935993653289, "language_loss": 0.72697395, "learning_rate": 8.0127524545206e-09, "loss": 0.75107312, "num_input_tokens_seen": 348743360, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6953125, "step": 16158, "time_per_iteration": 2.572038412094116 }, { "auxiliary_loss_clip": 0.01136392, "auxiliary_loss_mlp": 0.01034241, "balance_loss_clip": 1.02209949, "balance_loss_mlp": 1.03390193, "epoch": 0.9715316398617165, "flos": 18405547931520.0, "grad_norm": 1.9886262561334744, "language_loss": 0.59694284, "learning_rate": 7.97900151051234e-09, "loss": 0.61864918, "num_input_tokens_seen": 348759045, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.66796875, "step": 16159, "time_per_iteration": 2.6831448078155518 }, { "auxiliary_loss_clip": 0.01097118, "auxiliary_loss_mlp": 0.01026722, "balance_loss_clip": 1.01577318, "balance_loss_mlp": 1.03260064, "epoch": 0.9715917631143844, "flos": 28913799749760.0, "grad_norm": 1.678451188983067, "language_loss": 0.79058528, "learning_rate": 7.945321656609216e-09, "loss": 0.81182367, "num_input_tokens_seen": 348779910, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.64453125, "step": 16160, "time_per_iteration": 2.6043124198913574 }, { "auxiliary_loss_clip": 0.01022235, "auxiliary_loss_mlp": 0.01002992, "balance_loss_clip": 1.00169301, "balance_loss_mlp": 1.00106072, "epoch": 0.9716518863670525, "flos": 65939712900480.0, "grad_norm": 0.7743234406558969, "language_loss": 0.54266453, "learning_rate": 7.9117128940136e-09, "loss": 0.56291676, "num_input_tokens_seen": 348838995, "router_z_loss_clip": 0.01300049, "router_z_loss_mlp": 0.2109375, "step": 16161, "time_per_iteration": 3.1411447525024414 }, { "auxiliary_loss_clip": 0.01107521, "auxiliary_loss_mlp": 0.01277218, "balance_loss_clip": 1.01892042, "balance_loss_mlp": 1.03368187, "epoch": 0.9717120096197204, "flos": 17712759340800.0, "grad_norm": 1.6659132099428946, "language_loss": 0.71614337, "learning_rate": 7.878175223924754e-09, "loss": 0.73999071, "num_input_tokens_seen": 348858090, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6484375, "step": 16162, "time_per_iteration": 2.7079131603240967 }, { "auxiliary_loss_clip": 0.0112557, "auxiliary_loss_mlp": 0.0102837, "balance_loss_clip": 1.01542974, "balance_loss_mlp": 1.03178191, "epoch": 0.9717721328723884, "flos": 24463426521600.0, "grad_norm": 1.545230382068068, "language_loss": 0.78517818, "learning_rate": 7.844708647539277e-09, "loss": 0.80671757, "num_input_tokens_seen": 348877885, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.671875, "step": 16163, "time_per_iteration": 2.586048126220703 }, { "auxiliary_loss_clip": 0.01119428, "auxiliary_loss_mlp": 0.0102703, "balance_loss_clip": 1.01559186, "balance_loss_mlp": 1.03405178, "epoch": 0.9718322561250564, "flos": 20776477979520.0, "grad_norm": 1.8034096598145335, "language_loss": 0.72757876, "learning_rate": 7.81131316605177e-09, "loss": 0.74904341, "num_input_tokens_seen": 348897720, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.671875, "step": 16164, "time_per_iteration": 2.5815839767456055 }, { "auxiliary_loss_clip": 0.01147566, "auxiliary_loss_mlp": 0.01034535, "balance_loss_clip": 1.02219713, "balance_loss_mlp": 1.03380048, "epoch": 0.9718923793777243, "flos": 19025904746880.0, "grad_norm": 2.1578889209459957, "language_loss": 0.83970726, "learning_rate": 7.777988780654166e-09, "loss": 0.86152828, "num_input_tokens_seen": 348915410, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 16165, "time_per_iteration": 2.511948823928833 }, { "auxiliary_loss_clip": 0.01127182, "auxiliary_loss_mlp": 0.01024108, "balance_loss_clip": 1.01314652, "balance_loss_mlp": 1.03318155, "epoch": 0.9719525026303923, "flos": 21871717528320.0, "grad_norm": 1.545056120765243, "language_loss": 0.78967756, "learning_rate": 7.744735492535293e-09, "loss": 0.81119049, "num_input_tokens_seen": 348934335, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.671875, "step": 16166, "time_per_iteration": 2.583380937576294 }, { "auxiliary_loss_clip": 0.01120088, "auxiliary_loss_mlp": 0.01027241, "balance_loss_clip": 1.01591575, "balance_loss_mlp": 1.03427386, "epoch": 0.9720126258830603, "flos": 14429303251200.0, "grad_norm": 2.8062503973744515, "language_loss": 0.7059198, "learning_rate": 7.711553302882201e-09, "loss": 0.72739309, "num_input_tokens_seen": 348952405, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6796875, "step": 16167, "time_per_iteration": 2.4876785278320312 }, { "auxiliary_loss_clip": 0.01118736, "auxiliary_loss_mlp": 0.01035826, "balance_loss_clip": 1.02425659, "balance_loss_mlp": 1.03480971, "epoch": 0.9720727491357283, "flos": 26251167352320.0, "grad_norm": 2.1556272172076216, "language_loss": 0.75640553, "learning_rate": 7.678442212879055e-09, "loss": 0.77795112, "num_input_tokens_seen": 348973580, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.66015625, "step": 16168, "time_per_iteration": 2.633422613143921 }, { "auxiliary_loss_clip": 0.01140358, "auxiliary_loss_mlp": 0.01039375, "balance_loss_clip": 1.0264647, "balance_loss_mlp": 1.03502607, "epoch": 0.9721328723883962, "flos": 22674105492480.0, "grad_norm": 2.7032581270016722, "language_loss": 0.73101914, "learning_rate": 7.645402223707353e-09, "loss": 0.75281644, "num_input_tokens_seen": 348992035, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 16169, "time_per_iteration": 2.7033066749572754 }, { "auxiliary_loss_clip": 0.01130656, "auxiliary_loss_mlp": 0.0103171, "balance_loss_clip": 1.01885355, "balance_loss_mlp": 1.03259277, "epoch": 0.9721929956410642, "flos": 28074172360320.0, "grad_norm": 2.943746104367844, "language_loss": 0.5766288, "learning_rate": 7.612433336546376e-09, "loss": 0.59825242, "num_input_tokens_seen": 349013160, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 16170, "time_per_iteration": 2.7162880897521973 }, { "auxiliary_loss_clip": 0.0113018, "auxiliary_loss_mlp": 0.01032023, "balance_loss_clip": 1.01889253, "balance_loss_mlp": 1.03416848, "epoch": 0.9722531188937321, "flos": 20996251344000.0, "grad_norm": 1.7760171986769089, "language_loss": 0.71667415, "learning_rate": 7.579535552572514e-09, "loss": 0.73829615, "num_input_tokens_seen": 349033485, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 16171, "time_per_iteration": 2.647542953491211 }, { "auxiliary_loss_clip": 0.01128544, "auxiliary_loss_mlp": 0.01034527, "balance_loss_clip": 1.02302301, "balance_loss_mlp": 1.03319705, "epoch": 0.9723132421464001, "flos": 16070600332800.0, "grad_norm": 1.8834636910317428, "language_loss": 0.68498874, "learning_rate": 7.546708872960162e-09, "loss": 0.70661944, "num_input_tokens_seen": 349051705, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.68359375, "step": 16172, "time_per_iteration": 2.5967631340026855 }, { "auxiliary_loss_clip": 0.01110848, "auxiliary_loss_mlp": 0.01027836, "balance_loss_clip": 1.01577258, "balance_loss_mlp": 1.03426731, "epoch": 0.972373365399068, "flos": 27745769289600.0, "grad_norm": 1.7733237840838123, "language_loss": 0.86033261, "learning_rate": 7.513953298880382e-09, "loss": 0.88171947, "num_input_tokens_seen": 349070825, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.671875, "step": 16173, "time_per_iteration": 2.6447856426239014 }, { "auxiliary_loss_clip": 0.01143312, "auxiliary_loss_mlp": 0.01030163, "balance_loss_clip": 1.01650119, "balance_loss_mlp": 1.03461289, "epoch": 0.9724334886517361, "flos": 23002939526400.0, "grad_norm": 2.632492132304802, "language_loss": 0.64195031, "learning_rate": 7.481268831502241e-09, "loss": 0.66368508, "num_input_tokens_seen": 349089730, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 16174, "time_per_iteration": 4.13923978805542 }, { "auxiliary_loss_clip": 0.0111078, "auxiliary_loss_mlp": 0.01027188, "balance_loss_clip": 1.015553, "balance_loss_mlp": 1.03456342, "epoch": 0.972493611904404, "flos": 25447055535360.0, "grad_norm": 1.5130450539520413, "language_loss": 0.78157949, "learning_rate": 7.44865547199236e-09, "loss": 0.8029592, "num_input_tokens_seen": 349111315, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 16175, "time_per_iteration": 2.6896915435791016 }, { "auxiliary_loss_clip": 0.01110776, "auxiliary_loss_mlp": 0.01032138, "balance_loss_clip": 1.01894116, "balance_loss_mlp": 1.03341985, "epoch": 0.972553735157072, "flos": 16983054547200.0, "grad_norm": 2.060041792972681, "language_loss": 0.56245506, "learning_rate": 7.416113221514475e-09, "loss": 0.58388418, "num_input_tokens_seen": 349129495, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.68359375, "step": 16176, "time_per_iteration": 2.479069232940674 }, { "auxiliary_loss_clip": 0.01121297, "auxiliary_loss_mlp": 0.01029239, "balance_loss_clip": 1.01802135, "balance_loss_mlp": 1.0361619, "epoch": 0.97261385840974, "flos": 26104651776000.0, "grad_norm": 2.389445062006945, "language_loss": 0.72548091, "learning_rate": 7.383642081230102e-09, "loss": 0.74698627, "num_input_tokens_seen": 349148850, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.671875, "step": 16177, "time_per_iteration": 2.539135694503784 }, { "auxiliary_loss_clip": 0.01119932, "auxiliary_loss_mlp": 0.01027959, "balance_loss_clip": 1.01582992, "balance_loss_mlp": 1.03252316, "epoch": 0.9726739816624079, "flos": 25447881548160.0, "grad_norm": 1.7059346503500201, "language_loss": 0.68196875, "learning_rate": 7.351242052297646e-09, "loss": 0.7034477, "num_input_tokens_seen": 349167620, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 16178, "time_per_iteration": 2.5514323711395264 }, { "auxiliary_loss_clip": 0.0114808, "auxiliary_loss_mlp": 0.01029115, "balance_loss_clip": 1.01638353, "balance_loss_mlp": 1.03316629, "epoch": 0.972734104915076, "flos": 29014923513600.0, "grad_norm": 2.053670179528779, "language_loss": 0.67398345, "learning_rate": 7.318913135873961e-09, "loss": 0.69575536, "num_input_tokens_seen": 349185845, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 16179, "time_per_iteration": 2.598569631576538 }, { "auxiliary_loss_clip": 0.01117708, "auxiliary_loss_mlp": 0.01282265, "balance_loss_clip": 1.02396631, "balance_loss_mlp": 1.03431511, "epoch": 0.9727942281677439, "flos": 23437637919360.0, "grad_norm": 1.671423704903274, "language_loss": 0.76960021, "learning_rate": 7.286655333112124e-09, "loss": 0.79359996, "num_input_tokens_seen": 349204525, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6640625, "step": 16180, "time_per_iteration": 2.5744683742523193 }, { "auxiliary_loss_clip": 0.01113855, "auxiliary_loss_mlp": 0.01031232, "balance_loss_clip": 1.01902485, "balance_loss_mlp": 1.03356361, "epoch": 0.9728543514204119, "flos": 31724599749120.0, "grad_norm": 1.8321197767265722, "language_loss": 0.76661956, "learning_rate": 7.2544686451641024e-09, "loss": 0.78807044, "num_input_tokens_seen": 349228075, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.71875, "step": 16181, "time_per_iteration": 2.738847255706787 }, { "auxiliary_loss_clip": 0.01100207, "auxiliary_loss_mlp": 0.01030544, "balance_loss_clip": 1.01864076, "balance_loss_mlp": 1.03263211, "epoch": 0.9729144746730798, "flos": 16289368116480.0, "grad_norm": 2.902051743677603, "language_loss": 0.63187909, "learning_rate": 7.222353073177867e-09, "loss": 0.65318662, "num_input_tokens_seen": 349246990, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.67578125, "step": 16182, "time_per_iteration": 3.8670499324798584 }, { "auxiliary_loss_clip": 0.01166677, "auxiliary_loss_mlp": 0.01033921, "balance_loss_clip": 1.02217865, "balance_loss_mlp": 1.03526807, "epoch": 0.9729745979257478, "flos": 25041408266880.0, "grad_norm": 2.3136510422957124, "language_loss": 0.6229651, "learning_rate": 7.190308618299834e-09, "loss": 0.64497101, "num_input_tokens_seen": 349265890, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6953125, "step": 16183, "time_per_iteration": 2.627042531967163 }, { "auxiliary_loss_clip": 0.01118094, "auxiliary_loss_mlp": 0.01034667, "balance_loss_clip": 1.02231061, "balance_loss_mlp": 1.03365099, "epoch": 0.9730347211784157, "flos": 22638733574400.0, "grad_norm": 1.6705574965919097, "language_loss": 0.78331888, "learning_rate": 7.158335281673533e-09, "loss": 0.80484647, "num_input_tokens_seen": 349285275, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.66796875, "step": 16184, "time_per_iteration": 2.6130173206329346 }, { "auxiliary_loss_clip": 0.01126286, "auxiliary_loss_mlp": 0.01028623, "balance_loss_clip": 1.01638639, "balance_loss_mlp": 1.03234291, "epoch": 0.9730948444310837, "flos": 10998613313280.0, "grad_norm": 2.526961175713559, "language_loss": 0.79140639, "learning_rate": 7.1264330644402734e-09, "loss": 0.8129555, "num_input_tokens_seen": 349301515, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 16185, "time_per_iteration": 2.512036085128784 }, { "auxiliary_loss_clip": 0.01129523, "auxiliary_loss_mlp": 0.01028658, "balance_loss_clip": 1.01713085, "balance_loss_mlp": 1.03408849, "epoch": 0.9731549676837516, "flos": 16799479113600.0, "grad_norm": 1.7251249828569428, "language_loss": 0.77554774, "learning_rate": 7.094601967738256e-09, "loss": 0.79712957, "num_input_tokens_seen": 349319590, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6875, "step": 16186, "time_per_iteration": 2.5542168617248535 }, { "auxiliary_loss_clip": 0.01116998, "auxiliary_loss_mlp": 0.01027252, "balance_loss_clip": 1.01525366, "balance_loss_mlp": 1.03246439, "epoch": 0.9732150909364197, "flos": 21141761339520.0, "grad_norm": 1.7204457468972028, "language_loss": 0.65498555, "learning_rate": 7.062841992703461e-09, "loss": 0.67642808, "num_input_tokens_seen": 349339230, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 16187, "time_per_iteration": 2.6380035877227783 }, { "auxiliary_loss_clip": 0.01117539, "auxiliary_loss_mlp": 0.01027766, "balance_loss_clip": 1.0157913, "balance_loss_mlp": 1.03273487, "epoch": 0.9732752141890876, "flos": 33727337435520.0, "grad_norm": 1.6675033407606685, "language_loss": 0.80224431, "learning_rate": 7.031153140469425e-09, "loss": 0.82369733, "num_input_tokens_seen": 349361155, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.66796875, "step": 16188, "time_per_iteration": 2.6922190189361572 }, { "auxiliary_loss_clip": 0.01124837, "auxiliary_loss_mlp": 0.01029994, "balance_loss_clip": 1.01864481, "balance_loss_mlp": 1.03217685, "epoch": 0.9733353374417556, "flos": 23404384903680.0, "grad_norm": 1.7889777335784824, "language_loss": 0.78195143, "learning_rate": 6.999535412166802e-09, "loss": 0.8034997, "num_input_tokens_seen": 349379335, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.66015625, "step": 16189, "time_per_iteration": 5.525079250335693 }, { "auxiliary_loss_clip": 0.01108065, "auxiliary_loss_mlp": 0.01025108, "balance_loss_clip": 1.01299655, "balance_loss_mlp": 1.03143358, "epoch": 0.9733954606944236, "flos": 39165792963840.0, "grad_norm": 2.171029029871213, "language_loss": 0.51079309, "learning_rate": 6.967988808924463e-09, "loss": 0.53212488, "num_input_tokens_seen": 349401575, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.67578125, "step": 16190, "time_per_iteration": 2.7238121032714844 }, { "auxiliary_loss_clip": 0.01109619, "auxiliary_loss_mlp": 0.01027511, "balance_loss_clip": 1.0152266, "balance_loss_mlp": 1.03241634, "epoch": 0.9734555839470915, "flos": 21981819692160.0, "grad_norm": 1.5934256243833214, "language_loss": 0.81168318, "learning_rate": 6.936513331867733e-09, "loss": 0.83305448, "num_input_tokens_seen": 349420650, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 16191, "time_per_iteration": 2.5447165966033936 }, { "auxiliary_loss_clip": 0.01108681, "auxiliary_loss_mlp": 0.01031147, "balance_loss_clip": 1.01977408, "balance_loss_mlp": 1.0319829, "epoch": 0.9735157071997596, "flos": 16575539771520.0, "grad_norm": 1.6248488347000198, "language_loss": 0.82820255, "learning_rate": 6.905108982120156e-09, "loss": 0.84960085, "num_input_tokens_seen": 349436830, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6796875, "step": 16192, "time_per_iteration": 2.515082359313965 }, { "auxiliary_loss_clip": 0.01150288, "auxiliary_loss_mlp": 0.01030865, "balance_loss_clip": 1.01877129, "balance_loss_mlp": 1.03723383, "epoch": 0.9735758304524275, "flos": 20223237726720.0, "grad_norm": 2.3785242320486013, "language_loss": 0.75119334, "learning_rate": 6.8737757608021695e-09, "loss": 0.77300489, "num_input_tokens_seen": 349454325, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 16193, "time_per_iteration": 2.685973882675171 }, { "auxiliary_loss_clip": 0.01116465, "auxiliary_loss_mlp": 0.0103245, "balance_loss_clip": 1.02104747, "balance_loss_mlp": 1.03248537, "epoch": 0.9736359537050955, "flos": 23653353047040.0, "grad_norm": 1.7793413295319618, "language_loss": 0.8538124, "learning_rate": 6.842513669032435e-09, "loss": 0.8753016, "num_input_tokens_seen": 349470230, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.66015625, "step": 16194, "time_per_iteration": 2.523879289627075 }, { "auxiliary_loss_clip": 0.01128621, "auxiliary_loss_mlp": 0.01034296, "balance_loss_clip": 1.02207077, "balance_loss_mlp": 1.03529215, "epoch": 0.9736960769577634, "flos": 15560202026880.0, "grad_norm": 1.5599668289385515, "language_loss": 0.75789595, "learning_rate": 6.811322707926281e-09, "loss": 0.77952504, "num_input_tokens_seen": 349486250, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6640625, "step": 16195, "time_per_iteration": 2.5023906230926514 }, { "auxiliary_loss_clip": 0.01048494, "auxiliary_loss_mlp": 0.01001081, "balance_loss_clip": 0.99983543, "balance_loss_mlp": 1.0007962, "epoch": 0.9737562002104314, "flos": 65351783088000.0, "grad_norm": 0.7055043989305859, "language_loss": 0.52442622, "learning_rate": 6.780202878597041e-09, "loss": 0.54492199, "num_input_tokens_seen": 349545865, "router_z_loss_clip": 0.01245117, "router_z_loss_mlp": 0.2109375, "step": 16196, "time_per_iteration": 3.1096887588500977 }, { "auxiliary_loss_clip": 0.01118673, "auxiliary_loss_mlp": 0.01029852, "balance_loss_clip": 1.01791942, "balance_loss_mlp": 1.03327763, "epoch": 0.9738163234630993, "flos": 27196730928000.0, "grad_norm": 2.497127570374081, "language_loss": 0.76282907, "learning_rate": 6.749154182154937e-09, "loss": 0.78431427, "num_input_tokens_seen": 349566080, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 16197, "time_per_iteration": 2.6259748935699463 }, { "auxiliary_loss_clip": 0.01105093, "auxiliary_loss_mlp": 0.01028901, "balance_loss_clip": 1.01627672, "balance_loss_mlp": 1.03347349, "epoch": 0.9738764467157673, "flos": 21069365477760.0, "grad_norm": 2.7360937295321017, "language_loss": 0.67822504, "learning_rate": 6.7181766197084154e-09, "loss": 0.69956499, "num_input_tokens_seen": 349585665, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 16198, "time_per_iteration": 2.511733055114746 }, { "auxiliary_loss_clip": 0.01100978, "auxiliary_loss_mlp": 0.01027871, "balance_loss_clip": 1.01523519, "balance_loss_mlp": 1.0318656, "epoch": 0.9739365699684353, "flos": 21361211481600.0, "grad_norm": 2.0707306995028265, "language_loss": 0.7806527, "learning_rate": 6.687270192363037e-09, "loss": 0.80194116, "num_input_tokens_seen": 349605125, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 16199, "time_per_iteration": 2.6570253372192383 }, { "auxiliary_loss_clip": 0.01119173, "auxiliary_loss_mlp": 0.0103505, "balance_loss_clip": 1.02264678, "balance_loss_mlp": 1.0334785, "epoch": 0.9739966932211033, "flos": 21902061542400.0, "grad_norm": 1.6750157315359893, "language_loss": 0.79344285, "learning_rate": 6.656434901221253e-09, "loss": 0.81498504, "num_input_tokens_seen": 349623360, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6796875, "step": 16200, "time_per_iteration": 2.56459379196167 }, { "auxiliary_loss_clip": 0.01153661, "auxiliary_loss_mlp": 0.01037097, "balance_loss_clip": 1.0237273, "balance_loss_mlp": 1.03580046, "epoch": 0.9740568164737712, "flos": 24827345164800.0, "grad_norm": 2.1386220841515993, "language_loss": 0.68698573, "learning_rate": 6.6256707473839604e-09, "loss": 0.70889336, "num_input_tokens_seen": 349644390, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 16201, "time_per_iteration": 2.5993430614471436 }, { "auxiliary_loss_clip": 0.01128525, "auxiliary_loss_mlp": 0.01030156, "balance_loss_clip": 1.01843166, "balance_loss_mlp": 1.03275895, "epoch": 0.9741169397264392, "flos": 23623583650560.0, "grad_norm": 1.4265720017849004, "language_loss": 0.72683799, "learning_rate": 6.594977731948948e-09, "loss": 0.74842483, "num_input_tokens_seen": 349663200, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69140625, "step": 16202, "time_per_iteration": 2.568986177444458 }, { "auxiliary_loss_clip": 0.0112306, "auxiliary_loss_mlp": 0.01032198, "balance_loss_clip": 1.01972878, "balance_loss_mlp": 1.03588378, "epoch": 0.9741770629791072, "flos": 18841144164480.0, "grad_norm": 2.0553666289307295, "language_loss": 0.72700894, "learning_rate": 6.564355856011561e-09, "loss": 0.7485615, "num_input_tokens_seen": 349681975, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 16203, "time_per_iteration": 2.495438575744629 }, { "auxiliary_loss_clip": 0.01102294, "auxiliary_loss_mlp": 0.01030861, "balance_loss_clip": 1.0189817, "balance_loss_mlp": 1.03407407, "epoch": 0.9742371862317751, "flos": 22346241125760.0, "grad_norm": 2.157202712231575, "language_loss": 0.77427709, "learning_rate": 6.533805120664482e-09, "loss": 0.79560858, "num_input_tokens_seen": 349701185, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 16204, "time_per_iteration": 2.54093337059021 }, { "auxiliary_loss_clip": 0.01122402, "auxiliary_loss_mlp": 0.01033847, "balance_loss_clip": 1.02060258, "balance_loss_mlp": 1.03478134, "epoch": 0.9742973094844432, "flos": 21762764599680.0, "grad_norm": 2.1779079257289853, "language_loss": 0.79437554, "learning_rate": 6.503325526998171e-09, "loss": 0.815938, "num_input_tokens_seen": 349720360, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 16205, "time_per_iteration": 2.547654867172241 }, { "auxiliary_loss_clip": 0.01101177, "auxiliary_loss_mlp": 0.01028425, "balance_loss_clip": 1.01654601, "balance_loss_mlp": 1.03421295, "epoch": 0.9743574327371111, "flos": 26248725227520.0, "grad_norm": 1.5291928228559144, "language_loss": 0.74265456, "learning_rate": 6.472917076099982e-09, "loss": 0.76395053, "num_input_tokens_seen": 349741040, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.671875, "step": 16206, "time_per_iteration": 2.762212038040161 }, { "auxiliary_loss_clip": 0.01146255, "auxiliary_loss_mlp": 0.01030097, "balance_loss_clip": 1.01699042, "balance_loss_mlp": 1.03200269, "epoch": 0.9744175559897791, "flos": 12349321367040.0, "grad_norm": 20.2352582157215, "language_loss": 0.83855426, "learning_rate": 6.442579769055711e-09, "loss": 0.86031777, "num_input_tokens_seen": 349758895, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 16207, "time_per_iteration": 2.621980667114258 }, { "auxiliary_loss_clip": 0.01109159, "auxiliary_loss_mlp": 0.01031808, "balance_loss_clip": 1.01950574, "balance_loss_mlp": 1.0327816, "epoch": 0.974477679242447, "flos": 28397834835840.0, "grad_norm": 6.697795462371614, "language_loss": 0.70531529, "learning_rate": 6.4123136069476054e-09, "loss": 0.72672498, "num_input_tokens_seen": 349779740, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.67578125, "step": 16208, "time_per_iteration": 2.588914394378662 }, { "auxiliary_loss_clip": 0.01122729, "auxiliary_loss_mlp": 0.01025942, "balance_loss_clip": 1.01561832, "balance_loss_mlp": 1.03325486, "epoch": 0.974537802495115, "flos": 17785370684160.0, "grad_norm": 1.9808458228904948, "language_loss": 0.77537978, "learning_rate": 6.382118590856134e-09, "loss": 0.79686642, "num_input_tokens_seen": 349796820, "router_z_loss_clip": 0.10351562, "router_z_loss_mlp": 0.62890625, "step": 16209, "time_per_iteration": 2.583343505859375 }, { "auxiliary_loss_clip": 0.0113163, "auxiliary_loss_mlp": 0.01279016, "balance_loss_clip": 1.01968741, "balance_loss_mlp": 1.03447616, "epoch": 0.9745979257477829, "flos": 23842315520640.0, "grad_norm": 1.6043528737411532, "language_loss": 0.78637809, "learning_rate": 6.351994721858433e-09, "loss": 0.81048459, "num_input_tokens_seen": 349816550, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.70703125, "step": 16210, "time_per_iteration": 2.630814552307129 }, { "auxiliary_loss_clip": 0.01120573, "auxiliary_loss_mlp": 0.01033455, "balance_loss_clip": 1.02182651, "balance_loss_mlp": 1.03326762, "epoch": 0.9746580490004509, "flos": 27016172236800.0, "grad_norm": 2.50092903776312, "language_loss": 0.78081942, "learning_rate": 6.321942001029867e-09, "loss": 0.8023597, "num_input_tokens_seen": 349834350, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6953125, "step": 16211, "time_per_iteration": 2.602246046066284 }, { "auxiliary_loss_clip": 0.01134394, "auxiliary_loss_mlp": 0.01033667, "balance_loss_clip": 1.01989818, "balance_loss_mlp": 1.03474534, "epoch": 0.9747181722531189, "flos": 19792022952960.0, "grad_norm": 3.0120923043490224, "language_loss": 0.78212351, "learning_rate": 6.291960429442911e-09, "loss": 0.8038041, "num_input_tokens_seen": 349853460, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7265625, "step": 16212, "time_per_iteration": 2.5717387199401855 }, { "auxiliary_loss_clip": 0.01118133, "auxiliary_loss_mlp": 0.01031233, "balance_loss_clip": 1.01983047, "balance_loss_mlp": 1.03359818, "epoch": 0.9747782955057869, "flos": 31430598929280.0, "grad_norm": 1.434975977986914, "language_loss": 0.80154371, "learning_rate": 6.262050008167152e-09, "loss": 0.82303739, "num_input_tokens_seen": 349874830, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6640625, "step": 16213, "time_per_iteration": 2.7252988815307617 }, { "auxiliary_loss_clip": 0.01135389, "auxiliary_loss_mlp": 0.01028213, "balance_loss_clip": 1.01603603, "balance_loss_mlp": 1.03166211, "epoch": 0.9748384187584548, "flos": 15961288268160.0, "grad_norm": 1.9546711982518914, "language_loss": 0.6667788, "learning_rate": 6.232210738270627e-09, "loss": 0.68841481, "num_input_tokens_seen": 349893690, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.68359375, "step": 16214, "time_per_iteration": 2.6760761737823486 }, { "auxiliary_loss_clip": 0.01123105, "auxiliary_loss_mlp": 0.01028132, "balance_loss_clip": 1.01566291, "balance_loss_mlp": 1.0354625, "epoch": 0.9748985420111228, "flos": 20558715776640.0, "grad_norm": 1.5416127046063195, "language_loss": 0.74125201, "learning_rate": 6.202442620817816e-09, "loss": 0.76276433, "num_input_tokens_seen": 349912480, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.703125, "step": 16215, "time_per_iteration": 4.051563501358032 }, { "auxiliary_loss_clip": 0.01135193, "auxiliary_loss_mlp": 0.01031994, "balance_loss_clip": 1.02032316, "balance_loss_mlp": 1.03251672, "epoch": 0.9749586652637908, "flos": 36721605127680.0, "grad_norm": 1.710823007499204, "language_loss": 0.66875327, "learning_rate": 6.172745656871203e-09, "loss": 0.69042516, "num_input_tokens_seen": 349932470, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.671875, "step": 16216, "time_per_iteration": 2.729447603225708 }, { "auxiliary_loss_clip": 0.01149267, "auxiliary_loss_mlp": 0.01032741, "balance_loss_clip": 1.02028418, "balance_loss_mlp": 1.03447616, "epoch": 0.9750187885164587, "flos": 10999223844480.0, "grad_norm": 1.889530190427015, "language_loss": 0.72033119, "learning_rate": 6.1431198474903854e-09, "loss": 0.74215132, "num_input_tokens_seen": 349949060, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 16217, "time_per_iteration": 2.6057071685791016 }, { "auxiliary_loss_clip": 0.0112738, "auxiliary_loss_mlp": 0.01028129, "balance_loss_clip": 1.0161128, "balance_loss_mlp": 1.03288662, "epoch": 0.9750789117691268, "flos": 25739512070400.0, "grad_norm": 1.6250818910125955, "language_loss": 0.78464502, "learning_rate": 6.113565193732961e-09, "loss": 0.80620015, "num_input_tokens_seen": 349968010, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.68359375, "step": 16218, "time_per_iteration": 2.602464437484741 }, { "auxiliary_loss_clip": 0.01116185, "auxiliary_loss_mlp": 0.01030178, "balance_loss_clip": 1.01794147, "balance_loss_mlp": 1.03234458, "epoch": 0.9751390350217947, "flos": 13333955961600.0, "grad_norm": 2.4857797472117142, "language_loss": 0.77823293, "learning_rate": 6.084081696653642e-09, "loss": 0.79969656, "num_input_tokens_seen": 349985270, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6640625, "step": 16219, "time_per_iteration": 2.6305718421936035 }, { "auxiliary_loss_clip": 0.01120652, "auxiliary_loss_mlp": 0.01031196, "balance_loss_clip": 1.01923943, "balance_loss_mlp": 1.0333817, "epoch": 0.9751991582744627, "flos": 22820621068800.0, "grad_norm": 1.8706039072560179, "language_loss": 0.81125283, "learning_rate": 6.054669357304254e-09, "loss": 0.83277124, "num_input_tokens_seen": 350003935, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 16220, "time_per_iteration": 2.6676080226898193 }, { "auxiliary_loss_clip": 0.01100758, "auxiliary_loss_mlp": 0.0102897, "balance_loss_clip": 1.01726389, "balance_loss_mlp": 1.0334177, "epoch": 0.9752592815271306, "flos": 19646189735040.0, "grad_norm": 4.977560493786492, "language_loss": 0.75471592, "learning_rate": 6.025328176734623e-09, "loss": 0.7760132, "num_input_tokens_seen": 350023595, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.67578125, "step": 16221, "time_per_iteration": 2.6607813835144043 }, { "auxiliary_loss_clip": 0.01124693, "auxiliary_loss_mlp": 0.01029822, "balance_loss_clip": 1.0176208, "balance_loss_mlp": 1.0319891, "epoch": 0.9753194047797986, "flos": 26690462686080.0, "grad_norm": 1.5450734111071514, "language_loss": 0.6666581, "learning_rate": 5.996058155992134e-09, "loss": 0.68820322, "num_input_tokens_seen": 350045920, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.66015625, "step": 16222, "time_per_iteration": 2.655042886734009 }, { "auxiliary_loss_clip": 0.01108946, "auxiliary_loss_mlp": 0.0103089, "balance_loss_clip": 1.01924908, "balance_loss_mlp": 1.03290308, "epoch": 0.9753795280324665, "flos": 23221779137280.0, "grad_norm": 1.722467215271691, "language_loss": 0.88262886, "learning_rate": 5.966859296121063e-09, "loss": 0.90402722, "num_input_tokens_seen": 350063925, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.671875, "step": 16223, "time_per_iteration": 2.5020902156829834 }, { "auxiliary_loss_clip": 0.01144667, "auxiliary_loss_mlp": 0.01028874, "balance_loss_clip": 1.01703095, "balance_loss_mlp": 1.03311813, "epoch": 0.9754396512851345, "flos": 19463835363840.0, "grad_norm": 1.6835394873474794, "language_loss": 0.74492896, "learning_rate": 5.937731598163464e-09, "loss": 0.76666439, "num_input_tokens_seen": 350080900, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.671875, "step": 16224, "time_per_iteration": 4.006216526031494 }, { "auxiliary_loss_clip": 0.01040141, "auxiliary_loss_mlp": 0.01000072, "balance_loss_clip": 0.99886763, "balance_loss_mlp": 1.00140619, "epoch": 0.9754997745378025, "flos": 68459313340800.0, "grad_norm": 0.811037051495023, "language_loss": 0.59161144, "learning_rate": 5.908675063158952e-09, "loss": 0.61201352, "num_input_tokens_seen": 350144550, "router_z_loss_clip": 0.01202393, "router_z_loss_mlp": 0.20996094, "step": 16225, "time_per_iteration": 3.0973551273345947 }, { "auxiliary_loss_clip": 0.01127732, "auxiliary_loss_mlp": 0.01283597, "balance_loss_clip": 1.02485716, "balance_loss_mlp": 1.03416121, "epoch": 0.9755598977904705, "flos": 26395168976640.0, "grad_norm": 1.6673808578034819, "language_loss": 0.68905461, "learning_rate": 5.879689692144474e-09, "loss": 0.71316791, "num_input_tokens_seen": 350164050, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 16226, "time_per_iteration": 2.8127965927124023 }, { "auxiliary_loss_clip": 0.01111677, "auxiliary_loss_mlp": 0.01035988, "balance_loss_clip": 1.02326274, "balance_loss_mlp": 1.03387308, "epoch": 0.9756200210431384, "flos": 16617663446400.0, "grad_norm": 1.8241334838811907, "language_loss": 0.82615113, "learning_rate": 5.8507754861543136e-09, "loss": 0.84762776, "num_input_tokens_seen": 350181350, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 16227, "time_per_iteration": 2.489121913909912 }, { "auxiliary_loss_clip": 0.01099698, "auxiliary_loss_mlp": 0.01026627, "balance_loss_clip": 1.01518321, "balance_loss_mlp": 1.03302097, "epoch": 0.9756801442958064, "flos": 23623044946560.0, "grad_norm": 2.0261459827914376, "language_loss": 0.77908683, "learning_rate": 5.821932446220312e-09, "loss": 0.80035007, "num_input_tokens_seen": 350199765, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.66796875, "step": 16228, "time_per_iteration": 2.512266159057617 }, { "auxiliary_loss_clip": 0.01030833, "auxiliary_loss_mlp": 0.00999517, "balance_loss_clip": 0.99826527, "balance_loss_mlp": 1.0007813, "epoch": 0.9757402675484744, "flos": 61625799440640.0, "grad_norm": 0.8497385043448709, "language_loss": 0.55865872, "learning_rate": 5.793160573371869e-09, "loss": 0.57896221, "num_input_tokens_seen": 350256420, "router_z_loss_clip": 0.01251221, "router_z_loss_mlp": 0.21191406, "step": 16229, "time_per_iteration": 3.0869269371032715 }, { "auxiliary_loss_clip": 0.01111644, "auxiliary_loss_mlp": 0.01031054, "balance_loss_clip": 1.01841784, "balance_loss_mlp": 1.03413868, "epoch": 0.9758003908011423, "flos": 24058964401920.0, "grad_norm": 1.5632429029563657, "language_loss": 0.75335407, "learning_rate": 5.7644598686359405e-09, "loss": 0.77478099, "num_input_tokens_seen": 350276270, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 16230, "time_per_iteration": 4.058481216430664 }, { "auxiliary_loss_clip": 0.01115953, "auxiliary_loss_mlp": 0.01029473, "balance_loss_clip": 1.01824903, "balance_loss_mlp": 1.03377831, "epoch": 0.9758605140538104, "flos": 17493093717120.0, "grad_norm": 1.7357068357165704, "language_loss": 0.71926677, "learning_rate": 5.7358303330368175e-09, "loss": 0.74072099, "num_input_tokens_seen": 350295000, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.64453125, "step": 16231, "time_per_iteration": 2.51902174949646 }, { "auxiliary_loss_clip": 0.0112314, "auxiliary_loss_mlp": 0.01030205, "balance_loss_clip": 1.01777697, "balance_loss_mlp": 1.03555059, "epoch": 0.9759206373064783, "flos": 24826950115200.0, "grad_norm": 1.785966117734236, "language_loss": 0.76964676, "learning_rate": 5.707271967595906e-09, "loss": 0.79118025, "num_input_tokens_seen": 350314980, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.703125, "step": 16232, "time_per_iteration": 4.145403146743774 }, { "auxiliary_loss_clip": 0.01040338, "auxiliary_loss_mlp": 0.01000331, "balance_loss_clip": 0.99910879, "balance_loss_mlp": 1.00109577, "epoch": 0.9759807605591463, "flos": 68161182456960.0, "grad_norm": 0.7461927566538905, "language_loss": 0.5379104, "learning_rate": 5.67878477333239e-09, "loss": 0.55831712, "num_input_tokens_seen": 350371985, "router_z_loss_clip": 0.01220703, "router_z_loss_mlp": 0.2109375, "step": 16233, "time_per_iteration": 3.042694330215454 }, { "auxiliary_loss_clip": 0.01101862, "auxiliary_loss_mlp": 0.0103243, "balance_loss_clip": 1.02050269, "balance_loss_mlp": 1.032969, "epoch": 0.9760408838118142, "flos": 18989239939200.0, "grad_norm": 1.6698141050902928, "language_loss": 0.71397406, "learning_rate": 5.650368751263012e-09, "loss": 0.73531699, "num_input_tokens_seen": 350390590, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 16234, "time_per_iteration": 2.601243257522583 }, { "auxiliary_loss_clip": 0.01113772, "auxiliary_loss_mlp": 0.01033182, "balance_loss_clip": 1.02008688, "balance_loss_mlp": 1.03318667, "epoch": 0.9761010070644822, "flos": 17125978763520.0, "grad_norm": 2.4913848447475107, "language_loss": 0.78553629, "learning_rate": 5.622023902402073e-09, "loss": 0.80700582, "num_input_tokens_seen": 350403770, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 16235, "time_per_iteration": 2.5620689392089844 }, { "auxiliary_loss_clip": 0.0111115, "auxiliary_loss_mlp": 0.01030264, "balance_loss_clip": 1.0168885, "balance_loss_mlp": 1.03407276, "epoch": 0.9761611303171501, "flos": 22052599441920.0, "grad_norm": 1.7370080839300086, "language_loss": 0.77090436, "learning_rate": 5.593750227760985e-09, "loss": 0.79231852, "num_input_tokens_seen": 350421870, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6796875, "step": 16236, "time_per_iteration": 2.6148085594177246 }, { "auxiliary_loss_clip": 0.01109375, "auxiliary_loss_mlp": 0.01025091, "balance_loss_clip": 1.01377773, "balance_loss_mlp": 1.03327644, "epoch": 0.9762212535698181, "flos": 21757521214080.0, "grad_norm": 1.8612197348779096, "language_loss": 0.75385475, "learning_rate": 5.5655477283487185e-09, "loss": 0.77519941, "num_input_tokens_seen": 350440025, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.67578125, "step": 16237, "time_per_iteration": 2.6205859184265137 }, { "auxiliary_loss_clip": 0.01138125, "auxiliary_loss_mlp": 0.01031111, "balance_loss_clip": 1.0169785, "balance_loss_mlp": 1.03271437, "epoch": 0.9762813768224861, "flos": 25921615046400.0, "grad_norm": 1.685025021160443, "language_loss": 0.73023373, "learning_rate": 5.53741640517158e-09, "loss": 0.75192606, "num_input_tokens_seen": 350459435, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.6953125, "step": 16238, "time_per_iteration": 2.6310837268829346 }, { "auxiliary_loss_clip": 0.01120036, "auxiliary_loss_mlp": 0.01277993, "balance_loss_clip": 1.01828575, "balance_loss_mlp": 1.03434396, "epoch": 0.9763415000751541, "flos": 24051853509120.0, "grad_norm": 2.157912505740932, "language_loss": 0.84476769, "learning_rate": 5.509356259233877e-09, "loss": 0.86874795, "num_input_tokens_seen": 350472655, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6796875, "step": 16239, "time_per_iteration": 2.8109452724456787 }, { "auxiliary_loss_clip": 0.01130678, "auxiliary_loss_mlp": 0.01030563, "balance_loss_clip": 1.0178318, "balance_loss_mlp": 1.03495455, "epoch": 0.976401623327822, "flos": 17018677860480.0, "grad_norm": 3.4398444289500807, "language_loss": 0.60364854, "learning_rate": 5.4813672915365874e-09, "loss": 0.62526095, "num_input_tokens_seen": 350488160, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 16240, "time_per_iteration": 2.6021337509155273 }, { "auxiliary_loss_clip": 0.01127999, "auxiliary_loss_mlp": 0.01030135, "balance_loss_clip": 1.0181185, "balance_loss_mlp": 1.03454983, "epoch": 0.97646174658049, "flos": 16106941918080.0, "grad_norm": 1.8822551205595468, "language_loss": 0.82348514, "learning_rate": 5.4534495030789104e-09, "loss": 0.84506649, "num_input_tokens_seen": 350506065, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6640625, "step": 16241, "time_per_iteration": 2.6203343868255615 }, { "auxiliary_loss_clip": 0.0112896, "auxiliary_loss_mlp": 0.01032853, "balance_loss_clip": 1.02113473, "balance_loss_mlp": 1.03421259, "epoch": 0.9765218698331579, "flos": 21252725429760.0, "grad_norm": 1.7669077525587105, "language_loss": 0.82917249, "learning_rate": 5.42560289485694e-09, "loss": 0.85079062, "num_input_tokens_seen": 350524495, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 16242, "time_per_iteration": 2.6256206035614014 }, { "auxiliary_loss_clip": 0.01113523, "auxiliary_loss_mlp": 0.0102778, "balance_loss_clip": 1.01574552, "balance_loss_mlp": 1.03546369, "epoch": 0.9765819930858259, "flos": 18588045957120.0, "grad_norm": 1.5258255614555407, "language_loss": 0.75643069, "learning_rate": 5.397827467864769e-09, "loss": 0.77784371, "num_input_tokens_seen": 350544185, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 16243, "time_per_iteration": 2.596818685531616 }, { "auxiliary_loss_clip": 0.01111555, "auxiliary_loss_mlp": 0.01036583, "balance_loss_clip": 1.02360058, "balance_loss_mlp": 1.03362226, "epoch": 0.976642116338494, "flos": 20266833859200.0, "grad_norm": 1.5631553450057516, "language_loss": 0.69909686, "learning_rate": 5.3701232230931595e-09, "loss": 0.72057825, "num_input_tokens_seen": 350562675, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69140625, "step": 16244, "time_per_iteration": 2.58203387260437 }, { "auxiliary_loss_clip": 0.01123854, "auxiliary_loss_mlp": 0.01030496, "balance_loss_clip": 1.01777673, "balance_loss_mlp": 1.03707623, "epoch": 0.9767022395911619, "flos": 25550477769600.0, "grad_norm": 3.0204253144209945, "language_loss": 0.83605534, "learning_rate": 5.3424901615310994e-09, "loss": 0.85759884, "num_input_tokens_seen": 350581535, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 16245, "time_per_iteration": 2.6211230754852295 }, { "auxiliary_loss_clip": 0.0113335, "auxiliary_loss_mlp": 0.01023289, "balance_loss_clip": 1.01189232, "balance_loss_mlp": 1.031618, "epoch": 0.9767623628438299, "flos": 21762656858880.0, "grad_norm": 1.417749556662514, "language_loss": 0.78304476, "learning_rate": 5.31492828416491e-09, "loss": 0.80461109, "num_input_tokens_seen": 350601615, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.66015625, "step": 16246, "time_per_iteration": 2.707132577896118 }, { "auxiliary_loss_clip": 0.01095321, "auxiliary_loss_mlp": 0.01031185, "balance_loss_clip": 1.02062893, "balance_loss_mlp": 1.03290224, "epoch": 0.9768224860964978, "flos": 15851114277120.0, "grad_norm": 1.5617550083519147, "language_loss": 0.73958176, "learning_rate": 5.287437591977806e-09, "loss": 0.76084679, "num_input_tokens_seen": 350619580, "router_z_loss_clip": 0.10546875, "router_z_loss_mlp": 0.625, "step": 16247, "time_per_iteration": 2.707943916320801 }, { "auxiliary_loss_clip": 0.01146597, "auxiliary_loss_mlp": 0.01030754, "balance_loss_clip": 1.01842821, "balance_loss_mlp": 1.03344154, "epoch": 0.9768826093491658, "flos": 25151151294720.0, "grad_norm": 6.079295457537669, "language_loss": 0.79723263, "learning_rate": 5.260018085951001e-09, "loss": 0.8190062, "num_input_tokens_seen": 350640015, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 16248, "time_per_iteration": 2.7137534618377686 }, { "auxiliary_loss_clip": 0.01154522, "auxiliary_loss_mlp": 0.01041511, "balance_loss_clip": 1.02823722, "balance_loss_mlp": 1.03790259, "epoch": 0.9769427326018337, "flos": 22967028904320.0, "grad_norm": 3.4548009475849275, "language_loss": 0.79401076, "learning_rate": 5.232669767063269e-09, "loss": 0.81597114, "num_input_tokens_seen": 350659155, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 16249, "time_per_iteration": 2.694551467895508 }, { "auxiliary_loss_clip": 0.01108239, "auxiliary_loss_mlp": 0.01031508, "balance_loss_clip": 1.01959312, "balance_loss_mlp": 1.03410172, "epoch": 0.9770028558545018, "flos": 15264297786240.0, "grad_norm": 1.7917791960564673, "language_loss": 0.6672793, "learning_rate": 5.205392636290273e-09, "loss": 0.68867677, "num_input_tokens_seen": 350676615, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.65625, "step": 16250, "time_per_iteration": 2.544926643371582 }, { "auxiliary_loss_clip": 0.01021984, "auxiliary_loss_mlp": 0.01000163, "balance_loss_clip": 0.99888176, "balance_loss_mlp": 1.00085235, "epoch": 0.9770629791071697, "flos": 71450348808960.0, "grad_norm": 0.7749458375764612, "language_loss": 0.59932923, "learning_rate": 5.178186694605457e-09, "loss": 0.61955076, "num_input_tokens_seen": 350736805, "router_z_loss_clip": 0.01281738, "router_z_loss_mlp": 0.2109375, "step": 16251, "time_per_iteration": 3.234764337539673 }, { "auxiliary_loss_clip": 0.01110706, "auxiliary_loss_mlp": 0.01030825, "balance_loss_clip": 1.01861155, "balance_loss_mlp": 1.03333783, "epoch": 0.9771231023598377, "flos": 22412854897920.0, "grad_norm": 1.775381684246788, "language_loss": 0.78534865, "learning_rate": 5.151051942980045e-09, "loss": 0.80676395, "num_input_tokens_seen": 350753600, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.68359375, "step": 16252, "time_per_iteration": 2.587420701980591 }, { "auxiliary_loss_clip": 0.01109935, "auxiliary_loss_mlp": 0.01031202, "balance_loss_clip": 1.01896513, "balance_loss_mlp": 1.03358996, "epoch": 0.9771832256125056, "flos": 19025940660480.0, "grad_norm": 2.7054148021536637, "language_loss": 0.63877141, "learning_rate": 5.1239883823821495e-09, "loss": 0.66018277, "num_input_tokens_seen": 350771225, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 16253, "time_per_iteration": 2.679442882537842 }, { "auxiliary_loss_clip": 0.01115906, "auxiliary_loss_mlp": 0.01030136, "balance_loss_clip": 1.01685596, "balance_loss_mlp": 1.03764689, "epoch": 0.9772433488651736, "flos": 17822143232640.0, "grad_norm": 2.0431066631068724, "language_loss": 0.77082139, "learning_rate": 5.0969960137776655e-09, "loss": 0.79228187, "num_input_tokens_seen": 350789100, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69140625, "step": 16254, "time_per_iteration": 2.5821726322174072 }, { "auxiliary_loss_clip": 0.01113527, "auxiliary_loss_mlp": 0.01030347, "balance_loss_clip": 1.01800275, "balance_loss_mlp": 1.03441668, "epoch": 0.9773034721178415, "flos": 37629785623680.0, "grad_norm": 1.8767283433168467, "language_loss": 0.64030623, "learning_rate": 5.070074838129823e-09, "loss": 0.66174507, "num_input_tokens_seen": 350811085, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.703125, "step": 16255, "time_per_iteration": 2.6833958625793457 }, { "auxiliary_loss_clip": 0.01133924, "auxiliary_loss_mlp": 0.01280482, "balance_loss_clip": 1.02004313, "balance_loss_mlp": 1.03574777, "epoch": 0.9773635953705095, "flos": 20302457172480.0, "grad_norm": 1.5093866237461988, "language_loss": 0.75592446, "learning_rate": 5.043224856399631e-09, "loss": 0.78006846, "num_input_tokens_seen": 350831065, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 16256, "time_per_iteration": 2.572312355041504 }, { "auxiliary_loss_clip": 0.01100927, "auxiliary_loss_mlp": 0.01035169, "balance_loss_clip": 1.02227092, "balance_loss_mlp": 1.03470826, "epoch": 0.9774237186231776, "flos": 22309253095680.0, "grad_norm": 2.0516274382658195, "language_loss": 0.78166479, "learning_rate": 5.016446069544988e-09, "loss": 0.80302578, "num_input_tokens_seen": 350849675, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6640625, "step": 16257, "time_per_iteration": 3.956655263900757 }, { "auxiliary_loss_clip": 0.01110375, "auxiliary_loss_mlp": 0.01033534, "balance_loss_clip": 1.02096295, "balance_loss_mlp": 1.03367448, "epoch": 0.9774838418758455, "flos": 24204905360640.0, "grad_norm": 1.5439694648376323, "language_loss": 0.75487757, "learning_rate": 4.9897384785217985e-09, "loss": 0.7763167, "num_input_tokens_seen": 350868955, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.67578125, "step": 16258, "time_per_iteration": 2.5929136276245117 }, { "auxiliary_loss_clip": 0.01031109, "auxiliary_loss_mlp": 0.01003011, "balance_loss_clip": 1.00180745, "balance_loss_mlp": 1.00117159, "epoch": 0.9775439651285135, "flos": 66357139829760.0, "grad_norm": 0.6708238300600823, "language_loss": 0.59799159, "learning_rate": 4.963102084282855e-09, "loss": 0.61833274, "num_input_tokens_seen": 350935110, "router_z_loss_clip": 0.01202393, "router_z_loss_mlp": 0.21191406, "step": 16259, "time_per_iteration": 3.2650914192199707 }, { "auxiliary_loss_clip": 0.01123206, "auxiliary_loss_mlp": 0.010333, "balance_loss_clip": 1.0202527, "balance_loss_mlp": 1.03417921, "epoch": 0.9776040883811814, "flos": 22601565976320.0, "grad_norm": 1.8289213461230471, "language_loss": 0.73717636, "learning_rate": 4.936536887779174e-09, "loss": 0.75874144, "num_input_tokens_seen": 350953220, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 16260, "time_per_iteration": 2.678297519683838 }, { "auxiliary_loss_clip": 0.01111902, "auxiliary_loss_mlp": 0.01031879, "balance_loss_clip": 1.01912951, "balance_loss_mlp": 1.03341341, "epoch": 0.9776642116338494, "flos": 18442176825600.0, "grad_norm": 1.981245176010321, "language_loss": 0.79292524, "learning_rate": 4.910042889958444e-09, "loss": 0.81436312, "num_input_tokens_seen": 350971915, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 16261, "time_per_iteration": 2.6002864837646484 }, { "auxiliary_loss_clip": 0.0111206, "auxiliary_loss_mlp": 0.01025061, "balance_loss_clip": 1.01253808, "balance_loss_mlp": 1.03379965, "epoch": 0.9777243348865173, "flos": 22638446265600.0, "grad_norm": 2.2164337810677646, "language_loss": 0.74441588, "learning_rate": 4.883620091766127e-09, "loss": 0.76578707, "num_input_tokens_seen": 350990470, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 16262, "time_per_iteration": 2.627014636993408 }, { "auxiliary_loss_clip": 0.01112821, "auxiliary_loss_mlp": 0.01031913, "balance_loss_clip": 1.01783419, "balance_loss_mlp": 1.03406954, "epoch": 0.9777844581391854, "flos": 31321394605440.0, "grad_norm": 2.11912427668329, "language_loss": 0.69978642, "learning_rate": 4.857268494145694e-09, "loss": 0.72123373, "num_input_tokens_seen": 351010755, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.69921875, "step": 16263, "time_per_iteration": 2.6629581451416016 }, { "auxiliary_loss_clip": 0.01111773, "auxiliary_loss_mlp": 0.01028684, "balance_loss_clip": 1.01691198, "balance_loss_mlp": 1.03379703, "epoch": 0.9778445813918533, "flos": 23039101543680.0, "grad_norm": 1.6941717500396007, "language_loss": 0.66696566, "learning_rate": 4.830988098037059e-09, "loss": 0.68837023, "num_input_tokens_seen": 351029965, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6953125, "step": 16264, "time_per_iteration": 2.5436830520629883 }, { "auxiliary_loss_clip": 0.01142898, "auxiliary_loss_mlp": 0.01031771, "balance_loss_clip": 1.01827633, "balance_loss_mlp": 1.03518569, "epoch": 0.9779047046445213, "flos": 17566351505280.0, "grad_norm": 3.176364274298391, "language_loss": 0.73034132, "learning_rate": 4.804778904378138e-09, "loss": 0.75208795, "num_input_tokens_seen": 351046205, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 16265, "time_per_iteration": 3.912062406539917 }, { "auxiliary_loss_clip": 0.01108938, "auxiliary_loss_mlp": 0.01033628, "balance_loss_clip": 1.02094972, "balance_loss_mlp": 1.03278255, "epoch": 0.9779648278971892, "flos": 20741141975040.0, "grad_norm": 1.7338285718887168, "language_loss": 0.69144082, "learning_rate": 4.778640914104625e-09, "loss": 0.71286649, "num_input_tokens_seen": 351065390, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.671875, "step": 16266, "time_per_iteration": 2.5480709075927734 }, { "auxiliary_loss_clip": 0.01134088, "auxiliary_loss_mlp": 0.01029342, "balance_loss_clip": 1.01853561, "balance_loss_mlp": 1.0340116, "epoch": 0.9780249511498572, "flos": 21026954494080.0, "grad_norm": 1.7659558664210127, "language_loss": 0.86751467, "learning_rate": 4.752574128149111e-09, "loss": 0.88914895, "num_input_tokens_seen": 351084355, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.6484375, "step": 16267, "time_per_iteration": 2.852195978164673 }, { "auxiliary_loss_clip": 0.01157874, "auxiliary_loss_mlp": 0.01028344, "balance_loss_clip": 1.01518321, "balance_loss_mlp": 1.03453851, "epoch": 0.9780850744025251, "flos": 30774223751040.0, "grad_norm": 1.760083654456269, "language_loss": 0.70164996, "learning_rate": 4.72657854744174e-09, "loss": 0.72351217, "num_input_tokens_seen": 351105870, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 16268, "time_per_iteration": 2.7772600650787354 }, { "auxiliary_loss_clip": 0.01120329, "auxiliary_loss_mlp": 0.01028989, "balance_loss_clip": 1.01728892, "balance_loss_mlp": 1.03346443, "epoch": 0.9781451976551931, "flos": 20302995876480.0, "grad_norm": 2.8085613526359534, "language_loss": 0.7356115, "learning_rate": 4.700654172910212e-09, "loss": 0.75710464, "num_input_tokens_seen": 351124760, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69140625, "step": 16269, "time_per_iteration": 2.635582447052002 }, { "auxiliary_loss_clip": 0.01123051, "auxiliary_loss_mlp": 0.01030312, "balance_loss_clip": 1.01724076, "balance_loss_mlp": 1.03556228, "epoch": 0.9782053209078612, "flos": 24316479982080.0, "grad_norm": 1.8620438762561842, "language_loss": 0.70919269, "learning_rate": 4.674801005480011e-09, "loss": 0.7307263, "num_input_tokens_seen": 351142820, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 16270, "time_per_iteration": 2.6419737339019775 }, { "auxiliary_loss_clip": 0.01104315, "auxiliary_loss_mlp": 0.01033997, "balance_loss_clip": 1.02182579, "balance_loss_mlp": 1.03476882, "epoch": 0.9782654441605291, "flos": 32489425065600.0, "grad_norm": 1.737762874034788, "language_loss": 0.63935518, "learning_rate": 4.64901904607351e-09, "loss": 0.66073835, "num_input_tokens_seen": 351164805, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 16271, "time_per_iteration": 2.6767282485961914 }, { "auxiliary_loss_clip": 0.01116481, "auxiliary_loss_mlp": 0.0102954, "balance_loss_clip": 1.01735675, "balance_loss_mlp": 1.03381753, "epoch": 0.9783255674131971, "flos": 26979076465920.0, "grad_norm": 1.4184621538838038, "language_loss": 0.70446813, "learning_rate": 4.6233082956108615e-09, "loss": 0.72592837, "num_input_tokens_seen": 351187005, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.65234375, "step": 16272, "time_per_iteration": 4.154935598373413 }, { "auxiliary_loss_clip": 0.01149007, "auxiliary_loss_mlp": 0.01035067, "balance_loss_clip": 1.02212679, "balance_loss_mlp": 1.03433156, "epoch": 0.978385690665865, "flos": 23112251591040.0, "grad_norm": 1.7043798238164856, "language_loss": 0.66547018, "learning_rate": 4.597668755009554e-09, "loss": 0.68731093, "num_input_tokens_seen": 351208450, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 16273, "time_per_iteration": 4.211086750030518 }, { "auxiliary_loss_clip": 0.01128626, "auxiliary_loss_mlp": 0.01021414, "balance_loss_clip": 1.00865877, "balance_loss_mlp": 1.0327642, "epoch": 0.978445813918533, "flos": 25409672455680.0, "grad_norm": 2.3365176849601106, "language_loss": 0.74001372, "learning_rate": 4.572100425184633e-09, "loss": 0.76151407, "num_input_tokens_seen": 351229585, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 16274, "time_per_iteration": 2.61930513381958 }, { "auxiliary_loss_clip": 0.01120239, "auxiliary_loss_mlp": 0.01032862, "balance_loss_clip": 1.02113235, "balance_loss_mlp": 1.03513408, "epoch": 0.9785059371712009, "flos": 23550218121600.0, "grad_norm": 1.6687203606335133, "language_loss": 0.77720529, "learning_rate": 4.546603307048702e-09, "loss": 0.79873633, "num_input_tokens_seen": 351249525, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 16275, "time_per_iteration": 2.611452341079712 }, { "auxiliary_loss_clip": 0.01158913, "auxiliary_loss_mlp": 0.01030798, "balance_loss_clip": 1.01813853, "balance_loss_mlp": 1.0351963, "epoch": 0.978566060423869, "flos": 34351177870080.0, "grad_norm": 1.4635339717248586, "language_loss": 0.71022856, "learning_rate": 4.5211774015117e-09, "loss": 0.73212564, "num_input_tokens_seen": 351272530, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 16276, "time_per_iteration": 2.728538751602173 }, { "auxiliary_loss_clip": 0.0111625, "auxiliary_loss_mlp": 0.0103015, "balance_loss_clip": 1.01859272, "balance_loss_mlp": 1.03273737, "epoch": 0.9786261836765369, "flos": 11618862387840.0, "grad_norm": 1.8933721057839035, "language_loss": 0.85679817, "learning_rate": 4.495822709480679e-09, "loss": 0.87826216, "num_input_tokens_seen": 351288530, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.65625, "step": 16277, "time_per_iteration": 2.493656873703003 }, { "auxiliary_loss_clip": 0.0111005, "auxiliary_loss_mlp": 0.01025976, "balance_loss_clip": 1.01435351, "balance_loss_mlp": 1.03438878, "epoch": 0.9786863069292049, "flos": 17420949250560.0, "grad_norm": 1.815303720119516, "language_loss": 0.7069816, "learning_rate": 4.470539231860693e-09, "loss": 0.72834188, "num_input_tokens_seen": 351305890, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6640625, "step": 16278, "time_per_iteration": 2.5255625247955322 }, { "auxiliary_loss_clip": 0.01107734, "auxiliary_loss_mlp": 0.01029374, "balance_loss_clip": 1.01648724, "balance_loss_mlp": 1.03126633, "epoch": 0.9787464301818728, "flos": 20844923345280.0, "grad_norm": 1.871324234142462, "language_loss": 0.84205437, "learning_rate": 4.4453269695541306e-09, "loss": 0.86342537, "num_input_tokens_seen": 351325010, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.671875, "step": 16279, "time_per_iteration": 2.5302553176879883 }, { "auxiliary_loss_clip": 0.01111852, "auxiliary_loss_mlp": 0.01030264, "balance_loss_clip": 1.01744902, "balance_loss_mlp": 1.03381276, "epoch": 0.9788065534345408, "flos": 18222942165120.0, "grad_norm": 2.7859158803872734, "language_loss": 0.79267693, "learning_rate": 4.420185923460717e-09, "loss": 0.81409806, "num_input_tokens_seen": 351343060, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 16280, "time_per_iteration": 2.6312508583068848 }, { "auxiliary_loss_clip": 0.01125088, "auxiliary_loss_mlp": 0.01032427, "balance_loss_clip": 1.02119184, "balance_loss_mlp": 1.03382969, "epoch": 0.9788666766872087, "flos": 21578219498880.0, "grad_norm": 1.641887432148111, "language_loss": 0.79304886, "learning_rate": 4.3951160944775135e-09, "loss": 0.81462401, "num_input_tokens_seen": 351363260, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.640625, "step": 16281, "time_per_iteration": 2.7341933250427246 }, { "auxiliary_loss_clip": 0.01111402, "auxiliary_loss_mlp": 0.01031089, "balance_loss_clip": 1.01830387, "balance_loss_mlp": 1.03328204, "epoch": 0.9789267999398767, "flos": 24900495212160.0, "grad_norm": 1.6813618422404333, "language_loss": 0.80265093, "learning_rate": 4.370117483499358e-09, "loss": 0.82407582, "num_input_tokens_seen": 351382610, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 16282, "time_per_iteration": 2.649883985519409 }, { "auxiliary_loss_clip": 0.01120169, "auxiliary_loss_mlp": 0.01039615, "balance_loss_clip": 1.02709246, "balance_loss_mlp": 1.03410149, "epoch": 0.9789869231925448, "flos": 19573111514880.0, "grad_norm": 2.045732944257818, "language_loss": 0.83089232, "learning_rate": 4.345190091418427e-09, "loss": 0.85249019, "num_input_tokens_seen": 351401075, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 16283, "time_per_iteration": 2.58027720451355 }, { "auxiliary_loss_clip": 0.01111487, "auxiliary_loss_mlp": 0.01031038, "balance_loss_clip": 1.0189079, "balance_loss_mlp": 1.03359079, "epoch": 0.9790470464452127, "flos": 16796641939200.0, "grad_norm": 2.8149247447488452, "language_loss": 0.72016054, "learning_rate": 4.3203339191240085e-09, "loss": 0.74158579, "num_input_tokens_seen": 351419275, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 16284, "time_per_iteration": 2.5130984783172607 }, { "auxiliary_loss_clip": 0.01109608, "auxiliary_loss_mlp": 0.01032853, "balance_loss_clip": 1.02157617, "balance_loss_mlp": 1.03503835, "epoch": 0.9791071696978807, "flos": 18369350000640.0, "grad_norm": 2.0238223152931365, "language_loss": 0.64273703, "learning_rate": 4.295548967503615e-09, "loss": 0.66416168, "num_input_tokens_seen": 351437375, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.65625, "step": 16285, "time_per_iteration": 2.541325569152832 }, { "auxiliary_loss_clip": 0.01118978, "auxiliary_loss_mlp": 0.0102793, "balance_loss_clip": 1.01577663, "balance_loss_mlp": 1.03352237, "epoch": 0.9791672929505486, "flos": 24170323541760.0, "grad_norm": 1.609298884282045, "language_loss": 0.70930028, "learning_rate": 4.270835237441206e-09, "loss": 0.73076946, "num_input_tokens_seen": 351457810, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6796875, "step": 16286, "time_per_iteration": 2.587256669998169 }, { "auxiliary_loss_clip": 0.01107093, "auxiliary_loss_mlp": 0.01026878, "balance_loss_clip": 1.01515341, "balance_loss_mlp": 1.03168368, "epoch": 0.9792274162032166, "flos": 24354114456960.0, "grad_norm": 1.3462665373725093, "language_loss": 0.58195114, "learning_rate": 4.246192729819409e-09, "loss": 0.60329092, "num_input_tokens_seen": 351478825, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6640625, "step": 16287, "time_per_iteration": 2.584150791168213 }, { "auxiliary_loss_clip": 0.01111226, "auxiliary_loss_mlp": 0.01033788, "balance_loss_clip": 1.02150917, "balance_loss_mlp": 1.03423178, "epoch": 0.9792875394558845, "flos": 26395779507840.0, "grad_norm": 2.2127125741464386, "language_loss": 0.81406677, "learning_rate": 4.221621445517076e-09, "loss": 0.83551687, "num_input_tokens_seen": 351498785, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 16288, "time_per_iteration": 2.810713291168213 }, { "auxiliary_loss_clip": 0.01120874, "auxiliary_loss_mlp": 0.01277412, "balance_loss_clip": 1.01780307, "balance_loss_mlp": 1.0332613, "epoch": 0.9793476627085526, "flos": 28148004766080.0, "grad_norm": 1.5189666012715348, "language_loss": 0.71609008, "learning_rate": 4.1971213854112845e-09, "loss": 0.74007297, "num_input_tokens_seen": 351520235, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 16289, "time_per_iteration": 2.632115602493286 }, { "auxiliary_loss_clip": 0.0112797, "auxiliary_loss_mlp": 0.01032358, "balance_loss_clip": 1.01963878, "balance_loss_mlp": 1.03437674, "epoch": 0.9794077859612205, "flos": 16763927627520.0, "grad_norm": 2.567565591183779, "language_loss": 0.75287753, "learning_rate": 4.172692550376667e-09, "loss": 0.77448082, "num_input_tokens_seen": 351538900, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.671875, "step": 16290, "time_per_iteration": 2.591268539428711 }, { "auxiliary_loss_clip": 0.01112264, "auxiliary_loss_mlp": 0.01034127, "balance_loss_clip": 1.02116871, "balance_loss_mlp": 1.03356171, "epoch": 0.9794679092138885, "flos": 19280834547840.0, "grad_norm": 1.520285931426296, "language_loss": 0.67191976, "learning_rate": 4.148334941284526e-09, "loss": 0.69338369, "num_input_tokens_seen": 351558715, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69921875, "step": 16291, "time_per_iteration": 2.544902801513672 }, { "auxiliary_loss_clip": 0.01104579, "auxiliary_loss_mlp": 0.01275552, "balance_loss_clip": 1.01595521, "balance_loss_mlp": 1.0334971, "epoch": 0.9795280324665564, "flos": 32156640535680.0, "grad_norm": 1.8198203316347812, "language_loss": 0.62412477, "learning_rate": 4.124048559004611e-09, "loss": 0.64792609, "num_input_tokens_seen": 351578450, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 16292, "time_per_iteration": 2.596756935119629 }, { "auxiliary_loss_clip": 0.01128844, "auxiliary_loss_mlp": 0.0103319, "balance_loss_clip": 1.02143574, "balance_loss_mlp": 1.0348177, "epoch": 0.9795881557192244, "flos": 19060953442560.0, "grad_norm": 1.9184779217718801, "language_loss": 0.73206091, "learning_rate": 4.099833404403119e-09, "loss": 0.75368118, "num_input_tokens_seen": 351597195, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 16293, "time_per_iteration": 2.557734727859497 }, { "auxiliary_loss_clip": 0.01112122, "auxiliary_loss_mlp": 0.01030401, "balance_loss_clip": 1.01774681, "balance_loss_mlp": 1.03309655, "epoch": 0.9796482789718923, "flos": 28329928174080.0, "grad_norm": 1.796810173386204, "language_loss": 0.84178519, "learning_rate": 4.07568947834469e-09, "loss": 0.86321044, "num_input_tokens_seen": 351617460, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 16294, "time_per_iteration": 2.6212849617004395 }, { "auxiliary_loss_clip": 0.01124876, "auxiliary_loss_mlp": 0.01032156, "balance_loss_clip": 1.02084923, "balance_loss_mlp": 1.03331077, "epoch": 0.9797084022245603, "flos": 17967976450560.0, "grad_norm": 1.8032978121708623, "language_loss": 0.72327065, "learning_rate": 4.051616781690637e-09, "loss": 0.74484098, "num_input_tokens_seen": 351635900, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6484375, "step": 16295, "time_per_iteration": 2.5952751636505127 }, { "auxiliary_loss_clip": 0.01117621, "auxiliary_loss_mlp": 0.0103361, "balance_loss_clip": 1.02101612, "balance_loss_mlp": 1.03674579, "epoch": 0.9797685254772284, "flos": 20266726118400.0, "grad_norm": 1.7879662500302616, "language_loss": 0.80804652, "learning_rate": 4.02761531530027e-09, "loss": 0.82955879, "num_input_tokens_seen": 351655400, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 16296, "time_per_iteration": 2.709261417388916 }, { "auxiliary_loss_clip": 0.01117525, "auxiliary_loss_mlp": 0.01033203, "balance_loss_clip": 1.02135324, "balance_loss_mlp": 1.03256476, "epoch": 0.9798286487298963, "flos": 26907147480960.0, "grad_norm": 1.8101828260664015, "language_loss": 0.75454295, "learning_rate": 4.003685080030239e-09, "loss": 0.77605021, "num_input_tokens_seen": 351675505, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.671875, "step": 16297, "time_per_iteration": 2.6225955486297607 }, { "auxiliary_loss_clip": 0.01113446, "auxiliary_loss_mlp": 0.01032059, "balance_loss_clip": 1.01973271, "balance_loss_mlp": 1.03387916, "epoch": 0.9798887719825643, "flos": 27161071701120.0, "grad_norm": 1.561189416418288, "language_loss": 0.78375602, "learning_rate": 3.979826076734083e-09, "loss": 0.80521107, "num_input_tokens_seen": 351697920, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.703125, "step": 16298, "time_per_iteration": 3.992065668106079 }, { "auxiliary_loss_clip": 0.0110684, "auxiliary_loss_mlp": 0.01026926, "balance_loss_clip": 1.01518416, "balance_loss_mlp": 1.03259444, "epoch": 0.9799488952352322, "flos": 20668422890880.0, "grad_norm": 1.491497003373796, "language_loss": 0.72594571, "learning_rate": 3.956038306263565e-09, "loss": 0.74728346, "num_input_tokens_seen": 351717615, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.65234375, "step": 16299, "time_per_iteration": 2.813184976577759 }, { "auxiliary_loss_clip": 0.01122692, "auxiliary_loss_mlp": 0.01030673, "balance_loss_clip": 1.01791155, "balance_loss_mlp": 1.03426802, "epoch": 0.9800090184879002, "flos": 21981209160960.0, "grad_norm": 2.178620691069522, "language_loss": 0.89124823, "learning_rate": 3.9323217694677835e-09, "loss": 0.91278195, "num_input_tokens_seen": 351735260, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 16300, "time_per_iteration": 2.5327723026275635 }, { "auxiliary_loss_clip": 0.01113417, "auxiliary_loss_mlp": 0.01031458, "balance_loss_clip": 1.01907849, "balance_loss_mlp": 1.03380752, "epoch": 0.9800691417405681, "flos": 21288420570240.0, "grad_norm": 2.0869762450633806, "language_loss": 0.78302205, "learning_rate": 3.9086764671929506e-09, "loss": 0.80447078, "num_input_tokens_seen": 351755800, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 16301, "time_per_iteration": 2.650322198867798 }, { "auxiliary_loss_clip": 0.01121491, "auxiliary_loss_mlp": 0.01034045, "balance_loss_clip": 1.02115273, "balance_loss_mlp": 1.03398633, "epoch": 0.9801292649932362, "flos": 18439878355200.0, "grad_norm": 2.106961300263227, "language_loss": 0.75048006, "learning_rate": 3.885102400282614e-09, "loss": 0.77203536, "num_input_tokens_seen": 351774790, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 16302, "time_per_iteration": 2.559540271759033 }, { "auxiliary_loss_clip": 0.01120331, "auxiliary_loss_mlp": 0.01029429, "balance_loss_clip": 1.01647067, "balance_loss_mlp": 1.03425407, "epoch": 0.9801893882459041, "flos": 25046364343680.0, "grad_norm": 1.5543817336378094, "language_loss": 0.79372036, "learning_rate": 3.861599569578544e-09, "loss": 0.81521797, "num_input_tokens_seen": 351792855, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6796875, "step": 16303, "time_per_iteration": 2.9841020107269287 }, { "auxiliary_loss_clip": 0.01132493, "auxiliary_loss_mlp": 0.01032297, "balance_loss_clip": 1.01954126, "balance_loss_mlp": 1.03469431, "epoch": 0.9802495114985721, "flos": 18511484117760.0, "grad_norm": 2.1340954635744684, "language_loss": 0.83071089, "learning_rate": 3.838167975919404e-09, "loss": 0.85235882, "num_input_tokens_seen": 351811450, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 16304, "time_per_iteration": 3.1824235916137695 }, { "auxiliary_loss_clip": 0.01123517, "auxiliary_loss_mlp": 0.01031278, "balance_loss_clip": 1.02056146, "balance_loss_mlp": 1.03310919, "epoch": 0.98030963475124, "flos": 21469841187840.0, "grad_norm": 2.0543838437944415, "language_loss": 0.70783007, "learning_rate": 3.814807620141192e-09, "loss": 0.72937799, "num_input_tokens_seen": 351831960, "router_z_loss_clip": 0.10742188, "router_z_loss_mlp": 0.640625, "step": 16305, "time_per_iteration": 2.952155351638794 }, { "auxiliary_loss_clip": 0.01123623, "auxiliary_loss_mlp": 0.01035415, "balance_loss_clip": 1.0227015, "balance_loss_mlp": 1.03617954, "epoch": 0.980369758003908, "flos": 20412272027520.0, "grad_norm": 1.84898130264233, "language_loss": 0.71971726, "learning_rate": 3.791518503077684e-09, "loss": 0.74130762, "num_input_tokens_seen": 351851585, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 16306, "time_per_iteration": 2.990912675857544 }, { "auxiliary_loss_clip": 0.01115754, "auxiliary_loss_mlp": 0.01033, "balance_loss_clip": 1.02159727, "balance_loss_mlp": 1.03177118, "epoch": 0.9804298812565759, "flos": 23399177431680.0, "grad_norm": 1.4840070564563121, "language_loss": 0.7331416, "learning_rate": 3.768300625559994e-09, "loss": 0.75462914, "num_input_tokens_seen": 351871085, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.66015625, "step": 16307, "time_per_iteration": 4.3332436084747314 }, { "auxiliary_loss_clip": 0.01123858, "auxiliary_loss_mlp": 0.01028725, "balance_loss_clip": 1.01773369, "balance_loss_mlp": 1.03404582, "epoch": 0.980490004509244, "flos": 23292666627840.0, "grad_norm": 1.5332243576369833, "language_loss": 0.75006783, "learning_rate": 3.745153988416794e-09, "loss": 0.77159369, "num_input_tokens_seen": 351891775, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.63671875, "step": 16308, "time_per_iteration": 2.990626335144043 }, { "auxiliary_loss_clip": 0.01039412, "auxiliary_loss_mlp": 0.01001459, "balance_loss_clip": 1.00020123, "balance_loss_mlp": 1.00097728, "epoch": 0.980550127761912, "flos": 56051027798400.0, "grad_norm": 0.7567502729845437, "language_loss": 0.57833427, "learning_rate": 3.722078592474087e-09, "loss": 0.59874296, "num_input_tokens_seen": 351946770, "router_z_loss_clip": 0.01257324, "router_z_loss_mlp": 0.21191406, "step": 16309, "time_per_iteration": 3.3333001136779785 }, { "auxiliary_loss_clip": 0.01137763, "auxiliary_loss_mlp": 0.01030936, "balance_loss_clip": 1.01847899, "balance_loss_mlp": 1.03258038, "epoch": 0.9806102510145799, "flos": 25333290184320.0, "grad_norm": 2.3330927874478657, "language_loss": 0.66222841, "learning_rate": 3.6990744385554383e-09, "loss": 0.68391544, "num_input_tokens_seen": 351966155, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 16310, "time_per_iteration": 2.9755160808563232 }, { "auxiliary_loss_clip": 0.01039893, "auxiliary_loss_mlp": 0.0099952, "balance_loss_clip": 0.99828643, "balance_loss_mlp": 1.00110984, "epoch": 0.9806703742672479, "flos": 67274837429760.0, "grad_norm": 0.819660019455141, "language_loss": 0.54590392, "learning_rate": 3.6761415274815242e-09, "loss": 0.56629807, "num_input_tokens_seen": 352031655, "router_z_loss_clip": 0.0123291, "router_z_loss_mlp": 0.21289062, "step": 16311, "time_per_iteration": 3.5986907482147217 }, { "auxiliary_loss_clip": 0.01110569, "auxiliary_loss_mlp": 0.01026947, "balance_loss_clip": 1.0153718, "balance_loss_mlp": 1.03396916, "epoch": 0.9807304975199158, "flos": 25228970110080.0, "grad_norm": 2.5726712195383237, "language_loss": 0.7987268, "learning_rate": 3.6532798600710236e-09, "loss": 0.82010198, "num_input_tokens_seen": 352051920, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.67578125, "step": 16312, "time_per_iteration": 2.9419283866882324 }, { "auxiliary_loss_clip": 0.01118701, "auxiliary_loss_mlp": 0.01030503, "balance_loss_clip": 1.01880264, "balance_loss_mlp": 1.0344615, "epoch": 0.9807906207725838, "flos": 17456392995840.0, "grad_norm": 4.597435756507797, "language_loss": 0.63208288, "learning_rate": 3.6304894371397277e-09, "loss": 0.65357494, "num_input_tokens_seen": 352069315, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6640625, "step": 16313, "time_per_iteration": 5.312643051147461 }, { "auxiliary_loss_clip": 0.0111189, "auxiliary_loss_mlp": 0.01032414, "balance_loss_clip": 1.01991463, "balance_loss_mlp": 1.03519392, "epoch": 0.9808507440252517, "flos": 21032413361280.0, "grad_norm": 1.9415100056835346, "language_loss": 0.72987902, "learning_rate": 3.607770259500986e-09, "loss": 0.75132209, "num_input_tokens_seen": 352089480, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 16314, "time_per_iteration": 4.30785346031189 }, { "auxiliary_loss_clip": 0.01047909, "auxiliary_loss_mlp": 0.01003285, "balance_loss_clip": 1.00195563, "balance_loss_mlp": 1.00096881, "epoch": 0.9809108672779198, "flos": 64044491598720.0, "grad_norm": 0.6934002126100957, "language_loss": 0.51720303, "learning_rate": 3.585122327965706e-09, "loss": 0.53771496, "num_input_tokens_seen": 352150000, "router_z_loss_clip": 0.01330566, "router_z_loss_mlp": 0.2109375, "step": 16315, "time_per_iteration": 3.4072606563568115 }, { "auxiliary_loss_clip": 0.01110096, "auxiliary_loss_mlp": 0.01029047, "balance_loss_clip": 1.01753736, "balance_loss_mlp": 1.03208649, "epoch": 0.9809709905305877, "flos": 22780616296320.0, "grad_norm": 1.3110676600160225, "language_loss": 0.69926745, "learning_rate": 3.5625456433419076e-09, "loss": 0.7206589, "num_input_tokens_seen": 352170990, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.69140625, "step": 16316, "time_per_iteration": 2.8116347789764404 }, { "auxiliary_loss_clip": 0.01102402, "auxiliary_loss_mlp": 0.01027937, "balance_loss_clip": 1.0159266, "balance_loss_mlp": 1.0329392, "epoch": 0.9810311137832557, "flos": 37013415217920.0, "grad_norm": 2.3595685588666706, "language_loss": 0.55694735, "learning_rate": 3.540040206435391e-09, "loss": 0.57825071, "num_input_tokens_seen": 352195335, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 16317, "time_per_iteration": 3.059998035430908 }, { "auxiliary_loss_clip": 0.01136022, "auxiliary_loss_mlp": 0.01027548, "balance_loss_clip": 1.01636052, "balance_loss_mlp": 1.03316653, "epoch": 0.9810912370359236, "flos": 18916305373440.0, "grad_norm": 1.5961787999296135, "language_loss": 0.72888595, "learning_rate": 3.5176060180495125e-09, "loss": 0.75052166, "num_input_tokens_seen": 352214170, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6640625, "step": 16318, "time_per_iteration": 3.003999710083008 }, { "auxiliary_loss_clip": 0.01117101, "auxiliary_loss_mlp": 0.01026554, "balance_loss_clip": 1.01513362, "balance_loss_mlp": 1.03335238, "epoch": 0.9811513602885916, "flos": 18441602208000.0, "grad_norm": 2.0254487535856804, "language_loss": 0.82196993, "learning_rate": 3.495243078984522e-09, "loss": 0.84340644, "num_input_tokens_seen": 352231470, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6640625, "step": 16319, "time_per_iteration": 2.8234927654266357 }, { "auxiliary_loss_clip": 0.01031294, "auxiliary_loss_mlp": 0.01001706, "balance_loss_clip": 1.00047767, "balance_loss_mlp": 1.00112569, "epoch": 0.9812114835412595, "flos": 68058945371520.0, "grad_norm": 0.7122454697035392, "language_loss": 0.53598905, "learning_rate": 3.472951390038892e-09, "loss": 0.556319, "num_input_tokens_seen": 352291770, "router_z_loss_clip": 0.01226807, "router_z_loss_mlp": 0.2109375, "step": 16320, "time_per_iteration": 3.2396326065063477 }, { "auxiliary_loss_clip": 0.01138759, "auxiliary_loss_mlp": 0.01033243, "balance_loss_clip": 1.02100635, "balance_loss_mlp": 1.03413844, "epoch": 0.9812716067939276, "flos": 25373007648000.0, "grad_norm": 1.6065345021238298, "language_loss": 0.72850007, "learning_rate": 3.450730952007763e-09, "loss": 0.75022006, "num_input_tokens_seen": 352310735, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 16321, "time_per_iteration": 2.8781516551971436 }, { "auxiliary_loss_clip": 0.01122316, "auxiliary_loss_mlp": 0.01033778, "balance_loss_clip": 1.0215652, "balance_loss_mlp": 1.03773975, "epoch": 0.9813317300465956, "flos": 22856818999680.0, "grad_norm": 1.5133144972389672, "language_loss": 0.78341502, "learning_rate": 3.42858176568428e-09, "loss": 0.80497599, "num_input_tokens_seen": 352329545, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.66796875, "step": 16322, "time_per_iteration": 2.7600369453430176 }, { "auxiliary_loss_clip": 0.01111186, "auxiliary_loss_mlp": 0.01029896, "balance_loss_clip": 1.01761174, "balance_loss_mlp": 1.03418911, "epoch": 0.9813918532992635, "flos": 23586954756480.0, "grad_norm": 1.4985391330173936, "language_loss": 0.80992246, "learning_rate": 3.40650383185892e-09, "loss": 0.83133334, "num_input_tokens_seen": 352352080, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 16323, "time_per_iteration": 2.8363776206970215 }, { "auxiliary_loss_clip": 0.01104284, "auxiliary_loss_mlp": 0.01033969, "balance_loss_clip": 1.02205372, "balance_loss_mlp": 1.03427827, "epoch": 0.9814519765519315, "flos": 19606328616960.0, "grad_norm": 2.077090970286965, "language_loss": 0.84301227, "learning_rate": 3.3844971513197207e-09, "loss": 0.86439478, "num_input_tokens_seen": 352366455, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69921875, "step": 16324, "time_per_iteration": 2.7071406841278076 }, { "auxiliary_loss_clip": 0.01119918, "auxiliary_loss_mlp": 0.01032376, "balance_loss_clip": 1.01927495, "balance_loss_mlp": 1.03413081, "epoch": 0.9815120998045994, "flos": 33946284787200.0, "grad_norm": 1.3540070339787234, "language_loss": 0.74657452, "learning_rate": 3.362561724851831e-09, "loss": 0.7680974, "num_input_tokens_seen": 352386090, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.68359375, "step": 16325, "time_per_iteration": 2.922903060913086 }, { "auxiliary_loss_clip": 0.01134098, "auxiliary_loss_mlp": 0.01031377, "balance_loss_clip": 1.02022505, "balance_loss_mlp": 1.03359818, "epoch": 0.9815722230572674, "flos": 20850023076480.0, "grad_norm": 2.305206747282499, "language_loss": 0.76674116, "learning_rate": 3.340697553238181e-09, "loss": 0.78839588, "num_input_tokens_seen": 352404000, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6484375, "step": 16326, "time_per_iteration": 2.8273236751556396 }, { "auxiliary_loss_clip": 0.01131838, "auxiliary_loss_mlp": 0.01027929, "balance_loss_clip": 1.01567459, "balance_loss_mlp": 1.03551292, "epoch": 0.9816323463099353, "flos": 22894525301760.0, "grad_norm": 2.1498397217834313, "language_loss": 0.67755759, "learning_rate": 3.3189046372590347e-09, "loss": 0.69915527, "num_input_tokens_seen": 352423540, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6953125, "step": 16327, "time_per_iteration": 2.594757556915283 }, { "auxiliary_loss_clip": 0.01115884, "auxiliary_loss_mlp": 0.01035474, "balance_loss_clip": 1.02253938, "balance_loss_mlp": 1.03518522, "epoch": 0.9816924695626034, "flos": 31539444117120.0, "grad_norm": 1.4892974378781871, "language_loss": 0.73800695, "learning_rate": 3.2971829776919924e-09, "loss": 0.75952053, "num_input_tokens_seen": 352445530, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 16328, "time_per_iteration": 2.6484010219573975 }, { "auxiliary_loss_clip": 0.01129076, "auxiliary_loss_mlp": 0.01034648, "balance_loss_clip": 1.02120161, "balance_loss_mlp": 1.0371151, "epoch": 0.9817525928152713, "flos": 21506901045120.0, "grad_norm": 1.964284107648101, "language_loss": 0.81289941, "learning_rate": 3.275532575312434e-09, "loss": 0.83453661, "num_input_tokens_seen": 352466325, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73828125, "step": 16329, "time_per_iteration": 2.570936918258667 }, { "auxiliary_loss_clip": 0.01105644, "auxiliary_loss_mlp": 0.01033171, "balance_loss_clip": 1.01982582, "balance_loss_mlp": 1.03355861, "epoch": 0.9818127160679393, "flos": 25550513683200.0, "grad_norm": 1.9189928456907517, "language_loss": 0.76219451, "learning_rate": 3.253953430892853e-09, "loss": 0.78358269, "num_input_tokens_seen": 352485505, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 16330, "time_per_iteration": 2.666940212249756 }, { "auxiliary_loss_clip": 0.0111055, "auxiliary_loss_mlp": 0.01032372, "balance_loss_clip": 1.01726782, "balance_loss_mlp": 1.03499842, "epoch": 0.9818728393206072, "flos": 28803661672320.0, "grad_norm": 2.2516751501374035, "language_loss": 0.66837585, "learning_rate": 3.2324455452035214e-09, "loss": 0.68980503, "num_input_tokens_seen": 352505360, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.75390625, "step": 16331, "time_per_iteration": 2.6208155155181885 }, { "auxiliary_loss_clip": 0.01109176, "auxiliary_loss_mlp": 0.01027831, "balance_loss_clip": 1.016011, "balance_loss_mlp": 1.03240931, "epoch": 0.9819329625732752, "flos": 23222246014080.0, "grad_norm": 1.7334193778711058, "language_loss": 0.73533654, "learning_rate": 3.2110089190118263e-09, "loss": 0.75670671, "num_input_tokens_seen": 352524035, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 16332, "time_per_iteration": 2.5451128482818604 }, { "auxiliary_loss_clip": 0.01110372, "auxiliary_loss_mlp": 0.01025601, "balance_loss_clip": 1.01449668, "balance_loss_mlp": 1.03431451, "epoch": 0.9819930858259431, "flos": 17530440883200.0, "grad_norm": 1.6564130860708217, "language_loss": 0.77136087, "learning_rate": 3.1896435530829324e-09, "loss": 0.79272068, "num_input_tokens_seen": 352543210, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.671875, "step": 16333, "time_per_iteration": 2.5229437351226807 }, { "auxiliary_loss_clip": 0.01109464, "auxiliary_loss_mlp": 0.01273525, "balance_loss_clip": 1.01486397, "balance_loss_mlp": 1.03292298, "epoch": 0.9820532090786112, "flos": 12529915971840.0, "grad_norm": 2.6012181709823037, "language_loss": 0.73326361, "learning_rate": 3.1683494481793415e-09, "loss": 0.75709349, "num_input_tokens_seen": 352559770, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.671875, "step": 16334, "time_per_iteration": 2.625802993774414 }, { "auxiliary_loss_clip": 0.01139329, "auxiliary_loss_mlp": 0.01038733, "balance_loss_clip": 1.02609706, "balance_loss_mlp": 1.03358531, "epoch": 0.9821133323312792, "flos": 18840174497280.0, "grad_norm": 1.8123959641406517, "language_loss": 0.78732628, "learning_rate": 3.147126605060668e-09, "loss": 0.80910695, "num_input_tokens_seen": 352577690, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 16335, "time_per_iteration": 2.5984323024749756 }, { "auxiliary_loss_clip": 0.01125504, "auxiliary_loss_mlp": 0.01035541, "balance_loss_clip": 1.02159941, "balance_loss_mlp": 1.03540885, "epoch": 0.9821734555839471, "flos": 34824013528320.0, "grad_norm": 1.9902745714556436, "language_loss": 0.64201534, "learning_rate": 3.1259750244847505e-09, "loss": 0.66362584, "num_input_tokens_seen": 352598850, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.72265625, "step": 16336, "time_per_iteration": 2.6829068660736084 }, { "auxiliary_loss_clip": 0.01133123, "auxiliary_loss_mlp": 0.01038523, "balance_loss_clip": 1.02464163, "balance_loss_mlp": 1.03411293, "epoch": 0.9822335788366151, "flos": 17128169493120.0, "grad_norm": 1.8117943460190937, "language_loss": 0.73284703, "learning_rate": 3.104894707206096e-09, "loss": 0.75456345, "num_input_tokens_seen": 352616130, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71875, "step": 16337, "time_per_iteration": 2.632981777191162 }, { "auxiliary_loss_clip": 0.01111183, "auxiliary_loss_mlp": 0.01028173, "balance_loss_clip": 1.01492882, "balance_loss_mlp": 1.03220034, "epoch": 0.982293702089283, "flos": 20813250528000.0, "grad_norm": 1.859879258862954, "language_loss": 0.73106503, "learning_rate": 3.083885653977214e-09, "loss": 0.75245863, "num_input_tokens_seen": 352636885, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 16338, "time_per_iteration": 2.587435483932495 }, { "auxiliary_loss_clip": 0.01144364, "auxiliary_loss_mlp": 0.01032326, "balance_loss_clip": 1.02010691, "balance_loss_mlp": 1.03140485, "epoch": 0.982353825341951, "flos": 24680829588480.0, "grad_norm": 2.093793618166461, "language_loss": 0.81594372, "learning_rate": 3.062947865547505e-09, "loss": 0.83771056, "num_input_tokens_seen": 352657905, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 16339, "time_per_iteration": 2.6977379322052 }, { "auxiliary_loss_clip": 0.01134659, "auxiliary_loss_mlp": 0.01033054, "balance_loss_clip": 1.02169323, "balance_loss_mlp": 1.03318906, "epoch": 0.982413948594619, "flos": 20850489953280.0, "grad_norm": 1.8295155475037568, "language_loss": 0.62820578, "learning_rate": 3.042081342664815e-09, "loss": 0.64988297, "num_input_tokens_seen": 352676320, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.65625, "step": 16340, "time_per_iteration": 3.992236852645874 }, { "auxiliary_loss_clip": 0.01105309, "auxiliary_loss_mlp": 0.01035062, "balance_loss_clip": 1.02211595, "balance_loss_mlp": 1.03486657, "epoch": 0.982474071847287, "flos": 15377380778880.0, "grad_norm": 2.2473442487306685, "language_loss": 0.85420322, "learning_rate": 3.021286086073216e-09, "loss": 0.87560689, "num_input_tokens_seen": 352692665, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 16341, "time_per_iteration": 2.6862378120422363 }, { "auxiliary_loss_clip": 0.01117244, "auxiliary_loss_mlp": 0.0102809, "balance_loss_clip": 1.01628828, "balance_loss_mlp": 1.03358924, "epoch": 0.9825341950999549, "flos": 23774732081280.0, "grad_norm": 1.8693873963649719, "language_loss": 0.6714561, "learning_rate": 3.0005620965152246e-09, "loss": 0.69290948, "num_input_tokens_seen": 352716130, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.65625, "step": 16342, "time_per_iteration": 2.64839768409729 }, { "auxiliary_loss_clip": 0.01102065, "auxiliary_loss_mlp": 0.01024655, "balance_loss_clip": 1.01266289, "balance_loss_mlp": 1.03272057, "epoch": 0.9825943183526229, "flos": 22746285872640.0, "grad_norm": 1.6778839192190513, "language_loss": 0.7709958, "learning_rate": 2.9799093747300274e-09, "loss": 0.79226303, "num_input_tokens_seen": 352734705, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6953125, "step": 16343, "time_per_iteration": 2.4991016387939453 }, { "auxiliary_loss_clip": 0.0112637, "auxiliary_loss_mlp": 0.01029644, "balance_loss_clip": 1.01866519, "balance_loss_mlp": 1.03310108, "epoch": 0.9826544416052908, "flos": 27709966408320.0, "grad_norm": 1.6714662556299724, "language_loss": 0.75461268, "learning_rate": 2.959327921455257e-09, "loss": 0.77617288, "num_input_tokens_seen": 352756225, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.66796875, "step": 16344, "time_per_iteration": 2.6142284870147705 }, { "auxiliary_loss_clip": 0.0111891, "auxiliary_loss_mlp": 0.01030849, "balance_loss_clip": 1.01842141, "balance_loss_mlp": 1.03285933, "epoch": 0.9827145648579588, "flos": 26941657472640.0, "grad_norm": 1.8973027537626457, "language_loss": 0.66593504, "learning_rate": 2.9388177374247703e-09, "loss": 0.68743265, "num_input_tokens_seen": 352776210, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6796875, "step": 16345, "time_per_iteration": 2.583892583847046 }, { "auxiliary_loss_clip": 0.0110597, "auxiliary_loss_mlp": 0.01027284, "balance_loss_clip": 1.01666808, "balance_loss_mlp": 1.03241181, "epoch": 0.9827746881106267, "flos": 21866545969920.0, "grad_norm": 1.3719705591343498, "language_loss": 0.79286224, "learning_rate": 2.918378823371093e-09, "loss": 0.8141948, "num_input_tokens_seen": 352795455, "router_z_loss_clip": 0.10595703, "router_z_loss_mlp": 0.6484375, "step": 16346, "time_per_iteration": 2.518808603286743 }, { "auxiliary_loss_clip": 0.01116929, "auxiliary_loss_mlp": 0.01032247, "balance_loss_clip": 1.01921153, "balance_loss_mlp": 1.03453732, "epoch": 0.9828348113632948, "flos": 20850777262080.0, "grad_norm": 2.022697690330983, "language_loss": 0.74799693, "learning_rate": 2.8980111800231965e-09, "loss": 0.76948869, "num_input_tokens_seen": 352812895, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.734375, "step": 16347, "time_per_iteration": 2.5069894790649414 }, { "auxiliary_loss_clip": 0.01152536, "auxiliary_loss_mlp": 0.01035868, "balance_loss_clip": 1.02482271, "balance_loss_mlp": 1.03264093, "epoch": 0.9828949346159628, "flos": 34569227381760.0, "grad_norm": 2.0602393296887898, "language_loss": 0.66498828, "learning_rate": 2.877714808108056e-09, "loss": 0.68687236, "num_input_tokens_seen": 352835470, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.66796875, "step": 16348, "time_per_iteration": 2.737954616546631 }, { "auxiliary_loss_clip": 0.0112, "auxiliary_loss_mlp": 0.01028997, "balance_loss_clip": 1.01652777, "balance_loss_mlp": 1.03396475, "epoch": 0.9829550578686307, "flos": 24457464864000.0, "grad_norm": 3.3183300313444564, "language_loss": 0.6922462, "learning_rate": 2.8574897083502027e-09, "loss": 0.71373624, "num_input_tokens_seen": 352854295, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.68359375, "step": 16349, "time_per_iteration": 4.133458852767944 }, { "auxiliary_loss_clip": 0.0111469, "auxiliary_loss_mlp": 0.01030777, "balance_loss_clip": 1.01937461, "balance_loss_mlp": 1.03251243, "epoch": 0.9830151811212987, "flos": 16910084067840.0, "grad_norm": 1.791084765349098, "language_loss": 0.75724399, "learning_rate": 2.8373358814712812e-09, "loss": 0.77869868, "num_input_tokens_seen": 352869695, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.640625, "step": 16350, "time_per_iteration": 2.536881923675537 }, { "auxiliary_loss_clip": 0.01129408, "auxiliary_loss_mlp": 0.01029885, "balance_loss_clip": 1.01720166, "balance_loss_mlp": 1.03387856, "epoch": 0.9830753043739666, "flos": 21288312829440.0, "grad_norm": 2.060977026137373, "language_loss": 0.8413552, "learning_rate": 2.817253328190494e-09, "loss": 0.86294806, "num_input_tokens_seen": 352887430, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 16351, "time_per_iteration": 2.582185983657837 }, { "auxiliary_loss_clip": 0.01127639, "auxiliary_loss_mlp": 0.01025046, "balance_loss_clip": 1.0127492, "balance_loss_mlp": 1.03291595, "epoch": 0.9831354276266346, "flos": 31723522341120.0, "grad_norm": 1.63924298115063, "language_loss": 0.68548328, "learning_rate": 2.7972420492243796e-09, "loss": 0.70701009, "num_input_tokens_seen": 352907555, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 16352, "time_per_iteration": 2.6721091270446777 }, { "auxiliary_loss_clip": 0.01109758, "auxiliary_loss_mlp": 0.01278728, "balance_loss_clip": 1.02047801, "balance_loss_mlp": 1.03442931, "epoch": 0.9831955508793025, "flos": 21619050284160.0, "grad_norm": 1.4714265680887988, "language_loss": 0.66370469, "learning_rate": 2.7773020452874773e-09, "loss": 0.68758953, "num_input_tokens_seen": 352928670, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.66015625, "step": 16353, "time_per_iteration": 2.5942893028259277 }, { "auxiliary_loss_clip": 0.01115146, "auxiliary_loss_mlp": 0.01031707, "balance_loss_clip": 1.02054262, "balance_loss_mlp": 1.03126812, "epoch": 0.9832556741319706, "flos": 22968214053120.0, "grad_norm": 1.4271817976451018, "language_loss": 0.74411094, "learning_rate": 2.7574333170912177e-09, "loss": 0.76557946, "num_input_tokens_seen": 352948345, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6640625, "step": 16354, "time_per_iteration": 2.578749179840088 }, { "auxiliary_loss_clip": 0.01109869, "auxiliary_loss_mlp": 0.01031048, "balance_loss_clip": 1.01887059, "balance_loss_mlp": 1.033584, "epoch": 0.9833157973846385, "flos": 21323900229120.0, "grad_norm": 1.8959748816556607, "language_loss": 0.77705228, "learning_rate": 2.7376358653445895e-09, "loss": 0.79846144, "num_input_tokens_seen": 352967250, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.671875, "step": 16355, "time_per_iteration": 4.073590517044067 }, { "auxiliary_loss_clip": 0.01108386, "auxiliary_loss_mlp": 0.01029665, "balance_loss_clip": 1.01761913, "balance_loss_mlp": 1.03276491, "epoch": 0.9833759206373065, "flos": 21068719032960.0, "grad_norm": 1.8780026369803036, "language_loss": 0.73332953, "learning_rate": 2.7179096907541387e-09, "loss": 0.75471002, "num_input_tokens_seen": 352984725, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.671875, "step": 16356, "time_per_iteration": 3.935202121734619 }, { "auxiliary_loss_clip": 0.01118764, "auxiliary_loss_mlp": 0.01029917, "balance_loss_clip": 1.01717401, "balance_loss_mlp": 1.03445554, "epoch": 0.9834360438899744, "flos": 18697322108160.0, "grad_norm": 1.6264668998614966, "language_loss": 0.76208568, "learning_rate": 2.6982547940239685e-09, "loss": 0.78357244, "num_input_tokens_seen": 353003480, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6640625, "step": 16357, "time_per_iteration": 2.7911596298217773 }, { "auxiliary_loss_clip": 0.0111811, "auxiliary_loss_mlp": 0.01024095, "balance_loss_clip": 1.01178622, "balance_loss_mlp": 1.03407669, "epoch": 0.9834961671426424, "flos": 21105240186240.0, "grad_norm": 1.9206235418322553, "language_loss": 0.80324918, "learning_rate": 2.678671175855296e-09, "loss": 0.82467121, "num_input_tokens_seen": 353021425, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6640625, "step": 16358, "time_per_iteration": 2.7491400241851807 }, { "auxiliary_loss_clip": 0.01109278, "auxiliary_loss_mlp": 0.01024976, "balance_loss_clip": 1.01357973, "balance_loss_mlp": 1.03277612, "epoch": 0.9835562903953103, "flos": 26687625511680.0, "grad_norm": 2.287337706661899, "language_loss": 0.67487264, "learning_rate": 2.6591588369471174e-09, "loss": 0.69621515, "num_input_tokens_seen": 353039870, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.67578125, "step": 16359, "time_per_iteration": 2.580958604812622 }, { "auxiliary_loss_clip": 0.01098609, "auxiliary_loss_mlp": 0.01032227, "balance_loss_clip": 1.02102184, "balance_loss_mlp": 1.03215098, "epoch": 0.9836164136479784, "flos": 22090162089600.0, "grad_norm": 3.944082007700481, "language_loss": 0.69645619, "learning_rate": 2.6397177779959867e-09, "loss": 0.71776456, "num_input_tokens_seen": 353059750, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6640625, "step": 16360, "time_per_iteration": 2.586991786956787 }, { "auxiliary_loss_clip": 0.01112304, "auxiliary_loss_mlp": 0.01032843, "balance_loss_clip": 1.02012908, "balance_loss_mlp": 1.03412676, "epoch": 0.9836765369006463, "flos": 23878405710720.0, "grad_norm": 1.7729053083086166, "language_loss": 0.84253961, "learning_rate": 2.620347999695349e-09, "loss": 0.86399102, "num_input_tokens_seen": 353079940, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 16361, "time_per_iteration": 2.597930431365967 }, { "auxiliary_loss_clip": 0.01100068, "auxiliary_loss_mlp": 0.01025885, "balance_loss_clip": 1.01464307, "balance_loss_mlp": 1.03196204, "epoch": 0.9837366601533143, "flos": 25845017293440.0, "grad_norm": 1.8863978333537346, "language_loss": 0.76132774, "learning_rate": 2.601049502736652e-09, "loss": 0.78258729, "num_input_tokens_seen": 353099990, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6796875, "step": 16362, "time_per_iteration": 2.6479711532592773 }, { "auxiliary_loss_clip": 0.01102891, "auxiliary_loss_mlp": 0.01032757, "balance_loss_clip": 1.02079439, "balance_loss_mlp": 1.03445101, "epoch": 0.9837967834059823, "flos": 21358015171200.0, "grad_norm": 1.5913164215744315, "language_loss": 0.70896089, "learning_rate": 2.581822287808455e-09, "loss": 0.73031735, "num_input_tokens_seen": 353118710, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 16363, "time_per_iteration": 2.6640264987945557 }, { "auxiliary_loss_clip": 0.011141, "auxiliary_loss_mlp": 0.01027005, "balance_loss_clip": 1.01562095, "balance_loss_mlp": 1.0319562, "epoch": 0.9838569066586502, "flos": 18515793749760.0, "grad_norm": 1.6935709622763564, "language_loss": 0.63008815, "learning_rate": 2.5626663555970985e-09, "loss": 0.65149921, "num_input_tokens_seen": 353136415, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.64453125, "step": 16364, "time_per_iteration": 2.6655867099761963 }, { "auxiliary_loss_clip": 0.01136193, "auxiliary_loss_mlp": 0.01034261, "balance_loss_clip": 1.02300119, "balance_loss_mlp": 1.0331378, "epoch": 0.9839170299113182, "flos": 22452392793600.0, "grad_norm": 1.8714016584569517, "language_loss": 0.75006133, "learning_rate": 2.5435817067862573e-09, "loss": 0.77176589, "num_input_tokens_seen": 353154650, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6796875, "step": 16365, "time_per_iteration": 2.8548014163970947 }, { "auxiliary_loss_clip": 0.0112702, "auxiliary_loss_mlp": 0.0102756, "balance_loss_clip": 1.01503694, "balance_loss_mlp": 1.03295493, "epoch": 0.9839771531639862, "flos": 27892320779520.0, "grad_norm": 1.6697671891913357, "language_loss": 0.76049244, "learning_rate": 2.524568342056943e-09, "loss": 0.78203821, "num_input_tokens_seen": 353174065, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.66796875, "step": 16366, "time_per_iteration": 2.6846203804016113 }, { "auxiliary_loss_clip": 0.01098945, "auxiliary_loss_mlp": 0.01028499, "balance_loss_clip": 1.0165782, "balance_loss_mlp": 1.03208029, "epoch": 0.9840372764166542, "flos": 28214510797440.0, "grad_norm": 2.784991226885224, "language_loss": 0.77156711, "learning_rate": 2.505626262087501e-09, "loss": 0.79284155, "num_input_tokens_seen": 353193560, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.66796875, "step": 16367, "time_per_iteration": 2.6497950553894043 }, { "auxiliary_loss_clip": 0.01119935, "auxiliary_loss_mlp": 0.01034398, "balance_loss_clip": 1.02213168, "balance_loss_mlp": 1.03541148, "epoch": 0.9840973996693221, "flos": 23403989854080.0, "grad_norm": 1.7202568915823342, "language_loss": 0.61212373, "learning_rate": 2.48675546755428e-09, "loss": 0.63366711, "num_input_tokens_seen": 353213525, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6640625, "step": 16368, "time_per_iteration": 2.6086416244506836 }, { "auxiliary_loss_clip": 0.01106018, "auxiliary_loss_mlp": 0.01028449, "balance_loss_clip": 1.01779151, "balance_loss_mlp": 1.03287148, "epoch": 0.9841575229219901, "flos": 21395865127680.0, "grad_norm": 1.4636028415796878, "language_loss": 0.65435171, "learning_rate": 2.467955959130297e-09, "loss": 0.67569637, "num_input_tokens_seen": 353234000, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.64453125, "step": 16369, "time_per_iteration": 2.6135566234588623 }, { "auxiliary_loss_clip": 0.01112339, "auxiliary_loss_mlp": 0.01282023, "balance_loss_clip": 1.02265131, "balance_loss_mlp": 1.03407478, "epoch": 0.984217646174658, "flos": 24464072966400.0, "grad_norm": 1.617973918929108, "language_loss": 0.6849699, "learning_rate": 2.449227737486792e-09, "loss": 0.70891351, "num_input_tokens_seen": 353254940, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 16370, "time_per_iteration": 2.61963152885437 }, { "auxiliary_loss_clip": 0.01102831, "auxiliary_loss_mlp": 0.01030713, "balance_loss_clip": 1.01748037, "balance_loss_mlp": 1.03279662, "epoch": 0.984277769427326, "flos": 16435057680000.0, "grad_norm": 1.7536453684519775, "language_loss": 0.73588371, "learning_rate": 2.43057080329212e-09, "loss": 0.7572192, "num_input_tokens_seen": 353272590, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69921875, "step": 16371, "time_per_iteration": 2.729139566421509 }, { "auxiliary_loss_clip": 0.01031198, "auxiliary_loss_mlp": 0.01002483, "balance_loss_clip": 1.001315, "balance_loss_mlp": 1.00095963, "epoch": 0.9843378926799939, "flos": 64530615288960.0, "grad_norm": 0.7792372426227384, "language_loss": 0.65171397, "learning_rate": 2.411985157211971e-09, "loss": 0.67205083, "num_input_tokens_seen": 353334380, "router_z_loss_clip": 0.01165771, "router_z_loss_mlp": 0.2109375, "step": 16372, "time_per_iteration": 3.220168113708496 }, { "auxiliary_loss_clip": 0.01108944, "auxiliary_loss_mlp": 0.0102959, "balance_loss_clip": 1.01741278, "balance_loss_mlp": 1.03273821, "epoch": 0.984398015932662, "flos": 17382811985280.0, "grad_norm": 1.8613593394633778, "language_loss": 0.70704532, "learning_rate": 2.3934707999095915e-09, "loss": 0.72843063, "num_input_tokens_seen": 353351640, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.67578125, "step": 16373, "time_per_iteration": 2.7799994945526123 }, { "auxiliary_loss_clip": 0.01101439, "auxiliary_loss_mlp": 0.01030788, "balance_loss_clip": 1.01891482, "balance_loss_mlp": 1.03423977, "epoch": 0.9844581391853299, "flos": 23879088069120.0, "grad_norm": 1.6913864844920654, "language_loss": 0.81448972, "learning_rate": 2.3750277320457866e-09, "loss": 0.83581197, "num_input_tokens_seen": 353372555, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 16374, "time_per_iteration": 2.9338841438293457 }, { "auxiliary_loss_clip": 0.01141079, "auxiliary_loss_mlp": 0.01031717, "balance_loss_clip": 1.02000427, "balance_loss_mlp": 1.03615713, "epoch": 0.9845182624379979, "flos": 19865352568320.0, "grad_norm": 1.7321542747026688, "language_loss": 0.69345218, "learning_rate": 2.356655954278919e-09, "loss": 0.71518016, "num_input_tokens_seen": 353391385, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69140625, "step": 16375, "time_per_iteration": 2.73606538772583 }, { "auxiliary_loss_clip": 0.01123034, "auxiliary_loss_mlp": 0.01032626, "balance_loss_clip": 1.01960278, "balance_loss_mlp": 1.03404236, "epoch": 0.9845783856906659, "flos": 28254659224320.0, "grad_norm": 1.6730636578465108, "language_loss": 0.80684686, "learning_rate": 2.3383554672642412e-09, "loss": 0.82840341, "num_input_tokens_seen": 353411630, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 16376, "time_per_iteration": 2.757521390914917 }, { "auxiliary_loss_clip": 0.01098873, "auxiliary_loss_mlp": 0.01033282, "balance_loss_clip": 1.02100384, "balance_loss_mlp": 1.03259599, "epoch": 0.9846385089433338, "flos": 28328383889280.0, "grad_norm": 1.516148067534539, "language_loss": 0.62531769, "learning_rate": 2.3201262716552315e-09, "loss": 0.64663923, "num_input_tokens_seen": 353432895, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6640625, "step": 16377, "time_per_iteration": 2.731214761734009 }, { "auxiliary_loss_clip": 0.0112993, "auxiliary_loss_mlp": 0.01034219, "balance_loss_clip": 1.02120721, "balance_loss_mlp": 1.03402531, "epoch": 0.9846986321960018, "flos": 24316767290880.0, "grad_norm": 1.5937014359709474, "language_loss": 0.72925109, "learning_rate": 2.3019683681022583e-09, "loss": 0.75089258, "num_input_tokens_seen": 353454195, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 16378, "time_per_iteration": 2.8067445755004883 }, { "auxiliary_loss_clip": 0.01102187, "auxiliary_loss_mlp": 0.01032303, "balance_loss_clip": 1.02059102, "balance_loss_mlp": 1.03352118, "epoch": 0.9847587554486698, "flos": 27271999877760.0, "grad_norm": 1.5611911244203467, "language_loss": 0.71222496, "learning_rate": 2.2838817572532477e-09, "loss": 0.73356986, "num_input_tokens_seen": 353475125, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 16379, "time_per_iteration": 2.699770927429199 }, { "auxiliary_loss_clip": 0.01130063, "auxiliary_loss_mlp": 0.01033346, "balance_loss_clip": 1.02112138, "balance_loss_mlp": 1.03412676, "epoch": 0.9848188787013378, "flos": 16542717719040.0, "grad_norm": 1.9711771816644554, "language_loss": 0.68232346, "learning_rate": 2.265866439753683e-09, "loss": 0.70395756, "num_input_tokens_seen": 353493265, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69921875, "step": 16380, "time_per_iteration": 2.7616095542907715 }, { "auxiliary_loss_clip": 0.01124143, "auxiliary_loss_mlp": 0.01032703, "balance_loss_clip": 1.02078235, "balance_loss_mlp": 1.03444195, "epoch": 0.9848790019540057, "flos": 20193647898240.0, "grad_norm": 2.0807367369299508, "language_loss": 0.790582, "learning_rate": 2.2479224162466058e-09, "loss": 0.81215048, "num_input_tokens_seen": 353511650, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.7109375, "step": 16381, "time_per_iteration": 4.137555122375488 }, { "auxiliary_loss_clip": 0.01110868, "auxiliary_loss_mlp": 0.01027677, "balance_loss_clip": 1.01557136, "balance_loss_mlp": 1.03319693, "epoch": 0.9849391252066737, "flos": 28259723041920.0, "grad_norm": 1.4778696608433148, "language_loss": 0.82436752, "learning_rate": 2.2300496873723928e-09, "loss": 0.84575307, "num_input_tokens_seen": 353534035, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 16382, "time_per_iteration": 2.8337628841400146 }, { "auxiliary_loss_clip": 0.01123598, "auxiliary_loss_mlp": 0.01032412, "balance_loss_clip": 1.01901889, "balance_loss_mlp": 1.03354347, "epoch": 0.9849992484593416, "flos": 22454942659200.0, "grad_norm": 3.0320466506789034, "language_loss": 0.7428003, "learning_rate": 2.212248253768978e-09, "loss": 0.76436043, "num_input_tokens_seen": 353549950, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 16383, "time_per_iteration": 2.7521562576293945 }, { "auxiliary_loss_clip": 0.01139932, "auxiliary_loss_mlp": 0.01029023, "balance_loss_clip": 1.01712608, "balance_loss_mlp": 1.03560567, "epoch": 0.9850593717120096, "flos": 34497190656000.0, "grad_norm": 3.019488395569163, "language_loss": 0.6617167, "learning_rate": 2.194518116071187e-09, "loss": 0.68340623, "num_input_tokens_seen": 353573745, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 16384, "time_per_iteration": 2.9089467525482178 }, { "auxiliary_loss_clip": 0.01125374, "auxiliary_loss_mlp": 0.01033445, "balance_loss_clip": 1.01939642, "balance_loss_mlp": 1.03664124, "epoch": 0.9851194949646775, "flos": 38837282152320.0, "grad_norm": 1.71714860135041, "language_loss": 0.7011745, "learning_rate": 2.1768592749122904e-09, "loss": 0.7227627, "num_input_tokens_seen": 353595335, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7109375, "step": 16385, "time_per_iteration": 2.850207805633545 }, { "auxiliary_loss_clip": 0.01123855, "auxiliary_loss_mlp": 0.01030074, "balance_loss_clip": 1.01789117, "balance_loss_mlp": 1.03738117, "epoch": 0.9851796182173456, "flos": 17712436118400.0, "grad_norm": 2.436418751548704, "language_loss": 0.80841458, "learning_rate": 2.1592717309222296e-09, "loss": 0.82995379, "num_input_tokens_seen": 353614270, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 16386, "time_per_iteration": 2.6722207069396973 }, { "auxiliary_loss_clip": 0.01129868, "auxiliary_loss_mlp": 0.0102896, "balance_loss_clip": 1.01669383, "balance_loss_mlp": 1.03425813, "epoch": 0.9852397414700135, "flos": 27454318335360.0, "grad_norm": 1.8364185121630614, "language_loss": 0.6748395, "learning_rate": 2.1417554847287244e-09, "loss": 0.69642776, "num_input_tokens_seen": 353634900, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 16387, "time_per_iteration": 2.73587703704834 }, { "auxiliary_loss_clip": 0.0110295, "auxiliary_loss_mlp": 0.01274082, "balance_loss_clip": 1.01611662, "balance_loss_mlp": 1.03049278, "epoch": 0.9852998647226815, "flos": 18296702743680.0, "grad_norm": 2.3564355301659266, "language_loss": 0.73481989, "learning_rate": 2.1243105369568302e-09, "loss": 0.75859016, "num_input_tokens_seen": 353652890, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.63671875, "step": 16388, "time_per_iteration": 2.648027181625366 }, { "auxiliary_loss_clip": 0.01109194, "auxiliary_loss_mlp": 0.01028696, "balance_loss_clip": 1.016662, "balance_loss_mlp": 1.03212357, "epoch": 0.9853599879753495, "flos": 19642562461440.0, "grad_norm": 1.642318146020486, "language_loss": 0.82177299, "learning_rate": 2.1069368882291603e-09, "loss": 0.84315187, "num_input_tokens_seen": 353671295, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 16389, "time_per_iteration": 2.6714000701904297 }, { "auxiliary_loss_clip": 0.01119228, "auxiliary_loss_mlp": 0.01272203, "balance_loss_clip": 1.01271081, "balance_loss_mlp": 1.03388071, "epoch": 0.9854201112280174, "flos": 23841956384640.0, "grad_norm": 2.012422482520637, "language_loss": 0.66944337, "learning_rate": 2.0896345391656634e-09, "loss": 0.69335771, "num_input_tokens_seen": 353690560, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 16390, "time_per_iteration": 4.256151914596558 }, { "auxiliary_loss_clip": 0.01130588, "auxiliary_loss_mlp": 0.01032991, "balance_loss_clip": 1.0197885, "balance_loss_mlp": 1.03513956, "epoch": 0.9854802344806854, "flos": 30080573233920.0, "grad_norm": 1.6635863366235082, "language_loss": 0.7720598, "learning_rate": 2.072403490383845e-09, "loss": 0.79369557, "num_input_tokens_seen": 353710660, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6875, "step": 16391, "time_per_iteration": 2.841198444366455 }, { "auxiliary_loss_clip": 0.01123916, "auxiliary_loss_mlp": 0.01275881, "balance_loss_clip": 1.0181098, "balance_loss_mlp": 1.03371191, "epoch": 0.9855403577333534, "flos": 21907412668800.0, "grad_norm": 7.345804923524808, "language_loss": 0.68095773, "learning_rate": 2.055243742498769e-09, "loss": 0.70495564, "num_input_tokens_seen": 353730440, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6328125, "step": 16392, "time_per_iteration": 2.7592475414276123 }, { "auxiliary_loss_clip": 0.01132131, "auxiliary_loss_mlp": 0.01027317, "balance_loss_clip": 1.01425791, "balance_loss_mlp": 1.03371334, "epoch": 0.9856004809860214, "flos": 15413794191360.0, "grad_norm": 1.956537502054207, "language_loss": 0.55857998, "learning_rate": 2.0381552961226125e-09, "loss": 0.58017445, "num_input_tokens_seen": 353748360, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 16393, "time_per_iteration": 2.7452292442321777 }, { "auxiliary_loss_clip": 0.0113471, "auxiliary_loss_mlp": 0.01030545, "balance_loss_clip": 1.0174973, "balance_loss_mlp": 1.03551888, "epoch": 0.9856606042386893, "flos": 13653201064320.0, "grad_norm": 4.959734244316165, "language_loss": 0.78823638, "learning_rate": 2.0211381518653313e-09, "loss": 0.80988896, "num_input_tokens_seen": 353760880, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 16394, "time_per_iteration": 2.7783920764923096 }, { "auxiliary_loss_clip": 0.01113801, "auxiliary_loss_mlp": 0.01034095, "balance_loss_clip": 1.02176309, "balance_loss_mlp": 1.03346467, "epoch": 0.9857207274913573, "flos": 23479151063040.0, "grad_norm": 1.5746749847171135, "language_loss": 0.76057756, "learning_rate": 2.0041923103344403e-09, "loss": 0.78205657, "num_input_tokens_seen": 353782255, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71484375, "step": 16395, "time_per_iteration": 2.804408311843872 }, { "auxiliary_loss_clip": 0.0111049, "auxiliary_loss_mlp": 0.0103034, "balance_loss_clip": 1.01853216, "balance_loss_mlp": 1.03426743, "epoch": 0.9857808507440252, "flos": 21065486808960.0, "grad_norm": 1.8868910668113217, "language_loss": 0.75216442, "learning_rate": 1.9873177721341226e-09, "loss": 0.77357274, "num_input_tokens_seen": 353803580, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 16396, "time_per_iteration": 2.7995011806488037 }, { "auxiliary_loss_clip": 0.01127611, "auxiliary_loss_mlp": 0.01027619, "balance_loss_clip": 1.01567459, "balance_loss_mlp": 1.03375351, "epoch": 0.9858409739966932, "flos": 25301365971840.0, "grad_norm": 1.4847512670256406, "language_loss": 0.70891476, "learning_rate": 1.9705145378672294e-09, "loss": 0.73046708, "num_input_tokens_seen": 353824200, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 16397, "time_per_iteration": 7.700043439865112 }, { "auxiliary_loss_clip": 0.011174, "auxiliary_loss_mlp": 0.01032374, "balance_loss_clip": 1.02067399, "balance_loss_mlp": 1.03535056, "epoch": 0.9859010972493611, "flos": 20558751690240.0, "grad_norm": 1.5228895352824874, "language_loss": 0.71238029, "learning_rate": 1.953782608133059e-09, "loss": 0.73387814, "num_input_tokens_seen": 353843350, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.640625, "step": 16398, "time_per_iteration": 2.684917449951172 }, { "auxiliary_loss_clip": 0.01127082, "auxiliary_loss_mlp": 0.01025188, "balance_loss_clip": 1.01354694, "balance_loss_mlp": 1.03416121, "epoch": 0.9859612205020292, "flos": 24754985216640.0, "grad_norm": 1.369351815345371, "language_loss": 0.73947895, "learning_rate": 1.937121983528689e-09, "loss": 0.76100159, "num_input_tokens_seen": 353864520, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.66796875, "step": 16399, "time_per_iteration": 2.844630002975464 }, { "auxiliary_loss_clip": 0.01102636, "auxiliary_loss_mlp": 0.01028736, "balance_loss_clip": 1.01614118, "balance_loss_mlp": 1.03314471, "epoch": 0.9860213437546971, "flos": 21105850717440.0, "grad_norm": 2.43950860400978, "language_loss": 0.57277048, "learning_rate": 1.920532664648755e-09, "loss": 0.59408426, "num_input_tokens_seen": 353882240, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 16400, "time_per_iteration": 2.7728772163391113 }, { "auxiliary_loss_clip": 0.01125473, "auxiliary_loss_mlp": 0.0102926, "balance_loss_clip": 1.01608145, "balance_loss_mlp": 1.03387797, "epoch": 0.9860814670073651, "flos": 19136078737920.0, "grad_norm": 1.6536206625734655, "language_loss": 0.806306, "learning_rate": 1.9040146520854507e-09, "loss": 0.82785332, "num_input_tokens_seen": 353901590, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 16401, "time_per_iteration": 2.8688650131225586 }, { "auxiliary_loss_clip": 0.01110143, "auxiliary_loss_mlp": 0.01032141, "balance_loss_clip": 1.01916504, "balance_loss_mlp": 1.03429842, "epoch": 0.9861415902600331, "flos": 17237050594560.0, "grad_norm": 6.1416309623774366, "language_loss": 0.77994883, "learning_rate": 1.887567946428081e-09, "loss": 0.80137169, "num_input_tokens_seen": 353918785, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.671875, "step": 16402, "time_per_iteration": 2.632594108581543 }, { "auxiliary_loss_clip": 0.01113231, "auxiliary_loss_mlp": 0.01031128, "balance_loss_clip": 1.01911175, "balance_loss_mlp": 1.0354414, "epoch": 0.986201713512701, "flos": 27782577751680.0, "grad_norm": 1.5934968231241629, "language_loss": 0.70369869, "learning_rate": 1.8711925482637334e-09, "loss": 0.72514224, "num_input_tokens_seen": 353940390, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 16403, "time_per_iteration": 2.791858673095703 }, { "auxiliary_loss_clip": 0.01106904, "auxiliary_loss_mlp": 0.01028195, "balance_loss_clip": 1.01707244, "balance_loss_mlp": 1.0309217, "epoch": 0.986261836765369, "flos": 28730403884160.0, "grad_norm": 2.0772039493874184, "language_loss": 0.74510133, "learning_rate": 1.8548884581766066e-09, "loss": 0.76645231, "num_input_tokens_seen": 353962180, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.671875, "step": 16404, "time_per_iteration": 2.764270782470703 }, { "auxiliary_loss_clip": 0.0111092, "auxiliary_loss_mlp": 0.01275881, "balance_loss_clip": 1.01702785, "balance_loss_mlp": 1.03388321, "epoch": 0.986321960018037, "flos": 34313471568000.0, "grad_norm": 1.8615017529509144, "language_loss": 0.70185995, "learning_rate": 1.8386556767489014e-09, "loss": 0.72572803, "num_input_tokens_seen": 353984305, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 16405, "time_per_iteration": 2.674654006958008 }, { "auxiliary_loss_clip": 0.01120081, "auxiliary_loss_mlp": 0.01030889, "balance_loss_clip": 1.0198319, "balance_loss_mlp": 1.0362668, "epoch": 0.986382083270705, "flos": 25189755436800.0, "grad_norm": 1.6814555519369647, "language_loss": 0.69535565, "learning_rate": 1.8224942045594883e-09, "loss": 0.71686542, "num_input_tokens_seen": 354004495, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6640625, "step": 16406, "time_per_iteration": 2.936879873275757 }, { "auxiliary_loss_clip": 0.01146304, "auxiliary_loss_mlp": 0.01032728, "balance_loss_clip": 1.02044916, "balance_loss_mlp": 1.03463399, "epoch": 0.9864422065233729, "flos": 11025904671360.0, "grad_norm": 5.290523414208324, "language_loss": 0.74807739, "learning_rate": 1.806404042185461e-09, "loss": 0.76986778, "num_input_tokens_seen": 354015985, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6640625, "step": 16407, "time_per_iteration": 2.701260566711426 }, { "auxiliary_loss_clip": 0.01116581, "auxiliary_loss_mlp": 0.01031, "balance_loss_clip": 1.01905537, "balance_loss_mlp": 1.0321573, "epoch": 0.9865023297760409, "flos": 16545590807040.0, "grad_norm": 2.198641861303079, "language_loss": 0.77551365, "learning_rate": 1.7903851902008049e-09, "loss": 0.79698944, "num_input_tokens_seen": 354033260, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.66796875, "step": 16408, "time_per_iteration": 2.700191020965576 }, { "auxiliary_loss_clip": 0.01113832, "auxiliary_loss_mlp": 0.01029247, "balance_loss_clip": 1.01543069, "balance_loss_mlp": 1.03435564, "epoch": 0.9865624530287088, "flos": 18880179269760.0, "grad_norm": 2.088754250217972, "language_loss": 0.68022859, "learning_rate": 1.774437649177285e-09, "loss": 0.70165938, "num_input_tokens_seen": 354052825, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.70703125, "step": 16409, "time_per_iteration": 2.6187491416931152 }, { "auxiliary_loss_clip": 0.0103981, "auxiliary_loss_mlp": 0.01000861, "balance_loss_clip": 0.99954969, "balance_loss_mlp": 1.00096846, "epoch": 0.9866225762813768, "flos": 68887798680960.0, "grad_norm": 0.8017150122487829, "language_loss": 0.61042011, "learning_rate": 1.7585614196840016e-09, "loss": 0.63082683, "num_input_tokens_seen": 354113920, "router_z_loss_clip": 0.01312256, "router_z_loss_mlp": 0.2109375, "step": 16410, "time_per_iteration": 3.297924280166626 }, { "auxiliary_loss_clip": 0.01113454, "auxiliary_loss_mlp": 0.01030489, "balance_loss_clip": 1.01850891, "balance_loss_mlp": 1.03492618, "epoch": 0.9866826995340447, "flos": 29023111814400.0, "grad_norm": 2.116702911074417, "language_loss": 0.65871251, "learning_rate": 1.7427565022876122e-09, "loss": 0.68015194, "num_input_tokens_seen": 354134210, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 16411, "time_per_iteration": 2.807163953781128 }, { "auxiliary_loss_clip": 0.01022223, "auxiliary_loss_mlp": 0.01002076, "balance_loss_clip": 1.00087821, "balance_loss_mlp": 1.00105405, "epoch": 0.9867428227867128, "flos": 65376814867200.0, "grad_norm": 0.7215251667103534, "language_loss": 0.56279868, "learning_rate": 1.7270228975521105e-09, "loss": 0.58304161, "num_input_tokens_seen": 354198010, "router_z_loss_clip": 0.01196289, "router_z_loss_mlp": 0.2109375, "step": 16412, "time_per_iteration": 3.351513624191284 }, { "auxiliary_loss_clip": 0.01131663, "auxiliary_loss_mlp": 0.0103265, "balance_loss_clip": 1.0196085, "balance_loss_mlp": 1.03451967, "epoch": 0.9868029460393807, "flos": 26506312634880.0, "grad_norm": 2.8926540006734003, "language_loss": 0.73001081, "learning_rate": 1.7113606060390473e-09, "loss": 0.75165391, "num_input_tokens_seen": 354220000, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 16413, "time_per_iteration": 2.7459728717803955 }, { "auxiliary_loss_clip": 0.01136949, "auxiliary_loss_mlp": 0.01027816, "balance_loss_clip": 1.01579952, "balance_loss_mlp": 1.03185415, "epoch": 0.9868630692920487, "flos": 22967280299520.0, "grad_norm": 1.5013812203045425, "language_loss": 0.71275759, "learning_rate": 1.6957696283073087e-09, "loss": 0.73440528, "num_input_tokens_seen": 354240910, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69921875, "step": 16414, "time_per_iteration": 3.088778495788574 }, { "auxiliary_loss_clip": 0.01110244, "auxiliary_loss_mlp": 0.01032311, "balance_loss_clip": 1.02052712, "balance_loss_mlp": 1.03602791, "epoch": 0.9869231925447167, "flos": 19828687760640.0, "grad_norm": 1.7237803104615712, "language_loss": 0.70333171, "learning_rate": 1.6802499649133384e-09, "loss": 0.72475719, "num_input_tokens_seen": 354259430, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6484375, "step": 16415, "time_per_iteration": 2.6565983295440674 }, { "auxiliary_loss_clip": 0.01122467, "auxiliary_loss_mlp": 0.01027009, "balance_loss_clip": 1.01598167, "balance_loss_mlp": 1.03186679, "epoch": 0.9869833157973846, "flos": 20195228096640.0, "grad_norm": 1.3841016156706853, "language_loss": 0.75559485, "learning_rate": 1.6648016164109157e-09, "loss": 0.77708966, "num_input_tokens_seen": 354279490, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.63671875, "step": 16416, "time_per_iteration": 2.7240452766418457 }, { "auxiliary_loss_clip": 0.01118817, "auxiliary_loss_mlp": 0.01029821, "balance_loss_clip": 1.01792395, "balance_loss_mlp": 1.03459609, "epoch": 0.9870434390500527, "flos": 16099507802880.0, "grad_norm": 1.8975861182716505, "language_loss": 0.70769519, "learning_rate": 1.6494245833513775e-09, "loss": 0.72918153, "num_input_tokens_seen": 354295080, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6640625, "step": 16417, "time_per_iteration": 2.7069056034088135 }, { "auxiliary_loss_clip": 0.01125936, "auxiliary_loss_mlp": 0.01035549, "balance_loss_clip": 1.02296042, "balance_loss_mlp": 1.03628445, "epoch": 0.9871035623027206, "flos": 21760753438080.0, "grad_norm": 1.4131557385782814, "language_loss": 0.70770609, "learning_rate": 1.634118866283396e-09, "loss": 0.72932094, "num_input_tokens_seen": 354314610, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 16418, "time_per_iteration": 2.7366108894348145 }, { "auxiliary_loss_clip": 0.01022306, "auxiliary_loss_mlp": 0.01000309, "balance_loss_clip": 0.99907476, "balance_loss_mlp": 1.00122583, "epoch": 0.9871636855553886, "flos": 70219583245440.0, "grad_norm": 0.6632594096723128, "language_loss": 0.53647542, "learning_rate": 1.6188844657534228e-09, "loss": 0.55670154, "num_input_tokens_seen": 354383115, "router_z_loss_clip": 0.0123291, "router_z_loss_mlp": 0.2109375, "step": 16419, "time_per_iteration": 3.2690224647521973 }, { "auxiliary_loss_clip": 0.01122312, "auxiliary_loss_mlp": 0.01036807, "balance_loss_clip": 1.02341378, "balance_loss_mlp": 1.03310156, "epoch": 0.9872238088080565, "flos": 25045825639680.0, "grad_norm": 2.838309496499425, "language_loss": 0.78403419, "learning_rate": 1.6037213823050232e-09, "loss": 0.80562538, "num_input_tokens_seen": 354403115, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71484375, "step": 16420, "time_per_iteration": 2.760593891143799 }, { "auxiliary_loss_clip": 0.01129756, "auxiliary_loss_mlp": 0.01028936, "balance_loss_clip": 1.01643693, "balance_loss_mlp": 1.0363059, "epoch": 0.9872839320607245, "flos": 19465846525440.0, "grad_norm": 1.5465460407612983, "language_loss": 0.70875567, "learning_rate": 1.588629616479098e-09, "loss": 0.73034263, "num_input_tokens_seen": 354424520, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6640625, "step": 16421, "time_per_iteration": 2.77721905708313 }, { "auxiliary_loss_clip": 0.01112867, "auxiliary_loss_mlp": 0.01034111, "balance_loss_clip": 1.02078938, "balance_loss_mlp": 1.03332186, "epoch": 0.9873440553133924, "flos": 26942914448640.0, "grad_norm": 1.9137463875604734, "language_loss": 0.81958044, "learning_rate": 1.5736091688147713e-09, "loss": 0.84105027, "num_input_tokens_seen": 354444800, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 16422, "time_per_iteration": 2.735544443130493 }, { "auxiliary_loss_clip": 0.01128076, "auxiliary_loss_mlp": 0.01028994, "balance_loss_clip": 1.01745498, "balance_loss_mlp": 1.03448892, "epoch": 0.9874041785660604, "flos": 19062210418560.0, "grad_norm": 1.7771243306316304, "language_loss": 0.86064631, "learning_rate": 1.5586600398476146e-09, "loss": 0.88221705, "num_input_tokens_seen": 354464590, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 16423, "time_per_iteration": 4.40385627746582 }, { "auxiliary_loss_clip": 0.01112234, "auxiliary_loss_mlp": 0.01028727, "balance_loss_clip": 1.01657999, "balance_loss_mlp": 1.03396392, "epoch": 0.9874643018187284, "flos": 21105814803840.0, "grad_norm": 1.7389933483573463, "language_loss": 0.70252436, "learning_rate": 1.5437822301112014e-09, "loss": 0.72393405, "num_input_tokens_seen": 354484145, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.69140625, "step": 16424, "time_per_iteration": 2.643998861312866 }, { "auxiliary_loss_clip": 0.01110769, "auxiliary_loss_mlp": 0.0127738, "balance_loss_clip": 1.01826274, "balance_loss_mlp": 1.03290129, "epoch": 0.9875244250713964, "flos": 24426043441920.0, "grad_norm": 2.0204692443288375, "language_loss": 0.80907166, "learning_rate": 1.528975740136662e-09, "loss": 0.8329531, "num_input_tokens_seen": 354502475, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 16425, "time_per_iteration": 2.8325018882751465 }, { "auxiliary_loss_clip": 0.01139153, "auxiliary_loss_mlp": 0.01033093, "balance_loss_clip": 1.02023005, "balance_loss_mlp": 1.03417516, "epoch": 0.9875845483240643, "flos": 25117610970240.0, "grad_norm": 1.969672881207626, "language_loss": 0.80021298, "learning_rate": 1.514240570452463e-09, "loss": 0.82193542, "num_input_tokens_seen": 354521855, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 16426, "time_per_iteration": 2.7454335689544678 }, { "auxiliary_loss_clip": 0.01112859, "auxiliary_loss_mlp": 0.01033062, "balance_loss_clip": 1.02100396, "balance_loss_mlp": 1.03592741, "epoch": 0.9876446715767323, "flos": 16581788737920.0, "grad_norm": 2.3398777183373465, "language_loss": 0.84907895, "learning_rate": 1.4995767215839617e-09, "loss": 0.87053812, "num_input_tokens_seen": 354539535, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 16427, "time_per_iteration": 2.706364154815674 }, { "auxiliary_loss_clip": 0.01100529, "auxiliary_loss_mlp": 0.01030645, "balance_loss_clip": 1.01852107, "balance_loss_mlp": 1.0343554, "epoch": 0.9877047948294003, "flos": 21616141282560.0, "grad_norm": 1.9864245102102243, "language_loss": 0.70726746, "learning_rate": 1.4849841940549612e-09, "loss": 0.72857922, "num_input_tokens_seen": 354557430, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6640625, "step": 16428, "time_per_iteration": 2.8247532844543457 }, { "auxiliary_loss_clip": 0.01126748, "auxiliary_loss_mlp": 0.01033069, "balance_loss_clip": 1.02116632, "balance_loss_mlp": 1.03394437, "epoch": 0.9877649180820682, "flos": 21178497974400.0, "grad_norm": 2.0514333152109008, "language_loss": 0.79893899, "learning_rate": 1.4704629883861563e-09, "loss": 0.82053721, "num_input_tokens_seen": 354574735, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.66015625, "step": 16429, "time_per_iteration": 2.736708402633667 }, { "auxiliary_loss_clip": 0.0110047, "auxiliary_loss_mlp": 0.01028896, "balance_loss_clip": 1.01702237, "balance_loss_mlp": 1.03290117, "epoch": 0.9878250413347363, "flos": 39749233576320.0, "grad_norm": 1.5390419731065739, "language_loss": 0.61600506, "learning_rate": 1.456013105095577e-09, "loss": 0.63729876, "num_input_tokens_seen": 354597050, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.67578125, "step": 16430, "time_per_iteration": 2.839909315109253 }, { "auxiliary_loss_clip": 0.01103492, "auxiliary_loss_mlp": 0.01033484, "balance_loss_clip": 1.02149129, "balance_loss_mlp": 1.03457236, "epoch": 0.9878851645874042, "flos": 29425634599680.0, "grad_norm": 1.3449330584867667, "language_loss": 0.73129225, "learning_rate": 1.441634544699033e-09, "loss": 0.752662, "num_input_tokens_seen": 354619095, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 16431, "time_per_iteration": 4.242112398147583 }, { "auxiliary_loss_clip": 0.01103007, "auxiliary_loss_mlp": 0.01033855, "balance_loss_clip": 1.02149868, "balance_loss_mlp": 1.034958, "epoch": 0.9879452878400722, "flos": 15806261168640.0, "grad_norm": 2.4171308703685948, "language_loss": 0.80547464, "learning_rate": 1.4273273077098912e-09, "loss": 0.8268432, "num_input_tokens_seen": 354633790, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6796875, "step": 16432, "time_per_iteration": 2.845752477645874 }, { "auxiliary_loss_clip": 0.01107717, "auxiliary_loss_mlp": 0.01029507, "balance_loss_clip": 1.01866484, "balance_loss_mlp": 1.0323503, "epoch": 0.9880054110927401, "flos": 22233912318720.0, "grad_norm": 1.4839886832128684, "language_loss": 0.80220056, "learning_rate": 1.4130913946381883e-09, "loss": 0.82357281, "num_input_tokens_seen": 354653180, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.671875, "step": 16433, "time_per_iteration": 2.6578619480133057 }, { "auxiliary_loss_clip": 0.01099441, "auxiliary_loss_mlp": 0.01029494, "balance_loss_clip": 1.0173285, "balance_loss_mlp": 1.0338124, "epoch": 0.9880655343454081, "flos": 28763836467840.0, "grad_norm": 2.607029630081806, "language_loss": 0.65153939, "learning_rate": 1.3989268059924064e-09, "loss": 0.67282873, "num_input_tokens_seen": 354669900, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.65625, "step": 16434, "time_per_iteration": 2.726179361343384 }, { "auxiliary_loss_clip": 0.01110848, "auxiliary_loss_mlp": 0.01032601, "balance_loss_clip": 1.02007198, "balance_loss_mlp": 1.03204262, "epoch": 0.988125657598076, "flos": 32853379622400.0, "grad_norm": 1.8029411015074646, "language_loss": 0.69272548, "learning_rate": 1.3848335422779188e-09, "loss": 0.71415997, "num_input_tokens_seen": 354693165, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 16435, "time_per_iteration": 2.8163814544677734 }, { "auxiliary_loss_clip": 0.0113864, "auxiliary_loss_mlp": 0.01033747, "balance_loss_clip": 1.02141452, "balance_loss_mlp": 1.03559077, "epoch": 0.988185780850744, "flos": 19390685316480.0, "grad_norm": 1.7106126653415359, "language_loss": 0.7516793, "learning_rate": 1.370811603997657e-09, "loss": 0.77340317, "num_input_tokens_seen": 354711915, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.671875, "step": 16436, "time_per_iteration": 2.6770853996276855 }, { "auxiliary_loss_clip": 0.01119752, "auxiliary_loss_mlp": 0.01028832, "balance_loss_clip": 1.01614833, "balance_loss_mlp": 1.03398836, "epoch": 0.988245904103412, "flos": 22528415928960.0, "grad_norm": 2.4619339090935326, "language_loss": 0.74145681, "learning_rate": 1.356860991651887e-09, "loss": 0.76294267, "num_input_tokens_seen": 354729135, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.67578125, "step": 16437, "time_per_iteration": 2.6509721279144287 }, { "auxiliary_loss_clip": 0.01124294, "auxiliary_loss_mlp": 0.01031823, "balance_loss_clip": 1.01989603, "balance_loss_mlp": 1.03228188, "epoch": 0.98830602735608, "flos": 28659193171200.0, "grad_norm": 1.858610045164547, "language_loss": 0.60188317, "learning_rate": 1.3429817057388771e-09, "loss": 0.62344432, "num_input_tokens_seen": 354752530, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.65625, "step": 16438, "time_per_iteration": 4.313032627105713 }, { "auxiliary_loss_clip": 0.01119103, "auxiliary_loss_mlp": 0.01029625, "balance_loss_clip": 1.01687598, "balance_loss_mlp": 1.03394568, "epoch": 0.9883661506087479, "flos": 20996035862400.0, "grad_norm": 1.9026006373840743, "language_loss": 0.72385252, "learning_rate": 1.3291737467535647e-09, "loss": 0.74533981, "num_input_tokens_seen": 354771135, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.671875, "step": 16439, "time_per_iteration": 4.425504207611084 }, { "auxiliary_loss_clip": 0.01109674, "auxiliary_loss_mlp": 0.01029831, "balance_loss_clip": 1.01766562, "balance_loss_mlp": 1.03386641, "epoch": 0.9884262738614159, "flos": 32706109860480.0, "grad_norm": 1.6875580375078567, "language_loss": 0.59912407, "learning_rate": 1.3154371151886667e-09, "loss": 0.62051916, "num_input_tokens_seen": 354791800, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.671875, "step": 16440, "time_per_iteration": 2.840402126312256 }, { "auxiliary_loss_clip": 0.01102487, "auxiliary_loss_mlp": 0.01030625, "balance_loss_clip": 1.01854324, "balance_loss_mlp": 1.03249788, "epoch": 0.9884863971140839, "flos": 17564699479680.0, "grad_norm": 4.348428555063609, "language_loss": 0.76593584, "learning_rate": 1.3017718115349018e-09, "loss": 0.78726697, "num_input_tokens_seen": 354809200, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69921875, "step": 16441, "time_per_iteration": 2.695985794067383 }, { "auxiliary_loss_clip": 0.01120612, "auxiliary_loss_mlp": 0.01026644, "balance_loss_clip": 1.01493728, "balance_loss_mlp": 1.03411889, "epoch": 0.9885465203667518, "flos": 40552519380480.0, "grad_norm": 1.84811138892191, "language_loss": 0.67608905, "learning_rate": 1.288177836279436e-09, "loss": 0.69756156, "num_input_tokens_seen": 354829945, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 16442, "time_per_iteration": 2.9366064071655273 }, { "auxiliary_loss_clip": 0.01108429, "auxiliary_loss_mlp": 0.01027044, "balance_loss_clip": 1.01482534, "balance_loss_mlp": 1.03274798, "epoch": 0.9886066436194199, "flos": 13807976768640.0, "grad_norm": 2.101035894587599, "language_loss": 0.74077755, "learning_rate": 1.2746551899076586e-09, "loss": 0.76213229, "num_input_tokens_seen": 354845055, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 16443, "time_per_iteration": 2.720616102218628 }, { "auxiliary_loss_clip": 0.01120646, "auxiliary_loss_mlp": 0.01029608, "balance_loss_clip": 1.01647758, "balance_loss_mlp": 1.03371131, "epoch": 0.9886667668720878, "flos": 23325129544320.0, "grad_norm": 1.6478763286622216, "language_loss": 0.73602951, "learning_rate": 1.2612038729020724e-09, "loss": 0.75753206, "num_input_tokens_seen": 354864680, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 16444, "time_per_iteration": 2.6784536838531494 }, { "auxiliary_loss_clip": 0.01137009, "auxiliary_loss_mlp": 0.01029598, "balance_loss_clip": 1.01789188, "balance_loss_mlp": 1.03567851, "epoch": 0.9887268901247558, "flos": 22706029704960.0, "grad_norm": 1.921497495473376, "language_loss": 0.69361281, "learning_rate": 1.2478238857429602e-09, "loss": 0.71527886, "num_input_tokens_seen": 354885685, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6640625, "step": 16445, "time_per_iteration": 2.7251358032226562 }, { "auxiliary_loss_clip": 0.01112394, "auxiliary_loss_mlp": 0.01027302, "balance_loss_clip": 1.01496375, "balance_loss_mlp": 1.03452682, "epoch": 0.9887870133774237, "flos": 13041283944960.0, "grad_norm": 2.742029178370426, "language_loss": 0.61023831, "learning_rate": 1.2345152289072736e-09, "loss": 0.63163531, "num_input_tokens_seen": 354901505, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 16446, "time_per_iteration": 2.5362820625305176 }, { "auxiliary_loss_clip": 0.01118672, "auxiliary_loss_mlp": 0.01030467, "balance_loss_clip": 1.01902306, "balance_loss_mlp": 1.03256679, "epoch": 0.9888471366300917, "flos": 15158864390400.0, "grad_norm": 1.8976286827731046, "language_loss": 0.70613003, "learning_rate": 1.2212779028706322e-09, "loss": 0.72762144, "num_input_tokens_seen": 354920060, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6796875, "step": 16447, "time_per_iteration": 2.7402939796447754 }, { "auxiliary_loss_clip": 0.01101294, "auxiliary_loss_mlp": 0.01273079, "balance_loss_clip": 1.01444387, "balance_loss_mlp": 1.03350973, "epoch": 0.9889072598827596, "flos": 25118796119040.0, "grad_norm": 1.7520162639195462, "language_loss": 0.83813339, "learning_rate": 1.2081119081048807e-09, "loss": 0.86187708, "num_input_tokens_seen": 354938690, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 16448, "time_per_iteration": 2.595822811126709 }, { "auxiliary_loss_clip": 0.01107033, "auxiliary_loss_mlp": 0.01026077, "balance_loss_clip": 1.01520538, "balance_loss_mlp": 1.03263986, "epoch": 0.9889673831354276, "flos": 16728663450240.0, "grad_norm": 1.6791367577323861, "language_loss": 0.70147657, "learning_rate": 1.1950172450800877e-09, "loss": 0.72280765, "num_input_tokens_seen": 354956955, "router_z_loss_clip": 0.10888672, "router_z_loss_mlp": 0.65625, "step": 16449, "time_per_iteration": 2.8647620677948 }, { "auxiliary_loss_clip": 0.01111437, "auxiliary_loss_mlp": 0.01029039, "balance_loss_clip": 1.01682019, "balance_loss_mlp": 1.03285265, "epoch": 0.9890275063880956, "flos": 35585175657600.0, "grad_norm": 1.992648698422126, "language_loss": 0.73777121, "learning_rate": 1.181993914263657e-09, "loss": 0.75917602, "num_input_tokens_seen": 354976800, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69921875, "step": 16450, "time_per_iteration": 2.7352192401885986 }, { "auxiliary_loss_clip": 0.01103804, "auxiliary_loss_mlp": 0.01032451, "balance_loss_clip": 1.01991606, "balance_loss_mlp": 1.03234684, "epoch": 0.9890876296407636, "flos": 18952359649920.0, "grad_norm": 1.6886599085386564, "language_loss": 0.79137444, "learning_rate": 1.169041916120328e-09, "loss": 0.81273693, "num_input_tokens_seen": 354996625, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71484375, "step": 16451, "time_per_iteration": 2.63865327835083 }, { "auxiliary_loss_clip": 0.01129335, "auxiliary_loss_mlp": 0.01033285, "balance_loss_clip": 1.02060688, "balance_loss_mlp": 1.03247476, "epoch": 0.9891477528934315, "flos": 23769309127680.0, "grad_norm": 2.7507689033083396, "language_loss": 0.69960439, "learning_rate": 1.1561612511123975e-09, "loss": 0.72123063, "num_input_tokens_seen": 355014535, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 16452, "time_per_iteration": 2.7089314460754395 }, { "auxiliary_loss_clip": 0.01099488, "auxiliary_loss_mlp": 0.01026466, "balance_loss_clip": 1.01536751, "balance_loss_mlp": 1.03433776, "epoch": 0.9892078761460995, "flos": 20772922533120.0, "grad_norm": 1.6044045539920324, "language_loss": 0.7404266, "learning_rate": 1.1433519196992759e-09, "loss": 0.76168615, "num_input_tokens_seen": 355033280, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6484375, "step": 16453, "time_per_iteration": 2.5830190181732178 }, { "auxiliary_loss_clip": 0.01137927, "auxiliary_loss_mlp": 0.01034356, "balance_loss_clip": 1.02269149, "balance_loss_mlp": 1.03304291, "epoch": 0.9892679993987675, "flos": 23367827836800.0, "grad_norm": 2.2841103927598088, "language_loss": 0.69519579, "learning_rate": 1.1306139223381527e-09, "loss": 0.71691865, "num_input_tokens_seen": 355053320, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.69140625, "step": 16454, "time_per_iteration": 2.758946180343628 }, { "auxiliary_loss_clip": 0.01116335, "auxiliary_loss_mlp": 0.01031576, "balance_loss_clip": 1.02009046, "balance_loss_mlp": 1.03372121, "epoch": 0.9893281226514354, "flos": 22705419173760.0, "grad_norm": 1.9489395007410415, "language_loss": 0.75812745, "learning_rate": 1.1179472594839978e-09, "loss": 0.77960658, "num_input_tokens_seen": 355070230, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6484375, "step": 16455, "time_per_iteration": 2.7631571292877197 }, { "auxiliary_loss_clip": 0.01118507, "auxiliary_loss_mlp": 0.01026081, "balance_loss_clip": 1.01403546, "balance_loss_mlp": 1.03415799, "epoch": 0.9893882459041035, "flos": 21796664060160.0, "grad_norm": 1.6329691883844202, "language_loss": 0.65353668, "learning_rate": 1.1053519315882275e-09, "loss": 0.67498255, "num_input_tokens_seen": 355090125, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6640625, "step": 16456, "time_per_iteration": 2.6633830070495605 }, { "auxiliary_loss_clip": 0.01113514, "auxiliary_loss_mlp": 0.010289, "balance_loss_clip": 1.01687813, "balance_loss_mlp": 1.03556347, "epoch": 0.9894483691567714, "flos": 18113773754880.0, "grad_norm": 2.1209116313245673, "language_loss": 0.73834336, "learning_rate": 1.0928279391009266e-09, "loss": 0.75976753, "num_input_tokens_seen": 355107890, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 16457, "time_per_iteration": 2.5173721313476562 }, { "auxiliary_loss_clip": 0.01103266, "auxiliary_loss_mlp": 0.01027851, "balance_loss_clip": 1.01588213, "balance_loss_mlp": 1.03508449, "epoch": 0.9895084924094394, "flos": 31211615664000.0, "grad_norm": 2.212041616914288, "language_loss": 0.68478096, "learning_rate": 1.0803752824688483e-09, "loss": 0.70609212, "num_input_tokens_seen": 355126340, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6796875, "step": 16458, "time_per_iteration": 2.820234775543213 }, { "auxiliary_loss_clip": 0.0110833, "auxiliary_loss_mlp": 0.0103098, "balance_loss_clip": 1.01879048, "balance_loss_mlp": 1.03318048, "epoch": 0.9895686156621073, "flos": 19678042120320.0, "grad_norm": 1.6852315842882875, "language_loss": 0.79075527, "learning_rate": 1.0679939621360823e-09, "loss": 0.81214833, "num_input_tokens_seen": 355144025, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6640625, "step": 16459, "time_per_iteration": 2.5094287395477295 }, { "auxiliary_loss_clip": 0.0111925, "auxiliary_loss_mlp": 0.01032127, "balance_loss_clip": 1.02018857, "balance_loss_mlp": 1.03284931, "epoch": 0.9896287389147753, "flos": 23581675457280.0, "grad_norm": 1.9536486614307327, "language_loss": 0.70539087, "learning_rate": 1.055683978544941e-09, "loss": 0.72690469, "num_input_tokens_seen": 355163125, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 16460, "time_per_iteration": 2.6022095680236816 }, { "auxiliary_loss_clip": 0.01121111, "auxiliary_loss_mlp": 0.01026947, "balance_loss_clip": 1.0141325, "balance_loss_mlp": 1.03326797, "epoch": 0.9896888621674432, "flos": 29605331364480.0, "grad_norm": 1.6775216004357993, "language_loss": 0.8723402, "learning_rate": 1.0434453321344073e-09, "loss": 0.89382076, "num_input_tokens_seen": 355184060, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 16461, "time_per_iteration": 2.708244562149048 }, { "auxiliary_loss_clip": 0.01110072, "auxiliary_loss_mlp": 0.01031972, "balance_loss_clip": 1.01885295, "balance_loss_mlp": 1.03363323, "epoch": 0.9897489854201112, "flos": 23695045758720.0, "grad_norm": 1.8796292860470825, "language_loss": 0.63006043, "learning_rate": 1.0312780233414642e-09, "loss": 0.65148091, "num_input_tokens_seen": 355204505, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6796875, "step": 16462, "time_per_iteration": 2.6607980728149414 }, { "auxiliary_loss_clip": 0.01113471, "auxiliary_loss_mlp": 0.01029818, "balance_loss_clip": 1.01821911, "balance_loss_mlp": 1.03654957, "epoch": 0.9898091086727792, "flos": 13225146687360.0, "grad_norm": 1.7906711028999711, "language_loss": 0.72745025, "learning_rate": 1.0191820526002092e-09, "loss": 0.74888313, "num_input_tokens_seen": 355223055, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 16463, "time_per_iteration": 2.544640302658081 }, { "auxiliary_loss_clip": 0.01109268, "auxiliary_loss_mlp": 0.01028861, "balance_loss_clip": 1.0157721, "balance_loss_mlp": 1.03280592, "epoch": 0.9898692319254472, "flos": 22930400010240.0, "grad_norm": 1.712876069951914, "language_loss": 0.70036864, "learning_rate": 1.0071574203425193e-09, "loss": 0.7217499, "num_input_tokens_seen": 355242000, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6796875, "step": 16464, "time_per_iteration": 2.549816846847534 }, { "auxiliary_loss_clip": 0.01128937, "auxiliary_loss_mlp": 0.0102861, "balance_loss_clip": 1.01617098, "balance_loss_mlp": 1.03277874, "epoch": 0.9899293551781151, "flos": 12458346122880.0, "grad_norm": 2.2276476921955077, "language_loss": 0.73931545, "learning_rate": 9.952041269971624e-10, "loss": 0.76089096, "num_input_tokens_seen": 355260175, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 16465, "time_per_iteration": 4.149223566055298 }, { "auxiliary_loss_clip": 0.0109857, "auxiliary_loss_mlp": 0.01033438, "balance_loss_clip": 1.02273941, "balance_loss_mlp": 1.03152502, "epoch": 0.9899894784307831, "flos": 26871129118080.0, "grad_norm": 1.8115330781107268, "language_loss": 0.86662745, "learning_rate": 9.83322172990908e-10, "loss": 0.88794756, "num_input_tokens_seen": 355281930, "router_z_loss_clip": 0.10742188, "router_z_loss_mlp": 0.671875, "step": 16466, "time_per_iteration": 2.727264165878296 }, { "auxiliary_loss_clip": 0.01129712, "auxiliary_loss_mlp": 0.01023998, "balance_loss_clip": 1.01183832, "balance_loss_mlp": 1.03450501, "epoch": 0.990049601683451, "flos": 21542093395200.0, "grad_norm": 2.031192471038236, "language_loss": 0.71628356, "learning_rate": 9.715115587478618e-10, "loss": 0.73782063, "num_input_tokens_seen": 355301555, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.68359375, "step": 16467, "time_per_iteration": 2.8936712741851807 }, { "auxiliary_loss_clip": 0.0111895, "auxiliary_loss_mlp": 0.01029963, "balance_loss_clip": 1.01825666, "balance_loss_mlp": 1.03384805, "epoch": 0.990109724936119, "flos": 28771809287040.0, "grad_norm": 1.2906346090934342, "language_loss": 0.65068507, "learning_rate": 9.59772284689464e-10, "loss": 0.67217422, "num_input_tokens_seen": 355324925, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 16468, "time_per_iteration": 3.0555269718170166 }, { "auxiliary_loss_clip": 0.01132016, "auxiliary_loss_mlp": 0.01029153, "balance_loss_clip": 1.01698208, "balance_loss_mlp": 1.03400981, "epoch": 0.9901698481887871, "flos": 29274270687360.0, "grad_norm": 1.8775125362768899, "language_loss": 0.62170547, "learning_rate": 9.48104351234713e-10, "loss": 0.64331716, "num_input_tokens_seen": 355343875, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.71875, "step": 16469, "time_per_iteration": 2.8452067375183105 }, { "auxiliary_loss_clip": 0.01131431, "auxiliary_loss_mlp": 0.0102961, "balance_loss_clip": 1.01736116, "balance_loss_mlp": 1.0341692, "epoch": 0.990229971441455, "flos": 15959025711360.0, "grad_norm": 3.2809261291949423, "language_loss": 0.70243239, "learning_rate": 9.365077587997205e-10, "loss": 0.72404289, "num_input_tokens_seen": 355358835, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.70703125, "step": 16470, "time_per_iteration": 2.7502646446228027 }, { "auxiliary_loss_clip": 0.01030105, "auxiliary_loss_mlp": 0.0100139, "balance_loss_clip": 1.00005496, "balance_loss_mlp": 1.00058818, "epoch": 0.990290094694123, "flos": 69780287911680.0, "grad_norm": 0.6637516402394708, "language_loss": 0.55480564, "learning_rate": 9.249825077988216e-10, "loss": 0.57512057, "num_input_tokens_seen": 355431225, "router_z_loss_clip": 0.0133667, "router_z_loss_mlp": 0.2109375, "step": 16471, "time_per_iteration": 3.4444491863250732 }, { "auxiliary_loss_clip": 0.01120918, "auxiliary_loss_mlp": 0.01033762, "balance_loss_clip": 1.021191, "balance_loss_mlp": 1.03504646, "epoch": 0.9903502179467909, "flos": 16252451913600.0, "grad_norm": 3.1128313581486537, "language_loss": 0.84120119, "learning_rate": 9.135285986427988e-10, "loss": 0.86274797, "num_input_tokens_seen": 355448250, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 16472, "time_per_iteration": 2.686340093612671 }, { "auxiliary_loss_clip": 0.01110736, "auxiliary_loss_mlp": 0.01026658, "balance_loss_clip": 1.01470685, "balance_loss_mlp": 1.03424466, "epoch": 0.9904103411994589, "flos": 21688393489920.0, "grad_norm": 1.8022914399710894, "language_loss": 0.85681558, "learning_rate": 9.021460317408802e-10, "loss": 0.8781895, "num_input_tokens_seen": 355467040, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 16473, "time_per_iteration": 4.151953458786011 }, { "auxiliary_loss_clip": 0.01099397, "auxiliary_loss_mlp": 0.01029752, "balance_loss_clip": 1.01846933, "balance_loss_mlp": 1.03213418, "epoch": 0.9904704644521268, "flos": 25739440243200.0, "grad_norm": 1.6818833240953575, "language_loss": 0.8416537, "learning_rate": 8.908348074989635e-10, "loss": 0.8629452, "num_input_tokens_seen": 355487825, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.671875, "step": 16474, "time_per_iteration": 2.7079198360443115 }, { "auxiliary_loss_clip": 0.01115595, "auxiliary_loss_mlp": 0.0103556, "balance_loss_clip": 1.02371705, "balance_loss_mlp": 1.03034127, "epoch": 0.9905305877047949, "flos": 21908346422400.0, "grad_norm": 1.6756295165594985, "language_loss": 0.764467, "learning_rate": 8.795949263209479e-10, "loss": 0.78597856, "num_input_tokens_seen": 355507445, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.67578125, "step": 16475, "time_per_iteration": 2.7821054458618164 }, { "auxiliary_loss_clip": 0.01127411, "auxiliary_loss_mlp": 0.01028711, "balance_loss_clip": 1.01659369, "balance_loss_mlp": 1.03131747, "epoch": 0.9905907109574628, "flos": 21392417422080.0, "grad_norm": 1.5025082585859246, "language_loss": 0.75584304, "learning_rate": 8.684263886078457e-10, "loss": 0.77740425, "num_input_tokens_seen": 355527205, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 16476, "time_per_iteration": 3.0172278881073 }, { "auxiliary_loss_clip": 0.01114689, "auxiliary_loss_mlp": 0.01276607, "balance_loss_clip": 1.015872, "balance_loss_mlp": 1.03365004, "epoch": 0.9906508342101308, "flos": 20521620005760.0, "grad_norm": 2.383777607298443, "language_loss": 0.67410207, "learning_rate": 8.573291947582273e-10, "loss": 0.69801503, "num_input_tokens_seen": 355544740, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.72265625, "step": 16477, "time_per_iteration": 2.866055965423584 }, { "auxiliary_loss_clip": 0.01124473, "auxiliary_loss_mlp": 0.01032547, "balance_loss_clip": 1.01958263, "balance_loss_mlp": 1.03602135, "epoch": 0.9907109574627987, "flos": 21361211481600.0, "grad_norm": 1.9684964601167783, "language_loss": 0.71655309, "learning_rate": 8.46303345167998e-10, "loss": 0.7381233, "num_input_tokens_seen": 355564385, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 16478, "time_per_iteration": 2.7848355770111084 }, { "auxiliary_loss_clip": 0.0110326, "auxiliary_loss_mlp": 0.01036938, "balance_loss_clip": 1.02367592, "balance_loss_mlp": 1.03413415, "epoch": 0.9907710807154667, "flos": 17338605321600.0, "grad_norm": 3.770561744867791, "language_loss": 0.80799121, "learning_rate": 8.353488402308429e-10, "loss": 0.82939321, "num_input_tokens_seen": 355579260, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69140625, "step": 16479, "time_per_iteration": 2.7692666053771973 }, { "auxiliary_loss_clip": 0.01113497, "auxiliary_loss_mlp": 0.01033845, "balance_loss_clip": 1.02231181, "balance_loss_mlp": 1.03525794, "epoch": 0.9908312039681346, "flos": 28621881918720.0, "grad_norm": 1.9437604553799224, "language_loss": 0.66064858, "learning_rate": 8.244656803375605e-10, "loss": 0.68212199, "num_input_tokens_seen": 355599790, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.69140625, "step": 16480, "time_per_iteration": 7.2844648361206055 }, { "auxiliary_loss_clip": 0.01109723, "auxiliary_loss_mlp": 0.01027808, "balance_loss_clip": 1.01546335, "balance_loss_mlp": 1.03329623, "epoch": 0.9908913272208026, "flos": 35770654512000.0, "grad_norm": 1.6844739228770769, "language_loss": 0.7183637, "learning_rate": 8.136538658765069e-10, "loss": 0.739739, "num_input_tokens_seen": 355620925, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.67578125, "step": 16481, "time_per_iteration": 2.9309892654418945 }, { "auxiliary_loss_clip": 0.011098, "auxiliary_loss_mlp": 0.010245, "balance_loss_clip": 1.01293647, "balance_loss_mlp": 1.0340873, "epoch": 0.9909514504734707, "flos": 19902196944000.0, "grad_norm": 2.7843814171263337, "language_loss": 0.77442884, "learning_rate": 8.029133972338176e-10, "loss": 0.79577184, "num_input_tokens_seen": 355639165, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.66796875, "step": 16482, "time_per_iteration": 2.917923927307129 }, { "auxiliary_loss_clip": 0.01113414, "auxiliary_loss_mlp": 0.01031879, "balance_loss_clip": 1.0201906, "balance_loss_mlp": 1.03456819, "epoch": 0.9910115737261386, "flos": 20004793165440.0, "grad_norm": 2.088284410493829, "language_loss": 0.75325906, "learning_rate": 7.922442747925195e-10, "loss": 0.77471203, "num_input_tokens_seen": 355657320, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.703125, "step": 16483, "time_per_iteration": 2.762382984161377 }, { "auxiliary_loss_clip": 0.0103141, "auxiliary_loss_mlp": 0.01000623, "balance_loss_clip": 0.99935895, "balance_loss_mlp": 1.00120187, "epoch": 0.9910716969788066, "flos": 70688432494080.0, "grad_norm": 0.6896657527770718, "language_loss": 0.53660178, "learning_rate": 7.816464989334193e-10, "loss": 0.5569222, "num_input_tokens_seen": 355726370, "router_z_loss_clip": 0.01263428, "router_z_loss_mlp": 0.2109375, "step": 16484, "time_per_iteration": 3.5563652515411377 }, { "auxiliary_loss_clip": 0.01098065, "auxiliary_loss_mlp": 0.01029497, "balance_loss_clip": 1.01785588, "balance_loss_mlp": 1.0327822, "epoch": 0.9911318202314745, "flos": 21434038306560.0, "grad_norm": 1.9602245741662285, "language_loss": 0.82103986, "learning_rate": 7.711200700348808e-10, "loss": 0.8423155, "num_input_tokens_seen": 355745840, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.65234375, "step": 16485, "time_per_iteration": 2.783531427383423 }, { "auxiliary_loss_clip": 0.0112799, "auxiliary_loss_mlp": 0.01035141, "balance_loss_clip": 1.02256405, "balance_loss_mlp": 1.03491664, "epoch": 0.9911919434841425, "flos": 19826820253440.0, "grad_norm": 1.7262248022970226, "language_loss": 0.81370717, "learning_rate": 7.606649884723815e-10, "loss": 0.83533847, "num_input_tokens_seen": 355763385, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.66015625, "step": 16486, "time_per_iteration": 2.693006992340088 }, { "auxiliary_loss_clip": 0.01111639, "auxiliary_loss_mlp": 0.010276, "balance_loss_clip": 1.01586354, "balance_loss_mlp": 1.03381777, "epoch": 0.9912520667368104, "flos": 41719364691840.0, "grad_norm": 1.4108908215013112, "language_loss": 0.6603843, "learning_rate": 7.502812546189563e-10, "loss": 0.68177664, "num_input_tokens_seen": 355786075, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69140625, "step": 16487, "time_per_iteration": 2.9476869106292725 }, { "auxiliary_loss_clip": 0.01108428, "auxiliary_loss_mlp": 0.01275908, "balance_loss_clip": 1.01761484, "balance_loss_mlp": 1.03295302, "epoch": 0.9913121899894785, "flos": 23769668263680.0, "grad_norm": 1.9251792960638452, "language_loss": 0.76607966, "learning_rate": 7.399688688454198e-10, "loss": 0.78992307, "num_input_tokens_seen": 355806295, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.66796875, "step": 16488, "time_per_iteration": 2.8899922370910645 }, { "auxiliary_loss_clip": 0.01132327, "auxiliary_loss_mlp": 0.01029121, "balance_loss_clip": 1.01627052, "balance_loss_mlp": 1.03418255, "epoch": 0.9913723132421464, "flos": 23769668263680.0, "grad_norm": 2.2430626598356853, "language_loss": 0.68715191, "learning_rate": 7.297278315196997e-10, "loss": 0.7087664, "num_input_tokens_seen": 355825730, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 16489, "time_per_iteration": 2.9538049697875977 }, { "auxiliary_loss_clip": 0.01049781, "auxiliary_loss_mlp": 0.01002979, "balance_loss_clip": 1.00172746, "balance_loss_mlp": 1.00137782, "epoch": 0.9914324364948144, "flos": 71267419820160.0, "grad_norm": 0.6084691522746102, "language_loss": 0.5259403, "learning_rate": 7.195581430072816e-10, "loss": 0.5464679, "num_input_tokens_seen": 355891545, "router_z_loss_clip": 0.01251221, "router_z_loss_mlp": 0.2109375, "step": 16490, "time_per_iteration": 3.4086239337921143 }, { "auxiliary_loss_clip": 0.01133543, "auxiliary_loss_mlp": 0.01037714, "balance_loss_clip": 1.02423191, "balance_loss_mlp": 1.03479242, "epoch": 0.9914925597474823, "flos": 23695440808320.0, "grad_norm": 1.5180183997552266, "language_loss": 0.75516343, "learning_rate": 7.094598036709865e-10, "loss": 0.77687597, "num_input_tokens_seen": 355909920, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 16491, "time_per_iteration": 2.8452889919281006 }, { "auxiliary_loss_clip": 0.01143719, "auxiliary_loss_mlp": 0.01033426, "balance_loss_clip": 1.02030158, "balance_loss_mlp": 1.03710341, "epoch": 0.9915526830001503, "flos": 13433822749440.0, "grad_norm": 2.4307648386068843, "language_loss": 0.70351797, "learning_rate": 6.994328138714145e-10, "loss": 0.7252894, "num_input_tokens_seen": 355923130, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 16492, "time_per_iteration": 2.7730095386505127 }, { "auxiliary_loss_clip": 0.0112771, "auxiliary_loss_mlp": 0.01029991, "balance_loss_clip": 1.01679492, "balance_loss_mlp": 1.03279924, "epoch": 0.9916128062528182, "flos": 20740962407040.0, "grad_norm": 5.569669549044397, "language_loss": 0.68143368, "learning_rate": 6.894771739662797e-10, "loss": 0.70301068, "num_input_tokens_seen": 355941960, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.68359375, "step": 16493, "time_per_iteration": 2.858405351638794 }, { "auxiliary_loss_clip": 0.01123251, "auxiliary_loss_mlp": 0.01035119, "balance_loss_clip": 1.02301371, "balance_loss_mlp": 1.03420556, "epoch": 0.9916729295054862, "flos": 22487082353280.0, "grad_norm": 1.9231415071446123, "language_loss": 0.71409297, "learning_rate": 6.795928843106314e-10, "loss": 0.73567671, "num_input_tokens_seen": 355961640, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.7109375, "step": 16494, "time_per_iteration": 2.9974706172943115 }, { "auxiliary_loss_clip": 0.01112338, "auxiliary_loss_mlp": 0.01034128, "balance_loss_clip": 1.02083015, "balance_loss_mlp": 1.03382373, "epoch": 0.9917330527581543, "flos": 14792467708800.0, "grad_norm": 3.10625089973945, "language_loss": 0.68058342, "learning_rate": 6.697799452575203e-10, "loss": 0.70204806, "num_input_tokens_seen": 355977980, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 16495, "time_per_iteration": 2.8455638885498047 }, { "auxiliary_loss_clip": 0.01040138, "auxiliary_loss_mlp": 0.01001959, "balance_loss_clip": 1.00073135, "balance_loss_mlp": 1.00115705, "epoch": 0.9917931760108222, "flos": 59191595585280.0, "grad_norm": 0.7247491921338304, "language_loss": 0.53545922, "learning_rate": 6.600383571571111e-10, "loss": 0.55588019, "num_input_tokens_seen": 356042900, "router_z_loss_clip": 0.01226807, "router_z_loss_mlp": 0.2109375, "step": 16496, "time_per_iteration": 3.4586400985717773 }, { "auxiliary_loss_clip": 0.01122283, "auxiliary_loss_mlp": 0.01277011, "balance_loss_clip": 1.01730156, "balance_loss_mlp": 1.03454363, "epoch": 0.9918532992634902, "flos": 26761637485440.0, "grad_norm": 1.7974961459299734, "language_loss": 0.71428442, "learning_rate": 6.503681203571254e-10, "loss": 0.73827738, "num_input_tokens_seen": 356063000, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 16497, "time_per_iteration": 2.739778757095337 }, { "auxiliary_loss_clip": 0.01132448, "auxiliary_loss_mlp": 0.01035723, "balance_loss_clip": 1.02200222, "balance_loss_mlp": 1.03450096, "epoch": 0.9919134225161581, "flos": 14975719920000.0, "grad_norm": 2.9047281811552788, "language_loss": 0.82084417, "learning_rate": 6.407692352023985e-10, "loss": 0.8425259, "num_input_tokens_seen": 356078130, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.71484375, "step": 16498, "time_per_iteration": 2.837700605392456 }, { "auxiliary_loss_clip": 0.011338, "auxiliary_loss_mlp": 0.01034287, "balance_loss_clip": 1.02173412, "balance_loss_mlp": 1.03657317, "epoch": 0.9919735457688261, "flos": 27818201064960.0, "grad_norm": 1.7519989170272032, "language_loss": 0.68311405, "learning_rate": 6.312417020357674e-10, "loss": 0.70479488, "num_input_tokens_seen": 356101655, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 16499, "time_per_iteration": 3.0580320358276367 }, { "auxiliary_loss_clip": 0.0110533, "auxiliary_loss_mlp": 0.01028299, "balance_loss_clip": 1.01657474, "balance_loss_mlp": 1.0355494, "epoch": 0.992033669021494, "flos": 22562782266240.0, "grad_norm": 1.8430335120556371, "language_loss": 0.81675291, "learning_rate": 6.217855211971823e-10, "loss": 0.83808923, "num_input_tokens_seen": 356121425, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6953125, "step": 16500, "time_per_iteration": 2.658388376235962 }, { "auxiliary_loss_clip": 0.01123447, "auxiliary_loss_mlp": 0.01025414, "balance_loss_clip": 1.01274204, "balance_loss_mlp": 1.03465223, "epoch": 0.9920937922741621, "flos": 25374587846400.0, "grad_norm": 1.7758018099315542, "language_loss": 0.8178556, "learning_rate": 6.12400693023929e-10, "loss": 0.83934414, "num_input_tokens_seen": 356140710, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 16501, "time_per_iteration": 2.9872703552246094 }, { "auxiliary_loss_clip": 0.01111565, "auxiliary_loss_mlp": 0.01028463, "balance_loss_clip": 1.01622033, "balance_loss_mlp": 1.0335052, "epoch": 0.99215391552683, "flos": 22054466949120.0, "grad_norm": 1.8343731476288052, "language_loss": 0.77191007, "learning_rate": 6.030872178510726e-10, "loss": 0.79331034, "num_input_tokens_seen": 356159835, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 16502, "time_per_iteration": 2.8098788261413574 }, { "auxiliary_loss_clip": 0.01123065, "auxiliary_loss_mlp": 0.01031713, "balance_loss_clip": 1.01830208, "balance_loss_mlp": 1.03418589, "epoch": 0.992214038779498, "flos": 15413937845760.0, "grad_norm": 2.1983747183741316, "language_loss": 0.71227014, "learning_rate": 5.938450960110142e-10, "loss": 0.73381793, "num_input_tokens_seen": 356177555, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 16503, "time_per_iteration": 2.6683831214904785 }, { "auxiliary_loss_clip": 0.0103159, "auxiliary_loss_mlp": 0.00999352, "balance_loss_clip": 0.99807662, "balance_loss_mlp": 1.00130415, "epoch": 0.9922741620321659, "flos": 62014498467840.0, "grad_norm": 0.6500449144693634, "language_loss": 0.55090183, "learning_rate": 5.846743278334898e-10, "loss": 0.57121122, "num_input_tokens_seen": 356244975, "router_z_loss_clip": 0.01275635, "router_z_loss_mlp": 0.21191406, "step": 16504, "time_per_iteration": 3.6097021102905273 }, { "auxiliary_loss_clip": 0.01125589, "auxiliary_loss_mlp": 0.0102396, "balance_loss_clip": 1.0127182, "balance_loss_mlp": 1.03199387, "epoch": 0.9923342852848339, "flos": 17165480745600.0, "grad_norm": 1.6682208880120846, "language_loss": 0.69700521, "learning_rate": 5.755749136457932e-10, "loss": 0.71850073, "num_input_tokens_seen": 356262605, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.671875, "step": 16505, "time_per_iteration": 2.7473888397216797 }, { "auxiliary_loss_clip": 0.01130955, "auxiliary_loss_mlp": 0.01032745, "balance_loss_clip": 1.01937568, "balance_loss_mlp": 1.03274822, "epoch": 0.9923944085375018, "flos": 23183210908800.0, "grad_norm": 3.719286418341305, "language_loss": 0.65580606, "learning_rate": 5.665468537727757e-10, "loss": 0.67744309, "num_input_tokens_seen": 356278935, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 16506, "time_per_iteration": 4.230038404464722 }, { "auxiliary_loss_clip": 0.01145422, "auxiliary_loss_mlp": 0.01029367, "balance_loss_clip": 1.01632547, "balance_loss_mlp": 1.03293324, "epoch": 0.9924545317901698, "flos": 20813861059200.0, "grad_norm": 1.759841872065172, "language_loss": 0.62780792, "learning_rate": 5.575901485366241e-10, "loss": 0.6495558, "num_input_tokens_seen": 356295675, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6796875, "step": 16507, "time_per_iteration": 2.951641321182251 }, { "auxiliary_loss_clip": 0.01102966, "auxiliary_loss_mlp": 0.01034561, "balance_loss_clip": 1.02189565, "balance_loss_mlp": 1.03385091, "epoch": 0.9925146550428379, "flos": 20083437993600.0, "grad_norm": 2.049045699389441, "language_loss": 0.72802484, "learning_rate": 5.487047982566384e-10, "loss": 0.74940014, "num_input_tokens_seen": 356312885, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 16508, "time_per_iteration": 2.7229068279266357 }, { "auxiliary_loss_clip": 0.0110133, "auxiliary_loss_mlp": 0.0102939, "balance_loss_clip": 1.01679575, "balance_loss_mlp": 1.03269458, "epoch": 0.9925747782955058, "flos": 24973717086720.0, "grad_norm": 1.6072925642737015, "language_loss": 0.70444024, "learning_rate": 5.398908032503425e-10, "loss": 0.72574747, "num_input_tokens_seen": 356334070, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 16509, "time_per_iteration": 2.798149824142456 }, { "auxiliary_loss_clip": 0.0111078, "auxiliary_loss_mlp": 0.0103289, "balance_loss_clip": 1.02051592, "balance_loss_mlp": 1.03257334, "epoch": 0.9926349015481738, "flos": 60472526492160.0, "grad_norm": 2.016502254273266, "language_loss": 0.68554866, "learning_rate": 5.311481638321513e-10, "loss": 0.70698535, "num_input_tokens_seen": 356359410, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 16510, "time_per_iteration": 2.9719412326812744 }, { "auxiliary_loss_clip": 0.01131448, "auxiliary_loss_mlp": 0.01036683, "balance_loss_clip": 1.02417147, "balance_loss_mlp": 1.03474617, "epoch": 0.9926950248008417, "flos": 20741716592640.0, "grad_norm": 2.2442952579649904, "language_loss": 0.81343925, "learning_rate": 5.224768803140378e-10, "loss": 0.83512056, "num_input_tokens_seen": 356378345, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 16511, "time_per_iteration": 2.792386531829834 }, { "auxiliary_loss_clip": 0.01126607, "auxiliary_loss_mlp": 0.01028589, "balance_loss_clip": 1.01705527, "balance_loss_mlp": 1.03279448, "epoch": 0.9927551480535097, "flos": 24352965221760.0, "grad_norm": 1.8479915605180273, "language_loss": 0.91490382, "learning_rate": 5.1387695300531e-10, "loss": 0.93645579, "num_input_tokens_seen": 356397345, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66796875, "step": 16512, "time_per_iteration": 2.735077381134033 }, { "auxiliary_loss_clip": 0.01048577, "auxiliary_loss_mlp": 0.01001224, "balance_loss_clip": 1.00000191, "balance_loss_mlp": 1.0011853, "epoch": 0.9928152713061776, "flos": 71275572207360.0, "grad_norm": 0.9413501016542841, "language_loss": 0.55225372, "learning_rate": 5.053483822132776e-10, "loss": 0.57275176, "num_input_tokens_seen": 356459160, "router_z_loss_clip": 0.01220703, "router_z_loss_mlp": 0.20996094, "step": 16513, "time_per_iteration": 3.4661002159118652 }, { "auxiliary_loss_clip": 0.0111813, "auxiliary_loss_mlp": 0.01029808, "balance_loss_clip": 1.0172317, "balance_loss_mlp": 1.03303266, "epoch": 0.9928753945588457, "flos": 57809499045120.0, "grad_norm": 1.4068134098413576, "language_loss": 0.65135652, "learning_rate": 4.968911682419197e-10, "loss": 0.67283589, "num_input_tokens_seen": 356486405, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.671875, "step": 16514, "time_per_iteration": 4.678213119506836 }, { "auxiliary_loss_clip": 0.01132568, "auxiliary_loss_mlp": 0.01027641, "balance_loss_clip": 1.01462948, "balance_loss_mlp": 1.03515363, "epoch": 0.9929355178115136, "flos": 19568981450880.0, "grad_norm": 2.9193557207324443, "language_loss": 0.73410535, "learning_rate": 4.885053113929949e-10, "loss": 0.7557075, "num_input_tokens_seen": 356502905, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 16515, "time_per_iteration": 2.8141379356384277 }, { "auxiliary_loss_clip": 0.01130184, "auxiliary_loss_mlp": 0.01030807, "balance_loss_clip": 1.01877284, "balance_loss_mlp": 1.03489161, "epoch": 0.9929956410641816, "flos": 22964658606720.0, "grad_norm": 1.9512705248134037, "language_loss": 0.77191544, "learning_rate": 4.801908119660414e-10, "loss": 0.79352534, "num_input_tokens_seen": 356523830, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 16516, "time_per_iteration": 2.755868911743164 }, { "auxiliary_loss_clip": 0.01126429, "auxiliary_loss_mlp": 0.01029717, "balance_loss_clip": 1.01810038, "balance_loss_mlp": 1.03262758, "epoch": 0.9930557643168495, "flos": 22566409539840.0, "grad_norm": 1.4216251952654644, "language_loss": 0.78122455, "learning_rate": 4.719476702579328e-10, "loss": 0.80278599, "num_input_tokens_seen": 356543965, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 16517, "time_per_iteration": 2.8287103176116943 }, { "auxiliary_loss_clip": 0.01116055, "auxiliary_loss_mlp": 0.01037063, "balance_loss_clip": 1.02300191, "balance_loss_mlp": 1.03508234, "epoch": 0.9931158875695175, "flos": 17201032231680.0, "grad_norm": 1.9690661986717102, "language_loss": 0.6739493, "learning_rate": 4.6377588656243416e-10, "loss": 0.69548047, "num_input_tokens_seen": 356561530, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.71875, "step": 16518, "time_per_iteration": 2.6520133018493652 }, { "auxiliary_loss_clip": 0.01040007, "auxiliary_loss_mlp": 0.01000971, "balance_loss_clip": 0.99974281, "balance_loss_mlp": 1.00095785, "epoch": 0.9931760108221854, "flos": 63711204278400.0, "grad_norm": 0.7296296113952097, "language_loss": 0.53498161, "learning_rate": 4.5567546117131206e-10, "loss": 0.55539137, "num_input_tokens_seen": 356616845, "router_z_loss_clip": 0.01226807, "router_z_loss_mlp": 0.20898438, "step": 16519, "time_per_iteration": 3.225369691848755 }, { "auxiliary_loss_clip": 0.0112765, "auxiliary_loss_mlp": 0.01028751, "balance_loss_clip": 1.01665115, "balance_loss_mlp": 1.03388011, "epoch": 0.9932361340748534, "flos": 15304805349120.0, "grad_norm": 2.3577342269127293, "language_loss": 0.60085893, "learning_rate": 4.476463943738906e-10, "loss": 0.62242287, "num_input_tokens_seen": 356633560, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.66796875, "step": 16520, "time_per_iteration": 2.776840925216675 }, { "auxiliary_loss_clip": 0.011207, "auxiliary_loss_mlp": 0.01031751, "balance_loss_clip": 1.01988375, "balance_loss_mlp": 1.03445172, "epoch": 0.9932962573275215, "flos": 36064906727040.0, "grad_norm": 1.667920348632349, "language_loss": 0.62063169, "learning_rate": 4.396886864561633e-10, "loss": 0.64215618, "num_input_tokens_seen": 356657600, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 16521, "time_per_iteration": 4.514011859893799 }, { "auxiliary_loss_clip": 0.01123555, "auxiliary_loss_mlp": 0.01030926, "balance_loss_clip": 1.01814055, "balance_loss_mlp": 1.03478801, "epoch": 0.9933563805801894, "flos": 21470523546240.0, "grad_norm": 1.7085013160768887, "language_loss": 0.74354601, "learning_rate": 4.318023377027913e-10, "loss": 0.76509088, "num_input_tokens_seen": 356675880, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 16522, "time_per_iteration": 5.524011850357056 }, { "auxiliary_loss_clip": 0.01127499, "auxiliary_loss_mlp": 0.01029415, "balance_loss_clip": 1.01747632, "balance_loss_mlp": 1.03389621, "epoch": 0.9934165038328574, "flos": 23986532626560.0, "grad_norm": 1.424819121356992, "language_loss": 0.73210096, "learning_rate": 4.23987348394661e-10, "loss": 0.7536701, "num_input_tokens_seen": 356696000, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 16523, "time_per_iteration": 2.7217905521392822 }, { "auxiliary_loss_clip": 0.01117582, "auxiliary_loss_mlp": 0.0127566, "balance_loss_clip": 1.01674438, "balance_loss_mlp": 1.03222322, "epoch": 0.9934766270855253, "flos": 21907807718400.0, "grad_norm": 1.4911330252200758, "language_loss": 0.71037805, "learning_rate": 4.1624371881110455e-10, "loss": 0.73431051, "num_input_tokens_seen": 356716845, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 16524, "time_per_iteration": 2.775564432144165 }, { "auxiliary_loss_clip": 0.01129417, "auxiliary_loss_mlp": 0.01030757, "balance_loss_clip": 1.01897359, "balance_loss_mlp": 1.03442788, "epoch": 0.9935367503381933, "flos": 17129139160320.0, "grad_norm": 2.5487620311184505, "language_loss": 0.79357779, "learning_rate": 4.0857144922812335e-10, "loss": 0.81517959, "num_input_tokens_seen": 356732100, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 16525, "time_per_iteration": 2.7425975799560547 }, { "auxiliary_loss_clip": 0.01127136, "auxiliary_loss_mlp": 0.01029528, "balance_loss_clip": 1.01720166, "balance_loss_mlp": 1.03337312, "epoch": 0.9935968735908612, "flos": 22346241125760.0, "grad_norm": 1.8155395078069059, "language_loss": 0.75102341, "learning_rate": 4.009705399197205e-10, "loss": 0.77259004, "num_input_tokens_seen": 356751480, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6640625, "step": 16526, "time_per_iteration": 2.692892551422119 }, { "auxiliary_loss_clip": 0.01097824, "auxiliary_loss_mlp": 0.01274224, "balance_loss_clip": 1.01607263, "balance_loss_mlp": 1.03193533, "epoch": 0.9936569968435293, "flos": 29460539640960.0, "grad_norm": 1.5671560272490053, "language_loss": 0.72457206, "learning_rate": 3.934409911570125e-10, "loss": 0.74829257, "num_input_tokens_seen": 356772650, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.65625, "step": 16527, "time_per_iteration": 2.6548402309417725 }, { "auxiliary_loss_clip": 0.01120616, "auxiliary_loss_mlp": 0.01029216, "balance_loss_clip": 1.01737249, "balance_loss_mlp": 1.03300607, "epoch": 0.9937171200961972, "flos": 16544046522240.0, "grad_norm": 2.204451237672497, "language_loss": 0.75994849, "learning_rate": 3.859828032088952e-10, "loss": 0.78144681, "num_input_tokens_seen": 356788510, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.703125, "step": 16528, "time_per_iteration": 2.631263017654419 }, { "auxiliary_loss_clip": 0.01121303, "auxiliary_loss_mlp": 0.01030216, "balance_loss_clip": 1.01654267, "balance_loss_mlp": 1.03309226, "epoch": 0.9937772433488652, "flos": 24390276474240.0, "grad_norm": 5.413186181831092, "language_loss": 0.67795324, "learning_rate": 3.785959763413782e-10, "loss": 0.69946843, "num_input_tokens_seen": 356809115, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.703125, "step": 16529, "time_per_iteration": 2.805274724960327 }, { "auxiliary_loss_clip": 0.01116127, "auxiliary_loss_mlp": 0.01034082, "balance_loss_clip": 1.02006269, "balance_loss_mlp": 1.03523982, "epoch": 0.9938373666015331, "flos": 15669909141120.0, "grad_norm": 3.14166832356461, "language_loss": 0.65330303, "learning_rate": 3.712805108182504e-10, "loss": 0.67480516, "num_input_tokens_seen": 356826410, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.71875, "step": 16530, "time_per_iteration": 2.639852285385132 }, { "auxiliary_loss_clip": 0.01113799, "auxiliary_loss_mlp": 0.0102896, "balance_loss_clip": 1.01701534, "balance_loss_mlp": 1.03530884, "epoch": 0.9938974898542011, "flos": 19496190539520.0, "grad_norm": 2.4259125232448095, "language_loss": 0.7091167, "learning_rate": 3.640364069001922e-10, "loss": 0.73054433, "num_input_tokens_seen": 356844990, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 16531, "time_per_iteration": 2.657249927520752 }, { "auxiliary_loss_clip": 0.01114507, "auxiliary_loss_mlp": 0.01031641, "balance_loss_clip": 1.01903486, "balance_loss_mlp": 1.03484905, "epoch": 0.993957613106869, "flos": 26906896085760.0, "grad_norm": 1.9714193579483132, "language_loss": 0.73998797, "learning_rate": 3.568636648463297e-10, "loss": 0.76144946, "num_input_tokens_seen": 356866530, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 16532, "time_per_iteration": 2.751068115234375 }, { "auxiliary_loss_clip": 0.01131008, "auxiliary_loss_mlp": 0.01030155, "balance_loss_clip": 1.01946235, "balance_loss_mlp": 1.03220546, "epoch": 0.994017736359537, "flos": 14939593816320.0, "grad_norm": 1.6652153605192692, "language_loss": 0.70379007, "learning_rate": 3.497622849120141e-10, "loss": 0.7254017, "num_input_tokens_seen": 356884660, "router_z_loss_clip": 0.10693359, "router_z_loss_mlp": 0.6328125, "step": 16533, "time_per_iteration": 2.901458263397217 }, { "auxiliary_loss_clip": 0.01113186, "auxiliary_loss_mlp": 0.01027816, "balance_loss_clip": 1.0150491, "balance_loss_mlp": 1.03419006, "epoch": 0.9940778596122051, "flos": 15377883569280.0, "grad_norm": 2.1111134678469616, "language_loss": 0.84471744, "learning_rate": 3.427322673512645e-10, "loss": 0.86612749, "num_input_tokens_seen": 356900895, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 16534, "time_per_iteration": 2.6646006107330322 }, { "auxiliary_loss_clip": 0.01124364, "auxiliary_loss_mlp": 0.01027059, "balance_loss_clip": 1.01579332, "balance_loss_mlp": 1.03317368, "epoch": 0.994137982864873, "flos": 25228108183680.0, "grad_norm": 1.3486176268567753, "language_loss": 0.65820384, "learning_rate": 3.357736124143251e-10, "loss": 0.67971802, "num_input_tokens_seen": 356920985, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6484375, "step": 16535, "time_per_iteration": 2.8999667167663574 }, { "auxiliary_loss_clip": 0.0112712, "auxiliary_loss_mlp": 0.01028701, "balance_loss_clip": 1.01742399, "balance_loss_mlp": 1.03447843, "epoch": 0.994198106117541, "flos": 18442140912000.0, "grad_norm": 1.5841902721609815, "language_loss": 0.65081578, "learning_rate": 3.28886320350108e-10, "loss": 0.67237401, "num_input_tokens_seen": 356939800, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6640625, "step": 16536, "time_per_iteration": 2.592996120452881 }, { "auxiliary_loss_clip": 0.01127562, "auxiliary_loss_mlp": 0.01028883, "balance_loss_clip": 1.01628828, "balance_loss_mlp": 1.03371119, "epoch": 0.9942582293702089, "flos": 16654112772480.0, "grad_norm": 1.8496038756353343, "language_loss": 0.78681403, "learning_rate": 3.2207039140419444e-10, "loss": 0.80837846, "num_input_tokens_seen": 356957780, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.66796875, "step": 16537, "time_per_iteration": 2.722907066345215 }, { "auxiliary_loss_clip": 0.01116728, "auxiliary_loss_mlp": 0.01033749, "balance_loss_clip": 1.02103531, "balance_loss_mlp": 1.03496206, "epoch": 0.9943183526228769, "flos": 21944580266880.0, "grad_norm": 1.7738541483273007, "language_loss": 0.69064605, "learning_rate": 3.153258258197233e-10, "loss": 0.71215081, "num_input_tokens_seen": 356979185, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73046875, "step": 16538, "time_per_iteration": 2.561228036880493 }, { "auxiliary_loss_clip": 0.01123207, "auxiliary_loss_mlp": 0.01028457, "balance_loss_clip": 1.01713181, "balance_loss_mlp": 1.0309577, "epoch": 0.9943784758755448, "flos": 23842566915840.0, "grad_norm": 2.1175949300763777, "language_loss": 0.75022554, "learning_rate": 3.08652623837391e-10, "loss": 0.77174222, "num_input_tokens_seen": 356997735, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.66015625, "step": 16539, "time_per_iteration": 2.558892011642456 }, { "auxiliary_loss_clip": 0.01140504, "auxiliary_loss_mlp": 0.01033012, "balance_loss_clip": 1.02095962, "balance_loss_mlp": 1.03522921, "epoch": 0.9944385991282129, "flos": 21469984842240.0, "grad_norm": 2.2697469600771414, "language_loss": 0.70381939, "learning_rate": 3.0205078569545126e-10, "loss": 0.72555459, "num_input_tokens_seen": 357015660, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6953125, "step": 16540, "time_per_iteration": 2.5512161254882812 }, { "auxiliary_loss_clip": 0.01108147, "auxiliary_loss_mlp": 0.01024257, "balance_loss_clip": 1.01342106, "balance_loss_mlp": 1.03209209, "epoch": 0.9944987223808808, "flos": 22927024131840.0, "grad_norm": 1.603112200187755, "language_loss": 0.74998486, "learning_rate": 2.9552031162949355e-10, "loss": 0.7713089, "num_input_tokens_seen": 357034800, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.671875, "step": 16541, "time_per_iteration": 2.581455945968628 }, { "auxiliary_loss_clip": 0.01118708, "auxiliary_loss_mlp": 0.01033728, "balance_loss_clip": 1.02140749, "balance_loss_mlp": 1.0325532, "epoch": 0.9945588456335488, "flos": 22383013674240.0, "grad_norm": 1.7088089312979369, "language_loss": 0.76517344, "learning_rate": 2.890612018726646e-10, "loss": 0.78669775, "num_input_tokens_seen": 357053785, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 16542, "time_per_iteration": 2.574172019958496 }, { "auxiliary_loss_clip": 0.0111442, "auxiliary_loss_mlp": 0.01031921, "balance_loss_clip": 1.01871264, "balance_loss_mlp": 1.03465414, "epoch": 0.9946189688862167, "flos": 21397517153280.0, "grad_norm": 2.5457155277601005, "language_loss": 0.74274802, "learning_rate": 2.826734566552247e-10, "loss": 0.76421142, "num_input_tokens_seen": 357072025, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 16543, "time_per_iteration": 2.601248025894165 }, { "auxiliary_loss_clip": 0.01115314, "auxiliary_loss_mlp": 0.01029156, "balance_loss_clip": 1.01632309, "balance_loss_mlp": 1.03520739, "epoch": 0.9946790921388847, "flos": 12416545670400.0, "grad_norm": 2.8148656757629262, "language_loss": 0.81385636, "learning_rate": 2.763570762054357e-10, "loss": 0.83530104, "num_input_tokens_seen": 357086960, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 16544, "time_per_iteration": 2.7745120525360107 }, { "auxiliary_loss_clip": 0.01100946, "auxiliary_loss_mlp": 0.01027869, "balance_loss_clip": 1.01568604, "balance_loss_mlp": 1.03353691, "epoch": 0.9947392153915526, "flos": 19058295836160.0, "grad_norm": 1.9049254341315818, "language_loss": 0.78701031, "learning_rate": 2.7011206074845084e-10, "loss": 0.80829847, "num_input_tokens_seen": 357105095, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 16545, "time_per_iteration": 2.550276041030884 }, { "auxiliary_loss_clip": 0.01133749, "auxiliary_loss_mlp": 0.01029604, "balance_loss_clip": 1.01739693, "balance_loss_mlp": 1.03632557, "epoch": 0.9947993386442207, "flos": 27308808339840.0, "grad_norm": 2.1305842686225915, "language_loss": 0.72360528, "learning_rate": 2.639384105074249e-10, "loss": 0.74523878, "num_input_tokens_seen": 357125065, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 16546, "time_per_iteration": 2.7077958583831787 }, { "auxiliary_loss_clip": 0.01108368, "auxiliary_loss_mlp": 0.01031365, "balance_loss_clip": 1.02016568, "balance_loss_mlp": 1.03302073, "epoch": 0.9948594618968887, "flos": 20806498771200.0, "grad_norm": 2.152855240458268, "language_loss": 0.77966487, "learning_rate": 2.578361257024042e-10, "loss": 0.80106217, "num_input_tokens_seen": 357141600, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6640625, "step": 16547, "time_per_iteration": 2.5881869792938232 }, { "auxiliary_loss_clip": 0.01119731, "auxiliary_loss_mlp": 0.01030442, "balance_loss_clip": 1.01909924, "balance_loss_mlp": 1.03397906, "epoch": 0.9949195851495566, "flos": 23292953936640.0, "grad_norm": 2.0585812901601455, "language_loss": 0.70469475, "learning_rate": 2.518052065512144e-10, "loss": 0.72619653, "num_input_tokens_seen": 357157880, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6796875, "step": 16548, "time_per_iteration": 4.044821739196777 }, { "auxiliary_loss_clip": 0.0111192, "auxiliary_loss_mlp": 0.01031455, "balance_loss_clip": 1.01937902, "balance_loss_mlp": 1.03461766, "epoch": 0.9949797084022246, "flos": 18515470527360.0, "grad_norm": 1.847497659421736, "language_loss": 0.75536162, "learning_rate": 2.4584565326901674e-10, "loss": 0.77679533, "num_input_tokens_seen": 357176705, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.68359375, "step": 16549, "time_per_iteration": 2.5445239543914795 }, { "auxiliary_loss_clip": 0.01131654, "auxiliary_loss_mlp": 0.01034745, "balance_loss_clip": 1.02160192, "balance_loss_mlp": 1.03497469, "epoch": 0.9950398316548925, "flos": 30407719328640.0, "grad_norm": 1.6588871866110146, "language_loss": 0.74580216, "learning_rate": 2.39957466068752e-10, "loss": 0.76746613, "num_input_tokens_seen": 357197630, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 16550, "time_per_iteration": 2.634700298309326 }, { "auxiliary_loss_clip": 0.01137719, "auxiliary_loss_mlp": 0.01274972, "balance_loss_clip": 1.01634884, "balance_loss_mlp": 1.03340948, "epoch": 0.9950999549075605, "flos": 19900868140800.0, "grad_norm": 1.5978191525839491, "language_loss": 0.78255665, "learning_rate": 2.341406451604744e-10, "loss": 0.8066836, "num_input_tokens_seen": 357215445, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.68359375, "step": 16551, "time_per_iteration": 2.7396390438079834 }, { "auxiliary_loss_clip": 0.01104994, "auxiliary_loss_mlp": 0.01032097, "balance_loss_clip": 1.02151108, "balance_loss_mlp": 1.03291154, "epoch": 0.9951600781602284, "flos": 17603555016960.0, "grad_norm": 1.361994165234439, "language_loss": 0.66819805, "learning_rate": 2.2839519075157355e-10, "loss": 0.689569, "num_input_tokens_seen": 357234285, "router_z_loss_clip": 0.10595703, "router_z_loss_mlp": 0.63671875, "step": 16552, "time_per_iteration": 2.5390801429748535 }, { "auxiliary_loss_clip": 0.01103341, "auxiliary_loss_mlp": 0.01026643, "balance_loss_clip": 1.01443017, "balance_loss_mlp": 1.03374767, "epoch": 0.9952202014128965, "flos": 28950715952640.0, "grad_norm": 1.4785041785522874, "language_loss": 0.81487834, "learning_rate": 2.227211030472187e-10, "loss": 0.83617818, "num_input_tokens_seen": 357257565, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 16553, "time_per_iteration": 2.864543914794922 }, { "auxiliary_loss_clip": 0.01031067, "auxiliary_loss_mlp": 0.01000932, "balance_loss_clip": 0.99977607, "balance_loss_mlp": 1.00146639, "epoch": 0.9952803246655644, "flos": 70810386145920.0, "grad_norm": 0.7430622755142258, "language_loss": 0.57325011, "learning_rate": 2.1711838224991453e-10, "loss": 0.59357011, "num_input_tokens_seen": 357320205, "router_z_loss_clip": 0.01153564, "router_z_loss_mlp": 0.20996094, "step": 16554, "time_per_iteration": 3.3037633895874023 }, { "auxiliary_loss_clip": 0.01112321, "auxiliary_loss_mlp": 0.01036217, "balance_loss_clip": 1.02414107, "balance_loss_mlp": 1.03394747, "epoch": 0.9953404479182324, "flos": 21799070271360.0, "grad_norm": 1.6394239484064, "language_loss": 0.76914793, "learning_rate": 2.1158702855972322e-10, "loss": 0.79063332, "num_input_tokens_seen": 357340695, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69140625, "step": 16555, "time_per_iteration": 2.674875020980835 }, { "auxiliary_loss_clip": 0.01129124, "auxiliary_loss_mlp": 0.01031899, "balance_loss_clip": 1.01929259, "balance_loss_mlp": 1.03335571, "epoch": 0.9954005711709003, "flos": 21937397546880.0, "grad_norm": 2.0251011124369986, "language_loss": 0.86107171, "learning_rate": 2.0612704217382038e-10, "loss": 0.88268197, "num_input_tokens_seen": 357357505, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 16556, "time_per_iteration": 4.50895094871521 }, { "auxiliary_loss_clip": 0.01134364, "auxiliary_loss_mlp": 0.01031312, "balance_loss_clip": 1.01894999, "balance_loss_mlp": 1.03388643, "epoch": 0.9954606944235683, "flos": 19354559212800.0, "grad_norm": 1.6304780968904868, "language_loss": 0.73402381, "learning_rate": 2.0073842328716118e-10, "loss": 0.75568056, "num_input_tokens_seen": 357375395, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.65625, "step": 16557, "time_per_iteration": 2.9722518920898438 }, { "auxiliary_loss_clip": 0.01107709, "auxiliary_loss_mlp": 0.01032123, "balance_loss_clip": 1.02057147, "balance_loss_mlp": 1.03312421, "epoch": 0.9955208176762362, "flos": 30518611591680.0, "grad_norm": 1.7166745522906184, "language_loss": 0.76108611, "learning_rate": 1.9542117209203624e-10, "loss": 0.78248441, "num_input_tokens_seen": 357397375, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.65625, "step": 16558, "time_per_iteration": 3.1140408515930176 }, { "auxiliary_loss_clip": 0.01119783, "auxiliary_loss_mlp": 0.0103217, "balance_loss_clip": 1.01954007, "balance_loss_mlp": 1.03240407, "epoch": 0.9955809409289043, "flos": 30008249199360.0, "grad_norm": 2.181627519130476, "language_loss": 0.63201916, "learning_rate": 1.901752887782937e-10, "loss": 0.6535387, "num_input_tokens_seen": 357418880, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 16559, "time_per_iteration": 3.137378215789795 }, { "auxiliary_loss_clip": 0.01110155, "auxiliary_loss_mlp": 0.01028288, "balance_loss_clip": 1.01633716, "balance_loss_mlp": 1.03419256, "epoch": 0.9956410641815723, "flos": 21543278544000.0, "grad_norm": 1.8047015145861083, "language_loss": 0.73979139, "learning_rate": 1.8500077353289512e-10, "loss": 0.76117587, "num_input_tokens_seen": 357438310, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.66796875, "step": 16560, "time_per_iteration": 3.0513973236083984 }, { "auxiliary_loss_clip": 0.01143712, "auxiliary_loss_mlp": 0.01028322, "balance_loss_clip": 1.016222, "balance_loss_mlp": 1.03282857, "epoch": 0.9957011874342402, "flos": 21689470897920.0, "grad_norm": 1.7182250717374703, "language_loss": 0.79066283, "learning_rate": 1.7989762654080366e-10, "loss": 0.81238317, "num_input_tokens_seen": 357457155, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.671875, "step": 16561, "time_per_iteration": 2.9471700191497803 }, { "auxiliary_loss_clip": 0.01099965, "auxiliary_loss_mlp": 0.01027995, "balance_loss_clip": 1.01696861, "balance_loss_mlp": 1.03403926, "epoch": 0.9957613106869082, "flos": 17702667619200.0, "grad_norm": 2.3332969783353787, "language_loss": 0.6555329, "learning_rate": 1.7486584798409587e-10, "loss": 0.67681247, "num_input_tokens_seen": 357468060, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.66015625, "step": 16562, "time_per_iteration": 2.7988030910491943 }, { "auxiliary_loss_clip": 0.01123445, "auxiliary_loss_mlp": 0.01279768, "balance_loss_clip": 1.02011096, "balance_loss_mlp": 1.03434849, "epoch": 0.9958214339395761, "flos": 30555994671360.0, "grad_norm": 1.7951461895348952, "language_loss": 0.64521599, "learning_rate": 1.6990543804218383e-10, "loss": 0.6692481, "num_input_tokens_seen": 357489665, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 16563, "time_per_iteration": 5.1066882610321045 }, { "auxiliary_loss_clip": 0.01117344, "auxiliary_loss_mlp": 0.01033614, "balance_loss_clip": 1.02101982, "balance_loss_mlp": 1.03301573, "epoch": 0.9958815571922441, "flos": 24169174306560.0, "grad_norm": 1.5786424906310597, "language_loss": 0.64610636, "learning_rate": 1.6501639689203706e-10, "loss": 0.66761601, "num_input_tokens_seen": 357511975, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.66796875, "step": 16564, "time_per_iteration": 5.164210557937622 }, { "auxiliary_loss_clip": 0.01118403, "auxiliary_loss_mlp": 0.01273008, "balance_loss_clip": 1.014359, "balance_loss_mlp": 1.03392339, "epoch": 0.995941680444912, "flos": 15487016065920.0, "grad_norm": 1.7535108782982205, "language_loss": 0.73814481, "learning_rate": 1.6019872470840468e-10, "loss": 0.76205891, "num_input_tokens_seen": 357529345, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.66796875, "step": 16565, "time_per_iteration": 2.9280588626861572 }, { "auxiliary_loss_clip": 0.01120915, "auxiliary_loss_mlp": 0.01030085, "balance_loss_clip": 1.01845598, "balance_loss_mlp": 1.0334785, "epoch": 0.9960018036975801, "flos": 18621227145600.0, "grad_norm": 2.1578876212972107, "language_loss": 0.79301345, "learning_rate": 1.554524216631492e-10, "loss": 0.81452346, "num_input_tokens_seen": 357547615, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6953125, "step": 16566, "time_per_iteration": 3.1057209968566895 }, { "auxiliary_loss_clip": 0.01135368, "auxiliary_loss_mlp": 0.01276347, "balance_loss_clip": 1.01809692, "balance_loss_mlp": 1.03274584, "epoch": 0.996061926950248, "flos": 20084120352000.0, "grad_norm": 1.700827855377297, "language_loss": 0.70675302, "learning_rate": 1.5077748792546862e-10, "loss": 0.73087013, "num_input_tokens_seen": 357567380, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.66796875, "step": 16567, "time_per_iteration": 3.0197558403015137 }, { "auxiliary_loss_clip": 0.01113149, "auxiliary_loss_mlp": 0.01033926, "balance_loss_clip": 1.02105784, "balance_loss_mlp": 1.03431821, "epoch": 0.996122050202916, "flos": 24347829576960.0, "grad_norm": 5.335021227376871, "language_loss": 0.78874421, "learning_rate": 1.461739236623405e-10, "loss": 0.81021494, "num_input_tokens_seen": 357586435, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 16568, "time_per_iteration": 3.0384504795074463 }, { "auxiliary_loss_clip": 0.01106421, "auxiliary_loss_mlp": 0.01026616, "balance_loss_clip": 1.01575017, "balance_loss_mlp": 1.03286695, "epoch": 0.9961821734555839, "flos": 24199302839040.0, "grad_norm": 1.6602989813163724, "language_loss": 0.81972522, "learning_rate": 1.416417290380778e-10, "loss": 0.84105557, "num_input_tokens_seen": 357604720, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.6484375, "step": 16569, "time_per_iteration": 3.0510573387145996 }, { "auxiliary_loss_clip": 0.01128158, "auxiliary_loss_mlp": 0.01282316, "balance_loss_clip": 1.0233897, "balance_loss_mlp": 1.03513169, "epoch": 0.9962422967082519, "flos": 22633741584000.0, "grad_norm": 1.9704839474480333, "language_loss": 0.70151436, "learning_rate": 1.3718090421432905e-10, "loss": 0.72561908, "num_input_tokens_seen": 357622345, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6640625, "step": 16570, "time_per_iteration": 2.964836597442627 }, { "auxiliary_loss_clip": 0.01128789, "auxiliary_loss_mlp": 0.01025702, "balance_loss_clip": 1.01354909, "balance_loss_mlp": 1.03372204, "epoch": 0.9963024199609198, "flos": 26396030903040.0, "grad_norm": 2.161891707522905, "language_loss": 0.74855697, "learning_rate": 1.3279144935030018e-10, "loss": 0.77010185, "num_input_tokens_seen": 357642710, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6796875, "step": 16571, "time_per_iteration": 3.0432844161987305 }, { "auxiliary_loss_clip": 0.01119307, "auxiliary_loss_mlp": 0.01033539, "balance_loss_clip": 1.0212009, "balance_loss_mlp": 1.03352427, "epoch": 0.9963625432135879, "flos": 16581537342720.0, "grad_norm": 1.7855043093437448, "language_loss": 0.79765767, "learning_rate": 1.284733646027547e-10, "loss": 0.81918609, "num_input_tokens_seen": 357659870, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 16572, "time_per_iteration": 2.9169600009918213 }, { "auxiliary_loss_clip": 0.0111933, "auxiliary_loss_mlp": 0.01032667, "balance_loss_clip": 1.02046013, "balance_loss_mlp": 1.03441858, "epoch": 0.9964226664662559, "flos": 26468534505600.0, "grad_norm": 1.6923184638252466, "language_loss": 0.70332909, "learning_rate": 1.2422665012556954e-10, "loss": 0.7248491, "num_input_tokens_seen": 357677075, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 16573, "time_per_iteration": 3.097241163253784 }, { "auxiliary_loss_clip": 0.01122342, "auxiliary_loss_mlp": 0.01036549, "balance_loss_clip": 1.02353144, "balance_loss_mlp": 1.0340867, "epoch": 0.9964827897189238, "flos": 27448320764160.0, "grad_norm": 1.9522059082868013, "language_loss": 0.62889051, "learning_rate": 1.200513060706232e-10, "loss": 0.65047944, "num_input_tokens_seen": 357696715, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 16574, "time_per_iteration": 3.052757978439331 }, { "auxiliary_loss_clip": 0.01134892, "auxiliary_loss_mlp": 0.01034624, "balance_loss_clip": 1.02141571, "balance_loss_mlp": 1.03577065, "epoch": 0.9965429129715918, "flos": 11721566350080.0, "grad_norm": 2.216955835930519, "language_loss": 0.7615701, "learning_rate": 1.1594733258668555e-10, "loss": 0.78326529, "num_input_tokens_seen": 357712345, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 16575, "time_per_iteration": 2.921813726425171 }, { "auxiliary_loss_clip": 0.01121699, "auxiliary_loss_mlp": 0.01029059, "balance_loss_clip": 1.01645851, "balance_loss_mlp": 1.03485405, "epoch": 0.9966030362242597, "flos": 19756004590080.0, "grad_norm": 2.227843996288061, "language_loss": 0.70063579, "learning_rate": 1.1191472982008399e-10, "loss": 0.72214335, "num_input_tokens_seen": 357731815, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 16576, "time_per_iteration": 3.009296178817749 }, { "auxiliary_loss_clip": 0.01109255, "auxiliary_loss_mlp": 0.01025993, "balance_loss_clip": 1.01353526, "balance_loss_mlp": 1.03398418, "epoch": 0.9966631594769277, "flos": 23915178259200.0, "grad_norm": 1.6327865910382349, "language_loss": 0.71741366, "learning_rate": 1.0795349791514752e-10, "loss": 0.73876613, "num_input_tokens_seen": 357751640, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6640625, "step": 16577, "time_per_iteration": 2.9697134494781494 }, { "auxiliary_loss_clip": 0.01119068, "auxiliary_loss_mlp": 0.01035343, "balance_loss_clip": 1.02347589, "balance_loss_mlp": 1.03286326, "epoch": 0.9967232827295956, "flos": 15559591495680.0, "grad_norm": 2.220111322119519, "language_loss": 0.78872651, "learning_rate": 1.0406363701287446e-10, "loss": 0.81027061, "num_input_tokens_seen": 357769850, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 16578, "time_per_iteration": 3.0969977378845215 }, { "auxiliary_loss_clip": 0.01137375, "auxiliary_loss_mlp": 0.01278282, "balance_loss_clip": 1.01974154, "balance_loss_mlp": 1.03389049, "epoch": 0.9967834059822637, "flos": 20813035046400.0, "grad_norm": 2.791626303544583, "language_loss": 0.76229608, "learning_rate": 1.0024514725226474e-10, "loss": 0.78645265, "num_input_tokens_seen": 357789550, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.68359375, "step": 16579, "time_per_iteration": 2.9955782890319824 }, { "auxiliary_loss_clip": 0.01123847, "auxiliary_loss_mlp": 0.01033453, "balance_loss_clip": 1.0194813, "balance_loss_mlp": 1.03609574, "epoch": 0.9968435292349316, "flos": 36719234830080.0, "grad_norm": 2.299401539422543, "language_loss": 0.69257963, "learning_rate": 9.649802876965374e-11, "loss": 0.71415263, "num_input_tokens_seen": 357809525, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.69921875, "step": 16580, "time_per_iteration": 3.261763572692871 }, { "auxiliary_loss_clip": 0.01137565, "auxiliary_loss_mlp": 0.01030439, "balance_loss_clip": 1.01863766, "balance_loss_mlp": 1.03294694, "epoch": 0.9969036524875996, "flos": 26760919213440.0, "grad_norm": 1.8024946872881842, "language_loss": 0.80023038, "learning_rate": 9.282228169849027e-11, "loss": 0.82191044, "num_input_tokens_seen": 357829795, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6875, "step": 16581, "time_per_iteration": 3.023622512817383 }, { "auxiliary_loss_clip": 0.01116931, "auxiliary_loss_mlp": 0.01025903, "balance_loss_clip": 1.01489425, "balance_loss_mlp": 1.03346562, "epoch": 0.9969637757402675, "flos": 24827237424000.0, "grad_norm": 1.692505969679038, "language_loss": 0.79925942, "learning_rate": 8.921790617022473e-11, "loss": 0.82068777, "num_input_tokens_seen": 357851655, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.65625, "step": 16582, "time_per_iteration": 2.994018316268921 }, { "auxiliary_loss_clip": 0.01106043, "auxiliary_loss_mlp": 0.01026952, "balance_loss_clip": 1.01538324, "balance_loss_mlp": 1.03261423, "epoch": 0.9970238989929355, "flos": 23038742407680.0, "grad_norm": 1.659509119021916, "language_loss": 0.60531145, "learning_rate": 8.568490231342096e-11, "loss": 0.62664139, "num_input_tokens_seen": 357871205, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.64453125, "step": 16583, "time_per_iteration": 3.0114097595214844 }, { "auxiliary_loss_clip": 0.01120339, "auxiliary_loss_mlp": 0.01034908, "balance_loss_clip": 1.02267134, "balance_loss_mlp": 1.03380525, "epoch": 0.9970840222456034, "flos": 25298816106240.0, "grad_norm": 1.6364915130403503, "language_loss": 0.77851605, "learning_rate": 8.222327025397824e-11, "loss": 0.80006856, "num_input_tokens_seen": 357892145, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 16584, "time_per_iteration": 3.0771260261535645 }, { "auxiliary_loss_clip": 0.01114751, "auxiliary_loss_mlp": 0.01031145, "balance_loss_clip": 1.01994503, "balance_loss_mlp": 1.03184438, "epoch": 0.9971441454982715, "flos": 21615602578560.0, "grad_norm": 1.6383442772516987, "language_loss": 0.69297087, "learning_rate": 7.883301011579746e-11, "loss": 0.7144298, "num_input_tokens_seen": 357911205, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.65234375, "step": 16585, "time_per_iteration": 3.041546106338501 }, { "auxiliary_loss_clip": 0.01101685, "auxiliary_loss_mlp": 0.01028649, "balance_loss_clip": 1.01625752, "balance_loss_mlp": 1.03498268, "epoch": 0.9972042687509394, "flos": 14975612179200.0, "grad_norm": 2.315336691143497, "language_loss": 0.8123343, "learning_rate": 7.551412201944885e-11, "loss": 0.83363771, "num_input_tokens_seen": 357928190, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.66796875, "step": 16586, "time_per_iteration": 2.894984245300293 }, { "auxiliary_loss_clip": 0.01120523, "auxiliary_loss_mlp": 0.01034954, "balance_loss_clip": 1.02322423, "balance_loss_mlp": 1.03343368, "epoch": 0.9972643920036074, "flos": 25806664546560.0, "grad_norm": 1.7028614857156419, "language_loss": 0.77700233, "learning_rate": 7.226660608372626e-11, "loss": 0.79855716, "num_input_tokens_seen": 357946985, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69140625, "step": 16587, "time_per_iteration": 3.0579729080200195 }, { "auxiliary_loss_clip": 0.01122229, "auxiliary_loss_mlp": 0.01033073, "balance_loss_clip": 1.02078843, "balance_loss_mlp": 1.03444266, "epoch": 0.9973245152562754, "flos": 23326242865920.0, "grad_norm": 2.069580413080229, "language_loss": 0.72635484, "learning_rate": 6.909046242431493e-11, "loss": 0.74790788, "num_input_tokens_seen": 357966720, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 16588, "time_per_iteration": 3.217871904373169 }, { "auxiliary_loss_clip": 0.01107451, "auxiliary_loss_mlp": 0.01029682, "balance_loss_clip": 1.01723647, "balance_loss_mlp": 1.03483367, "epoch": 0.9973846385089433, "flos": 12166212810240.0, "grad_norm": 2.681768537031433, "language_loss": 0.83330536, "learning_rate": 6.598569115467967e-11, "loss": 0.85467666, "num_input_tokens_seen": 357981375, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7265625, "step": 16589, "time_per_iteration": 4.38031530380249 }, { "auxiliary_loss_clip": 0.01115249, "auxiliary_loss_mlp": 0.01034839, "balance_loss_clip": 1.0213145, "balance_loss_mlp": 1.03542888, "epoch": 0.9974447617616113, "flos": 20045157073920.0, "grad_norm": 2.138732078861646, "language_loss": 0.7027117, "learning_rate": 6.295229238562072e-11, "loss": 0.72421253, "num_input_tokens_seen": 358000290, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 16590, "time_per_iteration": 2.9951558113098145 }, { "auxiliary_loss_clip": 0.01121337, "auxiliary_loss_mlp": 0.01030386, "balance_loss_clip": 1.01757097, "balance_loss_mlp": 1.03425312, "epoch": 0.9975048850142793, "flos": 32014614159360.0, "grad_norm": 1.8159373521841975, "language_loss": 0.63354576, "learning_rate": 5.999026622527382e-11, "loss": 0.65506303, "num_input_tokens_seen": 358022075, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 16591, "time_per_iteration": 3.062386989593506 }, { "auxiliary_loss_clip": 0.01118628, "auxiliary_loss_mlp": 0.0102629, "balance_loss_clip": 1.01452971, "balance_loss_mlp": 1.03201795, "epoch": 0.9975650082669473, "flos": 36933728895360.0, "grad_norm": 1.675099568573648, "language_loss": 0.73020518, "learning_rate": 5.709961277933217e-11, "loss": 0.75165439, "num_input_tokens_seen": 358043940, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6875, "step": 16592, "time_per_iteration": 2.769958972930908 }, { "auxiliary_loss_clip": 0.01119765, "auxiliary_loss_mlp": 0.01029904, "balance_loss_clip": 1.01705956, "balance_loss_mlp": 1.03368413, "epoch": 0.9976251315196152, "flos": 16472117537280.0, "grad_norm": 1.445447261156064, "language_loss": 0.85092443, "learning_rate": 5.428033215104655e-11, "loss": 0.87242115, "num_input_tokens_seen": 358062720, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 16593, "time_per_iteration": 2.725900888442993 }, { "auxiliary_loss_clip": 0.01097986, "auxiliary_loss_mlp": 0.0127489, "balance_loss_clip": 1.01655698, "balance_loss_mlp": 1.03283787, "epoch": 0.9976852547722832, "flos": 19646836179840.0, "grad_norm": 2.4020456378882145, "language_loss": 0.69274163, "learning_rate": 5.153242444122519e-11, "loss": 0.71647036, "num_input_tokens_seen": 358081560, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6484375, "step": 16594, "time_per_iteration": 2.6473097801208496 }, { "auxiliary_loss_clip": 0.01140822, "auxiliary_loss_mlp": 0.01026312, "balance_loss_clip": 1.01424825, "balance_loss_mlp": 1.03518724, "epoch": 0.9977453780249511, "flos": 20448434044800.0, "grad_norm": 3.89436135539184, "language_loss": 0.72561228, "learning_rate": 4.8855889747567715e-11, "loss": 0.7472837, "num_input_tokens_seen": 358099065, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.70703125, "step": 16595, "time_per_iteration": 2.6054418087005615 }, { "auxiliary_loss_clip": 0.01031767, "auxiliary_loss_mlp": 0.0099947, "balance_loss_clip": 0.99826032, "balance_loss_mlp": 1.00135016, "epoch": 0.9978055012776191, "flos": 65455097581440.0, "grad_norm": 0.7846330311837608, "language_loss": 0.60379905, "learning_rate": 4.625072816577535e-11, "loss": 0.62411141, "num_input_tokens_seen": 358156095, "router_z_loss_clip": 0.01208496, "router_z_loss_mlp": 0.21191406, "step": 16596, "time_per_iteration": 3.0867698192596436 }, { "auxiliary_loss_clip": 0.01111589, "auxiliary_loss_mlp": 0.01033726, "balance_loss_clip": 1.02148914, "balance_loss_mlp": 1.03507423, "epoch": 0.997865624530287, "flos": 20631506688000.0, "grad_norm": 1.7860445321232432, "language_loss": 0.77654159, "learning_rate": 4.3716939788662756e-11, "loss": 0.79799479, "num_input_tokens_seen": 358175230, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.67578125, "step": 16597, "time_per_iteration": 4.058161497116089 }, { "auxiliary_loss_clip": 0.01111691, "auxiliary_loss_mlp": 0.01030122, "balance_loss_clip": 1.01746249, "balance_loss_mlp": 1.03303814, "epoch": 0.9979257477829551, "flos": 29387102284800.0, "grad_norm": 2.2167491478026657, "language_loss": 0.82145095, "learning_rate": 4.1254524707046155e-11, "loss": 0.8428691, "num_input_tokens_seen": 358197075, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 16598, "time_per_iteration": 2.6091957092285156 }, { "auxiliary_loss_clip": 0.01123787, "auxiliary_loss_mlp": 0.01038358, "balance_loss_clip": 1.0252986, "balance_loss_mlp": 1.03361034, "epoch": 0.997985871035623, "flos": 22635070387200.0, "grad_norm": 1.947232603726535, "language_loss": 0.64505827, "learning_rate": 3.886348300841114e-11, "loss": 0.66667974, "num_input_tokens_seen": 358215925, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 16599, "time_per_iteration": 2.595008134841919 }, { "auxiliary_loss_clip": 0.01039164, "auxiliary_loss_mlp": 0.01001239, "balance_loss_clip": 1.00001144, "balance_loss_mlp": 1.00089061, "epoch": 0.998045994288291, "flos": 61806968663040.0, "grad_norm": 0.8509797154709493, "language_loss": 0.62419993, "learning_rate": 3.654381477824486e-11, "loss": 0.64460391, "num_input_tokens_seen": 358269035, "router_z_loss_clip": 0.01226807, "router_z_loss_mlp": 0.21191406, "step": 16600, "time_per_iteration": 3.09812331199646 }, { "auxiliary_loss_clip": 0.01118833, "auxiliary_loss_mlp": 0.01030389, "balance_loss_clip": 1.01937401, "balance_loss_mlp": 1.03443289, "epoch": 0.998106117540959, "flos": 19245534456960.0, "grad_norm": 1.5173273640463667, "language_loss": 0.78440064, "learning_rate": 3.4295520099147935e-11, "loss": 0.80589283, "num_input_tokens_seen": 358287680, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.66796875, "step": 16601, "time_per_iteration": 2.574692487716675 }, { "auxiliary_loss_clip": 0.01132518, "auxiliary_loss_mlp": 0.01031532, "balance_loss_clip": 1.01912856, "balance_loss_mlp": 1.03391385, "epoch": 0.9981662407936269, "flos": 21106209853440.0, "grad_norm": 1.8382735383591187, "language_loss": 0.82553828, "learning_rate": 3.211859905172254e-11, "loss": 0.8471787, "num_input_tokens_seen": 358304080, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71875, "step": 16602, "time_per_iteration": 2.7607009410858154 }, { "auxiliary_loss_clip": 0.01129265, "auxiliary_loss_mlp": 0.01033712, "balance_loss_clip": 1.02173197, "balance_loss_mlp": 1.033988, "epoch": 0.998226364046295, "flos": 24316839118080.0, "grad_norm": 1.9982118119391494, "language_loss": 0.62447667, "learning_rate": 3.0013051713462244e-11, "loss": 0.64610636, "num_input_tokens_seen": 358323670, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.68359375, "step": 16603, "time_per_iteration": 2.6423957347869873 }, { "auxiliary_loss_clip": 0.01121297, "auxiliary_loss_mlp": 0.01028099, "balance_loss_clip": 1.01602316, "balance_loss_mlp": 1.03418314, "epoch": 0.9982864872989629, "flos": 23836389776640.0, "grad_norm": 1.755114814712391, "language_loss": 0.71195883, "learning_rate": 2.797887815941813e-11, "loss": 0.7334528, "num_input_tokens_seen": 358341980, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69140625, "step": 16604, "time_per_iteration": 4.285475254058838 }, { "auxiliary_loss_clip": 0.01101463, "auxiliary_loss_mlp": 0.01026606, "balance_loss_clip": 1.01384449, "balance_loss_mlp": 1.03292489, "epoch": 0.9983466105516309, "flos": 18333116156160.0, "grad_norm": 1.7646059458124628, "language_loss": 0.64453417, "learning_rate": 2.6016078462420822e-11, "loss": 0.66581488, "num_input_tokens_seen": 358360400, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.68359375, "step": 16605, "time_per_iteration": 3.9736692905426025 }, { "auxiliary_loss_clip": 0.01126035, "auxiliary_loss_mlp": 0.01025246, "balance_loss_clip": 1.01353335, "balance_loss_mlp": 1.03319895, "epoch": 0.9984067338042988, "flos": 17236763285760.0, "grad_norm": 1.7298491663999562, "language_loss": 0.71420211, "learning_rate": 2.4124652692192327e-11, "loss": 0.73571491, "num_input_tokens_seen": 358378990, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6640625, "step": 16606, "time_per_iteration": 2.587059497833252 }, { "auxiliary_loss_clip": 0.01110487, "auxiliary_loss_mlp": 0.01279173, "balance_loss_clip": 1.02042556, "balance_loss_mlp": 1.03337204, "epoch": 0.9984668570569668, "flos": 20667884186880.0, "grad_norm": 1.9168765606373832, "language_loss": 0.8184247, "learning_rate": 2.230460091645625e-11, "loss": 0.84232128, "num_input_tokens_seen": 358395970, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6796875, "step": 16607, "time_per_iteration": 2.5942060947418213 }, { "auxiliary_loss_clip": 0.01128815, "auxiliary_loss_mlp": 0.01030762, "balance_loss_clip": 1.01763093, "balance_loss_mlp": 1.03187191, "epoch": 0.9985269803096347, "flos": 30262532555520.0, "grad_norm": 1.5861199876125187, "language_loss": 0.67093301, "learning_rate": 2.0555923200049618e-11, "loss": 0.69252884, "num_input_tokens_seen": 358417355, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 16608, "time_per_iteration": 2.576659679412842 }, { "auxiliary_loss_clip": 0.01140169, "auxiliary_loss_mlp": 0.01031046, "balance_loss_clip": 1.01882124, "balance_loss_mlp": 1.03560793, "epoch": 0.9985871035623027, "flos": 10560970005120.0, "grad_norm": 4.515909260926855, "language_loss": 0.8087815, "learning_rate": 1.8878619605366963e-11, "loss": 0.83049369, "num_input_tokens_seen": 358434345, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 16609, "time_per_iteration": 2.7473902702331543 }, { "auxiliary_loss_clip": 0.01118847, "auxiliary_loss_mlp": 0.01029423, "balance_loss_clip": 1.01710916, "balance_loss_mlp": 1.03258908, "epoch": 0.9986472268149706, "flos": 16873455173760.0, "grad_norm": 1.4320110880770163, "language_loss": 0.62782031, "learning_rate": 1.7272690192582372e-11, "loss": 0.64930302, "num_input_tokens_seen": 358452870, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 16610, "time_per_iteration": 2.5818755626678467 }, { "auxiliary_loss_clip": 0.01110984, "auxiliary_loss_mlp": 0.0103439, "balance_loss_clip": 1.02243912, "balance_loss_mlp": 1.03302169, "epoch": 0.9987073500676387, "flos": 22054538776320.0, "grad_norm": 2.3127018674056403, "language_loss": 0.67810464, "learning_rate": 1.5738135018539268e-11, "loss": 0.69955838, "num_input_tokens_seen": 358472210, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 16611, "time_per_iteration": 2.630730390548706 }, { "auxiliary_loss_clip": 0.01129716, "auxiliary_loss_mlp": 0.01032823, "balance_loss_clip": 1.02043724, "balance_loss_mlp": 1.03410637, "epoch": 0.9987674733203066, "flos": 30482880537600.0, "grad_norm": 1.4584395500362588, "language_loss": 0.695921, "learning_rate": 1.4274954138304706e-11, "loss": 0.71754646, "num_input_tokens_seen": 358493840, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 16612, "time_per_iteration": 2.8087422847747803 }, { "auxiliary_loss_clip": 0.01117261, "auxiliary_loss_mlp": 0.01029761, "balance_loss_clip": 1.01844776, "balance_loss_mlp": 1.03360665, "epoch": 0.9988275965729746, "flos": 27745230585600.0, "grad_norm": 1.858696467301308, "language_loss": 0.73809022, "learning_rate": 1.2883147604059175e-11, "loss": 0.75956047, "num_input_tokens_seen": 358515060, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.66015625, "step": 16613, "time_per_iteration": 3.1133580207824707 }, { "auxiliary_loss_clip": 0.01129586, "auxiliary_loss_mlp": 0.01273547, "balance_loss_clip": 1.01404643, "balance_loss_mlp": 1.03397822, "epoch": 0.9988877198256426, "flos": 17524191916800.0, "grad_norm": 2.225690806324155, "language_loss": 0.73771036, "learning_rate": 1.1562715465540663e-11, "loss": 0.7617417, "num_input_tokens_seen": 358528200, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 16614, "time_per_iteration": 2.8847012519836426 }, { "auxiliary_loss_clip": 0.01111802, "auxiliary_loss_mlp": 0.01031779, "balance_loss_clip": 1.02008486, "balance_loss_mlp": 1.03473067, "epoch": 0.9989478430783105, "flos": 20996502739200.0, "grad_norm": 1.719617905002304, "language_loss": 0.72652817, "learning_rate": 1.031365776960058e-11, "loss": 0.74796396, "num_input_tokens_seen": 358548360, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 16615, "time_per_iteration": 2.910912036895752 }, { "auxiliary_loss_clip": 0.01105485, "auxiliary_loss_mlp": 0.01276874, "balance_loss_clip": 1.01783597, "balance_loss_mlp": 1.03557062, "epoch": 0.9990079663309785, "flos": 13370620769280.0, "grad_norm": 2.040156191562389, "language_loss": 0.77305174, "learning_rate": 9.13597456109194e-12, "loss": 0.79687536, "num_input_tokens_seen": 358566270, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 16616, "time_per_iteration": 2.8666787147521973 }, { "auxiliary_loss_clip": 0.01120383, "auxiliary_loss_mlp": 0.01027963, "balance_loss_clip": 1.01566672, "balance_loss_mlp": 1.03421259, "epoch": 0.9990680895836465, "flos": 17310236555520.0, "grad_norm": 2.1674939288373567, "language_loss": 0.82859039, "learning_rate": 8.029665881759129e-12, "loss": 0.85007387, "num_input_tokens_seen": 358584710, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 16617, "time_per_iteration": 2.880540132522583 }, { "auxiliary_loss_clip": 0.01118432, "auxiliary_loss_mlp": 0.01026124, "balance_loss_clip": 1.01454258, "balance_loss_mlp": 1.03271508, "epoch": 0.9991282128363145, "flos": 24207993930240.0, "grad_norm": 1.6248154064521216, "language_loss": 0.78732812, "learning_rate": 6.994731771348128e-12, "loss": 0.8087737, "num_input_tokens_seen": 358606750, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6796875, "step": 16618, "time_per_iteration": 2.8845980167388916 }, { "auxiliary_loss_clip": 0.0110005, "auxiliary_loss_mlp": 0.01029989, "balance_loss_clip": 1.01816368, "balance_loss_mlp": 1.03275669, "epoch": 0.9991883360889824, "flos": 21175301664000.0, "grad_norm": 1.850000728243277, "language_loss": 0.74490631, "learning_rate": 6.0311722667183426e-12, "loss": 0.76620668, "num_input_tokens_seen": 358624675, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 16619, "time_per_iteration": 2.842557191848755 }, { "auxiliary_loss_clip": 0.01107486, "auxiliary_loss_mlp": 0.01026689, "balance_loss_clip": 1.01574516, "balance_loss_mlp": 1.03301716, "epoch": 0.9992484593416504, "flos": 19901155449600.0, "grad_norm": 2.1796200955559715, "language_loss": 0.86988056, "learning_rate": 5.138987402286687e-12, "loss": 0.8912223, "num_input_tokens_seen": 358640715, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.65625, "step": 16620, "time_per_iteration": 2.864694595336914 }, { "auxiliary_loss_clip": 0.01124175, "auxiliary_loss_mlp": 0.01026428, "balance_loss_clip": 1.01479292, "balance_loss_mlp": 1.03183317, "epoch": 0.9993085825943183, "flos": 24857832833280.0, "grad_norm": 1.898392870131174, "language_loss": 0.72209656, "learning_rate": 4.31817720980554e-12, "loss": 0.74360251, "num_input_tokens_seen": 358659630, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.66015625, "step": 16621, "time_per_iteration": 2.911081552505493 }, { "auxiliary_loss_clip": 0.01047716, "auxiliary_loss_mlp": 0.00999608, "balance_loss_clip": 0.99840349, "balance_loss_mlp": 1.0008409, "epoch": 0.9993687058469863, "flos": 71284478780160.0, "grad_norm": 0.7480424689218098, "language_loss": 0.56571198, "learning_rate": 3.568741718584789e-12, "loss": 0.58618522, "num_input_tokens_seen": 358727840, "router_z_loss_clip": 0.01202393, "router_z_loss_mlp": 0.2109375, "step": 16622, "time_per_iteration": 3.680511474609375 }, { "auxiliary_loss_clip": 0.01128861, "auxiliary_loss_mlp": 0.01027559, "balance_loss_clip": 1.01514316, "balance_loss_mlp": 1.03336513, "epoch": 0.9994288290996542, "flos": 12199573566720.0, "grad_norm": 1.9840421878934964, "language_loss": 0.7096355, "learning_rate": 2.890680955491831e-12, "loss": 0.73119974, "num_input_tokens_seen": 358744125, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 16623, "time_per_iteration": 2.858522415161133 }, { "auxiliary_loss_clip": 0.01127517, "auxiliary_loss_mlp": 0.01030604, "balance_loss_clip": 1.01740181, "balance_loss_mlp": 1.0329566, "epoch": 0.9994889523523223, "flos": 17889942153600.0, "grad_norm": 1.8418051218937452, "language_loss": 0.74797851, "learning_rate": 2.2839949445074834e-12, "loss": 0.76955974, "num_input_tokens_seen": 358761420, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.671875, "step": 16624, "time_per_iteration": 2.8325438499450684 }, { "auxiliary_loss_clip": 0.01128639, "auxiliary_loss_mlp": 0.01030009, "balance_loss_clip": 1.0173732, "balance_loss_mlp": 1.03265893, "epoch": 0.9995490756049902, "flos": 26578888064640.0, "grad_norm": 1.6338302419563473, "language_loss": 0.73501772, "learning_rate": 1.7486837073921178e-12, "loss": 0.75660419, "num_input_tokens_seen": 358782600, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 16625, "time_per_iteration": 2.9909346103668213 }, { "auxiliary_loss_clip": 0.01107796, "auxiliary_loss_mlp": 0.01031548, "balance_loss_clip": 1.02027094, "balance_loss_mlp": 1.03301573, "epoch": 0.9996091988576582, "flos": 22200048771840.0, "grad_norm": 2.1297074845231707, "language_loss": 0.76932216, "learning_rate": 1.28474726324157e-12, "loss": 0.79071558, "num_input_tokens_seen": 358801220, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.66015625, "step": 16626, "time_per_iteration": 3.1152639389038086 }, { "auxiliary_loss_clip": 0.01101088, "auxiliary_loss_mlp": 0.01033725, "balance_loss_clip": 1.02150059, "balance_loss_mlp": 1.03472805, "epoch": 0.9996693221103262, "flos": 27373195468800.0, "grad_norm": 1.687938783584354, "language_loss": 0.82237411, "learning_rate": 8.921856287091856e-13, "loss": 0.84372222, "num_input_tokens_seen": 358819190, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6640625, "step": 16627, "time_per_iteration": 2.9614155292510986 }, { "auxiliary_loss_clip": 0.01115298, "auxiliary_loss_mlp": 0.01035657, "balance_loss_clip": 1.02260971, "balance_loss_mlp": 1.03451455, "epoch": 0.9997294453629941, "flos": 26870410846080.0, "grad_norm": 2.0927416250137933, "language_loss": 0.70712441, "learning_rate": 5.709988175617297e-13, "loss": 0.72863394, "num_input_tokens_seen": 358839850, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 16628, "time_per_iteration": 3.0120882987976074 }, { "auxiliary_loss_clip": 0.01110821, "auxiliary_loss_mlp": 0.01027762, "balance_loss_clip": 1.01535273, "balance_loss_mlp": 1.03245425, "epoch": 0.9997895686156621, "flos": 23476996247040.0, "grad_norm": 1.3245222067993916, "language_loss": 0.75045115, "learning_rate": 3.211868415675667e-13, "loss": 0.77183694, "num_input_tokens_seen": 358859805, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 16629, "time_per_iteration": 2.8829541206359863 }, { "auxiliary_loss_clip": 0.01123727, "auxiliary_loss_mlp": 0.01035973, "balance_loss_clip": 1.02356315, "balance_loss_mlp": 1.03515863, "epoch": 0.9998496918683301, "flos": 20224961579520.0, "grad_norm": 3.7359091274914027, "language_loss": 0.60471928, "learning_rate": 1.4274970960848065e-13, "loss": 0.62631625, "num_input_tokens_seen": 358877900, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 16630, "time_per_iteration": 4.309621095657349 }, { "auxiliary_loss_clip": 0.01140426, "auxiliary_loss_mlp": 0.01026822, "balance_loss_clip": 1.01419187, "balance_loss_mlp": 1.03502584, "epoch": 0.9999098151209981, "flos": 21652913831040.0, "grad_norm": 2.1123220952257085, "language_loss": 0.60554433, "learning_rate": 3.5687427679675917e-14, "loss": 0.62721682, "num_input_tokens_seen": 358897285, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 16631, "time_per_iteration": 2.9355034828186035 }, { "auxiliary_loss_clip": 0.0113647, "auxiliary_loss_mlp": 0.01274443, "balance_loss_clip": 1.0166285, "balance_loss_mlp": 1.03429055, "epoch": 0.999969938373666, "flos": 11544599018880.0, "grad_norm": 1.8022607409401779, "language_loss": 0.72702289, "learning_rate": 0.0, "loss": 0.75113201, "num_input_tokens_seen": 358911570, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.65625, "step": 16632, "time_per_iteration": 2.763507604598999 }, { "epoch": 0.999969938373666, "num_input_tokens_seen": 358911570, "step": 16632, "total_flos": 1.3992169073237033e+18, "train_loss": 0.7687115174444493, "train_runtime": 47670.9027, "train_samples_per_second": 13.956, "train_steps_per_second": 0.349 } ], "logging_steps": 1.0, "max_steps": 16632, "num_input_tokens_seen": 358911570, "num_train_epochs": 1, "save_steps": 3328, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3992169073237033e+18, "train_batch_size": 5, "trial_name": null, "trial_params": null }